@string{OSDI94 = "Proc. of the First Symposium on Operating Systems Design and Implementation"}

@InProceedings{Waldspurger:osdi94,
  author = "Carl A. Waldspurger and William E. Weihl",
  title = "Lottery Scheduling: Flexible Proportional-Share Resource
		  Management",
  booktitle = OSDI94,
  year = 1994,
  month = nov,
  address = "Monterey, CA",
  organization = "USENIX Assoc.",
  pages = "1--11",
  abstract = "
    This paper presents lottery scheduling, a novel randomized resource
    allocation mechanism.  Lottery scheduling provides efficient,
    responsive control over the relative execution rates of
    computations. Such control is beyond the capabilities of
    conventional schedulers, and is desirable in systems that service
    requests of varying importance, such as databases, media-based
    applications, and networks. Lottery scheduling also supports
    modular resource management by enabling concurrent modules to
    insulate their resource allocation policies from one another. A
    currency abstraction is introduced to flexibly name, share, and
    protect resource rights. We also show that lottery scheduling can
    be generalized to manage many diverse resources, such as I/O
    bandwidth, memory, and access to locks. We have implemented a
    prototype lottery scheduler for the Mach 3.0 microkernel, and found
    that it provides flexible and responsive control over the relative
    execution rates of a wide range of applications. The overhead
    imposed by our unoptimized prototype is comparable to that of the
    standard Mach timesharing policy.
"
}

@InProceedings{Weiser:osdi94,
  author = "Mark Weiser and Alan Demers and Brent Welch and Scott Shenker",
  title = "Scheduling for Reduced {CPU} Energy",
  booktitle = OSDI94,
  year = 1994,
  month = nov,
  address = "Monterey, CA",
  organization = "USENIX Assoc.",
  pages = "13--23",
  abstract = "
    The energy usage of computer systems is becoming more important,
    especially for battery operated systems. Displays, disks, and cpus,
    in that order, use the most energy. Reducing the energy used by
    displays and disks has been studied elsewhere; this paper considers
    a new method for reducing the energy used by the cpu.  We introduce
    a new metric for cpu energy performance,
    millions-of-instructions-per-joule (MIPJ). We examine a class of
    methods to reduce MIPJ that are characterized by dynamic control of
    system clock speed by the operating system scheduler. Reducing
    clock speed alone does not reduce MIPJ, since to do the same work
    the system must run longer. However, a number of methods are
    available for reducing energy with reduced clock-speed, such as
    reducing the voltage [Chandrakasan {\em et al| 1992] [Horowitz
    1993] or using reversible [Younis and Knight 1993] or adiabatic
    logic [Athas {\em et al} 1994].
    \par
    What are the right scheduling algonthms for taking advantage of
    reduced clock-speed, especially in the presence of applications
    demanding ever more instructions-per-second?  We consider several
    methods for varying the clock speed dynamically under control of
    the operating system, and examine the performance of these methods
    against workstation traces. The primary result is that by adjusting
    the clock speed at a fine grain, substantial CPU energy can be
    saved with a limited impact on performance.
"
}

@InProceedings{Douglis:osdi94,
  author = "Fred Douglis and Ramon Caceres and Frans Kaashoek and Kai Li
		  and Brian Marsh and Joshua Tauber",
  title = "Storage Alternatives for Mobile Computers",
  booktitle = OSDI94,
  year = 1994,
  month = nov,
  address = "Monterey, CA",
  organization = "USENIX Assoc.",
  pages = "25--37",
  abstract = "
    Mobile computers such as notebooks, subnotebooks, and palmtops
    require low weight, low power consumption, and good interactive
    performance. These requirements impose many challenges on
    architectures and operating systems. This paper investigates three
    alternative storage devices for mobile computers: magnetic hard
    disks, flash memory disk emulators, and flash memory cards.
    \par
    We have used hardware measurements and trace-driven simulation to
    evaluate each of the alternative storage devices and their related
    design strategies. Hardware measurements on an HP OmniBook 300
    highlight differences in the performance of the three devices as
    used on the Omnibook, especially the poor performance of version
    2.00 of the Microsoft Flash File System [11] when accessing large
    files. The traces used in our study came from different
    environments, including mobile computers (Macintosh PowerBooks) and
    desktop computers (running Windows or HPUX), as well as synthetic
    workloads.  Our simulation study shows that flash memory can reduce
    energy consumption by an order of magnitude, compared to magnetic
    disk, while providing good read performance and acceptable write
    performance. These energy savings can translate into a 22\%
    extension of battery life. We also find that the amount of unused
    memory in a flash memory card has a substantial impact on energy
    consumption, performance, and endurance: compared to low storage
    utilizations (40\% full), running flash memory near its capacity
    (95\% full) can increase energy consumption by 70-190\%, degrade
    write response time by 30\%, and decrease the lifetime of the
    memory card by up to a third. For flash disks, asynchronous erasure
    can improve write response time by a factor of 2.5.
"
}

@InProceedings{OtooleShrira:osdi94,
  author = "James O'Toole and Liuba Shrira",
  title = "Opportunistic Log: Efficient Installation Reads in a
		  Reliable Storage Server",
  booktitle = OSDI94,
  year = 1994,
  month = nov,
  address = "Monterey, CA",
  organization = "USENIX Assoc.",
  pages = "39--48",
  abstract = "
    In a distributed storage system clielit caches managed on the basis
    of small granularity objects can provide better memory utilization
    then page-based caches. However, object servers, unlike page
    servers, must perform additional disk reads. These {\em
    installation reads} are required to install modified objects onto
    their conresponding disk pages. The opportunistic log is a new
    technique that significantly reduces the cost of installation
    reads. It defers the installation reads, removing them from the
    modification commit path, and manages a large pool of pending
    installation reads that can be scheduled efficiently.
    \par
    Using simulations, we show that the opportunistic log substantially
    enhances the I/O performance of reliable storage servers. An object
    server without the opportunistic log requires much better client
    caching to outperform a page server. With an opportunistic log,
    only a small client cache improvement suffices.
    \par
    Our results imply that efficient scheduling of installation reads
    can substantially improve the performance of larege-scale storage
    systems and therefore introduces a new perfonnance tradeoff between
    page-based and object-based architectures.
"
}

@InProceedings{Ganger:osdi94,
  author = "Gregory R. Ganger and Yale N. Patt",
  title = "Metadata Update Performance in File Systems",
  booktitle = OSDI94,
  year = 1994,
  month = nov,
  address = "Monterey, CA",
  organization = "USENIX Assoc.",
  pages = "49--60",
  abstract = "
    Structural changes, such as file creation and block allocation,
    have consistently been identified as file system performance
    problems in many user environments. We compare several
    implementations that maintain metadata integrity in the event of a
    system failure but do not require changes to the on-disk
    structures. In one set of schemes, the file system uses
    asynchronous writes and passes ordering requirements to the disk
    scheduler. These scheduler-enforced ordering schemes outperform the
    conventional approach (synchronous writes) by more than 30 percent
    for metadata update intensive benchmarks, but are suboptimal mainly
    due to their inability to safely use delayed writes when ordering
    is required. We therefore introduce soft updates, an implementation
    that asymptotically approaches memory-based file system performance
    (within 5 percent) while providing stronger integrity and security
    guarantees than most UNIX file systems.  For metadata update
    intensive benchmarks, this improves performance by more than a
    factor of two when compared to the conventional approach.
"
}

@InProceedings{Kotz:osdi94,
  author = "David Kotz",
  title = "Disk-directed {I/O} for {MIMD} Multiprocessors",
  booktitle = OSDI94,
  year = 1994,
  month = nov,
  address = "Monterey, CA",
  organization = "USENIX Assoc.",
  pages = "61--74",
  abstract = "
    Many scientific applications that run on today's multiprocessors,
    such as weather forecasting and seismic analysis, are bottlenecked
    by their file-I/O needs. Even if the multiprocessor is configured
    with sufficient I/O hardware, the file-system software often fails
    to provide the available bandwidth to the application. Although
    libraries and enhanced file-system interfaces can make a
    significant improvement, we believe that fundamental changes are
    needed in the file-server software. We propose a new technique,
    disk-directed I/O, to allow the disk servers to determine the flow
    of data for maximum performance. Our simulations show that
    tremendous performance gains are possible. Indeed, disk-directed
    I/O provided consistent high periormance that was largely
    independent of data distribution, obtained up to 93\% of peak disk
    bandwidth, and was as much as 16 times faster than traditional
    parallel file systems.
"
}

@InProceedings{Koch:osdi94,
  author = "Povl T. Koch and Robert J. Fowler and Eric Jul",
  title = "Message-Driven Relaxed Consistency in a Software
		  Distributed Shared Memory",
  booktitle = OSDI94,
  year = 1994,
  month = nov,
  address = "Monterey, CA",
  organization = "USENIX Assoc.",
  pages = "75--85",
  abstract = "
    Message-passing and distributed shared memory have their respective
    advantages and disadvantages in distributed parallel
    programming. We approach the problem of integrating both mechanisms
    into a single system by proposing a new {\em message-driven
    coherency} mechanism. Messages carrying explicit causality
    annotations are exchanged to trigger memory coherency actions. By
    adding annotations to standard message-based protocols, it is easy
    to construct efficient implementations of common synchronization
    and communication mechanisms.  Because these are user-level
    messages, the set of available primitives is extended easily with
    language- or application-specific mechanisms. CarlOS, an
    experimental prototype for evaluating this approach, is derived
    from the lazy release consistent memory of TreadMarks. We describe
    the message-driven coherency memory model used in CarlOS, and we
    examine the performance of several applications.
"
}

@InProceedings{Zekauskas:osdi94,
  author = "Matthew J. Zekauskas and Wayne A. Sawdon and Brian
		  N. Bershad",
  title = "Software Write Detection for Distributed Shared Memory",
  booktitle = OSDI94,
  year = 1994,
  month = nov,
  address = "Monterey, CA",
  organization = "USENIX Assoc.",
  pages = "87--100",
  abstract = "
     Most software-hased distributed shared memory (DSM) systems rely
    on the operating system's virtual memory interface to detect writes
    to shared data. Strategies based on virtual memory page protection
    create two problems for a DSM system.  First, writes can have high
    overhead since they are detected with a page fault. As a result, a
    page must be written many times to amortize the cost of that
    fault. Second, the size at a virtual memory page is too big to serve
    as a unit of coherency, inducing false sharing. Mechanisms to
    handle false sharing can increase runtime overhead and may cause
    data to he unnecessarily communicated between processors.
    \par
    In this paper we present a new method for write detection that
    solves these problems. Our method relies on the compiler and
    runtime system to detect writes to shared data without invoking the
    operating system.  We measure and compare implementations of a
    distributed shared memory system using both strategies, virtual
    memory and compiler/runtime, running a range of applications on a
    small scale distributed memory multicomputer. We show that the new
    method has low average write latency and supports fine-grained
    sharing with low overhead. Further, we show that the dominant cost
    of write detection with either strategy is due to the mechanism
    used to handle fine-grain sharing.
"
}

@InProceedings{Scales:osdi94,
  author = "Daniel J. Scales and Monica S. Lam",
  title = "The Design and Evaluation of a Shared Object System for
		  Distributed Memory Machines",
  booktitle = OSDI94,
  year = 1994,
  month = nov,
  address = "Monterey, CA",
  organization = "USENIX Assoc.",
  pages = "101--114",
  abstract = "
    This paper describes the design and evaluation of SAM, a shared
    object system for distributed memory machines. SAM is a portable
    run-time system that provides a global name space and automatic
    caching of shared data. SAM incorporates mechanisms to address the
    problem of high communication overheads on distributed memory
    machines; these mechanisms include tying synchronization to data
    access, chaotic access to data, prefetching of data, and pushing of
    data to remote processors. SAM has been implemented on the CM-5,
    Intel iPSC/860 and Paragon, IBM SPI, and networks of workstations
    running PVM. SAM applications run on all these platforms without
    modification.
    \par
    This paper provides an extensive analysis on several complex
    scientific algorithms written in SAM on a variety of hardware
    platforms. We find that the performance of these SAM applications
    depends fundamentally on the scalability of the underlying parallel
    algorithm, and whether the algorithm's communication requirements
    can be satisfied by the hardware.  Our experience suggests that SAM
    is successful in allowing programmers to use distributed memory
    machines effectively with much less programming effort than
    required today.
"
}

@InProceedings{Bailey:osdi94,
  author = "Mary L. Bailey and Burra Gopal and Michael A. Pagels and
		  Larry L. Peterson and Prasenjit Sarkar",
  title = "{PATHFINDER}: A Pattern-Based Packet Classifier",
  booktitle = OSDI94,
  year = 1994,
  month = nov,
  address = "Monterey, CA",
  organization = "USENIX Assoc.",
  pages = "115--123",
  abstract = "
    This paper describes a pattern-based approach to building packet
    classifiers. One novelty of the approach is that it can be
    implemented efficiently in both software and hardware. A
    performance study shows that the software implementation is about
    twice as fast as existing mechanisms, and that the hardware
    implementation is currently able to keep up with OC- 12 (622Mbps)
    network links and is likely to operate at gigabit speeds in the
    near future.
"
}

@InProceedings{Nahum:osdi94,
  author = "Erich M. Nahum and David J. Yates and James F. Kurose and
		  Don Towsley",
  title = "Performance Issues in Parallelized Network Protocols",
  booktitle = OSDI94,
  year = 1994,
  month = nov,
  address = "Monterey, CA",
  organization = "USENIX Assoc.",
  pages = "125--137",
  abstract = "
    Parallel processing has been proposed as a means of improving
    network protocol throughput. Several different strategies have been
    taken towards parallelizing protocols. A relatively popular
    approach is {\em packet-level parallelism,} where packets are
    distributed across processors.
    \par
    This paper provides an experimental performance study of
    packet-level parallelism on a contemporary shared memory
    multiprocessor. We examine several unexplored areas in packet-level
    parallelism and investigate how various protocol structuring and
    implementation techniques can affect performance. We study TCP/IP
    and UDP/IP protocol stacks implemented with a parallel version of
    the x-kernel running in user space on Silicon Graphics
    multiprocessors.
    \par
    Our results show that only limited packet-level parallelism can be
    achieved within a single connection under TCP but that using
    multiple connections can improve available parallelism.  We also
    demonstrate that packet ordering plays a key role in determining
    single-connection TCP performance, that careful use of locks is a
    necessity, and that selective exploitation of caching can improve
    throughput. We also describe experiments that compare parallel
    protocol performance on two generations of a parallel machine and
    show how computer architectural trends can influence performance.
"
}

@InProceedings{Unrau:osdi94,
  author = "Ronald C. Unrau and Orran Krieger and Benjamin Gamsa and
		  Michael Stumm",
  title = "Experiences with Locking in a {NUMA} Multiprocessor Operating
		  System Kernel",
  booktitle = OSDI94,
  year = 1994,
  month = nov,
  address = "Monterey, CA",
  organization = "USENIX Assoc.",
  pages = "139--152",
  abstract = "
    We describe the locking architecture of a new operating system,
    HURRICANE, designed for large scale shared-memory
    multiprocessors. Many papers already describe kernel locking
    techniques, and some of the techniques we use have been previously
    described by others. However, our work is novel in the particular
    combination of techniques used, as well as several of the
    individual techniques themselves. Moreover, it is the way the
    techniques work together that is the source of our performance
    advantages and scalability. Briefly, we use:
    \begin{itemize}
    \item a hybrid coarse-grainlfine-grain locking strategy that has the
    low latency and space overhead of a coarsegrain locking strategy
    while having the high concurrency of a fine-grain locking strategy;
    \item replication of data structures to increase access bandwidth
    and improve concurrency;
    \item a clustered kernel that bounds the number of processors that
    can compete for a lock so as to reduce second order effects such as
    memory and interconnect contention;
    \item Distributed Locks to further reduce second order effects, with
    modifications that reduce the uncontended latency of these locks to
    close to that of spin locks.
    \end{itemize}
"
}

@InProceedings{Chao:osdi94,
  author = "Chao Hsien Lee and Meng Chang Chen and Ruei Chuan Chang",
  title = "{HiPEC}: High Performance External Virtual Memory Caching",
  booktitle = OSDI94,
  year = 1994,
  month = nov,
  address = "Monterey, CA",
  organization = "USENIX Assoc.",
  pages = "153--164",
  abstract = "
    Traditional operating systems use a fixed LRU-like page replacement
    policy and centralized frame pool that cannot properly serve all
    types of memory access patterns of various applications. As a
    result, many memory-intensive applications, such as databases,
    multimedia applications and scientific simulators, induce excessive
    page faults and page replacement when running on top of existing
    operating systems.
    \par
    This paper presents a High Performance External virtual memory
    Caching mechanism (HiPEC) to provide applications with their own
    specific page replacement management. The user specific policy,
    programmed in the HiPEC command set, is stored in user address
    space. When a page fault occurs, the kernel fetches and interprets
    the corresponding policy commands to perform the user-specific page
    replacement management.  Experimental results show that HiPEC
    induces little overhead and can significantly improve performance
    for memory-intensive applications.
"
}

@InProceedings{Cao:osdi94,
  author = "Pei Cao and Edward W. Felten and Kai Li",
  title = "Implementation and Performance of Application-Controlled
		  File Caching",
  booktitle = OSDI94,
  year = 1994,
  month = nov,
  address = "Monterey, CA",
  organization = "USENIX Assoc.",
  pages = "165--177",
  abstract = "
    Traditional file system implementations do not allow applications
    to control file caching replacement decisions. We have implemented
    two-level replacement, a scheme that allows applications to control
    their own cache replacement, while letting the kernel control the
    allocation of cache space among processes. We designed an interface
    to let applications exert control on replacement via a set of
    directives to the kernel. This is effective and requires low
    overhead.
    \par
    We demonstrate that for applications that do not perform well under
    traditional caching policies, the combination of good
    application-chosen replacement strategies and our kernel allocation
    policy LRU-SP, can reduce the number of block I/Os by up to 80\%,
    and can reduce the elapsed time by up to 45\%. We also show that
    LRU-SP is crucial to the performance improvement for multiple
    concurrent applications: LRU-SP fairly distributes cache blocks and
    offers protection against foolish applications.
"
}

@InProceedings{CheritonDuda:osdi94,
  author = "David R. Cheriton and Kenneth J. Duda",
  title = "A Caching Model of Operating System Kernel Functionality",
  booktitle = OSDI94,
  year = 1994,
  month = nov,
  address = "Monterey, CA",
  organization = "USENIX Assoc.",
  pages = "179--193",
  abstract = "
    Operating system research has endeavored to develop micro-kernels
    that provide modularity, reliability and security improvements over
    conventional monolithic kernels.  However, the resulting kernels
    have been slower, larger and more error-prone than desired. These
    efforts have also failed to provide sufficient application control
    of resource management required by sophisticated applications.
    \par
    This paper describes a caching model of operating system
    functionality as implemented in the {\em Cache Kernel,} the
    supervisor-mode component of the V++ operating system. The Cache
    Kernel caches operating system objects such as threads and address
    spaces just as conventional hardware caches memory data. User-mode
    application kernels handle the loading and writehack of these
    objects, implementing application-specific management policies and
    mechanisms. Experience with implementing the Cache Kernel and
    measurements of its performance on a multiprocessor suggest that
    the caching model can provide competitive performance with
    conventional monolithic operating systems, yet provides
    application-level control of system resources, better
    modularity, better scalability, smaller size and a basis for fault
    containment. 
"
}

@InProceedings{Freeh:osdi94,
  author = "Vincent W. Freeh and David K. Lowenthal and Gregory R. Andrews",
  title = "Distributed Filaments: Efficient Fine-Grain Parallelism on
		  a Cluster of Workstations",
  booktitle = OSDI94,
  year = 1994,
  month = nov,
  address = "Monterey, CA",
  organization = "USENIX Assoc.",
  pages = "201--213",
  abstract = "
    A fine-grain parallel program is one in which processes are
    typically small, ranging from a few to a few hundred
    instructions. Fine-grain parallelism arises naturally in many
    situations, such as iterative grid computations, recursive
    fork/join programs, the bodies of parallel FOR loops, and the
    implicit parallelism in functional or dataflow languages. It is
    useful both to describe massively parallel computations and as a
    target for code generation by compilers. However, fine-grain
    parallelism has long been thought to be inefficient due to the
    overheads of process creation, context switching, and
    synchronization. This paper describes a software kernel,
    Distributed Filaments (DF), that implements fine-grain parallelism
    both portably and efficiently on a workstation cluster. DF runs on
    existing, off-the-shelf hardware and software. It has a simple
    interface, so it is easy to use. DF achieves efficiency by using
    stateless threads on each node, overlapping communication and
    computation, employing a new reliable datagram communication
    protocol, and automatically balancing the work generated by
    fork/join computations.
"
}

@InProceedings{Feeley:osdi94,
  author = "Michael J. Feeley and Jeffrey S. Chase and Vivek
		  R. Narasayya and Henry M. Levy",
  title = "Integrating Coherency and Recoverability in Distributed
		  Systems",
  booktitle = OSDI94,
  year = 1994,
  month = nov,
  address = "Monterey, CA",
  organization = "USENIX Assoc.",
  pages = "215--227",
  abstract = "
    We propose a technique for maintaining coherency of a transactional
    distributed shared memory, used by applications accessing a shared
    persistent store. Our goal is to improve support for fine-grained
    distributed data sharing in collaborative design applications, such
    as CAD systems and software development environments. In contrast,
    traditional research in distributed shared memory has focused on
    supporting parallel programs; in this paper, we show how
    distributed programs can benefit from this shared-memory
    abstraction as well.
    \par
    Our approach, called log-based coherency, integrates coherency
    support with a standard mechanism for ensuring recoverability of
    persistent data. In our system, transaction logs are the basis of
    both recoverability and coherency. We have prototyped log-based
    coherency as a set of extensions to RVM [Satyanarayanan et al. 94],
    a runtime package supporting recoverable virtual memory. Our
    prototype adds coherency support to RVM in a simple way that does
    not require changes to existing RVM applications. We report on our
    prototype and its performance, and discuss its relationship to
    other DSM systems.
"
}

@InProceedings{Ferreira:osdi94,
  author = "Paulo Ferreira and Marc Shapiro",
  title = "Garbage Collection and {DSM} Consistency",
  booktitle = OSDI94,
  year = 1994,
  month = nov,
  address = "Monterey, CA",
  organization = "USENIX Assoc.",
  pages = "229--241",
  abstract = "
    This paper presents the design of a copying garbage collector for
    persistent distributed shared objects in a loosely coupled network
    with weakly consistent distributed shared memory (DSM).
    \par
    The main goal of the design for this garbage collector is to
    minimize the communication overhead due to collection between nodes
    of the system, and to avoid any interference with the DSM memory
    consistency protocol.
    \par
    Our design is based on the observation that, in a weakly consistent
    DSM system, the memory consistency requirements of the garbage
    collector are less strict than those of the applications. Thus, the
    garbage collector reclaims objects independently of other copies of
    the same objects without interfering with the DSM consistency
    protocol. Furthermore, our design does not require reliable
    communication support, and is capable of reclaiming distributed
    cycles of dead objects.
"
}

@InProceedings{Bala:osdi94,
  author = "Kavita Bala and M. Frans Kaashoek and William E. Weihl",
  title = "Software Prefetching and Caching for Translation Lookaside
		  Buffers",
  booktitle = OSDI94,
  year = 1994,
  month = nov,
  address = "Monterey, CA",
  organization = "USENIX Assoc.",
  pages = "243--253",
  abstract = "
    A number of interacting trends in operating system structure,
    processor architecture, and memory systems are increasing both the
    rate of translation lookaside buffer (TLB) misses and the cost of
    servicing a miss. This paper presents two novel soflware schemes,
    implemented under Mach 3.0, to decrease both the number and the
    cost of kernel TLB misses (i.e., misses on kernel data structures,
    including user page tables). The first scheme is a new use of
    prefetching for TLB entries on the IPC path, and the second scheme
    is a new use of software caching of TLB entries for hierarchical
    page table organizations.
    \par
    For a range of applications, prefetching decreases the number of
    kernel TLB misses by 40\% to 50\%, and caching decreases TLB
    penalties by providing a fast path for over 90\% of the misses. Our
    caching scheme also decreases the number of nested TLB traps due to
    the page table hierarchy, reducing the number of kernel TLB miss
    traps for applications by 20\% to 40\%. Prefetching and caching,
    when used alone, each improve application performance by up to
    3.5\%; when used together, they improve application performance by
    up to 3\%. On synthetic benchmarks that involve frequent
    communication among several different address spaces (and thus put
    more pressure on the TLB), prefetching improves overall performance
    by about 6\%, caching improves overall performance by about 10\%,
    and the two used together improve overall perfor nance by about
    12\%.
    \par
    Our techniques are very effective in reducing kernel TLB penalties,
    which currently range from 1\% to 5\% of application runtime for
    the benchmarks studied. Since processor speeds continue to increase
    relative to memory speeds, our schemes should be even more
    effective in improving application performance in future
    architectures.
"
}

@InProceedings{Romer:osdi94,
  author = "Theodore H. Romer and Dennis Lee and Brian N. Bershad",
  title = "Dynamic Page Mapping Policies for Cache Conflict Resolution
		  on Standard Hardware",
  booktitle = OSDI94,
  year = 1994,
  month = nov,
  address = "Monterey, CA",
  organization = "USENIX Assoc.",
  pages = "255--266",
  abstract = "
    In computer systems with large, physically-indexed, direct-mapped
    caches, a poor mapping from virtual to physical pages causes
    excessive cache conflict misses. In a previous paper we proposed a
    simple hardware device, the Cache Miss Lookaside (CML) Buffer,
    which identifies pages that are suffering from conflict misses. The
    operating system can use this information to implement a dynamic
    page mapping policy that resolves conflicts by performing an
    in-memory copy of one of the conflicting pages, and updating the
    virtual to physical mappings.  In this paper, we propose several
    dynamic page mapping policies that detect and resolve cache
    conflicts using hardware available in existing systems, such as a
    TLB and cache miss counter, to locate possible cache conflicts. We
    evaluate the simulated performance of a variety of mapping
    policies, and show that a dynamic page mapping policy using
    standard hardware can improve upon the performance of a static
    policy, but is not as effective as special-purpose hardware such as
    an associative cache or a CML buffer. We also describe the
    implementation and performance of a software-based dynamic policy
    on a DEC Alpha workstation running DEC OSF/1.
"
}

@InProceedings{Dahlin:osdi94,
  author = "Michael D. Dahlin and Thomas E. Anderson and David
		  A. Patterson and Randolph Y. Wang",
  title = "Cooperative Caching: Using Remote Client Memory to Improve
		  File System Performance",
  booktitle = OSDI94,
  year = 1994,
  month = nov,
  address = "Monterey, CA",
  organization = "USENIX Assoc.",
  pages = "267--280",
  abstract = "
    Emerging high-speed networks will allow machines to access remote
    data nearly as quickly as they can access local data. This trend
    motivates the use of cooperative caching: coordinating the file
    caches of many machines distributed on a LAN to form a more
    effective overall file cache. In this paper we examine four
    cooperative caching algorithms using a trace-driven simulation
    study. These simulations indicate that for the systems studied
    cooperative caching can halve the number of disk accesses,
    improving file system read response time by as much as 73\%. Based
    on these simulations we conclude that cooperative caching can
    significantly improve file system read response time and that
    relatively simple cooperative caching algorithms are sufficient to
    realize most of the potential performance gain.
"
}

