From 731946f4b90eb1173452dd30f1296dd825155d82 Mon Sep 17 00:00:00 2001
From: jbj <devnull@localhost>
Date: Wed, 21 Mar 2001 18:33:35 +0000
Subject: Initial revision

CVS patchset: 4644
CVS date: 2001/03/21 18:33:35
---
 db/mp/Design        |  52 ++++
 db/mp/mp_alloc.c    | 152 +++++++++++
 db/mp/mp_bh.c       | 662 +++++++++++++++++++++++++++++++++++++++++++++
 db/mp/mp_fget.c     | 417 +++++++++++++++++++++++++++++
 db/mp/mp_fopen.c    | 756 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 db/mp/mp_fput.c     | 186 +++++++++++++
 db/mp/mp_fset.c     |  98 +++++++
 db/mp/mp_method.c   | 115 ++++++++
 db/mp/mp_region.c   | 357 +++++++++++++++++++++++++
 db/mp/mp_register.c |  85 ++++++
 db/mp/mp_stat.c     | 388 +++++++++++++++++++++++++++
 db/mp/mp_sync.c     | 658 +++++++++++++++++++++++++++++++++++++++++++++
 db/mp/mp_trickle.c  | 149 +++++++++++
 13 files changed, 4075 insertions(+)
 create mode 100644 db/mp/Design
 create mode 100644 db/mp/mp_alloc.c
 create mode 100644 db/mp/mp_bh.c
 create mode 100644 db/mp/mp_fget.c
 create mode 100644 db/mp/mp_fopen.c
 create mode 100644 db/mp/mp_fput.c
 create mode 100644 db/mp/mp_fset.c
 create mode 100644 db/mp/mp_method.c
 create mode 100644 db/mp/mp_region.c
 create mode 100644 db/mp/mp_register.c
 create mode 100644 db/mp/mp_stat.c
 create mode 100644 db/mp/mp_sync.c
 create mode 100644 db/mp/mp_trickle.c

(limited to 'db/mp')

diff --git a/db/mp/Design b/db/mp/Design
new file mode 100644
index 000000000..1b26aae6c
--- /dev/null
+++ b/db/mp/Design
@@ -0,0 +1,52 @@
+$Id: Design,v 11.2 1999/11/21 23:08:27 bostic Exp $
+
+There are three ways we do locking in the mpool code:
+
+Locking a handle mutex to provide concurrency for DB_THREAD operations.
+Locking the region mutex to provide mutual exclusion while reading and
+    writing structures in the shared region.
+Locking buffer header mutexes during I/O.
+
+The first will not be further described here.  We use the shared mpool
+region lock to provide mutual exclusion while reading/modifying all of
+the data structures, including the buffer headers.  We use a per-buffer
+header lock to wait on buffer I/O.  The order of locking is as follows:
+
+Searching for a buffer:
+    Acquire the region lock.
+    Find the buffer header.
+    Increment the reference count (guarantee the buffer stays).
+    While the BH_LOCKED flag is set (I/O is going on) {
+	Release the region lock.
+	    Explicitly yield the processor if it's not the first pass
+	    through this loop, otherwise, we can simply spin because
+	    we'll be simply switching between the two locks.
+	Request the buffer lock.
+	The I/O will complete...
+	Acquire the buffer lock.
+	Release the buffer lock.
+	Acquire the region lock.
+    }
+    Return the buffer.
+
+Reading/writing a buffer:
+    Acquire the region lock.
+    Find/create the buffer header.
+    If reading, increment the reference count (guarantee the buffer stays).
+    Set the BH_LOCKED flag.
+    Acquire the buffer lock (guaranteed not to block).
+    Release the region lock.
+    Do the I/O and/or initialize the buffer contents.
+    Release the buffer lock.
+	At this point, the buffer lock is available, but the logical
+	operation (flagged by BH_LOCKED) is not yet completed.  For
+	this reason, among others, threads checking the BH_LOCKED flag
+	must loop around their test.
+    Acquire the region lock.
+    Clear the BH_LOCKED flag.
+    Release the region lock.
+    Return/discard the buffer.
+
+Pointers to DB_MPOOL, MPOOL, DB_MPOOLFILE and MPOOLFILE structures are
+not reacquired when a region lock is reacquired because they couldn't
+have been closed/discarded and because they never move in memory.
diff --git a/db/mp/mp_alloc.c b/db/mp/mp_alloc.c
new file mode 100644
index 000000000..731f569f5
--- /dev/null
+++ b/db/mp/mp_alloc.c
@@ -0,0 +1,152 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ *	Sleepycat Software.  All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_alloc.c,v 11.7 2000/04/20 21:14:18 bostic Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#endif
+
+#include "db_int.h"
+#include "db_shash.h"
+#include "mp.h"
+
+/*
+ * __memp_alloc --
+ *	Allocate some space from a cache region.
+ *
+ * PUBLIC: int __memp_alloc __P((DB_MPOOL *,
+ * PUBLIC:     REGINFO *, MPOOLFILE *, size_t, roff_t *, void *));
+ */
+int
+__memp_alloc(dbmp, memreg, mfp, len, offsetp, retp)
+	DB_MPOOL *dbmp;
+	REGINFO *memreg;
+	MPOOLFILE *mfp;
+	size_t len;
+	roff_t *offsetp;
+	void *retp;
+{
+	BH *bhp, *nbhp;
+	MPOOL *c_mp;
+	MPOOLFILE *bh_mfp;
+	size_t total;
+	int nomore, restart, ret, wrote;
+	void *p;
+
+	c_mp = memreg->primary;
+
+	/*
+	 * If we're allocating a buffer, and the one we're discarding is the
+	 * same size, we don't want to waste the time to re-integrate it into
+	 * the shared memory free list.  If the DB_MPOOLFILE argument isn't
+	 * NULL, we'll compare the underlying page sizes of the two buffers
+	 * before free-ing and re-allocating buffers.
+	 */
+	if (mfp != NULL)
+		len = (sizeof(BH) - sizeof(u_int8_t)) + mfp->stat.st_pagesize;
+
+	nomore = 0;
+alloc:	if ((ret = __db_shalloc(memreg->addr, len, MUTEX_ALIGN, &p)) == 0) {
+		if (offsetp != NULL)
+			*offsetp = R_OFFSET(memreg, p);
+		*(void **)retp = p;
+		return (0);
+	}
+	if (nomore) {
+		__db_err(dbmp->dbenv,
+	    "Unable to allocate %lu bytes from mpool shared region: %s\n",
+		    (u_long)len, db_strerror(ret));
+		return (ret);
+	}
+
+retry:	/* Find a buffer we can flush; pure LRU. */
+	restart = total = 0;
+	for (bhp =
+	    SH_TAILQ_FIRST(&c_mp->bhq, __bh); bhp != NULL; bhp = nbhp) {
+		nbhp = SH_TAILQ_NEXT(bhp, q, __bh);
+
+		/* Ignore pinned or locked (I/O in progress) buffers. */
+		if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED))
+			continue;
+
+		/* Find the associated MPOOLFILE. */
+		bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+
+		/* Write the page if it's dirty. */
+		if (F_ISSET(bhp, BH_DIRTY)) {
+			++bhp->ref;
+			if ((ret = __memp_bhwrite(dbmp,
+			    bh_mfp, bhp, &restart, &wrote)) != 0)
+				return (ret);
+			--bhp->ref;
+
+			/*
+			 * Another process may have acquired this buffer and
+			 * incremented the ref count after we wrote it.
+			 */
+			if (bhp->ref != 0)
+				goto retry;
+
+			/*
+			 * If we wrote the page, continue and free the buffer.
+			 * We don't have to rewalk the list to acquire the
+			 * buffer because it was never available for any other
+			 * process to modify it.
+			 *
+			 * If we didn't write the page, but we discarded and
+			 * reacquired the region lock, restart the list walk.
+			 *
+			 * If we neither wrote the buffer nor discarded the
+			 * region lock, continue down the buffer list.
+			 */
+			if (wrote)
+				++c_mp->stat.st_rw_evict;
+			else {
+				if (restart)
+					goto retry;
+				continue;
+			}
+		} else
+			++c_mp->stat.st_ro_evict;
+
+		/*
+		 * Check to see if the buffer is the size we're looking for.
+		 * If it is, simply reuse it.
+		 */
+		if (mfp != NULL &&
+		    mfp->stat.st_pagesize == bh_mfp->stat.st_pagesize) {
+			__memp_bhfree(dbmp, bhp, 0);
+
+			if (offsetp != NULL)
+				*offsetp = R_OFFSET(memreg, bhp);
+			*(void **)retp = bhp;
+			return (0);
+		}
+
+		/* Note how much space we've freed, and free the buffer. */
+		total += __db_shsizeof(bhp);
+		__memp_bhfree(dbmp, bhp, 1);
+
+		/*
+		 * Retry as soon as we've freed up sufficient space.  If we
+		 * have to coalesce of memory to satisfy the request, don't
+		 * try until it's likely (possible?) that we'll succeed.
+		 */
+		if (total >= 3 * len)
+			goto alloc;
+
+		/* Restart the walk if we discarded the region lock. */
+		if (restart)
+			goto retry;
+	}
+	nomore = 1;
+	goto alloc;
+}
diff --git a/db/mp/mp_bh.c b/db/mp/mp_bh.c
new file mode 100644
index 000000000..e802b165b
--- /dev/null
+++ b/db/mp/mp_bh.c
@@ -0,0 +1,662 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ *	Sleepycat Software.  All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_bh.c,v 11.25 2001/01/10 04:50:53 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <string.h>
+#include <unistd.h>
+#endif
+
+#include "db_int.h"
+#include "db_shash.h"
+#include "mp.h"
+#include "log.h"
+#include "db_page.h"
+
+static int __memp_upgrade __P((DB_MPOOL *, DB_MPOOLFILE *, MPOOLFILE *));
+
+/*
+ * __memp_bhwrite --
+ *	Write the page associated with a given bucket header.
+ *
+ * PUBLIC: int __memp_bhwrite
+ * PUBLIC:     __P((DB_MPOOL *, MPOOLFILE *, BH *, int *, int *));
+ */
+int
+__memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
+	DB_MPOOL *dbmp;
+	MPOOLFILE *mfp;
+	BH *bhp;
+	int *restartp, *wrotep;
+{
+	DB_MPOOLFILE *dbmfp;
+	DB_MPREG *mpreg;
+	int incremented, ret;
+
+	if (restartp != NULL)
+		*restartp = 0;
+	if (wrotep != NULL)
+		*wrotep = 0;
+	incremented = 0;
+
+	/*
+	 * If the file has been removed or is a closed temporary file, Jump
+	 * right ahead and pretend that we've found the file we want-- the
+	 * page-write function knows how to handle the fact that we don't have
+	 * (or need!) any real file descriptor information.
+	 */
+	if (F_ISSET(mfp, MP_DEADFILE)) {
+		dbmfp = NULL;
+		goto found;
+	}
+
+	/*
+	 * Walk the process' DB_MPOOLFILE list and find a file descriptor for
+	 * the file.  We also check that the descriptor is open for writing.
+	 * If we find a descriptor on the file that's not open for writing, we
+	 * try and upgrade it to make it writeable.  If that fails, we're done.
+	 */
+	MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
+	for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
+	    dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))
+		if (dbmfp->mfp == mfp) {
+			if (F_ISSET(dbmfp, MP_READONLY) &&
+			    __memp_upgrade(dbmp, dbmfp, mfp)) {
+				MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+				return (0);
+			}
+
+			/*
+			 * Increment the reference count -- see the comment in
+			 * memp_fclose().
+			 */
+			++dbmfp->ref;
+			incremented = 1;
+			break;
+		}
+	MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+	if (dbmfp != NULL)
+		goto found;
+
+	/*
+	 * !!!
+	 * Don't try to attach to temporary files.  There are two problems in
+	 * trying to do that.  First, if we have different privileges than the
+	 * process that "owns" the temporary file, we might create the backing
+	 * disk file such that the owning process couldn't read/write its own
+	 * buffers, e.g., memp_trickle() running as root creating a file owned
+	 * as root, mode 600.  Second, if the temporary file has already been
+	 * created, we don't have any way of finding out what its real name is,
+	 * and, even if we did, it was already unlinked (so that it won't be
+	 * left if the process dies horribly).  This decision causes a problem,
+	 * however: if the temporary file consumes the entire buffer cache,
+	 * and the owner doesn't flush the buffers to disk, we could end up
+	 * with resource starvation, and the memp_trickle() thread couldn't do
+	 * anything about it.  That's a pretty unlikely scenario, though.
+	 *
+	 * Note that we should never get here when the temporary file
+	 * in question has already been closed in another process, in which
+	 * case it should be marked MP_DEADFILE.
+	 */
+	if (F_ISSET(mfp, MP_TEMP)) {
+		DB_ASSERT(!F_ISSET(mfp, MP_DEADFILE));
+		return (0);
+	}
+
+	/*
+	 * It's not a page from a file we've opened.  If the file requires
+	 * input/output processing, see if this process has ever registered
+	 * information as to how to write this type of file.  If not, there's
+	 * nothing we can do.
+	 */
+	if (mfp->ftype != 0) {
+		MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
+		for (mpreg = LIST_FIRST(&dbmp->dbregq);
+		    mpreg != NULL; mpreg = LIST_NEXT(mpreg, q))
+			if (mpreg->ftype == mfp->ftype)
+				break;
+		MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+		if (mpreg == NULL)
+			return (0);
+	}
+
+	/*
+	 * Try and open the file, attaching to the underlying shared area.
+	 * Ignore any error, assume it's a permissions problem.
+	 *
+	 * XXX
+	 * There's no negative cache, so we may repeatedly try and open files
+	 * that we have previously tried (and failed) to open.
+	 */
+	if (__memp_fopen(dbmp, mfp, R_ADDR(dbmp->reginfo, mfp->path_off),
+	    0, 0, mfp->stat.st_pagesize, 0, NULL, &dbmfp) != 0)
+		return (0);
+
+found:	ret = __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep);
+
+	if (incremented) {
+		MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
+		--dbmfp->ref;
+		MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+	}
+
+	return (ret);
+}
+
+/*
+ * __memp_pgread --
+ *	Read a page from a file.
+ *
+ * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, BH *, int));
+ */
+int
+__memp_pgread(dbmfp, bhp, can_create)
+	DB_MPOOLFILE *dbmfp;
+	BH *bhp;
+	int can_create;
+{
+	DB_IO db_io;
+	DB_ENV *dbenv;
+	DB_MPOOL *dbmp;
+	MPOOLFILE *mfp;
+	size_t len, pagesize;
+	size_t nr;
+	int created, ret;
+
+	dbmp = dbmfp->dbmp;
+	dbenv = dbmp->dbenv;
+	mfp = dbmfp->mfp;
+	pagesize = mfp->stat.st_pagesize;
+
+	F_SET(bhp, BH_LOCKED | BH_TRASH);
+	MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp);
+	R_UNLOCK(dbenv, dbmp->reginfo);
+
+	/*
+	 * Temporary files may not yet have been created.  We don't create
+	 * them now, we create them when the pages have to be flushed.
+	 */
+	nr = 0;
+	if (F_ISSET(&dbmfp->fh, DB_FH_VALID)) {
+		/*
+		 * Ignore read errors if we have permission to create the page.
+		 * Assume that the page doesn't exist, and that we'll create it
+		 * when we write it out.
+		 *
+		 * XXX
+		 * Theoretically, we could overwrite a page of data if it were
+		 * possible for a file to be successfully opened for reading
+		 * and then for the read to fail.  Shouldn't ever happen, but
+		 * it might be worth checking to see if the offset is past the
+		 * known end-of-file.
+		 */
+		db_io.fhp = &dbmfp->fh;
+		db_io.mutexp = dbmfp->mutexp;
+		db_io.pagesize = db_io.bytes = pagesize;
+		db_io.pgno = bhp->pgno;
+		db_io.buf = bhp->buf;
+
+		ret = __os_io(dbenv, &db_io, DB_IO_READ, &nr);
+	} else
+		ret = 0;
+
+	created = 0;
+	if (nr < pagesize) {
+		if (can_create)
+			created = 1;
+		else {
+			/*
+			 * If we had a short read, ret may be 0.  This may not
+			 * be an error -- in particular DB recovery processing
+			 * may request pages that have never been written to
+			 * disk, in which case we won't find the page.  So, the
+			 * caller must know how to handle the error.
+			 */
+			if (ret == 0)
+				ret = EIO;
+			goto err;
+		}
+	}
+
+	/*
+	 * Clear any bytes we didn't read that need to be cleared.  If we're
+	 * running in diagnostic mode, smash any bytes on the page that are
+	 * unknown quantities for the caller.
+	 */
+	if (nr != pagesize) {
+		len = mfp->clear_len == 0 ? pagesize : mfp->clear_len;
+		if (nr < len)
+			memset(bhp->buf + nr, 0, len - nr);
+#ifdef DIAGNOSTIC
+		if (nr > len)
+			len = nr;
+		if (len < pagesize)
+			memset(bhp->buf + len, CLEAR_BYTE, pagesize - len);
+#endif
+	}
+
+	/* Call any pgin function. */
+	ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1);
+
+	/* Unlock the buffer and reacquire the region lock. */
+err:	MUTEX_UNLOCK(dbenv, &bhp->mutex);
+	R_LOCK(dbenv, dbmp->reginfo);
+
+	/*
+	 * If no errors occurred, the data is now valid, clear the BH_TRASH
+	 * flag; regardless, clear the lock bit and let other threads proceed.
+	 */
+	F_CLR(bhp, BH_LOCKED);
+	if (ret == 0) {
+		F_CLR(bhp, BH_TRASH);
+
+		/* Update the statistics. */
+		if (created)
+			++mfp->stat.st_page_create;
+		else
+			++mfp->stat.st_page_in;
+	}
+
+	return (ret);
+}
+
+/*
+ * __memp_pgwrite --
+ *	Write a page to a file.
+ *
+ * PUBLIC: int __memp_pgwrite
+ * PUBLIC:     __P((DB_MPOOL *, DB_MPOOLFILE *, BH *, int *, int *));
+ */
+int
+__memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep)
+	DB_MPOOL *dbmp;
+	DB_MPOOLFILE *dbmfp;
+	BH *bhp;
+	int *restartp, *wrotep;
+{
+	DB_ENV *dbenv;
+	DB_IO db_io;
+	DB_LSN lsn;
+	MPOOL *c_mp, *mp;
+	MPOOLFILE *mfp;
+	size_t nw;
+	int callpgin, dosync, ret, syncfail;
+	const char *fail;
+
+	dbenv = dbmp->dbenv;
+	mp = dbmp->reginfo[0].primary;
+	mfp = dbmfp == NULL ? NULL : dbmfp->mfp;
+
+	if (restartp != NULL)
+		*restartp = 0;
+	if (wrotep != NULL)
+		*wrotep = 0;
+	callpgin = 0;
+
+	/*
+	 * Check the dirty bit -- this buffer may have been written since we
+	 * decided to write it.
+	 */
+	if (!F_ISSET(bhp, BH_DIRTY)) {
+		if (wrotep != NULL)
+			*wrotep = 1;
+		return (0);
+	}
+
+	MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp);
+
+	/*
+	 * If there were two writers, we may have just been waiting while the
+	 * other writer completed I/O on this buffer.  Check the dirty bit one
+	 * more time.
+	 */
+	if (!F_ISSET(bhp, BH_DIRTY)) {
+		MUTEX_UNLOCK(dbenv, &bhp->mutex);
+
+		if (wrotep != NULL)
+			*wrotep = 1;
+		return (0);
+	}
+
+	F_SET(bhp, BH_LOCKED);
+	R_UNLOCK(dbenv, dbmp->reginfo);
+
+	if (restartp != NULL)
+		*restartp = 1;
+
+	/*
+	 * It's possible that the underlying file doesn't exist, either
+	 * because of an outright removal or because it was a temporary
+	 * file that's been closed.
+	 *
+	 * !!!
+	 * Once we pass this point, we know that dbmfp and mfp aren't NULL,
+	 * and that we have a valid file reference.
+	 */
+	if (mfp == NULL || F_ISSET(mfp, MP_DEADFILE))
+		goto file_dead;
+
+	/*
+	 * Ensure the appropriate log records are on disk.  If the page is
+	 * being written as part of a sync operation, the flush has already
+	 * been done, unless it was written by the application *after* the
+	 * sync was scheduled.
+	 */
+	if (LOGGING_ON(dbenv) &&
+	    (!F_ISSET(bhp, BH_SYNC) || F_ISSET(bhp, BH_SYNC_LOGFLSH))) {
+		memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN));
+		if ((ret = log_flush(dbenv, &lsn)) != 0)
+			goto err;
+	}
+	DB_ASSERT(!LOGGING_ON(dbenv) ||
+	   log_compare(&((LOG *)((DB_LOG *)
+	   dbenv->lg_handle)->reginfo.primary)->s_lsn, &LSN(bhp->buf)) > 0);
+
+	/*
+	 * Call any pgout function.  We set the callpgin flag so that we flag
+	 * that the contents of the buffer will need to be passed through pgin
+	 * before they are reused.
+	 */
+	if (mfp->ftype == 0)
+		ret = 0;
+	else {
+		callpgin = 1;
+		if ((ret = __memp_pg(dbmfp, bhp, 0)) != 0)
+			goto err;
+	}
+
+	/* Temporary files may not yet have been created. */
+	if (!F_ISSET(&dbmfp->fh, DB_FH_VALID)) {
+		MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
+		if (!F_ISSET(&dbmfp->fh, DB_FH_VALID) &&
+		    ((ret = __db_appname(dbenv, DB_APP_TMP, NULL, NULL,
+		    DB_OSO_CREATE | DB_OSO_EXCL | DB_OSO_TEMP,
+		    &dbmfp->fh, NULL)) != 0 ||
+		    !F_ISSET(&dbmfp->fh, DB_FH_VALID))) {
+			MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+			__db_err(dbenv,
+			    "unable to create temporary backing file");
+			goto err;
+		}
+		MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+	}
+
+	/* Write the page. */
+	db_io.fhp = &dbmfp->fh;
+	db_io.mutexp = dbmfp->mutexp;
+	db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize;
+	db_io.pgno = bhp->pgno;
+	db_io.buf = bhp->buf;
+	if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) {
+		ret = __db_panic(dbenv, ret);
+		fail = "write";
+		goto syserr;
+	}
+	if (nw != mfp->stat.st_pagesize) {
+		ret = EIO;
+		fail = "write";
+		goto syserr;
+	}
+
+file_dead:
+	/*
+	 * !!!
+	 * Once we pass this point, dbmfp and mfp may be NULL, we may not have
+	 * a valid file reference.
+	 *
+	 * Unlock the buffer and reacquire the region lock.
+	 */
+	MUTEX_UNLOCK(dbenv, &bhp->mutex);
+	R_LOCK(dbenv, dbmp->reginfo);
+
+	/*
+	 * Clean up the flags based on a successful write.
+	 *
+	 * If we rewrote the page, it will need processing by the pgin
+	 * routine before reuse.
+	 */
+	if (callpgin)
+		F_SET(bhp, BH_CALLPGIN);
+	F_CLR(bhp, BH_DIRTY | BH_LOCKED);
+
+	/*
+	 * If we write a buffer for which a checkpoint is waiting, update
+	 * the count of pending buffers (both in the mpool as a whole and
+	 * for this file).  If the count for this file goes to zero, set a
+	 * flag so we flush the writes.
+	 */
+	dosync = 0;
+	if (F_ISSET(bhp, BH_SYNC)) {
+		F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH);
+
+		--mp->lsn_cnt;
+		if (mfp != NULL)
+			dosync = --mfp->lsn_cnt == 0 ? 1 : 0;
+	}
+
+	/* Update the page clean/dirty statistics. */
+	c_mp = BH_TO_CACHE(dbmp, bhp);
+	++c_mp->stat.st_page_clean;
+	--c_mp->stat.st_page_dirty;
+
+	/* Update I/O statistics. */
+	if (mfp != NULL)
+		++mfp->stat.st_page_out;
+
+	/*
+	 * Do the sync after everything else has been updated, so any incoming
+	 * checkpoint doesn't see inconsistent information.
+	 *
+	 * XXX:
+	 * Don't lock the region around the sync, fsync(2) has no atomicity
+	 * issues.
+	 *
+	 * XXX:
+	 * We ignore errors from the sync -- it makes no sense to return an
+	 * error to the calling process, so set a flag causing the checkpoint
+	 * to be retried later.  There is a possibility, of course, that a
+	 * subsequent checkpoint was started and that we're going to force it
+	 * to fail.  That should be unlikely, and fixing it would be difficult.
+	 */
+	if (dosync) {
+		R_UNLOCK(dbenv, dbmp->reginfo);
+		syncfail = __os_fsync(dbenv, &dbmfp->fh) != 0;
+		R_LOCK(dbenv, dbmp->reginfo);
+		if (syncfail)
+			F_SET(mp, MP_LSN_RETRY);
+	}
+
+	if (wrotep != NULL)
+		*wrotep = 1;
+
+	return (0);
+
+syserr:	__db_err(dbenv, "%s: %s failed for page %lu",
+	    __memp_fn(dbmfp), fail, (u_long)bhp->pgno);
+
+err:	/* Unlock the buffer and reacquire the region lock. */
+	MUTEX_UNLOCK(dbenv, &bhp->mutex);
+	R_LOCK(dbenv, dbmp->reginfo);
+
+	/*
+	 * Clean up the flags based on a failure.
+	 *
+	 * The page remains dirty but we remove our lock.  If we rewrote the
+	 * page, it will need processing by the pgin routine before reuse.
+	 */
+	if (callpgin)
+		F_SET(bhp, BH_CALLPGIN);
+	F_CLR(bhp, BH_LOCKED);
+
+	return (ret);
+}
+
+/*
+ * __memp_pg --
+ *	Call the pgin/pgout routine.
+ *
+ * PUBLIC: int __memp_pg __P((DB_MPOOLFILE *, BH *, int));
+ */
+int
+__memp_pg(dbmfp, bhp, is_pgin)
+	DB_MPOOLFILE *dbmfp;
+	BH *bhp;
+	int is_pgin;
+{
+	DBT dbt, *dbtp;
+	DB_MPOOL *dbmp;
+	DB_MPREG *mpreg;
+	MPOOLFILE *mfp;
+	int ftype, ret;
+
+	dbmp = dbmfp->dbmp;
+	mfp = dbmfp->mfp;
+
+	MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
+
+	ftype = mfp->ftype;
+	for (mpreg = LIST_FIRST(&dbmp->dbregq);
+	    mpreg != NULL; mpreg = LIST_NEXT(mpreg, q)) {
+		if (ftype != mpreg->ftype)
+			continue;
+		if (mfp->pgcookie_len == 0)
+			dbtp = NULL;
+		else {
+			dbt.size = mfp->pgcookie_len;
+			dbt.data = R_ADDR(dbmp->reginfo, mfp->pgcookie_off);
+			dbtp = &dbt;
+		}
+		MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+
+		if (is_pgin) {
+			if (mpreg->pgin != NULL &&
+			    (ret = mpreg->pgin(dbmp->dbenv,
+			    bhp->pgno, bhp->buf, dbtp)) != 0)
+				goto err;
+		} else
+			if (mpreg->pgout != NULL &&
+			    (ret = mpreg->pgout(dbmp->dbenv,
+			    bhp->pgno, bhp->buf, dbtp)) != 0)
+				goto err;
+		break;
+	}
+
+	if (mpreg == NULL)
+		MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+
+	return (0);
+
+err:	MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+	__db_err(dbmp->dbenv, "%s: %s failed for page %lu",
+	    __memp_fn(dbmfp), is_pgin ? "pgin" : "pgout", (u_long)bhp->pgno);
+	return (ret);
+}
+
+/*
+ * __memp_bhfree --
+ *	Free a bucket header and its referenced data.
+ *
+ * PUBLIC: void __memp_bhfree __P((DB_MPOOL *, BH *, int));
+ */
+void
+__memp_bhfree(dbmp, bhp, free_mem)
+	DB_MPOOL *dbmp;
+	BH *bhp;
+	int free_mem;
+{
+	DB_HASHTAB *dbht;
+	MPOOL *c_mp, *mp;
+	MPOOLFILE *mfp;
+	int n_bucket, n_cache;
+
+	mp = dbmp->reginfo[0].primary;
+	c_mp = BH_TO_CACHE(dbmp, bhp);
+	n_cache = NCACHE(mp, bhp->pgno);
+	n_bucket = NBUCKET(c_mp, bhp->mf_offset, bhp->pgno);
+	dbht = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
+
+	/* Delete the buffer header from the hash bucket queue. */
+	SH_TAILQ_REMOVE(&dbht[n_bucket], bhp, hq, __bh);
+
+	/* Delete the buffer header from the LRU queue. */
+	SH_TAILQ_REMOVE(&c_mp->bhq, bhp, q, __bh);
+
+	/* Clear the mutex this buffer recorded */
+	__db_shlocks_clear(&bhp->mutex, &dbmp->reginfo[n_cache],
+	    (REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], mp->maint_off));
+	/*
+	 * Find the underlying MPOOLFILE and decrement its reference count.
+	 * If this is its last reference, remove it.
+	 */
+	mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+	if (--mfp->block_cnt == 0 && mfp->mpf_cnt == 0)
+		__memp_mf_discard(dbmp, mfp);
+
+	/*
+	 * If we're not reusing it immediately, free the buffer header
+	 * and data for real.
+	 */
+	if (free_mem) {
+		--c_mp->stat.st_page_clean;
+		__db_shalloc_free(dbmp->reginfo[n_cache].addr, bhp);
+	}
+}
+
+/*
+ * __memp_upgrade --
+ *	Upgrade a file descriptor from readonly to readwrite.
+ */
+static int
+__memp_upgrade(dbmp, dbmfp, mfp)
+	DB_MPOOL *dbmp;
+	DB_MPOOLFILE *dbmfp;
+	MPOOLFILE *mfp;
+{
+	DB_FH fh;
+	int ret;
+	char *rpath;
+
+	/*
+	 * !!!
+	 * We expect the handle to already be locked.
+	 */
+
+	/* Check to see if we've already upgraded. */
+	if (F_ISSET(dbmfp, MP_UPGRADE))
+		return (0);
+
+	/* Check to see if we've already failed. */
+	if (F_ISSET(dbmfp, MP_UPGRADE_FAIL))
+		return (1);
+
+	/*
+	 * Calculate the real name for this file and try to open it read/write.
+	 * We know we have a valid pathname for the file because it's the only
+	 * way we could have gotten a file descriptor of any kind.
+	 */
+	if ((ret = __db_appname(dbmp->dbenv, DB_APP_DATA,
+	    NULL, R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) != 0)
+		return (ret);
+	if (__os_open(dbmp->dbenv, rpath, 0, 0, &fh) != 0) {
+		F_SET(dbmfp, MP_UPGRADE_FAIL);
+		ret = 1;
+	} else {
+		/* Swap the descriptors and set the upgrade flag. */
+		(void)__os_closehandle(&dbmfp->fh);
+		dbmfp->fh = fh;
+		F_SET(dbmfp, MP_UPGRADE);
+		ret = 0;
+	}
+	__os_freestr(rpath);
+	return (ret);
+}
diff --git a/db/mp/mp_fget.c b/db/mp/mp_fget.c
new file mode 100644
index 000000000..1bff5e136
--- /dev/null
+++ b/db/mp/mp_fget.c
@@ -0,0 +1,417 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ *	Sleepycat Software.  All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_fget.c,v 11.28 2001/01/10 04:50:53 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <string.h>
+#endif
+
+#ifdef  HAVE_RPC
+#include "db_server.h"
+#endif
+
+#include "db_int.h"
+#include "db_shash.h"
+#include "mp.h"
+
+#ifdef HAVE_RPC
+#include "gen_client_ext.h"
+#include "rpc_client_ext.h"
+#endif
+
+/*
+ * memp_fget --
+ *	Get a page from the file.
+ */
+int
+memp_fget(dbmfp, pgnoaddr, flags, addrp)
+	DB_MPOOLFILE *dbmfp;
+	db_pgno_t *pgnoaddr;
+	u_int32_t flags;
+	void *addrp;
+{
+	BH *bhp;
+	DB_ENV *dbenv;
+	DB_MPOOL *dbmp;
+	DB_HASHTAB *dbht;
+	MPOOL *c_mp, *mp;
+	MPOOLFILE *mfp;
+	size_t n_bucket, n_cache, mf_offset;
+	u_int32_t st_hsearch;
+	int b_incr, first, ret;
+
+	dbmp = dbmfp->dbmp;
+	dbenv = dbmp->dbenv;
+	mp = dbmp->reginfo[0].primary;
+	mfp = dbmfp->mfp;
+#ifdef HAVE_RPC
+	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
+		return (__dbcl_memp_fget(dbmfp, pgnoaddr, flags, addrp));
+#endif
+
+	PANIC_CHECK(dbenv);
+
+	/*
+	 * Validate arguments.
+	 *
+	 * !!!
+	 * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly
+	 * files here, and create non-existent pages in readonly files if the
+	 * flags are set, later.  The reason is that the hash access method
+	 * wants to get empty pages that don't really exist in readonly files.
+	 * The only alternative is for hash to write the last "bucket" all the
+	 * time, which we don't want to do because one of our big goals in life
+	 * is to keep database files small.  It's sleazy as hell, but we catch
+	 * any attempt to actually write the file in memp_fput().
+	 */
+#define	OKFLAGS	\
+    (DB_MPOOL_CREATE | DB_MPOOL_LAST | \
+    DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP | DB_MPOOL_EXTENT)
+	if (flags != 0) {
+		if ((ret = __db_fchk(dbenv, "memp_fget", flags, OKFLAGS)) != 0)
+			return (ret);
+
+		switch (flags & ~DB_MPOOL_EXTENT) {
+		case DB_MPOOL_CREATE:
+		case DB_MPOOL_LAST:
+		case DB_MPOOL_NEW:
+		case DB_MPOOL_NEW_GROUP:
+		case 0:
+			break;
+		default:
+			return (__db_ferr(dbenv, "memp_fget", 1));
+		}
+	}
+
+#ifdef DIAGNOSTIC
+	/*
+	 * XXX
+	 * We want to switch threads as often as possible.  Yield every time
+	 * we get a new page to ensure contention.
+	 */
+	if (DB_GLOBAL(db_pageyield))
+		__os_yield(dbenv, 1);
+#endif
+
+	/* Initialize remaining local variables. */
+	mf_offset = R_OFFSET(dbmp->reginfo, mfp);
+	bhp = NULL;
+	st_hsearch = 0;
+	b_incr = ret = 0;
+
+	R_LOCK(dbenv, dbmp->reginfo);
+
+	/*
+	 * Check for the new, last or last + 1 page requests.
+	 *
+	 * Examine and update the file's last_pgno value.  We don't care if
+	 * the last_pgno value immediately changes due to another thread --
+	 * at this instant in time, the value is correct.  We do increment the
+	 * current last_pgno value if the thread is asking for a new page,
+	 * however, to ensure that two threads creating pages don't get the
+	 * same one.
+	 *
+	 * If we create a page, there is the potential that a page after it
+	 * in the file will be written before it will be written.  Recovery
+	 * depends on pages that are "created" in the file by subsequent pages
+	 * being written be zeroed out, not have random garbage.  Ensure that
+	 * the OS agrees.
+	 *
+	 * !!!
+	 * DB_MPOOL_NEW_GROUP is undocumented -- the hash access method needs
+	 * to allocate contiguous groups of pages in order to do subdatabases.
+	 * We return the first page in the group, but the caller must put an
+	 * LSN on the *last* page and write it, otherwise after a crash we may
+	 * not create all of the pages we need to create.
+	 */
+	if (LF_ISSET(DB_MPOOL_LAST | DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP)) {
+		if (LF_ISSET(DB_MPOOL_NEW)) {
+			if (F_ISSET(&dbmfp->fh, DB_FH_VALID) && (ret =
+			    __os_fpinit(dbenv, &dbmfp->fh, mfp->last_pgno + 1,
+			    1, mfp->stat.st_pagesize)) != 0) {
+				R_UNLOCK(dbenv, dbmp->reginfo);
+				return (ret);
+			}
+			++mfp->last_pgno;
+		}
+		if (LF_ISSET(DB_MPOOL_NEW_GROUP)) {
+			if (F_ISSET(&dbmfp->fh, DB_FH_VALID) && (ret =
+			    __os_fpinit(dbenv, &dbmfp->fh, mfp->last_pgno + 1,
+			    (int)*pgnoaddr, mfp->stat.st_pagesize)) != 0) {
+				R_UNLOCK(dbenv, dbmp->reginfo);
+				return (ret);
+			}
+			mfp->last_pgno += *pgnoaddr;
+		}
+		*pgnoaddr = mfp->last_pgno;
+	}
+
+	/*
+	 * Determine the hash bucket where this page will live, and get local
+	 * pointers to the cache and its hash table.
+	 */
+	n_cache = NCACHE(mp, *pgnoaddr);
+	c_mp = dbmp->reginfo[n_cache].primary;
+	n_bucket = NBUCKET(c_mp, mf_offset, *pgnoaddr);
+	dbht = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
+
+	if (LF_ISSET(DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP))
+		goto alloc;
+
+	/*
+	 * If mmap'ing the file and the page is not past the end of the file,
+	 * just return a pointer.
+	 *
+	 * The page may be past the end of the file, so check the page number
+	 * argument against the original length of the file.  If we previously
+	 * returned pages past the original end of the file, last_pgno will
+	 * have been updated to match the "new" end of the file, and checking
+	 * against it would return pointers past the end of the mmap'd region.
+	 *
+	 * If another process has opened the file for writing since we mmap'd
+	 * it, we will start playing the game by their rules, i.e. everything
+	 * goes through the cache.  All pages previously returned will be safe,
+	 * as long as the correct locking protocol was observed.
+	 *
+	 * XXX
+	 * We don't discard the map because we don't know when all of the
+	 * pages will have been discarded from the process' address space.
+	 * It would be possible to do so by reference counting the open
+	 * pages from the mmap, but it's unclear to me that it's worth it.
+	 */
+	if (dbmfp->addr != NULL && F_ISSET(mfp, MP_CAN_MMAP)) {
+		if (*pgnoaddr > mfp->orig_last_pgno) {
+			/*
+			 * !!!
+			 * See the comment above about non-existent pages and
+			 * the hash access method.
+			 */
+			if (!LF_ISSET(DB_MPOOL_CREATE)) {
+				if (!LF_ISSET(DB_MPOOL_EXTENT))
+					__db_err(dbenv,
+					    "%s: page %lu doesn't exist",
+					    __memp_fn(dbmfp), (u_long)*pgnoaddr);
+				ret = EINVAL;
+				goto err;
+			}
+		} else {
+			*(void **)addrp =
+			    R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize);
+			++mfp->stat.st_map;
+			goto done;
+		}
+	}
+
+	/* Search the hash chain for the page. */
+	for (bhp = SH_TAILQ_FIRST(&dbht[n_bucket], __bh);
+	    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
+		++st_hsearch;
+		if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset)
+			continue;
+
+		/* Increment the reference count. */
+		if (bhp->ref == UINT16_T_MAX) {
+			__db_err(dbenv,
+			    "%s: page %lu: reference count overflow",
+			    __memp_fn(dbmfp), (u_long)bhp->pgno);
+			ret = EINVAL;
+			goto err;
+		}
+
+		/*
+		 * Increment the reference count.  We may discard the region
+		 * lock as we evaluate and/or read the buffer, so we need to
+		 * ensure that it doesn't move and that its contents remain
+		 * unchanged.
+		 */
+		++bhp->ref;
+		b_incr = 1;
+
+		/*
+		 * Any buffer we find might be trouble.
+		 *
+		 * BH_LOCKED --
+		 * I/O is in progress.  Because we've incremented the buffer
+		 * reference count, we know the buffer can't move.  Unlock
+		 * the region lock, wait for the I/O to complete, and reacquire
+		 * the region.
+		 */
+		for (first = 1; F_ISSET(bhp, BH_LOCKED); first = 0) {
+			R_UNLOCK(dbenv, dbmp->reginfo);
+
+			/*
+			 * Explicitly yield the processor if it's not the first
+			 * pass through this loop -- if we don't, we might end
+			 * up running to the end of our CPU quantum as we will
+			 * simply be swapping between the two locks.
+			 */
+			if (!first)
+				__os_yield(dbenv, 1);
+
+			MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp);
+			/* Wait for I/O to finish... */
+			MUTEX_UNLOCK(dbenv, &bhp->mutex);
+			R_LOCK(dbenv, dbmp->reginfo);
+		}
+
+		/*
+		 * BH_TRASH --
+		 * The contents of the buffer are garbage.  Shouldn't happen,
+		 * and this read is likely to fail, but might as well try.
+		 */
+		if (F_ISSET(bhp, BH_TRASH))
+			goto reread;
+
+		/*
+		 * BH_CALLPGIN --
+		 * The buffer was converted so it could be written, and the
+		 * contents need to be converted again.
+		 */
+		if (F_ISSET(bhp, BH_CALLPGIN)) {
+			if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0)
+				goto err;
+			F_CLR(bhp, BH_CALLPGIN);
+		}
+
+		++mfp->stat.st_cache_hit;
+		*(void **)addrp = bhp->buf;
+		goto done;
+	}
+
+alloc:	/* Allocate new buffer header and data space. */
+	if ((ret = __memp_alloc(dbmp,
+	    &dbmp->reginfo[n_cache], mfp, 0, NULL, &bhp)) != 0)
+		goto err;
+
+	++c_mp->stat.st_page_clean;
+
+	/*
+	 * Initialize the BH fields so that we can call the __memp_bhfree
+	 * routine if an error occurs.
+	 */
+	memset(bhp, 0, sizeof(BH));
+	bhp->ref = 1;
+	bhp->pgno = *pgnoaddr;
+	bhp->mf_offset = mf_offset;
+
+	/* Increment the count of buffers referenced by this MPOOLFILE. */
+	++mfp->block_cnt;
+
+	/*
+	 * Prepend the bucket header to the head of the appropriate MPOOL
+	 * bucket hash list.  Append the bucket header to the tail of the
+	 * MPOOL LRU chain.
+	 */
+	SH_TAILQ_INSERT_HEAD(&dbht[n_bucket], bhp, hq, __bh);
+	SH_TAILQ_INSERT_TAIL(&c_mp->bhq, bhp, q);
+
+#ifdef DIAGNOSTIC
+	if ((db_alignp_t)bhp->buf & (sizeof(size_t) - 1)) {
+		__db_err(dbenv, "Internal error: BH data NOT size_t aligned.");
+		ret = EINVAL;
+		__memp_bhfree(dbmp, bhp, 1);
+		goto err;
+	}
+#endif
+
+	if ((ret = __db_shmutex_init(dbenv, &bhp->mutex,
+	    R_OFFSET(dbmp->reginfo, &bhp->mutex) + DB_FCNTL_OFF_MPOOL,
+	    0, &dbmp->reginfo[n_cache],
+	    (REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], c_mp->maint_off)))
+	    != 0) {
+		__memp_bhfree(dbmp, bhp, 1);
+		goto err;
+	}
+
+	/*
+	 * If we created the page, zero it out and continue.
+	 *
+	 * !!!
+	 * Note: DB_MPOOL_NEW specifically doesn't call the pgin function.
+	 * If DB_MPOOL_CREATE is used, then the application's pgin function
+	 * has to be able to handle pages of 0's -- if it uses DB_MPOOL_NEW,
+	 * it can detect all of its page creates, and not bother.
+	 *
+	 * If we're running in diagnostic mode, smash any bytes on the
+	 * page that are unknown quantities for the caller.
+	 *
+	 * Otherwise, read the page into memory, optionally creating it if
+	 * DB_MPOOL_CREATE is set.
+	 */
+	if (LF_ISSET(DB_MPOOL_NEW | DB_MPOOL_NEW_GROUP)) {
+		if (mfp->clear_len == 0)
+			memset(bhp->buf, 0, mfp->stat.st_pagesize);
+		else {
+			memset(bhp->buf, 0, mfp->clear_len);
+#ifdef DIAGNOSTIC
+			memset(bhp->buf + mfp->clear_len, CLEAR_BYTE,
+			    mfp->stat.st_pagesize - mfp->clear_len);
+#endif
+		}
+
+		++mfp->stat.st_page_create;
+	} else {
+		/*
+		 * It's possible for the read function to fail, which means
+		 * that we fail as well.  Note, the __memp_pgread() function
+		 * discards the region lock, so the buffer must be pinned
+		 * down so that it cannot move and its contents are unchanged.
+		 */
+reread:		if ((ret = __memp_pgread(dbmfp,
+		    bhp, LF_ISSET(DB_MPOOL_CREATE|DB_MPOOL_EXTENT))) != 0) {
+			/*
+			 * !!!
+			 * Discard the buffer unless another thread is waiting
+			 * on our I/O to complete.  Regardless, the header has
+			 * the BH_TRASH flag set.
+			 */
+			if (bhp->ref == 1)
+				__memp_bhfree(dbmp, bhp, 1);
+			goto err;
+		}
+
+		++mfp->stat.st_cache_miss;
+	}
+
+	/*
+	 * If we're returning a page after our current notion of the last-page,
+	 * update our information.  Note, there's no way to un-instantiate this
+	 * page, it's going to exist whether it's returned to us dirty or not.
+	 */
+	if (bhp->pgno > mfp->last_pgno)
+		mfp->last_pgno = bhp->pgno;
+
+	*(void **)addrp = bhp->buf;
+
+done:	/* Update the chain search statistics. */
+	if (st_hsearch) {
+		++c_mp->stat.st_hash_searches;
+		if (st_hsearch > c_mp->stat.st_hash_longest)
+			c_mp->stat.st_hash_longest = st_hsearch;
+		c_mp->stat.st_hash_examined += st_hsearch;
+	}
+
+	++dbmfp->pinref;
+
+	R_UNLOCK(dbenv, dbmp->reginfo);
+
+	return (0);
+
+err:	/* Discard our reference. */
+	if (b_incr)
+		--bhp->ref;
+	R_UNLOCK(dbenv, dbmp->reginfo);
+
+	*(void **)addrp = NULL;
+	return (ret);
+}
diff --git a/db/mp/mp_fopen.c b/db/mp/mp_fopen.c
new file mode 100644
index 000000000..3611ded18
--- /dev/null
+++ b/db/mp/mp_fopen.c
@@ -0,0 +1,756 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ *	Sleepycat Software.  All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_fopen.c,v 11.41 2001/01/10 04:50:53 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <string.h>
+#endif
+
+#ifdef  HAVE_RPC
+#include "db_server.h"
+#endif
+
+#include "db_int.h"
+#include "db_shash.h"
+#include "mp.h"
+
+#ifdef HAVE_RPC
+#include "gen_client_ext.h"
+#include "rpc_client_ext.h"
+#endif
+
+static int __memp_mf_open __P((DB_MPOOL *, const char *,
+    size_t, db_pgno_t, DB_MPOOL_FINFO *, u_int32_t, MPOOLFILE **));
+
+/*
+ * MEMP_FREMOVE --
+ *	Discard an MPOOLFILE and any buffers it references: update the flags
+ *	so we never try to write buffers associated with the file, nor can we
+ *	find it when looking for files to join.  In addition, clear the ftype
+ *	field, there's no reason to post-process pages, they can be discarded
+ *	by any thread.
+ */
+#define	MEMP_FREMOVE(mfp) {						\
+	mfp->ftype = 0;							\
+	F_SET(mfp, MP_DEADFILE);					\
+}
+
+/*
+ * memp_fopen --
+ *	Open a backing file for the memory pool.
+ */
+int
+memp_fopen(dbenv, path, flags, mode, pagesize, finfop, retp)
+	DB_ENV *dbenv;
+	const char *path;
+	u_int32_t flags;
+	int mode;
+	size_t pagesize;
+	DB_MPOOL_FINFO *finfop;
+	DB_MPOOLFILE **retp;
+{
+	DB_MPOOL *dbmp;
+	int ret;
+
+#ifdef HAVE_RPC
+	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
+		return (__dbcl_memp_fopen(dbenv, path, flags,
+		    mode, pagesize, finfop, retp));
+#endif
+
+	PANIC_CHECK(dbenv);
+	ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
+
+	dbmp = dbenv->mp_handle;
+
+	/* Validate arguments. */
+	if ((ret = __db_fchk(dbenv, "memp_fopen", flags,
+	    DB_CREATE |
+	    DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE)) != 0)
+		return (ret);
+
+	/* Require a non-zero pagesize. */
+	if (pagesize == 0 ||
+	    (finfop != NULL && finfop->clear_len > pagesize)) {
+		__db_err(dbenv, "memp_fopen: illegal page size.");
+		return (EINVAL);
+	}
+
+	return (__memp_fopen(dbmp,
+	    NULL, path, flags, mode, pagesize, 1, finfop, retp));
+}
+
+/*
+ * __memp_set_unlink -- set unlink on last close flag.
+ *
+ * PUBLIC: void __memp_set_unlink __P((DB_MPOOLFILE *));
+ */
+void
+__memp_set_unlink(dbmpf)
+	DB_MPOOLFILE *dbmpf;
+{
+	DB_MPOOL *dbmp;
+	dbmp = dbmpf->dbmp;
+
+	R_LOCK(dbmp->dbenv, dbmp->reginfo);
+	F_SET(dbmpf->mfp, MP_UNLINK);
+	R_UNLOCK(dbmp->dbenv, dbmp->reginfo);
+}
+
+/*
+ * __memp_clear_unlink -- clear unlink on last close flag.
+ *
+ * PUBLIC: void __memp_clear_unlink __P((DB_MPOOLFILE *));
+ */
+void
+__memp_clear_unlink(dbmpf)
+	DB_MPOOLFILE *dbmpf;
+{
+	DB_MPOOL *dbmp;
+	dbmp = dbmpf->dbmp;
+
+	/*
+	 * This bit is protected in the queue code because the metapage
+	 * is locked so we can avoid geting the region lock.
+	 * If this gets used from other than the queue code, we cannot.
+	 */
+	if (!F_ISSET(dbmpf->mfp, MP_UNLINK))
+		return;
+	R_LOCK(dbmp->dbenv, dbmp->reginfo);
+	F_CLR(dbmpf->mfp, MP_UNLINK);
+	R_UNLOCK(dbmp->dbenv, dbmp->reginfo);
+}
+
+/*
+ * __memp_fopen --
+ *	Open a backing file for the memory pool; internal version.
+ *
+ * PUBLIC: int __memp_fopen __P((DB_MPOOL *, MPOOLFILE *, const char *,
+ * PUBLIC:    u_int32_t, int, size_t, int, DB_MPOOL_FINFO *, DB_MPOOLFILE **));
+ */
+int
+__memp_fopen(dbmp, mfp, path, flags, mode, pagesize, needlock, finfop, retp)
+	DB_MPOOL *dbmp;
+	MPOOLFILE *mfp;
+	const char *path;
+	u_int32_t flags;
+	int mode, needlock;
+	size_t pagesize;
+	DB_MPOOL_FINFO *finfop;
+	DB_MPOOLFILE **retp;
+{
+	DB_ENV *dbenv;
+	DB_MPOOLFILE *dbmfp;
+	DB_MPOOL_FINFO finfo;
+	db_pgno_t last_pgno;
+	size_t maxmap;
+	u_int32_t mbytes, bytes, oflags;
+	int ret;
+	u_int8_t idbuf[DB_FILE_ID_LEN];
+	char *rpath;
+
+	dbenv = dbmp->dbenv;
+	ret = 0;
+	rpath = NULL;
+
+	/*
+	 * If mfp is provided, we take the DB_MPOOL_FINFO information from
+	 * the mfp.  We don't bother initializing everything, because some
+	 * of them are expensive to acquire.  If no mfp is provided and the
+	 * finfop argument is NULL, we default the values.
+	 */
+	if (finfop == NULL) {
+		memset(&finfo, 0, sizeof(finfo));
+		if (mfp != NULL) {
+			finfo.ftype = mfp->ftype;
+			finfo.pgcookie = NULL;
+			finfo.fileid = NULL;
+			finfo.lsn_offset = mfp->lsn_off;
+			finfo.clear_len = mfp->clear_len;
+		} else {
+			finfo.ftype = 0;
+			finfo.pgcookie = NULL;
+			finfo.fileid = NULL;
+			finfo.lsn_offset = -1;
+			finfo.clear_len = 0;
+		}
+		finfop = &finfo;
+	}
+
+	/* Allocate and initialize the per-process structure. */
+	if ((ret = __os_calloc(dbenv, 1, sizeof(DB_MPOOLFILE), &dbmfp)) != 0)
+		return (ret);
+	dbmfp->dbmp = dbmp;
+	dbmfp->ref = 1;
+	if (LF_ISSET(DB_RDONLY))
+		F_SET(dbmfp, MP_READONLY);
+
+	if (path == NULL) {
+		if (LF_ISSET(DB_RDONLY)) {
+			__db_err(dbenv,
+			    "memp_fopen: temporary files can't be readonly");
+			ret = EINVAL;
+			goto err;
+		}
+		last_pgno = 0;
+	} else {
+		/* Get the real name for this file and open it. */
+		if ((ret = __db_appname(dbenv,
+		    DB_APP_DATA, NULL, path, 0, NULL, &rpath)) != 0)
+			goto err;
+		oflags = 0;
+		if (LF_ISSET(DB_CREATE))
+			oflags |= DB_OSO_CREATE;
+		if (LF_ISSET(DB_RDONLY))
+			oflags |= DB_OSO_RDONLY;
+		if ((ret =
+		   __os_open(dbenv, rpath, oflags, mode, &dbmfp->fh)) != 0) {
+			if (!LF_ISSET(DB_EXTENT))
+				__db_err(dbenv,
+				    "%s: %s", rpath, db_strerror(ret));
+			goto err;
+		}
+
+		/*
+		 * Don't permit files that aren't a multiple of the pagesize,
+		 * and find the number of the last page in the file, all the
+		 * time being careful not to overflow 32 bits.
+		 *
+		 * !!!
+		 * We can't use off_t's here, or in any code in the mainline
+		 * library for that matter.  (We have to use them in the os
+		 * stubs, of course, as there are system calls that take them
+		 * as arguments.)  The reason is that some customers build in
+		 * environments where an off_t is 32-bits, but still run where
+		 * offsets are 64-bits, and they pay us a lot of money.
+		 */
+		if ((ret = __os_ioinfo(dbenv, rpath,
+		    &dbmfp->fh, &mbytes, &bytes, NULL)) != 0) {
+			__db_err(dbenv, "%s: %s", rpath, db_strerror(ret));
+			goto err;
+		}
+
+		/*
+		 * If we're doing a verify, we might have to cope with
+		 * a truncated file;  if the file size is not a multiple
+		 * of the page size, round down to a page--we'll
+		 * take care of the partial page outside the memp system.
+		 */
+
+		/* Page sizes have to be a power-of-two, ignore mbytes. */
+		if (bytes % pagesize != 0) {
+			if (LF_ISSET(DB_ODDFILESIZE))
+				/*
+				 * If we're doing a verify, we might
+				 * have to cope with a truncated file;
+				 * round down, we'll worry about the partial
+				 * page outside the memp system.
+				 */
+				bytes -= (bytes % pagesize);
+			else {
+				__db_err(dbenv,
+		"%s: file size not a multiple of the pagesize",
+				    rpath);
+				ret = EINVAL;
+				goto err;
+			}
+		}
+
+		last_pgno = mbytes * (MEGABYTE / pagesize);
+		last_pgno += bytes / pagesize;
+
+		/* Correction: page numbers are zero-based, not 1-based. */
+		if (last_pgno != 0)
+			--last_pgno;
+
+		/*
+		 * Get the file id if we weren't given one.  Generated file id's
+		 * don't use timestamps, otherwise there'd be no chance of any
+		 * other process joining the party.
+		 */
+		if (finfop->fileid == NULL) {
+			if ((ret = __os_fileid(dbenv, rpath, 0, idbuf)) != 0)
+				goto err;
+			finfop->fileid = idbuf;
+		}
+	}
+
+	/*
+	 * If we weren't provided an underlying shared object to join with,
+	 * find/allocate the shared file objects.  Also allocate space for
+	 * for the per-process thread lock.
+	 */
+	if (needlock)
+		R_LOCK(dbenv, dbmp->reginfo);
+	if (mfp == NULL)
+		ret = __memp_mf_open(
+		    dbmp, path, pagesize, last_pgno, finfop, flags, &mfp);
+	else {
+		++mfp->mpf_cnt;
+		ret = 0;
+	}
+	if (needlock)
+		R_UNLOCK(dbenv, dbmp->reginfo);
+	if (ret != 0)
+		goto err;
+
+	if (F_ISSET(dbenv, DB_ENV_THREAD)) {
+		if ((ret = __db_mutex_alloc(
+		    dbenv, dbmp->reginfo, &dbmfp->mutexp)) != 0)
+			goto err;
+		if ((ret = __db_mutex_init(
+		    dbenv, dbmfp->mutexp, 0, MUTEX_THREAD)) != 0)
+			goto err;
+
+		/* XXX: KEITH: CLOSE THE FILE ON FAILURE? */
+	}
+
+	dbmfp->mfp = mfp;
+
+	/*
+	 * If a file:
+	 *	+ is read-only
+	 *	+ isn't temporary
+	 *	+ doesn't require any pgin/pgout support
+	 *	+ the DB_NOMMAP flag wasn't set (in either the file open or
+	 *	  the environment in which it was opened)
+	 *	+ and is less than mp_mmapsize bytes in size
+	 *
+	 * we can mmap it instead of reading/writing buffers.  Don't do error
+	 * checking based on the mmap call failure.  We want to do normal I/O
+	 * on the file if the reason we failed was because the file was on an
+	 * NFS mounted partition, and we can fail in buffer I/O just as easily
+	 * as here.
+	 *
+	 * XXX
+	 * We'd like to test to see if the file is too big to mmap.  Since we
+	 * don't know what size or type off_t's or size_t's are, or the largest
+	 * unsigned integral type is, or what random insanity the local C
+	 * compiler will perpetrate, doing the comparison in a portable way is
+	 * flatly impossible.  Hope that mmap fails if the file is too large.
+	 */
+#define	DB_MAXMMAPSIZE	(10 * 1024 * 1024)	/* 10 Mb. */
+	if (F_ISSET(mfp, MP_CAN_MMAP)) {
+		if (!F_ISSET(dbmfp, MP_READONLY))
+			F_CLR(mfp, MP_CAN_MMAP);
+		if (path == NULL)
+			F_CLR(mfp, MP_CAN_MMAP);
+		if (finfop->ftype != 0)
+			F_CLR(mfp, MP_CAN_MMAP);
+		if (LF_ISSET(DB_NOMMAP) || F_ISSET(dbenv, DB_ENV_NOMMAP))
+			F_CLR(mfp, MP_CAN_MMAP);
+		maxmap = dbenv->mp_mmapsize == 0 ?
+		    DB_MAXMMAPSIZE : dbenv->mp_mmapsize;
+		if (mbytes > maxmap / MEGABYTE ||
+		    (mbytes == maxmap / MEGABYTE && bytes >= maxmap % MEGABYTE))
+			F_CLR(mfp, MP_CAN_MMAP);
+	}
+	dbmfp->addr = NULL;
+	if (F_ISSET(mfp, MP_CAN_MMAP)) {
+		dbmfp->len = (size_t)mbytes * MEGABYTE + bytes;
+		if (__os_mapfile(dbenv, rpath,
+		    &dbmfp->fh, dbmfp->len, 1, &dbmfp->addr) != 0) {
+			dbmfp->addr = NULL;
+			F_CLR(mfp, MP_CAN_MMAP);
+		}
+	}
+	if (rpath != NULL)
+		__os_freestr(rpath);
+
+	MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
+	TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q);
+	MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+
+	*retp = dbmfp;
+	return (0);
+
+err:	/*
+	 * Note that we do not have to free the thread mutex, because we
+	 * never get to here after we have successfully allocated it.
+	 */
+	if (rpath != NULL)
+		__os_freestr(rpath);
+	if (F_ISSET(&dbmfp->fh, DB_FH_VALID))
+		(void)__os_closehandle(&dbmfp->fh);
+	if (dbmfp != NULL) {
+		if (dbmfp->mutexp != NULL)
+			__db_mutex_free(dbenv, dbmp->reginfo, dbmfp->mutexp);
+		__os_free(dbmfp, sizeof(DB_MPOOLFILE));
+	}
+	return (ret);
+}
+
+/*
+ * __memp_mf_open --
+ *	Open an MPOOLFILE.
+ */
+static int
+__memp_mf_open(dbmp, path, pagesize, last_pgno, finfop, flags, retp)
+	DB_MPOOL *dbmp;
+	const char *path;
+	size_t pagesize;
+	db_pgno_t last_pgno;
+	DB_MPOOL_FINFO *finfop;
+	u_int32_t flags;
+	MPOOLFILE **retp;
+{
+	MPOOL *mp;
+	MPOOLFILE *mfp;
+	int ret;
+	void *p;
+
+#define	ISTEMPORARY	(path == NULL)
+
+	/*
+	 * If not creating a temporary file, walk the list of MPOOLFILE's,
+	 * looking for a matching file.  Files backed by temporary files
+	 * or previously removed files can't match.
+	 *
+	 * DB_TRUNCATE support.
+	 *
+	 * The fileID is a filesystem unique number (e.g., a UNIX dev/inode
+	 * pair) plus a timestamp.  If files are removed and created in less
+	 * than a second, the fileID can be repeated.  The problem with
+	 * repetition happens when the file that previously had the fileID
+	 * value still has pages in the pool, since we don't want to use them
+	 * to satisfy requests for the new file.
+	 *
+	 * Because the DB_TRUNCATE flag reuses the dev/inode pair, repeated
+	 * opens with that flag set guarantees matching fileIDs when the
+	 * machine can open a file and then re-open with truncate within a
+	 * second.  For this reason, we pass that flag down, and, if we find
+	 * a matching entry, we ensure that it's never found again, and we
+	 * create a new entry for the current request.
+	 */
+	if (!ISTEMPORARY) {
+		mp = dbmp->reginfo[0].primary;
+		for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
+		    mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
+			if (F_ISSET(mfp, MP_DEADFILE | MP_TEMP))
+				continue;
+			if (memcmp(finfop->fileid, R_ADDR(dbmp->reginfo,
+			    mfp->fileid_off), DB_FILE_ID_LEN) == 0) {
+				if (LF_ISSET(DB_TRUNCATE)) {
+					MEMP_FREMOVE(mfp);
+					continue;
+				}
+				if (finfop->clear_len != mfp->clear_len ||
+				    pagesize != mfp->stat.st_pagesize) {
+					__db_err(dbmp->dbenv,
+				    "%s: page size or clear length changed",
+					    path);
+					return (EINVAL);
+				}
+
+				/*
+				 * It's possible that our needs for pre- and
+				 * post-processing are changing.  For example,
+				 * an application created a hash subdatabase
+				 * in a database that was previously all btree.
+				 */
+				if (finfop->ftype != 0)
+					mfp->ftype = finfop->ftype;
+
+				++mfp->mpf_cnt;
+
+				*retp = mfp;
+				return (0);
+			}
+		}
+	}
+
+	/* Allocate a new MPOOLFILE. */
+	if ((ret = __memp_alloc(
+	    dbmp, dbmp->reginfo, NULL, sizeof(MPOOLFILE), NULL, &mfp)) != 0)
+		goto mem_err;
+	*retp = mfp;
+
+	/* Initialize the structure. */
+	memset(mfp, 0, sizeof(MPOOLFILE));
+	mfp->mpf_cnt = 1;
+	mfp->ftype = finfop->ftype;
+	mfp->lsn_off = finfop->lsn_offset;
+	mfp->clear_len = finfop->clear_len;
+
+	/*
+	 * If the user specifies DB_MPOOL_LAST or DB_MPOOL_NEW on a memp_fget,
+	 * we have to know the last page in the file.  Figure it out and save
+	 * it away.
+	 */
+	mfp->stat.st_pagesize = pagesize;
+	mfp->orig_last_pgno = mfp->last_pgno = last_pgno;
+
+	if (ISTEMPORARY)
+		F_SET(mfp, MP_TEMP);
+	else {
+		/* Copy the file path into shared memory. */
+		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+		    NULL, strlen(path) + 1, &mfp->path_off, &p)) != 0)
+			goto err;
+		memcpy(p, path, strlen(path) + 1);
+
+		/* Copy the file identification string into shared memory. */
+		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+		    NULL, DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0)
+			goto err;
+		memcpy(p, finfop->fileid, DB_FILE_ID_LEN);
+
+		F_SET(mfp, MP_CAN_MMAP);
+	}
+
+	/* Copy the page cookie into shared memory. */
+	if (finfop->pgcookie == NULL || finfop->pgcookie->size == 0) {
+		mfp->pgcookie_len = 0;
+		mfp->pgcookie_off = 0;
+	} else {
+		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+		    NULL, finfop->pgcookie->size, &mfp->pgcookie_off, &p)) != 0)
+			goto err;
+		memcpy(p, finfop->pgcookie->data, finfop->pgcookie->size);
+		mfp->pgcookie_len = finfop->pgcookie->size;
+	}
+
+	/* Prepend the MPOOLFILE to the list of MPOOLFILE's. */
+	mp = dbmp->reginfo[0].primary;
+	SH_TAILQ_INSERT_HEAD(&mp->mpfq, mfp, q, __mpoolfile);
+
+	if (0) {
+err:		if (mfp->path_off != 0)
+			__db_shalloc_free(dbmp->reginfo[0].addr,
+			    R_ADDR(dbmp->reginfo, mfp->path_off));
+		if (mfp->fileid_off != 0)
+			__db_shalloc_free(dbmp->reginfo[0].addr,
+			    R_ADDR(dbmp->reginfo, mfp->fileid_off));
+		if (mfp != NULL)
+			__db_shalloc_free(dbmp->reginfo[0].addr, mfp);
+mem_err:	__db_err(dbmp->dbenv,
+		    "Unable to allocate memory for mpool file");
+	}
+	return (ret);
+}
+
+/*
+ * memp_fclose --
+ *	Close a backing file for the memory pool.
+ */
+int
+memp_fclose(dbmfp)
+	DB_MPOOLFILE *dbmfp;
+{
+	DB_ENV *dbenv;
+	DB_MPOOL *dbmp;
+	MPOOLFILE *mfp;
+	char *rpath;
+	int ret, t_ret;
+
+	dbmp = dbmfp->dbmp;
+	dbenv = dbmp->dbenv;
+	ret = 0;
+
+	PANIC_CHECK(dbenv);
+
+#ifdef HAVE_RPC
+	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
+		return (__dbcl_memp_fclose(dbmfp));
+#endif
+
+	/*
+	 * Remove the DB_MPOOLFILE from the queue.  This has to happen before
+	 * we perform any action that can fail, otherwise __memp_close may
+	 * loop infinitely when calling us to discard all of the DB_MPOOLFILEs.
+	 */
+	for (;;) {
+		MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
+
+		/*
+		 * We have to reference count DB_MPOOLFILE structures as other
+		 * threads may be using them.  The problem only happens if the
+		 * application makes a bad design choice.  Here's the path:
+		 *
+		 * Thread A opens a database.
+		 * Thread B uses thread A's DB_MPOOLFILE to write a buffer
+		 *    in order to free up memory in the mpool cache.
+		 * Thread A closes the database while thread B is using the
+		 *    DB_MPOOLFILE structure.
+		 *
+		 * By opening all databases before creating the threads, and
+		 * closing them after the threads have exited, applications
+		 * get better performance and avoid the problem path entirely.
+		 *
+		 * Regardless, holding the DB_MPOOLFILE to flush a dirty buffer
+		 * is a short-term lock, even in worst case, since we better be
+		 * the only thread of control using the DB_MPOOLFILE structure
+		 * to read pages *into* the cache.  Wait until we're the only
+		 * reference holder and remove the DB_MPOOLFILE structure from
+		 * the list, so nobody else can even find it.
+		 */
+		if (dbmfp->ref == 1) {
+			TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q);
+			break;
+		}
+		MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+
+		(void)__os_sleep(dbenv, 1, 0);
+	}
+	MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+
+	/* Complain if pinned blocks never returned. */
+	if (dbmfp->pinref != 0)
+		__db_err(dbenv, "%s: close: %lu blocks left pinned",
+		    __memp_fn(dbmfp), (u_long)dbmfp->pinref);
+
+	/* Discard any mmap information. */
+	if (dbmfp->addr != NULL &&
+	    (ret = __os_unmapfile(dbenv, dbmfp->addr, dbmfp->len)) != 0)
+		__db_err(dbenv, "%s: %s", __memp_fn(dbmfp), db_strerror(ret));
+
+	/* Close the file; temporary files may not yet have been created. */
+	if (F_ISSET(&dbmfp->fh, DB_FH_VALID) &&
+	    (t_ret = __os_closehandle(&dbmfp->fh)) != 0) {
+		__db_err(dbenv, "%s: %s", __memp_fn(dbmfp), db_strerror(t_ret));
+		if (ret != 0)
+			t_ret = ret;
+	}
+
+	/* Discard the thread mutex. */
+	if (dbmfp->mutexp != NULL)
+		__db_mutex_free(dbenv, dbmp->reginfo, dbmfp->mutexp);
+
+	/*
+	 * Discard our reference on the the underlying MPOOLFILE, and close
+	 * it if it's no longer useful to anyone.
+	 *
+	 * If we're not discarding it, and it's a temp file, this means
+	 * all the outstanding references belong to unflushed buffers.
+	 * (A temp file can only be referenced by one DB_MPOOLFILE).
+	 * We don't care about preserving any of those buffers, so mark
+	 * the MPOOLFILE as dead so that when we try to flush them,
+	 * even the dirty ones just get discarded.
+	 */
+	R_LOCK(dbenv, dbmp->reginfo);
+	mfp = dbmfp->mfp;
+	if (--mfp->mpf_cnt == 0) {
+		if (F_ISSET(mfp, MP_UNLINK)) {
+			MEMP_FREMOVE(mfp);
+			if ((t_ret = __db_appname(dbmp->dbenv,
+			    DB_APP_DATA, NULL, R_ADDR(dbmp->reginfo,
+			    mfp->path_off), 0, NULL, &rpath)) != 0 && ret == 0)
+				ret = t_ret;
+			if (t_ret == 0 && (t_ret =
+			    __os_unlink(dbmp->dbenv, rpath) != 0 && ret == 0))
+				ret = t_ret;
+			__os_free(rpath, 0);
+		}
+		if (mfp->block_cnt == 0)
+			__memp_mf_discard(dbmp, mfp);
+	}
+	else if (F_ISSET(mfp, MP_TEMP))
+		MEMP_FREMOVE(mfp);
+	R_UNLOCK(dbenv, dbmp->reginfo);
+
+	/* Discard the DB_MPOOLFILE structure. */
+	__os_free(dbmfp, sizeof(DB_MPOOLFILE));
+
+	return (ret);
+}
+
+/*
+ * __memp_mf_discard --
+ *	Discard an MPOOLFILE.
+ *
+ * PUBLIC: void __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *));
+ */
+void
+__memp_mf_discard(dbmp, mfp)
+	DB_MPOOL *dbmp;
+	MPOOLFILE *mfp;
+{
+	MPOOL *mp;
+
+	mp = dbmp->reginfo[0].primary;
+
+	/* Delete from the list of MPOOLFILEs. */
+	SH_TAILQ_REMOVE(&mp->mpfq, mfp, q, __mpoolfile);
+
+	/* Free the space. */
+	if (mfp->path_off != 0)
+		__db_shalloc_free(dbmp->reginfo[0].addr,
+		    R_ADDR(dbmp->reginfo, mfp->path_off));
+	if (mfp->fileid_off != 0)
+		__db_shalloc_free(dbmp->reginfo[0].addr,
+		    R_ADDR(dbmp->reginfo, mfp->fileid_off));
+	if (mfp->pgcookie_off != 0)
+		__db_shalloc_free(dbmp->reginfo[0].addr,
+		    R_ADDR(dbmp->reginfo, mfp->pgcookie_off));
+	__db_shalloc_free(dbmp->reginfo[0].addr, mfp);
+}
+
+/*
+ * __memp_fremove --
+ *	Remove an underlying file from the system.
+ *
+ * PUBLIC: int __memp_fremove __P((DB_MPOOLFILE *));
+ */
+int
+__memp_fremove(dbmfp)
+	DB_MPOOLFILE *dbmfp;
+{
+	DB_ENV *dbenv;
+	DB_MPOOL *dbmp;
+	MPOOLFILE *mfp;
+
+	dbmp = dbmfp->dbmp;
+	dbenv = dbmp->dbenv;
+	mfp = dbmfp->mfp;
+
+	PANIC_CHECK(dbenv);
+
+	R_LOCK(dbenv, dbmp->reginfo);
+
+	MEMP_FREMOVE(mfp);
+
+	R_UNLOCK(dbenv, dbmp->reginfo);
+
+	return (0);
+}
+
+/*
+ * __memp_fn --
+ *	On errors we print whatever is available as the file name.
+ *
+ * PUBLIC: char * __memp_fn __P((DB_MPOOLFILE *));
+ */
+char *
+__memp_fn(dbmfp)
+	DB_MPOOLFILE *dbmfp;
+{
+	return (__memp_fns(dbmfp->dbmp, dbmfp->mfp));
+}
+
+/*
+ * __memp_fns --
+ *	On errors we print whatever is available as the file name.
+ *
+ * PUBLIC: char * __memp_fns __P((DB_MPOOL *, MPOOLFILE *));
+ *
+ */
+char *
+__memp_fns(dbmp, mfp)
+	DB_MPOOL *dbmp;
+	MPOOLFILE *mfp;
+{
+	if (mfp->path_off == 0)
+		return ((char *)"temporary");
+
+	return ((char *)R_ADDR(dbmp->reginfo, mfp->path_off));
+}
diff --git a/db/mp/mp_fput.c b/db/mp/mp_fput.c
new file mode 100644
index 000000000..be03b721f
--- /dev/null
+++ b/db/mp/mp_fput.c
@@ -0,0 +1,186 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ *	Sleepycat Software.  All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_fput.c,v 11.16 2000/11/30 00:58:41 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#endif
+
+#ifdef  HAVE_RPC
+#include "db_server.h"
+#endif
+
+#include "db_int.h"
+#include "db_shash.h"
+#include "mp.h"
+
+#ifdef HAVE_RPC
+#include "gen_client_ext.h"
+#include "rpc_client_ext.h"
+#endif
+
+/*
+ * memp_fput --
+ *	Mpool file put function.
+ */
+int
+memp_fput(dbmfp, pgaddr, flags)
+	DB_MPOOLFILE *dbmfp;
+	void *pgaddr;
+	u_int32_t flags;
+{
+	BH *bhp;
+	DB_ENV *dbenv;
+	DB_MPOOL *dbmp;
+	MPOOL *c_mp, *mp;
+	int ret, wrote;
+
+	dbmp = dbmfp->dbmp;
+	dbenv = dbmp->dbenv;
+	mp = dbmp->reginfo[0].primary;
+
+#ifdef HAVE_RPC
+	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
+		return (__dbcl_memp_fput(dbmfp, pgaddr, flags));
+#endif
+
+	PANIC_CHECK(dbenv);
+
+	/* Validate arguments. */
+	if (flags) {
+		if ((ret = __db_fchk(dbenv, "memp_fput", flags,
+		    DB_MPOOL_CLEAN | DB_MPOOL_DIRTY | DB_MPOOL_DISCARD)) != 0)
+			return (ret);
+		if ((ret = __db_fcchk(dbenv, "memp_fput",
+		    flags, DB_MPOOL_CLEAN, DB_MPOOL_DIRTY)) != 0)
+			return (ret);
+
+		if (LF_ISSET(DB_MPOOL_DIRTY) && F_ISSET(dbmfp, MP_READONLY)) {
+			__db_err(dbenv,
+			    "%s: dirty flag set for readonly file page",
+			    __memp_fn(dbmfp));
+			return (EACCES);
+		}
+	}
+
+	R_LOCK(dbenv, dbmp->reginfo);
+
+	/* Decrement the pinned reference count. */
+	if (dbmfp->pinref == 0) {
+		__db_err(dbenv,
+		    "%s: more pages returned than retrieved", __memp_fn(dbmfp));
+		R_UNLOCK(dbenv, dbmp->reginfo);
+		return (EINVAL);
+	} else
+		--dbmfp->pinref;
+
+	/*
+	 * If we're mapping the file, there's nothing to do.  Because we can
+	 * stop mapping the file at any time, we have to check on each buffer
+	 * to see if the address we gave the application was part of the map
+	 * region.
+	 */
+	if (dbmfp->addr != NULL && pgaddr >= dbmfp->addr &&
+	    (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len) {
+		R_UNLOCK(dbenv, dbmp->reginfo);
+		return (0);
+	}
+
+	/* Convert the page address to a buffer header. */
+	bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
+
+	/* Convert the buffer header to a cache. */
+	c_mp = BH_TO_CACHE(dbmp, bhp);
+
+/* UNLOCK THE REGION, LOCK THE CACHE. */
+
+	/* Set/clear the page bits. */
+	if (LF_ISSET(DB_MPOOL_CLEAN) && F_ISSET(bhp, BH_DIRTY)) {
+		++c_mp->stat.st_page_clean;
+		--c_mp->stat.st_page_dirty;
+		F_CLR(bhp, BH_DIRTY);
+	}
+	if (LF_ISSET(DB_MPOOL_DIRTY) && !F_ISSET(bhp, BH_DIRTY)) {
+		--c_mp->stat.st_page_clean;
+		++c_mp->stat.st_page_dirty;
+		F_SET(bhp, BH_DIRTY);
+	}
+	if (LF_ISSET(DB_MPOOL_DISCARD))
+		F_SET(bhp, BH_DISCARD);
+
+	/*
+	 * If the page is dirty and being scheduled to be written as part of
+	 * a checkpoint, we no longer know that the log is up-to-date.
+	 */
+	if (F_ISSET(bhp, BH_DIRTY) && F_ISSET(bhp, BH_SYNC))
+		F_SET(bhp, BH_SYNC_LOGFLSH);
+
+	/*
+	 * Check for a reference count going to zero.  This can happen if the
+	 * application returns a page twice.
+	 */
+	if (bhp->ref == 0) {
+		__db_err(dbenv, "%s: page %lu: unpinned page returned",
+		    __memp_fn(dbmfp), (u_long)bhp->pgno);
+		R_UNLOCK(dbenv, dbmp->reginfo);
+		return (EINVAL);
+	}
+
+	/*
+	 * If more than one reference to the page, we're done.  Ignore the
+	 * discard flags (for now) and leave it at its position in the LRU
+	 * chain.  The rest gets done at last reference close.
+	 */
+	if (--bhp->ref > 0) {
+		R_UNLOCK(dbenv, dbmp->reginfo);
+		return (0);
+	}
+
+	/*
+	 * Move the buffer to the head/tail of the LRU chain.  We do this
+	 * before writing the buffer for checkpoint purposes, as the write
+	 * can discard the region lock and allow another process to acquire
+	 * buffer.  We could keep that from happening, but there seems no
+	 * reason to do so.
+	 */
+	SH_TAILQ_REMOVE(&c_mp->bhq, bhp, q, __bh);
+	if (F_ISSET(bhp, BH_DISCARD))
+		SH_TAILQ_INSERT_HEAD(&c_mp->bhq, bhp, q, __bh);
+	else
+		SH_TAILQ_INSERT_TAIL(&c_mp->bhq, bhp, q);
+
+	/*
+	 * If this buffer is scheduled for writing because of a checkpoint, we
+	 * need to write it (if it's dirty), or update the checkpoint counters
+	 * (if it's not dirty).  If we try to write it and can't, that's not
+	 * necessarily an error as it's not completely unreasonable that the
+	 * application have permission to write the underlying file, but set a
+	 * flag so that the next time the memp_sync function is called we try
+	 * writing it there, as the checkpoint thread of control better be able
+	 * to write all of the files.
+	 */
+	if (F_ISSET(bhp, BH_SYNC)) {
+		if (F_ISSET(bhp, BH_DIRTY)) {
+			if (__memp_bhwrite(dbmp,
+			    dbmfp->mfp, bhp, NULL, &wrote) != 0 || !wrote)
+				F_SET(mp, MP_LSN_RETRY);
+		} else {
+			F_CLR(bhp, BH_SYNC);
+
+			--mp->lsn_cnt;
+			--dbmfp->mfp->lsn_cnt;
+		}
+	}
+
+	R_UNLOCK(dbenv, dbmp->reginfo);
+	return (0);
+}
diff --git a/db/mp/mp_fset.c b/db/mp/mp_fset.c
new file mode 100644
index 000000000..08313c9b6
--- /dev/null
+++ b/db/mp/mp_fset.c
@@ -0,0 +1,98 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ *	Sleepycat Software.  All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_fset.c,v 11.13 2000/11/30 00:58:41 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#endif
+
+#ifdef  HAVE_RPC
+#include "db_server.h"
+#endif
+
+#include "db_int.h"
+#include "db_shash.h"
+#include "mp.h"
+
+#ifdef HAVE_RPC
+#include "gen_client_ext.h"
+#include "rpc_client_ext.h"
+#endif
+
+/*
+ * memp_fset --
+ *	Mpool page set-flag routine.
+ */
+int
+memp_fset(dbmfp, pgaddr, flags)
+	DB_MPOOLFILE *dbmfp;
+	void *pgaddr;
+	u_int32_t flags;
+{
+	BH *bhp;
+	DB_ENV *dbenv;
+	DB_MPOOL *dbmp;
+	MPOOL *c_mp, *mp;
+	int ret;
+
+	dbmp = dbmfp->dbmp;
+	dbenv = dbmp->dbenv;
+	mp = dbmp->reginfo[0].primary;
+
+#ifdef HAVE_RPC
+	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
+		return (__dbcl_memp_fset(dbmfp, pgaddr, flags));
+#endif
+
+	PANIC_CHECK(dbenv);
+
+	/* Validate arguments. */
+	if (flags == 0)
+		return (__db_ferr(dbenv, "memp_fset", 1));
+
+	if ((ret = __db_fchk(dbenv, "memp_fset", flags,
+	    DB_MPOOL_DIRTY | DB_MPOOL_CLEAN | DB_MPOOL_DISCARD)) != 0)
+		return (ret);
+	if ((ret = __db_fcchk(dbenv, "memp_fset",
+	    flags, DB_MPOOL_CLEAN, DB_MPOOL_DIRTY)) != 0)
+		return (ret);
+
+	if (LF_ISSET(DB_MPOOL_DIRTY) && F_ISSET(dbmfp, MP_READONLY)) {
+		__db_err(dbenv, "%s: dirty flag set for readonly file page",
+		    __memp_fn(dbmfp));
+		return (EACCES);
+	}
+
+	/* Convert the page address to a buffer header. */
+	bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
+
+	/* Convert the buffer header to a cache. */
+	c_mp = BH_TO_CACHE(dbmp, bhp);
+
+	R_LOCK(dbenv, dbmp->reginfo);
+
+	if (LF_ISSET(DB_MPOOL_CLEAN) && F_ISSET(bhp, BH_DIRTY)) {
+		++c_mp->stat.st_page_clean;
+		--c_mp->stat.st_page_dirty;
+		F_CLR(bhp, BH_DIRTY);
+	}
+	if (LF_ISSET(DB_MPOOL_DIRTY) && !F_ISSET(bhp, BH_DIRTY)) {
+		--c_mp->stat.st_page_clean;
+		++c_mp->stat.st_page_dirty;
+		F_SET(bhp, BH_DIRTY);
+	}
+	if (LF_ISSET(DB_MPOOL_DISCARD))
+		F_SET(bhp, BH_DISCARD);
+
+	R_UNLOCK(dbenv, dbmp->reginfo);
+	return (0);
+}
diff --git a/db/mp/mp_method.c b/db/mp/mp_method.c
new file mode 100644
index 000000000..85a6239b0
--- /dev/null
+++ b/db/mp/mp_method.c
@@ -0,0 +1,115 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ *	Sleepycat Software.  All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_method.c,v 11.10 2000/04/04 20:12:04 bostic Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#endif
+
+#ifdef  HAVE_RPC
+#include "db_server.h"
+#endif
+
+#include "db_int.h"
+#include "db_shash.h"
+#include "mp.h"
+
+#ifdef HAVE_RPC
+#include "gen_client_ext.h"
+#include "rpc_client_ext.h"
+#endif
+
+static int __memp_set_cachesize __P((DB_ENV *, u_int32_t, u_int32_t, int));
+static int __memp_set_mp_mmapsize __P((DB_ENV *, size_t));
+
+/*
+ * __memp_dbenv_create --
+ *	Mpool specific creation of the DB_ENV structure.
+ *
+ * PUBLIC: void __memp_dbenv_create __P((DB_ENV *));
+ */
+void
+__memp_dbenv_create(dbenv)
+	DB_ENV *dbenv;
+{
+	/*
+	 * We default to 32 8K pages.  We don't default to a flat 256K, because
+	 * some systems require significantly more memory to hold 32 pages than
+	 * others.  For example, HP-UX with POSIX pthreads needs 88 bytes for
+	 * a POSIX pthread mutex and almost 200 bytes per buffer header, while
+	 * Solaris needs 24 and 52 bytes for the same structures.
+	 */
+	dbenv->mp_bytes = 32 * ((8 * 1024) + sizeof(BH));
+	dbenv->mp_ncache = 1;
+
+	dbenv->set_mp_mmapsize = __memp_set_mp_mmapsize;
+	dbenv->set_cachesize = __memp_set_cachesize;
+
+#ifdef	HAVE_RPC
+	/*
+	 * If we have a client, overwrite what we just setup to
+	 * point to client functions.
+	 */
+	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) {
+		dbenv->set_cachesize = __dbcl_env_cachesize;
+		dbenv->set_mp_mmapsize = __dbcl_set_mp_mmapsize;
+	}
+#endif
+
+}
+
+/*
+ * __memp_set_cachesize --
+ *	Initialize the cache size.
+ */
+static int
+__memp_set_cachesize(dbenv, gbytes, bytes, ncache)
+	DB_ENV *dbenv;
+	u_int32_t gbytes, bytes;
+	int ncache;
+{
+	ENV_ILLEGAL_AFTER_OPEN(dbenv, "set_cachesize");
+
+	dbenv->mp_gbytes = gbytes + bytes / GIGABYTE;
+	dbenv->mp_bytes = bytes % GIGABYTE;
+	dbenv->mp_ncache = ncache == 0 ? 1 : ncache;
+
+	/*
+	 * If the application requested less than 500Mb, increase the
+	 * cachesize by 25% to account for our overhead.  (I'm guessing
+	 * that caches over 500Mb are specifically sized, i.e., it's
+	 * a large server and the application actually knows how much
+	 * memory is available.)
+	 *
+	 * There is a minimum cache size, regardless.
+	 */
+	if (dbenv->mp_gbytes == 0) {
+		if (dbenv->mp_bytes < 500 * MEGABYTE)
+			dbenv->mp_bytes += dbenv->mp_bytes / 4;
+		if (dbenv->mp_bytes < DB_CACHESIZE_MIN)
+			dbenv->mp_bytes = DB_CACHESIZE_MIN;
+	}
+
+	return (0);
+}
+
+/*
+ * __memp_set_mp_mmapsize --
+ *	Set the maximum mapped file size.
+ */
+static int
+__memp_set_mp_mmapsize(dbenv, mp_mmapsize )
+	DB_ENV *dbenv;
+	size_t mp_mmapsize;
+{
+	dbenv->mp_mmapsize = mp_mmapsize;
+	return (0);
+}
diff --git a/db/mp/mp_region.c b/db/mp/mp_region.c
new file mode 100644
index 000000000..4b85466ce
--- /dev/null
+++ b/db/mp/mp_region.c
@@ -0,0 +1,357 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ *	Sleepycat Software.  All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_region.c,v 11.26 2000/11/30 00:58:41 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_shash.h"
+#include "mp.h"
+
+static int __mpool_init __P((DB_ENV *, DB_MPOOL *, int, int));
+#ifdef MUTEX_SYSTEM_RESOURCES
+static size_t __mpool_region_maint __P((REGINFO *));
+#endif
+
+/*
+ * __memp_open --
+ *	Internal version of memp_open: only called from DB_ENV->open.
+ *
+ * PUBLIC: int __memp_open __P((DB_ENV *));
+ */
+int
+__memp_open(dbenv)
+	DB_ENV *dbenv;
+{
+	DB_MPOOL *dbmp;
+	MPOOL *mp;
+	REGINFO reginfo;
+	roff_t reg_size, *regids;
+	u_int32_t i;
+	int htab_buckets, ret;
+
+	/* Figure out how big each cache region is. */
+	reg_size = (dbenv->mp_gbytes / dbenv->mp_ncache) * GIGABYTE;
+	reg_size += ((dbenv->mp_gbytes %
+	    dbenv->mp_ncache) * GIGABYTE) / dbenv->mp_ncache;
+	reg_size += dbenv->mp_bytes / dbenv->mp_ncache;
+
+	/*
+	 * Figure out how many hash buckets each region will have.  Assume we
+	 * want to keep the hash chains with under 10 pages on each chain.  We
+	 * don't know the pagesize in advance, and it may differ for different
+	 * files.  Use a pagesize of 1K for the calculation -- we walk these
+	 * chains a lot, they must be kept short.
+	 */
+	htab_buckets = __db_tablesize((reg_size / (1 * 1024)) / 10);
+
+	/* Create and initialize the DB_MPOOL structure. */
+	if ((ret = __os_calloc(dbenv, 1, sizeof(*dbmp), &dbmp)) != 0)
+		return (ret);
+	LIST_INIT(&dbmp->dbregq);
+	TAILQ_INIT(&dbmp->dbmfq);
+	dbmp->dbenv = dbenv;
+
+	/* Join/create the first mpool region. */
+	memset(&reginfo, 0, sizeof(REGINFO));
+	reginfo.type = REGION_TYPE_MPOOL;
+	reginfo.id = INVALID_REGION_ID;
+	reginfo.mode = dbenv->db_mode;
+	reginfo.flags = REGION_JOIN_OK;
+	if (F_ISSET(dbenv, DB_ENV_CREATE))
+		F_SET(&reginfo, REGION_CREATE_OK);
+	if ((ret = __db_r_attach(dbenv, &reginfo, reg_size)) != 0)
+		goto err;
+
+	/*
+	 * If we created the region, initialize it.  Create or join any
+	 * additional regions.
+	 */
+	if (F_ISSET(&reginfo, REGION_CREATE)) {
+		/*
+		 * We define how many regions there are going to be, allocate
+		 * the REGINFO structures and create them.  Make sure we don't
+		 * clear the wrong entries on error.
+		 */
+		dbmp->nreg = dbenv->mp_ncache;
+		if ((ret = __os_calloc(dbenv,
+		    dbmp->nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0)
+			goto err;
+		/* Make sure we don't clear the wrong entries on error. */
+		for (i = 0; i < dbmp->nreg; ++i)
+			dbmp->reginfo[i].id = INVALID_REGION_ID;
+		dbmp->reginfo[0] = reginfo;
+
+		/* Initialize the first region. */
+		if ((ret = __mpool_init(dbenv, dbmp, 0, htab_buckets)) != 0)
+			goto err;
+
+		/*
+		 * Create/initialize remaining regions and copy their IDs into
+		 * the first region.
+		 */
+		mp = R_ADDR(dbmp->reginfo, dbmp->reginfo[0].rp->primary);
+		regids = R_ADDR(dbmp->reginfo, mp->regids);
+		for (i = 1; i < dbmp->nreg; ++i) {
+			dbmp->reginfo[i].type = REGION_TYPE_MPOOL;
+			dbmp->reginfo[i].id = INVALID_REGION_ID;
+			dbmp->reginfo[i].mode = dbenv->db_mode;
+			dbmp->reginfo[i].flags = REGION_CREATE_OK;
+			if ((ret = __db_r_attach(
+			    dbenv, &dbmp->reginfo[i], reg_size)) != 0)
+				goto err;
+			if ((ret =
+			    __mpool_init(dbenv, dbmp, i, htab_buckets)) != 0)
+				goto err;
+			R_UNLOCK(dbenv, &dbmp->reginfo[i]);
+
+			regids[i] = dbmp->reginfo[i].id;
+		}
+	} else {
+		/*
+		 * Determine how many regions there are going to be, allocate
+		 * the REGINFO structures and fill in local copies of that
+		 * information.
+		 */
+		mp = R_ADDR(&reginfo, reginfo.rp->primary);
+		dbmp->nreg = mp->nreg;
+		if ((ret = __os_calloc(dbenv,
+		    dbmp->nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0)
+			goto err;
+		/* Make sure we don't clear the wrong entries on error. */
+		for (i = 0; i < dbmp->nreg; ++i)
+			dbmp->reginfo[i].id = INVALID_REGION_ID;
+		dbmp->reginfo[0] = reginfo;
+
+		/* Join remaining regions. */
+		regids = R_ADDR(dbmp->reginfo, mp->regids);
+		for (i = 1; i < dbmp->nreg; ++i) {
+			dbmp->reginfo[i].type = REGION_TYPE_MPOOL;
+			dbmp->reginfo[i].id = regids[i];
+			dbmp->reginfo[i].mode = 0;
+			dbmp->reginfo[i].flags = REGION_JOIN_OK;
+			if ((ret = __db_r_attach(
+			    dbenv, &dbmp->reginfo[i], 0)) != 0)
+				goto err;
+			R_UNLOCK(dbenv, &dbmp->reginfo[i]);
+		}
+	}
+
+	/* Set the local addresses for the regions. */
+	for (i = 0; i < dbmp->nreg; ++i)
+		dbmp->reginfo[i].primary =
+		    R_ADDR(&dbmp->reginfo[i], dbmp->reginfo[i].rp->primary);
+
+	/* If the region is threaded, allocate a mutex to lock the handles. */
+	if (F_ISSET(dbenv, DB_ENV_THREAD)) {
+		if ((ret = __db_mutex_alloc(
+		    dbenv, dbmp->reginfo, &dbmp->mutexp)) != 0) {
+			goto err;
+		}
+		if ((ret =
+		    __db_mutex_init(dbenv, dbmp->mutexp, 0, MUTEX_THREAD)) != 0)
+			goto err;
+	}
+
+	R_UNLOCK(dbenv, dbmp->reginfo);
+
+	dbenv->mp_handle = dbmp;
+	return (0);
+
+err:	if (dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) {
+		if (F_ISSET(dbmp->reginfo, REGION_CREATE))
+			ret = __db_panic(dbenv, ret);
+
+		R_UNLOCK(dbenv, dbmp->reginfo);
+
+		for (i = 0; i < dbmp->nreg; ++i)
+			if (dbmp->reginfo[i].id != INVALID_REGION_ID)
+				(void)__db_r_detach(
+				    dbenv, &dbmp->reginfo[i], 0);
+		__os_free(dbmp->reginfo,
+		    dbmp->nreg * sizeof(*dbmp->reginfo));
+	}
+	if (dbmp->mutexp != NULL)
+		__db_mutex_free(dbenv, dbmp->reginfo, dbmp->mutexp);
+	__os_free(dbmp, sizeof(*dbmp));
+	return (ret);
+}
+
+/*
+ * __mpool_init --
+ *	Initialize a MPOOL structure in shared memory.
+ */
+static int
+__mpool_init(dbenv, dbmp, reginfo_off, htab_buckets)
+	DB_ENV *dbenv;
+	DB_MPOOL *dbmp;
+	int reginfo_off, htab_buckets;
+{
+	DB_HASHTAB *htab;
+	MPOOL *mp;
+	REGINFO *reginfo;
+#ifdef MUTEX_SYSTEM_RESOURCES
+	size_t maint_size;
+#endif
+	int ret;
+	void *p;
+
+	mp = NULL;
+
+	reginfo = &dbmp->reginfo[reginfo_off];
+	if ((ret = __db_shalloc(reginfo->addr,
+	    sizeof(MPOOL), MUTEX_ALIGN, &reginfo->primary)) != 0)
+		goto mem_err;
+	reginfo->rp->primary = R_OFFSET(reginfo, reginfo->primary);
+	mp = reginfo->primary;
+	memset(mp, 0, sizeof(*mp));
+
+#ifdef	MUTEX_SYSTEM_RESOURCES
+	maint_size = __mpool_region_maint(reginfo);
+	/* Allocate room for the maintenance info and initialize it. */
+	if ((ret = __db_shalloc(reginfo->addr,
+	    sizeof(REGMAINT) + maint_size, 0, &p)) != 0)
+		goto mem_err;
+	__db_maintinit(reginfo, p, maint_size);
+	mp->maint_off = R_OFFSET(reginfo, p);
+#endif
+
+	if (reginfo_off == 0) {
+		SH_TAILQ_INIT(&mp->mpfq);
+
+		if ((ret = __db_shmutex_init(dbenv, &mp->sync_mutex,
+		    R_OFFSET(dbmp->reginfo, &mp->sync_mutex) +
+		    DB_FCNTL_OFF_MPOOL, 0, dbmp->reginfo,
+		    (REGMAINT *)R_ADDR(dbmp->reginfo, mp->maint_off))) != 0)
+			goto err;
+
+		ZERO_LSN(mp->lsn);
+		mp->lsn_cnt = 0;
+
+		mp->nreg = dbmp->nreg;
+		if ((ret = __db_shalloc(dbmp->reginfo[0].addr,
+		    dbmp->nreg * sizeof(int), 0, &p)) != 0)
+			goto mem_err;
+		mp->regids = R_OFFSET(dbmp->reginfo, p);
+	}
+
+	SH_TAILQ_INIT(&mp->bhq);
+
+	/* Allocate hash table space and initialize it. */
+	if ((ret = __db_shalloc(reginfo->addr,
+	    htab_buckets * sizeof(DB_HASHTAB), 0, &htab)) != 0)
+		goto mem_err;
+	__db_hashinit(htab, htab_buckets);
+	mp->htab = R_OFFSET(reginfo, htab);
+	mp->htab_buckets = htab_buckets;
+
+	return (0);
+
+mem_err:__db_err(dbenv, "Unable to allocate memory for mpool region");
+err:	if (reginfo->primary != NULL)
+		__db_shalloc_free(reginfo->addr, reginfo->primary);
+	return (ret);
+}
+
+/*
+ * __memp_close --
+ *	Internal version of memp_close: only called from DB_ENV->close.
+ *
+ * PUBLIC: int __memp_close __P((DB_ENV *));
+ */
+int
+__memp_close(dbenv)
+	DB_ENV *dbenv;
+{
+	DB_MPOOL *dbmp;
+	DB_MPOOLFILE *dbmfp;
+	DB_MPREG *mpreg;
+	u_int32_t i;
+	int ret, t_ret;
+
+	ret = 0;
+	dbmp = dbenv->mp_handle;
+
+	/* Discard DB_MPREGs. */
+	while ((mpreg = LIST_FIRST(&dbmp->dbregq)) != NULL) {
+		LIST_REMOVE(mpreg, q);
+		__os_free(mpreg, sizeof(DB_MPREG));
+	}
+
+	/* Discard DB_MPOOLFILEs. */
+	while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL)
+		if ((t_ret = memp_fclose(dbmfp)) != 0 && ret == 0)
+			ret = t_ret;
+
+	/* Discard the thread mutex. */
+	if (dbmp->mutexp != NULL)
+		__db_mutex_free(dbenv, dbmp->reginfo, dbmp->mutexp);
+
+	/* Detach from the region(s). */
+	for (i = 0; i < dbmp->nreg; ++i)
+		if ((t_ret = __db_r_detach(
+		    dbenv, &dbmp->reginfo[i], 0)) != 0 && ret == 0)
+			ret = t_ret;
+
+	__os_free(dbmp->reginfo, dbmp->nreg * sizeof(*dbmp->reginfo));
+	__os_free(dbmp, sizeof(*dbmp));
+
+	dbenv->mp_handle = NULL;
+	return (ret);
+}
+
+#ifdef MUTEX_SYSTEM_RESOURCES
+/*
+ * __mpool_region_maint --
+ *	Return the amount of space needed for region maintenance info.
+ *
+ */
+static size_t
+__mpool_region_maint(infop)
+	REGINFO *infop;
+{
+	size_t s;
+	int numlocks;
+
+	/*
+	 * For mutex maintenance we need one mutex per possible page.
+	 * Compute the maximum number of pages this cache can have.
+	 * Also add in an mpool mutex.
+	 */
+	numlocks = ((infop->rp->size / DB_MIN_PGSIZE) + 1);
+	s = sizeof(roff_t) * numlocks;
+	return (s);
+}
+#endif
+
+/*
+ * __mpool_region_destroy
+ *	Destroy any region maintenance info.
+ *
+ * PUBLIC: void __mpool_region_destroy __P((DB_ENV *, REGINFO *));
+ */
+void
+__mpool_region_destroy(dbenv, infop)
+	DB_ENV *dbenv;
+	REGINFO *infop;
+{
+	MPOOL *mp;
+
+	COMPQUIET(dbenv, NULL);
+	mp = R_ADDR(infop, infop->rp->primary);
+
+	__db_shlocks_destroy(infop, (REGMAINT *)R_ADDR(infop, mp->maint_off));
+	return;
+}
diff --git a/db/mp/mp_register.c b/db/mp/mp_register.c
new file mode 100644
index 000000000..27859f69d
--- /dev/null
+++ b/db/mp/mp_register.c
@@ -0,0 +1,85 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ *	Sleepycat Software.  All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_register.c,v 11.12 2000/11/15 19:25:39 sue Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#endif
+
+#ifdef  HAVE_RPC
+#include "db_server.h"
+#endif
+
+#include "db_int.h"
+#include "db_shash.h"
+#include "mp.h"
+
+#ifdef HAVE_RPC
+#include "gen_client_ext.h"
+#include "rpc_client_ext.h"
+#endif
+
+/*
+ * memp_register --
+ *	Register a file type's pgin, pgout routines.
+ */
+int
+memp_register(dbenv, ftype, pgin, pgout)
+	DB_ENV *dbenv;
+	int ftype;
+	int (*pgin) __P((DB_ENV *, db_pgno_t, void *, DBT *));
+	int (*pgout) __P((DB_ENV *, db_pgno_t, void *, DBT *));
+{
+	DB_MPOOL *dbmp;
+	DB_MPREG *mpreg;
+	int ret;
+
+#ifdef HAVE_RPC
+	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
+		return (__dbcl_memp_register(dbenv, ftype, pgin, pgout));
+#endif
+
+	PANIC_CHECK(dbenv);
+	ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
+
+	dbmp = dbenv->mp_handle;
+
+	/*
+	 * Chances are good that the item has already been registered, as the
+	 * DB access methods are the folks that call this routine.  If already
+	 * registered, just update the entry, although it's probably unchanged.
+	 */
+	MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
+	for (mpreg = LIST_FIRST(&dbmp->dbregq);
+	    mpreg != NULL; mpreg = LIST_NEXT(mpreg, q))
+		if (mpreg->ftype == ftype) {
+			mpreg->pgin = pgin;
+			mpreg->pgout = pgout;
+			break;
+		}
+	MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+	if (mpreg != NULL)
+		return (0);
+
+	/* New entry. */
+	if ((ret = __os_malloc(dbenv, sizeof(DB_MPREG), NULL, &mpreg)) != 0)
+		return (ret);
+
+	mpreg->ftype = ftype;
+	mpreg->pgin = pgin;
+	mpreg->pgout = pgout;
+
+	MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
+	LIST_INSERT_HEAD(&dbmp->dbregq, mpreg, q);
+	MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+
+	return (0);
+}
diff --git a/db/mp/mp_stat.c b/db/mp/mp_stat.c
new file mode 100644
index 000000000..798251344
--- /dev/null
+++ b/db/mp/mp_stat.c
@@ -0,0 +1,388 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ *	Sleepycat Software.  All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_stat.c,v 11.21 2001/01/09 16:59:30 bostic Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#endif
+
+#ifdef  HAVE_RPC
+#include "db_server.h"
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "db_shash.h"
+#include "db_am.h"
+#include "mp.h"
+
+#ifdef HAVE_RPC
+#include "gen_client_ext.h"
+#include "rpc_client_ext.h"
+#endif
+
+static void __memp_dumpcache
+		__P((DB_MPOOL *, REGINFO *, size_t *, FILE *, u_int32_t));
+static void __memp_pbh __P((DB_MPOOL *, BH *, size_t *, FILE *));
+
+/*
+ * memp_stat --
+ *	Display MPOOL statistics.
+ */
+int
+memp_stat(dbenv, gspp, fspp, db_malloc)
+	DB_ENV *dbenv;
+	DB_MPOOL_STAT **gspp;
+	DB_MPOOL_FSTAT ***fspp;
+	void *(*db_malloc) __P((size_t));
+{
+	DB_MPOOL *dbmp;
+	DB_MPOOL_FSTAT **tfsp, *tstruct;
+	DB_MPOOL_STAT *sp;
+	MPOOL *c_mp, *mp;
+	MPOOLFILE *mfp;
+	char *tname;
+	size_t len, nlen;
+	u_int32_t i;
+	int ret;
+	char *name;
+
+#ifdef HAVE_RPC
+	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
+		return (__dbcl_memp_stat(dbenv, gspp, fspp, db_malloc));
+#endif
+
+	PANIC_CHECK(dbenv);
+	ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
+
+	dbmp = dbenv->mp_handle;
+	sp = NULL;
+
+	/* Global statistics. */
+	mp = dbmp->reginfo[0].primary;
+	if (gspp != NULL) {
+		*gspp = NULL;
+
+		if ((ret = __os_calloc(dbenv, 1, sizeof(**gspp), gspp)) != 0)
+			return (ret);
+		sp = *gspp;
+
+		/*
+		 * Initialization and information that is not maintained on
+		 * a per-cache basis.
+		 */
+		sp->st_hash_longest = 0;
+		sp->st_region_wait = dbmp->reginfo[0].rp->mutex.mutex_set_wait;
+		sp->st_region_nowait =
+		    dbmp->reginfo[0].rp->mutex.mutex_set_nowait;
+		sp->st_gbytes = dbenv->mp_gbytes;
+		sp->st_bytes = dbenv->mp_bytes;
+		sp->st_ncache = dbmp->nreg;
+		sp->st_regsize = dbmp->reginfo[0].rp->size;
+
+		R_LOCK(dbenv, dbmp->reginfo);
+
+		/* Walk the cache list and accumulate the global information. */
+		for (i = 0; i < mp->nreg; ++i) {
+			c_mp = dbmp->reginfo[i].primary;
+			sp->st_cache_hit += c_mp->stat.st_cache_hit;
+			sp->st_cache_miss += c_mp->stat.st_cache_miss;
+			sp->st_map += c_mp->stat.st_map;
+			sp->st_page_create += c_mp->stat.st_page_create;
+			sp->st_page_in += c_mp->stat.st_page_in;
+			sp->st_page_out += c_mp->stat.st_page_out;
+			sp->st_ro_evict += c_mp->stat.st_ro_evict;
+			sp->st_rw_evict += c_mp->stat.st_rw_evict;
+			sp->st_hash_buckets += c_mp->stat.st_hash_buckets;
+			sp->st_hash_searches += c_mp->stat.st_hash_searches;
+			if (c_mp->stat.st_hash_longest > sp->st_hash_longest)
+				sp->st_hash_longest =
+				    c_mp->stat.st_hash_longest;
+			sp->st_hash_examined += c_mp->stat.st_hash_examined;
+			sp->st_page_clean += c_mp->stat.st_page_clean;
+			sp->st_page_dirty += c_mp->stat.st_page_dirty;
+			sp->st_page_trickle += c_mp->stat.st_page_trickle;
+			sp->st_region_wait += c_mp->stat.st_region_wait;
+			sp->st_region_nowait += c_mp->stat.st_region_nowait;
+		}
+
+		/*
+		 * We have duplicate statistics fields in the cache and
+		 * per-file structures.  The counters are only incremented
+		 * in the per-file structures, though.  The intent is that
+		 * if we ever flush files from the pool we can save their
+		 * last known totals in the cache structure.
+		 */
+		for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
+		    mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
+			sp->st_cache_hit += mfp->stat.st_cache_hit;
+			sp->st_cache_miss += mfp->stat.st_cache_miss;
+			sp->st_map += mfp->stat.st_map;
+			sp->st_page_create += mfp->stat.st_page_create;
+			sp->st_page_in += mfp->stat.st_page_in;
+			sp->st_page_out += mfp->stat.st_page_out;
+		}
+
+		R_UNLOCK(dbenv, dbmp->reginfo);
+	}
+
+	/* Per-file statistics. */
+	if (fspp != NULL) {
+		*fspp = NULL;
+
+		R_LOCK(dbenv, dbmp->reginfo);
+
+		/* Count the MPOOLFILE structures. */
+		for (i = 0, len = 0,
+		    mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
+		    mfp != NULL;
+		    ++i, mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile))
+			len += sizeof(DB_MPOOL_FSTAT *) +
+			    sizeof(DB_MPOOL_FSTAT) +
+			    strlen(__memp_fns(dbmp, mfp)) + 1;
+		len += sizeof(DB_MPOOL_FSTAT *);	/* Trailing NULL */
+
+		R_UNLOCK(dbenv, dbmp->reginfo);
+
+		if (len == 0)
+			return (0);
+
+		/* Allocate space */
+		if ((ret = __os_malloc(dbenv, len, db_malloc, fspp)) != 0)
+			return (ret);
+
+		R_LOCK(dbenv, dbmp->reginfo);
+
+		/*
+		 * Build each individual entry.  We assume that an array of
+		 * pointers are aligned correctly to be followed by an array
+		 * of structures, which should be safe (in this particular
+		 * case, the first element of the structure is a pointer, so
+		 * we're doubly safe).  The array is followed by space for
+		 * the text file names.
+		 *
+		 * Add 1 to i because we need to skip over the NULL.
+		 */
+		tfsp = *fspp;
+		tstruct = (DB_MPOOL_FSTAT *)(tfsp + i + 1);
+		tname = (char *)(tstruct + i);
+
+		for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
+		    mfp != NULL;
+		    ++tfsp, ++tstruct, tname += nlen,
+		    mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
+			name = __memp_fns(dbmp, mfp);
+			nlen = strlen(name) + 1;
+			*tfsp = tstruct;
+			*tstruct = mfp->stat;
+			tstruct->file_name = tname;
+			memcpy(tname, name, nlen);
+		}
+		*tfsp = NULL;
+
+		R_UNLOCK(dbenv, dbmp->reginfo);
+	}
+	return (0);
+}
+
+#define	FMAP_ENTRIES	200			/* Files we map. */
+
+#define	MPOOL_DUMP_HASH	0x01			/* Debug hash chains. */
+#define	MPOOL_DUMP_LRU	0x02			/* Debug LRU chains. */
+#define	MPOOL_DUMP_MEM	0x04			/* Debug region memory. */
+#define	MPOOL_DUMP_ALL	0x07			/* Debug all. */
+
+/*
+ * __memp_dump_region --
+ *	Display MPOOL structures.
+ *
+ * PUBLIC: void __memp_dump_region __P((DB_ENV *, char *, FILE *));
+ */
+void
+__memp_dump_region(dbenv, area, fp)
+	DB_ENV *dbenv;
+	char *area;
+	FILE *fp;
+{
+	DB_MPOOL *dbmp;
+	DB_MPOOLFILE *dbmfp;
+	MPOOL *mp;
+	MPOOLFILE *mfp;
+	size_t fmap[FMAP_ENTRIES + 1];
+	u_int32_t i, flags;
+	int cnt;
+	u_int8_t *p;
+
+	dbmp = dbenv->mp_handle;
+
+	/* Make it easy to call from the debugger. */
+	if (fp == NULL)
+		fp = stderr;
+
+	for (flags = 0; *area != '\0'; ++area)
+		switch (*area) {
+		case 'A':
+			LF_SET(MPOOL_DUMP_ALL);
+			break;
+		case 'h':
+			LF_SET(MPOOL_DUMP_HASH);
+			break;
+		case 'l':
+			LF_SET(MPOOL_DUMP_LRU);
+			break;
+		case 'm':
+			LF_SET(MPOOL_DUMP_MEM);
+			break;
+		}
+
+	R_LOCK(dbenv, dbmp->reginfo);
+
+	mp = dbmp->reginfo[0].primary;
+
+	/* Display MPOOL structures. */
+	(void)fprintf(fp, "%s\nPool (region addr 0x%lx)\n",
+	    DB_LINE, (u_long)dbmp->reginfo[0].addr);
+
+	/* Display the MPOOLFILE structures. */
+	cnt = 0;
+	for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
+	    mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile), ++cnt) {
+		(void)fprintf(fp, "File #%d: %s: type %ld, %s\n\t [UID: ",
+		    cnt + 1, __memp_fns(dbmp, mfp), (long)mfp->ftype,
+		    F_ISSET(mfp, MP_CAN_MMAP) ? "mmap" : "read/write");
+		p = R_ADDR(dbmp->reginfo, mfp->fileid_off);
+		for (i = 0; i < DB_FILE_ID_LEN; ++i) {
+			(void)fprintf(fp, "%x", *p++);
+			if (i < DB_FILE_ID_LEN - 1)
+				(void)fprintf(fp, " ");
+		}
+		(void)fprintf(fp, "]\n");
+		if (cnt < FMAP_ENTRIES)
+			fmap[cnt] = R_OFFSET(dbmp->reginfo, mfp);
+	}
+
+	for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
+	    dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q), ++cnt) {
+		(void)fprintf(fp, "File #%d: %s: per-process, %s\n",
+		    cnt + 1, __memp_fn(dbmfp),
+		    F_ISSET(dbmfp, MP_READONLY) ? "readonly" : "read/write");
+		    if (cnt < FMAP_ENTRIES)
+			fmap[cnt] = R_OFFSET(dbmp->reginfo, mfp);
+	}
+	if (cnt < FMAP_ENTRIES)
+		fmap[cnt] = INVALID_ROFF;
+	else
+		fmap[FMAP_ENTRIES] = INVALID_ROFF;
+
+	/* Dump the memory pools. */
+	for (i = 0; i < mp->nreg; ++i) {
+		(void)fprintf(fp, "%s\nCache #%d:\n", DB_LINE, i + 1);
+		__memp_dumpcache(dbmp, &dbmp->reginfo[i], fmap, fp, flags);
+	}
+
+	R_UNLOCK(dbenv, dbmp->reginfo);
+
+	/* Flush in case we're debugging. */
+	(void)fflush(fp);
+}
+
+/*
+ * __memp_dumpcache --
+ *	Display statistics for a cache.
+ */
+static void
+__memp_dumpcache(dbmp, reginfo, fmap, fp, flags)
+	DB_MPOOL *dbmp;
+	REGINFO *reginfo;
+	size_t *fmap;
+	FILE *fp;
+	u_int32_t flags;
+{
+	BH *bhp;
+	DB_HASHTAB *dbht;
+	MPOOL *c_mp;
+	int bucket;
+
+	c_mp = reginfo->primary;
+
+	/* Display the hash table list of BH's. */
+	if (LF_ISSET(MPOOL_DUMP_HASH)) {
+		(void)fprintf(fp,
+	    "%s\nBH hash table (%lu hash slots)\npageno, file, ref, address\n",
+		    DB_LINE, (u_long)c_mp->htab_buckets);
+		for (dbht = R_ADDR(reginfo, c_mp->htab),
+		    bucket = 0; bucket < c_mp->htab_buckets; ++dbht, ++bucket) {
+			if (SH_TAILQ_FIRST(dbht, __bh) != NULL)
+				(void)fprintf(fp, "%lu:\n", (u_long)bucket);
+			for (bhp = SH_TAILQ_FIRST(dbht, __bh);
+			    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
+				__memp_pbh(dbmp, bhp, fmap, fp);
+		}
+	}
+
+	/* Display the LRU list of BH's. */
+	if (LF_ISSET(MPOOL_DUMP_LRU)) {
+		(void)fprintf(fp, "%s\nBH LRU list\n", DB_LINE);
+		(void)fprintf(fp, "pageno, file, ref, address\n");
+		for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
+		    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh))
+			__memp_pbh(dbmp, bhp, fmap, fp);
+	}
+
+	/* Dump the memory pool. */
+	if (LF_ISSET(MPOOL_DUMP_MEM))
+		__db_shalloc_dump(reginfo->addr, fp);
+}
+
+/*
+ * __memp_pbh --
+ *	Display a BH structure.
+ */
+static void
+__memp_pbh(dbmp, bhp, fmap, fp)
+	DB_MPOOL *dbmp;
+	BH *bhp;
+	size_t *fmap;
+	FILE *fp;
+{
+	static const FN fn[] = {
+		{ BH_CALLPGIN,		"callpgin" },
+		{ BH_DIRTY,		"dirty" },
+		{ BH_DISCARD,		"discard" },
+		{ BH_LOCKED,		"locked" },
+		{ BH_SYNC,		"sync" },
+		{ BH_SYNC_LOGFLSH,	"sync:logflush" },
+		{ BH_TRASH,		"trash" },
+		{ 0,			NULL }
+	};
+	int i;
+
+	for (i = 0; i < FMAP_ENTRIES; ++i)
+		if (fmap[i] == INVALID_ROFF || fmap[i] == bhp->mf_offset)
+			break;
+
+	if (fmap[i] == INVALID_ROFF)
+		(void)fprintf(fp, "  %4lu, %lu, %2lu, %lu",
+		    (u_long)bhp->pgno, (u_long)bhp->mf_offset,
+		    (u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp));
+	else
+		(void)fprintf(fp, "  %4lu,   #%d,  %2lu, %lu",
+		    (u_long)bhp->pgno, i + 1,
+		    (u_long)bhp->ref, (u_long)R_OFFSET(dbmp->reginfo, bhp));
+
+	__db_prflags(bhp->flags, fn, fp);
+
+	(void)fprintf(fp, "\n");
+}
diff --git a/db/mp/mp_sync.c b/db/mp/mp_sync.c
new file mode 100644
index 000000000..1b0751db7
--- /dev/null
+++ b/db/mp/mp_sync.c
@@ -0,0 +1,658 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ *	Sleepycat Software.  All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_sync.c,v 11.29 2001/01/11 18:19:53 bostic Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <stdlib.h>
+#endif
+
+#ifdef  HAVE_RPC
+#include "db_server.h"
+#endif
+
+#include "db_int.h"
+#include "db_shash.h"
+#include "mp.h"
+
+#ifdef HAVE_RPC
+#include "gen_client_ext.h"
+#include "rpc_client_ext.h"
+#endif
+
+static int __bhcmp __P((const void *, const void *));
+static int __memp_fsync __P((DB_MPOOLFILE *));
+static int __memp_sballoc __P((DB_ENV *, BH ***, u_int32_t *));
+
+/*
+ * memp_sync --
+ *	Mpool sync function.
+ */
+int
+memp_sync(dbenv, lsnp)
+	DB_ENV *dbenv;
+	DB_LSN *lsnp;
+{
+	BH *bhp, **bharray;
+	DB_MPOOL *dbmp;
+	DB_LSN tlsn;
+	MPOOL *c_mp, *mp;
+	MPOOLFILE *mfp;
+	u_int32_t ar_cnt, i, ndirty;
+	int ret, retry_done, retry_need, wrote;
+
+#ifdef HAVE_RPC
+	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
+		return (__dbcl_memp_sync(dbenv, lsnp));
+#endif
+
+	PANIC_CHECK(dbenv);
+	ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
+
+	dbmp = dbenv->mp_handle;
+	mp = dbmp->reginfo[0].primary;
+
+	/*
+	 * If no LSN is provided, flush the entire cache.
+	 *
+	 * !!!
+	 * Our current behavior is to flush the entire cache, so there's
+	 * nothing special we have to do here other than deal with NULL
+	 * pointers.
+	 */
+	if (lsnp == NULL) {
+		ZERO_LSN(tlsn);
+		lsnp = &tlsn;
+		F_SET(mp, MP_LSN_RETRY);
+	} else if (!LOGGING_ON(dbenv)) {
+		__db_err(dbenv, "memp_sync: requires logging");
+		return (EINVAL);
+	}
+
+	/*
+	 * Sync calls are single-threaded so that we don't have multiple
+	 * threads, with different checkpoint LSNs, walking the caches
+	 * and updating the checkpoint LSNs and how many buffers remain
+	 * to be written for the checkpoint.  This shouldn't be a problem,
+	 * any application that has multiple checkpoint threads isn't what
+	 * I'd call trustworthy.
+	 */
+	MUTEX_LOCK(dbenv, &mp->sync_mutex, dbenv->lockfhp);
+
+	/*
+	 * If the application is asking about a previous call to memp_sync(),
+	 * and we haven't found any buffers that the application holding the
+	 * pin couldn't write, return yes or no based on the current count.
+	 * Note, if the application is asking about a LSN *smaller* than one
+	 * we've already handled or are currently handling, then we return a
+	 * result based on the count for the larger LSN.
+	 */
+	R_LOCK(dbenv, dbmp->reginfo);
+	if (!IS_ZERO_LSN(*lsnp) &&
+	    !F_ISSET(mp, MP_LSN_RETRY) && log_compare(lsnp, &mp->lsn) <= 0) {
+		if (mp->lsn_cnt == 0) {
+			*lsnp = mp->lsn;
+			ret = 0;
+		} else
+			ret = DB_INCOMPLETE;
+
+		R_UNLOCK(dbenv, dbmp->reginfo);
+		MUTEX_UNLOCK(dbenv, &mp->sync_mutex);
+		return (ret);
+	}
+
+	/*
+	 * Allocate room for a list of buffers, and decide how many buffers
+	 * we can pin down.
+	 *
+	 * !!!
+	 * Note: __memp_sballoc has released the region lock if we're not
+	 * continuing forward.
+	 */
+	if ((ret =
+	    __memp_sballoc(dbenv, &bharray, &ndirty)) != 0 || ndirty == 0) {
+		MUTEX_UNLOCK(dbenv, &mp->sync_mutex);
+		return (ret);
+	}
+
+	retry_done = 0;
+retry:	retry_need = 0;
+	/*
+	 * Start a new checkpoint.
+	 *
+	 * Save the LSN.  We know that it's a new LSN, a retry, or larger than
+	 * the one for which we were already doing a checkpoint.  (BTW, I don't
+	 * expect to see multiple LSN's from the same or multiple processes,
+	 * but You Just Never Know.  Responding as if they all called with the
+	 * largest of the LSNs specified makes everything work.)
+	 *
+	 * We don't currently use the LSN we save.  We could potentially save
+	 * the last-written LSN in each buffer header and use it to determine
+	 * what buffers need to be written.  The problem with this is that it's
+	 * sizeof(LSN) more bytes of buffer header.  We currently write all the
+	 * dirty buffers instead, but with a sufficiently large cache that's
+	 * going to be a problem.
+	 */
+	mp->lsn = *lsnp;
+
+	/*
+	 * Clear the global count of buffers waiting to be written, walk the
+	 * list of files clearing the count of buffers waiting to be written.
+	 *
+	 * Clear the retry flag.
+	 */
+	mp->lsn_cnt = 0;
+	for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
+	    mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile))
+		mfp->lsn_cnt = 0;
+	F_CLR(mp, MP_LSN_RETRY);
+
+	/*
+	 * Walk each cache's list of buffers and mark all dirty buffers to be
+	 * written and all pinned buffers to be potentially written (we can't
+	 * know if they'll need to be written until the holder returns them to
+	 * the cache).  We do this in one pass while holding the region locked
+	 * so that processes can't make new buffers dirty, causing us to never
+	 * finish.  Since the application may have restarted the sync using a
+	 * different LSN value, clear any BH_SYNC | BH_SYNC_LOGFLSH flags that
+	 * appear leftover from previous calls.
+	 *
+	 * Keep a count of the total number of buffers we need to write in
+	 * MPOOL->lsn_cnt, and for each file, in MPOOLFILE->lsn_count.
+	 */
+	for (ar_cnt = 0, i = 0; i < mp->nreg; ++i) {
+		c_mp = dbmp->reginfo[i].primary;
+		for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
+		    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
+			if (F_ISSET(bhp, BH_DIRTY) || bhp->ref != 0) {
+				F_SET(bhp, BH_SYNC);
+
+				++mp->lsn_cnt;
+
+				mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+				++mfp->lsn_cnt;
+
+				/*
+				 * If the buffer isn't being used, we can write
+				 * it immediately, so increment its reference
+				 * count to lock it down, and save a reference
+				 * to it.
+				 *
+				 * If we've run out space to store buffer refs,
+				 * we're screwed.  We don't want to realloc the
+				 * array while holding a region lock, so we set
+				 * a flag and deal with it later.
+				 */
+				if (bhp->ref == 0) {
+					++bhp->ref;
+					bharray[ar_cnt] = bhp;
+
+					if (++ar_cnt >= ndirty) {
+						retry_need = 1;
+						break;
+					}
+				}
+			} else
+				if (F_ISSET(bhp, BH_SYNC))
+					F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH);
+		}
+		if (ar_cnt >= ndirty)
+			break;
+	}
+
+	/* If there no buffers we can write immediately, we're done. */
+	if (ar_cnt == 0) {
+		ret = mp->lsn_cnt ? DB_INCOMPLETE : 0;
+		goto done;
+	}
+
+	R_UNLOCK(dbenv, dbmp->reginfo);
+
+	/*
+	 * Sort the buffers we're going to write immediately.
+	 *
+	 * We try and write the buffers in file/page order: it should reduce
+	 * seeks by the underlying filesystem and possibly reduce the actual
+	 * number of writes.
+	 */
+	if (ar_cnt > 1)
+		qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp);
+
+	/*
+	 * Flush the log.  We have to ensure the log records reflecting the
+	 * changes on the database pages we're writing have already made it
+	 * to disk.  We usually do that as we write each page, but if we
+	 * are going to write a large number of pages, repeatedly acquiring
+	 * the log region lock is going to be expensive.  Flush the entire
+	 * log now, so that sync doesn't require any more log flushes.
+	 */
+	if (LOGGING_ON(dbenv) && (ret = log_flush(dbenv, NULL)) != 0)
+		goto done;
+
+	R_LOCK(dbenv, dbmp->reginfo);
+
+	/* Walk the array, writing buffers. */
+	for (i = 0; i < ar_cnt; ++i) {
+		/*
+		 * It's possible for a thread to have gotten the buffer since
+		 * we listed it for writing.  If the reference count is still
+		 * 1, we're the only ones using the buffer, go ahead and write.
+		 * If it's >1, then skip the buffer and assume that it will be
+		 * written when it's returned to the cache.
+		 */
+		if (bharray[i]->ref > 1) {
+			--bharray[i]->ref;
+			continue;
+		}
+
+		/* Write the buffer. */
+		mfp = R_ADDR(dbmp->reginfo, bharray[i]->mf_offset);
+		ret = __memp_bhwrite(dbmp, mfp, bharray[i], NULL, &wrote);
+
+		/* Release the buffer. */
+		--bharray[i]->ref;
+
+		if (ret == 0 && wrote)
+			continue;
+
+		/*
+		 * Any process syncing the shared memory buffer pool had best
+		 * be able to write to any underlying file. Be understanding,
+		 * but firm, on this point.
+		 */
+		if (ret == 0) {
+			__db_err(dbenv, "%s: unable to flush page: %lu",
+			    __memp_fns(dbmp, mfp), (u_long)bharray[i]->pgno);
+			ret = EPERM;
+		}
+
+		/*
+		 * On error, clear MPOOL->lsn and set MP_LSN_RETRY so that no
+		 * future checkpoint return can depend on this failure.  Clear
+		 * the buffer's BH_SYNC flag, because it's used to determine
+		 * if lsn_cnt values are incremented/decremented.  Don't bother
+		 * to reset/clear:
+		 *
+		 *	MPOOL->lsn_cnt
+		 *	MPOOLFILE->lsn_cnt
+		 *
+		 * they don't make any difference.
+		 */
+		ZERO_LSN(mp->lsn);
+		F_SET(mp, MP_LSN_RETRY);
+
+		/* Release any buffers we're still pinning down. */
+		while (++i < ar_cnt) {
+			bhp = bharray[i];
+			--bhp->ref;
+			F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH);
+		}
+
+		goto done;
+	}
+
+	ret = mp->lsn_cnt != 0 ? DB_INCOMPLETE : 0;
+
+	/*
+	 * If there were too many buffers and we're not returning an error, we
+	 * re-try the checkpoint once -- since we allocated 80% of the total
+	 * buffer count, once should be enough. If it still doesn't work, some
+	 * other thread of control is dirtying buffers as fast as we're writing
+	 * them, and we might as well give up for now.  In the latter case, set
+	 * the global retry flag, we'll have to start from scratch on the next
+	 * checkpoint.
+	 */
+	if (retry_need) {
+		if (retry_done) {
+			ret = DB_INCOMPLETE;
+			F_SET(mp, MP_LSN_RETRY);
+		} else {
+			retry_done = 1;
+			goto retry;
+		}
+	}
+
+done:	R_UNLOCK(dbenv, dbmp->reginfo);
+	MUTEX_UNLOCK(dbenv, &mp->sync_mutex);
+
+	__os_free(bharray, ndirty * sizeof(BH *));
+
+	return (ret);
+}
+
+/*
+ * memp_fsync --
+ *	Mpool file sync function.
+ */
+int
+memp_fsync(dbmfp)
+	DB_MPOOLFILE *dbmfp;
+{
+	DB_ENV *dbenv;
+	DB_MPOOL *dbmp;
+	int is_tmp;
+
+	dbmp = dbmfp->dbmp;
+	dbenv = dbmp->dbenv;
+
+#ifdef HAVE_RPC
+	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
+		return (__dbcl_memp_fsync(dbmfp));
+#endif
+
+	PANIC_CHECK(dbenv);
+
+	/*
+	 * If this handle doesn't have a file descriptor that's open for
+	 * writing, or if the file is a temporary, there's no reason to
+	 * proceed further.
+	 */
+	if (F_ISSET(dbmfp, MP_READONLY))
+		return (0);
+
+	R_LOCK(dbenv, dbmp->reginfo);
+	is_tmp = F_ISSET(dbmfp->mfp, MP_TEMP);
+	R_UNLOCK(dbenv, dbmp->reginfo);
+	if (is_tmp)
+		return (0);
+
+	return (__memp_fsync(dbmfp));
+}
+
+/*
+ * __mp_xxx_fh --
+ *	Return a file descriptor for DB 1.85 compatibility locking.
+ *
+ * PUBLIC: int __mp_xxx_fh __P((DB_MPOOLFILE *, DB_FH **));
+ */
+int
+__mp_xxx_fh(dbmfp, fhp)
+	DB_MPOOLFILE *dbmfp;
+	DB_FH **fhp;
+{
+	/*
+	 * This is a truly spectacular layering violation, intended ONLY to
+	 * support compatibility for the DB 1.85 DB->fd call.
+	 *
+	 * Sync the database file to disk, creating the file as necessary.
+	 *
+	 * We skip the MP_READONLY and MP_TEMP tests done by memp_fsync(3).
+	 * The MP_READONLY test isn't interesting because we will either
+	 * already have a file descriptor (we opened the database file for
+	 * reading) or we aren't readonly (we created the database which
+	 * requires write privileges).  The MP_TEMP test isn't interesting
+	 * because we want to write to the backing file regardless so that
+	 * we get a file descriptor to return.
+	 */
+	*fhp = &dbmfp->fh;
+	return (F_ISSET(&dbmfp->fh, DB_FH_VALID) ? 0 : __memp_fsync(dbmfp));
+}
+
+/*
+ * __memp_fsync --
+ *	Mpool file internal sync function.
+ */
+static int
+__memp_fsync(dbmfp)
+	DB_MPOOLFILE *dbmfp;
+{
+	BH *bhp, **bharray;
+	DB_ENV *dbenv;
+	DB_MPOOL *dbmp;
+	MPOOL *c_mp, *mp;
+	size_t mf_offset;
+	u_int32_t ar_cnt, i, ndirty;
+	int incomplete, ret, retry_done, retry_need, wrote;
+
+	dbmp = dbmfp->dbmp;
+	dbenv = dbmp->dbenv;
+	mp = dbmp->reginfo[0].primary;
+
+	R_LOCK(dbenv, dbmp->reginfo);
+
+	/*
+	 * Allocate room for a list of buffers, and decide how many buffers
+	 * we can pin down.
+	 *
+	 * !!!
+	 * Note: __memp_sballoc has released our region lock if we're not
+	 * continuing forward.
+	 */
+	if ((ret =
+	    __memp_sballoc(dbenv, &bharray, &ndirty)) != 0 || ndirty == 0)
+		return (ret);
+
+	retry_done = 0;
+retry:	retry_need = 0;
+	/*
+	 * Walk each cache's list of buffers and mark all dirty buffers to be
+	 * written and all pinned buffers to be potentially written (we can't
+	 * know if they'll need to be written until the holder returns them to
+	 * the cache).  We do this in one pass while holding the region locked
+	 * so that processes can't make new buffers dirty, causing us to never
+	 * finish.
+	 */
+	mf_offset = R_OFFSET(dbmp->reginfo, dbmfp->mfp);
+	for (ar_cnt = 0, incomplete = 0, i = 0; i < mp->nreg; ++i) {
+		c_mp = dbmp->reginfo[i].primary;
+		for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
+		    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
+			if (!F_ISSET(bhp, BH_DIRTY) ||
+			    bhp->mf_offset != mf_offset)
+				continue;
+			if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) {
+				incomplete = 1;
+				continue;
+			}
+
+			/*
+			 * If the buffer isn't being used, we can write
+			 * it immediately, so increment its reference
+			 * count to lock it down, and save a reference
+			 * to it.
+			 *
+			 * If we've run out space to store buffer refs,
+			 * we're screwed.  We don't want to realloc the
+			 * array while holding a region lock, so we set
+			 * a flag and deal with it later.
+			 */
+			++bhp->ref;
+			bharray[ar_cnt] = bhp;
+			if (++ar_cnt >= ndirty) {
+				retry_need = 1;
+				break;
+			}
+		}
+		if (ar_cnt >= ndirty)
+			break;
+	}
+
+	/* If there no buffers we can write immediately, we're done. */
+	if (ar_cnt == 0) {
+		ret = 0;
+		goto done;
+	}
+
+	R_UNLOCK(dbenv, dbmp->reginfo);
+
+	/* Sort the buffers we're going to write. */
+	if (ar_cnt > 1)
+		qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp);
+
+	R_LOCK(dbenv, dbmp->reginfo);
+
+	/* Walk the array, writing buffers. */
+	for (i = 0; i < ar_cnt;) {
+		/*
+		 * It's possible for a thread to have gotten the buffer since
+		 * we listed it for writing.  If the reference count is still
+		 * 1, we're the only ones using the buffer, go ahead and write.
+		 * If it's >1, then skip the buffer and assume that it will be
+		 * written when it's returned to the cache.
+		 */
+		if (bharray[i]->ref > 1) {
+			incomplete = 1;
+			--bharray[i++]->ref;
+			continue;
+		}
+
+		/* Write the buffer. */
+		ret = __memp_pgwrite(dbmp, dbmfp, bharray[i], NULL, &wrote);
+
+		/* Release the buffer. */
+		--bharray[i++]->ref;
+
+		if (ret == 0) {
+			if (!wrote)
+				incomplete = 1;
+			continue;
+		}
+
+		/*
+		 * On error:
+		 *
+		 * Release any buffers we're still pinning down.
+		 */
+		while (i < ar_cnt)
+			--bharray[i++]->ref;
+		break;
+	}
+
+	/*
+	 * If there were too many buffers and we're not returning an error, we
+	 * re-try the flush once -- since we allocated 80% of the total
+	 * buffer count, once should be enough. If it still doesn't work, some
+	 * other thread of control is dirtying buffers as fast as we're writing
+	 * them, and we might as well give up.
+	 */
+	if (retry_need) {
+		if (retry_done)
+			incomplete = 1;
+		else {
+			retry_done = 1;
+			goto retry;
+		}
+	}
+
+done:	R_UNLOCK(dbenv, dbmp->reginfo);
+
+	__os_free(bharray, ndirty * sizeof(BH *));
+
+	/*
+	 * Sync the underlying file as the last thing we do, so that the OS
+	 * has a maximal opportunity to flush buffers before we request it.
+	 *
+	 * !!!:
+	 * Don't lock the region around the sync, fsync(2) has no atomicity
+	 * issues.
+	 */
+	if (ret == 0)
+		ret = incomplete ?
+		    DB_INCOMPLETE : __os_fsync(dbenv, &dbmfp->fh);
+
+	return (ret);
+}
+
+/*
+ * __memp_sballoc --
+ *	Allocate room for a list of buffers.
+ */
+static int
+__memp_sballoc(dbenv, bharrayp, ndirtyp)
+	DB_ENV *dbenv;
+	BH ***bharrayp;
+	u_int32_t *ndirtyp;
+{
+	DB_MPOOL *dbmp;
+	MPOOL *c_mp, *mp;
+	u_int32_t i, nclean, ndirty, maxpin;
+	int ret;
+
+	dbmp = dbenv->mp_handle;
+	mp = dbmp->reginfo[0].primary;
+
+	/*
+	 * We don't want to hold the region lock while we write the buffers,
+	 * so only lock it while we create a list.
+	 *
+	 * Walk through the list of caches, figuring out how many buffers
+	 * we're going to need.
+	 *
+	 * Make a point of not holding the region lock across the library
+	 * allocation call.
+	 */
+	for (nclean = ndirty = 0, i = 0; i < mp->nreg; ++i) {
+		c_mp = dbmp->reginfo[i].primary;
+		ndirty += c_mp->stat.st_page_dirty;
+		nclean += c_mp->stat.st_page_clean;
+	}
+	R_UNLOCK(dbenv, dbmp->reginfo);
+	if (ndirty == 0) {
+		*ndirtyp = 0;
+		return (0);
+	}
+
+	/*
+	 * We don't want to pin down the entire buffer cache, otherwise we'll
+	 * starve threads needing new pages.  Don't pin down more than 80% of
+	 * the cache, making sure that we don't screw up just because only a
+	 * few pages have been created.
+	 */
+	maxpin = ((ndirty + nclean) * 8) / 10;
+	if (maxpin < 10)
+		maxpin = 10;
+
+	/*
+	 * Get a good-sized block of memory to hold buffer pointers, we don't
+	 * want to run out, but correct if we want to allocate more than we
+	 * would be allowed to store, regardless.
+	 */
+	ndirty += ndirty / 2 + 10;
+	if (ndirty > maxpin)
+		ndirty = maxpin;
+	if ((ret =
+	    __os_malloc(dbenv, ndirty * sizeof(BH *), NULL, bharrayp)) != 0)
+		return (ret);
+
+	*ndirtyp = ndirty;
+
+	R_LOCK(dbenv, dbmp->reginfo);
+
+	return (0);
+}
+
+static int
+__bhcmp(p1, p2)
+	const void *p1, *p2;
+{
+	BH *bhp1, *bhp2;
+
+	bhp1 = *(BH * const *)p1;
+	bhp2 = *(BH * const *)p2;
+
+	/* Sort by file (shared memory pool offset). */
+	if (bhp1->mf_offset < bhp2->mf_offset)
+		return (-1);
+	if (bhp1->mf_offset > bhp2->mf_offset)
+		return (1);
+
+	/*
+	 * !!!
+	 * Defend against badly written quicksort code calling the comparison
+	 * function with two identical pointers (e.g., WATCOM C++ (Power++)).
+	 */
+	if (bhp1->pgno < bhp2->pgno)
+		return (-1);
+	if (bhp1->pgno > bhp2->pgno)
+		return (1);
+	return (0);
+}
diff --git a/db/mp/mp_trickle.c b/db/mp/mp_trickle.c
new file mode 100644
index 000000000..f937805cf
--- /dev/null
+++ b/db/mp/mp_trickle.c
@@ -0,0 +1,149 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ *	Sleepycat Software.  All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: mp_trickle.c,v 11.12 2000/11/30 00:58:41 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <stdlib.h>
+#endif
+
+#ifdef  HAVE_RPC
+#include "db_server.h"
+#endif
+
+#include "db_int.h"
+#include "db_shash.h"
+#include "mp.h"
+
+#ifdef HAVE_RPC
+#include "gen_client_ext.h"
+#include "rpc_client_ext.h"
+#endif
+
+static int __memp_trick __P((DB_ENV *, int, int, int *));
+
+/*
+ * memp_trickle --
+ *	Keep a specified percentage of the buffers clean.
+ */
+int
+memp_trickle(dbenv, pct, nwrotep)
+	DB_ENV *dbenv;
+	int pct, *nwrotep;
+{
+	DB_MPOOL *dbmp;
+	MPOOL *mp;
+	u_int32_t i;
+	int ret;
+
+#ifdef HAVE_RPC
+	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
+		return (__dbcl_memp_trickle(dbenv, pct, nwrotep));
+#endif
+
+	PANIC_CHECK(dbenv);
+	ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
+
+	dbmp = dbenv->mp_handle;
+	mp = dbmp->reginfo[0].primary;
+
+	if (nwrotep != NULL)
+		*nwrotep = 0;
+
+	if (pct < 1 || pct > 100)
+		return (EINVAL);
+
+	R_LOCK(dbenv, dbmp->reginfo);
+
+	/* Loop through the caches... */
+	for (ret = 0, i = 0; i < mp->nreg; ++i)
+		if ((ret = __memp_trick(dbenv, i, pct, nwrotep)) != 0)
+			break;
+
+	R_UNLOCK(dbenv, dbmp->reginfo);
+	return (ret);
+}
+
+/*
+ * __memp_trick --
+ *	Trickle a single cache.
+ */
+static int
+__memp_trick(dbenv, ncache, pct, nwrotep)
+	DB_ENV *dbenv;
+	int ncache, pct, *nwrotep;
+{
+	BH *bhp;
+	DB_MPOOL *dbmp;
+	MPOOL *c_mp;
+	MPOOLFILE *mfp;
+	db_pgno_t pgno;
+	u_long total;
+	int ret, wrote;
+
+	dbmp = dbenv->mp_handle;
+	c_mp = dbmp->reginfo[ncache].primary;
+
+	/*
+	 * If there are sufficient clean buffers, or no buffers or no dirty
+	 * buffers, we're done.
+	 *
+	 * XXX
+	 * Using st_page_clean and st_page_dirty is our only choice at the
+	 * moment, but it's not as correct as we might like in the presence
+	 * of pools with more than one buffer size, as a free 512-byte buffer
+	 * isn't the same as a free 8K buffer.
+	 */
+loop:	total = c_mp->stat.st_page_clean + c_mp->stat.st_page_dirty;
+	if (total == 0 || c_mp->stat.st_page_dirty == 0 ||
+	    (c_mp->stat.st_page_clean * 100) / total >= (u_long)pct)
+		return (0);
+
+	/* Loop until we write a buffer. */
+	for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
+	    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
+		if (bhp->ref != 0 ||
+		    !F_ISSET(bhp, BH_DIRTY) || F_ISSET(bhp, BH_LOCKED))
+			continue;
+
+		mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+
+		/*
+		 * We can't write to temporary files -- see the comment in
+		 * mp_bh.c:__memp_bhwrite().
+		 */
+		if (F_ISSET(mfp, MP_TEMP))
+			continue;
+
+		pgno = bhp->pgno;
+		if ((ret = __memp_bhwrite(dbmp, mfp, bhp, NULL, &wrote)) != 0)
+			return (ret);
+
+		/*
+		 * Any process syncing the shared memory buffer pool had better
+		 * be able to write to any underlying file.  Be understanding,
+		 * but firm, on this point.
+		 */
+		if (!wrote) {
+			__db_err(dbenv, "%s: unable to flush page: %lu",
+			    __memp_fns(dbmp, mfp), (u_long)pgno);
+			return (EPERM);
+		}
+
+		++c_mp->stat.st_page_trickle;
+		if (nwrotep != NULL)
+			++*nwrotep;
+		goto loop;
+	}
+
+	return (0);
+}
-- 
cgit v1.2.3