16 files changed, 1592 insertions, 806 deletions
diff --git a/db/mp/Design b/db/mp/Design
deleted file mode 100644
index 1b26aae6c..000000000
--- a/db/mp/Design
+++ /dev/null
@@ -1,52 +0,0 @@
-$Id: Design,v 11.2 1999/11/21 23:08:27 bostic Exp $
-
-There are three ways we do locking in the mpool code:
-
-Locking a handle mutex to provide concurrency for DB_THREAD operations.
-Locking the region mutex to provide mutual exclusion while reading and
-    writing structures in the shared region.
-Locking buffer header mutexes during I/O.
-
-The first will not be further described here.  We use the shared mpool
-region lock to provide mutual exclusion while reading/modifying all of
-the data structures, including the buffer headers.  We use a per-buffer
-header lock to wait on buffer I/O.  The order of locking is as follows:
-
-Searching for a buffer:
-    Acquire the region lock.
-    Find the buffer header.
-    Increment the reference count (guarantee the buffer stays).
-    While the BH_LOCKED flag is set (I/O is going on) {
-	Release the region lock.
-	    Explicitly yield the processor if it's not the first pass
-	    through this loop, otherwise, we can simply spin because
-	    we'll be simply switching between the two locks.
-	Request the buffer lock.
-	The I/O will complete...
-	Acquire the buffer lock.
-	Release the buffer lock.
-	Acquire the region lock.
-    }
-    Return the buffer.
-
-Reading/writing a buffer:
-    Acquire the region lock.
-    Find/create the buffer header.
-    If reading, increment the reference count (guarantee the buffer stays).
-    Set the BH_LOCKED flag.
-    Acquire the buffer lock (guaranteed not to block).
-    Release the region lock.
-    Do the I/O and/or initialize the buffer contents.
-    Release the buffer lock.
-	At this point, the buffer lock is available, but the logical
-	operation (flagged by BH_LOCKED) is not yet completed.  For
-	this reason, among others, threads checking the BH_LOCKED flag
-	must loop around their test.
-    Acquire the region lock.
-    Clear the BH_LOCKED flag.
-    Release the region lock.
-    Return/discard the buffer.
-
-Pointers to DB_MPOOL, MPOOL, DB_MPOOLFILE and MPOOLFILE structures are
-not reacquired when a region lock is reacquired because they couldn't
-have been closed/discarded and because they never move in memory.
diff --git a/db/mp/mp_alloc.c b/db/mp/mp_alloc.c
index 0619d5ccf..c18e62dff 100644
--- a/db/mp/mp_alloc.c
+++ b/db/mp/mp_alloc.c
@@ -1,10 +1,9 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996-2006
- *	Oracle Corporation.  All rights reserved.
+ * Copyright (c) 1996,2007 Oracle.  All rights reserved.
  *
- * $Id: mp_alloc.c,v 12.20 2006/09/07 15:11:26 mjc Exp $
+ * $Id: mp_alloc.c,v 12.33 2007/06/01 18:32:44 bostic Exp $
  */
 
 #include "db_config.h"
@@ -38,7 +37,6 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp)
 	MPOOL *c_mp;
 	MPOOLFILE *bh_mfp;
 	size_t freed_space;
-	db_mutex_t mutex;
 	u_int32_t buckets, buffers, high_priority, priority;
 	u_int32_t put_counter, total_buckets;
 	int aggressive, alloc_freeze, giveup, got_oldest, ret;
@@ -54,7 +52,7 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp)
 	aggressive = alloc_freeze = giveup = got_oldest = 0;
 	hp_tmp = NULL;
 
-	c_mp->stat.st_alloc++;
+	STAT(c_mp->stat.st_alloc++);
 
 	/*
 	 * If we're allocating a buffer, and the one we're discarding is the
@@ -86,7 +84,7 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp)
 	 * we need in the hopes it will coalesce into a contiguous chunk of the
 	 * right size.  In the latter case we branch back here and try again.
 	 */
-alloc:	if ((ret = __db_shalloc(infop, len, 0, &p)) == 0) {
+alloc:	if ((ret = __env_alloc(infop, len, &p)) == 0) {
 		if (mfp != NULL)
 			c_mp->stat.st_pages++;
 		MPOOL_REGION_UNLOCK(dbenv, infop);
@@ -106,6 +104,7 @@ found:		if (offsetp != NULL)
 		 * We're not holding the region locked here, these statistics
 		 * can't be trusted.
 		 */
+#ifdef HAVE_STATISTICS
 		total_buckets += buckets;
 		if (total_buckets != 0) {
 			if (total_buckets > c_mp->stat.st_alloc_max_buckets)
@@ -117,6 +116,7 @@ found:		if (offsetp != NULL)
 				c_mp->stat.st_alloc_max_pages = buffers;
 			c_mp->stat.st_alloc_pages += buffers;
 		}
+#endif
 		return (0);
 	} else if (giveup || c_mp->stat.st_pages == 0) {
 		MPOOL_REGION_UNLOCK(dbenv, infop);
@@ -153,24 +153,14 @@ found:		if (offsetp != NULL)
 		}
 
 		/*
-		 * Skip empty buckets.
-		 *
-		 * We can check for empty buckets before locking as we
-		 * only care if the pointer is zero or non-zero.
-		 */
-		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
-			continue;
-
-		/*
 		 * The failure mode is when there are too many buffers we can't
-		 * write or there's not enough memory in the system.  We don't
-		 * have a way to know that allocation has no way to succeed.
-		 * We fail if there were no pages returned to the cache after
-		 * we've been trying for a relatively long time.
+		 * write or there's not enough memory in the system to support
+		 * the number of pinned buffers.
 		 *
-		 * Get aggressive if we've tried to flush the number of hash
-		 * buckets as are in the system and have not found any more
-		 * space.  Aggressive means:
+		 * Get aggressive if we've reviewed the entire cache without
+		 * freeing 3 times the needed space.  (The code resets the
+		 * counter when we free 3 times the needed space.)  Aggressive
+		 * means:
 		 *
 		 * a: set a flag to attempt to flush high priority buffers as
 		 *    well as other buffers.
@@ -187,11 +177,15 @@ found:		if (offsetp != NULL)
 		 * Always try to allocate memory too, in case some other thread
 		 * returns its memory to the region.
 		 *
+		 * We don't have any way to know an allocation has no way to
+		 * succeed.  Fail if no pages are returned to the cache after
+		 * we've been trying for a relatively long time.
+		 *
 		 * !!!
 		 * This test ignores pathological cases like no buffers in the
-		 * system -- that shouldn't be possible.
+		 * system -- we check for that early on, so it isn't possible.
 		 */
-		if ((++buckets % c_mp->htab_buckets) == 0) {
+		if (buckets++ == c_mp->htab_buckets) {
 			if (freed_space > 0)
 				goto alloc;
 			MPOOL_REGION_UNLOCK(dbenv, infop);
@@ -207,7 +201,7 @@ found:		if (offsetp != NULL)
 			case 5:
 			case 6:
 				(void)__memp_sync_int(
-				    dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
+				    dbenv, NULL, 0, DB_SYNC_ALLOC, NULL, NULL);
 
 				__os_sleep(dbenv, 1, 0);
 				break;
@@ -222,11 +216,35 @@ found:		if (offsetp != NULL)
 			goto alloc;
 		}
 
+		/*
+		 * Skip empty buckets.
+		 *
+		 * We can check for empty buckets before locking as we
+		 * only care if the pointer is zero or non-zero.
+		 */
+		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+			continue;
+		/*
+		 * Skip buckets that only have pinned pages.
+		 *
+		 * Again we are doing this without locking. If we misread
+		 * the number we might improperly skip a bucket but this is
+		 * not fatal.
+		 */
+		if (hp->hash_priority == UINT32_MAX)
+			continue;
+
 		if (!aggressive) {
-			/* Skip high priority buckets. */
-			if (hp->hash_priority > high_priority)
+			/* Adjust if the bucket has not been reset. */
+			priority = hp->hash_priority;
+			if (c_mp->lru_reset != 0 &&
+			    c_mp->lru_reset <= hp - dbht)
+				priority -= MPOOL_BASE_DECREMENT;
+			/*
+			 * Skip high priority buckets.
+			 */
+			if (priority > high_priority)
 				continue;
-
 			/*
 			 * Find two buckets and select the one with the lowest
 			 * priority.  Performance testing shows that looking
@@ -237,18 +255,22 @@ found:		if (offsetp != NULL)
 				hp_tmp = hp;
 				continue;
 			}
-			if (hp->hash_priority > hp_tmp->hash_priority)
+			if (c_mp->lru_reset &&
+			    c_mp->lru_reset <= hp_tmp - dbht) {
+				if (priority > hp_tmp->hash_priority -
+				    MPOOL_BASE_DECREMENT)
+					hp = hp_tmp;
+			} else if (priority > hp_tmp->hash_priority)
 				hp = hp_tmp;
 			hp_tmp = NULL;
 		}
 
-		/* Remember the priority of the buffer we're looking for. */
-		priority = hp->hash_priority;
-
 		/* Unlock the region and lock the hash bucket. */
 		MPOOL_REGION_UNLOCK(dbenv, infop);
-		mutex = hp->mtx_hash;
-		MUTEX_LOCK(dbenv, mutex);
+		MUTEX_LOCK(dbenv, hp->mtx_hash);
+
+		/* Remember the priority of the buffer we're looking for. */
+		priority = hp->hash_priority;
 
 #ifdef DIAGNOSTIC
 		__memp_check_order(dbenv, hp);
@@ -311,10 +333,15 @@ this_hb:	if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL)
 			++bhp->ref;
 			ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0);
 			--bhp->ref;
+#ifdef HAVE_STATISTICS
 			if (ret == 0)
 				++c_mp->stat.st_rw_evict;
-		} else
+#endif
+		}
+#ifdef HAVE_STATISTICS
+		else
 			++c_mp->stat.st_ro_evict;
+#endif
 
 		/*
 		 * Freeze this buffer, if necessary.  That is, if the buffer
@@ -373,13 +400,13 @@ this_hb:	if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL)
 			++bhp->ref;
 			if ((ret = __memp_bh_thaw(dbmp, infop, hp,
 			    bhp, NULL)) != 0) {
-				MUTEX_UNLOCK(dbenv, mutex);
+				MUTEX_UNLOCK(dbenv, hp->mtx_hash);
 				return (ret);
 			}
 			alloc_freeze = 0;
 			goto this_hb;
 		} else if (alloc_freeze) {
-			if ((ret = __memp_bhfree(dbmp, hp, bhp, 0)) != 0)
+			if ((ret = __memp_bhfree(dbmp, infop, hp, bhp, 0)) != 0)
 				return (ret);
 			MVCC_MPROTECT(bhp->buf, bh_mfp->stat.st_pagesize,
 			    PROT_READ | PROT_WRITE | PROT_EXEC);
@@ -399,13 +426,13 @@ this_hb:	if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL)
 			continue;
 		} else if (mfp != NULL &&
 		    mfp->stat.st_pagesize == bh_mfp->stat.st_pagesize) {
-			if ((ret = __memp_bhfree(dbmp, hp, bhp, 0)) != 0)
+			if ((ret = __memp_bhfree(dbmp, infop, hp, bhp, 0)) != 0)
 				return (ret);
 			p = bhp;
 			goto found;
 		} else {
-			freed_space += __db_shalloc_sizeof(bhp);
-			if ((ret = __memp_bhfree(dbmp,
+			freed_space += sizeof(*bhp) + bh_mfp->stat.st_pagesize;
+			if ((ret = __memp_bhfree(dbmp, infop,
 			    hp, bhp, BH_FREE_FREEMEM)) != 0)
 				return (ret);
 		}
@@ -419,7 +446,7 @@ this_hb:	if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL)
 		 * hash bucket lock has already been discarded.
 		 */
 		if (0) {
-next_hb:		MUTEX_UNLOCK(dbenv, mutex);
+next_hb:		MUTEX_UNLOCK(dbenv, hp->mtx_hash);
 		}
 		MPOOL_REGION_LOCK(dbenv, infop);
 
@@ -449,7 +476,7 @@ __memp_free(infop, mfp, buf)
 {
 	MVCC_BHUNALIGN(mfp, buf);
 	COMPQUIET(mfp, NULL);
-	__db_shalloc_free(infop, buf);
+	__env_alloc_free(infop, buf);
 }
 
 /*
@@ -516,7 +543,9 @@ __memp_check_order(dbenv, hp)
 	DB_MPOOL_HASH *hp;
 {
 	BH *bhp, *first_bhp, *tbhp;
-	u_int32_t priority, last_priority;
+	u_int32_t dirty, priority, last_priority;
+
+	dirty = 0;
 
 	/*
 	 * Assumes the hash bucket is locked.
@@ -526,6 +555,8 @@ __memp_check_order(dbenv, hp)
 	    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
 		DB_ASSERT(dbenv, !SH_CHAIN_HASNEXT(bhp, vc));
 
+		if (F_ISSET(bhp, BH_DIRTY))
+			dirty++;
 		priority = BH_PRIORITY(bhp);
 		DB_ASSERT(dbenv, (bhp == first_bhp) ?
 		    priority == last_priority : priority >= last_priority);
@@ -547,5 +578,6 @@ __memp_check_order(dbenv, hp)
 			DB_ASSERT(dbenv, bhp->pgno != tbhp->pgno ||
 			    bhp->mf_offset != tbhp->mf_offset);
 	}
+	DB_ASSERT(dbenv, dirty == hp->hash_page_dirty);
 }
 #endif
diff --git a/db/mp/mp_bh.c b/db/mp/mp_bh.c
index ef4d1d4be..85cc30cc7 100644
--- a/db/mp/mp_bh.c
+++ b/db/mp/mp_bh.c
@@ -1,10 +1,9 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996-2006
- *	Oracle Corporation.  All rights reserved.
+ * Copyright (c) 1996,2007 Oracle.  All rights reserved.
  *
- * $Id: mp_bh.c,v 12.31 2006/09/07 19:11:46 bostic Exp $
+ * $Id: mp_bh.c,v 12.38 2007/05/17 15:15:45 bostic Exp $
  */
 
 #include "db_config.h"
@@ -256,9 +255,13 @@ __memp_pgread(dbmfp, hp, bhp, can_create)
 		if (len < pagesize)
 			memset(bhp->buf + len, CLEAR_BYTE, pagesize - len);
 #endif
+#ifdef HAVE_STATISTICS
 		++mfp->stat.st_page_create;
 	} else
 		++mfp->stat.st_page_in;
+#else
+	}
+#endif
 
 	/* Call any pgin function. */
 	ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1);
@@ -304,18 +307,16 @@ __memp_pgwrite(dbenv, dbmfp, hp, bhp)
 	mfp = dbmfp == NULL ? NULL : dbmfp->mfp;
 	callpgin = ret = 0;
 
-	/*
-	 * We should never be called with a clean or trash buffer.
-	 * The sync code does call us with already locked buffers.
-	 */
+	/* We should never be called with a clean or trash buffer. */
 	DB_ASSERT(dbenv, F_ISSET(bhp, BH_DIRTY));
 	DB_ASSERT(dbenv, !F_ISSET(bhp, BH_TRASH));
 
-	/* If not already done, lock the buffer and unlock the hash bucket. */
-	if (!F_ISSET(bhp, BH_LOCKED)) {
-		F_SET(bhp, BH_LOCKED);
-		MUTEX_UNLOCK(dbenv, hp->mtx_hash);
-	}
+	/*
+	 * The sync code has already locked the buffer, but the allocation
+	 * code has not.  Lock the buffer and release the hash bucket mutex.
+	 */
+	F_SET(bhp, BH_LOCKED);
+	MUTEX_UNLOCK(dbenv, hp->mtx_hash);
 
 	/*
 	 * It's possible that the underlying file doesn't exist, either
@@ -333,7 +334,7 @@ __memp_pgwrite(dbenv, dbmfp, hp, bhp)
 	 * If the page is in a file for which we have LSN information, we have
 	 * to ensure the appropriate log records are on disk.
 	 */
-	if (LOGGING_ON(dbenv) && mfp->lsn_off != -1 &&
+	if (LOGGING_ON(dbenv) && mfp->lsn_off != DB_LSN_OFF_NOTSET &&
 	    !IS_CLIENT_PGRECOVER(dbenv)) {
 		memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN));
 		if (!IS_NOT_LOGGED_LSN(lsn) &&
@@ -402,7 +403,7 @@ __memp_pgwrite(dbenv, dbmfp, hp, bhp)
 		    __memp_fn(dbmfp), (u_long)bhp->pgno);
 		goto err;
 	}
-	++mfp->stat.st_page_out;
+	STAT(++mfp->stat.st_page_out);
 	if (bhp->pgno > mfp->last_flushed_pgno) {
 		MUTEX_LOCK(dbenv, mfp->mutex);
 		if (bhp->pgno > mfp->last_flushed_pgno)
@@ -517,20 +518,20 @@ err:	__db_errx(dbenv, "%s: %s failed for page %lu",
  *	Free a bucket header and its referenced data.
  *
  * PUBLIC: int __memp_bhfree
- * PUBLIC:     __P((DB_MPOOL *, DB_MPOOL_HASH *, BH *, u_int32_t));
+ * PUBLIC:     __P((DB_MPOOL *, REGINFO *, DB_MPOOL_HASH *, BH *, u_int32_t));
  */
 int
-__memp_bhfree(dbmp, hp, bhp, flags)
+__memp_bhfree(dbmp, infop, hp, bhp, flags)
 	DB_MPOOL *dbmp;
+	REGINFO *infop;
 	DB_MPOOL_HASH *hp;
 	BH *bhp;
 	u_int32_t flags;
 {
 	DB_ENV *dbenv;
-	MPOOL *c_mp, *mp;
+	MPOOL *c_mp;
 	MPOOLFILE *mfp;
 	BH *next_bhp, *prev_bhp;
-	u_int32_t n_cache;
 	int reorder, ret, t_ret;
 #ifdef DIAG_MVCC
 	size_t pagesize;
@@ -542,8 +543,6 @@ __memp_bhfree(dbmp, hp, bhp, flags)
 	 * Assumes the hash bucket is locked and the MPOOL is not.
 	 */
 	dbenv = dbmp->dbenv;
-	mp = dbmp->reginfo[0].primary;
-	n_cache = NCACHE(mp, bhp->mf_offset, bhp->pgno);
 	mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
 #ifdef DIAG_MVCC
 	pagesize = mfp->stat.st_pagesize;
@@ -623,13 +622,13 @@ __memp_bhfree(dbmp, hp, bhp, flags)
 	 * real.
 	 */
 	if (LF_ISSET(BH_FREE_FREEMEM)) {
-		MPOOL_REGION_LOCK(dbenv, &dbmp->reginfo[n_cache]);
+		MPOOL_REGION_LOCK(dbenv, infop);
 
-		__memp_free(&dbmp->reginfo[n_cache], mfp, bhp);
-		c_mp = dbmp->reginfo[n_cache].primary;
+		__memp_free(infop, mfp, bhp);
+		c_mp = infop->primary;
 		c_mp->stat.st_pages--;
 
-		MPOOL_REGION_UNLOCK(dbenv, &dbmp->reginfo[n_cache]);
+		MPOOL_REGION_UNLOCK(dbenv, infop);
 	}
 
 	/*
diff --git a/db/mp/mp_fget.c b/db/mp/mp_fget.c
index 5f7eb6802..bb73a0a08 100644
--- a/db/mp/mp_fget.c
+++ b/db/mp/mp_fget.c
@@ -1,10 +1,9 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996-2006
- *	Oracle Corporation.  All rights reserved.
+ * Copyright (c) 1996,2007 Oracle.  All rights reserved.
  *
- * $Id: mp_fget.c,v 12.33 2006/09/13 14:53:42 mjc Exp $
+ * $Id: mp_fget.c,v 12.43 2007/06/05 11:55:28 mjc Exp $
  */
 
 #include "db_config.h"
@@ -108,36 +107,34 @@ __memp_fget(dbmfp, pgnoaddr, txn, flags, addrp)
 	enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state;
 	BH *alloc_bhp, *bhp, *current_bhp, *frozen_bhp, *oldest_bhp;
 	DB_ENV *dbenv;
+	DB_LSN *read_lsnp;
 	DB_MPOOL *dbmp;
 	DB_MPOOL_HASH *hp;
-	MPOOL *c_mp, *mp;
+	MPOOL *c_mp;
 	MPOOLFILE *mfp;
-	REGINFO *infop;
+	REGINFO *infop, *t_infop;
 	TXN_DETAIL *td;
-	DB_LSN *read_lsnp;
 	roff_t mf_offset;
-	u_int32_t n_cache, st_hsearch;
+	u_int32_t st_hsearch;
 	int b_incr, b_locked, dirty, edit, extending, first;
 	int makecopy, mvcc, need_free, reorder, ret;
 
 	*(void **)addrp = NULL;
+	COMPQUIET(c_mp, NULL);
+	COMPQUIET(infop, NULL);
 	COMPQUIET(oldest_bhp, NULL);
 
 	dbenv = dbmfp->dbenv;
 	dbmp = dbenv->mp_handle;
 
-	c_mp = NULL;
-	mp = dbmp->reginfo[0].primary;
 	mfp = dbmfp->mfp;
 	mvcc = mfp->multiversion;
 	mf_offset = R_OFFSET(dbmp->reginfo, mfp);
 	alloc_bhp = bhp = frozen_bhp = NULL;
 	read_lsnp = NULL;
+	td = NULL;
 	hp = NULL;
 	b_incr = b_locked = extending = makecopy = ret = 0;
-	n_cache = 0;
-	infop = NULL;
-	td = NULL;
 
 	if (LF_ISSET(DB_MPOOL_DIRTY)) {
 		if (F_ISSET(dbmfp, MP_READONLY)) {
@@ -224,25 +221,22 @@ __memp_fget(dbmfp, pgnoaddr, txn, flags, addrp)
 	    F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) {
 		*(void **)addrp = (u_int8_t *)dbmfp->addr +
 		    (*pgnoaddr * mfp->stat.st_pagesize);
-		++mfp->stat.st_map;
+		STAT(++mfp->stat.st_map);
 		return (0);
 	}
 
-hb_search:
-	/*
+retry:	/*
 	 * Determine the cache and hash bucket where this page lives and get
 	 * local pointers to them.  Reset on each pass through this code, the
 	 * page number can change.
 	 */
-	n_cache = NCACHE(mp, mf_offset, *pgnoaddr);
-	infop = &dbmp->reginfo[n_cache];
+	MP_GET_BUCKET(dbmfp, *pgnoaddr, &infop, hp, ret);
+	if (ret != 0)
+		return (ret);
 	c_mp = infop->primary;
-	hp = R_ADDR(infop, c_mp->htab);
-	hp = &hp[NBUCKET(c_mp, mf_offset, *pgnoaddr)];
 
 	/* Search the hash chain for the page. */
-retry:	st_hsearch = 0;
-	MUTEX_LOCK(dbenv, hp->mtx_hash);
+	st_hsearch = 0;
 	b_locked = 1;
 	SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) {
 		++st_hsearch;
@@ -326,7 +320,7 @@ retry:	st_hsearch = 0;
 				F_SET(hp, IO_WAITER);
 				MUTEX_LOCK(dbenv, hp->mtx_io);
 			}
-			++hp->hash_io_wait;
+			STAT(++hp->hash_io_wait);
 
 			/* Release the hash bucket lock. */
 			MUTEX_UNLOCK(dbenv, hp->mtx_hash);
@@ -362,10 +356,13 @@ thawed:			need_free = (--frozen_bhp->ref == 0);
 			goto retry;
 		}
 
+#ifdef HAVE_STATISTICS
 		++mfp->stat.st_cache_hit;
+#endif
 		break;
 	}
 
+#ifdef HAVE_STATISTICS
 	/*
 	 * Update the hash bucket search statistics -- do now because our next
 	 * search may be for a different bucket.
@@ -374,6 +371,7 @@ thawed:			need_free = (--frozen_bhp->ref == 0);
 	if (st_hsearch > c_mp->stat.st_hash_longest)
 		c_mp->stat.st_hash_longest = st_hsearch;
 	c_mp->stat.st_hash_examined += st_hsearch;
+#endif
 
 	/*
 	 * There are 4 possible paths to this location:
@@ -411,6 +409,10 @@ thawed:			need_free = (--frozen_bhp->ref == 0);
 		 */
 		if (flags == DB_MPOOL_FREE) {
 			if (--bhp->ref == 0) {
+				if (F_ISSET(bhp, BH_DIRTY)) {
+					--hp->hash_page_dirty;
+					F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
+				}
 				/*
 				 * In a multiversion database, this page could
 				 * be requested again so we have to leave it in
@@ -424,17 +426,12 @@ thawed:			need_free = (--frozen_bhp->ref == 0);
 				if (mvcc && (!SH_CHAIN_SINGLETON(bhp, vc) ||
 				    bhp->td_off == INVALID_ROFF ||
 				    !IS_MAX_LSN(*VISIBLE_LSN(dbenv, bhp)))) {
-					if (F_ISSET(bhp, BH_DIRTY)) {
-						--hp->hash_page_dirty;
-						F_CLR(bhp,
-						    BH_DIRTY | BH_DIRTY_CREATE);
-					}
 					F_SET(bhp, BH_FREED);
 					MUTEX_UNLOCK(dbenv, hp->mtx_hash);
 					return (0);
 				}
 				return (__memp_bhfree(
-				    dbmp, hp, bhp, BH_FREE_FREEMEM));
+				    dbmp, infop, hp, bhp, BH_FREE_FREEMEM));
 			}
 			__db_errx(dbenv,
 			    "File %s: freeing pinned buffer for page %lu",
@@ -447,12 +444,10 @@ thawed:			need_free = (--frozen_bhp->ref == 0);
 			if (flags == DB_MPOOL_CREATE &&
 			    F_ISSET(bhp, BH_FREED)) {
 				extending = makecopy = 1;
-				MUTEX_UNLOCK(dbenv, hp->mtx_hash);
 				MUTEX_LOCK(dbenv, mfp->mutex);
 				if (*pgnoaddr > mfp->last_pgno)
 					mfp->last_pgno = *pgnoaddr;
 				MUTEX_UNLOCK(dbenv, mfp->mutex);
-				MUTEX_LOCK(dbenv, hp->mtx_hash);
 			}
 
 			/*
@@ -478,8 +473,9 @@ thawed:			need_free = (--frozen_bhp->ref == 0);
 				    ((ret = __txn_oldest_reader(dbenv,
 				    &hp->old_reader)) == 0 &&
 				    BH_OBSOLETE(oldest_bhp, hp->old_reader)))) {
-					if ((ret = __memp_bhfree(dbmp, hp,
-					    oldest_bhp, BH_FREE_REUSE)) != 0)
+					if ((ret = __memp_bhfree(dbmp,
+					    infop, hp, oldest_bhp,
+					    BH_FREE_REUSE)) != 0)
 						goto err;
 					alloc_bhp = oldest_bhp;
 				} else if (ret != 0)
@@ -547,17 +543,17 @@ alloc:		/*
 
 		/*
 		 * !!!
-		 * In the DB_MPOOL_NEW code path, mf_offset and n_cache have
+		 * In the DB_MPOOL_NEW code path, infop and c_mp have
 		 * not yet been initialized.
 		 */
-		mf_offset = R_OFFSET(dbmp->reginfo, mfp);
-		n_cache = NCACHE(mp, mf_offset, *pgnoaddr);
-		infop = &dbmp->reginfo[n_cache];
+		MP_GET_REGION(dbmfp, *pgnoaddr, &infop, ret);
+		if (ret != 0)
+			goto err;
 		c_mp = infop->primary;
 
 		/* Allocate a new buffer header and data space. */
 		if ((ret =
-		    __memp_alloc(dbmp,infop, mfp, 0, NULL, &alloc_bhp)) != 0)
+		    __memp_alloc(dbmp, infop, mfp, 0, NULL, &alloc_bhp)) != 0)
 			goto err;
 #ifdef DIAGNOSTIC
 		if ((uintptr_t)alloc_bhp->buf & (sizeof(size_t) - 1)) {
@@ -601,7 +597,10 @@ alloc:		/*
 		 */
 		if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) {
 			*pgnoaddr = mfp->last_pgno + 1;
-			if (n_cache != NCACHE(mp, mf_offset, *pgnoaddr)) {
+			MP_GET_REGION(dbmfp, *pgnoaddr, &t_infop,ret);
+			if (ret != 0)
+				goto err;
+			if (t_infop != infop) {
 				/*
 				 * flags == DB_MPOOL_NEW, so extending is set
 				 * and we're holding the mfp locked.
@@ -641,7 +640,7 @@ alloc:		/*
 			b_locked = 1;
 			break;
 		}
-		goto hb_search;
+		goto retry;
 	case SECOND_FOUND:
 		/*
 		 * We allocated buffer space for the requested page, but then
@@ -764,10 +763,10 @@ alloc:		/*
 			if (flags == DB_MPOOL_CREATE && mfp->ftype != 0)
 				F_SET(bhp, BH_CALLPGIN);
 
-			++mfp->stat.st_page_create;
+			STAT(++mfp->stat.st_page_create);
 		} else {
 			F_SET(bhp, BH_TRASH);
-			++mfp->stat.st_cache_miss;
+			STAT(++mfp->stat.st_cache_miss);
 		}
 
 		/* Increment buffer count referenced by MPOOLFILE. */
@@ -961,7 +960,8 @@ err:	/*
 		if (frozen_bhp != NULL)
 			--frozen_bhp;
 		if (b_incr && --bhp->ref == 0) {
-			(void)__memp_bhfree(dbmp, hp, bhp, BH_FREE_FREEMEM);
+			(void)__memp_bhfree(dbmp,
+			    infop, hp, bhp, BH_FREE_FREEMEM);
 			b_locked = 0;
 		}
 	}
diff --git a/db/mp/mp_fmethod.c b/db/mp/mp_fmethod.c
index 76d160ee5..38cd11d34 100644
--- a/db/mp/mp_fmethod.c
+++ b/db/mp/mp_fmethod.c
@@ -1,10 +1,9 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996-2006
- *	Oracle Corporation.  All rights reserved.
+ * Copyright (c) 1996,2007 Oracle.  All rights reserved.
  *
- * $Id: mp_fmethod.c,v 12.13 2006/08/24 14:46:14 bostic Exp $
+ * $Id: mp_fmethod.c,v 12.19 2007/06/01 16:30:30 bostic Exp $
  */
 
 #include "db_config.h"
@@ -67,7 +66,7 @@ __memp_fcreate(dbenv, retp)
 		return (ret);
 
 	dbmfp->ref = 1;
-	dbmfp->lsn_offset = -1;
+	dbmfp->lsn_offset = DB_LSN_OFF_NOTSET;
 	dbmfp->dbenv = dbenv;
 	dbmfp->mfp = INVALID_ROFF;
 
@@ -77,13 +76,13 @@ __memp_fcreate(dbenv, retp)
 	dbmfp->get_fileid = __memp_get_fileid;
 	dbmfp->get_flags = __memp_get_flags;
 	dbmfp->get_ftype = __memp_get_ftype;
+	dbmfp->get_last_pgno = __memp_get_last_pgno;
 	dbmfp->get_lsn_offset = __memp_get_lsn_offset;
 	dbmfp->get_maxsize = __memp_get_maxsize;
 	dbmfp->get_pgcookie = __memp_get_pgcookie;
 	dbmfp->get_priority = __memp_get_priority;
 	dbmfp->open = __memp_fopen_pp;
 	dbmfp->put = __memp_fput_pp;
-	dbmfp->set = __memp_fset_pp;
 	dbmfp->set_clear_len = __memp_set_clear_len;
 	dbmfp->set_fileid = __memp_set_fileid;
 	dbmfp->set_flags = __memp_set_flags;
@@ -489,16 +488,17 @@ __memp_set_priority(dbmfp, priority)
 }
 
 /*
- * __memp_last_pgno --
+ * __memp_get_last_pgno --
  *	Return the page number of the last page in the file.
  *
  * !!!
- * Undocumented interface: DB private.
+ * The method is undocumented, but the handle is exported, users occasionally
+ * ask for it.
  *
- * PUBLIC: int __memp_last_pgno __P((DB_MPOOLFILE *, db_pgno_t *));
+ * PUBLIC: int __memp_get_last_pgno __P((DB_MPOOLFILE *, db_pgno_t *));
  */
 int
-__memp_last_pgno(dbmfp, pgnoaddr)
+__memp_get_last_pgno(dbmfp, pgnoaddr)
 	DB_MPOOLFILE *dbmfp;
 	db_pgno_t *pgnoaddr;
 {
@@ -540,8 +540,8 @@ __memp_fns(dbmp, mfp)
 	DB_MPOOL *dbmp;
 	MPOOLFILE *mfp;
 {
-	if (mfp->path_off == 0)
-		return ((char *)"temporary");
+	if (mfp == NULL || mfp->path_off == 0)
+		return ((char *)"unknown");
 
 	return ((char *)R_ADDR(dbmp->reginfo, mfp->path_off));
 }
diff --git a/db/mp/mp_fopen.c b/db/mp/mp_fopen.c
index f13876e75..b41565304 100644
--- a/db/mp/mp_fopen.c
+++ b/db/mp/mp_fopen.c
@@ -1,10 +1,9 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996-2006
- *	Oracle Corporation.  All rights reserved.
+ * Copyright (c) 1996,2007 Oracle.  All rights reserved.
  *
- * $Id: mp_fopen.c,v 12.34 2006/09/09 13:55:52 bostic Exp $
+ * $Id: mp_fopen.c,v 12.44 2007/05/17 17:18:01 bostic Exp $
  */
 
 #include "db_config.h"
@@ -15,8 +14,10 @@
 #include "dbinc/db_page.h"
 #include "dbinc/hash.h"
 
-static int __memp_mfp_alloc __P((DB_MPOOL *,
+static int __memp_mpf_alloc __P((DB_MPOOL *,
     DB_MPOOLFILE *, const char *, u_int32_t, u_int32_t, MPOOLFILE **));
+static int __memp_mpf_find __P((DB_ENV *,
+    DB_MPOOLFILE *, DB_MPOOL_HASH *, const char *, u_int32_t, MPOOLFILE **));
 
 /*
  * __memp_fopen_pp --
@@ -140,14 +141,51 @@ __memp_fopen(dbmfp, mfp, path, flags, mode, pgsize)
 
 	bucket = 0;
 	hp = R_ADDR(dbmp->reginfo, mp->ftab);
-	if (path == NULL && mfp == NULL)
-		goto alloc;
+	if (mfp == NULL) {
+		if (path == NULL)
+			goto alloc;
 
-	/*
-	 * Our caller may be able to tell us which underlying MPOOLFILE we
-	 * need a handle for.
-	 */
-	if (mfp != NULL) {
+		/*
+		 * Hash to the proper file table entry and walk it.
+		 *
+		 * The fileID is a filesystem unique number (e.g., a
+		 * UNIX dev/inode pair) plus a timestamp.  If files are
+		 * removed and created in less than a second, the fileID
+		 * can be repeated.  The problem with repetition happens
+		 * when the file that previously had the fileID value still
+		 * has pages in the pool, since we don't want to use them
+		 * to satisfy requests for the new file. Because the
+		 * DB_TRUNCATE flag reuses the dev/inode pair, repeated
+		 * opens with that flag set guarantees matching fileIDs
+		 * when the machine can open a file and then re-open
+		 * with truncate within a second.  For this reason, we
+		 * pass that flag down, and, if we find a matching entry,
+		 * we ensure that it's never found again, and we create
+		 * a new entry for the current request.
+		 */
+
+		if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE))
+			bucket = FNBUCKET(path, strlen(path));
+		else
+			bucket = FNBUCKET(dbmfp->fileid, DB_FILE_ID_LEN);
+		hp += bucket;
+
+		/*
+		 * If we are passed a FILEID find the MPOOLFILE and inc
+		 * its ref count.  That way it cannot go away while we
+		 * open it.
+		 */
+		if (F_ISSET(dbmfp, MP_FILEID_SET)) {
+			MUTEX_LOCK(dbenv, hp->mtx_hash);
+			ret =
+			    __memp_mpf_find(dbenv, dbmfp, hp, path, flags,&mfp);
+			MUTEX_UNLOCK(dbenv, hp->mtx_hash);
+			if (ret != 0)
+				goto err;
+			if (mfp != NULL)
+				refinc = 1;
+		}
+	} else {
 		/*
 		 * Deadfile can only be set if mpf_cnt goes to zero (or if we
 		 * failed creating the file DB_AM_DISCARD).  Increment the ref
@@ -213,7 +251,7 @@ __memp_fopen(dbmfp, mfp, path, flags, mode, pgsize)
 		}
 		if ((ret = __db_appname(dbenv,
 		     DB_APP_DATA, path, 0, NULL, &rpath)) == 0)
-			ret = __os_open_extend(dbenv, rpath,
+			ret = __os_open(dbenv, rpath,
 			     (u_int32_t)pagesize, oflags, mode, &dbmfp->fhp);
 		if (mfp != NULL)
 			MPOOL_SYSTEM_UNLOCK(dbenv);
@@ -289,83 +327,21 @@ __memp_fopen(dbmfp, mfp, path, flags, mode, pgsize)
 		goto have_mfp;
 
 	/*
-	 * Hash to the proper file table entry and walk it.
-	 *
-	 * The fileID is a filesystem unique number (e.g., a UNIX dev/inode
-	 * pair) plus a timestamp.  If files are removed and created in less
-	 * than a second, the fileID can be repeated.  The problem with
-	 * repetition happens when the file that previously had the fileID
-	 * value still has pages in the pool, since we don't want to use them
-	 * to satisfy requests for the new file.
-	 *
-	 * Because the DB_TRUNCATE flag reuses the dev/inode pair, repeated
-	 * opens with that flag set guarantees matching fileIDs when the
-	 * machine can open a file and then re-open with truncate within a
-	 * second.  For this reason, we pass that flag down, and, if we find
-	 * a matching entry, we ensure that it's never found again, and we
-	 * create a new entry for the current request.
-	 */
-	if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) {
-		DB_ASSERT(dbenv, path != NULL);
-		bucket = FNBUCKET(path, strlen(path));
-	} else
-		bucket = FNBUCKET(dbmfp->fileid, DB_FILE_ID_LEN);
-	hp += bucket;
-
-	/*
 	 * We can race with another process opening the same file when
 	 * we allocate the mpoolfile structure.  We will come back
 	 * here and check the hash table again to see if it has appeared.
 	 * For most files this is not a problem, since the name is locked
 	 * at a higher layer but QUEUE extent files are not locked.
 	 */
-
 check:	MUTEX_LOCK(dbenv, hp->mtx_hash);
-	SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) {
-		/* Skip dead files and temporary files. */
-		if (mfp->deadfile || F_ISSET(mfp, MP_TEMP))
-			continue;
-
-		/*
-		 * Any remaining DB_MPOOL_NOFILE databases are in-memory
-		 * named databases and need only match other in-memory
-		 * databases with the same name.
-		 */
-		if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) {
-			if (!mfp->no_backing_file)
-				continue;
-
-			DB_ASSERT(dbenv, path != NULL);
-			if (strcmp(path, R_ADDR(dbmp->reginfo, mfp->path_off)))
-				continue;
-
-			/*
-			 * We matched an in-memory file; grab the fileid if
-			 * it is set in the region, but not in the dbmfp.
-			 */
-			if (!F_ISSET(dbmfp, MP_FILEID_SET))
-				(void)__memp_set_fileid(dbmfp,
-				    R_ADDR(dbmp->reginfo, mfp->fileid_off));
-		} else
-			if (memcmp(dbmfp->fileid, R_ADDR(dbmp->reginfo,
-			    mfp->fileid_off), DB_FILE_ID_LEN) != 0)
-				continue;
-
-		/*
-		 * If the file is being truncated, remove it from the system
-		 * and create a new entry.
-		 *
-		 * !!!
-		 * We should be able to set mfp to NULL and break out of the
-		 * loop, but I like the idea of checking all the entries.
-		 */
-		if (LF_ISSET(DB_TRUNCATE)) {
-			MUTEX_LOCK(dbenv, mfp->mutex);
-			mfp->deadfile = 1;
-			MUTEX_UNLOCK(dbenv, mfp->mutex);
-			continue;
-		}
+	if ((ret = __memp_mpf_find(dbenv, dbmfp, hp, path, flags, &mfp) != 0))
+		goto err;
 
+	if (alloc_mfp != NULL && mfp == NULL) {
+		mfp = alloc_mfp;
+		alloc_mfp = NULL;
+		SH_TAILQ_INSERT_HEAD(&hp->hash_bucket, mfp, q, __mpoolfile);
+	} else if (mfp != NULL) {
 		/*
 		 * Some things about a file cannot be changed: the clear length,
 		 * page size, or LSN location.  However, if this is an attempt
@@ -385,7 +361,7 @@ check:	MUTEX_LOCK(dbenv, hp->mtx_hash);
 		    mfp->clear_len != DB_CLEARLEN_NOTSET &&
 		    dbmfp->clear_len != mfp->clear_len) ||
 		    (pagesize != 0 && pagesize != mfp->stat.st_pagesize) ||
-		    (dbmfp->lsn_offset != -1 &&
+		    (dbmfp->lsn_offset != DB_LSN_OFF_NOTSET &&
 		    mfp->lsn_off != DB_LSN_OFF_NOTSET &&
 		    dbmfp->lsn_offset != mfp->lsn_off)) {
 			__db_errx(dbenv,
@@ -395,42 +371,6 @@ check:	MUTEX_LOCK(dbenv, hp->mtx_hash);
 			ret = EINVAL;
 			goto err;
 		}
-
-		/*
-		 * Check to see if this file has died while we waited.
-		 *
-		 * We normally don't lock the deadfile field when we read it as
-		 * we only care if the field is zero or non-zero.  We do lock
-		 * on read when searching for a matching MPOOLFILE so that two
-		 * threads of control don't race between setting the deadfile
-		 * bit and incrementing the reference count, that is, a thread
-		 * of control decrementing the reference count and then setting
-		 * deadfile because the reference count is 0 blocks us finding
-		 * the file without knowing it's about to be marked dead.
-		 */
-		MUTEX_LOCK(dbenv, mfp->mutex);
-		if (mfp->deadfile) {
-			MUTEX_UNLOCK(dbenv, mfp->mutex);
-			continue;
-		}
-		++mfp->mpf_cnt;
-		refinc = 1;
-		MUTEX_UNLOCK(dbenv, mfp->mutex);
-
-		/* Initialize any fields that are not yet set. */
-		if (dbmfp->ftype != 0)
-			mfp->ftype = dbmfp->ftype;
-		if (dbmfp->clear_len != DB_CLEARLEN_NOTSET)
-			mfp->clear_len = dbmfp->clear_len;
-		if (dbmfp->lsn_offset != -1)
-			mfp->lsn_off = dbmfp->lsn_offset;
-
-		break;
-	}
-	if (alloc_mfp != NULL && mfp == NULL) {
-		mfp = alloc_mfp;
-		alloc_mfp = NULL;
-		SH_TAILQ_INSERT_HEAD(&hp->hash_bucket, mfp, q, __mpoolfile);
 	}
 
 	MUTEX_UNLOCK(dbenv, hp->mtx_hash);
@@ -462,7 +402,7 @@ alloc:		/*
 			    __os_fileid(dbenv, rpath, 0, dbmfp->fileid)) != 0)
 				goto err;
 
-		if ((ret = __memp_mfp_alloc(dbmp,
+		if ((ret = __memp_mpf_alloc(dbmp,
 		     dbmfp, path, pagesize, flags, &alloc_mfp)) != 0)
 			goto err;
 
@@ -625,8 +565,105 @@ err:		if (refinc) {
 	return (ret);
 }
 
+/*
+ * __memp_mpf_find --
+ *	Search a hash bucket for a MPOOLFILE.
+ */
+static int
+__memp_mpf_find(dbenv, dbmfp, hp, path, flags, mfpp)
+	DB_ENV *dbenv;
+	DB_MPOOLFILE *dbmfp;
+	DB_MPOOL_HASH *hp;
+	const char *path;
+	u_int32_t flags;
+	MPOOLFILE **mfpp;
+{
+	DB_MPOOL *dbmp;
+	MPOOLFILE *mfp;
+
+	dbmp = dbenv->mp_handle;
+
+	SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) {
+		/* Skip dead files and temporary files. */
+		if (mfp->deadfile || F_ISSET(mfp, MP_TEMP))
+			continue;
+
+		/*
+		 * Any remaining DB_MPOOL_NOFILE databases are in-memory
+		 * named databases and need only match other in-memory
+		 * databases with the same name.
+		 */
+		if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) {
+			if (!mfp->no_backing_file)
+				continue;
+
+			if (strcmp(path, R_ADDR(dbmp->reginfo, mfp->path_off)))
+				continue;
+
+			/*
+			 * We matched an in-memory file; grab the fileid if
+			 * it is set in the region, but not in the dbmfp.
+			 */
+			if (!F_ISSET(dbmfp, MP_FILEID_SET))
+				(void)__memp_set_fileid(dbmfp,
+				    R_ADDR(dbmp->reginfo, mfp->fileid_off));
+		} else
+			if (memcmp(dbmfp->fileid, R_ADDR(dbmp->reginfo,
+			    mfp->fileid_off), DB_FILE_ID_LEN) != 0)
+				continue;
+
+		/*
+		 * If the file is being truncated, remove it from the system
+		 * and create a new entry.
+		 *
+		 * !!!
+		 * We should be able to set mfp to NULL and break out of the
+		 * loop, but I like the idea of checking all the entries.
+		 */
+		if (LF_ISSET(DB_TRUNCATE)) {
+			MUTEX_LOCK(dbenv, mfp->mutex);
+			mfp->deadfile = 1;
+			MUTEX_UNLOCK(dbenv, mfp->mutex);
+			continue;
+		}
+
+		/*
+		 * Check to see if this file has died while we waited.
+		 *
+		 * We normally don't lock the deadfile field when we read it as
+		 * we only care if the field is zero or non-zero.  We do lock
+		 * on read when searching for a matching MPOOLFILE so that two
+		 * threads of control don't race between setting the deadfile
+		 * bit and incrementing the reference count, that is, a thread
+		 * of control decrementing the reference count and then setting
+		 * deadfile because the reference count is 0 blocks us finding
+		 * the file without knowing it's about to be marked dead.
+		 */
+		MUTEX_LOCK(dbenv, mfp->mutex);
+		if (mfp->deadfile) {
+			MUTEX_UNLOCK(dbenv, mfp->mutex);
+			continue;
+		}
+		++mfp->mpf_cnt;
+		MUTEX_UNLOCK(dbenv, mfp->mutex);
+
+		/* Initialize any fields that are not yet set. */
+		if (dbmfp->ftype != 0)
+			mfp->ftype = dbmfp->ftype;
+		if (dbmfp->clear_len != DB_CLEARLEN_NOTSET)
+			mfp->clear_len = dbmfp->clear_len;
+		if (dbmfp->lsn_offset != -1)
+			mfp->lsn_off = dbmfp->lsn_offset;
+
+		break;
+	}
+
+	*mfpp = mfp;
+	return (0);
+}
+
 static int
-__memp_mfp_alloc(dbmp, dbmfp, path, pagesize, flags, retmfp)
+__memp_mpf_alloc(dbmp, dbmfp, path, pagesize, flags, retmfp)
 	DB_MPOOL *dbmp;
 	DB_MPOOLFILE *dbmfp;
 	const char *path;
@@ -742,14 +779,12 @@ __memp_fclose_pp(dbmfp, flags)
 
 	/*
 	 * Validate arguments, but as a handle destructor, we can't fail.
-	 *
-	 * !!!
-	 * DB_MPOOL_DISCARD: Undocumented flag: DB private.
 	 */
-	(void)__db_fchk(dbenv, "DB_MPOOLFILE->close", flags, DB_MPOOL_DISCARD);
+	if (flags != 0)
+		(void)__db_ferr(dbenv, "DB_MPOOLFILE->close", 0);
 
 	ENV_ENTER(dbenv, ip);
-	REPLICATION_WRAP(dbenv, (__memp_fclose(dbmfp, flags)), ret);
+	REPLICATION_WRAP(dbenv, (__memp_fclose(dbmfp, 0)), ret);
 	ENV_LEAVE(dbenv, ip);
 	return (ret);
 }
@@ -906,7 +941,9 @@ __memp_mf_discard(dbmp, mfp)
 {
 	DB_ENV *dbenv;
 	DB_MPOOL_HASH *hp;
+#ifdef HAVE_STATISTICS
 	DB_MPOOL_STAT *sp;
+#endif
 	MPOOL *mp;
 	int need_sync, ret, t_ret;
 
@@ -948,9 +985,10 @@ __memp_mf_discard(dbmp, mfp)
 	/* Lock the region and collect stats and free the space. */
 	MPOOL_SYSTEM_LOCK(dbenv);
 	if (need_sync &&
-	    (t_ret = __memp_mf_sync(dbmp, mfp, 1)) != 0 && ret == 0)
+	    (t_ret = __memp_mf_sync(dbmp, mfp, 0)) != 0 && ret == 0)
 		ret = t_ret;
 
+#ifdef HAVE_STATISTICS
 	/* Copy the statistics into the region. */
 	sp = &mp->stat;
 	sp->st_cache_hit += mfp->stat.st_cache_hit;
@@ -959,6 +997,7 @@ __memp_mf_discard(dbmp, mfp)
 	sp->st_page_create += mfp->stat.st_page_create;
 	sp->st_page_in += mfp->stat.st_page_in;
 	sp->st_page_out += mfp->stat.st_page_out;
+#endif
 
 	/* Free the space. */
 	if (mfp->path_off != 0)
diff --git a/db/mp/mp_fput.c b/db/mp/mp_fput.c
index 124d2e1da..53afe8a82 100644
--- a/db/mp/mp_fput.c
+++ b/db/mp/mp_fput.c
@@ -1,10 +1,9 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996-2006
- *	Oracle Corporation.  All rights reserved.
+ * Copyright (c) 1996,2007 Oracle.  All rights reserved.
  *
- * $Id: mp_fput.c,v 12.22 2006/09/07 20:05:33 bostic Exp $
+ * $Id: mp_fput.c,v 12.36 2007/06/05 11:55:28 mjc Exp $
  */
 
 #include "db_config.h"
@@ -19,12 +18,14 @@ static int __memp_reset_lru __P((DB_ENV *, REGINFO *));
  * __memp_fput_pp --
  *	DB_MPOOLFILE->put pre/post processing.
  *
- * PUBLIC: int __memp_fput_pp __P((DB_MPOOLFILE *, void *, u_int32_t));
+ * PUBLIC: int __memp_fput_pp
+ * PUBLIC:     __P((DB_MPOOLFILE *, void *, DB_CACHE_PRIORITY, u_int32_t));
  */
 int
-__memp_fput_pp(dbmfp, pgaddr, flags)
+__memp_fput_pp(dbmfp, pgaddr, priority, flags)
 	DB_MPOOLFILE *dbmfp;
 	void *pgaddr;
+	DB_CACHE_PRIORITY priority;
 	u_int32_t flags;
 {
 	DB_ENV *dbenv;
@@ -33,10 +34,14 @@ __memp_fput_pp(dbmfp, pgaddr, flags)
 
 	dbenv = dbmfp->dbenv;
 	PANIC_CHECK(dbenv);
+	if (flags != 0)
+		return (__db_ferr(dbenv, "DB_MPOOLFILE->put", 0));
+
+	MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->put");
 
 	ENV_ENTER(dbenv, ip);
 
-	ret = __memp_fput(dbmfp, pgaddr, flags);
+	ret = __memp_fput(dbmfp, pgaddr, priority);
 	if (IS_ENV_REPLICATED(dbenv) &&
 	    (t_ret = __op_rep_exit(dbenv)) != 0 && ret == 0)
 		ret = t_ret;
@@ -49,47 +54,30 @@ __memp_fput_pp(dbmfp, pgaddr, flags)
  * __memp_fput --
  *	DB_MPOOLFILE->put.
  *
- * PUBLIC: int __memp_fput __P((DB_MPOOLFILE *, void *, u_int32_t));
+ * PUBLIC: int __memp_fput __P((DB_MPOOLFILE *, void *, DB_CACHE_PRIORITY));
  */
 int
-__memp_fput(dbmfp, pgaddr, flags)
+__memp_fput(dbmfp, pgaddr, priority)
 	DB_MPOOLFILE *dbmfp;
 	void *pgaddr;
-	u_int32_t flags;
+	DB_CACHE_PRIORITY priority;
 {
+	BH *bhp;
 	DB_ENV *dbenv;
 	DB_MPOOL *dbmp;
 	DB_MPOOL_HASH *hp;
 	MPOOL *c_mp;
 	MPOOLFILE *mfp;
-	BH *bhp;
-	u_int32_t n_cache;
-	int adjust, ret, t_ret;
+	REGINFO *infop;
+	int adjust, pfactor, ret, t_ret;
 
 	dbenv = dbmfp->dbenv;
-	MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->put");
 	dbmp = dbenv->mp_handle;
 	mfp = dbmfp->mfp;
 	bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
 	ret = 0;
 
 	/*
-	 * Check arguments, but don't fail because we want to unpin the page
-	 * regardless.  The problem is when running with replication.  There
-	 * is a reference count we incremented when __memp_fget was called,
-	 * and we need to unpin the page and decrement that reference count.
-	 * If we see flag problems, mark the page dirty.
-	 */
-	if (flags) {
-		if (__db_fchk(dbenv, "memp_fput", flags,
-		    DB_MPOOL_DISCARD) != 0) {
-			flags = 0;
-			ret = EINVAL;
-			DB_ASSERT(dbenv, 0);
-		}
-	}
-
-	/*
 	 * If we're mapping the file, there's nothing to do.  Because we can
 	 * stop mapping the file at any time, we have to check on each buffer
 	 * to see if the address we gave the application was part of the map
@@ -116,15 +104,10 @@ __memp_fput(dbmfp, pgaddr, flags)
 #endif
 
 	/* Convert a page address to a buffer header and hash bucket. */
-	n_cache = NCACHE(dbmp->reginfo[0].primary, bhp->mf_offset, bhp->pgno);
-	c_mp = dbmp->reginfo[n_cache].primary;
-	hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
-	hp = &hp[NBUCKET(c_mp, bhp->mf_offset, bhp->pgno)];
-
-	MUTEX_LOCK(dbenv, hp->mtx_hash);
-
-	if (LF_ISSET(DB_MPOOL_DISCARD))
-		F_SET(bhp, BH_DISCARD);
+	MP_GET_BUCKET(dbmfp, bhp->pgno, &infop, hp, ret);
+	if (ret != 0)
+		return (ret);
+	c_mp = infop->primary;
 
 	/*
 	 * Check for a reference count going to zero.  This can happen if the
@@ -163,7 +146,8 @@ __memp_fput(dbmfp, pgaddr, flags)
 	MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, 0);
 
 	/* Update priority values. */
-	if (F_ISSET(bhp, BH_DISCARD) || mfp->priority == MPOOL_PRI_VERY_LOW)
+	if (priority == DB_PRIORITY_VERY_LOW ||
+	    mfp->priority == MPOOL_PRI_VERY_LOW)
 		bhp->priority = 0;
 	else {
 		/*
@@ -173,9 +157,31 @@ __memp_fput(dbmfp, pgaddr, flags)
 		 */
 		bhp->priority = c_mp->lru_count;
 
+		switch (priority) {
+		default:
+		case DB_PRIORITY_UNCHANGED:
+			pfactor = mfp->priority;
+			break;
+		case DB_PRIORITY_VERY_LOW:
+			pfactor = MPOOL_PRI_VERY_LOW;
+			break;
+		case DB_PRIORITY_LOW:
+			pfactor = MPOOL_PRI_LOW;
+			break;
+		case DB_PRIORITY_DEFAULT:
+			pfactor = MPOOL_PRI_DEFAULT;
+			break;
+		case DB_PRIORITY_HIGH:
+			pfactor = MPOOL_PRI_HIGH;
+			break;
+		case DB_PRIORITY_VERY_HIGH:
+			pfactor = MPOOL_PRI_VERY_HIGH;
+			break;
+		}
+
 		adjust = 0;
-		if (mfp->priority != 0)
-			adjust = (int)c_mp->stat.st_pages / mfp->priority;
+		if (pfactor != 0)
+			adjust = (int)c_mp->stat.st_pages / pfactor;
 
 		if (F_ISSET(bhp, BH_DIRTY))
 			adjust += (int)c_mp->stat.st_pages / MPOOL_PRI_DIRTY;
@@ -234,10 +240,9 @@ __memp_reset_lru(dbenv, infop)
 	BH *bhp, *tbhp;
 	DB_MPOOL_HASH *hp;
 	MPOOL *c_mp;
-	u_int32_t bucket;
+	u_int32_t bucket, priority;
 
 	c_mp = infop->primary;
-
 	/*
 	 * Update the counter so all future allocations will start at the
 	 * bottom.
@@ -253,19 +258,42 @@ __memp_reset_lru(dbenv, infop)
 		 * We can check for empty buckets before locking as we
 		 * only care if the pointer is zero or non-zero.
 		 */
-		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) {
+			c_mp->lru_reset++;
 			continue;
+		}
 
 		MUTEX_LOCK(dbenv, hp->mtx_hash);
-		SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh)
+		c_mp->lru_reset++;
+		/*
+		 * We need to take a little care that the bucket does
+		 * not become unsorted.  This is highly unlikely but
+		 * possible.
+		 */
+		priority = 0;
+		SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) {
 			for (tbhp = bhp; tbhp != NULL;
 			    tbhp = SH_CHAIN_PREV(tbhp, vc, __bh)) {
 				if (tbhp->priority != UINT32_MAX &&
-				    tbhp->priority > MPOOL_BASE_DECREMENT)
+				    tbhp->priority > MPOOL_BASE_DECREMENT) {
 					tbhp->priority -= MPOOL_BASE_DECREMENT;
+					if (tbhp->priority < priority)
+						tbhp->priority = priority;
+				}
 			}
+			priority = bhp->priority;
+		}
+		/*
+		 * Reset the hash bucket's priority.  The chain is never empty
+		 * in this case, so tbhp will never be NULL.
+		 */
+		if ((tbhp =
+		    SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) != NULL)
+			hp->hash_priority = tbhp->priority;
 		MUTEX_UNLOCK(dbenv, hp->mtx_hash);
 	}
+	c_mp->lru_reset = 0;
 
+	COMPQUIET(dbenv, NULL);
 	return (0);
 }
diff --git a/db/mp/mp_fset.c b/db/mp/mp_fset.c
index e3fd2f4df..46950f4e1 100644
--- a/db/mp/mp_fset.c
+++ b/db/mp/mp_fset.c
@@ -1,10 +1,9 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996-2006
- *	Oracle Corporation.  All rights reserved.
+ * Copyright (c) 1996,2007 Oracle.  All rights reserved.
  *
- * $Id: mp_fset.c,v 12.16 2006/09/13 14:53:42 mjc Exp $
+ * $Id: mp_fset.c,v 12.23 2007/06/05 11:55:28 mjc Exp $
  */
 
 #include "db_config.h"
@@ -15,108 +14,33 @@
 #include "dbinc/txn.h"
 
 /*
- * __memp_fset_pp --
- *	DB_MPOOLFILE->set pre/post processing.
- *
- * PUBLIC: int __memp_fset_pp __P((DB_MPOOLFILE *, void *, u_int32_t));
- */
-int
-__memp_fset_pp(dbmfp, pgaddr, flags)
-	DB_MPOOLFILE *dbmfp;
-	void *pgaddr;
-	u_int32_t flags;
-{
-	DB_ENV *dbenv;
-	DB_THREAD_INFO *ip;
-	int ret;
-
-	dbenv = dbmfp->dbenv;
-
-	PANIC_CHECK(dbenv);
-	MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->set");
-
-	/* Validate arguments. */
-	if (flags == 0)
-		return (__db_ferr(dbenv, "memp_fset", 1));
-
-	if ((ret = __db_fchk(dbenv, "memp_fset", flags, DB_MPOOL_DISCARD)) != 0)
-		return (ret);
-
-	ENV_ENTER(dbenv, ip);
-	REPLICATION_WRAP(dbenv, (__memp_fset(dbmfp, pgaddr, flags)), ret);
-	ENV_LEAVE(dbenv, ip);
-	return (ret);
-}
-
-/*
- * __memp_fset --
- *	DB_MPOOLFILE->set.
- *
- * PUBLIC: int __memp_fset __P((DB_MPOOLFILE *, void *, u_int32_t));
- */
-int
-__memp_fset(dbmfp, pgaddr, flags)
-	DB_MPOOLFILE *dbmfp;
-	void *pgaddr;
-	u_int32_t flags;
-{
-	BH *bhp;
-	DB_ENV *dbenv;
-	DB_MPOOL *dbmp;
-	DB_MPOOL_HASH *hp;
-	MPOOL *c_mp;
-	u_int32_t n_cache;
-
-	dbenv = dbmfp->dbenv;
-	dbmp = dbenv->mp_handle;
-
-	DB_ASSERT(dbenv, !LF_ISSET(DB_MPOOL_DIRTY));
-
-	/* Convert the page address to a buffer header and hash bucket. */
-	bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
-	n_cache = NCACHE(dbmp->reginfo[0].primary, bhp->mf_offset, bhp->pgno);
-	c_mp = dbmp->reginfo[n_cache].primary;
-	hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
-	hp = &hp[NBUCKET(c_mp, bhp->mf_offset, bhp->pgno)];
-
-	MUTEX_LOCK(dbenv, hp->mtx_hash);
-
-	if (LF_ISSET(DB_MPOOL_DISCARD))
-		F_SET(bhp, BH_DISCARD);
-
-	MUTEX_UNLOCK(dbenv, hp->mtx_hash);
-	return (0);
-}
-
-/*
  * __memp_dirty --
  *	Upgrade a page from a read-only to a writeable pointer.
  *
- * PUBLIC: int __memp_dirty __P((DB_MPOOLFILE *, void *, DB_TXN *, u_int32_t));
+ * PUBLIC: int __memp_dirty __P((
+ * PUBLIC:     DB_MPOOLFILE *, void *, DB_TXN *, DB_CACHE_PRIORITY, u_int32_t));
  */
 int
-__memp_dirty(dbmfp, addrp, txn, flags)
+__memp_dirty(dbmfp, addrp, txn, priority, flags)
 	DB_MPOOLFILE *dbmfp;
 	void *addrp;
 	DB_TXN *txn;
+	DB_CACHE_PRIORITY priority;
 	u_int32_t flags;
 {
 	BH *bhp;
 	DB_ENV *dbenv;
-	DB_MPOOL *dbmp;
 	DB_MPOOL_HASH *hp;
 	DB_TXN *ancestor;
 #ifdef DIAG_MVCC
 	MPOOLFILE *mfp;
 #endif
-	MPOOL *c_mp;
-	u_int32_t n_cache;
+	REGINFO *infop;
 	int ret;
 	db_pgno_t pgno;
 	void *pgaddr;
 
 	dbenv = dbmfp->dbenv;
-	dbmp = dbenv->mp_handle;
 	pgaddr = *(void **)addrp;
 
 	/* Convert the page address to a buffer header. */
@@ -154,11 +78,11 @@ __memp_dirty(dbmfp, addrp, txn, flags)
 		    (flags == DB_MPOOL_EDIT && *(void **)addrp == pgaddr) ||
 		    (flags != DB_MPOOL_EDIT && *(void **)addrp != pgaddr));
 
-		if ((ret = __memp_fput(dbmfp, pgaddr, 0)) != 0) {
+		if ((ret = __memp_fput(dbmfp, pgaddr, priority)) != 0) {
 			__db_errx(dbenv,
 			    "%s: error releasing a read-only page",
 			    __memp_fn(dbmfp));
-			(void)__memp_fput(dbmfp, *(void **)addrp, 0);
+			(void)__memp_fput(dbmfp, *(void **)addrp, priority);
 			*(void **)addrp = NULL;
 			return (ret);
 		}
@@ -168,13 +92,10 @@ __memp_dirty(dbmfp, addrp, txn, flags)
 		return (0);
 	}
 
-	n_cache = NCACHE(dbmp->reginfo[0].primary,
-	    bhp->mf_offset, bhp->pgno);
-	c_mp = dbmp->reginfo[n_cache].primary;
-	hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
-	hp = &hp[NBUCKET(c_mp, bhp->mf_offset, bhp->pgno)];
+	MP_GET_BUCKET(dbmfp, pgno, &infop, hp, ret);
+	if (ret != 0)
+		return (ret);
 
-	MUTEX_LOCK(dbenv, hp->mtx_hash);
 	/* Set/clear the page bits. */
 	if (!F_ISSET(bhp, BH_DIRTY)) {
 		++hp->hash_page_dirty;
@@ -183,7 +104,7 @@ __memp_dirty(dbmfp, addrp, txn, flags)
 	MUTEX_UNLOCK(dbenv, hp->mtx_hash);
 
 #ifdef DIAG_MVCC
-	mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+	mfp = R_ADDR(dbenv->mp_handle->reginfo, bhp->mf_offset);
 	MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, PROT_READ | PROT_WRITE);
 #endif
 	return (0);
diff --git a/db/mp/mp_method.c b/db/mp/mp_method.c
index 14c144974..e9096827c 100644
--- a/db/mp/mp_method.c
+++ b/db/mp/mp_method.c
@@ -1,10 +1,9 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996-2006
- *	Oracle Corporation.  All rights reserved.
+ * Copyright (c) 1996,2007 Oracle.  All rights reserved.
  *
- * $Id: mp_method.c,v 12.36 2006/09/15 18:54:13 margo Exp $
+ * $Id: mp_method.c,v 12.50 2007/06/01 18:32:44 bostic Exp $
  */
 
 #include "db_config.h"
@@ -15,13 +14,13 @@
 #include "dbinc/hash.h"
 
 /*
- * __memp_dbenv_create --
+ * __memp_env_create --
  *	Mpool specific creation of the DB_ENV structure.
  *
- * PUBLIC: int __memp_dbenv_create __P((DB_ENV *));
+ * PUBLIC: int __memp_env_create __P((DB_ENV *));
  */
 int
-__memp_dbenv_create(dbenv)
+__memp_env_create(dbenv)
 	DB_ENV *dbenv;
 {
 	/*
@@ -37,7 +36,7 @@ __memp_dbenv_create(dbenv)
 	 * Solaris needs 24 and 52 bytes for the same structures.  The minimum
 	 * number of hash buckets is 37.  These contain a mutex also.
 	 */
-	dbenv->mp_bytes =
+	dbenv->mp_bytes = dbenv->mp_max_bytes =
 	    32 * ((8 * 1024) + sizeof(BH)) + 37 * sizeof(DB_MPOOL_HASH);
 	dbenv->mp_ncache = 1;
 
@@ -45,13 +44,13 @@ __memp_dbenv_create(dbenv)
 }
 
 /*
- * __memp_dbenv_destroy --
+ * __memp_env_destroy --
  *	Mpool specific destruction of the DB_ENV structure.
  *
- * PUBLIC: void __memp_dbenv_destroy __P((DB_ENV *));
+ * PUBLIC: void __memp_env_destroy __P((DB_ENV *));
  */
 void
-__memp_dbenv_destroy(dbenv)
+__memp_env_destroy(dbenv)
 	DB_ENV *dbenv;
 {
 	COMPQUIET(dbenv, NULL);
@@ -109,8 +108,6 @@ __memp_set_cachesize(dbenv, gbytes, bytes, arg_ncache)
 {
 	u_int ncache;
 
-	ENV_ILLEGAL_AFTER_OPEN(dbenv, "DB_ENV->set_cachesize");
-
 	/* Normalize the cache count. */
 	ncache = arg_ncache <= 0 ? 1 : (u_int)arg_ncache;
 
@@ -133,18 +130,18 @@ __memp_set_cachesize(dbenv, gbytes, bytes, arg_ncache)
 	 * wrapping in the calculation of the number of hash buckets.  See
 	 * __memp_open for details.
 	 */
-	if (sizeof(roff_t) <= 4) {
-		if (gbytes / ncache >= 4) {
+	if (!F_ISSET(dbenv, DB_ENV_OPEN_CALLED)) {
+		if (sizeof(roff_t) <= 4 && gbytes / ncache >= 4) {
 			__db_errx(dbenv,
 			    "individual cache size too large: maximum is 4GB");
 			return (EINVAL);
 		}
-	} else
 		if (gbytes / ncache > 10000) {
 			__db_errx(dbenv,
 			    "individual cache size too large: maximum is 10TB");
 			return (EINVAL);
 		}
+	}
 
 	/*
 	 * If the application requested less than 500Mb, increase the cachesize
@@ -164,6 +161,9 @@ __memp_set_cachesize(dbenv, gbytes, bytes, arg_ncache)
 			bytes = ncache * DB_CACHESIZE_MIN;
 	}
 
+	if (F_ISSET(dbenv, DB_ENV_OPEN_CALLED))
+		return (__memp_resize(dbenv->mp_handle, gbytes, bytes));
+
 	dbenv->mp_gbytes = gbytes;
 	dbenv->mp_bytes = bytes;
 	dbenv->mp_ncache = ncache;
@@ -172,6 +172,76 @@ __memp_set_cachesize(dbenv, gbytes, bytes, arg_ncache)
 }
 
 /*
+ * __memp_set_config --
+ *	Set the cache subsystem configuration.
+ *
+ * PUBLIC: int __memp_set_config __P((DB_ENV *, u_int32_t, int));
+ */
+int
+__memp_set_config(dbenv, which, on)
+	DB_ENV *dbenv;
+	u_int32_t which;
+	int on;
+{
+	DB_MPOOL *dbmp;
+	MPOOL *mp;
+
+	ENV_NOT_CONFIGURED(dbenv,
+	    dbenv->mp_handle, "DB_ENV->memp_set_config", DB_INIT_MPOOL);
+
+	switch (which) {
+	case DB_MEMP_SUPPRESS_WRITE:
+	case DB_MEMP_SYNC_INTERRUPT:
+		if (MPOOL_ON(dbenv)) {
+			dbmp = dbenv->mp_handle;
+			mp = dbmp->reginfo[0].primary;
+			if (on)
+				FLD_SET(mp->config_flags, which);
+			else
+				FLD_CLR(mp->config_flags, which);
+		}
+		break;
+	default:
+		return (EINVAL);
+	}
+	return (0);
+}
+
+/*
+ * __memp_get_config --
+ *	Return the cache subsystem configuration.
+ *
+ * PUBLIC: int __memp_get_config __P((DB_ENV *, u_int32_t, int *));
+ */
+int
+__memp_get_config(dbenv, which, onp)
+	DB_ENV *dbenv;
+	u_int32_t which;
+	int *onp;
+{
+	DB_MPOOL *dbmp;
+	MPOOL *mp;
+
+	ENV_REQUIRES_CONFIG(dbenv,
+	    dbenv->mp_handle, "DB_ENV->memp_get_config", DB_INIT_MPOOL);
+
+	switch (which) {
+	case DB_MEMP_SUPPRESS_WRITE:
+	case DB_MEMP_SYNC_INTERRUPT:
+		if (MPOOL_ON(dbenv)) {
+			dbmp = dbenv->mp_handle;
+			mp = dbmp->reginfo[0].primary;
+			*onp = FLD_ISSET(mp->config_flags, which) ? 1 : 0;
+		} else
+			*onp = 0;
+		break;
+	default:
+		return (EINVAL);
+	}
+	return (0);
+}
+
+/*
  * PUBLIC: int __memp_get_mp_max_openfd __P((DB_ENV *, int *));
  */
 int
@@ -224,12 +294,13 @@ __memp_set_mp_max_openfd(dbenv, maxopenfd)
 }
 
 /*
- * PUBLIC: int __memp_get_mp_max_write __P((DB_ENV *, int *, int *));
+ * PUBLIC: int __memp_get_mp_max_write __P((DB_ENV *, int *, db_timeout_t *));
  */
 int
 __memp_get_mp_max_write(dbenv, maxwritep, maxwrite_sleepp)
 	DB_ENV *dbenv;
-	int *maxwritep, *maxwrite_sleepp;
+	int *maxwritep;
+	db_timeout_t *maxwrite_sleepp;
 {
 	DB_MPOOL *dbmp;
 	MPOOL *mp;
@@ -255,12 +326,13 @@ __memp_get_mp_max_write(dbenv, maxwritep, maxwrite_sleepp)
  * __memp_set_mp_max_write --
  *	Set the maximum continuous I/O count.
  *
- * PUBLIC: int __memp_set_mp_max_write __P((DB_ENV *, int, int));
+ * PUBLIC: int __memp_set_mp_max_write __P((DB_ENV *, int, db_timeout_t));
  */
 int
 __memp_set_mp_max_write(dbenv, maxwrite, maxwrite_sleep)
 	DB_ENV *dbenv;
-	int maxwrite, maxwrite_sleep;
+	int maxwrite;
+	db_timeout_t maxwrite_sleep;
 {
 	DB_MPOOL *dbmp;
 	MPOOL *mp;
@@ -366,9 +438,13 @@ __memp_nameop(dbenv, fileid, newname, fullold, fullnew, inmem)
 #define	op_is_remove	(newname == NULL)
 
 	COMPQUIET(bucket, 0);
+	COMPQUIET(hp, NULL);
+	COMPQUIET(newname_off, 0);
+	COMPQUIET(nlen, 0);
 
 	dbmp = NULL;
 	mfp = NULL;
+	nhp = NULL;
 	p = NULL;
 	locked = ret = 0;
 
@@ -378,63 +454,61 @@ __memp_nameop(dbenv, fileid, newname, fullold, fullnew, inmem)
 	dbmp = dbenv->mp_handle;
 	mp = dbmp->reginfo[0].primary;
 	hp = R_ADDR(dbmp->reginfo, mp->ftab);
-	nhp = NULL;
 
-	/*
-	 * Remove or rename a file that the mpool might know about.  We assume
-	 * that the fop layer has the file locked for exclusive access, so we
-	 * don't worry about locking except for the mpool mutexes.  Checkpoint
-	 * can happen at any time, independent of file locking, so we have to
-	 * do the actual unlink or rename system call to avoid any race.
-	 *
-	 * If this is a rename, allocate first, because we can't recursively
-	 * grab the region lock.  If this is a memory file
-	 * then on a rename, we need to make sure that the new name does
-	 * not exist.
-	 */
-	hp = R_ADDR(dbmp->reginfo, mp->ftab);
-	if (op_is_remove) {
-		COMPQUIET(newname_off, INVALID_ROFF);
-	} else {
+	if (!op_is_remove) {
 		nlen = strlen(newname);
 		if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
 		    NULL,  nlen + 1, &newname_off, &p)) != 0)
 			return (ret);
 		memcpy(p, newname, nlen + 1);
-		MPOOL_SYSTEM_LOCK(dbenv);
-		locked = 1;
-		if (inmem) {
-			bucket = FNBUCKET(newname, nlen);
-			nhp = hp + bucket;
-			MUTEX_LOCK(dbenv, nhp->mtx_hash);
-			SH_TAILQ_FOREACH(mfp, &nhp->hash_bucket, q, __mpoolfile)
-				if (!mfp->deadfile &&
-				    mfp->no_backing_file && strcmp(newname,
-				    R_ADDR(dbmp->reginfo, mfp->path_off)) == 0)
-					break;
-			MUTEX_UNLOCK(dbenv, nhp->mtx_hash);
-			if (mfp != NULL) {
-				ret = EEXIST;
-				goto err;
-			}
-		}
 	}
 
-	if (locked == 0)
-		MPOOL_SYSTEM_LOCK(dbenv);
-	locked = 1;
-
+	/*
+	 * Remove or rename a file that the mpool might know about.  We assume
+	 * that the fop layer has the file locked for exclusive access, so we
+	 * don't worry about locking except for the mpool mutexes.  Checkpoint
+	 * can happen at any time, independent of file locking, so we have to
+	 * do the actual unlink or rename system call while holding
+	 * all affected buckets locked.
+	 *
+	 * If this is a rename and this is a memory file then we need
+	 * to make sure that the new name does not exist.  Since we
+	 * are locking two buckets lock them in ascending order.
+	 */
 	if (inmem) {
 		DB_ASSERT(dbenv, fullold != NULL);
 		hp += FNBUCKET(fullold, strlen(fullold));
+		if (!op_is_remove) {
+			bucket = FNBUCKET(newname, nlen);
+			nhp = R_ADDR(dbmp->reginfo, mp->ftab);
+			nhp += bucket;
+		}
 	} else
 		hp += FNBUCKET(fileid, DB_FILE_ID_LEN);
 
+	if (nhp != NULL && nhp < hp)
+		MUTEX_LOCK(dbenv, nhp->mtx_hash);
+	MUTEX_LOCK(dbenv, hp->mtx_hash);
+	if (nhp != NULL && nhp > hp)
+		MUTEX_LOCK(dbenv, nhp->mtx_hash);
+	locked = 1;
+
+	if (!op_is_remove && inmem) {
+		SH_TAILQ_FOREACH(mfp, &nhp->hash_bucket, q, __mpoolfile)
+			if (!mfp->deadfile &&
+			    mfp->no_backing_file && strcmp(newname,
+			    R_ADDR(dbmp->reginfo, mfp->path_off)) == 0)
+				break;
+		if (mfp != NULL) {
+			ret = EEXIST;
+			goto err;
+		}
+	}
+
 	/*
 	 * Find the file -- if mpool doesn't know about this file, that may
-	 * not be an error -- if the file is not a memory-only file and it
+	 * not be an error.
 	 */
-	MUTEX_LOCK(dbenv, hp->mtx_hash);
 	SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) {
 		/* Ignore non-active files. */
 		if (mfp->deadfile || F_ISSET(mfp, MP_TEMP))
@@ -447,17 +521,21 @@ __memp_nameop(dbenv, fileid, newname, fullold, fullnew, inmem)
 
 		break;
 	}
-	MUTEX_UNLOCK(dbenv, hp->mtx_hash);
-	if (mfp == NULL)
+
+	if (mfp == NULL) {
+		if (inmem) {
+			ret = ENOENT;
+			goto err;
+		}
 		goto fsop;
+	}
 
 	if (op_is_remove) {
 		MUTEX_LOCK(dbenv, mfp->mutex);
 		/*
-		 * In-memory dbs have an artificially incremented
-		 * ref count so that they do not ever get reclaimed
-		 * as long as they exist.  Since we are now deleting
-		 * the database, we need to dec that count.
+		 * In-memory dbs have an artificially incremented ref count so
+		 * they do not get reclaimed as long as they exist.  Since we
+		 * are now deleting the database, we need to dec that count.
 		 */
 		if (mfp->no_backing_file)
 			mfp->mpf_cnt--;
@@ -465,31 +543,22 @@ __memp_nameop(dbenv, fileid, newname, fullold, fullnew, inmem)
 		MUTEX_UNLOCK(dbenv, mfp->mutex);
 	} else {
 		/*
-		 * Else, it's a rename.  We've allocated memory
-		 * for the new name.  Swap it with the old one.
+		 * Else, it's a rename.  We've allocated memory for the new
+		 * name.  Swap it with the old one.  If it's in memory we
+		 * need to move it the right bucket.
 		 */
 		p = R_ADDR(dbmp->reginfo, mfp->path_off);
 		mfp->path_off = newname_off;
 
-		/* If its in memory we need to move it the right bucket. */
-		if (inmem) {
+		if (inmem && hp != nhp) {
 			DB_ASSERT(dbenv, nhp != NULL);
-			MUTEX_LOCK(dbenv, hp->mtx_hash);
 			SH_TAILQ_REMOVE(&hp->hash_bucket, mfp, q, __mpoolfile);
-			MUTEX_UNLOCK(dbenv, hp->mtx_hash);
 			mfp->bucket = bucket;
-			MUTEX_LOCK(dbenv, nhp->mtx_hash);
 			SH_TAILQ_INSERT_TAIL(&nhp->hash_bucket, mfp, q);
-			MUTEX_UNLOCK(dbenv, nhp->mtx_hash);
 		}
 	}
 
-fsop:	if (mfp == NULL && inmem) {
-		ret = ENOENT;
-		goto err;
-	}
-
-	/*
+fsop:	/*
 	 * If this is a real file, then mfp could be NULL, because
 	 * mpool isn't turned on, and we still need to do the file ops.
 	 */
@@ -504,12 +573,14 @@ fsop:	if (mfp == NULL && inmem) {
 				ret = 0;
 		} else {
 			/*
-			 * Defensive only, fullname should never be
+			 * Defensive only, fullnew should never be
 			 * NULL.
 			 */
 			DB_ASSERT(dbenv, fullnew != NULL);
-			if (fullnew == NULL)
-				return (EINVAL);
+			if (fullnew == NULL) {
+				ret = EINVAL;
+				goto err;
+			}
 			ret = __os_rename(dbenv, fullold, fullnew, 1);
 		}
 	}
@@ -518,8 +589,12 @@ fsop:	if (mfp == NULL && inmem) {
 err:	if (p != NULL)
 		__memp_free(&dbmp->reginfo[0], NULL, p);
 
-	if (locked == 1)
-		MPOOL_SYSTEM_UNLOCK(dbenv);
+	/* If we have buckets locked, unlock them when done moving files. */
+	if (locked == 1) {
+		MUTEX_UNLOCK(dbenv, hp->mtx_hash);
+		if (nhp != NULL && nhp != hp)
+			MUTEX_UNLOCK(dbenv, nhp->mtx_hash);
+	}
 	return (ret);
 }
 
diff --git a/db/mp/mp_mvcc.c b/db/mp/mp_mvcc.c
index 4a763e1de..e797df904 100644
--- a/db/mp/mp_mvcc.c
+++ b/db/mp/mp_mvcc.c
@@ -1,10 +1,9 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 2006
- *	Oracle Corporation.  All rights reserved.
+ * Copyright (c) 2006,2007 Oracle.  All rights reserved.
  *
- * $Id: mp_mvcc.c,v 12.24 2006/09/18 13:11:50 mjc Exp $
+ * $Id: mp_mvcc.c,v 12.34 2007/06/05 11:55:28 mjc Exp $
  */
 
 #include "db_config.h"
@@ -92,9 +91,12 @@ __memp_bucket_reorder(dbenv, hp, bhp)
 			    next, bhp, hq, __bh);
 	}
 
-done:	/* Reset the hash bucket's priority. */
-	hp->hash_priority =
-	    BH_PRIORITY(SH_TAILQ_FIRST(&hp->hash_bucket, __bh));
+done:	/*
+	 * Reset the hash bucket's priority -- the chain is never empty in
+	 * this case, so bhp will never be NULL.
+	 */
+	if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) != NULL)
+		hp->hash_priority = BH_PRIORITY(bhp);
 }
 
 /*
@@ -103,7 +105,8 @@ done:	/* Reset the hash bucket's priority. */
  *
  * PUBLIC: int __memp_bh_settxn __P((DB_MPOOL *, MPOOLFILE *mfp, BH *, void *));
  */
-int __memp_bh_settxn(dbmp, mfp, bhp, vtd)
+int
+__memp_bh_settxn(dbmp, mfp, bhp, vtd)
 	DB_MPOOL *dbmp;
 	MPOOLFILE *mfp;
 	BH *bhp;
@@ -149,16 +152,13 @@ __memp_skip_curadj(dbc, pgno)
 	DB_MPOOL_HASH *hp;
 	DB_MPOOLFILE *dbmfp;
 	DB_TXN *txn;
-	MPOOL *c_mp, *mp;
 	MPOOLFILE *mfp;
 	REGINFO *infop;
 	roff_t mf_offset;
-	u_int32_t n_cache;
-	int skip;
+	int ret, skip;
 
 	dbenv = dbc->dbp->dbenv;
 	dbmp = dbenv->mp_handle;
-	mp = dbmp->reginfo[0].primary;
 	dbmfp = dbc->dbp->mpf;
 	mfp = dbmfp->mfp;
 	mf_offset = R_OFFSET(dbmp->reginfo, mfp);
@@ -172,13 +172,13 @@ __memp_skip_curadj(dbc, pgno)
 	 * local pointers to them.  Reset on each pass through this code, the
 	 * page number can change.
 	 */
-	n_cache = NCACHE(mp, mf_offset, pgno);
-	infop = &dbmp->reginfo[n_cache];
-	c_mp = infop->primary;
-	hp = R_ADDR(infop, c_mp->htab);
-	hp = &hp[NBUCKET(c_mp, mf_offset, pgno)];
+	MP_GET_BUCKET(dbmfp, pgno, &infop, hp, ret);
+	if (ret != 0) {
+		/* Panic: there is no way to return the error. */
+		(void)__db_panic(dbenv, ret);
+		return (0);
+	}
 
-	MUTEX_LOCK(dbenv, hp->mtx_hash);
 	SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) {
 		if (bhp->pgno != pgno || bhp->mf_offset != mf_offset)
 			continue;
@@ -251,12 +251,12 @@ __memp_bh_freeze(dbmp, infop, hp, bhp, need_frozenp)
 		*need_frozenp = 1;
 
 		/* There might be a small amount of unallocated space. */
-		if (__db_shalloc(infop,
-		    sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE), 0,
+		if (__env_alloc(infop,
+		    sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE),
 		    &frozen_alloc) == 0) {
 			frozen_bhp = (BH *)(frozen_alloc + 1);
-			SH_TAILQ_INSERT_HEAD(&c_mp->alloc_frozen, frozen_alloc,
-			    links, __bh_frozen_a);
+			SH_TAILQ_INSERT_TAIL(&c_mp->alloc_frozen,
+			    frozen_alloc, links);
 		}
 	}
 	MPOOL_REGION_UNLOCK(dbenv, infop);
@@ -285,7 +285,7 @@ __memp_bh_freeze(dbmp, infop, hp, bhp, need_frozenp)
 	if ((ret = __db_appname(dbenv, DB_APP_NONE, filename,
 	    0, NULL, &real_name)) != 0)
 		goto err;
-	if ((ret = __os_open_extend(dbenv, real_name, pagesize,
+	if ((ret = __os_open(dbenv, real_name, pagesize,
 	    DB_OSO_CREATE | DB_OSO_EXCL, dbenv->db_mode, &fhp)) == 0) {
 		/* We're creating the file -- initialize the metadata page. */
 		magic = DB_FREEZER_MAGIC;
@@ -299,8 +299,8 @@ __memp_bh_freeze(dbmp, infop, hp, bhp, need_frozenp)
 		    (ret = __os_seek(dbenv, fhp, 0, 0, 0)) != 0)
 			goto err;
 	} else if (ret == EEXIST)
-		ret = __os_open_extend(dbenv, real_name, pagesize, 0,
-		    dbenv->db_mode, &fhp);
+		ret = __os_open(
+		    dbenv, real_name, pagesize, 0, dbenv->db_mode, &fhp);
 	if (ret != 0)
 		goto err;
 	if ((ret = __os_read(dbenv, fhp, &magic, sizeof(u_int32_t),
@@ -372,8 +372,11 @@ __memp_bh_freeze(dbmp, infop, hp, bhp, need_frozenp)
 	 * Increment the file's block count -- freeing the original buffer will
 	 * decrement it.
 	 */
+	MUTEX_LOCK(dbenv, bh_mfp->mutex);
 	++bh_mfp->block_cnt;
-	++hp->hash_frozen;
+	MUTEX_UNLOCK(dbenv, bh_mfp->mutex);
+
+	STAT(++hp->hash_frozen);
 
 	if (0) {
 err:		if (ret == 0)
@@ -492,8 +495,8 @@ __memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp)
 	    &real_name)) != 0)
 		goto err;
 
-	if ((ret = __os_open_extend(dbenv, real_name, pagesize, 0,
-	    dbenv->db_mode, &fhp)) != 0)
+	if ((ret = __os_open(
+	    dbenv, real_name, pagesize, 0, dbenv->db_mode, &fhp)) != 0)
 		goto err;
 
 	/*
@@ -625,8 +628,8 @@ __memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp)
 	if (reorder) {
 		if (next_bhp != NULL)
 			__memp_bucket_reorder(dbenv, hp, next_bhp);
-		else
-			hp->hash_priority = BH_PRIORITY(SH_TAILQ_FIRST(
+		else if (!SH_TAILQ_EMPTY(&hp->hash_bucket))
+			hp->hash_priority = BH_PRIORITY(SH_TAILQ_FIRSTP(
 			    &hp->hash_bucket, __bh));
 	}
 
@@ -651,10 +654,12 @@ __memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp)
 		F_CLR(frozen_bhp, BH_FROZEN | BH_LOCKED);
 	}
 
+#ifdef HAVE_STATISTICS
 	if (alloc_bhp != NULL)
 		++hp->hash_thawed;
 	else
 		++hp->hash_frozen_freed;
+#endif
 
 	if (0) {
 err:		if (ret == 0)
diff --git a/db/mp/mp_region.c b/db/mp/mp_region.c
index a02683f21..34a1ced15 100644
--- a/db/mp/mp_region.c
+++ b/db/mp/mp_region.c
@@ -1,10 +1,9 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996-2006
- *	Oracle Corporation.  All rights reserved.
+ * Copyright (c) 1996,2007 Oracle.  All rights reserved.
  *
- * $Id: mp_region.c,v 12.21 2006/08/24 14:46:15 bostic Exp $
+ * $Id: mp_region.c,v 12.33 2007/05/17 17:18:01 bostic Exp $
  */
 
 #include "db_config.h"
@@ -12,7 +11,6 @@
 #include "db_int.h"
 #include "dbinc/mp.h"
 
-static int	__memp_init __P((DB_ENV *, DB_MPOOL *, u_int, u_int32_t));
 static int	__memp_init_config __P((DB_ENV *, MPOOL *));
 static void	__memp_region_size __P((DB_ENV *, roff_t *, u_int32_t *));
 
@@ -20,17 +18,18 @@ static void	__memp_region_size __P((DB_ENV *, roff_t *, u_int32_t *));
  * __memp_open --
  *	Internal version of memp_open: only called from DB_ENV->open.
  *
- * PUBLIC: int __memp_open __P((DB_ENV *));
+ * PUBLIC: int __memp_open __P((DB_ENV *, int));
  */
 int
-__memp_open(dbenv)
+__memp_open(dbenv, create_ok)
 	DB_ENV *dbenv;
+	int create_ok;
 {
 	DB_MPOOL *dbmp;
 	MPOOL *mp;
 	REGINFO reginfo;
 	roff_t reg_size;
-	u_int i;
+	u_int i, max_nreg;
 	u_int32_t htab_buckets, *regids;
 	int ret;
 
@@ -50,9 +49,9 @@ __memp_open(dbenv)
 	reginfo.type = REGION_TYPE_MPOOL;
 	reginfo.id = INVALID_REGION_ID;
 	reginfo.flags = REGION_JOIN_OK;
-	if (F_ISSET(dbenv, DB_ENV_CREATE))
+	if (create_ok)
 		F_SET(&reginfo, REGION_CREATE_OK);
-	if ((ret = __db_r_attach(dbenv, &reginfo, reg_size)) != 0)
+	if ((ret = __env_region_attach(dbenv, &reginfo, reg_size)) != 0)
 		goto err;
 
 	/*
@@ -65,17 +64,18 @@ __memp_open(dbenv)
 		 * the REGINFO structures and create them.  Make sure we don't
 		 * clear the wrong entries on error.
 		 */
-		dbmp->nreg = dbenv->mp_ncache;
+		max_nreg = __memp_max_regions(dbenv);
 		if ((ret = __os_calloc(dbenv,
-		    dbmp->nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0)
+		    max_nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0)
 			goto err;
 		/* Make sure we don't clear the wrong entries on error. */
-		for (i = 0; i < dbmp->nreg; ++i)
-			dbmp->reginfo[i].id = INVALID_REGION_ID;
 		dbmp->reginfo[0] = reginfo;
+		for (i = 1; i < max_nreg; ++i)
+			dbmp->reginfo[i].id = INVALID_REGION_ID;
 
 		/* Initialize the first region. */
-		if ((ret = __memp_init(dbenv, dbmp, 0, htab_buckets)) != 0)
+		if ((ret = __memp_init(dbenv, dbmp,
+		    0, htab_buckets, max_nreg)) != 0)
 			goto err;
 
 		/*
@@ -84,16 +84,17 @@ __memp_open(dbenv)
 		 */
 		mp = R_ADDR(dbmp->reginfo, dbmp->reginfo[0].rp->primary);
 		regids = R_ADDR(dbmp->reginfo, mp->regids);
-		for (i = 1; i < dbmp->nreg; ++i) {
+		regids[0] = dbmp->reginfo[0].id;
+		for (i = 1; i < dbenv->mp_ncache; ++i) {
 			dbmp->reginfo[i].dbenv = dbenv;
 			dbmp->reginfo[i].type = REGION_TYPE_MPOOL;
 			dbmp->reginfo[i].id = INVALID_REGION_ID;
 			dbmp->reginfo[i].flags = REGION_CREATE_OK;
-			if ((ret = __db_r_attach(
+			if ((ret = __env_region_attach(
 			    dbenv, &dbmp->reginfo[i], reg_size)) != 0)
 				goto err;
-			if ((ret =
-			    __memp_init(dbenv, dbmp, i, htab_buckets)) != 0)
+			if ((ret = __memp_init(dbenv, dbmp,
+			    i, htab_buckets, max_nreg)) != 0)
 				goto err;
 
 			regids[i] = dbmp->reginfo[i].id;
@@ -105,30 +106,30 @@ __memp_open(dbenv)
 		 * information.
 		 */
 		mp = R_ADDR(&reginfo, reginfo.rp->primary);
-		dbmp->nreg = mp->nreg;
+		dbenv->mp_ncache = mp->nreg;
 		if ((ret = __os_calloc(dbenv,
-		    dbmp->nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0)
+		    mp->max_nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0)
 			goto err;
 		/* Make sure we don't clear the wrong entries on error. */
-		for (i = 0; i < dbmp->nreg; ++i)
+		for (i = 0; i < dbenv->mp_ncache; ++i)
 			dbmp->reginfo[i].id = INVALID_REGION_ID;
 		dbmp->reginfo[0] = reginfo;
 
 		/* Join remaining regions. */
 		regids = R_ADDR(dbmp->reginfo, mp->regids);
-		for (i = 1; i < dbmp->nreg; ++i) {
+		for (i = 1; i < dbenv->mp_ncache; ++i) {
 			dbmp->reginfo[i].dbenv = dbenv;
 			dbmp->reginfo[i].type = REGION_TYPE_MPOOL;
 			dbmp->reginfo[i].id = regids[i];
 			dbmp->reginfo[i].flags = REGION_JOIN_OK;
-			if ((ret = __db_r_attach(
+			if ((ret = __env_region_attach(
 			    dbenv, &dbmp->reginfo[i], 0)) != 0)
 				goto err;
 		}
 	}
 
 	/* Set the local addresses for the regions. */
-	for (i = 0; i < dbmp->nreg; ++i)
+	for (i = 0; i < dbenv->mp_ncache; ++i)
 		dbmp->reginfo[i].primary =
 		    R_ADDR(&dbmp->reginfo[i], dbmp->reginfo[i].rp->primary);
 
@@ -147,9 +148,9 @@ __memp_open(dbenv)
 
 err:	dbenv->mp_handle = NULL;
 	if (dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) {
-		for (i = 0; i < dbmp->nreg; ++i)
+		for (i = 0; i < dbenv->mp_ncache; ++i)
 			if (dbmp->reginfo[i].id != INVALID_REGION_ID)
-				(void)__db_r_detach(
+				(void)__env_region_detach(
 				    dbenv, &dbmp->reginfo[i], 0);
 		__os_free(dbenv, dbmp->reginfo);
 	}
@@ -162,27 +163,32 @@ err:	dbenv->mp_handle = NULL;
 /*
  * __memp_init --
  *	Initialize a MPOOL structure in shared memory.
+ *
+ * PUBLIC: int	__memp_init
+ * PUBLIC:     __P((DB_ENV *, DB_MPOOL *, u_int, u_int32_t, u_int));
  */
-static int
-__memp_init(dbenv, dbmp, reginfo_off, htab_buckets)
+int
+__memp_init(dbenv, dbmp, reginfo_off, htab_buckets, max_nreg)
 	DB_ENV *dbenv;
 	DB_MPOOL *dbmp;
-	u_int reginfo_off;
+	u_int reginfo_off, max_nreg;
 	u_int32_t htab_buckets;
 {
+	BH_FROZEN_ALLOC *frozen;
+	BH *frozen_bhp;
 	DB_MPOOL_HASH *htab, *hp;
-	MPOOL *mp;
-	REGINFO *reginfo;
+	MPOOL *mp, *main_mp;
+	REGINFO *infop;
+	db_mutex_t mtx_base, mtx_discard, mtx_prev;
 	u_int32_t i;
 	int ret;
 	void *p;
 
-	reginfo = &dbmp->reginfo[reginfo_off];
-	if ((ret = __db_shalloc(
-	    reginfo, sizeof(MPOOL), 0, &reginfo->primary)) != 0)
+	infop = &dbmp->reginfo[reginfo_off];
+	if ((ret = __env_alloc(infop, sizeof(MPOOL), &infop->primary)) != 0)
 		goto mem_err;
-	reginfo->rp->primary = R_OFFSET(reginfo, reginfo->primary);
-	mp = reginfo->primary;
+	infop->rp->primary = R_OFFSET(infop, infop->primary);
+	mp = infop->primary;
 	memset(mp, 0, sizeof(*mp));
 
 	if ((ret =
@@ -192,17 +198,19 @@ __memp_init(dbenv, dbmp, reginfo_off, htab_buckets)
 	if (reginfo_off == 0) {
 		ZERO_LSN(mp->lsn);
 
-		mp->nreg = dbmp->nreg;
-		if ((ret = __db_shalloc(&dbmp->reginfo[0],
-		    dbmp->nreg * sizeof(u_int32_t), 0, &p)) != 0)
+		mp->nreg = dbenv->mp_ncache;
+		mp->max_nreg = max_nreg;
+		if ((ret = __env_alloc(&dbmp->reginfo[0],
+		    max_nreg * sizeof(u_int32_t), &p)) != 0)
 			goto mem_err;
 		mp->regids = R_OFFSET(dbmp->reginfo, p);
+		mp->nbuckets = dbenv->mp_ncache * htab_buckets;
 
 		/* Allocate file table space and initialize it. */
-		if ((ret = __db_shalloc(reginfo,
-		    MPOOL_FILE_BUCKETS * sizeof(DB_MPOOL_HASH), 0, &htab)) != 0)
+		if ((ret = __env_alloc(infop,
+		    MPOOL_FILE_BUCKETS * sizeof(DB_MPOOL_HASH), &htab)) != 0)
 			goto mem_err;
-		mp->ftab = R_OFFSET(reginfo, htab);
+		mp->ftab = R_OFFSET(infop, htab);
 		for (i = 0; i < MPOOL_FILE_BUCKETS; i++) {
 			if ((ret = __mutex_alloc(dbenv,
 			     MTX_MPOOL_FILE_BUCKET, 0, &htab[i].mtx_hash)) != 0)
@@ -211,32 +219,80 @@ __memp_init(dbenv, dbmp, reginfo_off, htab_buckets)
 			htab[i].hash_page_dirty = htab[i].hash_priority = 0;
 		}
 
+		/*
+		 * Allocate all of the hash bucket mutexes up front.  We do
+		 * this so that we don't need to free and reallocate mutexes as
+		 * the cache is resized.
+		 */
+		mtx_base = mtx_prev = MUTEX_INVALID;
+		for (i = 0; i < mp->max_nreg * htab_buckets; i++) {
+			if ((ret = __mutex_alloc(dbenv, MTX_MPOOL_HASH_BUCKET,
+			    0, &mtx_discard)) != 0)
+				return (ret);
+			if (i == 0) {
+				mtx_base = mtx_discard;
+				mtx_prev = mtx_discard - 1;
+			}
+			DB_ASSERT(dbenv, mtx_discard == mtx_prev + 1 ||
+			    mtx_base == MUTEX_INVALID);
+			mtx_prev = mtx_discard;
+			if ((ret = __mutex_alloc(dbenv, MTX_MPOOL_IO,
+			    DB_MUTEX_SELF_BLOCK, &mtx_discard)) != 0)
+				return (ret);
+			DB_ASSERT(dbenv, mtx_discard == mtx_prev + 1 ||
+			    mtx_base == MUTEX_INVALID);
+			mtx_prev = mtx_discard;
+		}
+	} else {
+		main_mp = dbmp->reginfo[0].primary;
+		htab = R_ADDR(&dbmp->reginfo[0], main_mp->htab);
+		mtx_base = htab[0].mtx_hash;
 	}
 
+	if (mtx_base != MUTEX_INVALID)
+		mtx_base += reginfo_off * htab_buckets;
+
 	/* Allocate hash table space and initialize it. */
-	if ((ret = __db_shalloc(reginfo,
-	    htab_buckets * sizeof(DB_MPOOL_HASH), 0, &htab)) != 0)
+	if ((ret = __env_alloc(infop,
+	    htab_buckets * sizeof(DB_MPOOL_HASH), &htab)) != 0)
 		goto mem_err;
-	mp->htab = R_OFFSET(reginfo, htab);
+	mp->htab = R_OFFSET(infop, htab);
 	for (i = 0; i < htab_buckets; i++) {
 		hp = &htab[i];
-		if ((ret = __mutex_alloc(dbenv,
-		    MTX_MPOOL_HASH_BUCKET, 0, &hp->mtx_hash)) != 0)
-			return (ret);
-		if ((ret = __mutex_alloc(dbenv,
-		    MTX_MPOOL_IO, DB_MUTEX_SELF_BLOCK, &hp->mtx_io)) != 0)
-			return (ret);
+		hp->mtx_hash = (mtx_base == MUTEX_INVALID) ? MUTEX_INVALID :
+		    mtx_base + i * 2;
+		hp->mtx_io = (mtx_base == MUTEX_INVALID) ? MUTEX_INVALID :
+		    mtx_base + i * 2 + 1;
 		SH_TAILQ_INIT(&hp->hash_bucket);
-		hp->hash_page_dirty = hp->hash_priority = hp->hash_io_wait = 0;
+		hp->hash_page_dirty = hp->hash_priority = 0;
+#ifdef HAVE_STATISTICS
+		hp->hash_io_wait = 0;
+		hp->hash_frozen = hp->hash_thawed = hp->hash_frozen_freed = 0;
+#endif
 		hp->flags = 0;
 		ZERO_LSN(hp->old_reader);
 	}
-	mp->htab_buckets = mp->stat.st_hash_buckets = htab_buckets;
+	mp->htab_buckets = htab_buckets;
+#ifdef HAVE_STATISTICS
+	mp->stat.st_hash_buckets = htab_buckets;
+#endif
 
 	SH_TAILQ_INIT(&mp->free_frozen);
 	SH_TAILQ_INIT(&mp->alloc_frozen);
 
 	/*
+	 * Pre-allocate one frozen buffer header.  This avoids situations where
+	 * the cache becomes full of pages and we don't even have the 28 bytes
+	 * (or so) available to allocate a frozen buffer header.
+	 */
+	if ((ret = __env_alloc(infop,
+	    sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE), &frozen)) != 0)
+		goto mem_err;
+	frozen_bhp = (BH *)(frozen + 1);
+	SH_TAILQ_INSERT_TAIL(&mp->alloc_frozen, frozen, links);
+	SH_TAILQ_INSERT_TAIL(&mp->free_frozen, frozen_bhp, hq);
+
+	/*
 	 * Only the environment creator knows the total cache size, fill in
 	 * those statistics now.
 	 */
@@ -249,6 +305,25 @@ mem_err:__db_errx(dbenv, "Unable to allocate memory for mpool region");
 }
 
 /*
+ * PUBLIC: u_int32_t __memp_max_regions __P((DB_ENV *));
+ */
+u_int32_t
+__memp_max_regions(dbenv)
+	DB_ENV *dbenv;
+{
+	roff_t reg_size, max_size;
+	u_int32_t max_nreg;
+
+	__memp_region_size(dbenv, &reg_size, NULL);
+	max_size = (roff_t)dbenv->mp_max_gbytes * GIGABYTE +
+	    dbenv->mp_max_bytes;
+	max_nreg = (max_size + reg_size / 2) / reg_size;
+	if (max_nreg <= dbenv->mp_ncache)
+		max_nreg = dbenv->mp_ncache;
+	return (max_nreg);
+}
+
+/*
  * __memp_region_size --
  *	Size the region and figure out how many hash buckets we'll have.
  */
@@ -258,15 +333,16 @@ __memp_region_size(dbenv, reg_sizep, htab_bucketsp)
 	roff_t *reg_sizep;
 	u_int32_t *htab_bucketsp;
 {
-	roff_t reg_size;
+	roff_t reg_size, cache_size;
 
 	/*
 	 * Figure out how big each cache region is.  Cast an operand to roff_t
 	 * so we do 64-bit arithmetic as appropriate.
 	 */
-	reg_size = ((roff_t)GIGABYTE / dbenv->mp_ncache) * dbenv->mp_gbytes;
-	reg_size += dbenv->mp_bytes / dbenv->mp_ncache;
-	*reg_sizep = reg_size;
+	cache_size = (roff_t)dbenv->mp_gbytes * GIGABYTE + dbenv->mp_bytes;
+	reg_size = cache_size / dbenv->mp_ncache;
+	if (reg_sizep != NULL)
+		*reg_sizep = reg_size;
 
 	/*
 	 * Figure out how many hash buckets each region will have.  Assume we
@@ -281,7 +357,9 @@ __memp_region_size(dbenv, reg_sizep, htab_bucketsp)
 	 * something we need to worry about right now, but is checked when the
 	 * cache size is set.
 	 */
-	*htab_bucketsp = __db_tablesize((u_int32_t)(reg_size / (10 * 1024)));
+	if (htab_bucketsp != NULL)
+		*htab_bucketsp =
+		    __db_tablesize((u_int32_t)(reg_size / (10 * 1024)));
 }
 
 /*
@@ -294,10 +372,9 @@ u_int32_t
 __memp_region_mutex_count(dbenv)
 	DB_ENV *dbenv;
 {
-	roff_t reg_size;
 	u_int32_t htab_buckets;
 
-	__memp_region_size(dbenv, &reg_size, &htab_buckets);
+	__memp_region_size(dbenv, NULL, &htab_buckets);
 
 	/*
 	 * We need a couple of mutexes for the region itself, one for each
@@ -334,13 +411,13 @@ __memp_init_config(dbenv, mp)
 }
 
 /*
- * __memp_dbenv_refresh --
+ * __memp_env_refresh --
  *	Clean up after the mpool system on a close or failed open.
  *
- * PUBLIC: int __memp_dbenv_refresh __P((DB_ENV *));
+ * PUBLIC: int __memp_env_refresh __P((DB_ENV *));
  */
 int
-__memp_dbenv_refresh(dbenv)
+__memp_env_refresh(dbenv)
 	DB_ENV *dbenv;
 {
 	BH *bhp;
@@ -349,53 +426,72 @@ __memp_dbenv_refresh(dbenv)
 	DB_MPOOLFILE *dbmfp;
 	DB_MPOOL_HASH *hp;
 	DB_MPREG *mpreg;
-	MPOOL *mp;
-	REGINFO *reginfo;
-	u_int32_t bucket, i;
+	MPOOL *mp, *c_mp;
+	REGINFO *infop;
+	db_mutex_t mtx_base, mtx;
+	u_int32_t bucket, htab_buckets, i, max_nreg, nreg;
 	int ret, t_ret;
 
 	ret = 0;
 	dbmp = dbenv->mp_handle;
+	mp = dbmp->reginfo[0].primary;
+	htab_buckets = mp->htab_buckets;
+	nreg = mp->nreg;
+	max_nreg = mp->max_nreg;
+	hp = R_ADDR(&dbmp->reginfo[0], mp->htab);
+	mtx_base = hp->mtx_hash;
 
 	/*
 	 * If a private region, return the memory to the heap.  Not needed for
 	 * filesystem-backed or system shared memory regions, that memory isn't
 	 * owned by any particular process.
-	 *
-	 * Discard buffers.
 	 */
-	if (F_ISSET(dbenv, DB_ENV_PRIVATE))
-		for (i = 0; i < dbmp->nreg; ++i) {
-			reginfo = &dbmp->reginfo[i];
-			mp = reginfo->primary;
-			for (hp = R_ADDR(reginfo, mp->htab), bucket = 0;
-			    bucket < mp->htab_buckets; ++hp, ++bucket) {
-				while ((bhp = SH_TAILQ_FIRST(
-				    &hp->hash_bucket, __bh)) != NULL)
-					if (F_ISSET(bhp, BH_FROZEN))
-						SH_TAILQ_REMOVE(
-						    &hp->hash_bucket, bhp,
-						    hq, __bh);
-					else if ((t_ret = __memp_bhfree(
-					    dbmp, hp, bhp,
+	if (!F_ISSET(dbenv, DB_ENV_PRIVATE))
+		goto not_priv;
+
+	/* Discard buffers. */
+	for (i = 0; i < nreg; ++i) {
+		infop = &dbmp->reginfo[i];
+		c_mp = infop->primary;
+		for (hp = R_ADDR(infop, c_mp->htab), bucket = 0;
+		    bucket < c_mp->htab_buckets; ++hp, ++bucket) {
+			while ((bhp = SH_TAILQ_FIRST(
+			    &hp->hash_bucket, __bh)) != NULL)
+				if (F_ISSET(bhp, BH_FROZEN))
+					SH_TAILQ_REMOVE(
+					    &hp->hash_bucket, bhp,
+					    hq, __bh);
+				else {
+					if (F_ISSET(bhp, BH_DIRTY)) {
+						--hp->hash_page_dirty;
+						F_CLR(bhp,
+						    BH_DIRTY | BH_DIRTY_CREATE);
+					}
+					if ((t_ret = __memp_bhfree(
+					    dbmp, infop, hp, bhp,
 					    BH_FREE_FREEMEM |
 					    BH_FREE_UNLOCKED)) != 0 && ret == 0)
 						ret = t_ret;
-				if ((t_ret = __mutex_free(
-				    dbenv, &hp->mtx_hash)) != 0 && ret == 0)
-					ret = t_ret;
-				if ((t_ret = __mutex_free(
-				    dbenv, &hp->mtx_io)) != 0 && ret == 0)
-					ret = t_ret;
-			}
-			while ((frozen_alloc = SH_TAILQ_FIRST(
-			    &mp->alloc_frozen, __bh_frozen_a)) != NULL) {
-				SH_TAILQ_REMOVE(&mp->alloc_frozen, frozen_alloc,
-				    links, __bh_frozen_a);
-				__db_shalloc_free(reginfo, frozen_alloc);
-			}
+				}
+		}
+		while ((frozen_alloc = SH_TAILQ_FIRST(
+		    &c_mp->alloc_frozen, __bh_frozen_a)) != NULL) {
+			SH_TAILQ_REMOVE(&c_mp->alloc_frozen, frozen_alloc,
+			    links, __bh_frozen_a);
+			__env_alloc_free(infop, frozen_alloc);
+		}
+	}
+
+	/* Discard hash bucket mutexes. */
+	if (mtx_base != MUTEX_INVALID)
+		for (i = 0; i < 2 * max_nreg * htab_buckets; ++i) {
+			mtx = mtx_base + i;
+			if ((t_ret = __mutex_free(dbenv, &mtx)) != 0 &&
+			    ret == 0)
+				ret = t_ret;
 		}
 
+not_priv:
 	/* Discard DB_MPOOLFILEs. */
 	while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL)
 		if ((t_ret = __memp_fclose(dbmfp, 0)) != 0 && ret == 0)
@@ -415,25 +511,25 @@ __memp_dbenv_refresh(dbenv)
 
 	if (F_ISSET(dbenv, DB_ENV_PRIVATE)) {
 		/* Discard REGION IDs. */
-		reginfo = &dbmp->reginfo[0];
-		mp = dbmp->reginfo[0].primary;
-		__memp_free(reginfo, NULL, R_ADDR(reginfo, mp->regids));
+		infop = &dbmp->reginfo[0];
+		__memp_free(infop, NULL, R_ADDR(infop, mp->regids));
 
 		/* Discard the File table. */
-		__memp_free(reginfo, NULL, R_ADDR(reginfo, mp->ftab));
+		__memp_free(infop, NULL, R_ADDR(infop, mp->ftab));
 
 		/* Discard Hash tables. */
-		for (i = 0; i < dbmp->nreg; ++i) {
-			reginfo = &dbmp->reginfo[i];
-			mp = reginfo->primary;
-			__memp_free(reginfo, NULL, R_ADDR(reginfo, mp->htab));
+		for (i = 0; i < nreg; ++i) {
+			infop = &dbmp->reginfo[i];
+			c_mp = infop->primary;
+			__memp_free(infop, NULL, R_ADDR(infop, c_mp->htab));
 		}
 	}
 
 	/* Detach from the region. */
-	for (i = 0; i < dbmp->nreg; ++i) {
-		reginfo = &dbmp->reginfo[i];
-		if ((t_ret = __db_r_detach(dbenv, reginfo, 0)) != 0 && ret == 0)
+	for (i = 0; i < nreg; ++i) {
+		infop = &dbmp->reginfo[i];
+		if ((t_ret =
+		    __env_region_detach(dbenv, infop, 0)) != 0 && ret == 0)
 			ret = t_ret;
 	}
 
diff --git a/db/mp/mp_register.c b/db/mp/mp_register.c
index 1ca5f8311..ef5269d42 100644
--- a/db/mp/mp_register.c
+++ b/db/mp/mp_register.c
@@ -1,10 +1,9 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996-2006
- *	Oracle Corporation.  All rights reserved.
+ * Copyright (c) 1996,2007 Oracle.  All rights reserved.
  *
- * $Id: mp_register.c,v 12.11 2006/08/24 14:46:15 bostic Exp $
+ * $Id: mp_register.c,v 12.13 2007/05/17 15:15:45 bostic Exp $
  */
 
 #include "db_config.h"
diff --git a/db/mp/mp_resize.c b/db/mp/mp_resize.c
new file mode 100644
index 000000000..241f37e4b
--- /dev/null
+++ b/db/mp/mp_resize.c
@@ -0,0 +1,559 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2006,2007 Oracle.  All rights reserved.
+ *
+ * $Id: mp_resize.c,v 12.5 2007/06/05 11:55:28 mjc Exp $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __memp_add_bucket __P((DB_MPOOL *));
+static int __memp_add_region __P((DB_MPOOL *));
+static int __memp_map_regions __P((DB_MPOOL *));
+static int __memp_merge_buckets
+    __P((DB_MPOOL *, u_int32_t, u_int32_t, u_int32_t));
+static int __memp_remove_bucket __P((DB_MPOOL *));
+static int __memp_remove_region __P((DB_MPOOL *));
+
+/*
+ * PUBLIC: int __memp_get_bucket
+ * PUBLIC:     __P((DB_MPOOLFILE *, db_pgno_t, REGINFO **, DB_MPOOL_HASH **));
+ */
+int
+__memp_get_bucket(dbmfp, pgno, infopp, hpp)
+	DB_MPOOLFILE *dbmfp;
+	db_pgno_t pgno;
+	REGINFO **infopp;
+	DB_MPOOL_HASH **hpp;
+{
+	DB_ENV *dbenv;
+	DB_MPOOL *dbmp;
+	DB_MPOOL_HASH *hp;
+	MPOOL *c_mp, *mp;
+	REGINFO *infop;
+	roff_t mf_offset;
+	u_int32_t bucket, nbuckets, new_bucket, new_nbuckets, region;
+	u_int32_t *regids;
+	int ret;
+
+	dbenv = dbmfp->dbenv;
+	dbmp = dbenv->mp_handle;
+	mf_offset = R_OFFSET(dbmp->reginfo, dbmfp->mfp);
+	mp = dbmp->reginfo[0].primary;
+	ret = 0;
+
+	for (;;) {
+		nbuckets = mp->nbuckets;
+		MP_BUCKET(mf_offset, pgno, nbuckets, bucket);
+
+		/*
+		 * Once we work out which region we are looking in, we have to
+		 * check that we have that region mapped, and that the version
+		 * we have matches the ID in the main mpool region.  Otherwise
+		 * we have to go and map in any regions that don't match and
+		 * retry.
+		 */
+		region = NREGION(mp, bucket);
+		regids = R_ADDR(dbmp->reginfo, mp->regids);
+
+		for (;;) {
+			infop = *infopp = &dbmp->reginfo[region];
+			c_mp = infop->primary;
+
+			/* If we have the correct region mapped, we're done. */
+			if (c_mp != NULL && regids[region] == infop->id)
+				break;
+			if ((ret = __memp_map_regions(dbmp)) != 0)
+				return (ret);
+		}
+
+		/* If our caller wants the hash bucket, lock it here. */
+		if (hpp != NULL) {
+			hp = R_ADDR(infop, c_mp->htab);
+			hp = &hp[bucket - region * mp->htab_buckets];
+
+			MUTEX_LOCK(dbenv, hp->mtx_hash);
+
+			/*
+			 * Check that we still have the correct region mapped.
+			 */
+			if (regids[region] != infop->id) {
+				MUTEX_UNLOCK(dbenv, hp->mtx_hash);
+				continue;
+			}
+
+			/*
+			 * Now that the bucket is locked, we need to check that
+			 * the cache has not been resized while we waited.
+			 */
+			new_nbuckets = mp->nbuckets;
+			if (nbuckets != new_nbuckets) {
+				MP_BUCKET(mf_offset, pgno, new_nbuckets,
+				    new_bucket);
+
+				if (new_bucket != bucket) {
+					MUTEX_UNLOCK(dbenv, hp->mtx_hash);
+					continue;
+				}
+			}
+
+			*hpp = hp;
+		}
+
+		break;
+	}
+
+	return (ret);
+}
+
+static int
+__memp_merge_buckets(dbmp, new_nbuckets, old_bucket, new_bucket)
+	DB_MPOOL *dbmp;
+	u_int32_t new_nbuckets, old_bucket, new_bucket;
+{
+	BH *alloc_bhp, *bhp, *current_bhp, *new_bhp, *next_bhp;
+	DB_ENV *dbenv;
+	DB_MPOOL_HASH *new_hp, *old_hp;
+	MPOOL *mp, *new_mp, *old_mp;
+	MPOOLFILE *mfp;
+	REGINFO *new_infop, *old_infop;
+	u_int32_t bucket, high_mask, new_region, old_region;
+	int ret;
+
+	dbenv = dbmp->dbenv;
+	mp = dbmp->reginfo[0].primary;
+	new_bhp = NULL;
+	ret = 0;
+
+	MP_MASK(new_nbuckets, high_mask);
+
+	old_region = NREGION(mp, old_bucket);
+	old_infop = &dbmp->reginfo[old_region];
+	old_mp = old_infop->primary;
+	old_hp = R_ADDR(old_infop, old_mp->htab);
+	old_hp = &old_hp[old_bucket - old_region * mp->htab_buckets];
+
+	new_region = NREGION(mp, new_bucket);
+	new_infop = &dbmp->reginfo[new_region];
+	new_mp = new_infop->primary;
+	new_hp = R_ADDR(new_infop, new_mp->htab);
+	new_hp = &new_hp[new_bucket - new_region * mp->htab_buckets];
+
+	/*
+	 * Before merging, we need to check that there are no old buffers left
+	 * in the target hash bucket after a previous split.
+	 */
+free_old:
+	MUTEX_LOCK(dbenv, new_hp->mtx_hash);
+	SH_TAILQ_FOREACH(bhp, &new_hp->hash_bucket, hq, __bh) {
+		MP_BUCKET(bhp->mf_offset, bhp->pgno, mp->nbuckets, bucket);
+
+		if (bucket != new_bucket) {
+			/*
+			 * There is no way that an old buffer can be locked
+			 * after a split, since everyone will look for it in
+			 * the new hash bucket.
+			 */
+			DB_ASSERT(dbenv, !F_ISSET(bhp, BH_LOCKED | BH_DIRTY) &&
+			    bhp->ref == 0);
+			if ((ret = __memp_bhfree(dbmp,
+			    new_infop, new_hp, bhp, BH_FREE_FREEMEM)) != 0) {
+				MUTEX_UNLOCK(dbenv, new_hp->mtx_hash);
+				return (ret);
+			}
+
+			/*
+			 * The free has modified the list of buffers and
+			 * dropped the mutex.  We need to start again.
+			 */
+			goto free_old;
+		}
+	}
+	MUTEX_UNLOCK(dbenv, new_hp->mtx_hash);
+
+	/*
+	 * Before we begin, make sure that all of the buffers we care about are
+	 * not in use and not frozen.  We do this because we can't drop the old
+	 * hash bucket mutex once we start moving buffers around.
+	 */
+retry:	MUTEX_LOCK(dbenv, old_hp->mtx_hash);
+	SH_TAILQ_FOREACH(bhp, &old_hp->hash_bucket, hq, __bh) {
+		MP_HASH_BUCKET(MP_HASH(bhp->mf_offset, bhp->pgno),
+		    new_nbuckets, high_mask, bucket);
+
+		if (bucket == new_bucket &&
+		    (F_ISSET(bhp, BH_LOCKED) || bhp->ref != 0)) {
+			MUTEX_UNLOCK(dbenv, old_hp->mtx_hash);
+			__os_yield(dbenv);
+			goto retry;
+		} else if (bucket == new_bucket && F_ISSET(bhp, BH_FROZEN)) {
+			if (BH_OBSOLETE(bhp, old_hp->old_reader))
+				alloc_bhp = NULL;
+			else {
+				++bhp->ref;
+				mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+				MUTEX_UNLOCK(dbenv, old_hp->mtx_hash);
+				if ((ret = __memp_alloc(dbmp,
+				    old_infop, mfp, 0, NULL, &alloc_bhp)) != 0)
+					return (ret);
+				MUTEX_LOCK(dbenv, old_hp->mtx_hash);
+			}
+			if ((ret = __memp_bh_thaw(dbmp,
+			    old_infop, old_hp, bhp, alloc_bhp)) != 0) {
+				MUTEX_UNLOCK(dbenv, old_hp->mtx_hash);
+				return (ret);
+			}
+
+			/*
+			 * We've dropped the mutex in order to thaw, so we need
+			 * to go back to the beginning and check that all of
+			 * the buffers we care about are still unlocked and
+			 * unreferenced.
+			 */
+			MUTEX_UNLOCK(dbenv, old_hp->mtx_hash);
+			goto retry;
+		}
+	}
+
+	/*
+	 * We now know that all of the buffers we care about are unlocked and
+	 * unreferenced.  Go ahead and copy them.
+	 */
+	SH_TAILQ_FOREACH(bhp, &old_hp->hash_bucket, hq, __bh) {
+		MP_HASH_BUCKET(MP_HASH(bhp->mf_offset, bhp->pgno),
+		    new_nbuckets, high_mask, bucket);
+		mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+
+		/*
+		 * We ignore buffers that don't hash to the new bucket.  We
+		 * could also ignore clean buffers which are not part of a
+		 * multiversion chain as long as they have a backing file.
+		 */
+		if (bucket != new_bucket || (!F_ISSET(bhp, BH_DIRTY) &&
+		    SH_CHAIN_SINGLETON(bhp, vc) && !mfp->no_backing_file))
+			continue;
+
+		for (current_bhp = bhp, next_bhp = NULL;
+		    current_bhp != NULL;
+		    current_bhp = SH_CHAIN_PREV(current_bhp, vc, __bh),
+		    next_bhp = alloc_bhp) {
+			if ((ret = __memp_alloc(dbmp,
+			    new_infop, mfp, 0, NULL, &alloc_bhp)) != 0)
+				break;
+
+			alloc_bhp->ref = current_bhp->ref;
+			alloc_bhp->ref_sync = current_bhp->ref_sync;
+			alloc_bhp->priority = current_bhp->priority;
+			alloc_bhp->pgno = current_bhp->pgno;
+			alloc_bhp->mf_offset = current_bhp->mf_offset;
+			alloc_bhp->flags = current_bhp->flags;
+			alloc_bhp->td_off = current_bhp->td_off;
+
+			/*
+			 * We've duplicated the buffer, so now we need to
+			 * update reference counts, including the counts in the
+			 * per-MPOOLFILE and the transaction detail (for MVCC
+			 * buffers).
+			 */
+			MUTEX_LOCK(dbenv, mfp->mutex);
+			++mfp->block_cnt;
+			MUTEX_UNLOCK(dbenv, mfp->mutex);
+
+			if (alloc_bhp->td_off != INVALID_ROFF &&
+			    (ret = __txn_add_buffer(dbenv,
+			    R_ADDR(&dbenv->tx_handle->reginfo,
+			    alloc_bhp->td_off))) != 0)
+				break;
+
+			memcpy(alloc_bhp->buf, bhp->buf, mfp->stat.st_pagesize);
+
+			/*
+			 * We build up the MVCC chain first, then insert the
+			 * head (stored in new_bhp) once.
+			 */
+			if (next_bhp == NULL) {
+				SH_CHAIN_INIT(alloc_bhp, vc);
+				new_bhp = alloc_bhp;
+			} else
+				SH_CHAIN_INSERT_BEFORE(
+				    next_bhp, alloc_bhp, vc, __bh);
+		}
+
+		MUTEX_LOCK(dbenv, new_hp->mtx_hash);
+		SH_TAILQ_INSERT_TAIL(&new_hp->hash_bucket, new_bhp, hq);
+		if (F_ISSET(new_bhp, BH_DIRTY))
+			++new_hp->hash_page_dirty;
+
+		/*
+		 * We're doing an insertion sort, so it is O(N**2), but since
+		 * buckets should be small, that should not matter.  When
+		 * splitting a bucket, we traverse in priority order and append
+		 * to the new bucket, and __memp_bucket_reorder is O(1) in that
+		 * case.
+		 */
+		__memp_bucket_reorder(dbenv, new_hp, new_bhp);
+		MUTEX_UNLOCK(dbenv, new_hp->mtx_hash);
+
+		if (F_ISSET(bhp, BH_DIRTY)) {
+			F_CLR(bhp, BH_DIRTY);
+			--old_hp->hash_page_dirty;
+		}
+	}
+
+	if (ret == 0)
+		mp->nbuckets = new_nbuckets;
+	MUTEX_UNLOCK(dbenv, old_hp->mtx_hash);
+
+	return (ret);
+}
+
+static int
+__memp_add_bucket(dbmp)
+	DB_MPOOL *dbmp;
+{
+	DB_ENV *dbenv;
+	MPOOL *mp;
+	u_int32_t high_mask, new_bucket, old_bucket;
+
+	dbenv = dbmp->dbenv;
+	mp = dbmp->reginfo[0].primary;
+
+	new_bucket = mp->nbuckets;
+	/* We should always be adding buckets to the last region. */
+	DB_ASSERT(dbenv, NREGION(mp, new_bucket) == mp->nreg - 1);
+	MP_MASK(mp->nbuckets, high_mask);
+	old_bucket = new_bucket & (high_mask >> 1);
+
+	/*
+	 * With fixed-sized regions, the new region is always smaller than the
+	 * existing total cache size, so buffers always need to be copied.  If
+	 * we implement variable region sizes, it's possible that we will be
+	 * splitting a hash bucket in the new region.  Catch that here.
+	 */
+	DB_ASSERT(dbenv, NREGION(mp, old_bucket) != NREGION(mp, new_bucket));
+
+	return (__memp_merge_buckets(dbmp, mp->nbuckets + 1,
+	    old_bucket, new_bucket));
+}
+
+static int
+__memp_add_region(dbmp)
+	DB_MPOOL *dbmp;
+{
+	DB_ENV *dbenv;
+	MPOOL *mp;
+	REGINFO *infop;
+	int ret;
+	roff_t reg_size;
+	u_int i;
+	u_int32_t *regids;
+
+	dbenv = dbmp->dbenv;
+	mp = dbmp->reginfo[0].primary;
+	/* All cache regions are the same size. */
+	reg_size = dbmp->reginfo[0].rp->size;
+	ret = 0;
+
+	infop = &dbmp->reginfo[mp->nreg];
+	infop->dbenv = dbenv;
+	infop->type = REGION_TYPE_MPOOL;
+	infop->id = INVALID_REGION_ID;
+	infop->flags = REGION_CREATE_OK;
+	if ((ret = __env_region_attach(dbenv, infop, reg_size)) != 0)
+		return (ret);
+	if ((ret = __memp_init(dbenv,
+	    dbmp, mp->nreg, mp->htab_buckets, mp->max_nreg)) != 0)
+		return (ret);
+	regids = R_ADDR(dbmp->reginfo, mp->regids);
+	regids[mp->nreg++] = infop->id;
+
+	for (i = 0; i < mp->htab_buckets; i++)
+		if ((ret = __memp_add_bucket(dbmp)) != 0)
+			break;
+
+	return (ret);
+}
+
+static int
+__memp_remove_bucket(dbmp)
+	DB_MPOOL *dbmp;
+{
+	DB_ENV *dbenv;
+	MPOOL *mp;
+	u_int32_t high_mask, new_bucket, old_bucket;
+
+	dbenv = dbmp->dbenv;
+	mp = dbmp->reginfo[0].primary;
+
+	old_bucket = mp->nbuckets - 1;
+
+	/* We should always be removing buckets from the last region. */
+	DB_ASSERT(dbenv, NREGION(mp, old_bucket) == mp->nreg - 1);
+	MP_MASK(mp->nbuckets - 1, high_mask);
+	new_bucket = old_bucket & (high_mask >> 1);
+
+	return (__memp_merge_buckets(dbmp, mp->nbuckets - 1,
+	    old_bucket, new_bucket));
+}
+
+static int
+__memp_remove_region(dbmp)
+	DB_MPOOL *dbmp;
+{
+	DB_ENV *dbenv;
+	MPOOL *mp;
+	REGINFO *infop;
+	int ret;
+	u_int i;
+
+	dbenv = dbmp->dbenv;
+	mp = dbmp->reginfo[0].primary;
+	ret = 0;
+
+	if (mp->nreg == 1) {
+		__db_errx(dbenv, "cannot remove the last cache");
+		return (EINVAL);
+	}
+
+	for (i = 0; i < mp->htab_buckets; i++)
+		if ((ret = __memp_remove_bucket(dbmp)) != 0)
+			return (ret);
+
+	/* Detach from the region then destroy it. */
+	infop = &dbmp->reginfo[--mp->nreg];
+	return (__env_region_detach(dbenv, infop, 1));
+}
+
+static int
+__memp_map_regions(dbmp)
+	DB_MPOOL *dbmp;
+{
+	DB_ENV *dbenv;
+	MPOOL *mp;
+	int ret;
+	u_int i;
+	u_int32_t *regids;
+
+	dbenv = dbmp->dbenv;
+	mp = dbmp->reginfo[0].primary;
+	regids = R_ADDR(dbmp->reginfo, mp->regids);
+	ret = 0;
+
+	for (i = 1; i < mp->nreg; ++i) {
+		if (dbmp->reginfo[i].primary != NULL &&
+		    dbmp->reginfo[i].id == regids[i])
+			continue;
+
+		if (dbmp->reginfo[i].primary != NULL)
+			ret = __env_region_detach(dbenv, &dbmp->reginfo[i], 0);
+
+		dbmp->reginfo[i].dbenv = dbenv;
+		dbmp->reginfo[i].type = REGION_TYPE_MPOOL;
+		dbmp->reginfo[i].id = regids[i];
+		dbmp->reginfo[i].flags = REGION_JOIN_OK;
+		if ((ret =
+		    __env_region_attach(dbenv, &dbmp->reginfo[i], 0)) != 0)
+			return (ret);
+		dbmp->reginfo[i].primary = R_ADDR(&dbmp->reginfo[i],
+		    dbmp->reginfo[i].rp->primary);
+	}
+
+	for (; i < mp->max_nreg; i++)
+		if (dbmp->reginfo[i].primary != NULL &&
+		    (ret = __env_region_detach(dbenv,
+		    &dbmp->reginfo[i], 0)) != 0)
+			break;
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __memp_resize __P((DB_MPOOL *, u_int32_t, u_int32_t));
+ */
+int
+__memp_resize(dbmp, gbytes, bytes)
+	DB_MPOOL *dbmp;
+	u_int32_t gbytes, bytes;
+{
+	DB_ENV *dbenv;
+	MPOOL *mp;
+	int ret;
+	u_int32_t ncache;
+	roff_t reg_size, total_size;
+
+	dbenv = dbmp->dbenv;
+	mp = dbmp->reginfo[0].primary;
+	reg_size = dbmp->reginfo[0].rp->size;
+	total_size = (roff_t)gbytes * GIGABYTE + bytes;
+	ncache = (u_int32_t)((total_size + reg_size / 2) / reg_size);
+
+	if (ncache < 1)
+		ncache = 1;
+	else if (ncache > mp->max_nreg) {
+		__db_errx(dbenv,
+		    "cannot resize to %lu cache regions: maximum is %lu",
+		    (u_long)ncache, (u_long)mp->max_nreg);
+		return (EINVAL);
+	}
+
+	ret = 0;
+	MUTEX_LOCK(dbenv, mp->mtx_resize);
+	while (mp->nreg != ncache)
+		if ((ret = (mp->nreg < ncache ?
+		    __memp_add_region(dbmp) :
+		    __memp_remove_region(dbmp))) != 0)
+			break;
+	MUTEX_UNLOCK(dbenv, mp->mtx_resize);
+
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __memp_get_cache_max __P((DB_ENV *, u_int32_t *, u_int32_t *));
+ */
+int
+__memp_get_cache_max(dbenv, max_gbytesp, max_bytesp)
+	DB_ENV *dbenv;
+	u_int32_t *max_gbytesp, *max_bytesp;
+{
+	DB_MPOOL *dbmp;
+	MPOOL *mp;
+	roff_t reg_size, max_size;
+
+	ENV_NOT_CONFIGURED(dbenv,
+	    dbenv->mp_handle, "DB_ENV->get_mp_max_ncache", DB_INIT_MPOOL);
+
+	if (MPOOL_ON(dbenv)) {
+		dbmp = dbenv->mp_handle;
+		mp = dbmp->reginfo[0].primary;
+		reg_size = dbmp->reginfo[0].rp->size;
+		max_size = mp->max_nreg * reg_size;
+		*max_gbytesp = (u_int32_t)(max_size / GIGABYTE);
+		*max_bytesp = (u_int32_t)(max_size % GIGABYTE);
+	} else {
+		*max_gbytesp = dbenv->mp_max_gbytes;
+		*max_bytesp = dbenv->mp_max_bytes;
+	}
+
+	return (0);
+}
+
+/*
+ * PUBLIC: int __memp_set_cache_max __P((DB_ENV *, u_int32_t, u_int32_t));
+ */
+int
+__memp_set_cache_max(dbenv, max_gbytes, max_bytes)
+	DB_ENV *dbenv;
+	u_int32_t max_gbytes, max_bytes;
+{
+	ENV_ILLEGAL_AFTER_OPEN(dbenv, "DB_ENV->set_cache_max");
+	dbenv->mp_max_gbytes = max_gbytes;
+	dbenv->mp_max_bytes = max_bytes;
+
+	return (0);
+}
diff --git a/db/mp/mp_stat.c b/db/mp/mp_stat.c
index b4d4544b5..0e7b6c237 100644
--- a/db/mp/mp_stat.c
+++ b/db/mp/mp_stat.c
@@ -1,10 +1,9 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996-2006
- *	Oracle Corporation.  All rights reserved.
+ * Copyright (c) 1996,2007 Oracle.  All rights reserved.
  *
- * $Id: mp_stat.c,v 12.28 2006/09/11 14:53:42 bostic Exp $
+ * $Id: mp_stat.c,v 12.36 2007/06/22 17:41:29 bostic Exp $
  */
 
 #include "db_config.h"
@@ -104,10 +103,10 @@ __memp_stat(dbenv, gspp, fspp, flags)
 		 * a per-cache basis.  Note that configuration information
 		 * may be modified at any time, and so we have to lock.
 		 */
-		c_mp = dbmp->reginfo[0].primary;
-		sp->st_gbytes = c_mp->stat.st_gbytes;
-		sp->st_bytes = c_mp->stat.st_bytes;
-		sp->st_ncache = dbmp->nreg;
+		sp->st_gbytes = mp->stat.st_gbytes;
+		sp->st_bytes = mp->stat.st_bytes;
+		sp->st_ncache = mp->nreg;
+		sp->st_max_ncache = mp->max_nreg;
 		sp->st_regsize = dbmp->reginfo[0].rp->size;
 
 		MPOOL_SYSTEM_LOCK(dbenv);
@@ -165,7 +164,8 @@ __memp_stat(dbenv, gspp, fspp, flags)
 				    c_mp->stat.st_alloc_max_pages;
 
 			if (LF_ISSET(DB_STAT_CLEAR)) {
-				__mutex_clear(dbenv, c_mp->mtx_region);
+				if (!LF_ISSET(DB_STAT_SUBSYSTEM))
+					__mutex_clear(dbenv, c_mp->mtx_region);
 
 				MPOOL_SYSTEM_LOCK(dbenv);
 				st_bytes = c_mp->stat.st_bytes;
@@ -388,9 +388,10 @@ __memp_stat_print(dbenv, flags)
 	int ret;
 
 	orig_flags = flags;
-	LF_CLR(DB_STAT_CLEAR);
+	LF_CLR(DB_STAT_CLEAR | DB_STAT_SUBSYSTEM);
 	if (flags == 0 || LF_ISSET(DB_STAT_ALL)) {
-		ret = __memp_print_stats(dbenv, orig_flags);
+		ret = __memp_print_stats(dbenv,
+		    LF_ISSET(DB_STAT_ALL) ? flags : orig_flags);
 		if (flags == 0 || ret != 0)
 			return (ret);
 	}
@@ -423,6 +424,7 @@ __memp_print_stats(dbenv, flags)
 	__db_dlbytes(dbenv, "Total cache size",
 	    (u_long)gsp->st_gbytes, (u_long)0, (u_long)gsp->st_bytes);
 	__db_dl(dbenv, "Number of caches", (u_long)gsp->st_ncache);
+	__db_dl(dbenv, "Maximum number of caches", (u_long)gsp->st_max_ncache);
 	__db_dlbytes(dbenv, "Pool individual cache size",
 	    (u_long)0, (u_long)0, (u_long)gsp->st_regsize);
 	__db_dlbytes(dbenv, "Maximum memory-mapped file size",
@@ -551,7 +553,7 @@ __memp_print_all(dbenv, flags)
 
 	MPOOL_SYSTEM_LOCK(dbenv);
 
-	__db_print_reginfo(dbenv, dbmp->reginfo, "Mpool");
+	__db_print_reginfo(dbenv, dbmp->reginfo, "Mpool", flags);
 	__db_msg(dbenv, "%s", DB_GLOBAL(db_line));
 
 	__db_msg(dbenv, "MPOOL structure:");
@@ -567,7 +569,7 @@ __memp_print_all(dbenv, flags)
 	__db_msg(dbenv, "DB_MPOOL handle information:");
 	__mutex_print_debug_single(
 	    dbenv, "DB_MPOOL handle mutex", dbmp->mutex, flags);
-	STAT_ULONG("Underlying cache regions", dbmp->nreg);
+	STAT_ULONG("Underlying cache regions", mp->nreg);
 
 	__db_msg(dbenv, "%s", DB_GLOBAL(db_line));
 	__db_msg(dbenv, "DB_MPOOLFILE structures:");
@@ -709,9 +711,11 @@ __memp_print_hash(dbenv, dbmp, reginfo, fmap, flags)
 	    bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
 		MUTEX_LOCK(dbenv, hp->mtx_hash);
 		if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) != NULL) {
-			__db_msgadd(dbenv, &mb, "bucket %lu: %lu, %lu ",
+			__db_msgadd(dbenv, &mb,
+			    "bucket %lu: %lu, %lu (%lu dirty)",
 			    (u_long)bucket, (u_long)hp->hash_io_wait,
-			    (u_long)hp->hash_priority);
+			    (u_long)hp->hash_priority,
+			    (u_long)hp->hash_page_dirty);
 			if (hp->hash_frozen != 0)
 				__db_msgadd(dbenv, &mb, "(MVCC %lu/%lu/%lu) ",
 				    (u_long)hp->hash_frozen,
@@ -822,7 +826,8 @@ __memp_stat_wait(dbenv, reginfo, mp, mstat, flags)
 			mstat->st_hash_max_wait = tmp_wait;
 			mstat->st_hash_max_nowait = tmp_nowait;
 		}
-		if (LF_ISSET(DB_STAT_CLEAR))
+		if (LF_ISSET(DB_STAT_CLEAR |
+		    DB_STAT_SUBSYSTEM) == DB_STAT_CLEAR)
 			__mutex_clear(dbenv, hp->mtx_hash);
 
 		mstat->st_io_wait += hp->hash_io_wait;
diff --git a/db/mp/mp_sync.c b/db/mp/mp_sync.c
index 898ae5b6d..5db83fc7b 100644
--- a/db/mp/mp_sync.c
+++ b/db/mp/mp_sync.c
@@ -1,10 +1,9 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996-2006
- *	Oracle Corporation.  All rights reserved.
+ * Copyright (c) 1996,2007 Oracle.  All rights reserved.
  *
- * $Id: mp_sync.c,v 12.24 2006/08/24 14:46:15 bostic Exp $
+ * $Id: mp_sync.c,v 12.52 2007/06/01 18:32:44 bostic Exp $
  */
 
 #include "db_config.h"
@@ -12,6 +11,8 @@
 #include "db_int.h"
 #include "dbinc/log.h"
 #include "dbinc/mp.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
 
 typedef struct {
 	DB_MPOOL_HASH *track_hp;	/* Hash bucket. */
@@ -21,8 +22,8 @@ typedef struct {
 } BH_TRACK;
 
 static int __bhcmp __P((const void *, const void *));
-static int __memp_close_flush_files __P((DB_ENV *, DB_MPOOL *, int));
-static int __memp_sync_files __P((DB_ENV *, DB_MPOOL *));
+static int __memp_close_flush_files __P((DB_ENV *, int));
+static int __memp_sync_files __P((DB_ENV *));
 static int __memp_sync_file __P((DB_ENV *,
 		MPOOLFILE *, void *, u_int32_t *, u_int32_t));
 
@@ -93,7 +94,7 @@ __memp_sync_pp(dbenv, lsnp)
 		    dbenv->lg_handle, "memp_sync", DB_INIT_LOG);
 
 	ENV_ENTER(dbenv, ip);
-	REPLICATION_WRAP(dbenv, (__memp_sync(dbenv, lsnp)), ret);
+	REPLICATION_WRAP(dbenv, (__memp_sync(dbenv, DB_SYNC_CACHE, lsnp)), ret);
 	ENV_LEAVE(dbenv, ip);
 	return (ret);
 }
@@ -102,16 +103,17 @@ __memp_sync_pp(dbenv, lsnp)
  * __memp_sync --
  *	DB_ENV->memp_sync.
  *
- * PUBLIC: int __memp_sync __P((DB_ENV *, DB_LSN *));
+ * PUBLIC: int __memp_sync __P((DB_ENV *, u_int32_t, DB_LSN *));
  */
 int
-__memp_sync(dbenv, lsnp)
+__memp_sync(dbenv, flags, lsnp)
 	DB_ENV *dbenv;
+	u_int32_t flags;
 	DB_LSN *lsnp;
 {
 	DB_MPOOL *dbmp;
 	MPOOL *mp;
-	int ret;
+	int interrupted, ret;
 
 	dbmp = dbenv->mp_handle;
 	mp = dbmp->reginfo[0].primary;
@@ -128,10 +130,11 @@ __memp_sync(dbenv, lsnp)
 		MPOOL_SYSTEM_UNLOCK(dbenv);
 	}
 
-	if ((ret = __memp_sync_int(dbenv, NULL, 0, DB_SYNC_CACHE, NULL)) != 0)
+	if ((ret =
+	    __memp_sync_int(dbenv, NULL, 0, flags, NULL, &interrupted)) != 0)
 		return (ret);
 
-	if (lsnp != NULL) {
+	if (!interrupted && lsnp != NULL) {
 		MPOOL_SYSTEM_LOCK(dbenv);
 		if (LOG_COMPARE(lsnp, &mp->lsn) > 0)
 			mp->lsn = *lsnp;
@@ -195,7 +198,8 @@ __memp_fsync(dbmfp)
 	if (mfp->file_written == 0)
 		return (0);
 
-	return (__memp_sync_int(dbmfp->dbenv, dbmfp, 0, DB_SYNC_FILE, NULL));
+	return (__memp_sync_int(
+	    dbmfp->dbenv, dbmfp, 0, DB_SYNC_FILE, NULL, NULL));
 }
 
 /*
@@ -209,6 +213,8 @@ __mp_xxx_fh(dbmfp, fhp)
 	DB_MPOOLFILE *dbmfp;
 	DB_FH **fhp;
 {
+	int ret;
+
 	/*
 	 * This is a truly spectacular layering violation, intended ONLY to
 	 * support compatibility for the DB 1.85 DB->fd call.
@@ -226,7 +232,10 @@ __mp_xxx_fh(dbmfp, fhp)
 	if ((*fhp = dbmfp->fhp) != NULL)
 		return (0);
 
-	return (__memp_sync_int(dbmfp->dbenv, dbmfp, 0, DB_SYNC_FILE, NULL));
+	if ((ret = __memp_sync_int(
+	    dbmfp->dbenv, dbmfp, 0, DB_SYNC_FILE, NULL, NULL)) == 0)
+		*fhp = dbmfp->fhp;
+	return (ret);
 }
 
 /*
@@ -234,14 +243,14 @@ __mp_xxx_fh(dbmfp, fhp)
  *	Mpool sync internal function.
  *
  * PUBLIC: int __memp_sync_int __P((DB_ENV *,
- * PUBLIC:     DB_MPOOLFILE *, u_int32_t, db_sync_op, u_int32_t *));
+ * PUBLIC:     DB_MPOOLFILE *, u_int32_t, u_int32_t, u_int32_t *, int *));
  */
 int
-__memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep)
+__memp_sync_int(dbenv, dbmfp, trickle_max, flags, wrote_totalp, interruptedp)
 	DB_ENV *dbenv;
 	DB_MPOOLFILE *dbmfp;
-	u_int32_t trickle_max, *wrotep;
-	db_sync_op op;
+	u_int32_t trickle_max, flags, *wrote_totalp;
+	int *interruptedp;
 {
 	BH *bhp;
 	BH_TRACK *bharray;
@@ -251,20 +260,32 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep)
 	MPOOLFILE *mfp;
 	db_mutex_t mutex;
 	roff_t last_mf_offset;
-	u_int32_t ar_cnt, ar_max, i, n_cache, remaining, wrote;
-	int filecnt, hb_lock, maxopenfd, maxwrite, maxwrite_sleep;
-	int pass, ret, t_ret, wait_cnt, write_cnt;
+	u_int32_t ar_cnt, ar_max, dirty, i, n_cache, remaining, wrote_total;
+	int filecnt, maxopenfd, pass, required_write, ret, t_ret;
+	int wait_cnt, wrote_cnt;
 
 	dbmp = dbenv->mp_handle;
 	mp = dbmp->reginfo[0].primary;
 	last_mf_offset = INVALID_ROFF;
-	filecnt = pass = wrote = 0;
+	filecnt = pass = wrote_total = 0;
+
+	if (wrote_totalp != NULL)
+		*wrote_totalp = 0;
+	if (interruptedp != NULL)
+		*interruptedp = 0;
+
+	/*
+	 * If we're flushing the cache, it's a checkpoint or we're flushing a
+	 * specific file, we really have to write the blocks and we have to
+	 * confirm they made it to disk.  Otherwise, we can skip a block if
+	 * it's hard to get.
+	 */
+	required_write = LF_ISSET(DB_SYNC_CACHE |
+	    DB_SYNC_CHECKPOINT | DB_SYNC_FILE | DB_SYNC_QUEUE_EXTENT);
 
 	/* Get shared configuration information. */
 	MPOOL_SYSTEM_LOCK(dbenv);
 	maxopenfd = mp->mp_maxopenfd;
-	maxwrite = mp->mp_maxwrite;
-	maxwrite_sleep = mp->mp_maxwrite_sleep;
 	MPOOL_SYSTEM_UNLOCK(dbenv);
 
 	/* Assume one dirty page per bucket. */
@@ -284,43 +305,60 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep)
 		hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
 		for (i = 0; i < c_mp->htab_buckets; i++, hp++) {
 			/*
-			 * We can check for empty buckets before locking as we
-			 * only care if the pointer is zero or non-zero.  We
-			 * can ignore empty buckets because we only need write
-			 * buffers that were dirty before we started.
+			 * We can check for empty buckets before locking as
+			 * we only care if the pointer is zero or non-zero.
+			 * We can ignore empty or clean buckets because we
+			 * only need write buffers that were dirty before
+			 * we started.
 			 */
+#ifdef DIAGNOSTIC
 			if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+#else
+			if (hp->hash_page_dirty == 0)
+#endif
 				continue;
 
+			dirty = 0;
 			MUTEX_LOCK(dbenv, hp->mtx_hash);
 			SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) {
 				/* Always ignore clean pages. */
 				if (!F_ISSET(bhp, BH_DIRTY))
 					continue;
 
+				dirty++;
 				mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
 
 				/*
-				 * Ignore in-memory files, even if they are
-				 * temp files to whom a backing file has been
-				 * allocated.
+				 * Ignore in-memory files, unless the file is
+				 * specifically being flushed.
 				 */
-				if (mfp->no_backing_file ||
+				if (mfp->no_backing_file)
+					continue;
+				if (!LF_ISSET(DB_SYNC_FILE) &&
 				    F_ISSET(mfp, MP_TEMP))
 					continue;
 
 				/*
-				 * If we're flushing a specific file, see if
-				 * this page is from that file.
+				 * Ignore files that aren't involved in DB's
+				 * transactional operations during checkpoints.
 				 */
-				if (dbmfp != NULL && mfp != dbmfp->mfp)
+				if (LF_ISSET(DB_SYNC_CHECKPOINT) &&
+				    mfp->lsn_off == DB_LSN_OFF_NOTSET)
 					continue;
 
 				/*
-				 * Ignore files that aren't involved in DB's
-				 * transactional operations during checkpoints.
+				 * Ignore files that aren't Queue extent files
+				 * if we're flushing a Queue file with extents.
 				 */
-				if (dbmfp == NULL && mfp->lsn_off == -1)
+				if (LF_ISSET(DB_SYNC_QUEUE_EXTENT) &&
+				    !F_ISSET(mfp, MP_EXTENT))
+					continue;
+
+				/*
+				 * If we're flushing a specific file, see if
+				 * this page is from that file.
+				 */
+				if (dbmfp != NULL && mfp != dbmfp->mfp)
 					continue;
 
 				/* Track the buffer, we want it. */
@@ -343,10 +381,25 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep)
 					ar_max *= 2;
 				}
 			}
+			DB_ASSERT(dbenv, dirty == hp->hash_page_dirty);
+			if (dirty != hp->hash_page_dirty) {
+				__db_errx(dbenv,
+				    "memp_sync: correcting dirty count %lu %lu",
+				    (u_long)hp->hash_page_dirty, (u_long)dirty);
+				hp->hash_page_dirty = dirty;
+			}
 			MUTEX_UNLOCK(dbenv, hp->mtx_hash);
 
 			if (ret != 0)
 				goto err;
+
+			/* Check if the call has been interrupted. */
+			if (LF_ISSET(DB_SYNC_INTERRUPT_OK) && FLD_ISSET(
+			    mp->config_flags, DB_MEMP_SYNC_INTERRUPT)) {
+				if (interruptedp != NULL)
+					*interruptedp = 1;
+				goto err;
+			}
 		}
 	}
 
@@ -366,7 +419,7 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep)
 	 * If we're trickling buffers, only write enough to reach the correct
 	 * percentage.
 	 */
-	if (op == DB_SYNC_TRICKLE && ar_cnt > trickle_max)
+	if (LF_ISSET(DB_SYNC_TRICKLE) && ar_cnt > trickle_max)
 		ar_cnt = trickle_max;
 
 	/*
@@ -385,7 +438,7 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep)
 	 * out its hash bucket pointer so we don't process a slot more than
 	 * once.
 	 */
-	for (i = pass = write_cnt = 0, remaining = ar_cnt; remaining > 0; ++i) {
+	for (i = pass = wrote_cnt = 0, remaining = ar_cnt; remaining > 0; ++i) {
 		if (i >= ar_cnt) {
 			i = 0;
 			++pass;
@@ -429,44 +482,40 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep)
 		 */
 		if (F_ISSET(bhp, BH_LOCKED) || (bhp->ref != 0 && pass < 2)) {
 			MUTEX_UNLOCK(dbenv, mutex);
-			if (op != DB_SYNC_CACHE && op != DB_SYNC_FILE) {
+			if (!required_write) {
 				--remaining;
 				bharray[i].track_hp = NULL;
 			}
 			continue;
 		}
 
-		/*
-		 * The buffer is dirty and may also be pinned.
-		 *
-		 * Set the sync wait-for count, used to count down outstanding
-		 * references to this buffer as they are returned to the cache.
-		 */
-		bhp->ref_sync = bhp->ref;
-
 		/* Pin the buffer into memory and lock it. */
 		++bhp->ref;
 		F_SET(bhp, BH_LOCKED);
 
 		/*
-		 * Unlock the hash bucket and wait for the wait-for count to
-		 * go to 0.   No new thread can acquire the buffer because we
-		 * have it locked.
+		 * If the buffer is referenced by another thread, set the sync
+		 * wait-for count (used to count down outstanding references to
+		 * this buffer as they are returned to the cache), then unlock
+		 * the hash bucket and wait for the count to go to 0.   No other
+		 * thread can acquire the buffer because we have it locked.
 		 *
 		 * If a thread attempts to re-pin a page, the wait-for count
-		 * will never go to 0 (the thread spins on our buffer lock,
+		 * will never go to 0 (that thread spins on our buffer lock,
 		 * while we spin on the thread's ref count).  Give up if we
-		 * don't get the buffer in 3 seconds, we can try again later.
+		 * don't get the buffer in 3 seconds, we'll try again later.
 		 *
 		 * If, when the wait-for count goes to 0, the buffer is found
 		 * to be dirty, write it.
 		 */
-		MUTEX_UNLOCK(dbenv, mutex);
-		for (wait_cnt = 1;
-		    bhp->ref_sync != 0 && wait_cnt < 4; ++wait_cnt)
-			__os_sleep(dbenv, 1, 0);
-		MUTEX_LOCK(dbenv, mutex);
-		hb_lock = 1;
+		bhp->ref_sync = bhp->ref - 1;
+		if (bhp->ref_sync != 0) {
+			MUTEX_UNLOCK(dbenv, mutex);
+			for (wait_cnt = 1;
+			    bhp->ref_sync != 0 && wait_cnt < 4; ++wait_cnt)
+				__os_sleep(dbenv, 1, 0);
+			MUTEX_LOCK(dbenv, mutex);
+		}
 
 		/*
 		 * If we've switched files, check to see if we're configured
@@ -476,7 +525,7 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep)
 			if (++filecnt >= maxopenfd) {
 				filecnt = 0;
 				if ((t_ret = __memp_close_flush_files(
-				    dbenv, dbmp, 1)) != 0 && ret == 0)
+				    dbenv, 1)) != 0 && ret == 0)
 					ret = t_ret;
 			}
 			last_mf_offset = bhp->mf_offset;
@@ -496,28 +545,18 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep)
 		 * dirty, we write it.  We only try to write the buffer once.
 		 */
 		if (bhp->ref_sync == 0 && F_ISSET(bhp, BH_DIRTY)) {
-			MUTEX_UNLOCK(dbenv, mutex);
-			hb_lock = 0;
-
 			mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
 			if ((t_ret =
-			    __memp_bhwrite(dbmp, hp, mfp, bhp, 1)) == 0)
-				++wrote;
-			else {
+			    __memp_bhwrite(dbmp, hp, mfp, bhp, 1)) == 0) {
+				++wrote_cnt;
+				++wrote_total;
+			} else {
 				if (ret == 0)
 					ret = t_ret;
 				__db_errx
 				    (dbenv, "%s: unable to flush page: %lu",
 				    __memp_fns(dbmp, mfp), (u_long)bhp->pgno);
-			}
 
-			/*
-			 * Avoid saturating the disk, sleep once we've done
-			 * some number of writes.
-			 */
-			if (maxwrite != 0 && ++write_cnt >= maxwrite) {
-				write_cnt = 0;
-				__os_sleep(dbenv, 0, (u_long)maxwrite_sleep);
 			}
 		}
 
@@ -525,18 +564,9 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep)
 		 * If ref_sync count never went to 0, the buffer was written
 		 * by another thread, or the write failed, we still have the
 		 * buffer locked.
-		 *
-		 * We may or may not currently hold the hash bucket mutex.  If
-		 * the __memp_bhwrite -> __memp_pgwrite call was successful,
-		 * __memp_pgwrite will have acquired the hash bucket lock; all
-		 * other call paths will leave us without the hash bucket lock.
 		 */
-		if (F_ISSET(bhp, BH_LOCKED)) {
-			if (!hb_lock)
-				MUTEX_LOCK(dbenv, mutex);
-
+		if (F_ISSET(bhp, BH_LOCKED))
 			F_CLR(bhp, BH_LOCKED);
-		}
 
 		/*
 		 * Reset the ref_sync count regardless of our success, we're
@@ -548,7 +578,8 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep)
 		--bhp->ref;
 
 		/*
-		 * If a thread of control is waiting on this buffer, wake it up.
+		 * If a thread of control is waiting in this hash bucket, wake
+		 * it up.
 		 */
 		if (F_ISSET(hp, IO_WAITER)) {
 			F_CLR(hp, IO_WAITER);
@@ -557,29 +588,51 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep)
 
 		/* Release the hash bucket mutex. */
 		MUTEX_UNLOCK(dbenv, mutex);
+
+		/* Check if the call has been interrupted. */
+		if (LF_ISSET(DB_SYNC_INTERRUPT_OK) &&
+		    FLD_ISSET(mp->config_flags, DB_MEMP_SYNC_INTERRUPT)) {
+			if (interruptedp != NULL)
+				*interruptedp = 1;
+			goto err;
+		}
+
+		/*
+		 * Sleep after some number of writes to avoid disk saturation.
+		 * Don't cache the max writes value, an application shutting
+		 * down might reset the value in order to do a fast flush or
+		 * checkpoint.
+		 */
+		if (!LF_ISSET(DB_SYNC_SUPPRESS_WRITE) &&
+		    !FLD_ISSET(mp->config_flags, DB_MEMP_SUPPRESS_WRITE) &&
+		    mp->mp_maxwrite != 0 && wrote_cnt >= mp->mp_maxwrite) {
+			wrote_cnt = 0;
+			__os_sleep(
+			    dbenv, 0, (u_long)mp->mp_maxwrite_sleep);
+		}
 	}
 
 done:	/*
-	 * If doing a checkpoint or flushing a file for the application, we
-	 * have to force the pages to disk.  We don't do this as we go along
-	 * because we want to give the OS as much time as possible to lazily
-	 * flush, and because we have to flush files that might not even have
-	 * had dirty buffers in the cache, so we have to walk the files list.
+	 * If a write is required, we have to force the pages to disk.  We
+	 * don't do this as we go along because we want to give the OS as
+	 * much time as possible to lazily flush, and because we have to flush
+	 * files that might not even have had dirty buffers in the cache, so
+	 * we have to walk the files list.
 	 */
-	if (ret == 0 && (op == DB_SYNC_CACHE || op == DB_SYNC_FILE)) {
+	if (ret == 0 && required_write) {
 		if (dbmfp == NULL)
-			ret = __memp_sync_files(dbenv, dbmp);
+			ret = __memp_sync_files(dbenv);
 		else
 			ret = __os_fsync(dbenv, dbmfp->fhp);
 	}
 
 	/* If we've opened files to flush pages, close them. */
-	if ((t_ret = __memp_close_flush_files(dbenv, dbmp, 0)) != 0 && ret == 0)
+	if ((t_ret = __memp_close_flush_files(dbenv, 0)) != 0 && ret == 0)
 		ret = t_ret;
 
 err:	__os_free(dbenv, bharray);
-	if (wrotep != NULL)
-		*wrotep = wrote;
+	if (wrote_totalp != NULL)
+		*wrote_totalp = wrote_total;
 
 	return (ret);
 }
@@ -651,28 +704,23 @@ __memp_sync_file(dbenv, mfp, argp, countp, flags)
 
 	/* If we don't find a handle we can use, open one. */
 	if (dbmfp == NULL) {
-		if ((ret = __memp_mf_sync(dbmp, mfp, 0)) != 0) {
+		if ((ret = __memp_mf_sync(dbmp, mfp, 1)) != 0) {
 			__db_err(dbenv, ret,
 			    "%s: unable to flush", (char *)
 			    R_ADDR(dbmp->reginfo, mfp->path_off));
 		}
-	} else {
+	} else
 		ret = __os_fsync(dbenv, dbmfp->fhp);
 
-		if ((t_ret = __memp_fclose(dbmfp, 0)) != 0 && ret == 0)
-			ret = t_ret;
-	}
-
 	/*
 	 * Re-acquire the MPOOLFILE mutex, we need it to modify the
 	 * reference count.
 	 */
 	MUTEX_LOCK(dbenv, mfp->mutex);
-	--mfp->mpf_cnt;
 
 	/*
-	 * If we wrote the file and there are no open handles (or there
-	 * is a single open handle, and it's the one we opened to write
+	 * If we wrote the file and there are no other references (or there
+	 * is a single reference, and it's the one we opened to write
 	 * buffers during checkpoint), clear the file_written flag.  We
 	 * do this so that applications opening thousands of files don't
 	 * loop here opening and flushing those files during checkpoint.
@@ -684,7 +732,7 @@ __memp_sync_file(dbenv, mfp, argp, countp, flags)
 	 * the region lock, no possibility of another thread of control
 	 * racing with us to open a MPOOLFILE.
 	 */
-	if (mfp->mpf_cnt == 0 || (mfp->mpf_cnt == 1 &&
+	if (mfp->mpf_cnt == 1 || (mfp->mpf_cnt == 2 &&
 	    dbmfp != NULL && F_ISSET(dbmfp, MP_FLUSH))) {
 		mfp->file_written = 0;
 
@@ -696,31 +744,44 @@ __memp_sync_file(dbenv, mfp, argp, countp, flags)
 		 * I mean, what are the chances that there aren't any
 		 * buffers in the pool?  Regardless, it might happen.)
 		 */
-		if (mfp->mpf_cnt == 0 && mfp->block_cnt == 0)
+		if (mfp->mpf_cnt == 1 && mfp->block_cnt == 0)
 			*(int *)argp = 1;
 	}
 
-	/* Unlock the MPOOLFILE, and move to the next entry. */
+	/*
+	 * If we found the file we must close it in case we are the last
+	 * reference to the dbmfp.  NOTE: since we have incremented
+	 * mfp->mpf_cnt this cannot be the last reference to the mfp.
+	 * This is important since we are called with the hash bucket
+	 * locked.  The mfp will get freed via the cleanup pass.
+	 */
+	if (dbmfp != NULL && (t_ret = __memp_fclose(dbmfp, 0)) != 0 && ret == 0)
+		ret = t_ret;
+
+	--mfp->mpf_cnt;
+
+	/* Unlock the MPOOLFILE. */
 	MUTEX_UNLOCK(dbenv, mfp->mutex);
-	return (0);
+	return (ret);
 }
 
 /*
  * __memp_sync_files --
  *	Sync all the files in the environment, open or not.
  */
-static
-int __memp_sync_files(dbenv, dbmp)
+static int
+__memp_sync_files(dbenv)
 	DB_ENV *dbenv;
-	DB_MPOOL *dbmp;
 {
+	DB_MPOOL *dbmp;
 	DB_MPOOL_HASH *hp;
 	MPOOL *mp;
 	MPOOLFILE *mfp, *next_mfp;
 	int i, need_discard_pass, ret;
 
-	need_discard_pass = ret = 0;
+	dbmp = dbenv->mp_handle;
 	mp = dbmp->reginfo[0].primary;
+	need_discard_pass = ret = 0;
 
 	ret = __memp_walk_files(dbenv,
 	    mp, __memp_sync_file, &need_discard_pass, 0, DB_STAT_NOERROR);
@@ -734,7 +795,7 @@ int __memp_sync_files(dbenv, dbmp)
 
 	hp = R_ADDR(dbmp->reginfo, mp->ftab);
 	for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) {
-		MUTEX_LOCK(dbenv, hp->mtx_hash);
+retry:		MUTEX_LOCK(dbenv, hp->mtx_hash);
 		for (mfp = SH_TAILQ_FIRST(&hp->hash_bucket,
 		    __mpoolfile); mfp != NULL; mfp = next_mfp) {
 			next_mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile);
@@ -743,13 +804,17 @@ int __memp_sync_files(dbenv, dbmp)
 			 * without a mutex on the MPOOLFILE.  If likely to
 			 * succeed, lock the MPOOLFILE down and look for real.
 			 */
-			if (mfp->block_cnt != 0 || mfp->mpf_cnt != 0)
+			if (mfp->deadfile ||
+			    mfp->block_cnt != 0 || mfp->mpf_cnt != 0)
 				continue;
 
 			MUTEX_LOCK(dbenv, mfp->mutex);
-			if (mfp->block_cnt == 0 && mfp->mpf_cnt == 0)
+			if (!mfp->deadfile &&
+			    mfp->block_cnt == 0 && mfp->mpf_cnt == 0) {
+				MUTEX_UNLOCK(dbenv, hp->mtx_hash);
 				(void)__memp_mf_discard(dbmp, mfp);
-			else
+				goto retry;
+			} else
 				MUTEX_UNLOCK(dbenv, mfp->mutex);
 		}
 		MUTEX_UNLOCK(dbenv, hp->mtx_hash);
@@ -764,28 +829,36 @@ int __memp_sync_files(dbenv, dbmp)
  * PUBLIC: int __memp_mf_sync __P((DB_MPOOL *, MPOOLFILE *, int));
  */
 int
-__memp_mf_sync(dbmp, mfp, region_locked)
+__memp_mf_sync(dbmp, mfp, locked)
 	DB_MPOOL *dbmp;
 	MPOOLFILE *mfp;
-	int region_locked;
+	int locked;
 {
 	DB_ENV *dbenv;
 	DB_FH *fhp;
+	DB_MPOOL_HASH *hp;
+	MPOOL *mp;
 	int ret, t_ret;
 	char *rpath;
 
+	COMPQUIET(hp, NULL);
 	dbenv = dbmp->dbenv;
 
 	/*
-	 * We need to be holding the region lock: we're using the path name
+	 * We need to be holding the hash lock: we're using the path name
 	 * and __memp_nameop might try and rename the file.
 	 */
-	if (!region_locked)
-		MPOOL_SYSTEM_LOCK(dbenv);
+	if (!locked) {
+		mp = dbmp->reginfo[0].primary;
+		hp = R_ADDR(dbmp->reginfo, mp->ftab);
+		hp += FNBUCKET(
+		    R_ADDR(dbmp->reginfo, mfp->fileid_off), DB_FILE_ID_LEN);
+		MUTEX_LOCK(dbenv, hp->mtx_hash);
+	}
 
 	if ((ret = __db_appname(dbenv, DB_APP_DATA,
 	    R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) == 0) {
-		if ((ret = __os_open(dbenv, rpath, 0, 0, &fhp)) == 0) {
+		if ((ret = __os_open(dbenv, rpath, 0, 0, 0, &fhp)) == 0) {
 			ret = __os_fsync(dbenv, fhp);
 			if ((t_ret =
 			    __os_closehandle(dbenv, fhp)) != 0 && ret == 0)
@@ -794,8 +867,8 @@ __memp_mf_sync(dbmp, mfp, region_locked)
 		__os_free(dbenv, rpath);
 	}
 
-	if (!region_locked)
-		MPOOL_SYSTEM_UNLOCK(dbenv);
+	if (!locked)
+		MUTEX_UNLOCK(dbenv, hp->mtx_hash);
 
 	return (ret);
 }
@@ -805,15 +878,17 @@ __memp_mf_sync(dbmp, mfp, region_locked)
  *	Close files opened only to flush buffers.
  */
 static int
-__memp_close_flush_files(dbenv, dbmp, dosync)
+__memp_close_flush_files(dbenv, dosync)
 	DB_ENV *dbenv;
-	DB_MPOOL *dbmp;
 	int dosync;
 {
+	DB_MPOOL *dbmp;
 	DB_MPOOLFILE *dbmfp;
 	MPOOLFILE *mfp;
 	int ret;
 
+	dbmp = dbenv->mp_handle;
+
 	/*
 	 * The routine exists because we must close files opened by sync to
 	 * flush buffers.  There are two cases: first, extent files have to
diff --git a/db/mp/mp_trickle.c b/db/mp/mp_trickle.c
index d1d3853aa..cbe7af4f2 100644
--- a/db/mp/mp_trickle.c
+++ b/db/mp/mp_trickle.c
@@ -1,10 +1,9 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996-2006
- *	Oracle Corporation.  All rights reserved.
+ * Copyright (c) 1996,2007 Oracle.  All rights reserved.
  *
- * $Id: mp_trickle.c,v 12.9 2006/08/24 14:46:15 bostic Exp $
+ * $Id: mp_trickle.c,v 12.16 2007/06/01 18:32:44 bostic Exp $
  */
 
 #include "db_config.h"
@@ -89,15 +88,21 @@ __memp_trickle(dbenv, pct, nwrotep)
 	if (total == 0 || dirty == 0)
 		return (0);
 
-	clean = total - dirty;
+	/*
+	 * The total number of pages is an exact number, but the dirty page
+	 * count can change while we're walking the hash buckets, and it's
+	 * even possible the dirty page count ends up larger than the total
+	 * number of pages.
+	 */
+	clean = total > dirty ? total - dirty : 0;
 	need_clean = (total * (u_int)pct) / 100;
 	if (clean >= need_clean)
 		return (0);
 
 	need_clean -= clean;
-	ret = __memp_sync_int(
-	    dbenv, NULL, need_clean, DB_SYNC_TRICKLE, &wrote);
-	mp->stat.st_page_trickle += wrote;
+	ret = __memp_sync_int(dbenv, NULL,
+	    need_clean, DB_SYNC_TRICKLE | DB_SYNC_INTERRUPT_OK, &wrote, NULL);
+	STAT((mp->stat.st_page_trickle += wrote));
 	if (nwrotep != NULL)
 		*nwrotep = (int)wrote;