2 files changed, 445 insertions, 498 deletions
diff --git a/db/env/db_salloc.c b/db/env/db_salloc.c
index 4780107c5..881c63bbd 100644
--- a/db/env/db_salloc.c
+++ b/db/env/db_salloc.c
@@ -1,14 +1,14 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
  *	Sleepycat Software.  All rights reserved.
  */
 
 #include "db_config.h"
 
 #ifndef lint
-static const char revid[] = "$Id: db_salloc.c,v 11.10 2000/12/06 19:55:44 ubell Exp $";
+static const char revid[] = "Id: db_salloc.c,v 11.15 2002/02/22 01:55:53 mjc Exp ";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -59,8 +59,8 @@ __db_shalloc_init(area, size)
 }
 
 /*
- * __db_shalloc --
- *	Allocate some space from the shared region.
+ * __db_shalloc_size --
+ *	Return size of the shared region, including alignment.
  *
  * PUBLIC: int __db_shalloc_size __P((size_t, size_t));
  */
@@ -81,7 +81,7 @@ __db_shalloc_size(len, align)
 	if (align <= sizeof(db_align_t))
 		align = sizeof(db_align_t);
 
-	return (ALIGN(len, align) + sizeof (struct __data));
+	return ((int)(ALIGN(len, align) + sizeof (struct __data)));
 }
 
 /*
@@ -284,28 +284,6 @@ __db_shalloc_free(regionp, ptr)
 }
 
 /*
- * __db_shalloc_count --
- *	Return the amount of memory on the free list.
- *
- * PUBLIC: size_t __db_shalloc_count __P((void *));
- */
-size_t
-__db_shalloc_count(addr)
-	void *addr;
-{
-	struct __data *elp;
-	size_t count;
-
-	count = 0;
-	for (elp = SH_LIST_FIRST((struct __head *)addr, __data);
-	    elp != NULL;
-	    elp = SH_LIST_NEXT(elp, links, __data))
-		count += elp->len;
-
-	return (count);
-}
-
-/*
  * __db_shsizeof --
  *	Return the size of a shalloc'd piece of memory.
  *
@@ -355,6 +333,6 @@ __db_shalloc_dump(addr, fp)
 	for (elp = SH_LIST_FIRST((struct __head *)addr, __data);
 	    elp != NULL;
 	    elp = SH_LIST_NEXT(elp, links, __data))
-		fprintf(fp, "%#lx: %lu\t", (u_long)elp, (u_long)elp->len);
+		fprintf(fp, "%#lx: %lu\t", P_TO_ULONG(elp), (u_long)elp->len);
 	fprintf(fp, "\n");
 }
diff --git a/db/mp/mp_sync.c b/db/mp/mp_sync.c
index 1b0751db7..8c176a385 100644
--- a/db/mp/mp_sync.c
+++ b/db/mp/mp_sync.c
@@ -1,13 +1,13 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
  *	Sleepycat Software.  All rights reserved.
  */
 #include "db_config.h"
 
 #ifndef lint
-static const char revid[] = "$Id: mp_sync.c,v 11.29 2001/01/11 18:19:53 bostic Exp $";
+static const char revid[] = "Id: mp_sync.c,v 11.63 2002/08/13 18:30:07 bostic Exp ";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -16,339 +16,92 @@ static const char revid[] = "$Id: mp_sync.c,v 11.29 2001/01/11 18:19:53 bostic E
 #include <stdlib.h>
 #endif
 
-#ifdef  HAVE_RPC
-#include "db_server.h"
-#endif
-
 #include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
 
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
+typedef struct {
+	DB_MPOOL_HASH *track_hp;	/* Hash bucket. */
+
+	roff_t	  track_off;		/* Page file offset. */
+	db_pgno_t track_pgno;		/* Page number. */
+} BH_TRACK;
 
 static int __bhcmp __P((const void *, const void *));
-static int __memp_fsync __P((DB_MPOOLFILE *));
-static int __memp_sballoc __P((DB_ENV *, BH ***, u_int32_t *));
+static int __memp_close_flush_files __P((DB_ENV *, DB_MPOOL *));
+static int __memp_sync_files __P((DB_ENV *, DB_MPOOL *));
 
 /*
- * memp_sync --
+ * __memp_sync --
  *	Mpool sync function.
+ *
+ * PUBLIC: int __memp_sync __P((DB_ENV *, DB_LSN *));
  */
 int
-memp_sync(dbenv, lsnp)
+__memp_sync(dbenv, lsnp)
 	DB_ENV *dbenv;
 	DB_LSN *lsnp;
 {
-	BH *bhp, **bharray;
 	DB_MPOOL *dbmp;
-	DB_LSN tlsn;
-	MPOOL *c_mp, *mp;
-	MPOOLFILE *mfp;
-	u_int32_t ar_cnt, i, ndirty;
-	int ret, retry_done, retry_need, wrote;
-
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_memp_sync(dbenv, lsnp));
-#endif
+	MPOOL *mp;
+	int ret;
 
 	PANIC_CHECK(dbenv);
-	ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
-
-	dbmp = dbenv->mp_handle;
-	mp = dbmp->reginfo[0].primary;
+	ENV_REQUIRES_CONFIG(dbenv,
+	    dbenv->mp_handle, "memp_sync", DB_INIT_MPOOL);
 
 	/*
-	 * If no LSN is provided, flush the entire cache.
-	 *
-	 * !!!
-	 * Our current behavior is to flush the entire cache, so there's
-	 * nothing special we have to do here other than deal with NULL
-	 * pointers.
+	 * If no LSN is provided, flush the entire cache (reasonable usage
+	 * even if there's no log subsystem configured).
 	 */
-	if (lsnp == NULL) {
-		ZERO_LSN(tlsn);
-		lsnp = &tlsn;
-		F_SET(mp, MP_LSN_RETRY);
-	} else if (!LOGGING_ON(dbenv)) {
-		__db_err(dbenv, "memp_sync: requires logging");
-		return (EINVAL);
-	}
+	if (lsnp != NULL)
+		ENV_REQUIRES_CONFIG(dbenv,
+		    dbenv->lg_handle, "memp_sync", DB_INIT_LOG);
 
-	/*
-	 * Sync calls are single-threaded so that we don't have multiple
-	 * threads, with different checkpoint LSNs, walking the caches
-	 * and updating the checkpoint LSNs and how many buffers remain
-	 * to be written for the checkpoint.  This shouldn't be a problem,
-	 * any application that has multiple checkpoint threads isn't what
-	 * I'd call trustworthy.
-	 */
-	MUTEX_LOCK(dbenv, &mp->sync_mutex, dbenv->lockfhp);
+	dbmp = dbenv->mp_handle;
+	mp = dbmp->reginfo[0].primary;
 
-	/*
-	 * If the application is asking about a previous call to memp_sync(),
-	 * and we haven't found any buffers that the application holding the
-	 * pin couldn't write, return yes or no based on the current count.
-	 * Note, if the application is asking about a LSN *smaller* than one
-	 * we've already handled or are currently handling, then we return a
-	 * result based on the count for the larger LSN.
-	 */
-	R_LOCK(dbenv, dbmp->reginfo);
-	if (!IS_ZERO_LSN(*lsnp) &&
-	    !F_ISSET(mp, MP_LSN_RETRY) && log_compare(lsnp, &mp->lsn) <= 0) {
-		if (mp->lsn_cnt == 0) {
+	/* If we've flushed to the requested LSN, return that information. */
+	if (lsnp != NULL) {
+		R_LOCK(dbenv, dbmp->reginfo);
+		if (log_compare(lsnp, &mp->lsn) <= 0) {
 			*lsnp = mp->lsn;
-			ret = 0;
-		} else
-			ret = DB_INCOMPLETE;
 
+			R_UNLOCK(dbenv, dbmp->reginfo);
+			return (0);
+		}
 		R_UNLOCK(dbenv, dbmp->reginfo);
-		MUTEX_UNLOCK(dbenv, &mp->sync_mutex);
-		return (ret);
 	}
 
-	/*
-	 * Allocate room for a list of buffers, and decide how many buffers
-	 * we can pin down.
-	 *
-	 * !!!
-	 * Note: __memp_sballoc has released the region lock if we're not
-	 * continuing forward.
-	 */
-	if ((ret =
-	    __memp_sballoc(dbenv, &bharray, &ndirty)) != 0 || ndirty == 0) {
-		MUTEX_UNLOCK(dbenv, &mp->sync_mutex);
+	if ((ret = __memp_sync_int(dbenv, NULL, 0, DB_SYNC_CACHE, NULL)) != 0)
 		return (ret);
-	}
 
-	retry_done = 0;
-retry:	retry_need = 0;
-	/*
-	 * Start a new checkpoint.
-	 *
-	 * Save the LSN.  We know that it's a new LSN, a retry, or larger than
-	 * the one for which we were already doing a checkpoint.  (BTW, I don't
-	 * expect to see multiple LSN's from the same or multiple processes,
-	 * but You Just Never Know.  Responding as if they all called with the
-	 * largest of the LSNs specified makes everything work.)
-	 *
-	 * We don't currently use the LSN we save.  We could potentially save
-	 * the last-written LSN in each buffer header and use it to determine
-	 * what buffers need to be written.  The problem with this is that it's
-	 * sizeof(LSN) more bytes of buffer header.  We currently write all the
-	 * dirty buffers instead, but with a sufficiently large cache that's
-	 * going to be a problem.
-	 */
-	mp->lsn = *lsnp;
-
-	/*
-	 * Clear the global count of buffers waiting to be written, walk the
-	 * list of files clearing the count of buffers waiting to be written.
-	 *
-	 * Clear the retry flag.
-	 */
-	mp->lsn_cnt = 0;
-	for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
-	    mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile))
-		mfp->lsn_cnt = 0;
-	F_CLR(mp, MP_LSN_RETRY);
-
-	/*
-	 * Walk each cache's list of buffers and mark all dirty buffers to be
-	 * written and all pinned buffers to be potentially written (we can't
-	 * know if they'll need to be written until the holder returns them to
-	 * the cache).  We do this in one pass while holding the region locked
-	 * so that processes can't make new buffers dirty, causing us to never
-	 * finish.  Since the application may have restarted the sync using a
-	 * different LSN value, clear any BH_SYNC | BH_SYNC_LOGFLSH flags that
-	 * appear leftover from previous calls.
-	 *
-	 * Keep a count of the total number of buffers we need to write in
-	 * MPOOL->lsn_cnt, and for each file, in MPOOLFILE->lsn_count.
-	 */
-	for (ar_cnt = 0, i = 0; i < mp->nreg; ++i) {
-		c_mp = dbmp->reginfo[i].primary;
-		for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
-		    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
-			if (F_ISSET(bhp, BH_DIRTY) || bhp->ref != 0) {
-				F_SET(bhp, BH_SYNC);
-
-				++mp->lsn_cnt;
-
-				mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
-				++mfp->lsn_cnt;
-
-				/*
-				 * If the buffer isn't being used, we can write
-				 * it immediately, so increment its reference
-				 * count to lock it down, and save a reference
-				 * to it.
-				 *
-				 * If we've run out space to store buffer refs,
-				 * we're screwed.  We don't want to realloc the
-				 * array while holding a region lock, so we set
-				 * a flag and deal with it later.
-				 */
-				if (bhp->ref == 0) {
-					++bhp->ref;
-					bharray[ar_cnt] = bhp;
-
-					if (++ar_cnt >= ndirty) {
-						retry_need = 1;
-						break;
-					}
-				}
-			} else
-				if (F_ISSET(bhp, BH_SYNC))
-					F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH);
-		}
-		if (ar_cnt >= ndirty)
-			break;
-	}
-
-	/* If there no buffers we can write immediately, we're done. */
-	if (ar_cnt == 0) {
-		ret = mp->lsn_cnt ? DB_INCOMPLETE : 0;
-		goto done;
-	}
-
-	R_UNLOCK(dbenv, dbmp->reginfo);
-
-	/*
-	 * Sort the buffers we're going to write immediately.
-	 *
-	 * We try and write the buffers in file/page order: it should reduce
-	 * seeks by the underlying filesystem and possibly reduce the actual
-	 * number of writes.
-	 */
-	if (ar_cnt > 1)
-		qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp);
-
-	/*
-	 * Flush the log.  We have to ensure the log records reflecting the
-	 * changes on the database pages we're writing have already made it
-	 * to disk.  We usually do that as we write each page, but if we
-	 * are going to write a large number of pages, repeatedly acquiring
-	 * the log region lock is going to be expensive.  Flush the entire
-	 * log now, so that sync doesn't require any more log flushes.
-	 */
-	if (LOGGING_ON(dbenv) && (ret = log_flush(dbenv, NULL)) != 0)
-		goto done;
-
-	R_LOCK(dbenv, dbmp->reginfo);
-
-	/* Walk the array, writing buffers. */
-	for (i = 0; i < ar_cnt; ++i) {
-		/*
-		 * It's possible for a thread to have gotten the buffer since
-		 * we listed it for writing.  If the reference count is still
-		 * 1, we're the only ones using the buffer, go ahead and write.
-		 * If it's >1, then skip the buffer and assume that it will be
-		 * written when it's returned to the cache.
-		 */
-		if (bharray[i]->ref > 1) {
-			--bharray[i]->ref;
-			continue;
-		}
-
-		/* Write the buffer. */
-		mfp = R_ADDR(dbmp->reginfo, bharray[i]->mf_offset);
-		ret = __memp_bhwrite(dbmp, mfp, bharray[i], NULL, &wrote);
-
-		/* Release the buffer. */
-		--bharray[i]->ref;
-
-		if (ret == 0 && wrote)
-			continue;
-
-		/*
-		 * Any process syncing the shared memory buffer pool had best
-		 * be able to write to any underlying file. Be understanding,
-		 * but firm, on this point.
-		 */
-		if (ret == 0) {
-			__db_err(dbenv, "%s: unable to flush page: %lu",
-			    __memp_fns(dbmp, mfp), (u_long)bharray[i]->pgno);
-			ret = EPERM;
-		}
-
-		/*
-		 * On error, clear MPOOL->lsn and set MP_LSN_RETRY so that no
-		 * future checkpoint return can depend on this failure.  Clear
-		 * the buffer's BH_SYNC flag, because it's used to determine
-		 * if lsn_cnt values are incremented/decremented.  Don't bother
-		 * to reset/clear:
-		 *
-		 *	MPOOL->lsn_cnt
-		 *	MPOOLFILE->lsn_cnt
-		 *
-		 * they don't make any difference.
-		 */
-		ZERO_LSN(mp->lsn);
-		F_SET(mp, MP_LSN_RETRY);
-
-		/* Release any buffers we're still pinning down. */
-		while (++i < ar_cnt) {
-			bhp = bharray[i];
-			--bhp->ref;
-			F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH);
-		}
-
-		goto done;
-	}
-
-	ret = mp->lsn_cnt != 0 ? DB_INCOMPLETE : 0;
-
-	/*
-	 * If there were too many buffers and we're not returning an error, we
-	 * re-try the checkpoint once -- since we allocated 80% of the total
-	 * buffer count, once should be enough. If it still doesn't work, some
-	 * other thread of control is dirtying buffers as fast as we're writing
-	 * them, and we might as well give up for now.  In the latter case, set
-	 * the global retry flag, we'll have to start from scratch on the next
-	 * checkpoint.
-	 */
-	if (retry_need) {
-		if (retry_done) {
-			ret = DB_INCOMPLETE;
-			F_SET(mp, MP_LSN_RETRY);
-		} else {
-			retry_done = 1;
-			goto retry;
-		}
+	if (lsnp != NULL) {
+		R_LOCK(dbenv, dbmp->reginfo);
+		if (log_compare(lsnp, &mp->lsn) > 0)
+			mp->lsn = *lsnp;
+		R_UNLOCK(dbenv, dbmp->reginfo);
 	}
 
-done:	R_UNLOCK(dbenv, dbmp->reginfo);
-	MUTEX_UNLOCK(dbenv, &mp->sync_mutex);
-
-	__os_free(bharray, ndirty * sizeof(BH *));
-
-	return (ret);
+	return (0);
 }
 
 /*
- * memp_fsync --
+ * __memp_fsync --
  *	Mpool file sync function.
+ *
+ * PUBLIC: int __memp_fsync __P((DB_MPOOLFILE *));
  */
 int
-memp_fsync(dbmfp)
+__memp_fsync(dbmfp)
 	DB_MPOOLFILE *dbmfp;
 {
 	DB_ENV *dbenv;
 	DB_MPOOL *dbmp;
-	int is_tmp;
 
 	dbmp = dbmfp->dbmp;
 	dbenv = dbmp->dbenv;
 
-#ifdef HAVE_RPC
-	if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
-		return (__dbcl_memp_fsync(dbmfp));
-#endif
-
 	PANIC_CHECK(dbenv);
 
 	/*
@@ -359,13 +112,10 @@ memp_fsync(dbmfp)
 	if (F_ISSET(dbmfp, MP_READONLY))
 		return (0);
 
-	R_LOCK(dbenv, dbmp->reginfo);
-	is_tmp = F_ISSET(dbmfp->mfp, MP_TEMP);
-	R_UNLOCK(dbenv, dbmp->reginfo);
-	if (is_tmp)
+	if (F_ISSET(dbmfp->mfp, MP_TEMP))
 		return (0);
 
-	return (__memp_fsync(dbmfp));
+	return (__memp_sync_int(dbenv, dbmfp, 0, DB_SYNC_FILE, NULL));
 }
 
 /*
@@ -379,6 +129,7 @@ __mp_xxx_fh(dbmfp, fhp)
 	DB_MPOOLFILE *dbmfp;
 	DB_FH **fhp;
 {
+	DB_ENV *dbenv;
 	/*
 	 * This is a truly spectacular layering violation, intended ONLY to
 	 * support compatibility for the DB 1.85 DB->fd call.
@@ -393,239 +144,457 @@ __mp_xxx_fh(dbmfp, fhp)
 	 * because we want to write to the backing file regardless so that
 	 * we get a file descriptor to return.
 	 */
-	*fhp = &dbmfp->fh;
-	return (F_ISSET(&dbmfp->fh, DB_FH_VALID) ? 0 : __memp_fsync(dbmfp));
+	*fhp = dbmfp->fhp;
+	if (F_ISSET(dbmfp->fhp, DB_FH_VALID))
+		return (0);
+	dbenv = dbmfp->dbmp->dbenv;
+
+	return (__memp_sync_int(dbenv, dbmfp, 0, DB_SYNC_FILE, NULL));
 }
 
 /*
- * __memp_fsync --
- *	Mpool file internal sync function.
+ * __memp_sync_int --
+ *	Mpool sync internal function.
+ *
+ * PUBLIC: int __memp_sync_int
+ * PUBLIC:     __P((DB_ENV *, DB_MPOOLFILE *, int, db_sync_op, int *));
  */
-static int
-__memp_fsync(dbmfp)
+int
+__memp_sync_int(dbenv, dbmfp, ar_max, op, wrotep)
+	DB_ENV *dbenv;
 	DB_MPOOLFILE *dbmfp;
+	int ar_max, *wrotep;
+	db_sync_op op;
 {
-	BH *bhp, **bharray;
-	DB_ENV *dbenv;
+	BH *bhp;
+	BH_TRACK *bharray;
 	DB_MPOOL *dbmp;
+	DB_MPOOL_HASH *hp;
+	DB_MUTEX *mutexp;
 	MPOOL *c_mp, *mp;
-	size_t mf_offset;
-	u_int32_t ar_cnt, i, ndirty;
-	int incomplete, ret, retry_done, retry_need, wrote;
+	MPOOLFILE *mfp;
+	u_int32_t n_cache;
+	int ar_cnt, hb_lock, i, pass, remaining, ret, t_ret, wait_cnt, wrote;
 
-	dbmp = dbmfp->dbmp;
-	dbenv = dbmp->dbenv;
+	dbmp = dbenv->mp_handle;
 	mp = dbmp->reginfo[0].primary;
-
-	R_LOCK(dbenv, dbmp->reginfo);
+	pass = wrote = 0;
 
 	/*
-	 * Allocate room for a list of buffers, and decide how many buffers
-	 * we can pin down.
-	 *
-	 * !!!
-	 * Note: __memp_sballoc has released our region lock if we're not
-	 * continuing forward.
+	 * If the caller does not specify how many pages assume one
+	 * per bucket.
 	 */
+	if (ar_max == 0)
+		ar_max = mp->nreg * mp->htab_buckets;
+
 	if ((ret =
-	    __memp_sballoc(dbenv, &bharray, &ndirty)) != 0 || ndirty == 0)
+	    __os_malloc(dbenv, ar_max * sizeof(BH_TRACK), &bharray)) != 0)
 		return (ret);
 
-	retry_done = 0;
-retry:	retry_need = 0;
 	/*
 	 * Walk each cache's list of buffers and mark all dirty buffers to be
-	 * written and all pinned buffers to be potentially written (we can't
-	 * know if they'll need to be written until the holder returns them to
-	 * the cache).  We do this in one pass while holding the region locked
-	 * so that processes can't make new buffers dirty, causing us to never
-	 * finish.
+	 * written and all pinned buffers to be potentially written, depending
+	 * on our flags.
 	 */
-	mf_offset = R_OFFSET(dbmp->reginfo, dbmfp->mfp);
-	for (ar_cnt = 0, incomplete = 0, i = 0; i < mp->nreg; ++i) {
-		c_mp = dbmp->reginfo[i].primary;
-		for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
-		    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
-			if (!F_ISSET(bhp, BH_DIRTY) ||
-			    bhp->mf_offset != mf_offset)
-				continue;
-			if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) {
-				incomplete = 1;
-				continue;
-			}
+	for (ar_cnt = 0, n_cache = 0; n_cache < mp->nreg; ++n_cache) {
+		c_mp = dbmp->reginfo[n_cache].primary;
 
+		hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
+		for (i = 0; i < c_mp->htab_buckets; i++, hp++) {
 			/*
-			 * If the buffer isn't being used, we can write
-			 * it immediately, so increment its reference
-			 * count to lock it down, and save a reference
-			 * to it.
-			 *
-			 * If we've run out space to store buffer refs,
-			 * we're screwed.  We don't want to realloc the
-			 * array while holding a region lock, so we set
-			 * a flag and deal with it later.
+			 * We can check for empty buckets before locking as we
+			 * only care if the pointer is zero or non-zero.  We
+			 * can ignore empty buckets because we only need write
+			 * buffers that were dirty before we started.
 			 */
-			++bhp->ref;
-			bharray[ar_cnt] = bhp;
-			if (++ar_cnt >= ndirty) {
-				retry_need = 1;
-				break;
+			if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+				continue;
+
+			MUTEX_LOCK(dbenv, &hp->hash_mutex);
+			for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+			    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
+				/* Always ignore unreferenced, clean pages. */
+				if (bhp->ref == 0 && !F_ISSET(bhp, BH_DIRTY))
+					continue;
+
+				/*
+				 * Checkpoints have to wait on all pinned pages,
+				 * as pages may be marked dirty when returned to
+				 * the cache.
+				 *
+				 * File syncs only wait on pages both pinned and
+				 * dirty.  (We don't care if pages are marked
+				 * dirty when returned to the cache, that means
+				 * there's another writing thread and flushing
+				 * the cache for this handle is meaningless.)
+				 */
+				if (op == DB_SYNC_FILE &&
+				    !F_ISSET(bhp, BH_DIRTY))
+					continue;
+
+				mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+
+				/*
+				 * Ignore temporary files -- this means you
+				 * can't even flush temporary files by handle.
+				 * (Checkpoint doesn't require temporary files
+				 * be flushed and the underlying buffer write
+				 * write routine may not be able to write it
+				 * anyway.)
+				 */
+				if (F_ISSET(mfp, MP_TEMP))
+					continue;
+
+				/*
+				 * If we're flushing a specific file, see if
+				 * this page is from that file.
+				 */
+				if (dbmfp != NULL && mfp != dbmfp->mfp)
+					continue;
+
+				/*
+				 * Ignore files that aren't involved in DB's
+				 * transactional operations during checkpoints.
+				 */
+				if (dbmfp == NULL && mfp->lsn_off == -1)
+					continue;
+
+				/* Track the buffer, we want it. */
+				bharray[ar_cnt].track_hp = hp;
+				bharray[ar_cnt].track_pgno = bhp->pgno;
+				bharray[ar_cnt].track_off = bhp->mf_offset;
+				ar_cnt++;
+
+				if (ar_cnt >= ar_max) {
+					if ((ret = __os_realloc(dbenv,
+					    (ar_max * 2) * sizeof(BH_TRACK),
+					    &bharray)) != 0)
+						break;
+					ar_max *= 2;
+				}
 			}
+			MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+
+			if (ret != 0)
+				goto err;
 		}
-		if (ar_cnt >= ndirty)
-			break;
 	}
 
-	/* If there no buffers we can write immediately, we're done. */
-	if (ar_cnt == 0) {
-		ret = 0;
+	/* If there no buffers to write, we're done. */
+	if (ar_cnt == 0)
 		goto done;
-	}
 
-	R_UNLOCK(dbenv, dbmp->reginfo);
-
-	/* Sort the buffers we're going to write. */
+	/*
+	 * Write the buffers in file/page order, trying to reduce seeks by the
+	 * filesystem and, when pages are smaller than filesystem block sizes,
+	 * reduce the actual number of writes.
+	 */
 	if (ar_cnt > 1)
-		qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp);
+		qsort(bharray, ar_cnt, sizeof(BH_TRACK), __bhcmp);
 
-	R_LOCK(dbenv, dbmp->reginfo);
+	/*
+	 * If we're trickling buffers, only write enough to reach the correct
+	 * percentage for this region.  We may not write enough if the dirty
+	 * buffers have an unbalanced distribution among the regions, but that
+	 * seems unlikely.
+	 */
+	 if (op == DB_SYNC_TRICKLE && ar_cnt > ar_max / (int)mp->nreg)
+		ar_cnt = ar_max / (int)mp->nreg;
+
+	/*
+	 * Flush the log.  We have to ensure the log records reflecting the
+	 * changes on the database pages we're writing have already made it
+	 * to disk.  We still have to check the log each time we write a page
+	 * (because pages we are about to write may be modified after we have
+	 * flushed the log), but in general this will at least avoid any I/O
+	 * on the log's part.
+	 */
+	if (LOGGING_ON(dbenv) && (ret = dbenv->log_flush(dbenv, NULL)) != 0)
+		goto err;
+
+	/*
+	 * Walk the array, writing buffers.  When we write a buffer, we NULL
+	 * out its hash bucket pointer so we don't process a slot more than
+	 * once.
+	 */
+	for (remaining = ar_cnt, i = pass = 0; remaining > 0; ++i) {
+		if (i >= ar_cnt) {
+			i = 0;
+			++pass;
+			__os_sleep(dbenv, 1, 0);
+		}
+		if ((hp = bharray[i].track_hp) == NULL)
+			continue;
+
+		/* Lock the hash bucket and find the buffer. */
+		mutexp = &hp->hash_mutex;
+		MUTEX_LOCK(dbenv, mutexp);
+		for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+		    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
+			if (bhp->pgno == bharray[i].track_pgno &&
+			    bhp->mf_offset == bharray[i].track_off)
+				break;
 
-	/* Walk the array, writing buffers. */
-	for (i = 0; i < ar_cnt;) {
 		/*
-		 * It's possible for a thread to have gotten the buffer since
-		 * we listed it for writing.  If the reference count is still
-		 * 1, we're the only ones using the buffer, go ahead and write.
-		 * If it's >1, then skip the buffer and assume that it will be
-		 * written when it's returned to the cache.
+		 * If we can't find the buffer we're done, somebody else had
+		 * to have written it.
+		 *
+		 * If the buffer isn't pinned or dirty, we're done, there's
+		 * no work needed.
 		 */
-		if (bharray[i]->ref > 1) {
-			incomplete = 1;
-			--bharray[i++]->ref;
+		if (bhp == NULL || (bhp->ref == 0 && !F_ISSET(bhp, BH_DIRTY))) {
+			MUTEX_UNLOCK(dbenv, mutexp);
+			--remaining;
+			bharray[i].track_hp = NULL;
 			continue;
 		}
 
-		/* Write the buffer. */
-		ret = __memp_pgwrite(dbmp, dbmfp, bharray[i], NULL, &wrote);
+		/*
+		 * If the buffer is locked by another thread, ignore it, we'll
+		 * come back to it.
+		 *
+		 * If the buffer is pinned and it's only the first or second
+		 * time we have looked at it, ignore it, we'll come back to
+		 * it.
+		 *
+		 * In either case, skip the buffer if we're not required to
+		 * write it.
+		 */
+		if (F_ISSET(bhp, BH_LOCKED) || (bhp->ref != 0 && pass < 2)) {
+			MUTEX_UNLOCK(dbenv, mutexp);
+			if (op != DB_SYNC_CACHE && op != DB_SYNC_FILE) {
+				--remaining;
+				bharray[i].track_hp = NULL;
+			}
+			continue;
+		}
+
+		/*
+		 * The buffer is either pinned or dirty.
+		 *
+		 * Set the sync wait-for count, used to count down outstanding
+		 * references to this buffer as they are returned to the cache.
+		 */
+		bhp->ref_sync = bhp->ref;
 
-		/* Release the buffer. */
-		--bharray[i++]->ref;
+		/* Pin the buffer into memory and lock it. */
+		++bhp->ref;
+		F_SET(bhp, BH_LOCKED);
+		MUTEX_LOCK(dbenv, &bhp->mutex);
 
-		if (ret == 0) {
-			if (!wrote)
-				incomplete = 1;
-			continue;
+		/*
+		 * Unlock the hash bucket and wait for the wait-for count to
+		 * go to 0.   No new thread can acquire the buffer because we
+		 * have it locked.
+		 *
+		 * If a thread attempts to re-pin a page, the wait-for count
+		 * will never go to 0 (the thread spins on our buffer lock,
+		 * while we spin on the thread's ref count).  Give up if we
+		 * don't get the buffer in 3 seconds, we can try again later.
+		 *
+		 * If, when the wait-for count goes to 0, the buffer is found
+		 * to be dirty, write it.
+		 */
+		MUTEX_UNLOCK(dbenv, mutexp);
+		for (wait_cnt = 1;
+		    bhp->ref_sync != 0 && wait_cnt < 4; ++wait_cnt)
+			__os_sleep(dbenv, 1, 0);
+		MUTEX_LOCK(dbenv, mutexp);
+		hb_lock = 1;
+
+		/*
+		 * If the ref_sync count has gone to 0, we're going to be done
+		 * with this buffer no matter what happens.
+		 */
+		if (bhp->ref_sync == 0) {
+			--remaining;
+			bharray[i].track_hp = NULL;
 		}
 
 		/*
-		 * On error:
+		 * If the ref_sync count has gone to 0 and the buffer is still
+		 * dirty, we write it.  We only try to write the buffer once.
+		 * Any process checkpointing or trickle-flushing the pool
+		 * must be able to write any underlying file -- if the write
+		 * fails, error out.  It would be very strange if file sync
+		 * failed to write, but we don't care if it happens.
+		 */
+		if (bhp->ref_sync == 0 && F_ISSET(bhp, BH_DIRTY)) {
+			hb_lock = 0;
+			MUTEX_UNLOCK(dbenv, mutexp);
+
+			mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+			if ((ret = __memp_bhwrite(dbmp, hp, mfp, bhp, 1)) == 0)
+				++wrote;
+			else if (op == DB_SYNC_CACHE || op == DB_SYNC_TRICKLE)
+				__db_err(dbenv, "%s: unable to flush page: %lu",
+				    __memp_fns(dbmp, mfp), (u_long)bhp->pgno);
+			else
+				ret = 0;
+		}
+
+		/*
+		 * If ref_sync count never went to 0, the buffer was written
+		 * by another thread, or the write failed, we still have the
+		 * buffer locked.
+		 *
+		 * We may or may not currently hold the hash bucket mutex.  If
+		 * the __memp_bhwrite -> __memp_pgwrite call was successful,
+		 * then __memp_pgwrite will have swapped the buffer lock for
+		 * the hash lock.  All other call paths will leave us without
+		 * the hash bucket lock.
 		 *
-		 * Release any buffers we're still pinning down.
+		 * The order of mutexes above was to acquire the buffer lock
+		 * while holding the hash bucket lock.  Don't deadlock here,
+		 * release the buffer lock and then acquire the hash bucket
+		 * lock.
 		 */
-		while (i < ar_cnt)
-			--bharray[i++]->ref;
-		break;
-	}
+		if (F_ISSET(bhp, BH_LOCKED)) {
+			F_CLR(bhp, BH_LOCKED);
+			MUTEX_UNLOCK(dbenv, &bhp->mutex);
 
-	/*
-	 * If there were too many buffers and we're not returning an error, we
-	 * re-try the flush once -- since we allocated 80% of the total
-	 * buffer count, once should be enough. If it still doesn't work, some
-	 * other thread of control is dirtying buffers as fast as we're writing
-	 * them, and we might as well give up.
-	 */
-	if (retry_need) {
-		if (retry_done)
-			incomplete = 1;
-		else {
-			retry_done = 1;
-			goto retry;
+			if (!hb_lock)
+				MUTEX_LOCK(dbenv, mutexp);
 		}
-	}
 
-done:	R_UNLOCK(dbenv, dbmp->reginfo);
+		/*
+		 * Reset the ref_sync count regardless of our success, we're
+		 * done with this buffer for now.
+		 */
+		bhp->ref_sync = 0;
+
+		/* Discard our reference and unlock the bucket. */
+		--bhp->ref;
+		MUTEX_UNLOCK(dbenv, mutexp);
 
-	__os_free(bharray, ndirty * sizeof(BH *));
+		if (ret != 0)
+			break;
+	}
+
+done:	/* If we've opened files to flush pages, close them. */
+	if ((t_ret = __memp_close_flush_files(dbenv, dbmp)) != 0 && ret == 0)
+		ret = t_ret;
 
 	/*
-	 * Sync the underlying file as the last thing we do, so that the OS
-	 * has a maximal opportunity to flush buffers before we request it.
-	 *
-	 * !!!:
-	 * Don't lock the region around the sync, fsync(2) has no atomicity
-	 * issues.
+	 * If doing a checkpoint or flushing a file for the application, we
+	 * have to force the pages to disk.  We don't do this as we go along
+	 * because we want to give the OS as much time as possible to lazily
+	 * flush, and because we have to flush files that might not even have
+	 * had dirty buffers in the cache, so we have to walk the files list.
 	 */
-	if (ret == 0)
-		ret = incomplete ?
-		    DB_INCOMPLETE : __os_fsync(dbenv, &dbmfp->fh);
+	if (ret == 0 && (op == DB_SYNC_CACHE || op == DB_SYNC_FILE)) {
+		if (dbmfp == NULL)
+			ret = __memp_sync_files(dbenv, dbmp);
+		else
+			ret = __os_fsync(dbenv, dbmfp->fhp);
+	}
+
+err:	__os_free(dbenv, bharray);
+	if (wrotep != NULL)
+		*wrotep = wrote;
 
 	return (ret);
 }
 
 /*
- * __memp_sballoc --
- *	Allocate room for a list of buffers.
+ * __memp_sync_files --
+ *	Sync all the files in the environment, open or not.
  */
-static int
-__memp_sballoc(dbenv, bharrayp, ndirtyp)
+static
+int __memp_sync_files(dbenv, dbmp)
 	DB_ENV *dbenv;
-	BH ***bharrayp;
-	u_int32_t *ndirtyp;
-{
 	DB_MPOOL *dbmp;
-	MPOOL *c_mp, *mp;
-	u_int32_t i, nclean, ndirty, maxpin;
-	int ret;
+{
+	DB_MPOOLFILE *dbmfp;
+	MPOOL *mp;
+	MPOOLFILE *mfp;
+	int ret, t_ret;
 
-	dbmp = dbenv->mp_handle;
+	ret = 0;
 	mp = dbmp->reginfo[0].primary;
 
-	/*
-	 * We don't want to hold the region lock while we write the buffers,
-	 * so only lock it while we create a list.
-	 *
-	 * Walk through the list of caches, figuring out how many buffers
-	 * we're going to need.
-	 *
-	 * Make a point of not holding the region lock across the library
-	 * allocation call.
-	 */
-	for (nclean = ndirty = 0, i = 0; i < mp->nreg; ++i) {
-		c_mp = dbmp->reginfo[i].primary;
-		ndirty += c_mp->stat.st_page_dirty;
-		nclean += c_mp->stat.st_page_clean;
+	R_LOCK(dbenv, dbmp->reginfo);
+	for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
+	    mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
+		if (mfp->stat.st_page_out == 0 ||
+		    F_ISSET(mfp, MP_DEADFILE | MP_TEMP))
+			continue;
+
+		/* Look for an already open handle. */
+		ret = 0;
+		MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
+		for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
+		    dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))
+			if (dbmfp->mfp == mfp) {
+				ret = __os_fsync(dbenv, dbmfp->fhp);
+				break;
+			}
+		MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+		if (ret != 0)
+			goto err;
+
+		/* If we don't find one, open one. */
+		if (dbmfp == NULL) {
+			if ((ret = dbenv->memp_fcreate(dbenv, &dbmfp, 0)) != 0)
+				goto err;
+			ret = __memp_fopen_int(
+			    dbmfp, mfp, R_ADDR(dbmp->reginfo, mfp->path_off),
+			    0, 0, mfp->stat.st_pagesize);
+			if (ret == 0)
+				ret = __os_fsync(dbenv, dbmfp->fhp);
+			if ((t_ret =
+			    __memp_fclose_int(dbmfp, 0)) != 0 && ret == 0)
+				ret = t_ret;
+			if (ret != 0)
+				goto err;
+		}
 	}
-	R_UNLOCK(dbenv, dbmp->reginfo);
-	if (ndirty == 0) {
-		*ndirtyp = 0;
-		return (0);
+
+	if (0) {
+err:		__db_err(dbenv, "%s: cannot sync: %s",
+		    R_ADDR(dbmp->reginfo, mfp->path_off), db_strerror(ret));
 	}
+	R_UNLOCK(dbenv, dbmp->reginfo);
 
-	/*
-	 * We don't want to pin down the entire buffer cache, otherwise we'll
-	 * starve threads needing new pages.  Don't pin down more than 80% of
-	 * the cache, making sure that we don't screw up just because only a
-	 * few pages have been created.
-	 */
-	maxpin = ((ndirty + nclean) * 8) / 10;
-	if (maxpin < 10)
-		maxpin = 10;
+	return (ret);
+}
+
+/*
+ * __memp_close_flush_files --
+ *	Close files opened only to flush buffers.
+ */
+static int
+__memp_close_flush_files(dbenv, dbmp)
+	DB_ENV *dbenv;
+	DB_MPOOL *dbmp;
+{
+	DB_MPOOLFILE *dbmfp;
+	int ret;
 
 	/*
-	 * Get a good-sized block of memory to hold buffer pointers, we don't
-	 * want to run out, but correct if we want to allocate more than we
-	 * would be allowed to store, regardless.
+	 * The routine exists because we must close files opened by sync to
+	 * flush buffers.  There are two cases: first, extent files have to
+	 * be closed so they may be removed when empty.  Second, regular
+	 * files have to be closed so we don't run out of descriptors (for
+	 * example, and application partitioning its data into databases
+	 * based on timestamps, so there's a continually increasing set of
+	 * files).
+	 *
+	 * We mark files opened in the __memp_bhwrite() function with the
+	 * MP_FLUSH flag.  Here we walk through our file descriptor list,
+	 * and, if a file was opened by __memp_bhwrite(), we close it.
 	 */
-	ndirty += ndirty / 2 + 10;
-	if (ndirty > maxpin)
-		ndirty = maxpin;
-	if ((ret =
-	    __os_malloc(dbenv, ndirty * sizeof(BH *), NULL, bharrayp)) != 0)
-		return (ret);
-
-	*ndirtyp = ndirty;
-
-	R_LOCK(dbenv, dbmp->reginfo);
+retry:	MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
+	for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
+	    dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))
+		if (F_ISSET(dbmfp, MP_FLUSH)) {
+			F_CLR(dbmfp, MP_FLUSH);
+			MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+			if ((ret = __memp_fclose_int(dbmfp, 0)) != 0)
+				return (ret);
+			goto retry;
+		}
+	MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
 
 	return (0);
 }
@@ -634,15 +603,15 @@ static int
 __bhcmp(p1, p2)
 	const void *p1, *p2;
 {
-	BH *bhp1, *bhp2;
+	BH_TRACK *bhp1, *bhp2;
 
-	bhp1 = *(BH * const *)p1;
-	bhp2 = *(BH * const *)p2;
+	bhp1 = (BH_TRACK *)p1;
+	bhp2 = (BH_TRACK *)p2;
 
 	/* Sort by file (shared memory pool offset). */
-	if (bhp1->mf_offset < bhp2->mf_offset)
+	if (bhp1->track_off < bhp2->track_off)
 		return (-1);
-	if (bhp1->mf_offset > bhp2->mf_offset)
+	if (bhp1->track_off > bhp2->track_off)
 		return (1);
 
 	/*
@@ -650,9 +619,9 @@ __bhcmp(p1, p2)
 	 * Defend against badly written quicksort code calling the comparison
 	 * function with two identical pointers (e.g., WATCOM C++ (Power++)).
 	 */
-	if (bhp1->pgno < bhp2->pgno)
+	if (bhp1->track_pgno < bhp2->track_pgno)
 		return (-1);
-	if (bhp1->pgno > bhp2->pgno)
+	if (bhp1->track_pgno > bhp2->track_pgno)
 		return (1);
 	return (0);
 }