Upload Tizen:Base source

author: Kim Kibum <kb0929.kim@samsung.com> 2012-05-21 17:40:46 +0900
committer: Kim Kibum <kb0929.kim@samsung.com> 2012-05-21 17:40:46 +0900
commit: 2e082c838d2ca750f5daac6dcdabecc22dfd4e46 (patch)
tree: 01c1dd87d4cc0b62a655c0d768ff695d2d244728 /db
parent: a86e3ca152fb414b376e64c449c201d762e414dd (diff)
download: db4-2e082c838d2ca750f5daac6dcdabecc22dfd4e46.tar.gz
db4-2e082c838d2ca750f5daac6dcdabecc22dfd4e46.tar.bz2
db4-2e082c838d2ca750f5daac6dcdabecc22dfd4e46.zip
38 files changed, 34838 insertions, 0 deletions
diff --git a/db/crdel.src b/db/crdel.src
new file mode 100644
index 0000000..cd0b02f
--- /dev/null
+++ b/db/crdel.src
@@ -0,0 +1,72 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle.  All rights reserved.
+ *
+ * $Id$
+ */
+
+DBPRIVATE
+PREFIX	__crdel
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/crypto.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_dispatch.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/log.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE
+
+/*
+ * Metasub: log the creation of a subdatabase meta data page.
+ *
+ * fileid: identifies the file being acted upon.
+ * pgno: page number on which to write this meta-data page
+ * page: the actual meta-data page
+ * lsn: lsn of the page.
+ */
+BEGIN metasub		42	142
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+PGDBT	page		DBT		s
+POINTER	lsn		DB_LSN *	lu
+END
+
+/*
+ * Inmem_create: Log the creation of an in-memory database.
+ *
+ * name: Name of the database
+ * fid: File id of the database
+ */
+BEGIN	inmem_create	44	138
+ARG	fileid	int32_t		ld
+DBT	name	DBT		s
+DBT	fid	DBT		s
+ARG	pgsize	u_int32_t	lu
+END
+
+/*
+ * Inmem_rename: Log the renaming of an in-memory only database.
+ *
+ * oldname: database's starting name
+ * newname: database's ending name
+ * fid: fileid
+ */
+BEGIN	inmem_rename	44	139
+DBT	oldname		DBT		s
+DBT	newname		DBT		s
+DBT	fid		DBT		s
+END
+
+/*
+ * Inmem_remove: Log the removal of an in-memory only database.
+ *
+ * name: database's ending name
+ * fid: fileid
+ */
+BEGIN	inmem_remove	44	140
+DBT	name		DBT		s
+DBT	fid		DBT		s
+END
+
diff --git a/db/crdel_auto.c b/db/crdel_auto.c
new file mode 100644
index 0000000..801a0a5
--- /dev/null
+++ b/db/crdel_auto.c
@@ -0,0 +1,945 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __crdel_metasub_read __P((ENV *, DB **, void *,
+ * PUBLIC:     void *, __crdel_metasub_args **));
+ */
+int
+__crdel_metasub_read(env, dbpp, td, recbuf, argpp)
+	ENV *env;
+	DB **dbpp;
+	void *td;
+	void *recbuf;
+	__crdel_metasub_args **argpp;
+{
+	__crdel_metasub_args *argp;
+	u_int32_t uinttmp;
+	u_int8_t *bp;
+	int ret;
+
+	if ((ret = __os_malloc(env,
+	    sizeof(__crdel_metasub_args) + sizeof(DB_TXN), &argp)) != 0)
+		return (ret);
+	bp = recbuf;
+	argp->txnp = (DB_TXN *)&argp[1];
+	memset(argp->txnp, 0, sizeof(DB_TXN));
+
+	argp->txnp->td = td;
+	LOGCOPY_32(env, &argp->type, bp);
+	bp += sizeof(argp->type);
+
+	LOGCOPY_32(env, &argp->txnp->txnid, bp);
+	bp += sizeof(argp->txnp->txnid);
+
+	LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->fileid = (int32_t)uinttmp;
+	bp += sizeof(uinttmp);
+	if (dbpp != NULL) {
+		*dbpp = NULL;
+		ret = __dbreg_id_to_db(
+		    env, argp->txnp, dbpp, argp->fileid, 1);
+	}
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	memset(&argp->page, 0, sizeof(argp->page));
+	LOGCOPY_32(env,&argp->page.size, bp);
+	bp += sizeof(u_int32_t);
+	argp->page.data = bp;
+	bp += argp->page.size;
+	if (LOG_SWAPPED(env) && dbpp != NULL && *dbpp != NULL) {
+		int t_ret;
+		if ((t_ret = __db_pageswap(*dbpp, (PAGE *)argp->page.data,
+		    (size_t)argp->page.size, NULL, 1)) != 0)
+			return (t_ret);
+	}
+
+	LOGCOPY_TOLSN(env, &argp->lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	*argpp = argp;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __crdel_metasub_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC:     u_int32_t, db_pgno_t, const DBT *, DB_LSN *));
+ */
+int
+__crdel_metasub_log(dbp, txnp, ret_lsnp, flags, pgno, page, lsn)
+	DB *dbp;
+	DB_TXN *txnp;
+	DB_LSN *ret_lsnp;
+	u_int32_t flags;
+	db_pgno_t pgno;
+	const DBT *page;
+	DB_LSN * lsn;
+{
+	DBT logrec;
+	DB_LSN *lsnp, null_lsn, *rlsnp;
+	DB_TXNLOGREC *lr;
+	ENV *env;
+	u_int32_t zero, uinttmp, rectype, txn_num;
+	u_int npad;
+	u_int8_t *bp;
+	int is_durable, ret;
+
+	COMPQUIET(lr, NULL);
+
+	env = dbp->env;
+	rlsnp = ret_lsnp;
+	rectype = DB___crdel_metasub;
+	npad = 0;
+	ret = 0;
+
+	if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+	    F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+		if (txnp == NULL)
+			return (0);
+		is_durable = 0;
+	} else
+		is_durable = 1;
+
+	if (txnp == NULL) {
+		txn_num = 0;
+		lsnp = &null_lsn;
+		null_lsn.file = null_lsn.offset = 0;
+	} else {
+		if (TAILQ_FIRST(&txnp->kids) != NULL &&
+		    (ret = __txn_activekids(env, rectype, txnp)) != 0)
+			return (ret);
+		/*
+		 * We need to assign begin_lsn while holding region mutex.
+		 * That assignment is done inside the DbEnv->log_put call,
+		 * so pass in the appropriate memory location to be filled
+		 * in by the log_put code.
+		 */
+		DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+		txn_num = txnp->txnid;
+	}
+
+	DB_ASSERT(env, dbp->log_filename != NULL);
+	if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+	    (ret = __dbreg_lazy_id(dbp)) != 0)
+		return (ret);
+
+	logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t) + (page == NULL ? 0 : page->size)
+	    + sizeof(*lsn);
+	if (CRYPTO_ON(env)) {
+		npad = env->crypto_handle->adj_size(logrec.size);
+		logrec.size += npad;
+	}
+
+	if (is_durable || txnp == NULL) {
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0)
+			return (ret);
+	} else {
+		if ((ret = __os_malloc(env,
+		    logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+			return (ret);
+#ifdef DIAGNOSTIC
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+			__os_free(env, lr);
+			return (ret);
+		}
+#else
+		logrec.data = lr->data;
+#endif
+	}
+	if (npad > 0)
+		memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+	bp = logrec.data;
+
+	LOGCOPY_32(env, bp, &rectype);
+	bp += sizeof(rectype);
+
+	LOGCOPY_32(env, bp, &txn_num);
+	bp += sizeof(txn_num);
+
+	LOGCOPY_FROMLSN(env, bp, lsnp);
+	bp += sizeof(DB_LSN);
+
+	uinttmp = (u_int32_t)dbp->log_filename->id;
+	LOGCOPY_32(env, bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	uinttmp = (u_int32_t)pgno;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	if (page == NULL) {
+		zero = 0;
+		LOGCOPY_32(env, bp, &zero);
+		bp += sizeof(u_int32_t);
+	} else {
+		LOGCOPY_32(env, bp, &page->size);
+		bp += sizeof(page->size);
+		memcpy(bp, page->data, page->size);
+		if (LOG_SWAPPED(env))
+			if ((ret = __db_pageswap(dbp,
+			    (PAGE *)bp, (size_t)page->size, (DBT *)NULL, 0)) != 0)
+				return (ret);
+		bp += page->size;
+	}
+
+	if (lsn != NULL) {
+		if (txnp != NULL) {
+			LOG *lp = env->lg_handle->reginfo.primary;
+			if (LOG_COMPARE(lsn, &lp->lsn) >= 0 && (ret =
+			    __log_check_page_lsn(env, dbp, lsn)) != 0)
+				return (ret);
+		}
+		LOGCOPY_FROMLSN(env, bp, lsn);
+	} else
+		memset(bp, 0, sizeof(*lsn));
+	bp += sizeof(*lsn);
+
+	DB_ASSERT(env,
+	    (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+	if (is_durable || txnp == NULL) {
+		if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+		    flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+			*lsnp = *rlsnp;
+			if (rlsnp != ret_lsnp)
+				 *ret_lsnp = *rlsnp;
+		}
+	} else {
+		ret = 0;
+#ifdef DIAGNOSTIC
+		/*
+		 * Set the debug bit if we are going to log non-durable
+		 * transactions so they will be ignored by recovery.
+		 */
+		memcpy(lr->data, logrec.data, logrec.size);
+		rectype |= DB_debug_FLAG;
+		LOGCOPY_32(env, logrec.data, &rectype);
+
+		if (!IS_REP_CLIENT(env))
+			ret = __log_put(env,
+			    rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+		STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+		F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+		LSN_NOT_LOGGED(*ret_lsnp);
+	}
+
+#ifdef LOG_DIAGNOSTIC
+	if (ret != 0)
+		(void)__crdel_metasub_print(env,
+		    (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+	__os_free(env, logrec.data);
+#else
+	if (is_durable || txnp == NULL)
+		__os_free(env, logrec.data);
+#endif
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_create_read __P((ENV *, void *,
+ * PUBLIC:     __crdel_inmem_create_args **));
+ */
+int
+__crdel_inmem_create_read(env, recbuf, argpp)
+	ENV *env;
+	void *recbuf;
+	__crdel_inmem_create_args **argpp;
+{
+	__crdel_inmem_create_args *argp;
+	u_int32_t uinttmp;
+	u_int8_t *bp;
+	int ret;
+
+	if ((ret = __os_malloc(env,
+	    sizeof(__crdel_inmem_create_args) + sizeof(DB_TXN), &argp)) != 0)
+		return (ret);
+	bp = recbuf;
+	argp->txnp = (DB_TXN *)&argp[1];
+	memset(argp->txnp, 0, sizeof(DB_TXN));
+
+	LOGCOPY_32(env, &argp->type, bp);
+	bp += sizeof(argp->type);
+
+	LOGCOPY_32(env, &argp->txnp->txnid, bp);
+	bp += sizeof(argp->txnp->txnid);
+
+	LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->fileid = (int32_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	memset(&argp->name, 0, sizeof(argp->name));
+	LOGCOPY_32(env,&argp->name.size, bp);
+	bp += sizeof(u_int32_t);
+	argp->name.data = bp;
+	bp += argp->name.size;
+
+	memset(&argp->fid, 0, sizeof(argp->fid));
+	LOGCOPY_32(env,&argp->fid.size, bp);
+	bp += sizeof(u_int32_t);
+	argp->fid.data = bp;
+	bp += argp->fid.size;
+
+	LOGCOPY_32(env, &argp->pgsize, bp);
+	bp += sizeof(argp->pgsize);
+
+	*argpp = argp;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_create_log __P((ENV *, DB_TXN *,
+ * PUBLIC:     DB_LSN *, u_int32_t, int32_t, const DBT *, const DBT *,
+ * PUBLIC:     u_int32_t));
+ */
+int
+__crdel_inmem_create_log(env, txnp, ret_lsnp, flags,
+    fileid, name, fid, pgsize)
+	ENV *env;
+	DB_TXN *txnp;
+	DB_LSN *ret_lsnp;
+	u_int32_t flags;
+	int32_t fileid;
+	const DBT *name;
+	const DBT *fid;
+	u_int32_t pgsize;
+{
+	DBT logrec;
+	DB_LSN *lsnp, null_lsn, *rlsnp;
+	DB_TXNLOGREC *lr;
+	u_int32_t zero, uinttmp, rectype, txn_num;
+	u_int npad;
+	u_int8_t *bp;
+	int is_durable, ret;
+
+	COMPQUIET(lr, NULL);
+
+	rlsnp = ret_lsnp;
+	rectype = DB___crdel_inmem_create;
+	npad = 0;
+	ret = 0;
+
+	if (LF_ISSET(DB_LOG_NOT_DURABLE)) {
+		if (txnp == NULL)
+			return (0);
+		is_durable = 0;
+	} else
+		is_durable = 1;
+
+	if (txnp == NULL) {
+		txn_num = 0;
+		lsnp = &null_lsn;
+		null_lsn.file = null_lsn.offset = 0;
+	} else {
+		if (TAILQ_FIRST(&txnp->kids) != NULL &&
+		    (ret = __txn_activekids(env, rectype, txnp)) != 0)
+			return (ret);
+		/*
+		 * We need to assign begin_lsn while holding region mutex.
+		 * That assignment is done inside the DbEnv->log_put call,
+		 * so pass in the appropriate memory location to be filled
+		 * in by the log_put code.
+		 */
+		DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+		txn_num = txnp->txnid;
+	}
+
+	logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t) + (name == NULL ? 0 : name->size)
+	    + sizeof(u_int32_t) + (fid == NULL ? 0 : fid->size)
+	    + sizeof(u_int32_t);
+	if (CRYPTO_ON(env)) {
+		npad = env->crypto_handle->adj_size(logrec.size);
+		logrec.size += npad;
+	}
+
+	if (is_durable || txnp == NULL) {
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0)
+			return (ret);
+	} else {
+		if ((ret = __os_malloc(env,
+		    logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+			return (ret);
+#ifdef DIAGNOSTIC
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+			__os_free(env, lr);
+			return (ret);
+		}
+#else
+		logrec.data = lr->data;
+#endif
+	}
+	if (npad > 0)
+		memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+	bp = logrec.data;
+
+	LOGCOPY_32(env, bp, &rectype);
+	bp += sizeof(rectype);
+
+	LOGCOPY_32(env, bp, &txn_num);
+	bp += sizeof(txn_num);
+
+	LOGCOPY_FROMLSN(env, bp, lsnp);
+	bp += sizeof(DB_LSN);
+
+	uinttmp = (u_int32_t)fileid;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	if (name == NULL) {
+		zero = 0;
+		LOGCOPY_32(env, bp, &zero);
+		bp += sizeof(u_int32_t);
+	} else {
+		LOGCOPY_32(env, bp, &name->size);
+		bp += sizeof(name->size);
+		memcpy(bp, name->data, name->size);
+		bp += name->size;
+	}
+
+	if (fid == NULL) {
+		zero = 0;
+		LOGCOPY_32(env, bp, &zero);
+		bp += sizeof(u_int32_t);
+	} else {
+		LOGCOPY_32(env, bp, &fid->size);
+		bp += sizeof(fid->size);
+		memcpy(bp, fid->data, fid->size);
+		bp += fid->size;
+	}
+
+	LOGCOPY_32(env, bp, &pgsize);
+	bp += sizeof(pgsize);
+
+	DB_ASSERT(env,
+	    (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+	if (is_durable || txnp == NULL) {
+		if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+		    flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+			*lsnp = *rlsnp;
+			if (rlsnp != ret_lsnp)
+				 *ret_lsnp = *rlsnp;
+		}
+	} else {
+		ret = 0;
+#ifdef DIAGNOSTIC
+		/*
+		 * Set the debug bit if we are going to log non-durable
+		 * transactions so they will be ignored by recovery.
+		 */
+		memcpy(lr->data, logrec.data, logrec.size);
+		rectype |= DB_debug_FLAG;
+		LOGCOPY_32(env, logrec.data, &rectype);
+
+		if (!IS_REP_CLIENT(env))
+			ret = __log_put(env,
+			    rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+		STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+		F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+		LSN_NOT_LOGGED(*ret_lsnp);
+	}
+
+#ifdef LOG_DIAGNOSTIC
+	if (ret != 0)
+		(void)__crdel_inmem_create_print(env,
+		    (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+	__os_free(env, logrec.data);
+#else
+	if (is_durable || txnp == NULL)
+		__os_free(env, logrec.data);
+#endif
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_rename_read __P((ENV *, void *,
+ * PUBLIC:     __crdel_inmem_rename_args **));
+ */
+int
+__crdel_inmem_rename_read(env, recbuf, argpp)
+	ENV *env;
+	void *recbuf;
+	__crdel_inmem_rename_args **argpp;
+{
+	__crdel_inmem_rename_args *argp;
+	u_int8_t *bp;
+	int ret;
+
+	if ((ret = __os_malloc(env,
+	    sizeof(__crdel_inmem_rename_args) + sizeof(DB_TXN), &argp)) != 0)
+		return (ret);
+	bp = recbuf;
+	argp->txnp = (DB_TXN *)&argp[1];
+	memset(argp->txnp, 0, sizeof(DB_TXN));
+
+	LOGCOPY_32(env, &argp->type, bp);
+	bp += sizeof(argp->type);
+
+	LOGCOPY_32(env, &argp->txnp->txnid, bp);
+	bp += sizeof(argp->txnp->txnid);
+
+	LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	memset(&argp->oldname, 0, sizeof(argp->oldname));
+	LOGCOPY_32(env,&argp->oldname.size, bp);
+	bp += sizeof(u_int32_t);
+	argp->oldname.data = bp;
+	bp += argp->oldname.size;
+
+	memset(&argp->newname, 0, sizeof(argp->newname));
+	LOGCOPY_32(env,&argp->newname.size, bp);
+	bp += sizeof(u_int32_t);
+	argp->newname.data = bp;
+	bp += argp->newname.size;
+
+	memset(&argp->fid, 0, sizeof(argp->fid));
+	LOGCOPY_32(env,&argp->fid.size, bp);
+	bp += sizeof(u_int32_t);
+	argp->fid.data = bp;
+	bp += argp->fid.size;
+
+	*argpp = argp;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_rename_log __P((ENV *, DB_TXN *,
+ * PUBLIC:     DB_LSN *, u_int32_t, const DBT *, const DBT *, const DBT *));
+ */
+int
+__crdel_inmem_rename_log(env, txnp, ret_lsnp, flags,
+    oldname, newname, fid)
+	ENV *env;
+	DB_TXN *txnp;
+	DB_LSN *ret_lsnp;
+	u_int32_t flags;
+	const DBT *oldname;
+	const DBT *newname;
+	const DBT *fid;
+{
+	DBT logrec;
+	DB_LSN *lsnp, null_lsn, *rlsnp;
+	DB_TXNLOGREC *lr;
+	u_int32_t zero, rectype, txn_num;
+	u_int npad;
+	u_int8_t *bp;
+	int is_durable, ret;
+
+	COMPQUIET(lr, NULL);
+
+	rlsnp = ret_lsnp;
+	rectype = DB___crdel_inmem_rename;
+	npad = 0;
+	ret = 0;
+
+	if (LF_ISSET(DB_LOG_NOT_DURABLE)) {
+		if (txnp == NULL)
+			return (0);
+		is_durable = 0;
+	} else
+		is_durable = 1;
+
+	if (txnp == NULL) {
+		txn_num = 0;
+		lsnp = &null_lsn;
+		null_lsn.file = null_lsn.offset = 0;
+	} else {
+		if (TAILQ_FIRST(&txnp->kids) != NULL &&
+		    (ret = __txn_activekids(env, rectype, txnp)) != 0)
+			return (ret);
+		/*
+		 * We need to assign begin_lsn while holding region mutex.
+		 * That assignment is done inside the DbEnv->log_put call,
+		 * so pass in the appropriate memory location to be filled
+		 * in by the log_put code.
+		 */
+		DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+		txn_num = txnp->txnid;
+	}
+
+	logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+	    + sizeof(u_int32_t) + (oldname == NULL ? 0 : oldname->size)
+	    + sizeof(u_int32_t) + (newname == NULL ? 0 : newname->size)
+	    + sizeof(u_int32_t) + (fid == NULL ? 0 : fid->size);
+	if (CRYPTO_ON(env)) {
+		npad = env->crypto_handle->adj_size(logrec.size);
+		logrec.size += npad;
+	}
+
+	if (is_durable || txnp == NULL) {
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0)
+			return (ret);
+	} else {
+		if ((ret = __os_malloc(env,
+		    logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+			return (ret);
+#ifdef DIAGNOSTIC
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+			__os_free(env, lr);
+			return (ret);
+		}
+#else
+		logrec.data = lr->data;
+#endif
+	}
+	if (npad > 0)
+		memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+	bp = logrec.data;
+
+	LOGCOPY_32(env, bp, &rectype);
+	bp += sizeof(rectype);
+
+	LOGCOPY_32(env, bp, &txn_num);
+	bp += sizeof(txn_num);
+
+	LOGCOPY_FROMLSN(env, bp, lsnp);
+	bp += sizeof(DB_LSN);
+
+	if (oldname == NULL) {
+		zero = 0;
+		LOGCOPY_32(env, bp, &zero);
+		bp += sizeof(u_int32_t);
+	} else {
+		LOGCOPY_32(env, bp, &oldname->size);
+		bp += sizeof(oldname->size);
+		memcpy(bp, oldname->data, oldname->size);
+		bp += oldname->size;
+	}
+
+	if (newname == NULL) {
+		zero = 0;
+		LOGCOPY_32(env, bp, &zero);
+		bp += sizeof(u_int32_t);
+	} else {
+		LOGCOPY_32(env, bp, &newname->size);
+		bp += sizeof(newname->size);
+		memcpy(bp, newname->data, newname->size);
+		bp += newname->size;
+	}
+
+	if (fid == NULL) {
+		zero = 0;
+		LOGCOPY_32(env, bp, &zero);
+		bp += sizeof(u_int32_t);
+	} else {
+		LOGCOPY_32(env, bp, &fid->size);
+		bp += sizeof(fid->size);
+		memcpy(bp, fid->data, fid->size);
+		bp += fid->size;
+	}
+
+	DB_ASSERT(env,
+	    (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+	if (is_durable || txnp == NULL) {
+		if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+		    flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+			*lsnp = *rlsnp;
+			if (rlsnp != ret_lsnp)
+				 *ret_lsnp = *rlsnp;
+		}
+	} else {
+		ret = 0;
+#ifdef DIAGNOSTIC
+		/*
+		 * Set the debug bit if we are going to log non-durable
+		 * transactions so they will be ignored by recovery.
+		 */
+		memcpy(lr->data, logrec.data, logrec.size);
+		rectype |= DB_debug_FLAG;
+		LOGCOPY_32(env, logrec.data, &rectype);
+
+		if (!IS_REP_CLIENT(env))
+			ret = __log_put(env,
+			    rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+		STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+		F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+		LSN_NOT_LOGGED(*ret_lsnp);
+	}
+
+#ifdef LOG_DIAGNOSTIC
+	if (ret != 0)
+		(void)__crdel_inmem_rename_print(env,
+		    (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+	__os_free(env, logrec.data);
+#else
+	if (is_durable || txnp == NULL)
+		__os_free(env, logrec.data);
+#endif
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_remove_read __P((ENV *, void *,
+ * PUBLIC:     __crdel_inmem_remove_args **));
+ */
+int
+__crdel_inmem_remove_read(env, recbuf, argpp)
+	ENV *env;
+	void *recbuf;
+	__crdel_inmem_remove_args **argpp;
+{
+	__crdel_inmem_remove_args *argp;
+	u_int8_t *bp;
+	int ret;
+
+	if ((ret = __os_malloc(env,
+	    sizeof(__crdel_inmem_remove_args) + sizeof(DB_TXN), &argp)) != 0)
+		return (ret);
+	bp = recbuf;
+	argp->txnp = (DB_TXN *)&argp[1];
+	memset(argp->txnp, 0, sizeof(DB_TXN));
+
+	LOGCOPY_32(env, &argp->type, bp);
+	bp += sizeof(argp->type);
+
+	LOGCOPY_32(env, &argp->txnp->txnid, bp);
+	bp += sizeof(argp->txnp->txnid);
+
+	LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	memset(&argp->name, 0, sizeof(argp->name));
+	LOGCOPY_32(env,&argp->name.size, bp);
+	bp += sizeof(u_int32_t);
+	argp->name.data = bp;
+	bp += argp->name.size;
+
+	memset(&argp->fid, 0, sizeof(argp->fid));
+	LOGCOPY_32(env,&argp->fid.size, bp);
+	bp += sizeof(u_int32_t);
+	argp->fid.data = bp;
+	bp += argp->fid.size;
+
+	*argpp = argp;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_remove_log __P((ENV *, DB_TXN *,
+ * PUBLIC:     DB_LSN *, u_int32_t, const DBT *, const DBT *));
+ */
+int
+__crdel_inmem_remove_log(env, txnp, ret_lsnp, flags,
+    name, fid)
+	ENV *env;
+	DB_TXN *txnp;
+	DB_LSN *ret_lsnp;
+	u_int32_t flags;
+	const DBT *name;
+	const DBT *fid;
+{
+	DBT logrec;
+	DB_LSN *lsnp, null_lsn, *rlsnp;
+	DB_TXNLOGREC *lr;
+	u_int32_t zero, rectype, txn_num;
+	u_int npad;
+	u_int8_t *bp;
+	int is_durable, ret;
+
+	COMPQUIET(lr, NULL);
+
+	rlsnp = ret_lsnp;
+	rectype = DB___crdel_inmem_remove;
+	npad = 0;
+	ret = 0;
+
+	if (LF_ISSET(DB_LOG_NOT_DURABLE)) {
+		if (txnp == NULL)
+			return (0);
+		is_durable = 0;
+	} else
+		is_durable = 1;
+
+	if (txnp == NULL) {
+		txn_num = 0;
+		lsnp = &null_lsn;
+		null_lsn.file = null_lsn.offset = 0;
+	} else {
+		if (TAILQ_FIRST(&txnp->kids) != NULL &&
+		    (ret = __txn_activekids(env, rectype, txnp)) != 0)
+			return (ret);
+		/*
+		 * We need to assign begin_lsn while holding region mutex.
+		 * That assignment is done inside the DbEnv->log_put call,
+		 * so pass in the appropriate memory location to be filled
+		 * in by the log_put code.
+		 */
+		DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+		txn_num = txnp->txnid;
+	}
+
+	logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+	    + sizeof(u_int32_t) + (name == NULL ? 0 : name->size)
+	    + sizeof(u_int32_t) + (fid == NULL ? 0 : fid->size);
+	if (CRYPTO_ON(env)) {
+		npad = env->crypto_handle->adj_size(logrec.size);
+		logrec.size += npad;
+	}
+
+	if (is_durable || txnp == NULL) {
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0)
+			return (ret);
+	} else {
+		if ((ret = __os_malloc(env,
+		    logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+			return (ret);
+#ifdef DIAGNOSTIC
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+			__os_free(env, lr);
+			return (ret);
+		}
+#else
+		logrec.data = lr->data;
+#endif
+	}
+	if (npad > 0)
+		memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+	bp = logrec.data;
+
+	LOGCOPY_32(env, bp, &rectype);
+	bp += sizeof(rectype);
+
+	LOGCOPY_32(env, bp, &txn_num);
+	bp += sizeof(txn_num);
+
+	LOGCOPY_FROMLSN(env, bp, lsnp);
+	bp += sizeof(DB_LSN);
+
+	if (name == NULL) {
+		zero = 0;
+		LOGCOPY_32(env, bp, &zero);
+		bp += sizeof(u_int32_t);
+	} else {
+		LOGCOPY_32(env, bp, &name->size);
+		bp += sizeof(name->size);
+		memcpy(bp, name->data, name->size);
+		bp += name->size;
+	}
+
+	if (fid == NULL) {
+		zero = 0;
+		LOGCOPY_32(env, bp, &zero);
+		bp += sizeof(u_int32_t);
+	} else {
+		LOGCOPY_32(env, bp, &fid->size);
+		bp += sizeof(fid->size);
+		memcpy(bp, fid->data, fid->size);
+		bp += fid->size;
+	}
+
+	DB_ASSERT(env,
+	    (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+	if (is_durable || txnp == NULL) {
+		if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+		    flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+			*lsnp = *rlsnp;
+			if (rlsnp != ret_lsnp)
+				 *ret_lsnp = *rlsnp;
+		}
+	} else {
+		ret = 0;
+#ifdef DIAGNOSTIC
+		/*
+		 * Set the debug bit if we are going to log non-durable
+		 * transactions so they will be ignored by recovery.
+		 */
+		memcpy(lr->data, logrec.data, logrec.size);
+		rectype |= DB_debug_FLAG;
+		LOGCOPY_32(env, logrec.data, &rectype);
+
+		if (!IS_REP_CLIENT(env))
+			ret = __log_put(env,
+			    rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+		STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+		F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+		LSN_NOT_LOGGED(*ret_lsnp);
+	}
+
+#ifdef LOG_DIAGNOSTIC
+	if (ret != 0)
+		(void)__crdel_inmem_remove_print(env,
+		    (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+	__os_free(env, logrec.data);
+#else
+	if (is_durable || txnp == NULL)
+		__os_free(env, logrec.data);
+#endif
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __crdel_init_recover __P((ENV *, DB_DISTAB *));
+ */
+int
+__crdel_init_recover(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __crdel_metasub_recover, DB___crdel_metasub)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __crdel_inmem_create_recover, DB___crdel_inmem_create)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __crdel_inmem_rename_recover, DB___crdel_inmem_rename)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __crdel_inmem_remove_recover, DB___crdel_inmem_remove)) != 0)
+		return (ret);
+	return (0);
+}
diff --git a/db/crdel_autop.c b/db/crdel_autop.c
new file mode 100644
index 0000000..6bf4bb6
--- /dev/null
+++ b/db/crdel_autop.c
@@ -0,0 +1,227 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __crdel_metasub_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__crdel_metasub_print(env, dbtp, lsnp, notused2, notused3)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *notused3;
+{
+	__crdel_metasub_args *argp;
+	u_int32_t i;
+	int ch;
+	int ret;
+
+	notused2 = DB_TXN_PRINT;
+	notused3 = NULL;
+
+	if ((ret =
+	    __crdel_metasub_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+	(void)printf(
+    "[%lu][%lu]__crdel_metasub%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+	    (u_long)lsnp->file, (u_long)lsnp->offset,
+	    (argp->type & DB_debug_FLAG) ? "_debug" : "",
+	    (u_long)argp->type,
+	    (u_long)argp->txnp->txnid,
+	    (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+	(void)printf("\tfileid: %ld\n", (long)argp->fileid);
+	(void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+	(void)printf("\tpage: ");
+	for (i = 0; i < argp->page.size; i++) {
+		ch = ((u_int8_t *)argp->page.data)[i];
+		printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+	}
+	(void)printf("\n");
+	(void)printf("\tlsn: [%lu][%lu]\n",
+	    (u_long)argp->lsn.file, (u_long)argp->lsn.offset);
+	(void)printf("\n");
+	__os_free(env, argp);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_create_print __P((ENV *, DBT *,
+ * PUBLIC:     DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_create_print(env, dbtp, lsnp, notused2, notused3)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *notused3;
+{
+	__crdel_inmem_create_args *argp;
+	u_int32_t i;
+	int ch;
+	int ret;
+
+	notused2 = DB_TXN_PRINT;
+	notused3 = NULL;
+
+	if ((ret = __crdel_inmem_create_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+	(void)printf(
+    "[%lu][%lu]__crdel_inmem_create%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+	    (u_long)lsnp->file, (u_long)lsnp->offset,
+	    (argp->type & DB_debug_FLAG) ? "_debug" : "",
+	    (u_long)argp->type,
+	    (u_long)argp->txnp->txnid,
+	    (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+	(void)printf("\tfileid: %ld\n", (long)argp->fileid);
+	(void)printf("\tname: ");
+	for (i = 0; i < argp->name.size; i++) {
+		ch = ((u_int8_t *)argp->name.data)[i];
+		printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+	}
+	(void)printf("\n");
+	(void)printf("\tfid: ");
+	for (i = 0; i < argp->fid.size; i++) {
+		ch = ((u_int8_t *)argp->fid.data)[i];
+		printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+	}
+	(void)printf("\n");
+	(void)printf("\tpgsize: %lu\n", (u_long)argp->pgsize);
+	(void)printf("\n");
+	__os_free(env, argp);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_rename_print __P((ENV *, DBT *,
+ * PUBLIC:     DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_rename_print(env, dbtp, lsnp, notused2, notused3)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *notused3;
+{
+	__crdel_inmem_rename_args *argp;
+	u_int32_t i;
+	int ch;
+	int ret;
+
+	notused2 = DB_TXN_PRINT;
+	notused3 = NULL;
+
+	if ((ret = __crdel_inmem_rename_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+	(void)printf(
+    "[%lu][%lu]__crdel_inmem_rename%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+	    (u_long)lsnp->file, (u_long)lsnp->offset,
+	    (argp->type & DB_debug_FLAG) ? "_debug" : "",
+	    (u_long)argp->type,
+	    (u_long)argp->txnp->txnid,
+	    (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+	(void)printf("\toldname: ");
+	for (i = 0; i < argp->oldname.size; i++) {
+		ch = ((u_int8_t *)argp->oldname.data)[i];
+		printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+	}
+	(void)printf("\n");
+	(void)printf("\tnewname: ");
+	for (i = 0; i < argp->newname.size; i++) {
+		ch = ((u_int8_t *)argp->newname.data)[i];
+		printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+	}
+	(void)printf("\n");
+	(void)printf("\tfid: ");
+	for (i = 0; i < argp->fid.size; i++) {
+		ch = ((u_int8_t *)argp->fid.data)[i];
+		printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+	}
+	(void)printf("\n");
+	(void)printf("\n");
+	__os_free(env, argp);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_remove_print __P((ENV *, DBT *,
+ * PUBLIC:     DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_remove_print(env, dbtp, lsnp, notused2, notused3)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *notused3;
+{
+	__crdel_inmem_remove_args *argp;
+	u_int32_t i;
+	int ch;
+	int ret;
+
+	notused2 = DB_TXN_PRINT;
+	notused3 = NULL;
+
+	if ((ret = __crdel_inmem_remove_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+	(void)printf(
+    "[%lu][%lu]__crdel_inmem_remove%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+	    (u_long)lsnp->file, (u_long)lsnp->offset,
+	    (argp->type & DB_debug_FLAG) ? "_debug" : "",
+	    (u_long)argp->type,
+	    (u_long)argp->txnp->txnid,
+	    (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+	(void)printf("\tname: ");
+	for (i = 0; i < argp->name.size; i++) {
+		ch = ((u_int8_t *)argp->name.data)[i];
+		printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+	}
+	(void)printf("\n");
+	(void)printf("\tfid: ");
+	for (i = 0; i < argp->fid.size; i++) {
+		ch = ((u_int8_t *)argp->fid.data)[i];
+		printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+	}
+	(void)printf("\n");
+	(void)printf("\n");
+	__os_free(env, argp);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __crdel_init_print __P((ENV *, DB_DISTAB *));
+ */
+int
+__crdel_init_print(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __crdel_metasub_print, DB___crdel_metasub)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __crdel_inmem_create_print, DB___crdel_inmem_create)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __crdel_inmem_rename_print, DB___crdel_inmem_rename)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __crdel_inmem_remove_print, DB___crdel_inmem_remove)) != 0)
+		return (ret);
+	return (0);
+}
diff --git a/db/crdel_rec.c b/db/crdel_rec.c
new file mode 100644
index 0000000..285b965
--- /dev/null
+++ b/db/crdel_rec.c
@@ -0,0 +1,298 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/fop.h"
+#include "dbinc/hash.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+/*
+ * __crdel_metasub_recover --
+ *	Recovery function for metasub.
+ *
+ * PUBLIC: int __crdel_metasub_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_metasub_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__crdel_metasub_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int cmp_p, ret, t_ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__crdel_metasub_print);
+	REC_INTRO(__crdel_metasub_read, ip, 0);
+
+	/*
+	 * If we are undoing this operation, but the DB that we got back
+	 * was never really opened, then this open was an in-memory open
+	 * that did not finish. We can let the file creation take care
+	 * of any necessary undo/cleanup.
+	 */
+	if (DB_UNDO(op) && !F_ISSET(file_dbp, DB_AM_OPEN_CALLED))
+		goto done;
+
+	if ((ret = __memp_fget(mpf, &argp->pgno,
+	    ip, NULL, 0, &pagep)) != 0) {
+		/* If this is an in-memory file, this might be OK. */
+		if (F_ISSET(file_dbp, DB_AM_INMEM) &&
+		    (ret = __memp_fget(mpf, &argp->pgno, ip, NULL,
+		    DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &pagep)) == 0) {
+			LSN_NOT_LOGGED(LSN(pagep));
+		} else {
+			*lsnp = argp->prev_lsn;
+			ret = 0;
+			goto out;
+		}
+	}
+
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
+
+	if (cmp_p == 0 && DB_REDO(op)) {
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		memcpy(pagep, argp->page.data, argp->page.size);
+		LSN(pagep) = *lsnp;
+
+		/*
+		 * If this was an in-memory database and we are re-creating
+		 * and this is the meta-data page, then we need to set up a
+		 * bunch of fields in the dbo as well.
+		 */
+		if (F_ISSET(file_dbp, DB_AM_INMEM) &&
+		    argp->pgno == PGNO_BASE_MD &&
+		    (ret = __db_meta_setup(file_dbp->env, file_dbp,
+		    file_dbp->dname, (DBMETA *)pagep, 0, DB_CHK_META)) != 0)
+			goto out;
+	} else if (DB_UNDO(op)) {
+		/*
+		 * We want to undo this page creation.  The page creation
+		 * happened in two parts.  First, we called __db_pg_alloc which
+		 * was logged separately. Then we wrote the meta-data onto
+		 * the page.  So long as we restore the LSN, then the recovery
+		 * for __db_pg_alloc will do everything else.
+		 *
+		 * Don't bother checking the lsn on the page.  If we are
+		 * rolling back the next thing is that this page will get
+		 * freed.  Opening the subdb will have reinitialized the
+		 * page, but not the lsn.
+		 */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		LSN(pagep) = argp->lsn;
+	}
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL && (t_ret = __memp_fput(mpf,
+	     ip, pagep, file_dbp->priority)) != 0 &&
+	    ret == 0)
+		ret = t_ret;
+
+	REC_CLOSE;
+}
+
+/*
+ * __crdel_inmem_create_recover --
+ *	Recovery function for inmem_create.
+ *
+ * PUBLIC: int __crdel_inmem_create_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_create_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__crdel_inmem_create_args *argp;
+	DB *dbp;
+	int do_close, ret, t_ret;
+
+	COMPQUIET(info, NULL);
+
+	dbp = NULL;
+	do_close = 0;
+	REC_PRINT(__crdel_inmem_create_print);
+	REC_NOOP_INTRO(__crdel_inmem_create_read);
+
+	/* First, see if the DB handle already exists. */
+	if (argp->fileid == DB_LOGFILEID_INVALID) {
+		if (DB_REDO(op))
+			ret = ENOENT;
+		else
+			ret = 0;
+	} else
+		ret = __dbreg_id_to_db(env, argp->txnp, &dbp, argp->fileid, 0);
+
+	if (DB_REDO(op)) {
+		/*
+		 * If the dbreg failed, that means that we're creating a
+		 * tmp file.
+		 */
+		if (ret != 0) {
+			if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+				goto out;
+
+			F_SET(dbp, DB_AM_RECOVER | DB_AM_INMEM);
+			memcpy(dbp->fileid, argp->fid.data, DB_FILE_ID_LEN);
+			if (((ret = __os_strdup(env,
+			    argp->name.data, &dbp->dname)) != 0))
+				goto out;
+
+			/*
+			 * This DBP is never going to be entered into the
+			 * dbentry table, so if we leave it open here,
+			 * then we're going to lose it.
+			 */
+			do_close = 1;
+		}
+
+		/* Now, set the fileid. */
+		memcpy(dbp->fileid, argp->fid.data, argp->fid.size);
+		if ((ret = __memp_set_fileid(dbp->mpf, dbp->fileid)) != 0)
+			goto out;
+		dbp->preserve_fid = 1;
+		MAKE_INMEM(dbp);
+		if ((ret = __env_setup(dbp,
+		    NULL, NULL, argp->name.data, TXN_INVALID, 0)) != 0)
+			goto out;
+		ret = __env_mpool(dbp, argp->name.data, 0);
+
+		if (ret == ENOENT) {
+			dbp->pgsize = argp->pgsize;
+			if ((ret = __env_mpool(dbp,
+			    argp->name.data, DB_CREATE)) != 0)
+				goto out;
+		} else if (ret != 0)
+			goto out;
+	}
+
+	if (DB_UNDO(op)) {
+		if (ret == 0)
+			ret = __memp_nameop(env, argp->fid.data, NULL,
+			    (const char *)argp->name.data,  NULL, 1);
+
+		if (ret == ENOENT || ret == DB_DELETED)
+			ret = 0;
+		else
+			goto out;
+	}
+
+	*lsnp = argp->prev_lsn;
+
+out:	if (dbp != NULL) {
+		t_ret = 0;
+
+		if (do_close || ret != 0)
+			t_ret = __db_close(dbp, NULL, DB_NOSYNC);
+		if (t_ret != 0 && ret == 0)
+			ret = t_ret;
+	}
+	REC_NOOP_CLOSE;
+}
+
+/*
+ * __crdel_inmem_rename_recover --
+ *	Recovery function for inmem_rename.
+ *
+ * PUBLIC: int __crdel_inmem_rename_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_rename_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__crdel_inmem_rename_args *argp;
+	u_int8_t *fileid;
+	int ret;
+
+	COMPQUIET(info, NULL);
+
+	REC_PRINT(__crdel_inmem_rename_print);
+	REC_NOOP_INTRO(__crdel_inmem_rename_read);
+	fileid = argp->fid.data;
+
+	/* Void out errors because the files may or may not still exist. */
+	if (DB_REDO(op))
+		(void)__memp_nameop(env, fileid,
+		    (const char *)argp->newname.data,
+		    (const char *)argp->oldname.data,
+		    (const char *)argp->newname.data, 1);
+
+	if (DB_UNDO(op))
+		(void)__memp_nameop(env, fileid,
+		    (const char *)argp->oldname.data,
+		    (const char *)argp->newname.data,
+		    (const char *)argp->oldname.data, 1);
+
+	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+	REC_NOOP_CLOSE;
+}
+
+/*
+ * __crdel_inmem_remove_recover --
+ *	Recovery function for inmem_remove.
+ *
+ * PUBLIC: int __crdel_inmem_remove_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_remove_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__crdel_inmem_remove_args *argp;
+	int ret;
+
+	COMPQUIET(info, NULL);
+
+	REC_PRINT(__crdel_inmem_remove_print);
+	REC_NOOP_INTRO(__crdel_inmem_remove_read);
+
+	/*
+	 * Since removes are delayed; there is no undo for a remove; only redo.
+	 * The remove may fail, which is OK.
+	 */
+	if (DB_REDO(op)) {
+		(void)__memp_nameop(env,
+		    argp->fid.data, NULL, argp->name.data, NULL, 1);
+	}
+
+	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+	REC_NOOP_CLOSE;
+}
diff --git a/db/db.c b/db/db.c
new file mode 100644
index 0000000..9caa1aa
--- /dev/null
+++ b/db/db.c
@@ -0,0 +1,1539 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ *	Keith Bostic.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
+#include "dbinc/fop.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+static int __db_disassociate __P((DB *));
+static int __db_disassociate_foreign __P ((DB *));
+
+#ifdef CONFIG_TEST
+static int __db_makecopy __P((ENV *, const char *, const char *));
+static int __qam_testdocopy __P((DB *, const char *));
+#endif
+
+/*
+ * DB.C --
+ *	This file contains the utility functions for the DBP layer.
+ */
+
+/*
+ * __db_master_open --
+ *	Open up a handle on a master database.
+ *
+ * PUBLIC: int __db_master_open __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC:     DB_TXN *, const char *, u_int32_t, int, DB **));
+ */
+int
+__db_master_open(subdbp, ip, txn, name, flags, mode, dbpp)
+	DB *subdbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name;
+	u_int32_t flags;
+	int mode;
+	DB **dbpp;
+{
+	DB *dbp;
+	int ret;
+
+	*dbpp = NULL;
+
+	/* Open up a handle on the main database. */
+	if ((ret = __db_create_internal(&dbp, subdbp->env, 0)) != 0)
+		return (ret);
+
+	/*
+	 * It's always a btree.
+	 * Run in the transaction we've created.
+	 * Set the pagesize in case we're creating a new database.
+	 * Flag that we're creating a database with subdatabases.
+	 */
+	dbp->pgsize = subdbp->pgsize;
+	F_SET(dbp, DB_AM_SUBDB);
+	F_SET(dbp, F_ISSET(subdbp,
+	    DB_AM_RECOVER | DB_AM_SWAP |
+	    DB_AM_ENCRYPT | DB_AM_CHKSUM | DB_AM_NOT_DURABLE));
+
+	/*
+	 * If there was a subdb specified, then we only want to apply
+	 * DB_EXCL to the subdb, not the actual file.  We only got here
+	 * because there was a subdb specified.
+	 */
+	LF_CLR(DB_EXCL);
+	LF_SET(DB_RDWRMASTER);
+	if ((ret = __db_open(dbp, ip,
+	    txn, name, NULL, DB_BTREE, flags, mode, PGNO_BASE_MD)) != 0)
+		goto err;
+
+	/*
+	 * The items in dbp are initialized from the master file's meta page.
+	 * Other items such as checksum and encryption are checked when we
+	 * read the meta-page, so we do not check those here.  However, if
+	 * the meta-page caused checksumming to be turned on and it wasn't
+	 * already, set it here.
+	 */
+	if (F_ISSET(dbp, DB_AM_CHKSUM))
+		F_SET(subdbp, DB_AM_CHKSUM);
+
+	/*
+	 * The user may have specified a page size for an existing file,
+	 * which we want to ignore.
+	 */
+	subdbp->pgsize = dbp->pgsize;
+	*dbpp = dbp;
+
+	if (0) {
+err:		if (!F_ISSET(dbp, DB_AM_DISCARD))
+			(void)__db_close(dbp, txn, 0);
+	}
+
+	return (ret);
+}
+
+/*
+ * __db_master_update --
+ *	Add/Open/Remove a subdatabase from a master database.
+ *
+ * PUBLIC: int __db_master_update __P((DB *, DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC:      const char *, DBTYPE, mu_action, const char *, u_int32_t));
+ */
+int
+__db_master_update(mdbp, sdbp, ip, txn, subdb, type, action, newname, flags)
+	DB *mdbp, *sdbp;
+	DB_TXN *txn;
+	DB_THREAD_INFO *ip;
+	const char *subdb;
+	DBTYPE type;
+	mu_action action;
+	const char *newname;
+	u_int32_t flags;
+{
+	DBC *dbc, *ndbc;
+	DBT key, data, ndata;
+	ENV *env;
+	PAGE *p, *r;
+	db_pgno_t t_pgno;
+	int modify, ret, t_ret;
+
+	env = mdbp->env;
+	dbc = ndbc = NULL;
+	p = NULL;
+
+	/*
+	 * Open up a cursor.  If this is CDB and we're creating the database,
+	 * make it an update cursor.
+	 *
+	 * Might we modify the master database?  If so, we'll need to lock.
+	 */
+	modify = (action != MU_OPEN || LF_ISSET(DB_CREATE)) ? 1 : 0;
+
+	if ((ret = __db_cursor(mdbp, ip, txn, &dbc,
+	    (CDB_LOCKING(env) && modify) ? DB_WRITECURSOR : 0)) != 0)
+		return (ret);
+
+	/*
+	 * Point the cursor at the record.
+	 *
+	 * If we're removing or potentially creating an entry, lock the page
+	 * with DB_RMW.
+	 *
+	 * We do multiple cursor operations with the cursor in some cases and
+	 * subsequently access the data DBT information.  Set DB_DBT_MALLOC so
+	 * we don't risk modification of the data between our uses of it.
+	 *
+	 * !!!
+	 * We don't include the name's nul termination in the database.
+	 */
+	DB_INIT_DBT(key, subdb, strlen(subdb));
+	memset(&data, 0, sizeof(data));
+	F_SET(&data, DB_DBT_MALLOC);
+
+	ret = __dbc_get(dbc, &key, &data,
+	    DB_SET | ((STD_LOCKING(dbc) && modify) ? DB_RMW : 0));
+
+	/*
+	 * What we do next--whether or not we found a record for the
+	 * specified subdatabase--depends on what the specified action is.
+	 * Handle ret appropriately as the first statement of each case.
+	 */
+	switch (action) {
+	case MU_REMOVE:
+		/*
+		 * We should have found something if we're removing it.  Note
+		 * that in the common case where the DB we're asking to remove
+		 * doesn't exist, we won't get this far;  __db_subdb_remove
+		 * will already have returned an error from __db_open.
+		 */
+		if (ret != 0)
+			goto err;
+
+		/*
+		 * Delete the subdatabase entry first;  if this fails,
+		 * we don't want to touch the actual subdb pages.
+		 */
+		if ((ret = __dbc_del(dbc, 0)) != 0)
+			goto err;
+
+		/*
+		 * We're handling actual data, not on-page meta-data,
+		 * so it hasn't been converted to/from opposite
+		 * endian architectures.  Do it explicitly, now.
+		 */
+		memcpy(&sdbp->meta_pgno, data.data, sizeof(db_pgno_t));
+		DB_NTOHL_SWAP(env, &sdbp->meta_pgno);
+		if ((ret = __memp_fget(mdbp->mpf, &sdbp->meta_pgno,
+		    ip, dbc->txn, DB_MPOOL_DIRTY, &p)) != 0)
+			goto err;
+
+		/* Free the root on the master db if it was created. */
+		if (TYPE(p) == P_BTREEMETA &&
+		    ((BTMETA *)p)->root != PGNO_INVALID) {
+			if ((ret = __memp_fget(mdbp->mpf,
+			    &((BTMETA *)p)->root, ip, dbc->txn,
+			    DB_MPOOL_DIRTY, &r)) != 0)
+				goto err;
+
+			/* Free and put the page. */
+			if ((ret = __db_free(dbc, r)) != 0) {
+				r = NULL;
+				goto err;
+			}
+		}
+		/* Free and put the page. */
+		if ((ret = __db_free(dbc, p)) != 0) {
+			p = NULL;
+			goto err;
+		}
+		p = NULL;
+		break;
+	case MU_RENAME:
+		/* We should have found something if we're renaming it. */
+		if (ret != 0)
+			goto err;
+
+		/*
+		 * Before we rename, we need to make sure we're not
+		 * overwriting another subdatabase, or else this operation
+		 * won't be undoable.  Open a second cursor and check
+		 * for the existence of newname;  it shouldn't appear under
+		 * us since we hold the metadata lock.
+		 */
+		if ((ret = __db_cursor(mdbp, ip, txn, &ndbc,
+		    CDB_LOCKING(env) ? DB_WRITECURSOR : 0)) != 0)
+			goto err;
+		DB_SET_DBT(key, newname, strlen(newname));
+
+		/*
+		 * We don't actually care what the meta page of the potentially-
+		 * overwritten DB is; we just care about existence.
+		 */
+		memset(&ndata, 0, sizeof(ndata));
+		F_SET(&ndata, DB_DBT_USERMEM | DB_DBT_PARTIAL);
+
+		if ((ret = __dbc_get(ndbc, &key, &ndata, DB_SET)) == 0) {
+			/* A subdb called newname exists.  Bail. */
+			ret = EEXIST;
+			__db_errx(env, "rename: database %s exists", newname);
+			goto err;
+		} else if (ret != DB_NOTFOUND)
+			goto err;
+
+		/*
+		 * Now do the put first; we don't want to lose our only
+		 * reference to the subdb.  Use the second cursor so the
+		 * first one continues to point to the old record.
+		 */
+		if ((ret = __dbc_put(ndbc, &key, &data, DB_KEYFIRST)) != 0)
+			goto err;
+		if ((ret = __dbc_del(dbc, 0)) != 0) {
+			/*
+			 * If the delete fails, try to delete the record
+			 * we just put, in case we're not txn-protected.
+			 */
+			(void)__dbc_del(ndbc, 0);
+			goto err;
+		}
+
+		break;
+	case MU_OPEN:
+		/*
+		 * Get the subdatabase information.  If it already exists,
+		 * copy out the page number and we're done.
+		 */
+		switch (ret) {
+		case 0:
+			if (LF_ISSET(DB_CREATE) && LF_ISSET(DB_EXCL)) {
+				ret = EEXIST;
+				goto err;
+			}
+			memcpy(&sdbp->meta_pgno, data.data, sizeof(db_pgno_t));
+			DB_NTOHL_SWAP(env, &sdbp->meta_pgno);
+			goto done;
+		case DB_NOTFOUND:
+			if (LF_ISSET(DB_CREATE))
+				break;
+			/*
+			 * No db_err, it is reasonable to remove a
+			 * nonexistent db.
+			 */
+			ret = ENOENT;
+			goto err;
+		default:
+			goto err;
+		}
+
+		/* Create a subdatabase. */
+		if ((ret = __db_new(dbc,
+		    type == DB_HASH ? P_HASHMETA : P_BTREEMETA, NULL, &p)) != 0)
+			goto err;
+		sdbp->meta_pgno = PGNO(p);
+
+		/*
+		 * XXX
+		 * We're handling actual data, not on-page meta-data, so it
+		 * hasn't been converted to/from opposite endian architectures.
+		 * Do it explicitly, now.
+		 */
+		t_pgno = PGNO(p);
+		DB_HTONL_SWAP(env, &t_pgno);
+		memset(&ndata, 0, sizeof(ndata));
+		ndata.data = &t_pgno;
+		ndata.size = sizeof(db_pgno_t);
+		if ((ret = __dbc_put(dbc, &key, &ndata, 0)) != 0)
+			goto err;
+		F_SET(sdbp, DB_AM_CREATED);
+		break;
+	}
+
+err:
+done:	/*
+	 * If we allocated a page: if we're successful, mark the page dirty
+	 * and return it to the cache, otherwise, discard/free it.
+	 */
+	if (p != NULL && (t_ret = __memp_fput(mdbp->mpf,
+	     dbc->thread_info, p, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Discard the cursor(s) and data. */
+	if (data.data != NULL)
+		__os_ufree(env, data.data);
+	if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+	if (ndbc != NULL && (t_ret = __dbc_close(ndbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __env_setup --
+ *	Set up the underlying environment during a db_open.
+ *
+ * PUBLIC: int __env_setup __P((DB *,
+ * PUBLIC:     DB_TXN *, const char *, const char *, u_int32_t, u_int32_t));
+ */
+int
+__env_setup(dbp, txn, fname, dname, id, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	const char *fname, *dname;
+	u_int32_t id, flags;
+{
+	DB *ldbp;
+	DB_ENV *dbenv;
+	ENV *env;
+	u_int32_t maxid;
+	int ret;
+
+	env = dbp->env;
+	dbenv = env->dbenv;
+
+	/* If we don't yet have an environment, it's time to create it. */
+	if (!F_ISSET(env, ENV_OPEN_CALLED)) {
+		/* Make sure we have at least DB_MINCACHE pages in our cache. */
+		if (dbenv->mp_gbytes == 0 &&
+		    dbenv->mp_bytes < dbp->pgsize * DB_MINPAGECACHE &&
+		    (ret = __memp_set_cachesize(
+		    dbenv, 0, dbp->pgsize * DB_MINPAGECACHE, 0)) != 0)
+			return (ret);
+
+		if ((ret = __env_open(dbenv, NULL, DB_CREATE |
+		    DB_INIT_MPOOL | DB_PRIVATE | LF_ISSET(DB_THREAD), 0)) != 0)
+			return (ret);
+	}
+
+	/* Join the underlying cache. */
+	if ((!F_ISSET(dbp, DB_AM_INMEM) || dname == NULL) &&
+	    (ret = __env_mpool(dbp, fname, flags)) != 0)
+		return (ret);
+
+	/* We may need a per-thread mutex. */
+	if (LF_ISSET(DB_THREAD) && (ret = __mutex_alloc(
+	    env, MTX_DB_HANDLE, DB_MUTEX_PROCESS_ONLY, &dbp->mutex)) != 0)
+		return (ret);
+
+	/*
+	 * Set up a bookkeeping entry for this database in the log region,
+	 * if such a region exists.  Note that even if we're in recovery
+	 * or a replication client, where we won't log registries, we'll
+	 * still need an FNAME struct, so LOGGING_ON is the correct macro.
+	 */
+	if (LOGGING_ON(env) && dbp->log_filename == NULL
+#if !defined(DEBUG_ROP) && !defined(DEBUG_WOP) && !defined(DIAGNOSTIC)
+	    && (txn != NULL || F_ISSET(dbp, DB_AM_RECOVER))
+#endif
+#if !defined(DEBUG_ROP)
+	    && !F_ISSET(dbp, DB_AM_RDONLY)
+#endif
+	    ) {
+		if ((ret = __dbreg_setup(dbp,
+		    F_ISSET(dbp, DB_AM_INMEM) ? dname : fname,
+		    F_ISSET(dbp, DB_AM_INMEM) ? NULL : dname, id)) != 0)
+			return (ret);
+
+		/*
+		 * If we're actively logging and our caller isn't a
+		 * recovery function that already did so, then assign
+		 * this dbp a log fileid.
+		 */
+		if (DBENV_LOGGING(env) && !F_ISSET(dbp, DB_AM_RECOVER) &&
+		    (ret = __dbreg_new_id(dbp, txn)) != 0)
+			return (ret);
+	}
+
+	/*
+	 * Insert ourselves into the ENV's dblist.  We allocate a
+	 * unique ID to each {fileid, meta page number} pair, and to
+	 * each temporary file (since they all have a zero fileid).
+	 * This ID gives us something to use to tell which DB handles
+	 * go with which databases in all the cursor adjustment
+	 * routines, where we don't want to do a lot of ugly and
+	 * expensive memcmps.
+	 */
+	MUTEX_LOCK(env, env->mtx_dblist);
+	maxid = 0;
+	TAILQ_FOREACH(ldbp, &env->dblist, dblistlinks) {
+		/*
+		 * There are three cases: on-disk database (first clause),
+		 * named in-memory database (second clause), temporary database
+		 * (never matches; no clause).
+		 */
+		if (!F_ISSET(dbp, DB_AM_INMEM)) {
+			if (memcmp(ldbp->fileid, dbp->fileid, DB_FILE_ID_LEN)
+			    == 0 && ldbp->meta_pgno == dbp->meta_pgno)
+				break;
+		} else if (dname != NULL) {
+			if (F_ISSET(ldbp, DB_AM_INMEM) &&
+			    ldbp->dname != NULL &&
+			    strcmp(ldbp->dname, dname) == 0)
+				break;
+		}
+		if (ldbp->adj_fileid > maxid)
+			maxid = ldbp->adj_fileid;
+	}
+
+	/*
+	 * If ldbp is NULL, we didn't find a match. Assign the dbp an
+	 * adj_fileid one higher than the largest we found, and
+	 * insert it at the head of the master dbp list.
+	 *
+	 * If ldbp is not NULL, it is a match for our dbp.  Give dbp
+	 * the same ID that ldbp has, and add it after ldbp so they're
+	 * together in the list.
+	 */
+	if (ldbp == NULL) {
+		dbp->adj_fileid = maxid + 1;
+		TAILQ_INSERT_HEAD(&env->dblist, dbp, dblistlinks);
+	} else {
+		dbp->adj_fileid = ldbp->adj_fileid;
+		TAILQ_INSERT_AFTER(&env->dblist, ldbp, dbp, dblistlinks);
+	}
+	MUTEX_UNLOCK(env, env->mtx_dblist);
+
+	return (0);
+}
+
+/*
+ * __env_mpool --
+ *	Set up the underlying environment cache during a db_open.
+ *
+ * PUBLIC: int __env_mpool __P((DB *, const char *, u_int32_t));
+ */
+int
+__env_mpool(dbp, fname, flags)
+	DB *dbp;
+	const char *fname;
+	u_int32_t flags;
+{
+	DBT pgcookie;
+	DB_MPOOLFILE *mpf;
+	DB_PGINFO pginfo;
+	ENV *env;
+	int fidset, ftype, ret;
+	int32_t lsn_off;
+	u_int8_t nullfid[DB_FILE_ID_LEN];
+	u_int32_t clear_len;
+
+	env = dbp->env;
+
+	/* The LSN is the first entry on a DB page, byte offset 0. */
+	lsn_off = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LSN_OFF_NOTSET : 0;
+
+	/* It's possible that this database is already open. */
+	if (F_ISSET(dbp, DB_AM_OPEN_CALLED))
+		return (0);
+
+	/*
+	 * If we need to pre- or post-process a file's pages on I/O, set the
+	 * file type.  If it's a hash file, always call the pgin and pgout
+	 * routines.  This means that hash files can never be mapped into
+	 * process memory.  If it's a btree file and requires swapping, we
+	 * need to page the file in and out.  This has to be right -- we can't
+	 * mmap files that are being paged in and out.
+	 */
+	switch (dbp->type) {
+	case DB_BTREE:
+	case DB_RECNO:
+		ftype = F_ISSET(dbp, DB_AM_SWAP | DB_AM_ENCRYPT | DB_AM_CHKSUM)
+		    ? DB_FTYPE_SET : DB_FTYPE_NOTSET;
+		clear_len = CRYPTO_ON(env) ?
+		    (dbp->pgsize != 0 ? dbp->pgsize : DB_CLEARLEN_NOTSET) :
+		    DB_PAGE_DB_LEN;
+		break;
+	case DB_HASH:
+		ftype = DB_FTYPE_SET;
+		clear_len = CRYPTO_ON(env) ?
+		    (dbp->pgsize != 0 ? dbp->pgsize : DB_CLEARLEN_NOTSET) :
+		    DB_PAGE_DB_LEN;
+		break;
+	case DB_QUEUE:
+		ftype = F_ISSET(dbp,
+		    DB_AM_SWAP | DB_AM_ENCRYPT | DB_AM_CHKSUM) ?
+		    DB_FTYPE_SET : DB_FTYPE_NOTSET;
+
+		/*
+		 * If we came in here without a pagesize set, then we need
+		 * to mark the in-memory handle as having clear_len not
+		 * set, because we don't really know the clear length or
+		 * the page size yet (since the file doesn't yet exist).
+		 */
+		clear_len = dbp->pgsize != 0 ? dbp->pgsize : DB_CLEARLEN_NOTSET;
+		break;
+	case DB_UNKNOWN:
+		/*
+		 * If we're running in the verifier, our database might
+		 * be corrupt and we might not know its type--but we may
+		 * still want to be able to verify and salvage.
+		 *
+		 * If we can't identify the type, it's not going to be safe
+		 * to call __db_pgin--we pretty much have to give up all
+		 * hope of salvaging cross-endianness.  Proceed anyway;
+		 * at worst, the database will just appear more corrupt
+		 * than it actually is, but at best, we may be able
+		 * to salvage some data even with no metadata page.
+		 */
+		if (F_ISSET(dbp, DB_AM_VERIFYING)) {
+			ftype = DB_FTYPE_NOTSET;
+			clear_len = DB_PAGE_DB_LEN;
+			break;
+		}
+
+		/*
+		 * This might be an in-memory file and we won't know its
+		 * file type until after we open it and read the meta-data
+		 * page.
+		 */
+		if (F_ISSET(dbp, DB_AM_INMEM)) {
+			clear_len = DB_CLEARLEN_NOTSET;
+			ftype = DB_FTYPE_NOTSET;
+			lsn_off = DB_LSN_OFF_NOTSET;
+			break;
+		}
+		/* FALLTHROUGH */
+	default:
+		return (__db_unknown_type(env, "DB->open", dbp->type));
+	}
+
+	mpf = dbp->mpf;
+
+	memset(nullfid, 0, DB_FILE_ID_LEN);
+	fidset = memcmp(nullfid, dbp->fileid, DB_FILE_ID_LEN);
+	if (fidset)
+		(void)__memp_set_fileid(mpf, dbp->fileid);
+
+	(void)__memp_set_clear_len(mpf, clear_len);
+	(void)__memp_set_ftype(mpf, ftype);
+	(void)__memp_set_lsn_offset(mpf, lsn_off);
+
+	pginfo.db_pagesize = dbp->pgsize;
+	pginfo.flags =
+	    F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_SWAP));
+	pginfo.type = dbp->type;
+	pgcookie.data = &pginfo;
+	pgcookie.size = sizeof(DB_PGINFO);
+	(void)__memp_set_pgcookie(mpf, &pgcookie);
+
+#ifndef DIAG_MVCC
+	if (F_ISSET(env->dbenv, DB_ENV_MULTIVERSION))
+#endif
+		if (F_ISSET(dbp, DB_AM_TXN) &&
+		    dbp->type != DB_QUEUE && dbp->type != DB_UNKNOWN)
+			LF_SET(DB_MULTIVERSION);
+
+	if ((ret = __memp_fopen(mpf, NULL, fname, &dbp->dirname,
+	    LF_ISSET(DB_CREATE | DB_DURABLE_UNKNOWN | DB_MULTIVERSION |
+		DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE) |
+	    (F_ISSET(env->dbenv, DB_ENV_DIRECT_DB) ? DB_DIRECT : 0) |
+	    (F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_TXN_NOT_DURABLE : 0),
+	    0, dbp->pgsize)) != 0) {
+		/*
+		 * The open didn't work; we need to reset the mpf,
+		 * retaining the in-memory semantics (if any).
+		 */
+		(void)__memp_fclose(dbp->mpf, 0);
+		(void)__memp_fcreate(env, &dbp->mpf);
+		if (F_ISSET(dbp, DB_AM_INMEM))
+			MAKE_INMEM(dbp);
+		return (ret);
+	}
+
+	/*
+	 * Set the open flag.  We use it to mean that the dbp has gone
+	 * through mpf setup, including dbreg_register.  Also, below,
+	 * the underlying access method open functions may want to do
+	 * things like acquire cursors, so the open flag has to be set
+	 * before calling them.
+	 */
+	F_SET(dbp, DB_AM_OPEN_CALLED);
+	if (!fidset && fname != NULL) {
+		(void)__memp_get_fileid(dbp->mpf, dbp->fileid);
+		dbp->preserve_fid = 1;
+	}
+
+	return (0);
+}
+
+/*
+ * __db_close --
+ *	DB->close method.
+ *
+ * PUBLIC: int __db_close __P((DB *, DB_TXN *, u_int32_t));
+ */
+int
+__db_close(dbp, txn, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	u_int32_t flags;
+{
+	ENV *env;
+	int db_ref, deferred_close, ret, t_ret;
+
+	env = dbp->env;
+	deferred_close = ret = 0;
+
+	/*
+	 * Validate arguments, but as a DB handle destructor, we can't fail.
+	 *
+	 * Check for consistent transaction usage -- ignore errors.  Only
+	 * internal callers specify transactions, so it's a serious problem
+	 * if we get error messages.
+	 */
+	if (txn != NULL)
+		(void)__db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0);
+
+	/* Refresh the structure and close any underlying resources. */
+	ret = __db_refresh(dbp, txn, flags, &deferred_close, 0);
+
+	/*
+	 * If we've deferred the close because the logging of the close failed,
+	 * return our failure right away without destroying the handle.
+	 */
+	if (deferred_close)
+		return (ret);
+
+	/* !!!
+	 * This code has an apparent race between the moment we read and
+	 * decrement env->db_ref and the moment we check whether it's 0.
+	 * However, if the environment is DBLOCAL, the user shouldn't have a
+	 * reference to the env handle anyway;  the only way we can get
+	 * multiple dbps sharing a local env is if we open them internally
+	 * during something like a subdatabase open.  If any such thing is
+	 * going on while the user is closing the original dbp with a local
+	 * env, someone's already badly screwed up, so there's no reason
+	 * to bother engineering around this possibility.
+	 */
+	MUTEX_LOCK(env, env->mtx_dblist);
+	db_ref = --env->db_ref;
+	MUTEX_UNLOCK(env, env->mtx_dblist);
+	if (F_ISSET(env, ENV_DBLOCAL) && db_ref == 0 &&
+	    (t_ret = __env_close(env->dbenv, 0)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Free the database handle. */
+	memset(dbp, CLEAR_BYTE, sizeof(*dbp));
+	__os_free(env, dbp);
+
+	return (ret);
+}
+
+/*
+ * __db_refresh --
+ *	Refresh the DB structure, releasing any allocated resources.
+ * This does most of the work of closing files now because refresh
+ * is what is used during abort processing (since we can't destroy
+ * the actual handle) and during abort processing, we may have a
+ * fully opened handle.
+ *
+ * PUBLIC: int __db_refresh __P((DB *, DB_TXN *, u_int32_t, int *, int));
+ */
+int
+__db_refresh(dbp, txn, flags, deferred_closep, reuse)
+	DB *dbp;
+	DB_TXN *txn;
+	u_int32_t flags;
+	int *deferred_closep, reuse;
+{
+	DB *sdbp;
+	DBC *dbc;
+	DB_FOREIGN_INFO *f_info, *tmp;
+	DB_LOCKER *locker;
+	DB_LOCKREQ lreq;
+	ENV *env;
+	REGENV *renv;
+	REGINFO *infop;
+	u_int32_t save_flags;
+	int resync, ret, t_ret;
+
+	ret = 0;
+
+	env = dbp->env;
+	infop = env->reginfo;
+	if (infop != NULL)
+		renv = infop->primary;
+	else
+		renv = NULL;
+
+	/*
+	 * If this dbp is not completely open, avoid trapping by trying to
+	 * sync without an mpool file.
+	 */
+	if (dbp->mpf == NULL)
+		LF_SET(DB_NOSYNC);
+
+	/* If never opened, or not currently open, it's easy. */
+	if (!F_ISSET(dbp, DB_AM_OPEN_CALLED))
+		goto never_opened;
+
+	/*
+	 * If we have any secondary indices, disassociate them from us.
+	 * We don't bother with the mutex here;  it only protects some
+	 * of the ops that will make us core-dump mid-close anyway, and
+	 * if you're trying to do something with a secondary *while* you're
+	 * closing the primary, you deserve what you get.  The disassociation
+	 * is mostly done just so we can close primaries and secondaries in
+	 * any order--but within one thread of control.
+	 */
+	LIST_FOREACH(sdbp, &dbp->s_secondaries, s_links) {
+		LIST_REMOVE(sdbp, s_links);
+		if ((t_ret = __db_disassociate(sdbp)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+
+	/*
+	 * Disassociate ourself from any databases using us as a foreign key
+	 * database by clearing the referring db's pointer.  Reclaim memory.
+	 */
+	f_info = LIST_FIRST(&dbp->f_primaries);
+	while (f_info != NULL) {
+		tmp = LIST_NEXT(f_info, f_links);
+		LIST_REMOVE(f_info, f_links);
+		f_info->dbp->s_foreign = NULL;
+		__os_free(env, f_info);
+		f_info = tmp;
+	}
+
+	if (dbp->s_foreign != NULL &&
+	    (t_ret = __db_disassociate_foreign(dbp)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/*
+	 * Sync the underlying access method.  Do before closing the cursors
+	 * because DB->sync allocates cursors in order to write Recno backing
+	 * source text files.
+	 *
+	 * Sync is slow on some systems, notably Solaris filesystems where the
+	 * entire buffer cache is searched.  If we're in recovery, don't flush
+	 * the file, it's not necessary.
+	 */
+	if (!LF_ISSET(DB_NOSYNC) &&
+	    !F_ISSET(dbp, DB_AM_DISCARD | DB_AM_RECOVER) &&
+	    (t_ret = __db_sync(dbp)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/*
+	 * Go through the active cursors and call the cursor recycle routine,
+	 * which resolves pending operations and moves the cursors onto the
+	 * free list.  Then, walk the free list and call the cursor destroy
+	 * routine.  Note that any failure on a close is considered "really
+	 * bad" and we just break out of the loop and force forward.
+	 */
+	resync = TAILQ_FIRST(&dbp->active_queue) == NULL ? 0 : 1;
+	while ((dbc = TAILQ_FIRST(&dbp->active_queue)) != NULL)
+		if ((t_ret = __dbc_close(dbc)) != 0) {
+			if (ret == 0)
+				ret = t_ret;
+			break;
+		}
+
+	while ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL)
+		if ((t_ret = __dbc_destroy(dbc)) != 0) {
+			if (ret == 0)
+				ret = t_ret;
+			break;
+		}
+
+	/*
+	 * Close any outstanding join cursors.  Join cursors destroy themselves
+	 * on close and have no separate destroy routine.  We don't have to set
+	 * the resync flag here, because join cursors aren't write cursors.
+	 */
+	while ((dbc = TAILQ_FIRST(&dbp->join_queue)) != NULL)
+		if ((t_ret = __db_join_close(dbc)) != 0) {
+			if (ret == 0)
+				ret = t_ret;
+			break;
+		}
+
+	/*
+	 * Sync the memory pool, even though we've already called DB->sync,
+	 * because closing cursors can dirty pages by deleting items they
+	 * referenced.
+	 *
+	 * Sync is slow on some systems, notably Solaris filesystems where the
+	 * entire buffer cache is searched.  If we're in recovery, don't flush
+	 * the file, it's not necessary.
+	 */
+	if (resync && !LF_ISSET(DB_NOSYNC) &&
+	    !F_ISSET(dbp, DB_AM_DISCARD | DB_AM_RECOVER) &&
+	    (t_ret = __memp_fsync(dbp->mpf)) != 0 && ret == 0)
+		ret = t_ret;
+
+never_opened:
+	/*
+	 * At this point, we haven't done anything to render the DB handle
+	 * unusable, at least by a transaction abort.  Take the opportunity
+	 * now to log the file close if we have initialized the logging
+	 * information.  If this log fails and we're in a transaction,
+	 * we have to bail out of the attempted close; we'll need a dbp in
+	 * order to successfully abort the transaction, and we can't conjure
+	 * a new one up because we haven't gotten out the dbreg_register
+	 * record that represents the close.  In this case, we put off
+	 * actually closing the dbp until we've performed the abort.
+	 */
+	if (!reuse && LOGGING_ON(dbp->env) && dbp->log_filename != NULL) {
+		/*
+		 * Discard the log file id, if any.  We want to log the close
+		 * if and only if this is not a recovery dbp or a client dbp,
+		 * or a dead dbp handle.
+		 */
+		DB_ASSERT(env, renv != NULL);
+		if (F_ISSET(dbp, DB_AM_RECOVER) || IS_REP_CLIENT(env) ||
+		    dbp->timestamp != renv->rep_timestamp) {
+			if ((t_ret = __dbreg_revoke_id(dbp,
+			    0, DB_LOGFILEID_INVALID)) == 0 && ret == 0)
+				ret = t_ret;
+			if ((t_ret = __dbreg_teardown(dbp)) != 0 && ret == 0)
+				ret = t_ret;
+		} else {
+			if ((t_ret = __dbreg_close_id(dbp,
+			    txn, DBREG_CLOSE)) != 0 && txn != NULL) {
+				/*
+				 * We're in a txn and the attempt to log the
+				 * close failed;  let the txn subsystem know
+				 * that we need to destroy this dbp once we're
+				 * done with the abort, then bail from the
+				 * close.
+				 *
+				 * Note that if the attempt to put off the
+				 * close -also- fails--which it won't unless
+				 * we're out of heap memory--we're really
+				 * screwed.  Panic.
+				 */
+				if ((ret =
+				    __txn_closeevent(env, txn, dbp)) != 0)
+					return (__env_panic(env, ret));
+				if (deferred_closep != NULL)
+					*deferred_closep = 1;
+				return (t_ret);
+			}
+			/*
+			 * If dbreg_close_id failed and we were not in a
+			 * transaction, then we need to finish this close
+			 * because the caller can't do anything with the
+			 * handle after we return an error.  We rely on
+			 * dbreg_close_id to mark the entry in some manner
+			 * so that we do not do a clean shutdown of this
+			 * environment.  If shutdown isn't clean, then the
+			 * application *must* run recovery and that will
+			 * generate the RCLOSE record.
+			 */
+		}
+
+	}
+
+	/* Close any handle we've been holding since the open.  */
+	if (dbp->saved_open_fhp != NULL &&
+	    (t_ret = __os_closehandle(env, dbp->saved_open_fhp)) != 0 &&
+	    ret == 0)
+		ret = t_ret;
+
+	/*
+	 * Remove this DB handle from the ENV's dblist, if it's been added.
+	 *
+	 * Close our reference to the underlying cache while locked, we don't
+	 * want to race with a thread searching for our underlying cache link
+	 * while opening a DB handle.
+	 *
+	 * The DB handle may not yet have been added to the ENV list, don't
+	 * blindly call the underlying TAILQ_REMOVE macro.  Explicitly reset
+	 * the field values to NULL so that we can't call TAILQ_REMOVE twice.
+	 */
+	MUTEX_LOCK(env, env->mtx_dblist);
+	if (!reuse &&
+	    (dbp->dblistlinks.tqe_next != NULL ||
+	    dbp->dblistlinks.tqe_prev != NULL)) {
+		TAILQ_REMOVE(&env->dblist, dbp, dblistlinks);
+		dbp->dblistlinks.tqe_next = NULL;
+		dbp->dblistlinks.tqe_prev = NULL;
+	}
+
+	/* Close the memory pool file handle. */
+	if (dbp->mpf != NULL) {
+		if ((t_ret = __memp_fclose(dbp->mpf,
+		    F_ISSET(dbp, DB_AM_DISCARD) ? DB_MPOOL_DISCARD : 0)) != 0 &&
+		    ret == 0)
+			ret = t_ret;
+		dbp->mpf = NULL;
+		if (reuse &&
+		    (t_ret = __memp_fcreate(env, &dbp->mpf)) != 0 &&
+		    ret == 0)
+			ret = t_ret;
+	}
+
+	MUTEX_UNLOCK(env, env->mtx_dblist);
+
+	/*
+	 * Call the access specific close function.
+	 *
+	 * We do this here rather than in __db_close as we need to do this when
+	 * aborting an open so that file descriptors are closed and abort of
+	 * renames can succeed on platforms that lock open files (such as
+	 * Windows).  In particular, we need to ensure that all the extents
+	 * associated with a queue are closed so that queue renames can be
+	 * aborted.
+	 *
+	 * It is also important that we do this before releasing the handle
+	 * lock, because dbremove and dbrename assume that once they have the
+	 * handle lock, it is safe to modify the underlying file(s).
+	 *
+	 * !!!
+	 * Because of where these functions are called in the DB handle close
+	 * process, these routines can't do anything that would dirty pages or
+	 * otherwise affect closing down the database.  Specifically, we can't
+	 * abort and recover any of the information they control.
+	 */
+#ifdef HAVE_PARTITION
+	if (dbp->p_internal != NULL &&
+	    (t_ret = __partition_close(dbp, txn, flags)) != 0 && ret == 0)
+		ret = t_ret;
+#endif
+	if ((t_ret = __bam_db_close(dbp)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __ham_db_close(dbp)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __qam_db_close(dbp, dbp->flags)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/*
+	 * !!!
+	 * At this point, the access-method specific information has been
+	 * freed.  From now on, we can use the dbp, but not touch any
+	 * access-method specific data.
+	 */
+
+	if (!reuse && dbp->locker != NULL) {
+		/* We may have pending trade operations on this dbp. */
+		if (txn == NULL)
+			txn = dbp->cur_txn;
+		if (IS_REAL_TXN(txn))
+			__txn_remlock(env,
+			     txn, &dbp->handle_lock, dbp->locker);
+
+		/* We may be holding the handle lock; release it. */
+		lreq.op = DB_LOCK_PUT_ALL;
+		lreq.obj = NULL;
+		if ((t_ret = __lock_vec(env,
+		    dbp->locker, 0, &lreq, 1, NULL)) != 0 && ret == 0)
+			ret = t_ret;
+
+		if ((t_ret =
+		     __lock_id_free(env, dbp->locker)) != 0 && ret == 0)
+			ret = t_ret;
+		dbp->locker = NULL;
+		LOCK_INIT(dbp->handle_lock);
+	}
+
+	/*
+	 * If this is a temporary file (un-named in-memory file), then
+	 * discard the locker ID allocated as the fileid.
+	 */
+	if (LOCKING_ON(env) &&
+	    F_ISSET(dbp, DB_AM_INMEM) && !dbp->preserve_fid &&
+	    *(u_int32_t *)dbp->fileid != DB_LOCK_INVALIDID) {
+		if ((t_ret = __lock_getlocker(env->lk_handle,
+		     *(u_int32_t *)dbp->fileid, 0, &locker)) == 0)
+			t_ret = __lock_id_free(env, locker);
+		if (ret == 0)
+			ret = t_ret;
+	}
+
+	if (reuse) {
+		/*
+		 * If we are reusing this dbp, then we're done now. Re-init
+		 * the handle, preserving important flags, and then return.
+		 * This code is borrowed from __db_init, which does more
+		 * than we can do here.
+		 */
+		save_flags = F_ISSET(dbp, DB_AM_INMEM | DB_AM_TXN);
+
+		if ((ret = __bam_db_create(dbp)) != 0)
+			return (ret);
+		if ((ret = __ham_db_create(dbp)) != 0)
+			return (ret);
+		if ((ret = __qam_db_create(dbp)) != 0)
+			return (ret);
+
+		/* Restore flags */
+		dbp->flags = dbp->orig_flags | save_flags;
+
+		if (FLD_ISSET(save_flags, DB_AM_INMEM)) {
+			/*
+			 * If this is inmem, then it may have a fileid
+			 * even if it was never opened, and we need to
+			 * clear out that fileid.
+			 */
+			memset(dbp->fileid, 0, sizeof(dbp->fileid));
+			MAKE_INMEM(dbp);
+		}
+		return (ret);
+	}
+
+	dbp->type = DB_UNKNOWN;
+
+	/*
+	 * The thread mutex may have been invalidated in __dbreg_close_id if the
+	 * fname refcount did not go to 0. If not, discard the thread mutex.
+	 */
+	if ((t_ret = __mutex_free(env, &dbp->mutex)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Discard any memory allocated for the file and database names. */
+	if (dbp->fname != NULL) {
+		__os_free(dbp->env, dbp->fname);
+		dbp->fname = NULL;
+	}
+	if (dbp->dname != NULL) {
+		__os_free(dbp->env, dbp->dname);
+		dbp->dname = NULL;
+	}
+
+	/* Discard any memory used to store returned data. */
+	if (dbp->my_rskey.data != NULL)
+		__os_free(dbp->env, dbp->my_rskey.data);
+	if (dbp->my_rkey.data != NULL)
+		__os_free(dbp->env, dbp->my_rkey.data);
+	if (dbp->my_rdata.data != NULL)
+		__os_free(dbp->env, dbp->my_rdata.data);
+
+	/* For safety's sake;  we may refresh twice. */
+	memset(&dbp->my_rskey, 0, sizeof(DBT));
+	memset(&dbp->my_rkey, 0, sizeof(DBT));
+	memset(&dbp->my_rdata, 0, sizeof(DBT));
+
+	/* Clear out fields that normally get set during open. */
+	memset(dbp->fileid, 0, sizeof(dbp->fileid));
+	dbp->adj_fileid = 0;
+	dbp->meta_pgno = 0;
+	dbp->cur_locker = NULL;
+	dbp->cur_txn = NULL;
+	dbp->associate_locker = NULL;
+	dbp->cl_id = 0;
+	dbp->open_flags = 0;
+
+	/*
+	 * If we are being refreshed with a txn specified, then we need
+	 * to make sure that we clear out the lock handle field, because
+	 * releasing all the locks for this transaction will release this
+	 * lock and we don't want close to stumble upon this handle and
+	 * try to close it.
+	 */
+	if (txn != NULL)
+		LOCK_INIT(dbp->handle_lock);
+
+	/* Reset flags to whatever the user configured. */
+	dbp->flags = dbp->orig_flags;
+
+	return (ret);
+}
+
+/*
+ * __db_disassociate --
+ *	Destroy the association between a given secondary and its primary.
+ */
+static int
+__db_disassociate(sdbp)
+	DB *sdbp;
+{
+	DBC *dbc;
+	int ret, t_ret;
+
+	ret = 0;
+
+	sdbp->s_callback = NULL;
+	sdbp->s_primary = NULL;
+	sdbp->get = sdbp->stored_get;
+	sdbp->close = sdbp->stored_close;
+
+	/*
+	 * Complain, but proceed, if we have any active cursors.  (We're in
+	 * the middle of a close, so there's really no turning back.)
+	 */
+	if (sdbp->s_refcnt != 1 ||
+	    TAILQ_FIRST(&sdbp->active_queue) != NULL ||
+	    TAILQ_FIRST(&sdbp->join_queue) != NULL) {
+		__db_errx(sdbp->env,
+    "Closing a primary DB while a secondary DB has active cursors is unsafe");
+		ret = EINVAL;
+	}
+	sdbp->s_refcnt = 0;
+
+	while ((dbc = TAILQ_FIRST(&sdbp->free_queue)) != NULL)
+		if ((t_ret = __dbc_destroy(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+
+	F_CLR(sdbp, DB_AM_SECONDARY);
+	return (ret);
+}
+
+/*
+ * __db_disassociate_foreign --
+ *     Destroy the association between a given secondary and its foreign.
+ */
+static int
+__db_disassociate_foreign(sdbp)
+	DB *sdbp;
+{
+	DB *fdbp;
+	DB_FOREIGN_INFO *f_info, *tmp;
+	int ret;
+
+	if (sdbp->s_foreign == NULL)
+		return (0);
+	if ((ret = __os_malloc(sdbp->env, sizeof(DB_FOREIGN_INFO), &tmp)) != 0)
+		return (ret);
+
+	fdbp = sdbp->s_foreign;
+	ret = 0;
+	f_info = LIST_FIRST(&fdbp->f_primaries);
+	while (f_info != NULL) {
+		tmp = LIST_NEXT(f_info, f_links);
+		if (f_info ->dbp == sdbp) {
+			LIST_REMOVE(f_info, f_links);
+			__os_free(sdbp->env, f_info);
+		}
+		f_info = tmp;
+	}
+
+	return (ret);
+}
+
+/*
+ * __db_log_page
+ *	Log a meta-data or root page during a subdatabase create operation.
+ *
+ * PUBLIC: int __db_log_page __P((DB *, DB_TXN *, DB_LSN *, db_pgno_t, PAGE *));
+ */
+int
+__db_log_page(dbp, txn, lsn, pgno, page)
+	DB *dbp;
+	DB_TXN *txn;
+	DB_LSN *lsn;
+	db_pgno_t pgno;
+	PAGE *page;
+{
+	DBT page_dbt;
+	DB_LSN new_lsn;
+	int ret;
+
+	if (!LOGGING_ON(dbp->env) || txn == NULL)
+		return (0);
+
+	memset(&page_dbt, 0, sizeof(page_dbt));
+	page_dbt.size = dbp->pgsize;
+	page_dbt.data = page;
+
+	ret = __crdel_metasub_log(dbp, txn, &new_lsn, 0, pgno, &page_dbt, lsn);
+
+	if (ret == 0)
+		page->lsn = new_lsn;
+	return (ret);
+}
+
+/*
+ * __db_backup_name
+ *	Create the backup file name for a given file.
+ *
+ * PUBLIC: int __db_backup_name __P((ENV *,
+ * PUBLIC:     const char *, DB_TXN *, char **));
+ */
+#undef	BACKUP_PREFIX
+#define	BACKUP_PREFIX	"__db."
+
+#undef	MAX_INT_TO_HEX
+#define	MAX_INT_TO_HEX	8
+
+int
+__db_backup_name(env, name, txn, backup)
+	ENV *env;
+	const char *name;
+	DB_TXN *txn;
+	char **backup;
+{
+	u_int32_t id;
+	size_t len;
+	int ret;
+	char *p, *retp;
+
+	*backup = NULL;
+
+	/*
+	 * Part of the name may be a full path, so we need to make sure that
+	 * we allocate enough space for it, even in the case where we don't
+	 * use the entire filename for the backup name.
+	 */
+	len = strlen(name) + strlen(BACKUP_PREFIX) + 2 * MAX_INT_TO_HEX + 1;
+	if ((ret = __os_malloc(env, len, &retp)) != 0)
+		return (ret);
+
+	/*
+	 * Create the name.  Backup file names are in one of 2 forms: in a
+	 * transactional env "__db.TXNID.ID", where ID is a random number,
+	 * and in any other env "__db.FILENAME".
+	 *
+	 * In addition, the name passed may contain an env-relative path.
+	 * In that case, put the "__db." in the right place (in the last
+	 * component of the pathname).
+	 *
+	 * There are four cases here:
+	 *	1. simple path w/out transaction
+	 *	2. simple path + transaction
+	 *	3. multi-component path w/out transaction
+	 *	4. multi-component path + transaction
+	 */
+	p = __db_rpath(name);
+	if (IS_REAL_TXN(txn)) {
+		__os_unique_id(env, &id);
+		if (p == NULL)				/* Case 2. */
+			snprintf(retp, len, "%s%x.%x",
+			    BACKUP_PREFIX, txn->txnid, id);
+		else					/* Case 4. */
+			snprintf(retp, len, "%.*s%x.%x",
+			    (int)(p - name) + 1, name, txn->txnid, id);
+	} else {
+		if (p == NULL)				/* Case 1. */
+			snprintf(retp, len, "%s%s", BACKUP_PREFIX, name);
+		else					/* Case 3. */
+			snprintf(retp, len, "%.*s%s%s",
+			    (int)(p - name) + 1, name, BACKUP_PREFIX, p + 1);
+	}
+
+	*backup = retp;
+	return (0);
+}
+
+#ifdef CONFIG_TEST
+/*
+ * __db_testcopy
+ *	Create a copy of all backup files and our "main" DB.
+ *
+ * PUBLIC: #ifdef CONFIG_TEST
+ * PUBLIC: int __db_testcopy __P((ENV *, DB *, const char *));
+ * PUBLIC: #endif
+ */
+int
+__db_testcopy(env, dbp, name)
+	ENV *env;
+	DB *dbp;
+	const char *name;
+{
+	DB_MPOOL *dbmp;
+	DB_MPOOLFILE *mpf;
+
+	DB_ASSERT(env, dbp != NULL || name != NULL);
+
+	if (name == NULL) {
+		dbmp = env->mp_handle;
+		mpf = dbp->mpf;
+		name = R_ADDR(dbmp->reginfo, mpf->mfp->path_off);
+	}
+
+	if (dbp != NULL && dbp->type == DB_QUEUE)
+		return (__qam_testdocopy(dbp, name));
+	else
+#ifdef HAVE_PARTITION
+	if (dbp != NULL && DB_IS_PARTITIONED(dbp))
+		return (__part_testdocopy(dbp, name));
+	else
+#endif
+		return (__db_testdocopy(env, name));
+}
+
+static int
+__qam_testdocopy(dbp, name)
+	DB *dbp;
+	const char *name;
+{
+	DB_THREAD_INFO *ip;
+	QUEUE_FILELIST *filelist, *fp;
+	int ret;
+	char buf[DB_MAXPATHLEN], *dir;
+
+	filelist = NULL;
+	if ((ret = __db_testdocopy(dbp->env, name)) != 0)
+		return (ret);
+
+	/* Call ENV_GET_THREAD_INFO to get a valid DB_THREAD_INFO */
+	ENV_GET_THREAD_INFO(dbp->env, ip);
+	if (dbp->mpf != NULL &&
+	    (ret = __qam_gen_filelist(dbp, ip, &filelist)) != 0)
+		goto done;
+
+	if (filelist == NULL)
+		return (0);
+	dir = ((QUEUE *)dbp->q_internal)->dir;
+	for (fp = filelist; fp->mpf != NULL; fp++) {
+		snprintf(buf, sizeof(buf),
+		    QUEUE_EXTENT, dir, PATH_SEPARATOR[0], name, fp->id);
+		if ((ret = __db_testdocopy(dbp->env, buf)) != 0)
+			return (ret);
+	}
+
+done:	__os_free(dbp->env, filelist);
+	return (0);
+}
+
+/*
+ * __db_testdocopy
+ *	Create a copy of all backup files and our "main" DB.
+ * PUBLIC: int __db_testdocopy __P((ENV *, const char *));
+ */
+int
+__db_testdocopy(env, name)
+	ENV *env;
+	const char *name;
+{
+	size_t len;
+	int dircnt, i, ret;
+	char *copy, **namesp, *p, *real_name;
+
+	dircnt = 0;
+	copy = NULL;
+	namesp = NULL;
+
+	/* Create the real backing file name. */
+	if ((ret = __db_appname(env,
+	    DB_APP_DATA, name, NULL, &real_name)) != 0)
+		return (ret);
+
+	/*
+	 * !!!
+	 * There are tests that attempt to copy non-existent files.  I'd guess
+	 * it's a testing bug, but I don't have time to figure it out.  Block
+	 * the case here.
+	 */
+	if (__os_exists(env, real_name, NULL) != 0) {
+		__os_free(env, real_name);
+		return (0);
+	}
+
+	/*
+	 * Copy the file itself.
+	 *
+	 * Allocate space for the file name, including adding an ".afterop" and
+	 * trailing nul byte.
+	 */
+	len = strlen(real_name) + sizeof(".afterop");
+	if ((ret = __os_malloc(env, len, &copy)) != 0)
+		goto err;
+	snprintf(copy, len, "%s.afterop", real_name);
+	if ((ret = __db_makecopy(env, real_name, copy)) != 0)
+		goto err;
+
+	/*
+	 * Get the directory path to call __os_dirlist().
+	 */
+	if ((p = __db_rpath(real_name)) != NULL)
+		*p = '\0';
+	if ((ret = __os_dirlist(env, real_name, 0, &namesp, &dircnt)) != 0)
+		goto err;
+
+	/*
+	 * Walk the directory looking for backup files.  Backup file names in
+	 * transactional environments are of the form:
+	 *
+	 *	BACKUP_PREFIX.TXNID.ID
+	 */
+	for (i = 0; i < dircnt; i++) {
+		/* Check for a related backup file name. */
+		if (strncmp(
+		    namesp[i], BACKUP_PREFIX, sizeof(BACKUP_PREFIX) - 1) != 0)
+			continue;
+		p = namesp[i] + sizeof(BACKUP_PREFIX);
+		p += strspn(p, "0123456789ABCDEFabcdef");
+		if (*p != '.')
+			continue;
+		++p;
+		p += strspn(p, "0123456789ABCDEFabcdef");
+		if (*p != '\0')
+			continue;
+
+		/*
+		 * Copy the backup file.
+		 *
+		 * Allocate space for the file name, including adding a
+		 * ".afterop" and trailing nul byte.
+		 */
+		if (real_name != NULL) {
+			__os_free(env, real_name);
+			real_name = NULL;
+		}
+		if ((ret = __db_appname(env,
+		    DB_APP_DATA, namesp[i], NULL, &real_name)) != 0)
+			goto err;
+		if (copy != NULL) {
+			__os_free(env, copy);
+			copy = NULL;
+		}
+		len = strlen(real_name) + sizeof(".afterop");
+		if ((ret = __os_malloc(env, len, &copy)) != 0)
+			goto err;
+		snprintf(copy, len, "%s.afterop", real_name);
+		if ((ret = __db_makecopy(env, real_name, copy)) != 0)
+			goto err;
+	}
+
+err:	if (namesp != NULL)
+		__os_dirfree(env, namesp, dircnt);
+	if (copy != NULL)
+		__os_free(env, copy);
+	if (real_name != NULL)
+		__os_free(env, real_name);
+	return (ret);
+}
+
+static int
+__db_makecopy(env, src, dest)
+	ENV *env;
+	const char *src, *dest;
+{
+	DB_FH *rfhp, *wfhp;
+	size_t rcnt, wcnt;
+	int ret;
+	char *buf;
+
+	rfhp = wfhp = NULL;
+
+	if ((ret = __os_malloc(env, 64 * 1024, &buf)) != 0)
+		goto err;
+
+	if ((ret = __os_open(env, src, 0,
+	    DB_OSO_RDONLY, DB_MODE_600, &rfhp)) != 0)
+		goto err;
+	if ((ret = __os_open(env, dest, 0,
+	    DB_OSO_CREATE | DB_OSO_TRUNC, DB_MODE_600, &wfhp)) != 0)
+		goto err;
+
+	for (;;) {
+		if ((ret =
+		    __os_read(env, rfhp, buf, sizeof(buf), &rcnt)) != 0)
+			goto err;
+		if (rcnt == 0)
+			break;
+		if ((ret =
+		    __os_write(env, wfhp, buf, sizeof(buf), &wcnt)) != 0)
+			goto err;
+	}
+
+	if (0) {
+err:		__db_err(env, ret, "__db_makecopy: %s -> %s", src, dest);
+	}
+
+	if (buf != NULL)
+		__os_free(env, buf);
+	if (rfhp != NULL)
+		(void)__os_closehandle(env, rfhp);
+	if (wfhp != NULL)
+		(void)__os_closehandle(env, wfhp);
+	return (ret);
+}
+#endif
diff --git a/db/db.src b/db/db.src
new file mode 100644
index 0000000..2136b79
--- /dev/null
+++ b/db/db.src
@@ -0,0 +1,328 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle.  All rights reserved.
+ *
+ * $Id$
+ */
+
+DBPRIVATE
+PREFIX	__db
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/crypto.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_dispatch.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/log.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE
+
+/*
+ * addrem -- Add or remove an entry from a duplicate page.
+ *
+ * opcode:	identifies if this is an add or delete.
+ * fileid:	file identifier of the file being modified.
+ * pgno:	duplicate page number.
+ * indx:	location at which to insert or delete.
+ * nbytes:	number of bytes added/removed to/from the page.
+ * hdr:		header for the data item.
+ * dbt:		data that is deleted or is to be added.
+ * pagelsn:	former lsn of the page.
+ *
+ * If the hdr was NULL then, the dbt is a regular B_KEYDATA.
+ * If the dbt was NULL then the hdr is a complete item to be
+ * pasted on the page.
+ */
+BEGIN addrem		42	41
+ARG	opcode		u_int32_t	lu
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+ARG	indx		u_int32_t	lu
+ARG	nbytes		u_int32_t	lu
+DBT	hdr		DBT		s
+DBT	dbt		DBT		s
+POINTER	pagelsn		DB_LSN *	lu
+END
+
+/*
+ * big -- Handles addition and deletion of big key/data items.
+ *
+ * opcode:	identifies get/put.
+ * fileid:	file identifier of the file being modified.
+ * pgno:	page onto which data is being added/removed.
+ * prev_pgno:	the page before the one we are logging.
+ * next_pgno:	the page after the one we are logging.
+ * dbt:		data being written onto the page.
+ * pagelsn:	former lsn of the orig_page.
+ * prevlsn:	former lsn of the prev_pgno.
+ * nextlsn:	former lsn of the next_pgno. This is not currently used, but
+ *		may be used later if we actually do overwrites of big key/
+ *		data items in place.
+ */
+BEGIN big		42	43
+ARG	opcode		u_int32_t	lu
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+ARG	prev_pgno	db_pgno_t	lu
+ARG	next_pgno	db_pgno_t	lu
+DBT	dbt		DBT		s
+POINTER	pagelsn		DB_LSN *	lu
+POINTER	prevlsn		DB_LSN *	lu
+POINTER	nextlsn		DB_LSN *	lu
+END
+
+/*
+ * ovref -- Handles increment/decrement of overflow page reference count.
+ *
+ * fileid:	identifies the file being modified.
+ * pgno:	page number whose ref count is being incremented/decremented.
+ * adjust:	the adjustment being made.
+ * lsn:		the page's original lsn.
+ */
+BEGIN ovref		42	44
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+ARG	adjust		int32_t		ld
+POINTER	lsn		DB_LSN *	lu
+END
+
+/*
+ * relink -- Handles relinking around a page.
+ *
+ * opcode:	indicates if this is an addpage or delete page
+ * pgno:	the page being changed.
+ * lsn		the page's original lsn.
+ * prev:	the previous page.
+ * lsn_prev:	the previous page's original lsn.
+ * next:	the next page.
+ * lsn_next:	the previous page's original lsn.
+ */
+BEGIN_COMPAT relink		42	45
+ARG	opcode		u_int32_t	lu
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+POINTER	lsn		DB_LSN *	lu
+ARG	prev		db_pgno_t	lu
+POINTER	lsn_prev	DB_LSN *	lu
+ARG	next		db_pgno_t	lu
+POINTER	lsn_next	DB_LSN *	lu
+END
+
+/*
+ * Debug -- log an operation upon entering an access method.
+ * op:		Operation (cursor, c_close, c_get, c_put, c_del,
+ *		get, put, delete).
+ * fileid:	identifies the file being acted upon.
+ * key:		key paramater
+ * data:	data parameter
+ * flags:	flags parameter
+ */
+BEGIN debug		42	47
+DBT	op		DBT		s
+ARG	fileid		int32_t		ld
+DBT	key		DBT		s
+DBT	data		DBT		s
+ARG	arg_flags	u_int32_t	lu
+END
+
+/*
+ * noop -- do nothing, but get an LSN.
+ */
+BEGIN noop		42	48
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+POINTER	prevlsn		DB_LSN *	lu
+END
+
+/*
+ * pg_alloc: used to record allocating a new page.
+ *
+ * meta_lsn:	the original lsn of the page reference by meta_pgno.
+ * meta_pgno	the page pointing at the allocated page in the free list.
+ *			If the list is unsorted this is the metadata page.
+ * page_lsn:	the allocated page's original lsn.
+ * pgno:	the page allocated.
+ * ptype:	the type of the page allocated.
+ * next:	the next page on the free list.
+ * last_pgno:	the last page in the file after this op (4.3+).
+ */
+BEGIN_COMPAT pg_alloc	42	49
+DB	fileid		int32_t		ld
+POINTER	meta_lsn	DB_LSN *	lu
+ARG	meta_pgno	db_pgno_t	lu
+POINTER	page_lsn	DB_LSN *	lu
+ARG	pgno		db_pgno_t	lu
+ARG	ptype		u_int32_t	lu
+ARG	next		db_pgno_t	lu
+END
+
+BEGIN pg_alloc	43	49
+DB	fileid		int32_t		ld
+POINTER	meta_lsn	DB_LSN *	lu
+ARG	meta_pgno	db_pgno_t	lu
+POINTER	page_lsn	DB_LSN *	lu
+ARG	pgno		db_pgno_t	lu
+ARG	ptype		u_int32_t	lu
+ARG	next		db_pgno_t	lu
+ARG	last_pgno	db_pgno_t	lu
+END
+
+/*
+ * pg_free: used to record freeing a page.
+ *	If we are maintaining a sorted free list (during compact) meta_pgno
+ * will be non-zero and refer to the page that preceeds the one we are freeing
+ * in the free list.  Meta_lsn will then be the lsn of that page.
+ *
+ * pgno:	the page being freed.
+ * meta_lsn:	the meta-data page's original lsn.
+ * meta_pgno:	the meta-data page number.
+ * header:	the header from the free'd page.
+ * next:	the previous next pointer on the metadata page.
+ * last_pgno:	the last page in the file before this op (4.3+).
+ */
+BEGIN_COMPAT pg_free		42	50
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+POINTER	meta_lsn	DB_LSN *	lu
+ARG	meta_pgno	db_pgno_t	lu
+PGDBT	header		DBT		s
+ARG	next		db_pgno_t	lu
+END
+
+BEGIN pg_free		43	50
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+POINTER	meta_lsn	DB_LSN *	lu
+ARG	meta_pgno	db_pgno_t	lu
+PGDBT	header		DBT		s
+ARG	next		db_pgno_t	lu
+ARG	last_pgno	db_pgno_t	lu
+END
+
+/*
+ * cksum --
+ *	This log record is written when we're unable to checksum a page,
+ *	before returning DB_RUNRECOVERY.  This log record causes normal
+ *	recovery to itself return DB_RUNRECOVERY, as only catastrophic
+ *	recovery can fix things.
+ */
+BEGIN cksum		42	51
+END
+
+/*
+ * pg_freedata: used to record freeing a page with data on it.
+ *
+ * pgno:	the page being freed.
+ * meta_lsn:	the meta-data page's original lsn.
+ * meta_pgno:	the meta-data page number.
+ * header:	the header and index entries from the free'd page.
+ * data:	the data from the free'd page.
+ * next:	the previous next pointer on the metadata page.
+ * last_pgno:	the last page in the file before this op (4.3+).
+ */
+BEGIN_COMPAT pg_freedata		42	52
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+POINTER	meta_lsn	DB_LSN *	lu
+ARG	meta_pgno	db_pgno_t	lu
+PGDBT	header		DBT		s
+ARG	next		db_pgno_t	lu
+PGDDBT	data		DBT		s
+END
+
+BEGIN pg_freedata		43	52
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+POINTER	meta_lsn	DB_LSN *	lu
+ARG	meta_pgno	db_pgno_t	lu
+PGDBT	header		DBT		s
+ARG	next		db_pgno_t	lu
+ARG	last_pgno	db_pgno_t	lu
+PGDDBT	data		DBT		s
+END
+
+/*
+ * pg_prepare: used to record an aborted page in a prepared transaction.
+ *
+ * pgno:	the page being freed.
+ */
+X BEGIN pg_prepare		42	53
+X DB	fileid		int32_t		ld
+X ARG	pgno		db_pgno_t	lu
+X END
+
+/*
+ * pg_new: used to record a new page put on the free list.
+ *
+ * pgno:	the page being freed.
+ * meta_lsn:	the meta-data page's original lsn.
+ * meta_pgno:	the meta-data page number.
+ * header:	the header from the free'd page.
+ * next:	the previous next pointer on the metadata page.
+ */
+X BEGIN pg_new		42	54
+X DB	fileid		int32_t		ld
+X ARG	pgno		db_pgno_t	lu
+X POINTER	meta_lsn	DB_LSN *	lu
+X ARG	meta_pgno	db_pgno_t	lu
+X PGDBT	header		DBT		s
+X ARG	next		db_pgno_t	lu
+X END
+
+/*
+ * pg_init: used to reinitialize a page during truncate.
+ *
+ * pgno:	the page being initialized.
+ * header:	the header from the page.
+ * data:	data that used to be on the page.
+ */
+BEGIN pg_init		43	60
+DB	fileid		int32_t		ld
+ARG	pgno		db_pgno_t	lu
+PGDBT	header		DBT		s
+PGDDBT	data		DBT		s
+END
+
+/*
+ * pg_sort: sort the free list
+ *
+ * meta:	meta page number
+ * meta_lsn:	lsn on meta page.
+ * last_free:	page number of new last free page.
+ * last_lsn;	lsn of last free page.
+ * last_pgno:	current last page number.
+ * list:	list of pages and lsns to sort.
+ */
+BEGIN_COMPAT pg_sort		44	61
+DB	fileid		int32_t		ld
+ARG	meta		db_pgno_t	lu
+POINTER	meta_lsn	DB_LSN *	lu
+ARG	last_free	db_pgno_t	lu
+POINTER	last_lsn	DB_LSN *	lu
+ARG	last_pgno	db_pgno_t	lu
+DBT	list		DBT		s
+END
+
+
+/*
+ * pg_truc: truncate the free list
+ *
+ * meta:	meta page number
+ * meta_lsn:	lsn on meta page.
+ * last_free:	page number of new last free page.
+ * last_lsn;	lsn of last free page.
+ * last_pgno:	current last page number.
+ * list:	list of pages and lsns on free list.
+ */
+BEGIN pg_trunc		49	66
+DB	fileid		int32_t		ld
+ARG	meta		db_pgno_t	lu
+POINTER	meta_lsn	DB_LSN *	lu
+ARG	last_free	db_pgno_t	lu
+POINTER	last_lsn	DB_LSN *	lu
+ARG	next_free	db_pgno_t	lu
+ARG	last_pgno	db_pgno_t	lu
+DBT	list		DBT		s
+END
+
diff --git a/db/db_am.c b/db/db_am.c
new file mode 100644
index 0000000..c453ea9
--- /dev/null
+++ b/db/db_am.c
@@ -0,0 +1,1015 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998-2009 Oracle.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+static int __db_secondary_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+static int __dbc_set_priority __P((DBC *, DB_CACHE_PRIORITY));
+static int __dbc_get_priority __P((DBC *, DB_CACHE_PRIORITY* ));
+
+/*
+ * __db_cursor_int --
+ *	Internal routine to create a cursor.
+ *
+ * PUBLIC: int __db_cursor_int __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC:     DB_TXN *, DBTYPE, db_pgno_t, int, DB_LOCKER *, DBC **));
+ */
+int
+__db_cursor_int(dbp, ip, txn, dbtype, root, flags, locker, dbcp)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DBTYPE dbtype;
+	db_pgno_t root;
+	int flags;
+	DB_LOCKER *locker;
+	DBC **dbcp;
+{
+	DBC *dbc;
+	DBC_INTERNAL *cp;
+	ENV *env;
+	db_threadid_t tid;
+	int allocated, ret;
+	pid_t pid;
+
+	env = dbp->env;
+	allocated = 0;
+
+	/*
+	 * If dbcp is non-NULL it is assumed to point to an area to initialize
+	 * as a cursor.
+	 *
+	 * Take one from the free list if it's available.  Take only the
+	 * right type.  With off page dups we may have different kinds
+	 * of cursors on the queue for a single database.
+	 */
+	MUTEX_LOCK(env, dbp->mutex);
+
+#ifndef HAVE_NO_DB_REFCOUNT
+	/*
+	 * If this DBP is being logged then refcount the log filename
+	 * relative to this transaction. We do this here because we have
+	 * the dbp->mutex which protects the refcount.  We want to avoid
+	 * calling the function if we are duplicating a cursor.  This includes
+	 * the case of creating an off page duplicate cursor. If we know this
+	 * cursor will not be used in an update, we could avoid this,
+	 * but we don't have that information.
+	 */
+	if (txn != NULL && !LF_ISSET(DBC_OPD | DBC_DUPLICATE)
+	    && !F_ISSET(dbp, DB_AM_RECOVER) &&
+	    dbp->log_filename != NULL && !IS_REP_CLIENT(env) &&
+	    (ret = __txn_record_fname(env, txn, dbp->log_filename)) != 0) {
+		MUTEX_UNLOCK(env, dbp->mutex);
+		return (ret);
+	}
+
+#endif
+
+	TAILQ_FOREACH(dbc, &dbp->free_queue, links)
+		if (dbtype == dbc->dbtype) {
+			TAILQ_REMOVE(&dbp->free_queue, dbc, links);
+			F_CLR(dbc, ~DBC_OWN_LID);
+			break;
+		}
+	MUTEX_UNLOCK(env, dbp->mutex);
+
+	if (dbc == NULL) {
+		if ((ret = __os_calloc(env, 1, sizeof(DBC), &dbc)) != 0)
+			return (ret);
+		allocated = 1;
+		dbc->flags = 0;
+
+		dbc->dbp = dbp;
+		dbc->dbenv = dbp->dbenv;
+		dbc->env = dbp->env;
+
+		/* Set up locking information. */
+		if (LOCKING_ON(env)) {
+			/*
+			 * If we are not threaded, we share a locker ID among
+			 * all cursors opened in the environment handle,
+			 * allocating one if this is the first cursor.
+			 *
+			 * This relies on the fact that non-threaded DB handles
+			 * always have non-threaded environment handles, since
+			 * we set DB_THREAD on DB handles created with threaded
+			 * environment handles.
+			 */
+			if (!DB_IS_THREADED(dbp)) {
+				if (env->env_lref == NULL && (ret =
+				    __lock_id(env, NULL, &env->env_lref)) != 0)
+					goto err;
+				dbc->lref = env->env_lref;
+			} else {
+				if ((ret =
+				    __lock_id(env, NULL, &dbc->lref)) != 0)
+					goto err;
+				F_SET(dbc, DBC_OWN_LID);
+			}
+
+			/*
+			 * In CDB, secondary indices should share a lock file
+			 * ID with the primary;  otherwise we're susceptible
+			 * to deadlocks.  We also use __db_cursor_int rather
+			 * than __db_cursor to create secondary update cursors
+			 * in c_put and c_del; these won't acquire a new lock.
+			 *
+			 * !!!
+			 * Since this is in the one-time cursor allocation
+			 * code, we need to be sure to destroy, not just
+			 * close, all cursors in the secondary when we
+			 * associate.
+			 */
+			if (CDB_LOCKING(env) &&
+			    F_ISSET(dbp, DB_AM_SECONDARY))
+				memcpy(dbc->lock.fileid,
+				    dbp->s_primary->fileid, DB_FILE_ID_LEN);
+			else
+				memcpy(dbc->lock.fileid,
+				    dbp->fileid, DB_FILE_ID_LEN);
+
+			if (CDB_LOCKING(env)) {
+				if (F_ISSET(env->dbenv, DB_ENV_CDB_ALLDB)) {
+					/*
+					 * If we are doing a single lock per
+					 * environment, set up the global
+					 * lock object just like we do to
+					 * single thread creates.
+					 */
+					DB_ASSERT(env, sizeof(db_pgno_t) ==
+					    sizeof(u_int32_t));
+					dbc->lock_dbt.size = sizeof(u_int32_t);
+					dbc->lock_dbt.data = &dbc->lock.pgno;
+					dbc->lock.pgno = 0;
+				} else {
+					dbc->lock_dbt.size = DB_FILE_ID_LEN;
+					dbc->lock_dbt.data = dbc->lock.fileid;
+				}
+			} else {
+				dbc->lock.type = DB_PAGE_LOCK;
+				dbc->lock_dbt.size = sizeof(dbc->lock);
+				dbc->lock_dbt.data = &dbc->lock;
+			}
+		}
+		/* Init the DBC internal structure. */
+#ifdef HAVE_PARTITION
+		if (DB_IS_PARTITIONED(dbp)) {
+			if ((ret = __partc_init(dbc)) != 0)
+				goto err;
+		} else
+#endif
+		switch (dbtype) {
+		case DB_BTREE:
+		case DB_RECNO:
+			if ((ret = __bamc_init(dbc, dbtype)) != 0)
+				goto err;
+			break;
+		case DB_HASH:
+			if ((ret = __hamc_init(dbc)) != 0)
+				goto err;
+			break;
+		case DB_QUEUE:
+			if ((ret = __qamc_init(dbc)) != 0)
+				goto err;
+			break;
+		case DB_UNKNOWN:
+		default:
+			ret = __db_unknown_type(env, "DB->cursor", dbtype);
+			goto err;
+		}
+
+		cp = dbc->internal;
+	}
+
+	/* Refresh the DBC structure. */
+	dbc->dbtype = dbtype;
+	RESET_RET_MEM(dbc);
+	dbc->set_priority = __dbc_set_priority;
+	dbc->get_priority = __dbc_get_priority;
+	dbc->priority = dbp->priority;
+
+	if ((dbc->txn = txn) != NULL)
+		dbc->locker = txn->locker;
+	else if (LOCKING_ON(env)) {
+		/*
+		 * There are certain cases in which we want to create a
+		 * new cursor with a particular locker ID that is known
+		 * to be the same as (and thus not conflict with) an
+		 * open cursor.
+		 *
+		 * The most obvious case is cursor duplication;  when we
+		 * call DBC->dup or __dbc_idup, we want to use the original
+		 * cursor's locker ID.
+		 *
+		 * Another case is when updating secondary indices.  Standard
+		 * CDB locking would mean that we might block ourself:  we need
+		 * to open an update cursor in the secondary while an update
+		 * cursor in the primary is open, and when the secondary and
+		 * primary are subdatabases or we're using env-wide locking,
+		 * this is disastrous.
+		 *
+		 * In these cases, our caller will pass a nonzero locker
+		 * ID into this function.  Use this locker ID instead of
+		 * the default as the locker ID for our new cursor.
+		 */
+		if (locker != NULL)
+			dbc->locker = locker;
+		else {
+			/*
+			 * If we are threaded then we need to set the
+			 * proper thread id into the locker.
+			 */
+			if (DB_IS_THREADED(dbp)) {
+				env->dbenv->thread_id(env->dbenv, &pid, &tid);
+				__lock_set_thread_id(dbc->lref, pid, tid);
+			}
+			dbc->locker = dbc->lref;
+		}
+	}
+
+	/*
+	 * These fields change when we are used as a secondary index, so
+	 * if the DB is a secondary, make sure they're set properly just
+	 * in case we opened some cursors before we were associated.
+	 *
+	 * __dbc_get is used by all access methods, so this should be safe.
+	 */
+	if (F_ISSET(dbp, DB_AM_SECONDARY))
+		dbc->get = dbc->c_get = __dbc_secondary_get_pp;
+
+	if (LF_ISSET(DB_CURSOR_BULK) && dbtype == DB_BTREE)
+		F_SET(dbc, DBC_BULK);
+	if (LF_ISSET(DB_CURSOR_TRANSIENT))
+		F_SET(dbc, DBC_TRANSIENT);
+	if (LF_ISSET(DBC_OPD))
+		F_SET(dbc, DBC_OPD);
+	if (F_ISSET(dbp, DB_AM_RECOVER))
+		F_SET(dbc, DBC_RECOVER);
+	if (F_ISSET(dbp, DB_AM_COMPENSATE))
+		F_SET(dbc, DBC_DONTLOCK);
+#ifdef HAVE_REPLICATION
+	/*
+	 * If we are replicating from a down rev version then we must
+	 * use old locking protocols.
+	 */
+	if (LOGGING_ON(env) &&
+	     ((LOG *)env->lg_handle->
+	     reginfo.primary)->persist.version < DB_LOGVERSION_LATCHING)
+		F_SET(dbc, DBC_DOWNREV);
+#endif
+
+	/* Refresh the DBC internal structure. */
+	cp = dbc->internal;
+	cp->opd = NULL;
+	cp->pdbc = NULL;
+
+	cp->indx = 0;
+	cp->page = NULL;
+	cp->pgno = PGNO_INVALID;
+	cp->root = root;
+	cp->stream_start_pgno = cp->stream_curr_pgno = PGNO_INVALID;
+	cp->stream_off = 0;
+
+	if (DB_IS_PARTITIONED(dbp)) {
+		DBC_PART_REFRESH(dbc);
+	} else switch (dbtype) {
+	case DB_BTREE:
+	case DB_RECNO:
+		if ((ret = __bamc_refresh(dbc)) != 0)
+			goto err;
+		break;
+	case DB_HASH:
+	case DB_QUEUE:
+		break;
+	case DB_UNKNOWN:
+	default:
+		ret = __db_unknown_type(env, "DB->cursor", dbp->type);
+		goto err;
+	}
+
+	/*
+	 * The transaction keeps track of how many cursors were opened within
+	 * it to catch application errors where the cursor isn't closed when
+	 * the transaction is resolved.
+	 */
+	if (txn != NULL)
+		++txn->cursors;
+	if (ip != NULL)
+		dbc->thread_info = ip;
+	else if (txn != NULL)
+		dbc->thread_info = txn->thread_info;
+	else
+		ENV_GET_THREAD_INFO(env, dbc->thread_info);
+
+	MUTEX_LOCK(env, dbp->mutex);
+	TAILQ_INSERT_TAIL(&dbp->active_queue, dbc, links);
+	F_SET(dbc, DBC_ACTIVE);
+	MUTEX_UNLOCK(env, dbp->mutex);
+
+	*dbcp = dbc;
+	return (0);
+
+err:	if (allocated)
+		__os_free(env, dbc);
+	return (ret);
+}
+
+/*
+ * __db_put --
+ *	Store a key/data pair.
+ *
+ * PUBLIC: int __db_put __P((DB *,
+ * PUBLIC:      DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_put(dbp, ip, txn, key, data, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	DBT tdata, tkey;
+	ENV *env;
+	void *bulk_kptr, *bulk_ptr;
+	db_recno_t recno;
+	u_int32_t cursor_flags;
+	int ret, t_ret;
+
+	env = dbp->env;
+
+	/*
+	 * See the comment in __db_get() regarding DB_CURSOR_TRANSIENT.
+	 *
+	 * Note that the get in the DB_NOOVERWRITE case is safe to do with this
+	 * flag set;  if it errors in any way other than DB_NOTFOUND, we're
+	 * going to close the cursor without doing anything else, and if it
+	 * returns DB_NOTFOUND then it's safe to do a c_put(DB_KEYLAST) even if
+	 * an access method moved the cursor, since that's not
+	 * position-dependent.
+	 */
+	cursor_flags = DB_WRITELOCK;
+	if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY))
+		cursor_flags |= DB_CURSOR_BULK;
+	else
+		cursor_flags |= DB_CURSOR_TRANSIENT;
+	if ((ret = __db_cursor(dbp, ip, txn, &dbc, cursor_flags)) != 0)
+		return (ret);
+
+	DEBUG_LWRITE(dbc, txn, "DB->put", key, data, flags);
+
+	SET_RET_MEM(dbc, dbp);
+
+	if (flags == DB_APPEND && !DB_IS_PRIMARY(dbp)) {
+		/*
+		 * If there is an append callback, the value stored in
+		 * data->data may be replaced and then freed.  To avoid
+		 * passing a freed pointer back to the user, just operate
+		 * on a copy of the data DBT.
+		 */
+		tdata = *data;
+
+		/*
+		 * Append isn't a normal put operation;  call the appropriate
+		 * access method's append function.
+		 */
+		switch (dbp->type) {
+		case DB_QUEUE:
+			if ((ret = __qam_append(dbc, key, &tdata)) != 0)
+				goto err;
+			break;
+		case DB_RECNO:
+			if ((ret = __ram_append(dbc, key, &tdata)) != 0)
+				goto err;
+			break;
+		case DB_BTREE:
+		case DB_HASH:
+		case DB_UNKNOWN:
+		default:
+			/* The interface should prevent this. */
+			DB_ASSERT(env,
+			    dbp->type == DB_QUEUE || dbp->type == DB_RECNO);
+
+			ret = __db_ferr(env, "DB->put", 0);
+			goto err;
+		}
+
+		/*
+		 * The append callback, if one exists, may have allocated
+		 * a new tdata.data buffer.  If so, free it.
+		 */
+		FREE_IF_NEEDED(env, &tdata);
+
+		/* No need for a cursor put;  we're done. */
+#ifdef HAVE_COMPRESSION
+	} else if (DB_IS_COMPRESSED(dbp) && !F_ISSET(dbp, DB_AM_SECONDARY) &&
+	    !DB_IS_PRIMARY(dbp) && LIST_FIRST(&dbp->f_primaries) == NULL) {
+		ret = __dbc_put(dbc, key, data, flags);
+#endif
+	} else if (LF_ISSET(DB_MULTIPLE)) {
+		ret = 0;
+		memset(&tkey, 0, sizeof(tkey));
+		if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO) {
+			tkey.data = &recno;
+			tkey.size = sizeof(recno);
+		}
+		memset(&tdata, 0, sizeof(tdata));
+		DB_MULTIPLE_INIT(bulk_kptr, key);
+		DB_MULTIPLE_INIT(bulk_ptr, data);
+		key->doff = 0;
+		while (ret == 0) {
+			if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO)
+				DB_MULTIPLE_RECNO_NEXT(bulk_kptr, key,
+				    recno, tdata.data, tdata.size);
+			else
+				DB_MULTIPLE_NEXT(bulk_kptr, key,
+				    tkey.data, tkey.size);
+			DB_MULTIPLE_NEXT(bulk_ptr, data,
+			    tdata.data, tdata.size);
+			if (bulk_kptr == NULL || bulk_ptr == NULL)
+				break;
+			ret = __dbc_put(dbc, &tkey, &tdata,
+			    LF_ISSET(DB_OPFLAGS_MASK));
+			if (ret == 0)
+				++key->doff;
+		}
+	} else if (LF_ISSET(DB_MULTIPLE_KEY)) {
+		ret = 0;
+		memset(&tkey, 0, sizeof(tkey));
+		if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO) {
+			tkey.data = &recno;
+			tkey.size = sizeof(recno);
+		}
+		memset(&tdata, 0, sizeof(tdata));
+		DB_MULTIPLE_INIT(bulk_ptr, key);
+		while (ret == 0) {
+			if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO)
+				DB_MULTIPLE_RECNO_NEXT(bulk_ptr, key, recno,
+				    tdata.data, tdata.size);
+			else
+				DB_MULTIPLE_KEY_NEXT(bulk_ptr, key, tkey.data,
+				    tkey.size, tdata.data, tdata.size);
+			if (bulk_ptr == NULL)
+				break;
+			ret = __dbc_put(dbc, &tkey, &tdata,
+			    LF_ISSET(DB_OPFLAGS_MASK));
+			if (ret == 0)
+				++key->doff;
+		}
+	} else
+		ret = __dbc_put(dbc, key, data, flags);
+
+err:	/* Close the cursor. */
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_del --
+ *	Delete the items referenced by a key.
+ *
+ * PUBLIC: int __db_del __P((DB *,
+ * PUBLIC:      DB_THREAD_INFO *, DB_TXN *, DBT *, u_int32_t));
+ */
+int
+__db_del(dbp, ip, txn, key, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DBT *key;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	DBT data, tkey;
+	void *bulk_ptr;
+	db_recno_t recno;
+	u_int32_t cursor_flags, f_init, f_next;
+	int ret, t_ret;
+
+	COMPQUIET(bulk_ptr, NULL);
+	/* Allocate a cursor. */
+	cursor_flags = DB_WRITELOCK;
+	if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY))
+		cursor_flags |= DB_CURSOR_BULK;
+	if ((ret = __db_cursor(dbp, ip, txn, &dbc, cursor_flags)) != 0)
+		goto err;
+
+	DEBUG_LWRITE(dbc, txn, "DB->del", key, NULL, flags);
+
+#ifdef HAVE_COMPRESSION
+	if (DB_IS_COMPRESSED(dbp) && !F_ISSET(dbp, DB_AM_SECONDARY) &&
+	    !DB_IS_PRIMARY(dbp) && LIST_FIRST(&dbp->f_primaries) == NULL) {
+		F_SET(dbc, DBC_TRANSIENT);
+		ret = __dbc_bulk_del(dbc, key, flags);
+		goto err;
+	}
+#endif
+
+	/*
+	 * Walk a cursor through the key/data pairs, deleting as we go.  Set
+	 * the DB_DBT_USERMEM flag, as this might be a threaded application
+	 * and the flags checking will catch us.  We don't actually want the
+	 * keys or data, set DB_DBT_ISSET.  We rely on __dbc_get to clear
+	 * this.
+	 */
+	memset(&data, 0, sizeof(data));
+	F_SET(&data, DB_DBT_USERMEM);
+	tkey = *key;
+
+	f_init = LF_ISSET(DB_MULTIPLE_KEY) ? DB_GET_BOTH : DB_SET;
+	f_next = DB_NEXT_DUP;
+
+	/*
+	 * If locking (and we haven't already acquired CDB locks), set the
+	 * read-modify-write flag.
+	 */
+	if (STD_LOCKING(dbc)) {
+		f_init |= DB_RMW;
+		f_next |= DB_RMW;
+	}
+
+	if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+		if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO) {
+			memset(&tkey, 0, sizeof(tkey));
+			tkey.data = &recno;
+			tkey.size = sizeof(recno);
+		}
+		DB_MULTIPLE_INIT(bulk_ptr, key);
+		/* We return the number of keys deleted in doff. */
+		key->doff = 0;
+bulk_next:	if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO)
+			DB_MULTIPLE_RECNO_NEXT(bulk_ptr, key,
+			    recno, data.data, data.size);
+		else if (LF_ISSET(DB_MULTIPLE))
+			DB_MULTIPLE_NEXT(bulk_ptr, key, tkey.data, tkey.size);
+		else
+			DB_MULTIPLE_KEY_NEXT(bulk_ptr, key,
+			    tkey.data, tkey.size, data.data, data.size);
+		if (bulk_ptr == NULL)
+			goto err;
+	}
+
+	/* We're not interested in the data -- do not return it. */
+	F_SET(&tkey, DB_DBT_ISSET);
+	F_SET(&data, DB_DBT_ISSET);
+
+	/*
+	 * Optimize the simple cases.  For all AMs if we don't have secondaries
+	 * and are not a secondary and we aren't a foreign database and there
+	 * are no dups then we can avoid a bunch of overhead.  For queue we
+	 * don't need to fetch the record since we delete by direct calculation
+	 * from the record number.
+	 *
+	 * Hash permits an optimization in DB->del: since on-page duplicates are
+	 * stored in a single HKEYDATA structure, it's possible to delete an
+	 * entire set of them at once, and as the HKEYDATA has to be rebuilt
+	 * and re-put each time it changes, this is much faster than deleting
+	 * the duplicates one by one.  Thus, if not pointing at an off-page
+	 * duplicate set, and we're not using secondary indices (in which case
+	 * we'd have to examine the items one by one anyway), let hash do this
+	 * "quick delete".
+	 *
+	 * !!!
+	 * Note that this is the only application-executed delete call in
+	 * Berkeley DB that does not go through the __dbc_del function.
+	 * If anything other than the delete itself (like a secondary index
+	 * update) has to happen there in a particular situation, the
+	 * conditions here should be modified not to use these optimizations.
+	 * The ordinary AM-independent alternative will work just fine;
+	 * it'll just be slower.
+	 */
+	if (!F_ISSET(dbp, DB_AM_SECONDARY) && !DB_IS_PRIMARY(dbp) &&
+	    LIST_FIRST(&dbp->f_primaries) == NULL) {
+#ifdef HAVE_QUEUE
+		if (dbp->type == DB_QUEUE) {
+			ret = __qam_delete(dbc, &tkey, flags);
+			goto next;
+		}
+#endif
+
+		/* Fetch the first record. */
+		if ((ret = __dbc_get(dbc, &tkey, &data, f_init)) != 0)
+			goto err;
+
+#ifdef HAVE_HASH
+		/*
+		 * Hash "quick delete" removes all on-page duplicates.  We
+		 * can't do that if deleting specific key/data pairs.
+		 */
+		if (dbp->type == DB_HASH && !LF_ISSET(DB_MULTIPLE_KEY)) {
+			DBC *sdbc;
+			sdbc = dbc;
+#ifdef HAVE_PARTITION
+			if (F_ISSET(dbc, DBC_PARTITIONED))
+				sdbc =
+				    ((PART_CURSOR*)dbc->internal)->sub_cursor;
+#endif
+			if (sdbc->internal->opd == NULL) {
+				ret = __ham_quick_delete(sdbc);
+				goto next;
+			}
+		}
+#endif
+
+		if (!F_ISSET(dbp, DB_AM_DUP)) {
+			ret = dbc->am_del(dbc, 0);
+			goto next;
+		}
+	} else if ((ret = __dbc_get(dbc, &tkey, &data, f_init)) != 0)
+		goto err;
+
+	/* Walk through the set of key/data pairs, deleting as we go. */
+	for (;;) {
+		if ((ret = __dbc_del(dbc, flags)) != 0)
+			break;
+		/*
+		 * With DB_MULTIPLE_KEY, the application has specified the
+		 * exact records they want deleted.  We don't need to walk
+		 * through a set of duplicates.
+		 */
+		if (LF_ISSET(DB_MULTIPLE_KEY))
+			break;
+
+		F_SET(&tkey, DB_DBT_ISSET);
+		F_SET(&data, DB_DBT_ISSET);
+		if ((ret = __dbc_get(dbc, &tkey, &data, f_next)) != 0) {
+			if (ret == DB_NOTFOUND)
+				ret = 0;
+			break;
+		}
+	}
+
+next:	if (ret == 0 && LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+		++key->doff;
+		goto bulk_next;
+	}
+err:	/* Discard the cursor. */
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_sync --
+ *	Flush the database cache.
+ *
+ * PUBLIC: int __db_sync __P((DB *));
+ */
+int
+__db_sync(dbp)
+	DB *dbp;
+{
+	int ret, t_ret;
+
+	ret = 0;
+
+	/* If the database was read-only, we're done. */
+	if (F_ISSET(dbp, DB_AM_RDONLY))
+		return (0);
+
+	/* If it's a Recno tree, write the backing source text file. */
+	if (dbp->type == DB_RECNO)
+		ret = __ram_writeback(dbp);
+
+	/* If the database was never backed by a database file, we're done. */
+	if (F_ISSET(dbp, DB_AM_INMEM))
+		return (ret);
+#ifdef HAVE_PARTITION
+	if (DB_IS_PARTITIONED(dbp))
+		ret = __partition_sync(dbp);
+	else
+#endif
+	if (dbp->type == DB_QUEUE)
+		ret = __qam_sync(dbp);
+	else
+		/* Flush any dirty pages from the cache to the backing file. */
+		if ((t_ret = __memp_fsync(dbp->mpf)) != 0 && ret == 0)
+			ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_associate --
+ *	Associate another database as a secondary index to this one.
+ *
+ * PUBLIC: int __db_associate __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB *,
+ * PUBLIC:     int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t));
+ */
+int
+__db_associate(dbp, ip, txn, sdbp, callback, flags)
+	DB *dbp, *sdbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	int (*callback) __P((DB *, const DBT *, const DBT *, DBT *));
+	u_int32_t flags;
+{
+	DBC *pdbc, *sdbc;
+	DBT key, data, skey, *tskeyp;
+	ENV *env;
+	int build, ret, t_ret;
+	u_int32_t nskey;
+
+	env = dbp->env;
+	pdbc = sdbc = NULL;
+	ret = 0;
+
+	memset(&skey, 0, sizeof(DBT));
+	nskey = 0;
+	tskeyp = NULL;
+
+	/*
+	 * Check to see if the secondary is empty -- and thus if we should
+	 * build it -- before we link it in and risk making it show up in other
+	 * threads.  Do this first so that the databases remain unassociated on
+	 * error.
+	 */
+	build = 0;
+	if (LF_ISSET(DB_CREATE)) {
+		if ((ret = __db_cursor(sdbp, ip, txn, &sdbc, 0)) != 0)
+			goto err;
+
+		/*
+		 * We don't care about key or data;  we're just doing
+		 * an existence check.
+		 */
+		memset(&key, 0, sizeof(DBT));
+		memset(&data, 0, sizeof(DBT));
+		F_SET(&key, DB_DBT_PARTIAL | DB_DBT_USERMEM);
+		F_SET(&data, DB_DBT_PARTIAL | DB_DBT_USERMEM);
+		if ((ret = __dbc_get(sdbc, &key, &data,
+		    (STD_LOCKING(sdbc) ? DB_RMW : 0) |
+		    DB_FIRST)) == DB_NOTFOUND) {
+			build = 1;
+			ret = 0;
+		}
+
+		if ((t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
+			ret = t_ret;
+
+		/* Reset for later error check. */
+		sdbc = NULL;
+
+		if (ret != 0)
+			goto err;
+	}
+
+	/*
+	 * Set up the database handle as a secondary.
+	 */
+	sdbp->s_callback = callback;
+	sdbp->s_primary = dbp;
+
+	sdbp->stored_get = sdbp->get;
+	sdbp->get = __db_secondary_get;
+
+	sdbp->stored_close = sdbp->close;
+	sdbp->close = __db_secondary_close_pp;
+
+	F_SET(sdbp, DB_AM_SECONDARY);
+
+	if (LF_ISSET(DB_IMMUTABLE_KEY))
+		FLD_SET(sdbp->s_assoc_flags, DB_ASSOC_IMMUTABLE_KEY);
+
+	/*
+	 * Add the secondary to the list on the primary.  Do it here
+	 * so that we see any updates that occur while we're walking
+	 * the primary.
+	 */
+	MUTEX_LOCK(env, dbp->mutex);
+
+	/* See __db_s_next for an explanation of secondary refcounting. */
+	DB_ASSERT(env, sdbp->s_refcnt == 0);
+	sdbp->s_refcnt = 1;
+	LIST_INSERT_HEAD(&dbp->s_secondaries, sdbp, s_links);
+	MUTEX_UNLOCK(env, dbp->mutex);
+
+	if (build) {
+		/*
+		 * We loop through the primary, putting each item we
+		 * find into the new secondary.
+		 *
+		 * If we're using CDB, opening these two cursors puts us
+		 * in a bit of a locking tangle:  CDB locks are done on the
+		 * primary, so that we stay deadlock-free, but that means
+		 * that updating the secondary while we have a read cursor
+		 * open on the primary will self-block.  To get around this,
+		 * we force the primary cursor to use the same locker ID
+		 * as the secondary, so they won't conflict.  This should
+		 * be harmless even if we're not using CDB.
+		 */
+		if ((ret = __db_cursor(sdbp, ip, txn, &sdbc,
+		    CDB_LOCKING(sdbp->env) ? DB_WRITECURSOR : 0)) != 0)
+			goto err;
+		if ((ret = __db_cursor_int(dbp, ip,
+		    txn, dbp->type, PGNO_INVALID, 0, sdbc->locker, &pdbc)) != 0)
+			goto err;
+
+		/* Lock out other threads, now that we have a locker. */
+		dbp->associate_locker = sdbc->locker;
+
+		memset(&key, 0, sizeof(DBT));
+		memset(&data, 0, sizeof(DBT));
+		while ((ret = __dbc_get(pdbc, &key, &data, DB_NEXT)) == 0) {
+			if ((ret = callback(sdbp, &key, &data, &skey)) != 0) {
+				if (ret == DB_DONOTINDEX)
+					continue;
+				goto err;
+			}
+			if (F_ISSET(&skey, DB_DBT_MULTIPLE)) {
+#ifdef DIAGNOSTIC
+				__db_check_skeyset(sdbp, &skey);
+#endif
+				nskey = skey.size;
+				tskeyp = (DBT *)skey.data;
+			} else {
+				nskey = 1;
+				tskeyp = &skey;
+			}
+			SWAP_IF_NEEDED(sdbp, &key);
+			for (; nskey > 0; nskey--, tskeyp++) {
+				if ((ret = __dbc_put(sdbc,
+				    tskeyp, &key, DB_UPDATE_SECONDARY)) != 0)
+					goto err;
+				FREE_IF_NEEDED(env, tskeyp);
+			}
+			SWAP_IF_NEEDED(sdbp, &key);
+			FREE_IF_NEEDED(env, &skey);
+		}
+		if (ret == DB_NOTFOUND)
+			ret = 0;
+	}
+
+err:	if (sdbc != NULL && (t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (pdbc != NULL && (t_ret = __dbc_close(pdbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	dbp->associate_locker = NULL;
+
+	for (; nskey > 0; nskey--, tskeyp++)
+		FREE_IF_NEEDED(env, tskeyp);
+	FREE_IF_NEEDED(env, &skey);
+
+	return (ret);
+}
+
+/*
+ * __db_secondary_get --
+ *	This wrapper function for DB->pget() is the DB->get() function
+ *	on a database which has been made into a secondary index.
+ */
+static int
+__db_secondary_get(sdbp, txn, skey, data, flags)
+	DB *sdbp;
+	DB_TXN *txn;
+	DBT *skey, *data;
+	u_int32_t flags;
+{
+	DB_ASSERT(sdbp->env, F_ISSET(sdbp, DB_AM_SECONDARY));
+	return (__db_pget_pp(sdbp, txn, skey, NULL, data, flags));
+}
+
+/*
+ * __db_secondary_close --
+ *	Wrapper function for DB->close() which we use on secondaries to
+ *	manage refcounting and make sure we don't close them underneath
+ *	a primary that is updating.
+ *
+ * PUBLIC: int __db_secondary_close __P((DB *, u_int32_t));
+ */
+int
+__db_secondary_close(sdbp, flags)
+	DB *sdbp;
+	u_int32_t flags;
+{
+	DB *primary;
+	ENV *env;
+	int doclose;
+
+	doclose = 0;
+	primary = sdbp->s_primary;
+	env = primary->env;
+
+	MUTEX_LOCK(env, primary->mutex);
+	/*
+	 * Check the refcount--if it was at 1 when we were called, no
+	 * thread is currently updating this secondary through the primary,
+	 * so it's safe to close it for real.
+	 *
+	 * If it's not safe to do the close now, we do nothing;  the
+	 * database will actually be closed when the refcount is decremented,
+	 * which can happen in either __db_s_next or __db_s_done.
+	 */
+	DB_ASSERT(env, sdbp->s_refcnt != 0);
+	if (--sdbp->s_refcnt == 0) {
+		LIST_REMOVE(sdbp, s_links);
+		/* We don't want to call close while the mutex is held. */
+		doclose = 1;
+	}
+	MUTEX_UNLOCK(env, primary->mutex);
+
+	/*
+	 * sdbp->close is this function;  call the real one explicitly if
+	 * need be.
+	 */
+	return (doclose ? __db_close(sdbp, NULL, flags) : 0);
+}
+
+/*
+ * __db_associate_foreign --
+ *	Associate this database (fdbp) as a foreign constraint to another
+ *	database (pdbp).  That is, dbp's keys appear as foreign key values in
+ *	pdbp.
+ *
+ * PUBLIC: int __db_associate_foreign __P((DB *, DB *,
+ * PUBLIC:     int (*)(DB *, const DBT *, DBT *, const DBT *, int *),
+ * PUBLIC:     u_int32_t));
+ */
+int
+__db_associate_foreign(fdbp, pdbp, callback, flags)
+	DB *fdbp, *pdbp;
+	int (*callback)(DB *, const DBT *, DBT *, const DBT *, int *);
+	u_int32_t flags;
+{
+	DB_FOREIGN_INFO *f_info;
+	ENV *env;
+	int ret;
+
+	env = fdbp->env;
+	ret = 0;
+
+	if ((ret = __os_malloc(env, sizeof(DB_FOREIGN_INFO), &f_info)) != 0) {
+		return ret;
+	}
+	memset(f_info, 0, sizeof(DB_FOREIGN_INFO));
+
+	f_info->dbp = pdbp;
+	f_info->callback = callback;
+
+	/*
+	 * It might be wise to filter this, but for now the flags only
+	 * set the delete action type.
+	 */
+	FLD_SET(f_info->flags, flags);
+
+	/*
+	 * Add f_info to the foreign database's list of primaries.  That is to
+	 * say, fdbp->f_primaries lists all databases for which fdbp is a
+	 * foreign constraint.
+	 */
+	MUTEX_LOCK(env, fdbp->mutex);
+	LIST_INSERT_HEAD(&fdbp->f_primaries, f_info, f_links);
+	MUTEX_UNLOCK(env, fdbp->mutex);
+
+	/*
+	* Associate fdbp as pdbp's foreign db, for referential integrity
+	* checks.  We don't allow the foreign db to be changed, because we
+	* currently have no way of removing pdbp from the old foreign db's list
+	* of primaries.
+	*/
+	if (pdbp->s_foreign != NULL)
+		return (EINVAL);
+	pdbp->s_foreign = fdbp;
+
+	return (ret);
+}
+
+static int
+__dbc_set_priority(dbc, priority)
+	DBC *dbc;
+	DB_CACHE_PRIORITY priority;
+{
+	dbc->priority = priority;
+	return (0);
+}
+
+static int
+__dbc_get_priority(dbc, priority)
+	DBC *dbc;
+	DB_CACHE_PRIORITY *priority;
+{
+	*priority = dbc->priority;
+	return (0);
+}
diff --git a/db/db_auto.c b/db/db_auto.c
new file mode 100644
index 0000000..2ce4199
--- /dev/null
+++ b/db/db_auto.c
@@ -0,0 +1,3267 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __db_addrem_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC:     __db_addrem_args **));
+ */
+int
+__db_addrem_read(env, dbpp, td, recbuf, argpp)
+	ENV *env;
+	DB **dbpp;
+	void *td;
+	void *recbuf;
+	__db_addrem_args **argpp;
+{
+	__db_addrem_args *argp;
+	u_int32_t uinttmp;
+	u_int8_t *bp;
+	int ret;
+
+	if ((ret = __os_malloc(env,
+	    sizeof(__db_addrem_args) + sizeof(DB_TXN), &argp)) != 0)
+		return (ret);
+	bp = recbuf;
+	argp->txnp = (DB_TXN *)&argp[1];
+	memset(argp->txnp, 0, sizeof(DB_TXN));
+
+	argp->txnp->td = td;
+	LOGCOPY_32(env, &argp->type, bp);
+	bp += sizeof(argp->type);
+
+	LOGCOPY_32(env, &argp->txnp->txnid, bp);
+	bp += sizeof(argp->txnp->txnid);
+
+	LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &argp->opcode, bp);
+	bp += sizeof(argp->opcode);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->fileid = (int32_t)uinttmp;
+	bp += sizeof(uinttmp);
+	if (dbpp != NULL) {
+		*dbpp = NULL;
+		ret = __dbreg_id_to_db(
+		    env, argp->txnp, dbpp, argp->fileid, 1);
+	}
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_32(env, &argp->indx, bp);
+	bp += sizeof(argp->indx);
+
+	LOGCOPY_32(env, &argp->nbytes, bp);
+	bp += sizeof(argp->nbytes);
+
+	memset(&argp->hdr, 0, sizeof(argp->hdr));
+	LOGCOPY_32(env,&argp->hdr.size, bp);
+	bp += sizeof(u_int32_t);
+	argp->hdr.data = bp;
+	bp += argp->hdr.size;
+
+	memset(&argp->dbt, 0, sizeof(argp->dbt));
+	LOGCOPY_32(env,&argp->dbt.size, bp);
+	bp += sizeof(u_int32_t);
+	argp->dbt.data = bp;
+	bp += argp->dbt.size;
+
+	LOGCOPY_TOLSN(env, &argp->pagelsn, bp);
+	bp += sizeof(DB_LSN);
+
+	*argpp = argp;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_addrem_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC:     u_int32_t, u_int32_t, db_pgno_t, u_int32_t, u_int32_t,
+ * PUBLIC:     const DBT *, const DBT *, DB_LSN *));
+ */
+int
+__db_addrem_log(dbp, txnp, ret_lsnp, flags,
+    opcode, pgno, indx, nbytes, hdr,
+    dbt, pagelsn)
+	DB *dbp;
+	DB_TXN *txnp;
+	DB_LSN *ret_lsnp;
+	u_int32_t flags;
+	u_int32_t opcode;
+	db_pgno_t pgno;
+	u_int32_t indx;
+	u_int32_t nbytes;
+	const DBT *hdr;
+	const DBT *dbt;
+	DB_LSN * pagelsn;
+{
+	DBT logrec;
+	DB_LSN *lsnp, null_lsn, *rlsnp;
+	DB_TXNLOGREC *lr;
+	ENV *env;
+	u_int32_t zero, uinttmp, rectype, txn_num;
+	u_int npad;
+	u_int8_t *bp;
+	int is_durable, ret;
+
+	COMPQUIET(lr, NULL);
+
+	env = dbp->env;
+	rlsnp = ret_lsnp;
+	rectype = DB___db_addrem;
+	npad = 0;
+	ret = 0;
+
+	if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+	    F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+		if (txnp == NULL)
+			return (0);
+		is_durable = 0;
+	} else
+		is_durable = 1;
+
+	if (txnp == NULL) {
+		txn_num = 0;
+		lsnp = &null_lsn;
+		null_lsn.file = null_lsn.offset = 0;
+	} else {
+		if (TAILQ_FIRST(&txnp->kids) != NULL &&
+		    (ret = __txn_activekids(env, rectype, txnp)) != 0)
+			return (ret);
+		/*
+		 * We need to assign begin_lsn while holding region mutex.
+		 * That assignment is done inside the DbEnv->log_put call,
+		 * so pass in the appropriate memory location to be filled
+		 * in by the log_put code.
+		 */
+		DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+		txn_num = txnp->txnid;
+	}
+
+	DB_ASSERT(env, dbp->log_filename != NULL);
+	if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+	    (ret = __dbreg_lazy_id(dbp)) != 0)
+		return (ret);
+
+	logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t) + (hdr == NULL ? 0 : hdr->size)
+	    + sizeof(u_int32_t) + (dbt == NULL ? 0 : dbt->size)
+	    + sizeof(*pagelsn);
+	if (CRYPTO_ON(env)) {
+		npad = env->crypto_handle->adj_size(logrec.size);
+		logrec.size += npad;
+	}
+
+	if (is_durable || txnp == NULL) {
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0)
+			return (ret);
+	} else {
+		if ((ret = __os_malloc(env,
+		    logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+			return (ret);
+#ifdef DIAGNOSTIC
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+			__os_free(env, lr);
+			return (ret);
+		}
+#else
+		logrec.data = lr->data;
+#endif
+	}
+	if (npad > 0)
+		memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+	bp = logrec.data;
+
+	LOGCOPY_32(env, bp, &rectype);
+	bp += sizeof(rectype);
+
+	LOGCOPY_32(env, bp, &txn_num);
+	bp += sizeof(txn_num);
+
+	LOGCOPY_FROMLSN(env, bp, lsnp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, bp, &opcode);
+	bp += sizeof(opcode);
+
+	uinttmp = (u_int32_t)dbp->log_filename->id;
+	LOGCOPY_32(env, bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	uinttmp = (u_int32_t)pgno;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_32(env, bp, &indx);
+	bp += sizeof(indx);
+
+	LOGCOPY_32(env, bp, &nbytes);
+	bp += sizeof(nbytes);
+
+	if (hdr == NULL) {
+		zero = 0;
+		LOGCOPY_32(env, bp, &zero);
+		bp += sizeof(u_int32_t);
+	} else {
+		LOGCOPY_32(env, bp, &hdr->size);
+		bp += sizeof(hdr->size);
+		memcpy(bp, hdr->data, hdr->size);
+		bp += hdr->size;
+	}
+
+	if (dbt == NULL) {
+		zero = 0;
+		LOGCOPY_32(env, bp, &zero);
+		bp += sizeof(u_int32_t);
+	} else {
+		LOGCOPY_32(env, bp, &dbt->size);
+		bp += sizeof(dbt->size);
+		memcpy(bp, dbt->data, dbt->size);
+		bp += dbt->size;
+	}
+
+	if (pagelsn != NULL) {
+		if (txnp != NULL) {
+			LOG *lp = env->lg_handle->reginfo.primary;
+			if (LOG_COMPARE(pagelsn, &lp->lsn) >= 0 && (ret =
+			    __log_check_page_lsn(env, dbp, pagelsn)) != 0)
+				return (ret);
+		}
+		LOGCOPY_FROMLSN(env, bp, pagelsn);
+	} else
+		memset(bp, 0, sizeof(*pagelsn));
+	bp += sizeof(*pagelsn);
+
+	DB_ASSERT(env,
+	    (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+	if (is_durable || txnp == NULL) {
+		if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+		    flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+			*lsnp = *rlsnp;
+			if (rlsnp != ret_lsnp)
+				 *ret_lsnp = *rlsnp;
+		}
+	} else {
+		ret = 0;
+#ifdef DIAGNOSTIC
+		/*
+		 * Set the debug bit if we are going to log non-durable
+		 * transactions so they will be ignored by recovery.
+		 */
+		memcpy(lr->data, logrec.data, logrec.size);
+		rectype |= DB_debug_FLAG;
+		LOGCOPY_32(env, logrec.data, &rectype);
+
+		if (!IS_REP_CLIENT(env))
+			ret = __log_put(env,
+			    rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+		STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+		F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+		LSN_NOT_LOGGED(*ret_lsnp);
+	}
+
+#ifdef LOG_DIAGNOSTIC
+	if (ret != 0)
+		(void)__db_addrem_print(env,
+		    (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+	__os_free(env, logrec.data);
+#else
+	if (is_durable || txnp == NULL)
+		__os_free(env, logrec.data);
+#endif
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_big_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC:     __db_big_args **));
+ */
+int
+__db_big_read(env, dbpp, td, recbuf, argpp)
+	ENV *env;
+	DB **dbpp;
+	void *td;
+	void *recbuf;
+	__db_big_args **argpp;
+{
+	__db_big_args *argp;
+	u_int32_t uinttmp;
+	u_int8_t *bp;
+	int ret;
+
+	if ((ret = __os_malloc(env,
+	    sizeof(__db_big_args) + sizeof(DB_TXN), &argp)) != 0)
+		return (ret);
+	bp = recbuf;
+	argp->txnp = (DB_TXN *)&argp[1];
+	memset(argp->txnp, 0, sizeof(DB_TXN));
+
+	argp->txnp->td = td;
+	LOGCOPY_32(env, &argp->type, bp);
+	bp += sizeof(argp->type);
+
+	LOGCOPY_32(env, &argp->txnp->txnid, bp);
+	bp += sizeof(argp->txnp->txnid);
+
+	LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &argp->opcode, bp);
+	bp += sizeof(argp->opcode);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->fileid = (int32_t)uinttmp;
+	bp += sizeof(uinttmp);
+	if (dbpp != NULL) {
+		*dbpp = NULL;
+		ret = __dbreg_id_to_db(
+		    env, argp->txnp, dbpp, argp->fileid, 1);
+	}
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->prev_pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->next_pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	memset(&argp->dbt, 0, sizeof(argp->dbt));
+	LOGCOPY_32(env,&argp->dbt.size, bp);
+	bp += sizeof(u_int32_t);
+	argp->dbt.data = bp;
+	bp += argp->dbt.size;
+
+	LOGCOPY_TOLSN(env, &argp->pagelsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_TOLSN(env, &argp->prevlsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_TOLSN(env, &argp->nextlsn, bp);
+	bp += sizeof(DB_LSN);
+
+	*argpp = argp;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_big_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC:     u_int32_t, u_int32_t, db_pgno_t, db_pgno_t, db_pgno_t,
+ * PUBLIC:     const DBT *, DB_LSN *, DB_LSN *, DB_LSN *));
+ */
+int
+__db_big_log(dbp, txnp, ret_lsnp, flags,
+    opcode, pgno, prev_pgno, next_pgno, dbt,
+    pagelsn, prevlsn, nextlsn)
+	DB *dbp;
+	DB_TXN *txnp;
+	DB_LSN *ret_lsnp;
+	u_int32_t flags;
+	u_int32_t opcode;
+	db_pgno_t pgno;
+	db_pgno_t prev_pgno;
+	db_pgno_t next_pgno;
+	const DBT *dbt;
+	DB_LSN * pagelsn;
+	DB_LSN * prevlsn;
+	DB_LSN * nextlsn;
+{
+	DBT logrec;
+	DB_LSN *lsnp, null_lsn, *rlsnp;
+	DB_TXNLOGREC *lr;
+	ENV *env;
+	u_int32_t zero, uinttmp, rectype, txn_num;
+	u_int npad;
+	u_int8_t *bp;
+	int is_durable, ret;
+
+	COMPQUIET(lr, NULL);
+
+	env = dbp->env;
+	rlsnp = ret_lsnp;
+	rectype = DB___db_big;
+	npad = 0;
+	ret = 0;
+
+	if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+	    F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+		if (txnp == NULL)
+			return (0);
+		is_durable = 0;
+	} else
+		is_durable = 1;
+
+	if (txnp == NULL) {
+		txn_num = 0;
+		lsnp = &null_lsn;
+		null_lsn.file = null_lsn.offset = 0;
+	} else {
+		if (TAILQ_FIRST(&txnp->kids) != NULL &&
+		    (ret = __txn_activekids(env, rectype, txnp)) != 0)
+			return (ret);
+		/*
+		 * We need to assign begin_lsn while holding region mutex.
+		 * That assignment is done inside the DbEnv->log_put call,
+		 * so pass in the appropriate memory location to be filled
+		 * in by the log_put code.
+		 */
+		DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+		txn_num = txnp->txnid;
+	}
+
+	DB_ASSERT(env, dbp->log_filename != NULL);
+	if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+	    (ret = __dbreg_lazy_id(dbp)) != 0)
+		return (ret);
+
+	logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t) + (dbt == NULL ? 0 : dbt->size)
+	    + sizeof(*pagelsn)
+	    + sizeof(*prevlsn)
+	    + sizeof(*nextlsn);
+	if (CRYPTO_ON(env)) {
+		npad = env->crypto_handle->adj_size(logrec.size);
+		logrec.size += npad;
+	}
+
+	if (is_durable || txnp == NULL) {
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0)
+			return (ret);
+	} else {
+		if ((ret = __os_malloc(env,
+		    logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+			return (ret);
+#ifdef DIAGNOSTIC
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+			__os_free(env, lr);
+			return (ret);
+		}
+#else
+		logrec.data = lr->data;
+#endif
+	}
+	if (npad > 0)
+		memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+	bp = logrec.data;
+
+	LOGCOPY_32(env, bp, &rectype);
+	bp += sizeof(rectype);
+
+	LOGCOPY_32(env, bp, &txn_num);
+	bp += sizeof(txn_num);
+
+	LOGCOPY_FROMLSN(env, bp, lsnp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, bp, &opcode);
+	bp += sizeof(opcode);
+
+	uinttmp = (u_int32_t)dbp->log_filename->id;
+	LOGCOPY_32(env, bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	uinttmp = (u_int32_t)pgno;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	uinttmp = (u_int32_t)prev_pgno;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	uinttmp = (u_int32_t)next_pgno;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	if (dbt == NULL) {
+		zero = 0;
+		LOGCOPY_32(env, bp, &zero);
+		bp += sizeof(u_int32_t);
+	} else {
+		LOGCOPY_32(env, bp, &dbt->size);
+		bp += sizeof(dbt->size);
+		memcpy(bp, dbt->data, dbt->size);
+		bp += dbt->size;
+	}
+
+	if (pagelsn != NULL) {
+		if (txnp != NULL) {
+			LOG *lp = env->lg_handle->reginfo.primary;
+			if (LOG_COMPARE(pagelsn, &lp->lsn) >= 0 && (ret =
+			    __log_check_page_lsn(env, dbp, pagelsn)) != 0)
+				return (ret);
+		}
+		LOGCOPY_FROMLSN(env, bp, pagelsn);
+	} else
+		memset(bp, 0, sizeof(*pagelsn));
+	bp += sizeof(*pagelsn);
+
+	if (prevlsn != NULL) {
+		if (txnp != NULL) {
+			LOG *lp = env->lg_handle->reginfo.primary;
+			if (LOG_COMPARE(prevlsn, &lp->lsn) >= 0 && (ret =
+			    __log_check_page_lsn(env, dbp, prevlsn)) != 0)
+				return (ret);
+		}
+		LOGCOPY_FROMLSN(env, bp, prevlsn);
+	} else
+		memset(bp, 0, sizeof(*prevlsn));
+	bp += sizeof(*prevlsn);
+
+	if (nextlsn != NULL) {
+		if (txnp != NULL) {
+			LOG *lp = env->lg_handle->reginfo.primary;
+			if (LOG_COMPARE(nextlsn, &lp->lsn) >= 0 && (ret =
+			    __log_check_page_lsn(env, dbp, nextlsn)) != 0)
+				return (ret);
+		}
+		LOGCOPY_FROMLSN(env, bp, nextlsn);
+	} else
+		memset(bp, 0, sizeof(*nextlsn));
+	bp += sizeof(*nextlsn);
+
+	DB_ASSERT(env,
+	    (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+	if (is_durable || txnp == NULL) {
+		if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+		    flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+			*lsnp = *rlsnp;
+			if (rlsnp != ret_lsnp)
+				 *ret_lsnp = *rlsnp;
+		}
+	} else {
+		ret = 0;
+#ifdef DIAGNOSTIC
+		/*
+		 * Set the debug bit if we are going to log non-durable
+		 * transactions so they will be ignored by recovery.
+		 */
+		memcpy(lr->data, logrec.data, logrec.size);
+		rectype |= DB_debug_FLAG;
+		LOGCOPY_32(env, logrec.data, &rectype);
+
+		if (!IS_REP_CLIENT(env))
+			ret = __log_put(env,
+			    rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+		STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+		F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+		LSN_NOT_LOGGED(*ret_lsnp);
+	}
+
+#ifdef LOG_DIAGNOSTIC
+	if (ret != 0)
+		(void)__db_big_print(env,
+		    (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+	__os_free(env, logrec.data);
+#else
+	if (is_durable || txnp == NULL)
+		__os_free(env, logrec.data);
+#endif
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_ovref_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC:     __db_ovref_args **));
+ */
+int
+__db_ovref_read(env, dbpp, td, recbuf, argpp)
+	ENV *env;
+	DB **dbpp;
+	void *td;
+	void *recbuf;
+	__db_ovref_args **argpp;
+{
+	__db_ovref_args *argp;
+	u_int32_t uinttmp;
+	u_int8_t *bp;
+	int ret;
+
+	if ((ret = __os_malloc(env,
+	    sizeof(__db_ovref_args) + sizeof(DB_TXN), &argp)) != 0)
+		return (ret);
+	bp = recbuf;
+	argp->txnp = (DB_TXN *)&argp[1];
+	memset(argp->txnp, 0, sizeof(DB_TXN));
+
+	argp->txnp->td = td;
+	LOGCOPY_32(env, &argp->type, bp);
+	bp += sizeof(argp->type);
+
+	LOGCOPY_32(env, &argp->txnp->txnid, bp);
+	bp += sizeof(argp->txnp->txnid);
+
+	LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->fileid = (int32_t)uinttmp;
+	bp += sizeof(uinttmp);
+	if (dbpp != NULL) {
+		*dbpp = NULL;
+		ret = __dbreg_id_to_db(
+		    env, argp->txnp, dbpp, argp->fileid, 1);
+	}
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->adjust = (int32_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_TOLSN(env, &argp->lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	*argpp = argp;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_ovref_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC:     u_int32_t, db_pgno_t, int32_t, DB_LSN *));
+ */
+int
+__db_ovref_log(dbp, txnp, ret_lsnp, flags, pgno, adjust, lsn)
+	DB *dbp;
+	DB_TXN *txnp;
+	DB_LSN *ret_lsnp;
+	u_int32_t flags;
+	db_pgno_t pgno;
+	int32_t adjust;
+	DB_LSN * lsn;
+{
+	DBT logrec;
+	DB_LSN *lsnp, null_lsn, *rlsnp;
+	DB_TXNLOGREC *lr;
+	ENV *env;
+	u_int32_t uinttmp, rectype, txn_num;
+	u_int npad;
+	u_int8_t *bp;
+	int is_durable, ret;
+
+	COMPQUIET(lr, NULL);
+
+	env = dbp->env;
+	rlsnp = ret_lsnp;
+	rectype = DB___db_ovref;
+	npad = 0;
+	ret = 0;
+
+	if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+	    F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+		if (txnp == NULL)
+			return (0);
+		is_durable = 0;
+	} else
+		is_durable = 1;
+
+	if (txnp == NULL) {
+		txn_num = 0;
+		lsnp = &null_lsn;
+		null_lsn.file = null_lsn.offset = 0;
+	} else {
+		if (TAILQ_FIRST(&txnp->kids) != NULL &&
+		    (ret = __txn_activekids(env, rectype, txnp)) != 0)
+			return (ret);
+		/*
+		 * We need to assign begin_lsn while holding region mutex.
+		 * That assignment is done inside the DbEnv->log_put call,
+		 * so pass in the appropriate memory location to be filled
+		 * in by the log_put code.
+		 */
+		DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+		txn_num = txnp->txnid;
+	}
+
+	DB_ASSERT(env, dbp->log_filename != NULL);
+	if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+	    (ret = __dbreg_lazy_id(dbp)) != 0)
+		return (ret);
+
+	logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t)
+	    + sizeof(*lsn);
+	if (CRYPTO_ON(env)) {
+		npad = env->crypto_handle->adj_size(logrec.size);
+		logrec.size += npad;
+	}
+
+	if (is_durable || txnp == NULL) {
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0)
+			return (ret);
+	} else {
+		if ((ret = __os_malloc(env,
+		    logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+			return (ret);
+#ifdef DIAGNOSTIC
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+			__os_free(env, lr);
+			return (ret);
+		}
+#else
+		logrec.data = lr->data;
+#endif
+	}
+	if (npad > 0)
+		memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+	bp = logrec.data;
+
+	LOGCOPY_32(env, bp, &rectype);
+	bp += sizeof(rectype);
+
+	LOGCOPY_32(env, bp, &txn_num);
+	bp += sizeof(txn_num);
+
+	LOGCOPY_FROMLSN(env, bp, lsnp);
+	bp += sizeof(DB_LSN);
+
+	uinttmp = (u_int32_t)dbp->log_filename->id;
+	LOGCOPY_32(env, bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	uinttmp = (u_int32_t)pgno;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	uinttmp = (u_int32_t)adjust;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	if (lsn != NULL) {
+		if (txnp != NULL) {
+			LOG *lp = env->lg_handle->reginfo.primary;
+			if (LOG_COMPARE(lsn, &lp->lsn) >= 0 && (ret =
+			    __log_check_page_lsn(env, dbp, lsn)) != 0)
+				return (ret);
+		}
+		LOGCOPY_FROMLSN(env, bp, lsn);
+	} else
+		memset(bp, 0, sizeof(*lsn));
+	bp += sizeof(*lsn);
+
+	DB_ASSERT(env,
+	    (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+	if (is_durable || txnp == NULL) {
+		if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+		    flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+			*lsnp = *rlsnp;
+			if (rlsnp != ret_lsnp)
+				 *ret_lsnp = *rlsnp;
+		}
+	} else {
+		ret = 0;
+#ifdef DIAGNOSTIC
+		/*
+		 * Set the debug bit if we are going to log non-durable
+		 * transactions so they will be ignored by recovery.
+		 */
+		memcpy(lr->data, logrec.data, logrec.size);
+		rectype |= DB_debug_FLAG;
+		LOGCOPY_32(env, logrec.data, &rectype);
+
+		if (!IS_REP_CLIENT(env))
+			ret = __log_put(env,
+			    rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+		STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+		F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+		LSN_NOT_LOGGED(*ret_lsnp);
+	}
+
+#ifdef LOG_DIAGNOSTIC
+	if (ret != 0)
+		(void)__db_ovref_print(env,
+		    (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+	__os_free(env, logrec.data);
+#else
+	if (is_durable || txnp == NULL)
+		__os_free(env, logrec.data);
+#endif
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_relink_42_read __P((ENV *, DB **, void *,
+ * PUBLIC:     void *, __db_relink_42_args **));
+ */
+int
+__db_relink_42_read(env, dbpp, td, recbuf, argpp)
+	ENV *env;
+	DB **dbpp;
+	void *td;
+	void *recbuf;
+	__db_relink_42_args **argpp;
+{
+	__db_relink_42_args *argp;
+	u_int32_t uinttmp;
+	u_int8_t *bp;
+	int ret;
+
+	if ((ret = __os_malloc(env,
+	    sizeof(__db_relink_42_args) + sizeof(DB_TXN), &argp)) != 0)
+		return (ret);
+	bp = recbuf;
+	argp->txnp = (DB_TXN *)&argp[1];
+	memset(argp->txnp, 0, sizeof(DB_TXN));
+
+	argp->txnp->td = td;
+	LOGCOPY_32(env, &argp->type, bp);
+	bp += sizeof(argp->type);
+
+	LOGCOPY_32(env, &argp->txnp->txnid, bp);
+	bp += sizeof(argp->txnp->txnid);
+
+	LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &argp->opcode, bp);
+	bp += sizeof(argp->opcode);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->fileid = (int32_t)uinttmp;
+	bp += sizeof(uinttmp);
+	if (dbpp != NULL) {
+		*dbpp = NULL;
+		ret = __dbreg_id_to_db(
+		    env, argp->txnp, dbpp, argp->fileid, 1);
+	}
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_TOLSN(env, &argp->lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->prev = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_TOLSN(env, &argp->lsn_prev, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->next = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_TOLSN(env, &argp->lsn_next, bp);
+	bp += sizeof(DB_LSN);
+
+	*argpp = argp;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_debug_read __P((ENV *, void *, __db_debug_args **));
+ */
+int
+__db_debug_read(env, recbuf, argpp)
+	ENV *env;
+	void *recbuf;
+	__db_debug_args **argpp;
+{
+	__db_debug_args *argp;
+	u_int32_t uinttmp;
+	u_int8_t *bp;
+	int ret;
+
+	if ((ret = __os_malloc(env,
+	    sizeof(__db_debug_args) + sizeof(DB_TXN), &argp)) != 0)
+		return (ret);
+	bp = recbuf;
+	argp->txnp = (DB_TXN *)&argp[1];
+	memset(argp->txnp, 0, sizeof(DB_TXN));
+
+	LOGCOPY_32(env, &argp->type, bp);
+	bp += sizeof(argp->type);
+
+	LOGCOPY_32(env, &argp->txnp->txnid, bp);
+	bp += sizeof(argp->txnp->txnid);
+
+	LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	memset(&argp->op, 0, sizeof(argp->op));
+	LOGCOPY_32(env,&argp->op.size, bp);
+	bp += sizeof(u_int32_t);
+	argp->op.data = bp;
+	bp += argp->op.size;
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->fileid = (int32_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	memset(&argp->key, 0, sizeof(argp->key));
+	LOGCOPY_32(env,&argp->key.size, bp);
+	bp += sizeof(u_int32_t);
+	argp->key.data = bp;
+	bp += argp->key.size;
+
+	memset(&argp->data, 0, sizeof(argp->data));
+	LOGCOPY_32(env,&argp->data.size, bp);
+	bp += sizeof(u_int32_t);
+	argp->data.data = bp;
+	bp += argp->data.size;
+
+	LOGCOPY_32(env, &argp->arg_flags, bp);
+	bp += sizeof(argp->arg_flags);
+
+	*argpp = argp;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_debug_log __P((ENV *, DB_TXN *, DB_LSN *,
+ * PUBLIC:     u_int32_t, const DBT *, int32_t, const DBT *, const DBT *,
+ * PUBLIC:     u_int32_t));
+ */
+int
+__db_debug_log(env, txnp, ret_lsnp, flags,
+    op, fileid, key, data, arg_flags)
+	ENV *env;
+	DB_TXN *txnp;
+	DB_LSN *ret_lsnp;
+	u_int32_t flags;
+	const DBT *op;
+	int32_t fileid;
+	const DBT *key;
+	const DBT *data;
+	u_int32_t arg_flags;
+{
+	DBT logrec;
+	DB_LSN *lsnp, null_lsn, *rlsnp;
+	DB_TXNLOGREC *lr;
+	u_int32_t zero, uinttmp, rectype, txn_num;
+	u_int npad;
+	u_int8_t *bp;
+	int is_durable, ret;
+
+	COMPQUIET(lr, NULL);
+
+	rlsnp = ret_lsnp;
+	rectype = DB___db_debug;
+	npad = 0;
+	ret = 0;
+
+	if (LF_ISSET(DB_LOG_NOT_DURABLE)) {
+		if (txnp == NULL)
+			return (0);
+		is_durable = 0;
+	} else
+		is_durable = 1;
+
+	if (txnp == NULL) {
+		txn_num = 0;
+		lsnp = &null_lsn;
+		null_lsn.file = null_lsn.offset = 0;
+	} else {
+		/*
+		 * We need to assign begin_lsn while holding region mutex.
+		 * That assignment is done inside the DbEnv->log_put call,
+		 * so pass in the appropriate memory location to be filled
+		 * in by the log_put code.
+		 */
+		DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+		txn_num = txnp->txnid;
+	}
+
+	logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+	    + sizeof(u_int32_t) + (op == NULL ? 0 : op->size)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t) + (key == NULL ? 0 : key->size)
+	    + sizeof(u_int32_t) + (data == NULL ? 0 : data->size)
+	    + sizeof(u_int32_t);
+	if (CRYPTO_ON(env)) {
+		npad = env->crypto_handle->adj_size(logrec.size);
+		logrec.size += npad;
+	}
+
+	if (is_durable || txnp == NULL) {
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0)
+			return (ret);
+	} else {
+		if ((ret = __os_malloc(env,
+		    logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+			return (ret);
+#ifdef DIAGNOSTIC
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+			__os_free(env, lr);
+			return (ret);
+		}
+#else
+		logrec.data = lr->data;
+#endif
+	}
+	if (npad > 0)
+		memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+	bp = logrec.data;
+
+	LOGCOPY_32(env, bp, &rectype);
+	bp += sizeof(rectype);
+
+	LOGCOPY_32(env, bp, &txn_num);
+	bp += sizeof(txn_num);
+
+	LOGCOPY_FROMLSN(env, bp, lsnp);
+	bp += sizeof(DB_LSN);
+
+	if (op == NULL) {
+		zero = 0;
+		LOGCOPY_32(env, bp, &zero);
+		bp += sizeof(u_int32_t);
+	} else {
+		LOGCOPY_32(env, bp, &op->size);
+		bp += sizeof(op->size);
+		memcpy(bp, op->data, op->size);
+		bp += op->size;
+	}
+
+	uinttmp = (u_int32_t)fileid;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	if (key == NULL) {
+		zero = 0;
+		LOGCOPY_32(env, bp, &zero);
+		bp += sizeof(u_int32_t);
+	} else {
+		LOGCOPY_32(env, bp, &key->size);
+		bp += sizeof(key->size);
+		memcpy(bp, key->data, key->size);
+		bp += key->size;
+	}
+
+	if (data == NULL) {
+		zero = 0;
+		LOGCOPY_32(env, bp, &zero);
+		bp += sizeof(u_int32_t);
+	} else {
+		LOGCOPY_32(env, bp, &data->size);
+		bp += sizeof(data->size);
+		memcpy(bp, data->data, data->size);
+		bp += data->size;
+	}
+
+	LOGCOPY_32(env, bp, &arg_flags);
+	bp += sizeof(arg_flags);
+
+	DB_ASSERT(env,
+	    (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+	if (is_durable || txnp == NULL) {
+		if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+		    flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+			*lsnp = *rlsnp;
+			if (rlsnp != ret_lsnp)
+				 *ret_lsnp = *rlsnp;
+		}
+	} else {
+		ret = 0;
+#ifdef DIAGNOSTIC
+		/*
+		 * Set the debug bit if we are going to log non-durable
+		 * transactions so they will be ignored by recovery.
+		 */
+		memcpy(lr->data, logrec.data, logrec.size);
+		rectype |= DB_debug_FLAG;
+		LOGCOPY_32(env, logrec.data, &rectype);
+
+		if (!IS_REP_CLIENT(env))
+			ret = __log_put(env,
+			    rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+		STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+		F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+		LSN_NOT_LOGGED(*ret_lsnp);
+	}
+
+#ifdef LOG_DIAGNOSTIC
+	if (ret != 0)
+		(void)__db_debug_print(env,
+		    (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+	__os_free(env, logrec.data);
+#else
+	if (is_durable || txnp == NULL)
+		__os_free(env, logrec.data);
+#endif
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_noop_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC:     __db_noop_args **));
+ */
+int
+__db_noop_read(env, dbpp, td, recbuf, argpp)
+	ENV *env;
+	DB **dbpp;
+	void *td;
+	void *recbuf;
+	__db_noop_args **argpp;
+{
+	__db_noop_args *argp;
+	u_int32_t uinttmp;
+	u_int8_t *bp;
+	int ret;
+
+	if ((ret = __os_malloc(env,
+	    sizeof(__db_noop_args) + sizeof(DB_TXN), &argp)) != 0)
+		return (ret);
+	bp = recbuf;
+	argp->txnp = (DB_TXN *)&argp[1];
+	memset(argp->txnp, 0, sizeof(DB_TXN));
+
+	argp->txnp->td = td;
+	LOGCOPY_32(env, &argp->type, bp);
+	bp += sizeof(argp->type);
+
+	LOGCOPY_32(env, &argp->txnp->txnid, bp);
+	bp += sizeof(argp->txnp->txnid);
+
+	LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->fileid = (int32_t)uinttmp;
+	bp += sizeof(uinttmp);
+	if (dbpp != NULL) {
+		*dbpp = NULL;
+		ret = __dbreg_id_to_db(
+		    env, argp->txnp, dbpp, argp->fileid, 1);
+	}
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_TOLSN(env, &argp->prevlsn, bp);
+	bp += sizeof(DB_LSN);
+
+	*argpp = argp;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_noop_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC:     u_int32_t, db_pgno_t, DB_LSN *));
+ */
+int
+__db_noop_log(dbp, txnp, ret_lsnp, flags, pgno, prevlsn)
+	DB *dbp;
+	DB_TXN *txnp;
+	DB_LSN *ret_lsnp;
+	u_int32_t flags;
+	db_pgno_t pgno;
+	DB_LSN * prevlsn;
+{
+	DBT logrec;
+	DB_LSN *lsnp, null_lsn, *rlsnp;
+	DB_TXNLOGREC *lr;
+	ENV *env;
+	u_int32_t uinttmp, rectype, txn_num;
+	u_int npad;
+	u_int8_t *bp;
+	int is_durable, ret;
+
+	COMPQUIET(lr, NULL);
+
+	env = dbp->env;
+	rlsnp = ret_lsnp;
+	rectype = DB___db_noop;
+	npad = 0;
+	ret = 0;
+
+	if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+	    F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+		if (txnp == NULL)
+			return (0);
+		is_durable = 0;
+	} else
+		is_durable = 1;
+
+	if (txnp == NULL) {
+		txn_num = 0;
+		lsnp = &null_lsn;
+		null_lsn.file = null_lsn.offset = 0;
+	} else {
+		if (TAILQ_FIRST(&txnp->kids) != NULL &&
+		    (ret = __txn_activekids(env, rectype, txnp)) != 0)
+			return (ret);
+		/*
+		 * We need to assign begin_lsn while holding region mutex.
+		 * That assignment is done inside the DbEnv->log_put call,
+		 * so pass in the appropriate memory location to be filled
+		 * in by the log_put code.
+		 */
+		DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+		txn_num = txnp->txnid;
+	}
+
+	DB_ASSERT(env, dbp->log_filename != NULL);
+	if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+	    (ret = __dbreg_lazy_id(dbp)) != 0)
+		return (ret);
+
+	logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t)
+	    + sizeof(*prevlsn);
+	if (CRYPTO_ON(env)) {
+		npad = env->crypto_handle->adj_size(logrec.size);
+		logrec.size += npad;
+	}
+
+	if (is_durable || txnp == NULL) {
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0)
+			return (ret);
+	} else {
+		if ((ret = __os_malloc(env,
+		    logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+			return (ret);
+#ifdef DIAGNOSTIC
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+			__os_free(env, lr);
+			return (ret);
+		}
+#else
+		logrec.data = lr->data;
+#endif
+	}
+	if (npad > 0)
+		memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+	bp = logrec.data;
+
+	LOGCOPY_32(env, bp, &rectype);
+	bp += sizeof(rectype);
+
+	LOGCOPY_32(env, bp, &txn_num);
+	bp += sizeof(txn_num);
+
+	LOGCOPY_FROMLSN(env, bp, lsnp);
+	bp += sizeof(DB_LSN);
+
+	uinttmp = (u_int32_t)dbp->log_filename->id;
+	LOGCOPY_32(env, bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	uinttmp = (u_int32_t)pgno;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	if (prevlsn != NULL) {
+		if (txnp != NULL) {
+			LOG *lp = env->lg_handle->reginfo.primary;
+			if (LOG_COMPARE(prevlsn, &lp->lsn) >= 0 && (ret =
+			    __log_check_page_lsn(env, dbp, prevlsn)) != 0)
+				return (ret);
+		}
+		LOGCOPY_FROMLSN(env, bp, prevlsn);
+	} else
+		memset(bp, 0, sizeof(*prevlsn));
+	bp += sizeof(*prevlsn);
+
+	DB_ASSERT(env,
+	    (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+	if (is_durable || txnp == NULL) {
+		if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+		    flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+			*lsnp = *rlsnp;
+			if (rlsnp != ret_lsnp)
+				 *ret_lsnp = *rlsnp;
+		}
+	} else {
+		ret = 0;
+#ifdef DIAGNOSTIC
+		/*
+		 * Set the debug bit if we are going to log non-durable
+		 * transactions so they will be ignored by recovery.
+		 */
+		memcpy(lr->data, logrec.data, logrec.size);
+		rectype |= DB_debug_FLAG;
+		LOGCOPY_32(env, logrec.data, &rectype);
+
+		if (!IS_REP_CLIENT(env))
+			ret = __log_put(env,
+			    rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+		STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+		F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+		LSN_NOT_LOGGED(*ret_lsnp);
+	}
+
+#ifdef LOG_DIAGNOSTIC
+	if (ret != 0)
+		(void)__db_noop_print(env,
+		    (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+	__os_free(env, logrec.data);
+#else
+	if (is_durable || txnp == NULL)
+		__os_free(env, logrec.data);
+#endif
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_alloc_42_read __P((ENV *, DB **, void *,
+ * PUBLIC:     void *, __db_pg_alloc_42_args **));
+ */
+int
+__db_pg_alloc_42_read(env, dbpp, td, recbuf, argpp)
+	ENV *env;
+	DB **dbpp;
+	void *td;
+	void *recbuf;
+	__db_pg_alloc_42_args **argpp;
+{
+	__db_pg_alloc_42_args *argp;
+	u_int32_t uinttmp;
+	u_int8_t *bp;
+	int ret;
+
+	if ((ret = __os_malloc(env,
+	    sizeof(__db_pg_alloc_42_args) + sizeof(DB_TXN), &argp)) != 0)
+		return (ret);
+	bp = recbuf;
+	argp->txnp = (DB_TXN *)&argp[1];
+	memset(argp->txnp, 0, sizeof(DB_TXN));
+
+	argp->txnp->td = td;
+	LOGCOPY_32(env, &argp->type, bp);
+	bp += sizeof(argp->type);
+
+	LOGCOPY_32(env, &argp->txnp->txnid, bp);
+	bp += sizeof(argp->txnp->txnid);
+
+	LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->fileid = (int32_t)uinttmp;
+	bp += sizeof(uinttmp);
+	if (dbpp != NULL) {
+		*dbpp = NULL;
+		ret = __dbreg_id_to_db(
+		    env, argp->txnp, dbpp, argp->fileid, 1);
+	}
+
+	LOGCOPY_TOLSN(env, &argp->meta_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->meta_pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_TOLSN(env, &argp->page_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_32(env, &argp->ptype, bp);
+	bp += sizeof(argp->ptype);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->next = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	*argpp = argp;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_alloc_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC:     __db_pg_alloc_args **));
+ */
+int
+__db_pg_alloc_read(env, dbpp, td, recbuf, argpp)
+	ENV *env;
+	DB **dbpp;
+	void *td;
+	void *recbuf;
+	__db_pg_alloc_args **argpp;
+{
+	__db_pg_alloc_args *argp;
+	u_int32_t uinttmp;
+	u_int8_t *bp;
+	int ret;
+
+	if ((ret = __os_malloc(env,
+	    sizeof(__db_pg_alloc_args) + sizeof(DB_TXN), &argp)) != 0)
+		return (ret);
+	bp = recbuf;
+	argp->txnp = (DB_TXN *)&argp[1];
+	memset(argp->txnp, 0, sizeof(DB_TXN));
+
+	argp->txnp->td = td;
+	LOGCOPY_32(env, &argp->type, bp);
+	bp += sizeof(argp->type);
+
+	LOGCOPY_32(env, &argp->txnp->txnid, bp);
+	bp += sizeof(argp->txnp->txnid);
+
+	LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->fileid = (int32_t)uinttmp;
+	bp += sizeof(uinttmp);
+	if (dbpp != NULL) {
+		*dbpp = NULL;
+		ret = __dbreg_id_to_db(
+		    env, argp->txnp, dbpp, argp->fileid, 1);
+	}
+
+	LOGCOPY_TOLSN(env, &argp->meta_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->meta_pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_TOLSN(env, &argp->page_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_32(env, &argp->ptype, bp);
+	bp += sizeof(argp->ptype);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->next = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->last_pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	*argpp = argp;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_alloc_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC:     u_int32_t, DB_LSN *, db_pgno_t, DB_LSN *, db_pgno_t, u_int32_t,
+ * PUBLIC:     db_pgno_t, db_pgno_t));
+ */
+int
+__db_pg_alloc_log(dbp, txnp, ret_lsnp, flags, meta_lsn, meta_pgno, page_lsn, pgno, ptype,
+    next, last_pgno)
+	DB *dbp;
+	DB_TXN *txnp;
+	DB_LSN *ret_lsnp;
+	u_int32_t flags;
+	DB_LSN * meta_lsn;
+	db_pgno_t meta_pgno;
+	DB_LSN * page_lsn;
+	db_pgno_t pgno;
+	u_int32_t ptype;
+	db_pgno_t next;
+	db_pgno_t last_pgno;
+{
+	DBT logrec;
+	DB_LSN *lsnp, null_lsn, *rlsnp;
+	DB_TXNLOGREC *lr;
+	ENV *env;
+	u_int32_t uinttmp, rectype, txn_num;
+	u_int npad;
+	u_int8_t *bp;
+	int is_durable, ret;
+
+	COMPQUIET(lr, NULL);
+
+	env = dbp->env;
+	rlsnp = ret_lsnp;
+	rectype = DB___db_pg_alloc;
+	npad = 0;
+	ret = 0;
+
+	if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+	    F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+		if (txnp == NULL)
+			return (0);
+		is_durable = 0;
+	} else
+		is_durable = 1;
+
+	if (txnp == NULL) {
+		txn_num = 0;
+		lsnp = &null_lsn;
+		null_lsn.file = null_lsn.offset = 0;
+	} else {
+		if (TAILQ_FIRST(&txnp->kids) != NULL &&
+		    (ret = __txn_activekids(env, rectype, txnp)) != 0)
+			return (ret);
+		/*
+		 * We need to assign begin_lsn while holding region mutex.
+		 * That assignment is done inside the DbEnv->log_put call,
+		 * so pass in the appropriate memory location to be filled
+		 * in by the log_put code.
+		 */
+		DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+		txn_num = txnp->txnid;
+	}
+
+	DB_ASSERT(env, dbp->log_filename != NULL);
+	if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+	    (ret = __dbreg_lazy_id(dbp)) != 0)
+		return (ret);
+
+	logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+	    + sizeof(u_int32_t)
+	    + sizeof(*meta_lsn)
+	    + sizeof(u_int32_t)
+	    + sizeof(*page_lsn)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t);
+	if (CRYPTO_ON(env)) {
+		npad = env->crypto_handle->adj_size(logrec.size);
+		logrec.size += npad;
+	}
+
+	if (is_durable || txnp == NULL) {
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0)
+			return (ret);
+	} else {
+		if ((ret = __os_malloc(env,
+		    logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+			return (ret);
+#ifdef DIAGNOSTIC
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+			__os_free(env, lr);
+			return (ret);
+		}
+#else
+		logrec.data = lr->data;
+#endif
+	}
+	if (npad > 0)
+		memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+	bp = logrec.data;
+
+	LOGCOPY_32(env, bp, &rectype);
+	bp += sizeof(rectype);
+
+	LOGCOPY_32(env, bp, &txn_num);
+	bp += sizeof(txn_num);
+
+	LOGCOPY_FROMLSN(env, bp, lsnp);
+	bp += sizeof(DB_LSN);
+
+	uinttmp = (u_int32_t)dbp->log_filename->id;
+	LOGCOPY_32(env, bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	if (meta_lsn != NULL) {
+		if (txnp != NULL) {
+			LOG *lp = env->lg_handle->reginfo.primary;
+			if (LOG_COMPARE(meta_lsn, &lp->lsn) >= 0 && (ret =
+			    __log_check_page_lsn(env, dbp, meta_lsn)) != 0)
+				return (ret);
+		}
+		LOGCOPY_FROMLSN(env, bp, meta_lsn);
+	} else
+		memset(bp, 0, sizeof(*meta_lsn));
+	bp += sizeof(*meta_lsn);
+
+	uinttmp = (u_int32_t)meta_pgno;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	if (page_lsn != NULL) {
+		if (txnp != NULL) {
+			LOG *lp = env->lg_handle->reginfo.primary;
+			if (LOG_COMPARE(page_lsn, &lp->lsn) >= 0 && (ret =
+			    __log_check_page_lsn(env, dbp, page_lsn)) != 0)
+				return (ret);
+		}
+		LOGCOPY_FROMLSN(env, bp, page_lsn);
+	} else
+		memset(bp, 0, sizeof(*page_lsn));
+	bp += sizeof(*page_lsn);
+
+	uinttmp = (u_int32_t)pgno;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_32(env, bp, &ptype);
+	bp += sizeof(ptype);
+
+	uinttmp = (u_int32_t)next;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	uinttmp = (u_int32_t)last_pgno;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	DB_ASSERT(env,
+	    (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+	if (is_durable || txnp == NULL) {
+		if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+		    flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+			*lsnp = *rlsnp;
+			if (rlsnp != ret_lsnp)
+				 *ret_lsnp = *rlsnp;
+		}
+	} else {
+		ret = 0;
+#ifdef DIAGNOSTIC
+		/*
+		 * Set the debug bit if we are going to log non-durable
+		 * transactions so they will be ignored by recovery.
+		 */
+		memcpy(lr->data, logrec.data, logrec.size);
+		rectype |= DB_debug_FLAG;
+		LOGCOPY_32(env, logrec.data, &rectype);
+
+		if (!IS_REP_CLIENT(env))
+			ret = __log_put(env,
+			    rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+		STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+		F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+		LSN_NOT_LOGGED(*ret_lsnp);
+	}
+
+#ifdef LOG_DIAGNOSTIC
+	if (ret != 0)
+		(void)__db_pg_alloc_print(env,
+		    (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+	__os_free(env, logrec.data);
+#else
+	if (is_durable || txnp == NULL)
+		__os_free(env, logrec.data);
+#endif
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_free_42_read __P((ENV *, DB **, void *,
+ * PUBLIC:     void *, __db_pg_free_42_args **));
+ */
+int
+__db_pg_free_42_read(env, dbpp, td, recbuf, argpp)
+	ENV *env;
+	DB **dbpp;
+	void *td;
+	void *recbuf;
+	__db_pg_free_42_args **argpp;
+{
+	__db_pg_free_42_args *argp;
+	u_int32_t uinttmp;
+	u_int8_t *bp;
+	int ret;
+
+	if ((ret = __os_malloc(env,
+	    sizeof(__db_pg_free_42_args) + sizeof(DB_TXN), &argp)) != 0)
+		return (ret);
+	bp = recbuf;
+	argp->txnp = (DB_TXN *)&argp[1];
+	memset(argp->txnp, 0, sizeof(DB_TXN));
+
+	argp->txnp->td = td;
+	LOGCOPY_32(env, &argp->type, bp);
+	bp += sizeof(argp->type);
+
+	LOGCOPY_32(env, &argp->txnp->txnid, bp);
+	bp += sizeof(argp->txnp->txnid);
+
+	LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->fileid = (int32_t)uinttmp;
+	bp += sizeof(uinttmp);
+	if (dbpp != NULL) {
+		*dbpp = NULL;
+		ret = __dbreg_id_to_db(
+		    env, argp->txnp, dbpp, argp->fileid, 1);
+	}
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_TOLSN(env, &argp->meta_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->meta_pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	memset(&argp->header, 0, sizeof(argp->header));
+	LOGCOPY_32(env,&argp->header.size, bp);
+	bp += sizeof(u_int32_t);
+	argp->header.data = bp;
+	bp += argp->header.size;
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->next = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	*argpp = argp;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_free_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC:     __db_pg_free_args **));
+ */
+int
+__db_pg_free_read(env, dbpp, td, recbuf, argpp)
+	ENV *env;
+	DB **dbpp;
+	void *td;
+	void *recbuf;
+	__db_pg_free_args **argpp;
+{
+	__db_pg_free_args *argp;
+	u_int32_t uinttmp;
+	u_int8_t *bp;
+	int ret;
+
+	if ((ret = __os_malloc(env,
+	    sizeof(__db_pg_free_args) + sizeof(DB_TXN), &argp)) != 0)
+		return (ret);
+	bp = recbuf;
+	argp->txnp = (DB_TXN *)&argp[1];
+	memset(argp->txnp, 0, sizeof(DB_TXN));
+
+	argp->txnp->td = td;
+	LOGCOPY_32(env, &argp->type, bp);
+	bp += sizeof(argp->type);
+
+	LOGCOPY_32(env, &argp->txnp->txnid, bp);
+	bp += sizeof(argp->txnp->txnid);
+
+	LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->fileid = (int32_t)uinttmp;
+	bp += sizeof(uinttmp);
+	if (dbpp != NULL) {
+		*dbpp = NULL;
+		ret = __dbreg_id_to_db(
+		    env, argp->txnp, dbpp, argp->fileid, 1);
+	}
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_TOLSN(env, &argp->meta_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->meta_pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	memset(&argp->header, 0, sizeof(argp->header));
+	LOGCOPY_32(env,&argp->header.size, bp);
+	bp += sizeof(u_int32_t);
+	argp->header.data = bp;
+	bp += argp->header.size;
+	if (LOG_SWAPPED(env) && dbpp != NULL && *dbpp != NULL) {
+		int t_ret;
+		if ((t_ret = __db_pageswap(*dbpp, (PAGE *)argp->header.data,
+		    (size_t)argp->header.size, NULL, 1)) != 0)
+			return (t_ret);
+	}
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->next = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->last_pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	*argpp = argp;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_free_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC:     u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t, const DBT *,
+ * PUBLIC:     db_pgno_t, db_pgno_t));
+ */
+int
+__db_pg_free_log(dbp, txnp, ret_lsnp, flags, pgno, meta_lsn, meta_pgno, header, next,
+    last_pgno)
+	DB *dbp;
+	DB_TXN *txnp;
+	DB_LSN *ret_lsnp;
+	u_int32_t flags;
+	db_pgno_t pgno;
+	DB_LSN * meta_lsn;
+	db_pgno_t meta_pgno;
+	const DBT *header;
+	db_pgno_t next;
+	db_pgno_t last_pgno;
+{
+	DBT logrec;
+	DB_LSN *lsnp, null_lsn, *rlsnp;
+	DB_TXNLOGREC *lr;
+	ENV *env;
+	u_int32_t zero, uinttmp, rectype, txn_num;
+	u_int npad;
+	u_int8_t *bp;
+	int is_durable, ret;
+
+	COMPQUIET(lr, NULL);
+
+	env = dbp->env;
+	rlsnp = ret_lsnp;
+	rectype = DB___db_pg_free;
+	npad = 0;
+	ret = 0;
+
+	if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+	    F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+		if (txnp == NULL)
+			return (0);
+		is_durable = 0;
+	} else
+		is_durable = 1;
+
+	if (txnp == NULL) {
+		txn_num = 0;
+		lsnp = &null_lsn;
+		null_lsn.file = null_lsn.offset = 0;
+	} else {
+		if (TAILQ_FIRST(&txnp->kids) != NULL &&
+		    (ret = __txn_activekids(env, rectype, txnp)) != 0)
+			return (ret);
+		/*
+		 * We need to assign begin_lsn while holding region mutex.
+		 * That assignment is done inside the DbEnv->log_put call,
+		 * so pass in the appropriate memory location to be filled
+		 * in by the log_put code.
+		 */
+		DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+		txn_num = txnp->txnid;
+	}
+
+	DB_ASSERT(env, dbp->log_filename != NULL);
+	if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+	    (ret = __dbreg_lazy_id(dbp)) != 0)
+		return (ret);
+
+	logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t)
+	    + sizeof(*meta_lsn)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t) + (header == NULL ? 0 : header->size)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t);
+	if (CRYPTO_ON(env)) {
+		npad = env->crypto_handle->adj_size(logrec.size);
+		logrec.size += npad;
+	}
+
+	if (is_durable || txnp == NULL) {
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0)
+			return (ret);
+	} else {
+		if ((ret = __os_malloc(env,
+		    logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+			return (ret);
+#ifdef DIAGNOSTIC
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+			__os_free(env, lr);
+			return (ret);
+		}
+#else
+		logrec.data = lr->data;
+#endif
+	}
+	if (npad > 0)
+		memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+	bp = logrec.data;
+
+	LOGCOPY_32(env, bp, &rectype);
+	bp += sizeof(rectype);
+
+	LOGCOPY_32(env, bp, &txn_num);
+	bp += sizeof(txn_num);
+
+	LOGCOPY_FROMLSN(env, bp, lsnp);
+	bp += sizeof(DB_LSN);
+
+	uinttmp = (u_int32_t)dbp->log_filename->id;
+	LOGCOPY_32(env, bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	uinttmp = (u_int32_t)pgno;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	if (meta_lsn != NULL) {
+		if (txnp != NULL) {
+			LOG *lp = env->lg_handle->reginfo.primary;
+			if (LOG_COMPARE(meta_lsn, &lp->lsn) >= 0 && (ret =
+			    __log_check_page_lsn(env, dbp, meta_lsn)) != 0)
+				return (ret);
+		}
+		LOGCOPY_FROMLSN(env, bp, meta_lsn);
+	} else
+		memset(bp, 0, sizeof(*meta_lsn));
+	bp += sizeof(*meta_lsn);
+
+	uinttmp = (u_int32_t)meta_pgno;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	if (header == NULL) {
+		zero = 0;
+		LOGCOPY_32(env, bp, &zero);
+		bp += sizeof(u_int32_t);
+	} else {
+		LOGCOPY_32(env, bp, &header->size);
+		bp += sizeof(header->size);
+		memcpy(bp, header->data, header->size);
+		if (LOG_SWAPPED(env))
+			if ((ret = __db_pageswap(dbp,
+			    (PAGE *)bp, (size_t)header->size, (DBT *)NULL, 0)) != 0)
+				return (ret);
+		bp += header->size;
+	}
+
+	uinttmp = (u_int32_t)next;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	uinttmp = (u_int32_t)last_pgno;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	DB_ASSERT(env,
+	    (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+	if (is_durable || txnp == NULL) {
+		if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+		    flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+			*lsnp = *rlsnp;
+			if (rlsnp != ret_lsnp)
+				 *ret_lsnp = *rlsnp;
+		}
+	} else {
+		ret = 0;
+#ifdef DIAGNOSTIC
+		/*
+		 * Set the debug bit if we are going to log non-durable
+		 * transactions so they will be ignored by recovery.
+		 */
+		memcpy(lr->data, logrec.data, logrec.size);
+		rectype |= DB_debug_FLAG;
+		LOGCOPY_32(env, logrec.data, &rectype);
+
+		if (!IS_REP_CLIENT(env))
+			ret = __log_put(env,
+			    rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+		STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+		F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+		LSN_NOT_LOGGED(*ret_lsnp);
+	}
+
+#ifdef LOG_DIAGNOSTIC
+	if (ret != 0)
+		(void)__db_pg_free_print(env,
+		    (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+	__os_free(env, logrec.data);
+#else
+	if (is_durable || txnp == NULL)
+		__os_free(env, logrec.data);
+#endif
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_cksum_read __P((ENV *, void *, __db_cksum_args **));
+ */
+int
+__db_cksum_read(env, recbuf, argpp)
+	ENV *env;
+	void *recbuf;
+	__db_cksum_args **argpp;
+{
+	__db_cksum_args *argp;
+	u_int8_t *bp;
+	int ret;
+
+	if ((ret = __os_malloc(env,
+	    sizeof(__db_cksum_args) + sizeof(DB_TXN), &argp)) != 0)
+		return (ret);
+	bp = recbuf;
+	argp->txnp = (DB_TXN *)&argp[1];
+	memset(argp->txnp, 0, sizeof(DB_TXN));
+
+	LOGCOPY_32(env, &argp->type, bp);
+	bp += sizeof(argp->type);
+
+	LOGCOPY_32(env, &argp->txnp->txnid, bp);
+	bp += sizeof(argp->txnp->txnid);
+
+	LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	*argpp = argp;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_cksum_log __P((ENV *, DB_TXN *, DB_LSN *, u_int32_t));
+ */
+int
+__db_cksum_log(env, txnp, ret_lsnp, flags)
+	ENV *env;
+	DB_TXN *txnp;
+	DB_LSN *ret_lsnp;
+	u_int32_t flags;
+{
+	DBT logrec;
+	DB_LSN *lsnp, null_lsn, *rlsnp;
+	DB_TXNLOGREC *lr;
+	u_int32_t rectype, txn_num;
+	u_int npad;
+	u_int8_t *bp;
+	int is_durable, ret;
+
+	COMPQUIET(lr, NULL);
+
+	rlsnp = ret_lsnp;
+	rectype = DB___db_cksum;
+	npad = 0;
+	ret = 0;
+
+	if (LF_ISSET(DB_LOG_NOT_DURABLE)) {
+		if (txnp == NULL)
+			return (0);
+		is_durable = 0;
+	} else
+		is_durable = 1;
+
+	if (txnp == NULL) {
+		txn_num = 0;
+		lsnp = &null_lsn;
+		null_lsn.file = null_lsn.offset = 0;
+	} else {
+		if (TAILQ_FIRST(&txnp->kids) != NULL &&
+		    (ret = __txn_activekids(env, rectype, txnp)) != 0)
+			return (ret);
+		/*
+		 * We need to assign begin_lsn while holding region mutex.
+		 * That assignment is done inside the DbEnv->log_put call,
+		 * so pass in the appropriate memory location to be filled
+		 * in by the log_put code.
+		 */
+		DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+		txn_num = txnp->txnid;
+	}
+
+	logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN);
+	if (CRYPTO_ON(env)) {
+		npad = env->crypto_handle->adj_size(logrec.size);
+		logrec.size += npad;
+	}
+
+	if (is_durable || txnp == NULL) {
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0)
+			return (ret);
+	} else {
+		if ((ret = __os_malloc(env,
+		    logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+			return (ret);
+#ifdef DIAGNOSTIC
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+			__os_free(env, lr);
+			return (ret);
+		}
+#else
+		logrec.data = lr->data;
+#endif
+	}
+	if (npad > 0)
+		memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+	bp = logrec.data;
+
+	LOGCOPY_32(env, bp, &rectype);
+	bp += sizeof(rectype);
+
+	LOGCOPY_32(env, bp, &txn_num);
+	bp += sizeof(txn_num);
+
+	LOGCOPY_FROMLSN(env, bp, lsnp);
+	bp += sizeof(DB_LSN);
+
+	DB_ASSERT(env,
+	    (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+	if (is_durable || txnp == NULL) {
+		if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+		    flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+			*lsnp = *rlsnp;
+			if (rlsnp != ret_lsnp)
+				 *ret_lsnp = *rlsnp;
+		}
+	} else {
+		ret = 0;
+#ifdef DIAGNOSTIC
+		/*
+		 * Set the debug bit if we are going to log non-durable
+		 * transactions so they will be ignored by recovery.
+		 */
+		memcpy(lr->data, logrec.data, logrec.size);
+		rectype |= DB_debug_FLAG;
+		LOGCOPY_32(env, logrec.data, &rectype);
+
+		if (!IS_REP_CLIENT(env))
+			ret = __log_put(env,
+			    rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+		STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+		F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+		LSN_NOT_LOGGED(*ret_lsnp);
+	}
+
+#ifdef LOG_DIAGNOSTIC
+	if (ret != 0)
+		(void)__db_cksum_print(env,
+		    (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+	__os_free(env, logrec.data);
+#else
+	if (is_durable || txnp == NULL)
+		__os_free(env, logrec.data);
+#endif
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_freedata_42_read __P((ENV *, DB **, void *,
+ * PUBLIC:     void *, __db_pg_freedata_42_args **));
+ */
+int
+__db_pg_freedata_42_read(env, dbpp, td, recbuf, argpp)
+	ENV *env;
+	DB **dbpp;
+	void *td;
+	void *recbuf;
+	__db_pg_freedata_42_args **argpp;
+{
+	__db_pg_freedata_42_args *argp;
+	u_int32_t uinttmp;
+	u_int8_t *bp;
+	int ret;
+
+	if ((ret = __os_malloc(env,
+	    sizeof(__db_pg_freedata_42_args) + sizeof(DB_TXN), &argp)) != 0)
+		return (ret);
+	bp = recbuf;
+	argp->txnp = (DB_TXN *)&argp[1];
+	memset(argp->txnp, 0, sizeof(DB_TXN));
+
+	argp->txnp->td = td;
+	LOGCOPY_32(env, &argp->type, bp);
+	bp += sizeof(argp->type);
+
+	LOGCOPY_32(env, &argp->txnp->txnid, bp);
+	bp += sizeof(argp->txnp->txnid);
+
+	LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->fileid = (int32_t)uinttmp;
+	bp += sizeof(uinttmp);
+	if (dbpp != NULL) {
+		*dbpp = NULL;
+		ret = __dbreg_id_to_db(
+		    env, argp->txnp, dbpp, argp->fileid, 1);
+	}
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_TOLSN(env, &argp->meta_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->meta_pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	memset(&argp->header, 0, sizeof(argp->header));
+	LOGCOPY_32(env,&argp->header.size, bp);
+	bp += sizeof(u_int32_t);
+	argp->header.data = bp;
+	bp += argp->header.size;
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->next = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	memset(&argp->data, 0, sizeof(argp->data));
+	LOGCOPY_32(env,&argp->data.size, bp);
+	bp += sizeof(u_int32_t);
+	argp->data.data = bp;
+	bp += argp->data.size;
+
+	*argpp = argp;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_freedata_read __P((ENV *, DB **, void *,
+ * PUBLIC:     void *, __db_pg_freedata_args **));
+ */
+int
+__db_pg_freedata_read(env, dbpp, td, recbuf, argpp)
+	ENV *env;
+	DB **dbpp;
+	void *td;
+	void *recbuf;
+	__db_pg_freedata_args **argpp;
+{
+	__db_pg_freedata_args *argp;
+	u_int32_t uinttmp;
+	u_int8_t *bp;
+	int ret;
+
+	if ((ret = __os_malloc(env,
+	    sizeof(__db_pg_freedata_args) + sizeof(DB_TXN), &argp)) != 0)
+		return (ret);
+	bp = recbuf;
+	argp->txnp = (DB_TXN *)&argp[1];
+	memset(argp->txnp, 0, sizeof(DB_TXN));
+
+	argp->txnp->td = td;
+	LOGCOPY_32(env, &argp->type, bp);
+	bp += sizeof(argp->type);
+
+	LOGCOPY_32(env, &argp->txnp->txnid, bp);
+	bp += sizeof(argp->txnp->txnid);
+
+	LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->fileid = (int32_t)uinttmp;
+	bp += sizeof(uinttmp);
+	if (dbpp != NULL) {
+		*dbpp = NULL;
+		ret = __dbreg_id_to_db(
+		    env, argp->txnp, dbpp, argp->fileid, 1);
+	}
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_TOLSN(env, &argp->meta_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->meta_pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	memset(&argp->header, 0, sizeof(argp->header));
+	LOGCOPY_32(env,&argp->header.size, bp);
+	bp += sizeof(u_int32_t);
+	argp->header.data = bp;
+	bp += argp->header.size;
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->next = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->last_pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	memset(&argp->data, 0, sizeof(argp->data));
+	LOGCOPY_32(env,&argp->data.size, bp);
+	bp += sizeof(u_int32_t);
+	argp->data.data = bp;
+	bp += argp->data.size;
+	if (LOG_SWAPPED(env) && dbpp != NULL && *dbpp != NULL) {
+		int t_ret;
+		if ((t_ret = __db_pageswap(*dbpp,
+		    (PAGE *)argp->header.data, (size_t)argp->header.size,
+		    &argp->data, 1)) != 0)
+			return (t_ret);
+	}
+
+	*argpp = argp;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_freedata_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC:     u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t, const DBT *,
+ * PUBLIC:     db_pgno_t, db_pgno_t, const DBT *));
+ */
+int
+__db_pg_freedata_log(dbp, txnp, ret_lsnp, flags, pgno, meta_lsn, meta_pgno, header, next,
+    last_pgno, data)
+	DB *dbp;
+	DB_TXN *txnp;
+	DB_LSN *ret_lsnp;
+	u_int32_t flags;
+	db_pgno_t pgno;
+	DB_LSN * meta_lsn;
+	db_pgno_t meta_pgno;
+	const DBT *header;
+	db_pgno_t next;
+	db_pgno_t last_pgno;
+	const DBT *data;
+{
+	DBT logrec;
+	DB_LSN *lsnp, null_lsn, *rlsnp;
+	DB_TXNLOGREC *lr;
+	ENV *env;
+	u_int32_t zero, uinttmp, rectype, txn_num;
+	u_int npad;
+	u_int8_t *bp;
+	int is_durable, ret;
+
+	COMPQUIET(lr, NULL);
+
+	env = dbp->env;
+	rlsnp = ret_lsnp;
+	rectype = DB___db_pg_freedata;
+	npad = 0;
+	ret = 0;
+
+	if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+	    F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+		if (txnp == NULL)
+			return (0);
+		is_durable = 0;
+	} else
+		is_durable = 1;
+
+	if (txnp == NULL) {
+		txn_num = 0;
+		lsnp = &null_lsn;
+		null_lsn.file = null_lsn.offset = 0;
+	} else {
+		if (TAILQ_FIRST(&txnp->kids) != NULL &&
+		    (ret = __txn_activekids(env, rectype, txnp)) != 0)
+			return (ret);
+		/*
+		 * We need to assign begin_lsn while holding region mutex.
+		 * That assignment is done inside the DbEnv->log_put call,
+		 * so pass in the appropriate memory location to be filled
+		 * in by the log_put code.
+		 */
+		DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+		txn_num = txnp->txnid;
+	}
+
+	DB_ASSERT(env, dbp->log_filename != NULL);
+	if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+	    (ret = __dbreg_lazy_id(dbp)) != 0)
+		return (ret);
+
+	logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t)
+	    + sizeof(*meta_lsn)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t) + (header == NULL ? 0 : header->size)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t) + (data == NULL ? 0 : data->size);
+	if (CRYPTO_ON(env)) {
+		npad = env->crypto_handle->adj_size(logrec.size);
+		logrec.size += npad;
+	}
+
+	if (is_durable || txnp == NULL) {
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0)
+			return (ret);
+	} else {
+		if ((ret = __os_malloc(env,
+		    logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+			return (ret);
+#ifdef DIAGNOSTIC
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+			__os_free(env, lr);
+			return (ret);
+		}
+#else
+		logrec.data = lr->data;
+#endif
+	}
+	if (npad > 0)
+		memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+	bp = logrec.data;
+
+	LOGCOPY_32(env, bp, &rectype);
+	bp += sizeof(rectype);
+
+	LOGCOPY_32(env, bp, &txn_num);
+	bp += sizeof(txn_num);
+
+	LOGCOPY_FROMLSN(env, bp, lsnp);
+	bp += sizeof(DB_LSN);
+
+	uinttmp = (u_int32_t)dbp->log_filename->id;
+	LOGCOPY_32(env, bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	uinttmp = (u_int32_t)pgno;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	if (meta_lsn != NULL) {
+		if (txnp != NULL) {
+			LOG *lp = env->lg_handle->reginfo.primary;
+			if (LOG_COMPARE(meta_lsn, &lp->lsn) >= 0 && (ret =
+			    __log_check_page_lsn(env, dbp, meta_lsn)) != 0)
+				return (ret);
+		}
+		LOGCOPY_FROMLSN(env, bp, meta_lsn);
+	} else
+		memset(bp, 0, sizeof(*meta_lsn));
+	bp += sizeof(*meta_lsn);
+
+	uinttmp = (u_int32_t)meta_pgno;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	if (header == NULL) {
+		zero = 0;
+		LOGCOPY_32(env, bp, &zero);
+		bp += sizeof(u_int32_t);
+	} else {
+		LOGCOPY_32(env, bp, &header->size);
+		bp += sizeof(header->size);
+		memcpy(bp, header->data, header->size);
+		if (LOG_SWAPPED(env))
+			if ((ret = __db_pageswap(dbp,
+			    (PAGE *)bp, (size_t)header->size, (DBT *)data, 0)) != 0)
+				return (ret);
+		bp += header->size;
+	}
+
+	uinttmp = (u_int32_t)next;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	uinttmp = (u_int32_t)last_pgno;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	if (data == NULL) {
+		zero = 0;
+		LOGCOPY_32(env, bp, &zero);
+		bp += sizeof(u_int32_t);
+	} else {
+		LOGCOPY_32(env, bp, &data->size);
+		bp += sizeof(data->size);
+		memcpy(bp, data->data, data->size);
+		if (LOG_SWAPPED(env) && F_ISSET(data, DB_DBT_APPMALLOC))
+			__os_free(env, data->data);
+		bp += data->size;
+	}
+
+	DB_ASSERT(env,
+	    (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+	if (is_durable || txnp == NULL) {
+		if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+		    flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+			*lsnp = *rlsnp;
+			if (rlsnp != ret_lsnp)
+				 *ret_lsnp = *rlsnp;
+		}
+	} else {
+		ret = 0;
+#ifdef DIAGNOSTIC
+		/*
+		 * Set the debug bit if we are going to log non-durable
+		 * transactions so they will be ignored by recovery.
+		 */
+		memcpy(lr->data, logrec.data, logrec.size);
+		rectype |= DB_debug_FLAG;
+		LOGCOPY_32(env, logrec.data, &rectype);
+
+		if (!IS_REP_CLIENT(env))
+			ret = __log_put(env,
+			    rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+		STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+		F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+		LSN_NOT_LOGGED(*ret_lsnp);
+	}
+
+#ifdef LOG_DIAGNOSTIC
+	if (ret != 0)
+		(void)__db_pg_freedata_print(env,
+		    (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+	__os_free(env, logrec.data);
+#else
+	if (is_durable || txnp == NULL)
+		__os_free(env, logrec.data);
+#endif
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_init_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC:     __db_pg_init_args **));
+ */
+int
+__db_pg_init_read(env, dbpp, td, recbuf, argpp)
+	ENV *env;
+	DB **dbpp;
+	void *td;
+	void *recbuf;
+	__db_pg_init_args **argpp;
+{
+	__db_pg_init_args *argp;
+	u_int32_t uinttmp;
+	u_int8_t *bp;
+	int ret;
+
+	if ((ret = __os_malloc(env,
+	    sizeof(__db_pg_init_args) + sizeof(DB_TXN), &argp)) != 0)
+		return (ret);
+	bp = recbuf;
+	argp->txnp = (DB_TXN *)&argp[1];
+	memset(argp->txnp, 0, sizeof(DB_TXN));
+
+	argp->txnp->td = td;
+	LOGCOPY_32(env, &argp->type, bp);
+	bp += sizeof(argp->type);
+
+	LOGCOPY_32(env, &argp->txnp->txnid, bp);
+	bp += sizeof(argp->txnp->txnid);
+
+	LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->fileid = (int32_t)uinttmp;
+	bp += sizeof(uinttmp);
+	if (dbpp != NULL) {
+		*dbpp = NULL;
+		ret = __dbreg_id_to_db(
+		    env, argp->txnp, dbpp, argp->fileid, 1);
+	}
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	memset(&argp->header, 0, sizeof(argp->header));
+	LOGCOPY_32(env,&argp->header.size, bp);
+	bp += sizeof(u_int32_t);
+	argp->header.data = bp;
+	bp += argp->header.size;
+
+	memset(&argp->data, 0, sizeof(argp->data));
+	LOGCOPY_32(env,&argp->data.size, bp);
+	bp += sizeof(u_int32_t);
+	argp->data.data = bp;
+	bp += argp->data.size;
+	if (LOG_SWAPPED(env) && dbpp != NULL && *dbpp != NULL) {
+		int t_ret;
+		if ((t_ret = __db_pageswap(*dbpp,
+		    (PAGE *)argp->header.data, (size_t)argp->header.size,
+		    &argp->data, 1)) != 0)
+			return (t_ret);
+	}
+
+	*argpp = argp;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_init_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC:     u_int32_t, db_pgno_t, const DBT *, const DBT *));
+ */
+int
+__db_pg_init_log(dbp, txnp, ret_lsnp, flags, pgno, header, data)
+	DB *dbp;
+	DB_TXN *txnp;
+	DB_LSN *ret_lsnp;
+	u_int32_t flags;
+	db_pgno_t pgno;
+	const DBT *header;
+	const DBT *data;
+{
+	DBT logrec;
+	DB_LSN *lsnp, null_lsn, *rlsnp;
+	DB_TXNLOGREC *lr;
+	ENV *env;
+	u_int32_t zero, uinttmp, rectype, txn_num;
+	u_int npad;
+	u_int8_t *bp;
+	int is_durable, ret;
+
+	COMPQUIET(lr, NULL);
+
+	env = dbp->env;
+	rlsnp = ret_lsnp;
+	rectype = DB___db_pg_init;
+	npad = 0;
+	ret = 0;
+
+	if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+	    F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+		if (txnp == NULL)
+			return (0);
+		is_durable = 0;
+	} else
+		is_durable = 1;
+
+	if (txnp == NULL) {
+		txn_num = 0;
+		lsnp = &null_lsn;
+		null_lsn.file = null_lsn.offset = 0;
+	} else {
+		if (TAILQ_FIRST(&txnp->kids) != NULL &&
+		    (ret = __txn_activekids(env, rectype, txnp)) != 0)
+			return (ret);
+		/*
+		 * We need to assign begin_lsn while holding region mutex.
+		 * That assignment is done inside the DbEnv->log_put call,
+		 * so pass in the appropriate memory location to be filled
+		 * in by the log_put code.
+		 */
+		DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+		txn_num = txnp->txnid;
+	}
+
+	DB_ASSERT(env, dbp->log_filename != NULL);
+	if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+	    (ret = __dbreg_lazy_id(dbp)) != 0)
+		return (ret);
+
+	logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t) + (header == NULL ? 0 : header->size)
+	    + sizeof(u_int32_t) + (data == NULL ? 0 : data->size);
+	if (CRYPTO_ON(env)) {
+		npad = env->crypto_handle->adj_size(logrec.size);
+		logrec.size += npad;
+	}
+
+	if (is_durable || txnp == NULL) {
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0)
+			return (ret);
+	} else {
+		if ((ret = __os_malloc(env,
+		    logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+			return (ret);
+#ifdef DIAGNOSTIC
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+			__os_free(env, lr);
+			return (ret);
+		}
+#else
+		logrec.data = lr->data;
+#endif
+	}
+	if (npad > 0)
+		memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+	bp = logrec.data;
+
+	LOGCOPY_32(env, bp, &rectype);
+	bp += sizeof(rectype);
+
+	LOGCOPY_32(env, bp, &txn_num);
+	bp += sizeof(txn_num);
+
+	LOGCOPY_FROMLSN(env, bp, lsnp);
+	bp += sizeof(DB_LSN);
+
+	uinttmp = (u_int32_t)dbp->log_filename->id;
+	LOGCOPY_32(env, bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	uinttmp = (u_int32_t)pgno;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	if (header == NULL) {
+		zero = 0;
+		LOGCOPY_32(env, bp, &zero);
+		bp += sizeof(u_int32_t);
+	} else {
+		LOGCOPY_32(env, bp, &header->size);
+		bp += sizeof(header->size);
+		memcpy(bp, header->data, header->size);
+		if (LOG_SWAPPED(env))
+			if ((ret = __db_pageswap(dbp,
+			    (PAGE *)bp, (size_t)header->size, (DBT *)data, 0)) != 0)
+				return (ret);
+		bp += header->size;
+	}
+
+	if (data == NULL) {
+		zero = 0;
+		LOGCOPY_32(env, bp, &zero);
+		bp += sizeof(u_int32_t);
+	} else {
+		LOGCOPY_32(env, bp, &data->size);
+		bp += sizeof(data->size);
+		memcpy(bp, data->data, data->size);
+		if (LOG_SWAPPED(env) && F_ISSET(data, DB_DBT_APPMALLOC))
+			__os_free(env, data->data);
+		bp += data->size;
+	}
+
+	DB_ASSERT(env,
+	    (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+	if (is_durable || txnp == NULL) {
+		if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+		    flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+			*lsnp = *rlsnp;
+			if (rlsnp != ret_lsnp)
+				 *ret_lsnp = *rlsnp;
+		}
+	} else {
+		ret = 0;
+#ifdef DIAGNOSTIC
+		/*
+		 * Set the debug bit if we are going to log non-durable
+		 * transactions so they will be ignored by recovery.
+		 */
+		memcpy(lr->data, logrec.data, logrec.size);
+		rectype |= DB_debug_FLAG;
+		LOGCOPY_32(env, logrec.data, &rectype);
+
+		if (!IS_REP_CLIENT(env))
+			ret = __log_put(env,
+			    rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+		STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+		F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+		LSN_NOT_LOGGED(*ret_lsnp);
+	}
+
+#ifdef LOG_DIAGNOSTIC
+	if (ret != 0)
+		(void)__db_pg_init_print(env,
+		    (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+	__os_free(env, logrec.data);
+#else
+	if (is_durable || txnp == NULL)
+		__os_free(env, logrec.data);
+#endif
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_sort_44_read __P((ENV *, DB **, void *,
+ * PUBLIC:     void *, __db_pg_sort_44_args **));
+ */
+int
+__db_pg_sort_44_read(env, dbpp, td, recbuf, argpp)
+	ENV *env;
+	DB **dbpp;
+	void *td;
+	void *recbuf;
+	__db_pg_sort_44_args **argpp;
+{
+	__db_pg_sort_44_args *argp;
+	u_int32_t uinttmp;
+	u_int8_t *bp;
+	int ret;
+
+	if ((ret = __os_malloc(env,
+	    sizeof(__db_pg_sort_44_args) + sizeof(DB_TXN), &argp)) != 0)
+		return (ret);
+	bp = recbuf;
+	argp->txnp = (DB_TXN *)&argp[1];
+	memset(argp->txnp, 0, sizeof(DB_TXN));
+
+	argp->txnp->td = td;
+	LOGCOPY_32(env, &argp->type, bp);
+	bp += sizeof(argp->type);
+
+	LOGCOPY_32(env, &argp->txnp->txnid, bp);
+	bp += sizeof(argp->txnp->txnid);
+
+	LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->fileid = (int32_t)uinttmp;
+	bp += sizeof(uinttmp);
+	if (dbpp != NULL) {
+		*dbpp = NULL;
+		ret = __dbreg_id_to_db(
+		    env, argp->txnp, dbpp, argp->fileid, 1);
+	}
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->meta = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_TOLSN(env, &argp->meta_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->last_free = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_TOLSN(env, &argp->last_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->last_pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	memset(&argp->list, 0, sizeof(argp->list));
+	LOGCOPY_32(env,&argp->list.size, bp);
+	bp += sizeof(u_int32_t);
+	argp->list.data = bp;
+	bp += argp->list.size;
+
+	*argpp = argp;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_trunc_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC:     __db_pg_trunc_args **));
+ */
+int
+__db_pg_trunc_read(env, dbpp, td, recbuf, argpp)
+	ENV *env;
+	DB **dbpp;
+	void *td;
+	void *recbuf;
+	__db_pg_trunc_args **argpp;
+{
+	__db_pg_trunc_args *argp;
+	u_int32_t uinttmp;
+	u_int8_t *bp;
+	int ret;
+
+	if ((ret = __os_malloc(env,
+	    sizeof(__db_pg_trunc_args) + sizeof(DB_TXN), &argp)) != 0)
+		return (ret);
+	bp = recbuf;
+	argp->txnp = (DB_TXN *)&argp[1];
+	memset(argp->txnp, 0, sizeof(DB_TXN));
+
+	argp->txnp->td = td;
+	LOGCOPY_32(env, &argp->type, bp);
+	bp += sizeof(argp->type);
+
+	LOGCOPY_32(env, &argp->txnp->txnid, bp);
+	bp += sizeof(argp->txnp->txnid);
+
+	LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->fileid = (int32_t)uinttmp;
+	bp += sizeof(uinttmp);
+	if (dbpp != NULL) {
+		*dbpp = NULL;
+		ret = __dbreg_id_to_db(
+		    env, argp->txnp, dbpp, argp->fileid, 1);
+	}
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->meta = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_TOLSN(env, &argp->meta_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->last_free = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_TOLSN(env, &argp->last_lsn, bp);
+	bp += sizeof(DB_LSN);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->next_free = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	LOGCOPY_32(env, &uinttmp, bp);
+	argp->last_pgno = (db_pgno_t)uinttmp;
+	bp += sizeof(uinttmp);
+
+	memset(&argp->list, 0, sizeof(argp->list));
+	LOGCOPY_32(env,&argp->list.size, bp);
+	bp += sizeof(u_int32_t);
+	argp->list.data = bp;
+	bp += argp->list.size;
+
+	*argpp = argp;
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_trunc_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC:     u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t, DB_LSN *, db_pgno_t,
+ * PUBLIC:     db_pgno_t, const DBT *));
+ */
+int
+__db_pg_trunc_log(dbp, txnp, ret_lsnp, flags, meta, meta_lsn, last_free, last_lsn, next_free,
+    last_pgno, list)
+	DB *dbp;
+	DB_TXN *txnp;
+	DB_LSN *ret_lsnp;
+	u_int32_t flags;
+	db_pgno_t meta;
+	DB_LSN * meta_lsn;
+	db_pgno_t last_free;
+	DB_LSN * last_lsn;
+	db_pgno_t next_free;
+	db_pgno_t last_pgno;
+	const DBT *list;
+{
+	DBT logrec;
+	DB_LSN *lsnp, null_lsn, *rlsnp;
+	DB_TXNLOGREC *lr;
+	ENV *env;
+	u_int32_t zero, uinttmp, rectype, txn_num;
+	u_int npad;
+	u_int8_t *bp;
+	int is_durable, ret;
+
+	COMPQUIET(lr, NULL);
+
+	env = dbp->env;
+	rlsnp = ret_lsnp;
+	rectype = DB___db_pg_trunc;
+	npad = 0;
+	ret = 0;
+
+	if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+	    F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+		if (txnp == NULL)
+			return (0);
+		is_durable = 0;
+	} else
+		is_durable = 1;
+
+	if (txnp == NULL) {
+		txn_num = 0;
+		lsnp = &null_lsn;
+		null_lsn.file = null_lsn.offset = 0;
+	} else {
+		if (TAILQ_FIRST(&txnp->kids) != NULL &&
+		    (ret = __txn_activekids(env, rectype, txnp)) != 0)
+			return (ret);
+		/*
+		 * We need to assign begin_lsn while holding region mutex.
+		 * That assignment is done inside the DbEnv->log_put call,
+		 * so pass in the appropriate memory location to be filled
+		 * in by the log_put code.
+		 */
+		DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+		txn_num = txnp->txnid;
+	}
+
+	DB_ASSERT(env, dbp->log_filename != NULL);
+	if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+	    (ret = __dbreg_lazy_id(dbp)) != 0)
+		return (ret);
+
+	logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t)
+	    + sizeof(*meta_lsn)
+	    + sizeof(u_int32_t)
+	    + sizeof(*last_lsn)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t)
+	    + sizeof(u_int32_t) + (list == NULL ? 0 : list->size);
+	if (CRYPTO_ON(env)) {
+		npad = env->crypto_handle->adj_size(logrec.size);
+		logrec.size += npad;
+	}
+
+	if (is_durable || txnp == NULL) {
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0)
+			return (ret);
+	} else {
+		if ((ret = __os_malloc(env,
+		    logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+			return (ret);
+#ifdef DIAGNOSTIC
+		if ((ret =
+		    __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+			__os_free(env, lr);
+			return (ret);
+		}
+#else
+		logrec.data = lr->data;
+#endif
+	}
+	if (npad > 0)
+		memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+	bp = logrec.data;
+
+	LOGCOPY_32(env, bp, &rectype);
+	bp += sizeof(rectype);
+
+	LOGCOPY_32(env, bp, &txn_num);
+	bp += sizeof(txn_num);
+
+	LOGCOPY_FROMLSN(env, bp, lsnp);
+	bp += sizeof(DB_LSN);
+
+	uinttmp = (u_int32_t)dbp->log_filename->id;
+	LOGCOPY_32(env, bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	uinttmp = (u_int32_t)meta;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	if (meta_lsn != NULL) {
+		if (txnp != NULL) {
+			LOG *lp = env->lg_handle->reginfo.primary;
+			if (LOG_COMPARE(meta_lsn, &lp->lsn) >= 0 && (ret =
+			    __log_check_page_lsn(env, dbp, meta_lsn)) != 0)
+				return (ret);
+		}
+		LOGCOPY_FROMLSN(env, bp, meta_lsn);
+	} else
+		memset(bp, 0, sizeof(*meta_lsn));
+	bp += sizeof(*meta_lsn);
+
+	uinttmp = (u_int32_t)last_free;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	if (last_lsn != NULL) {
+		if (txnp != NULL) {
+			LOG *lp = env->lg_handle->reginfo.primary;
+			if (LOG_COMPARE(last_lsn, &lp->lsn) >= 0 && (ret =
+			    __log_check_page_lsn(env, dbp, last_lsn)) != 0)
+				return (ret);
+		}
+		LOGCOPY_FROMLSN(env, bp, last_lsn);
+	} else
+		memset(bp, 0, sizeof(*last_lsn));
+	bp += sizeof(*last_lsn);
+
+	uinttmp = (u_int32_t)next_free;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	uinttmp = (u_int32_t)last_pgno;
+	LOGCOPY_32(env,bp, &uinttmp);
+	bp += sizeof(uinttmp);
+
+	if (list == NULL) {
+		zero = 0;
+		LOGCOPY_32(env, bp, &zero);
+		bp += sizeof(u_int32_t);
+	} else {
+		LOGCOPY_32(env, bp, &list->size);
+		bp += sizeof(list->size);
+		memcpy(bp, list->data, list->size);
+		bp += list->size;
+	}
+
+	DB_ASSERT(env,
+	    (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+	if (is_durable || txnp == NULL) {
+		if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+		    flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+			*lsnp = *rlsnp;
+			if (rlsnp != ret_lsnp)
+				 *ret_lsnp = *rlsnp;
+		}
+	} else {
+		ret = 0;
+#ifdef DIAGNOSTIC
+		/*
+		 * Set the debug bit if we are going to log non-durable
+		 * transactions so they will be ignored by recovery.
+		 */
+		memcpy(lr->data, logrec.data, logrec.size);
+		rectype |= DB_debug_FLAG;
+		LOGCOPY_32(env, logrec.data, &rectype);
+
+		if (!IS_REP_CLIENT(env))
+			ret = __log_put(env,
+			    rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+		STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+		F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+		LSN_NOT_LOGGED(*ret_lsnp);
+	}
+
+#ifdef LOG_DIAGNOSTIC
+	if (ret != 0)
+		(void)__db_pg_trunc_print(env,
+		    (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+	__os_free(env, logrec.data);
+#else
+	if (is_durable || txnp == NULL)
+		__os_free(env, logrec.data);
+#endif
+	return (ret);
+}
+
+/*
+ * PUBLIC: int __db_init_recover __P((ENV *, DB_DISTAB *));
+ */
+int
+__db_init_recover(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_addrem_recover, DB___db_addrem)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_big_recover, DB___db_big)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_ovref_recover, DB___db_ovref)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_debug_recover, DB___db_debug)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_noop_recover, DB___db_noop)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pg_alloc_recover, DB___db_pg_alloc)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pg_free_recover, DB___db_pg_free)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_cksum_recover, DB___db_cksum)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pg_freedata_recover, DB___db_pg_freedata)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pg_init_recover, DB___db_pg_init)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pg_trunc_recover, DB___db_pg_trunc)) != 0)
+		return (ret);
+	return (0);
+}
diff --git a/db/db_autop.c b/db/db_autop.c
new file mode 100644
index 0000000..f3b0635
--- /dev/null
+++ b/db/db_autop.c
@@ -0,0 +1,802 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __db_addrem_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_addrem_print(env, dbtp, lsnp, notused2, notused3)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *notused3;
+{
+	__db_addrem_args *argp;
+	u_int32_t i;
+	int ch;
+	int ret;
+
+	notused2 = DB_TXN_PRINT;
+	notused3 = NULL;
+
+	if ((ret =
+	    __db_addrem_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+	(void)printf(
+    "[%lu][%lu]__db_addrem%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+	    (u_long)lsnp->file, (u_long)lsnp->offset,
+	    (argp->type & DB_debug_FLAG) ? "_debug" : "",
+	    (u_long)argp->type,
+	    (u_long)argp->txnp->txnid,
+	    (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+	(void)printf("\topcode: %lu\n", (u_long)argp->opcode);
+	(void)printf("\tfileid: %ld\n", (long)argp->fileid);
+	(void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+	(void)printf("\tindx: %lu\n", (u_long)argp->indx);
+	(void)printf("\tnbytes: %lu\n", (u_long)argp->nbytes);
+	(void)printf("\thdr: ");
+	for (i = 0; i < argp->hdr.size; i++) {
+		ch = ((u_int8_t *)argp->hdr.data)[i];
+		printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+	}
+	(void)printf("\n");
+	(void)printf("\tdbt: ");
+	for (i = 0; i < argp->dbt.size; i++) {
+		ch = ((u_int8_t *)argp->dbt.data)[i];
+		printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+	}
+	(void)printf("\n");
+	(void)printf("\tpagelsn: [%lu][%lu]\n",
+	    (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset);
+	(void)printf("\n");
+	__os_free(env, argp);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __db_big_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_big_print(env, dbtp, lsnp, notused2, notused3)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *notused3;
+{
+	__db_big_args *argp;
+	u_int32_t i;
+	int ch;
+	int ret;
+
+	notused2 = DB_TXN_PRINT;
+	notused3 = NULL;
+
+	if ((ret =
+	    __db_big_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+	(void)printf(
+    "[%lu][%lu]__db_big%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+	    (u_long)lsnp->file, (u_long)lsnp->offset,
+	    (argp->type & DB_debug_FLAG) ? "_debug" : "",
+	    (u_long)argp->type,
+	    (u_long)argp->txnp->txnid,
+	    (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+	(void)printf("\topcode: %lu\n", (u_long)argp->opcode);
+	(void)printf("\tfileid: %ld\n", (long)argp->fileid);
+	(void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+	(void)printf("\tprev_pgno: %lu\n", (u_long)argp->prev_pgno);
+	(void)printf("\tnext_pgno: %lu\n", (u_long)argp->next_pgno);
+	(void)printf("\tdbt: ");
+	for (i = 0; i < argp->dbt.size; i++) {
+		ch = ((u_int8_t *)argp->dbt.data)[i];
+		printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+	}
+	(void)printf("\n");
+	(void)printf("\tpagelsn: [%lu][%lu]\n",
+	    (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset);
+	(void)printf("\tprevlsn: [%lu][%lu]\n",
+	    (u_long)argp->prevlsn.file, (u_long)argp->prevlsn.offset);
+	(void)printf("\tnextlsn: [%lu][%lu]\n",
+	    (u_long)argp->nextlsn.file, (u_long)argp->nextlsn.offset);
+	(void)printf("\n");
+	__os_free(env, argp);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __db_ovref_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_ovref_print(env, dbtp, lsnp, notused2, notused3)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *notused3;
+{
+	__db_ovref_args *argp;
+	int ret;
+
+	notused2 = DB_TXN_PRINT;
+	notused3 = NULL;
+
+	if ((ret =
+	    __db_ovref_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+	(void)printf(
+    "[%lu][%lu]__db_ovref%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+	    (u_long)lsnp->file, (u_long)lsnp->offset,
+	    (argp->type & DB_debug_FLAG) ? "_debug" : "",
+	    (u_long)argp->type,
+	    (u_long)argp->txnp->txnid,
+	    (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+	(void)printf("\tfileid: %ld\n", (long)argp->fileid);
+	(void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+	(void)printf("\tadjust: %ld\n", (long)argp->adjust);
+	(void)printf("\tlsn: [%lu][%lu]\n",
+	    (u_long)argp->lsn.file, (u_long)argp->lsn.offset);
+	(void)printf("\n");
+	__os_free(env, argp);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __db_relink_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_relink_42_print(env, dbtp, lsnp, notused2, notused3)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *notused3;
+{
+	__db_relink_42_args *argp;
+	int ret;
+
+	notused2 = DB_TXN_PRINT;
+	notused3 = NULL;
+
+	if ((ret =
+	    __db_relink_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+	(void)printf(
+    "[%lu][%lu]__db_relink_42%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+	    (u_long)lsnp->file, (u_long)lsnp->offset,
+	    (argp->type & DB_debug_FLAG) ? "_debug" : "",
+	    (u_long)argp->type,
+	    (u_long)argp->txnp->txnid,
+	    (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+	(void)printf("\topcode: %lu\n", (u_long)argp->opcode);
+	(void)printf("\tfileid: %ld\n", (long)argp->fileid);
+	(void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+	(void)printf("\tlsn: [%lu][%lu]\n",
+	    (u_long)argp->lsn.file, (u_long)argp->lsn.offset);
+	(void)printf("\tprev: %lu\n", (u_long)argp->prev);
+	(void)printf("\tlsn_prev: [%lu][%lu]\n",
+	    (u_long)argp->lsn_prev.file, (u_long)argp->lsn_prev.offset);
+	(void)printf("\tnext: %lu\n", (u_long)argp->next);
+	(void)printf("\tlsn_next: [%lu][%lu]\n",
+	    (u_long)argp->lsn_next.file, (u_long)argp->lsn_next.offset);
+	(void)printf("\n");
+	__os_free(env, argp);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __db_debug_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_debug_print(env, dbtp, lsnp, notused2, notused3)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *notused3;
+{
+	__db_debug_args *argp;
+	u_int32_t i;
+	int ch;
+	int ret;
+
+	notused2 = DB_TXN_PRINT;
+	notused3 = NULL;
+
+	if ((ret = __db_debug_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+	(void)printf(
+    "[%lu][%lu]__db_debug%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+	    (u_long)lsnp->file, (u_long)lsnp->offset,
+	    (argp->type & DB_debug_FLAG) ? "_debug" : "",
+	    (u_long)argp->type,
+	    (u_long)argp->txnp->txnid,
+	    (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+	(void)printf("\top: ");
+	for (i = 0; i < argp->op.size; i++) {
+		ch = ((u_int8_t *)argp->op.data)[i];
+		printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+	}
+	(void)printf("\n");
+	(void)printf("\tfileid: %ld\n", (long)argp->fileid);
+	(void)printf("\tkey: ");
+	for (i = 0; i < argp->key.size; i++) {
+		ch = ((u_int8_t *)argp->key.data)[i];
+		printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+	}
+	(void)printf("\n");
+	(void)printf("\tdata: ");
+	for (i = 0; i < argp->data.size; i++) {
+		ch = ((u_int8_t *)argp->data.data)[i];
+		printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+	}
+	(void)printf("\n");
+	(void)printf("\targ_flags: %lu\n", (u_long)argp->arg_flags);
+	(void)printf("\n");
+	__os_free(env, argp);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __db_noop_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_noop_print(env, dbtp, lsnp, notused2, notused3)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *notused3;
+{
+	__db_noop_args *argp;
+	int ret;
+
+	notused2 = DB_TXN_PRINT;
+	notused3 = NULL;
+
+	if ((ret =
+	    __db_noop_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+	(void)printf(
+    "[%lu][%lu]__db_noop%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+	    (u_long)lsnp->file, (u_long)lsnp->offset,
+	    (argp->type & DB_debug_FLAG) ? "_debug" : "",
+	    (u_long)argp->type,
+	    (u_long)argp->txnp->txnid,
+	    (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+	(void)printf("\tfileid: %ld\n", (long)argp->fileid);
+	(void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+	(void)printf("\tprevlsn: [%lu][%lu]\n",
+	    (u_long)argp->prevlsn.file, (u_long)argp->prevlsn.offset);
+	(void)printf("\n");
+	__os_free(env, argp);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __db_pg_alloc_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_alloc_42_print(env, dbtp, lsnp, notused2, notused3)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *notused3;
+{
+	__db_pg_alloc_42_args *argp;
+	int ret;
+
+	notused2 = DB_TXN_PRINT;
+	notused3 = NULL;
+
+	if ((ret =
+	    __db_pg_alloc_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+	(void)printf(
+    "[%lu][%lu]__db_pg_alloc_42%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+	    (u_long)lsnp->file, (u_long)lsnp->offset,
+	    (argp->type & DB_debug_FLAG) ? "_debug" : "",
+	    (u_long)argp->type,
+	    (u_long)argp->txnp->txnid,
+	    (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+	(void)printf("\tfileid: %ld\n", (long)argp->fileid);
+	(void)printf("\tmeta_lsn: [%lu][%lu]\n",
+	    (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset);
+	(void)printf("\tmeta_pgno: %lu\n", (u_long)argp->meta_pgno);
+	(void)printf("\tpage_lsn: [%lu][%lu]\n",
+	    (u_long)argp->page_lsn.file, (u_long)argp->page_lsn.offset);
+	(void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+	(void)printf("\tptype: %lu\n", (u_long)argp->ptype);
+	(void)printf("\tnext: %lu\n", (u_long)argp->next);
+	(void)printf("\n");
+	__os_free(env, argp);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __db_pg_alloc_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_alloc_print(env, dbtp, lsnp, notused2, notused3)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *notused3;
+{
+	__db_pg_alloc_args *argp;
+	int ret;
+
+	notused2 = DB_TXN_PRINT;
+	notused3 = NULL;
+
+	if ((ret =
+	    __db_pg_alloc_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+	(void)printf(
+    "[%lu][%lu]__db_pg_alloc%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+	    (u_long)lsnp->file, (u_long)lsnp->offset,
+	    (argp->type & DB_debug_FLAG) ? "_debug" : "",
+	    (u_long)argp->type,
+	    (u_long)argp->txnp->txnid,
+	    (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+	(void)printf("\tfileid: %ld\n", (long)argp->fileid);
+	(void)printf("\tmeta_lsn: [%lu][%lu]\n",
+	    (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset);
+	(void)printf("\tmeta_pgno: %lu\n", (u_long)argp->meta_pgno);
+	(void)printf("\tpage_lsn: [%lu][%lu]\n",
+	    (u_long)argp->page_lsn.file, (u_long)argp->page_lsn.offset);
+	(void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+	(void)printf("\tptype: %lu\n", (u_long)argp->ptype);
+	(void)printf("\tnext: %lu\n", (u_long)argp->next);
+	(void)printf("\tlast_pgno: %lu\n", (u_long)argp->last_pgno);
+	(void)printf("\n");
+	__os_free(env, argp);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __db_pg_free_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_free_42_print(env, dbtp, lsnp, notused2, notused3)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *notused3;
+{
+	__db_pg_free_42_args *argp;
+	u_int32_t i;
+	int ch;
+	int ret;
+
+	notused2 = DB_TXN_PRINT;
+	notused3 = NULL;
+
+	if ((ret =
+	    __db_pg_free_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+	(void)printf(
+    "[%lu][%lu]__db_pg_free_42%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+	    (u_long)lsnp->file, (u_long)lsnp->offset,
+	    (argp->type & DB_debug_FLAG) ? "_debug" : "",
+	    (u_long)argp->type,
+	    (u_long)argp->txnp->txnid,
+	    (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+	(void)printf("\tfileid: %ld\n", (long)argp->fileid);
+	(void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+	(void)printf("\tmeta_lsn: [%lu][%lu]\n",
+	    (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset);
+	(void)printf("\tmeta_pgno: %lu\n", (u_long)argp->meta_pgno);
+	(void)printf("\theader: ");
+	for (i = 0; i < argp->header.size; i++) {
+		ch = ((u_int8_t *)argp->header.data)[i];
+		printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+	}
+	(void)printf("\n");
+	(void)printf("\tnext: %lu\n", (u_long)argp->next);
+	(void)printf("\n");
+	__os_free(env, argp);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __db_pg_free_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_free_print(env, dbtp, lsnp, notused2, notused3)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *notused3;
+{
+	__db_pg_free_args *argp;
+	u_int32_t i;
+	int ch;
+	int ret;
+
+	notused2 = DB_TXN_PRINT;
+	notused3 = NULL;
+
+	if ((ret =
+	    __db_pg_free_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+	(void)printf(
+    "[%lu][%lu]__db_pg_free%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+	    (u_long)lsnp->file, (u_long)lsnp->offset,
+	    (argp->type & DB_debug_FLAG) ? "_debug" : "",
+	    (u_long)argp->type,
+	    (u_long)argp->txnp->txnid,
+	    (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+	(void)printf("\tfileid: %ld\n", (long)argp->fileid);
+	(void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+	(void)printf("\tmeta_lsn: [%lu][%lu]\n",
+	    (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset);
+	(void)printf("\tmeta_pgno: %lu\n", (u_long)argp->meta_pgno);
+	(void)printf("\theader: ");
+	for (i = 0; i < argp->header.size; i++) {
+		ch = ((u_int8_t *)argp->header.data)[i];
+		printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+	}
+	(void)printf("\n");
+	(void)printf("\tnext: %lu\n", (u_long)argp->next);
+	(void)printf("\tlast_pgno: %lu\n", (u_long)argp->last_pgno);
+	(void)printf("\n");
+	__os_free(env, argp);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __db_cksum_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_cksum_print(env, dbtp, lsnp, notused2, notused3)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *notused3;
+{
+	__db_cksum_args *argp;
+	int ret;
+
+	notused2 = DB_TXN_PRINT;
+	notused3 = NULL;
+
+	if ((ret = __db_cksum_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+	(void)printf(
+    "[%lu][%lu]__db_cksum%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+	    (u_long)lsnp->file, (u_long)lsnp->offset,
+	    (argp->type & DB_debug_FLAG) ? "_debug" : "",
+	    (u_long)argp->type,
+	    (u_long)argp->txnp->txnid,
+	    (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+	(void)printf("\n");
+	__os_free(env, argp);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __db_pg_freedata_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_freedata_42_print(env, dbtp, lsnp, notused2, notused3)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *notused3;
+{
+	__db_pg_freedata_42_args *argp;
+	u_int32_t i;
+	int ch;
+	int ret;
+
+	notused2 = DB_TXN_PRINT;
+	notused3 = NULL;
+
+	if ((ret =
+	    __db_pg_freedata_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+	(void)printf(
+    "[%lu][%lu]__db_pg_freedata_42%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+	    (u_long)lsnp->file, (u_long)lsnp->offset,
+	    (argp->type & DB_debug_FLAG) ? "_debug" : "",
+	    (u_long)argp->type,
+	    (u_long)argp->txnp->txnid,
+	    (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+	(void)printf("\tfileid: %ld\n", (long)argp->fileid);
+	(void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+	(void)printf("\tmeta_lsn: [%lu][%lu]\n",
+	    (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset);
+	(void)printf("\tmeta_pgno: %lu\n", (u_long)argp->meta_pgno);
+	(void)printf("\theader: ");
+	for (i = 0; i < argp->header.size; i++) {
+		ch = ((u_int8_t *)argp->header.data)[i];
+		printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+	}
+	(void)printf("\n");
+	(void)printf("\tnext: %lu\n", (u_long)argp->next);
+	(void)printf("\tdata: ");
+	for (i = 0; i < argp->data.size; i++) {
+		ch = ((u_int8_t *)argp->data.data)[i];
+		printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+	}
+	(void)printf("\n");
+	(void)printf("\n");
+	__os_free(env, argp);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __db_pg_freedata_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_freedata_print(env, dbtp, lsnp, notused2, notused3)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *notused3;
+{
+	__db_pg_freedata_args *argp;
+	u_int32_t i;
+	int ch;
+	int ret;
+
+	notused2 = DB_TXN_PRINT;
+	notused3 = NULL;
+
+	if ((ret =
+	    __db_pg_freedata_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+	(void)printf(
+    "[%lu][%lu]__db_pg_freedata%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+	    (u_long)lsnp->file, (u_long)lsnp->offset,
+	    (argp->type & DB_debug_FLAG) ? "_debug" : "",
+	    (u_long)argp->type,
+	    (u_long)argp->txnp->txnid,
+	    (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+	(void)printf("\tfileid: %ld\n", (long)argp->fileid);
+	(void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+	(void)printf("\tmeta_lsn: [%lu][%lu]\n",
+	    (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset);
+	(void)printf("\tmeta_pgno: %lu\n", (u_long)argp->meta_pgno);
+	(void)printf("\theader: ");
+	for (i = 0; i < argp->header.size; i++) {
+		ch = ((u_int8_t *)argp->header.data)[i];
+		printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+	}
+	(void)printf("\n");
+	(void)printf("\tnext: %lu\n", (u_long)argp->next);
+	(void)printf("\tlast_pgno: %lu\n", (u_long)argp->last_pgno);
+	(void)printf("\tdata: ");
+	for (i = 0; i < argp->data.size; i++) {
+		ch = ((u_int8_t *)argp->data.data)[i];
+		printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+	}
+	(void)printf("\n");
+	(void)printf("\n");
+	__os_free(env, argp);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __db_pg_init_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_init_print(env, dbtp, lsnp, notused2, notused3)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *notused3;
+{
+	__db_pg_init_args *argp;
+	u_int32_t i;
+	int ch;
+	int ret;
+
+	notused2 = DB_TXN_PRINT;
+	notused3 = NULL;
+
+	if ((ret =
+	    __db_pg_init_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+	(void)printf(
+    "[%lu][%lu]__db_pg_init%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+	    (u_long)lsnp->file, (u_long)lsnp->offset,
+	    (argp->type & DB_debug_FLAG) ? "_debug" : "",
+	    (u_long)argp->type,
+	    (u_long)argp->txnp->txnid,
+	    (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+	(void)printf("\tfileid: %ld\n", (long)argp->fileid);
+	(void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+	(void)printf("\theader: ");
+	for (i = 0; i < argp->header.size; i++) {
+		ch = ((u_int8_t *)argp->header.data)[i];
+		printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+	}
+	(void)printf("\n");
+	(void)printf("\tdata: ");
+	for (i = 0; i < argp->data.size; i++) {
+		ch = ((u_int8_t *)argp->data.data)[i];
+		printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+	}
+	(void)printf("\n");
+	(void)printf("\n");
+	__os_free(env, argp);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __db_pg_sort_44_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_sort_44_print(env, dbtp, lsnp, notused2, notused3)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *notused3;
+{
+	__db_pg_sort_44_args *argp;
+	u_int32_t i;
+	int ch;
+	int ret;
+
+	notused2 = DB_TXN_PRINT;
+	notused3 = NULL;
+
+	if ((ret =
+	    __db_pg_sort_44_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+	(void)printf(
+    "[%lu][%lu]__db_pg_sort_44%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+	    (u_long)lsnp->file, (u_long)lsnp->offset,
+	    (argp->type & DB_debug_FLAG) ? "_debug" : "",
+	    (u_long)argp->type,
+	    (u_long)argp->txnp->txnid,
+	    (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+	(void)printf("\tfileid: %ld\n", (long)argp->fileid);
+	(void)printf("\tmeta: %lu\n", (u_long)argp->meta);
+	(void)printf("\tmeta_lsn: [%lu][%lu]\n",
+	    (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset);
+	(void)printf("\tlast_free: %lu\n", (u_long)argp->last_free);
+	(void)printf("\tlast_lsn: [%lu][%lu]\n",
+	    (u_long)argp->last_lsn.file, (u_long)argp->last_lsn.offset);
+	(void)printf("\tlast_pgno: %lu\n", (u_long)argp->last_pgno);
+	(void)printf("\tlist: ");
+	for (i = 0; i < argp->list.size; i++) {
+		ch = ((u_int8_t *)argp->list.data)[i];
+		printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+	}
+	(void)printf("\n");
+	(void)printf("\n");
+	__os_free(env, argp);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __db_pg_trunc_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC:     db_recops, void *));
+ */
+int
+__db_pg_trunc_print(env, dbtp, lsnp, notused2, notused3)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops notused2;
+	void *notused3;
+{
+	__db_pg_trunc_args *argp;
+	u_int32_t i;
+	int ch;
+	int ret;
+
+	notused2 = DB_TXN_PRINT;
+	notused3 = NULL;
+
+	if ((ret =
+	    __db_pg_trunc_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+		return (ret);
+	(void)printf(
+    "[%lu][%lu]__db_pg_trunc%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+	    (u_long)lsnp->file, (u_long)lsnp->offset,
+	    (argp->type & DB_debug_FLAG) ? "_debug" : "",
+	    (u_long)argp->type,
+	    (u_long)argp->txnp->txnid,
+	    (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+	(void)printf("\tfileid: %ld\n", (long)argp->fileid);
+	(void)printf("\tmeta: %lu\n", (u_long)argp->meta);
+	(void)printf("\tmeta_lsn: [%lu][%lu]\n",
+	    (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset);
+	(void)printf("\tlast_free: %lu\n", (u_long)argp->last_free);
+	(void)printf("\tlast_lsn: [%lu][%lu]\n",
+	    (u_long)argp->last_lsn.file, (u_long)argp->last_lsn.offset);
+	(void)printf("\tnext_free: %lu\n", (u_long)argp->next_free);
+	(void)printf("\tlast_pgno: %lu\n", (u_long)argp->last_pgno);
+	(void)printf("\tlist: ");
+	for (i = 0; i < argp->list.size; i++) {
+		ch = ((u_int8_t *)argp->list.data)[i];
+		printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+	}
+	(void)printf("\n");
+	(void)printf("\n");
+	__os_free(env, argp);
+	return (0);
+}
+
+/*
+ * PUBLIC: int __db_init_print __P((ENV *, DB_DISTAB *));
+ */
+int
+__db_init_print(env, dtabp)
+	ENV *env;
+	DB_DISTAB *dtabp;
+{
+	int ret;
+
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_addrem_print, DB___db_addrem)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_big_print, DB___db_big)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_ovref_print, DB___db_ovref)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_debug_print, DB___db_debug)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_noop_print, DB___db_noop)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pg_alloc_print, DB___db_pg_alloc)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pg_free_print, DB___db_pg_free)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_cksum_print, DB___db_cksum)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pg_freedata_print, DB___db_pg_freedata)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pg_init_print, DB___db_pg_init)) != 0)
+		return (ret);
+	if ((ret = __db_add_recovery_int(env, dtabp,
+	    __db_pg_trunc_print, DB___db_pg_trunc)) != 0)
+		return (ret);
+	return (0);
+}
diff --git a/db/db_cam.c b/db/db_cam.c
new file mode 100644
index 0000000..4c1322d
--- /dev/null
+++ b/db/db_cam.c
@@ -0,0 +1,3460 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000, 2010 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+static int __db_s_count __P((DB *));
+static int __db_wrlock_err __P((ENV *));
+static int __dbc_del_foreign __P((DBC *));
+static int __dbc_del_oldskey __P((DB *, DBC *, DBT *, DBT *, DBT *));
+static int __dbc_del_secondary __P((DBC *));
+static int __dbc_pget_recno __P((DBC *, DBT *, DBT *, u_int32_t));
+static inline int __dbc_put_append __P((DBC *,
+		DBT *, DBT *, u_int32_t *, u_int32_t));
+static inline int __dbc_put_fixed_len __P((DBC *, DBT *, DBT *));
+static inline int __dbc_put_partial __P((DBC *,
+		DBT *, DBT *, DBT *, DBT *, u_int32_t *, u_int32_t));
+static int __dbc_put_primary __P((DBC *, DBT *, DBT *, u_int32_t));
+static inline int __dbc_put_resolve_key __P((DBC *,
+		DBT *, DBT *, u_int32_t *, u_int32_t));
+static inline int __dbc_put_secondaries __P((DBC *,
+		DBT *, DBT *, DBT *, int, DBT *, u_int32_t *));
+
+#define	CDB_LOCKING_INIT(env, dbc)					\
+	/*								\
+	 * If we are running CDB, this had better be either a write	\
+	 * cursor or an immediate writer.  If it's a regular writer,	\
+	 * that means we have an IWRITE lock and we need to upgrade	\
+	 * it to a write lock.						\
+	 */								\
+	if (CDB_LOCKING(env)) {						\
+		if (!F_ISSET(dbc, DBC_WRITECURSOR | DBC_WRITER))	\
+			return (__db_wrlock_err(env));			\
+									\
+		if (F_ISSET(dbc, DBC_WRITECURSOR) &&			\
+		    (ret = __lock_get(env,				\
+		    (dbc)->locker, DB_LOCK_UPGRADE, &(dbc)->lock_dbt,	\
+		    DB_LOCK_WRITE, &(dbc)->mylock)) != 0)		\
+			return (ret);					\
+	}
+#define	CDB_LOCKING_DONE(env, dbc)					\
+	/* Release the upgraded lock. */				\
+	if (F_ISSET(dbc, DBC_WRITECURSOR))				\
+		(void)__lock_downgrade(					\
+		    env, &(dbc)->mylock, DB_LOCK_IWRITE, 0);
+
+#define	SET_READ_LOCKING_FLAGS(dbc, var) do {				\
+	var = 0;							\
+	if (!F_ISSET(dbc, DBC_READ_COMMITTED | DBC_READ_UNCOMMITTED)) {	\
+		if (LF_ISSET(DB_READ_COMMITTED))			\
+			var = DBC_READ_COMMITTED | DBC_WAS_READ_COMMITTED; \
+		if (LF_ISSET(DB_READ_UNCOMMITTED))			\
+			var = DBC_READ_UNCOMMITTED;			\
+	}								\
+	LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED);		\
+} while (0)
+
+/*
+ * __dbc_close --
+ *	DBC->close.
+ *
+ * PUBLIC: int __dbc_close __P((DBC *));
+ */
+int
+__dbc_close(dbc)
+	DBC *dbc;
+{
+	DB *dbp;
+	DBC *opd;
+	DBC_INTERNAL *cp;
+	DB_TXN *txn;
+	ENV *env;
+	int ret, t_ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	cp = dbc->internal;
+	opd = cp->opd;
+	ret = 0;
+
+	/*
+	 * Remove the cursor(s) from the active queue.  We may be closing two
+	 * cursors at once here, a top-level one and a lower-level, off-page
+	 * duplicate one.  The access-method specific cursor close routine must
+	 * close both of them in a single call.
+	 *
+	 * !!!
+	 * Cursors must be removed from the active queue before calling the
+	 * access specific cursor close routine, btree depends on having that
+	 * order of operations.
+	 */
+	MUTEX_LOCK(env, dbp->mutex);
+
+	if (opd != NULL) {
+		DB_ASSERT(env, F_ISSET(opd, DBC_ACTIVE));
+		F_CLR(opd, DBC_ACTIVE);
+		TAILQ_REMOVE(&dbp->active_queue, opd, links);
+	}
+	DB_ASSERT(env, F_ISSET(dbc, DBC_ACTIVE));
+	F_CLR(dbc, DBC_ACTIVE);
+	TAILQ_REMOVE(&dbp->active_queue, dbc, links);
+
+	MUTEX_UNLOCK(env, dbp->mutex);
+
+	/* Call the access specific cursor close routine. */
+	if ((t_ret =
+	    dbc->am_close(dbc, PGNO_INVALID, NULL)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/*
+	 * Release the lock after calling the access method specific close
+	 * routine, a Btree cursor may have had pending deletes.
+	 */
+	if (CDB_LOCKING(env)) {
+		/*
+		 * Also, be sure not to free anything if mylock.off is
+		 * INVALID;  in some cases, such as idup'ed read cursors
+		 * and secondary update cursors, a cursor in a CDB
+		 * environment may not have a lock at all.
+		 */
+		if ((t_ret = __LPUT(dbc, dbc->mylock)) != 0 && ret == 0)
+			ret = t_ret;
+
+		/* For safety's sake, since this is going on the free queue. */
+		memset(&dbc->mylock, 0, sizeof(dbc->mylock));
+		if (opd != NULL)
+			memset(&opd->mylock, 0, sizeof(opd->mylock));
+	}
+
+	if ((txn = dbc->txn) != NULL)
+		txn->cursors--;
+
+	/* Move the cursor(s) to the free queue. */
+	MUTEX_LOCK(env, dbp->mutex);
+	if (opd != NULL) {
+		if (txn != NULL)
+			txn->cursors--;
+		TAILQ_INSERT_TAIL(&dbp->free_queue, opd, links);
+		opd = NULL;
+	}
+	TAILQ_INSERT_TAIL(&dbp->free_queue, dbc, links);
+	MUTEX_UNLOCK(env, dbp->mutex);
+
+	if (txn != NULL && F_ISSET(txn, TXN_PRIVATE) && txn->cursors == 0 &&
+	    (t_ret = __txn_commit(txn, 0)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __dbc_destroy --
+ *	Destroy the cursor, called after DBC->close.
+ *
+ * PUBLIC: int __dbc_destroy __P((DBC *));
+ */
+int
+__dbc_destroy(dbc)
+	DBC *dbc;
+{
+	DB *dbp;
+	ENV *env;
+	int ret, t_ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	/* Remove the cursor from the free queue. */
+	MUTEX_LOCK(env, dbp->mutex);
+	TAILQ_REMOVE(&dbp->free_queue, dbc, links);
+	MUTEX_UNLOCK(env, dbp->mutex);
+
+	/* Free up allocated memory. */
+	if (dbc->my_rskey.data != NULL)
+		__os_free(env, dbc->my_rskey.data);
+	if (dbc->my_rkey.data != NULL)
+		__os_free(env, dbc->my_rkey.data);
+	if (dbc->my_rdata.data != NULL)
+		__os_free(env, dbc->my_rdata.data);
+
+	/* Call the access specific cursor destroy routine. */
+	ret = dbc->am_destroy == NULL ? 0 : dbc->am_destroy(dbc);
+
+	/*
+	 * Release the lock id for this cursor.
+	 */
+	if (LOCKING_ON(env) &&
+	    F_ISSET(dbc, DBC_OWN_LID) &&
+	    (t_ret = __lock_id_free(env, dbc->lref)) != 0 && ret == 0)
+		ret = t_ret;
+
+	__os_free(env, dbc);
+
+	return (ret);
+}
+
+/*
+ * __dbc_cmp --
+ *	Compare the position of two cursors. Return whether two cursors are
+ *	pointing to the same key/data pair.
+ *
+ * result == 0  if both cursors refer to the same item.
+ * result == 1  otherwise
+ *
+ * PUBLIC: int __dbc_cmp __P((DBC *, DBC *, int *));
+ */
+int
+__dbc_cmp(dbc, other_dbc, result)
+	DBC *dbc, *other_dbc;
+	int *result;
+{
+	DBC *curr_dbc, *curr_odbc;
+	DBC_INTERNAL *dbc_int, *odbc_int;
+	ENV *env;
+	int ret;
+
+	env = dbc->env;
+	ret = 0;
+
+#ifdef HAVE_PARTITION
+	if (DB_IS_PARTITIONED(dbc->dbp)) {
+		dbc = ((PART_CURSOR *)dbc->internal)->sub_cursor;
+		other_dbc = ((PART_CURSOR *)other_dbc->internal)->sub_cursor;
+	}
+	/* Both cursors must still be valid. */
+	if (dbc == NULL || other_dbc == NULL) {
+		__db_errx(env,
+"Both cursors must be initialized before calling DBC->cmp.");
+		return (EINVAL);
+	}
+
+	if (dbc->dbp != other_dbc->dbp) {
+		*result = 1;
+		return (0);
+	}
+#endif
+
+#ifdef HAVE_COMPRESSION
+	if (DB_IS_COMPRESSED(dbc->dbp))
+		return (__bamc_compress_cmp(dbc, other_dbc, result));
+#endif
+
+	curr_dbc = dbc;
+	curr_odbc = other_dbc;
+	dbc_int = dbc->internal;
+	odbc_int = other_dbc->internal;
+
+	/* Both cursors must be on valid positions. */
+	if (dbc_int->pgno == PGNO_INVALID || odbc_int->pgno == PGNO_INVALID) {
+		__db_errx(env,
+"Both cursors must be initialized before calling DBC->cmp.");
+		return (EINVAL);
+	}
+
+	/*
+	 * Use a loop since cursors can be nested. Off page duplicate
+	 * sets can only be nested one level deep, so it is safe to use a
+	 * while (true) loop.
+	 */
+	while (1) {
+		if (dbc_int->pgno == odbc_int->pgno &&
+		    dbc_int->indx == odbc_int->indx) {
+			/*
+			 * If one cursor is sitting on an off page duplicate
+			 * set, the other will be pointing to the same set. Be
+			 * careful, and check  anyway.
+			 */
+			if (dbc_int->opd != NULL && odbc_int->opd != NULL) {
+				curr_dbc = dbc_int->opd;
+				curr_odbc = odbc_int->opd;
+				dbc_int = dbc_int->opd->internal;
+				odbc_int= odbc_int->opd->internal;
+				continue;
+			} else if (dbc_int->opd == NULL &&
+			    odbc_int->opd == NULL)
+				*result = 0;
+			else {
+				__db_errx(env,
+		"DBCursor->cmp mismatched off page duplicate cursor pointers.");
+				return (EINVAL);
+			}
+
+			switch (curr_dbc->dbtype) {
+			case DB_HASH:
+				/*
+				 * Make sure that on-page duplicate data
+				 * indexes match, and that the deleted
+				 * flags are consistent.
+				 */
+				ret = __hamc_cmp(curr_dbc, curr_odbc, result);
+				break;
+			case DB_BTREE:
+			case DB_RECNO:
+				/*
+				 * Check for consisted deleted flags on btree
+				 * specific cursors.
+				 */
+				ret = __bamc_cmp(curr_dbc, curr_odbc, result);
+				break;
+			default:
+				/* NO-OP break out. */
+				break;
+			}
+		} else
+			*result = 1;
+		return (ret);
+	}
+	/* NOTREACHED. */
+	return (ret);
+}
+
+/*
+ * __dbc_count --
+ *	Return a count of duplicate data items.
+ *
+ * PUBLIC: int __dbc_count __P((DBC *, db_recno_t *));
+ */
+int
+__dbc_count(dbc, recnop)
+	DBC *dbc;
+	db_recno_t *recnop;
+{
+	ENV *env;
+	int ret;
+
+	env = dbc->env;
+
+#ifdef HAVE_PARTITION
+	if (DB_IS_PARTITIONED(dbc->dbp))
+		dbc = ((PART_CURSOR *)dbc->internal)->sub_cursor;
+#endif
+	/*
+	 * Cursor Cleanup Note:
+	 * All of the cursors passed to the underlying access methods by this
+	 * routine are not duplicated and will not be cleaned up on return.
+	 * So, pages/locks that the cursor references must be resolved by the
+	 * underlying functions.
+	 */
+	switch (dbc->dbtype) {
+	case DB_QUEUE:
+	case DB_RECNO:
+		*recnop = 1;
+		break;
+	case DB_HASH:
+		if (dbc->internal->opd == NULL) {
+			if ((ret = __hamc_count(dbc, recnop)) != 0)
+				return (ret);
+			break;
+		}
+		/* FALLTHROUGH */
+	case DB_BTREE:
+#ifdef HAVE_COMPRESSION
+		if (DB_IS_COMPRESSED(dbc->dbp))
+			return (__bamc_compress_count(dbc, recnop));
+#endif
+		if ((ret = __bamc_count(dbc, recnop)) != 0)
+			return (ret);
+		break;
+	case DB_UNKNOWN:
+	default:
+		return (__db_unknown_type(env, "__dbc_count", dbc->dbtype));
+	}
+	return (0);
+}
+
+/*
+ * __dbc_del --
+ *	DBC->del.
+ *
+ * PUBLIC: int __dbc_del __P((DBC *, u_int32_t));
+ */
+int
+__dbc_del(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	DB *dbp;
+	ENV *env;
+	int ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	CDB_LOCKING_INIT(env, dbc);
+
+	/*
+	 * If we're a secondary index, and DB_UPDATE_SECONDARY isn't set
+	 * (which it only is if we're being called from a primary update),
+	 * then we need to call through to the primary and delete the item.
+	 *
+	 * Note that this will delete the current item;  we don't need to
+	 * delete it ourselves as well, so we can just goto done.
+	 */
+	if (flags != DB_UPDATE_SECONDARY && F_ISSET(dbp, DB_AM_SECONDARY)) {
+		ret = __dbc_del_secondary(dbc);
+		goto done;
+	}
+
+	/*
+	 * If we are a foreign db, go through and check any foreign key
+	 * constraints first, which will make rolling back changes on an abort
+	 * simpler.
+	 */
+	if (LIST_FIRST(&dbp->f_primaries) != NULL &&
+	    (ret = __dbc_del_foreign(dbc)) != 0)
+		goto done;
+
+	/*
+	 * If we are a primary and have secondary indices, go through
+	 * and delete any secondary keys that point at the current record.
+	 */
+	if (DB_IS_PRIMARY(dbp) &&
+	    (ret = __dbc_del_primary(dbc)) != 0)
+		goto done;
+
+#ifdef HAVE_COMPRESSION
+	if (DB_IS_COMPRESSED(dbp))
+		ret = __bamc_compress_del(dbc, flags);
+	else
+#endif
+		ret = __dbc_idel(dbc, flags);
+
+done:	CDB_LOCKING_DONE(env, dbc);
+
+	return (ret);
+}
+
+/*
+ * __dbc_del --
+ *	Implemenation of DBC->del.
+ *
+ * PUBLIC: int __dbc_idel __P((DBC *, u_int32_t));
+ */
+int
+__dbc_idel(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DBC *opd;
+	int ret, t_ret;
+
+	COMPQUIET(flags, 0);
+
+	dbp = dbc->dbp;
+
+	/*
+	 * Cursor Cleanup Note:
+	 * All of the cursors passed to the underlying access methods by this
+	 * routine are not duplicated and will not be cleaned up on return.
+	 * So, pages/locks that the cursor references must be resolved by the
+	 * underlying functions.
+	 */
+
+	/*
+	 * Off-page duplicate trees are locked in the primary tree, that is,
+	 * we acquire a write lock in the primary tree and no locks in the
+	 * off-page dup tree.  If the del operation is done in an off-page
+	 * duplicate tree, call the primary cursor's upgrade routine first.
+	 */
+	opd = dbc->internal->opd;
+	if (opd == NULL)
+		ret = dbc->am_del(dbc, flags);
+	else if ((ret = dbc->am_writelock(dbc)) == 0)
+		ret = opd->am_del(opd, flags);
+
+	/*
+	 * If this was an update that is supporting dirty reads
+	 * then we may have just swapped our read for a write lock
+	 * which is held by the surviving cursor.  We need
+	 * to explicitly downgrade this lock.  The closed cursor
+	 * may only have had a read lock.
+	 */
+	if (F_ISSET(dbp, DB_AM_READ_UNCOMMITTED) &&
+	    dbc->internal->lock_mode == DB_LOCK_WRITE) {
+		if ((t_ret =
+		    __TLPUT(dbc, dbc->internal->lock)) != 0 && ret == 0)
+			ret = t_ret;
+		if (t_ret == 0)
+			dbc->internal->lock_mode = DB_LOCK_WWRITE;
+		if (dbc->internal->page != NULL && (t_ret =
+		    __memp_shared(dbp->mpf, dbc->internal->page)) != 0 &&
+		    ret == 0)
+			ret = t_ret;
+	}
+
+	return (ret);
+}
+
+#ifdef HAVE_COMPRESSION
+/*
+ * __dbc_bulk_del --
+ *	Bulk del for a cursor.
+ *
+ *	Only implemented for compressed BTrees. In this file in order to
+ *	use the CDB_LOCKING_* macros.
+ *
+ * PUBLIC: #ifdef HAVE_COMPRESSION
+ * PUBLIC: int __dbc_bulk_del __P((DBC *, DBT *, u_int32_t));
+ * PUBLIC: #endif
+ */
+int
+__dbc_bulk_del(dbc, key, flags)
+	DBC *dbc;
+	DBT *key;
+	u_int32_t flags;
+{
+	ENV *env;
+	int ret;
+
+	env = dbc->env;
+
+	DB_ASSERT(env, DB_IS_COMPRESSED(dbc->dbp));
+
+	CDB_LOCKING_INIT(env, dbc);
+
+	ret = __bamc_compress_bulk_del(dbc, key, flags);
+
+	CDB_LOCKING_DONE(env, dbc);
+
+	return (ret);
+}
+#endif
+
+/*
+ * __dbc_dup --
+ *	Duplicate a cursor
+ *
+ * PUBLIC: int __dbc_dup __P((DBC *, DBC **, u_int32_t));
+ */
+int
+__dbc_dup(dbc_orig, dbcp, flags)
+	DBC *dbc_orig;
+	DBC **dbcp;
+	u_int32_t flags;
+{
+	DBC *dbc_n, *dbc_nopd;
+	int ret;
+
+	dbc_n = dbc_nopd = NULL;
+
+	/* Allocate a new cursor and initialize it. */
+	if ((ret = __dbc_idup(dbc_orig, &dbc_n, flags)) != 0)
+		goto err;
+	*dbcp = dbc_n;
+
+	/*
+	 * If the cursor references an off-page duplicate tree, allocate a
+	 * new cursor for that tree and initialize it.
+	 */
+	if (dbc_orig->internal->opd != NULL) {
+		if ((ret =
+		   __dbc_idup(dbc_orig->internal->opd, &dbc_nopd, flags)) != 0)
+			goto err;
+		dbc_n->internal->opd = dbc_nopd;
+		dbc_nopd->internal->pdbc = dbc_n;
+	}
+	return (0);
+
+err:	if (dbc_n != NULL)
+		(void)__dbc_close(dbc_n);
+	if (dbc_nopd != NULL)
+		(void)__dbc_close(dbc_nopd);
+
+	return (ret);
+}
+
+/*
+ * __dbc_idup --
+ *	Internal version of __dbc_dup.
+ *
+ * PUBLIC: int __dbc_idup __P((DBC *, DBC **, u_int32_t));
+ */
+int
+__dbc_idup(dbc_orig, dbcp, flags)
+	DBC *dbc_orig, **dbcp;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DBC *dbc_n;
+	DBC_INTERNAL *int_n, *int_orig;
+	ENV *env;
+	int ret;
+
+	dbp = dbc_orig->dbp;
+	dbc_n = *dbcp;
+	env = dbp->env;
+
+	if ((ret = __db_cursor_int(dbp, dbc_orig->thread_info,
+	    dbc_orig->txn, dbc_orig->dbtype, dbc_orig->internal->root,
+	    F_ISSET(dbc_orig, DBC_OPD) | DBC_DUPLICATE,
+	    dbc_orig->locker, &dbc_n)) != 0)
+		return (ret);
+
+	/* Position the cursor if requested, acquiring the necessary locks. */
+	if (LF_ISSET(DB_POSITION)) {
+		int_n = dbc_n->internal;
+		int_orig = dbc_orig->internal;
+
+		dbc_n->flags |= dbc_orig->flags & ~DBC_OWN_LID;
+
+		int_n->indx = int_orig->indx;
+		int_n->pgno = int_orig->pgno;
+		int_n->root = int_orig->root;
+		int_n->lock_mode = int_orig->lock_mode;
+
+		int_n->stream_start_pgno = int_orig->stream_start_pgno;
+		int_n->stream_off = int_orig->stream_off;
+		int_n->stream_curr_pgno = int_orig->stream_curr_pgno;
+
+		switch (dbc_orig->dbtype) {
+		case DB_QUEUE:
+			if ((ret = __qamc_dup(dbc_orig, dbc_n)) != 0)
+				goto err;
+			break;
+		case DB_BTREE:
+		case DB_RECNO:
+			if ((ret = __bamc_dup(dbc_orig, dbc_n, flags)) != 0)
+				goto err;
+			break;
+		case DB_HASH:
+			if ((ret = __hamc_dup(dbc_orig, dbc_n)) != 0)
+				goto err;
+			break;
+		case DB_UNKNOWN:
+		default:
+			ret = __db_unknown_type(env,
+			    "__dbc_idup", dbc_orig->dbtype);
+			goto err;
+		}
+	} else if (F_ISSET(dbc_orig, DBC_BULK)) {
+		/*
+		 * For bulk cursors, remember what page were on, even if we
+		 * don't know that the next operation will be nearby.
+		 */
+		dbc_n->internal->pgno = dbc_orig->internal->pgno;
+	}
+
+	/* Copy the locking flags to the new cursor. */
+	F_SET(dbc_n, F_ISSET(dbc_orig, DBC_BULK |
+	    DBC_READ_COMMITTED | DBC_READ_UNCOMMITTED | DBC_WRITECURSOR));
+
+	/*
+	 * If we're in CDB and this isn't an offpage dup cursor, then
+	 * we need to get a lock for the duplicated cursor.
+	 */
+	if (CDB_LOCKING(env) && !F_ISSET(dbc_n, DBC_OPD) &&
+	    (ret = __lock_get(env, dbc_n->locker, 0,
+	    &dbc_n->lock_dbt, F_ISSET(dbc_orig, DBC_WRITECURSOR) ?
+	    DB_LOCK_IWRITE : DB_LOCK_READ, &dbc_n->mylock)) != 0)
+		goto err;
+
+	dbc_n->priority = dbc_orig->priority;
+	dbc_n->internal->pdbc = dbc_orig->internal->pdbc;
+	*dbcp = dbc_n;
+	return (0);
+
+err:	(void)__dbc_close(dbc_n);
+	return (ret);
+}
+
+/*
+ * __dbc_newopd --
+ *	Create a new off-page duplicate cursor.
+ *
+ * PUBLIC: int __dbc_newopd __P((DBC *, db_pgno_t, DBC *, DBC **));
+ */
+int
+__dbc_newopd(dbc_parent, root, oldopd, dbcp)
+	DBC *dbc_parent;
+	db_pgno_t root;
+	DBC *oldopd;
+	DBC **dbcp;
+{
+	DB *dbp;
+	DBC *opd;
+	DBTYPE dbtype;
+	int ret;
+
+	dbp = dbc_parent->dbp;
+	dbtype = (dbp->dup_compare == NULL) ? DB_RECNO : DB_BTREE;
+
+	/*
+	 * On failure, we want to default to returning the old off-page dup
+	 * cursor, if any;  our caller can't be left with a dangling pointer
+	 * to a freed cursor.  On error the only allowable behavior is to
+	 * close the cursor (and the old OPD cursor it in turn points to), so
+	 * this should be safe.
+	 */
+	*dbcp = oldopd;
+
+	if ((ret = __db_cursor_int(dbp, dbc_parent->thread_info,
+	    dbc_parent->txn,
+	    dbtype, root, DBC_OPD, dbc_parent->locker, &opd)) != 0)
+		return (ret);
+
+	opd->priority = dbc_parent->priority;
+	opd->internal->pdbc = dbc_parent;
+	*dbcp = opd;
+
+	/*
+	 * Check to see if we already have an off-page dup cursor that we've
+	 * passed in.  If we do, close it.  It'd be nice to use it again
+	 * if it's a cursor belonging to the right tree, but if we're doing
+	 * a cursor-relative operation this might not be safe, so for now
+	 * we'll take the easy way out and always close and reopen.
+	 *
+	 * Note that under no circumstances do we want to close the old
+	 * cursor without returning a valid new one;  we don't want to
+	 * leave the main cursor in our caller with a non-NULL pointer
+	 * to a freed off-page dup cursor.
+	 */
+	if (oldopd != NULL && (ret = __dbc_close(oldopd)) != 0)
+		return (ret);
+
+	return (0);
+}
+
+/*
+ * __dbc_get --
+ *	Get using a cursor.
+ *
+ * PUBLIC: int __dbc_get __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_get(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+#ifdef HAVE_PARTITION
+	if (F_ISSET(dbc, DBC_PARTITIONED))
+		return (__partc_get(dbc, key, data, flags));
+#endif
+
+#ifdef HAVE_COMPRESSION
+	if (DB_IS_COMPRESSED(dbc->dbp))
+		return (__bamc_compress_get(dbc, key, data, flags));
+#endif
+
+	return (__dbc_iget(dbc, key, data, flags));
+}
+
+/*
+ * __dbc_iget --
+ *	Implementation of get using a cursor.
+ *
+ * PUBLIC: int __dbc_iget __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_iget(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DBC *ddbc, *dbc_n, *opd;
+	DBC_INTERNAL *cp, *cp_n;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	db_pgno_t pgno;
+	db_indx_t indx_off;
+	u_int32_t multi, orig_ulen, tmp_flags, tmp_read_locking, tmp_rmw;
+	u_int8_t type;
+	int key_small, ret, t_ret;
+
+	COMPQUIET(orig_ulen, 0);
+
+	key_small = 0;
+
+	/*
+	 * Cursor Cleanup Note:
+	 * All of the cursors passed to the underlying access methods by this
+	 * routine are duplicated cursors.  On return, any referenced pages
+	 * will be discarded, and, if the cursor is not intended to be used
+	 * again, the close function will be called.  So, pages/locks that
+	 * the cursor references do not need to be resolved by the underlying
+	 * functions.
+	 */
+	dbp = dbc->dbp;
+	env = dbp->env;
+	mpf = dbp->mpf;
+	dbc_n = NULL;
+	opd = NULL;
+
+	/* Clear OR'd in additional bits so we can check for flag equality. */
+	tmp_rmw = LF_ISSET(DB_RMW);
+	LF_CLR(DB_RMW);
+
+	SET_READ_LOCKING_FLAGS(dbc, tmp_read_locking);
+
+	multi = LF_ISSET(DB_MULTIPLE|DB_MULTIPLE_KEY);
+	LF_CLR(DB_MULTIPLE|DB_MULTIPLE_KEY);
+
+	/*
+	 * Return a cursor's record number.  It has nothing to do with the
+	 * cursor get code except that it was put into the interface.
+	 */
+	if (flags == DB_GET_RECNO) {
+		if (tmp_rmw)
+			F_SET(dbc, DBC_RMW);
+		F_SET(dbc, tmp_read_locking);
+		ret = __bamc_rget(dbc, data);
+		if (tmp_rmw)
+			F_CLR(dbc, DBC_RMW);
+		/* Clear the temp flags, but leave WAS_READ_COMMITTED. */
+		F_CLR(dbc, tmp_read_locking & ~DBC_WAS_READ_COMMITTED);
+		return (ret);
+	}
+
+	if (flags == DB_CONSUME || flags == DB_CONSUME_WAIT)
+		CDB_LOCKING_INIT(env, dbc);
+
+	/* Don't return the key or data if it was passed to us. */
+	if (!DB_RETURNS_A_KEY(dbp, flags))
+		F_SET(key, DB_DBT_ISSET);
+	if (flags == DB_GET_BOTH &&
+	    (dbp->dup_compare == NULL || dbp->dup_compare == __bam_defcmp))
+		F_SET(data, DB_DBT_ISSET);
+
+	/*
+	 * If we have an off-page duplicates cursor, and the operation applies
+	 * to it, perform the operation.  Duplicate the cursor and call the
+	 * underlying function.
+	 *
+	 * Off-page duplicate trees are locked in the primary tree, that is,
+	 * we acquire a write lock in the primary tree and no locks in the
+	 * off-page dup tree.  If the DB_RMW flag was specified and the get
+	 * operation is done in an off-page duplicate tree, call the primary
+	 * cursor's upgrade routine first.
+	 */
+	cp = dbc->internal;
+	if (cp->opd != NULL &&
+	    (flags == DB_CURRENT || flags == DB_GET_BOTHC ||
+	    flags == DB_NEXT || flags == DB_NEXT_DUP ||
+	    flags == DB_PREV || flags == DB_PREV_DUP)) {
+		if (tmp_rmw && (ret = dbc->am_writelock(dbc)) != 0)
+			goto err;
+		if (F_ISSET(dbc, DBC_TRANSIENT))
+			opd = cp->opd;
+		else if ((ret = __dbc_idup(cp->opd, &opd, DB_POSITION)) != 0)
+			goto err;
+
+		if ((ret = opd->am_get(opd, key, data, flags, NULL)) == 0)
+			goto done;
+		/*
+		 * Another cursor may have deleted all of the off-page
+		 * duplicates, so for operations that are moving a cursor, we
+		 * need to skip the empty tree and retry on the parent cursor.
+		 */
+		if (ret == DB_NOTFOUND &&
+		    (flags == DB_PREV || flags == DB_NEXT)) {
+			ret = __dbc_close(opd);
+			opd = NULL;
+			if (F_ISSET(dbc, DBC_TRANSIENT))
+				cp->opd = NULL;
+		}
+		if (ret != 0)
+			goto err;
+	} else if (cp->opd != NULL && F_ISSET(dbc, DBC_TRANSIENT)) {
+		if ((ret = __dbc_close(cp->opd)) != 0)
+			goto err;
+		cp->opd = NULL;
+	}
+
+	/*
+	 * Perform an operation on the main cursor.  Duplicate the cursor,
+	 * upgrade the lock as required, and call the underlying function.
+	 */
+	switch (flags) {
+	case DB_CURRENT:
+	case DB_GET_BOTHC:
+	case DB_NEXT:
+	case DB_NEXT_DUP:
+	case DB_NEXT_NODUP:
+	case DB_PREV:
+	case DB_PREV_DUP:
+	case DB_PREV_NODUP:
+		tmp_flags = DB_POSITION;
+		break;
+	default:
+		tmp_flags = 0;
+		break;
+	}
+
+	/*
+	 * If this cursor is going to be closed immediately, we don't
+	 * need to take precautions to clean it up on error.
+	 */
+	if (F_ISSET(dbc, DBC_TRANSIENT | DBC_PARTITIONED))
+		dbc_n = dbc;
+	else {
+		ret = __dbc_idup(dbc, &dbc_n, tmp_flags);
+
+		if (ret != 0)
+			goto err;
+		COPY_RET_MEM(dbc, dbc_n);
+	}
+
+	if (tmp_rmw)
+		F_SET(dbc_n, DBC_RMW);
+	F_SET(dbc_n, tmp_read_locking);
+
+	switch (multi) {
+	case DB_MULTIPLE:
+		F_SET(dbc_n, DBC_MULTIPLE);
+		break;
+	case DB_MULTIPLE_KEY:
+		F_SET(dbc_n, DBC_MULTIPLE_KEY);
+		break;
+	case DB_MULTIPLE | DB_MULTIPLE_KEY:
+		F_SET(dbc_n, DBC_MULTIPLE|DBC_MULTIPLE_KEY);
+		break;
+	case 0:
+	default:
+		break;
+	}
+
+retry:	pgno = PGNO_INVALID;
+	ret = dbc_n->am_get(dbc_n, key, data, flags, &pgno);
+	if (tmp_rmw)
+		F_CLR(dbc_n, DBC_RMW);
+	/*
+	 * Clear the temporary locking flags in the new cursor.  The user's
+	 * (old) cursor needs to have the WAS_READ_COMMITTED flag because this
+	 * is used on the next call on that cursor.
+	 */
+	F_CLR(dbc_n, tmp_read_locking);
+	F_SET(dbc, tmp_read_locking & DBC_WAS_READ_COMMITTED);
+	F_CLR(dbc_n, DBC_MULTIPLE|DBC_MULTIPLE_KEY);
+	if (ret != 0)
+		goto err;
+
+	cp_n = dbc_n->internal;
+
+	/*
+	 * We may be referencing a new off-page duplicates tree.  Acquire
+	 * a new cursor and call the underlying function.
+	 */
+	if (pgno != PGNO_INVALID) {
+		if ((ret = __dbc_newopd(dbc,
+		    pgno, cp_n->opd, &cp_n->opd)) != 0)
+			goto err;
+
+		switch (flags) {
+		case DB_FIRST:
+		case DB_NEXT:
+		case DB_NEXT_NODUP:
+		case DB_SET:
+		case DB_SET_RECNO:
+		case DB_SET_RANGE:
+			tmp_flags = DB_FIRST;
+			break;
+		case DB_LAST:
+		case DB_PREV:
+		case DB_PREV_NODUP:
+			tmp_flags = DB_LAST;
+			break;
+		case DB_GET_BOTH:
+		case DB_GET_BOTHC:
+		case DB_GET_BOTH_RANGE:
+			tmp_flags = flags;
+			break;
+		default:
+			ret = __db_unknown_flag(env, "__dbc_get", flags);
+			goto err;
+		}
+		ret = cp_n->opd->am_get(cp_n->opd, key, data, tmp_flags, NULL);
+		/*
+		 * Another cursor may have deleted all of the off-page
+		 * duplicates, so for operations that are moving a cursor, we
+		 * need to skip the empty tree and retry on the parent cursor.
+		 */
+		if (ret == DB_NOTFOUND) {
+			switch (flags) {
+			case DB_FIRST:
+			case DB_NEXT:
+			case DB_NEXT_NODUP:
+				flags = DB_NEXT;
+				break;
+			case DB_LAST:
+			case DB_PREV:
+			case DB_PREV_NODUP:
+				flags = DB_PREV;
+				break;
+			default:
+				goto err;
+			}
+
+			ret = __dbc_close(cp_n->opd);
+			cp_n->opd = NULL;
+			if (ret == 0)
+				goto retry;
+		}
+		if (ret != 0)
+			goto err;
+	}
+
+done:	/*
+	 * Return a key/data item.  The only exception is that we don't return
+	 * a key if the user already gave us one, that is, if the DB_SET flag
+	 * was set.  The DB_SET flag is necessary.  In a Btree, the user's key
+	 * doesn't have to be the same as the key stored the tree, depending on
+	 * the magic performed by the comparison function.  As we may not have
+	 * done any key-oriented operation here, the page reference may not be
+	 * valid.  Fill it in as necessary.  We don't have to worry about any
+	 * locks, the cursor must already be holding appropriate locks.
+	 *
+	 * XXX
+	 * If not a Btree and DB_SET_RANGE is set, we shouldn't return a key
+	 * either, should we?
+	 */
+	cp_n = dbc_n == NULL ? dbc->internal : dbc_n->internal;
+	if (!F_ISSET(key, DB_DBT_ISSET)) {
+		if (cp_n->page == NULL && (ret = __memp_fget(mpf, &cp_n->pgno,
+		    dbc->thread_info, dbc->txn, 0, &cp_n->page)) != 0)
+			goto err;
+
+		if ((ret = __db_ret(dbc, cp_n->page, cp_n->indx, key,
+		    &dbc->rkey->data, &dbc->rkey->ulen)) != 0) {
+			/*
+			 * If the key DBT is too small, we still want to return
+			 * the size of the data.  Otherwise applications are
+			 * forced to check each one with a separate call.  We
+			 * don't want to copy the data, so we set the ulen to
+			 * zero before calling __db_ret.
+			 */
+			if (ret == DB_BUFFER_SMALL &&
+			    F_ISSET(data, DB_DBT_USERMEM)) {
+				key_small = 1;
+				orig_ulen = data->ulen;
+				data->ulen = 0;
+			} else
+				goto err;
+		}
+	}
+	if (multi != 0 && dbc->am_bulk != NULL) {
+		/*
+		 * Even if fetching from the OPD cursor we need a duplicate
+		 * primary cursor if we are going after multiple keys.
+		 */
+		if (dbc_n == NULL) {
+			/*
+			 * Non-"_KEY" DB_MULTIPLE doesn't move the main cursor,
+			 * so it's safe to just use dbc, unless the cursor
+			 * has an open off-page duplicate cursor whose state
+			 * might need to be preserved.
+			 */
+			if ((!(multi & DB_MULTIPLE_KEY) &&
+			    dbc->internal->opd == NULL) ||
+			    F_ISSET(dbc, DBC_TRANSIENT | DBC_PARTITIONED))
+				dbc_n = dbc;
+			else {
+				if ((ret = __dbc_idup(dbc,
+				    &dbc_n, DB_POSITION)) != 0)
+					goto err;
+				if ((ret = dbc_n->am_get(dbc_n,
+				    key, data, DB_CURRENT, &pgno)) != 0)
+					goto err;
+			}
+			cp_n = dbc_n->internal;
+		}
+
+		/*
+		 * If opd is set then we dupped the opd that we came in with.
+		 * When we return we may have a new opd if we went to another
+		 * key.
+		 */
+		if (opd != NULL) {
+			DB_ASSERT(env, cp_n->opd == NULL);
+			cp_n->opd = opd;
+			opd = NULL;
+		}
+
+		/*
+		 * Bulk get doesn't use __db_retcopy, so data.size won't
+		 * get set up unless there is an error.  Assume success
+		 * here.  This is the only call to am_bulk, and it avoids
+		 * setting it exactly the same everywhere.  If we have an
+		 * DB_BUFFER_SMALL error, it'll get overwritten with the
+		 * needed value.
+		 */
+		data->size = data->ulen;
+		ret = dbc_n->am_bulk(dbc_n, data, flags | multi);
+	} else if (!F_ISSET(data, DB_DBT_ISSET)) {
+		ddbc = opd != NULL ? opd :
+		    cp_n->opd != NULL ? cp_n->opd : dbc_n;
+		cp = ddbc->internal;
+		if (cp->page == NULL &&
+		    (ret = __memp_fget(mpf, &cp->pgno,
+			 dbc->thread_info, ddbc->txn, 0, &cp->page)) != 0)
+			goto err;
+
+		type = TYPE(cp->page);
+		indx_off = ((type == P_LBTREE ||
+		    type == P_HASH || type == P_HASH_UNSORTED) ? O_INDX : 0);
+		ret = __db_ret(ddbc, cp->page, cp->indx + indx_off,
+		    data, &dbc->rdata->data, &dbc->rdata->ulen);
+	}
+
+err:	/* Don't pass DB_DBT_ISSET back to application level, error or no. */
+	F_CLR(key, DB_DBT_ISSET);
+	F_CLR(data, DB_DBT_ISSET);
+
+	/* Cleanup and cursor resolution. */
+	if (opd != NULL) {
+		/*
+		 * To support dirty reads we must reget the write lock
+		 * if we have just stepped off a deleted record.
+		 * Since the OPD cursor does not know anything
+		 * about the referencing page or cursor we need
+		 * to peek at the OPD cursor and get the lock here.
+		 */
+		if (F_ISSET(dbp, DB_AM_READ_UNCOMMITTED) &&
+		     F_ISSET((BTREE_CURSOR *)
+		     dbc->internal->opd->internal, C_DELETED))
+			if ((t_ret =
+			    dbc->am_writelock(dbc)) != 0 && ret == 0)
+				ret = t_ret;
+		if ((t_ret = __dbc_cleanup(
+		    dbc->internal->opd, opd, ret)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+
+	if (key_small) {
+		data->ulen = orig_ulen;
+		if (ret == 0)
+			ret = DB_BUFFER_SMALL;
+	}
+
+	if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 &&
+	    (ret == 0 || ret == DB_BUFFER_SMALL))
+		ret = t_ret;
+
+	if (flags == DB_CONSUME || flags == DB_CONSUME_WAIT)
+		CDB_LOCKING_DONE(env, dbc);
+	return (ret);
+}
+
+/* Internal flags shared by the dbc_put functions. */
+#define	DBC_PUT_RMW		0x001
+#define	DBC_PUT_NODEL		0x002
+#define	DBC_PUT_HAVEREC		0x004
+
+/*
+ * __dbc_put_resolve_key --
+ *	Get the current key and data so that we can correctly update the
+ *	secondary and foreign databases.
+ */
+static inline int
+__dbc_put_resolve_key(dbc, oldkey, olddata, put_statep, flags)
+	DBC *dbc;
+	DBT *oldkey, *olddata;
+	u_int32_t flags, *put_statep;
+{
+	DB *dbp;
+	ENV *env;
+	int ret, rmw;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	rmw = FLD_ISSET(*put_statep, DBC_PUT_RMW) ? DB_RMW : 0;
+
+	DB_ASSERT(env, flags == DB_CURRENT);
+	COMPQUIET(flags, 0);
+
+	/*
+	 * This is safe to do on the cursor we already have;
+	 * error or no, it won't move.
+	 *
+	 * We use DB_RMW for all of these gets because we'll be
+	 * writing soon enough in the "normal" put code.  In
+	 * transactional databases we'll hold those write locks
+	 * even if we close the cursor we're reading with.
+	 *
+	 * The DB_KEYEMPTY return needs special handling -- if the
+	 * cursor is on a deleted key, we return DB_NOTFOUND.
+	 */
+	memset(oldkey, 0, sizeof(DBT));
+	if ((ret = __dbc_get(dbc, oldkey, olddata, rmw | DB_CURRENT)) != 0)
+		return (ret == DB_KEYEMPTY ? DB_NOTFOUND : ret);
+
+	/* Record that we've looked for the old record. */
+	FLD_SET(*put_statep, DBC_PUT_HAVEREC);
+	return (0);
+}
+
+/*
+ * __dbc_put_append --
+ *	Handle an append to a primary.
+ */
+static inline int
+__dbc_put_append(dbc, key, data, put_statep, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags, *put_statep;
+{
+	DB *dbp;
+	ENV *env;
+	DBC *dbc_n;
+	DBT tdata;
+	int ret, t_ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	ret = 0;
+	dbc_n = NULL;
+
+	DB_ASSERT(env, flags == DB_APPEND);
+	COMPQUIET(flags, 0);
+
+	/*
+	 * With DB_APPEND, we need to do the insert to populate the key value.
+	 * So we swap the 'normal' order of updating secondary / verifying
+	 * foreign databases and inserting.
+	 *
+	 * If there is an append callback, the value stored in data->data may
+	 * be replaced and then freed.  To avoid passing a freed pointer back
+	 * to the user, just operate on a copy of the data DBT.
+	 */
+	tdata = *data;
+
+	/*
+	 * If this cursor is going to be closed immediately, we don't
+	 * need to take precautions to clean it up on error.
+	 */
+	if (F_ISSET(dbc, DBC_TRANSIENT))
+		dbc_n = dbc;
+	else if ((ret = __dbc_idup(dbc, &dbc_n, 0)) != 0)
+		goto err;
+
+	/*
+	 * Append isn't a normal put operation;  call the appropriate access
+	 * method's append function.
+	 */
+	switch (dbp->type) {
+	case DB_QUEUE:
+		if ((ret = __qam_append(dbc_n, key, &tdata)) != 0)
+			goto err;
+		break;
+	case DB_RECNO:
+		if ((ret = __ram_append(dbc_n, key, &tdata)) != 0)
+			goto err;
+		break;
+	default:
+		/* The interface should prevent this. */
+		DB_ASSERT(env,
+		    dbp->type == DB_QUEUE || dbp->type == DB_RECNO);
+
+		ret = __db_ferr(env, "DBC->put", 0);
+		goto err;
+	}
+
+	/*
+	 * The append callback, if one exists, may have allocated a new
+	 * tdata.data buffer.  If so, free it.
+	 */
+	FREE_IF_NEEDED(env, &tdata);
+
+	/*
+	 * The key value may have been generated by the above operation, but
+	 * not set in the data buffer. Make sure it is there so that secondary
+	 * updates can complete.
+	 */
+	if ((ret = __dbt_usercopy(env, key)) != 0)
+		goto err;
+
+	/* An append cannot be replacing an existing item. */
+	FLD_SET(*put_statep, DBC_PUT_NODEL);
+
+err:	if (dbc_n != NULL &&
+	    (t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __dbc_put_partial --
+ *	Ensure that the data item we are using is complete and correct.
+ *      Otherwise we could break the secondary constraints.
+ */
+static inline int
+__dbc_put_partial(dbc, pkey, data, orig_data, out_data, put_statep, flags)
+	DBC *dbc;
+	DBT *pkey, *data, *orig_data, *out_data;
+	u_int32_t *put_statep, flags;
+{
+	DB *dbp;
+	DBC *pdbc;
+	ENV *env;
+	int ret, rmw, t_ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	ret = t_ret = 0;
+	rmw = FLD_ISSET(*put_statep, DBC_PUT_RMW) ? DB_RMW : 0;
+
+	if (!FLD_ISSET(*put_statep, DBC_PUT_HAVEREC) &&
+	    !FLD_ISSET(*put_statep, DBC_PUT_NODEL)) {
+		/*
+		 * We're going to have to search the tree for the
+		 * specified key.  Dup a cursor (so we have the same
+		 * locking info) and do a c_get.
+		 */
+		if ((ret = __dbc_idup(dbc, &pdbc, 0)) != 0)
+			return (ret);
+
+		/*
+		 * When doing a put with DB_CURRENT, partial data items have
+		 * already been resolved.
+		 */
+		DB_ASSERT(env, flags != DB_CURRENT);
+
+		F_SET(pkey, DB_DBT_ISSET);
+		ret = __dbc_get(pdbc, pkey, orig_data, rmw | DB_SET);
+		if (ret == DB_KEYEMPTY || ret == DB_NOTFOUND) {
+			FLD_SET(*put_statep, DBC_PUT_NODEL);
+			ret = 0;
+		}
+		if ((t_ret = __dbc_close(pdbc)) != 0)
+			ret = t_ret;
+		if (ret != 0)
+			return (ret);
+
+		FLD_SET(*put_statep, DBC_PUT_HAVEREC);
+	}
+
+	COMPQUIET(flags, 0);
+
+	/*
+	 * Now build the new datum from orig_data and the partial data
+	 * we were given.  It's okay to do this if no record was
+	 * returned above: a partial put on an empty record is allowed,
+	 * if a little strange.  The data is zero-padded.
+	 */
+	return (__db_buildpartial(dbp, orig_data, data, out_data));
+}
+
+/*
+ * __dbc_put_fixed_len --
+ *      Handle padding for fixed-length records.
+ */
+static inline int
+__dbc_put_fixed_len(dbc, data, out_data)
+	DBC *dbc;
+	DBT *data, *out_data;
+{
+	DB *dbp;
+	ENV *env;
+	int re_pad, ret;
+	u_int32_t re_len, size;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	ret = 0;
+
+	/*
+	 * Handle fixed-length records.  If the primary database has
+	 * fixed-length records, we need to pad out the datum before
+	 * we pass it into the callback function;  we always index the
+	 * "real" record.
+	 */
+	if (dbp->type == DB_QUEUE) {
+		re_len = ((QUEUE *)dbp->q_internal)->re_len;
+		re_pad = ((QUEUE *)dbp->q_internal)->re_pad;
+	} else {
+		re_len = ((BTREE *)dbp->bt_internal)->re_len;
+		re_pad = ((BTREE *)dbp->bt_internal)->re_pad;
+	}
+
+	size = data->size;
+	if (size > re_len) {
+		ret = __db_rec_toobig(env, size, re_len);
+		return (ret);
+	} else if (size < re_len) {
+		/*
+		 * If we're not doing a partial put, copy data->data into
+		 * out_data->data, then pad out out_data->data. This overrides
+		 * the assignment made above, which is used in the more common
+		 * case when padding is not needed.
+		 *
+		 * If we're doing a partial put, the data we want are already
+		 * in out_data.data; we just need to pad.
+		 */
+		if (F_ISSET(data, DB_DBT_PARTIAL)) {
+		       if ((ret = __os_realloc(
+			    env, re_len, &out_data->data)) != 0)
+				return (ret);
+		       /*
+			* In the partial case, we have built the item into
+			* out_data already using __db_buildpartial. Just need
+			* to pad from the end of out_data, not from data->size.
+			*/
+		       size = out_data->size;
+		} else {
+			if ((ret = __os_malloc(
+			    env, re_len, &out_data->data)) != 0)
+				return (ret);
+			memcpy(out_data->data, data->data, size);
+		}
+		memset((u_int8_t *)out_data->data + size, re_pad,
+		    re_len - size);
+		out_data->size = re_len;
+	}
+
+	return (ret);
+}
+
+/*
+ * __dbc_put_secondaries --
+ *	Insert the secondary keys, and validate the foreign key constraints.
+ */
+static inline int
+__dbc_put_secondaries(dbc,
+    pkey, data, orig_data, s_count, s_keys_buf, put_statep)
+	DBC *dbc;
+	DBT *pkey, *data, *orig_data, *s_keys_buf;
+	int s_count;
+	u_int32_t *put_statep;
+{
+	DB *dbp, *sdbp;
+	DBC *fdbc, *sdbc;
+	DBT fdata, oldpkey, *skeyp, temppkey, tempskey, *tskeyp;
+	ENV *env;
+	int cmp, ret, rmw, t_ret;
+	u_int32_t nskey;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	fdbc = sdbc = NULL;
+	sdbp = NULL;
+	ret = t_ret = 0;
+	rmw = FLD_ISSET(*put_statep, DBC_PUT_RMW) ? DB_RMW : 0;
+
+	/*
+	 * Loop through the secondaries.  (Step 3.)
+	 *
+	 * Note that __db_s_first and __db_s_next will take care of
+	 * thread-locking and refcounting issues.
+	 */
+	for (ret = __db_s_first(dbp, &sdbp), skeyp = s_keys_buf;
+	    sdbp != NULL && ret == 0;
+	    ret = __db_s_next(&sdbp, dbc->txn), ++skeyp) {
+		DB_ASSERT(env, skeyp - s_keys_buf < s_count);
+		/*
+		 * Don't process this secondary if the key is immutable and we
+		 * know that the old record exists.  This optimization can't be
+		 * used if we have not checked for the old record yet.
+		 */
+		if (FLD_ISSET(*put_statep, DBC_PUT_HAVEREC) &&
+		    !FLD_ISSET(*put_statep, DBC_PUT_NODEL) &&
+		    FLD_ISSET(sdbp->s_assoc_flags, DB_ASSOC_IMMUTABLE_KEY))
+			continue;
+
+		/*
+		 * Call the callback for this secondary, to get the
+		 * appropriate secondary key.
+		 */
+		if ((ret = sdbp->s_callback(sdbp,
+		    pkey, data, skeyp)) != 0) {
+			/* Not indexing is equivalent to an empty key set. */
+			if (ret == DB_DONOTINDEX) {
+				F_SET(skeyp, DB_DBT_MULTIPLE);
+				skeyp->size = 0;
+				ret = 0;
+			} else
+				goto err;
+		}
+
+		if (sdbp->s_foreign != NULL &&
+		    (ret = __db_cursor_int(sdbp->s_foreign,
+		    dbc->thread_info, dbc->txn, sdbp->s_foreign->type,
+		    PGNO_INVALID, 0, dbc->locker, &fdbc)) != 0)
+			goto err;
+
+		/*
+		 * Mark the secondary key DBT(s) as set -- that is, the
+		 * callback returned at least one secondary key.
+		 *
+		 * Also, if this secondary index is associated with a foreign
+		 * database, check that the foreign db contains the key(s) to
+		 * maintain referential integrity.  Set flags in fdata to avoid
+		 * mem copying, we just need to know existence.  We need to do
+		 * this check before setting DB_DBT_ISSET, otherwise __dbc_get
+		 * will overwrite the flag values.
+		 */
+		if (F_ISSET(skeyp, DB_DBT_MULTIPLE)) {
+#ifdef DIAGNOSTIC
+			__db_check_skeyset(sdbp, skeyp);
+#endif
+			for (tskeyp = (DBT *)skeyp->data, nskey = skeyp->size;
+			     nskey > 0; nskey--, tskeyp++) {
+				if (fdbc != NULL) {
+					memset(&fdata, 0, sizeof(DBT));
+					F_SET(&fdata,
+					    DB_DBT_PARTIAL | DB_DBT_USERMEM);
+					if ((ret = __dbc_get(
+					    fdbc, tskeyp, &fdata,
+					    DB_SET | rmw)) == DB_NOTFOUND ||
+					    ret == DB_KEYEMPTY) {
+						ret = DB_FOREIGN_CONFLICT;
+						break;
+					}
+				}
+				F_SET(tskeyp, DB_DBT_ISSET);
+			}
+			tskeyp = (DBT *)skeyp->data;
+			nskey = skeyp->size;
+		} else {
+			if (fdbc != NULL) {
+				memset(&fdata, 0, sizeof(DBT));
+				F_SET(&fdata, DB_DBT_PARTIAL | DB_DBT_USERMEM);
+				if ((ret = __dbc_get(fdbc, skeyp, &fdata,
+				    DB_SET | rmw)) == DB_NOTFOUND ||
+				    ret == DB_KEYEMPTY)
+					ret = DB_FOREIGN_CONFLICT;
+			}
+			F_SET(skeyp, DB_DBT_ISSET);
+			tskeyp = skeyp;
+			nskey = 1;
+		}
+		if (fdbc != NULL && (t_ret = __dbc_close(fdbc)) != 0 &&
+		    ret == 0)
+			ret = t_ret;
+		fdbc = NULL;
+		if (ret != 0)
+			goto err;
+
+		/*
+		 * If we have the old record, we can generate and remove any
+		 * old secondary key(s) now.  We can also skip the secondary
+		 * put if there is no change.
+		 */
+		if (FLD_ISSET(*put_statep, DBC_PUT_HAVEREC)) {
+			if ((ret = __dbc_del_oldskey(sdbp, dbc,
+			    skeyp, pkey, orig_data)) == DB_KEYEXIST)
+				continue;
+			else if (ret != 0)
+				goto err;
+		}
+		if (nskey == 0)
+			continue;
+
+		/*
+		 * Open a cursor in this secondary.
+		 *
+		 * Use the same locker ID as our primary cursor, so that
+		 * we're guaranteed that the locks don't conflict (e.g. in CDB
+		 * or if we're subdatabases that share and want to lock a
+		 * metadata page).
+		 */
+		if ((ret = __db_cursor_int(sdbp, dbc->thread_info, dbc->txn,
+		    sdbp->type, PGNO_INVALID, 0, dbc->locker, &sdbc)) != 0)
+			goto err;
+
+		/*
+		 * If we're in CDB, updates will fail since the new cursor
+		 * isn't a writer.  However, we hold the WRITE lock in the
+		 * primary and will for as long as our new cursor lasts,
+		 * and the primary and secondary share a lock file ID,
+		 * so it's safe to consider this a WRITER.  The close
+		 * routine won't try to put anything because we don't
+		 * really have a lock.
+		 */
+		if (CDB_LOCKING(env)) {
+			DB_ASSERT(env, sdbc->mylock.off == LOCK_INVALID);
+			F_SET(sdbc, DBC_WRITER);
+		}
+
+		/*
+		 * Swap the primary key to the byte order of this secondary, if
+		 * necessary.  By doing this now, we can compare directly
+		 * against the data already in the secondary without having to
+		 * swap it after reading.
+		 */
+		SWAP_IF_NEEDED(sdbp, pkey);
+
+		for (; nskey > 0 && ret == 0; nskey--, tskeyp++) {
+			/* Skip this key if it is already in the database. */
+			if (!F_ISSET(tskeyp, DB_DBT_ISSET))
+				continue;
+
+			/*
+			 * There are three cases here--
+			 * 1) The secondary supports sorted duplicates.
+			 *	If we attempt to put a secondary/primary pair
+			 *	that already exists, that's a duplicate
+			 *	duplicate, and c_put will return DB_KEYEXIST
+			 *	(see __db_duperr).  This will leave us with
+			 *	exactly one copy of the secondary/primary pair,
+			 *	and this is just right--we'll avoid deleting it
+			 *	later, as the old and new secondaries will
+			 *	match (since the old secondary is the dup dup
+			 *	that's already there).
+			 * 2) The secondary supports duplicates, but they're not
+			 *	sorted.  We need to avoid putting a duplicate
+			 *	duplicate, because the matching old and new
+			 *	secondaries will prevent us from deleting
+			 *	anything and we'll wind up with two secondary
+			 *	records that point to the same primary key.  Do
+			 *	a c_get(DB_GET_BOTH);  only do the put if the
+			 *	secondary doesn't exist.
+			 * 3) The secondary doesn't support duplicates at all.
+			 *	In this case, secondary keys must be unique;
+			 *	if another primary key already exists for this
+			 *	secondary key, we have to either overwrite it
+			 *	or not put this one, and in either case we've
+			 *	corrupted the secondary index.  Do a
+			 *	c_get(DB_SET).  If the secondary/primary pair
+			 *	already exists, do nothing;  if the secondary
+			 *	exists with a different primary, return an
+			 *	error;  and if the secondary does not exist,
+			 *	put it.
+			 */
+			if (!F_ISSET(sdbp, DB_AM_DUP)) {
+				/* Case 3. */
+				memset(&oldpkey, 0, sizeof(DBT));
+				F_SET(&oldpkey, DB_DBT_MALLOC);
+				ret = __dbc_get(sdbc,
+				    tskeyp, &oldpkey, rmw | DB_SET);
+				if (ret == 0) {
+					cmp = __bam_defcmp(sdbp,
+					    &oldpkey, pkey);
+					__os_ufree(env, oldpkey.data);
+					/*
+					 * If the secondary key is unchanged,
+					 * skip the put and go on to the next
+					 * one.
+					 */
+					if (cmp == 0)
+						continue;
+
+					__db_errx(env, "%s%s",
+			    "Put results in a non-unique secondary key in an ",
+			    "index not configured to support duplicates");
+					ret = EINVAL;
+				}
+				if (ret != DB_NOTFOUND && ret != DB_KEYEMPTY)
+					break;
+			} else if (!F_ISSET(sdbp, DB_AM_DUPSORT)) {
+				/* Case 2. */
+				DB_INIT_DBT(tempskey,
+				    tskeyp->data, tskeyp->size);
+				DB_INIT_DBT(temppkey,
+				    pkey->data, pkey->size);
+				ret = __dbc_get(sdbc, &tempskey, &temppkey,
+				    rmw | DB_GET_BOTH);
+				if (ret != DB_NOTFOUND && ret != DB_KEYEMPTY)
+					break;
+			}
+
+			ret = __dbc_put(sdbc, tskeyp, pkey,
+			    DB_UPDATE_SECONDARY);
+
+			/*
+			 * We don't know yet whether this was a put-overwrite
+			 * that in fact changed nothing.  If it was, we may get
+			 * DB_KEYEXIST.  This is not an error.
+			 */
+			if (ret == DB_KEYEXIST)
+				ret = 0;
+		}
+
+		/* Make sure the primary key is back in native byte-order. */
+		SWAP_IF_NEEDED(sdbp, pkey);
+
+		if ((t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
+			ret = t_ret;
+
+		if (ret != 0)
+			goto err;
+
+		/*
+		 * Mark that we have a key for this secondary so we can check
+		 * it later before deleting the old one.  We can't set it
+		 * earlier or it would be cleared in the calls above.
+		 */
+		F_SET(skeyp, DB_DBT_ISSET);
+	}
+err:	if (sdbp != NULL &&
+	    (t_ret = __db_s_done(sdbp, dbc->txn)) != 0 && ret == 0)
+		ret = t_ret;
+	COMPQUIET(s_count, 0);
+	return (ret);
+}
+
+static int
+__dbc_put_primary(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB *dbp, *sdbp;
+	DBC *dbc_n, *pdbc;
+	DBT oldkey, olddata, newdata;
+	DBT *all_skeys, *skeyp, *tskeyp;
+	ENV *env;
+	int ret, t_ret, s_count;
+	u_int32_t nskey, put_state, rmw;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	ret = t_ret = s_count = 0;
+	put_state = 0;
+	sdbp = NULL;
+	pdbc = dbc_n = NULL;
+	all_skeys = NULL;
+	memset(&newdata, 0, sizeof(DBT));
+	memset(&olddata, 0, sizeof(DBT));
+
+	/*
+	 * We do multiple cursor operations in some cases and subsequently
+	 * access the data DBT information.  Set DB_DBT_MALLOC so we don't risk
+	 * modification of the data between our uses of it.
+	 */
+	F_SET(&olddata, DB_DBT_MALLOC);
+
+	/*
+	 * We have at least one secondary which we may need to update.
+	 *
+	 * There is a rather vile locking issue here.  Secondary gets
+	 * will always involve acquiring a read lock in the secondary,
+	 * then acquiring a read lock in the primary.  Ideally, we
+	 * would likewise perform puts by updating all the secondaries
+	 * first, then doing the actual put in the primary, to avoid
+	 * deadlock (since having multiple threads doing secondary
+	 * gets and puts simultaneously is probably a common case).
+	 *
+	 * However, if this put is a put-overwrite--and we have no way to
+	 * tell in advance whether it will be--we may need to delete
+	 * an outdated secondary key.  In order to find that old
+	 * secondary key, we need to get the record we're overwriting,
+	 * before we overwrite it.
+	 *
+	 * (XXX: It would be nice to avoid this extra get, and have the
+	 * underlying put routines somehow pass us the old record
+	 * since they need to traverse the tree anyway.  I'm saving
+	 * this optimization for later, as it's a lot of work, and it
+	 * would be hard to fit into this locking paradigm anyway.)
+	 *
+	 * The simple thing to do would be to go get the old record before
+	 * we do anything else.  Unfortunately, though, doing so would
+	 * violate our "secondary, then primary" lock acquisition
+	 * ordering--even in the common case where no old primary record
+	 * exists, we'll still acquire and keep a lock on the page where
+	 * we're about to do the primary insert.
+	 *
+	 * To get around this, we do the following gyrations, which
+	 * hopefully solve this problem in the common case:
+	 *
+	 * 1) If this is a c_put(DB_CURRENT), go ahead and get the
+	 *    old record.  We already hold the lock on this page in
+	 *    the primary, so no harm done, and we'll need the primary
+	 *    key (which we weren't passed in this case) to do any
+	 *    secondary puts anyway.
+	 *    If this is a put(DB_APPEND), then we need to insert the item,
+	 *    so that we can know the key value. So go ahead and insert. In
+	 *    the case of a put(DB_APPEND) without secondaries it is
+	 *    implemented in the __db_put method as an optimization.
+	 *
+	 * 2) If we're doing a partial put, we need to perform the
+	 *    get on the primary key right away, since we don't have
+	 *    the whole datum that the secondary key is based on.
+	 *    We may also need to pad out the record if the primary
+	 *    has a fixed record length.
+	 *
+	 * 3) Loop through the secondary indices, putting into each a
+	 *    new secondary key that corresponds to the new record.
+	 *
+	 * 4) If we haven't done so in (1) or (2), get the old primary
+	 *    key/data pair.  If one does not exist--the common case--we're
+	 *    done with secondary indices, and can go straight on to the
+	 *    primary put.
+	 *
+	 * 5) If we do have an old primary key/data pair, however, we need
+	 *    to loop through all the secondaries a second time and delete
+	 *    the old secondary in each.
+	 */
+	s_count = __db_s_count(dbp);
+	if ((ret = __os_calloc(env,
+	    (u_int)s_count, sizeof(DBT), &all_skeys)) != 0)
+		goto err;
+
+	/*
+	 * Primary indices can't have duplicates, so only DB_APPEND,
+	 * DB_CURRENT, DB_KEYFIRST, and DB_KEYLAST make any sense.  Other flags
+	 * should have been caught by the checking routine, but
+	 * add a sprinkling of paranoia.
+	 */
+	DB_ASSERT(env, flags == DB_APPEND || flags == DB_CURRENT ||
+	      flags == DB_KEYFIRST || flags == DB_KEYLAST ||
+	      flags == DB_NOOVERWRITE || flags == DB_OVERWRITE_DUP);
+
+	/*
+	 * We'll want to use DB_RMW in a few places, but it's only legal
+	 * when locking is on.
+	 */
+	rmw = STD_LOCKING(dbc) ? DB_RMW : 0;
+	if (rmw)
+		FLD_SET(put_state, DBC_PUT_RMW);
+
+	/* Resolve the primary key if required (Step 1). */
+	if (flags == DB_CURRENT) {
+		if ((ret = __dbc_put_resolve_key(dbc,
+		    &oldkey, &olddata, &put_state, flags)) != 0)
+			goto err;
+		key = &oldkey;
+	} else if (flags == DB_APPEND) {
+		if ((ret = __dbc_put_append(dbc,
+		    key, data, &put_state, flags)) != 0)
+			goto err;
+	}
+
+	/*
+	 * PUT_NOOVERWRITE with secondaries is a troublesome case. We need
+	 * to check that the insert will work prior to making any changes
+	 * to secondaries. Try to work within the locking constraints outlined
+	 * above.
+	 *
+	 * This is DB->put (DB_NOOVERWRITE). DBC->put(DB_NODUPDATA) is not
+	 * relevant since it is only valid on DBs that support duplicates,
+	 * which primaries with secondaries can't have.
+	 */
+	if (flags == DB_NOOVERWRITE) {
+		/* Don't bother retrieving the data. */
+		F_SET(key, DB_DBT_ISSET);
+		olddata.dlen = 0;
+		olddata.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM;
+		if (__dbc_get(dbc, key, &olddata, DB_SET) != DB_NOTFOUND) {
+			ret = DB_KEYEXIST;
+			goto done;
+		}
+	}
+
+	/*
+	 * Check for partial puts using DB_DBT_PARTIAL (Step 2).
+	 */
+	if (F_ISSET(data, DB_DBT_PARTIAL)) {
+		if ((ret = __dbc_put_partial(dbc,
+		    key, data, &olddata, &newdata, &put_state, flags)) != 0)
+			goto err;
+	} else {
+		newdata = *data;
+	}
+
+	/*
+	 * Check for partial puts, with fixed length record databases (Step 2).
+	 */
+	if ((dbp->type == DB_RECNO && F_ISSET(dbp, DB_AM_FIXEDLEN)) ||
+	    (dbp->type == DB_QUEUE)) {
+		if ((ret = __dbc_put_fixed_len(dbc, data, &newdata)) != 0)
+			goto err;
+	}
+
+	/* Validate any foreign databases, and update secondaries. (Step 3). */
+	if ((ret = __dbc_put_secondaries(dbc, key, &newdata,
+	    &olddata, s_count, all_skeys, &put_state))
+	    != 0)
+		goto err;
+	/*
+	 * If we've already got the old primary key/data pair, the secondary
+	 * updates are already done.
+	 */
+	if (FLD_ISSET(put_state, DBC_PUT_HAVEREC))
+		goto done;
+
+	/*
+	 * If still necessary, go get the old primary key/data.  (Step 4.)
+	 *
+	 * See the comments in step 2.  This is real familiar.
+	 */
+	if ((ret = __dbc_idup(dbc, &pdbc, 0)) != 0)
+		goto err;
+	DB_ASSERT(env, flags != DB_CURRENT);
+	F_SET(key, DB_DBT_ISSET);
+	ret = __dbc_get(pdbc, key, &olddata, rmw | DB_SET);
+	if (ret == DB_KEYEMPTY || ret == DB_NOTFOUND) {
+		FLD_SET(put_state, DBC_PUT_NODEL);
+		ret = 0;
+	}
+	if ((t_ret = __dbc_close(pdbc)) != 0 && ret == 0)
+		ret = t_ret;
+	if (ret != 0)
+		goto err;
+
+	/*
+	 * Check whether we do in fact have an old record we may need to
+	 * delete.  (Step 5).
+	 */
+	if (FLD_ISSET(put_state, DBC_PUT_NODEL))
+		goto done;
+
+	for (ret = __db_s_first(dbp, &sdbp), skeyp = all_skeys;
+	    sdbp != NULL && ret == 0;
+	    ret = __db_s_next(&sdbp, dbc->txn), skeyp++) {
+		DB_ASSERT(env, skeyp - all_skeys < s_count);
+		/*
+		 * Don't process this secondary if the key is immutable.  We
+		 * know that the old record exists, so this optimization can
+		 * always be used.
+		 */
+		if (FLD_ISSET(sdbp->s_assoc_flags, DB_ASSOC_IMMUTABLE_KEY))
+			continue;
+
+		if ((ret = __dbc_del_oldskey(sdbp, dbc,
+		    skeyp, key, &olddata)) != 0 && ret != DB_KEYEXIST)
+			goto err;
+	}
+	if (ret != 0)
+		goto err;
+
+done:
+err:
+	if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* If newdata or olddata were used, free their buffers. */
+	if (newdata.data != NULL && newdata.data != data->data)
+		__os_free(env, newdata.data);
+	if (olddata.data != NULL)
+		__os_ufree(env, olddata.data);
+
+	CDB_LOCKING_DONE(env, dbc);
+
+	if (sdbp != NULL &&
+	    (t_ret = __db_s_done(sdbp, dbc->txn)) != 0 && ret == 0)
+		ret = t_ret;
+
+	for (skeyp = all_skeys; skeyp - all_skeys < s_count; skeyp++) {
+		if (F_ISSET(skeyp, DB_DBT_MULTIPLE)) {
+			for (nskey = skeyp->size, tskeyp = (DBT *)skeyp->data;
+			    nskey > 0;
+			    nskey--, tskeyp++)
+				FREE_IF_NEEDED(env, tskeyp);
+		}
+		FREE_IF_NEEDED(env, skeyp);
+	}
+	if (all_skeys != NULL)
+		__os_free(env, all_skeys);
+	return (ret);
+}
+
+/*
+ * __dbc_put --
+ *	Put using a cursor.
+ *
+ * PUBLIC: int __dbc_put __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_put(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB *dbp;
+	int ret;
+
+	dbp = dbc->dbp;
+	ret = 0;
+
+	/*
+	 * Putting to secondary indices is forbidden;  when we need to
+	 * internally update one, we're called with a private flag,
+	 * DB_UPDATE_SECONDARY, which does the right thing but won't return an
+	 * error during flag checking.
+	 *
+	 * As a convenience, many places that want the default DB_KEYLAST
+	 * behavior call DBC->put with flags == 0.  Protect lower-level code
+	 * here by translating that.
+	 *
+	 * Lastly, the DB_OVERWRITE_DUP flag is equivalent to DB_KEYLAST unless
+	 * there are sorted duplicates.  Limit the number of places that need
+	 * to test for it explicitly.
+	 */
+	if (flags == DB_UPDATE_SECONDARY || flags == 0 ||
+	    (flags == DB_OVERWRITE_DUP && !F_ISSET(dbp, DB_AM_DUPSORT)))
+		flags = DB_KEYLAST;
+
+	CDB_LOCKING_INIT(dbc->env, dbc);
+
+	/*
+	 * Check to see if we are a primary and have secondary indices.
+	 * If we are not, we save ourselves a good bit of trouble and
+	 * just skip to the "normal" put.
+	 */
+	if (DB_IS_PRIMARY(dbp) &&
+	    ((ret = __dbc_put_primary(dbc, key, data, flags)) != 0))
+		return (ret);
+
+	/*
+	 * If this is an append operation, the insert was done prior to the
+	 * secondary updates, so we are finished.
+	 */
+	if (flags == DB_APPEND)
+		return (ret);
+
+#ifdef HAVE_COMPRESSION
+	if (DB_IS_COMPRESSED(dbp))
+		return (__bamc_compress_put(dbc, key, data, flags));
+#endif
+
+	return (__dbc_iput(dbc, key, data, flags));
+}
+
+/*
+ * __dbc_iput --
+ *	Implementation of put using a cursor.
+ *
+ * PUBLIC: int __dbc_iput __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_iput(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DBC *dbc_n, *oldopd, *opd;
+	db_pgno_t pgno;
+	int ret, t_ret;
+	u_int32_t tmp_flags;
+
+	/*
+	 * Cursor Cleanup Note:
+	 * All of the cursors passed to the underlying access methods by this
+	 * routine are duplicated cursors.  On return, any referenced pages
+	 * will be discarded, and, if the cursor is not intended to be used
+	 * again, the close function will be called.  So, pages/locks that
+	 * the cursor references do not need to be resolved by the underlying
+	 * functions.
+	 */
+	dbc_n = NULL;
+	ret = t_ret = 0;
+
+	/*
+	 * If we have an off-page duplicates cursor, and the operation applies
+	 * to it, perform the operation.  Duplicate the cursor and call the
+	 * underlying function.
+	 *
+	 * Off-page duplicate trees are locked in the primary tree, that is,
+	 * we acquire a write lock in the primary tree and no locks in the
+	 * off-page dup tree.  If the put operation is done in an off-page
+	 * duplicate tree, call the primary cursor's upgrade routine first.
+	 */
+	if (dbc->internal->opd != NULL &&
+	    (flags == DB_AFTER || flags == DB_BEFORE || flags == DB_CURRENT)) {
+		/*
+		 * A special case for hash off-page duplicates.  Hash doesn't
+		 * support (and is documented not to support) put operations
+		 * relative to a cursor which references an already deleted
+		 * item.  For consistency, apply the same criteria to off-page
+		 * duplicates as well.
+		 */
+		if (dbc->dbtype == DB_HASH && F_ISSET(
+		    ((BTREE_CURSOR *)(dbc->internal->opd->internal)),
+		    C_DELETED)) {
+			ret = DB_NOTFOUND;
+			goto err;
+		}
+
+		if ((ret = dbc->am_writelock(dbc)) != 0 ||
+		    (ret = __dbc_dup(dbc, &dbc_n, DB_POSITION)) != 0)
+			goto err;
+		opd = dbc_n->internal->opd;
+		if ((ret = opd->am_put(
+		    opd, key, data, flags, NULL)) != 0)
+			goto err;
+		goto done;
+	}
+
+	/*
+	 * Perform an operation on the main cursor.  Duplicate the cursor,
+	 * and call the underlying function.
+	 */
+	if (flags == DB_AFTER || flags == DB_BEFORE || flags == DB_CURRENT)
+		tmp_flags = DB_POSITION;
+	else
+		tmp_flags = 0;
+
+	/*
+	 * If this cursor is going to be closed immediately, we don't
+	 * need to take precautions to clean it up on error.
+	 */
+	if (F_ISSET(dbc, DBC_TRANSIENT | DBC_PARTITIONED))
+		dbc_n = dbc;
+	else if ((ret = __dbc_idup(dbc, &dbc_n, tmp_flags)) != 0)
+		goto err;
+
+	pgno = PGNO_INVALID;
+	if ((ret = dbc_n->am_put(dbc_n, key, data, flags, &pgno)) != 0)
+		goto err;
+
+	/*
+	 * We may be referencing a new off-page duplicates tree.  Acquire
+	 * a new cursor and call the underlying function.
+	 */
+	if (pgno != PGNO_INVALID) {
+		oldopd = dbc_n->internal->opd;
+		if ((ret = __dbc_newopd(dbc, pgno, oldopd, &opd)) != 0) {
+			dbc_n->internal->opd = opd;
+			goto err;
+		}
+
+		dbc_n->internal->opd = opd;
+		opd->internal->pdbc = dbc_n;
+
+		if (flags == DB_NOOVERWRITE)
+			flags = DB_KEYLAST;
+		if ((ret = opd->am_put(
+		    opd, key, data, flags, NULL)) != 0)
+			goto err;
+	}
+
+done:
+err:	/* Cleanup and cursor resolution. */
+	if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __dbc_del_oldskey --
+ *	Delete an old secondary key, if necessary.
+ *	Returns DB_KEYEXIST if the new and old keys match..
+ */
+static int
+__dbc_del_oldskey(sdbp, dbc, skey, pkey, olddata)
+	DB *sdbp;
+	DBC *dbc;
+	DBT *skey, *pkey, *olddata;
+{
+	DB *dbp;
+	DBC *sdbc;
+	DBT *toldskeyp, *tskeyp;
+	DBT oldskey, temppkey, tempskey;
+	ENV *env;
+	int ret, t_ret;
+	u_int32_t i, noldskey, nsame, nskey, rmw;
+
+	sdbc = NULL;
+	dbp = sdbp->s_primary;
+	env = dbp->env;
+	nsame = 0;
+	rmw = STD_LOCKING(dbc) ? DB_RMW : 0;
+
+	/*
+	 * Get the old secondary key.
+	 */
+	memset(&oldskey, 0, sizeof(DBT));
+	if ((ret = sdbp->s_callback(sdbp, pkey, olddata, &oldskey)) != 0) {
+		if (ret == DB_DONOTINDEX ||
+		    (F_ISSET(&oldskey, DB_DBT_MULTIPLE) && oldskey.size == 0))
+			/* There's no old key to delete. */
+			ret = 0;
+		return (ret);
+	}
+
+	if (F_ISSET(&oldskey, DB_DBT_MULTIPLE)) {
+#ifdef DIAGNOSTIC
+		__db_check_skeyset(sdbp, &oldskey);
+#endif
+		toldskeyp = (DBT *)oldskey.data;
+		noldskey = oldskey.size;
+	} else {
+		toldskeyp = &oldskey;
+		noldskey = 1;
+	}
+
+	if (F_ISSET(skey, DB_DBT_MULTIPLE)) {
+		nskey = skey->size;
+		skey = (DBT *)skey->data;
+	} else
+		nskey = F_ISSET(skey, DB_DBT_ISSET) ? 1 : 0;
+
+	for (; noldskey > 0 && ret == 0; noldskey--, toldskeyp++) {
+		/*
+		 * Check whether this old secondary key is also a new key
+		 * before we delete it.  Note that bt_compare is (and must be)
+		 * set no matter what access method we're in.
+		 */
+		for (i = 0, tskeyp = skey; i < nskey; i++, tskeyp++)
+			if (((BTREE *)sdbp->bt_internal)->bt_compare(sdbp,
+			    toldskeyp, tskeyp) == 0) {
+				nsame++;
+				F_CLR(tskeyp, DB_DBT_ISSET);
+				break;
+			}
+
+		if (i < nskey) {
+			FREE_IF_NEEDED(env, toldskeyp);
+			continue;
+		}
+
+		if (sdbc == NULL) {
+			if ((ret = __db_cursor_int(sdbp,
+			    dbc->thread_info, dbc->txn, sdbp->type,
+			    PGNO_INVALID, 0, dbc->locker, &sdbc)) != 0)
+				goto err;
+			if (CDB_LOCKING(env)) {
+				DB_ASSERT(env,
+				    sdbc->mylock.off == LOCK_INVALID);
+				F_SET(sdbc, DBC_WRITER);
+			}
+		}
+
+		/*
+		 * Don't let c_get(DB_GET_BOTH) stomp on our data.  Use
+		 * temporary DBTs instead.
+		 */
+		SWAP_IF_NEEDED(sdbp, pkey);
+		DB_INIT_DBT(temppkey, pkey->data, pkey->size);
+		DB_INIT_DBT(tempskey, toldskeyp->data, toldskeyp->size);
+		if ((ret = __dbc_get(sdbc,
+		    &tempskey, &temppkey, rmw | DB_GET_BOTH)) == 0)
+			ret = __dbc_del(sdbc, DB_UPDATE_SECONDARY);
+		else if (ret == DB_NOTFOUND)
+			ret = __db_secondary_corrupt(dbp);
+		SWAP_IF_NEEDED(sdbp, pkey);
+		FREE_IF_NEEDED(env, toldskeyp);
+	}
+
+err:	for (; noldskey > 0; noldskey--, toldskeyp++)
+		FREE_IF_NEEDED(env, toldskeyp);
+	FREE_IF_NEEDED(env, &oldskey);
+	if (sdbc != NULL && (t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
+		ret = t_ret;
+	if (ret == 0 && nsame == nskey)
+		return (DB_KEYEXIST);
+	return (ret);
+}
+
+/*
+ * __db_duperr()
+ *	Error message: we don't currently support sorted duplicate duplicates.
+ * PUBLIC: int __db_duperr __P((DB *, u_int32_t));
+ */
+int
+__db_duperr(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	/*
+	 * If we run into this error while updating a secondary index,
+	 * don't yell--there's no clean way to pass DB_NODUPDATA in along
+	 * with DB_UPDATE_SECONDARY, but we may run into this problem
+	 * in a normal, non-error course of events.
+	 *
+	 * !!!
+	 * If and when we ever permit duplicate duplicates in sorted-dup
+	 * databases, we need to either change the secondary index code
+	 * to check for dup dups, or we need to maintain the implicit
+	 * "DB_NODUPDATA" behavior for databases with DB_AM_SECONDARY set.
+	 */
+	if (flags != DB_NODUPDATA && !F_ISSET(dbp, DB_AM_SECONDARY))
+		__db_errx(dbp->env,
+		    "Duplicate data items are not supported with sorted data");
+	return (DB_KEYEXIST);
+}
+
+/*
+ * __dbc_cleanup --
+ *	Clean up duplicate cursors.
+ *
+ * PUBLIC: int __dbc_cleanup __P((DBC *, DBC *, int));
+ */
+int
+__dbc_cleanup(dbc, dbc_n, failed)
+	DBC *dbc, *dbc_n;
+	int failed;
+{
+	DB *dbp;
+	DBC *opd;
+	DBC_INTERNAL *internal;
+	DB_MPOOLFILE *mpf;
+	int ret, t_ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	internal = dbc->internal;
+	ret = 0;
+
+	/* Discard any pages we're holding. */
+	if (internal->page != NULL) {
+		if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+		     internal->page, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		internal->page = NULL;
+	}
+	opd = internal->opd;
+	if (opd != NULL && opd->internal->page != NULL) {
+		if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+		    opd->internal->page, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		opd->internal->page = NULL;
+	}
+
+	/*
+	 * If dbc_n is NULL, there's no internal cursor swapping to be done
+	 * and no dbc_n to close--we probably did the entire operation on an
+	 * offpage duplicate cursor.  Just return.
+	 *
+	 * If dbc and dbc_n are the same, we're either inside a DB->{put/get}
+	 * operation, and as an optimization we performed the operation on
+	 * the main cursor rather than on a duplicated one, or we're in a
+	 * bulk get that can't have moved the cursor (DB_MULTIPLE with the
+	 * initial c_get operation on an off-page dup cursor).  Just
+	 * return--either we know we didn't move the cursor, or we're going
+	 * to close it before we return to application code, so we're sure
+	 * not to visibly violate the "cursor stays put on error" rule.
+	 */
+	if (dbc_n == NULL || dbc == dbc_n)
+		return (ret);
+
+	if (dbc_n->internal->page != NULL) {
+		if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+		    dbc_n->internal->page, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		dbc_n->internal->page = NULL;
+	}
+	opd = dbc_n->internal->opd;
+	if (opd != NULL && opd->internal->page != NULL) {
+		if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+		     opd->internal->page, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		opd->internal->page = NULL;
+	}
+
+	/*
+	 * If we didn't fail before entering this routine or just now when
+	 * freeing pages, swap the interesting contents of the old and new
+	 * cursors.
+	 */
+	if (!failed && ret == 0) {
+		if (opd != NULL)
+			opd->internal->pdbc = dbc;
+		if (internal->opd != NULL)
+			internal->opd->internal->pdbc = dbc_n;
+		dbc->internal = dbc_n->internal;
+		dbc_n->internal = internal;
+	}
+
+	/*
+	 * Close the cursor we don't care about anymore.  The close can fail,
+	 * but we only expect DB_LOCK_DEADLOCK failures.  This violates our
+	 * "the cursor is unchanged on error" semantics, but since all you can
+	 * do with a DB_LOCK_DEADLOCK failure is close the cursor, I believe
+	 * that's OK.
+	 *
+	 * XXX
+	 * There's no way to recover from failure to close the old cursor.
+	 * All we can do is move to the new position and return an error.
+	 *
+	 * XXX
+	 * We might want to consider adding a flag to the cursor, so that any
+	 * subsequent operations other than close just return an error?
+	 */
+	if ((t_ret = __dbc_close(dbc_n)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/*
+	 * If this was an update that is supporting dirty reads
+	 * then we may have just swapped our read for a write lock
+	 * which is held by the surviving cursor.  We need
+	 * to explicitly downgrade this lock.  The closed cursor
+	 * may only have had a read lock.
+	 */
+	if (F_ISSET(dbp, DB_AM_READ_UNCOMMITTED) &&
+	    dbc->internal->lock_mode == DB_LOCK_WRITE) {
+		if ((t_ret =
+		    __TLPUT(dbc, dbc->internal->lock)) != 0 && ret == 0)
+			ret = t_ret;
+		if (t_ret == 0)
+			dbc->internal->lock_mode = DB_LOCK_WWRITE;
+		if (dbc->internal->page != NULL && (t_ret =
+		    __memp_shared(dbp->mpf, dbc->internal->page)) != 0 &&
+		    ret == 0)
+			ret = t_ret;
+	}
+
+	return (ret);
+}
+
+/*
+ * __dbc_secondary_get_pp --
+ *	This wrapper function for DBC->pget() is the DBC->get() function
+ *	for a secondary index cursor.
+ *
+ * PUBLIC: int __dbc_secondary_get_pp __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_secondary_get_pp(dbc, skey, data, flags)
+	DBC *dbc;
+	DBT *skey, *data;
+	u_int32_t flags;
+{
+	DB_ASSERT(dbc->env, F_ISSET(dbc->dbp, DB_AM_SECONDARY));
+	return (__dbc_pget_pp(dbc, skey, NULL, data, flags));
+}
+
+/*
+ * __dbc_pget --
+ *	Get a primary key/data pair through a secondary index.
+ *
+ * PUBLIC: int __dbc_pget __P((DBC *, DBT *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_pget(dbc, skey, pkey, data, flags)
+	DBC *dbc;
+	DBT *skey, *pkey, *data;
+	u_int32_t flags;
+{
+	DB *pdbp, *sdbp;
+	DBC *dbc_n, *pdbc;
+	DBT nullpkey;
+	u_int32_t save_pkey_flags, tmp_flags, tmp_read_locking, tmp_rmw;
+	int pkeymalloc, ret, t_ret;
+
+	sdbp = dbc->dbp;
+	pdbp = sdbp->s_primary;
+	dbc_n = NULL;
+	pkeymalloc = t_ret = 0;
+
+	/*
+	 * The challenging part of this function is getting the behavior
+	 * right for all the various permutations of DBT flags.  The
+	 * next several blocks handle the various cases we need to
+	 * deal with specially.
+	 */
+
+	/*
+	 * We may be called with a NULL pkey argument, if we've been
+	 * wrapped by a 2-DBT get call.  If so, we need to use our
+	 * own DBT.
+	 */
+	if (pkey == NULL) {
+		memset(&nullpkey, 0, sizeof(DBT));
+		pkey = &nullpkey;
+	}
+
+	/* Clear OR'd in additional bits so we can check for flag equality. */
+	tmp_rmw = LF_ISSET(DB_RMW);
+	LF_CLR(DB_RMW);
+
+	SET_READ_LOCKING_FLAGS(dbc, tmp_read_locking);
+	/*
+	 * DB_GET_RECNO is a special case, because we're interested not in
+	 * the primary key/data pair, but rather in the primary's record
+	 * number.
+	 */
+	if (flags == DB_GET_RECNO) {
+		if (tmp_rmw)
+			F_SET(dbc, DBC_RMW);
+		F_SET(dbc, tmp_read_locking);
+		ret = __dbc_pget_recno(dbc, pkey, data, flags);
+		if (tmp_rmw)
+			F_CLR(dbc, DBC_RMW);
+		/* Clear the temp flags, but leave WAS_READ_COMMITTED. */
+		F_CLR(dbc, tmp_read_locking & ~DBC_WAS_READ_COMMITTED);
+		return (ret);
+	}
+
+	/*
+	 * If the DBTs we've been passed don't have any of the
+	 * user-specified memory management flags set, we want to make sure
+	 * we return values using the DBTs dbc->rskey, dbc->rkey, and
+	 * dbc->rdata, respectively.
+	 *
+	 * There are two tricky aspects to this:  first, we need to pass
+	 * skey and pkey *in* to the initial c_get on the secondary key,
+	 * since either or both may be looked at by it (depending on the
+	 * get flag).  Second, we must not use a normal DB->get call
+	 * on the secondary, even though that's what we want to accomplish,
+	 * because the DB handle may be free-threaded.  Instead,
+	 * we open a cursor, then take steps to ensure that we actually use
+	 * the rkey/rdata from the *secondary* cursor.
+	 *
+	 * We accomplish all this by passing in the DBTs we started out
+	 * with to the c_get, but swapping the contents of rskey and rkey,
+	 * respectively, into rkey and rdata;  __db_ret will treat them like
+	 * the normal key/data pair in a c_get call, and will realloc them as
+	 * need be (this is "step 1").  Then, for "step 2", we swap back
+	 * rskey/rkey/rdata to normal, and do a get on the primary with the
+	 * secondary dbc appointed as the owner of the returned-data memory.
+	 *
+	 * Note that in step 2, we copy the flags field in case we need to
+	 * pass down a DB_DBT_PARTIAL or other flag that is compatible with
+	 * letting DB do the memory management.
+	 */
+
+	/*
+	 * It is correct, though slightly sick, to attempt a partial get of a
+	 * primary key.  However, if we do so here, we'll never find the
+	 * primary record;  clear the DB_DBT_PARTIAL field of pkey just for the
+	 * duration of the next call.
+	 */
+	save_pkey_flags = pkey->flags;
+	F_CLR(pkey, DB_DBT_PARTIAL);
+
+	/*
+	 * Now we can go ahead with the meat of this call.  First, get the
+	 * primary key from the secondary index.  (What exactly we get depends
+	 * on the flags, but the underlying cursor get will take care of the
+	 * dirty work.)  Duplicate the cursor, in case the later get on the
+	 * primary fails.
+	 */
+	switch (flags) {
+	case DB_CURRENT:
+	case DB_GET_BOTHC:
+	case DB_NEXT:
+	case DB_NEXT_DUP:
+	case DB_NEXT_NODUP:
+	case DB_PREV:
+	case DB_PREV_DUP:
+	case DB_PREV_NODUP:
+		tmp_flags = DB_POSITION;
+		break;
+	default:
+		tmp_flags = 0;
+		break;
+	}
+
+	if (F_ISSET(dbc, DBC_PARTITIONED | DBC_TRANSIENT))
+		dbc_n = dbc;
+	else if ((ret = __dbc_dup(dbc, &dbc_n, tmp_flags)) != 0)
+		return (ret);
+
+	F_SET(dbc_n, DBC_TRANSIENT);
+
+	if (tmp_rmw)
+		F_SET(dbc_n, DBC_RMW);
+	F_SET(dbc_n, tmp_read_locking);
+
+	/*
+	 * If we've been handed a primary key, it will be in native byte order,
+	 * so we need to swap it before reading from the secondary.
+	 */
+	if (flags == DB_GET_BOTH || flags == DB_GET_BOTHC ||
+	    flags == DB_GET_BOTH_RANGE)
+		SWAP_IF_NEEDED(sdbp, pkey);
+
+retry:	/* Step 1. */
+	dbc_n->rdata = dbc->rkey;
+	dbc_n->rkey = dbc->rskey;
+	ret = __dbc_get(dbc_n, skey, pkey, flags);
+	/* Restore pkey's flags in case we stomped the PARTIAL flag. */
+	pkey->flags = save_pkey_flags;
+
+	/*
+	 * We need to swap the primary key to native byte order if we read it
+	 * successfully, or if we swapped it on entry above.  We can't return
+	 * with the application's data modified.
+	 */
+	if (ret == 0 || flags == DB_GET_BOTH || flags == DB_GET_BOTHC ||
+	    flags == DB_GET_BOTH_RANGE)
+		SWAP_IF_NEEDED(sdbp, pkey);
+
+	if (ret != 0)
+		goto err;
+
+	/*
+	 * Now we're ready for "step 2".  If either or both of pkey and data do
+	 * not have memory management flags set--that is, if DB is managing
+	 * their memory--we need to swap around the rkey/rdata structures so
+	 * that we don't wind up trying to use memory managed by the primary
+	 * database cursor, which we'll close before we return.
+	 *
+	 * !!!
+	 * If you're carefully following the bouncing ball, you'll note that in
+	 * the DB-managed case, the buffer hanging off of pkey is the same as
+	 * dbc->rkey->data.  This is just fine;  we may well realloc and stomp
+	 * on it when we return, if we're doing a DB_GET_BOTH and need to
+	 * return a different partial or key (depending on the comparison
+	 * function), but this is safe.
+	 *
+	 * !!!
+	 * We need to use __db_cursor_int here rather than simply calling
+	 * pdbp->cursor, because otherwise, if we're in CDB, we'll allocate a
+	 * new locker ID and leave ourselves open to deadlocks.  (Even though
+	 * we're only acquiring read locks, we'll still block if there are any
+	 * waiters.)
+	 */
+	if ((ret = __db_cursor_int(pdbp, dbc->thread_info,
+	    dbc->txn, pdbp->type, PGNO_INVALID, 0, dbc->locker, &pdbc)) != 0)
+		goto err;
+
+	F_SET(pdbc, tmp_read_locking |
+	     F_ISSET(dbc, DBC_READ_UNCOMMITTED | DBC_READ_COMMITTED | DBC_RMW));
+
+	/*
+	 * We're about to use pkey a second time.  If DB_DBT_MALLOC is set on
+	 * it, we'll leak the memory we allocated the first time.  Thus, set
+	 * DB_DBT_REALLOC instead so that we reuse that memory instead of
+	 * leaking it.
+	 *
+	 * Alternatively, if the application is handling copying for pkey, we
+	 * need to take a copy now.  The copy will be freed on exit from
+	 * __dbc_pget_pp (and we must be coming through there if DB_DBT_USERCOPY
+	 * is set).  In the case of DB_GET_BOTH_RANGE, the pkey supplied by
+	 * the application has already been copied in but the value may have
+	 * changed in the search.  In that case, free the original copy and get
+	 * a new one.
+	 *
+	 * !!!
+	 * This assumes that the user must always specify a compatible realloc
+	 * function if a malloc function is specified.  I think this is a
+	 * reasonable requirement.
+	 */
+	if (F_ISSET(pkey, DB_DBT_MALLOC)) {
+		F_CLR(pkey, DB_DBT_MALLOC);
+		F_SET(pkey, DB_DBT_REALLOC);
+		pkeymalloc = 1;
+	} else if (F_ISSET(pkey, DB_DBT_USERCOPY)) {
+		if (flags == DB_GET_BOTH_RANGE)
+			__dbt_userfree(sdbp->env, NULL, pkey, NULL);
+		if ((ret = __dbt_usercopy(sdbp->env, pkey)) != 0)
+			goto err;
+	}
+
+	/*
+	 * Do the actual get.  Set DBC_TRANSIENT since we don't care about
+	 * preserving the position on error, and it's faster.  SET_RET_MEM so
+	 * that the secondary DBC owns any returned-data memory.
+	 */
+	F_SET(pdbc, DBC_TRANSIENT);
+	SET_RET_MEM(pdbc, dbc);
+	ret = __dbc_get(pdbc, pkey, data, DB_SET);
+
+	/*
+	 * If the item wasn't found in the primary, this is a bug; our
+	 * secondary has somehow gotten corrupted, and contains elements that
+	 * don't correspond to anything in the primary.  Complain.
+	 */
+
+	/* Now close the primary cursor. */
+	if ((t_ret = __dbc_close(pdbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	else if (ret == DB_NOTFOUND) {
+		if (!F_ISSET(pdbc, DBC_READ_UNCOMMITTED))
+			ret = __db_secondary_corrupt(pdbp);
+		else switch (flags) {
+		case DB_GET_BOTHC:
+		case DB_NEXT:
+		case DB_NEXT_DUP:
+		case DB_NEXT_NODUP:
+		case DB_PREV:
+		case DB_PREV_DUP:
+		case DB_PREV_NODUP:
+			goto retry;
+		default:
+			break;
+		}
+	}
+
+err:	/* Cleanup and cursor resolution. */
+	if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && ret == 0)
+		ret = t_ret;
+	if (pkeymalloc) {
+		/*
+		 * If pkey had a MALLOC flag, we need to restore it; otherwise,
+		 * if the user frees the buffer but reuses the DBT without
+		 * NULL'ing its data field or changing the flags, we may drop
+		 * core.
+		 */
+		F_CLR(pkey, DB_DBT_REALLOC);
+		F_SET(pkey, DB_DBT_MALLOC);
+	}
+
+	return (ret);
+}
+
+/*
+ * __dbc_pget_recno --
+ *	Perform a DB_GET_RECNO c_pget on a secondary index.  Returns
+ * the secondary's record number in the pkey field and the primary's
+ * in the data field.
+ */
+static int
+__dbc_pget_recno(sdbc, pkey, data, flags)
+	DBC *sdbc;
+	DBT *pkey, *data;
+	u_int32_t flags;
+{
+	DB *pdbp, *sdbp;
+	DBC *pdbc;
+	DBT discardme, primary_key;
+	ENV *env;
+	db_recno_t oob;
+	u_int32_t rmw;
+	int ret, t_ret;
+
+	sdbp = sdbc->dbp;
+	pdbp = sdbp->s_primary;
+	env = sdbp->env;
+	pdbc = NULL;
+	ret = t_ret = 0;
+
+	rmw = LF_ISSET(DB_RMW);
+
+	memset(&discardme, 0, sizeof(DBT));
+	F_SET(&discardme, DB_DBT_USERMEM | DB_DBT_PARTIAL);
+
+	oob = RECNO_OOB;
+
+	/*
+	 * If the primary is an rbtree, we want its record number, whether
+	 * or not the secondary is one too.  Fetch the recno into "data".
+	 *
+	 * If it's not an rbtree, return RECNO_OOB in "data".
+	 */
+	if (F_ISSET(pdbp, DB_AM_RECNUM)) {
+		/*
+		 * Get the primary key, so we can find the record number
+		 * in the primary. (We're uninterested in the secondary key.)
+		 */
+		memset(&primary_key, 0, sizeof(DBT));
+		F_SET(&primary_key, DB_DBT_MALLOC);
+		if ((ret = __dbc_get(sdbc,
+		    &discardme, &primary_key, rmw | DB_CURRENT)) != 0)
+			return (ret);
+
+		/*
+		 * Open a cursor on the primary, set it to the right record,
+		 * and fetch its recno into "data".
+		 *
+		 * (See __dbc_pget for comments on the use of __db_cursor_int.)
+		 *
+		 * SET_RET_MEM so that the secondary DBC owns any returned-data
+		 * memory.
+		 */
+		if ((ret = __db_cursor_int(pdbp, sdbc->thread_info, sdbc->txn,
+		    pdbp->type, PGNO_INVALID, 0, sdbc->locker, &pdbc)) != 0)
+			goto perr;
+		SET_RET_MEM(pdbc, sdbc);
+		if ((ret = __dbc_get(pdbc,
+		    &primary_key, &discardme, rmw | DB_SET)) != 0)
+			goto perr;
+
+		ret = __dbc_get(pdbc, &discardme, data, rmw | DB_GET_RECNO);
+
+perr:		__os_ufree(env, primary_key.data);
+		if (pdbc != NULL &&
+		    (t_ret = __dbc_close(pdbc)) != 0 && ret == 0)
+			ret = t_ret;
+		if (ret != 0)
+			return (ret);
+	} else if ((ret = __db_retcopy(env, data, &oob,
+		    sizeof(oob), &sdbc->rkey->data, &sdbc->rkey->ulen)) != 0)
+			return (ret);
+
+	/*
+	 * If the secondary is an rbtree, we want its record number, whether
+	 * or not the primary is one too.  Fetch the recno into "pkey".
+	 *
+	 * If it's not an rbtree, return RECNO_OOB in "pkey".
+	 */
+	if (F_ISSET(sdbp, DB_AM_RECNUM))
+		return (__dbc_get(sdbc, &discardme, pkey, flags));
+	else
+		return (__db_retcopy(env, pkey, &oob,
+		    sizeof(oob), &sdbc->rdata->data, &sdbc->rdata->ulen));
+}
+
+/*
+ * __db_wrlock_err -- do not have a write lock.
+ */
+static int
+__db_wrlock_err(env)
+	ENV *env;
+{
+	__db_errx(env, "Write attempted on read-only cursor");
+	return (EPERM);
+}
+
+/*
+ * __dbc_del_secondary --
+ *	Perform a delete operation on a secondary index:  call through
+ *	to the primary and delete the primary record that this record
+ *	points to.
+ *
+ *	Note that deleting the primary record will call c_del on all
+ *	the secondaries, including this one;  thus, it is not necessary
+ *	to execute both this function and an actual delete.
+ */
+static int
+__dbc_del_secondary(dbc)
+	DBC *dbc;
+{
+	DB *pdbp;
+	DBC *pdbc;
+	DBT skey, pkey;
+	ENV *env;
+	int ret, t_ret;
+	u_int32_t rmw;
+
+	pdbp = dbc->dbp->s_primary;
+	env = pdbp->env;
+	rmw = STD_LOCKING(dbc) ? DB_RMW : 0;
+
+	/*
+	 * Get the current item that we're pointing at.
+	 * We don't actually care about the secondary key, just
+	 * the primary.
+	 */
+	memset(&skey, 0, sizeof(DBT));
+	memset(&pkey, 0, sizeof(DBT));
+	F_SET(&skey, DB_DBT_PARTIAL | DB_DBT_USERMEM);
+	if ((ret = __dbc_get(dbc, &skey, &pkey, DB_CURRENT)) != 0)
+		return (ret);
+
+	SWAP_IF_NEEDED(dbc->dbp, &pkey);
+
+	/*
+	 * Create a cursor on the primary with our locker ID,
+	 * so that when it calls back, we don't conflict.
+	 *
+	 * We create a cursor explicitly because there's no
+	 * way to specify the same locker ID if we're using
+	 * locking but not transactions if we use the DB->del
+	 * interface.  This shouldn't be any less efficient
+	 * anyway.
+	 */
+	if ((ret = __db_cursor_int(pdbp, dbc->thread_info, dbc->txn,
+	    pdbp->type, PGNO_INVALID, 0, dbc->locker, &pdbc)) != 0)
+		return (ret);
+
+	/*
+	 * See comment in __dbc_put--if we're in CDB,
+	 * we already hold the locks we need, and we need to flag
+	 * the cursor as a WRITER so we don't run into errors
+	 * when we try to delete.
+	 */
+	if (CDB_LOCKING(env)) {
+		DB_ASSERT(env, pdbc->mylock.off == LOCK_INVALID);
+		F_SET(pdbc, DBC_WRITER);
+	}
+
+	/*
+	 * Set the new cursor to the correct primary key.  Then
+	 * delete it.  We don't really care about the datum;
+	 * just reuse our skey DBT.
+	 *
+	 * If the primary get returns DB_NOTFOUND, something is amiss--
+	 * every record in the secondary should correspond to some record
+	 * in the primary.
+	 */
+	if ((ret = __dbc_get(pdbc, &pkey, &skey, DB_SET | rmw)) == 0)
+		ret = __dbc_del(pdbc, 0);
+	else if (ret == DB_NOTFOUND)
+		ret = __db_secondary_corrupt(pdbp);
+
+	if ((t_ret = __dbc_close(pdbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __dbc_del_primary --
+ *	Perform a delete operation on a primary index.  Loop through
+ *	all the secondary indices which correspond to this primary
+ *	database, and delete any secondary keys that point at the current
+ *	record.
+ *
+ * PUBLIC: int __dbc_del_primary __P((DBC *));
+ */
+int
+__dbc_del_primary(dbc)
+	DBC *dbc;
+{
+	DB *dbp, *sdbp;
+	DBC *sdbc;
+	DBT *tskeyp;
+	DBT data, pkey, skey, temppkey, tempskey;
+	ENV *env;
+	u_int32_t nskey, rmw;
+	int ret, t_ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	sdbp = NULL;
+	rmw = STD_LOCKING(dbc) ? DB_RMW : 0;
+
+	/*
+	 * If we're called at all, we have at least one secondary.
+	 * (Unfortunately, we can't assert this without grabbing the mutex.)
+	 * Get the current record so that we can construct appropriate
+	 * secondary keys as needed.
+	 */
+	memset(&pkey, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+	if ((ret = __dbc_get(dbc, &pkey, &data, DB_CURRENT)) != 0)
+		return (ret);
+
+	memset(&skey, 0, sizeof(DBT));
+	for (ret = __db_s_first(dbp, &sdbp);
+	    sdbp != NULL && ret == 0;
+	    ret = __db_s_next(&sdbp, dbc->txn)) {
+		/*
+		 * Get the secondary key for this secondary and the current
+		 * item.
+		 */
+		if ((ret = sdbp->s_callback(sdbp, &pkey, &data, &skey)) != 0) {
+			/* Not indexing is equivalent to an empty key set. */
+			if (ret == DB_DONOTINDEX) {
+				F_SET(&skey, DB_DBT_MULTIPLE);
+				skey.size = 0;
+			} else /* We had a substantive error.  Bail. */
+				goto err;
+		}
+
+#ifdef DIAGNOSTIC
+		if (F_ISSET(&skey, DB_DBT_MULTIPLE))
+			__db_check_skeyset(sdbp, &skey);
+#endif
+
+		if (F_ISSET(&skey, DB_DBT_MULTIPLE)) {
+			tskeyp = (DBT *)skey.data;
+			nskey = skey.size;
+			if (nskey == 0)
+				continue;
+		} else {
+			tskeyp = &skey;
+			nskey = 1;
+		}
+
+		/* Open a secondary cursor. */
+		if ((ret = __db_cursor_int(sdbp,
+		    dbc->thread_info, dbc->txn, sdbp->type,
+		    PGNO_INVALID, 0, dbc->locker, &sdbc)) != 0)
+			goto err;
+		/* See comment above and in __dbc_put. */
+		if (CDB_LOCKING(env)) {
+			DB_ASSERT(env, sdbc->mylock.off == LOCK_INVALID);
+			F_SET(sdbc, DBC_WRITER);
+		}
+
+		for (; nskey > 0; nskey--, tskeyp++) {
+			/*
+			 * Set the secondary cursor to the appropriate item.
+			 * Delete it.
+			 *
+			 * We want to use DB_RMW if locking is on; it's only
+			 * legal then, though.
+			 *
+			 * !!!
+			 * Don't stomp on any callback-allocated buffer in skey
+			 * when we do a c_get(DB_GET_BOTH); use a temp DBT
+			 * instead.  Similarly, don't allow pkey to be
+			 * invalidated when the cursor is closed.
+			 */
+			DB_INIT_DBT(tempskey, tskeyp->data, tskeyp->size);
+			SWAP_IF_NEEDED(sdbp, &pkey);
+			DB_INIT_DBT(temppkey, pkey.data, pkey.size);
+			if ((ret = __dbc_get(sdbc, &tempskey, &temppkey,
+			    DB_GET_BOTH | rmw)) == 0)
+				ret = __dbc_del(sdbc, DB_UPDATE_SECONDARY);
+			else if (ret == DB_NOTFOUND)
+				ret = __db_secondary_corrupt(dbp);
+			SWAP_IF_NEEDED(sdbp, &pkey);
+			FREE_IF_NEEDED(env, tskeyp);
+		}
+
+		if ((t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
+			ret = t_ret;
+		if (ret != 0)
+			goto err;
+
+		/*
+		 * In the common case where there is a single secondary key, we
+		 * will have freed any application-allocated data in skey
+		 * already.  In the multiple key case, we need to free it here.
+		 * It is safe to do this twice as the macro resets the data
+		 * field.
+		 */
+		FREE_IF_NEEDED(env, &skey);
+	}
+
+err:	if (sdbp != NULL &&
+	    (t_ret = __db_s_done(sdbp, dbc->txn)) != 0 && ret == 0)
+		ret = t_ret;
+	FREE_IF_NEEDED(env, &skey);
+	return (ret);
+}
+
+/*
+ * __dbc_del_foreign --
+ *	Apply the foreign database constraints for a particular foreign
+ *	database when an item is being deleted (dbc points at item being deleted
+ *	in the foreign database.)
+ *
+ *      Delete happens in dbp, check for occurrences of key in pdpb.
+ *      Terminology:
+ *        Foreign db = Where delete occurs (dbp).
+ *        Secondary db = Where references to dbp occur (sdbp, a secondary)
+ *        Primary db = sdbp's primary database, references to dbp are secondary
+ *                      keys here
+ *        Foreign Key = Key being deleted in dbp (fkey)
+ *        Primary Key = Key of the corresponding entry in sdbp's primary (pkey).
+ */
+static int
+__dbc_del_foreign(dbc)
+	DBC *dbc;
+{
+	DB_FOREIGN_INFO *f_info;
+	DB *dbp, *pdbp, *sdbp;
+	DBC *pdbc, *sdbc;
+	DBT data, fkey, pkey;
+	ENV *env;
+	u_int32_t flags, rmw;
+	int changed, ret, t_ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	memset(&fkey, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+	if ((ret = __dbc_get(dbc, &fkey, &data, DB_CURRENT)) != 0)
+		return (ret);
+
+	LIST_FOREACH(f_info, &(dbp->f_primaries), f_links) {
+		sdbp = f_info->dbp;
+		pdbp = sdbp->s_primary;
+		flags = f_info->flags;
+
+		rmw = (STD_LOCKING(dbc) &&
+		    !LF_ISSET(DB_FOREIGN_ABORT)) ? DB_RMW : 0;
+
+		/*
+		 * Handle CDB locking.  Some of this is copied from
+		 * __dbc_del_primary, but a bit more acrobatics are required.
+		 * If we're not going to abort, then we need to get a write
+		 * cursor.  If CDB_ALLDB is set, then only one write cursor is
+		 * allowed and we hold it, so we fudge things and promote the
+		 * cursor on the other DBs manually, it won't cause a problem.
+		 * If CDB_ALLDB is not set, then we go through the usual route
+		 * to make sure we block as necessary.  If there are any open
+		 * read cursors on sdbp, the delete or put call later will
+		 * block.
+		 *
+		 * If NULLIFY is set, we'll need a cursor on the primary to
+		 * update it with the nullified data.  Because primary and
+		 * secondary dbs share a lock file ID in CDB, we open a cursor
+		 * on the secondary and then get another writeable cursor on the
+		 * primary via __db_cursor_int to avoid deadlocking.
+		 */
+		sdbc = pdbc = NULL;
+		if (!LF_ISSET(DB_FOREIGN_ABORT) && CDB_LOCKING(env) &&
+		    !F_ISSET(env->dbenv, DB_ENV_CDB_ALLDB)) {
+			ret = __db_cursor(sdbp,
+			    dbc->thread_info, dbc->txn, &sdbc, DB_WRITECURSOR);
+			if (LF_ISSET(DB_FOREIGN_NULLIFY) && ret == 0) {
+				ret = __db_cursor_int(pdbp,
+				    dbc->thread_info, dbc->txn, pdbp->type,
+				    PGNO_INVALID, 0, dbc->locker, &pdbc);
+				F_SET(pdbc, DBC_WRITER);
+			}
+		} else {
+			ret = __db_cursor_int(sdbp, dbc->thread_info, dbc->txn,
+			    sdbp->type, PGNO_INVALID, 0, dbc->locker, &sdbc);
+			if (LF_ISSET(DB_FOREIGN_NULLIFY) && ret == 0)
+				ret = __db_cursor_int(pdbp, dbc->thread_info,
+				    dbc->txn, pdbp->type, PGNO_INVALID, 0,
+				    dbc->locker, &pdbc);
+			}
+		if (ret != 0) {
+			if (sdbc != NULL)
+				(void)__dbc_close(sdbc);
+			return (ret);
+		}
+		if (CDB_LOCKING(env) && F_ISSET(env->dbenv, DB_ENV_CDB_ALLDB)) {
+			DB_ASSERT(env, sdbc->mylock.off == LOCK_INVALID);
+			F_SET(sdbc, DBC_WRITER);
+			if (LF_ISSET(DB_FOREIGN_NULLIFY) && pdbc != NULL) {
+				DB_ASSERT(env,
+				    pdbc->mylock.off == LOCK_INVALID);
+				F_SET(pdbc, DBC_WRITER);
+			}
+		}
+
+		/*
+		 * There are three actions possible when a foreign database has
+		 * items corresponding to a deleted item:
+		 * DB_FOREIGN_ABORT - The delete operation should be aborted.
+		 * DB_FOREIGN_CASCADE - All corresponding foreign items should
+		 *    be deleted.
+		 * DB_FOREIGN_NULLIFY - A callback needs to be made, allowing
+		 *    the application to modify the data DBT from the
+		 *    associated database.  If the callback makes a
+		 *    modification, the updated item needs to replace the
+		 *    original item in the foreign db
+		 */
+		memset(&pkey, 0, sizeof(DBT));
+		memset(&data, 0, sizeof(DBT));
+		ret = __dbc_pget(sdbc, &fkey, &pkey, &data, DB_SET|rmw);
+
+		if (ret == DB_NOTFOUND) {
+			/* No entry means no constraint */
+			ret = __dbc_close(sdbc);
+			if (LF_ISSET(DB_FOREIGN_NULLIFY) &&
+			    (t_ret = __dbc_close(pdbc)) != 0)
+				ret = t_ret;
+			if (ret != 0)
+				return (ret);
+			continue;
+		} else if (ret != 0) {
+			/* Just return the error code from the pget */
+			(void)__dbc_close(sdbc);
+			if (LF_ISSET(DB_FOREIGN_NULLIFY))
+				(void)__dbc_close(pdbc);
+			return (ret);
+		} else if (LF_ISSET(DB_FOREIGN_ABORT)) {
+			/* If the record exists and ABORT is set, we're done */
+			if ((ret = __dbc_close(sdbc)) != 0)
+				return (ret);
+			return (DB_FOREIGN_CONFLICT);
+		}
+
+		/*
+		 * There were matching items in the primary DB, and the action
+		 * is either DB_FOREIGN_CASCADE or DB_FOREIGN_NULLIFY.
+		 */
+		while (ret == 0) {
+			if (LF_ISSET(DB_FOREIGN_CASCADE)) {
+				/*
+				 * Don't use the DB_UPDATE_SECONDARY flag,
+				 * since we want the delete to cascade into the
+				 * secondary's primary.
+				 */
+				if ((ret = __dbc_del(sdbc, 0)) != 0) {
+					__db_err(env, ret,
+	    "Attempt to execute cascading delete in a foreign index failed");
+					break;
+				}
+			} else if (LF_ISSET(DB_FOREIGN_NULLIFY)) {
+				changed = 0;
+				if ((ret = f_info->callback(sdbp,
+				    &pkey, &data, &fkey, &changed)) != 0) {
+					__db_err(env, ret,
+				    "Foreign database application callback");
+					break;
+				}
+
+				/*
+				 * If the user callback modified the DBT and
+				 * a put on the primary failed.
+				 */
+				if (changed && (ret = __dbc_put(pdbc,
+				    &pkey, &data, DB_KEYFIRST)) != 0) {
+					__db_err(env, ret,
+  "Attempt to overwrite item in foreign database with nullified value failed");
+					break;
+				}
+			}
+			/* retrieve the next matching item from the prim. db */
+			memset(&pkey, 0, sizeof(DBT));
+			memset(&data, 0, sizeof(DBT));
+			ret = __dbc_pget(sdbc,
+			    &fkey, &pkey, &data, DB_NEXT_DUP|rmw);
+		}
+
+		if (ret == DB_NOTFOUND)
+			ret = 0;
+		if ((t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
+			ret = t_ret;
+		if (LF_ISSET(DB_FOREIGN_NULLIFY) &&
+		    (t_ret = __dbc_close(pdbc)) != 0 && ret == 0)
+			ret = t_ret;
+		if (ret != 0)
+			return (ret);
+	}
+
+	return (ret);
+}
+
+/*
+ * __db_s_first --
+ *	Get the first secondary, if any are present, from the primary.
+ *
+ * PUBLIC: int __db_s_first __P((DB *, DB **));
+ */
+int
+__db_s_first(pdbp, sdbpp)
+	DB *pdbp, **sdbpp;
+{
+	DB *sdbp;
+
+	MUTEX_LOCK(pdbp->env, pdbp->mutex);
+	sdbp = LIST_FIRST(&pdbp->s_secondaries);
+
+	/* See __db_s_next. */
+	if (sdbp != NULL)
+		sdbp->s_refcnt++;
+	MUTEX_UNLOCK(pdbp->env, pdbp->mutex);
+
+	*sdbpp = sdbp;
+
+	return (0);
+}
+
+/*
+ * __db_s_next --
+ *	Get the next secondary in the list.
+ *
+ * PUBLIC: int __db_s_next __P((DB **, DB_TXN *));
+ */
+int
+__db_s_next(sdbpp, txn)
+	DB **sdbpp;
+	DB_TXN *txn;
+{
+	DB *sdbp, *pdbp, *closeme;
+	ENV *env;
+	int ret;
+
+	/*
+	 * Secondary indices are kept in a linked list, s_secondaries,
+	 * off each primary DB handle.  If a primary is free-threaded,
+	 * this list may only be traversed or modified while the primary's
+	 * thread mutex is held.
+	 *
+	 * The tricky part is that we don't want to hold the thread mutex
+	 * across the full set of secondary puts necessary for each primary
+	 * put, or we'll wind up essentially single-threading all the puts
+	 * to the handle;  the secondary puts will each take about as
+	 * long as the primary does, and may require I/O.  So we instead
+	 * hold the thread mutex only long enough to follow one link to the
+	 * next secondary, and then we release it before performing the
+	 * actual secondary put.
+	 *
+	 * The only danger here is that we might legitimately close a
+	 * secondary index in one thread while another thread is performing
+	 * a put and trying to update that same secondary index.  To
+	 * prevent this from happening, we refcount the secondary handles.
+	 * If close is called on a secondary index handle while we're putting
+	 * to it, it won't really be closed--the refcount will simply drop,
+	 * and we'll be responsible for closing it here.
+	 */
+	sdbp = *sdbpp;
+	pdbp = sdbp->s_primary;
+	env = pdbp->env;
+	closeme = NULL;
+
+	MUTEX_LOCK(env, pdbp->mutex);
+	DB_ASSERT(env, sdbp->s_refcnt != 0);
+	if (--sdbp->s_refcnt == 0) {
+		LIST_REMOVE(sdbp, s_links);
+		closeme = sdbp;
+	}
+	sdbp = LIST_NEXT(sdbp, s_links);
+	if (sdbp != NULL)
+		sdbp->s_refcnt++;
+	MUTEX_UNLOCK(env, pdbp->mutex);
+
+	*sdbpp = sdbp;
+
+	/*
+	 * closeme->close() is a wrapper;  call __db_close explicitly.
+	 */
+	if (closeme == NULL)
+		ret = 0;
+	else 
+		ret = __db_close(closeme, txn, 0);
+
+	return (ret);
+}
+
+/*
+ * __db_s_done --
+ *	Properly decrement the refcount on a secondary database handle we're
+ *	using, without calling __db_s_next.
+ *
+ * PUBLIC: int __db_s_done __P((DB *, DB_TXN *));
+ */
+int
+__db_s_done(sdbp, txn)
+	DB *sdbp;
+	DB_TXN *txn;
+{
+	DB *pdbp;
+	ENV *env;
+	int doclose, ret;
+
+	pdbp = sdbp->s_primary;
+	env = pdbp->env;
+	doclose = 0;
+
+	MUTEX_LOCK(env, pdbp->mutex);
+	DB_ASSERT(env, sdbp->s_refcnt != 0);
+	if (--sdbp->s_refcnt == 0) {
+		LIST_REMOVE(sdbp, s_links);
+		doclose = 1;
+	}
+	MUTEX_UNLOCK(env, pdbp->mutex);
+
+	if (doclose == 0)
+		ret = 0;
+	else
+		ret = __db_close(sdbp, txn, 0);
+	return (ret);
+}
+
+/*
+ * __db_s_count --
+ *	Count the number of secondaries associated with a given primary.
+ */
+static int
+__db_s_count(pdbp)
+	DB *pdbp;
+{
+	DB *sdbp;
+	ENV *env;
+	int count;
+
+	env = pdbp->env;
+	count = 0;
+
+	MUTEX_LOCK(env, pdbp->mutex);
+	for (sdbp = LIST_FIRST(&pdbp->s_secondaries);
+	    sdbp != NULL;
+	    sdbp = LIST_NEXT(sdbp, s_links))
+		++count;
+	MUTEX_UNLOCK(env, pdbp->mutex);
+
+	return (count);
+}
+
+/*
+ * __db_buildpartial --
+ *	Build the record that will result after a partial put is applied to
+ *	an existing record.
+ *
+ *	This should probably be merged with __bam_build, but that requires
+ *	a little trickery if we plan to keep the overflow-record optimization
+ *	in that function.
+ *
+ * PUBLIC: int __db_buildpartial __P((DB *, DBT *, DBT *, DBT *));
+ */
+int
+__db_buildpartial(dbp, oldrec, partial, newrec)
+	DB *dbp;
+	DBT *oldrec, *partial, *newrec;
+{
+	ENV *env;
+	u_int32_t len, nbytes;
+	u_int8_t *buf;
+	int ret;
+
+	env = dbp->env;
+
+	DB_ASSERT(env, F_ISSET(partial, DB_DBT_PARTIAL));
+
+	memset(newrec, 0, sizeof(DBT));
+
+	nbytes = __db_partsize(oldrec->size, partial);
+	newrec->size = nbytes;
+
+	if ((ret = __os_malloc(env, nbytes, &buf)) != 0)
+		return (ret);
+	newrec->data = buf;
+
+	/* Nul or pad out the buffer, for any part that isn't specified. */
+	memset(buf,
+	    F_ISSET(dbp, DB_AM_FIXEDLEN) ? ((BTREE *)dbp->bt_internal)->re_pad :
+	    0, nbytes);
+
+	/* Copy in any leading data from the original record. */
+	memcpy(buf, oldrec->data,
+	    partial->doff > oldrec->size ? oldrec->size : partial->doff);
+
+	/* Copy the data from partial. */
+	memcpy(buf + partial->doff, partial->data, partial->size);
+
+	/* Copy any trailing data from the original record. */
+	len = partial->doff + partial->dlen;
+	if (oldrec->size > len)
+		memcpy(buf + partial->doff + partial->size,
+		    (u_int8_t *)oldrec->data + len, oldrec->size - len);
+
+	return (0);
+}
+
+/*
+ * __db_partsize --
+ *	Given the number of bytes in an existing record and a DBT that
+ *	is about to be partial-put, calculate the size of the record
+ *	after the put.
+ *
+ *	This code is called from __bam_partsize.
+ *
+ * PUBLIC: u_int32_t __db_partsize __P((u_int32_t, DBT *));
+ */
+u_int32_t
+__db_partsize(nbytes, data)
+	u_int32_t nbytes;
+	DBT *data;
+{
+
+	/*
+	 * There are really two cases here:
+	 *
+	 * Case 1: We are replacing some bytes that do not exist (i.e., they
+	 * are past the end of the record).  In this case the number of bytes
+	 * we are replacing is irrelevant and all we care about is how many
+	 * bytes we are going to add from offset.  So, the new record length
+	 * is going to be the size of the new bytes (size) plus wherever those
+	 * new bytes begin (doff).
+	 *
+	 * Case 2: All the bytes we are replacing exist.  Therefore, the new
+	 * size is the oldsize (nbytes) minus the bytes we are replacing (dlen)
+	 * plus the bytes we are adding (size).
+	 */
+	if (nbytes < data->doff + data->dlen)		/* Case 1 */
+		return (data->doff + data->size);
+
+	return (nbytes + data->size - data->dlen);	/* Case 2 */
+}
+
+#ifdef DIAGNOSTIC
+/*
+ * __db_check_skeyset --
+ *	Diagnostic check that the application's callback returns a set of
+ *	secondary keys without repeats.
+ *
+ * PUBLIC: #ifdef DIAGNOSTIC
+ * PUBLIC: void __db_check_skeyset __P((DB *, DBT *));
+ * PUBLIC: #endif
+ */
+void
+__db_check_skeyset(sdbp, skeyp)
+	DB *sdbp;
+	DBT *skeyp;
+{
+	DBT *firstkey, *lastkey, *key1, *key2;
+	ENV *env;
+
+	env = sdbp->env;
+
+	firstkey = (DBT *)skeyp->data;
+	lastkey = firstkey + skeyp->size;
+	for (key1 = firstkey; key1 < lastkey; key1++)
+		for (key2 = key1 + 1; key2 < lastkey; key2++)
+			DB_ASSERT(env,
+			    ((BTREE *)sdbp->bt_internal)->bt_compare(sdbp,
+			    key1, key2) != 0);
+}
+#endif
diff --git a/db/db_cds.c b/db/db_cds.c
new file mode 100644
index 0000000..5efda31
--- /dev/null
+++ b/db/db_cds.c
@@ -0,0 +1,177 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000-2009 Oracle.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/txn.h"
+
+static int __cdsgroup_abort __P((DB_TXN *txn));
+static int __cdsgroup_commit __P((DB_TXN *txn, u_int32_t flags));
+static int __cdsgroup_discard __P((DB_TXN *txn, u_int32_t flags));
+static u_int32_t __cdsgroup_id __P((DB_TXN *txn));
+static int __cdsgroup_notsup __P((ENV *env, const char *meth));
+static int __cdsgroup_prepare __P((DB_TXN *txn, u_int8_t *gid));
+static int __cdsgroup_set_name __P((DB_TXN *txn, const char *name));
+static int __cdsgroup_set_timeout
+    __P((DB_TXN *txn, db_timeout_t timeout, u_int32_t flags));
+
+/*
+ * __cdsgroup_notsup --
+ *	Error when CDS groups don't support a method.
+ */
+static int
+__cdsgroup_notsup(env, meth)
+	ENV *env;
+	const char *meth;
+{
+	__db_errx(env, "CDS groups do not support %s", meth);
+	return (DB_OPNOTSUP);
+}
+
+static int
+__cdsgroup_abort(txn)
+	DB_TXN *txn;
+{
+	return (__cdsgroup_notsup(txn->mgrp->env, "abort"));
+}
+
+static int
+__cdsgroup_commit(txn, flags)
+	DB_TXN *txn;
+	u_int32_t flags;
+{
+	DB_LOCKER *locker;
+	DB_LOCKREQ lreq;
+	ENV *env;
+	int ret, t_ret;
+
+	COMPQUIET(flags, 0);
+	env = txn->mgrp->env;
+
+	/* Check for live cursors. */
+	if (txn->cursors != 0) {
+		__db_errx(env, "CDS group has active cursors");
+		return (EINVAL);
+	}
+
+	/* We may be holding handle locks; release them. */
+	lreq.op = DB_LOCK_PUT_ALL;
+	lreq.obj = NULL;
+	ret = __lock_vec(env, txn->locker, 0, &lreq, 1, NULL);
+
+	env = txn->mgrp->env;
+	locker = txn->locker;
+	__os_free(env, txn->mgrp);
+	__os_free(env, txn);
+	if ((t_ret = __lock_id_free(env, locker)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+static int __cdsgroup_discard(txn, flags)
+	DB_TXN *txn;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+	return (__cdsgroup_notsup(txn->mgrp->env, "discard"));
+}
+
+static u_int32_t __cdsgroup_id(txn)
+	DB_TXN *txn;
+{
+	return (txn->txnid);
+}
+
+static int __cdsgroup_prepare(txn, gid)
+	DB_TXN *txn;
+	u_int8_t *gid;
+{
+	COMPQUIET(gid, NULL);
+	return (__cdsgroup_notsup(txn->mgrp->env, "prepare"));
+}
+
+static int __cdsgroup_set_name(txn, name)
+	DB_TXN *txn;
+	const char *name;
+{
+	COMPQUIET(name, NULL);
+	return (__cdsgroup_notsup(txn->mgrp->env, "set_name"));
+}
+
+static int __cdsgroup_set_timeout(txn, timeout, flags)
+	DB_TXN *txn;
+	db_timeout_t timeout;
+	u_int32_t flags;
+{
+	COMPQUIET(timeout, 0);
+	COMPQUIET(flags, 0);
+	return (__cdsgroup_notsup(txn->mgrp->env, "set_timeout"));
+}
+
+/*
+ * __cds_txn_begin --
+ *	ENV->cdsgroup_begin
+ *
+ * PUBLIC: int __cdsgroup_begin __P((DB_ENV *, DB_TXN **));
+ */
+int
+__cdsgroup_begin(dbenv, txnpp)
+	DB_ENV *dbenv;
+	DB_TXN **txnpp;
+{
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_BEFORE_OPEN(env, "cdsgroup_begin");
+	if (!CDB_LOCKING(env))
+		return (__env_not_config(env, "cdsgroup_begin", DB_INIT_CDB));
+
+	ENV_ENTER(env, ip);
+	*txnpp = txn = NULL;
+	if ((ret = __os_calloc(env, 1, sizeof(DB_TXN), &txn)) != 0)
+		goto err;
+	/*
+	 * We need a dummy DB_TXNMGR -- it's the only way to get from a
+	 * transaction handle to the environment handle.
+	 */
+	if ((ret = __os_calloc(env, 1, sizeof(DB_TXNMGR), &txn->mgrp)) != 0)
+		goto err;
+	txn->mgrp->env = env;
+
+	if ((ret = __lock_id(env, &txn->txnid, &txn->locker)) != 0)
+		goto err;
+
+	txn->flags = TXN_CDSGROUP;
+	txn->abort = __cdsgroup_abort;
+	txn->commit = __cdsgroup_commit;
+	txn->discard = __cdsgroup_discard;
+	txn->id = __cdsgroup_id;
+	txn->prepare = __cdsgroup_prepare;
+	txn->set_name = __cdsgroup_set_name;
+	txn->set_timeout = __cdsgroup_set_timeout;
+
+	*txnpp = txn;
+
+	if (0) {
+err:		if (txn != NULL) {
+			if (txn->mgrp != NULL)
+				__os_free(env, txn->mgrp);
+			__os_free(env, txn);
+		}
+	}
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
diff --git a/db/db_conv.c b/db/db_conv.c
new file mode 100644
index 0000000..4572683
--- /dev/null
+++ b/db/db_conv.c
@@ -0,0 +1,733 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ *	Keith Bostic.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/log.h"
+#include "dbinc/qam.h"
+
+/*
+ * __db_pgin --
+ *	Primary page-swap routine.
+ *
+ * PUBLIC: int __db_pgin __P((DB_ENV *, db_pgno_t, void *, DBT *));
+ */
+int
+__db_pgin(dbenv, pg, pp, cookie)
+	DB_ENV *dbenv;
+	db_pgno_t pg;
+	void *pp;
+	DBT *cookie;
+{
+	DB dummydb, *dbp;
+	DB_CIPHER *db_cipher;
+	DB_LSN not_used;
+	DB_PGINFO *pginfo;
+	ENV *env;
+	PAGE *pagep;
+	size_t sum_len;
+	int is_hmac, ret;
+	u_int8_t *chksum;
+
+	pginfo = (DB_PGINFO *)cookie->data;
+	env = dbenv->env;
+	pagep = (PAGE *)pp;
+
+	ret = is_hmac = 0;
+	chksum = NULL;
+	memset(&dummydb, 0, sizeof(DB));
+	dbp = &dummydb;
+	dbp->dbenv = dbenv;
+	dbp->env = env;
+	dbp->flags = pginfo->flags;
+	dbp->pgsize = pginfo->db_pagesize;
+	db_cipher = env->crypto_handle;
+	switch (pagep->type) {
+	case P_HASHMETA:
+	case P_BTREEMETA:
+	case P_QAMMETA:
+		/*
+		 * If checksumming is set on the meta-page, we must set
+		 * it in the dbp.
+		 */
+		if (FLD_ISSET(((DBMETA *)pp)->metaflags, DBMETA_CHKSUM))
+			F_SET(dbp, DB_AM_CHKSUM);
+		else
+			F_CLR(dbp, DB_AM_CHKSUM);
+		if (((DBMETA *)pp)->encrypt_alg != 0 ||
+		    F_ISSET(dbp, DB_AM_ENCRYPT))
+			is_hmac = 1;
+		/*
+		 * !!!
+		 * For all meta pages it is required that the chksum
+		 * be at the same location.  Use BTMETA to get to it
+		 * for any meta type.
+		 */
+		chksum = ((BTMETA *)pp)->chksum;
+		sum_len = DBMETASIZE;
+		break;
+	case P_INVALID:
+		/*
+		 * We assume that we've read a file hole if we have
+		 * a zero LSN, zero page number and P_INVALID.  Otherwise
+		 * we have an invalid page that might contain real data.
+		 */
+		if (IS_ZERO_LSN(LSN(pagep)) && pagep->pgno == PGNO_INVALID) {
+			sum_len = 0;
+			break;
+		}
+		/* FALLTHROUGH */
+	default:
+		chksum = P_CHKSUM(dbp, pagep);
+		sum_len = pginfo->db_pagesize;
+		/*
+		 * If we are reading in a non-meta page, then if we have
+		 * a db_cipher then we are using hmac.
+		 */
+		is_hmac = CRYPTO_ON(env) ? 1 : 0;
+		break;
+	}
+
+	/*
+	 * We expect a checksum error if there was a configuration problem.
+	 * If there is no configuration problem and we don't get a match,
+	 * it's fatal: panic the system.
+	 */
+	if (F_ISSET(dbp, DB_AM_CHKSUM) && sum_len != 0) {
+		if (F_ISSET(dbp, DB_AM_SWAP) && is_hmac == 0)
+			P_32_SWAP(chksum);
+		switch (ret = __db_check_chksum(
+		    env, NULL, db_cipher, chksum, pp, sum_len, is_hmac)) {
+		case 0:
+			break;
+		case -1:
+			if (DBENV_LOGGING(env))
+				(void)__db_cksum_log(
+				    env, NULL, &not_used, DB_FLUSH);
+			__db_errx(env,
+	    "checksum error: page %lu: catastrophic recovery required",
+			    (u_long)pg);
+			return (__env_panic(env, DB_RUNRECOVERY));
+		default:
+			return (ret);
+		}
+	}
+	if ((ret = __db_decrypt_pg(env, dbp, pagep)) != 0)
+		return (ret);
+	switch (pagep->type) {
+	case P_INVALID:
+		if (pginfo->type == DB_QUEUE)
+			return (__qam_pgin_out(env, pg, pp, cookie));
+		else
+			return (__ham_pgin(dbp, pg, pp, cookie));
+	case P_HASH_UNSORTED:
+	case P_HASH:
+	case P_HASHMETA:
+		return (__ham_pgin(dbp, pg, pp, cookie));
+	case P_BTREEMETA:
+	case P_IBTREE:
+	case P_IRECNO:
+	case P_LBTREE:
+	case P_LDUP:
+	case P_LRECNO:
+	case P_OVERFLOW:
+		return (__bam_pgin(dbp, pg, pp, cookie));
+	case P_QAMMETA:
+	case P_QAMDATA:
+		return (__qam_pgin_out(env, pg, pp, cookie));
+	default:
+		break;
+	}
+	return (__db_pgfmt(env, pg));
+}
+
+/*
+ * __db_pgout --
+ *	Primary page-swap routine.
+ *
+ * PUBLIC: int __db_pgout __P((DB_ENV *, db_pgno_t, void *, DBT *));
+ */
+int
+__db_pgout(dbenv, pg, pp, cookie)
+	DB_ENV *dbenv;
+	db_pgno_t pg;
+	void *pp;
+	DBT *cookie;
+{
+	DB dummydb, *dbp;
+	DB_PGINFO *pginfo;
+	ENV *env;
+	PAGE *pagep;
+	int ret;
+
+	pginfo = (DB_PGINFO *)cookie->data;
+	env = dbenv->env;
+	pagep = (PAGE *)pp;
+
+	memset(&dummydb, 0, sizeof(DB));
+	dbp = &dummydb;
+	dbp->dbenv = dbenv;
+	dbp->env = env;
+	dbp->flags = pginfo->flags;
+	dbp->pgsize = pginfo->db_pagesize;
+	ret = 0;
+	switch (pagep->type) {
+	case P_INVALID:
+		if (pginfo->type == DB_QUEUE)
+			ret = __qam_pgin_out(env, pg, pp, cookie);
+		else
+			ret = __ham_pgout(dbp, pg, pp, cookie);
+		break;
+	case P_HASH:
+	case P_HASH_UNSORTED:
+		/*
+		 * Support pgout of unsorted hash pages - since online
+		 * replication upgrade can cause pages of this type to be
+		 * written out.
+		 *
+		 * FALLTHROUGH
+		 */
+	case P_HASHMETA:
+		ret = __ham_pgout(dbp, pg, pp, cookie);
+		break;
+	case P_BTREEMETA:
+	case P_IBTREE:
+	case P_IRECNO:
+	case P_LBTREE:
+	case P_LDUP:
+	case P_LRECNO:
+	case P_OVERFLOW:
+		ret = __bam_pgout(dbp, pg, pp, cookie);
+		break;
+	case P_QAMMETA:
+	case P_QAMDATA:
+		ret = __qam_pgin_out(env, pg, pp, cookie);
+		break;
+	default:
+		return (__db_pgfmt(env, pg));
+	}
+	if (ret)
+		return (ret);
+
+	return (__db_encrypt_and_checksum_pg(env, dbp, pagep));
+}
+
+/*
+ * __db_decrypt_pg --
+ *      Utility function to decrypt a db page.
+ *
+ * PUBLIC: int __db_decrypt_pg __P((ENV *, DB *, PAGE *));
+ */
+int
+__db_decrypt_pg (env, dbp, pagep)
+	ENV *env;
+	DB *dbp;
+	PAGE *pagep;
+{
+	DB_CIPHER *db_cipher;
+	size_t pg_len, pg_off;
+	u_int8_t *iv;
+	int ret;
+
+	db_cipher = env->crypto_handle;
+	ret = 0;
+	iv = NULL;
+	if (F_ISSET(dbp, DB_AM_ENCRYPT)) {
+		DB_ASSERT(env, db_cipher != NULL);
+		DB_ASSERT(env, F_ISSET(dbp, DB_AM_CHKSUM));
+
+		pg_off = P_OVERHEAD(dbp);
+		DB_ASSERT(env, db_cipher->adj_size(pg_off) == 0);
+
+		switch (pagep->type) {
+		case P_HASHMETA:
+		case P_BTREEMETA:
+		case P_QAMMETA:
+			/*
+			 * !!!
+			 * For all meta pages it is required that the iv
+			 * be at the same location.  Use BTMETA to get to it
+			 * for any meta type.
+			 */
+			iv = ((BTMETA *)pagep)->iv;
+			pg_len = DBMETASIZE;
+			break;
+		case P_INVALID:
+			if (IS_ZERO_LSN(LSN(pagep)) &&
+			    pagep->pgno == PGNO_INVALID) {
+				pg_len = 0;
+				break;
+			}
+			/* FALLTHROUGH */
+		default:
+			iv = P_IV(dbp, pagep);
+			pg_len = dbp->pgsize;
+			break;
+		}
+		if (pg_len != 0)
+			ret = db_cipher->decrypt(env, db_cipher->data,
+			    iv, ((u_int8_t *)pagep) + pg_off,
+			    pg_len - pg_off);
+	}
+	return (ret);
+}
+
+/*
+ * __db_encrypt_and_checksum_pg --
+ *	Utility function to encrypt and checksum a db page.
+ *
+ * PUBLIC: int __db_encrypt_and_checksum_pg
+ * PUBLIC:     __P((ENV *, DB *, PAGE *));
+ */
+int
+__db_encrypt_and_checksum_pg (env, dbp, pagep)
+	ENV *env;
+	DB *dbp;
+	PAGE *pagep;
+{
+	DB_CIPHER *db_cipher;
+	int ret;
+	size_t pg_off, pg_len, sum_len;
+	u_int8_t *chksum, *iv, *key;
+
+	chksum = iv = key = NULL;
+	db_cipher = env->crypto_handle;
+
+	if (F_ISSET(dbp, DB_AM_ENCRYPT)) {
+		DB_ASSERT(env, db_cipher != NULL);
+		DB_ASSERT(env, F_ISSET(dbp, DB_AM_CHKSUM));
+
+		pg_off = P_OVERHEAD(dbp);
+		DB_ASSERT(env, db_cipher->adj_size(pg_off) == 0);
+
+		key = db_cipher->mac_key;
+
+		switch (pagep->type) {
+		case P_HASHMETA:
+		case P_BTREEMETA:
+		case P_QAMMETA:
+			/*
+			 * !!!
+			 * For all meta pages it is required that the iv
+			 * be at the same location.  Use BTMETA to get to it
+			 * for any meta type.
+			 */
+			iv = ((BTMETA *)pagep)->iv;
+			pg_len = DBMETASIZE;
+			break;
+		default:
+			iv = P_IV(dbp, pagep);
+			pg_len = dbp->pgsize;
+			break;
+		}
+		if ((ret = db_cipher->encrypt(env, db_cipher->data,
+		    iv, ((u_int8_t *)pagep) + pg_off, pg_len - pg_off)) != 0)
+			return (ret);
+	}
+	if (F_ISSET(dbp, DB_AM_CHKSUM)) {
+		switch (pagep->type) {
+		case P_HASHMETA:
+		case P_BTREEMETA:
+		case P_QAMMETA:
+			/*
+			 * !!!
+			 * For all meta pages it is required that the chksum
+			 * be at the same location.  Use BTMETA to get to it
+			 * for any meta type.
+			 */
+			chksum = ((BTMETA *)pagep)->chksum;
+			sum_len = DBMETASIZE;
+			break;
+		default:
+			chksum = P_CHKSUM(dbp, pagep);
+			sum_len = dbp->pgsize;
+			break;
+		}
+		__db_chksum(NULL, (u_int8_t *)pagep, sum_len, key, chksum);
+		if (F_ISSET(dbp, DB_AM_SWAP) && !F_ISSET(dbp, DB_AM_ENCRYPT))
+			 P_32_SWAP(chksum);
+	}
+	return (0);
+}
+
+/*
+ * __db_metaswap --
+ *	Byteswap the common part of the meta-data page.
+ *
+ * PUBLIC: void __db_metaswap __P((PAGE *));
+ */
+void
+__db_metaswap(pg)
+	PAGE *pg;
+{
+	u_int8_t *p;
+
+	p = (u_int8_t *)pg;
+
+	/* Swap the meta-data information. */
+	SWAP32(p);	/* lsn.file */
+	SWAP32(p);	/* lsn.offset */
+	SWAP32(p);	/* pgno */
+	SWAP32(p);	/* magic */
+	SWAP32(p);	/* version */
+	SWAP32(p);	/* pagesize */
+	p += 4;		/* unused, page type, unused, unused */
+	SWAP32(p);	/* free */
+	SWAP32(p);	/* alloc_lsn part 1 */
+	SWAP32(p);	/* alloc_lsn part 2 */
+	SWAP32(p);	/* cached key count */
+	SWAP32(p);	/* cached record count */
+	SWAP32(p);	/* flags */
+}
+
+/*
+ * __db_byteswap --
+ *	Byteswap an ordinary database page.
+ *
+ * PUBLIC: int __db_byteswap
+ * PUBLIC:         __P((DB *, db_pgno_t, PAGE *, size_t, int));
+ */
+int
+__db_byteswap(dbp, pg, h, pagesize, pgin)
+	DB *dbp;
+	db_pgno_t pg;
+	PAGE *h;
+	size_t pagesize;
+	int pgin;
+{
+	ENV *env;
+	BINTERNAL *bi;
+	BKEYDATA *bk;
+	BOVERFLOW *bo;
+	RINTERNAL *ri;
+	db_indx_t i, *inp, len, tmp;
+	u_int8_t *end, *p, *pgend;
+
+	if (pagesize == 0)
+		return (0);
+
+	env = dbp->env;
+
+	if (pgin) {
+		M_32_SWAP(h->lsn.file);
+		M_32_SWAP(h->lsn.offset);
+		M_32_SWAP(h->pgno);
+		M_32_SWAP(h->prev_pgno);
+		M_32_SWAP(h->next_pgno);
+		M_16_SWAP(h->entries);
+		M_16_SWAP(h->hf_offset);
+	}
+
+	pgend = (u_int8_t *)h + pagesize;
+
+	inp = P_INP(dbp, h);
+	if ((u_int8_t *)inp >= pgend)
+		goto out;
+
+	switch (TYPE(h)) {
+	case P_HASH_UNSORTED:
+	case P_HASH:
+		for (i = 0; i < NUM_ENT(h); i++) {
+			if (pgin)
+				M_16_SWAP(inp[i]);
+
+			if (P_ENTRY(dbp, h, i) >= pgend)
+				continue;
+
+			switch (HPAGE_TYPE(dbp, h, i)) {
+			case H_KEYDATA:
+				break;
+			case H_DUPLICATE:
+				len = LEN_HKEYDATA(dbp, h, pagesize, i);
+				p = HKEYDATA_DATA(P_ENTRY(dbp, h, i));
+				for (end = p + len; p < end;) {
+					if (pgin) {
+						P_16_SWAP(p);
+						memcpy(&tmp,
+						    p, sizeof(db_indx_t));
+						p += sizeof(db_indx_t);
+					} else {
+						memcpy(&tmp,
+						    p, sizeof(db_indx_t));
+						SWAP16(p);
+					}
+					p += tmp;
+					SWAP16(p);
+				}
+				break;
+			case H_OFFDUP:
+				p = HOFFPAGE_PGNO(P_ENTRY(dbp, h, i));
+				SWAP32(p);			/* pgno */
+				break;
+			case H_OFFPAGE:
+				p = HOFFPAGE_PGNO(P_ENTRY(dbp, h, i));
+				SWAP32(p);			/* pgno */
+				SWAP32(p);			/* tlen */
+				break;
+			default:
+				return (__db_pgfmt(env, pg));
+			}
+
+		}
+
+		/*
+		 * The offsets in the inp array are used to determine
+		 * the size of entries on a page; therefore they
+		 * cannot be converted until we've done all the
+		 * entries.
+		 */
+		if (!pgin)
+			for (i = 0; i < NUM_ENT(h); i++)
+				M_16_SWAP(inp[i]);
+		break;
+	case P_LBTREE:
+	case P_LDUP:
+	case P_LRECNO:
+		for (i = 0; i < NUM_ENT(h); i++) {
+			if (pgin)
+				M_16_SWAP(inp[i]);
+
+			/*
+			 * In the case of on-page duplicates, key information
+			 * should only be swapped once.
+			 */
+			if (h->type == P_LBTREE && i > 1) {
+				if (pgin) {
+					if (inp[i] == inp[i - 2])
+						continue;
+				} else {
+					M_16_SWAP(inp[i]);
+					if (inp[i] == inp[i - 2])
+						continue;
+					M_16_SWAP(inp[i]);
+				}
+			}
+
+			bk = GET_BKEYDATA(dbp, h, i);
+			if ((u_int8_t *)bk >= pgend)
+				continue;
+			switch (B_TYPE(bk->type)) {
+			case B_KEYDATA:
+				M_16_SWAP(bk->len);
+				break;
+			case B_DUPLICATE:
+			case B_OVERFLOW:
+				bo = (BOVERFLOW *)bk;
+				M_32_SWAP(bo->pgno);
+				M_32_SWAP(bo->tlen);
+				break;
+			default:
+				return (__db_pgfmt(env, pg));
+			}
+
+			if (!pgin)
+				M_16_SWAP(inp[i]);
+		}
+		break;
+	case P_IBTREE:
+		for (i = 0; i < NUM_ENT(h); i++) {
+			if (pgin)
+				M_16_SWAP(inp[i]);
+
+			bi = GET_BINTERNAL(dbp, h, i);
+			if ((u_int8_t *)bi >= pgend)
+				continue;
+
+			M_16_SWAP(bi->len);
+			M_32_SWAP(bi->pgno);
+			M_32_SWAP(bi->nrecs);
+
+			switch (B_TYPE(bi->type)) {
+			case B_KEYDATA:
+				break;
+			case B_DUPLICATE:
+			case B_OVERFLOW:
+				bo = (BOVERFLOW *)bi->data;
+				M_32_SWAP(bo->pgno);
+				M_32_SWAP(bo->tlen);
+				break;
+			default:
+				return (__db_pgfmt(env, pg));
+			}
+
+			if (!pgin)
+				M_16_SWAP(inp[i]);
+		}
+		break;
+	case P_IRECNO:
+		for (i = 0; i < NUM_ENT(h); i++) {
+			if (pgin)
+				M_16_SWAP(inp[i]);
+
+			ri = GET_RINTERNAL(dbp, h, i);
+			if ((u_int8_t *)ri >= pgend)
+				continue;
+
+			M_32_SWAP(ri->pgno);
+			M_32_SWAP(ri->nrecs);
+
+			if (!pgin)
+				M_16_SWAP(inp[i]);
+		}
+		break;
+	case P_INVALID:
+	case P_OVERFLOW:
+	case P_QAMDATA:
+		/* Nothing to do. */
+		break;
+	default:
+		return (__db_pgfmt(env, pg));
+	}
+
+out:	if (!pgin) {
+		/* Swap the header information. */
+		M_32_SWAP(h->lsn.file);
+		M_32_SWAP(h->lsn.offset);
+		M_32_SWAP(h->pgno);
+		M_32_SWAP(h->prev_pgno);
+		M_32_SWAP(h->next_pgno);
+		M_16_SWAP(h->entries);
+		M_16_SWAP(h->hf_offset);
+	}
+	return (0);
+}
+
+/*
+ * __db_pageswap --
+ *	Byteswap any database page.  Normally, the page to be swapped will be
+ *	referenced by the "pp" argument and the pdata argument will be NULL.
+ *	This function is also called by automatically generated log functions,
+ *	where the page may be split into separate header and data parts.  In
+ *	that case, pdata is not NULL we reconsitute
+ *
+ * PUBLIC: int __db_pageswap
+ * PUBLIC:         __P((DB *, void *, size_t, DBT *, int));
+ */
+int
+__db_pageswap(dbp, pp, len, pdata, pgin)
+	DB *dbp;
+	void *pp;
+	size_t len;
+	DBT *pdata;
+	int pgin;
+{
+	ENV *env;
+	db_pgno_t pg;
+	size_t pgsize;
+	void *pgcopy;
+	int ret;
+	u_int16_t hoffset;
+
+	env = dbp->env;
+
+	switch (TYPE(pp)) {
+	case P_BTREEMETA:
+		return (__bam_mswap(env, pp));
+
+	case P_HASHMETA:
+		return (__ham_mswap(env, pp));
+
+	case P_QAMMETA:
+		return (__qam_mswap(env, pp));
+
+	case P_INVALID:
+	case P_OVERFLOW:
+	case P_QAMDATA:
+		/*
+		 * We may have been passed an invalid page, or a queue data
+		 * page, or an overflow page where fields like hoffset have a
+		 * special meaning.  In that case, no swapping of the page data
+		 * is required, just the fields in the page header.
+		 */
+		pdata = NULL;
+		break;
+
+	default:
+		break;
+	}
+
+	if (pgin) {
+		P_32_COPYSWAP(&PGNO(pp), &pg);
+		P_16_COPYSWAP(&HOFFSET(pp), &hoffset);
+	} else {
+		pg = PGNO(pp);
+		hoffset = HOFFSET(pp);
+	}
+
+	if (pdata == NULL)
+		ret = __db_byteswap(dbp, pg, (PAGE *)pp, len, pgin);
+	else {
+		pgsize = hoffset + pdata->size;
+		if ((ret = __os_malloc(env, pgsize, &pgcopy)) != 0)
+			return (ret);
+		memset(pgcopy, 0, pgsize);
+		memcpy(pgcopy, pp, len);
+		memcpy((u_int8_t *)pgcopy + hoffset, pdata->data, pdata->size);
+
+		ret = __db_byteswap(dbp, pg, (PAGE *)pgcopy, pgsize, pgin);
+		memcpy(pp, pgcopy, len);
+
+		/*
+		 * If we are swapping data to be written to the log, we can't
+		 * overwrite the buffer that was passed in: it may be a pointer
+		 * into a page in cache.  We set DB_DBT_APPMALLOC here so that
+		 * the calling code can free the memory we allocate here.
+		 */
+		if (!pgin) {
+			if ((ret =
+			    __os_malloc(env, pdata->size, &pdata->data)) != 0) {
+				__os_free(env, pgcopy);
+				return (ret);
+			}
+			F_SET(pdata, DB_DBT_APPMALLOC);
+		}
+		memcpy(pdata->data, (u_int8_t *)pgcopy + hoffset, pdata->size);
+		__os_free(env, pgcopy);
+	}
+
+	return (ret);
+}
diff --git a/db/db_dispatch.c b/db/db_dispatch.c
new file mode 100644
index 0000000..65dc260
--- /dev/null
+++ b/db/db_dispatch.c
@@ -0,0 +1,953 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1995, 1996
+ *	The President and Fellows of Harvard University.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+#include "dbinc/fop.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __db_txnlist_find_internal __P((ENV *, DB_TXNHEAD *,
+		db_txnlist_type, u_int32_t,  DB_TXNLIST **,
+		int, u_int32_t *));
+
+/*
+ * __db_dispatch --
+ *
+ * This is the transaction dispatch function used by the db access methods.
+ * It is designed to handle the record format used by all the access
+ * methods (the one automatically generated by the db_{h,log,read}.sh
+ * scripts in the tools directory).  An application using a different
+ * recovery paradigm will supply a different dispatch function to txn_open.
+ *
+ * PUBLIC: int __db_dispatch __P((ENV *,
+ * PUBLIC:     DB_DISTAB *, DBT *, DB_LSN *, db_recops, DB_TXNHEAD *));
+ */
+int
+__db_dispatch(env, dtab, db, lsnp, redo, info)
+	ENV *env;		/* The environment. */
+	DB_DISTAB *dtab;
+	DBT *db;		/* The log record upon which to dispatch. */
+	DB_LSN *lsnp;		/* The lsn of the record being dispatched. */
+	db_recops redo;		/* Redo this op (or undo it). */
+	DB_TXNHEAD *info;	/* Transaction list. */
+{
+	DB_ENV *dbenv;
+	DB_LSN prev_lsn;
+	u_int32_t rectype, status, txnid, urectype;
+	int make_call, ret;
+
+	dbenv = env->dbenv;
+	LOGCOPY_32(env, &rectype, db->data);
+	LOGCOPY_32(env, &txnid, (u_int8_t *)db->data + sizeof(rectype));
+
+	make_call = ret = 0;
+
+	/* If we don't have a dispatch table, it's hard to dispatch. */
+	DB_ASSERT(env, dtab != NULL);
+
+	/*
+	 * If we find a record that is in the user's number space and they
+	 * have specified a recovery routine, let them handle it.  If they
+	 * didn't specify a recovery routine, then we expect that they've
+	 * followed all our rules and registered new recovery functions.
+	 */
+	switch (redo) {
+	case DB_TXN_ABORT:
+	case DB_TXN_APPLY:
+	case DB_TXN_PRINT:
+		make_call = 1;
+		break;
+	case DB_TXN_OPENFILES:
+		/*
+		 * We collect all the transactions that have
+		 * "begin" records, those with no previous LSN,
+		 * so that we do not abort partial transactions.
+		 * These are known to be undone, otherwise the
+		 * log would not have been freeable.
+		 */
+		LOGCOPY_TOLSN(env, &prev_lsn, (u_int8_t *)db->data +
+		    sizeof(rectype) + sizeof(txnid));
+		if (txnid != 0 && prev_lsn.file == 0 && (ret =
+		    __db_txnlist_add(env, info, txnid, TXN_OK, NULL)) != 0)
+			return (ret);
+
+		/* FALLTHROUGH */
+	case DB_TXN_POPENFILES:
+		if (rectype == DB___dbreg_register ||
+		    rectype == DB___txn_child ||
+		    rectype == DB___txn_ckp || rectype == DB___txn_recycle)
+			return ((dtab->int_dispatch[rectype])(env,
+			    db, lsnp, redo, info));
+		break;
+	case DB_TXN_BACKWARD_ROLL:
+		/*
+		 * Running full recovery in the backward pass. In general,
+		 * we only process records during this pass that belong
+		 * to aborted transactions.  Unfortunately, there are several
+		 * exceptions:
+		 * 1. If this is a meta-record, one not associated with
+		 *    a transaction, then we must always process it.
+		 * 2. If this is a transaction commit/abort, we must
+		 *    always process it, so that we know the status of
+		 *    every transaction.
+		 * 3. If this is a child commit, we need to process it
+		 *    because the outcome of the child transaction depends
+		 *    on the outcome of the parent.
+		 * 4. If this is a dbreg_register record, we must always
+		 *    process is because they contain non-transactional
+		 *    closes that must be properly handled.
+		 * 5. If this is a noop, we must always undo it so that we
+		 *    properly handle any aborts before a file was closed.
+		 * 6. If this a file remove, we need to process it to
+		 *    determine if the on-disk file is the same as the
+		 *    one being described.
+		 */
+		switch (rectype) {
+		/*
+		 * These either do not belong to a transaction or (regop)
+		 * must be processed regardless of the status of the
+		 * transaction.
+		 */
+		case DB___txn_regop:
+		case DB___txn_recycle:
+		case DB___txn_ckp:
+			make_call = 1;
+			break;
+		/*
+		 * These belong to a transaction whose status must be
+		 * checked.
+		 */
+		case DB___txn_child:
+		case DB___db_noop:
+		case DB___fop_file_remove:
+		case DB___dbreg_register:
+			make_call = 1;
+
+			/* FALLTHROUGH */
+		default:
+			if (txnid == 0)
+				break;
+
+			ret = __db_txnlist_find(env, info, txnid, &status);
+
+			/* If not found, this is an incomplete abort.  */
+			if (ret == DB_NOTFOUND)
+				return (__db_txnlist_add(env,
+				    info, txnid, TXN_IGNORE, lsnp));
+			if (ret != 0)
+				return (ret);
+
+			/*
+			 * If we ignore the transaction, ignore the operation
+			 * UNLESS this is a child commit in which case we need
+			 * to make sure that the child also gets marked as
+			 * ignore.
+			 */
+			if (status == TXN_IGNORE && rectype != DB___txn_child) {
+				make_call = 0;
+				break;
+			}
+			if (status == TXN_COMMIT)
+				break;
+
+			/* Set make_call in case we came through default */
+			make_call = 1;
+			if (status == TXN_OK &&
+			    (ret = __db_txnlist_update(env,
+			    info, txnid, rectype == DB___txn_prepare ?
+			    TXN_PREPARE : TXN_ABORT, NULL, &status, 0)) != 0)
+				return (ret);
+		}
+		break;
+	case DB_TXN_FORWARD_ROLL:
+		/*
+		 * In the forward pass, if we haven't seen the transaction,
+		 * do nothing, else recover it.
+		 *
+		 * We need to always redo DB___db_noop records, so that we
+		 * properly handle any commits after the file was closed.
+		 */
+		switch (rectype) {
+		case DB___txn_recycle:
+		case DB___txn_ckp:
+		case DB___db_noop:
+		case DB___dbreg_register:
+			make_call = 1;
+			break;
+
+		default:
+			if (txnid == 0)
+				status = 0;
+			else {
+				ret = __db_txnlist_find(env,
+				    info, txnid, &status);
+
+				if (ret == DB_NOTFOUND)
+					/* Break out out of if clause. */
+					;
+				else if (ret != 0)
+					return (ret);
+				else if (status == TXN_COMMIT) {
+					make_call = 1;
+					break;
+				}
+			}
+
+		}
+		break;
+	default:
+		return (__db_unknown_flag(
+		    env, "__db_dispatch", (u_int32_t)redo));
+	}
+
+	if (make_call) {
+		/*
+		 * If the debug flag is set then we are logging
+		 * records for a non-durable update so that they
+		 * may be examined for diagnostic purposes.
+		 * So only make the call if we are printing,
+		 * otherwise we need to extract the previous
+		 * lsn so undo will work properly.
+		 */
+		if (rectype & DB_debug_FLAG) {
+			if (redo == DB_TXN_PRINT)
+				rectype &= ~DB_debug_FLAG;
+			else {
+				LOGCOPY_TOLSN(env, lsnp,
+				    (u_int8_t *)db->data +
+				    sizeof(rectype) +
+				    sizeof(txnid));
+				return (0);
+			}
+		}
+		if (rectype >= DB_user_BEGIN) {
+			if (dbenv->app_dispatch != NULL)
+				return (dbenv->app_dispatch(dbenv,
+				    db, lsnp, redo));
+
+			/* No application-specific dispatch */
+			urectype = rectype - DB_user_BEGIN;
+			if (urectype > dtab->ext_size ||
+			    dtab->ext_dispatch[urectype] == NULL) {
+				__db_errx(env,
+	    "Illegal application-specific record type %lu in log",
+				    (u_long)rectype);
+				return (EINVAL);
+			}
+			return ((dtab->ext_dispatch[urectype])(dbenv,
+			    db, lsnp, redo));
+		} else {
+			if (rectype > dtab->int_size ||
+			    dtab->int_dispatch[rectype] == NULL) {
+				__db_errx(env,
+				    "Illegal record type %lu in log",
+				    (u_long)rectype);
+				return (EINVAL);
+			}
+			return ((dtab->int_dispatch[rectype])(env,
+			    db, lsnp, redo, info));
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * __db_add_recovery -- Add recovery functions to the dispatch table.
+ *
+ * We have two versions of this, an external one and an internal one,
+ * because application-specific functions take different arguments
+ * for dispatch (ENV versus DB_ENV).
+ *
+ * This is the external version.
+ *
+ * PUBLIC: int __db_add_recovery __P((DB_ENV *, DB_DISTAB *,
+ * PUBLIC:   int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops), u_int32_t));
+ */
+int
+__db_add_recovery(dbenv, dtab, func, ndx)
+	DB_ENV *dbenv;
+	DB_DISTAB *dtab;
+	int (*func) __P((DB_ENV *, DBT *, DB_LSN *, db_recops));
+	u_int32_t ndx;
+{
+	size_t i, nsize;
+	int ret;
+
+	/* Make sure this is an application-specific record. */
+	if (ndx < DB_user_BEGIN) {
+		__db_errx(dbenv->env,
+	 "Attempting to add application-specific record with invalid type %lu",
+		    (u_long)ndx);
+		return (EINVAL);
+	}
+	ndx -= DB_user_BEGIN;
+
+	/* Check if we have to grow the table. */
+	if (ndx >= dtab->ext_size) {
+		nsize = ndx + 40;
+		if ((ret =
+		    __os_realloc(dbenv->env, nsize *
+		    sizeof((dtab->ext_dispatch)[0]), &dtab->ext_dispatch))
+		    != 0)
+			return (ret);
+		for (i = dtab->ext_size; i < nsize; ++i)
+			(dtab->ext_dispatch)[i] = NULL;
+		dtab->ext_size = nsize;
+	}
+
+	(dtab->ext_dispatch)[ndx] = func;
+	return (0);
+}
+
+/*
+ * __db_add_recovery_int --
+ *
+ * Internal version of dispatch addition function.
+ *
+ *
+ * PUBLIC: int __db_add_recovery_int __P((ENV *, DB_DISTAB *,
+ * PUBLIC:   int (*)(ENV *, DBT *, DB_LSN *, db_recops, void *), u_int32_t));
+ */
+int
+__db_add_recovery_int(env, dtab, func, ndx)
+	ENV *env;
+	DB_DISTAB *dtab;
+	int (*func) __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+	u_int32_t ndx;
+{
+	size_t i, nsize;
+	int ret;
+
+	if (ndx >= DB_user_BEGIN) {
+		__db_errx(env,
+		    "Attempting to add internal record with invalid type %lu",
+		    (u_long)ndx);
+		return (EINVAL);
+	}
+
+	/* Check if we have to grow the table. */
+	if (ndx >= dtab->int_size) {
+		nsize = ndx + 40;
+		if ((ret =
+		    __os_realloc(env, nsize * sizeof((dtab->int_dispatch)[0]),
+		    &dtab->int_dispatch)) != 0)
+			return (ret);
+		for (i = dtab->int_size; i < nsize; ++i)
+			(dtab->int_dispatch)[i] = NULL;
+		dtab->int_size = nsize;
+	}
+
+	(dtab->int_dispatch)[ndx] = func;
+	return (0);
+}
+
+/*
+ * __db_txnlist_init --
+ *	Initialize transaction linked list.
+ *
+ * PUBLIC: int __db_txnlist_init __P((ENV *, DB_THREAD_INFO *,
+ * PUBLIC:     u_int32_t, u_int32_t, DB_LSN *, DB_TXNHEAD **));
+ */
+int
+__db_txnlist_init(env, ip, low_txn, hi_txn, trunc_lsn, retp)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	u_int32_t low_txn, hi_txn;
+	DB_LSN *trunc_lsn;
+	DB_TXNHEAD **retp;
+{
+	DB_TXNHEAD *headp;
+	u_int32_t size, tmp;
+	int ret;
+
+	/*
+	 * Size a hash table.
+	 *	If low is zero then we are being called during rollback
+	 * and we need only one slot.
+	 *	Hi maybe lower than low if we have recycled txnid's.
+	 *	The numbers here are guesses about txn density, we can afford
+	 * to look at a few entries in each slot.
+	 */
+	if (low_txn == 0)
+		size = 1;
+	else {
+		if (hi_txn < low_txn) {
+			tmp = hi_txn;
+			hi_txn = low_txn;
+			low_txn = tmp;
+		}
+		tmp = hi_txn - low_txn;
+		/* See if we wrapped around. */
+		if (tmp > (TXN_MAXIMUM - TXN_MINIMUM) / 2)
+			tmp = (low_txn - TXN_MINIMUM) + (TXN_MAXIMUM - hi_txn);
+		size = tmp / 5;
+		if (size < 100)
+			size = 100;
+	}
+	if ((ret = __os_malloc(env,
+	    sizeof(DB_TXNHEAD) + size * sizeof(headp->head), &headp)) != 0)
+		return (ret);
+
+	memset(headp, 0, sizeof(DB_TXNHEAD) + size * sizeof(headp->head));
+	headp->maxid = hi_txn;
+	headp->generation = 0;
+	headp->nslots = size;
+	headp->gen_alloc = 8;
+	headp->thread_info = ip;
+	if ((ret = __os_malloc(env, headp->gen_alloc *
+	    sizeof(headp->gen_array[0]), &headp->gen_array)) != 0) {
+		__os_free(env, headp);
+		return (ret);
+	}
+	headp->gen_array[0].generation = 0;
+	headp->gen_array[0].txn_min = TXN_MINIMUM;
+	headp->gen_array[0].txn_max = TXN_MAXIMUM;
+	if (trunc_lsn != NULL) {
+		headp->trunc_lsn = *trunc_lsn;
+		headp->maxlsn = *trunc_lsn;
+	} else {
+		ZERO_LSN(headp->trunc_lsn);
+		ZERO_LSN(headp->maxlsn);
+	}
+	ZERO_LSN(headp->ckplsn);
+
+	*retp = headp;
+	return (0);
+}
+
+#define	FIND_GENERATION(hp, txnid, gen) do {				\
+	u_int32_t __i;							\
+	for (__i = 0; __i <= (hp)->generation; __i++)			\
+		/* The range may wrap around the end. */		\
+		if ((hp)->gen_array[__i].txn_min <			\
+		    (hp)->gen_array[__i].txn_max ?			\
+		    ((txnid) >= (hp)->gen_array[__i].txn_min &&		\
+		    (txnid) <= (hp)->gen_array[__i].txn_max) :		\
+		    ((txnid) >= (hp)->gen_array[__i].txn_min ||		\
+		    (txnid) <= (hp)->gen_array[__i].txn_max))		\
+			break;						\
+	DB_ASSERT(env, __i <= (hp)->generation);			\
+	gen = (hp)->gen_array[__i].generation;				\
+} while (0)
+
+/*
+ * __db_txnlist_add --
+ *	Add an element to our transaction linked list.
+ *
+ * PUBLIC: int __db_txnlist_add __P((ENV *,
+ * PUBLIC:     DB_TXNHEAD *, u_int32_t, u_int32_t, DB_LSN *));
+ */
+int
+__db_txnlist_add(env, hp, txnid, status, lsn)
+	ENV *env;
+	DB_TXNHEAD *hp;
+	u_int32_t txnid, status;
+	DB_LSN *lsn;
+{
+	DB_TXNLIST *elp;
+	int ret;
+
+	if ((ret = __os_malloc(env, sizeof(DB_TXNLIST), &elp)) != 0)
+		return (ret);
+
+	LIST_INSERT_HEAD(&hp->head[DB_TXNLIST_MASK(hp, txnid)], elp, links);
+
+	/* Find the most recent generation containing this ID */
+	FIND_GENERATION(hp, txnid, elp->u.t.generation);
+	elp->type = TXNLIST_TXNID;
+	elp->u.t.txnid = txnid;
+	elp->u.t.status = status;
+	if (txnid > hp->maxid)
+		hp->maxid = txnid;
+	if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
+		hp->maxlsn = *lsn;
+
+	DB_ASSERT(env, lsn == NULL ||
+	    status != TXN_COMMIT || LOG_COMPARE(&hp->maxlsn, lsn) >= 0);
+
+	return (0);
+}
+
+/*
+ * __db_txnlist_remove --
+ *	Remove an element from our transaction linked list.
+ *
+ * PUBLIC: int __db_txnlist_remove __P((ENV *, DB_TXNHEAD *, u_int32_t));
+ */
+int
+__db_txnlist_remove(env, hp, txnid)
+	ENV *env;
+	DB_TXNHEAD *hp;
+	u_int32_t txnid;
+{
+	DB_TXNLIST *entry;
+	u_int32_t status;
+
+	return (__db_txnlist_find_internal(env,
+	    hp, TXNLIST_TXNID, txnid, &entry, 1, &status));
+}
+
+/*
+ * __db_txnlist_ckp --
+ *	Used to record the maximum checkpoint that will be retained
+ * after recovery.  Typically this is simply the max checkpoint, but
+ * if we are doing client replication recovery or timestamp-based
+ * recovery, we are going to virtually truncate the log and we need
+ * to retain the last checkpoint before the truncation point.
+ *
+ * PUBLIC: void __db_txnlist_ckp __P((ENV *, DB_TXNHEAD *, DB_LSN *));
+ */
+void
+__db_txnlist_ckp(env, hp, ckp_lsn)
+	ENV *env;
+	DB_TXNHEAD *hp;
+	DB_LSN *ckp_lsn;
+{
+
+	COMPQUIET(env, NULL);
+
+	if (IS_ZERO_LSN(hp->ckplsn) && !IS_ZERO_LSN(hp->maxlsn) &&
+	    LOG_COMPARE(&hp->maxlsn, ckp_lsn) >= 0)
+		hp->ckplsn = *ckp_lsn;
+}
+
+/*
+ * __db_txnlist_end --
+ *	Discard transaction linked list.
+ *
+ * PUBLIC: void __db_txnlist_end __P((ENV *, DB_TXNHEAD *));
+ */
+void
+__db_txnlist_end(env, hp)
+	ENV *env;
+	DB_TXNHEAD *hp;
+{
+	u_int32_t i;
+	DB_TXNLIST *p;
+
+	if (hp == NULL)
+		return;
+
+	for (i = 0; i < hp->nslots; i++)
+		while (hp != NULL && (p = LIST_FIRST(&hp->head[i])) != NULL) {
+			switch (p->type) {
+			case TXNLIST_LSN:
+				__os_free(env, p->u.l.lsn_stack);
+				break;
+			case TXNLIST_DELETE:
+			case TXNLIST_TXNID:
+			default:
+				/*
+				 * Possibly an incomplete DB_TXNLIST; just
+				 * free it.
+				 */
+				break;
+			}
+			LIST_REMOVE(p, links);
+			__os_free(env, p);
+		}
+
+	if (hp->gen_array != NULL)
+		__os_free(env, hp->gen_array);
+	__os_free(env, hp);
+}
+
+/*
+ * __db_txnlist_find --
+ *	Checks to see if a txnid with the current generation is in the
+ *	txnid list.  This returns DB_NOTFOUND if the item isn't in the
+ *	list otherwise it returns (like __db_txnlist_find_internal)
+ *	the status of the transaction.  A txnid of 0 means the record
+ *	was generated while not in a transaction.
+ *
+ * PUBLIC: int __db_txnlist_find __P((ENV *,
+ * PUBLIC:     DB_TXNHEAD *, u_int32_t, u_int32_t *));
+ */
+int
+__db_txnlist_find(env, hp, txnid, statusp)
+	ENV *env;
+	DB_TXNHEAD *hp;
+	u_int32_t txnid, *statusp;
+{
+	DB_TXNLIST *entry;
+
+	if (txnid == 0)
+		return (DB_NOTFOUND);
+
+	return (__db_txnlist_find_internal(env, hp,
+	    TXNLIST_TXNID, txnid, &entry, 0, statusp));
+}
+
+/*
+ * __db_txnlist_update --
+ *	Change the status of an existing transaction entry.
+ *	Returns DB_NOTFOUND if no such entry exists.
+ *
+ * PUBLIC: int __db_txnlist_update __P((ENV *, DB_TXNHEAD *,
+ * PUBLIC:     u_int32_t, u_int32_t, DB_LSN *, u_int32_t *, int));
+ */
+int
+__db_txnlist_update(env, hp, txnid, status, lsn, ret_status, add_ok)
+	ENV *env;
+	DB_TXNHEAD *hp;
+	u_int32_t txnid, status;
+	DB_LSN *lsn;
+	u_int32_t *ret_status;
+	int add_ok;
+{
+	DB_TXNLIST *elp;
+	int ret;
+
+	if (txnid == 0)
+		return (DB_NOTFOUND);
+
+	ret = __db_txnlist_find_internal(env,
+	    hp, TXNLIST_TXNID, txnid, &elp, 0, ret_status);
+
+	if (ret == DB_NOTFOUND && add_ok) {
+		*ret_status = status;
+		return (__db_txnlist_add(env, hp, txnid, status, lsn));
+	}
+	if (ret != 0)
+		return (ret);
+
+	if (*ret_status == TXN_IGNORE)
+		return (0);
+
+	elp->u.t.status = status;
+
+	if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
+		hp->maxlsn = *lsn;
+
+	return (ret);
+}
+
+/*
+ * __db_txnlist_find_internal --
+ *	Find an entry on the transaction list.  If the entry is not there or
+ *	the list pointer is not initialized we return DB_NOTFOUND.  If the
+ *	item is found, we return the status.  Currently we always call this
+ *	with an initialized list pointer but checking for NULL keeps it general.
+ */
+static int
+__db_txnlist_find_internal(env,
+    hp, type, txnid, txnlistp, delete, statusp)
+	ENV *env;
+	DB_TXNHEAD *hp;
+	db_txnlist_type type;
+	u_int32_t  txnid;
+	DB_TXNLIST **txnlistp;
+	int delete;
+	u_int32_t *statusp;
+{
+	struct __db_headlink *head;
+	DB_TXNLIST *p;
+	u_int32_t generation, hash;
+	int ret;
+
+	ret = 0;
+
+	if (hp == NULL)
+		return (DB_NOTFOUND);
+
+	switch (type) {
+	case TXNLIST_TXNID:
+		hash = txnid;
+		FIND_GENERATION(hp, txnid, generation);
+		break;
+	case TXNLIST_DELETE:
+	case TXNLIST_LSN:
+	default:
+		return (__env_panic(env, EINVAL));
+	}
+
+	head = &hp->head[DB_TXNLIST_MASK(hp, hash)];
+	LIST_FOREACH(p, head, links) {
+		if (p->type != type)
+			continue;
+		switch (type) {
+		case TXNLIST_TXNID:
+			if (p->u.t.txnid != txnid ||
+			    generation != p->u.t.generation)
+				continue;
+			*statusp = p->u.t.status;
+			break;
+
+		case TXNLIST_DELETE:
+		case TXNLIST_LSN:
+		default:
+			return (__env_panic(env, EINVAL));
+		}
+		if (delete == 1) {
+			LIST_REMOVE(p, links);
+			__os_free(env, p);
+			*txnlistp = NULL;
+		} else if (p != LIST_FIRST(head)) {
+			/* Move it to head of list. */
+			LIST_REMOVE(p, links);
+			LIST_INSERT_HEAD(head, p, links);
+			*txnlistp = p;
+		} else
+			*txnlistp = p;
+		return (ret);
+	}
+
+	return (DB_NOTFOUND);
+}
+
+/*
+ * __db_txnlist_gen --
+ *	Change the current generation number.
+ *
+ * PUBLIC: int __db_txnlist_gen __P((ENV *,
+ * PUBLIC:       DB_TXNHEAD *, int, u_int32_t, u_int32_t));
+ */
+int
+__db_txnlist_gen(env, hp, incr, min, max)
+	ENV *env;
+	DB_TXNHEAD *hp;
+	int incr;
+	u_int32_t min, max;
+{
+	int ret;
+
+	/*
+	 * During recovery generation numbers keep track of "restart"
+	 * checkpoints and recycle records.  Restart checkpoints occur
+	 * whenever we take a checkpoint and there are no outstanding
+	 * transactions.  When that happens, we can reset transaction IDs
+	 * back to TXNID_MINIMUM.  Currently we only do the reset
+	 * at then end of recovery.  Recycle records occur when txnids
+	 * are exhausted during runtime.  A free range of ids is identified
+	 * and logged.  This code maintains a stack of ranges.  A txnid
+	 * is given the generation number of the first range it falls into
+	 * in the stack.
+	 */
+	if (incr < 0) {
+		--hp->generation;
+		memmove(hp->gen_array, &hp->gen_array[1],
+		    (hp->generation + 1) * sizeof(hp->gen_array[0]));
+	} else {
+		++hp->generation;
+		if (hp->generation >= hp->gen_alloc) {
+			hp->gen_alloc *= 2;
+			if ((ret = __os_realloc(env, hp->gen_alloc *
+			    sizeof(hp->gen_array[0]), &hp->gen_array)) != 0)
+				return (ret);
+		}
+		memmove(&hp->gen_array[1], &hp->gen_array[0],
+		    hp->generation * sizeof(hp->gen_array[0]));
+		hp->gen_array[0].generation = hp->generation;
+		hp->gen_array[0].txn_min = min;
+		hp->gen_array[0].txn_max = max;
+	}
+	return (0);
+}
+
+/*
+ * __db_txnlist_lsnadd --
+ *	Save the prev_lsn from a txn_child record.
+ *
+ * PUBLIC: int __db_txnlist_lsnadd __P((ENV *, DB_TXNHEAD *, DB_LSN *));
+ */
+int
+__db_txnlist_lsnadd(env, hp, lsnp)
+	ENV *env;
+	DB_TXNHEAD *hp;
+	DB_LSN *lsnp;
+{
+	DB_TXNLIST *elp;
+	int ret;
+
+	if (IS_ZERO_LSN(*lsnp))
+		return (0);
+
+	LIST_FOREACH(elp, &hp->head[0], links)
+		if (elp->type == TXNLIST_LSN)
+			break;
+
+	if (elp == NULL) {
+		if ((ret = __db_txnlist_lsninit(env, hp, lsnp)) != 0)
+			return (ret);
+		return (DB_SURPRISE_KID);
+	}
+
+	if (elp->u.l.stack_indx == elp->u.l.stack_size) {
+		elp->u.l.stack_size <<= 1;
+		if ((ret = __os_realloc(env, sizeof(DB_LSN) *
+		     elp->u.l.stack_size, &elp->u.l.lsn_stack)) != 0) {
+			__db_txnlist_end(env, hp);
+			return (ret);
+		}
+	}
+	elp->u.l.lsn_stack[elp->u.l.stack_indx++] = *lsnp;
+
+	return (0);
+}
+
+/*
+ * __db_txnlist_lsnget --
+ *
+ * PUBLIC: int __db_txnlist_lsnget __P((ENV *,
+ * PUBLIC:     DB_TXNHEAD *, DB_LSN *, u_int32_t));
+ *	Get the lsn saved from a txn_child record.
+ */
+int
+__db_txnlist_lsnget(env, hp, lsnp, flags)
+	ENV *env;
+	DB_TXNHEAD *hp;
+	DB_LSN *lsnp;
+	u_int32_t flags;
+{
+	DB_TXNLIST *elp;
+
+	COMPQUIET(env, NULL);
+	COMPQUIET(flags, 0);
+
+	LIST_FOREACH(elp, &hp->head[0], links)
+		if (elp->type == TXNLIST_LSN)
+			break;
+
+	if (elp == NULL || elp->u.l.stack_indx == 0) {
+		ZERO_LSN(*lsnp);
+		return (0);
+	}
+
+	*lsnp = elp->u.l.lsn_stack[--elp->u.l.stack_indx];
+
+	return (0);
+}
+
+/*
+ * __db_txnlist_lsninit --
+ *	Initialize a transaction list with an lsn array entry.
+ *
+ * PUBLIC: int __db_txnlist_lsninit __P((ENV *, DB_TXNHEAD *, DB_LSN *));
+ */
+int
+__db_txnlist_lsninit(env, hp, lsnp)
+	ENV *env;
+	DB_TXNHEAD *hp;
+	DB_LSN *lsnp;
+{
+	DB_TXNLIST *elp;
+	int ret;
+
+	elp = NULL;
+
+	if ((ret = __os_malloc(env, sizeof(DB_TXNLIST), &elp)) != 0)
+		goto err;
+	LIST_INSERT_HEAD(&hp->head[0], elp, links);
+
+	elp->type = TXNLIST_LSN;
+	if ((ret = __os_malloc(env,
+	    sizeof(DB_LSN) * DB_LSN_STACK_SIZE, &elp->u.l.lsn_stack)) != 0)
+		goto err;
+	elp->u.l.stack_indx = 1;
+	elp->u.l.stack_size = DB_LSN_STACK_SIZE;
+	elp->u.l.lsn_stack[0] = *lsnp;
+
+	return (0);
+
+err:	__db_txnlist_end(env, hp);
+	return (ret);
+}
+
+#ifdef DEBUG
+/*
+ * __db_txnlist_print --
+ *	Print out the transaction list.
+ *
+ * PUBLIC: void __db_txnlist_print __P((DB_TXNHEAD *));
+ */
+void
+__db_txnlist_print(hp)
+	DB_TXNHEAD *hp;
+{
+	DB_TXNLIST *p;
+	u_int32_t i;
+	char *txntype;
+
+	printf("Maxid: %lu Generation: %lu\n",
+	    (u_long)hp->maxid, (u_long)hp->generation);
+	for (i = 0; i < hp->nslots; i++)
+		LIST_FOREACH(p, &hp->head[i], links) {
+			if (p->type != TXNLIST_TXNID) {
+				printf("Unrecognized type: %d\n", p->type);
+				continue;
+			}
+			switch (p->u.t.status) {
+			case TXN_OK:
+				txntype = "OK";
+				break;
+			case TXN_COMMIT:
+				txntype = "commit";
+				break;
+			case TXN_PREPARE:
+				txntype = "prepare";
+				break;
+			case TXN_ABORT:
+				txntype = "abort";
+				break;
+			case TXN_IGNORE:
+				txntype = "ignore";
+				break;
+			case TXN_EXPECTED:
+				txntype = "expected";
+				break;
+			case TXN_UNEXPECTED:
+				txntype = "unexpected";
+				break;
+			default:
+				txntype = "UNKNOWN";
+				break;
+			}
+			printf("TXNID: %lx(%lu): %s\n",
+			    (u_long)p->u.t.txnid,
+			    (u_long)p->u.t.generation, txntype);
+		}
+}
+#endif
diff --git a/db/db_dup.c b/db/db_dup.c
new file mode 100644
index 0000000..b789e03
--- /dev/null
+++ b/db/db_dup.c
@@ -0,0 +1,203 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/mp.h"
+#include "dbinc/db_am.h"
+
+/*
+ * __db_ditem_nolog --
+ *	Remove an item from a page without affecting its recoverability.
+ *
+ * PUBLIC:  int __db_ditem_nolog __P((DBC *, PAGE *, u_int32_t, u_int32_t));
+ */
+int
+__db_ditem_nolog(dbc, pagep, indx, nbytes)
+	DBC *dbc;
+	PAGE *pagep;
+	u_int32_t indx, nbytes;
+{
+	DB *dbp;
+	db_indx_t cnt, *inp, offset;
+	u_int8_t *from;
+
+	dbp = dbc->dbp;
+	DB_ASSERT(dbp->env, IS_DIRTY(pagep));
+	DB_ASSERT(dbp->env, indx < NUM_ENT(pagep));
+
+	/*
+	 * If there's only a single item on the page, we don't have to
+	 * work hard.
+	 */
+	if (NUM_ENT(pagep) == 1) {
+		NUM_ENT(pagep) = 0;
+		HOFFSET(pagep) = dbp->pgsize;
+		return (0);
+	}
+
+	inp = P_INP(dbp, pagep);
+	/*
+	 * Pack the remaining key/data items at the end of the page.  Use
+	 * memmove(3), the regions may overlap.
+	 */
+	from = (u_int8_t *)pagep + HOFFSET(pagep);
+	DB_ASSERT(dbp->env, inp[indx] >= HOFFSET(pagep));
+	memmove(from + nbytes, from, inp[indx] - HOFFSET(pagep));
+	HOFFSET(pagep) += nbytes;
+
+	/* Adjust the indices' offsets. */
+	offset = inp[indx];
+	for (cnt = 0; cnt < NUM_ENT(pagep); ++cnt)
+		if (inp[cnt] < offset)
+			inp[cnt] += nbytes;
+
+	/* Shift the indices down. */
+	--NUM_ENT(pagep);
+	if (indx != NUM_ENT(pagep))
+		memmove(&inp[indx], &inp[indx + 1],
+		    sizeof(db_indx_t) * (NUM_ENT(pagep) - indx));
+
+	return (0);
+}
+
+/*
+ * __db_ditem --
+ *	Remove an item from a page, logging it if enabled.
+ *
+ * PUBLIC:  int __db_ditem __P((DBC *, PAGE *, u_int32_t, u_int32_t));
+ */
+int
+__db_ditem(dbc, pagep, indx, nbytes)
+	DBC *dbc;
+	PAGE *pagep;
+	u_int32_t indx, nbytes;
+{
+	DB *dbp;
+	DBT ldbt;
+	int ret;
+
+	dbp = dbc->dbp;
+
+	if (DBC_LOGGING(dbc)) {
+		ldbt.data = P_ENTRY(dbp, pagep, indx);
+		ldbt.size = nbytes;
+		if ((ret = __db_addrem_log(dbp, dbc->txn,
+		    &LSN(pagep), 0, DB_REM_DUP, PGNO(pagep),
+		    (u_int32_t)indx, nbytes, &ldbt, NULL, &LSN(pagep))) != 0)
+			return (ret);
+	} else
+		LSN_NOT_LOGGED(LSN(pagep));
+
+	return (__db_ditem_nolog(dbc, pagep, indx, nbytes));
+}
+
+/*
+ * __db_pitem_nolog --
+ *	Put an item on a page without logging.
+ *
+ * PUBLIC: int __db_pitem_nolog
+ * PUBLIC:     __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
+ */
+int
+__db_pitem_nolog(dbc, pagep, indx, nbytes, hdr, data)
+	DBC *dbc;
+	PAGE *pagep;
+	u_int32_t indx;
+	u_int32_t nbytes;
+	DBT *hdr, *data;
+{
+	BKEYDATA bk;
+	DB *dbp;
+	DBT thdr;
+	db_indx_t *inp;
+	u_int8_t *p;
+
+	dbp = dbc->dbp;
+
+	DB_ASSERT(dbp->env, IS_DIRTY(pagep));
+
+	if (nbytes > P_FREESPACE(dbp, pagep)) {
+		DB_ASSERT(dbp->env, nbytes <= P_FREESPACE(dbp, pagep));
+		return (EINVAL);
+	}
+
+	if (hdr == NULL) {
+		B_TSET(bk.type, B_KEYDATA);
+		bk.len = data == NULL ? 0 : data->size;
+
+		thdr.data = &bk;
+		thdr.size = SSZA(BKEYDATA, data);
+		hdr = &thdr;
+	}
+	inp = P_INP(dbp, pagep);
+
+	/* Adjust the index table, then put the item on the page. */
+	if (indx != NUM_ENT(pagep))
+		memmove(&inp[indx + 1], &inp[indx],
+		    sizeof(db_indx_t) * (NUM_ENT(pagep) - indx));
+	HOFFSET(pagep) -= nbytes;
+	inp[indx] = HOFFSET(pagep);
+	++NUM_ENT(pagep);
+
+	p = P_ENTRY(dbp, pagep, indx);
+	memcpy(p, hdr->data, hdr->size);
+	if (data != NULL)
+		memcpy(p + hdr->size, data->data, data->size);
+
+	return (0);
+}
+
+/*
+ * __db_pitem --
+ *	Put an item on a page.
+ *
+ * PUBLIC: int __db_pitem
+ * PUBLIC:     __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
+ */
+int
+__db_pitem(dbc, pagep, indx, nbytes, hdr, data)
+	DBC *dbc;
+	PAGE *pagep;
+	u_int32_t indx;
+	u_int32_t nbytes;
+	DBT *hdr, *data;
+{
+	DB *dbp;
+	int ret;
+
+	dbp = dbc->dbp;
+	/*
+	 * Put a single item onto a page.  The logic figuring out where to
+	 * insert and whether it fits is handled in the caller.  All we do
+	 * here is manage the page shuffling.  We cheat a little bit in that
+	 * we don't want to copy the dbt on a normal put twice.  If hdr is
+	 * NULL, we create a BKEYDATA structure on the page, otherwise, just
+	 * copy the caller's information onto the page.
+	 *
+	 * This routine is also used to put entries onto the page where the
+	 * entry is pre-built, e.g., during recovery.  In this case, the hdr
+	 * will point to the entry, and the data argument will be NULL.
+	 *
+	 * !!!
+	 * There's a tremendous potential for off-by-one errors here, since
+	 * the passed in header sizes must be adjusted for the structure's
+	 * placeholder for the trailing variable-length data field.
+	 */
+	if (DBC_LOGGING(dbc)) {
+		if ((ret = __db_addrem_log(dbp, dbc->txn,
+		    &LSN(pagep), 0, DB_ADD_DUP, PGNO(pagep),
+		    (u_int32_t)indx, nbytes, hdr, data, &LSN(pagep))) != 0)
+			return (ret);
+	} else
+		LSN_NOT_LOGGED(LSN(pagep));
+
+	return (__db_pitem_nolog(dbc, pagep, indx, nbytes, hdr, data));
+}
diff --git a/db/db_iface.c b/db/db_iface.c
new file mode 100644
index 0000000..55f3e2a
--- /dev/null
+++ b/db/db_iface.c
@@ -0,0 +1,2817 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#ifndef HAVE_QUEUE
+#include "dbinc/qam.h"			/* For __db_no_queue_am(). */
+#endif
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/txn.h"
+
+static int __db_associate_arg __P((DB *, DB *,
+	       int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t));
+static int __dbc_del_arg __P((DBC *, u_int32_t));
+static int __dbc_pget_arg __P((DBC *, DBT *, u_int32_t));
+static int __dbc_put_arg __P((DBC *, DBT *, DBT *, u_int32_t));
+static int __db_curinval __P((const ENV *));
+static int __db_cursor_arg __P((DB *, u_int32_t));
+static int __db_del_arg __P((DB *, DBT *, u_int32_t));
+static int __db_get_arg __P((const DB *, DBT *, DBT *, u_int32_t));
+static int __db_join_arg __P((DB *, DBC **, u_int32_t));
+static int __db_open_arg __P((DB *,
+	       DB_TXN *, const char *, const char *, DBTYPE, u_int32_t));
+static int __db_pget_arg __P((DB *, DBT *, u_int32_t));
+static int __db_put_arg __P((DB *, DBT *, DBT *, u_int32_t));
+static int __dbt_ferr __P((const DB *, const char *, const DBT *, int));
+static int __db_associate_foreign_arg __P((DB *, DB *,
+		int (*)(DB *, const DBT *, DBT *, const DBT *, int *),
+		u_int32_t));
+
+/*
+ * These functions implement the Berkeley DB API.  They are organized in a
+ * layered fashion.  The interface functions (XXX_pp) perform all generic
+ * error checks (for example, PANIC'd region, replication state change
+ * in progress, inconsistent transaction usage), call function-specific
+ * check routines (_arg) to check for proper flag usage, etc., do pre-amble
+ * processing (incrementing handle counts, handling local transactions),
+ * call the function and then do post-amble processing (local transactions,
+ * decrement handle counts).
+ *
+ * The basic structure is:
+ *	Check for simple/generic errors (PANIC'd region)
+ *	Check if replication is changing state (increment handle count).
+ *	Call function-specific argument checking routine
+ *	Create internal transaction if necessary
+ *	Call underlying worker function
+ *	Commit/abort internal transaction if necessary
+ *	Decrement handle count
+ */
+
+/*
+ * __db_associate_pp --
+ *	DB->associate pre/post processing.
+ *
+ * PUBLIC: int __db_associate_pp __P((DB *, DB_TXN *, DB *,
+ * PUBLIC:     int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t));
+ */
+int
+__db_associate_pp(dbp, txn, sdbp, callback, flags)
+	DB *dbp, *sdbp;
+	DB_TXN *txn;
+	int (*callback) __P((DB *, const DBT *, const DBT *, DBT *));
+	u_int32_t flags;
+{
+	DBC *sdbc;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret, txn_local;
+
+	env = dbp->env;
+	txn_local = 0;
+
+	STRIP_AUTO_COMMIT(flags);
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check &&
+	    (ret = __db_rep_enter(dbp, 1, 0, txn != NULL)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	/*
+	 * Secondary cursors may have the primary's lock file ID, so we need
+	 * to make sure that no older cursors are lying around when we make
+	 * the transition.
+	 */
+	if (TAILQ_FIRST(&sdbp->active_queue) != NULL ||
+	    TAILQ_FIRST(&sdbp->join_queue) != NULL) {
+		__db_errx(env,
+    "Databases may not become secondary indices while cursors are open");
+		ret = EINVAL;
+		goto err;
+	}
+
+	if ((ret = __db_associate_arg(dbp, sdbp, callback, flags)) != 0)
+		goto err;
+
+	/*
+	 * Create a local transaction as necessary, check for consistent
+	 * transaction usage, and, if we have no transaction but do have
+	 * locking on, acquire a locker id for the handle lock acquisition.
+	 */
+	if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+		if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+			goto err;
+		txn_local = 1;
+	}
+
+	/* Check for consistent transaction usage. */
+	if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+		goto err;
+
+	while ((sdbc = TAILQ_FIRST(&sdbp->free_queue)) != NULL)
+		if ((ret = __dbc_destroy(sdbc)) != 0)
+			goto err;
+
+	ret = __db_associate(dbp, ip, txn, sdbp, callback, flags);
+
+err:	if (txn_local &&
+	    (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0)
+		ret = t_ret;
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_associate_arg --
+ *	Check DB->associate arguments.
+ */
+static int
+__db_associate_arg(dbp, sdbp, callback, flags)
+	DB *dbp, *sdbp;
+	int (*callback) __P((DB *, const DBT *, const DBT *, DBT *));
+	u_int32_t flags;
+{
+	ENV *env;
+	int ret;
+
+	env = dbp->env;
+
+	if (F_ISSET(sdbp, DB_AM_SECONDARY)) {
+		__db_errx(env,
+		    "Secondary index handles may not be re-associated");
+		return (EINVAL);
+	}
+	if (F_ISSET(dbp, DB_AM_SECONDARY)) {
+		__db_errx(env,
+		    "Secondary indices may not be used as primary databases");
+		return (EINVAL);
+	}
+	if (F_ISSET(dbp, DB_AM_DUP)) {
+		__db_errx(env,
+		    "Primary databases may not be configured with duplicates");
+		return (EINVAL);
+	}
+	if (F_ISSET(dbp, DB_AM_RENUMBER)) {
+		__db_errx(env,
+	    "Renumbering recno databases may not be used as primary databases");
+		return (EINVAL);
+	}
+
+	/*
+	 * It's OK for the primary and secondary to not share an environment IFF
+	 * the environments are local to the DB handle.  (Specifically, cursor
+	 * adjustment will work correctly in this case.)  The environment being
+	 * local implies the environment is not configured for either locking or
+	 * transactions, as neither of those could work correctly.
+	 */
+	if (dbp->env != sdbp->env &&
+	    (!F_ISSET(dbp->env, ENV_DBLOCAL) ||
+	     !F_ISSET(sdbp->env, ENV_DBLOCAL))) {
+		__db_errx(env,
+	    "The primary and secondary must be opened in the same environment");
+		return (EINVAL);
+	}
+	if ((DB_IS_THREADED(dbp) && !DB_IS_THREADED(sdbp)) ||
+	    (!DB_IS_THREADED(dbp) && DB_IS_THREADED(sdbp))) {
+		__db_errx(env,
+	    "The DB_THREAD setting must be the same for primary and secondary");
+		return (EINVAL);
+	}
+	if (callback == NULL &&
+	    (!F_ISSET(dbp, DB_AM_RDONLY) || !F_ISSET(sdbp, DB_AM_RDONLY))) {
+		__db_errx(env,
+    "Callback function may be NULL only when database handles are read-only");
+		return (EINVAL);
+	}
+
+	if ((ret = __db_fchk(env, "DB->associate", flags, DB_CREATE |
+	    DB_IMMUTABLE_KEY)) != 0)
+		return (ret);
+
+	return (0);
+}
+
+/*
+ * __db_close_pp --
+ *	DB->close pre/post processing.
+ *
+ * PUBLIC: int __db_close_pp __P((DB *, u_int32_t));
+ */
+int
+__db_close_pp(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = dbp->env;
+	ret = 0;
+
+	/*
+	 * Close a DB handle -- as a handle destructor, we can't fail.
+	 *
+	 * !!!
+	 * The actual argument checking is simple, do it inline, outside of
+	 * the replication block.
+	 */
+	if (flags != 0 && flags != DB_NOSYNC)
+		ret = __db_ferr(env, "DB->close", 0);
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (t_ret = __db_rep_enter(dbp, 0, 0, 0)) != 0) {
+		handle_check = 0;
+		if (ret == 0)
+			ret = t_ret;
+	}
+
+	if ((t_ret = __db_close(dbp, NULL, flags)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_cursor_pp --
+ *	DB->cursor pre/post processing.
+ *
+ * PUBLIC: int __db_cursor_pp __P((DB *, DB_TXN *, DBC **, u_int32_t));
+ */
+int
+__db_cursor_pp(dbp, txn, dbcp, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBC **dbcp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	REGENV *renv;
+	int rep_blocked, ret;
+
+	env = dbp->env;
+
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->cursor");
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	rep_blocked = 0;
+	if (txn == NULL && IS_ENV_REPLICATED(env)) {
+		if ((ret = __op_rep_enter(env)) != 0)
+			goto err;
+		rep_blocked = 1;
+		renv = env->reginfo->primary;
+		if (dbp->timestamp != renv->rep_timestamp) {
+			__db_errx(env, "%s %s",
+		    "replication recovery unrolled committed transactions;",
+		    "open DB and DBcursor handles must be closed");
+			ret = DB_REP_HANDLE_DEAD;
+			goto err;
+		}
+	}
+	if ((ret = __db_cursor_arg(dbp, flags)) != 0)
+		goto err;
+
+	/*
+	 * Check for consistent transaction usage.  For now, assume this
+	 * cursor might be used for read operations only (in which case
+	 * it may not require a txn).  We'll check more stringently in
+	 * c_del and c_put.  (Note this means the read-op txn tests have
+	 * to be a subset of the write-op ones.)
+	 */
+	if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 1)) != 0)
+		goto err;
+
+	ret = __db_cursor(dbp, ip, txn, dbcp, flags);
+
+err:	/* Release replication block on error. */
+	if (ret != 0 && rep_blocked)
+		(void)__op_rep_exit(env);
+
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_cursor --
+ *	DB->cursor.
+ *
+ * PUBLIC: int __db_cursor __P((DB *,
+ * PUBLIC:      DB_THREAD_INFO *, DB_TXN *, DBC **, u_int32_t));
+ */
+int
+__db_cursor(dbp, ip, txn, dbcp, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DBC **dbcp;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	ENV *env;
+	db_lockmode_t mode;
+	int ret;
+
+	env = dbp->env;
+
+	if (MULTIVERSION(dbp) && txn == NULL && (LF_ISSET(DB_TXN_SNAPSHOT) ||
+	    F_ISSET(env->dbenv, DB_ENV_TXN_SNAPSHOT))) {
+		if ((ret =
+		    __txn_begin(env, ip, NULL, &txn, DB_TXN_SNAPSHOT)) != 0)
+			return (ret);
+		F_SET(txn, TXN_PRIVATE);
+	}
+
+	if ((ret = __db_cursor_int(dbp, ip, txn, dbp->type, PGNO_INVALID,
+	    LF_ISSET(DB_CURSOR_BULK | DB_CURSOR_TRANSIENT), NULL, &dbc)) != 0)
+		return (ret);
+
+	/*
+	 * If this is CDB, do all the locking in the interface, which is
+	 * right here.
+	 */
+	if (CDB_LOCKING(env)) {
+		mode = (LF_ISSET(DB_WRITELOCK)) ? DB_LOCK_WRITE :
+		    ((LF_ISSET(DB_WRITECURSOR) || txn != NULL) ?
+		    DB_LOCK_IWRITE : DB_LOCK_READ);
+		if ((ret = __lock_get(env, dbc->locker, 0,
+		    &dbc->lock_dbt, mode, &dbc->mylock)) != 0)
+			goto err;
+		if (LF_ISSET(DB_WRITECURSOR))
+			F_SET(dbc, DBC_WRITECURSOR);
+		if (LF_ISSET(DB_WRITELOCK))
+			F_SET(dbc, DBC_WRITER);
+	}
+
+	if (LF_ISSET(DB_READ_UNCOMMITTED) ||
+	    (txn != NULL && F_ISSET(txn, TXN_READ_UNCOMMITTED)))
+		F_SET(dbc, DBC_READ_UNCOMMITTED);
+
+	if (LF_ISSET(DB_READ_COMMITTED) ||
+	    (txn != NULL && F_ISSET(txn, TXN_READ_COMMITTED)))
+		F_SET(dbc, DBC_READ_COMMITTED);
+
+	*dbcp = dbc;
+	return (0);
+
+err:	(void)__dbc_close(dbc);
+	return (ret);
+}
+
+/*
+ * __db_cursor_arg --
+ *	Check DB->cursor arguments.
+ */
+static int
+__db_cursor_arg(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	ENV *env;
+
+	env = dbp->env;
+
+	/*
+	 * DB_READ_COMMITTED and DB_READ_UNCOMMITTED require locking.
+	 */
+	if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED)) {
+		if (!LOCKING_ON(env))
+			return (__db_fnl(env, "DB->cursor"));
+	}
+
+	LF_CLR(DB_CURSOR_BULK |
+	    DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_TXN_SNAPSHOT);
+
+	/* Check for invalid function flags. */
+	if (LF_ISSET(DB_WRITECURSOR)) {
+		if (DB_IS_READONLY(dbp))
+			return (__db_rdonly(env, "DB->cursor"));
+		if (!CDB_LOCKING(env))
+			return (__db_ferr(env, "DB->cursor", 0));
+		LF_CLR(DB_WRITECURSOR);
+	} else if (LF_ISSET(DB_WRITELOCK)) {
+		if (DB_IS_READONLY(dbp))
+			return (__db_rdonly(env, "DB->cursor"));
+		LF_CLR(DB_WRITELOCK);
+	}
+
+	if (flags != 0)
+		return (__db_ferr(env, "DB->cursor", 0));
+
+	return (0);
+}
+
+/*
+ * __db_del_pp --
+ *	DB->del pre/post processing.
+ *
+ * PUBLIC: int __db_del_pp __P((DB *, DB_TXN *, DBT *, u_int32_t));
+ */
+int
+__db_del_pp(dbp, txn, key, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBT *key;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret, txn_local;
+
+	env = dbp->env;
+	txn_local = 0;
+
+	STRIP_AUTO_COMMIT(flags);
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->del");
+
+#ifdef CONFIG_TEST
+	if (IS_REP_MASTER(env))
+		DB_TEST_WAIT(env, env->test_check);
+#endif
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check &&
+	     (ret = __db_rep_enter(dbp, 1, 0, txn != NULL)) != 0) {
+			handle_check = 0;
+			goto err;
+	}
+
+	if ((ret = __db_del_arg(dbp, key, flags)) != 0)
+		goto err;
+
+	/* Create local transaction as necessary. */
+	if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+		if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+			goto err;
+		txn_local = 1;
+	}
+
+	/* Check for consistent transaction usage. */
+	if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+		goto err;
+
+	ret = __db_del(dbp, ip, txn, key, flags);
+
+err:	if (txn_local &&
+	    (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0)
+		ret = t_ret;
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+	ENV_LEAVE(env, ip);
+	__dbt_userfree(env, key, NULL, NULL);
+	return (ret);
+}
+
+/*
+ * __db_del_arg --
+ *	Check DB->delete arguments.
+ */
+static int
+__db_del_arg(dbp, key, flags)
+	DB *dbp;
+	DBT *key;
+	u_int32_t flags;
+{
+	ENV *env;
+	int ret;
+
+	env = dbp->env;
+
+	/* Check for changes to a read-only tree. */
+	if (DB_IS_READONLY(dbp))
+		return (__db_rdonly(env, "DB->del"));
+
+	/* Check for invalid function flags. */
+	switch (flags) {
+	case DB_CONSUME:
+		if (dbp->type != DB_QUEUE)
+			return (__db_ferr(env, "DB->del", 0));
+		goto copy;
+	case DB_MULTIPLE:
+	case DB_MULTIPLE_KEY:
+		if (!F_ISSET(key, DB_DBT_BULK)) {
+			__db_errx(env,
+		"DB->del with DB_MULTIPLE(_KEY) requires multiple key records");
+			return (EINVAL);
+		}
+		/* FALL THROUGH */
+	case 0:
+copy:		if ((ret = __dbt_usercopy(env, key)) != 0)
+			return (ret);
+		break;
+	default:
+		return (__db_ferr(env, "DB->del", 0));
+	}
+
+	return (0);
+}
+
+/*
+ * __db_exists --
+ *	DB->exists implementation.
+ *
+ * PUBLIC: int __db_exists __P((DB *, DB_TXN *, DBT *, u_int32_t));
+ */
+int
+__db_exists(dbp, txn, key, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBT *key;
+	u_int32_t flags;
+{
+	DBT data;
+	int ret;
+
+	/*
+	 * Most flag checking is done in the DB->get call, we only check for
+	 * specific incompatibilities here.  This saves making __get_arg
+	 * aware of the exist method's API constraints.
+	 */
+	STRIP_AUTO_COMMIT(flags);	
+	if ((ret = __db_fchk(dbp->env, "DB->exists", flags,
+	    DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW)) != 0)
+		return (ret);
+
+	/*
+	 * Configure a data DBT that returns no bytes so there's no copy
+	 * of the data.
+	 */
+	memset(&data, 0, sizeof(data));
+	data.dlen = 0;
+	data.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM;
+
+	return (dbp->get(dbp, txn, key, &data, flags));
+}
+
+/*
+ * db_fd_pp --
+ *	DB->fd pre/post processing.
+ *
+ * PUBLIC: int __db_fd_pp __P((DB *, int *));
+ */
+int
+__db_fd_pp(dbp, fdp)
+	DB *dbp;
+	int *fdp;
+{
+	DB_FH *fhp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = dbp->env;
+
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->fd");
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, 0)) != 0)
+		goto err;
+
+	/*
+	 * !!!
+	 * There's no argument checking to be done.
+	 *
+	 * !!!
+	 * The actual method call is simple, do it inline.
+	 *
+	 * XXX
+	 * Truly spectacular layering violation.
+	 */
+	if ((ret = __mp_xxx_fh(dbp->mpf, &fhp)) == 0) {
+		if (fhp == NULL) {
+			*fdp = -1;
+			__db_errx(env,
+			    "Database does not have a valid file handle");
+			ret = ENOENT;
+		} else
+			*fdp = fhp->fd;
+	}
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+err:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_get_pp --
+ *	DB->get pre/post processing.
+ *
+ * PUBLIC: int __db_get_pp __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_get_pp(dbp, txn, key, data, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	u_int32_t mode;
+	int handle_check, ignore_lease, ret, t_ret, txn_local;
+
+	env = dbp->env;
+	mode = 0;
+	txn_local = 0;
+
+	STRIP_AUTO_COMMIT(flags);
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get");
+
+	ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0;
+	LF_CLR(DB_IGNORE_LEASE);
+
+	if ((ret = __db_get_arg(dbp, key, data, flags)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check &&
+	     (ret = __db_rep_enter(dbp, 1, 0, txn != NULL)) != 0) {
+			handle_check = 0;
+			goto err;
+	}
+
+	if (LF_ISSET(DB_READ_UNCOMMITTED))
+		mode = DB_READ_UNCOMMITTED;
+	else if ((flags & DB_OPFLAGS_MASK) == DB_CONSUME ||
+	    (flags & DB_OPFLAGS_MASK) == DB_CONSUME_WAIT) {
+		mode = DB_WRITELOCK;
+		if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+			if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+				goto err;
+			txn_local = 1;
+		}
+	}
+
+	/* Check for consistent transaction usage. */
+	if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID,
+	    mode == DB_WRITELOCK || LF_ISSET(DB_RMW) ? 0 : 1)) != 0)
+		goto err;
+
+	ret = __db_get(dbp, ip, txn, key, data, flags);
+	/*
+	 * Check for master leases.
+	 */
+	if (ret == 0 &&
+	    IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease)
+		ret = __rep_lease_check(env, 1);
+
+err:	if (txn_local &&
+	    (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0)
+		ret = t_ret;
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	__dbt_userfree(env, key, NULL, data);
+	return (ret);
+}
+
+/*
+ * __db_get --
+ *	DB->get.
+ *
+ * PUBLIC: int __db_get __P((DB *,
+ * PUBLIC:     DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_get(dbp, ip, txn, key, data, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	u_int32_t mode;
+	int ret, t_ret;
+
+	/*
+	 * The DB_CURSOR_TRANSIENT flag indicates that we're just doing a single
+	 * operation with this cursor, and that in case of error we don't need
+	 * to restore it to its old position.  Thus, we can perform the get
+	 * without duplicating the cursor, saving some cycles in this common
+	 * case.
+	 */
+	mode = DB_CURSOR_TRANSIENT;
+	if (LF_ISSET(DB_READ_UNCOMMITTED)) {
+		mode |= DB_READ_UNCOMMITTED;
+		LF_CLR(DB_READ_UNCOMMITTED);
+	} else if (LF_ISSET(DB_READ_COMMITTED)) {
+		mode |= DB_READ_COMMITTED;
+		LF_CLR(DB_READ_COMMITTED);
+	} else if ((flags & DB_OPFLAGS_MASK) == DB_CONSUME ||
+	    (flags & DB_OPFLAGS_MASK) == DB_CONSUME_WAIT)
+		mode |= DB_WRITELOCK;
+
+	if ((ret = __db_cursor(dbp, ip, txn, &dbc, mode)) != 0)
+		return (ret);
+
+	DEBUG_LREAD(dbc, txn, "DB->get", key, NULL, flags);
+
+	/*
+	 * The semantics of bulk gets are different for DB->get vs DBC->get.
+	 * Mark the cursor so the low-level bulk get routines know which
+	 * behavior we want.
+	 */
+	F_SET(dbc, DBC_FROM_DB_GET);
+
+	/*
+	 * SET_RET_MEM indicates that if key and/or data have no DBT
+	 * flags set and DB manages the returned-data memory, that memory
+	 * will belong to this handle, not to the underlying cursor.
+	 */
+	SET_RET_MEM(dbc, dbp);
+
+	if (LF_ISSET(~(DB_RMW | DB_MULTIPLE)) == 0)
+		LF_SET(DB_SET);
+
+#ifdef HAVE_PARTITION
+	if (F_ISSET(dbc, DBC_PARTITIONED))
+		ret = __partc_get(dbc, key, data, flags);
+	else
+#endif
+		ret = __dbc_get(dbc, key, data, flags);
+
+	if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_get_arg --
+ *	DB->get argument checking, used by both DB->get and DB->pget.
+ */
+static int
+__db_get_arg(dbp, key, data, flags)
+	const DB *dbp;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	ENV *env;
+	int dirty, multi, ret;
+
+	env = dbp->env;
+
+	/*
+	 * Check for read-modify-write validity.  DB_RMW doesn't make sense
+	 * with CDB cursors since if you're going to write the cursor, you
+	 * had to create it with DB_WRITECURSOR.  Regardless, we check for
+	 * LOCKING_ON and not STD_LOCKING, as we don't want to disallow it.
+	 * If this changes, confirm that DB does not itself set the DB_RMW
+	 * flag in a path where CDB may have been configured.
+	 */
+	dirty = 0;
+	if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW)) {
+		if (!LOCKING_ON(env))
+			return (__db_fnl(env, "DB->get"));
+		if ((ret = __db_fcchk(env, "DB->get",
+		    flags, DB_READ_UNCOMMITTED, DB_READ_COMMITTED)) != 0)
+			return (ret);
+		if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED))
+			dirty = 1;
+		LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
+	}
+
+	multi = 0;
+	if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+		if (LF_ISSET(DB_MULTIPLE_KEY))
+			goto multi_err;
+		multi = LF_ISSET(DB_MULTIPLE) ? 1 : 0;
+		LF_CLR(DB_MULTIPLE);
+	}
+
+	/* Check for invalid function flags. */
+	switch (flags) {
+	case DB_GET_BOTH:
+		if ((ret = __dbt_usercopy(env, data)) != 0)
+			return (ret);
+		/* FALLTHROUGH */
+	case 0:
+		if ((ret = __dbt_usercopy(env, key)) != 0) {
+			__dbt_userfree(env, key, NULL, data);
+			return (ret);
+		}
+		break;
+	case DB_SET_RECNO:
+		if (!F_ISSET(dbp, DB_AM_RECNUM))
+			goto err;
+		if ((ret = __dbt_usercopy(env, key)) != 0)
+			return (ret);
+		break;
+	case DB_CONSUME:
+	case DB_CONSUME_WAIT:
+		if (dirty) {
+			__db_errx(env,
+		    "%s is not supported with DB_CONSUME or DB_CONSUME_WAIT",
+			     LF_ISSET(DB_READ_UNCOMMITTED) ?
+			     "DB_READ_UNCOMMITTED" : "DB_READ_COMMITTED");
+			return (EINVAL);
+		}
+		if (multi)
+multi_err:		return (__db_ferr(env, "DB->get", 1));
+		if (dbp->type == DB_QUEUE)
+			break;
+		/* FALLTHROUGH */
+	default:
+err:		return (__db_ferr(env, "DB->get", 0));
+	}
+
+	/*
+	 * Check for invalid key/data flags.
+	 */
+	if ((ret =
+	    __dbt_ferr(dbp, "key", key, DB_RETURNS_A_KEY(dbp, flags))) != 0)
+		return (ret);
+	if ((ret = __dbt_ferr(dbp, "data", data, 1)) != 0)
+		return (ret);
+
+	if (multi) {
+		if (!F_ISSET(data, DB_DBT_USERMEM)) {
+			__db_errx(env,
+			    "DB_MULTIPLE requires DB_DBT_USERMEM be set");
+			return (EINVAL);
+		}
+		if (F_ISSET(key, DB_DBT_PARTIAL) ||
+		    F_ISSET(data, DB_DBT_PARTIAL)) {
+			__db_errx(env,
+			    "DB_MULTIPLE does not support DB_DBT_PARTIAL");
+			return (EINVAL);
+		}
+		if (data->ulen < 1024 ||
+		    data->ulen < dbp->pgsize || data->ulen % 1024 != 0) {
+			__db_errx(env, "%s%s",
+			    "DB_MULTIPLE buffers must be ",
+			    "aligned, at least page size and multiples of 1KB");
+			return (EINVAL);
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * __db_join_pp --
+ *	DB->join pre/post processing.
+ *
+ * PUBLIC: int __db_join_pp __P((DB *, DBC **, DBC **, u_int32_t));
+ */
+int
+__db_join_pp(primary, curslist, dbcp, flags)
+	DB *primary;
+	DBC **curslist, **dbcp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = primary->env;
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (ret =
+	    __db_rep_enter(primary, 1, 0, curslist[0]->txn != NULL)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	if ((ret = __db_join_arg(primary, curslist, flags)) == 0)
+		ret = __db_join(primary, curslist, dbcp, flags);
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+err:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_join_arg --
+ *	Check DB->join arguments.
+ */
+static int
+__db_join_arg(primary, curslist, flags)
+	DB *primary;
+	DBC **curslist;
+	u_int32_t flags;
+{
+	DB_TXN *txn;
+	ENV *env;
+	int i;
+
+	env = primary->env;
+
+	switch (flags) {
+	case 0:
+	case DB_JOIN_NOSORT:
+		break;
+	default:
+		return (__db_ferr(env, "DB->join", 0));
+	}
+
+	if (curslist == NULL || curslist[0] == NULL) {
+		__db_errx(env,
+	    "At least one secondary cursor must be specified to DB->join");
+		return (EINVAL);
+	}
+
+	txn = curslist[0]->txn;
+	for (i = 1; curslist[i] != NULL; i++)
+		if (curslist[i]->txn != txn) {
+			__db_errx(env,
+		    "All secondary cursors must share the same transaction");
+			return (EINVAL);
+		}
+
+	return (0);
+}
+
+/*
+ * __db_key_range_pp --
+ *	DB->key_range pre/post processing.
+ *
+ * PUBLIC: int __db_key_range_pp
+ * PUBLIC:     __P((DB *, DB_TXN *, DBT *, DB_KEY_RANGE *, u_int32_t));
+ */
+int
+__db_key_range_pp(dbp, txn, key, kr, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBT *key;
+	DB_KEY_RANGE *kr;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = dbp->env;
+
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->key_range");
+
+	/*
+	 * !!!
+	 * The actual argument checking is simple, do it inline, outside of
+	 * the replication block.
+	 */
+	if (flags != 0)
+		return (__db_ferr(env, "DB->key_range", 0));
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check &&
+	     (ret = __db_rep_enter(dbp, 1, 0, txn != NULL)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	/* Check for consistent transaction usage. */
+	if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 1)) != 0)
+		goto err;
+
+	/*
+	 * !!!
+	 * The actual method call is simple, do it inline.
+	 */
+	switch (dbp->type) {
+	case DB_BTREE:
+#ifndef HAVE_BREW
+		if ((ret = __dbt_usercopy(env, key)) != 0)
+			goto err;
+
+		/* Acquire a cursor. */
+		if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+			break;
+
+		DEBUG_LWRITE(dbc, NULL, "bam_key_range", NULL, NULL, 0);
+#ifdef HAVE_PARTITION
+		if (DB_IS_PARTITIONED(dbp))
+			ret = __part_key_range(dbc, key, kr, flags);
+		else
+#endif
+			ret = __bam_key_range(dbc, key, kr, flags);
+
+		if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+		__dbt_userfree(env, key, NULL, NULL);
+		break;
+#else
+		COMPQUIET(dbc, NULL);
+		COMPQUIET(key, NULL);
+		COMPQUIET(kr, NULL);
+		/* FALLTHROUGH */
+#endif
+	case DB_HASH:
+	case DB_QUEUE:
+	case DB_RECNO:
+		ret = __dbh_am_chk(dbp, DB_OK_BTREE);
+		break;
+	case DB_UNKNOWN:
+	default:
+		ret = __db_unknown_type(env, "DB->key_range", dbp->type);
+		break;
+	}
+
+err:	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_open_pp --
+ *	DB->open pre/post processing.
+ *
+ * PUBLIC: int __db_open_pp __P((DB *, DB_TXN *,
+ * PUBLIC:     const char *, const char *, DBTYPE, u_int32_t, int));
+ */
+int
+__db_open_pp(dbp, txn, fname, dname, type, flags, mode)
+	DB *dbp;
+	DB_TXN *txn;
+	const char *fname, *dname;
+	DBTYPE type;
+	u_int32_t flags;
+	int mode;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, nosync, remove_me, ret, t_ret, txn_local;
+
+	env = dbp->env;
+	nosync = 1;
+	handle_check = remove_me = txn_local = 0;
+
+	ENV_ENTER(env, ip);
+
+	/*
+	 * Save the file and database names and flags.  We do this here
+	 * because we don't pass all of the flags down into the actual
+	 * DB->open method call, we strip DB_AUTO_COMMIT at this layer.
+	 */
+	if ((fname != NULL &&
+	    (ret = __os_strdup(env, fname, &dbp->fname)) != 0))
+		goto err;
+	if ((dname != NULL &&
+	    (ret = __os_strdup(env, dname, &dbp->dname)) != 0))
+		goto err;
+	dbp->open_flags = flags;
+
+	/* Save the current DB handle flags for refresh. */
+	dbp->orig_flags = dbp->flags;
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check &&
+	    (ret = __db_rep_enter(dbp, 1, 0, txn != NULL)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	/*
+	 * Create local transaction as necessary, check for consistent
+	 * transaction usage.
+	 */
+	if (IS_ENV_AUTO_COMMIT(env, txn, flags)) {
+		if ((ret = __db_txn_auto_init(env, ip, &txn)) != 0)
+			goto err;
+		txn_local = 1;
+	} else if (txn != NULL && !TXN_ON(env) &&
+	    (!CDB_LOCKING(env) || !F_ISSET(txn, TXN_CDSGROUP))) {
+		ret = __db_not_txn_env(env);
+		goto err;
+	}
+	LF_CLR(DB_AUTO_COMMIT);
+
+	/*
+	 * We check arguments after possibly creating a local transaction,
+	 * which is unusual -- the reason is some flags are illegal if any
+	 * kind of transaction is in effect.
+	 */
+	if ((ret = __db_open_arg(dbp, txn, fname, dname, type, flags)) == 0)
+		if ((ret = __db_open(dbp, ip, txn, fname, dname, type,
+		    flags, mode, PGNO_BASE_MD)) != 0)
+			goto txnerr;
+
+	/*
+	 * You can open the database that describes the subdatabases in the
+	 * rest of the file read-only.  The content of each key's data is
+	 * unspecified and applications should never be adding new records
+	 * or updating existing records.  However, during recovery, we need
+	 * to open these databases R/W so we can redo/undo changes in them.
+	 * Likewise, we need to open master databases read/write during
+	 * rename and remove so we can be sure they're fully sync'ed, so
+	 * we provide an override flag for the purpose.
+	 */
+	if (dname == NULL && !IS_RECOVERING(env) && !LF_ISSET(DB_RDONLY) &&
+	    !LF_ISSET(DB_RDWRMASTER) && F_ISSET(dbp, DB_AM_SUBDB)) {
+		__db_errx(env,
+    "files containing multiple databases may only be opened read-only");
+		ret = EINVAL;
+		goto txnerr;
+	}
+
+	/*
+	 * Success: file creations have to be synchronous, otherwise we don't
+	 * care.
+	 */
+	if (F_ISSET(dbp, DB_AM_CREATED | DB_AM_CREATED_MSTR))
+		nosync = 0;
+
+	/* Success: don't discard the file on close. */
+	F_CLR(dbp, DB_AM_DISCARD | DB_AM_CREATED | DB_AM_CREATED_MSTR);
+
+	/*
+	 * If not transactional, remove the databases/subdatabases if it is
+	 * persistent.  If we're transactional, the child transaction abort
+	 * cleans up.
+	 */
+txnerr:	if (ret != 0 && !IS_REAL_TXN(txn)) {
+		remove_me = (F_ISSET(dbp, DB_AM_CREATED) &&
+			(fname != NULL || dname != NULL)) ? 1 : 0;
+		if (F_ISSET(dbp, DB_AM_CREATED_MSTR) ||
+		    (dname == NULL && remove_me))
+			/* Remove file. */
+			(void)__db_remove_int(dbp,
+			    ip, txn, fname, NULL, DB_FORCE);
+		else if (remove_me)
+			/* Remove subdatabase. */
+			(void)__db_remove_int(dbp,
+			    ip, txn, fname, dname, DB_FORCE);
+	}
+
+	if (txn_local && (t_ret =
+	     __db_txn_auto_resolve(env, txn, nosync, ret)) && ret == 0)
+		ret = t_ret;
+
+err:	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_open_arg --
+ *	Check DB->open arguments.
+ */
+static int
+__db_open_arg(dbp, txn, fname, dname, type, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	const char *fname, *dname;
+	DBTYPE type;
+	u_int32_t flags;
+{
+	ENV *env;
+	u_int32_t ok_flags;
+	int ret;
+
+	env = dbp->env;
+
+	/* Validate arguments. */
+#undef	OKFLAGS
+#define	OKFLAGS								\
+	(DB_AUTO_COMMIT | DB_CREATE | DB_EXCL | DB_FCNTL_LOCKING |	\
+	DB_MULTIVERSION | DB_NOMMAP | DB_NO_AUTO_COMMIT | DB_RDONLY |	\
+	DB_RDWRMASTER | DB_READ_UNCOMMITTED | DB_THREAD | DB_TRUNCATE)
+	if ((ret = __db_fchk(env, "DB->open", flags, OKFLAGS)) != 0)
+		return (ret);
+	if (LF_ISSET(DB_EXCL) && !LF_ISSET(DB_CREATE))
+		return (__db_ferr(env, "DB->open", 1));
+	if (LF_ISSET(DB_RDONLY) && LF_ISSET(DB_CREATE))
+		return (__db_ferr(env, "DB->open", 1));
+
+#ifdef	HAVE_VXWORKS
+	if (LF_ISSET(DB_TRUNCATE)) {
+		__db_errx(env, "DB_TRUNCATE not supported on VxWorks");
+		return (DB_OPNOTSUP);
+	}
+#endif
+	switch (type) {
+	case DB_UNKNOWN:
+		if (LF_ISSET(DB_CREATE|DB_TRUNCATE)) {
+			__db_errx(env,
+	    "DB_UNKNOWN type specified with DB_CREATE or DB_TRUNCATE");
+			return (EINVAL);
+		}
+		ok_flags = 0;
+		break;
+	case DB_BTREE:
+		ok_flags = DB_OK_BTREE;
+		break;
+	case DB_HASH:
+#ifndef HAVE_HASH
+		return (__db_no_hash_am(env));
+#endif
+		ok_flags = DB_OK_HASH;
+		break;
+	case DB_QUEUE:
+#ifndef HAVE_QUEUE
+		return (__db_no_queue_am(env));
+#endif
+		ok_flags = DB_OK_QUEUE;
+		break;
+	case DB_RECNO:
+		ok_flags = DB_OK_RECNO;
+		break;
+	default:
+		__db_errx(env, "unknown type: %lu", (u_long)type);
+		return (EINVAL);
+	}
+	if (ok_flags)
+		DB_ILLEGAL_METHOD(dbp, ok_flags);
+
+	/* The environment may have been created, but never opened. */
+	if (!F_ISSET(env, ENV_DBLOCAL | ENV_OPEN_CALLED)) {
+		__db_errx(env, "database environment not yet opened");
+		return (EINVAL);
+	}
+
+	/*
+	 * Historically, you could pass in an environment that didn't have a
+	 * mpool, and DB would create a private one behind the scenes.  This
+	 * no longer works.
+	 */
+	if (!F_ISSET(env, ENV_DBLOCAL) && !MPOOL_ON(env)) {
+		__db_errx(env, "environment did not include a memory pool");
+		return (EINVAL);
+	}
+
+	/*
+	 * You can't specify threads during DB->open if subsystems in the
+	 * environment weren't configured with them.
+	 */
+	if (LF_ISSET(DB_THREAD) && !F_ISSET(env, ENV_DBLOCAL | ENV_THREAD)) {
+		__db_errx(env, "environment not created using DB_THREAD");
+		return (EINVAL);
+	}
+
+	/* DB_MULTIVERSION requires a database configured for transactions. */
+	if (LF_ISSET(DB_MULTIVERSION) && !IS_REAL_TXN(txn)) {
+		__db_errx(env,
+		    "DB_MULTIVERSION illegal without a transaction specified");
+		return (EINVAL);
+	}
+
+	if (LF_ISSET(DB_MULTIVERSION) && type == DB_QUEUE) {
+		__db_errx(env,
+		    "DB_MULTIVERSION illegal with queue databases");
+		return (EINVAL);
+	}
+
+	/* DB_TRUNCATE is neither transaction recoverable nor lockable. */
+	if (LF_ISSET(DB_TRUNCATE) && (LOCKING_ON(env) || txn != NULL)) {
+		__db_errx(env,
+		    "DB_TRUNCATE illegal with %s specified",
+		    LOCKING_ON(env) ? "locking" : "transactions");
+		return (EINVAL);
+	}
+
+	/* Subdatabase checks. */
+	if (dname != NULL) {
+		/* QAM can only be done on in-memory subdatabases. */
+		if (type == DB_QUEUE && fname != NULL) {
+			__db_errx(
+			    env, "Queue databases must be one-per-file");
+			return (EINVAL);
+		}
+
+		/*
+		 * Named in-memory databases can't support certain flags,
+		 * so check here.
+		 */
+		if (fname == NULL)
+			F_CLR(dbp, DB_AM_CHKSUM | DB_AM_ENCRYPT);
+	}
+
+	return (0);
+}
+
+/*
+ * __db_pget_pp --
+ *	DB->pget pre/post processing.
+ *
+ * PUBLIC: int __db_pget_pp
+ * PUBLIC:     __P((DB *, DB_TXN *, DBT *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_pget_pp(dbp, txn, skey, pkey, data, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBT *skey, *pkey, *data;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ignore_lease, ret, t_ret;
+
+	env = dbp->env;
+
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->pget");
+
+	ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0;
+	LF_CLR(DB_IGNORE_LEASE);
+
+	if ((ret = __db_pget_arg(dbp, pkey, flags)) != 0 ||
+	    (ret = __db_get_arg(dbp, skey, data, flags)) != 0) {
+		__dbt_userfree(env, skey, pkey, data);
+		return (ret);
+	}
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check &&
+	    (ret = __db_rep_enter(dbp, 1, 0, txn != NULL)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	ret = __db_pget(dbp, ip, txn, skey, pkey, data, flags);
+	/*
+	 * Check for master leases.
+	 */
+	if (ret == 0 &&
+	    IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease)
+		ret = __rep_lease_check(env, 1);
+
+err:	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	__dbt_userfree(env, skey, pkey, data);
+	return (ret);
+}
+
+/*
+ * __db_pget --
+ *	DB->pget.
+ *
+ * PUBLIC: int __db_pget __P((DB *,
+ * PUBLIC:     DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_pget(dbp, ip, txn, skey, pkey, data, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DBT *skey, *pkey, *data;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	u_int32_t mode;
+	int ret, t_ret;
+
+	mode = DB_CURSOR_TRANSIENT;
+	if (LF_ISSET(DB_READ_UNCOMMITTED)) {
+		mode |= DB_READ_UNCOMMITTED;
+		LF_CLR(DB_READ_UNCOMMITTED);
+	} else if (LF_ISSET(DB_READ_COMMITTED)) {
+		mode |= DB_READ_COMMITTED;
+		LF_CLR(DB_READ_COMMITTED);
+	}
+
+	if ((ret = __db_cursor(dbp, ip, txn, &dbc, mode)) != 0)
+		return (ret);
+
+	SET_RET_MEM(dbc, dbp);
+
+	DEBUG_LREAD(dbc, txn, "__db_pget", skey, NULL, flags);
+
+	/*
+	 * !!!
+	 * The actual method call is simple, do it inline.
+	 *
+	 * The underlying cursor pget will fill in a default DBT for null
+	 * pkeys, and use the cursor's returned-key memory internally to
+	 * store any intermediate primary keys.  However, we've just set
+	 * the returned-key memory to the DB handle's key memory, which
+	 * is unsafe to use if the DB handle is threaded.  If the pkey
+	 * argument is NULL, use the DBC-owned returned-key memory
+	 * instead;  it'll go away when we close the cursor before we
+	 * return, but in this case that's just fine, as we're not
+	 * returning the primary key.
+	 */
+	if (pkey == NULL)
+		dbc->rkey = &dbc->my_rkey;
+
+	/*
+	 * The cursor is just a perfectly ordinary secondary database cursor.
+	 * Call its c_pget() method to do the dirty work.
+	 */
+	if (flags == 0 || flags == DB_RMW)
+		flags |= DB_SET;
+
+	ret = __dbc_pget(dbc, skey, pkey, data, flags);
+
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_pget_arg --
+ *	Check DB->pget arguments.
+ */
+static int
+__db_pget_arg(dbp, pkey, flags)
+	DB *dbp;
+	DBT *pkey;
+	u_int32_t flags;
+{
+	ENV *env;
+	int ret;
+
+	env = dbp->env;
+
+	if (!F_ISSET(dbp, DB_AM_SECONDARY)) {
+		__db_errx(env,
+		    "DB->pget may only be used on secondary indices");
+		return (EINVAL);
+	}
+
+	if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+		__db_errx(env,
+	"DB_MULTIPLE and DB_MULTIPLE_KEY may not be used on secondary indices");
+		return (EINVAL);
+	}
+
+	/* DB_CONSUME makes no sense on a secondary index. */
+	LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
+	switch (flags) {
+	case DB_CONSUME:
+	case DB_CONSUME_WAIT:
+		return (__db_ferr(env, "DB->pget", 0));
+	default:
+		/* __db_get_arg will catch the rest. */
+		break;
+	}
+
+	/*
+	 * We allow the pkey field to be NULL, so that we can make the
+	 * two-DBT get calls into wrappers for the three-DBT ones.
+	 */
+	if (pkey != NULL &&
+	    (ret = __dbt_ferr(dbp, "primary key", pkey, 1)) != 0)
+		return (ret);
+
+	if (flags == DB_GET_BOTH) {
+		/* The pkey field can't be NULL if we're doing a DB_GET_BOTH. */
+		if (pkey == NULL) {
+			__db_errx(env,
+		    "DB_GET_BOTH on a secondary index requires a primary key");
+			return (EINVAL);
+		}
+		if ((ret = __dbt_usercopy(env, pkey)) != 0)
+			return (ret);
+	}
+
+	return (0);
+}
+
+/*
+ * __db_put_pp --
+ *	DB->put pre/post processing.
+ *
+ * PUBLIC: int __db_put_pp __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_put_pp(dbp, txn, key, data, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, txn_local, t_ret;
+
+	env = dbp->env;
+	txn_local = 0;
+
+	STRIP_AUTO_COMMIT(flags);
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->put");
+
+	if ((ret = __db_put_arg(dbp, key, data, flags)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check &&
+	    (ret = __db_rep_enter(dbp, 1, 0, txn != NULL)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	/* Create local transaction as necessary. */
+	if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+		if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+			goto err;
+		txn_local = 1;
+	}
+
+	/* Check for consistent transaction usage. */
+	if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+		goto err;
+
+	ret = __db_put(dbp, ip, txn, key, data, flags);
+
+err:	if (txn_local &&
+	    (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0)
+		ret = t_ret;
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	__dbt_userfree(env, key, NULL, data);
+	return (ret);
+}
+
+/*
+ * __db_put_arg --
+ *	Check DB->put arguments.
+ */
+static int
+__db_put_arg(dbp, key, data, flags)
+	DB *dbp;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	ENV *env;
+	int ret, returnkey;
+
+	env = dbp->env;
+	returnkey = 0;
+
+	/* Check for changes to a read-only tree. */
+	if (DB_IS_READONLY(dbp))
+		return (__db_rdonly(env, "DB->put"));
+
+	/* Check for puts on a secondary. */
+	if (F_ISSET(dbp, DB_AM_SECONDARY)) {
+		__db_errx(env, "DB->put forbidden on secondary indices");
+		return (EINVAL);
+	}
+
+	if (LF_ISSET(DB_MULTIPLE_KEY | DB_MULTIPLE)) {
+		if (LF_ISSET(DB_MULTIPLE) && LF_ISSET(DB_MULTIPLE_KEY))
+			goto err;
+
+		switch (LF_ISSET(DB_OPFLAGS_MASK)) {
+		case 0:
+		case DB_OVERWRITE_DUP:
+			break;
+		default:
+			__db_errx(env,
+       "DB->put: DB_MULTIPLE(_KEY) can only be combined with DB_OVERWRITE_DUP");
+			return (EINVAL);
+		}
+
+		if (!F_ISSET(key, DB_DBT_BULK)) {
+			__db_errx(env,
+		   "DB->put with DB_MULTIPLE(_KEY) requires a bulk key buffer");
+			return (EINVAL);
+		}
+	}
+	if (LF_ISSET(DB_MULTIPLE)) {
+		if (!F_ISSET(data, DB_DBT_BULK)) {
+			__db_errx(env,
+			"DB->put with DB_MULTIPLE requires a bulk data buffer");
+			return (EINVAL);
+		}
+	}
+
+	/* Check for invalid function flags. */
+	switch (LF_ISSET(DB_OPFLAGS_MASK)) {
+	case 0:
+	case DB_NOOVERWRITE:
+	case DB_OVERWRITE_DUP:
+		break;
+	case DB_APPEND:
+		if (dbp->type != DB_RECNO && dbp->type != DB_QUEUE)
+			goto err;
+		returnkey = 1;
+		break;
+	case DB_NODUPDATA:
+		if (F_ISSET(dbp, DB_AM_DUPSORT))
+			break;
+		/* FALLTHROUGH */
+	default:
+err:		return (__db_ferr(env, "DB->put", 0));
+	}
+
+	/*
+	 * Check for invalid key/data flags.  The key may reasonably be NULL
+	 * if DB_APPEND is set and the application doesn't care about the
+	 * returned key.
+	 */
+	if (((returnkey && key != NULL) || !returnkey) &&
+	    (ret = __dbt_ferr(dbp, "key", key, returnkey)) != 0)
+		return (ret);
+	if (!LF_ISSET(DB_MULTIPLE_KEY) &&
+	    (ret = __dbt_ferr(dbp, "data", data, 0)) != 0)
+		return (ret);
+
+	/*
+	 * The key parameter should not be NULL or have the "partial" flag set
+	 * in a put call unless the user doesn't care about a key value we'd
+	 * return.  The user tells us they don't care about the returned key by
+	 * setting the key parameter to NULL or configuring the key DBT to not
+	 * return any information.  (Returned keys from a put are always record
+	 * numbers, and returning part of a record number  doesn't make sense:
+	 * only accept a partial return if the length returned is 0.)
+	 */
+	if ((returnkey &&
+	    key != NULL && F_ISSET(key, DB_DBT_PARTIAL) && key->dlen != 0) ||
+	    (!returnkey && F_ISSET(key, DB_DBT_PARTIAL)))
+		return (__db_ferr(env, "key DBT", 0));
+
+	/* Check for partial puts in the presence of duplicates. */
+	if (data != NULL && F_ISSET(data, DB_DBT_PARTIAL) &&
+	    (F_ISSET(dbp, DB_AM_DUP) || F_ISSET(key, DB_DBT_DUPOK))) {
+		__db_errx(env,
+"a partial put in the presence of duplicates requires a cursor operation");
+		return (EINVAL);
+	}
+
+	if ((flags != DB_APPEND && (ret = __dbt_usercopy(env, key)) != 0) ||
+	    (!LF_ISSET(DB_MULTIPLE_KEY) &&
+	    (ret = __dbt_usercopy(env, data)) != 0))
+		return (ret);
+
+	return (0);
+}
+
+/*
+ * __db_compact_pp --
+ *	DB->compact pre/post processing.
+ *
+ * PUBLIC: int __db_compact_pp __P((DB *, DB_TXN *,
+ * PUBLIC:       DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *));
+ */
+int
+__db_compact_pp(dbp, txn, start, stop, c_data, flags, end)
+	DB *dbp;
+	DB_TXN *txn;
+	DBT *start, *stop;
+	DB_COMPACT *c_data;
+	u_int32_t flags;
+	DBT *end;
+{
+	DB_COMPACT *dp, l_data;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = dbp->env;
+
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->compact");
+
+	/*
+	 * !!!
+	 * The actual argument checking is simple, do it inline, outside of
+	 * the replication block.
+	 */
+	if ((ret = __db_fchk(
+	    env, "DB->compact", flags, DB_FREELIST_ONLY | DB_FREE_SPACE)) != 0)
+		return (ret);
+
+	/* Check for changes to a read-only database. */
+	if (DB_IS_READONLY(dbp))
+		return (__db_rdonly(env, "DB->compact"));
+
+	if (start != NULL && (ret = __dbt_usercopy(env, start)) != 0)
+		return (ret);
+	if (stop != NULL && (ret = __dbt_usercopy(env, stop)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (ret = __db_rep_enter(dbp, 1, 0,
+	    txn != NULL)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	if (c_data == NULL) {
+		dp = &l_data;
+		memset(dp, 0, sizeof(*dp));
+	} else
+		dp = c_data;
+#ifdef HAVE_PARTITION
+	if (DB_IS_PARTITIONED(dbp))
+		ret = __part_compact(dbp, ip, txn, start, stop, dp, flags, end);
+	else
+#endif
+	switch (dbp->type) {
+	case DB_HASH:
+		if (!LF_ISSET(DB_FREELIST_ONLY))
+			goto err;
+		/* FALLTHROUGH */
+	case DB_BTREE:
+	case DB_RECNO:
+		ret = __bam_compact(dbp, ip, txn, start, stop, dp, flags, end);
+		break;
+
+	default:
+err:		ret = __dbh_am_chk(dbp, DB_OK_BTREE);
+		break;
+	}
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	__dbt_userfree(env, start, stop, NULL);
+	return (ret);
+}
+
+/*
+ * __db_associate_foreign_pp --
+ *	DB->associate_foreign pre/post processing.
+ *
+ * PUBLIC: int __db_associate_foreign_pp __P((DB *, DB *,
+ * PUBLIC:     int (*)(DB *, const DBT *, DBT *, const DBT *, int *),
+ * PUBLIC:     u_int32_t));
+ */
+int
+__db_associate_foreign_pp(fdbp, dbp, callback, flags)
+	DB *dbp, *fdbp;
+	int (*callback) __P((DB *, const DBT *, DBT *, const DBT *, int *));
+	u_int32_t flags;
+{
+	/* Most of this is based on the implementation of associate */
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = dbp->env;
+
+	PANIC_CHECK(env);
+	STRIP_AUTO_COMMIT(flags);
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check &&
+	    (ret = __db_rep_enter(dbp, 1, 0, 0)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	if ((ret = __db_associate_foreign_arg(fdbp, dbp, callback, flags)) != 0)
+		goto err;
+
+	ret = __db_associate_foreign(fdbp, dbp, callback, flags);
+
+err:	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_associate_foreign_arg --
+ *	DB->associate_foreign argument checking.
+ */
+static int
+__db_associate_foreign_arg(fdbp, dbp, callback, flags)
+	DB *dbp, *fdbp;
+	int (*callback) __P((DB *, const DBT *, DBT *, const DBT *, int *));
+	u_int32_t flags;
+{
+	ENV *env;
+
+	env = fdbp->env;
+
+	if (F_ISSET(fdbp, DB_AM_SECONDARY)) {
+		__db_errx(env,
+		    "Secondary indices may not be used as foreign databases");
+		return (EINVAL);
+	}
+	if (F_ISSET(fdbp, DB_AM_DUP)) {
+		__db_errx(env,
+		    "Foreign databases may not be configured with duplicates");
+		return (EINVAL);
+	}
+	if (F_ISSET(fdbp, DB_AM_RENUMBER)) {
+		__db_errx(env,
+	    "Renumbering recno databases may not be used as foreign databases");
+		return (EINVAL);
+	}
+	if (!F_ISSET(dbp, DB_AM_SECONDARY)) {
+		__db_errx(env,
+		    "The associating database must be a secondary index.");
+		return (EINVAL);
+	}
+	if (LF_ISSET(DB_FOREIGN_NULLIFY) && callback == NULL) {
+		__db_errx(env,
+		    "When specifying a delete action of nullify, a callback%s",
+		    " function needs to be configured");
+		return (EINVAL);
+	} else if (!LF_ISSET(DB_FOREIGN_NULLIFY) && callback != NULL) {
+		__db_errx(env,
+		    "When not specifying a delete action of nullify, a%s",
+		    " callback function cannot be configured");
+		return (EINVAL);
+	}
+
+	return (0);
+}
+
+/*
+ * __db_sync_pp --
+ *	DB->sync pre/post processing.
+ *
+ * PUBLIC: int __db_sync_pp __P((DB *, u_int32_t));
+ */
+int
+__db_sync_pp(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = dbp->env;
+
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->sync");
+
+	/*
+	 * !!!
+	 * The actual argument checking is simple, do it inline, outside of
+	 * the replication block.
+	 */
+	if (flags != 0)
+		return (__db_ferr(env, "DB->sync", 0));
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, 0)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	ret = __db_sync(dbp);
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+err:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __dbc_close_pp --
+ *	DBC->close pre/post processing.
+ *
+ * PUBLIC: int __dbc_close_pp __P((DBC *));
+ */
+int
+__dbc_close_pp(dbc)
+	DBC *dbc;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	/*
+	 * If the cursor is already closed we have a serious problem, and we
+	 * assume that the cursor isn't on the active queue.  Don't do any of
+	 * the remaining cursor close processing.
+	 */
+	if (!F_ISSET(dbc, DBC_ACTIVE)) {
+		__db_errx(env, "Closing already-closed cursor");
+		return (EINVAL);
+	}
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = dbc->txn == NULL && IS_ENV_REPLICATED(env);
+	ret = __dbc_close(dbc);
+
+	/* Release replication block. */
+	if (handle_check &&
+	    (t_ret = __op_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __dbc_cmp_pp --
+ *	DBC->cmp pre/post processing.
+ *
+ * PUBLIC: int __dbc_cmp_pp __P((DBC *, DBC *, int*, u_int32_t));
+ */
+int
+__dbc_cmp_pp(dbc, other_cursor, result, flags)
+	DBC *dbc, *other_cursor;
+	int *result;
+	u_int32_t flags;
+{
+	DB *dbp, *odbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	dbp = dbc->dbp;
+	odbp = other_cursor->dbp;
+	env = dbp->env;
+
+	if (flags != 0)
+		return (__db_ferr(env, "DBcursor->cmp", 0));
+
+	if (other_cursor == NULL) {
+		__db_errx(env, "DBcursor->cmp dbc pointer must not be null");
+		return (EINVAL);
+	}
+
+	if (dbp != odbp) {
+		__db_errx(env, 
+"DBcursor->cmp both cursors must refer to the same database.");
+		return (EINVAL);
+	}
+
+	ENV_ENTER(env, ip);
+	ret = __dbc_cmp(dbc, other_cursor, result);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __dbc_count_pp --
+ *	DBC->count pre/post processing.
+ *
+ * PUBLIC: int __dbc_count_pp __P((DBC *, db_recno_t *, u_int32_t));
+ */
+int
+__dbc_count_pp(dbc, recnop, flags)
+	DBC *dbc;
+	db_recno_t *recnop;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	/*
+	 * !!!
+	 * The actual argument checking is simple, do it inline, outside of
+	 * the replication block.
+	 *
+	 * The cursor must be initialized, return EINVAL for an invalid cursor.
+	 */
+	if (flags != 0)
+		return (__db_ferr(env, "DBcursor->count", 0));
+
+	if (!IS_INITIALIZED(dbc))
+		return (__db_curinval(env));
+
+	ENV_ENTER(env, ip);
+	ret = __dbc_count(dbc, recnop);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __dbc_del_pp --
+ *	DBC->del pre/post processing.
+ *
+ * PUBLIC: int __dbc_del_pp __P((DBC *, u_int32_t));
+ */
+int
+__dbc_del_pp(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	if ((ret = __dbc_del_arg(dbc, flags)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+
+	/* Check for consistent transaction usage. */
+	if ((ret = __db_check_txn(dbp, dbc->txn, dbc->locker, 0)) != 0)
+		goto err;
+
+	DEBUG_LWRITE(dbc, dbc->txn, "DBcursor->del", NULL, NULL, flags);
+	ret = __dbc_del(dbc, flags);
+
+err:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __dbc_del_arg --
+ *	Check DBC->del arguments.
+ */
+static int
+__dbc_del_arg(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	DB *dbp;
+	ENV *env;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	/* Check for changes to a read-only tree. */
+	if (DB_IS_READONLY(dbp))
+		return (__db_rdonly(env, "DBcursor->del"));
+
+	/* Check for invalid function flags. */
+	switch (flags) {
+	case 0:
+		break;
+	case DB_CONSUME:
+		if (dbp->type != DB_QUEUE)
+			return (__db_ferr(env, "DBC->del", 0));
+		break;
+	case DB_UPDATE_SECONDARY:
+		DB_ASSERT(env, F_ISSET(dbp, DB_AM_SECONDARY));
+		break;
+	default:
+		return (__db_ferr(env, "DBcursor->del", 0));
+	}
+
+	/*
+	 * The cursor must be initialized, return EINVAL for an invalid cursor,
+	 * otherwise 0.
+	 */
+	if (!IS_INITIALIZED(dbc))
+		return (__db_curinval(env));
+
+	return (0);
+}
+
+/*
+ * __dbc_dup_pp --
+ *	DBC->dup pre/post processing.
+ *
+ * PUBLIC: int __dbc_dup_pp __P((DBC *, DBC **, u_int32_t));
+ */
+int
+__dbc_dup_pp(dbc, dbcp, flags)
+	DBC *dbc, **dbcp;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	/*
+	 * !!!
+	 * The actual argument checking is simple, do it inline, outside of
+	 * the replication block.
+	 */
+	if (flags != 0 && flags != DB_POSITION)
+		return (__db_ferr(env, "DBcursor->dup", 0));
+
+	ENV_ENTER(env, ip);
+	ret = __dbc_dup(dbc, dbcp, flags);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __dbc_get_pp --
+ *	DBC->get pre/post processing.
+ *
+ * PUBLIC: int __dbc_get_pp __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_get_pp(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ignore_lease, ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0;
+	LF_CLR(DB_IGNORE_LEASE);
+	if ((ret = __dbc_get_arg(dbc, key, data, flags)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+
+	DEBUG_LREAD(dbc, dbc->txn, "DBcursor->get",
+	    flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags);
+	ret = __dbc_get(dbc, key, data, flags);
+
+	/*
+	 * Check for master leases.
+	 */
+	if (ret == 0 &&
+	    IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease)
+		ret = __rep_lease_check(env, 1);
+
+	ENV_LEAVE(env, ip);
+	__dbt_userfree(env, key, NULL, data);
+	return (ret);
+}
+
+/*
+ * __dbc_get_arg --
+ *	Common DBC->get argument checking, used by both DBC->get and DBC->pget.
+ * PUBLIC: int __dbc_get_arg __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_get_arg(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB *dbp;
+	ENV *env;
+	int dirty, multi, ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	/*
+	 * Typically in checking routines that modify the flags, we have
+	 * to save them and restore them, because the checking routine
+	 * calls the work routine.  However, this is a pure-checking
+	 * routine which returns to a function that calls the work routine,
+	 * so it's OK that we do not save and restore the flags, even though
+	 * we modify them.
+	 *
+	 * Check for read-modify-write validity.  DB_RMW doesn't make sense
+	 * with CDB cursors since if you're going to write the cursor, you
+	 * had to create it with DB_WRITECURSOR.  Regardless, we check for
+	 * LOCKING_ON and not STD_LOCKING, as we don't want to disallow it.
+	 * If this changes, confirm that DB does not itself set the DB_RMW
+	 * flag in a path where CDB may have been configured.
+	 */
+	dirty = 0;
+	if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW)) {
+		if (!LOCKING_ON(env))
+			return (__db_fnl(env, "DBcursor->get"));
+		if (LF_ISSET(DB_READ_UNCOMMITTED))
+			dirty = 1;
+		LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
+	}
+
+	multi = 0;
+	if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+		multi = 1;
+		if (LF_ISSET(DB_MULTIPLE) && LF_ISSET(DB_MULTIPLE_KEY))
+			goto multi_err;
+		LF_CLR(DB_MULTIPLE | DB_MULTIPLE_KEY);
+	}
+
+	/* Check for invalid function flags. */
+	switch (flags) {
+	case DB_CONSUME:
+	case DB_CONSUME_WAIT:
+		if (dirty) {
+			__db_errx(env,
+    "DB_READ_UNCOMMITTED is not supported with DB_CONSUME or DB_CONSUME_WAIT");
+			return (EINVAL);
+		}
+		if (dbp->type != DB_QUEUE)
+			goto err;
+		break;
+	case DB_CURRENT:
+	case DB_FIRST:
+	case DB_NEXT:
+	case DB_NEXT_DUP:
+	case DB_NEXT_NODUP:
+		break;
+	case DB_LAST:
+	case DB_PREV:
+	case DB_PREV_DUP:
+	case DB_PREV_NODUP:
+		if (multi)
+multi_err:		return (__db_ferr(env, "DBcursor->get", 1));
+		break;
+	case DB_GET_BOTHC:
+		if (dbp->type == DB_QUEUE)
+			goto err;
+		/* FALLTHROUGH */
+	case DB_GET_BOTH:
+	case DB_GET_BOTH_RANGE:
+		if ((ret = __dbt_usercopy(env, data)) != 0)
+			goto err;
+		/* FALLTHROUGH */
+	case DB_SET:
+	case DB_SET_RANGE:
+		if ((ret = __dbt_usercopy(env, key)) != 0)
+			goto err;
+		break;
+	case DB_GET_RECNO:
+		/*
+		 * The one situation in which this might be legal with a
+		 * non-RECNUM dbp is if dbp is a secondary and its primary is
+		 * DB_AM_RECNUM.
+		 */
+		if (!F_ISSET(dbp, DB_AM_RECNUM) &&
+		    (!F_ISSET(dbp, DB_AM_SECONDARY) ||
+		    !F_ISSET(dbp->s_primary, DB_AM_RECNUM)))
+			goto err;
+		break;
+	case DB_SET_RECNO:
+		if (!F_ISSET(dbp, DB_AM_RECNUM))
+			goto err;
+		if ((ret = __dbt_usercopy(env, key)) != 0)
+			goto err;
+		break;
+	default:
+err:		__dbt_userfree(env, key, NULL, data);
+		return (__db_ferr(env, "DBcursor->get", 0));
+	}
+
+	/* Check for invalid key/data flags. */
+	if ((ret = __dbt_ferr(dbp, "key", key, 0)) != 0)
+		return (ret);
+	if ((ret = __dbt_ferr(dbp, "data", data, 0)) != 0)
+		return (ret);
+
+	if (multi) {
+		if (!F_ISSET(data, DB_DBT_USERMEM)) {
+			__db_errx(env,
+	    "DB_MULTIPLE/DB_MULTIPLE_KEY require DB_DBT_USERMEM be set");
+			return (EINVAL);
+		}
+		if (F_ISSET(key, DB_DBT_PARTIAL) ||
+		    F_ISSET(data, DB_DBT_PARTIAL)) {
+			__db_errx(env,
+	    "DB_MULTIPLE/DB_MULTIPLE_KEY do not support DB_DBT_PARTIAL");
+			return (EINVAL);
+		}
+		if (data->ulen < 1024 ||
+		    data->ulen < dbp->pgsize || data->ulen % 1024 != 0) {
+			__db_errx(env, "%s%s",
+			    "DB_MULTIPLE/DB_MULTIPLE_KEY buffers must be ",
+			    "aligned, at least page size and multiples of 1KB");
+			return (EINVAL);
+		}
+	}
+
+	/*
+	 * The cursor must be initialized for DB_CURRENT, DB_GET_RECNO,
+	 * DB_PREV_DUP and DB_NEXT_DUP.  Return EINVAL for an invalid
+	 * cursor, otherwise 0.
+	 */
+	if (!IS_INITIALIZED(dbc) && (flags == DB_CURRENT ||
+	    flags == DB_GET_RECNO ||
+	    flags == DB_NEXT_DUP || flags == DB_PREV_DUP))
+		return (__db_curinval(env));
+
+	/* Check for consistent transaction usage. */
+	if (LF_ISSET(DB_RMW) &&
+	    (ret = __db_check_txn(dbp, dbc->txn, dbc->locker, 0)) != 0)
+		return (ret);
+
+	return (0);
+}
+
+/*
+ * __db_secondary_close_pp --
+ *	DB->close for secondaries
+ *
+ * PUBLIC: int __db_secondary_close_pp __P((DB *, u_int32_t));
+ */
+int
+__db_secondary_close_pp(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = dbp->env;
+	ret = 0;
+
+	/*
+	 * As a DB handle destructor, we can't fail.
+	 *
+	 * !!!
+	 * The actual argument checking is simple, do it inline, outside of
+	 * the replication block.
+	 */
+	if (flags != 0 && flags != DB_NOSYNC)
+		ret = __db_ferr(env, "DB->close", 0);
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (t_ret = __db_rep_enter(dbp, 0, 0, 0)) != 0) {
+		handle_check = 0;
+		if (ret == 0)
+			ret = t_ret;
+	}
+
+	if ((t_ret = __db_secondary_close(dbp, flags)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __dbc_pget_pp --
+ *	DBC->pget pre/post processing.
+ *
+ * PUBLIC: int __dbc_pget_pp __P((DBC *, DBT *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_pget_pp(dbc, skey, pkey, data, flags)
+	DBC *dbc;
+	DBT *skey, *pkey, *data;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ignore_lease, ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0;
+	LF_CLR(DB_IGNORE_LEASE);
+	if ((ret = __dbc_pget_arg(dbc, pkey, flags)) != 0 ||
+	    (ret = __dbc_get_arg(dbc, skey, data, flags)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	ret = __dbc_pget(dbc, skey, pkey, data, flags);
+	/*
+	 * Check for master leases.
+	 */
+	if (ret == 0 &&
+	    IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease)
+		ret = __rep_lease_check(env, 1);
+
+	ENV_LEAVE(env, ip);
+
+	__dbt_userfree(env, skey, pkey, data);
+	return (ret);
+}
+
+/*
+ * __dbc_pget_arg --
+ *	Check DBC->pget arguments.
+ */
+static int
+__dbc_pget_arg(dbc, pkey, flags)
+	DBC *dbc;
+	DBT *pkey;
+	u_int32_t flags;
+{
+	DB *dbp;
+	ENV *env;
+	int ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	if (!F_ISSET(dbp, DB_AM_SECONDARY)) {
+		__db_errx(env,
+		    "DBcursor->pget may only be used on secondary indices");
+		return (EINVAL);
+	}
+
+	if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+		__db_errx(env,
+	"DB_MULTIPLE and DB_MULTIPLE_KEY may not be used on secondary indices");
+		return (EINVAL);
+	}
+
+	switch (LF_ISSET(DB_OPFLAGS_MASK)) {
+	case DB_CONSUME:
+	case DB_CONSUME_WAIT:
+		/* These flags make no sense on a secondary index. */
+		return (__db_ferr(env, "DBcursor->pget", 0));
+	case DB_GET_BOTH:
+	case DB_GET_BOTH_RANGE:
+		/* BOTH is "get both the primary and the secondary". */
+		if (pkey == NULL) {
+			__db_errx(env,
+			    "%s requires both a secondary and a primary key",
+			     LF_ISSET(DB_GET_BOTH) ?
+			     "DB_GET_BOTH" : "DB_GET_BOTH_RANGE");
+			return (EINVAL);
+		}
+		if ((ret = __dbt_usercopy(env, pkey)) != 0)
+			return (ret);
+		break;
+	default:
+		/* __dbc_get_arg will catch the rest. */
+		break;
+	}
+
+	/*
+	 * We allow the pkey field to be NULL, so that we can make the
+	 * two-DBT get calls into wrappers for the three-DBT ones.
+	 */
+	if (pkey != NULL &&
+	    (ret = __dbt_ferr(dbp, "primary key", pkey, 0)) != 0)
+		return (ret);
+
+	/* But the pkey field can't be NULL if we're doing a DB_GET_BOTH. */
+	if (pkey == NULL && (flags & DB_OPFLAGS_MASK) == DB_GET_BOTH) {
+		__db_errx(env,
+		    "DB_GET_BOTH on a secondary index requires a primary key");
+		return (EINVAL);
+	}
+	return (0);
+}
+
+/*
+ * __dbc_put_pp --
+ *	DBC->put pre/post processing.
+ *
+ * PUBLIC: int __dbc_put_pp __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_put_pp(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	if ((ret = __dbc_put_arg(dbc, key, data, flags)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+
+	/* Check for consistent transaction usage. */
+	if ((ret = __db_check_txn(dbp, dbc->txn, dbc->locker, 0)) != 0)
+		goto err;
+
+	DEBUG_LWRITE(dbc, dbc->txn, "DBcursor->put",
+	    flags == DB_KEYFIRST || flags == DB_KEYLAST ||
+	    flags == DB_NODUPDATA || flags == DB_UPDATE_SECONDARY ?
+	    key : NULL, data, flags);
+	ret = __dbc_put(dbc, key, data, flags);
+
+err:	ENV_LEAVE(env, ip);
+	__dbt_userfree(env, key, NULL, data);
+	return (ret);
+}
+
+/*
+ * __dbc_put_arg --
+ *	Check DBC->put arguments.
+ */
+static int
+__dbc_put_arg(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB *dbp;
+	ENV *env;
+	int key_flags, ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	key_flags = 0;
+
+	/* Check for changes to a read-only tree. */
+	if (DB_IS_READONLY(dbp))
+		return (__db_rdonly(env, "DBcursor->put"));
+
+	/* Check for puts on a secondary. */
+	if (F_ISSET(dbp, DB_AM_SECONDARY)) {
+		if (flags == DB_UPDATE_SECONDARY)
+			flags = 0;
+		else {
+			__db_errx(env,
+			    "DBcursor->put forbidden on secondary indices");
+			return (EINVAL);
+		}
+	}
+
+	if ((ret = __dbt_usercopy(env, data)) != 0)
+		return (ret);
+
+	/* Check for invalid function flags. */
+	switch (flags) {
+	case DB_AFTER:
+	case DB_BEFORE:
+		switch (dbp->type) {
+		case DB_BTREE:
+		case DB_HASH:		/* Only with unsorted duplicates. */
+			if (!F_ISSET(dbp, DB_AM_DUP))
+				goto err;
+			if (dbp->dup_compare != NULL)
+				goto err;
+			break;
+		case DB_QUEUE:		/* Not permitted. */
+			goto err;
+		case DB_RECNO:		/* Only with mutable record numbers. */
+			if (!F_ISSET(dbp, DB_AM_RENUMBER))
+				goto err;
+			key_flags = key == NULL ? 0 : 1;
+			break;
+		case DB_UNKNOWN:
+		default:
+			goto err;
+		}
+		break;
+	case DB_CURRENT:
+		/*
+		 * If there is a comparison function, doing a DB_CURRENT
+		 * must not change the part of the data item that is used
+		 * for the comparison.
+		 */
+		break;
+	case DB_NODUPDATA:
+		if (!F_ISSET(dbp, DB_AM_DUPSORT))
+			goto err;
+		/* FALLTHROUGH */
+	case DB_KEYFIRST:
+	case DB_KEYLAST:
+	case DB_OVERWRITE_DUP:
+		key_flags = 1;
+		if ((ret = __dbt_usercopy(env, key)) != 0)
+			return (ret);
+		break;
+	default:
+err:		return (__db_ferr(env, "DBcursor->put", 0));
+	}
+
+	/*
+	 * Check for invalid key/data flags.  The key may reasonably be NULL
+	 * if DB_AFTER or DB_BEFORE is set and the application doesn't care
+	 * about the returned key, or if the DB_CURRENT flag is set.
+	 */
+	if (key_flags && (ret = __dbt_ferr(dbp, "key", key, 0)) != 0)
+		return (ret);
+	if ((ret = __dbt_ferr(dbp, "data", data, 0)) != 0)
+		return (ret);
+
+	/*
+	 * The key parameter should not be NULL or have the "partial" flag set
+	 * in a put call unless the user doesn't care about a key value we'd
+	 * return.  The user tells us they don't care about the returned key by
+	 * setting the key parameter to NULL or configuring the key DBT to not
+	 * return any information.  (Returned keys from a put are always record
+	 * numbers, and returning part of a record number  doesn't make sense:
+	 * only accept a partial return if the length returned is 0.)
+	 */
+	if (key_flags && F_ISSET(key, DB_DBT_PARTIAL) && key->dlen != 0)
+		return (__db_ferr(env, "key DBT", 0));
+
+	/*
+	 * The cursor must be initialized for anything other than DB_KEYFIRST,
+	 * DB_KEYLAST or zero: return EINVAL for an invalid cursor, otherwise 0.
+	 */
+	if (!IS_INITIALIZED(dbc) && flags != 0 && flags != DB_KEYFIRST &&
+	    flags != DB_KEYLAST && flags != DB_NODUPDATA &&
+	    flags != DB_OVERWRITE_DUP)
+		return (__db_curinval(env));
+
+	return (0);
+}
+
+/*
+ * __dbt_ferr --
+ *	Check a DBT for flag errors.
+ */
+static int
+__dbt_ferr(dbp, name, dbt, check_thread)
+	const DB *dbp;
+	const char *name;
+	const DBT *dbt;
+	int check_thread;
+{
+	ENV *env;
+	int ret;
+
+	env = dbp->env;
+
+	/*
+	 * Check for invalid DBT flags.  We allow any of the flags to be
+	 * specified to any DB or DBcursor call so that applications can
+	 * set DB_DBT_MALLOC when retrieving a data item from a secondary
+	 * database and then specify that same DBT as a key to a primary
+	 * database, without having to clear flags.
+	 */
+	if ((ret = __db_fchk(env, name, dbt->flags, DB_DBT_APPMALLOC |
+	    DB_DBT_BULK | DB_DBT_DUPOK | DB_DBT_MALLOC | DB_DBT_REALLOC |
+	    DB_DBT_USERCOPY | DB_DBT_USERMEM | DB_DBT_PARTIAL)) != 0)
+		return (ret);
+	switch (F_ISSET(dbt, DB_DBT_MALLOC | DB_DBT_REALLOC |
+	    DB_DBT_USERCOPY | DB_DBT_USERMEM)) {
+	case 0:
+	case DB_DBT_MALLOC:
+	case DB_DBT_REALLOC:
+	case DB_DBT_USERCOPY:
+	case DB_DBT_USERMEM:
+		break;
+	default:
+		return (__db_ferr(env, name, 1));
+	}
+
+	if (F_ISSET(dbt, DB_DBT_BULK) && F_ISSET(dbt, DB_DBT_PARTIAL)) {
+		__db_errx(env,
+	      "Bulk and partial operations cannot be combined on %s DBT", name);
+		return (EINVAL);
+	}
+
+	if (check_thread && DB_IS_THREADED(dbp) &&
+	    !F_ISSET(dbt, DB_DBT_MALLOC | DB_DBT_REALLOC |
+		DB_DBT_USERCOPY | DB_DBT_USERMEM)) {
+		__db_errx(env,
+		    "DB_THREAD mandates memory allocation flag on %s DBT",
+		    name);
+		return (EINVAL);
+	}
+	return (0);
+}
+
+/*
+ * __db_curinval
+ *	Report that a cursor is in an invalid state.
+ */
+static int
+__db_curinval(env)
+	const ENV *env;
+{
+	__db_errx(env,
+	    "Cursor position must be set before performing this operation");
+	return (EINVAL);
+}
+
+/*
+ * __db_txn_auto_init --
+ *	Handle DB_AUTO_COMMIT initialization.
+ *
+ * PUBLIC: int __db_txn_auto_init __P((ENV *, DB_THREAD_INFO *, DB_TXN **));
+ */
+int
+__db_txn_auto_init(env, ip, txnidp)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	DB_TXN **txnidp;
+{
+	/*
+	 * Method calls where applications explicitly specify DB_AUTO_COMMIT
+	 * require additional validation: the DB_AUTO_COMMIT flag cannot be
+	 * specified if a transaction cookie is also specified, nor can the
+	 * flag be specified in a non-transactional environment.
+	 */
+	if (*txnidp != NULL) {
+		__db_errx(env,
+    "DB_AUTO_COMMIT may not be specified along with a transaction handle");
+		return (EINVAL);
+	}
+
+	if (!TXN_ON(env)) {
+		__db_errx(env,
+    "DB_AUTO_COMMIT may not be specified in non-transactional environment");
+		return (EINVAL);
+	}
+
+	/*
+	 * Our caller checked to see if replication is making a state change.
+	 * Don't call the user-level API (which would repeat that check).
+	 */
+	return (__txn_begin(env, ip, NULL, txnidp, 0));
+}
+
+/*
+ * __db_txn_auto_resolve --
+ *	Resolve local transactions.
+ *
+ * PUBLIC: int __db_txn_auto_resolve __P((ENV *, DB_TXN *, int, int));
+ */
+int
+__db_txn_auto_resolve(env, txn, nosync, ret)
+	ENV *env;
+	DB_TXN *txn;
+	int nosync, ret;
+{
+	int t_ret;
+
+	/*
+	 * We're resolving a transaction for the user, and must decrement the
+	 * replication handle count.  Call the user-level API.
+	 */
+	if (ret == 0)
+		return (__txn_commit(txn, nosync ? DB_TXN_NOSYNC : 0));
+
+	if ((t_ret = __txn_abort(txn)) != 0)
+		return (__env_panic(env, t_ret));
+
+	return (ret);
+}
diff --git a/db/db_join.c b/db/db_join.c
new file mode 100644
index 0000000..05c11a4
--- /dev/null
+++ b/db/db_join.c
@@ -0,0 +1,940 @@
+/*
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998-2009 Oracle.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_join.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+
+static int __db_join_close_pp __P((DBC *));
+static int __db_join_cmp __P((const void *, const void *));
+static int __db_join_del __P((DBC *, u_int32_t));
+static int __db_join_get __P((DBC *, DBT *, DBT *, u_int32_t));
+static int __db_join_get_pp __P((DBC *, DBT *, DBT *, u_int32_t));
+static int __db_join_getnext __P((DBC *, DBT *, DBT *, u_int32_t, u_int32_t));
+static int __db_join_primget __P((DB *, DB_THREAD_INFO *,
+    DB_TXN *, DB_LOCKER *, DBT *, DBT *, u_int32_t));
+static int __db_join_put __P((DBC *, DBT *, DBT *, u_int32_t));
+
+/*
+ * Check to see if the Nth secondary cursor of join cursor jc is pointing
+ * to a sorted duplicate set.
+ */
+#define	SORTED_SET(jc, n)   ((jc)->j_curslist[(n)]->dbp->dup_compare != NULL)
+
+/*
+ * This is the duplicate-assisted join functionality.  Right now we're
+ * going to write it such that we return one item at a time, although
+ * I think we may need to optimize it to return them all at once.
+ * It should be easier to get it working this way, and I believe that
+ * changing it should be fairly straightforward.
+ *
+ * We optimize the join by sorting cursors from smallest to largest
+ * cardinality.  In most cases, this is indeed optimal.  However, if
+ * a cursor with large cardinality has very few data in common with the
+ * first cursor, it is possible that the join will be made faster by
+ * putting it earlier in the cursor list.  Since we have no way to detect
+ * cases like this, we simply provide a flag, DB_JOIN_NOSORT, which retains
+ * the sort order specified by the caller, who may know more about the
+ * structure of the data.
+ *
+ * The first cursor moves sequentially through the duplicate set while
+ * the others search explicitly for the duplicate in question.
+ *
+ */
+
+/*
+ * __db_join --
+ *	This is the interface to the duplicate-assisted join functionality.
+ * In the same way that cursors mark a position in a database, a cursor
+ * can mark a position in a join.  While most cursors are created by the
+ * cursor method of a DB, join cursors are created through an explicit
+ * call to DB->join.
+ *
+ * The curslist is an array of existing, initialized cursors and primary
+ * is the DB of the primary file.  The data item that joins all the
+ * cursors in the curslist is used as the key into the primary and that
+ * key and data are returned.  When no more items are left in the join
+ * set, the  c_next operation off the join cursor will return DB_NOTFOUND.
+ *
+ * PUBLIC: int __db_join __P((DB *, DBC **, DBC **, u_int32_t));
+ */
+int
+__db_join(primary, curslist, dbcp, flags)
+	DB *primary;
+	DBC **curslist, **dbcp;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	ENV *env;
+	JOIN_CURSOR *jc;
+	size_t ncurs, nslots;
+	u_int32_t i;
+	int ret;
+
+	env = primary->env;
+	dbc = NULL;
+	jc = NULL;
+
+	if ((ret = __os_calloc(env, 1, sizeof(DBC), &dbc)) != 0)
+		goto err;
+
+	if ((ret = __os_calloc(env, 1, sizeof(JOIN_CURSOR), &jc)) != 0)
+		goto err;
+
+	if ((ret = __os_malloc(env, 256, &jc->j_key.data)) != 0)
+		goto err;
+	jc->j_key.ulen = 256;
+	F_SET(&jc->j_key, DB_DBT_USERMEM);
+
+	F_SET(&jc->j_rdata, DB_DBT_REALLOC);
+
+	for (jc->j_curslist = curslist;
+	    *jc->j_curslist != NULL; jc->j_curslist++)
+		;
+
+	/*
+	 * The number of cursor slots we allocate is one greater than
+	 * the number of cursors involved in the join, because the
+	 * list is NULL-terminated.
+	 */
+	ncurs = (size_t)(jc->j_curslist - curslist);
+	nslots = ncurs + 1;
+
+	/*
+	 * !!! -- A note on the various lists hanging off jc.
+	 *
+	 * j_curslist is the initial NULL-terminated list of cursors passed
+	 * into __db_join.  The original cursors are not modified; pristine
+	 * copies are required because, in databases with unsorted dups, we
+	 * must reset all of the secondary cursors after the first each
+	 * time the first one is incremented, or else we will lose data
+	 * which happen to be sorted differently in two different cursors.
+	 *
+	 * j_workcurs is where we put those copies that we're planning to
+	 * work with.  They're lazily c_dup'ed from j_curslist as we need
+	 * them, and closed when the join cursor is closed or when we need
+	 * to reset them to their original values (in which case we just
+	 * c_dup afresh).
+	 *
+	 * j_fdupcurs is an array of cursors which point to the first
+	 * duplicate in the duplicate set that contains the data value
+	 * we're currently interested in.  We need this to make
+	 * __db_join_get correctly return duplicate duplicates;  i.e., if a
+	 * given data value occurs twice in the set belonging to cursor #2,
+	 * and thrice in the set belonging to cursor #3, and once in all
+	 * the other cursors, successive calls to __db_join_get need to
+	 * return that data item six times.  To make this happen, each time
+	 * cursor N is allowed to advance to a new datum, all cursors M
+	 * such that M > N have to be reset to the first duplicate with
+	 * that datum, so __db_join_get will return all the dup-dups again.
+	 * We could just reset them to the original cursor from j_curslist,
+	 * but that would be a bit slower in the unsorted case and a LOT
+	 * slower in the sorted one.
+	 *
+	 * j_exhausted is a list of boolean values which represent
+	 * whether or not their corresponding cursors are "exhausted",
+	 * i.e. whether the datum under the corresponding cursor has
+	 * been found not to exist in any unreturned combinations of
+	 * later secondary cursors, in which case they are ready to be
+	 * incremented.
+	 */
+
+	/* We don't want to free regions whose callocs have failed. */
+	jc->j_curslist = NULL;
+	jc->j_workcurs = NULL;
+	jc->j_fdupcurs = NULL;
+	jc->j_exhausted = NULL;
+
+	if ((ret = __os_calloc(env, nslots, sizeof(DBC *),
+	    &jc->j_curslist)) != 0)
+		goto err;
+	if ((ret = __os_calloc(env, nslots, sizeof(DBC *),
+	    &jc->j_workcurs)) != 0)
+		goto err;
+	if ((ret = __os_calloc(env, nslots, sizeof(DBC *),
+	    &jc->j_fdupcurs)) != 0)
+		goto err;
+	if ((ret = __os_calloc(env, nslots, sizeof(u_int8_t),
+	    &jc->j_exhausted)) != 0)
+		goto err;
+	for (i = 0; curslist[i] != NULL; i++) {
+		jc->j_curslist[i] = curslist[i];
+		jc->j_workcurs[i] = NULL;
+		jc->j_fdupcurs[i] = NULL;
+		jc->j_exhausted[i] = 0;
+	}
+	jc->j_ncurs = (u_int32_t)ncurs;
+
+	/*
+	 * If DB_JOIN_NOSORT is not set, optimize secondary cursors by
+	 * sorting in order of increasing cardinality.
+	 */
+	if (!LF_ISSET(DB_JOIN_NOSORT))
+		qsort(jc->j_curslist, ncurs, sizeof(DBC *), __db_join_cmp);
+
+	/*
+	 * We never need to reset the 0th cursor, so there's no
+	 * solid reason to use workcurs[0] rather than curslist[0] in
+	 * join_get.  Nonetheless, it feels cleaner to do it for symmetry,
+	 * and this is the most logical place to copy it.
+	 *
+	 * !!!
+	 * There's no need to close the new cursor if we goto err only
+	 * because this is the last thing that can fail.  Modifier of this
+	 * function beware!
+	 */
+	if ((ret =
+	    __dbc_dup(jc->j_curslist[0], jc->j_workcurs, DB_POSITION)) != 0)
+		goto err;
+
+	dbc->close = dbc->c_close = __db_join_close_pp;
+	dbc->del = dbc->c_del = __db_join_del;
+	dbc->get = dbc->c_get = __db_join_get_pp;
+	dbc->put = dbc->c_put = __db_join_put;
+	dbc->internal = (DBC_INTERNAL *)jc;
+	dbc->dbp = primary;
+	jc->j_primary = primary;
+
+	/* Stash the first cursor's transaction here for easy access. */
+	dbc->txn = curslist[0]->txn;
+
+	*dbcp = dbc;
+
+	MUTEX_LOCK(env, primary->mutex);
+	TAILQ_INSERT_TAIL(&primary->join_queue, dbc, links);
+	MUTEX_UNLOCK(env, primary->mutex);
+
+	return (0);
+
+err:	if (jc != NULL) {
+		if (jc->j_curslist != NULL)
+			__os_free(env, jc->j_curslist);
+		if (jc->j_workcurs != NULL) {
+			if (jc->j_workcurs[0] != NULL)
+				(void)__dbc_close(jc->j_workcurs[0]);
+			__os_free(env, jc->j_workcurs);
+		}
+		if (jc->j_fdupcurs != NULL)
+			__os_free(env, jc->j_fdupcurs);
+		if (jc->j_exhausted != NULL)
+			__os_free(env, jc->j_exhausted);
+		__os_free(env, jc);
+	}
+	if (dbc != NULL)
+		__os_free(env, dbc);
+	return (ret);
+}
+
+/*
+ * __db_join_close_pp --
+ *	DBC->close pre/post processing for join cursors.
+ */
+static int
+__db_join_close_pp(dbc)
+	DBC *dbc;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	ENV_ENTER(env, ip);
+
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check &&
+	    (ret = __db_rep_enter(dbp, 1, 0, dbc->txn != NULL)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	ret = __db_join_close(dbc);
+
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+err:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+static int
+__db_join_put(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key;
+	DBT *data;
+	u_int32_t flags;
+{
+	COMPQUIET(dbc, NULL);
+	COMPQUIET(key, NULL);
+	COMPQUIET(data, NULL);
+	COMPQUIET(flags, 0);
+	return (EINVAL);
+}
+
+static int
+__db_join_del(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	COMPQUIET(dbc, NULL);
+	COMPQUIET(flags, 0);
+	return (EINVAL);
+}
+
+/*
+ * __db_join_get_pp --
+ *	DBjoin->get pre/post processing.
+ */
+static int
+__db_join_get_pp(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	u_int32_t handle_check, save_flags;
+	int ret, t_ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	/* Save the original flags value. */
+	save_flags = flags;
+
+	if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW)) {
+		if (!LOCKING_ON(env))
+			return (__db_fnl(env, "DBC->get"));
+		LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
+	}
+
+	switch (flags) {
+	case 0:
+	case DB_JOIN_ITEM:
+		break;
+	default:
+		return (__db_ferr(env, "DBC->get", 0));
+	}
+
+	/*
+	 * A partial get of the key of a join cursor don't make much sense;
+	 * the entire key is necessary to query the primary database
+	 * and find the datum, and so regardless of the size of the key
+	 * it would not be a performance improvement.  Since it would require
+	 * special handling, we simply disallow it.
+	 *
+	 * A partial get of the data, however, potentially makes sense (if
+	 * all possible data are a predictable large structure, for instance)
+	 * and causes us no headaches, so we permit it.
+	 */
+	if (F_ISSET(key, DB_DBT_PARTIAL)) {
+		__db_errx(env,
+		    "DB_DBT_PARTIAL may not be set on key during join_get");
+		return (EINVAL);
+	}
+
+	ENV_ENTER(env, ip);
+
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check &&
+	    (ret = __db_rep_enter(dbp, 1, 0, dbc->txn != NULL)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	/* Restore the original flags value. */
+	flags = save_flags;
+
+	ret = __db_join_get(dbc, key, data, flags);
+
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+err:	ENV_LEAVE(env, ip);
+	__dbt_userfree(env, key, NULL, NULL);
+	return (ret);
+}
+
+static int
+__db_join_get(dbc, key_arg, data_arg, flags)
+	DBC *dbc;
+	DBT *key_arg, *data_arg;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DBC *cp;
+	DBT *key_n, key_n_mem;
+	ENV *env;
+	JOIN_CURSOR *jc;
+	int db_manage_data, ret;
+	u_int32_t i, j, operation, opmods;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	jc = (JOIN_CURSOR *)dbc->internal;
+
+	operation = LF_ISSET(DB_OPFLAGS_MASK);
+
+	/* !!!
+	 * If the set of flags here changes, check that __db_join_primget
+	 * is updated to handle them properly.
+	 */
+	opmods = LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
+
+	/*
+	 * Since we are fetching the key as a datum in the secondary indices,
+	 * we must be careful of caller-specified DB_DBT_* memory
+	 * management flags.  If necessary, use a stack-allocated DBT;
+	 * we'll appropriately copy and/or allocate the data later.
+	 */
+	if (F_ISSET(key_arg,
+	    DB_DBT_MALLOC | DB_DBT_USERCOPY | DB_DBT_USERMEM)) {
+		/* We just use the default buffer;  no need to go malloc. */
+		key_n = &key_n_mem;
+		memset(key_n, 0, sizeof(DBT));
+	} else {
+		/*
+		 * Either DB_DBT_REALLOC or the default buffer will work
+		 * fine if we have to reuse it, as we do.
+		 */
+		key_n = key_arg;
+	}
+	if (F_ISSET(key_arg, DB_DBT_USERCOPY))
+		key_arg->data = NULL;
+
+	/*
+	 * If our last attempt to do a get on the primary key failed,
+	 * short-circuit the join and try again with the same key.
+	 */
+	if (F_ISSET(jc, JOIN_RETRY))
+		goto samekey;
+	F_CLR(jc, JOIN_RETRY);
+
+retry:	ret = __dbc_get(jc->j_workcurs[0], &jc->j_key, key_n,
+	    opmods | (jc->j_exhausted[0] ? DB_NEXT_DUP : DB_CURRENT));
+
+	if (ret == DB_BUFFER_SMALL) {
+		jc->j_key.ulen <<= 1;
+		if ((ret = __os_realloc(env,
+		    jc->j_key.ulen, &jc->j_key.data)) != 0)
+			goto mem_err;
+		goto retry;
+	}
+
+	/*
+	 * If ret == DB_NOTFOUND, we're out of elements of the first
+	 * secondary cursor.  This is how we finally finish the join
+	 * if all goes well.
+	 */
+	if (ret != 0)
+		goto err;
+
+	/*
+	 * If jc->j_exhausted[0] == 1, we've just advanced the first cursor,
+	 * and we're going to want to advance all the cursors that point to
+	 * the first member of a duplicate duplicate set (j_fdupcurs[1..N]).
+	 * Close all the cursors in j_fdupcurs;  we'll reopen them the
+	 * first time through the upcoming loop.
+	 */
+	for (i = 1; i < jc->j_ncurs; i++) {
+		if (jc->j_fdupcurs[i] != NULL &&
+		    (ret = __dbc_close(jc->j_fdupcurs[i])) != 0)
+			goto err;
+		jc->j_fdupcurs[i] = NULL;
+	}
+
+	/*
+	 * If jc->j_curslist[1] == NULL, we have only one cursor in the join.
+	 * Thus, we can safely increment that one cursor on each call
+	 * to __db_join_get, and we signal this by setting jc->j_exhausted[0]
+	 * right away.
+	 *
+	 * Otherwise, reset jc->j_exhausted[0] to 0, so that we don't
+	 * increment it until we know we're ready to.
+	 */
+	if (jc->j_curslist[1] == NULL)
+		jc->j_exhausted[0] = 1;
+	else
+		jc->j_exhausted[0] = 0;
+
+	/* We have the first element; now look for it in the other cursors. */
+	for (i = 1; i < jc->j_ncurs; i++) {
+		DB_ASSERT(env, jc->j_curslist[i] != NULL);
+		if (jc->j_workcurs[i] == NULL)
+			/* If this is NULL, we need to dup curslist into it. */
+			if ((ret = __dbc_dup(jc->j_curslist[i],
+			    &jc->j_workcurs[i], DB_POSITION)) != 0)
+				goto err;
+
+retry2:		cp = jc->j_workcurs[i];
+
+		if ((ret = __db_join_getnext(cp, &jc->j_key, key_n,
+			    jc->j_exhausted[i], opmods)) == DB_NOTFOUND) {
+			/*
+			 * jc->j_workcurs[i] has no more of the datum we're
+			 * interested in.  Go back one cursor and get
+			 * a new dup.  We can't just move to a new
+			 * element of the outer relation, because that way
+			 * we might miss duplicate duplicates in cursor i-1.
+			 *
+			 * If this takes us back to the first cursor,
+			 * -then- we can move to a new element of the outer
+			 * relation.
+			 */
+			--i;
+			jc->j_exhausted[i] = 1;
+
+			if (i == 0) {
+				for (j = 1; jc->j_workcurs[j] != NULL; j++) {
+					/*
+					 * We're moving to a new element of
+					 * the first secondary cursor.  If
+					 * that cursor is sorted, then any
+					 * other sorted cursors can be safely
+					 * reset to the first duplicate
+					 * duplicate in the current set if we
+					 * have a pointer to it (we can't just
+					 * leave them be, or we'll miss
+					 * duplicate duplicates in the outer
+					 * relation).
+					 *
+					 * If the first cursor is unsorted, or
+					 * if cursor j is unsorted, we can
+					 * make no assumptions about what
+					 * we're looking for next or where it
+					 * will be, so we reset to the very
+					 * beginning (setting workcurs NULL
+					 * will achieve this next go-round).
+					 *
+					 * XXX: This is likely to break
+					 * horribly if any two cursors are
+					 * both sorted, but have different
+					 * specified sort functions.  For,
+					 * now, we dismiss this as pathology
+					 * and let strange things happen--we
+					 * can't make rope childproof.
+					 */
+					if ((ret = __dbc_close(
+					    jc->j_workcurs[j])) != 0)
+						goto err;
+					if (!SORTED_SET(jc, 0) ||
+					    !SORTED_SET(jc, j) ||
+					    jc->j_fdupcurs[j] == NULL)
+						/*
+						 * Unsafe conditions;
+						 * reset fully.
+						 */
+						jc->j_workcurs[j] = NULL;
+					else
+						/* Partial reset suffices. */
+						if ((__dbc_dup(
+						    jc->j_fdupcurs[j],
+						    &jc->j_workcurs[j],
+						    DB_POSITION)) != 0)
+							goto err;
+					jc->j_exhausted[j] = 0;
+				}
+				goto retry;
+				/* NOTREACHED */
+			}
+
+			/*
+			 * We're about to advance the cursor and need to
+			 * reset all of the workcurs[j] where j>i, so that
+			 * we don't miss any duplicate duplicates.
+			 */
+			for (j = i + 1;
+			    jc->j_workcurs[j] != NULL;
+			    j++) {
+				if ((ret =
+				    __dbc_close(jc->j_workcurs[j])) != 0)
+					goto err;
+				jc->j_exhausted[j] = 0;
+				if (jc->j_fdupcurs[j] == NULL)
+					jc->j_workcurs[j] = NULL;
+				else if ((ret = __dbc_dup(jc->j_fdupcurs[j],
+				    &jc->j_workcurs[j], DB_POSITION)) != 0)
+					goto err;
+			}
+			goto retry2;
+			/* NOTREACHED */
+		}
+
+		if (ret == DB_BUFFER_SMALL) {
+			jc->j_key.ulen <<= 1;
+			if ((ret = __os_realloc(env, jc->j_key.ulen,
+			    &jc->j_key.data)) != 0) {
+mem_err:			__db_errx(env,
+				    "Allocation failed for join key, len = %lu",
+				    (u_long)jc->j_key.ulen);
+				goto err;
+			}
+			goto retry2;
+		}
+
+		if (ret != 0)
+			goto err;
+
+		/*
+		 * If we made it this far, we've found a matching
+		 * datum in cursor i.  Mark the current cursor
+		 * unexhausted, so we don't miss any duplicate
+		 * duplicates the next go-round--unless this is the
+		 * very last cursor, in which case there are none to
+		 * miss, and we'll need that exhausted flag to finally
+		 * get a DB_NOTFOUND and move on to the next datum in
+		 * the outermost cursor.
+		 */
+		if (i + 1 != jc->j_ncurs)
+			jc->j_exhausted[i] = 0;
+		else
+			jc->j_exhausted[i] = 1;
+
+		/*
+		 * If jc->j_fdupcurs[i] is NULL and the ith cursor's dups are
+		 * sorted, then we're here for the first time since advancing
+		 * cursor 0, and we have a new datum of interest.
+		 * jc->j_workcurs[i] points to the beginning of a set of
+		 * duplicate duplicates;  store this into jc->j_fdupcurs[i].
+		 */
+		if (SORTED_SET(jc, i) && jc->j_fdupcurs[i] == NULL && (ret =
+		    __dbc_dup(cp, &jc->j_fdupcurs[i], DB_POSITION)) != 0)
+			goto err;
+	}
+
+err:	if (ret != 0)
+		return (ret);
+
+	if (0) {
+samekey:	/*
+		 * Get the key we tried and failed to return last time;
+		 * it should be the current datum of all the secondary cursors.
+		 */
+		if ((ret = __dbc_get(jc->j_workcurs[0],
+		    &jc->j_key, key_n, DB_CURRENT | opmods)) != 0)
+			return (ret);
+		F_CLR(jc, JOIN_RETRY);
+	}
+
+	/*
+	 * ret == 0;  we have a key to return.
+	 *
+	 * If DB_DBT_USERMEM or DB_DBT_MALLOC is set, we need to copy the key
+	 * back into the dbt we were given for the key; call __db_retcopy.
+	 * Otherwise, assert that we do not need to copy anything and proceed.
+	 */
+	DB_ASSERT(env, F_ISSET(key_arg, DB_DBT_USERMEM | DB_DBT_MALLOC |
+	    DB_DBT_USERCOPY) || key_n == key_arg);
+
+	if ((F_ISSET(key_arg, DB_DBT_USERMEM | DB_DBT_MALLOC |
+	    DB_DBT_USERCOPY)) &&
+	    (ret = __db_retcopy(env,
+	    key_arg, key_n->data, key_n->size, NULL, NULL)) != 0) {
+		/*
+		 * The retcopy failed, most commonly because we have a user
+		 * buffer for the key which is too small. Set things up to
+		 * retry next time, and return.
+		 */
+		F_SET(jc, JOIN_RETRY);
+		return (ret);
+	}
+
+	/*
+	 * If DB_JOIN_ITEM is set, we return it; otherwise we do the lookup
+	 * in the primary and then return.
+	 */
+	if (operation == DB_JOIN_ITEM)
+		return (0);
+
+	/*
+	 * If data_arg->flags == 0--that is, if DB is managing the
+	 * data DBT's memory--it's not safe to just pass the DBT
+	 * through to the primary get call, since we don't want that
+	 * memory to belong to the primary DB handle (and if the primary
+	 * is free-threaded, it can't anyway).
+	 *
+	 * Instead, use memory that is managed by the join cursor, in
+	 * jc->j_rdata.
+	 */
+	if (!F_ISSET(data_arg, DB_DBT_MALLOC | DB_DBT_REALLOC |
+	    DB_DBT_USERMEM | DB_DBT_USERCOPY))
+		db_manage_data = 1;
+	else
+		db_manage_data = 0;
+	if ((ret = __db_join_primget(jc->j_primary, dbc->thread_info,
+	    jc->j_curslist[0]->txn, jc->j_curslist[0]->locker, key_n,
+	    db_manage_data ? &jc->j_rdata : data_arg, opmods)) != 0) {
+		if (ret == DB_NOTFOUND) {
+			if (LF_ISSET(DB_READ_UNCOMMITTED) ||
+			    (jc->j_curslist[0]->txn != NULL && F_ISSET(
+			    jc->j_curslist[0]->txn, TXN_READ_UNCOMMITTED)))
+				goto retry;
+			/*
+			 * If ret == DB_NOTFOUND, the primary and secondary
+			 * are out of sync;  every item in each secondary
+			 * should correspond to something in the primary,
+			 * or we shouldn't have done the join this way.
+			 * Wail.
+			 */
+			ret = __db_secondary_corrupt(jc->j_primary);
+		} else
+			/*
+			 * The get on the primary failed for some other
+			 * reason, most commonly because we're using a user
+			 * buffer that's not big enough.  Flag our failure
+			 * so we can return the same key next time.
+			 */
+			F_SET(jc, JOIN_RETRY);
+	}
+	if (db_manage_data && ret == 0) {
+		data_arg->data = jc->j_rdata.data;
+		data_arg->size = jc->j_rdata.size;
+	}
+
+	return (ret);
+}
+
+/*
+ * __db_join_close --
+ *	DBC->close for join cursors.
+ *
+ * PUBLIC: int __db_join_close __P((DBC *));
+ */
+int
+__db_join_close(dbc)
+	DBC *dbc;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	JOIN_CURSOR *jc;
+	int ret, t_ret;
+	u_int32_t i;
+
+	jc = (JOIN_CURSOR *)dbc->internal;
+	dbp = dbc->dbp;
+	env = dbp->env;
+	ret = t_ret = 0;
+
+	/*
+	 * Remove from active list of join cursors.  Note that this
+	 * must happen before any action that can fail and return, or else
+	 * __db_close may loop indefinitely.
+	 */
+	MUTEX_LOCK(env, dbp->mutex);
+	TAILQ_REMOVE(&dbp->join_queue, dbc, links);
+	MUTEX_UNLOCK(env, dbp->mutex);
+
+	ENV_ENTER(env, ip);
+	/*
+	 * Close any open scratch cursors.  In each case, there may
+	 * not be as many outstanding as there are cursors in
+	 * curslist, but we want to close whatever's there.
+	 *
+	 * If any close fails, there's no reason not to close everything else;
+	 * we'll just return the error code of the last one to fail.  There's
+	 * not much the caller can do anyway, since these cursors only exist
+	 * hanging off a db-internal data structure that they shouldn't be
+	 * mucking with.
+	 */
+	for (i = 0; i < jc->j_ncurs; i++) {
+		if (jc->j_workcurs[i] != NULL &&
+		    (t_ret = __dbc_close(jc->j_workcurs[i])) != 0)
+			ret = t_ret;
+		if (jc->j_fdupcurs[i] != NULL &&
+		    (t_ret = __dbc_close(jc->j_fdupcurs[i])) != 0)
+			ret = t_ret;
+	}
+	ENV_LEAVE(env, ip);
+
+	__os_free(env, jc->j_exhausted);
+	__os_free(env, jc->j_curslist);
+	__os_free(env, jc->j_workcurs);
+	__os_free(env, jc->j_fdupcurs);
+	__os_free(env, jc->j_key.data);
+	if (jc->j_rdata.data != NULL)
+		__os_ufree(env, jc->j_rdata.data);
+	__os_free(env, jc);
+	__os_free(env, dbc);
+
+	return (ret);
+}
+
+/*
+ * __db_join_getnext --
+ *	This function replaces the DBC_CONTINUE and DBC_KEYSET
+ *	functionality inside the various cursor get routines.
+ *
+ *	If exhausted == 0, we're not done with the current datum;
+ *	return it if it matches "matching", otherwise search
+ *	using DB_GET_BOTHC (which is faster than iteratively doing
+ *	DB_NEXT_DUP) forward until we find one that does.
+ *
+ *	If exhausted == 1, we are done with the current datum, so just
+ *	leap forward to searching NEXT_DUPs.
+ *
+ *	If no matching datum exists, returns DB_NOTFOUND, else 0.
+ */
+static int
+__db_join_getnext(dbc, key, data, exhausted, opmods)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t exhausted, opmods;
+{
+	int ret, cmp;
+	DB *dbp;
+	DBT ldata;
+	int (*func) __P((DB *, const DBT *, const DBT *));
+
+	dbp = dbc->dbp;
+	func = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare;
+
+	switch (exhausted) {
+	case 0:
+		/*
+		 * We don't want to step on data->data;  use a new
+		 * DBT and malloc so we don't step on dbc's rdata memory.
+		 */
+		memset(&ldata, 0, sizeof(DBT));
+		F_SET(&ldata, DB_DBT_MALLOC);
+		if ((ret = __dbc_get(dbc,
+		    key, &ldata, opmods | DB_CURRENT)) != 0)
+			break;
+		cmp = func(dbp, data, &ldata);
+		if (cmp == 0) {
+			/*
+			 * We have to return the real data value.  Copy
+			 * it into data, then free the buffer we malloc'ed
+			 * above.
+			 */
+			if ((ret = __db_retcopy(dbp->env, data, ldata.data,
+			    ldata.size, &data->data, &data->size)) != 0)
+				return (ret);
+			__os_ufree(dbp->env, ldata.data);
+			return (0);
+		}
+
+		/*
+		 * Didn't match--we want to fall through and search future
+		 * dups.  We just forget about ldata and free
+		 * its buffer--data contains the value we're searching for.
+		 */
+		__os_ufree(dbp->env, ldata.data);
+		/* FALLTHROUGH */
+	case 1:
+		ret = __dbc_get(dbc, key, data, opmods | DB_GET_BOTHC);
+		break;
+	default:
+		ret = EINVAL;
+		break;
+	}
+
+	return (ret);
+}
+
+/*
+ * __db_join_cmp --
+ *	Comparison function for sorting DBCs in cardinality order.
+ */
+static int
+__db_join_cmp(a, b)
+	const void *a, *b;
+{
+	DBC *dbca, *dbcb;
+	db_recno_t counta, countb;
+
+	dbca = *((DBC * const *)a);
+	dbcb = *((DBC * const *)b);
+
+	if (__dbc_count(dbca, &counta) != 0 ||
+	    __dbc_count(dbcb, &countb) != 0)
+		return (0);
+
+	return ((long)counta - (long)countb);
+}
+
+/*
+ * __db_join_primget --
+ *	Perform a DB->get in the primary, being careful not to use a new
+ * locker ID if we're doing CDB locking.
+ */
+static int
+__db_join_primget(dbp, ip, txn, locker, key, data, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DB_LOCKER  *locker;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	u_int32_t rmw;
+	int ret, t_ret;
+
+	if ((ret = __db_cursor_int(dbp, ip,
+	    txn, dbp->type, PGNO_INVALID, 0, locker, &dbc)) != 0)
+		return (ret);
+
+	/*
+	 * The only allowable flags here are the two flags copied into "opmods"
+	 * in __db_join_get, DB_RMW and DB_READ_UNCOMMITTED.  The former is an
+	 * op on the c_get call, the latter on the cursor call.  It's a DB bug
+	 * if we allow any other flags down in here.
+	 */
+	rmw = LF_ISSET(DB_RMW);
+	if (LF_ISSET(DB_READ_UNCOMMITTED) ||
+	    (txn != NULL && F_ISSET(txn, TXN_READ_UNCOMMITTED)))
+		F_SET(dbc, DBC_READ_UNCOMMITTED);
+
+	if (LF_ISSET(DB_READ_COMMITTED) ||
+	    (txn != NULL && F_ISSET(txn, TXN_READ_COMMITTED)))
+		F_SET(dbc, DBC_READ_COMMITTED);
+
+	LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
+	DB_ASSERT(dbp->env, flags == 0);
+
+	F_SET(dbc, DBC_TRANSIENT);
+
+	/*
+	 * This shouldn't be necessary, thanks to the fact that join cursors
+	 * swap in their own DB_DBT_REALLOC'ed buffers, but just for form's
+	 * sake, we mirror what __db_get does.
+	 */
+	SET_RET_MEM(dbc, dbp);
+
+	ret = __dbc_get(dbc, key, data, DB_SET | rmw);
+
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_secondary_corrupt --
+ *	Report primary/secondary inconsistencies.
+ *
+ * PUBLIC: int __db_secondary_corrupt __P((DB *));
+ */
+int
+__db_secondary_corrupt(dbp)
+	DB *dbp;
+{
+	__db_err(dbp->env, DB_SECONDARY_BAD, "%s%s%s",
+	    dbp->fname == NULL ? "unnamed" : dbp->fname,
+	    dbp->dname == NULL ? "" : "/",
+	    dbp->dname == NULL ? "" : dbp->dname);
+	return (DB_SECONDARY_BAD);
+}
diff --git a/db/db_meta.c b/db/db_meta.c
new file mode 100644
index 0000000..ef42e44
--- /dev/null
+++ b/db/db_meta.c
@@ -0,0 +1,1299 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ *	Keith Bostic.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+#include "dbinc/db_am.h"
+#include "dbinc/hash.h"
+
+static void __db_init_meta __P((DB *, void *, db_pgno_t, u_int32_t));
+#ifdef HAVE_FTRUNCATE
+static int  __db_pglistcmp __P((const void *, const void *));
+static int  __db_truncate_freelist __P((DBC *, DBMETA *,
+      PAGE *, db_pgno_t *, u_int32_t, u_int32_t));
+#endif
+
+/*
+ * __db_init_meta --
+ *	Helper function for __db_new that initializes the important fields in
+ * a meta-data page (used instead of P_INIT).  We need to make sure that we
+ * retain the page number and LSN of the existing page.
+ */
+static void
+__db_init_meta(dbp, p, pgno, pgtype)
+	DB *dbp;
+	void *p;
+	db_pgno_t pgno;
+	u_int32_t pgtype;
+{
+	DBMETA *meta;
+	DB_LSN save_lsn;
+
+	meta = (DBMETA *)p;
+	save_lsn = meta->lsn;
+	memset(meta, 0, sizeof(DBMETA));
+	meta->lsn = save_lsn;
+	meta->pagesize = dbp->pgsize;
+	if (F_ISSET(dbp, DB_AM_CHKSUM))
+		FLD_SET(meta->metaflags, DBMETA_CHKSUM);
+	meta->pgno = pgno;
+	meta->type = (u_int8_t)pgtype;
+}
+
+/*
+ * __db_new --
+ *	Get a new page, preferably from the freelist.
+ *
+ * PUBLIC: int __db_new __P((DBC *, u_int32_t, DB_LOCK *, PAGE **));
+ */
+int
+__db_new(dbc, type, lockp, pagepp)
+	DBC *dbc;
+	u_int32_t type;
+	DB_LOCK *lockp;
+	PAGE **pagepp;
+{
+	DB *dbp;
+	DBMETA *meta;
+	DB_LOCK metalock;
+	DB_LSN lsn;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	PAGE *h;
+	db_pgno_t last, *list, pgno, newnext;
+	int extend, hash, ret, t_ret;
+
+	meta = NULL;
+	dbp = dbc->dbp;
+	env = dbp->env;
+	mpf = dbp->mpf;
+	h = NULL;
+	newnext = PGNO_INVALID;
+	if (lockp != NULL)
+		LOCK_INIT(*lockp);
+
+	hash = 0;
+	ret = 0;
+	LOCK_INIT(metalock);
+
+#ifdef HAVE_HASH
+	if (dbp->type == DB_HASH) {
+		if ((ret = __ham_return_meta(dbc, DB_MPOOL_DIRTY, &meta)) != 0)
+			goto err;
+		if (meta != NULL)
+			hash = 1;
+	}
+#endif
+	if (meta == NULL) {
+		pgno = PGNO_BASE_MD;
+		if ((ret = __db_lget(dbc,
+		    LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+			goto err;
+		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+		    DB_MPOOL_DIRTY, &meta)) != 0)
+			goto err;
+	}
+
+	last = meta->last_pgno;
+	if (meta->free == PGNO_INVALID) {
+		if (FLD_ISSET(type, P_DONTEXTEND)) {
+			*pagepp = NULL;
+			goto err;
+		}
+		last = pgno = meta->last_pgno + 1;
+		ZERO_LSN(lsn);
+		extend = 1;
+	} else {
+		pgno = meta->free;
+		/*
+		 * Lock the new page.  Do this here because we must do it
+		 * before getting the page and the caller may need the lock
+		 * to keep readers from seeing the page before the transaction
+		 * commits.  We can do this because no one will hold a free
+		 * page locked.
+		 */
+		if (lockp != NULL && (ret =
+		     __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, lockp)) != 0)
+			goto err;
+		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+		    DB_MPOOL_DIRTY, &h)) != 0)
+			goto err;
+
+		/*
+		 * We want to take the first page off the free list and
+		 * then set meta->free to the that page's next_pgno, but
+		 * we need to log the change first.
+		 */
+		newnext = h->next_pgno;
+		lsn = h->lsn;
+		extend = 0;
+		DB_ASSERT(env, TYPE(h) == P_INVALID);
+
+		if (TYPE(h) != P_INVALID) {
+			__db_errx(env,
+			    "%s page %lu is on free list with type %lu",
+				dbp->fname, (u_long)PGNO(h), (u_long)TYPE(h));
+			return (__env_panic(env, EINVAL));
+		}
+
+	}
+
+	FLD_CLR(type, P_DONTEXTEND);
+
+	/*
+	 * Log the allocation before fetching the new page.  If we
+	 * don't have room in the log then we don't want to tell
+	 * mpool to extend the file.
+	 */
+	if (DBC_LOGGING(dbc)) {
+		if ((ret = __db_pg_alloc_log(dbp, dbc->txn, &LSN(meta), 0,
+		    &LSN(meta), PGNO_BASE_MD, &lsn,
+		    pgno, (u_int32_t)type, newnext, meta->last_pgno)) != 0)
+			goto err;
+	} else
+		LSN_NOT_LOGGED(LSN(meta));
+
+	meta->free = newnext;
+
+	if (extend == 1) {
+		if (lockp != NULL && (ret =
+		     __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, lockp)) != 0)
+			goto err;
+		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+		    DB_MPOOL_NEW, &h)) != 0)
+			goto err;
+		DB_ASSERT(env, last == pgno);
+		meta->last_pgno = pgno;
+		ZERO_LSN(h->lsn);
+		h->pgno = pgno;
+	}
+	LSN(h) = LSN(meta);
+
+	if (hash == 0)
+		ret = __memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
+	meta = NULL;
+	if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
+		ret = t_ret;
+	if (ret != 0)
+		goto err;
+
+	switch (type) {
+		case P_BTREEMETA:
+		case P_HASHMETA:
+		case P_QAMMETA:
+			__db_init_meta(dbp, h, h->pgno, type);
+			break;
+		default:
+			P_INIT(h, dbp->pgsize,
+			    h->pgno, PGNO_INVALID, PGNO_INVALID, 0, type);
+			break;
+	}
+
+	/* Fix up the sorted free list if necessary. */
+#ifdef HAVE_FTRUNCATE
+	if (extend == 0) {
+		u_int32_t nelems = 0;
+
+		if ((ret = __memp_get_freelist(dbp->mpf, &nelems, &list)) != 0)
+			goto err;
+		if (nelems != 0) {
+			DB_ASSERT(env, h->pgno == list[0]);
+			memmove(list, &list[1], (nelems - 1) * sizeof(*list));
+			if ((ret = __memp_extend_freelist(
+			    dbp->mpf, nelems - 1, &list)) != 0)
+				goto err;
+		}
+	}
+#else
+	COMPQUIET(list, NULL);
+#endif
+
+	*pagepp = h;
+	return (0);
+
+err:	if (h != NULL)
+		(void)__memp_fput(mpf, dbc->thread_info, h, dbc->priority);
+	if (meta != NULL && hash == 0)
+		(void)__memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
+	(void)__TLPUT(dbc, metalock);
+	if (lockp != NULL)
+		(void)__LPUT(dbc, *lockp);
+	return (ret);
+}
+
+/*
+ * __db_free --
+ *	Add a page to the head of the freelist.
+ *
+ * PUBLIC: int __db_free __P((DBC *, PAGE *));
+ */
+int
+__db_free(dbc, h)
+	DBC *dbc;
+	PAGE *h;
+{
+	DB *dbp;
+	DBMETA *meta;
+	DBT ddbt, ldbt;
+	DB_LOCK metalock;
+	DB_LSN *lsnp;
+	DB_MPOOLFILE *mpf;
+	PAGE *prev;
+	db_pgno_t last_pgno, next_pgno, pgno, prev_pgno;
+	u_int32_t lflag;
+	int hash, ret, t_ret;
+#ifdef HAVE_FTRUNCATE
+	db_pgno_t *list, *lp;
+	u_int32_t nelem, position, start;
+	int do_truncate;
+#endif
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	prev_pgno = PGNO_INVALID;
+	meta = NULL;
+	prev = NULL;
+	LOCK_INIT(metalock);
+#ifdef HAVE_FTRUNCATE
+	lp = NULL;
+	nelem = 0;
+	do_truncate = 0;
+#endif
+
+	/*
+	 * Retrieve the metadata page.  If we are not keeping a sorted
+	 * free list put the page at the head of the the free list.
+	 * If we are keeping a sorted free list, for truncation,
+	 * then figure out where this page belongs and either
+	 * link it in or truncate the file as much as possible.
+	 * If either the lock get or page get routines
+	 * fail, then we need to put the page with which we were called
+	 * back because our caller assumes we take care of it.
+	 */
+	hash = 0;
+
+	pgno = PGNO_BASE_MD;
+#ifdef HAVE_HASH
+	if (dbp->type == DB_HASH) {
+		if ((ret = __ham_return_meta(dbc,
+#ifdef HAVE_FTRUNCATE
+		    0,
+#else
+		    DB_MPOOL_DIRTY,
+#endif
+		&meta)) != 0)
+			goto err;
+		if (meta != NULL)
+			hash = 1;
+	}
+#endif
+	if (meta == NULL) {
+		if ((ret = __db_lget(dbc,
+		    LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+			goto err;
+
+		/* If we support truncate, we might not dirty the meta page. */
+		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+#ifdef HAVE_FTRUNCATE
+		    0,
+#else
+		    DB_MPOOL_DIRTY,
+#endif
+		    &meta)) != 0)
+			goto err1;
+	}
+
+	last_pgno = meta->last_pgno;
+	next_pgno = meta->free;
+	/*
+	 * Assign lsnp here so it always initialized when
+	 * HAVE_FTRUNCATE is not defined.
+	 */
+	lsnp = &LSN(meta);
+
+	DB_ASSERT(dbp->env, h->pgno != next_pgno);
+
+#ifdef HAVE_FTRUNCATE
+	/*
+	 * If we are maintaining a sorted free list see if we either have a
+	 * new truncation point or the page goes somewhere in the middle of
+	 * the list.  If it goes in the middle of the list, we will drop the
+	 * meta page and get the previous page.
+	 */
+	if ((ret = __memp_get_freelist(mpf, &nelem, &list)) != 0)
+		goto err1;
+	if (list == NULL)
+		goto no_sort;
+
+	if (h->pgno != last_pgno) {
+		/*
+		 * Put the page number in the sorted list.
+		 * Finds its position and the previous page,
+		 * extend the list, make room and insert.
+		 */
+		position = 0;
+		if (nelem != 0) {
+			__db_freelist_pos(h->pgno, list, nelem, &position);
+
+			DB_ASSERT(dbp->env, h->pgno != list[position]);
+
+			/* Get the previous page if this is not the smallest. */
+			if (position != 0 || h->pgno > list[0])
+				prev_pgno = list[position];
+		}
+
+	} else if (nelem != 0) {
+		/* Find the truncation point. */
+		for (lp = &list[nelem - 1]; lp >= list; lp--)
+			if (--last_pgno != *lp)
+				break;
+		if (lp < list || last_pgno < h->pgno - 1)
+			do_truncate = 1;
+		last_pgno = meta->last_pgno;
+	}
+
+no_sort:
+	if (prev_pgno == PGNO_INVALID) {
+#ifdef HAVE_HASH
+		if (hash) {
+			if ((ret =
+			    __ham_return_meta(dbc, DB_MPOOL_DIRTY, &meta)) != 0)
+				goto err1;
+		} else
+#endif
+		if ((ret = __memp_dirty(mpf,
+		    &meta, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+			goto err1;
+		lsnp = &LSN(meta);
+	} else {
+		pgno = prev_pgno;
+		if ((ret = __memp_fget(mpf, &pgno,
+		    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &prev)) != 0)
+			goto err1;
+		next_pgno = NEXT_PGNO(prev);
+		lsnp = &LSN(prev);
+	}
+#endif
+
+	/*
+	 * Log the change.
+	 *	We are either logging an update to the metapage or to the
+	 * previous page in the sorted list.
+	 */
+	if (DBC_LOGGING(dbc)) {
+		memset(&ldbt, 0, sizeof(ldbt));
+		ldbt.data = h;
+		ldbt.size = P_OVERHEAD(dbp);
+		/*
+		 * If we are truncating the file, we need to make sure
+		 * the logging happens before the truncation.  If we
+		 * are truncating multiple pages we don't need to flush the
+		 * log here as it will be flushed by __db_truncate_freelist.
+		 * If we are zeroing pages rather than truncating we still
+		 * need to flush since they will not have valid LSNs.
+		 */
+		lflag = 0;
+
+		if (h->pgno == last_pgno
+#ifdef HAVE_FTRUNCATE
+		    && do_truncate == 0
+#endif
+		)
+			lflag = DB_FLUSH;
+		switch (h->type) {
+		case P_HASH:
+		case P_IBTREE:
+		case P_IRECNO:
+		case P_LBTREE:
+		case P_LRECNO:
+		case P_LDUP:
+			if (h->entries > 0) {
+				ldbt.size += h->entries * sizeof(db_indx_t);
+				ddbt.data = (u_int8_t *)h + HOFFSET(h);
+				ddbt.size = dbp->pgsize - HOFFSET(h);
+				if ((ret = __db_pg_freedata_log(dbp, dbc->txn,
+				     lsnp, lflag,
+				     h->pgno, lsnp, pgno,
+				     &ldbt, next_pgno, last_pgno, &ddbt)) != 0)
+					goto err1;
+				goto logged;
+			}
+			break;
+		case P_HASHMETA:
+			ldbt.size = sizeof(HMETA);
+			break;
+		case P_BTREEMETA:
+			ldbt.size = sizeof(BTMETA);
+			break;
+		case P_OVERFLOW:
+			ldbt.size += OV_LEN(h);
+			break;
+		default:
+			DB_ASSERT(dbp->env, h->type != P_QAMDATA);
+		}
+
+		if ((ret = __db_pg_free_log(dbp,
+		      dbc->txn, lsnp, lflag, h->pgno,
+		      lsnp, pgno, &ldbt, next_pgno, last_pgno)) != 0)
+			goto err1;
+	} else
+		LSN_NOT_LOGGED(*lsnp);
+
+logged:
+#ifdef HAVE_FTRUNCATE
+	if (do_truncate) {
+		start = (u_int32_t) (lp - list) + 1;
+		meta->last_pgno--;
+		ret = __db_truncate_freelist(
+		      dbc, meta, h, list, start, nelem);
+		h = NULL;
+	} else
+#endif
+	if (h->pgno == last_pgno) {
+		/*
+		 * We are going to throw this page away, but if we are
+		 * using MVCC then this version may stick around and we
+		 * might have to make a copy.
+		 */
+		if (mpf->mfp->multiversion && (ret = __memp_dirty(mpf,
+		    &h, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+			goto err1;
+		LSN(h) = *lsnp;
+		P_INIT(h, dbp->pgsize,
+		    h->pgno, PGNO_INVALID, next_pgno, 0, P_INVALID);
+		if ((ret = __memp_fput(mpf,
+		    dbc->thread_info, h, DB_PRIORITY_VERY_LOW)) != 0)
+			goto err1;
+		h = NULL;
+		/* Give the page back to the OS. */
+		if ((ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info,
+		    last_pgno, 0)) != 0)
+			goto err1;
+		DB_ASSERT(dbp->env, meta->pgno == PGNO_BASE_MD);
+		meta->last_pgno--;
+		h = NULL;
+	} else {
+#ifdef HAVE_FTRUNCATE
+		if (list != NULL) {
+			/* Put the page number into the list. */
+			if ((ret =
+			    __memp_extend_freelist(mpf, nelem + 1, &list)) != 0)
+				goto err1;
+			if (prev_pgno != PGNO_INVALID)
+				lp = &list[position + 1];
+			else
+				lp = list;
+			if (nelem != 0 && position != nelem)
+				memmove(lp + 1, lp, (size_t)
+				    ((u_int8_t*)&list[nelem] - (u_int8_t*)lp));
+			*lp = h->pgno;
+		}
+#endif
+		/*
+		 * If we are not truncating the page then we
+		 * reinitialize it and put it at the head of
+		 * the free list.
+		 */
+		if ((ret = __memp_dirty(mpf,
+		    &h, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+			goto err1;
+		LSN(h) = *lsnp;
+		P_INIT(h, dbp->pgsize,
+		    h->pgno, PGNO_INVALID, next_pgno, 0, P_INVALID);
+#ifdef DIAGNOSTIC
+		memset((u_int8_t *) h + P_OVERHEAD(dbp),
+		    CLEAR_BYTE, dbp->pgsize - P_OVERHEAD(dbp));
+#endif
+		if (prev_pgno == PGNO_INVALID)
+			meta->free = h->pgno;
+		else
+			NEXT_PGNO(prev) = h->pgno;
+	}
+
+	/* Discard the metadata or previous page. */
+err1:	if (hash == 0 && meta != NULL && (t_ret = __memp_fput(mpf,
+	    dbc->thread_info, (PAGE *)meta, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
+		ret = t_ret;
+	if (prev != (PAGE*) meta && prev != NULL && (t_ret = __memp_fput(mpf,
+	    dbc->thread_info, prev, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Discard the caller's page reference. */
+err:	if (h != NULL && (t_ret = __memp_fput(mpf,
+	    dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/*
+	 * XXX
+	 * We have to unlock the caller's page in the caller!
+	 */
+	return (ret);
+}
+
+#ifdef HAVE_FTRUNCATE
+/*
+ * __db_freelist_pos -- find the position of a page in the freelist.
+ *	The list is sorted, we do a binary search.
+ *
+ * PUBLIC: #ifdef HAVE_FTRUNCATE
+ * PUBLIC: void __db_freelist_pos __P((db_pgno_t,
+ * PUBLIC:       db_pgno_t *, u_int32_t, u_int32_t *));
+ * PUBLIC: #endif
+ */
+void
+__db_freelist_pos(pgno, list, nelem, posp)
+	db_pgno_t pgno;
+	db_pgno_t *list;
+	u_int32_t nelem;
+	u_int32_t *posp;
+{
+	u_int32_t base, indx, lim;
+
+	indx = 0;
+	for (base = 0, lim = nelem; lim != 0; lim >>= 1) {
+		indx = base + (lim >> 1);
+		if (pgno == list[indx]) {
+			*posp = indx;
+			return;
+		}
+		if (pgno > list[indx]) {
+			base = indx + 1;
+			--lim;
+		}
+	}
+	if (base != 0)
+		base--;
+	*posp = base;
+	return;
+}
+
+static int
+__db_pglistcmp(a, b)
+	const void *a, *b;
+{
+	db_pglist_t *ap, *bp;
+
+	ap = (db_pglist_t *)a;
+	bp = (db_pglist_t *)b;
+
+	return ((ap->pgno > bp->pgno) ? 1 : (ap->pgno < bp->pgno) ? -1: 0);
+}
+
+/*
+ * __db_freelist_sort -- sort a list of free pages.
+ * PUBLIC: void __db_freelist_sort __P((db_pglist_t *, u_int32_t));
+ */
+void
+__db_freelist_sort(list, nelems)
+	db_pglist_t *list;
+	u_int32_t nelems;
+{
+	qsort(list, (size_t)nelems, sizeof(db_pglist_t), __db_pglistcmp);
+}
+
+/*
+ * __db_pg_truncate -- find the truncation point in a sorted freelist.
+ *
+ * PUBLIC: #ifdef HAVE_FTRUNCATE
+ * PUBLIC: int __db_pg_truncate __P((DBC *, DB_TXN *,
+ * PUBLIC:    db_pglist_t *, DB_COMPACT *, u_int32_t *, 
+ * PUBLIC:    db_pgno_t , db_pgno_t *, DB_LSN *, int));
+ * PUBLIC: #endif
+ */
+int
+__db_pg_truncate(dbc, txn,
+    list, c_data, nelemp, free_pgno, last_pgno, lsnp, in_recovery)
+	DBC *dbc;
+	DB_TXN *txn;
+	db_pglist_t *list;
+	DB_COMPACT *c_data;
+	u_int32_t *nelemp;
+	db_pgno_t free_pgno, *last_pgno;
+	DB_LSN *lsnp;
+	int in_recovery;
+{
+	DB *dbp;
+	DBT ddbt;
+	DB_LSN null_lsn;
+	DB_MPOOLFILE *mpf;
+	PAGE *h;
+	db_pglist_t *lp, *slp;
+	db_pgno_t lpgno, pgno;
+	u_int32_t elems, log_size, tpoint;
+	int last, ret;
+
+	ret = 0;
+	h = NULL;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	elems = tpoint = *nelemp;
+
+	/*
+	 * Figure out what (if any) pages can be truncated immediately and
+	 * record the place from which we can truncate, so we can do the
+	 * memp_ftruncate below.  We also use this to avoid ever putting
+	 * these pages on the freelist, which we are about to relink.
+	 */
+	pgno = *last_pgno;
+	lp = &list[elems - 1];
+	last = 1;
+	while (tpoint != 0) {
+		if (lp->pgno != pgno)
+			break;
+		pgno--;
+		tpoint--;
+		lp--;
+	}
+
+	lp = list;
+	slp = &list[elems];
+	/*
+	 * Log the sorted list. We log the whole list so it can be rebuilt.
+	 * Don't overflow the log file.  
+	 */
+again:	if (DBC_LOGGING(dbc)) {
+		last = 1;
+		lpgno = *last_pgno;
+		ddbt.size = elems * sizeof(*lp);
+		ddbt.data = lp;
+		log_size = ((LOG *)dbc->env->
+		    lg_handle->reginfo.primary)->log_size;
+		if (ddbt.size > log_size / 2) {
+			elems = (log_size / 2) / sizeof(*lp);
+			ddbt.size = elems * sizeof(*lp);
+			last = 0;
+			/*
+			 * If we stopped after the truncation point
+			 * then we need to truncate from here.
+			 */
+			if (lp + elems >= &list[tpoint])
+				lpgno = lp[elems - 1].pgno;
+		}
+		/*
+		 * If this is not the begining of the list fetch the end
+		 * of the previous segment.  This page becomes the last_free
+		 * page and will link to this segment if it is not truncated.
+		 */
+		if (lp != list) {
+			if ((ret = __memp_fget(mpf, &lp[-1].pgno,
+			    dbc->thread_info, txn, 0, &h)) != 0) 
+			    	goto err;
+		}
+			
+		slp = &lp[elems];
+
+		ZERO_LSN(null_lsn);
+		if ((ret = __db_pg_trunc_log(dbp, dbc->txn,
+		     lsnp, last == 1 ? DB_FLUSH : 0, PGNO_BASE_MD,
+		     lsnp, h != NULL ? PGNO(h) : PGNO_INVALID,
+		     h != NULL ? &LSN(h) : &null_lsn,
+		     free_pgno, lpgno, &ddbt)) != 0)
+			goto err;
+		if (h != NULL) {
+			LSN(h) = *lsnp;
+			if ((ret = __memp_fput(mpf,
+			    dbc->thread_info, h, dbc->priority)) != 0)
+				goto err;
+		}
+		h = NULL;
+	} else if (!in_recovery)
+		LSN_NOT_LOGGED(*lsnp);
+
+	for (; lp < slp && lp < &list[tpoint]; lp++) {
+		if ((ret = __memp_fget(mpf, &lp->pgno, dbc->thread_info,
+		    txn, !in_recovery ? DB_MPOOL_DIRTY : 0, &h)) != 0) {
+			/* Page may have been truncated later. */
+			if (in_recovery && ret == DB_PAGE_NOTFOUND) {
+				ret = 0;
+				continue;
+			}
+			goto err;
+		}
+		if (in_recovery) {
+			if (LOG_COMPARE(&LSN(h), &lp->lsn) == 0) {
+				if ((ret = __memp_dirty(mpf, &h,
+				    dbc->thread_info,
+				    txn, dbp->priority, 0)) != 0) {
+					(void)__memp_fput(mpf,
+					    dbc->thread_info, h, dbp->priority);
+					goto err;
+				}
+			} else 
+				goto skip;
+		}
+
+		if (lp == &list[tpoint - 1])
+			NEXT_PGNO(h) = PGNO_INVALID;
+		else
+			NEXT_PGNO(h) = lp[1].pgno;
+		DB_ASSERT(mpf->env, NEXT_PGNO(h) < *last_pgno);
+
+		LSN(h) = *lsnp;
+skip:		if ((ret = __memp_fput(mpf,
+		    dbc->thread_info, h, dbp->priority)) != 0)
+			goto err;
+		h = NULL;
+	}
+
+	/*
+	 * If we did not log everything try again.  We start from slp and
+	 * try to go to the end of the list.
+	 */
+	if (last == 0) {
+		elems = (u_int32_t)(&list[*nelemp] - slp);
+		lp = slp;
+		goto again;
+	}
+
+	/*
+	 * Truncate the file.  Its possible that the last page is the
+	 * only one that got truncated and that's done in the caller.
+	 */
+	if (pgno != *last_pgno) {
+		if (tpoint != *nelemp &&
+		    (ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info,
+		    pgno + 1, in_recovery ? MP_TRUNC_RECOVER : 0)) != 0)
+			goto err;
+		if (c_data)
+			c_data->compact_pages_truncated += *last_pgno - pgno;
+		*last_pgno = pgno;
+	}
+	*nelemp = tpoint;
+
+	if (0) {
+err:		if (h != NULL)
+			(void)__memp_fput(mpf,
+			    dbc->thread_info, h, dbc->priority);
+	}
+	return (ret);
+}
+
+/*
+ * __db_free_truncate --
+ * 	  Build a sorted free list and truncate free pages at the end
+ *	  of the file.
+ *
+ * PUBLIC: #ifdef HAVE_FTRUNCATE
+ * PUBLIC: int __db_free_truncate __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC:    u_int32_t, DB_COMPACT *, db_pglist_t **, u_int32_t *,
+ * PUBLIC:    db_pgno_t *));
+ * PUBLIC: #endif
+ */
+int
+__db_free_truncate(dbp, ip, txn, flags, c_data, listp, nelemp, last_pgnop)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	u_int32_t flags;
+	DB_COMPACT *c_data;
+	db_pglist_t **listp;
+	u_int32_t *nelemp;
+	db_pgno_t *last_pgnop;
+{
+	DBC *dbc;
+	DBMETA *meta;
+	DB_LOCK metalock;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	PAGE *h;
+	db_pglist_t *list, *lp;
+	db_pgno_t pgno;
+	u_int32_t nelems;
+	int ret, t_ret;
+	size_t size;
+
+	COMPQUIET(flags, 0);
+	list = NULL;
+	meta = NULL;
+	env = dbp->env;
+	mpf = dbp->mpf;
+	h = NULL;
+	nelems = 0;
+	if (listp != NULL) {
+		*listp = NULL;
+		DB_ASSERT(env, nelemp != NULL);
+		*nelemp = 0;
+	}
+
+	if ((ret = __db_cursor(dbp, ip, txn, &dbc, DB_WRITELOCK)) != 0)
+		return (ret);
+
+	pgno = PGNO_BASE_MD;
+	if ((ret = __db_lget(dbc,
+	    LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+		goto err;
+	if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, 0,
+	    &meta)) != 0)
+		goto err;
+
+	if (last_pgnop != NULL)
+		*last_pgnop = meta->last_pgno;
+	if ((pgno = meta->free) == PGNO_INVALID)
+		goto done;
+
+	size = 128;
+	if ((ret = __os_malloc(env, size * sizeof(*list), &list)) != 0)
+		goto err;
+	lp = list;
+
+	do {
+		if (lp == &list[size]) {
+			size *= 2;
+			if ((ret = __os_realloc(env,
+			    size * sizeof(*list), &list)) != 0)
+				goto err;
+			lp = &list[size / 2];
+		}
+		if ((ret = __memp_fget(mpf, &pgno,
+		     dbc->thread_info, dbc->txn, 0, &h)) != 0)
+			goto err;
+
+		lp->pgno = pgno;
+		lp->next_pgno = NEXT_PGNO(h);
+		lp->lsn = LSN(h);
+		pgno = NEXT_PGNO(h);
+		if ((ret = __memp_fput(mpf,
+		    dbc->thread_info, h, dbc->priority)) != 0)
+			goto err;
+		lp++;
+	} while (pgno != PGNO_INVALID);
+	nelems = (u_int32_t)(lp - list);
+
+	if ((ret = __memp_dirty(mpf,
+	    &meta, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+		goto err;
+
+	/* Sort the list */
+	__db_freelist_sort(list, nelems);
+
+	if ((ret = __db_pg_truncate(dbc, txn, list, c_data,
+	    &nelems, meta->free, &meta->last_pgno, &LSN(meta), 0)) != 0)
+		goto err;
+
+	if (nelems == 0)
+		meta->free = PGNO_INVALID;
+	else
+		meta->free = list[0].pgno;
+
+done:	if (last_pgnop != NULL)
+		*last_pgnop = meta->last_pgno;
+
+	/*
+	 * The truncate point is the number of pages in the free
+	 * list back from the last page.  The number of pages
+	 * in the free list are the number that we can swap in.
+	 */
+	if (c_data)
+		c_data->compact_truncate = (u_int32_t)meta->last_pgno - nelems;
+
+	if (nelems != 0 && listp != NULL) {
+		*listp = list;
+		*nelemp = nelems;
+		list = NULL;
+	}
+
+err:	if (list != NULL)
+		__os_free(env, list);
+	if (meta != NULL && (t_ret = __memp_fput(mpf,
+	     dbc->thread_info, (PAGE *)meta, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+static int
+__db_truncate_freelist(dbc, meta, h, list, start, nelem)
+	DBC *dbc;
+	DBMETA *meta;
+	PAGE *h;
+	db_pgno_t *list;
+	u_int32_t start, nelem;
+{
+	DB *dbp;
+	DBT ddbt;
+	DB_LSN null_lsn;
+	DB_MPOOLFILE *mpf;
+	PAGE *last_free, *pg;
+	db_pgno_t *lp, free_pgno, lpgno;
+	db_pglist_t *plist, *pp, *spp;
+	u_int32_t elem, log_size;
+	int last, ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	plist = NULL;
+	last_free = NULL;
+	pg = NULL;
+
+	if (start != 0 &&
+	    (ret = __memp_fget(mpf, &list[start - 1],
+	    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &last_free)) != 0)
+		goto err;
+
+	if (DBC_LOGGING(dbc)) {
+		if ((ret = __os_malloc(dbp->env,
+		     (nelem - start) * sizeof(*pp), &plist)) != 0)
+			goto err;
+
+		pp = plist;
+		for (lp = &list[start]; lp < &list[nelem]; lp++) {
+			pp->pgno = *lp;
+			if ((ret = __memp_fget(mpf, lp,
+			     dbc->thread_info, dbc->txn, 0, &pg)) != 0)
+				goto err;
+			pp->lsn = LSN(pg);
+			pp->next_pgno = NEXT_PGNO(pg);
+			if ((ret = __memp_fput(mpf,
+			    dbc->thread_info, pg, DB_PRIORITY_VERY_LOW)) != 0)
+				goto err;
+			pg = NULL;
+			pp++;
+		}
+		ZERO_LSN(null_lsn);
+		pp = plist;
+		elem = nelem - start;
+		log_size = ((LOG *)dbc->env->
+		    lg_handle->reginfo.primary)->log_size;
+again:		ddbt.data = spp = pp;
+		free_pgno = pp->pgno;
+		lpgno = meta->last_pgno;
+		ddbt.size = elem * sizeof(*pp);
+		if (ddbt.size > log_size / 2) {
+			elem = (log_size / 2) / (u_int32_t)sizeof(*pp);
+			ddbt.size = elem * sizeof(*pp);
+			pp += elem;
+			elem = (nelem - start) - (u_int32_t)(pp - plist);
+			lpgno = pp[-1].pgno;
+			last = 0;
+		} else
+			last = 1;
+		/*
+		 * Get the page which will link to this section if we abort.
+		 * If this is the first segment then its last_free.
+		 */
+		if (spp == plist) 
+			pg = last_free;
+		else if ((ret = __memp_fget(mpf, &spp[-1].pgno,
+		     dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pg)) != 0)
+			goto err;
+			
+		if ((ret = __db_pg_trunc_log(dbp, dbc->txn,
+		     &LSN(meta), last == 1 ? DB_FLUSH : 0,
+		     PGNO(meta), &LSN(meta),
+		     pg != NULL ? PGNO(pg) : PGNO_INVALID,
+		     pg != NULL ? &LSN(pg) : &null_lsn,
+		     free_pgno, lpgno, &ddbt)) != 0)
+			goto err;
+		if (pg != NULL) {
+			LSN(pg) = LSN(meta);
+			if (pg != last_free && (ret = __memp_fput(mpf, 
+			    dbc->thread_info, pg, DB_PRIORITY_VERY_LOW)) != 0)
+				goto err;
+			pg = NULL;
+		}
+		if (last == 0)
+			goto again;
+	} else
+		LSN_NOT_LOGGED(LSN(meta));
+
+	if ((ret = __memp_fput(mpf,
+	    dbc->thread_info, h, DB_PRIORITY_VERY_LOW)) != 0)
+		goto err;
+	h = NULL;
+	if ((ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info,
+	    list[start], 0)) != 0)
+		goto err;
+	meta->last_pgno = list[start] - 1;
+
+	if (start == 0)
+		meta->free = PGNO_INVALID;
+	else {
+		NEXT_PGNO(last_free) = PGNO_INVALID;
+		if ((ret = __memp_fput(mpf,
+		    dbc->thread_info, last_free, dbc->priority)) != 0)
+			goto err;
+		last_free = NULL;
+	}
+
+	/* Shrink the number of elements in the list. */
+	ret = __memp_extend_freelist(mpf, start, &list);
+
+err:	if (plist != NULL)
+		__os_free(dbp->env, plist);
+
+	/* We need to put the page on error. */
+	if (h != NULL)
+		(void)__memp_fput(mpf, dbc->thread_info, h, dbc->priority);
+	if (pg != NULL && pg != last_free)
+		(void)__memp_fput(mpf, dbc->thread_info, pg, dbc->priority);
+	if (last_free != NULL)
+		(void)__memp_fput(mpf,
+		    dbc->thread_info, last_free, dbc->priority);
+
+	return (ret);
+}
+#endif
+
+#ifdef DEBUG
+/*
+ * __db_lprint --
+ *	Print out the list of locks currently held by a cursor.
+ *
+ * PUBLIC: int __db_lprint __P((DBC *));
+ */
+int
+__db_lprint(dbc)
+	DBC *dbc;
+{
+	DB *dbp;
+	DB_LOCKREQ req;
+	ENV *env;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	if (LOCKING_ON(env)) {
+		req.op = DB_LOCK_DUMP;
+		(void)__lock_vec(env, dbc->locker, 0, &req, 1, NULL);
+	}
+	return (0);
+}
+#endif
+
+/*
+ * __db_lget --
+ *	The standard lock get call.
+ *
+ * PUBLIC: int __db_lget __P((DBC *,
+ * PUBLIC:     int, db_pgno_t, db_lockmode_t, u_int32_t, DB_LOCK *));
+ */
+int
+__db_lget(dbc, action, pgno, mode, lkflags, lockp)
+	DBC *dbc;
+	int action;
+	db_pgno_t pgno;
+	db_lockmode_t mode;
+	u_int32_t lkflags;
+	DB_LOCK *lockp;
+{
+	DB *dbp;
+	DB_LOCKREQ couple[3], *reqp;
+	DB_TXN *txn;
+	ENV *env;
+	int has_timeout, i, ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	txn = dbc->txn;
+
+	/*
+	 * We do not always check if we're configured for locking before
+	 * calling __db_lget to acquire the lock.
+	 */
+	if (CDB_LOCKING(env) || !LOCKING_ON(env) ||
+	    (MULTIVERSION(dbp) && mode == DB_LOCK_READ &&
+	    dbc->txn != NULL && F_ISSET(dbc->txn, TXN_SNAPSHOT)) ||
+	    F_ISSET(dbc, DBC_DONTLOCK) || (F_ISSET(dbc, DBC_RECOVER) &&
+	    (action != LCK_ROLLBACK || IS_REP_CLIENT(env))) ||
+	    (action != LCK_ALWAYS && F_ISSET(dbc, DBC_OPD))) {
+		LOCK_INIT(*lockp);
+		return (0);
+	}
+
+	dbc->lock.pgno = pgno;
+	if (lkflags & DB_LOCK_RECORD)
+		dbc->lock.type = DB_RECORD_LOCK;
+	else
+		dbc->lock.type = DB_PAGE_LOCK;
+	lkflags &= ~DB_LOCK_RECORD;
+
+	/*
+	 * If the transaction enclosing this cursor has DB_LOCK_NOWAIT set,
+	 * pass that along to the lock call.
+	 */
+	if (DB_NONBLOCK(dbc))
+		lkflags |= DB_LOCK_NOWAIT;
+
+	if (F_ISSET(dbc, DBC_READ_UNCOMMITTED) && mode == DB_LOCK_READ)
+		mode = DB_LOCK_READ_UNCOMMITTED;
+
+	has_timeout = F_ISSET(dbc, DBC_RECOVER) ||
+	    (txn != NULL && F_ISSET(txn, TXN_LOCKTIMEOUT));
+
+	/*
+	 * Transactional locking.
+	 * Hold on to the previous read lock only if we are in full isolation.
+	 * COUPLE_ALWAYS indicates we are holding an interior node which need
+	 *	not be isolated.
+	 * Downgrade write locks if we are supporting dirty readers.
+	 */
+	if ((action != LCK_COUPLE && action != LCK_COUPLE_ALWAYS) ||
+	    !LOCK_ISSET(*lockp))
+		action = 0;
+	else if (dbc->txn == NULL || action == LCK_COUPLE_ALWAYS)
+		action = LCK_COUPLE;
+	else if (F_ISSET(dbc, DBC_READ_COMMITTED | DBC_WAS_READ_COMMITTED) &&
+	    lockp->mode == DB_LOCK_READ)
+		action = LCK_COUPLE;
+	else if (lockp->mode == DB_LOCK_READ_UNCOMMITTED)
+		action = LCK_COUPLE;
+	else if (F_ISSET(dbc->dbp,
+	    DB_AM_READ_UNCOMMITTED) && lockp->mode == DB_LOCK_WRITE)
+		action = LCK_DOWNGRADE;
+	else
+		action = 0;
+
+	i = 0;
+	switch (action) {
+	default:
+		if (has_timeout)
+			goto do_couple;
+		ret = __lock_get(env,
+		    dbc->locker, lkflags, &dbc->lock_dbt, mode, lockp);
+		break;
+
+	case LCK_DOWNGRADE:
+		couple[0].op = DB_LOCK_GET;
+		couple[0].obj = NULL;
+		couple[0].lock = *lockp;
+		couple[0].mode = DB_LOCK_WWRITE;
+		UMRW_SET(couple[0].timeout);
+		i++;
+		/* FALLTHROUGH */
+	case LCK_COUPLE:
+do_couple:	couple[i].op = has_timeout? DB_LOCK_GET_TIMEOUT : DB_LOCK_GET;
+		couple[i].obj = &dbc->lock_dbt;
+		couple[i].mode = mode;
+		UMRW_SET(couple[i].timeout);
+		i++;
+		if (has_timeout)
+			couple[0].timeout =
+			     F_ISSET(dbc, DBC_RECOVER) ? 0 : txn->lock_timeout;
+		if (action == LCK_COUPLE || action == LCK_DOWNGRADE) {
+			couple[i].op = DB_LOCK_PUT;
+			couple[i].lock = *lockp;
+			i++;
+		}
+
+		ret = __lock_vec(env,
+		    dbc->locker, lkflags, couple, i, &reqp);
+		if (ret == 0 || reqp == &couple[i - 1])
+			*lockp = i == 1 ? couple[0].lock : couple[i - 2].lock;
+		break;
+	}
+
+	if (txn != NULL && ret == DB_LOCK_DEADLOCK)
+		F_SET(txn, TXN_DEADLOCK);
+	return ((ret == DB_LOCK_NOTGRANTED && !F_ISSET(env->dbenv,
+		 DB_ENV_TIME_NOTGRANTED)) ? DB_LOCK_DEADLOCK : ret);
+}
+
+/*
+ * __db_lput --
+ *	The standard lock put call.
+ *
+ * PUBLIC: int __db_lput __P((DBC *, DB_LOCK *));
+ */
+int
+__db_lput(dbc, lockp)
+	DBC *dbc;
+	DB_LOCK *lockp;
+{
+	DB_LOCKREQ couple[2], *reqp;
+	ENV *env;
+	int action, ret;
+
+	/*
+	 * Transactional locking.
+	 * Hold on to the read locks only if we are in full isolation.
+	 * Downgrade write locks if we are supporting dirty readers.
+	 */
+	if (F_ISSET(dbc->dbp,
+	    DB_AM_READ_UNCOMMITTED) && lockp->mode == DB_LOCK_WRITE)
+		action = LCK_DOWNGRADE;
+	else if (dbc->txn == NULL)
+		action = LCK_COUPLE;
+	else if (F_ISSET(dbc, DBC_READ_COMMITTED | DBC_WAS_READ_COMMITTED) &&
+	    lockp->mode == DB_LOCK_READ)
+		action = LCK_COUPLE;
+	else if (lockp->mode == DB_LOCK_READ_UNCOMMITTED)
+		action = LCK_COUPLE;
+	else
+		action = 0;
+
+	env = dbc->env;
+	switch (action) {
+	case LCK_COUPLE:
+		ret = __lock_put(env, lockp);
+		break;
+	case LCK_DOWNGRADE:
+		couple[0].op = DB_LOCK_GET;
+		couple[0].obj = NULL;
+		couple[0].mode = DB_LOCK_WWRITE;
+		couple[0].lock = *lockp;
+		UMRW_SET(couple[0].timeout);
+		couple[1].op = DB_LOCK_PUT;
+		couple[1].lock = *lockp;
+		ret = __lock_vec(env, dbc->locker, 0, couple, 2, &reqp);
+		if (ret == 0 || reqp == &couple[1])
+			*lockp = couple[0].lock;
+		break;
+	default:
+		ret = 0;
+		break;
+	}
+
+	return (ret);
+}
diff --git a/db/db_method.c b/db/db_method.c
new file mode 100644
index 0000000..1182f97
--- /dev/null
+++ b/db/db_method.c
@@ -0,0 +1,1052 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999-2009 Oracle.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+#ifdef HAVE_RPC
+#ifdef HAVE_SYSTEM_INCLUDE_FILES
+#include <rpc/rpc.h>
+#endif
+#include "db_server.h"
+#include "dbinc_auto/rpc_client_ext.h"
+#endif
+
+static int  __db_get_byteswapped __P((DB *, int *));
+static int  __db_get_dbname __P((DB *, const char **, const char **));
+static DB_ENV *__db_get_env __P((DB *));
+static void __db_get_msgcall
+	      __P((DB *, void (**)(const DB_ENV *, const char *)));
+static DB_MPOOLFILE *__db_get_mpf __P((DB *));
+static int  __db_get_multiple __P((DB *));
+static int  __db_get_transactional __P((DB *));
+static int  __db_get_type __P((DB *, DBTYPE *dbtype));
+static int  __db_init __P((DB *, u_int32_t));
+static int  __db_get_alloc __P((DB *, void *(**)(size_t),
+		void *(**)(void *, size_t), void (**)(void *)));
+static int  __db_set_alloc __P((DB *, void *(*)(size_t),
+		void *(*)(void *, size_t), void (*)(void *)));
+static int  __db_get_append_recno __P((DB *,
+		int (**)(DB *, DBT *, db_recno_t)));
+static int  __db_set_append_recno __P((DB *, int (*)(DB *, DBT *, db_recno_t)));
+static int  __db_get_cachesize __P((DB *, u_int32_t *, u_int32_t *, int *));
+static int  __db_set_cachesize __P((DB *, u_int32_t, u_int32_t, int));
+static int  __db_get_create_dir __P((DB *, const char **));
+static int  __db_set_create_dir __P((DB *, const char *));
+static int  __db_get_dup_compare
+		__P((DB *, int (**)(DB *, const DBT *, const DBT *)));
+static int  __db_set_dup_compare
+		__P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+static int  __db_get_encrypt_flags __P((DB *, u_int32_t *));
+static int  __db_set_encrypt __P((DB *, const char *, u_int32_t));
+static int  __db_get_feedback __P((DB *, void (**)(DB *, int, int)));
+static int  __db_set_feedback __P((DB *, void (*)(DB *, int, int)));
+static void __db_map_flags __P((DB *, u_int32_t *, u_int32_t *));
+static int  __db_get_pagesize __P((DB *, u_int32_t *));
+static int  __db_set_paniccall __P((DB *, void (*)(DB_ENV *, int)));
+static int  __db_set_priority __P((DB *, DB_CACHE_PRIORITY));
+static int  __db_get_priority __P((DB *, DB_CACHE_PRIORITY *));
+static void __db_get_errcall __P((DB *,
+	      void (**)(const DB_ENV *, const char *, const char *)));
+static void __db_set_errcall
+	      __P((DB *, void (*)(const DB_ENV *, const char *, const char *)));
+static void __db_get_errfile __P((DB *, FILE **));
+static void __db_set_errfile __P((DB *, FILE *));
+static void __db_get_errpfx __P((DB *, const char **));
+static void __db_set_errpfx __P((DB *, const char *));
+static void __db_set_msgcall
+	      __P((DB *, void (*)(const DB_ENV *, const char *)));
+static void __db_get_msgfile __P((DB *, FILE **));
+static void __db_set_msgfile __P((DB *, FILE *));
+static void __dbh_err __P((DB *, int, const char *, ...));
+static void __dbh_errx __P((DB *, const char *, ...));
+
+/*
+ * db_create --
+ *	DB constructor.
+ *
+ * EXTERN: int db_create __P((DB **, DB_ENV *, u_int32_t));
+ */
+int
+db_create(dbpp, dbenv, flags)
+	DB **dbpp;
+	DB_ENV *dbenv;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	ip = NULL;
+	env = dbenv == NULL ? NULL : dbenv->env;
+
+	/* Check for invalid function flags. */
+	if (flags != 0)
+		return (__db_ferr(env, "db_create", 0));
+
+	if (env != NULL)
+		ENV_ENTER(env, ip);
+	ret = __db_create_internal(dbpp, env, flags);
+	if (env != NULL)
+		ENV_LEAVE(env, ip);
+
+	return (ret);
+}
+
+/*
+ * __db_create_internal --
+ *	DB constructor internal routine.
+ *
+ * PUBLIC: int __db_create_internal  __P((DB **, ENV *, u_int32_t));
+ */
+int
+__db_create_internal(dbpp, env, flags)
+	DB **dbpp;
+	ENV *env;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_ENV *dbenv;
+	DB_REP *db_rep;
+	int ret;
+
+	*dbpp = NULL;
+
+	/* If we don't have an environment yet, allocate a local one. */
+	if (env == NULL) {
+		if ((ret = db_env_create(&dbenv, 0)) != 0)
+			return (ret);
+		env = dbenv->env;
+		F_SET(env, ENV_DBLOCAL);
+	} else
+		dbenv = env->dbenv;
+
+	/* Allocate and initialize the DB handle. */
+	if ((ret = __os_calloc(env, 1, sizeof(*dbp), &dbp)) != 0)
+		goto err;
+
+	dbp->dbenv = env->dbenv;
+	dbp->env = env;
+	if ((ret = __db_init(dbp, flags)) != 0)
+		goto err;
+
+	MUTEX_LOCK(env, env->mtx_dblist);
+	++env->db_ref;
+	MUTEX_UNLOCK(env, env->mtx_dblist);
+
+	/*
+	 * Set the replication timestamp; it's 0 if we're not in a replicated
+	 * environment.  Don't acquire a lock to read the value, even though
+	 * it's opaque: all we check later is value equality, nothing else.
+	 */
+	dbp->timestamp = REP_ON(env) ?
+	    ((REGENV *)env->reginfo->primary)->rep_timestamp : 0;
+	/*
+	 * Set the replication generation number for fid management; valid
+	 * replication generations start at 1.  Don't acquire a lock to
+	 * read the value.  All we check later is value equality.
+	 */
+	db_rep = env->rep_handle;
+	dbp->fid_gen = REP_ON(env) ? ((REP *)db_rep->region)->gen : 0;
+
+	/* If not RPC, open a backing DB_MPOOLFILE handle in the memory pool. */
+	if (!RPC_ON(dbenv) && (ret = __memp_fcreate(env, &dbp->mpf)) != 0)
+		goto err;
+
+	dbp->type = DB_UNKNOWN;
+
+	*dbpp = dbp;
+	return (0);
+
+err:	if (dbp != NULL) {
+		if (dbp->mpf != NULL)
+			(void)__memp_fclose(dbp->mpf, 0);
+		__os_free(env, dbp);
+	}
+
+	if (F_ISSET(env, ENV_DBLOCAL))
+		(void)__env_close(dbp->dbenv, 0);
+
+	return (ret);
+}
+
+/*
+ * __db_init --
+ *	Initialize a DB structure.
+ */
+static int
+__db_init(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	int ret;
+
+	dbp->locker = NULL;
+	LOCK_INIT(dbp->handle_lock);
+
+	TAILQ_INIT(&dbp->free_queue);
+	TAILQ_INIT(&dbp->active_queue);
+	TAILQ_INIT(&dbp->join_queue);
+	LIST_INIT(&dbp->s_secondaries);
+
+	FLD_SET(dbp->am_ok,
+	    DB_OK_BTREE | DB_OK_HASH | DB_OK_QUEUE | DB_OK_RECNO);
+
+	/* DB PUBLIC HANDLE LIST BEGIN */
+	dbp->associate = __db_associate_pp;
+	dbp->associate_foreign = __db_associate_foreign_pp;
+	dbp->close = __db_close_pp;
+	dbp->compact = __db_compact_pp;
+	dbp->cursor = __db_cursor_pp;
+	dbp->del = __db_del_pp;
+	dbp->dump = __db_dump_pp;
+	dbp->err = __dbh_err;
+	dbp->errx = __dbh_errx;
+	dbp->exists = __db_exists;
+	dbp->fd = __db_fd_pp;
+	dbp->get = __db_get_pp;
+	dbp->get_alloc = __db_get_alloc;
+	dbp->get_append_recno = __db_get_append_recno;
+	dbp->get_byteswapped = __db_get_byteswapped;
+	dbp->get_cachesize = __db_get_cachesize;
+	dbp->get_create_dir = __db_get_create_dir;
+	dbp->get_dbname = __db_get_dbname;
+	dbp->get_dup_compare = __db_get_dup_compare;
+	dbp->get_encrypt_flags = __db_get_encrypt_flags;
+	dbp->get_env = __db_get_env;
+	dbp->get_errcall = __db_get_errcall;
+	dbp->get_errfile = __db_get_errfile;
+	dbp->get_errpfx = __db_get_errpfx;
+	dbp->get_feedback = __db_get_feedback;
+	dbp->get_flags = __db_get_flags;
+	dbp->get_lorder = __db_get_lorder;
+	dbp->get_mpf = __db_get_mpf;
+	dbp->get_msgcall = __db_get_msgcall;
+	dbp->get_msgfile = __db_get_msgfile;
+	dbp->get_multiple = __db_get_multiple;
+	dbp->get_open_flags = __db_get_open_flags;
+	dbp->get_partition_dirs = __partition_get_dirs;
+	dbp->get_partition_callback = __partition_get_callback;
+	dbp->get_partition_keys = __partition_get_keys;
+	dbp->get_pagesize = __db_get_pagesize;
+	dbp->get_priority = __db_get_priority;
+	dbp->get_transactional = __db_get_transactional;
+	dbp->get_type = __db_get_type;
+	dbp->join = __db_join_pp;
+	dbp->key_range = __db_key_range_pp;
+	dbp->open = __db_open_pp;
+	dbp->pget = __db_pget_pp;
+	dbp->put = __db_put_pp;
+	dbp->remove = __db_remove_pp;
+	dbp->rename = __db_rename_pp;
+	dbp->set_alloc = __db_set_alloc;
+	dbp->set_append_recno = __db_set_append_recno;
+	dbp->set_cachesize = __db_set_cachesize;
+	dbp->set_create_dir = __db_set_create_dir;
+	dbp->set_dup_compare = __db_set_dup_compare;
+	dbp->set_encrypt = __db_set_encrypt;
+	dbp->set_errcall = __db_set_errcall;
+	dbp->set_errfile = __db_set_errfile;
+	dbp->set_errpfx = __db_set_errpfx;
+	dbp->set_feedback = __db_set_feedback;
+	dbp->set_flags = __db_set_flags;
+	dbp->set_lorder = __db_set_lorder;
+	dbp->set_msgcall = __db_set_msgcall;
+	dbp->set_msgfile = __db_set_msgfile;
+	dbp->set_pagesize = __db_set_pagesize;
+	dbp->set_paniccall = __db_set_paniccall;
+	dbp->set_partition = __partition_set;
+	dbp->set_partition_dirs = __partition_set_dirs;
+	dbp->set_priority = __db_set_priority;
+	dbp->sort_multiple = __db_sort_multiple;
+	dbp->stat = __db_stat_pp;
+	dbp->stat_print = __db_stat_print_pp;
+	dbp->sync = __db_sync_pp;
+	dbp->truncate = __db_truncate_pp;
+	dbp->upgrade = __db_upgrade_pp;
+	dbp->verify = __db_verify_pp;
+	/* DB PUBLIC HANDLE LIST END */
+
+					/* Access method specific. */
+	if ((ret = __bam_db_create(dbp)) != 0)
+		return (ret);
+	if ((ret = __ham_db_create(dbp)) != 0)
+		return (ret);
+	if ((ret = __qam_db_create(dbp)) != 0)
+		return (ret);
+
+#ifdef HAVE_RPC
+	/*
+	 * RPC specific: must be last, as we replace methods set by the
+	 * access methods.
+	 */
+	if (RPC_ON(dbp->dbenv)) {
+		__dbcl_dbp_init(dbp);
+		/*
+		 * !!!
+		 * We wrap the DB->open method for RPC, and the rpc.src file
+		 * can't handle that.
+		 */
+		dbp->open = __dbcl_db_open_wrap;
+		if ((ret = __dbcl_db_create(dbp, dbp->dbenv, flags)) != 0)
+			return (ret);
+	}
+#else
+	COMPQUIET(flags, 0);
+#endif
+
+	return (0);
+}
+
+/*
+ * __dbh_am_chk --
+ *	Error if an unreasonable method is called.
+ *
+ * PUBLIC: int __dbh_am_chk __P((DB *, u_int32_t));
+ */
+int
+__dbh_am_chk(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	/*
+	 * We start out allowing any access methods to be called, and as the
+	 * application calls the methods the options become restricted.  The
+	 * idea is to quit as soon as an illegal method combination is called.
+	 */
+	if ((LF_ISSET(DB_OK_BTREE) && FLD_ISSET(dbp->am_ok, DB_OK_BTREE)) ||
+	    (LF_ISSET(DB_OK_HASH) && FLD_ISSET(dbp->am_ok, DB_OK_HASH)) ||
+	    (LF_ISSET(DB_OK_QUEUE) && FLD_ISSET(dbp->am_ok, DB_OK_QUEUE)) ||
+	    (LF_ISSET(DB_OK_RECNO) && FLD_ISSET(dbp->am_ok, DB_OK_RECNO))) {
+		FLD_CLR(dbp->am_ok, ~flags);
+		return (0);
+	}
+
+	__db_errx(dbp->env,
+    "call implies an access method which is inconsistent with previous calls");
+	return (EINVAL);
+}
+
+/*
+ * __dbh_err --
+ *	Db.err method.
+ */
+static void
+#ifdef STDC_HEADERS
+__dbh_err(DB *dbp, int error, const char *fmt, ...)
+#else
+__dbh_err(dbp, error, fmt, va_alist)
+	DB *dbp;
+	int error;
+	const char *fmt;
+	va_dcl
+#endif
+{
+	/* Message with error string, to stderr by default. */
+	DB_REAL_ERR(dbp->dbenv, error, DB_ERROR_SET, 1, fmt);
+}
+
+/*
+ * __dbh_errx --
+ *	Db.errx method.
+ */
+static void
+#ifdef STDC_HEADERS
+__dbh_errx(DB *dbp, const char *fmt, ...)
+#else
+__dbh_errx(dbp, fmt, va_alist)
+	DB *dbp;
+	const char *fmt;
+	va_dcl
+#endif
+{
+	/* Message without error string, to stderr by default. */
+	DB_REAL_ERR(dbp->dbenv, 0, DB_ERROR_NOT_SET, 1, fmt);
+}
+
+/*
+ * __db_get_byteswapped --
+ *	Return if database requires byte swapping.
+ */
+static int
+__db_get_byteswapped(dbp, isswapped)
+	DB *dbp;
+	int *isswapped;
+{
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_byteswapped");
+
+	*isswapped = F_ISSET(dbp, DB_AM_SWAP) ? 1 : 0;
+	return (0);
+}
+
+/*
+ * __db_get_dbname --
+ *	Get the name of the database as passed to DB->open.
+ */
+static int
+__db_get_dbname(dbp, fnamep, dnamep)
+	DB *dbp;
+	const char **fnamep, **dnamep;
+{
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_dbname");
+
+	if (fnamep != NULL)
+		*fnamep = dbp->fname;
+	if (dnamep != NULL)
+		*dnamep = dbp->dname;
+	return (0);
+}
+
+/*
+ * __db_get_env --
+ *	Get the DB_ENV handle that was passed to db_create.
+ */
+static DB_ENV *
+__db_get_env(dbp)
+	DB *dbp;
+{
+	return (dbp->dbenv);
+}
+
+/*
+ * __db_get_mpf --
+ *	Get the underlying DB_MPOOLFILE handle.
+ */
+static DB_MPOOLFILE *
+__db_get_mpf(dbp)
+	DB *dbp;
+{
+	return (dbp->mpf);
+}
+
+/*
+ * get_multiple --
+ *	Return whether this DB handle references a physical file with multiple
+ *	databases.
+ */
+static int
+__db_get_multiple(dbp)
+	DB *dbp;
+{
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_multiple");
+
+	/*
+	 * Only return TRUE if the handle is for the master database, not for
+	 * any subdatabase in the physical file.  If it's a Btree, with the
+	 * subdatabases flag set, and the meta-data page has the right value,
+	 * return TRUE.  (We don't need to check it's a Btree, I suppose, but
+	 * it doesn't hurt.)
+	 */
+	return (dbp->type == DB_BTREE &&
+	    F_ISSET(dbp, DB_AM_SUBDB) &&
+	    dbp->meta_pgno == PGNO_BASE_MD ? 1 : 0);
+}
+
+/*
+ * get_transactional --
+ *	Return whether this database was created in a transaction.
+ */
+static int
+__db_get_transactional(dbp)
+	DB *dbp;
+{
+	return (F_ISSET(dbp, DB_AM_TXN) ? 1 : 0);
+}
+
+/*
+ * __db_get_type --
+ *	Return type of underlying database.
+ */
+static int
+__db_get_type(dbp, dbtype)
+	DB *dbp;
+	DBTYPE *dbtype;
+{
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_type");
+
+	*dbtype = dbp->type;
+	return (0);
+}
+
+/*
+ * __db_get_append_recno --
+ *	Get record number append routine.
+ */
+static int
+__db_get_append_recno(dbp, funcp)
+	DB *dbp;
+	int (**funcp) __P((DB *, DBT *, db_recno_t));
+{
+	DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO);
+	if (funcp)
+		*funcp = dbp->db_append_recno;
+
+	return (0);
+}
+/*
+ * __db_set_append_recno --
+ *	Set record number append routine.
+ */
+static int
+__db_set_append_recno(dbp, func)
+	DB *dbp;
+	int (*func) __P((DB *, DBT *, db_recno_t));
+{
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_append_recno");
+	DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO);
+
+	dbp->db_append_recno = func;
+
+	return (0);
+}
+
+/*
+ * __db_get_cachesize --
+ *	Get underlying cache size.
+ */
+static int
+__db_get_cachesize(dbp, cache_gbytesp, cache_bytesp, ncachep)
+	DB *dbp;
+	u_int32_t *cache_gbytesp, *cache_bytesp;
+	int *ncachep;
+{
+	DB_ILLEGAL_IN_ENV(dbp, "DB->get_cachesize");
+
+	return (__memp_get_cachesize(dbp->dbenv,
+	    cache_gbytesp, cache_bytesp, ncachep));
+}
+
+/*
+ * __db_set_cachesize --
+ *	Set underlying cache size.
+ */
+static int
+__db_set_cachesize(dbp, cache_gbytes, cache_bytes, ncache)
+	DB *dbp;
+	u_int32_t cache_gbytes, cache_bytes;
+	int ncache;
+{
+	DB_ILLEGAL_IN_ENV(dbp, "DB->set_cachesize");
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_cachesize");
+
+	return (__memp_set_cachesize(
+	    dbp->dbenv, cache_gbytes, cache_bytes, ncache));
+}
+
+static int
+__db_set_create_dir(dbp, dir)
+	DB *dbp;
+	const char *dir;
+{
+	DB_ENV *dbenv;
+	int i;
+
+	dbenv = dbp->dbenv;
+
+	for (i = 0; i < dbenv->data_next; i++)
+		if (strcmp(dir, dbenv->db_data_dir[i]) == 0)
+			break;
+
+	if (i == dbenv->data_next) {
+		__db_errx(dbp->env,
+		     "Directory %s not in environment list.", dir);
+		return (EINVAL);
+	}
+
+	dbp->dirname = dbenv->db_data_dir[i];
+	return (0);
+}
+
+static int
+__db_get_create_dir(dbp, dirp)
+	DB *dbp;
+	const char **dirp;
+{
+	*dirp = dbp->dirname;
+	return (0);
+}
+
+/*
+ * __db_get_dup_compare --
+ *	Get duplicate comparison routine.
+ */
+static int
+__db_get_dup_compare(dbp, funcp)
+	DB *dbp;
+	int (**funcp) __P((DB *, const DBT *, const DBT *));
+{
+
+	DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE | DB_OK_HASH);
+
+	if (funcp != NULL) {
+#ifdef HAVE_COMPRESSION
+		if (DB_IS_COMPRESSED(dbp)) {
+			*funcp =
+			     ((BTREE *)dbp->bt_internal)->compress_dup_compare;
+		} else
+#endif
+			*funcp = dbp->dup_compare;
+	}
+
+	return (0);
+}
+
+/*
+ * __db_set_dup_compare --
+ *	Set duplicate comparison routine.
+ */
+static int
+__db_set_dup_compare(dbp, func)
+	DB *dbp;
+	int (*func) __P((DB *, const DBT *, const DBT *));
+{
+	int ret;
+
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_dup_compare");
+	DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE | DB_OK_HASH);
+
+	if ((ret = __db_set_flags(dbp, DB_DUPSORT)) != 0)
+		return (ret);
+
+#ifdef HAVE_COMPRESSION
+	if (DB_IS_COMPRESSED(dbp)) {
+		dbp->dup_compare = __bam_compress_dupcmp;
+		((BTREE *)dbp->bt_internal)->compress_dup_compare = func;
+	} else
+#endif
+		dbp->dup_compare = func;
+
+	return (0);
+}
+
+/*
+ * __db_get_encrypt_flags --
+ */
+static int
+__db_get_encrypt_flags(dbp, flagsp)
+	DB *dbp;
+	u_int32_t *flagsp;
+{
+	DB_ILLEGAL_IN_ENV(dbp, "DB->get_encrypt_flags");
+
+	return (__env_get_encrypt_flags(dbp->dbenv, flagsp));
+}
+
+/*
+ * __db_set_encrypt --
+ *	Set database passwd.
+ */
+static int
+__db_set_encrypt(dbp, passwd, flags)
+	DB *dbp;
+	const char *passwd;
+	u_int32_t flags;
+{
+	DB_CIPHER *db_cipher;
+	int ret;
+
+	DB_ILLEGAL_IN_ENV(dbp, "DB->set_encrypt");
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_encrypt");
+
+	if ((ret = __env_set_encrypt(dbp->dbenv, passwd, flags)) != 0)
+		return (ret);
+
+	/*
+	 * In a real env, this gets initialized with the region.  In a local
+	 * env, we must do it here.
+	 */
+	db_cipher = dbp->env->crypto_handle;
+	if (!F_ISSET(db_cipher, CIPHER_ANY) &&
+	    (ret = db_cipher->init(dbp->env, db_cipher)) != 0)
+		return (ret);
+
+	return (__db_set_flags(dbp, DB_ENCRYPT));
+}
+
+static void
+__db_get_errcall(dbp, errcallp)
+	DB *dbp;
+	void (**errcallp) __P((const DB_ENV *, const char *, const char *));
+{
+	__env_get_errcall(dbp->dbenv, errcallp);
+}
+
+static void
+__db_set_errcall(dbp, errcall)
+	DB *dbp;
+	void (*errcall) __P((const DB_ENV *, const char *, const char *));
+{
+	__env_set_errcall(dbp->dbenv, errcall);
+}
+
+static void
+__db_get_errfile(dbp, errfilep)
+	DB *dbp;
+	FILE **errfilep;
+{
+	__env_get_errfile(dbp->dbenv, errfilep);
+}
+
+static void
+__db_set_errfile(dbp, errfile)
+	DB *dbp;
+	FILE *errfile;
+{
+	__env_set_errfile(dbp->dbenv, errfile);
+}
+
+static void
+__db_get_errpfx(dbp, errpfxp)
+	DB *dbp;
+	const char **errpfxp;
+{
+	__env_get_errpfx(dbp->dbenv, errpfxp);
+}
+
+static void
+__db_set_errpfx(dbp, errpfx)
+	DB *dbp;
+	const char *errpfx;
+{
+	__env_set_errpfx(dbp->dbenv, errpfx);
+}
+
+static int
+__db_get_feedback(dbp, feedbackp)
+	DB *dbp;
+	void (**feedbackp) __P((DB *, int, int));
+{
+	if (feedbackp != NULL)
+		*feedbackp = dbp->db_feedback;
+	return (0);
+}
+
+static int
+__db_set_feedback(dbp, feedback)
+	DB *dbp;
+	void (*feedback) __P((DB *, int, int));
+{
+	dbp->db_feedback = feedback;
+	return (0);
+}
+
+/*
+ * __db_map_flags --
+ *	Maps between public and internal flag values.
+ *      This function doesn't check for validity, so it can't fail.
+ */
+static void
+__db_map_flags(dbp, inflagsp, outflagsp)
+	DB *dbp;
+	u_int32_t *inflagsp, *outflagsp;
+{
+	COMPQUIET(dbp, NULL);
+
+	if (FLD_ISSET(*inflagsp, DB_CHKSUM)) {
+		FLD_SET(*outflagsp, DB_AM_CHKSUM);
+		FLD_CLR(*inflagsp, DB_CHKSUM);
+	}
+	if (FLD_ISSET(*inflagsp, DB_ENCRYPT)) {
+		FLD_SET(*outflagsp, DB_AM_ENCRYPT | DB_AM_CHKSUM);
+		FLD_CLR(*inflagsp, DB_ENCRYPT);
+	}
+	if (FLD_ISSET(*inflagsp, DB_TXN_NOT_DURABLE)) {
+		FLD_SET(*outflagsp, DB_AM_NOT_DURABLE);
+		FLD_CLR(*inflagsp, DB_TXN_NOT_DURABLE);
+	}
+}
+
+/*
+ * __db_get_flags --
+ *	The DB->get_flags method.
+ *
+ * PUBLIC: int __db_get_flags __P((DB *, u_int32_t *));
+ */
+int
+__db_get_flags(dbp, flagsp)
+	DB *dbp;
+	u_int32_t *flagsp;
+{
+	static const u_int32_t db_flags[] = {
+		DB_CHKSUM,
+		DB_DUP,
+		DB_DUPSORT,
+		DB_ENCRYPT,
+#ifdef HAVE_QUEUE
+		DB_INORDER,
+#endif
+		DB_RECNUM,
+		DB_RENUMBER,
+		DB_REVSPLITOFF,
+		DB_SNAPSHOT,
+		DB_TXN_NOT_DURABLE,
+		0
+	};
+	u_int32_t f, flags, mapped_flag;
+	int i;
+
+	flags = 0;
+	for (i = 0; (f = db_flags[i]) != 0; i++) {
+		mapped_flag = 0;
+		__db_map_flags(dbp, &f, &mapped_flag);
+		__bam_map_flags(dbp, &f, &mapped_flag);
+		__ram_map_flags(dbp, &f, &mapped_flag);
+#ifdef HAVE_QUEUE
+		__qam_map_flags(dbp, &f, &mapped_flag);
+#endif
+		DB_ASSERT(dbp->env, f == 0);
+		if (F_ISSET(dbp, mapped_flag) == mapped_flag)
+			LF_SET(db_flags[i]);
+	}
+
+	*flagsp = flags;
+	return (0);
+}
+
+/*
+ * __db_set_flags --
+ *	DB->set_flags.
+ *
+ * PUBLIC: int  __db_set_flags __P((DB *, u_int32_t));
+ */
+int
+__db_set_flags(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	ENV *env;
+	int ret;
+
+	env = dbp->env;
+
+	if (LF_ISSET(DB_ENCRYPT) && !CRYPTO_ON(env)) {
+		__db_errx(env,
+		    "Database environment not configured for encryption");
+		return (EINVAL);
+	}
+	if (LF_ISSET(DB_TXN_NOT_DURABLE))
+		ENV_REQUIRES_CONFIG(env,
+		    env->tx_handle, "DB_NOT_DURABLE", DB_INIT_TXN);
+
+	__db_map_flags(dbp, &flags, &dbp->flags);
+
+	if ((ret = __bam_set_flags(dbp, &flags)) != 0)
+		return (ret);
+	if ((ret = __ram_set_flags(dbp, &flags)) != 0)
+		return (ret);
+#ifdef HAVE_QUEUE
+	if ((ret = __qam_set_flags(dbp, &flags)) != 0)
+		return (ret);
+#endif
+
+	return (flags == 0 ? 0 : __db_ferr(env, "DB->set_flags", 0));
+}
+
+/*
+ * __db_get_lorder --
+ *	Get whether lorder is swapped or not.
+ *
+ * PUBLIC: int  __db_get_lorder __P((DB *, int *));
+ */
+int
+__db_get_lorder(dbp, db_lorderp)
+	DB *dbp;
+	int *db_lorderp;
+{
+	int ret;
+
+	/* Flag if the specified byte order requires swapping. */
+	switch (ret = __db_byteorder(dbp->env, 1234)) {
+	case 0:
+		*db_lorderp = F_ISSET(dbp, DB_AM_SWAP) ? 4321 : 1234;
+		break;
+	case DB_SWAPBYTES:
+		*db_lorderp = F_ISSET(dbp, DB_AM_SWAP) ? 1234 : 4321;
+		break;
+	default:
+		return (ret);
+		/* NOTREACHED */
+	}
+
+	return (0);
+}
+
+/*
+ * __db_set_lorder --
+ *	Set whether lorder is swapped or not.
+ *
+ * PUBLIC: int  __db_set_lorder __P((DB *, int));
+ */
+int
+__db_set_lorder(dbp, db_lorder)
+	DB *dbp;
+	int db_lorder;
+{
+	int ret;
+
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_lorder");
+
+	/* Flag if the specified byte order requires swapping. */
+	switch (ret = __db_byteorder(dbp->env, db_lorder)) {
+	case 0:
+		F_CLR(dbp, DB_AM_SWAP);
+		break;
+	case DB_SWAPBYTES:
+		F_SET(dbp, DB_AM_SWAP);
+		break;
+	default:
+		return (ret);
+		/* NOTREACHED */
+	}
+	return (0);
+}
+
+static int
+__db_get_alloc(dbp, mal_funcp, real_funcp, free_funcp)
+	DB *dbp;
+	void *(**mal_funcp) __P((size_t));
+	void *(**real_funcp) __P((void *, size_t));
+	void (**free_funcp) __P((void *));
+{
+	DB_ILLEGAL_IN_ENV(dbp, "DB->get_alloc");
+
+	return (__env_get_alloc(dbp->dbenv, mal_funcp,
+	    real_funcp, free_funcp));
+}
+
+static int
+__db_set_alloc(dbp, mal_func, real_func, free_func)
+	DB *dbp;
+	void *(*mal_func) __P((size_t));
+	void *(*real_func) __P((void *, size_t));
+	void (*free_func) __P((void *));
+{
+	DB_ILLEGAL_IN_ENV(dbp, "DB->set_alloc");
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_alloc");
+
+	return (__env_set_alloc(dbp->dbenv, mal_func, real_func, free_func));
+}
+
+static void
+__db_get_msgcall(dbp, msgcallp)
+	DB *dbp;
+	void (**msgcallp) __P((const DB_ENV *, const char *));
+{
+	__env_get_msgcall(dbp->dbenv, msgcallp);
+}
+
+static void
+__db_set_msgcall(dbp, msgcall)
+	DB *dbp;
+	void (*msgcall) __P((const DB_ENV *, const char *));
+{
+	__env_set_msgcall(dbp->dbenv, msgcall);
+}
+
+static void
+__db_get_msgfile(dbp, msgfilep)
+	DB *dbp;
+	FILE **msgfilep;
+{
+	__env_get_msgfile(dbp->dbenv, msgfilep);
+}
+
+static void
+__db_set_msgfile(dbp, msgfile)
+	DB *dbp;
+	FILE *msgfile;
+{
+	__env_set_msgfile(dbp->dbenv, msgfile);
+}
+
+static int
+__db_get_pagesize(dbp, db_pagesizep)
+	DB *dbp;
+	u_int32_t *db_pagesizep;
+{
+	*db_pagesizep = dbp->pgsize;
+	return (0);
+}
+
+/*
+ * __db_set_pagesize --
+ *	DB->set_pagesize
+ *
+ * PUBLIC: int  __db_set_pagesize __P((DB *, u_int32_t));
+ */
+int
+__db_set_pagesize(dbp, db_pagesize)
+	DB *dbp;
+	u_int32_t db_pagesize;
+{
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_pagesize");
+
+	if (db_pagesize < DB_MIN_PGSIZE) {
+		__db_errx(dbp->env, "page sizes may not be smaller than %lu",
+		    (u_long)DB_MIN_PGSIZE);
+		return (EINVAL);
+	}
+	if (db_pagesize > DB_MAX_PGSIZE) {
+		__db_errx(dbp->env, "page sizes may not be larger than %lu",
+		    (u_long)DB_MAX_PGSIZE);
+		return (EINVAL);
+	}
+
+	/*
+	 * We don't want anything that's not a power-of-2, as we rely on that
+	 * for alignment of various types on the pages.
+	 */
+	if (!POWER_OF_TWO(db_pagesize)) {
+		__db_errx(dbp->env, "page sizes must be a power-of-2");
+		return (EINVAL);
+	}
+
+	/*
+	 * XXX
+	 * Should we be checking for a page size that's not a multiple of 512,
+	 * so that we never try and write less than a disk sector?
+	 */
+	dbp->pgsize = db_pagesize;
+
+	return (0);
+}
+
+static int
+__db_set_paniccall(dbp, paniccall)
+	DB *dbp;
+	void (*paniccall) __P((DB_ENV *, int));
+{
+	return (__env_set_paniccall(dbp->dbenv, paniccall));
+}
+
+static int
+__db_set_priority(dbp, priority)
+	DB *dbp;
+	DB_CACHE_PRIORITY priority;
+{
+	dbp->priority = priority;
+	return (0);
+}
+
+static int
+__db_get_priority(dbp, priority)
+	DB *dbp;
+	DB_CACHE_PRIORITY *priority;
+{
+	*priority = dbp->priority;
+	return (0);
+}
diff --git a/db/db_open.c b/db/db_open.c
new file mode 100644
index 0000000..5c5db09
--- /dev/null
+++ b/db/db_open.c
@@ -0,0 +1,628 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+#include "dbinc/fop.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+/*
+ * __db_open --
+ *	DB->open method.
+ *
+ * This routine gets called in three different ways:
+ *
+ * 1. It can be called to open a file/database.  In this case, subdb will
+ *    be NULL and meta_pgno will be PGNO_BASE_MD.
+ * 2. It can be called to open a subdatabase during normal operation.  In
+ *    this case, name and subname will both be non-NULL and meta_pgno will
+ *    be PGNO_BASE_MD (also PGNO_INVALID).
+ * 3. It can be called to open an in-memory database (name == NULL;
+ *    subname = name).
+ * 4. It can be called during recovery to open a file/database, in which case
+ *    name will be non-NULL, subname will be NULL, and meta-pgno will be
+ *    PGNO_BASE_MD.
+ * 5. It can be called during recovery to open a subdatabase, in which case
+ *    name will be non-NULL, subname may be NULL and meta-pgno will be
+ *    a valid pgno (i.e., not PGNO_BASE_MD).
+ * 6. It can be called during recovery to open an in-memory database.
+ *
+ * PUBLIC: int __db_open __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC:     const char *, const char *, DBTYPE, u_int32_t, int, db_pgno_t));
+ */
+int
+__db_open(dbp, ip, txn, fname, dname, type, flags, mode, meta_pgno)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *fname, *dname;
+	DBTYPE type;
+	u_int32_t flags;
+	int mode;
+	db_pgno_t meta_pgno;
+{
+	DB *tdbp;
+	ENV *env;
+	int ret;
+	u_int32_t id;
+
+	env = dbp->env;
+	id = TXN_INVALID;
+
+	/*
+	 * We must flush any existing pages before truncating the file
+	 * since they could age out of mpool and overwrite new pages.
+	 */
+	if (LF_ISSET(DB_TRUNCATE)) {
+		if ((ret = __db_create_internal(&tdbp, dbp->env, 0)) != 0)
+			goto err;
+		ret = __db_open(tdbp, ip, txn, fname, dname, DB_UNKNOWN,
+		     DB_NOERROR | (flags &  ~(DB_TRUNCATE|DB_CREATE)),
+		     mode, meta_pgno);
+		if (ret == 0)
+			ret = __memp_ftruncate(tdbp->mpf, txn, ip, 0, 0);
+		(void)__db_close(tdbp, txn, DB_NOSYNC);
+		if (ret != 0 && ret != ENOENT && ret != EINVAL)
+			goto err;
+		ret = 0;
+	}
+
+	DB_TEST_RECOVERY(dbp, DB_TEST_PREOPEN, ret, fname);
+
+	/*
+	 * If the environment was configured with threads, the DB handle
+	 * must also be free-threaded, so we force the DB_THREAD flag on.
+	 * (See SR #2033 for why this is a requirement--recovery needs
+	 * to be able to grab a dbp using __db_fileid_to_dbp, and it has
+	 * no way of knowing which dbp goes with which thread, so whichever
+	 * one it finds has to be usable in any of them.)
+	 */
+	if (F_ISSET(env, ENV_THREAD))
+		LF_SET(DB_THREAD);
+
+	/* Convert any DB->open flags. */
+	if (LF_ISSET(DB_RDONLY))
+		F_SET(dbp, DB_AM_RDONLY);
+	if (LF_ISSET(DB_READ_UNCOMMITTED))
+		F_SET(dbp, DB_AM_READ_UNCOMMITTED);
+
+	if (IS_REAL_TXN(txn))
+		F_SET(dbp, DB_AM_TXN);
+
+	/* Fill in the type. */
+	dbp->type = type;
+
+	/*
+	 * If both fname and subname are NULL, it's always a create, so make
+	 * sure that we have both DB_CREATE and a type specified.  It would
+	 * be nice if this checking were done in __db_open where most of the
+	 * interface checking is done, but this interface (__db_dbopen) is
+	 * used by the recovery and limbo system, so we need to safeguard
+	 * this interface as well.
+	 */
+	if (fname == NULL) {
+		if (dbp->p_internal != NULL) {
+			__db_errx(env,
+		    "Partitioned databases may not be in memory.");
+			return (ENOENT);
+		}
+		if (dname == NULL) {
+			if (!LF_ISSET(DB_CREATE)) {
+				__db_errx(env,
+			    "DB_CREATE must be specified to create databases.");
+				return (ENOENT);
+			}
+
+			F_SET(dbp, DB_AM_INMEM);
+			F_SET(dbp, DB_AM_CREATED);
+
+			if (dbp->type == DB_UNKNOWN) {
+				__db_errx(env,
+				    "DBTYPE of unknown without existing file");
+				return (EINVAL);
+			}
+
+			if (dbp->pgsize == 0)
+				dbp->pgsize = DB_DEF_IOSIZE;
+
+			/*
+			 * If the file is a temporary file and we're
+			 * doing locking, then we have to create a
+			 * unique file ID.  We can't use our normal
+			 * dev/inode pair (or whatever this OS uses
+			 * in place of dev/inode pairs) because no
+			 * backing file will be created until the
+			 * mpool cache is filled forcing the buffers
+			 * to disk.  Grab a random locker ID to use
+			 * as a file ID.  The created ID must never
+			 * match a potential real file ID -- we know
+			 * it won't because real file IDs contain a
+			 * time stamp after the dev/inode pair, and
+			 * we're simply storing a 4-byte value.
+
+			 * !!!
+			 * Store the locker in the file id structure
+			 * -- we can get it from there as necessary,
+			 * and it saves having two copies.
+			*/
+			if (LOCKING_ON(env) && (ret = __lock_id(env,
+			    (u_int32_t *)dbp->fileid, NULL)) != 0)
+				return (ret);
+		} else
+			MAKE_INMEM(dbp);
+
+		/*
+		 * Normally we would do handle locking here, however, with
+		 * in-memory files, we cannot do any database manipulation
+		 * until the mpool is open, so it happens later.
+		 */
+	} else if (dname == NULL && meta_pgno == PGNO_BASE_MD) {
+		/* Open/create the underlying file.  Acquire locks. */
+		if ((ret = __fop_file_setup(dbp, ip,
+		    txn, fname, mode, flags, &id)) != 0)
+			return (ret);
+	} else {
+		if (dbp->p_internal != NULL) {
+			__db_errx(env,
+    "Partitioned databases may not be included with multiple databases.");
+			return (ENOENT);
+		}
+		if ((ret = __fop_subdb_setup(dbp, ip,
+		    txn, fname, dname, mode, flags)) != 0)
+			return (ret);
+		meta_pgno = dbp->meta_pgno;
+	}
+
+	/* Set up the underlying environment. */
+	if ((ret = __env_setup(dbp, txn, fname, dname, id, flags)) != 0)
+		return (ret);
+
+	/* For in-memory databases, we now need to open/create the database. */
+	if (F_ISSET(dbp, DB_AM_INMEM)) {
+		if (dname == NULL)
+			ret = __db_new_file(dbp, ip, txn, NULL, NULL);
+		else {
+			id = TXN_INVALID;
+			if ((ret = __fop_file_setup(dbp, ip,
+			    txn, dname, mode, flags, &id)) == 0 &&
+			    DBENV_LOGGING(env) && !F_ISSET(dbp, DB_AM_RECOVER)
+#if !defined(DEBUG_ROP) && !defined(DEBUG_WOP) && !defined(DIAGNOSTIC)
+			    && txn != NULL
+#endif
+#if !defined(DEBUG_ROP)
+			    && !F_ISSET(dbp, DB_AM_RDONLY)
+#endif
+			)
+				ret = __dbreg_log_id(dbp,
+				    txn, dbp->log_filename->id, 1);
+		}
+		if (ret != 0)
+			goto err;
+	}
+
+	switch (dbp->type) {
+		case DB_BTREE:
+			ret = __bam_open(dbp, ip, txn, fname, meta_pgno, flags);
+			break;
+		case DB_HASH:
+			ret = __ham_open(dbp, ip, txn, fname, meta_pgno, flags);
+			break;
+		case DB_RECNO:
+			ret = __ram_open(dbp, ip, txn, fname, meta_pgno, flags);
+			break;
+		case DB_QUEUE:
+			ret = __qam_open(
+			    dbp, ip, txn, fname, meta_pgno, mode, flags);
+			break;
+		case DB_UNKNOWN:
+			return (
+			    __db_unknown_type(env, "__db_dbopen", dbp->type));
+	}
+	if (ret != 0)
+		goto err;
+
+#ifdef HAVE_PARTITION
+	if (dbp->p_internal != NULL && (ret =
+	    __partition_open(dbp, ip, txn, fname, type, flags, mode, 1)) != 0)
+		goto err;
+#endif
+	DB_TEST_RECOVERY(dbp, DB_TEST_POSTOPEN, ret, fname);
+
+	/*
+	 * Temporary files don't need handle locks, so we only have to check
+	 * for a handle lock downgrade or lockevent in the case of named
+	 * files.
+	 */
+	if (!F_ISSET(dbp, DB_AM_RECOVER) && (fname != NULL || dname != NULL) &&
+	    LOCK_ISSET(dbp->handle_lock)) {
+		if (IS_REAL_TXN(txn))
+			ret = __txn_lockevent(env,
+			    txn, dbp, &dbp->handle_lock, dbp->locker);
+		else if (LOCKING_ON(env))
+			/* Trade write handle lock for read handle lock. */
+			ret = __lock_downgrade(env,
+			    &dbp->handle_lock, DB_LOCK_READ, 0);
+	}
+DB_TEST_RECOVERY_LABEL
+err:
+	return (ret);
+}
+
+/*
+ * __db_get_open_flags --
+ *	Accessor for flags passed into DB->open call
+ *
+ * PUBLIC: int __db_get_open_flags __P((DB *, u_int32_t *));
+ */
+int
+__db_get_open_flags(dbp, flagsp)
+	DB *dbp;
+	u_int32_t *flagsp;
+{
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_open_flags");
+
+	*flagsp = dbp->open_flags;
+	return (0);
+}
+
+/*
+ * __db_new_file --
+ *	Create a new database file.
+ *
+ * PUBLIC: int __db_new_file __P((DB *,
+ * PUBLIC:      DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *));
+ */
+int
+__db_new_file(dbp, ip, txn, fhp, name)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DB_FH *fhp;
+	const char *name;
+{
+	int ret;
+
+	switch (dbp->type) {
+	case DB_BTREE:
+	case DB_RECNO:
+		ret = __bam_new_file(dbp, ip, txn, fhp, name);
+		break;
+	case DB_HASH:
+		ret = __ham_new_file(dbp, ip, txn, fhp, name);
+		break;
+	case DB_QUEUE:
+		ret = __qam_new_file(dbp, ip, txn, fhp, name);
+		break;
+	case DB_UNKNOWN:
+	default:
+		__db_errx(dbp->env,
+		    "%s: Invalid type %d specified", name, dbp->type);
+		ret = EINVAL;
+		break;
+	}
+
+	DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOGMETA, ret, name);
+	/* Sync the file in preparation for moving it into place. */
+	if (ret == 0 && fhp != NULL)
+		ret = __os_fsync(dbp->env, fhp);
+
+	DB_TEST_RECOVERY(dbp, DB_TEST_POSTSYNC, ret, name);
+
+DB_TEST_RECOVERY_LABEL
+	return (ret);
+}
+
+/*
+ * __db_init_subdb --
+ *	Initialize the dbp for a subdb.
+ *
+ * PUBLIC: int __db_init_subdb __P((DB *,
+ * PUBLIC:       DB *, const char *, DB_THREAD_INFO *, DB_TXN *));
+ */
+int
+__db_init_subdb(mdbp, dbp, name, ip, txn)
+	DB *mdbp, *dbp;
+	const char *name;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+{
+	DBMETA *meta;
+	DB_MPOOLFILE *mpf;
+	int ret, t_ret;
+
+	ret = 0;
+	if (!F_ISSET(dbp, DB_AM_CREATED)) {
+		/* Subdb exists; read meta-data page and initialize. */
+		mpf = mdbp->mpf;
+		if  ((ret = __memp_fget(mpf, &dbp->meta_pgno,
+		    ip, txn, 0, &meta)) != 0)
+			goto err;
+		ret = __db_meta_setup(mdbp->env, dbp, name, meta, 0, 0);
+		if ((t_ret = __memp_fput(mpf,
+		    ip, meta, dbp->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		/*
+		 * If __db_meta_setup found that the meta-page hadn't
+		 * been written out during recovery, we can just return.
+		 */
+		if (ret == ENOENT)
+			ret = 0;
+		goto err;
+	}
+
+	/* Handle the create case here. */
+	switch (dbp->type) {
+	case DB_BTREE:
+	case DB_RECNO:
+		ret = __bam_new_subdb(mdbp, dbp, ip, txn);
+		break;
+	case DB_HASH:
+		ret = __ham_new_subdb(mdbp, dbp, ip, txn);
+		break;
+	case DB_QUEUE:
+		ret = EINVAL;
+		break;
+	case DB_UNKNOWN:
+	default:
+		__db_errx(dbp->env,
+		    "Invalid subdatabase type %d specified", dbp->type);
+		return (EINVAL);
+	}
+
+err:	return (ret);
+}
+
+/*
+ * __db_chk_meta --
+ *	Take a buffer containing a meta-data page and check it for a valid LSN,
+ *	checksum (and verify the checksum if necessary) and possibly decrypt it.
+ *
+ *	Return 0 on success, >0 (errno) on error, -1 on checksum mismatch.
+ *
+ * PUBLIC: int __db_chk_meta __P((ENV *, DB *, DBMETA *, u_int32_t));
+ */
+int
+__db_chk_meta(env, dbp, meta, flags)
+	ENV *env;
+	DB *dbp;
+	DBMETA *meta;
+	u_int32_t flags;
+{
+	DB_LSN swap_lsn;
+	int is_hmac, ret, swapped;
+	u_int32_t magic, orig_chk;
+	u_int8_t *chksum;
+
+	ret = 0;
+	swapped = 0;
+
+	if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM)) {
+		if (dbp != NULL)
+			F_SET(dbp, DB_AM_CHKSUM);
+
+		is_hmac = meta->encrypt_alg == 0 ? 0 : 1;
+		chksum = ((BTMETA *)meta)->chksum;
+
+		/*
+		 * If we need to swap, the checksum function overwrites the
+		 * original checksum with 0, so we need to save a copy of the
+		 * original for swapping later.
+		 */
+		orig_chk = *(u_int32_t *)chksum;
+
+		/*
+		 * We cannot add this to __db_metaswap because that gets done
+		 * later after we've verified the checksum or decrypted.
+		 */
+		if (LF_ISSET(DB_CHK_META)) {
+			swapped = 0;
+chk_retry:		if ((ret =
+			    __db_check_chksum(env, NULL, env->crypto_handle,
+			    chksum, meta, DBMETASIZE, is_hmac)) != 0) {
+				if (is_hmac || swapped)
+					return (ret);
+
+				M_32_SWAP(orig_chk);
+				swapped = 1;
+				*(u_int32_t *)chksum = orig_chk;
+				goto chk_retry;
+			}
+		}
+	} else if (dbp != NULL)
+		F_CLR(dbp, DB_AM_CHKSUM);
+
+#ifdef HAVE_CRYPTO
+	ret = __crypto_decrypt_meta(env,
+	     dbp, (u_int8_t *)meta, LF_ISSET(DB_CHK_META));
+#endif
+
+	/* Now that we're decrypted, we can check LSN. */
+	if (LOGGING_ON(env) && !LF_ISSET(DB_CHK_NOLSN)) {
+		/*
+		 * This gets called both before and after swapping, so we
+		 * need to check ourselves.  If we already swapped it above,
+		 * we'll know that here.
+		 */
+
+		swap_lsn = meta->lsn;
+		magic = meta->magic;
+lsn_retry:
+		if (swapped) {
+			M_32_SWAP(swap_lsn.file);
+			M_32_SWAP(swap_lsn.offset);
+			M_32_SWAP(magic);
+		}
+		switch (magic) {
+		case DB_BTREEMAGIC:
+		case DB_HASHMAGIC:
+		case DB_QAMMAGIC:
+		case DB_RENAMEMAGIC:
+			break;
+		default:
+			if (swapped)
+				return (EINVAL);
+			swapped = 1;
+			goto lsn_retry;
+		}
+		if (!IS_REP_CLIENT(env) &&
+		    !IS_NOT_LOGGED_LSN(swap_lsn) && !IS_ZERO_LSN(swap_lsn))
+			/* Need to do check. */
+			ret = __log_check_page_lsn(env, dbp, &swap_lsn);
+	}
+	return (ret);
+}
+
+/*
+ * __db_meta_setup --
+ *
+ * Take a buffer containing a meta-data page and figure out if it's
+ * valid, and if so, initialize the dbp from the meta-data page.
+ *
+ * PUBLIC: int __db_meta_setup __P((ENV *,
+ * PUBLIC:     DB *, const char *, DBMETA *, u_int32_t, u_int32_t));
+ */
+int
+__db_meta_setup(env, dbp, name, meta, oflags, flags)
+	ENV *env;
+	DB *dbp;
+	const char *name;
+	DBMETA *meta;
+	u_int32_t oflags;
+	u_int32_t flags;
+{
+	u_int32_t magic;
+	int ret;
+
+	ret = 0;
+
+	/*
+	 * Figure out what access method we're dealing with, and then
+	 * call access method specific code to check error conditions
+	 * based on conflicts between the found file and application
+	 * arguments.  A found file overrides some user information --
+	 * we don't consider it an error, for example, if the user set
+	 * an expected byte order and the found file doesn't match it.
+	 */
+	F_CLR(dbp, DB_AM_SWAP | DB_AM_IN_RENAME);
+	magic = meta->magic;
+
+swap_retry:
+	switch (magic) {
+	case DB_BTREEMAGIC:
+	case DB_HASHMAGIC:
+	case DB_QAMMAGIC:
+	case DB_RENAMEMAGIC:
+		break;
+	case 0:
+		/*
+		 * The only time this should be 0 is if we're in the
+		 * midst of opening a subdb during recovery and that
+		 * subdatabase had its meta-data page allocated, but
+		 * not yet initialized.
+		 */
+		if (F_ISSET(dbp, DB_AM_SUBDB) && ((IS_RECOVERING(env) &&
+		    F_ISSET(env->lg_handle, DBLOG_FORCE_OPEN)) ||
+		    meta->pgno != PGNO_INVALID))
+			return (ENOENT);
+
+		goto bad_format;
+	default:
+		if (F_ISSET(dbp, DB_AM_SWAP))
+			goto bad_format;
+
+		M_32_SWAP(magic);
+		F_SET(dbp, DB_AM_SWAP);
+		goto swap_retry;
+	}
+
+	/*
+	 * We can only check the meta page if we are sure we have a meta page.
+	 * If it is random data, then this check can fail.  So only now can we
+	 * checksum and decrypt.  Don't distinguish between configuration and
+	 * checksum match errors here, because we haven't opened the database
+	 * and even a checksum error isn't a reason to panic the environment.
+	 */
+	if ((ret = __db_chk_meta(env, dbp, meta, flags)) != 0) {
+		if (ret == -1)
+			__db_errx(env,
+			    "%s: metadata page checksum error", name);
+		goto bad_format;
+	}
+
+	switch (magic) {
+	case DB_BTREEMAGIC:
+		if (dbp->type != DB_UNKNOWN &&
+		    dbp->type != DB_RECNO && dbp->type != DB_BTREE)
+			goto bad_format;
+
+		flags = meta->flags;
+		if (F_ISSET(dbp, DB_AM_SWAP))
+			M_32_SWAP(flags);
+		if (LF_ISSET(BTM_RECNO))
+			dbp->type = DB_RECNO;
+		else
+			dbp->type = DB_BTREE;
+		if ((oflags & DB_TRUNCATE) == 0 && (ret =
+		    __bam_metachk(dbp, name, (BTMETA *)meta)) != 0)
+			return (ret);
+		break;
+	case DB_HASHMAGIC:
+		if (dbp->type != DB_UNKNOWN && dbp->type != DB_HASH)
+			goto bad_format;
+
+		dbp->type = DB_HASH;
+		if ((oflags & DB_TRUNCATE) == 0 && (ret =
+		    __ham_metachk(dbp, name, (HMETA *)meta)) != 0)
+			return (ret);
+		break;
+	case DB_QAMMAGIC:
+		if (dbp->type != DB_UNKNOWN && dbp->type != DB_QUEUE)
+			goto bad_format;
+		dbp->type = DB_QUEUE;
+		if ((oflags & DB_TRUNCATE) == 0 && (ret =
+		    __qam_metachk(dbp, name, (QMETA *)meta)) != 0)
+			return (ret);
+		break;
+	case DB_RENAMEMAGIC:
+		F_SET(dbp, DB_AM_IN_RENAME);
+
+		/* Copy the file's ID. */
+		memcpy(dbp->fileid, ((DBMETA *)meta)->uid, DB_FILE_ID_LEN);
+
+		break;
+	default:
+		goto bad_format;
+	}
+
+	if (FLD_ISSET(meta->metaflags,
+	    DBMETA_PART_RANGE | DBMETA_PART_CALLBACK))
+		if ((ret =
+		    __partition_init(dbp, meta->metaflags)) != 0)
+			return (ret);
+	return (0);
+
+bad_format:
+	if (F_ISSET(dbp, DB_AM_RECOVER))
+		ret = ENOENT;
+	else
+		__db_errx(env,
+		    "__db_meta_setup: %s: unexpected file type or format",
+		    name);
+	return (ret == 0 ? EINVAL : ret);
+}
diff --git a/db/db_overflow.c b/db/db_overflow.c
new file mode 100644
index 0000000..a718182
--- /dev/null
+++ b/db/db_overflow.c
@@ -0,0 +1,706 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ *	Keith Bostic.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/mp.h"
+
+/*
+ * Big key/data code.
+ *
+ * Big key and data entries are stored on linked lists of pages.  The initial
+ * reference is a structure with the total length of the item and the page
+ * number where it begins.  Each entry in the linked list contains a pointer
+ * to the next page of data, and so on.
+ */
+
+/*
+ * __db_goff --
+ *	Get an offpage item.
+ *
+ * PUBLIC: int __db_goff __P((DBC *,
+ * PUBLIC:     DBT *, u_int32_t, db_pgno_t, void **, u_int32_t *));
+ */
+int
+__db_goff(dbc, dbt, tlen, pgno, bpp, bpsz)
+	DBC *dbc;
+	DBT *dbt;
+	u_int32_t tlen;
+	db_pgno_t pgno;
+	void **bpp;
+	u_int32_t *bpsz;
+{
+	DB *dbp;
+	DB_MPOOLFILE *mpf;
+	DB_TXN *txn;
+	DBC_INTERNAL *cp;
+	ENV *env;
+	PAGE *h;
+	DB_THREAD_INFO *ip;
+	db_indx_t bytes;
+	u_int32_t curoff, needed, start;
+	u_int8_t *p, *src;
+	int ret;
+
+	dbp = dbc->dbp;
+	cp = dbc->internal;
+	env = dbp->env;
+	ip = dbc->thread_info;
+	mpf = dbp->mpf;
+	txn = dbc->txn;
+
+	/*
+	 * Check if the buffer is big enough; if it is not and we are
+	 * allowed to malloc space, then we'll malloc it.  If we are
+	 * not (DB_DBT_USERMEM), then we'll set the dbt and return
+	 * appropriately.
+	 */
+	if (F_ISSET(dbt, DB_DBT_PARTIAL)) {
+		start = dbt->doff;
+		if (start > tlen)
+			needed = 0;
+		else if (dbt->dlen > tlen - start)
+			needed = tlen - start;
+		else
+			needed = dbt->dlen;
+	} else {
+		start = 0;
+		needed = tlen;
+	}
+
+	/*
+	 * If the caller has not requested any data, return success. This
+	 * "early-out" also avoids setting up the streaming optimization when
+	 * no page would be retrieved. If it were removed, the streaming code
+	 * should only initialize when needed is not 0.
+	 */
+	if (needed == 0) {
+		dbt->size = 0;
+		return (0);
+	}
+
+	if (F_ISSET(dbt, DB_DBT_USERCOPY))
+		goto skip_alloc;
+
+	/* Allocate any necessary memory. */
+	if (F_ISSET(dbt, DB_DBT_USERMEM)) {
+		if (needed > dbt->ulen) {
+			dbt->size = needed;
+			return (DB_BUFFER_SMALL);
+		}
+	} else if (F_ISSET(dbt, DB_DBT_MALLOC)) {
+		if ((ret = __os_umalloc(env, needed, &dbt->data)) != 0)
+			return (ret);
+	} else if (F_ISSET(dbt, DB_DBT_REALLOC)) {
+		if ((ret = __os_urealloc(env, needed, &dbt->data)) != 0)
+			return (ret);
+	} else if (bpsz != NULL && (*bpsz == 0 || *bpsz < needed)) {
+		if ((ret = __os_realloc(env, needed, bpp)) != 0)
+			return (ret);
+		*bpsz = needed;
+		dbt->data = *bpp;
+	} else if (bpp != NULL)
+		dbt->data = *bpp;
+	else {
+		DB_ASSERT(env,
+		    F_ISSET(dbt,
+		    DB_DBT_USERMEM | DB_DBT_MALLOC | DB_DBT_REALLOC) ||
+		    bpsz != NULL || bpp != NULL);
+		return (DB_BUFFER_SMALL);
+	}
+
+skip_alloc:
+	/* Set up a start page in the overflow chain if streaming. */
+	if (cp->stream_start_pgno != PGNO_INVALID &&
+	    pgno == cp->stream_start_pgno && start >= cp->stream_off &&
+	    start < cp->stream_off + P_MAXSPACE(dbp, dbp->pgsize)) {
+		pgno = cp->stream_curr_pgno;
+		curoff = cp->stream_off;
+	} else {
+		cp->stream_start_pgno = cp->stream_curr_pgno = pgno;
+		cp->stream_off = curoff = 0;
+	}
+
+	/*
+	 * Step through the linked list of pages, copying the data on each
+	 * one into the buffer.  Never copy more than the total data length.
+	 */
+	dbt->size = needed;
+	for (p = dbt->data; pgno != PGNO_INVALID && needed > 0;) {
+		if ((ret = __memp_fget(mpf,
+		    &pgno, ip, txn, 0, &h)) != 0)
+			return (ret);
+		DB_ASSERT(env, TYPE(h) == P_OVERFLOW);
+
+		/* Check if we need any bytes from this page. */
+		if (curoff + OV_LEN(h) >= start) {
+			bytes = OV_LEN(h);
+			src = (u_int8_t *)h + P_OVERHEAD(dbp);
+			if (start > curoff) {
+				src += start - curoff;
+				bytes -= start - curoff;
+			}
+			if (bytes > needed)
+				bytes = needed;
+			if (F_ISSET(dbt, DB_DBT_USERCOPY)) {
+				/*
+				 * The offset into the DBT is the total size
+				 * less the amount of data still needed.  Care
+				 * needs to be taken if doing a partial copy
+				 * beginning at an offset other than 0.
+				 */
+				if ((ret = env->dbt_usercopy(
+				    dbt, dbt->size - needed,
+				    src, bytes, DB_USERCOPY_SETDATA)) != 0) {
+					(void)__memp_fput(mpf,
+					    ip, h, dbp->priority);
+					return (ret);
+				}
+			} else
+				memcpy(p, src, bytes);
+			p += bytes;
+			needed -= bytes;
+		}
+		cp->stream_off = curoff;
+		curoff += OV_LEN(h);
+		cp->stream_curr_pgno = pgno;
+		pgno = h->next_pgno;
+		(void)__memp_fput(mpf, ip, h, dbp->priority);
+	}
+
+	return (0);
+}
+
+/*
+ * __db_poff --
+ *	Put an offpage item.
+ *
+ * PUBLIC: int __db_poff __P((DBC *, const DBT *, db_pgno_t *));
+ */
+int
+__db_poff(dbc, dbt, pgnop)
+	DBC *dbc;
+	const DBT *dbt;
+	db_pgno_t *pgnop;
+{
+	DB *dbp;
+	DBT tmp_dbt;
+	DB_LSN null_lsn;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep, *lastp;
+	db_indx_t pagespace;
+	db_pgno_t pgno;
+	u_int32_t space, sz, tlen;
+	u_int8_t *p;
+	int ret, t_ret;
+
+	/*
+	 * Allocate pages and copy the key/data item into them.  Calculate the
+	 * number of bytes we get for pages we fill completely with a single
+	 * item.
+	 */
+	dbp = dbc->dbp;
+	lastp = NULL;
+	mpf = dbp->mpf;
+	pagespace = P_MAXSPACE(dbp, dbp->pgsize);
+	p = dbt->data;
+	sz = dbt->size;
+
+	/*
+	 * Check whether we are streaming at the end of the overflow item.
+	 * If so, the last pgno and offset will be cached in the cursor.
+	 */
+	if (F_ISSET(dbt, DB_DBT_STREAMING)) {
+		tlen = dbt->size - dbt->dlen;
+		pgno = dbc->internal->stream_curr_pgno;
+		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info,
+		    dbc->txn, DB_MPOOL_DIRTY, &lastp)) != 0)
+			return (ret);
+
+		/*
+		 * Calculate how much we can write on the last page of the
+		 * overflow item.
+		 */
+		DB_ASSERT(dbp->env,
+		    OV_LEN(lastp) == (tlen - dbc->internal->stream_off));
+		space = pagespace - OV_LEN(lastp);
+
+		/* Only copy as much data as we have. */
+		if (space > dbt->dlen)
+			space = dbt->dlen;
+
+		if (DBC_LOGGING(dbc)) {
+			tmp_dbt.data = dbt->data;
+			tmp_dbt.size = space;
+			ZERO_LSN(null_lsn);
+			if ((ret = __db_big_log(dbp, dbc->txn,
+			    &LSN(lastp), 0, DB_APPEND_BIG, pgno,
+			    PGNO_INVALID, PGNO_INVALID, &tmp_dbt,
+			    &LSN(lastp), &null_lsn, &null_lsn)) != 0)
+				goto err;
+		} else
+			LSN_NOT_LOGGED(LSN(lastp));
+
+		memcpy((u_int8_t *)lastp + P_OVERHEAD(dbp) + OV_LEN(lastp),
+		    dbt->data, space);
+		OV_LEN(lastp) += space;
+		sz -= space + dbt->doff;
+		p += space;
+		*pgnop = dbc->internal->stream_start_pgno;
+	}
+
+	ret = 0;
+	for (; sz > 0; p += pagespace, sz -= pagespace) {
+		/*
+		 * Reduce pagespace so we terminate the loop correctly and
+		 * don't copy too much data.
+		 */
+		if (sz < pagespace)
+			pagespace = sz;
+
+		/*
+		 * Allocate and initialize a new page and copy all or part of
+		 * the item onto the page.  If sz is less than pagespace, we
+		 * have a partial record.
+		 */
+		if ((ret = __db_new(dbc, P_OVERFLOW, NULL, &pagep)) != 0)
+			break;
+		if (DBC_LOGGING(dbc)) {
+			tmp_dbt.data = p;
+			tmp_dbt.size = pagespace;
+			ZERO_LSN(null_lsn);
+			if ((ret = __db_big_log(dbp, dbc->txn,
+			    &LSN(pagep), 0, DB_ADD_BIG, PGNO(pagep),
+			    lastp ? PGNO(lastp) : PGNO_INVALID,
+			    PGNO_INVALID, &tmp_dbt, &LSN(pagep),
+			    lastp == NULL ? &null_lsn : &LSN(lastp),
+			    &null_lsn)) != 0) {
+				(void)__memp_fput(mpf, dbc->thread_info,
+				    pagep, dbc->priority);
+				goto err;
+			}
+		} else
+			LSN_NOT_LOGGED(LSN(pagep));
+
+		/* Move LSN onto page. */
+		if (lastp != NULL)
+			LSN(lastp) = LSN(pagep);
+
+		OV_LEN(pagep) = pagespace;
+		OV_REF(pagep) = 1;
+		memcpy((u_int8_t *)pagep + P_OVERHEAD(dbp), p, pagespace);
+
+		/*
+		 * If this is the first entry, update the user's info and
+		 * initialize the cursor to allow for streaming of subsequent
+		 * updates.  Otherwise, update the entry on the last page
+		 * filled in and release that page.
+		 */
+		if (lastp == NULL) {
+			*pgnop = PGNO(pagep);
+			dbc->internal->stream_start_pgno =
+			    dbc->internal->stream_curr_pgno = *pgnop;
+			dbc->internal->stream_off = 0;
+		} else {
+			lastp->next_pgno = PGNO(pagep);
+			pagep->prev_pgno = PGNO(lastp);
+			if ((ret = __memp_fput(mpf,
+			    dbc->thread_info, lastp, dbc->priority)) != 0) {
+				lastp = NULL;
+				goto err;
+			}
+		}
+		lastp = pagep;
+	}
+err:	if (lastp != NULL) {
+		if (ret == 0) {
+			dbc->internal->stream_curr_pgno = PGNO(lastp);
+			dbc->internal->stream_off = dbt->size - OV_LEN(lastp);
+		}
+
+		if ((t_ret = __memp_fput(mpf, dbc->thread_info, lastp,
+		    dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+	return (ret);
+}
+
+/*
+ * __db_ovref --
+ *	Decrement the reference count on an overflow page.
+ *
+ * PUBLIC: int __db_ovref __P((DBC *, db_pgno_t));
+ */
+int
+__db_ovref(dbc, pgno)
+	DBC *dbc;
+	db_pgno_t pgno;
+{
+	DB *dbp;
+	DB_MPOOLFILE *mpf;
+	PAGE *h;
+	int ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+
+	if ((ret = __memp_fget(mpf, &pgno,
+	     dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &h)) != 0)
+		return (ret);
+
+	if (DBC_LOGGING(dbc)) {
+		if ((ret = __db_ovref_log(dbp,
+		    dbc->txn, &LSN(h), 0, h->pgno, -1, &LSN(h))) != 0) {
+			(void)__memp_fput(mpf,
+			     dbc->thread_info, h, dbc->priority);
+			return (ret);
+		}
+	} else
+		LSN_NOT_LOGGED(LSN(h));
+
+	/*
+	 * In BDB releases before 4.5, the overflow reference counts were
+	 * incremented when an overflow item was split onto an internal
+	 * page.  There was a lock race in that code, and rather than fix
+	 * the race, we changed BDB to copy overflow items when splitting
+	 * them onto internal pages.  The code to decrement reference
+	 * counts remains so databases already in the field continue to
+	 * work.
+	 */
+	--OV_REF(h);
+
+	return (__memp_fput(mpf, dbc->thread_info, h, dbc->priority));
+}
+
+/*
+ * __db_doff --
+ *	Delete an offpage chain of overflow pages.
+ *
+ * PUBLIC: int __db_doff __P((DBC *, db_pgno_t));
+ */
+int
+__db_doff(dbc, pgno)
+	DBC *dbc;
+	db_pgno_t pgno;
+{
+	DB *dbp;
+	DBT tmp_dbt;
+	DB_LSN null_lsn;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int ret;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+
+	do {
+		if ((ret = __memp_fget(mpf, &pgno,
+		     dbc->thread_info, dbc->txn, 0, &pagep)) != 0)
+			return (ret);
+
+		DB_ASSERT(dbp->env, TYPE(pagep) == P_OVERFLOW);
+		/*
+		 * If it's referenced by more than one key/data item,
+		 * decrement the reference count and return.
+		 */
+		if (OV_REF(pagep) > 1) {
+			(void)__memp_fput(mpf,
+			    dbc->thread_info, pagep, dbc->priority);
+			return (__db_ovref(dbc, pgno));
+		}
+
+		if ((ret = __memp_dirty(mpf, &pagep,
+		    dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) {
+			if (pagep != NULL)
+				(void)__memp_fput(mpf,
+				    dbc->thread_info, pagep, dbc->priority);
+			return (ret);
+		}
+
+		if (DBC_LOGGING(dbc)) {
+			tmp_dbt.data = (u_int8_t *)pagep + P_OVERHEAD(dbp);
+			tmp_dbt.size = OV_LEN(pagep);
+			ZERO_LSN(null_lsn);
+			if ((ret = __db_big_log(dbp, dbc->txn,
+			    &LSN(pagep), 0, DB_REM_BIG,
+			    PGNO(pagep), PREV_PGNO(pagep),
+			    NEXT_PGNO(pagep), &tmp_dbt,
+			    &LSN(pagep), &null_lsn, &null_lsn)) != 0) {
+				(void)__memp_fput(mpf,
+				    dbc->thread_info, pagep, dbc->priority);
+				return (ret);
+			}
+		} else
+			LSN_NOT_LOGGED(LSN(pagep));
+		pgno = pagep->next_pgno;
+		OV_LEN(pagep) = 0;
+		if ((ret = __db_free(dbc, pagep)) != 0)
+			return (ret);
+	} while (pgno != PGNO_INVALID);
+
+	return (0);
+}
+
+/*
+ * __db_moff --
+ *	Match on overflow pages.
+ *
+ * Given a starting page number and a key, return <0, 0, >0 to indicate if the
+ * key on the page is less than, equal to or greater than the key specified.
+ * We optimize this by doing chunk at a time comparison unless the user has
+ * specified a comparison function.  In this case, we need to materialize
+ * the entire object and call their comparison routine.
+ *
+ * __db_moff and __db_coff are generic functions useful in searching and
+ * ordering off page items. __db_moff matches an overflow DBT with an offpage
+ * item. __db_coff compares two offpage items for lexicographic sort order.
+ *
+ * PUBLIC: int __db_moff __P((DBC *, const DBT *, db_pgno_t, u_int32_t,
+ * PUBLIC:     int (*)(DB *, const DBT *, const DBT *), int *));
+ */
+int
+__db_moff(dbc, dbt, pgno, tlen, cmpfunc, cmpp)
+	DBC *dbc;
+	const DBT *dbt;
+	db_pgno_t pgno;
+	u_int32_t tlen;
+	int (*cmpfunc) __P((DB *, const DBT *, const DBT *)), *cmpp;
+{
+	DB *dbp;
+	DBT local_dbt;
+	DB_MPOOLFILE *mpf;
+	DB_THREAD_INFO *ip;
+	PAGE *pagep;
+	void *buf;
+	u_int32_t bufsize, cmp_bytes, key_left;
+	u_int8_t *p1, *p2;
+	int ret;
+
+	dbp = dbc->dbp;
+	ip = dbc->thread_info;
+	mpf = dbp->mpf;
+
+	/*
+	 * If there is a user-specified comparison function, build a
+	 * contiguous copy of the key, and call it.
+	 */
+	if (cmpfunc != NULL) {
+		memset(&local_dbt, 0, sizeof(local_dbt));
+		buf = NULL;
+		bufsize = 0;
+
+		if ((ret = __db_goff(dbc,
+		    &local_dbt, tlen, pgno, &buf, &bufsize)) != 0)
+			return (ret);
+		/* Pass the key as the first argument */
+		*cmpp = cmpfunc(dbp, dbt, &local_dbt);
+		__os_free(dbp->env, buf);
+		return (0);
+	}
+
+	/* While there are both keys to compare. */
+	for (*cmpp = 0, p1 = dbt->data,
+	    key_left = dbt->size; key_left > 0 && pgno != PGNO_INVALID;) {
+		if ((ret =
+		    __memp_fget(mpf, &pgno, ip, dbc->txn, 0, &pagep)) != 0)
+			return (ret);
+
+		cmp_bytes = OV_LEN(pagep) < key_left ? OV_LEN(pagep) : key_left;
+		tlen -= cmp_bytes;
+		key_left -= cmp_bytes;
+		for (p2 = (u_int8_t *)pagep + P_OVERHEAD(dbp);
+		    cmp_bytes-- > 0; ++p1, ++p2)
+			if (*p1 != *p2) {
+				*cmpp = (long)*p1 - (long)*p2;
+				break;
+			}
+		pgno = NEXT_PGNO(pagep);
+		if ((ret = __memp_fput(mpf, ip, pagep, dbp->priority)) != 0)
+			return (ret);
+		if (*cmpp != 0)
+			return (0);
+	}
+	if (key_left > 0)		/* DBT is longer than the page key. */
+		*cmpp = 1;
+	else if (tlen > 0)		/* DBT is shorter than the page key. */
+		*cmpp = -1;
+	else
+		*cmpp = 0;
+
+	return (0);
+}
+
+/*
+ * __db_coff --
+ *	Match two offpage dbts.
+ *
+ * The DBTs must both refer to offpage items.
+ * The match happens a chunk (page) at a time unless a user defined comparison
+ * function exists. It is not possible to optimize this comparison away when
+ * a lexicographic sort order is required on mismatch.
+ *
+ * NOTE: For now this function only works for H_OFFPAGE type items. It would
+ * be simple to extend it for use with B_OVERFLOW type items. It would only
+ * require extracting the total length, and page number, dependent on the
+ * DBT type.
+ *
+ * PUBLIC: int __db_coff __P((DBC *, const DBT *, const DBT *,
+ * PUBLIC:     int (*)(DB *, const DBT *, const DBT *), int *));
+ */
+int
+__db_coff(dbc, dbt, match, cmpfunc, cmpp)
+	DBC *dbc;
+	const DBT *dbt, *match;
+	int (*cmpfunc) __P((DB *, const DBT *, const DBT *)), *cmpp;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_MPOOLFILE *mpf;
+	DB_TXN *txn;
+	DBT local_key, local_match;
+	PAGE *dbt_pagep, *match_pagep;
+	db_pgno_t dbt_pgno, match_pgno;
+	u_int32_t cmp_bytes, dbt_bufsz, dbt_len, match_bufsz;
+	u_int32_t match_len, max_data, page_sz;
+	u_int8_t *p1, *p2;
+	int ret;
+	void *dbt_buf, *match_buf;
+
+	dbp = dbc->dbp;
+	ip = dbc->thread_info;
+	txn = dbc->txn;
+	mpf = dbp->mpf;
+	page_sz = dbp->pgsize;
+	*cmpp = 0;
+	dbt_buf = match_buf = NULL;
+
+	DB_ASSERT(dbp->env, HPAGE_PTYPE(dbt->data) == H_OFFPAGE);
+	DB_ASSERT(dbp->env, HPAGE_PTYPE(match->data) == H_OFFPAGE);
+
+	/* Extract potentially unaligned length and pgno fields from DBTs */
+	memcpy(&dbt_len, HOFFPAGE_TLEN(dbt->data), sizeof(u_int32_t));
+	memcpy(&dbt_pgno, HOFFPAGE_PGNO(dbt->data), sizeof(db_pgno_t));
+	memcpy(&match_len, HOFFPAGE_TLEN(match->data), sizeof(u_int32_t));
+	memcpy(&match_pgno, HOFFPAGE_PGNO(match->data), sizeof(db_pgno_t));
+	max_data = (dbt_len < match_len ? dbt_len : match_len);
+
+	/*
+	 * If there is a custom comparator, fully resolve both DBTs.
+	 * Then call the users comparator.
+	 */
+	if (cmpfunc != NULL) {
+		memset(&local_key, 0, sizeof(local_key));
+		memset(&local_match, 0, sizeof(local_match));
+		dbt_buf = match_buf = NULL;
+		dbt_bufsz = match_bufsz = 0;
+
+		if ((ret = __db_goff(dbc, &local_key, dbt_len,
+		    dbt_pgno, &dbt_buf, &dbt_bufsz)) != 0)
+			goto err1;
+		if ((ret = __db_goff(dbc, &local_match, match_len,
+		    match_pgno, &match_buf, &match_bufsz)) != 0)
+			goto err1;
+		/* The key needs to be the first argument for sort order */
+		*cmpp = cmpfunc(dbp, &local_key, &local_match);
+
+err1:		if (dbt_buf != NULL)
+			__os_free(dbp->env, dbt_buf);
+		if (match_buf != NULL)
+			__os_free(dbp->env, match_buf);
+		return (ret);
+	}
+
+	/* Match the offpage DBTs a page at a time. */
+	while (dbt_pgno != PGNO_INVALID && match_pgno != PGNO_INVALID) {
+		if ((ret =
+		    __memp_fget(mpf, &dbt_pgno, ip, txn, 0, &dbt_pagep)) != 0)
+			return (ret);
+		if ((ret =
+		    __memp_fget(mpf, &match_pgno,
+			ip, txn, 0, &match_pagep)) != 0) {
+			(void)__memp_fput(
+			    mpf, ip, dbt_pagep, DB_PRIORITY_UNCHANGED);
+			return (ret);
+		}
+		cmp_bytes = page_sz < max_data ? page_sz : max_data;
+		for (p1 = (u_int8_t *)dbt_pagep + P_OVERHEAD(dbp),
+		    p2 = (u_int8_t *)match_pagep + P_OVERHEAD(dbp);
+		    cmp_bytes-- > 0; ++p1, ++p2)
+				if (*p1 != *p2) {
+					*cmpp = (long)*p1 - (long)*p2;
+					break;
+				}
+
+		dbt_pgno = NEXT_PGNO(dbt_pagep);
+		match_pgno = NEXT_PGNO(match_pagep);
+		max_data -= page_sz;
+		if ((ret = __memp_fput(mpf,
+		     ip, dbt_pagep, DB_PRIORITY_UNCHANGED)) != 0) {
+			(void)__memp_fput(mpf,
+			    ip, match_pagep, DB_PRIORITY_UNCHANGED);
+			return (ret);
+		}
+		if ((ret = __memp_fput(mpf,
+		    ip, match_pagep, DB_PRIORITY_UNCHANGED)) != 0)
+			return (ret);
+		if (*cmpp != 0)
+			return (0);
+	}
+
+	/* If a lexicographic mismatch was found, then the result has already
+	 * been returned. If the DBTs matched, consider the lengths of the
+	 * items, and return appropriately.
+	 */
+	if (dbt_len > match_len) /* DBT is longer than the match key. */
+		*cmpp = 1;
+	else if (match_len > dbt_len) /* DBT is shorter than the match key. */
+		*cmpp = -1;
+	else
+		*cmpp = 0;
+
+	return (0);
+
+}
diff --git a/db/db_ovfl_vrfy.c b/db/db_ovfl_vrfy.c
new file mode 100644
index 0000000..fdd0201
--- /dev/null
+++ b/db/db_ovfl_vrfy.c
@@ -0,0 +1,409 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ *	Keith Bostic.  All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/mp.h"
+
+/*
+ * __db_vrfy_overflow --
+ *	Verify overflow page.
+ *
+ * PUBLIC: int __db_vrfy_overflow __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+ * PUBLIC:     u_int32_t));
+ */
+int
+__db_vrfy_overflow(dbp, vdp, h, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	PAGE *h;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	VRFY_PAGEINFO *pip;
+	int isbad, ret, t_ret;
+
+	isbad = 0;
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+
+	if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) {
+		if (ret == DB_VERIFY_BAD)
+			isbad = 1;
+		else
+			goto err;
+	}
+
+	pip->refcount = OV_REF(h);
+	if (pip->refcount < 1) {
+		EPRINT((dbp->env,
+		    "Page %lu: overflow page has zero reference count",
+		    (u_long)pgno));
+		isbad = 1;
+	}
+
+	/* Just store for now. */
+	pip->olen = HOFFSET(h);
+
+err:	if ((t_ret = __db_vrfy_putpageinfo(dbp->env, vdp, pip)) != 0)
+		ret = t_ret;
+	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_vrfy_ovfl_structure --
+ *	Walk a list of overflow pages, avoiding cycles and marking
+ *	pages seen.
+ *
+ * PUBLIC: int __db_vrfy_ovfl_structure
+ * PUBLIC:     __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t, u_int32_t));
+ */
+int
+__db_vrfy_ovfl_structure(dbp, vdp, pgno, tlen, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	u_int32_t tlen;
+	u_int32_t flags;
+{
+	DB *pgset;
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	db_pgno_t next, prev;
+	int isbad, ret, seen_cnt, t_ret;
+	u_int32_t refcount;
+
+	env = dbp->env;
+	pgset = vdp->pgset;
+	DB_ASSERT(env, pgset != NULL);
+	isbad = 0;
+
+	/* This shouldn't happen, but just to be sure. */
+	if (!IS_VALID_PGNO(pgno))
+		return (DB_VERIFY_BAD);
+
+	/*
+	 * Check the first prev_pgno;  it ought to be PGNO_INVALID,
+	 * since there's no prev page.
+	 */
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+
+	/* The refcount is stored on the first overflow page. */
+	refcount = pip->refcount;
+
+	if (pip->type != P_OVERFLOW) {
+		EPRINT((env,
+		    "Page %lu: overflow page of invalid type %lu",
+		    (u_long)pgno, (u_long)pip->type));
+		ret = DB_VERIFY_BAD;
+		goto err;		/* Unsafe to continue. */
+	}
+
+	prev = pip->prev_pgno;
+	if (prev != PGNO_INVALID) {
+		EPRINT((env,
+	    "Page %lu: first page in overflow chain has a prev_pgno %lu",
+		    (u_long)pgno, (u_long)prev));
+		isbad = 1;
+	}
+
+	for (;;) {
+		/*
+		 * We may have seen this page elsewhere, if the overflow entry
+		 * has been promoted to an internal page;  we just want to
+		 * make sure that each overflow page is seen exactly as many
+		 * times as its refcount dictates.
+		 *
+		 * Note that this code also serves to keep us from looping
+		 * infinitely if there's a cycle in an overflow chain.
+		 */
+		if ((ret = __db_vrfy_pgset_get(pgset,
+		    vdp->thread_info, pgno, &seen_cnt)) != 0)
+			goto err;
+		if ((u_int32_t)seen_cnt > refcount) {
+			EPRINT((env,
+		"Page %lu: encountered too many times in overflow traversal",
+			    (u_long)pgno));
+			ret = DB_VERIFY_BAD;
+			goto err;
+		}
+		if ((ret =
+		    __db_vrfy_pgset_inc(pgset, vdp->thread_info, pgno)) != 0)
+			goto err;
+
+		/*
+		 * Each overflow page can be referenced multiple times,
+		 * because it's possible for overflow Btree keys to get
+		 * promoted to internal pages.  We want to make sure that
+		 * each page is referenced from a Btree leaf (or Hash data
+		 * page, which we consider a "leaf" here) exactly once; if
+		 * the parent was a leaf, set a flag to indicate that we've
+		 * seen this page in a leaf context.
+		 *
+		 * If the parent is not a leaf--in which case it's a Btree
+		 * internal page--we don't need to bother doing any further
+		 * verification, as we'll do it when we hit the leaf (or
+		 * complain that we never saw the leaf).  Only the first
+		 * page in an overflow chain should ever have a refcount
+		 * greater than 1, and the combination of the LEAFSEEN check
+		 * and the fact that we bail after the first page for
+		 * non-leaves should ensure this.
+		 *
+		 * Note that each "child" of a page, such as an overflow page,
+		 * is stored and verified in a structure check exactly once,
+		 * so this code does not need to contend with the fact that
+		 * overflow chains used as Btree duplicate keys may be
+		 * referenced multiply from a single Btree leaf page.
+		 */
+		if (LF_ISSET(DB_ST_OVFL_LEAF)) {
+			if (F_ISSET(pip, VRFY_OVFL_LEAFSEEN)) {
+				EPRINT((env,
+		"Page %lu: overflow page linked twice from leaf or data page",
+				    (u_long)pgno));
+				ret = DB_VERIFY_BAD;
+				goto err;
+			}
+			F_SET(pip, VRFY_OVFL_LEAFSEEN);
+		}
+
+		/*
+		 * We want to verify each overflow chain only once, and
+		 * although no chain should be linked more than once from a
+		 * leaf page, we can't guarantee that it'll be linked that
+		 * once if it's linked from an internal page and the key
+		 * is gone.
+		 *
+		 * seen_cnt is the number of times we'd encountered this page
+		 * before calling this function.
+		 */
+		if (seen_cnt == 0) {
+			/*
+			 * Keep a running tab on how much of the item we've
+			 * seen.
+			 */
+			tlen -= pip->olen;
+
+			/* Send the application feedback about our progress. */
+			if (!LF_ISSET(DB_SALVAGE))
+				__db_vrfy_struct_feedback(dbp, vdp);
+		} else
+			goto done;
+
+		next = pip->next_pgno;
+
+		/* Are we there yet? */
+		if (next == PGNO_INVALID)
+			break;
+
+		/*
+		 * We've already checked this when we saved it, but just
+		 * to be sure...
+		 */
+		if (!IS_VALID_PGNO(next)) {
+			EPRINT((env,
+			    "Page %lu: bad next_pgno %lu on overflow page",
+			    (u_long)pgno, (u_long)next));
+			ret = DB_VERIFY_BAD;
+			goto err;
+		}
+
+		if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 ||
+		    (ret = __db_vrfy_getpageinfo(vdp, next, &pip)) != 0)
+			return (ret);
+		if (pip->prev_pgno != pgno) {
+			EPRINT((env,
+		"Page %lu: bad prev_pgno %lu on overflow page (should be %lu)",
+			    (u_long)next, (u_long)pip->prev_pgno,
+			    (u_long)pgno));
+			isbad = 1;
+			/*
+			 * It's safe to continue because we have separate
+			 * cycle detection.
+			 */
+		}
+
+		pgno = next;
+	}
+
+	if (tlen > 0) {
+		isbad = 1;
+		EPRINT((env,
+		    "Page %lu: overflow item incomplete", (u_long)pgno));
+	}
+
+done:
+err:	if ((t_ret =
+	    __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_safe_goff --
+ *	Get an overflow item, very carefully, from an untrusted database,
+ *	in the context of the salvager.
+ *
+ * PUBLIC: int __db_safe_goff __P((DB *, VRFY_DBINFO *,
+ * PUBLIC:      db_pgno_t, DBT *, void *, u_int32_t *, u_int32_t));
+ */
+int
+__db_safe_goff(dbp, vdp, pgno, dbt, buf, bufsz, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	DBT *dbt;
+	void *buf;
+	u_int32_t *bufsz;
+	u_int32_t flags;
+{
+	DB_MPOOLFILE *mpf;
+	PAGE *h;
+	int ret, t_ret;
+	u_int32_t bytesgot, bytes;
+	u_int8_t *src, *dest;
+
+	mpf = dbp->mpf;
+	h = NULL;
+	ret = t_ret = 0;
+	bytesgot = bytes = 0;
+
+    DB_ASSERT(dbp->env, bufsz != NULL);
+
+	/*
+	 * Back up to the start of the overflow chain (if necessary) via the
+	 * prev pointer of the overflow page.  This guarantees we transverse the
+	 * longest possible chains of overflow pages and won't be called again
+	 * with a pgno earlier in the chain, stepping on ourselves.
+	 */
+	for (;;) {
+		if ((ret = __memp_fget(
+		    mpf, &pgno, vdp->thread_info, NULL, 0, &h)) != 0)
+			return (ret);
+
+		if (PREV_PGNO(h) == PGNO_INVALID ||
+		    !IS_VALID_PGNO(PREV_PGNO(h)))
+			break;
+
+		pgno = PREV_PGNO(h);
+
+		if ((ret = __memp_fput(mpf,
+		    vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
+			return (ret);
+	}
+	if ((ret = __memp_fput(
+	    mpf, vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
+		return (ret);
+
+	h = NULL;
+
+	while ((pgno != PGNO_INVALID) && (IS_VALID_PGNO(pgno))) {
+		/*
+		 * Mark that we're looking at this page;  if we've seen it
+		 * already, quit.
+		 */
+		if ((ret = __db_salvage_markdone(vdp, pgno)) != 0)
+			break;
+
+		if ((ret = __memp_fget(mpf, &pgno,
+		    vdp->thread_info, NULL, 0, &h)) != 0)
+			break;
+
+		/*
+		 * Make sure it's really an overflow page, unless we're
+		 * being aggressive, in which case we pretend it is.
+		 */
+		if (!LF_ISSET(DB_AGGRESSIVE) && TYPE(h) != P_OVERFLOW) {
+			ret = DB_VERIFY_BAD;
+			break;
+		}
+
+		src = (u_int8_t *)h + P_OVERHEAD(dbp);
+		bytes = OV_LEN(h);
+
+		if (bytes + P_OVERHEAD(dbp) > dbp->pgsize)
+			bytes = dbp->pgsize - P_OVERHEAD(dbp);
+
+		/*
+		 * Realloc if buf is too small
+		 */
+		if (bytesgot + bytes > *bufsz) {
+			if ((ret =
+			    __os_realloc(dbp->env, bytesgot + bytes, buf)) != 0)
+				break;
+			*bufsz = bytesgot + bytes;
+		}
+
+		dest = *(u_int8_t **)buf + bytesgot;
+		bytesgot += bytes;
+
+		memcpy(dest, src, bytes);
+
+		pgno = NEXT_PGNO(h);
+
+		if ((ret = __memp_fput(mpf,
+		     vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
+			break;
+		h = NULL;
+	}
+
+	/*
+	 * If we're being aggressive, salvage a partial datum if there
+	 * was an error somewhere along the way.
+	 */
+	if (ret == 0 || LF_ISSET(DB_AGGRESSIVE)) {
+		dbt->size = bytesgot;
+		dbt->data = *(void **)buf;
+	}
+
+	/* If we broke out on error, don't leave pages pinned. */
+	if (h != NULL && (t_ret = __memp_fput(mpf,
+	    vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
diff --git a/db/db_pr.c b/db/db_pr.c
new file mode 100644
index 0000000..69fb7c7
--- /dev/null
+++ b/db/db_pr.c
@@ -0,0 +1,1659 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/qam.h"
+#include "dbinc/db_verify.h"
+
+/*
+ * __db_loadme --
+ *	A nice place to put a breakpoint.
+ *
+ * PUBLIC: void __db_loadme __P((void));
+ */
+void
+__db_loadme()
+{
+	pid_t pid;
+
+	__os_id(NULL, &pid, NULL);
+}
+
+#ifdef HAVE_STATISTICS
+static int	 __db_bmeta __P((DB *, BTMETA *, u_int32_t));
+static int	 __db_hmeta __P((DB *, HMETA *, u_int32_t));
+static void	 __db_meta __P((DB *, DBMETA *, FN const *, u_int32_t));
+static const char *__db_pagetype_to_string __P((u_int32_t));
+static void	 __db_prdb __P((DB *, u_int32_t));
+static void	 __db_proff __P((ENV *, DB_MSGBUF *, void *));
+static int	 __db_prtree __P((DB *, DB_TXN *, u_int32_t));
+static int	 __db_qmeta __P((DB *, QMETA *, u_int32_t));
+
+/*
+ * __db_dumptree --
+ *	Dump the tree to a file.
+ *
+ * PUBLIC: int __db_dumptree __P((DB *, DB_TXN *, char *, char *));
+ */
+int
+__db_dumptree(dbp, txn, op, name)
+	DB *dbp;
+	DB_TXN *txn;
+	char *op, *name;
+{
+	ENV *env;
+	FILE *fp, *orig_fp;
+	u_int32_t flags;
+	int ret;
+
+	env = dbp->env;
+
+	for (flags = 0; *op != '\0'; ++op)
+		switch (*op) {
+		case 'a':
+			LF_SET(DB_PR_PAGE);
+			break;
+		case 'h':
+			break;
+		case 'r':
+			LF_SET(DB_PR_RECOVERYTEST);
+			break;
+		default:
+			return (EINVAL);
+		}
+
+	if (name != NULL) {
+		if ((fp = fopen(name, "w")) == NULL)
+			return (__os_get_errno());
+
+		orig_fp = dbp->dbenv->db_msgfile;
+		dbp->dbenv->db_msgfile = fp;
+	} else
+		fp = orig_fp = NULL;
+
+	__db_prdb(dbp, flags);
+
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+
+	ret = __db_prtree(dbp, txn, flags);
+
+	if (fp != NULL) {
+		(void)fclose(fp);
+		env->dbenv->db_msgfile = orig_fp;
+	}
+
+	return (ret);
+}
+
+static const FN __db_flags_fn[] = {
+	{ DB_AM_CHKSUM,			"checksumming" },
+	{ DB_AM_COMPENSATE,		"created by compensating transaction" },
+	{ DB_AM_CREATED,		"database created" },
+	{ DB_AM_CREATED_MSTR,		"encompassing file created" },
+	{ DB_AM_DBM_ERROR,		"dbm/ndbm error" },
+	{ DB_AM_DELIMITER,		"variable length" },
+	{ DB_AM_DISCARD,		"discard cached pages" },
+	{ DB_AM_DUP,			"duplicates" },
+	{ DB_AM_DUPSORT,		"sorted duplicates" },
+	{ DB_AM_ENCRYPT,		"encrypted" },
+	{ DB_AM_FIXEDLEN,		"fixed-length records" },
+	{ DB_AM_INMEM,			"in-memory" },
+	{ DB_AM_IN_RENAME,		"file is being renamed" },
+	{ DB_AM_NOT_DURABLE,		"changes not logged" },
+	{ DB_AM_OPEN_CALLED,		"open called" },
+	{ DB_AM_PAD,			"pad value" },
+	{ DB_AM_PGDEF,			"default page size" },
+	{ DB_AM_RDONLY,			"read-only" },
+	{ DB_AM_READ_UNCOMMITTED,	"read-uncommitted" },
+	{ DB_AM_RECNUM,			"Btree record numbers" },
+	{ DB_AM_RECOVER,		"opened for recovery" },
+	{ DB_AM_RENUMBER,		"renumber" },
+	{ DB_AM_REVSPLITOFF,		"no reverse splits" },
+	{ DB_AM_SECONDARY,		"secondary" },
+	{ DB_AM_SNAPSHOT,		"load on open" },
+	{ DB_AM_SUBDB,			"subdatabases" },
+	{ DB_AM_SWAP,			"needswap" },
+	{ DB_AM_TXN,			"transactional" },
+	{ DB_AM_VERIFYING,		"verifier" },
+	{ 0,				NULL }
+};
+
+/*
+ * __db_get_flags_fn --
+ *	Return the __db_flags_fn array.
+ *
+ * PUBLIC: const FN * __db_get_flags_fn __P((void));
+ */
+const FN *
+__db_get_flags_fn()
+{
+	return (__db_flags_fn);
+}
+
+/*
+ * __db_prdb --
+ *	Print out the DB structure information.
+ */
+static void
+__db_prdb(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	BTREE *bt;
+	DB_MSGBUF mb;
+	ENV *env;
+	HASH *h;
+	QUEUE *q;
+
+	env = dbp->env;
+
+	DB_MSGBUF_INIT(&mb);
+	__db_msg(env, "In-memory DB structure:");
+	__db_msgadd(env, &mb, "%s: %#lx",
+	    __db_dbtype_to_string(dbp->type), (u_long)dbp->flags);
+	__db_prflags(env, &mb, dbp->flags, __db_flags_fn, " (", ")");
+	DB_MSGBUF_FLUSH(env, &mb);
+
+	switch (dbp->type) {
+	case DB_BTREE:
+	case DB_RECNO:
+		bt = dbp->bt_internal;
+		__db_msg(env, "bt_meta: %lu bt_root: %lu",
+		    (u_long)bt->bt_meta, (u_long)bt->bt_root);
+		__db_msg(env, "bt_minkey: %lu", (u_long)bt->bt_minkey);
+		if (!LF_ISSET(DB_PR_RECOVERYTEST))
+			__db_msg(env, "bt_compare: %#lx bt_prefix: %#lx",
+			    P_TO_ULONG(bt->bt_compare),
+			    P_TO_ULONG(bt->bt_prefix));
+#ifdef HAVE_COMPRESSION
+		if (!LF_ISSET(DB_PR_RECOVERYTEST))
+			__db_msg(env, "bt_compress: %#lx bt_decompress: %#lx",
+			    P_TO_ULONG(bt->bt_compress),
+			    P_TO_ULONG(bt->bt_decompress));
+#endif
+		__db_msg(env, "bt_lpgno: %lu", (u_long)bt->bt_lpgno);
+		if (dbp->type == DB_RECNO) {
+			__db_msg(env,
+		    "re_pad: %#lx re_delim: %#lx re_len: %lu re_source: %s",
+			    (u_long)bt->re_pad, (u_long)bt->re_delim,
+			    (u_long)bt->re_len,
+			    bt->re_source == NULL ? "" : bt->re_source);
+			__db_msg(env,
+			    "re_modified: %d re_eof: %d re_last: %lu",
+			    bt->re_modified, bt->re_eof, (u_long)bt->re_last);
+		}
+		break;
+	case DB_HASH:
+		h = dbp->h_internal;
+		__db_msg(env, "meta_pgno: %lu", (u_long)h->meta_pgno);
+		__db_msg(env, "h_ffactor: %lu", (u_long)h->h_ffactor);
+		__db_msg(env, "h_nelem: %lu", (u_long)h->h_nelem);
+		if (!LF_ISSET(DB_PR_RECOVERYTEST))
+			__db_msg(env, "h_hash: %#lx", P_TO_ULONG(h->h_hash));
+		break;
+	case DB_QUEUE:
+		q = dbp->q_internal;
+		__db_msg(env, "q_meta: %lu", (u_long)q->q_meta);
+		__db_msg(env, "q_root: %lu", (u_long)q->q_root);
+		__db_msg(env, "re_pad: %#lx re_len: %lu",
+		    (u_long)q->re_pad, (u_long)q->re_len);
+		__db_msg(env, "rec_page: %lu", (u_long)q->rec_page);
+		__db_msg(env, "page_ext: %lu", (u_long)q->page_ext);
+		break;
+	case DB_UNKNOWN:
+	default:
+		break;
+	}
+}
+
+/*
+ * __db_prtree --
+ *	Print out the entire tree.
+ */
+static int
+__db_prtree(dbp, txn, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	u_int32_t flags;
+{
+	DB_MPOOLFILE *mpf;
+	PAGE *h;
+	db_pgno_t i, last;
+	int ret;
+
+	mpf = dbp->mpf;
+
+	if (dbp->type == DB_QUEUE)
+		return (__db_prqueue(dbp, flags));
+
+	/*
+	 * Find out the page number of the last page in the database, then
+	 * dump each page.
+	 */
+	if ((ret = __memp_get_last_pgno(mpf, &last)) != 0)
+		return (ret);
+	for (i = 0; i <= last; ++i) {
+		if ((ret = __memp_fget(mpf, &i, NULL, txn, 0, &h)) != 0)
+			return (ret);
+		(void)__db_prpage(dbp, h, flags);
+		if ((ret = __memp_fput(mpf, NULL, h, dbp->priority)) != 0)
+			return (ret);
+	}
+
+	return (0);
+}
+
+/*
+ * __db_meta --
+ *	Print out common metadata information.
+ */
+static void
+__db_meta(dbp, dbmeta, fn, flags)
+	DB *dbp;
+	DBMETA *dbmeta;
+	FN const *fn;
+	u_int32_t flags;
+{
+	DB_MPOOLFILE *mpf;
+	DB_MSGBUF mb;
+	ENV *env;
+	PAGE *h;
+	db_pgno_t pgno;
+	u_int8_t *p;
+	int cnt, ret;
+	const char *sep;
+
+	env = dbp->env;
+	mpf = dbp->mpf;
+	DB_MSGBUF_INIT(&mb);
+
+	__db_msg(env, "\tmagic: %#lx", (u_long)dbmeta->magic);
+	__db_msg(env, "\tversion: %lu", (u_long)dbmeta->version);
+	__db_msg(env, "\tpagesize: %lu", (u_long)dbmeta->pagesize);
+	__db_msg(env, "\ttype: %lu", (u_long)dbmeta->type);
+	__db_msg(env, "\tmetaflags %#lx", (u_long)dbmeta->metaflags);
+	__db_msg(env, "\tkeys: %lu\trecords: %lu",
+	    (u_long)dbmeta->key_count, (u_long)dbmeta->record_count);
+	if (dbmeta->nparts)
+		__db_msg(env, "\tnparts: %lu", (u_long)dbmeta->nparts);
+
+	/*
+	 * If we're doing recovery testing, don't display the free list,
+	 * it may have changed and that makes the dump diff not work.
+	 */
+	if (!LF_ISSET(DB_PR_RECOVERYTEST)) {
+		__db_msgadd(
+		    env, &mb, "\tfree list: %lu", (u_long)dbmeta->free);
+		for (pgno = dbmeta->free,
+		    cnt = 0, sep = ", "; pgno != PGNO_INVALID;) {
+			if ((ret = __memp_fget(mpf,
+			     &pgno, NULL, NULL, 0, &h)) != 0) {
+				DB_MSGBUF_FLUSH(env, &mb);
+				__db_msg(env,
+			    "Unable to retrieve free-list page: %lu: %s",
+				    (u_long)pgno, db_strerror(ret));
+				break;
+			}
+			pgno = h->next_pgno;
+			(void)__memp_fput(mpf, NULL, h, dbp->priority);
+			__db_msgadd(env, &mb, "%s%lu", sep, (u_long)pgno);
+			if (++cnt % 10 == 0) {
+				DB_MSGBUF_FLUSH(env, &mb);
+				cnt = 0;
+				sep = "\t";
+			} else
+				sep = ", ";
+		}
+		DB_MSGBUF_FLUSH(env, &mb);
+		__db_msg(env, "\tlast_pgno: %lu", (u_long)dbmeta->last_pgno);
+	}
+
+	if (fn != NULL) {
+		DB_MSGBUF_FLUSH(env, &mb);
+		__db_msgadd(env, &mb, "\tflags: %#lx", (u_long)dbmeta->flags);
+		__db_prflags(env, &mb, dbmeta->flags, fn, " (", ")");
+	}
+
+	DB_MSGBUF_FLUSH(env, &mb);
+	__db_msgadd(env, &mb, "\tuid: ");
+	for (p = (u_int8_t *)dbmeta->uid,
+	    cnt = 0; cnt < DB_FILE_ID_LEN; ++cnt) {
+		__db_msgadd(env, &mb, "%x", *p++);
+		if (cnt < DB_FILE_ID_LEN - 1)
+			__db_msgadd(env, &mb, " ");
+	}
+	DB_MSGBUF_FLUSH(env, &mb);
+}
+
+/*
+ * __db_bmeta --
+ *	Print out the btree meta-data page.
+ */
+static int
+__db_bmeta(dbp, h, flags)
+	DB *dbp;
+	BTMETA *h;
+	u_int32_t flags;
+{
+	static const FN fn[] = {
+		{ BTM_DUP,	"duplicates" },
+		{ BTM_RECNO,	"recno" },
+		{ BTM_RECNUM,	"btree:recnum" },
+		{ BTM_FIXEDLEN,	"recno:fixed-length" },
+		{ BTM_RENUMBER,	"recno:renumber" },
+		{ BTM_SUBDB,	"multiple-databases" },
+		{ BTM_DUPSORT,	"sorted duplicates" },
+		{ BTM_COMPRESS,	"compressed" },
+		{ 0,		NULL }
+	};
+	ENV *env;
+
+	env = dbp->env;
+
+	__db_meta(dbp, (DBMETA *)h, fn, flags);
+
+	__db_msg(env, "\tminkey: %lu", (u_long)h->minkey);
+	if (dbp->type == DB_RECNO)
+		__db_msg(env, "\tre_len: %#lx re_pad: %#lx",
+		    (u_long)h->re_len, (u_long)h->re_pad);
+	__db_msg(env, "\troot: %lu", (u_long)h->root);
+
+	return (0);
+}
+
+/*
+ * __db_hmeta --
+ *	Print out the hash meta-data page.
+ */
+static int
+__db_hmeta(dbp, h, flags)
+	DB *dbp;
+	HMETA *h;
+	u_int32_t flags;
+{
+	static const FN fn[] = {
+		{ DB_HASH_DUP,		"duplicates" },
+		{ DB_HASH_SUBDB,	"multiple-databases" },
+		{ DB_HASH_DUPSORT,	"sorted duplicates" },
+		{ 0,			NULL }
+	};
+	ENV *env;
+	DB_MSGBUF mb;
+	int i;
+
+	env = dbp->env;
+	DB_MSGBUF_INIT(&mb);
+
+	__db_meta(dbp, (DBMETA *)h, fn, flags);
+
+	__db_msg(env, "\tmax_bucket: %lu", (u_long)h->max_bucket);
+	__db_msg(env, "\thigh_mask: %#lx", (u_long)h->high_mask);
+	__db_msg(env, "\tlow_mask:  %#lx", (u_long)h->low_mask);
+	__db_msg(env, "\tffactor: %lu", (u_long)h->ffactor);
+	__db_msg(env, "\tnelem: %lu", (u_long)h->nelem);
+	__db_msg(env, "\th_charkey: %#lx", (u_long)h->h_charkey);
+	__db_msgadd(env, &mb, "\tspare points: ");
+	for (i = 0; i < NCACHED; i++)
+		__db_msgadd(env, &mb, "%lu ", (u_long)h->spares[i]);
+	DB_MSGBUF_FLUSH(env, &mb);
+
+	return (0);
+}
+
+/*
+ * __db_qmeta --
+ *	Print out the queue meta-data page.
+ */
+static int
+__db_qmeta(dbp, h, flags)
+	DB *dbp;
+	QMETA *h;
+	u_int32_t flags;
+{
+	ENV *env;
+
+	env = dbp->env;
+
+	__db_meta(dbp, (DBMETA *)h, NULL, flags);
+
+	__db_msg(env, "\tfirst_recno: %lu", (u_long)h->first_recno);
+	__db_msg(env, "\tcur_recno: %lu", (u_long)h->cur_recno);
+	__db_msg(env, "\tre_len: %#lx re_pad: %lu",
+	    (u_long)h->re_len, (u_long)h->re_pad);
+	__db_msg(env, "\trec_page: %lu", (u_long)h->rec_page);
+	__db_msg(env, "\tpage_ext: %lu", (u_long)h->page_ext);
+
+	return (0);
+}
+
+/*
+ * __db_prnpage
+ *	-- Print out a specific page.
+ *
+ * PUBLIC: int __db_prnpage __P((DB *, DB_TXN *, db_pgno_t));
+ */
+int
+__db_prnpage(dbp, txn, pgno)
+	DB *dbp;
+	DB_TXN *txn;
+	db_pgno_t pgno;
+{
+	DB_MPOOLFILE *mpf;
+	PAGE *h;
+	int ret, t_ret;
+
+	mpf = dbp->mpf;
+
+	if ((ret = __memp_fget(mpf, &pgno, NULL, txn, 0, &h)) != 0)
+		return (ret);
+
+	ret = __db_prpage(dbp, h, DB_PR_PAGE);
+
+	if ((t_ret = __memp_fput(mpf, NULL, h, dbp->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_prpage
+ *	-- Print out a page.
+ *
+ * PUBLIC: int __db_prpage __P((DB *, PAGE *, u_int32_t));
+ */
+int
+__db_prpage(dbp, h, flags)
+	DB *dbp;
+	PAGE *h;
+	u_int32_t flags;
+{
+	BINTERNAL *bi;
+	BKEYDATA *bk;
+	DB_MSGBUF mb;
+	ENV *env;
+	HOFFPAGE a_hkd;
+	QAMDATA *qp, *qep;
+	RINTERNAL *ri;
+	db_indx_t dlen, len, i, *inp;
+	db_pgno_t pgno;
+	db_recno_t recno;
+	u_int32_t pagesize, qlen;
+	u_int8_t *ep, *hk, *p;
+	int deleted, ret;
+	const char *s;
+	void *sp;
+
+	env = dbp->env;
+	DB_MSGBUF_INIT(&mb);
+
+	/*
+	 * If we're doing recovery testing and this page is P_INVALID,
+	 * assume it's a page that's on the free list, and don't display it.
+	 */
+	if (LF_ISSET(DB_PR_RECOVERYTEST) && TYPE(h) == P_INVALID)
+		return (0);
+
+	if ((s = __db_pagetype_to_string(TYPE(h))) == NULL) {
+		__db_msg(env, "ILLEGAL PAGE TYPE: page: %lu type: %lu",
+		    (u_long)h->pgno, (u_long)TYPE(h));
+		return (EINVAL);
+	}
+
+	/*
+	 * !!!
+	 * Find out the page size.  We don't want to do it the "right" way,
+	 * by reading the value from the meta-data page, that's going to be
+	 * slow.  Reach down into the mpool region.
+	 */
+	pagesize = (u_int32_t)dbp->mpf->mfp->stat.st_pagesize;
+
+	/* Page number, page type. */
+	__db_msgadd(env, &mb, "page %lu: %s:", (u_long)h->pgno, s);
+
+	/*
+	 * LSNs on a metadata page will be different from the original after an
+	 * abort, in some cases.  Don't display them if we're testing recovery.
+	 */
+	if (!LF_ISSET(DB_PR_RECOVERYTEST) ||
+	    (TYPE(h) != P_BTREEMETA && TYPE(h) != P_HASHMETA &&
+	    TYPE(h) != P_QAMMETA && TYPE(h) != P_QAMDATA))
+		__db_msgadd(env, &mb, " LSN [%lu][%lu]:",
+		    (u_long)LSN(h).file, (u_long)LSN(h).offset);
+
+	/*
+	 * Page level (only applicable for Btree/Recno, but we always display
+	 * it, for no particular reason.
+	 */
+	__db_msgadd(env, &mb, " level %lu", (u_long)h->level);
+
+	/* Record count. */
+	if (TYPE(h) == P_IBTREE ||
+	    TYPE(h) == P_IRECNO || (TYPE(h) == P_LRECNO &&
+	    h->pgno == ((BTREE *)dbp->bt_internal)->bt_root))
+		__db_msgadd(env, &mb, " records: %lu", (u_long)RE_NREC(h));
+	DB_MSGBUF_FLUSH(env, &mb);
+
+	switch (TYPE(h)) {
+	case P_BTREEMETA:
+		return (__db_bmeta(dbp, (BTMETA *)h, flags));
+	case P_HASHMETA:
+		return (__db_hmeta(dbp, (HMETA *)h, flags));
+	case P_QAMMETA:
+		return (__db_qmeta(dbp, (QMETA *)h, flags));
+	case P_QAMDATA:				/* Should be meta->start. */
+		if (!LF_ISSET(DB_PR_PAGE))
+			return (0);
+
+		qlen = ((QUEUE *)dbp->q_internal)->re_len;
+		recno = (h->pgno - 1) * QAM_RECNO_PER_PAGE(dbp) + 1;
+		i = 0;
+		qep = (QAMDATA *)((u_int8_t *)h + pagesize - qlen);
+		for (qp = QAM_GET_RECORD(dbp, h, i); qp < qep;
+		    recno++, i++, qp = QAM_GET_RECORD(dbp, h, i)) {
+			if (!F_ISSET(qp, QAM_SET))
+				continue;
+
+			__db_msgadd(env, &mb, "%s",
+			    F_ISSET(qp, QAM_VALID) ? "\t" : "       D");
+			__db_msgadd(env, &mb, "[%03lu] %4lu ", (u_long)recno,
+			    (u_long)((u_int8_t *)qp - (u_int8_t *)h));
+			__db_prbytes(env, &mb, qp->data, qlen);
+		}
+		return (0);
+	default:
+		break;
+	}
+
+	s = "\t";
+	if (TYPE(h) != P_IBTREE && TYPE(h) != P_IRECNO) {
+		__db_msgadd(env, &mb, "%sprev: %4lu next: %4lu",
+		    s, (u_long)PREV_PGNO(h), (u_long)NEXT_PGNO(h));
+		s = " ";
+	}
+	if (TYPE(h) == P_OVERFLOW) {
+		__db_msgadd(env, &mb,
+		    "%sref cnt: %4lu ", s, (u_long)OV_REF(h));
+		__db_prbytes(env,
+		    &mb, (u_int8_t *)h + P_OVERHEAD(dbp), OV_LEN(h));
+		return (0);
+	}
+	__db_msgadd(env, &mb, "%sentries: %4lu", s, (u_long)NUM_ENT(h));
+	__db_msgadd(env, &mb, " offset: %4lu", (u_long)HOFFSET(h));
+	DB_MSGBUF_FLUSH(env, &mb);
+
+	if (TYPE(h) == P_INVALID || !LF_ISSET(DB_PR_PAGE))
+		return (0);
+
+	ret = 0;
+	inp = P_INP(dbp, h);
+	for (i = 0; i < NUM_ENT(h); i++) {
+		if ((uintptr_t)(P_ENTRY(dbp, h, i) - (u_int8_t *)h) <
+		    (uintptr_t)(P_OVERHEAD(dbp)) ||
+		    (size_t)(P_ENTRY(dbp, h, i) - (u_int8_t *)h) >= pagesize) {
+			__db_msg(env,
+			    "ILLEGAL PAGE OFFSET: indx: %lu of %lu",
+			    (u_long)i, (u_long)inp[i]);
+			ret = EINVAL;
+			continue;
+		}
+		deleted = 0;
+		switch (TYPE(h)) {
+		case P_HASH_UNSORTED:
+		case P_HASH:
+		case P_IBTREE:
+		case P_IRECNO:
+			sp = P_ENTRY(dbp, h, i);
+			break;
+		case P_LBTREE:
+			sp = P_ENTRY(dbp, h, i);
+			deleted = i % 2 == 0 &&
+			    B_DISSET(GET_BKEYDATA(dbp, h, i + O_INDX)->type);
+			break;
+		case P_LDUP:
+		case P_LRECNO:
+			sp = P_ENTRY(dbp, h, i);
+			deleted = B_DISSET(GET_BKEYDATA(dbp, h, i)->type);
+			break;
+		default:
+			goto type_err;
+		}
+		__db_msgadd(env, &mb, "%s", deleted ? "       D" : "\t");
+		__db_msgadd(
+		    env, &mb, "[%03lu] %4lu ", (u_long)i, (u_long)inp[i]);
+		switch (TYPE(h)) {
+		case P_HASH_UNSORTED:
+		case P_HASH:
+			hk = sp;
+			switch (HPAGE_PTYPE(hk)) {
+			case H_OFFDUP:
+				memcpy(&pgno,
+				    HOFFDUP_PGNO(hk), sizeof(db_pgno_t));
+				__db_msgadd(env, &mb,
+				    "%4lu [offpage dups]", (u_long)pgno);
+				DB_MSGBUF_FLUSH(env, &mb);
+				break;
+			case H_DUPLICATE:
+				/*
+				 * If this is the first item on a page, then
+				 * we cannot figure out how long it is, so
+				 * we only print the first one in the duplicate
+				 * set.
+				 */
+				if (i != 0)
+					len = LEN_HKEYDATA(dbp, h, 0, i);
+				else
+					len = 1;
+
+				__db_msgadd(env, &mb, "Duplicates:");
+				DB_MSGBUF_FLUSH(env, &mb);
+				for (p = HKEYDATA_DATA(hk),
+				    ep = p + len; p < ep;) {
+					memcpy(&dlen, p, sizeof(db_indx_t));
+					p += sizeof(db_indx_t);
+					__db_msgadd(env, &mb, "\t\t");
+					__db_prbytes(env, &mb, p, dlen);
+					p += sizeof(db_indx_t) + dlen;
+				}
+				break;
+			case H_KEYDATA:
+				__db_prbytes(env, &mb, HKEYDATA_DATA(hk),
+				    LEN_HKEYDATA(dbp, h, i == 0 ?
+				    pagesize : 0, i));
+				break;
+			case H_OFFPAGE:
+				memcpy(&a_hkd, hk, HOFFPAGE_SIZE);
+				__db_msgadd(env, &mb,
+				    "overflow: total len: %4lu page: %4lu",
+				    (u_long)a_hkd.tlen, (u_long)a_hkd.pgno);
+				DB_MSGBUF_FLUSH(env, &mb);
+				break;
+			default:
+				DB_MSGBUF_FLUSH(env, &mb);
+				__db_msg(env, "ILLEGAL HASH PAGE TYPE: %lu",
+				    (u_long)HPAGE_PTYPE(hk));
+				ret = EINVAL;
+				break;
+			}
+			break;
+		case P_IBTREE:
+			bi = sp;
+
+			if (F_ISSET(dbp, DB_AM_RECNUM))
+				__db_msgadd(env, &mb,
+				    "count: %4lu ", (u_long)bi->nrecs);
+			__db_msgadd(env, &mb,
+			    "pgno: %4lu type: %lu ",
+			    (u_long)bi->pgno, (u_long)bi->type);
+			switch (B_TYPE(bi->type)) {
+			case B_KEYDATA:
+				__db_prbytes(env, &mb, bi->data, bi->len);
+				break;
+			case B_DUPLICATE:
+			case B_OVERFLOW:
+				__db_proff(env, &mb, bi->data);
+				break;
+			default:
+				DB_MSGBUF_FLUSH(env, &mb);
+				__db_msg(env, "ILLEGAL BINTERNAL TYPE: %lu",
+				    (u_long)B_TYPE(bi->type));
+				ret = EINVAL;
+				break;
+			}
+			break;
+		case P_IRECNO:
+			ri = sp;
+			__db_msgadd(env, &mb, "entries %4lu pgno %4lu",
+			    (u_long)ri->nrecs, (u_long)ri->pgno);
+			DB_MSGBUF_FLUSH(env, &mb);
+			break;
+		case P_LBTREE:
+		case P_LDUP:
+		case P_LRECNO:
+			bk = sp;
+			switch (B_TYPE(bk->type)) {
+			case B_KEYDATA:
+				__db_prbytes(env, &mb, bk->data, bk->len);
+				break;
+			case B_DUPLICATE:
+			case B_OVERFLOW:
+				__db_proff(env, &mb, bk);
+				break;
+			default:
+				DB_MSGBUF_FLUSH(env, &mb);
+				__db_msg(env,
+			    "ILLEGAL DUPLICATE/LBTREE/LRECNO TYPE: %lu",
+				    (u_long)B_TYPE(bk->type));
+				ret = EINVAL;
+				break;
+			}
+			break;
+		default:
+type_err:		DB_MSGBUF_FLUSH(env, &mb);
+			__db_msg(env,
+			    "ILLEGAL PAGE TYPE: %lu", (u_long)TYPE(h));
+			ret = EINVAL;
+			continue;
+		}
+	}
+	return (ret);
+}
+
+/*
+ * __db_prbytes --
+ *	Print out a data element.
+ *
+ * PUBLIC: void __db_prbytes __P((ENV *, DB_MSGBUF *, u_int8_t *, u_int32_t));
+ */
+void
+__db_prbytes(env, mbp, bytes, len)
+	ENV *env;
+	DB_MSGBUF *mbp;
+	u_int8_t *bytes;
+	u_int32_t len;
+{
+	u_int8_t *p;
+	u_int32_t i;
+	int msg_truncated;
+
+	__db_msgadd(env, mbp, "len: %3lu", (u_long)len);
+	if (len != 0) {
+		__db_msgadd(env, mbp, " data: ");
+
+		/*
+		 * Print the first 20 bytes of the data.   If that chunk is
+		 * all printable characters, print it as text, else print it
+		 * in hex.  We have this heuristic because we're displaying
+		 * things like lock objects that could be either text or data.
+		 */
+		if (len > 20) {
+			len = 20;
+			msg_truncated = 1;
+		} else
+			msg_truncated = 0;
+		for (p = bytes, i = len; i > 0; --i, ++p)
+			if (!isprint((int)*p) && *p != '\t' && *p != '\n')
+				break;
+		if (i == 0)
+			for (p = bytes, i = len; i > 0; --i, ++p)
+				__db_msgadd(env, mbp, "%c", *p);
+		else
+			for (p = bytes, i = len; i > 0; --i, ++p)
+				__db_msgadd(env, mbp, "%#.2x", (u_int)*p);
+		if (msg_truncated)
+			__db_msgadd(env, mbp, "...");
+	}
+	DB_MSGBUF_FLUSH(env, mbp);
+}
+
+/*
+ * __db_proff --
+ *	Print out an off-page element.
+ */
+static void
+__db_proff(env, mbp, vp)
+	ENV *env;
+	DB_MSGBUF *mbp;
+	void *vp;
+{
+	BOVERFLOW *bo;
+
+	bo = vp;
+	switch (B_TYPE(bo->type)) {
+	case B_OVERFLOW:
+		__db_msgadd(env, mbp, "overflow: total len: %4lu page: %4lu",
+		    (u_long)bo->tlen, (u_long)bo->pgno);
+		break;
+	case B_DUPLICATE:
+		__db_msgadd(
+		    env, mbp, "duplicate: page: %4lu", (u_long)bo->pgno);
+		break;
+	default:
+		/* NOTREACHED */
+		break;
+	}
+	DB_MSGBUF_FLUSH(env, mbp);
+}
+
+/*
+ * __db_prflags --
+ *	Print out flags values.
+ *
+ * PUBLIC: void __db_prflags __P((ENV *, DB_MSGBUF *,
+ * PUBLIC:     u_int32_t, const FN *, const char *, const char *));
+ */
+void
+__db_prflags(env, mbp, flags, fn, prefix, suffix)
+	ENV *env;
+	DB_MSGBUF *mbp;
+	u_int32_t flags;
+	FN const *fn;
+	const char *prefix, *suffix;
+{
+	DB_MSGBUF mb;
+	const FN *fnp;
+	int found, standalone;
+	const char *sep;
+
+	if (fn == NULL)
+		return;
+
+	/*
+	 * If it's a standalone message, output the suffix (which will be the
+	 * label), regardless of whether we found anything or not, and flush
+	 * the line.
+	 */
+	if (mbp == NULL) {
+		standalone = 1;
+		mbp = &mb;
+		DB_MSGBUF_INIT(mbp);
+	} else
+		standalone = 0;
+
+	sep = prefix == NULL ? "" : prefix;
+	for (found = 0, fnp = fn; fnp->mask != 0; ++fnp)
+		if (LF_ISSET(fnp->mask)) {
+			__db_msgadd(env, mbp, "%s%s", sep, fnp->name);
+			sep = ", ";
+			found = 1;
+		}
+
+	if ((standalone || found) && suffix != NULL)
+		__db_msgadd(env, mbp, "%s", suffix);
+	if (standalone)
+		DB_MSGBUF_FLUSH(env, mbp);
+}
+
+/*
+ * __db_lockmode_to_string --
+ *	Return the name of the lock mode.
+ *
+ * PUBLIC: const char * __db_lockmode_to_string __P((db_lockmode_t));
+ */
+const char *
+__db_lockmode_to_string(mode)
+	db_lockmode_t mode;
+{
+	switch (mode) {
+	case DB_LOCK_NG:
+		return ("Not granted");
+	case DB_LOCK_READ:
+		return ("Shared/read");
+	case DB_LOCK_WRITE:
+		return ("Exclusive/write");
+	case DB_LOCK_WAIT:
+		return ("Wait for event");
+	case DB_LOCK_IWRITE:
+		return ("Intent exclusive/write");
+	case DB_LOCK_IREAD:
+		return ("Intent shared/read");
+	case DB_LOCK_IWR:
+		return ("Intent to read/write");
+	case DB_LOCK_READ_UNCOMMITTED:
+		return ("Read uncommitted");
+	case DB_LOCK_WWRITE:
+		return ("Was written");
+	default:
+		break;
+	}
+	return ("UNKNOWN LOCK MODE");
+}
+
+/*
+ * __db_pagetype_to_string --
+ *	Return the name of the specified page type.
+ */
+static const char *
+__db_pagetype_to_string(type)
+	u_int32_t type;
+{
+	char *s;
+
+	s = NULL;
+	switch (type) {
+	case P_BTREEMETA:
+		s = "btree metadata";
+		break;
+	case P_LDUP:
+		s = "duplicate";
+		break;
+	case P_HASH_UNSORTED:
+		s = "hash unsorted";
+		break;
+	case P_HASH:
+		s = "hash";
+		break;
+	case P_HASHMETA:
+		s = "hash metadata";
+		break;
+	case P_IBTREE:
+		s = "btree internal";
+		break;
+	case P_INVALID:
+		s = "invalid";
+		break;
+	case P_IRECNO:
+		s = "recno internal";
+		break;
+	case P_LBTREE:
+		s = "btree leaf";
+		break;
+	case P_LRECNO:
+		s = "recno leaf";
+		break;
+	case P_OVERFLOW:
+		s = "overflow";
+		break;
+	case P_QAMMETA:
+		s = "queue metadata";
+		break;
+	case P_QAMDATA:
+		s = "queue";
+		break;
+	default:
+		/* Just return a NULL. */
+		break;
+	}
+	return (s);
+}
+
+#else /* !HAVE_STATISTICS */
+
+/*
+ * __db_dumptree --
+ *	Dump the tree to a file.
+ *
+ * PUBLIC: int __db_dumptree __P((DB *, DB_TXN *, char *, char *));
+ */
+int
+__db_dumptree(dbp, txn, op, name)
+	DB *dbp;
+	DB_TXN *txn;
+	char *op, *name;
+{
+	COMPQUIET(txn, NULL);
+	COMPQUIET(op, NULL);
+	COMPQUIET(name, NULL);
+
+	return (__db_stat_not_built(dbp->env));
+}
+
+/*
+ * __db_get_flags_fn --
+ *	Return the __db_flags_fn array.
+ *
+ * PUBLIC: const FN * __db_get_flags_fn __P((void));
+ */
+const FN *
+__db_get_flags_fn()
+{
+	/*
+	 * !!!
+	 * The Tcl API uses this interface, stub it off.
+	 */
+	return (NULL);
+}
+#endif
+
+/*
+ * __db_dump_pp --
+ *	DB->dump pre/post processing.
+ *
+ * PUBLIC: int __db_dump_pp __P((DB *, const char *,
+ * PUBLIC:     int (*)(void *, const void *), void *, int, int));
+ */
+int
+__db_dump_pp(dbp, subname, callback, handle, pflag, keyflag)
+	DB *dbp;
+	const char *subname;
+	int (*callback) __P((void *, const void *));
+	void *handle;
+	int pflag, keyflag;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = dbp->env;
+
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->dump");
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, 1)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	ret = __db_dump(dbp, subname, callback, handle, pflag, keyflag);
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+err:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_dump --
+ *	DB->dump.
+ *
+ * PUBLIC: int __db_dump __P((DB *, const char *,
+ * PUBLIC:     int (*)(void *, const void *), void *, int, int));
+ */
+int
+__db_dump(dbp, subname, callback, handle, pflag, keyflag)
+	DB *dbp;
+	const char *subname;
+	int (*callback) __P((void *, const void *));
+	void *handle;
+	int pflag, keyflag;
+{
+	DBC *dbcp;
+	DBT key, data;
+	DBT keyret, dataret;
+	ENV *env;
+	db_recno_t recno;
+	int is_recno, ret, t_ret;
+	void *pointer;
+
+	env = dbp->env;
+
+	if ((ret = __db_prheader(
+	    dbp, subname, pflag, keyflag, handle, callback, NULL, 0)) != 0)
+		return (ret);
+
+	/*
+	 * Get a cursor and step through the database, printing out each
+	 * key/data pair.
+	 */
+	if ((ret = __db_cursor(dbp, NULL, NULL, &dbcp, 0)) != 0)
+		return (ret);
+
+	memset(&key, 0, sizeof(key));
+	memset(&data, 0, sizeof(data));
+	if ((ret = __os_malloc(env, 1024 * 1024, &data.data)) != 0)
+		goto err;
+	data.ulen = 1024 * 1024;
+	data.flags = DB_DBT_USERMEM;
+	is_recno = (dbp->type == DB_RECNO || dbp->type == DB_QUEUE);
+	keyflag = is_recno ? keyflag : 1;
+	if (is_recno) {
+		keyret.data = &recno;
+		keyret.size = sizeof(recno);
+	}
+
+retry: while ((ret =
+	    __dbc_get(dbcp, &key, &data, DB_NEXT | DB_MULTIPLE_KEY)) == 0) {
+		DB_MULTIPLE_INIT(pointer, &data);
+		for (;;) {
+			if (is_recno)
+				DB_MULTIPLE_RECNO_NEXT(pointer, &data,
+				    recno, dataret.data, dataret.size);
+			else
+				DB_MULTIPLE_KEY_NEXT(pointer,
+				    &data, keyret.data,
+				    keyret.size, dataret.data, dataret.size);
+
+			if (dataret.data == NULL)
+				break;
+
+			if ((keyflag &&
+			    (ret = __db_prdbt(&keyret, pflag, " ",
+			    handle, callback, is_recno)) != 0) ||
+			    (ret = __db_prdbt(&dataret, pflag, " ",
+			    handle, callback, 0)) != 0)
+				goto err;
+		}
+	}
+	if (ret == DB_BUFFER_SMALL) {
+		data.size = (u_int32_t)DB_ALIGN(data.size, 1024);
+		if ((ret = __os_realloc(env, data.size, &data.data)) != 0)
+			goto err;
+		data.ulen = data.size;
+		goto retry;
+	}
+	if (ret == DB_NOTFOUND)
+		ret = 0;
+
+	if ((t_ret = __db_prfooter(handle, callback)) != 0 && ret == 0)
+		ret = t_ret;
+
+err:	if ((t_ret = __dbc_close(dbcp)) != 0 && ret == 0)
+		ret = t_ret;
+	if (data.data != NULL)
+		__os_free(env, data.data);
+
+	return (ret);
+}
+
+/*
+ * __db_prdbt --
+ *	Print out a DBT data element.
+ *
+ * PUBLIC: int __db_prdbt __P((DBT *, int, const char *, void *,
+ * PUBLIC:     int (*)(void *, const void *), int));
+ */
+int
+__db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno)
+	DBT *dbtp;
+	int checkprint;
+	const char *prefix;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	int is_recno;
+{
+	static const u_char hex[] = "0123456789abcdef";
+	db_recno_t recno;
+	size_t len;
+	int ret;
+#define	DBTBUFLEN	100
+	u_int8_t *p, *hp;
+	char buf[DBTBUFLEN], hbuf[DBTBUFLEN];
+
+	/*
+	 * !!!
+	 * This routine is the routine that dumps out items in the format
+	 * used by db_dump(1) and db_load(1).  This means that the format
+	 * cannot change.
+	 */
+	if (prefix != NULL && (ret = callback(handle, prefix)) != 0)
+		return (ret);
+	if (is_recno) {
+		/*
+		 * We're printing a record number, and this has to be done
+		 * in a platform-independent way.  So we use the numeral in
+		 * straight ASCII.
+		 */
+		(void)__ua_memcpy(&recno, dbtp->data, sizeof(recno));
+		snprintf(buf, DBTBUFLEN, "%lu", (u_long)recno);
+
+		/* If we're printing data as hex, print keys as hex too. */
+		if (!checkprint) {
+			for (len = strlen(buf), p = (u_int8_t *)buf,
+			    hp = (u_int8_t *)hbuf; len-- > 0; ++p) {
+				*hp++ = hex[(u_int8_t)(*p & 0xf0) >> 4];
+				*hp++ = hex[*p & 0x0f];
+			}
+			*hp = '\0';
+			ret = callback(handle, hbuf);
+		} else
+			ret = callback(handle, buf);
+
+		if (ret != 0)
+			return (ret);
+	} else if (checkprint) {
+		for (len = dbtp->size, p = dbtp->data; len--; ++p)
+			if (isprint((int)*p)) {
+				if (*p == '\\' &&
+				    (ret = callback(handle, "\\")) != 0)
+					return (ret);
+				snprintf(buf, DBTBUFLEN, "%c", *p);
+				if ((ret = callback(handle, buf)) != 0)
+					return (ret);
+			} else {
+				snprintf(buf, DBTBUFLEN, "\\%c%c",
+				    hex[(u_int8_t)(*p & 0xf0) >> 4],
+				    hex[*p & 0x0f]);
+				if ((ret = callback(handle, buf)) != 0)
+					return (ret);
+			}
+	} else
+		for (len = dbtp->size, p = dbtp->data; len--; ++p) {
+			snprintf(buf, DBTBUFLEN, "%c%c",
+			    hex[(u_int8_t)(*p & 0xf0) >> 4],
+			    hex[*p & 0x0f]);
+			if ((ret = callback(handle, buf)) != 0)
+				return (ret);
+		}
+
+	return (callback(handle, "\n"));
+}
+
+/*
+ * __db_prheader --
+ *	Write out header information in the format expected by db_load.
+ *
+ * PUBLIC: int	__db_prheader __P((DB *, const char *, int, int, void *,
+ * PUBLIC:     int (*)(void *, const void *), VRFY_DBINFO *, db_pgno_t));
+ */
+int
+__db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno)
+	DB *dbp;
+	const char *subname;
+	int pflag, keyflag;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	VRFY_DBINFO *vdp;
+	db_pgno_t meta_pgno;
+{
+	DBT dbt;
+	DBTYPE dbtype;
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	u_int32_t flags, tmp_u_int32;
+	size_t buflen;
+	char *buf;
+	int using_vdp, ret, t_ret, tmp_int;
+
+	ret = 0;
+	buf = NULL;
+	COMPQUIET(buflen, 0);
+
+	/*
+	 * If dbp is NULL, then pip is guaranteed to be non-NULL; we only ever
+	 * call __db_prheader with a NULL dbp from one case inside __db_prdbt,
+	 * and this is a special subdatabase for "lost" items.  In this case
+	 * we have a vdp (from which we'll get a pip).  In all other cases, we
+	 * will have a non-NULL dbp (and vdp may or may not be NULL depending
+	 * on whether we're salvaging).
+	 */
+	if (dbp == NULL)
+		env = NULL;
+	else
+		env = dbp->env;
+	DB_ASSERT(env, dbp != NULL || vdp != NULL);
+
+	/*
+	 * If we've been passed a verifier statistics object, use that;  we're
+	 * being called in a context where dbp->stat is unsafe.
+	 *
+	 * Also, the verifier may set the pflag on a per-salvage basis.  If so,
+	 * respect that.
+	 */
+	if (vdp != NULL) {
+		if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &pip)) != 0)
+			return (ret);
+
+		if (F_ISSET(vdp, SALVAGE_PRINTABLE))
+			pflag = 1;
+		using_vdp = 1;
+	} else {
+		pip = NULL;
+		using_vdp = 0;
+	}
+
+	/*
+	 * If dbp is NULL, make it a btree.  Otherwise, set dbtype to whatever
+	 * appropriate type for the specified meta page, or the type of the dbp.
+	 */
+	if (dbp == NULL)
+		dbtype = DB_BTREE;
+	else if (using_vdp)
+		switch (pip->type) {
+		case P_BTREEMETA:
+			if (F_ISSET(pip, VRFY_IS_RECNO))
+				dbtype = DB_RECNO;
+			else
+				dbtype = DB_BTREE;
+			break;
+		case P_HASHMETA:
+			dbtype = DB_HASH;
+			break;
+		case P_QAMMETA:
+			dbtype = DB_QUEUE;
+			break;
+		default:
+			/*
+			 * If the meta page is of a bogus type, it's because
+			 * we have a badly corrupt database.  (We must be in
+			 * the verifier for pip to be non-NULL.) Pretend we're
+			 * a Btree and salvage what we can.
+			 */
+			DB_ASSERT(env, F_ISSET(dbp, DB_AM_VERIFYING));
+			dbtype = DB_BTREE;
+			break;
+		}
+	else
+		dbtype = dbp->type;
+
+	if ((ret = callback(handle, "VERSION=3\n")) != 0)
+		goto err;
+	if (pflag) {
+		if ((ret = callback(handle, "format=print\n")) != 0)
+			goto err;
+	} else if ((ret = callback(handle, "format=bytevalue\n")) != 0)
+		goto err;
+
+	/*
+	 * 64 bytes is long enough, as a minimum bound, for any of the
+	 * fields besides subname.  Subname uses __db_prdbt and therefore
+	 * does not need buffer space here.
+	 */
+	buflen = 64;
+	if ((ret = __os_malloc(env, buflen, &buf)) != 0)
+		goto err;
+	if (subname != NULL) {
+		snprintf(buf, buflen, "database=");
+		if ((ret = callback(handle, buf)) != 0)
+			goto err;
+		DB_INIT_DBT(dbt, subname, strlen(subname));
+		if ((ret = __db_prdbt(&dbt, 1, NULL, handle, callback, 0)) != 0)
+			goto err;
+	}
+	switch (dbtype) {
+	case DB_BTREE:
+		if ((ret = callback(handle, "type=btree\n")) != 0)
+			goto err;
+		if (using_vdp)
+			tmp_int = F_ISSET(pip, VRFY_HAS_RECNUMS) ? 1 : 0;
+		else {
+			if ((ret = __db_get_flags(dbp, &flags)) != 0) {
+				__db_err(env, ret, "DB->get_flags");
+				goto err;
+			}
+			tmp_int = F_ISSET(dbp, DB_AM_RECNUM) ? 1 : 0;
+		}
+		if (tmp_int && (ret = callback(handle, "recnum=1\n")) != 0)
+			goto err;
+
+		if (using_vdp)
+			tmp_u_int32 = pip->bt_minkey;
+		else
+			if ((ret =
+			    __bam_get_bt_minkey(dbp, &tmp_u_int32)) != 0) {
+				__db_err(env, ret, "DB->get_bt_minkey");
+				goto err;
+			}
+		if (tmp_u_int32 != 0 && tmp_u_int32 != DEFMINKEYPAGE) {
+			snprintf(buf, buflen,
+			    "bt_minkey=%lu\n", (u_long)tmp_u_int32);
+			if ((ret = callback(handle, buf)) != 0)
+				goto err;
+		}
+		break;
+	case DB_HASH:
+#ifdef HAVE_HASH
+		if ((ret = callback(handle, "type=hash\n")) != 0)
+			goto err;
+		if (using_vdp)
+			tmp_u_int32 = pip->h_ffactor;
+		else
+			if ((ret =
+			    __ham_get_h_ffactor(dbp, &tmp_u_int32)) != 0) {
+				__db_err(env, ret, "DB->get_h_ffactor");
+				goto err;
+			}
+		if (tmp_u_int32 != 0) {
+			snprintf(buf, buflen,
+			    "h_ffactor=%lu\n", (u_long)tmp_u_int32);
+			if ((ret = callback(handle, buf)) != 0)
+				goto err;
+		}
+
+		if (using_vdp)
+			tmp_u_int32 = pip->h_nelem;
+		else
+			if ((ret = __ham_get_h_nelem(dbp, &tmp_u_int32)) != 0) {
+				__db_err(env, ret, "DB->get_h_nelem");
+				goto err;
+			}
+		/*
+		 * Hash databases have an h_nelem field of 0 or 1, neither
+		 * of those values is interesting.
+		 */
+		if (tmp_u_int32 > 1) {
+			snprintf(buf, buflen,
+			    "h_nelem=%lu\n", (u_long)tmp_u_int32);
+			if ((ret = callback(handle, buf)) != 0)
+				goto err;
+		}
+		break;
+#else
+		ret = __db_no_hash_am(env);
+		goto err;
+#endif
+	case DB_QUEUE:
+#ifdef HAVE_QUEUE
+		if ((ret = callback(handle, "type=queue\n")) != 0)
+			goto err;
+		if (using_vdp)
+			tmp_u_int32 = vdp->re_len;
+		else
+			if ((ret = __ram_get_re_len(dbp, &tmp_u_int32)) != 0) {
+				__db_err(env, ret, "DB->get_re_len");
+				goto err;
+			}
+		snprintf(buf, buflen, "re_len=%lu\n", (u_long)tmp_u_int32);
+		if ((ret = callback(handle, buf)) != 0)
+			goto err;
+
+		if (using_vdp)
+			tmp_int = (int)vdp->re_pad;
+		else
+			if ((ret = __ram_get_re_pad(dbp, &tmp_int)) != 0) {
+				__db_err(env, ret, "DB->get_re_pad");
+				goto err;
+			}
+		if (tmp_int != 0 && tmp_int != ' ') {
+			snprintf(buf, buflen, "re_pad=%#x\n", tmp_int);
+			if ((ret = callback(handle, buf)) != 0)
+				goto err;
+		}
+
+		if (using_vdp)
+			tmp_u_int32 = vdp->page_ext;
+		else
+			if ((ret =
+			    __qam_get_extentsize(dbp, &tmp_u_int32)) != 0) {
+				__db_err(env, ret, "DB->get_q_extentsize");
+				goto err;
+			}
+		if (tmp_u_int32 != 0) {
+			snprintf(buf, buflen,
+			    "extentsize=%lu\n", (u_long)tmp_u_int32);
+			if ((ret = callback(handle, buf)) != 0)
+				goto err;
+		}
+		break;
+#else
+		ret = __db_no_queue_am(env);
+		goto err;
+#endif
+	case DB_RECNO:
+		if ((ret = callback(handle, "type=recno\n")) != 0)
+			goto err;
+		if (using_vdp)
+			tmp_int = F_ISSET(pip, VRFY_IS_RRECNO) ? 1 : 0;
+		else
+			tmp_int = F_ISSET(dbp, DB_AM_RENUMBER) ? 1 : 0;
+		if (tmp_int != 0 &&
+		    (ret = callback(handle, "renumber=1\n")) != 0)
+				goto err;
+
+		if (using_vdp)
+			tmp_int = F_ISSET(pip, VRFY_IS_FIXEDLEN) ? 1 : 0;
+		else
+			tmp_int = F_ISSET(dbp, DB_AM_FIXEDLEN) ? 1 : 0;
+		if (tmp_int) {
+			if (using_vdp)
+				tmp_u_int32 = pip->re_len;
+			else
+				if ((ret =
+				    __ram_get_re_len(dbp, &tmp_u_int32)) != 0) {
+					__db_err(env, ret, "DB->get_re_len");
+					goto err;
+				}
+			snprintf(buf, buflen,
+			    "re_len=%lu\n", (u_long)tmp_u_int32);
+			if ((ret = callback(handle, buf)) != 0)
+				goto err;
+
+			if (using_vdp)
+				tmp_int = (int)pip->re_pad;
+			else
+				if ((ret =
+				    __ram_get_re_pad(dbp, &tmp_int)) != 0) {
+					__db_err(env, ret, "DB->get_re_pad");
+					goto err;
+				}
+			if (tmp_int != 0 && tmp_int != ' ') {
+				snprintf(buf,
+				    buflen, "re_pad=%#x\n", (u_int)tmp_int);
+				if ((ret = callback(handle, buf)) != 0)
+					goto err;
+			}
+		}
+		break;
+	case DB_UNKNOWN:			/* Impossible. */
+		ret = __db_unknown_path(env, "__db_prheader");
+		goto err;
+	}
+
+	if (using_vdp) {
+		if (F_ISSET(pip, VRFY_HAS_CHKSUM))
+			if ((ret = callback(handle, "chksum=1\n")) != 0)
+				goto err;
+		if (F_ISSET(pip, VRFY_HAS_DUPS))
+			if ((ret = callback(handle, "duplicates=1\n")) != 0)
+				goto err;
+		if (F_ISSET(pip, VRFY_HAS_DUPSORT))
+			if ((ret = callback(handle, "dupsort=1\n")) != 0)
+				goto err;
+#ifdef HAVE_COMPRESSION
+		if (F_ISSET(pip, VRFY_HAS_COMPRESS))
+			if ((ret = callback(handle, "compressed=1\n")) != 0)
+				goto err;
+#endif
+		/*
+		 * !!!
+		 * We don't know if the page size was the default if we're
+		 * salvaging.  It doesn't seem that interesting to have, so
+		 * we ignore it for now.
+		 */
+	} else {
+		if (F_ISSET(dbp, DB_AM_CHKSUM))
+			if ((ret = callback(handle, "chksum=1\n")) != 0)
+				goto err;
+		if (F_ISSET(dbp, DB_AM_DUP))
+			if ((ret = callback(handle, "duplicates=1\n")) != 0)
+				goto err;
+		if (F_ISSET(dbp, DB_AM_DUPSORT))
+			if ((ret = callback(handle, "dupsort=1\n")) != 0)
+				goto err;
+#ifdef HAVE_COMPRESSION
+		if (DB_IS_COMPRESSED(dbp))
+			if ((ret = callback(handle, "compressed=1\n")) != 0)
+				goto err;
+#endif
+		if (!F_ISSET(dbp, DB_AM_PGDEF)) {
+			snprintf(buf, buflen,
+			    "db_pagesize=%lu\n", (u_long)dbp->pgsize);
+			if ((ret = callback(handle, buf)) != 0)
+				goto err;
+		}
+	}
+
+#ifdef HAVE_PARTITION
+	if (DB_IS_PARTITIONED(dbp) &&
+	    F_ISSET((DB_PARTITION *)dbp->p_internal, PART_RANGE)) {
+		DBT *keys;
+		u_int32_t i;
+
+		if ((ret = __partition_get_keys(dbp, &tmp_u_int32, &keys)) != 0)
+			goto err;
+		if (tmp_u_int32 != 0) {
+			snprintf(buf,
+			     buflen, "nparts=%lu\n", (u_long)tmp_u_int32);
+			if ((ret = callback(handle, buf)) != 0)
+				goto err;
+			for (i = 0; i < tmp_u_int32 - 1; i++)
+			    if ((ret = __db_prdbt(&keys[i],
+				pflag, " ", handle, callback, 0)) != 0)
+					goto err;
+		}
+	}
+#endif
+
+	if (keyflag && (ret = callback(handle, "keys=1\n")) != 0)
+		goto err;
+
+	ret = callback(handle, "HEADER=END\n");
+
+err:	if (using_vdp &&
+	    (t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+	if (buf != NULL)
+		__os_free(env, buf);
+
+	return (ret);
+}
+
+/*
+ * __db_prfooter --
+ *	Print the footer that marks the end of a DB dump.  This is trivial,
+ *	but for consistency's sake we don't want to put its literal contents
+ *	in multiple places.
+ *
+ * PUBLIC: int __db_prfooter __P((void *, int (*)(void *, const void *)));
+ */
+int
+__db_prfooter(handle, callback)
+	void *handle;
+	int (*callback) __P((void *, const void *));
+{
+	return (callback(handle, "DATA=END\n"));
+}
+
+/*
+ * __db_pr_callback --
+ *	Callback function for using pr_* functions from C.
+ *
+ * PUBLIC: int  __db_pr_callback __P((void *, const void *));
+ */
+int
+__db_pr_callback(handle, str_arg)
+	void *handle;
+	const void *str_arg;
+{
+	char *str;
+	FILE *f;
+
+	str = (char *)str_arg;
+	f = (FILE *)handle;
+
+	if (fprintf(f, "%s", str) != (int)strlen(str))
+		return (EIO);
+
+	return (0);
+}
+
+/*
+ * __db_dbtype_to_string --
+ *	Return the name of the database type.
+ *
+ * PUBLIC: const char * __db_dbtype_to_string __P((DBTYPE));
+ */
+const char *
+__db_dbtype_to_string(type)
+	DBTYPE type;
+{
+	switch (type) {
+	case DB_BTREE:
+		return ("btree");
+	case DB_HASH:
+		return ("hash");
+	case DB_RECNO:
+		return ("recno");
+	case DB_QUEUE:
+		return ("queue");
+	case DB_UNKNOWN:
+	default:
+		break;
+	}
+	return ("UNKNOWN TYPE");
+}
diff --git a/db/db_rec.c b/db/db_rec.c
new file mode 100644
index 0000000..02fe096
--- /dev/null
+++ b/db/db_rec.c
@@ -0,0 +1,1859 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2010 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/hash.h"
+
+static int __db_pg_free_recover_int __P((ENV *, DB_THREAD_INFO *,
+    __db_pg_freedata_args *, DB *, DB_LSN *, DB_MPOOLFILE *, db_recops, int));
+static int __db_pg_free_recover_42_int __P((ENV *, DB_THREAD_INFO *,
+    __db_pg_freedata_42_args *,
+    DB *, DB_LSN *, DB_MPOOLFILE *, db_recops, int));
+
+/*
+ * PUBLIC: int __db_addrem_recover
+ * PUBLIC:    __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ *
+ * This log message is generated whenever we add or remove a duplicate
+ * to/from a duplicate page.  On recover, we just do the opposite.
+ */
+int
+__db_addrem_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_addrem_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int cmp_n, cmp_p, modified, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__db_addrem_print);
+	REC_INTRO(__db_addrem_read, ip, 1);
+
+	REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+	modified = 0;
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+	if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_DUP) ||
+	    (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_REM_DUP)) {
+		/* Need to redo an add, or undo a delete. */
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		if ((ret = __db_pitem(dbc, pagep, argp->indx, argp->nbytes,
+		    argp->hdr.size == 0 ? NULL : &argp->hdr,
+		    argp->dbt.size == 0 ? NULL : &argp->dbt)) != 0)
+			goto out;
+		modified = 1;
+
+	} else if ((cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_ADD_DUP) ||
+	    (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_REM_DUP)) {
+		/* Need to undo an add, or redo a delete. */
+		REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+		if ((ret = __db_ditem(dbc,
+		    pagep, argp->indx, argp->nbytes)) != 0)
+			goto out;
+		modified = 1;
+	}
+
+	if (modified) {
+		if (DB_REDO(op))
+			LSN(pagep) = *lsnp;
+		else
+			LSN(pagep) = argp->pagelsn;
+	}
+
+	if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, dbc->priority);
+	REC_CLOSE;
+}
+
+/*
+ * PUBLIC: int __db_big_recover
+ * PUBLIC:     __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_big_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_big_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int cmp_n, cmp_p, modified, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__db_big_print);
+	REC_INTRO(__db_big_read, ip, 0);
+
+	REC_FGET(mpf, ip, argp->pgno, &pagep, ppage);
+	modified = 0;
+
+	/*
+	 * There are three pages we need to check.  The one on which we are
+	 * adding data, the previous one whose next_pointer may have
+	 * been updated, and the next one whose prev_pointer may have
+	 * been updated.
+	 */
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+	if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_BIG) ||
+	    (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_REM_BIG)) {
+		/* We are either redo-ing an add, or undoing a delete. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		P_INIT(pagep, file_dbp->pgsize, argp->pgno, argp->prev_pgno,
+			argp->next_pgno, 0, P_OVERFLOW);
+		OV_LEN(pagep) = argp->dbt.size;
+		OV_REF(pagep) = 1;
+		memcpy((u_int8_t *)pagep + P_OVERHEAD(file_dbp), argp->dbt.data,
+		    argp->dbt.size);
+		PREV_PGNO(pagep) = argp->prev_pgno;
+		modified = 1;
+	} else if ((cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_ADD_BIG) ||
+	    (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_REM_BIG)) {
+		/*
+		 * We are either undo-ing an add or redo-ing a delete.
+		 * The page is about to be reclaimed in either case, so
+		 * there really isn't anything to do here.
+		 */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		modified = 1;
+	} else if (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_APPEND_BIG) {
+		/* We are redoing an append. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		memcpy((u_int8_t *)pagep + P_OVERHEAD(file_dbp) +
+		    OV_LEN(pagep), argp->dbt.data, argp->dbt.size);
+		OV_LEN(pagep) += argp->dbt.size;
+		modified = 1;
+	} else if (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_APPEND_BIG) {
+		/* We are undoing an append. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		OV_LEN(pagep) -= argp->dbt.size;
+		memset((u_int8_t *)pagep + P_OVERHEAD(file_dbp) +
+		    OV_LEN(pagep), 0, argp->dbt.size);
+		modified = 1;
+	}
+	if (modified)
+		LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn;
+
+	ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+	pagep = NULL;
+	if (ret != 0)
+		goto out;
+
+	/*
+	 * We only delete a whole chain of overflow items, and appends only
+	 * apply to a single page.  Adding a page is the only case that
+	 * needs to update the chain.
+	 */
+	if (argp->opcode != DB_ADD_BIG)
+		goto done;
+
+	/* Now check the previous page. */
+ppage:	if (argp->prev_pgno != PGNO_INVALID) {
+		REC_FGET(mpf, ip, argp->prev_pgno, &pagep, npage);
+		modified = 0;
+
+		cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+		cmp_p = LOG_COMPARE(&LSN(pagep), &argp->prevlsn);
+		CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->prevlsn);
+		CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+
+		if (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_BIG) {
+			/* Redo add, undo delete. */
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+			NEXT_PGNO(pagep) = argp->pgno;
+			modified = 1;
+		} else if (cmp_n == 0 &&
+		    DB_UNDO(op) && argp->opcode == DB_ADD_BIG) {
+			/* Redo delete, undo add. */
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+			NEXT_PGNO(pagep) = argp->next_pgno;
+			modified = 1;
+		}
+		if (modified)
+			LSN(pagep) = DB_REDO(op) ? *lsnp : argp->prevlsn;
+		ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+		pagep = NULL;
+		if (ret != 0)
+			goto out;
+	}
+	pagep = NULL;
+
+	/* Now check the next page.  Can only be set on a delete. */
+npage:	if (argp->next_pgno != PGNO_INVALID) {
+		REC_FGET(mpf, ip, argp->next_pgno, &pagep, done);
+		modified = 0;
+
+		cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+		cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nextlsn);
+		CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nextlsn);
+		CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+		if (cmp_p == 0 && DB_REDO(op)) {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+			PREV_PGNO(pagep) = PGNO_INVALID;
+			modified = 1;
+		} else if (cmp_n == 0 && DB_UNDO(op)) {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+			PREV_PGNO(pagep) = argp->pgno;
+			modified = 1;
+		}
+		if (modified)
+			LSN(pagep) = DB_REDO(op) ? *lsnp : argp->nextlsn;
+		ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+		pagep = NULL;
+		if (ret != 0)
+			goto out;
+	}
+	pagep = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __db_ovref_recover --
+ *	Recovery function for __db_ovref().
+ *
+ * PUBLIC: int __db_ovref_recover
+ * PUBLIC:     __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_ovref_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_ovref_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int cmp, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__db_ovref_print);
+	REC_INTRO(__db_ovref_read, ip, 0);
+
+	REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+
+	cmp = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+	CHECK_LSN(env, op, cmp, &LSN(pagep), &argp->lsn);
+	if (cmp == 0 && DB_REDO(op)) {
+		/* Need to redo update described. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		OV_REF(pagep) += argp->adjust;
+		pagep->lsn = *lsnp;
+	} else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) {
+		/* Need to undo update described. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		OV_REF(pagep) -= argp->adjust;
+		pagep->lsn = argp->lsn;
+	}
+	ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+	pagep = NULL;
+	if (ret != 0)
+		goto out;
+	pagep = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __db_debug_recover --
+ *	Recovery function for debug.
+ *
+ * PUBLIC: int __db_debug_recover __P((ENV *,
+ * PUBLIC:     DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_debug_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_debug_args *argp;
+	int ret;
+
+	COMPQUIET(op, DB_TXN_ABORT);
+	COMPQUIET(info, NULL);
+
+	REC_PRINT(__db_debug_print);
+	REC_NOOP_INTRO(__db_debug_read);
+
+	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+	REC_NOOP_CLOSE;
+}
+
+/*
+ * __db_noop_recover --
+ *	Recovery function for noop.
+ *
+ * PUBLIC: int __db_noop_recover __P((ENV *,
+ * PUBLIC:      DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_noop_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_noop_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int cmp_n, cmp_p, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__db_noop_print);
+	REC_INTRO(__db_noop_read, ip, 0);
+
+	REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->prevlsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->prevlsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		LSN(pagep) = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		LSN(pagep) = argp->prevlsn;
+	}
+	ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+	pagep = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf,
+		    ip, pagep, file_dbp->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __db_pg_alloc_recover --
+ *	Recovery function for pg_alloc.
+ *
+ * PUBLIC: int __db_pg_alloc_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_alloc_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_pg_alloc_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DBMETA *meta;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	db_pgno_t pgno;
+	int cmp_n, cmp_p, created, level, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	meta = NULL;
+	pagep = NULL;
+	created = 0;
+	REC_PRINT(__db_pg_alloc_print);
+	REC_INTRO(__db_pg_alloc_read, ip, 0);
+
+	/*
+	 * Fix up the metadata page.  If we're redoing the operation, we have
+	 * to get the metadata page and update its LSN and its free pointer.
+	 * If we're undoing the operation and the page was ever created, we put
+	 * it on the freelist.
+	 */
+	pgno = PGNO_BASE_MD;
+	if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &meta)) != 0) {
+		/* The metadata page must always exist on redo. */
+		if (DB_REDO(op)) {
+			ret = __db_pgerr(file_dbp, pgno, ret);
+			goto out;
+		} else
+			goto done;
+	}
+	cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
+	cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(meta), lsnp);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Need to redo update described. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+		LSN(meta) = *lsnp;
+		meta->free = argp->next;
+		if (argp->pgno > meta->last_pgno)
+			meta->last_pgno = argp->pgno;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Need to undo update described. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+		LSN(meta) = argp->meta_lsn;
+		/*
+		 * If the page has a zero LSN then its newly created and
+		 * will be truncated rather than go on the free list.
+		 */
+		if (!IS_ZERO_LSN(argp->page_lsn))
+			meta->free = argp->pgno;
+		meta->last_pgno = argp->last_pgno;
+	}
+
+#ifdef HAVE_FTRUNCATE
+	/*
+	 * check to see if we are keeping a sorted freelist, if so put
+	 * this back in the in memory list.  It must be the first element.
+	 */
+	if (op == DB_TXN_ABORT && !IS_ZERO_LSN(argp->page_lsn)) {
+		db_pgno_t *list;
+		u_int32_t nelem;
+
+		if ((ret = __memp_get_freelist(mpf, &nelem, &list)) != 0)
+			goto out;
+		if (list != NULL && (nelem == 0 || *list != argp->pgno)) {
+			if ((ret =
+			    __memp_extend_freelist(mpf, nelem + 1, &list)) != 0)
+				goto out;
+			if (nelem != 0)
+				memmove(list + 1, list, nelem * sizeof(*list));
+			*list = argp->pgno;
+		}
+	}
+#endif
+
+	/*
+	 * Fix up the allocated page. If the page does not exist
+	 * and we can truncate it then don't create it.
+	 * Otherwise if we're redoing the operation, we have
+	 * to get the page (creating it if it doesn't exist), and update its
+	 * LSN.  If we're undoing the operation, we have to reset the page's
+	 * LSN and put it on the free list.
+	 */
+	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+		/*
+		 * We have to be able to identify if a page was newly
+		 * created so we can recover it properly.  We cannot simply
+		 * look for an empty header, because hash uses a pgin
+		 * function that will set the header.  Instead, we explicitly
+		 * try for the page without CREATE and if that fails, then
+		 * create it.
+		 */
+		if (DB_UNDO(op))
+			goto do_truncate;
+		if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL,
+		    DB_MPOOL_CREATE, &pagep)) != 0) {
+			if (DB_UNDO(op) && ret == ENOSPC)
+				goto do_truncate;
+			ret = __db_pgerr(file_dbp, argp->pgno, ret);
+			goto out;
+		}
+		created = 1;
+	}
+
+	/* Fix up the allocated page. */
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->page_lsn);
+
+	/*
+	 * If an initial allocation is aborted and then reallocated during
+	 * an archival restore the log record will have an LSN for the page
+	 * but the page will be empty.
+	 */
+	if (IS_ZERO_LSN(LSN(pagep)))
+		cmp_p = 0;
+
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->page_lsn);
+	/*
+	 * Another special case we have to handle is if we ended up with a
+	 * page of all 0's which can happen if we abort between allocating a
+	 * page in mpool and initializing it.  In that case, even if we're
+	 * undoing, we need to re-initialize the page.
+	 */
+	if (DB_REDO(op) && cmp_p == 0) {
+		/* Need to redo update described. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		switch (argp->ptype) {
+		case P_LBTREE:
+		case P_LRECNO:
+		case P_LDUP:
+			level = LEAFLEVEL;
+			break;
+		default:
+			level = 0;
+			break;
+		}
+		P_INIT(pagep, file_dbp->pgsize,
+		    argp->pgno, PGNO_INVALID, PGNO_INVALID, level, argp->ptype);
+
+		pagep->lsn = *lsnp;
+	} else if (DB_UNDO(op) && (cmp_n == 0 || created)) {
+		/*
+		 * This is where we handle the case of a 0'd page (pagep->pgno
+		 * is equal to PGNO_INVALID).
+		 * Undo the allocation, reinitialize the page and
+		 * link its next pointer to the free list.
+		 */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		P_INIT(pagep, file_dbp->pgsize,
+		    argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID);
+
+		pagep->lsn = argp->page_lsn;
+	}
+
+do_truncate:
+	/*
+	 * If the page was newly created, give it back.
+	 */
+	if ((pagep == NULL || IS_ZERO_LSN(LSN(pagep))) &&
+	    IS_ZERO_LSN(argp->page_lsn) && DB_UNDO(op)) {
+		/* Discard the page. */
+		if (pagep != NULL) {
+			if ((ret = __memp_fput(mpf, ip,
+			    pagep, DB_PRIORITY_VERY_LOW)) != 0)
+				goto out;
+			pagep = NULL;
+		}
+		/* Give the page back to the OS. */
+		if (meta->last_pgno <= argp->pgno && (ret = __memp_ftruncate(
+		    mpf, NULL, ip, argp->pgno, MP_TRUNC_RECOVER)) != 0)
+			goto out;
+	}
+
+	if (pagep != NULL) {
+		ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+		pagep = NULL;
+		if (ret != 0)
+			goto out;
+	}
+
+	ret = __memp_fput(mpf, ip, meta, file_dbp->priority);
+	meta = NULL;
+	if (ret != 0)
+		goto out;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+	if (meta != NULL)
+		(void)__memp_fput(mpf, ip, meta, file_dbp->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __db_pg_free_recover_int --
+ */
+static int
+__db_pg_free_recover_int(env, ip, argp, file_dbp, lsnp, mpf, op, data)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	__db_pg_freedata_args *argp;
+	DB *file_dbp;
+	DB_LSN *lsnp;
+	DB_MPOOLFILE *mpf;
+	db_recops op;
+	int data;
+{
+	DBMETA *meta;
+	DB_LSN copy_lsn;
+	PAGE *pagep, *prevp;
+	int cmp_n, cmp_p, is_meta, ret;
+
+	meta = NULL;
+	pagep = prevp = NULL;
+
+	/*
+	 * Get the "metapage".  This will either be the metapage
+	 * or the previous page in the free list if we are doing
+	 * sorted allocations.  If its a previous page then
+	 * we will not be truncating.
+	 */
+	is_meta = argp->meta_pgno == PGNO_BASE_MD;
+
+	REC_FGET(mpf, ip, argp->meta_pgno, &meta, check_meta);
+
+	if (argp->meta_pgno != PGNO_BASE_MD)
+		prevp = (PAGE *)meta;
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
+	cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(meta), lsnp);
+
+	/*
+	 * Fix up the metadata page.  If we're redoing or undoing the operation
+	 * we get the page and update its LSN, last and free pointer.
+	 */
+	if (cmp_p == 0 && DB_REDO(op)) {
+		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+		/*
+		 * If we are at the end of the file truncate, otherwise
+		 * put on the free list.
+		*/
+		if (argp->pgno == argp->last_pgno)
+			meta->last_pgno = argp->pgno - 1;
+		else if (is_meta)
+			meta->free = argp->pgno;
+		else
+			NEXT_PGNO(prevp) = argp->pgno;
+		LSN(meta) = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Need to undo the deallocation. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+		if (is_meta) {
+			if (meta->last_pgno < argp->pgno)
+				meta->last_pgno = argp->pgno;
+			meta->free = argp->next;
+		} else
+			NEXT_PGNO(prevp) = argp->next;
+		LSN(meta) = argp->meta_lsn;
+	}
+
+check_meta:
+	if (ret != 0 && is_meta) {
+		/* The metadata page must always exist. */
+		ret = __db_pgerr(file_dbp, argp->meta_pgno, ret);
+		goto out;
+	}
+
+	/*
+	 * Get the freed page.  Don't create the page if we are going to
+	 * free it.  If we're redoing the operation we get the page and
+	 * explicitly discard its contents, then update its LSN. If we're
+	 * undoing the operation, we get the page and restore its header.
+	 */
+	if (DB_REDO(op) || (is_meta && meta->last_pgno < argp->pgno)) {
+		if ((ret = __memp_fget(mpf, &argp->pgno,
+		    ip, NULL, 0, &pagep)) != 0) {
+			if (ret != DB_PAGE_NOTFOUND)
+				goto out;
+			if (is_meta &&
+			    DB_REDO(op) && meta->last_pgno <= argp->pgno)
+				goto trunc;
+			goto done;
+		}
+	} else if ((ret = __memp_fget(mpf, &argp->pgno,
+	   ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0)
+		goto out;
+
+	(void)__ua_memcpy(&copy_lsn, &LSN(argp->header.data), sizeof(DB_LSN));
+	cmp_n = IS_ZERO_LSN(LSN(pagep)) ? 0 : LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &copy_lsn);
+
+	/*
+	 * This page got extended by a later allocation,
+	 * but its allocation was not in the scope of this
+	 * recovery pass.
+	 */
+	if (IS_ZERO_LSN(LSN(pagep)))
+		cmp_p = 0;
+
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &copy_lsn);
+	if (DB_REDO(op) &&
+	    (cmp_p == 0 ||
+	    (IS_ZERO_LSN(copy_lsn) &&
+	    LOG_COMPARE(&LSN(pagep), &argp->meta_lsn) <= 0))) {
+		/* Need to redo the deallocation. */
+		/*
+		 * The page can be truncated if it was truncated at runtime
+		 * and the current metapage reflects the truncation.
+		 */
+		if (is_meta && meta->last_pgno <= argp->pgno &&
+		    argp->last_pgno <= argp->pgno) {
+			if ((ret = __memp_fput(mpf, ip,
+			    pagep, DB_PRIORITY_VERY_LOW)) != 0)
+				goto out;
+			pagep = NULL;
+trunc:			if ((ret = __memp_ftruncate(mpf, NULL, ip,
+			    argp->pgno, MP_TRUNC_RECOVER)) != 0)
+				goto out;
+		} else if (argp->last_pgno == argp->pgno) {
+			/* The page was truncated at runtime, zero it out. */
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+			P_INIT(pagep, 0, PGNO_INVALID,
+			    PGNO_INVALID, PGNO_INVALID, 0, P_INVALID);
+			ZERO_LSN(pagep->lsn);
+		} else {
+			REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+			P_INIT(pagep, file_dbp->pgsize,
+			    argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID);
+			pagep->lsn = *lsnp;
+
+		}
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Need to reallocate the page. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		memcpy(pagep, argp->header.data, argp->header.size);
+		if (data)
+			memcpy((u_int8_t*)pagep + HOFFSET(pagep),
+			     argp->data.data, argp->data.size);
+	}
+	if (pagep != NULL &&
+	    (ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+
+	pagep = NULL;
+#ifdef HAVE_FTRUNCATE
+	/*
+	 * If we are keeping an in memory free list remove this
+	 * element from the list.
+	 */
+	if (op == DB_TXN_ABORT && argp->pgno != argp->last_pgno) {
+		db_pgno_t *lp;
+		u_int32_t nelem, pos;
+
+		if ((ret = __memp_get_freelist(mpf, &nelem, &lp)) != 0)
+			goto out;
+		if (lp != NULL) {
+			pos = 0;
+			if (!is_meta) {
+				__db_freelist_pos(argp->pgno, lp, nelem, &pos);
+
+				/* 
+				 * If we aborted after logging but before
+				 * updating the free list don't do anything.
+				*/
+				if (argp->pgno != lp[pos]) {
+					DB_ASSERT(env,
+					    argp->meta_pgno == lp[pos]);
+					goto done;
+				}
+				DB_ASSERT(env,
+				    argp->meta_pgno == lp[pos - 1]);
+			} else if (nelem != 0 && argp->pgno != lp[pos])
+				goto done;
+
+			if (pos < nelem)
+				memmove(&lp[pos], &lp[pos + 1],
+				    ((nelem - pos) - 1) * sizeof(*lp));
+
+			/* Shrink the list */
+			if ((ret =
+			    __memp_extend_freelist(mpf, nelem - 1, &lp)) != 0)
+				goto out;
+		}
+	}
+#endif
+done:
+	if (meta != NULL &&
+	     (ret = __memp_fput(mpf, ip,  meta, file_dbp->priority)) != 0)
+		goto out;
+	meta = NULL;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip,  pagep, file_dbp->priority);
+	if (meta != NULL)
+		(void)__memp_fput(mpf, ip,  meta, file_dbp->priority);
+
+	return (ret);
+}
+
+/*
+ * __db_pg_free_recover --
+ *	Recovery function for pg_free.
+ *
+ * PUBLIC: int __db_pg_free_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_free_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_pg_free_args *argp;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	DB_THREAD_INFO *ip;
+	int ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__db_pg_free_print);
+	REC_INTRO(__db_pg_free_read, ip, 0);
+
+	ret = __db_pg_free_recover_int(env, ip,
+	     (__db_pg_freedata_args *)argp, file_dbp, lsnp, mpf, op, 0);
+
+done:	*lsnp = argp->prev_lsn;
+out:
+	REC_CLOSE;
+}
+
+/*
+ * __db_pg_freedata_recover --
+ *	Recovery function for pg_freedata.
+ *
+ * PUBLIC: int __db_pg_freedata_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_freedata_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_pg_freedata_args *argp;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	DB_THREAD_INFO *ip;
+	int ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__db_pg_freedata_print);
+	REC_INTRO(__db_pg_freedata_read, ip, 0);
+
+	ret = __db_pg_free_recover_int(env,
+	    ip, argp, file_dbp, lsnp, mpf, op, 1);
+
+done:	*lsnp = argp->prev_lsn;
+out:
+	REC_CLOSE;
+}
+
+/*
+ * __db_cksum_recover --
+ *	Recovery function for checksum failure log record.
+ *
+ * PUBLIC: int __db_cksum_recover __P((ENV *,
+ * PUBLIC:      DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_cksum_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_cksum_args *argp;
+	int ret;
+
+	COMPQUIET(info, NULL);
+	COMPQUIET(lsnp, NULL);
+	COMPQUIET(op, DB_TXN_ABORT);
+
+	REC_PRINT(__db_cksum_print);
+
+	if ((ret = __db_cksum_read(env, dbtp->data, &argp)) != 0)
+		return (ret);
+
+	/*
+	 * We had a checksum failure -- the only option is to run catastrophic
+	 * recovery.
+	 */
+	if (F_ISSET(env, ENV_RECOVER_FATAL))
+		ret = 0;
+	else {
+		__db_errx(env,
+		    "Checksum failure requires catastrophic recovery");
+		ret = __env_panic(env, DB_RUNRECOVERY);
+	}
+
+	__os_free(env, argp);
+	return (ret);
+}
+
+/*
+ * __db_pg_init_recover --
+ *	Recovery function to reinit pages after truncation.
+ *
+ * PUBLIC: int __db_pg_init_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_init_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_pg_init_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_LSN copy_lsn;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int cmp_n, cmp_p, ret, type;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__db_pg_init_print);
+	REC_INTRO(__db_pg_init_read, ip, 0);
+
+	mpf = file_dbp->mpf;
+	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+		if (DB_UNDO(op)) {
+			if (ret == DB_PAGE_NOTFOUND)
+				goto done;
+			else {
+				ret = __db_pgerr(file_dbp, argp->pgno, ret);
+				goto out;
+			}
+		}
+
+		/*
+		 * This page was truncated and may simply not have
+		 * had an item written to it yet.  This should only
+		 * happen on hash databases, so confirm that.
+		 */
+		DB_ASSERT(env, file_dbp->type == DB_HASH);
+		if ((ret = __memp_fget(mpf, &argp->pgno,
+		    ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) {
+			ret = __db_pgerr(file_dbp, argp->pgno, ret);
+			goto out;
+		}
+	}
+
+	(void)__ua_memcpy(&copy_lsn, &LSN(argp->header.data), sizeof(DB_LSN));
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &copy_lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &copy_lsn);
+	CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+
+	if (cmp_p == 0 && DB_REDO(op)) {
+		if (TYPE(pagep) == P_HASH)
+			type = P_HASH;
+		else
+			type = file_dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE;
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		P_INIT(pagep, file_dbp->pgsize, PGNO(pagep), PGNO_INVALID,
+		    PGNO_INVALID, TYPE(pagep) == P_HASH ? 0 : 1, type);
+		pagep->lsn = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Put the data back on the page. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		memcpy(pagep, argp->header.data, argp->header.size);
+		if (argp->data.size > 0)
+			memcpy((u_int8_t*)pagep + HOFFSET(pagep),
+			     argp->data.data, argp->data.size);
+	}
+	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+
+done:	*lsnp = argp->prev_lsn;
+out:
+	REC_CLOSE;
+}
+
+/*
+ * __db_pg_trunc_recover --
+ *	Recovery function for pg_trunc.
+ *
+ * PUBLIC: int __db_pg_trunc_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_trunc_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+#ifdef HAVE_FTRUNCATE
+	__db_pg_trunc_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DBMETA *meta;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	db_pglist_t *pglist, *lp;
+	db_pgno_t last_pgno, *list;
+	u_int32_t felem, nelem, pos;
+	int ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__db_pg_trunc_print);
+	REC_INTRO(__db_pg_trunc_read, ip, 1);
+
+	pglist = (db_pglist_t *) argp->list.data;
+	nelem = argp->list.size / sizeof(db_pglist_t);
+	if (DB_REDO(op)) {
+		/*
+		 * First call __db_pg_truncate to find the truncation
+		 * point, truncate the file and return the new last_pgno.
+		 */
+		last_pgno = argp->last_pgno;
+		if ((ret = __db_pg_truncate(dbc, NULL, pglist,
+		    NULL, &nelem, argp->next_free, &last_pgno, lsnp, 1)) != 0)
+			goto out;
+
+		if (argp->last_free != PGNO_INVALID) {
+			/* 
+			 * Update the next pointer of the last page in
+			 * the freelist.  If the truncation point is
+			 * beyond next_free then this is still in the freelist
+			 * otherwise the last_free page is at the end.
+			 */
+			if ((ret = __memp_fget(mpf,
+			    &argp->last_free, ip, NULL, 0, &meta)) == 0) {
+				if (LOG_COMPARE(&LSN(meta),
+				     &argp->last_lsn) == 0) {
+					REC_DIRTY(mpf,
+					    ip, dbc->priority, &meta);
+					if (pglist->pgno > last_pgno)
+						NEXT_PGNO(meta) = PGNO_INVALID;
+					else
+						NEXT_PGNO(meta) = pglist->pgno;
+					LSN(meta) = *lsnp;
+				}
+				if ((ret = __memp_fput(mpf, ip,
+				    meta, file_dbp->priority)) != 0)
+					goto out;
+				meta = NULL;
+			} else if (ret != DB_PAGE_NOTFOUND)
+				goto out;
+		}
+		if ((ret = __memp_fget(mpf, &argp->meta, ip, NULL,
+		    0, &meta)) != 0)
+			goto out;
+		if (LOG_COMPARE(&LSN(meta), &argp->meta_lsn) == 0) {
+			REC_DIRTY(mpf, ip, dbc->priority, &meta);
+			if (argp->last_free == PGNO_INVALID) {
+				if (nelem == 0)
+					meta->free = PGNO_INVALID;
+				else
+					meta->free = pglist->pgno;
+			}
+			meta->last_pgno = last_pgno;
+			LSN(meta) = *lsnp;
+		}
+	} else {
+		/* Put the free list back in its original order. */
+		for (lp = pglist; lp < &pglist[nelem]; lp++) {
+			if ((ret = __memp_fget(mpf, &lp->pgno, ip,
+			    NULL, DB_MPOOL_CREATE, &pagep)) != 0)
+				goto out;
+			if (IS_ZERO_LSN(LSN(pagep)) ||
+			     LOG_COMPARE(&LSN(pagep), lsnp) == 0) {
+				REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+				P_INIT(pagep, file_dbp->pgsize, lp->pgno,
+				    PGNO_INVALID, lp->next_pgno, 0, P_INVALID);
+				LSN(pagep) = lp->lsn;
+			}
+			if ((ret = __memp_fput(mpf,
+			    ip, pagep, file_dbp->priority)) != 0)
+				goto out;
+		}
+		/*
+		 * Link the truncated part back into the free list.
+		 * Its either after the last_free page or direclty
+		 * linked to the metadata page.
+		 */
+		if (argp->last_free != PGNO_INVALID) {
+			if ((ret = __memp_fget(mpf, &argp->last_free,
+			    ip, NULL, DB_MPOOL_EDIT, &meta)) == 0) {
+				if (LOG_COMPARE(&LSN(meta), lsnp) == 0) {
+					NEXT_PGNO(meta) = argp->next_free;
+					LSN(meta) = argp->last_lsn;
+				}
+				if ((ret = __memp_fput(mpf, ip,
+				    meta, file_dbp->priority)) != 0)
+					goto out;
+			} else if (ret != DB_PAGE_NOTFOUND)
+				goto out;
+			meta = NULL;
+		}
+		if ((ret = __memp_fget(mpf, &argp->meta,
+		    ip, NULL, DB_MPOOL_EDIT, &meta)) != 0)
+			goto out;
+		if (LOG_COMPARE(&LSN(meta), lsnp) == 0) {
+			REC_DIRTY(mpf, ip, dbc->priority, &meta);
+			/*
+			 * If we had to break up the list last_pgno
+			 * may only represent the end of the block.
+			 */
+			if (meta->last_pgno < argp->last_pgno)
+				meta->last_pgno = argp->last_pgno;
+			if (argp->last_free == PGNO_INVALID)
+				meta->free = argp->next_free;
+			LSN(meta) = argp->meta_lsn;
+		}
+	}
+
+	if ((ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0)
+		goto out;
+
+	if (op == DB_TXN_ABORT) {
+		/*
+		 * Put the pages back on the in memory free list.
+		 * If this is part of a multi-record truncate then
+		 * we need to find this batch, it may not be at the end.
+		 * If we aborted while writing one of the log records
+		 * then this set may still be in the list.
+		 */
+		if ((ret = __memp_get_freelist(mpf, &felem, &list)) != 0)
+			goto out;
+		if (list != NULL) {
+			if (felem != 0 && list[felem - 1] > pglist->pgno) {
+				__db_freelist_pos(
+				    pglist->pgno, list, felem, &pos);
+				DB_ASSERT(env, pos < felem);
+				if (pglist->pgno == list[pos])
+					goto done;
+				pos++;
+			} else if (felem != 0 &&
+			    list[felem - 1] == pglist->pgno)
+				goto done;
+			else 
+				pos = felem;
+			if ((ret = __memp_extend_freelist(
+			    mpf, felem + nelem, &list)) != 0)
+				goto out;
+			if (pos != felem)
+				memmove(&list[nelem + pos], &list[pos],
+				    sizeof(*list) * (felem - pos));
+			for (lp = pglist; lp < &pglist[nelem]; lp++)
+				list[pos++] = lp->pgno;
+		}
+	}
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	REC_CLOSE;
+#else
+	/*
+	 * If HAVE_FTRUNCATE is not defined, we'll never see pg_trunc records
+	 * to recover.
+	 */
+	COMPQUIET(env, NULL);
+	COMPQUIET(dbtp, NULL);
+	COMPQUIET(lsnp, NULL);
+	COMPQUIET(op,  DB_TXN_ABORT);
+	COMPQUIET(info, NULL);
+	return (EINVAL);
+#endif
+}
+/*
+ * __db_pg_sort_44_recover --
+ *	Recovery function for pg_sort.
+ * This is deprecated and kept for replication upgrades.
+ *
+ * PUBLIC: int __db_pg_sort_44_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_sort_44_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+#ifdef HAVE_FTRUNCATE
+	__db_pg_sort_44_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DBMETA *meta;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	db_pglist_t *pglist, *lp;
+	db_pgno_t pgno, *list;
+	u_int32_t felem, nelem;
+	int ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__db_pg_sort_44_print);
+	REC_INTRO(__db_pg_sort_44_read, ip, 1);
+
+	pglist = (db_pglist_t *) argp->list.data;
+	nelem = argp->list.size / sizeof(db_pglist_t);
+	if (DB_REDO(op)) {
+		pgno = argp->last_pgno;
+		__db_freelist_sort(pglist, nelem);
+		if ((ret = __db_pg_truncate(dbc, NULL,
+		    pglist, NULL, &nelem, PGNO_INVALID, &pgno, lsnp, 1)) != 0)
+			goto out;
+
+		if (argp->last_free != PGNO_INVALID) {
+			if ((ret = __memp_fget(mpf,
+			    &argp->last_free, ip, NULL, 0, &meta)) == 0) {
+				if (LOG_COMPARE(&LSN(meta),
+				     &argp->last_lsn) == 0) {
+					REC_DIRTY(mpf,
+					    ip, dbc->priority, &meta);
+					NEXT_PGNO(meta) = PGNO_INVALID;
+					LSN(meta) = *lsnp;
+				}
+				if ((ret = __memp_fput(mpf, ip,
+				    meta, file_dbp->priority)) != 0)
+					goto out;
+				meta = NULL;
+			} else if (ret != DB_PAGE_NOTFOUND)
+				goto out;
+		}
+		if ((ret = __memp_fget(mpf, &argp->meta, ip, NULL,
+		    0, &meta)) != 0)
+			goto out;
+		if (LOG_COMPARE(&LSN(meta), &argp->meta_lsn) == 0) {
+			REC_DIRTY(mpf, ip, dbc->priority, &meta);
+			if (argp->last_free == PGNO_INVALID) {
+				if (nelem == 0)
+					meta->free = PGNO_INVALID;
+				else
+					meta->free = pglist->pgno;
+			}
+			meta->last_pgno = pgno;
+			LSN(meta) = *lsnp;
+		}
+	} else {
+		/* Put the free list back in its original order. */
+		for (lp = pglist; lp < &pglist[nelem]; lp++) {
+			if ((ret = __memp_fget(mpf, &lp->pgno, ip,
+			    NULL, DB_MPOOL_CREATE, &pagep)) != 0)
+				goto out;
+			if (IS_ZERO_LSN(LSN(pagep)) ||
+			     LOG_COMPARE(&LSN(pagep), lsnp) == 0) {
+				REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+				if (lp == &pglist[nelem - 1])
+					pgno = PGNO_INVALID;
+				else
+					pgno = lp[1].pgno;
+
+				P_INIT(pagep, file_dbp->pgsize,
+				    lp->pgno, PGNO_INVALID, pgno, 0, P_INVALID);
+				LSN(pagep) = lp->lsn;
+			}
+			if ((ret = __memp_fput(mpf,
+			    ip, pagep, file_dbp->priority)) != 0)
+				goto out;
+		}
+		if (argp->last_free != PGNO_INVALID) {
+			if ((ret = __memp_fget(mpf, &argp->last_free,
+			    ip, NULL, DB_MPOOL_EDIT, &meta)) == 0) {
+				if (LOG_COMPARE(&LSN(meta), lsnp) == 0) {
+					NEXT_PGNO(meta) = pglist->pgno;
+					LSN(meta) = argp->last_lsn;
+				}
+				if ((ret = __memp_fput(mpf, ip,
+				    meta, file_dbp->priority)) != 0)
+					goto out;
+			} else if (ret != DB_PAGE_NOTFOUND)
+				goto out;
+			meta = NULL;
+		}
+		if ((ret = __memp_fget(mpf, &argp->meta,
+		    ip, NULL, DB_MPOOL_EDIT, &meta)) != 0)
+			goto out;
+		if (LOG_COMPARE(&LSN(meta), lsnp) == 0) {
+			REC_DIRTY(mpf, ip, dbc->priority, &meta);
+			meta->last_pgno = argp->last_pgno;
+			if (argp->last_free == PGNO_INVALID)
+				meta->free = pglist->pgno;
+			LSN(meta) = argp->meta_lsn;
+		}
+	}
+	if (op == DB_TXN_ABORT) {
+		if ((ret = __memp_get_freelist(mpf, &felem, &list)) != 0)
+			goto out;
+		if (list != NULL) {
+			DB_ASSERT(env, felem == 0 ||
+			    argp->last_free == list[felem - 1]);
+			if ((ret = __memp_extend_freelist(
+			    mpf, felem + nelem, &list)) != 0)
+				goto out;
+			for (lp = pglist; lp < &pglist[nelem]; lp++)
+				list[felem++] = lp->pgno;
+		}
+	}
+
+	if ((ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0)
+		goto out;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	REC_CLOSE;
+#else
+	/*
+	 * If HAVE_FTRUNCATE is not defined, we'll never see pg_sort records
+	 * to recover.
+	 */
+	COMPQUIET(env, NULL);
+	COMPQUIET(dbtp, NULL);
+	COMPQUIET(lsnp, NULL);
+	COMPQUIET(op,  DB_TXN_ABORT);
+	COMPQUIET(info, NULL);
+	return (EINVAL);
+#endif
+}
+
+/*
+ * __db_pg_alloc_42_recover --
+ *	Recovery function for pg_alloc.
+ *
+ * PUBLIC: int __db_pg_alloc_42_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_alloc_42_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_pg_alloc_42_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DBMETA *meta;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	db_pgno_t pgno;
+	int cmp_n, cmp_p, created, level, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	meta = NULL;
+	pagep = NULL;
+	created = 0;
+	REC_PRINT(__db_pg_alloc_42_print);
+	REC_INTRO(__db_pg_alloc_42_read, ip, 0);
+
+	/*
+	 * Fix up the metadata page.  If we're redoing the operation, we have
+	 * to get the metadata page and update its LSN and its free pointer.
+	 * If we're undoing the operation and the page was ever created, we put
+	 * it on the freelist.
+	 */
+	pgno = PGNO_BASE_MD;
+	if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &meta)) != 0) {
+		/* The metadata page must always exist on redo. */
+		if (DB_REDO(op)) {
+			ret = __db_pgerr(file_dbp, pgno, ret);
+			goto out;
+		} else
+			goto done;
+	}
+	cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
+	cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Need to redo update described. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+		LSN(meta) = *lsnp;
+		meta->free = argp->next;
+		if (argp->pgno > meta->last_pgno)
+			meta->last_pgno = argp->pgno;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		goto no_rollback;
+	}
+
+	/*
+	 * Fix up the allocated page. If the page does not exist
+	 * and we can truncate it then don't create it.
+	 * Otherwise if we're redoing the operation, we have
+	 * to get the page (creating it if it doesn't exist), and update its
+	 * LSN.  If we're undoing the operation, we have to reset the page's
+	 * LSN and put it on the free list, or truncate it.
+	 */
+	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+		/*
+		 * We have to be able to identify if a page was newly
+		 * created so we can recover it properly.  We cannot simply
+		 * look for an empty header, because hash uses a pgin
+		 * function that will set the header.  Instead, we explicitly
+		 * try for the page without CREATE and if that fails, then
+		 * create it.
+		 */
+		if ((ret = __memp_fget(mpf, &argp->pgno,
+		    ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) {
+			if (DB_UNDO(op) && ret == ENOSPC)
+				goto do_truncate;
+			ret = __db_pgerr(file_dbp, argp->pgno, ret);
+			goto out;
+		}
+		created = 1;
+	}
+
+	/* Fix up the allocated page. */
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->page_lsn);
+
+	/*
+	 * If an initial allocation is aborted and then reallocated during
+	 * an archival restore the log record will have an LSN for the page
+	 * but the page will be empty.
+	 */
+	if (IS_ZERO_LSN(LSN(pagep)) ||
+	    (IS_ZERO_LSN(argp->page_lsn) && IS_INIT_LSN(LSN(pagep))))
+		cmp_p = 0;
+
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->page_lsn);
+	/*
+	 * Another special case we have to handle is if we ended up with a
+	 * page of all 0's which can happen if we abort between allocating a
+	 * page in mpool and initializing it.  In that case, even if we're
+	 * undoing, we need to re-initialize the page.
+	 */
+	if (DB_REDO(op) && cmp_p == 0) {
+		/* Need to redo update described. */
+		switch (argp->ptype) {
+		case P_LBTREE:
+		case P_LRECNO:
+		case P_LDUP:
+			level = LEAFLEVEL;
+			break;
+		default:
+			level = 0;
+			break;
+		}
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		P_INIT(pagep, file_dbp->pgsize,
+		    argp->pgno, PGNO_INVALID, PGNO_INVALID, level, argp->ptype);
+
+		pagep->lsn = *lsnp;
+	} else if (DB_UNDO(op) && (cmp_n == 0 || created)) {
+		/*
+		 * This is where we handle the case of a 0'd page (pagep->pgno
+		 * is equal to PGNO_INVALID).
+		 * Undo the allocation, reinitialize the page and
+		 * link its next pointer to the free list.
+		 */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		P_INIT(pagep, file_dbp->pgsize,
+		    argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID);
+
+		pagep->lsn = argp->page_lsn;
+	}
+
+do_truncate:
+	/*
+	 * We cannot undo things from 4.2 land, because we nolonger
+	 * have limbo processing.
+	 */
+	if ((pagep == NULL || IS_ZERO_LSN(LSN(pagep))) &&
+	    IS_ZERO_LSN(argp->page_lsn) && DB_UNDO(op)) {
+no_rollback:	__db_errx(env,
+"Cannot replicate prepared transactions from master running release 4.2 ");
+		ret = __env_panic(env, EINVAL);
+	}
+
+	if (pagep != NULL &&
+	    (ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+	if ((ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0)
+		goto out;
+	meta = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+	if (meta != NULL)
+		(void)__memp_fput(mpf, ip, meta, file_dbp->priority);
+	REC_CLOSE;
+}
+
+/*
+ * __db_pg_free_recover_42_int --
+ */
+static int
+__db_pg_free_recover_42_int(env, ip, argp, file_dbp, lsnp, mpf, op, data)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	__db_pg_freedata_42_args *argp;
+	DB *file_dbp;
+	DB_LSN *lsnp;
+	DB_MPOOLFILE *mpf;
+	db_recops op;
+	int data;
+{
+	DBMETA *meta;
+	DB_LSN copy_lsn;
+	PAGE *pagep, *prevp;
+	int cmp_n, cmp_p, is_meta, ret;
+
+	meta = NULL;
+	pagep = NULL;
+	prevp = NULL;
+
+	/*
+	 * Get the "metapage".  This will either be the metapage
+	 * or the previous page in the free list if we are doing
+	 * sorted allocations.  If its a previous page then
+	 * we will not be truncating.
+	 */
+	is_meta = argp->meta_pgno == PGNO_BASE_MD;
+
+	REC_FGET(mpf, ip, argp->meta_pgno, &meta, check_meta);
+
+	if (argp->meta_pgno != PGNO_BASE_MD)
+		prevp = (PAGE *)meta;
+
+	cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
+	cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
+
+	/*
+	 * Fix up the metadata page.  If we're redoing or undoing the operation
+	 * we get the page and update its LSN, last and free pointer.
+	 */
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Need to redo the deallocation. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+		if (prevp == NULL)
+			meta->free = argp->pgno;
+		else
+			NEXT_PGNO(prevp) = argp->pgno;
+		/*
+		 * If this was a compensating transaction and
+		 * we are a replica, then we never executed the
+		 * original allocation which incremented meta->free.
+		 */
+		if (prevp == NULL && meta->last_pgno < meta->free)
+			meta->last_pgno = meta->free;
+		LSN(meta) = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Need to undo the deallocation. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+		if (prevp == NULL)
+			meta->free = argp->next;
+		else
+			NEXT_PGNO(prevp) = argp->next;
+		LSN(meta) = argp->meta_lsn;
+		if (prevp == NULL && meta->last_pgno < argp->pgno)
+			meta->last_pgno = argp->pgno;
+	}
+
+check_meta:
+	if (ret != 0 && is_meta) {
+		/* The metadata page must always exist. */
+		ret = __db_pgerr(file_dbp, argp->meta_pgno, ret);
+		goto out;
+	}
+
+	/*
+	 * Get the freed page.  If we support truncate then don't
+	 * create the page if we are going to free it.  If we're
+	 * redoing the operation we get the page and explicitly discard
+	 * its contents, then update its LSN.  If we're undoing the
+	 * operation, we get the page and restore its header.
+	 * If we don't support truncate, then we must create the page
+	 * and roll it back.
+	 */
+	if ((ret = __memp_fget(mpf, &argp->pgno,
+	    ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0)
+		goto out;
+
+	(void)__ua_memcpy(&copy_lsn, &LSN(argp->header.data), sizeof(DB_LSN));
+	cmp_n = IS_ZERO_LSN(LSN(pagep)) ? 0 : LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &copy_lsn);
+
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &copy_lsn);
+	if (DB_REDO(op) &&
+	    (cmp_p == 0 ||
+	    (IS_ZERO_LSN(copy_lsn) &&
+	    LOG_COMPARE(&LSN(pagep), &argp->meta_lsn) <= 0))) {
+		/* Need to redo the deallocation. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		P_INIT(pagep, file_dbp->pgsize,
+		    argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID);
+		pagep->lsn = *lsnp;
+	} else if (cmp_n == 0 && DB_UNDO(op)) {
+		/* Need to reallocate the page. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		memcpy(pagep, argp->header.data, argp->header.size);
+		if (data)
+			memcpy((u_int8_t*)pagep + HOFFSET(pagep),
+			     argp->data.data, argp->data.size);
+	}
+	if (pagep != NULL &&
+	    (ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+
+	pagep = NULL;
+	if (meta != NULL &&
+	    (ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0)
+		goto out;
+	meta = NULL;
+
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+	if (meta != NULL)
+		(void)__memp_fput(mpf, ip, meta, file_dbp->priority);
+
+	return (ret);
+}
+
+/*
+ * __db_pg_free_42_recover --
+ *	Recovery function for pg_free.
+ *
+ * PUBLIC: int __db_pg_free_42_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_free_42_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_pg_free_42_args *argp;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	DB_THREAD_INFO *ip;
+	int ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__db_pg_free_42_print);
+	REC_INTRO(__db_pg_free_42_read, ip, 0);
+
+	ret = __db_pg_free_recover_42_int(env, ip,
+	     (__db_pg_freedata_42_args *)argp, file_dbp, lsnp, mpf, op, 0);
+
+done:	*lsnp = argp->prev_lsn;
+out:
+	REC_CLOSE;
+}
+
+/*
+ * __db_pg_freedata_42_recover --
+ *	Recovery function for pg_freedata.
+ *
+ * PUBLIC: int __db_pg_freedata_42_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_freedata_42_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_pg_freedata_42_args *argp;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	DB_THREAD_INFO *ip;
+	int ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	REC_PRINT(__db_pg_freedata_42_print);
+	REC_INTRO(__db_pg_freedata_42_read, ip, 0);
+
+	ret = __db_pg_free_recover_42_int(
+	    env, ip, argp, file_dbp, lsnp, mpf, op, 1);
+
+done:	*lsnp = argp->prev_lsn;
+out:
+	REC_CLOSE;
+}
+
+/*
+ * __db_relink_42_recover --
+ *	Recovery function for relink.
+ *
+ * PUBLIC: int __db_relink_42_recover
+ * PUBLIC:   __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_relink_42_recover(env, dbtp, lsnp, op, info)
+	ENV *env;
+	DBT *dbtp;
+	DB_LSN *lsnp;
+	db_recops op;
+	void *info;
+{
+	__db_relink_42_args *argp;
+	DB_THREAD_INFO *ip;
+	DB *file_dbp;
+	DBC *dbc;
+	DB_MPOOLFILE *mpf;
+	PAGE *pagep;
+	int cmp_n, cmp_p, modified, ret;
+
+	ip = ((DB_TXNHEAD *)info)->thread_info;
+	pagep = NULL;
+	REC_PRINT(__db_relink_42_print);
+	REC_INTRO(__db_relink_42_read, ip, 0);
+
+	/*
+	 * There are up to three pages we need to check -- the page, and the
+	 * previous and next pages, if they existed.  For a page add operation,
+	 * the current page is the result of a split and is being recovered
+	 * elsewhere, so all we need do is recover the next page.
+	 */
+	if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+		if (DB_REDO(op)) {
+			ret = __db_pgerr(file_dbp, argp->pgno, ret);
+			goto out;
+		}
+		goto next2;
+	}
+	if (argp->opcode == DB_ADD_PAGE_COMPAT)
+		goto next1;
+
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Redo the relink. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		pagep->lsn = *lsnp;
+	} else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) {
+		/* Undo the relink. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		pagep->next_pgno = argp->next;
+		pagep->prev_pgno = argp->prev;
+		pagep->lsn = argp->lsn;
+	}
+next1:	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+next2:	if ((ret = __memp_fget(mpf, &argp->next, ip, NULL, 0, &pagep)) != 0) {
+		if (DB_REDO(op)) {
+			ret = __db_pgerr(file_dbp, argp->next, ret);
+			goto out;
+		}
+		goto prev;
+	}
+	modified = 0;
+	cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_next);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_next);
+	if ((argp->opcode == DB_REM_PAGE_COMPAT && cmp_p == 0 && DB_REDO(op)) ||
+	    (argp->opcode == DB_ADD_PAGE_COMPAT && cmp_n == 0 && DB_UNDO(op))) {
+		/* Redo the remove or undo the add. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		pagep->prev_pgno = argp->prev;
+		modified = 1;
+	} else if ((argp->opcode == DB_REM_PAGE_COMPAT &&
+	    cmp_n == 0 && DB_UNDO(op)) ||
+	    (argp->opcode == DB_ADD_PAGE_COMPAT && cmp_p == 0 && DB_REDO(op))) {
+		/* Undo the remove or redo the add. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		pagep->prev_pgno = argp->pgno;
+		modified = 1;
+	}
+	if (modified) {
+		if (DB_UNDO(op))
+			pagep->lsn = argp->lsn_next;
+		else
+			pagep->lsn = *lsnp;
+	}
+	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+	pagep = NULL;
+	if (argp->opcode == DB_ADD_PAGE_COMPAT)
+		goto done;
+
+prev:	if ((ret = __memp_fget(mpf, &argp->prev, ip, NULL, 0, &pagep)) != 0) {
+		if (DB_REDO(op)) {
+			ret = __db_pgerr(file_dbp, argp->prev, ret);
+			goto out;
+		}
+		goto done;
+	}
+	modified = 0;
+	cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_prev);
+	CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_prev);
+	if (cmp_p == 0 && DB_REDO(op)) {
+		/* Redo the relink. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		pagep->next_pgno = argp->next;
+		modified = 1;
+	} else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) {
+		/* Undo the relink. */
+		REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+		pagep->next_pgno = argp->pgno;
+		modified = 1;
+	}
+	if (modified) {
+		if (DB_UNDO(op))
+			pagep->lsn = argp->lsn_prev;
+		else
+			pagep->lsn = *lsnp;
+	}
+	if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+		goto out;
+	pagep = NULL;
+
+done:	*lsnp = argp->prev_lsn;
+	ret = 0;
+
+out:	if (pagep != NULL)
+		(void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+	REC_CLOSE;
+}
diff --git a/db/db_reclaim.c b/db/db_reclaim.c
new file mode 100644
index 0000000..a44d054
--- /dev/null
+++ b/db/db_reclaim.c
@@ -0,0 +1,246 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/mp.h"
+
+/*
+ * __db_traverse_big
+ *	Traverse a chain of overflow pages and call the callback routine
+ * on each one.  The calling convention for the callback is:
+ *	callback(dbc, page, cookie, did_put),
+ * where did_put is a return value indicating if the page in question has
+ * already been returned to the mpool.
+ *
+ * PUBLIC: int __db_traverse_big __P((DBC *, db_pgno_t,
+ * PUBLIC:	int (*)(DBC *, PAGE *, void *, int *), void *));
+ */
+int
+__db_traverse_big(dbc, pgno, callback, cookie)
+	DBC *dbc;
+	db_pgno_t pgno;
+	int (*callback) __P((DBC *, PAGE *, void *, int *));
+	void *cookie;
+{
+	DB_MPOOLFILE *mpf;
+	PAGE *p;
+	int did_put, ret;
+
+	mpf = dbc->dbp->mpf;
+
+	do {
+		did_put = 0;
+		if ((ret = __memp_fget(mpf,
+		     &pgno, dbc->thread_info, dbc->txn, 0, &p)) != 0)
+			return (ret);
+		/*
+		 * If we are freeing pages only process the overflow
+		 * chain if the head of the chain has a refcount of 1.
+		 */
+		pgno = NEXT_PGNO(p);
+		if (callback == __db_truncate_callback && OV_REF(p) != 1)
+			pgno = PGNO_INVALID;
+		if ((ret = callback(dbc, p, cookie, &did_put)) == 0 &&
+		    !did_put)
+			ret = __memp_fput(mpf,
+			     dbc->thread_info, p, dbc->priority);
+	} while (ret == 0 && pgno != PGNO_INVALID);
+
+	return (ret);
+}
+
+/*
+ * __db_reclaim_callback
+ * This is the callback routine used during a delete of a subdatabase.
+ * we are traversing a btree or hash table and trying to free all the
+ * pages.  Since they share common code for duplicates and overflow
+ * items, we traverse them identically and use this routine to do the
+ * actual free.  The reason that this is callback is because hash uses
+ * the same traversal code for statistics gathering.
+ *
+ * PUBLIC: int __db_reclaim_callback __P((DBC *, PAGE *, void *, int *));
+ */
+int
+__db_reclaim_callback(dbc, p, cookie, putp)
+	DBC *dbc;
+	PAGE *p;
+	void *cookie;
+	int *putp;
+{
+	DB *dbp;
+	int ret;
+
+	COMPQUIET(cookie, NULL);
+	dbp = dbc->dbp;
+
+	/*
+	 * We don't want to log the free of the root with the subdb.
+	 * If we abort then the subdb may not be openable to undo
+	 * the free.
+	 */
+	if ((dbp->type == DB_BTREE || dbp->type == DB_RECNO) &&
+	    PGNO(p) == ((BTREE *)dbp->bt_internal)->bt_root)
+		return (0);
+	if ((ret = __db_free(dbc, p)) != 0)
+		return (ret);
+	*putp = 1;
+
+	return (0);
+}
+
+/*
+ * __db_truncate_callback
+ * This is the callback routine used during a truncate.
+ * we are traversing a btree or hash table and trying to free all the
+ * pages.
+ *
+ * PUBLIC: int __db_truncate_callback __P((DBC *, PAGE *, void *, int *));
+ */
+int
+__db_truncate_callback(dbc, p, cookie, putp)
+	DBC *dbc;
+	PAGE *p;
+	void *cookie;
+	int *putp;
+{
+	DB *dbp;
+	DBT ddbt, ldbt;
+	DB_MPOOLFILE *mpf;
+	db_indx_t indx, len, off, tlen, top;
+	u_int8_t *hk, type;
+	u_int32_t *countp;
+	int ret;
+
+	top = NUM_ENT(p);
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	countp = cookie;
+	*putp = 1;
+
+	switch (TYPE(p)) {
+	case P_LBTREE:
+		/* Skip for off-page duplicates and deleted items. */
+		for (indx = 0; indx < top; indx += P_INDX) {
+			type = GET_BKEYDATA(dbp, p, indx + O_INDX)->type;
+			if (!B_DISSET(type) && B_TYPE(type) != B_DUPLICATE)
+				++*countp;
+		}
+		/* FALLTHROUGH */
+	case P_IBTREE:
+	case P_IRECNO:
+	case P_INVALID:
+		if (dbp->type != DB_HASH &&
+		    ((BTREE *)dbp->bt_internal)->bt_root == PGNO(p)) {
+			type = dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE;
+			goto reinit;
+		}
+		break;
+	case P_OVERFLOW:
+		if ((ret = __memp_dirty(mpf,
+		    &p, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+			return (ret);
+		if (DBC_LOGGING(dbc)) {
+			if ((ret = __db_ovref_log(dbp, dbc->txn,
+			    &LSN(p), 0, p->pgno, -1, &LSN(p))) != 0)
+				return (ret);
+		} else
+			LSN_NOT_LOGGED(LSN(p));
+		if (--OV_REF(p) != 0)
+			*putp = 0;
+		break;
+	case P_LRECNO:
+		for (indx = 0; indx < top; indx += O_INDX) {
+			type = GET_BKEYDATA(dbp, p, indx)->type;
+			if (!B_DISSET(type))
+				++*countp;
+		}
+
+		if (((BTREE *)dbp->bt_internal)->bt_root == PGNO(p)) {
+			type = P_LRECNO;
+			goto reinit;
+		}
+		break;
+	case P_LDUP:
+		/* Correct for deleted items. */
+		for (indx = 0; indx < top; indx += O_INDX)
+			if (!B_DISSET(GET_BKEYDATA(dbp, p, indx)->type))
+				++*countp;
+
+		break;
+	case P_HASH:
+		/* Correct for on-page duplicates and deleted items. */
+		for (indx = 0; indx < top; indx += P_INDX) {
+			switch (*H_PAIRDATA(dbp, p, indx)) {
+			case H_OFFDUP:
+				break;
+			case H_OFFPAGE:
+			case H_KEYDATA:
+				++*countp;
+				break;
+			case H_DUPLICATE:
+				tlen = LEN_HDATA(dbp, p, 0, indx);
+				hk = H_PAIRDATA(dbp, p, indx);
+				for (off = 0; off < tlen;
+				    off += len + 2 * sizeof(db_indx_t)) {
+					++*countp;
+					memcpy(&len,
+					    HKEYDATA_DATA(hk)
+					    + off, sizeof(db_indx_t));
+				}
+				break;
+			default:
+				return (__db_pgfmt(dbp->env, p->pgno));
+			}
+		}
+		/* Don't free the head of the bucket. */
+		if (PREV_PGNO(p) == PGNO_INVALID) {
+			type = P_HASH;
+
+reinit:			if ((ret = __memp_dirty(mpf, &p,
+			    dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+				return (ret);
+			*putp = 0;
+			if (DBC_LOGGING(dbc)) {
+				memset(&ldbt, 0, sizeof(ldbt));
+				memset(&ddbt, 0, sizeof(ddbt));
+				ldbt.data = p;
+				ldbt.size = P_OVERHEAD(dbp);
+				ldbt.size += p->entries * sizeof(db_indx_t);
+				ddbt.data = (u_int8_t *)p + HOFFSET(p);
+				ddbt.size = dbp->pgsize - HOFFSET(p);
+				if ((ret = __db_pg_init_log(dbp,
+				    dbc->txn, &LSN(p), 0,
+				    p->pgno, &ldbt, &ddbt)) != 0)
+					return (ret);
+			} else
+				LSN_NOT_LOGGED(LSN(p));
+
+			P_INIT(p, dbp->pgsize, PGNO(p), PGNO_INVALID,
+			    PGNO_INVALID, type == P_HASH ? 0 : 1, type);
+		}
+		break;
+	default:
+		return (__db_pgfmt(dbp->env, p->pgno));
+	}
+
+	if (*putp == 1) {
+		if ((ret = __db_free(dbc, p)) != 0)
+			return (ret);
+	} else {
+		if ((ret = __memp_fput(mpf, dbc->thread_info, p,
+		    dbc->priority)) != 0)
+			return (ret);
+		*putp = 1;
+	}
+
+	return (0);
+}
diff --git a/db/db_remove.c b/db/db_remove.c
new file mode 100644
index 0000000..6b59ec3
--- /dev/null
+++ b/db/db_remove.c
@@ -0,0 +1,492 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2010 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/fop.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __db_dbtxn_remove __P((DB *,
+    DB_THREAD_INFO *, DB_TXN *, const char *, const char *));
+static int __db_subdb_remove __P((DB *,
+    DB_THREAD_INFO *, DB_TXN *, const char *, const char *));
+
+/*
+ * __env_dbremove_pp
+ *	ENV->dbremove pre/post processing.
+ *
+ * PUBLIC: int __env_dbremove_pp __P((DB_ENV *,
+ * PUBLIC:     DB_TXN *, const char *, const char *, u_int32_t));
+ */
+int
+__env_dbremove_pp(dbenv, txn, name, subdb, flags)
+	DB_ENV *dbenv;
+	DB_TXN *txn;
+	const char *name, *subdb;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret, txn_local;
+
+	dbp = NULL;
+	env = dbenv->env;
+	txn_local = 0;
+
+	ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->dbremove");
+
+	/*
+	 * The actual argument checking is simple, do it inline, outside of
+	 * the replication block.
+	 */
+	if ((ret = __db_fchk(env,
+		"DB->remove", flags, DB_AUTO_COMMIT | DB_TXN_NOT_DURABLE)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (ret = __env_rep_enter(env, 1)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	/*
+	 * Create local transaction as necessary, check for consistent
+	 * transaction usage.
+	 */
+	if (IS_ENV_AUTO_COMMIT(env, txn, flags)) {
+		if ((ret = __db_txn_auto_init(env, ip, &txn)) != 0)
+			goto err;
+		txn_local = 1;
+	} else
+		if (txn != NULL && !TXN_ON(env) &&
+		    (!CDB_LOCKING(env) || !F_ISSET(txn, TXN_CDSGROUP))) {
+			ret = __db_not_txn_env(env);
+			goto err;
+		}
+	LF_CLR(DB_AUTO_COMMIT);
+
+	if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+		goto err;
+	if (LF_ISSET(DB_TXN_NOT_DURABLE) &&
+	    (ret = __db_set_flags(dbp, DB_TXN_NOT_DURABLE)) != 0)
+		goto err;
+	LF_CLR(DB_TXN_NOT_DURABLE);
+
+	ret = __db_remove_int(dbp, ip, txn, name, subdb, flags);
+
+	if (txn_local) {
+		/*
+		 * We created the DBP here and when we commit/abort, we'll
+		 * release all the transactional locks, including the handle
+		 * lock; mark the handle cleared explicitly.
+		 */
+		LOCK_INIT(dbp->handle_lock);
+		dbp->locker = NULL;
+	} else if (txn != NULL) {
+		/*
+		 * We created this handle locally so we need to close it
+		 * and clean it up.  Unfortunately, it's holding transactional
+		 * locks that need to persist until the end of transaction.
+		 * If we invalidate the locker id (dbp->locker), then the close
+		 * won't free these locks prematurely.
+		 */
+		 dbp->locker = NULL;
+	}
+
+err:	if (txn_local && (t_ret =
+	    __db_txn_auto_resolve(env, txn, 0, ret)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/*
+	 * We never opened this dbp for real, so don't include a transaction
+	 * handle, and use NOSYNC to avoid calling into mpool.
+	 *
+	 * !!!
+	 * Note we're reversing the order of operations: we started the txn and
+	 * then opened the DB handle; we're resolving the txn and then closing
+	 * closing the DB handle -- a DB handle cannot be closed before
+	 * resolving the txn.
+	 */
+	if (dbp != NULL &&
+	    (t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_remove_pp
+ *	DB->remove pre/post processing.
+ *
+ * PUBLIC: int __db_remove_pp
+ * PUBLIC:     __P((DB *, const char *, const char *, u_int32_t));
+ */
+int
+__db_remove_pp(dbp, name, subdb, flags)
+	DB *dbp;
+	const char *name, *subdb;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = dbp->env;
+
+	/*
+	 * Validate arguments, continuing to destroy the handle on failure.
+	 *
+	 * Cannot use DB_ILLEGAL_AFTER_OPEN directly because it returns.
+	 *
+	 * !!!
+	 * We have a serious problem if we're here with a handle used to open
+	 * a database -- we'll destroy the handle, and the application won't
+	 * ever be able to close the database.
+	 */
+	if (F_ISSET(dbp, DB_AM_OPEN_CALLED))
+		return (__db_mi_open(env, "DB->remove", 1));
+
+	/* Validate arguments. */
+	if ((ret = __db_fchk(env, "DB->remove", flags, 0)) != 0)
+		return (ret);
+
+	/* Check for consistent transaction usage. */
+	if ((ret = __db_check_txn(dbp, NULL, DB_LOCK_INVALIDID, 0)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (ret = __db_rep_enter(dbp, 1, 1, 0)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	/* Remove the file. */
+	ret = __db_remove(dbp, ip, NULL, name, subdb, flags);
+
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+err:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_remove
+ *	DB->remove method.
+ *
+ * PUBLIC: int __db_remove __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC:     DB_TXN *, const char *, const char *, u_int32_t));
+ */
+int
+__db_remove(dbp, ip, txn, name, subdb, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name, *subdb;
+	u_int32_t flags;
+{
+	int ret, t_ret;
+
+	ret = __db_remove_int(dbp, ip, txn, name, subdb, flags);
+
+	if ((t_ret = __db_close(dbp, txn, DB_NOSYNC)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_remove_int
+ *	Worker function for the DB->remove method.
+ *
+ * PUBLIC: int __db_remove_int __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC:    DB_TXN *, const char *, const char *, u_int32_t));
+ */
+int
+__db_remove_int(dbp, ip, txn, name, subdb, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name, *subdb;
+	u_int32_t flags;
+{
+	ENV *env;
+	int ret;
+	char *real_name, *tmpname;
+
+	env = dbp->env;
+	real_name = tmpname = NULL;
+
+	if (name == NULL && subdb == NULL) {
+		__db_errx(env, "Remove on temporary files invalid");
+		ret = EINVAL;
+		goto err;
+	}
+
+	if (name == NULL) {
+		MAKE_INMEM(dbp);
+		real_name = (char *)subdb;
+	} else if (subdb != NULL) {
+		ret = __db_subdb_remove(dbp, ip, txn, name, subdb);
+		goto err;
+	}
+
+	/* Handle transactional file removes separately. */
+	if (IS_REAL_TXN(txn)) {
+		ret = __db_dbtxn_remove(dbp, ip, txn, name, subdb);
+		goto err;
+	}
+
+	/*
+	 * The remaining case is a non-transactional file remove.
+	 *
+	 * Find the real name of the file.
+	 */
+	if (!F_ISSET(dbp, DB_AM_INMEM) && (ret = __db_appname(env,
+	    DB_APP_DATA, name, &dbp->dirname, &real_name)) != 0)
+		goto err;
+
+	/*
+	 * If this is a file and force is set, remove the temporary file, which
+	 * may have been left around.  Ignore errors because the temporary file
+	 * might not exist.
+	 */
+	if (!F_ISSET(dbp, DB_AM_INMEM) && LF_ISSET(DB_FORCE) &&
+	    (ret = __db_backup_name(env, real_name, NULL, &tmpname)) == 0)
+		(void)__os_unlink(env, tmpname, 0);
+
+	if ((ret = __fop_remove_setup(dbp, NULL, real_name, 0)) != 0)
+		goto err;
+
+	if (dbp->db_am_remove != NULL &&
+	    (ret = dbp->db_am_remove(dbp, ip, NULL, name, subdb, flags)) != 0)
+		goto err;
+
+	ret = F_ISSET(dbp, DB_AM_INMEM) ?
+	    __db_inmem_remove(dbp, NULL, real_name) :
+	    __fop_remove(env,
+	    NULL, dbp->fileid, name, &dbp->dirname, DB_APP_DATA,
+	    F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0);
+
+err:	if (!F_ISSET(dbp, DB_AM_INMEM) && real_name != NULL)
+		__os_free(env, real_name);
+	if (tmpname != NULL)
+		__os_free(env, tmpname);
+
+	return (ret);
+}
+
+/*
+ * __db_inmem_remove --
+ *	Removal of a named in-memory database.
+ *
+ * PUBLIC: int __db_inmem_remove __P((DB *, DB_TXN *, const char *));
+ */
+int
+__db_inmem_remove(dbp, txn, name)
+	DB *dbp;
+	DB_TXN *txn;
+	const char *name;
+{
+	DBT fid_dbt, name_dbt;
+	DB_LOCKER *locker;
+	DB_LSN lsn;
+	ENV *env;
+	int ret;
+
+	env = dbp->env;
+	locker = NULL;
+
+	DB_ASSERT(env, name != NULL);
+
+	/* This had better exist if we are trying to do a remove. */
+	(void)__memp_set_flags(dbp->mpf, DB_MPOOL_NOFILE, 1);
+	if ((ret = __memp_fopen(dbp->mpf, NULL,
+	    name, &dbp->dirname, 0, 0, 0)) != 0)
+		return (ret);
+	if ((ret = __memp_get_fileid(dbp->mpf, dbp->fileid)) != 0)
+		return (ret);
+	dbp->preserve_fid = 1;
+
+	if (LOCKING_ON(env)) {
+		if (dbp->locker == NULL &&
+		    (ret = __lock_id(env, NULL, &dbp->locker)) != 0)
+			return (ret);
+		locker = txn == NULL ? dbp->locker : txn->locker;
+	}
+
+	/*
+	 * In a transactional environment, we'll play the same game we play
+	 * for databases in the file system -- create a temporary database
+	 * and put it in with the current name and then rename this one to
+	 * another name.  We'll then use a commit-time event to remove the
+	 * entry.
+	 */
+	if ((ret =
+	    __fop_lock_handle(env, dbp, locker, DB_LOCK_WRITE, NULL, 0)) != 0)
+		return (ret);
+
+	if (!IS_REAL_TXN(txn))
+		ret = __memp_nameop(env, dbp->fileid, NULL, name, NULL, 1);
+	else if (LOGGING_ON(env)) {
+		if (txn != NULL && (ret =
+		    __txn_remevent(env, txn, name, dbp->fileid, 1)) != 0)
+			return (ret);
+
+		DB_INIT_DBT(name_dbt, name, strlen(name) + 1);
+		DB_INIT_DBT(fid_dbt, dbp->fileid, DB_FILE_ID_LEN);
+		ret = __crdel_inmem_remove_log(
+		    env, txn, &lsn, 0, &name_dbt, &fid_dbt);
+	}
+
+	return (ret);
+}
+
+/*
+ * __db_subdb_remove --
+ *	Remove a subdatabase.
+ */
+static int
+__db_subdb_remove(dbp, ip, txn, name, subdb)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name, *subdb;
+{
+	DB *mdbp, *sdbp;
+	int ret, t_ret;
+
+	mdbp = sdbp = NULL;
+
+	/* Open the subdatabase. */
+	if ((ret = __db_create_internal(&sdbp, dbp->env, 0)) != 0)
+		goto err;
+	if (F_ISSET(dbp, DB_AM_NOT_DURABLE) &&
+		(ret = __db_set_flags(sdbp, DB_TXN_NOT_DURABLE)) != 0)
+		goto err;
+	if ((ret = __db_open(sdbp, ip,
+	    txn, name, subdb, DB_UNKNOWN, DB_WRITEOPEN, 0, PGNO_BASE_MD)) != 0)
+		goto err;
+
+	DB_TEST_RECOVERY(sdbp, DB_TEST_PREDESTROY, ret, name);
+
+	/* Free up the pages in the subdatabase. */
+	switch (sdbp->type) {
+		case DB_BTREE:
+		case DB_RECNO:
+			if ((ret = __bam_reclaim(sdbp, ip, txn)) != 0)
+				goto err;
+			break;
+		case DB_HASH:
+			if ((ret = __ham_reclaim(sdbp, ip, txn)) != 0)
+				goto err;
+			break;
+		case DB_QUEUE:
+		case DB_UNKNOWN:
+		default:
+			ret = __db_unknown_type(
+			    sdbp->env, "__db_subdb_remove", sdbp->type);
+			goto err;
+	}
+
+	/*
+	 * Remove the entry from the main database and free the subdatabase
+	 * metadata page.
+	 */
+	if ((ret = __db_master_open(sdbp, ip, txn, name, 0, 0, &mdbp)) != 0)
+		goto err;
+
+	if ((ret = __db_master_update(mdbp,
+	    sdbp, ip, txn, subdb, sdbp->type, MU_REMOVE, NULL, 0)) != 0)
+		goto err;
+
+	DB_TEST_RECOVERY(sdbp, DB_TEST_POSTDESTROY, ret, name);
+
+DB_TEST_RECOVERY_LABEL
+err:
+	/* Close the main and subdatabases. */
+	if ((t_ret = __db_close(sdbp, txn, DB_NOSYNC)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (mdbp != NULL &&
+	    (t_ret = __db_close(mdbp, txn, DB_NOSYNC)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+static int
+__db_dbtxn_remove(dbp, ip, txn, name, subdb)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name, *subdb;
+{
+	ENV *env;
+	int ret;
+	char *tmpname;
+
+	env = dbp->env;
+	tmpname = NULL;
+
+	/*
+	 * This is a transactional remove, so we have to keep the name
+	 * of the file locked until the transaction commits.  As a result,
+	 * we implement remove by renaming the file to some other name
+	 * (which creates a dummy named file as a placeholder for the
+	 * file being rename/dremoved) and then deleting that file as
+	 * a delayed remove at commit.
+	 */
+	if ((ret = __db_backup_name(env,
+	    F_ISSET(dbp, DB_AM_INMEM) ? subdb : name, txn, &tmpname)) != 0)
+		return (ret);
+
+	DB_TEST_RECOVERY(dbp, DB_TEST_PREDESTROY, ret, name);
+
+	if ((ret = __db_rename_int(dbp,
+	    txn->thread_info, txn, name, subdb, tmpname)) != 0)
+		goto err;
+
+	/*
+	 * The internal removes will also translate into delayed removes.
+	 */
+	if (dbp->db_am_remove != NULL &&
+	    (ret = dbp->db_am_remove(dbp, ip, txn, tmpname, NULL, 0)) != 0)
+		goto err;
+
+	ret = F_ISSET(dbp, DB_AM_INMEM) ?
+	     __db_inmem_remove(dbp, txn, tmpname) :
+	    __fop_remove(env,
+	    txn, dbp->fileid, tmpname, &dbp->dirname, DB_APP_DATA,
+	    F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0);
+
+	DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, name);
+
+err:
+DB_TEST_RECOVERY_LABEL
+	if (tmpname != NULL)
+		__os_free(env, tmpname);
+
+	return (ret);
+}
diff --git a/db/db_rename.c b/db/db_rename.c
new file mode 100644
index 0000000..1fdf721
--- /dev/null
+++ b/db/db_rename.c
@@ -0,0 +1,372 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2010 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/fop.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __db_rename __P((DB *, DB_THREAD_INFO *,
+	     DB_TXN *, const char *, const char *, const char *));
+static int __db_subdb_rename __P((DB *, DB_THREAD_INFO *,
+	     DB_TXN *, const char *, const char *, const char *));
+
+/*
+ * __env_dbrename_pp
+ *	ENV->dbrename pre/post processing.
+ *
+ * PUBLIC: int __env_dbrename_pp __P((DB_ENV *, DB_TXN *,
+ * PUBLIC:     const char *, const char *, const char *, u_int32_t));
+ */
+int
+__env_dbrename_pp(dbenv, txn, name, subdb, newname, flags)
+	DB_ENV *dbenv;
+	DB_TXN *txn;
+	const char *name, *subdb, *newname;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret, txn_local;
+
+	env = dbenv->env;
+	dbp = NULL;
+	txn_local = 0;
+
+	ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->dbrename");
+
+	/*
+	 * The actual argument checking is simple, do it inline, outside of
+	 * the replication block.
+	 */
+	if ((ret = __db_fchk(env, "DB->rename", flags, DB_AUTO_COMMIT)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (ret = __env_rep_enter(env, 1)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	/*
+	 * Create local transaction as necessary, check for consistent
+	 * transaction usage.
+	 */
+	if (IS_ENV_AUTO_COMMIT(env, txn, flags)) {
+		if ((ret = __db_txn_auto_init(env, ip, &txn)) != 0)
+			goto err;
+		txn_local = 1;
+	} else
+		if (txn != NULL && !TXN_ON(env) &&
+		    (!CDB_LOCKING(env) || !F_ISSET(txn, TXN_CDSGROUP))) {
+			ret = __db_not_txn_env(env);
+			goto err;
+		}
+
+	LF_CLR(DB_AUTO_COMMIT);
+
+	if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+		goto err;
+
+	ret = __db_rename_int(dbp, ip, txn, name, subdb, newname);
+
+	if (txn_local) {
+		/*
+		 * We created the DBP here and when we commit/abort, we'll
+		 * release all the transactional locks, including the handle
+		 * lock; mark the handle cleared explicitly.
+		 */
+		LOCK_INIT(dbp->handle_lock);
+		dbp->locker = NULL;
+	} else if (txn != NULL) {
+		/*
+		 * We created this handle locally so we need to close it and
+		 * clean it up.  Unfortunately, it's holding transactional
+		 * or CDS group locks that need to persist until the end of
+		 * transaction.  If we invalidate the locker (dbp->locker),
+		 * then the close won't free these locks prematurely.
+		 */
+		 dbp->locker = NULL;
+	}
+
+err:	if (txn_local && (t_ret =
+	    __db_txn_auto_resolve(env, txn, 0, ret)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/*
+	 * We never opened this dbp for real, so don't include a transaction
+	 * handle, and use NOSYNC to avoid calling into mpool.
+	 *
+	 * !!!
+	 * Note we're reversing the order of operations: we started the txn and
+	 * then opened the DB handle; we're resolving the txn and then closing
+	 * closing the DB handle -- it's safer.
+	 */
+	if (dbp != NULL &&
+	    (t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_rename_pp
+ *	DB->rename pre/post processing.
+ *
+ * PUBLIC: int __db_rename_pp __P((DB *,
+ * PUBLIC:     const char *, const char *, const char *, u_int32_t));
+ */
+int
+__db_rename_pp(dbp, name, subdb, newname, flags)
+	DB *dbp;
+	const char *name, *subdb, *newname;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = dbp->env;
+	handle_check = 0;
+
+	/*
+	 * Validate arguments, continuing to destroy the handle on failure.
+	 *
+	 * Cannot use DB_ILLEGAL_AFTER_OPEN directly because it returns.
+	 *
+	 * !!!
+	 * We have a serious problem if we're here with a handle used to open
+	 * a database -- we'll destroy the handle, and the application won't
+	 * ever be able to close the database.
+	 */
+	if (F_ISSET(dbp, DB_AM_OPEN_CALLED))
+		return (__db_mi_open(env, "DB->rename", 1));
+
+	/* Validate arguments. */
+	if ((ret = __db_fchk(env, "DB->rename", flags, 0)) != 0)
+		return (ret);
+
+	/* Check for consistent transaction usage. */
+	if ((ret = __db_check_txn(dbp, NULL, DB_LOCK_INVALIDID, 0)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (ret = __db_rep_enter(dbp, 1, 1, 0)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	/* Rename the file. */
+	ret = __db_rename(dbp, ip, NULL, name, subdb, newname);
+
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+err:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_rename
+ *	DB->rename method.
+ *
+ */
+static int
+__db_rename(dbp, ip, txn, name, subdb, newname)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name, *subdb, *newname;
+{
+	int ret, t_ret;
+
+	ret = __db_rename_int(dbp, ip, txn, name, subdb, newname);
+
+	if ((t_ret = __db_close(dbp, txn, DB_NOSYNC)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_rename_int
+ *	Worker function for DB->rename method; the close of the dbp is
+ * left in the wrapper routine.
+ *
+ * PUBLIC: int __db_rename_int __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC:      DB_TXN *, const char *, const char *, const char *));
+ */
+int
+__db_rename_int(dbp, ip, txn, name, subdb, newname)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name, *subdb, *newname;
+{
+	ENV *env;
+	int ret;
+	char *old, *real_name;
+
+	env = dbp->env;
+	real_name = NULL;
+
+	DB_TEST_RECOVERY(dbp, DB_TEST_PREDESTROY, ret, name);
+
+	if (name == NULL && subdb == NULL) {
+		__db_errx(env, "Rename on temporary files invalid");
+		ret = EINVAL;
+		goto err;
+	}
+
+	if (name == NULL)
+		MAKE_INMEM(dbp);
+	else if (subdb != NULL) {
+		ret = __db_subdb_rename(dbp, ip, txn, name, subdb, newname);
+		goto err;
+	}
+
+	/*
+	 * From here on down, this pertains to files or in-memory databases.
+	 *
+	 * Find the real name of the file.
+	 */
+	if (F_ISSET(dbp, DB_AM_INMEM)) {
+		old = (char *)subdb;
+		real_name = (char *)subdb;
+	} else {
+		if ((ret = __db_appname(env, DB_APP_DATA,
+		    name, &dbp->dirname, &real_name)) != 0)
+			goto err;
+		old = (char *)name;
+	}
+	DB_ASSERT(env, old != NULL);
+
+	if ((ret = __fop_remove_setup(dbp, txn, real_name, 0)) != 0)
+		goto err;
+
+	if (dbp->db_am_rename != NULL &&
+	    (ret = dbp->db_am_rename(dbp, ip, txn, name, subdb, newname)) != 0)
+		goto err;
+
+	/*
+	 * The transactional case and non-transactional case are
+	 * quite different.  In the non-transactional case, we simply
+	 * do the rename.  In the transactional case, since we need
+	 * the ability to back out and maintain locking, we have to
+	 * create a temporary object as a placeholder.  This is all
+	 * taken care of in the fop layer.
+	 */
+	if (IS_REAL_TXN(txn)) {
+		if ((ret = __fop_dummy(dbp, txn, old, newname)) != 0)
+			goto err;
+	} else {
+		if ((ret = __fop_dbrename(dbp, old, newname)) != 0)
+			goto err;
+	}
+
+	/*
+	 * I am pretty sure that we haven't gotten a dbreg id, so calling
+	 * dbreg_filelist_update is not necessary.
+	 */
+	DB_ASSERT(env, dbp->log_filename == NULL ||
+	    dbp->log_filename->id == DB_LOGFILEID_INVALID);
+
+	DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, newname);
+
+DB_TEST_RECOVERY_LABEL
+err:	if (!F_ISSET(dbp, DB_AM_INMEM) && real_name != NULL)
+		__os_free(env, real_name);
+
+	return (ret);
+}
+
+/*
+ * __db_subdb_rename --
+ *	Rename a subdatabase.
+ */
+static int
+__db_subdb_rename(dbp, ip, txn, name, subdb, newname)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name, *subdb, *newname;
+{
+	DB *mdbp;
+	ENV *env;
+	PAGE *meta;
+	int ret, t_ret;
+
+	mdbp = NULL;
+	meta = NULL;
+	env = dbp->env;
+
+	/*
+	 * We have not opened this dbp so it isn't marked as a subdb,
+	 * but it ought to be.
+	 */
+	F_SET(dbp, DB_AM_SUBDB);
+
+	/*
+	 * Rename the entry in the main database.  We need to first
+	 * get the meta-data page number (via MU_OPEN) so that we can
+	 * read the meta-data page and obtain a handle lock.  Once we've
+	 * done that, we can proceed to do the rename in the master.
+	 */
+	if ((ret = __db_master_open(dbp, ip, txn, name, 0, 0, &mdbp)) != 0)
+		goto err;
+
+	if ((ret = __db_master_update(mdbp, dbp, ip, txn, subdb, dbp->type,
+	    MU_OPEN, NULL, 0)) != 0)
+		goto err;
+
+	if ((ret = __memp_fget(mdbp->mpf, &dbp->meta_pgno,
+	    ip, txn, 0, &meta)) != 0)
+		goto err;
+	memcpy(dbp->fileid, ((DBMETA *)meta)->uid, DB_FILE_ID_LEN);
+	if ((ret = __fop_lock_handle(env,
+	    dbp, mdbp->locker, DB_LOCK_WRITE, NULL, NOWAIT_FLAG(txn))) != 0)
+		goto err;
+
+	ret = __memp_fput(mdbp->mpf, ip, meta, dbp->priority);
+	meta = NULL;
+	if (ret != 0)
+		goto err;
+
+	if ((ret = __db_master_update(mdbp, dbp, ip, txn,
+	    subdb, dbp->type, MU_RENAME, newname, 0)) != 0)
+		goto err;
+
+	DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, name);
+
+DB_TEST_RECOVERY_LABEL
+err:
+	if (meta != NULL && (t_ret =
+	    __memp_fput(mdbp->mpf, ip, meta, dbp->priority)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (mdbp != NULL &&
+	    (t_ret = __db_close(mdbp, txn, DB_NOSYNC)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
diff --git a/db/db_ret.c b/db/db_ret.c
new file mode 100644
index 0000000..5ff60d1
--- /dev/null
+++ b/db/db_ret.c
@@ -0,0 +1,156 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+
+/*
+ * __db_ret --
+ *	Build return DBT.
+ *
+ * PUBLIC: int __db_ret __P((DBC *,
+ * PUBLIC:    PAGE *, u_int32_t, DBT *, void **, u_int32_t *));
+ */
+int
+__db_ret(dbc, h, indx, dbt, memp, memsize)
+	DBC *dbc;
+	PAGE *h;
+	u_int32_t indx;
+	DBT *dbt;
+	void **memp;
+	u_int32_t *memsize;
+{
+	BKEYDATA *bk;
+	BOVERFLOW *bo;
+	DB *dbp;
+	HOFFPAGE ho;
+	u_int32_t len;
+	u_int8_t *hk;
+	void *data;
+
+	dbp = dbc->dbp;
+
+	switch (TYPE(h)) {
+	case P_HASH_UNSORTED:
+	case P_HASH:
+		hk = P_ENTRY(dbp, h, indx);
+		if (HPAGE_PTYPE(hk) == H_OFFPAGE) {
+			memcpy(&ho, hk, sizeof(HOFFPAGE));
+			return (__db_goff(dbc, dbt,
+			    ho.tlen, ho.pgno, memp, memsize));
+		}
+		len = LEN_HKEYDATA(dbp, h, dbp->pgsize, indx);
+		data = HKEYDATA_DATA(hk);
+		break;
+	case P_LBTREE:
+	case P_LDUP:
+	case P_LRECNO:
+		bk = GET_BKEYDATA(dbp, h, indx);
+		if (B_TYPE(bk->type) == B_OVERFLOW) {
+			bo = (BOVERFLOW *)bk;
+			return (__db_goff(dbc, dbt,
+			    bo->tlen, bo->pgno, memp, memsize));
+		}
+		len = bk->len;
+		data = bk->data;
+		break;
+	default:
+		return (__db_pgfmt(dbp->env, h->pgno));
+	}
+
+	return (__db_retcopy(dbp->env, dbt, data, len, memp, memsize));
+}
+
+/*
+ * __db_retcopy --
+ *	Copy the returned data into the user's DBT, handling special flags.
+ *
+ * PUBLIC: int __db_retcopy __P((ENV *, DBT *,
+ * PUBLIC:    void *, u_int32_t, void **, u_int32_t *));
+ */
+int
+__db_retcopy(env, dbt, data, len, memp, memsize)
+	ENV *env;
+	DBT *dbt;
+	void *data;
+	u_int32_t len;
+	void **memp;
+	u_int32_t *memsize;
+{
+	int ret;
+
+	ret = 0;
+
+	/* If returning a partial record, reset the length. */
+	if (F_ISSET(dbt, DB_DBT_PARTIAL)) {
+		data = (u_int8_t *)data + dbt->doff;
+		if (len > dbt->doff) {
+			len -= dbt->doff;
+			if (len > dbt->dlen)
+				len = dbt->dlen;
+		} else
+			len = 0;
+	}
+
+	/*
+	 * Allocate memory to be owned by the application: DB_DBT_MALLOC,
+	 * DB_DBT_REALLOC.
+	 *
+	 * !!!
+	 * We always allocate memory, even if we're copying out 0 bytes. This
+	 * guarantees consistency, i.e., the application can always free memory
+	 * without concern as to how many bytes of the record were requested.
+	 *
+	 * Use the memory specified by the application: DB_DBT_USERMEM.
+	 *
+	 * !!!
+	 * If the length we're going to copy is 0, the application-supplied
+	 * memory pointer is allowed to be NULL.
+	 */
+	if (F_ISSET(dbt, DB_DBT_USERCOPY)) {
+		dbt->size = len;
+		return (len == 0 ? 0 : env->dbt_usercopy(dbt, 0, data,
+		    len, DB_USERCOPY_SETDATA));
+
+	} else if (F_ISSET(dbt, DB_DBT_MALLOC))
+		ret = __os_umalloc(env, len, &dbt->data);
+	else if (F_ISSET(dbt, DB_DBT_REALLOC)) {
+		if (dbt->data == NULL || dbt->size == 0 || dbt->size < len)
+			ret = __os_urealloc(env, len, &dbt->data);
+	} else if (F_ISSET(dbt, DB_DBT_USERMEM)) {
+		if (len != 0 && (dbt->data == NULL || dbt->ulen < len))
+			ret = DB_BUFFER_SMALL;
+	} else if (memp == NULL || memsize == NULL)
+		ret = EINVAL;
+	else {
+		if (len != 0 && (*memsize == 0 || *memsize < len)) {
+			if ((ret = __os_realloc(env, len, memp)) == 0)
+				*memsize = len;
+			else
+				*memsize = 0;
+		}
+		if (ret == 0)
+			dbt->data = *memp;
+	}
+
+	if (ret == 0 && len != 0)
+		memcpy(dbt->data, data, len);
+
+	/*
+	 * Return the length of the returned record in the DBT size field.
+	 * This satisfies the requirement that if we're using user memory
+	 * and insufficient memory was provided, return the amount necessary
+	 * in the size field.
+	 */
+	dbt->size = len;
+
+	return (ret);
+}
diff --git a/db/db_setid.c b/db/db_setid.c
new file mode 100644
index 0000000..a78977e
--- /dev/null
+++ b/db/db_setid.c
@@ -0,0 +1,213 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000-2009 Oracle.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/db_am.h"
+#include "dbinc/mp.h"
+
+/*
+ * __env_fileid_reset_pp --
+ *	ENV->fileid_reset pre/post processing.
+ *
+ * PUBLIC: int __env_fileid_reset_pp __P((DB_ENV *, const char *, u_int32_t));
+ */
+int
+__env_fileid_reset_pp(dbenv, name, flags)
+	DB_ENV *dbenv;
+	const char *name;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->fileid_reset");
+
+	/*
+	 * !!!
+	 * The actual argument checking is simple, do it inline, outside of
+	 * the replication block.
+	 */
+	if (flags != 0 && flags != DB_ENCRYPT)
+		return (__db_ferr(env, "DB_ENV->fileid_reset", 0));
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env,
+	    (__env_fileid_reset(env, ip, name, LF_ISSET(DB_ENCRYPT) ? 1 : 0)),
+	    1, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __env_fileid_reset --
+ *	Reset the file IDs for every database in the file.
+ * PUBLIC: int __env_fileid_reset
+ * PUBLIC:	 __P((ENV *, DB_THREAD_INFO *, const char *, int));
+ */
+int
+__env_fileid_reset(env, ip, name, encrypted)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	const char *name;
+	int encrypted;
+{
+	DB *dbp;
+	DBC *dbcp;
+	DBMETA *meta;
+	DBT key, data;
+	DB_FH *fhp;
+	DB_MPOOLFILE *mpf;
+	DB_PGINFO cookie;
+	db_pgno_t pgno;
+	int t_ret, ret;
+	size_t n;
+	char *real_name;
+	u_int8_t fileid[DB_FILE_ID_LEN], mbuf[DBMETASIZE];
+	void *pagep;
+
+	dbp = NULL;
+	dbcp = NULL;
+	fhp = NULL;
+	real_name = NULL;
+
+	/* Get the real backing file name. */
+	if ((ret = __db_appname(env,
+	    DB_APP_DATA, name, NULL, &real_name)) != 0)
+		return (ret);
+
+	/* Get a new file ID. */
+	if ((ret = __os_fileid(env, real_name, 1, fileid)) != 0)
+		goto err;
+
+	/*
+	 * The user may have physically copied a file currently open in the
+	 * cache, which means if we open this file through the cache before
+	 * updating the file ID on page 0, we might connect to the file from
+	 * which the copy was made.
+	 */
+	if ((ret = __os_open(env, real_name, 0, 0, 0, &fhp)) != 0) {
+		__db_err(env, ret, "%s", real_name);
+		goto err;
+	}
+	if ((ret = __os_read(env, fhp, mbuf, sizeof(mbuf), &n)) != 0)
+		goto err;
+
+	if (n != sizeof(mbuf)) {
+		ret = EINVAL;
+		__db_errx(env,
+		    "__env_fileid_reset: %s: unexpected file type or format",
+		    real_name);
+		goto err;
+	}
+
+	/*
+	 * Create the DB object.
+	 */
+	if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+		goto err;
+
+	/* If configured with a password, the databases are encrypted. */
+	if (encrypted && (ret = __db_set_flags(dbp, DB_ENCRYPT)) != 0)
+		goto err;
+
+	if ((ret = __db_meta_setup(env,
+	    dbp, real_name, (DBMETA *)mbuf, 0, DB_CHK_META)) != 0)
+		goto err;
+
+	meta = (DBMETA *)mbuf;
+	if (FLD_ISSET(meta->metaflags,
+	    DBMETA_PART_RANGE | DBMETA_PART_CALLBACK) && (ret =
+	    __part_fileid_reset(env, ip, name, meta->nparts, encrypted)) != 0)
+		goto err;
+
+	memcpy(meta->uid, fileid, DB_FILE_ID_LEN);
+	cookie.db_pagesize = sizeof(mbuf);
+	cookie.flags = dbp->flags;
+	cookie.type = dbp->type;
+	key.data = &cookie;
+
+	if ((ret = __db_pgout(env->dbenv, 0, mbuf, &key)) != 0)
+		goto err;
+	if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+		goto err;
+	if ((ret = __os_write(env, fhp, mbuf, sizeof(mbuf), &n)) != 0)
+		goto err;
+	if ((ret = __os_fsync(env, fhp)) != 0)
+		goto err;
+
+	/*
+	 * Page 0 of the file has an updated file ID, and we can open it in
+	 * the cache without connecting to a different, existing file.  Open
+	 * the file in the cache, and update the file IDs for subdatabases.
+	 * (No existing code, as far as I know, actually uses the file ID of
+	 * a subdatabase, but it's cleaner to get them all.)
+	 */
+
+	/*
+	 * If the database file doesn't support subdatabases, we only have
+	 * to update a single metadata page.  Otherwise, we have to open a
+	 * cursor and step through the master database, and update all of
+	 * the subdatabases' metadata pages.
+	 */
+	if (meta->type != P_BTREEMETA || !F_ISSET(meta, BTM_SUBDB))
+		goto err;
+
+	/*
+	 * Open the DB file.
+	 *
+	 * !!!
+	 * Note DB_RDWRMASTER flag, we need to open the master database file
+	 * for writing in this case.
+	 */
+	if ((ret = __db_open(dbp, ip, NULL,
+	    name, NULL, DB_UNKNOWN, DB_RDWRMASTER, 0, PGNO_BASE_MD)) != 0)
+		goto err;
+
+	mpf = dbp->mpf;
+	memset(&key, 0, sizeof(key));
+	memset(&data, 0, sizeof(data));
+	if ((ret = __db_cursor(dbp, ip, NULL, &dbcp, 0)) != 0)
+		goto err;
+	while ((ret = __dbc_get(dbcp, &key, &data, DB_NEXT)) == 0) {
+		/*
+		 * XXX
+		 * We're handling actual data, not on-page meta-data, so it
+		 * hasn't been converted to/from opposite endian architectures.
+		 * Do it explicitly, now.
+		 */
+		memcpy(&pgno, data.data, sizeof(db_pgno_t));
+		DB_NTOHL_SWAP(env, &pgno);
+		if ((ret = __memp_fget(mpf, &pgno, ip, NULL,
+		    DB_MPOOL_DIRTY, &pagep)) != 0)
+			goto err;
+		memcpy(((DBMETA *)pagep)->uid, fileid, DB_FILE_ID_LEN);
+		if ((ret = __memp_fput(mpf, ip, pagep, dbcp->priority)) != 0)
+			goto err;
+	}
+	if (ret == DB_NOTFOUND)
+		ret = 0;
+
+err:	if (dbcp != NULL && (t_ret = __dbc_close(dbcp)) != 0 && ret == 0)
+		ret = t_ret;
+	if (dbp != NULL && (t_ret = __db_close(dbp, NULL, 0)) != 0 && ret == 0)
+		ret = t_ret;
+	if (fhp != NULL &&
+	    (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
+		ret = t_ret;
+	if (real_name != NULL)
+		__os_free(env, real_name);
+
+	return (ret);
+}
diff --git a/db/db_setlsn.c b/db/db_setlsn.c
new file mode 100644
index 0000000..51ee7d3
--- /dev/null
+++ b/db/db_setlsn.c
@@ -0,0 +1,137 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000-2009 Oracle.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/qam.h"
+
+static int __env_lsn_reset __P((ENV *, DB_THREAD_INFO *, const char *, int));
+
+/*
+ * __env_lsn_reset_pp --
+ *	ENV->lsn_reset pre/post processing.
+ *
+ * PUBLIC: int __env_lsn_reset_pp __P((DB_ENV *, const char *, u_int32_t));
+ */
+int
+__env_lsn_reset_pp(dbenv, name, flags)
+	DB_ENV *dbenv;
+	const char *name;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbenv->env;
+
+	ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->lsn_reset");
+
+	/*
+	 * !!!
+	 * The actual argument checking is simple, do it inline, outside of
+	 * the replication block.
+	 */
+	if (flags != 0 && flags != DB_ENCRYPT)
+		return (__db_ferr(env, "DB_ENV->lsn_reset", 0));
+
+	ENV_ENTER(env, ip);
+	REPLICATION_WRAP(env,
+	    (__env_lsn_reset(env, ip, name, LF_ISSET(DB_ENCRYPT) ? 1 : 0)),
+	    1, ret);
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __env_lsn_reset --
+ *	Reset the LSNs for every page in the file.
+ */
+static int
+__env_lsn_reset(env, ip, name, encrypted)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	const char *name;
+	int encrypted;
+{
+	DB *dbp;
+	int t_ret, ret;
+
+	/* Create the DB object. */
+	if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+		return (ret);
+
+	/* If configured with a password, the databases are encrypted. */
+	if (encrypted && (ret = __db_set_flags(dbp, DB_ENCRYPT)) != 0)
+		goto err;
+
+	/*
+	 * Open the DB file.
+	 *
+	 * !!!
+	 * Note DB_RDWRMASTER flag, we need to open the master database file
+	 * for writing in this case.
+	 */
+	if ((ret = __db_open(dbp, ip, NULL,
+	    name, NULL, DB_UNKNOWN, DB_RDWRMASTER, 0, PGNO_BASE_MD)) != 0) {
+		__db_err(env, ret, "%s", name);
+		goto err;
+	}
+
+	ret = __db_lsn_reset(dbp->mpf, ip);
+#ifdef HAVE_PARTITION
+	if (ret == 0 && DB_IS_PARTITIONED(dbp))
+		ret = __part_lsn_reset(dbp, ip);
+	else
+#endif
+	if (ret == 0 && dbp->type == DB_QUEUE)
+#ifdef HAVE_QUEUE
+		ret = __qam_lsn_reset(dbp, ip);
+#else
+		ret = __db_no_queue_am(env);
+#endif
+
+err:	if ((t_ret = __db_close(dbp, NULL, 0)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __db_lsn_reset -- reset the lsn for a db mpool handle.
+ * PUBLIC: int __db_lsn_reset __P((DB_MPOOLFILE *, DB_THREAD_INFO *));
+ */
+int
+__db_lsn_reset(mpf, ip)
+	DB_MPOOLFILE *mpf;
+	DB_THREAD_INFO *ip;
+{
+	PAGE *pagep;
+	db_pgno_t pgno;
+	int ret;
+
+	/* Reset the LSN on every page of the database file. */
+	for (pgno = 0;
+	    (ret = __memp_fget(mpf,
+	    &pgno, ip, NULL, DB_MPOOL_DIRTY, &pagep)) == 0;
+	    ++pgno) {
+		LSN_NOT_LOGGED(pagep->lsn);
+		if ((ret = __memp_fput(mpf,
+		    ip, pagep, DB_PRIORITY_UNCHANGED)) != 0)
+			break;
+	}
+
+	if (ret == DB_PAGE_NOTFOUND)
+		ret = 0;
+
+	return (ret);
+}
diff --git a/db/db_sort_multiple.c b/db/db_sort_multiple.c
new file mode 100644
index 0000000..32ae2df
--- /dev/null
+++ b/db/db_sort_multiple.c
@@ -0,0 +1,287 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle.  All rights reserved.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+
+static int __db_quicksort __P((DB *, DBT *, DBT *, u_int32_t *, u_int32_t *,
+		u_int32_t *, u_int32_t *, u_int32_t));
+
+/*
+ * __db_compare_both --
+ *	Use the comparison functions from db to compare akey and bkey, and if
+ *	DB_DUPSORT adata and bdata.
+ *
+ * PUBLIC: int __db_compare_both __P((DB *, const DBT *, const DBT *,
+ * PUBLIC:   const DBT *, const DBT *));
+ */
+int
+__db_compare_both(db, akey, adata, bkey, bdata)
+	DB *db;
+	const DBT *akey;
+	const DBT *adata;
+	const DBT *bkey;
+	const DBT *bdata;
+{
+	BTREE *t;
+	int cmp;
+
+	t = (BTREE *)db->bt_internal;
+
+	cmp = t->bt_compare(db, akey, bkey);
+	if (cmp != 0) return cmp;
+	if (!F_ISSET(db, DB_AM_DUPSORT)) return 0;
+
+	if (adata == 0) return bdata == 0 ? 0 : -1;
+	if (bdata == 0) return 1;
+
+#ifdef HAVE_COMPRESSION
+	if (DB_IS_COMPRESSED(db))
+		return t->compress_dup_compare(db, adata, bdata);
+#endif
+	return db->dup_compare(db, adata, bdata);
+}
+
+#define	DB_SORT_SWAP(a, ad, b, bd)					\
+do {									\
+	tmp = (a)[0]; (a)[0] = (b)[0]; (b)[0] = tmp;			\
+	tmp = (a)[-1]; (a)[-1] = (b)[-1]; (b)[-1] = tmp;		\
+	if (data != NULL) {						\
+		tmp = (ad)[0]; (ad)[0] = (bd)[0]; (bd)[0] = tmp;	\
+		tmp = (ad)[-1]; (ad)[-1] = (bd)[-1]; (bd)[-1] = tmp;	\
+	}								\
+} while (0)
+
+#define	DB_SORT_LOAD_DBT(a, ad, aptr, adptr)				\
+do {									\
+	(a).data = (u_int8_t*)key->data + (aptr)[0];			\
+	(a).size = (aptr)[-1];						\
+	if (data != NULL) {						\
+		(ad).data = (u_int8_t*)data->data + (adptr)[0];		\
+		(ad).size = (adptr)[-1];				\
+	}								\
+} while (0)
+
+#define	DB_SORT_COMPARE(a, ad, b, bd) (data != NULL ?			\
+	__db_compare_both(db, &(a), &(ad), &(b), &(bd)) :		\
+	__db_compare_both(db, &(a), 0, &(b), 0))
+
+#define	DB_SORT_STACKSIZE 32
+
+/*
+ * __db_quicksort --
+ *	The quicksort implementation for __db_sort_multiple() and
+ *	__db_sort_multiple_key().
+ */
+static int
+__db_quicksort(db, key, data, kstart, kend, dstart, dend, size)
+	DB *db;
+	DBT *key, *data;
+	u_int32_t *kstart, *kend, *dstart, *dend;
+	u_int32_t size;
+{
+	int ret;
+	u_int32_t tmp;
+	u_int32_t *kmiddle, *dmiddle, *kptr, *dptr;
+	DBT a, ad, b, bd, m, md;
+	ENV *env;
+
+	struct DB_SORT_quicksort_stack {
+		u_int32_t *kstart;
+		u_int32_t *kend;
+		u_int32_t *dstart;
+		u_int32_t *dend;
+	} stackbuf[DB_SORT_STACKSIZE], *stack;
+	u_int32_t soff, slen;
+
+	ret = 0;
+	env = db->env;
+
+	memset(&a, 0, sizeof(DBT));
+	memset(&ad, 0, sizeof(DBT));
+	memset(&b, 0, sizeof(DBT));
+	memset(&bd, 0, sizeof(DBT));
+	memset(&m, 0, sizeof(DBT));
+	memset(&md, 0, sizeof(DBT));
+
+	/* NB end is smaller than start */
+
+	stack = stackbuf;
+	soff = 0;
+	slen = DB_SORT_STACKSIZE;
+
+ start:
+	if (kend >= kstart) goto pop;
+
+	/* If there's only one value, it's already sorted */
+	tmp = (u_int32_t)(kstart - kend) / size;
+	if (tmp == 1) goto pop;
+
+	DB_SORT_LOAD_DBT(a, ad, kstart, dstart);
+	DB_SORT_LOAD_DBT(b, bd, kend + size, dend + size);
+
+	if (tmp == 2) {
+		/* Special case the sorting of two value sequences */
+		if (DB_SORT_COMPARE(a, ad, b, bd) > 0) {
+			DB_SORT_SWAP(kstart, dstart, kend + size, dend + size);
+		}
+		goto pop;
+	}
+
+	kmiddle = kstart - (tmp / 2) * size;
+	dmiddle = dstart - (tmp / 2) * size;
+	DB_SORT_LOAD_DBT(m, md, kmiddle, dmiddle);
+
+	/* Find the median of three */
+	if (DB_SORT_COMPARE(a, ad, b, bd) < 0) {
+		if (DB_SORT_COMPARE(m, md, a, ad) < 0) {
+			/* m < a < b */
+			DB_SORT_SWAP(kstart, dstart, kend + size, dend + size);
+		} else if (DB_SORT_COMPARE(m, md, b, bd) < 0) {
+			/* a < m < b */
+			DB_SORT_SWAP(kmiddle,
+			    dmiddle, kend + size, dend + size);
+		} else {
+			/* a < b < m */
+			/* Do nothing */
+		}
+	} else {
+		if (DB_SORT_COMPARE(a, ad, m, md) < 0) {
+			/* b < a < m */
+			DB_SORT_SWAP(kstart, dstart, kend + size, dend + size);
+		} else if (DB_SORT_COMPARE(b, bd, m, md) < 0) {
+			/* b < m < a */
+			DB_SORT_SWAP(kmiddle,
+			    dmiddle, kend + size, dend + size);
+		} else {
+			/* m < b < a */
+			/* Do nothing */
+		}
+	}
+
+	/* partition */
+	DB_SORT_LOAD_DBT(b, bd, kend + size, dend + size);
+	kmiddle = kstart;
+	dmiddle = dstart;
+	for (kptr = kstart, dptr = dstart; kptr > kend;
+	    kptr -= size, dptr -= size) {
+		DB_SORT_LOAD_DBT(a, ad, kptr, dptr);
+		if (DB_SORT_COMPARE(a, ad, b, bd) < 0) {
+			DB_SORT_SWAP(kmiddle, dmiddle, kptr, dptr);
+			kmiddle -= size;
+			dmiddle -= size;
+		}
+	}
+
+	DB_SORT_SWAP(kmiddle, dmiddle, kend + size, dend + size);
+
+	if (soff == slen) {
+		/* Grow the stack */
+		slen = slen * 2;
+		if (stack == stackbuf) {
+			ret = __os_malloc(env, slen *
+				sizeof(struct DB_SORT_quicksort_stack), &stack);
+			if (ret != 0) goto error;
+			memcpy(stack, stackbuf, soff *
+				sizeof(struct DB_SORT_quicksort_stack));
+		} else {
+			ret = __os_realloc(env, slen *
+				sizeof(struct DB_SORT_quicksort_stack), &stack);
+			if (ret != 0) goto error;
+		}
+	}
+
+	/* divide and conquer */
+	stack[soff].kstart = kmiddle - size;
+	stack[soff].kend = kend;
+	stack[soff].dstart = dmiddle - size;
+	stack[soff].dend = dend;
+	++soff;
+
+	kend = kmiddle;
+	dend = dmiddle;
+
+	goto start;
+
+ pop:
+	if (soff != 0) {
+		--soff;
+		kstart = stack[soff].kstart;
+		kend = stack[soff].kend;
+		dstart = stack[soff].dstart;
+		dend = stack[soff].dend;
+		goto start;
+	}
+
+ error:
+	if (stack != stackbuf)
+		__os_free(env, stack);
+
+	return ret;
+}
+
+#undef DB_SORT_SWAP
+#undef DB_SORT_LOAD_DBT
+
+/*
+ * __db_sort_multiple --
+ *	If flags == DB_MULTIPLE_KEY, sorts a DB_MULTIPLE_KEY format DBT using
+ *	the BTree comparison function and duplicate comparison function.
+ *
+ *	If flags == DB_MULTIPLE, sorts one or two DB_MULTIPLE format DBTs using
+ *	the BTree comparison function and duplicate comparison function. Will
+ *	assume key and data specifies pairs of key/data to sort together. If
+ *	data is NULL, will just sort key according to the btree comparison
+ *	function.
+ *
+ *	Uses an in-place quicksort algorithm, with median of three for the pivot
+ *	point.
+ *
+ * PUBLIC: int __db_sort_multiple __P((DB *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_sort_multiple(db, key, data, flags)
+	DB *db;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	u_int32_t *kstart, *kend, *dstart, *dend;
+
+	/* TODO: sanity checks on the DBTs */
+	/* DB_ILLEGAL_METHOD(db, DB_OK_BTREE); */
+
+	kstart = (u_int32_t*)((u_int8_t *)key->data + key->ulen) - 1;
+
+	switch (flags) {
+	case DB_MULTIPLE:
+		if (data != NULL)
+			dstart = (u_int32_t*)((u_int8_t *)data->data +
+				data->ulen) - 1;
+		else
+			dstart = kstart;
+
+		/* Find the end */
+		for (kend = kstart, dend = dstart;
+		    *kend != (u_int32_t)-1 && *dend != (u_int32_t)-1;
+		    kend -= 2, dend -= 2)
+			;
+
+		return (__db_quicksort(db, key, data, kstart, kend, dstart,
+			dend, 2));
+	case DB_MULTIPLE_KEY:
+		/* Find the end */
+		for (kend = kstart; *kend != (u_int32_t)-1; kend -= 4)
+			;
+
+		return (__db_quicksort(db, key, key, kstart, kend, kstart - 2,
+			kend - 2, 4));
+	default:
+		return (__db_ferr(db->env, "DB->sort_multiple", 0));
+	}
+}
diff --git a/db/db_stati.c b/db/db_stati.c
new file mode 100644
index 0000000..b8d3a3f
--- /dev/null
+++ b/db/db_stati.c
@@ -0,0 +1,494 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/qam.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+
+#ifdef HAVE_STATISTICS
+static int __db_print_all __P((DB *, u_int32_t));
+static int __db_print_citem __P((DBC *));
+static int __db_print_cursor __P((DB *));
+static int __db_print_stats __P((DB *, DB_THREAD_INFO *, u_int32_t));
+static int __db_stat __P((DB *, DB_THREAD_INFO *, DB_TXN *, void *, u_int32_t));
+static int __db_stat_arg __P((DB *, u_int32_t));
+
+/*
+ * __db_stat_pp --
+ *	DB->stat pre/post processing.
+ *
+ * PUBLIC: int __db_stat_pp __P((DB *, DB_TXN *, void *, u_int32_t));
+ */
+int
+__db_stat_pp(dbp, txn, spp, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	void *spp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = dbp->env;
+
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->stat");
+
+	if ((ret = __db_stat_arg(dbp, flags)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (ret = __db_rep_enter(dbp, 1, 0,
+	    txn != NULL)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	ret = __db_stat(dbp, ip, txn, spp, flags);
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+err:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_stat --
+ *	DB->stat.
+ *
+ */
+static int
+__db_stat(dbp, ip, txn, spp, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	void *spp;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	ENV *env;
+	int ret, t_ret;
+
+	env = dbp->env;
+
+	/* Acquire a cursor. */
+	if ((ret = __db_cursor(dbp, ip, txn,
+	     &dbc, LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED))) != 0)
+		return (ret);
+
+	DEBUG_LWRITE(dbc, NULL, "DB->stat", NULL, NULL, flags);
+	LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED);
+#ifdef HAVE_PARTITION
+	if (DB_IS_PARTITIONED(dbp))
+		ret = __partition_stat(dbc, spp, flags);
+	else
+#endif
+	switch (dbp->type) {
+	case DB_BTREE:
+	case DB_RECNO:
+		ret = __bam_stat(dbc, spp, flags);
+		break;
+	case DB_HASH:
+		ret = __ham_stat(dbc, spp, flags);
+		break;
+	case DB_QUEUE:
+		ret = __qam_stat(dbc, spp, flags);
+		break;
+	case DB_UNKNOWN:
+	default:
+		ret = (__db_unknown_type(env, "DB->stat", dbp->type));
+		break;
+	}
+
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_stat_arg --
+ *	Check DB->stat arguments.
+ */
+static int
+__db_stat_arg(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	ENV *env;
+
+	env = dbp->env;
+
+	/* Check for invalid function flags. */
+	LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED);
+	switch (flags) {
+	case 0:
+	case DB_FAST_STAT:
+		break;
+	default:
+		return (__db_ferr(env, "DB->stat", 0));
+	}
+
+	return (0);
+}
+
+/*
+ * __db_stat_print_pp --
+ *	DB->stat_print pre/post processing.
+ *
+ * PUBLIC: int __db_stat_print_pp __P((DB *, u_int32_t));
+ */
+int
+__db_stat_print_pp(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret;
+
+	env = dbp->env;
+
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->stat_print");
+
+	/*
+	 * !!!
+	 * The actual argument checking is simple, do it inline.
+	 */
+	if ((ret = __db_fchk(env,
+	    "DB->stat_print", flags, DB_FAST_STAT | DB_STAT_ALL)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, 0)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	ret = __db_stat_print(dbp, ip, flags);
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+err:	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_stat_print --
+ *	DB->stat_print.
+ *
+ * PUBLIC: int __db_stat_print __P((DB *, DB_THREAD_INFO *, u_int32_t));
+ */
+int
+__db_stat_print(dbp, ip, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	u_int32_t flags;
+{
+	time_t now;
+	int ret;
+	char time_buf[CTIME_BUFLEN];
+
+	(void)time(&now);
+	__db_msg(dbp->env, "%.24s\tLocal time", __os_ctime(&now, time_buf));
+
+	if (LF_ISSET(DB_STAT_ALL) && (ret = __db_print_all(dbp, flags)) != 0)
+		return (ret);
+
+	if ((ret = __db_print_stats(dbp, ip, flags)) != 0)
+		return (ret);
+
+	return (0);
+}
+
+/*
+ * __db_print_stats --
+ *	Display default DB handle statistics.
+ */
+static int
+__db_print_stats(dbp, ip, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	ENV *env;
+	int ret, t_ret;
+
+	env = dbp->env;
+
+	/* Acquire a cursor. */
+	if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+		return (ret);
+
+	DEBUG_LWRITE(dbc, NULL, "DB->stat_print", NULL, NULL, 0);
+
+	switch (dbp->type) {
+	case DB_BTREE:
+	case DB_RECNO:
+		ret = __bam_stat_print(dbc, flags);
+		break;
+	case DB_HASH:
+		ret = __ham_stat_print(dbc, flags);
+		break;
+	case DB_QUEUE:
+		ret = __qam_stat_print(dbc, flags);
+		break;
+	case DB_UNKNOWN:
+	default:
+		ret = (__db_unknown_type(env, "DB->stat_print", dbp->type));
+		break;
+	}
+
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_print_all --
+ *	Display debugging DB handle statistics.
+ */
+static int
+__db_print_all(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	static const FN fn[] = {
+		{ DB_AM_CHKSUM,			"DB_AM_CHKSUM" },
+		{ DB_AM_COMPENSATE,		"DB_AM_COMPENSATE" },
+		{ DB_AM_CREATED,		"DB_AM_CREATED" },
+		{ DB_AM_CREATED_MSTR,		"DB_AM_CREATED_MSTR" },
+		{ DB_AM_DBM_ERROR,		"DB_AM_DBM_ERROR" },
+		{ DB_AM_DELIMITER,		"DB_AM_DELIMITER" },
+		{ DB_AM_DISCARD,		"DB_AM_DISCARD" },
+		{ DB_AM_DUP,			"DB_AM_DUP" },
+		{ DB_AM_DUPSORT,		"DB_AM_DUPSORT" },
+		{ DB_AM_ENCRYPT,		"DB_AM_ENCRYPT" },
+		{ DB_AM_FIXEDLEN,		"DB_AM_FIXEDLEN" },
+		{ DB_AM_INMEM,			"DB_AM_INMEM" },
+		{ DB_AM_IN_RENAME,		"DB_AM_IN_RENAME" },
+		{ DB_AM_NOT_DURABLE,		"DB_AM_NOT_DURABLE" },
+		{ DB_AM_OPEN_CALLED,		"DB_AM_OPEN_CALLED" },
+		{ DB_AM_PAD,			"DB_AM_PAD" },
+		{ DB_AM_PGDEF,			"DB_AM_PGDEF" },
+		{ DB_AM_RDONLY,			"DB_AM_RDONLY" },
+		{ DB_AM_READ_UNCOMMITTED,	"DB_AM_READ_UNCOMMITTED" },
+		{ DB_AM_RECNUM,			"DB_AM_RECNUM" },
+		{ DB_AM_RECOVER,		"DB_AM_RECOVER" },
+		{ DB_AM_RENUMBER,		"DB_AM_RENUMBER" },
+		{ DB_AM_REVSPLITOFF,		"DB_AM_REVSPLITOFF" },
+		{ DB_AM_SECONDARY,		"DB_AM_SECONDARY" },
+		{ DB_AM_SNAPSHOT,		"DB_AM_SNAPSHOT" },
+		{ DB_AM_SUBDB,			"DB_AM_SUBDB" },
+		{ DB_AM_SWAP,			"DB_AM_SWAP" },
+		{ DB_AM_TXN,			"DB_AM_TXN" },
+		{ DB_AM_VERIFYING,		"DB_AM_VERIFYING" },
+		{ 0,				NULL }
+	};
+	ENV *env;
+	char time_buf[CTIME_BUFLEN];
+
+	env = dbp->env;
+
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+	__db_msg(env, "DB handle information:");
+	STAT_ULONG("Page size", dbp->pgsize);
+	STAT_ISSET("Append recno", dbp->db_append_recno);
+	STAT_ISSET("Feedback", dbp->db_feedback);
+	STAT_ISSET("Dup compare", dbp->dup_compare);
+	STAT_ISSET("App private", dbp->app_private);
+	STAT_ISSET("DbEnv", dbp->env);
+	STAT_STRING("Type", __db_dbtype_to_string(dbp->type));
+
+	__mutex_print_debug_single(env, "Thread mutex", dbp->mutex, flags);
+
+	STAT_STRING("File", dbp->fname);
+	STAT_STRING("Database", dbp->dname);
+	STAT_HEX("Open flags", dbp->open_flags);
+
+	__db_print_fileid(env, dbp->fileid, "\tFile ID");
+
+	STAT_ULONG("Cursor adjust ID", dbp->adj_fileid);
+	STAT_ULONG("Meta pgno", dbp->meta_pgno);
+	if (dbp->locker != NULL)
+		STAT_ULONG("Locker ID", dbp->locker->id);
+	if (dbp->cur_locker != NULL)
+		STAT_ULONG("Handle lock", dbp->cur_locker->id);
+	if (dbp->associate_locker != NULL)
+		STAT_ULONG("Associate lock", dbp->associate_locker->id);
+	STAT_ULONG("RPC remote ID", dbp->cl_id);
+
+	__db_msg(env,
+	    "%.24s\tReplication handle timestamp",
+	    dbp->timestamp == 0 ? "0" : __os_ctime(&dbp->timestamp, time_buf));
+
+	STAT_ISSET("Secondary callback", dbp->s_callback);
+	STAT_ISSET("Primary handle", dbp->s_primary);
+
+	STAT_ISSET("api internal", dbp->api_internal);
+	STAT_ISSET("Btree/Recno internal", dbp->bt_internal);
+	STAT_ISSET("Hash internal", dbp->h_internal);
+	STAT_ISSET("Queue internal", dbp->q_internal);
+
+	__db_prflags(env, NULL, dbp->flags, fn, NULL, "\tFlags");
+
+	if (dbp->log_filename == NULL)
+		STAT_ISSET("File naming information", dbp->log_filename);
+	else
+		__dbreg_print_fname(env, dbp->log_filename);
+
+	(void)__db_print_cursor(dbp);
+
+	return (0);
+}
+
+/*
+ * __db_print_cursor --
+ *	Display the cursor active and free queues.
+ */
+static int
+__db_print_cursor(dbp)
+	DB *dbp;
+{
+	DBC *dbc;
+	ENV *env;
+	int ret, t_ret;
+
+	env = dbp->env;
+
+	__db_msg(env, "%s", DB_GLOBAL(db_line));
+	__db_msg(env, "DB handle cursors:");
+
+	ret = 0;
+	MUTEX_LOCK(dbp->env, dbp->mutex);
+	__db_msg(env, "Active queue:");
+	TAILQ_FOREACH(dbc, &dbp->active_queue, links)
+		if ((t_ret = __db_print_citem(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+	__db_msg(env, "Join queue:");
+	TAILQ_FOREACH(dbc, &dbp->join_queue, links)
+		if ((t_ret = __db_print_citem(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+	__db_msg(env, "Free queue:");
+	TAILQ_FOREACH(dbc, &dbp->free_queue, links)
+		if ((t_ret = __db_print_citem(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+	MUTEX_UNLOCK(dbp->env, dbp->mutex);
+
+	return (ret);
+}
+
+static int
+__db_print_citem(dbc)
+	DBC *dbc;
+{
+	static const FN fn[] = {
+		{ DBC_ACTIVE,		"DBC_ACTIVE" },
+		{ DBC_DONTLOCK,		"DBC_DONTLOCK" },
+		{ DBC_MULTIPLE,		"DBC_MULTIPLE" },
+		{ DBC_MULTIPLE_KEY,	"DBC_MULTIPLE_KEY" },
+		{ DBC_OPD,		"DBC_OPD" },
+		{ DBC_OWN_LID,		"DBC_OWN_LID" },
+		{ DBC_READ_COMMITTED,	"DBC_READ_COMMITTED" },
+		{ DBC_READ_UNCOMMITTED,	"DBC_READ_UNCOMMITTED" },
+		{ DBC_RECOVER,		"DBC_RECOVER" },
+		{ DBC_RMW,		"DBC_RMW" },
+		{ DBC_TRANSIENT,	"DBC_TRANSIENT" },
+		{ DBC_WAS_READ_COMMITTED,"DBC_WAS_READ_COMMITTED" },
+		{ DBC_WRITECURSOR,	"DBC_WRITECURSOR" },
+		{ DBC_WRITER,		"DBC_WRITER" },
+		{ 0,			NULL }
+	};
+	DB *dbp;
+	DBC_INTERNAL *cp;
+	ENV *env;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+	cp = dbc->internal;
+
+	STAT_POINTER("DBC", dbc);
+	STAT_POINTER("Associated dbp", dbc->dbp);
+	STAT_POINTER("Associated txn", dbc->txn);
+	STAT_POINTER("Internal", cp);
+	STAT_HEX("Default locker ID", dbc->lref == NULL ? 0 : dbc->lref->id);
+	STAT_HEX("Locker", P_TO_ULONG(dbc->locker));
+	STAT_STRING("Type", __db_dbtype_to_string(dbc->dbtype));
+
+	STAT_POINTER("Off-page duplicate cursor", cp->opd);
+	STAT_POINTER("Referenced page", cp->page);
+	STAT_ULONG("Root", cp->root);
+	STAT_ULONG("Page number", cp->pgno);
+	STAT_ULONG("Page index", cp->indx);
+	STAT_STRING("Lock mode", __db_lockmode_to_string(cp->lock_mode));
+	__db_prflags(env, NULL, dbc->flags, fn, NULL, "\tFlags");
+
+	switch (dbc->dbtype) {
+	case DB_BTREE:
+	case DB_RECNO:
+		__bam_print_cursor(dbc);
+		break;
+	case DB_HASH:
+		__ham_print_cursor(dbc);
+		break;
+	case DB_UNKNOWN:
+		DB_ASSERT(env, dbp->type != DB_UNKNOWN);
+		/* FALLTHROUGH */
+	case DB_QUEUE:
+	default:
+		break;
+	}
+	return (0);
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__db_stat_pp(dbp, txn, spp, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	void *spp;
+	u_int32_t flags;
+{
+	COMPQUIET(spp, NULL);
+	COMPQUIET(txn, NULL);
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(dbp->env));
+}
+
+int
+__db_stat_print_pp(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+
+	return (__db_stat_not_built(dbp->env));
+}
+#endif
diff --git a/db/db_truncate.c b/db/db_truncate.c
new file mode 100644
index 0000000..66f4180
--- /dev/null
+++ b/db/db_truncate.c
@@ -0,0 +1,225 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001-2009 Oracle.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/qam.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/partition.h"
+#include "dbinc/txn.h"
+
+static int __db_cursor_check __P((DB *));
+
+/*
+ * __db_truncate_pp
+ *	DB->truncate pre/post processing.
+ *
+ * PUBLIC: int __db_truncate_pp __P((DB *, DB_TXN *, u_int32_t *, u_int32_t));
+ */
+int
+__db_truncate_pp(dbp, txn, countp, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	u_int32_t *countp, flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int handle_check, ret, t_ret, txn_local;
+
+	env = dbp->env;
+	handle_check = txn_local = 0;
+
+	STRIP_AUTO_COMMIT(flags);
+
+	/* Check for invalid flags. */
+	if (F_ISSET(dbp, DB_AM_SECONDARY)) {
+		__db_errx(env, "DB->truncate forbidden on secondary indices");
+		return (EINVAL);
+	}
+	if ((ret = __db_fchk(env, "DB->truncate", flags, 0)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+
+	/*
+	 * Make sure there are no active cursors on this db.  Since we drop
+	 * pages we cannot really adjust cursors.
+	 */
+	if ((ret = __db_cursor_check(dbp)) != 0) {
+		__db_errx(env,
+		     "DB->truncate not permitted with active cursors");
+		goto err;
+	}
+
+#ifdef CONFIG_TEST
+	if (IS_REP_MASTER(env))
+		DB_TEST_WAIT(env, env->test_check);
+#endif
+	/* Check for replication block. */
+	handle_check = IS_ENV_REPLICATED(env);
+	if (handle_check &&
+	    (ret = __db_rep_enter(dbp, 1, 0, txn != NULL)) != 0) {
+		handle_check = 0;
+		goto err;
+	}
+
+	/*
+	 * Check for changes to a read-only database.  This must be after the
+	 * replication block so that we cannot race master/client state changes.
+	 */
+	if (DB_IS_READONLY(dbp)) {
+		ret = __db_rdonly(env, "DB->truncate");
+		goto err;
+	}
+
+	/*
+	 * Create local transaction as necessary, check for consistent
+	 * transaction usage.
+	 */
+	if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+		if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+			goto err;
+		txn_local = 1;
+	}
+
+	/* Check for consistent transaction usage. */
+	if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+		goto err;
+
+	ret = __db_truncate(dbp, ip, txn, countp);
+
+err:	if (txn_local &&
+	    (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0)
+		ret = t_ret;
+
+	/* Release replication block. */
+	if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_truncate
+ *	DB->truncate.
+ *
+ * PUBLIC: int __db_truncate __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC:     u_int32_t *));
+ */
+int
+__db_truncate(dbp, ip, txn, countp)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	u_int32_t *countp;
+{
+	DB *sdbp;
+	DBC *dbc;
+	ENV *env;
+	u_int32_t scount;
+	int ret, t_ret;
+
+	env = dbp->env;
+	dbc = NULL;
+	ret = 0;
+
+	/*
+	 * Run through all secondaries and truncate them first.  The count
+	 * returned is the count of the primary only.  QUEUE uses normal
+	 * processing to truncate so it will update the secondaries normally.
+	 */
+	if (dbp->type != DB_QUEUE && DB_IS_PRIMARY(dbp)) {
+		if ((ret = __db_s_first(dbp, &sdbp)) != 0)
+			return (ret);
+		for (; sdbp != NULL && ret == 0; ret = __db_s_next(&sdbp, txn))
+			if ((ret = __db_truncate(sdbp, ip, txn, &scount)) != 0)
+				break;
+		if (sdbp != NULL)
+			(void)__db_s_done(sdbp, txn);
+		if (ret != 0)
+			return (ret);
+	}
+
+	DB_TEST_RECOVERY(dbp, DB_TEST_PREDESTROY, ret, NULL);
+
+	/* Acquire a cursor. */
+	if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+		return (ret);
+
+	DEBUG_LWRITE(dbc, txn, "DB->truncate", NULL, NULL, 0);
+#ifdef HAVE_PARTITION
+	if (DB_IS_PARTITIONED(dbp))
+		ret = __part_truncate(dbc, countp);
+	else
+#endif
+	switch (dbp->type) {
+	case DB_BTREE:
+	case DB_RECNO:
+		ret = __bam_truncate(dbc, countp);
+		break;
+	case DB_HASH:
+		ret = __ham_truncate(dbc, countp);
+		break;
+	case DB_QUEUE:
+		ret = __qam_truncate(dbc, countp);
+		break;
+	case DB_UNKNOWN:
+	default:
+		ret = __db_unknown_type(env, "DB->truncate", dbp->type);
+		break;
+	}
+
+	/* Discard the cursor. */
+	if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, NULL);
+
+DB_TEST_RECOVERY_LABEL
+
+	return (ret);
+}
+
+/*
+ * __db_cursor_check --
+ *	See if there are any active cursors on this db.
+ */
+static int
+__db_cursor_check(dbp)
+	DB *dbp;
+{
+	DB *ldbp;
+	DBC *dbc;
+	ENV *env;
+	int found;
+
+	env = dbp->env;
+
+	MUTEX_LOCK(env, env->mtx_dblist);
+	FIND_FIRST_DB_MATCH(env, dbp, ldbp);
+	for (found = 0;
+	    !found && ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+	    ldbp = TAILQ_NEXT(ldbp, dblistlinks)) {
+		MUTEX_LOCK(env, dbp->mutex);
+		TAILQ_FOREACH(dbc, &ldbp->active_queue, links)
+			if (IS_INITIALIZED(dbc)) {
+				found = 1;
+				break;
+			}
+		MUTEX_UNLOCK(env, dbp->mutex);
+	}
+	MUTEX_UNLOCK(env, env->mtx_dblist);
+
+	return (found ? EINVAL : 0);
+}
diff --git a/db/db_upg.c b/db/db_upg.c
new file mode 100644
index 0000000..5a6db94
--- /dev/null
+++ b/db/db_upg.c
@@ -0,0 +1,510 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/qam.h"
+
+/*
+ * __db_upgrade_pp --
+ *	DB->upgrade pre/post processing.
+ *
+ * PUBLIC: int __db_upgrade_pp __P((DB *, const char *, u_int32_t));
+ */
+int
+__db_upgrade_pp(dbp, fname, flags)
+	DB *dbp;
+	const char *fname;
+	u_int32_t flags;
+{
+#ifdef HAVE_UPGRADE_SUPPORT
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret;
+
+	env = dbp->env;
+
+	/*
+	 * !!!
+	 * The actual argument checking is simple, do it inline.
+	 */
+	if ((ret = __db_fchk(env, "DB->upgrade", flags, DB_DUPSORT)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+	ret = __db_upgrade(dbp, fname, flags);
+	ENV_LEAVE(env, ip);
+	return (ret);
+#else
+	COMPQUIET(dbp, NULL);
+	COMPQUIET(fname, NULL);
+	COMPQUIET(flags, 0);
+
+	__db_errx(dbp->env, "upgrade not supported");
+	return (EINVAL);
+#endif
+}
+
+#ifdef HAVE_UPGRADE_SUPPORT
+static int (* const func_31_list[P_PAGETYPE_MAX])
+    __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)) = {
+	NULL,			/* P_INVALID */
+	NULL,			/* __P_DUPLICATE */
+	__ham_31_hash,		/* P_HASH_UNSORTED */
+	NULL,			/* P_IBTREE */
+	NULL,			/* P_IRECNO */
+	__bam_31_lbtree,	/* P_LBTREE */
+	NULL,			/* P_LRECNO */
+	NULL,			/* P_OVERFLOW */
+	__ham_31_hashmeta,	/* P_HASHMETA */
+	__bam_31_btreemeta,	/* P_BTREEMETA */
+	NULL,			/* P_QAMMETA */
+	NULL,			/* P_QAMDATA */
+	NULL,			/* P_LDUP */
+	NULL,			/* P_HASH */
+};
+
+static int (* const func_46_list[P_PAGETYPE_MAX])
+    __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)) = {
+	NULL,			/* P_INVALID */
+	NULL,			/* __P_DUPLICATE */
+	__ham_46_hash,		/* P_HASH_UNSORTED */
+	NULL,			/* P_IBTREE */
+	NULL,			/* P_IRECNO */
+	NULL,			/* P_LBTREE */
+	NULL,			/* P_LRECNO */
+	NULL,			/* P_OVERFLOW */
+	__ham_46_hashmeta,	/* P_HASHMETA */
+	NULL,			/* P_BTREEMETA */
+	NULL,			/* P_QAMMETA */
+	NULL,			/* P_QAMDATA */
+	NULL,			/* P_LDUP */
+	NULL,			/* P_HASH */
+};
+
+static int __db_page_pass __P((DB *, char *, u_int32_t, int (* const [])
+	       (DB *, char *, u_int32_t, DB_FH *, PAGE *, int *), DB_FH *));
+static int __db_set_lastpgno __P((DB *, char *, DB_FH *));
+
+/*
+ * __db_upgrade --
+ *	Upgrade an existing database.
+ *
+ * PUBLIC: int __db_upgrade __P((DB *, const char *, u_int32_t));
+ */
+int
+__db_upgrade(dbp, fname, flags)
+	DB *dbp;
+	const char *fname;
+	u_int32_t flags;
+{
+	DBMETA *meta;
+	DB_FH *fhp;
+	ENV *env;
+	size_t n;
+	int ret, t_ret, use_mp_open;
+	u_int8_t mbuf[256], tmpflags;
+	char *real_name;
+
+	use_mp_open = 0;
+	env = dbp->env;
+	fhp = NULL;
+
+	/* Get the real backing file name. */
+	if ((ret = __db_appname(env,
+	    DB_APP_DATA, fname, NULL, &real_name)) != 0)
+		return (ret);
+
+	/* Open the file. */
+	if ((ret = __os_open(env, real_name, 0, 0, 0, &fhp)) != 0) {
+		__db_err(env, ret, "%s", real_name);
+		return (ret);
+	}
+
+	/* Initialize the feedback. */
+	if (dbp->db_feedback != NULL)
+		dbp->db_feedback(dbp, DB_UPGRADE, 0);
+
+	/*
+	 * Read the metadata page.  We read 256 bytes, which is larger than
+	 * any access method's metadata page and smaller than any disk sector.
+	 */
+	if ((ret = __os_read(env, fhp, mbuf, sizeof(mbuf), &n)) != 0)
+		goto err;
+
+	switch (((DBMETA *)mbuf)->magic) {
+	case DB_BTREEMAGIC:
+		switch (((DBMETA *)mbuf)->version) {
+		case 6:
+			/*
+			 * Before V7 not all pages had page types, so we do the
+			 * single meta-data page by hand.
+			 */
+			if ((ret =
+			    __bam_30_btreemeta(dbp, real_name, mbuf)) != 0)
+				goto err;
+			if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+				goto err;
+			if ((ret = __os_write(env, fhp, mbuf, 256, &n)) != 0)
+				goto err;
+			/* FALLTHROUGH */
+		case 7:
+			/*
+			 * We need the page size to do more.  Rip it out of
+			 * the meta-data page.
+			 */
+			memcpy(&dbp->pgsize, mbuf + 20, sizeof(u_int32_t));
+
+			if ((ret = __db_page_pass(
+			    dbp, real_name, flags, func_31_list, fhp)) != 0)
+				goto err;
+			/* FALLTHROUGH */
+		case 8:
+			if ((ret =
+			     __db_set_lastpgno(dbp, real_name, fhp)) != 0)
+				goto err;
+			/* FALLTHROUGH */
+		case 9:
+			break;
+		default:
+			__db_errx(env, "%s: unsupported btree version: %lu",
+			    real_name, (u_long)((DBMETA *)mbuf)->version);
+			ret = DB_OLD_VERSION;
+			goto err;
+		}
+		break;
+	case DB_HASHMAGIC:
+		switch (((DBMETA *)mbuf)->version) {
+		case 4:
+		case 5:
+			/*
+			 * Before V6 not all pages had page types, so we do the
+			 * single meta-data page by hand.
+			 */
+			if ((ret =
+			    __ham_30_hashmeta(dbp, real_name, mbuf)) != 0)
+				goto err;
+			if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+				goto err;
+			if ((ret = __os_write(env, fhp, mbuf, 256, &n)) != 0)
+				goto err;
+
+			/*
+			 * Before V6, we created hash pages one by one as they
+			 * were needed, using hashhdr.ovfl_point to reserve
+			 * a block of page numbers for them.  A consequence
+			 * of this was that, if no overflow pages had been
+			 * created, the current doubling might extend past
+			 * the end of the database file.
+			 *
+			 * In DB 3.X, we now create all the hash pages
+			 * belonging to a doubling atomically; it's not
+			 * safe to just save them for later, because when
+			 * we create an overflow page we'll just create
+			 * a new last page (whatever that may be).  Grow
+			 * the database to the end of the current doubling.
+			 */
+			if ((ret =
+			    __ham_30_sizefix(dbp, fhp, real_name, mbuf)) != 0)
+				goto err;
+			/* FALLTHROUGH */
+		case 6:
+			/*
+			 * We need the page size to do more.  Rip it out of
+			 * the meta-data page.
+			 */
+			memcpy(&dbp->pgsize, mbuf + 20, sizeof(u_int32_t));
+
+			if ((ret = __db_page_pass(
+			    dbp, real_name, flags, func_31_list, fhp)) != 0)
+				goto err;
+			/* FALLTHROUGH */
+		case 7:
+			if ((ret =
+			     __db_set_lastpgno(dbp, real_name, fhp)) != 0)
+				goto err;
+			/* FALLTHROUGH */
+		case 8:
+			/*
+			 * Any upgrade that has proceeded this far has metadata
+			 * pages compatible with hash version 8 metadata pages,
+			 * so casting mbuf to a dbmeta is safe.
+			 * If a newer revision moves the pagesize, checksum or
+			 * encrypt_alg flags in the metadata, then the
+			 * extraction of the fields will need to use hard coded
+			 * offsets.
+			 */
+			meta = (DBMETA*)mbuf;
+			/*
+			 * We need the page size to do more.  Extract it from
+			 * the meta-data page.
+			 */
+			memcpy(&dbp->pgsize, &meta->pagesize,
+			    sizeof(u_int32_t));
+			/*
+			 * Rip out metadata and encrypt_alg fields from the
+			 * metadata page. So the upgrade can know how big
+			 * the page metadata pre-amble is. Any upgrade that has
+			 * proceeded this far has metadata pages compatible
+			 * with hash version 8 metadata pages, so extracting
+			 * the fields is safe.
+			 */
+			memcpy(&tmpflags, &meta->metaflags, sizeof(u_int8_t));
+			if (FLD_ISSET(tmpflags, DBMETA_CHKSUM))
+				F_SET(dbp, DB_AM_CHKSUM);
+			memcpy(&tmpflags, &meta->encrypt_alg, sizeof(u_int8_t));
+			if (tmpflags != 0) {
+				if (!CRYPTO_ON(dbp->env)) {
+					__db_errx(env,
+"Attempt to upgrade an encrypted database without providing a password.");
+					ret = EINVAL;
+					goto err;
+				}
+				F_SET(dbp, DB_AM_ENCRYPT);
+			}
+
+			/*
+			 * This is ugly. It is necessary to have a usable
+			 * mpool in the dbp to upgrade from an unsorted
+			 * to a sorted hash database. The mpool file is used
+			 * to resolve offpage key items, which are needed to
+			 * determine sort order. Having mpool open and access
+			 * the file does not affect the page pass, since the
+			 * page pass only updates DB_HASH_UNSORTED pages
+			 * in-place, and the mpool file is only used to read
+			 * OFFPAGE items.
+			 */
+			use_mp_open = 1;
+			if ((ret = __os_closehandle(env, fhp)) != 0)
+				return (ret);
+			dbp->type = DB_HASH;
+			if ((ret = __env_mpool(dbp, fname,
+			    DB_AM_NOT_DURABLE | DB_AM_VERIFYING)) != 0)
+				return (ret);
+			fhp = dbp->mpf->fhp;
+
+			/* Do the actual conversion pass. */
+			if ((ret = __db_page_pass(
+			    dbp, real_name, flags, func_46_list, fhp)) != 0)
+				goto err;
+
+			/* FALLTHROUGH */
+		case 9:
+			break;
+		default:
+			__db_errx(env, "%s: unsupported hash version: %lu",
+			    real_name, (u_long)((DBMETA *)mbuf)->version);
+			ret = DB_OLD_VERSION;
+			goto err;
+		}
+		break;
+	case DB_QAMMAGIC:
+		switch (((DBMETA *)mbuf)->version) {
+		case 1:
+			/*
+			 * If we're in a Queue database, the only page that
+			 * needs upgrading is the meta-database page, don't
+			 * bother with a full pass.
+			 */
+			if ((ret = __qam_31_qammeta(dbp, real_name, mbuf)) != 0)
+				return (ret);
+			/* FALLTHROUGH */
+		case 2:
+			if ((ret = __qam_32_qammeta(dbp, real_name, mbuf)) != 0)
+				return (ret);
+			if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+				goto err;
+			if ((ret = __os_write(env, fhp, mbuf, 256, &n)) != 0)
+				goto err;
+			/* FALLTHROUGH */
+		case 3:
+		case 4:
+			break;
+		default:
+			__db_errx(env, "%s: unsupported queue version: %lu",
+			    real_name, (u_long)((DBMETA *)mbuf)->version);
+			ret = DB_OLD_VERSION;
+			goto err;
+		}
+		break;
+	default:
+		M_32_SWAP(((DBMETA *)mbuf)->magic);
+		switch (((DBMETA *)mbuf)->magic) {
+		case DB_BTREEMAGIC:
+		case DB_HASHMAGIC:
+		case DB_QAMMAGIC:
+			__db_errx(env,
+		"%s: DB->upgrade only supported on native byte-order systems",
+			    real_name);
+			break;
+		default:
+			__db_errx(env,
+			    "%s: unrecognized file type", real_name);
+			break;
+		}
+		ret = EINVAL;
+		goto err;
+	}
+
+	ret = __os_fsync(env, fhp);
+
+	/*
+	 * If mp_open was used, then rely on the database close to clean up
+	 * any file handles.
+	 */
+err:	if (use_mp_open == 0 && fhp != NULL &&
+	    (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
+		ret = t_ret;
+	__os_free(env, real_name);
+
+	/* We're done. */
+	if (dbp->db_feedback != NULL)
+		dbp->db_feedback(dbp, DB_UPGRADE, 100);
+
+	return (ret);
+}
+
+/*
+ * __db_page_pass --
+ *	Walk the pages of the database, upgrading whatever needs it.
+ */
+static int
+__db_page_pass(dbp, real_name, flags, fl, fhp)
+	DB *dbp;
+	char *real_name;
+	u_int32_t flags;
+	int (* const fl[P_PAGETYPE_MAX])
+	    __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+	DB_FH *fhp;
+{
+	ENV *env;
+	PAGE *page;
+	db_pgno_t i, pgno_last;
+	size_t n;
+	int dirty, ret;
+
+	env = dbp->env;
+
+	/* Determine the last page of the file. */
+	if ((ret = __db_lastpgno(dbp, real_name, fhp, &pgno_last)) != 0)
+		return (ret);
+
+	/* Allocate memory for a single page. */
+	if ((ret = __os_malloc(env, dbp->pgsize, &page)) != 0)
+		return (ret);
+
+	/* Walk the file, calling the underlying conversion functions. */
+	for (i = 0; i < pgno_last; ++i) {
+		if (dbp->db_feedback != NULL)
+			dbp->db_feedback(
+			    dbp, DB_UPGRADE, (int)((i * 100)/pgno_last));
+		if ((ret = __os_seek(env, fhp, i, dbp->pgsize, 0)) != 0)
+			break;
+		if ((ret = __os_read(env, fhp, page, dbp->pgsize, &n)) != 0)
+			break;
+		dirty = 0;
+		/* Always decrypt the page. */
+		if ((ret = __db_decrypt_pg(env, dbp, page)) != 0)
+			break;
+		if (fl[TYPE(page)] != NULL && (ret = fl[TYPE(page)]
+		    (dbp, real_name, flags, fhp, page, &dirty)) != 0)
+			break;
+		if (dirty) {
+			if ((ret = __db_encrypt_and_checksum_pg(
+			    env, dbp, page)) != 0)
+				break;
+			if ((ret =
+			    __os_seek(env, fhp, i, dbp->pgsize, 0)) != 0)
+				break;
+			if ((ret = __os_write(env,
+			    fhp, page, dbp->pgsize, &n)) != 0)
+				break;
+		}
+	}
+
+	__os_free(dbp->env, page);
+	return (ret);
+}
+
+/*
+ * __db_lastpgno --
+ *	Return the current last page number of the file.
+ *
+ * PUBLIC: int __db_lastpgno __P((DB *, char *, DB_FH *, db_pgno_t *));
+ */
+int
+__db_lastpgno(dbp, real_name, fhp, pgno_lastp)
+	DB *dbp;
+	char *real_name;
+	DB_FH *fhp;
+	db_pgno_t *pgno_lastp;
+{
+	ENV *env;
+	db_pgno_t pgno_last;
+	u_int32_t mbytes, bytes;
+	int ret;
+
+	env = dbp->env;
+
+	if ((ret = __os_ioinfo(env,
+	    real_name, fhp, &mbytes, &bytes, NULL)) != 0) {
+		__db_err(env, ret, "%s", real_name);
+		return (ret);
+	}
+
+	/* Page sizes have to be a power-of-two. */
+	if (bytes % dbp->pgsize != 0) {
+		__db_errx(env,
+		    "%s: file size not a multiple of the pagesize", real_name);
+		return (EINVAL);
+	}
+	pgno_last = mbytes * (MEGABYTE / dbp->pgsize);
+	pgno_last += bytes / dbp->pgsize;
+
+	*pgno_lastp = pgno_last;
+	return (0);
+}
+
+/*
+ * __db_set_lastpgno --
+ *	Update the meta->last_pgno field.
+ *
+ * Code assumes that we do not have checksums/crypto on the page.
+ */
+static int
+__db_set_lastpgno(dbp, real_name, fhp)
+	DB *dbp;
+	char *real_name;
+	DB_FH *fhp;
+{
+	DBMETA meta;
+	ENV *env;
+	int ret;
+	size_t n;
+
+	env = dbp->env;
+	if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+		return (ret);
+	if ((ret = __os_read(env, fhp, &meta, sizeof(meta), &n)) != 0)
+		return (ret);
+	dbp->pgsize = meta.pagesize;
+	if ((ret = __db_lastpgno(dbp, real_name, fhp, &meta.last_pgno)) != 0)
+		return (ret);
+	if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+		return (ret);
+	if ((ret = __os_write(env, fhp, &meta, sizeof(meta), &n)) != 0)
+		return (ret);
+
+	return (0);
+}
+#endif /* HAVE_UPGRADE_SUPPORT */
diff --git a/db/db_upg_opd.c b/db/db_upg_opd.c
new file mode 100644
index 0000000..ea143cf
--- /dev/null
+++ b/db/db_upg_opd.c
@@ -0,0 +1,343 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+
+static int __db_build_bi __P((DB *, DB_FH *, PAGE *, PAGE *, u_int32_t, int *));
+static int __db_build_ri __P((DB *, DB_FH *, PAGE *, PAGE *, u_int32_t, int *));
+static int __db_up_ovref __P((DB *, DB_FH *, db_pgno_t));
+
+#define	GET_PAGE(dbp, fhp, pgno, page) {				\
+	if ((ret = __os_seek(						\
+	    dbp->env, fhp, pgno, (dbp)->pgsize, 0)) != 0)		\
+		goto err;						\
+	if ((ret = __os_read(dbp->env,				\
+	    fhp, page, (dbp)->pgsize, &n)) != 0)			\
+		goto err;						\
+}
+#define	PUT_PAGE(dbp, fhp, pgno, page) {				\
+	if ((ret = __os_seek(						\
+	    dbp->env, fhp, pgno, (dbp)->pgsize, 0)) != 0)		\
+		goto err;						\
+	if ((ret = __os_write(dbp->env,				\
+	    fhp, page, (dbp)->pgsize, &n)) != 0)			\
+		goto err;						\
+}
+
+/*
+ * __db_31_offdup --
+ *	Convert 3.0 off-page duplicates to 3.1 off-page duplicates.
+ *
+ * PUBLIC: int __db_31_offdup __P((DB *, char *, DB_FH *, int, db_pgno_t *));
+ */
+int
+__db_31_offdup(dbp, real_name, fhp, sorted, pgnop)
+	DB *dbp;
+	char *real_name;
+	DB_FH *fhp;
+	int sorted;
+	db_pgno_t *pgnop;
+{
+	PAGE *ipage, *page;
+	db_indx_t indx;
+	db_pgno_t cur_cnt, i, next_cnt, pgno, *pgno_cur, pgno_last;
+	db_pgno_t *pgno_next, pgno_max, *tmp;
+	db_recno_t nrecs;
+	size_t n;
+	int level, nomem, ret;
+
+	ipage = page = NULL;
+	pgno_cur = pgno_next = NULL;
+
+	/* Allocate room to hold a page. */
+	if ((ret = __os_malloc(dbp->env, dbp->pgsize, &page)) != 0)
+		goto err;
+
+	/*
+	 * Walk the chain of 3.0 off-page duplicates.  Each one is converted
+	 * in place to a 3.1 off-page duplicate page.  If the duplicates are
+	 * sorted, they are converted to a Btree leaf page, otherwise to a
+	 * Recno leaf page.
+	 */
+	for (nrecs = 0, cur_cnt = pgno_max = 0,
+	    pgno = *pgnop; pgno != PGNO_INVALID;) {
+		if (pgno_max == cur_cnt) {
+			pgno_max += 20;
+			if ((ret = __os_realloc(dbp->env, pgno_max *
+			    sizeof(db_pgno_t), &pgno_cur)) != 0)
+				goto err;
+		}
+		pgno_cur[cur_cnt++] = pgno;
+
+		GET_PAGE(dbp, fhp, pgno, page);
+		nrecs += NUM_ENT(page);
+		LEVEL(page) = LEAFLEVEL;
+		TYPE(page) = sorted ? P_LDUP : P_LRECNO;
+		/*
+		 * !!!
+		 * DB didn't zero the LSNs on off-page duplicates pages.
+		 */
+		ZERO_LSN(LSN(page));
+		PUT_PAGE(dbp, fhp, pgno, page);
+
+		pgno = NEXT_PGNO(page);
+	}
+
+	/* If we only have a single page, it's easy. */
+	if (cur_cnt <= 1)
+		goto done;
+
+	/*
+	 * pgno_cur is the list of pages we just converted.  We're
+	 * going to walk that list, but we'll need to create a new
+	 * list while we do so.
+	 */
+	if ((ret = __os_malloc(dbp->env,
+	    cur_cnt * sizeof(db_pgno_t), &pgno_next)) != 0)
+		goto err;
+
+	/* Figure out where we can start allocating new pages. */
+	if ((ret = __db_lastpgno(dbp, real_name, fhp, &pgno_last)) != 0)
+		goto err;
+
+	/* Allocate room for an internal page. */
+	if ((ret = __os_malloc(dbp->env, dbp->pgsize, &ipage)) != 0)
+		goto err;
+	PGNO(ipage) = PGNO_INVALID;
+
+	/*
+	 * Repeatedly walk the list of pages, building internal pages, until
+	 * there's only one page at a level.
+	 */
+	for (level = LEAFLEVEL + 1; cur_cnt > 1; ++level) {
+		for (indx = 0, i = next_cnt = 0; i < cur_cnt;) {
+			if (indx == 0) {
+				P_INIT(ipage, dbp->pgsize, pgno_last,
+				    PGNO_INVALID, PGNO_INVALID,
+				    level, sorted ? P_IBTREE : P_IRECNO);
+				ZERO_LSN(LSN(ipage));
+
+				pgno_next[next_cnt++] = pgno_last++;
+			}
+
+			GET_PAGE(dbp, fhp, pgno_cur[i], page);
+
+			/*
+			 * If the duplicates are sorted, put the first item on
+			 * the lower-level page onto a Btree internal page. If
+			 * the duplicates are not sorted, create an internal
+			 * Recno structure on the page.  If either case doesn't
+			 * fit, push out the current page and start a new one.
+			 */
+			nomem = 0;
+			if (sorted) {
+				if ((ret = __db_build_bi(
+				    dbp, fhp, ipage, page, indx, &nomem)) != 0)
+					goto err;
+			} else
+				if ((ret = __db_build_ri(
+				    dbp, fhp, ipage, page, indx, &nomem)) != 0)
+					goto err;
+			if (nomem) {
+				indx = 0;
+				PUT_PAGE(dbp, fhp, PGNO(ipage), ipage);
+			} else {
+				++indx;
+				++NUM_ENT(ipage);
+				++i;
+			}
+		}
+
+		/*
+		 * Push out the last internal page.  Set the top-level record
+		 * count if we've reached the top.
+		 */
+		if (next_cnt == 1)
+			RE_NREC_SET(ipage, nrecs);
+		PUT_PAGE(dbp, fhp, PGNO(ipage), ipage);
+
+		/* Swap the current and next page number arrays. */
+		cur_cnt = next_cnt;
+		tmp = pgno_cur;
+		pgno_cur = pgno_next;
+		pgno_next = tmp;
+	}
+
+done:	*pgnop = pgno_cur[0];
+
+err:	if (pgno_cur != NULL)
+		__os_free(dbp->env, pgno_cur);
+	if (pgno_next != NULL)
+		__os_free(dbp->env, pgno_next);
+	if (ipage != NULL)
+		__os_free(dbp->env, ipage);
+	if (page != NULL)
+		__os_free(dbp->env, page);
+
+	return (ret);
+}
+
+/*
+ * __db_build_bi --
+ *	Build a BINTERNAL entry for a parent page.
+ */
+static int
+__db_build_bi(dbp, fhp, ipage, page, indx, nomemp)
+	DB *dbp;
+	DB_FH *fhp;
+	PAGE *ipage, *page;
+	u_int32_t indx;
+	int *nomemp;
+{
+	BINTERNAL bi, *child_bi;
+	BKEYDATA *child_bk;
+	u_int8_t *p;
+	int ret;
+	db_indx_t *inp;
+
+	inp = P_INP(dbp, ipage);
+	switch (TYPE(page)) {
+	case P_IBTREE:
+		child_bi = GET_BINTERNAL(dbp, page, 0);
+		if (P_FREESPACE(dbp, ipage) < BINTERNAL_PSIZE(child_bi->len)) {
+			*nomemp = 1;
+			return (0);
+		}
+		inp[indx] =
+		    HOFFSET(ipage) -= BINTERNAL_SIZE(child_bi->len);
+		p = P_ENTRY(dbp, ipage, indx);
+
+		bi.len = child_bi->len;
+		B_TSET(bi.type, child_bi->type);
+		bi.pgno = PGNO(page);
+		bi.nrecs = __bam_total(dbp, page);
+		memcpy(p, &bi, SSZA(BINTERNAL, data));
+		p += SSZA(BINTERNAL, data);
+		memcpy(p, child_bi->data, child_bi->len);
+
+		/* Increment the overflow ref count. */
+		if (B_TYPE(child_bi->type) == B_OVERFLOW)
+			if ((ret = __db_up_ovref(dbp, fhp,
+			    ((BOVERFLOW *)(child_bi->data))->pgno)) != 0)
+				return (ret);
+		break;
+	case P_LDUP:
+		child_bk = GET_BKEYDATA(dbp, page, 0);
+		switch (B_TYPE(child_bk->type)) {
+		case B_KEYDATA:
+			if (P_FREESPACE(dbp, ipage) <
+			    BINTERNAL_PSIZE(child_bk->len)) {
+				*nomemp = 1;
+				return (0);
+			}
+			inp[indx] =
+			    HOFFSET(ipage) -= BINTERNAL_SIZE(child_bk->len);
+			p = P_ENTRY(dbp, ipage, indx);
+
+			bi.len = child_bk->len;
+			B_TSET(bi.type, child_bk->type);
+			bi.pgno = PGNO(page);
+			bi.nrecs = __bam_total(dbp, page);
+			memcpy(p, &bi, SSZA(BINTERNAL, data));
+			p += SSZA(BINTERNAL, data);
+			memcpy(p, child_bk->data, child_bk->len);
+			break;
+		case B_OVERFLOW:
+			if (P_FREESPACE(dbp, ipage) <
+			    BINTERNAL_PSIZE(BOVERFLOW_SIZE)) {
+				*nomemp = 1;
+				return (0);
+			}
+			inp[indx] =
+			    HOFFSET(ipage) -= BINTERNAL_SIZE(BOVERFLOW_SIZE);
+			p = P_ENTRY(dbp, ipage, indx);
+
+			bi.len = BOVERFLOW_SIZE;
+			B_TSET(bi.type, child_bk->type);
+			bi.pgno = PGNO(page);
+			bi.nrecs = __bam_total(dbp, page);
+			memcpy(p, &bi, SSZA(BINTERNAL, data));
+			p += SSZA(BINTERNAL, data);
+			memcpy(p, child_bk, BOVERFLOW_SIZE);
+
+			/* Increment the overflow ref count. */
+			if ((ret = __db_up_ovref(dbp, fhp,
+			    ((BOVERFLOW *)child_bk)->pgno)) != 0)
+				return (ret);
+			break;
+		default:
+			return (__db_pgfmt(dbp->env, PGNO(page)));
+		}
+		break;
+	default:
+		return (__db_pgfmt(dbp->env, PGNO(page)));
+	}
+
+	return (0);
+}
+
+/*
+ * __db_build_ri --
+ *	Build a RINTERNAL entry for an internal parent page.
+ */
+static int
+__db_build_ri(dbp, fhp, ipage, page, indx, nomemp)
+	DB *dbp;
+	DB_FH *fhp;
+	PAGE *ipage, *page;
+	u_int32_t indx;
+	int *nomemp;
+{
+	RINTERNAL ri;
+	db_indx_t *inp;
+
+	COMPQUIET(fhp, NULL);
+	inp = P_INP(dbp, ipage);
+	if (P_FREESPACE(dbp, ipage) < RINTERNAL_PSIZE) {
+		*nomemp = 1;
+		return (0);
+	}
+
+	ri.pgno = PGNO(page);
+	ri.nrecs = __bam_total(dbp, page);
+	inp[indx] = HOFFSET(ipage) -= RINTERNAL_SIZE;
+	memcpy(P_ENTRY(dbp, ipage, indx), &ri, RINTERNAL_SIZE);
+
+	return (0);
+}
+
+/*
+ * __db_up_ovref --
+ *	Increment/decrement the reference count on an overflow page.
+ */
+static int
+__db_up_ovref(dbp, fhp, pgno)
+	DB *dbp;
+	DB_FH *fhp;
+	db_pgno_t pgno;
+{
+	PAGE *page;
+	size_t n;
+	int ret;
+
+	/* Allocate room to hold a page. */
+	if ((ret = __os_malloc(dbp->env, dbp->pgsize, &page)) != 0)
+		return (ret);
+
+	GET_PAGE(dbp, fhp, pgno, page);
+	++OV_REF(page);
+	PUT_PAGE(dbp, fhp, pgno, page);
+
+err:	__os_free(dbp->env, page);
+
+	return (ret);
+}
diff --git a/db/db_vrfy.c b/db/db_vrfy.c
new file mode 100644
index 0000000..7ea9c62
--- /dev/null
+++ b/db/db_vrfy.c
@@ -0,0 +1,2894 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000-2009 Oracle.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+/*
+ * This is the code for DB->verify, the DB database consistency checker.
+ * For now, it checks all subdatabases in a database, and verifies
+ * everything it knows how to (i.e. it's all-or-nothing, and one can't
+ * check only for a subset of possible problems).
+ */
+
+static u_int __db_guesspgsize __P((ENV *, DB_FH *));
+static int   __db_is_valid_magicno __P((u_int32_t, DBTYPE *));
+static int   __db_meta2pgset
+		__P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t, DB *));
+static int   __db_salvage __P((DB *, VRFY_DBINFO *,
+		db_pgno_t, void *, int (*)(void *, const void *), u_int32_t));
+static int   __db_salvage_subdbpg __P((DB *, VRFY_DBINFO *,
+		PAGE *, void *, int (*)(void *, const void *), u_int32_t));
+static int   __db_salvage_all __P((DB *, VRFY_DBINFO *, void *,
+		int(*)(void *, const void *), u_int32_t, int *));
+static int   __db_salvage_unknowns __P((DB *, VRFY_DBINFO *, void *,
+		int (*)(void *, const void *), u_int32_t));
+static int   __db_verify_arg __P((DB *, const char *, void *, u_int32_t));
+static int   __db_vrfy_freelist
+		__P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t));
+static int   __db_vrfy_invalid
+		__P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+static int   __db_vrfy_orderchkonly __P((DB *,
+		VRFY_DBINFO *, const char *, const char *, u_int32_t));
+static int   __db_vrfy_pagezero __P((DB *, VRFY_DBINFO *, DB_FH *, u_int32_t));
+static int   __db_vrfy_subdbs
+		__P((DB *, VRFY_DBINFO *, const char *, u_int32_t));
+static int   __db_vrfy_structure __P((DB *, VRFY_DBINFO *,
+		const char *, db_pgno_t, void *, void *, u_int32_t));
+static int   __db_vrfy_walkpages __P((DB *, VRFY_DBINFO *,
+		void *, int (*)(void *, const void *), u_int32_t));
+
+#define	VERIFY_FLAGS							\
+    (DB_AGGRESSIVE |							\
+     DB_NOORDERCHK | DB_ORDERCHKONLY | DB_PRINTABLE | DB_SALVAGE | DB_UNREF)
+
+/*
+ * __db_verify_pp --
+ *	DB->verify public interface.
+ *
+ * PUBLIC: int __db_verify_pp
+ * PUBLIC:     __P((DB *, const char *, const char *, FILE *, u_int32_t));
+ */
+int
+__db_verify_pp(dbp, file, database, outfile, flags)
+	DB *dbp;
+	const char *file, *database;
+	FILE *outfile;
+	u_int32_t flags;
+{
+	/*
+	 * __db_verify_pp is a wrapper to __db_verify_internal, which lets
+	 * us pass appropriate equivalents to FILE * in from the non-C APIs.
+	 * That's why the usual ENV_ENTER macros are in __db_verify_internal,
+	 * not here.
+	 */
+	return (__db_verify_internal(dbp,
+	    file, database, outfile, __db_pr_callback, flags));
+}
+
+/*
+ * __db_verify_internal --
+ *
+ * PUBLIC: int __db_verify_internal __P((DB *, const char *,
+ * PUBLIC:     const char *, void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__db_verify_internal(dbp, fname, dname, handle, callback, flags)
+	DB *dbp;
+	const char *fname, *dname;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ret, t_ret;
+
+	env = dbp->env;
+
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->verify");
+
+	if (!LF_ISSET(DB_SALVAGE))
+		LF_SET(DB_UNREF);
+
+	ENV_ENTER(env, ip);
+
+	if ((ret = __db_verify_arg(dbp, dname, handle, flags)) == 0)
+		ret = __db_verify(dbp, ip,
+		     fname, dname, handle, callback, NULL, NULL, flags);
+
+	/* Db.verify is a DB handle destructor. */
+	if ((t_ret = __db_close(dbp, NULL, 0)) != 0 && ret == 0)
+		ret = t_ret;
+
+	ENV_LEAVE(env, ip);
+	return (ret);
+}
+
+/*
+ * __db_verify_arg --
+ *	Check DB->verify arguments.
+ */
+static int
+__db_verify_arg(dbp, dname, handle, flags)
+	DB *dbp;
+	const char *dname;
+	void *handle;
+	u_int32_t flags;
+{
+	ENV *env;
+	int ret;
+
+	env = dbp->env;
+
+	if ((ret = __db_fchk(env, "DB->verify", flags, VERIFY_FLAGS)) != 0)
+		return (ret);
+
+	/*
+	 * DB_SALVAGE is mutually exclusive with the other flags except
+	 * DB_AGGRESSIVE, DB_PRINTABLE.
+	 *
+	 * DB_AGGRESSIVE and DB_PRINTABLE are only meaningful when salvaging.
+	 *
+	 * DB_SALVAGE requires an output stream.
+	 */
+	if (LF_ISSET(DB_SALVAGE)) {
+		if (LF_ISSET(~(DB_AGGRESSIVE | DB_PRINTABLE | DB_SALVAGE)))
+			return (__db_ferr(env, "DB->verify", 1));
+		if (handle == NULL) {
+			__db_errx(env,
+			    "DB_SALVAGE requires a an output handle");
+			return (EINVAL);
+		}
+	} else
+		if (LF_ISSET(DB_AGGRESSIVE | DB_PRINTABLE))
+			return (__db_ferr(env, "DB->verify", 1));
+
+	/*
+	 * DB_ORDERCHKONLY is mutually exclusive with DB_SALVAGE and
+	 * DB_NOORDERCHK, and requires a database name.
+	 */
+	if ((ret = __db_fcchk(env, "DB->verify", flags,
+	    DB_ORDERCHKONLY, DB_SALVAGE | DB_NOORDERCHK)) != 0)
+		return (ret);
+	if (LF_ISSET(DB_ORDERCHKONLY) && dname == NULL) {
+		__db_errx(env, "DB_ORDERCHKONLY requires a database name");
+		return (EINVAL);
+	}
+	return (0);
+}
+
+/*
+ * __db_verify --
+ *	Walk the entire file page-by-page, either verifying with or without
+ *	dumping in db_dump -d format, or DB_SALVAGE-ing whatever key/data
+ *	pairs can be found and dumping them in standard (db_load-ready)
+ *	dump format.
+ *
+ *	(Salvaging isn't really a verification operation, but we put it
+ *	here anyway because it requires essentially identical top-level
+ *	code.)
+ *
+ *	flags may be 0, DB_NOORDERCHK, DB_ORDERCHKONLY, or DB_SALVAGE
+ *	(and optionally DB_AGGRESSIVE).
+ * PUBLIC: int   __db_verify __P((DB *, DB_THREAD_INFO *, const char *,
+ * PUBLIC:		const char *, void *, int (*)(void *, const void *),
+ * PUBLIC:		void *, void *, u_int32_t));
+ */
+int
+__db_verify(dbp, ip, name, subdb, handle, callback, lp, rp, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	const char *name, *subdb;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	void *lp, *rp;
+	u_int32_t flags;
+{
+	DB_FH *fhp;
+	ENV *env;
+	VRFY_DBINFO *vdp;
+	u_int32_t sflags;
+	int has_subdbs, isbad, ret, t_ret;
+	char *real_name;
+
+	env = dbp->env;
+	fhp = NULL;
+	vdp = NULL;
+	real_name = NULL;
+	has_subdbs = isbad = ret = t_ret = 0;
+
+	F_SET(dbp, DB_AM_VERIFYING);
+
+	/* Initialize any feedback function. */
+	if (!LF_ISSET(DB_SALVAGE) && dbp->db_feedback != NULL)
+		dbp->db_feedback(dbp, DB_VERIFY, 0);
+
+	/*
+	 * We don't know how large the cache is, and if the database
+	 * in question uses a small page size--which we don't know
+	 * yet!--it may be uncomfortably small for the default page
+	 * size [#2143].  However, the things we need temporary
+	 * databases for in dbinfo are largely tiny, so using a
+	 * 1024-byte pagesize is probably not going to be a big hit,
+	 * and will make us fit better into small spaces.
+	 */
+	if ((ret = __db_vrfy_dbinfo_create(env, ip,  1024, &vdp)) != 0)
+		goto err;
+
+	/*
+	 * Note whether the user has requested that we use printable
+	 * chars where possible.  We won't get here with this flag if
+	 * we're not salvaging.
+	 */
+	if (LF_ISSET(DB_PRINTABLE))
+		F_SET(vdp, SALVAGE_PRINTABLE);
+
+	/* Find the real name of the file. */
+	if ((ret = __db_appname(env,
+	    DB_APP_DATA, name, &dbp->dirname, &real_name)) != 0)
+		goto err;
+
+	/*
+	 * Our first order of business is to verify page 0, which is
+	 * the metadata page for the master database of subdatabases
+	 * or of the only database in the file.  We want to do this by hand
+	 * rather than just calling __db_open in case it's corrupt--various
+	 * things in __db_open might act funny.
+	 *
+	 * Once we know the metadata page is healthy, I believe that it's
+	 * safe to open the database normally and then use the page swapping
+	 * code, which makes life easier.
+	 */
+	if ((ret = __os_open(env, real_name, 0, DB_OSO_RDONLY, 0, &fhp)) != 0)
+		goto err;
+
+	/* Verify the metadata page 0; set pagesize and type. */
+	if ((ret = __db_vrfy_pagezero(dbp, vdp, fhp, flags)) != 0) {
+		if (ret == DB_VERIFY_BAD)
+			isbad = 1;
+		else
+			goto err;
+	}
+
+	/*
+	 * We can assume at this point that dbp->pagesize and dbp->type are
+	 * set correctly, or at least as well as they can be, and that
+	 * locking, logging, and txns are not in use.  Thus we can trust
+	 * the memp code not to look at the page, and thus to be safe
+	 * enough to use.
+	 *
+	 * The dbp is not open, but the file is open in the fhp, and we
+	 * cannot assume that __db_open is safe.  Call __env_setup,
+	 * the [safe] part of __db_open that initializes the environment--
+	 * and the mpool--manually.
+	 */
+	if ((ret = __env_setup(dbp, NULL,
+	    name, subdb, TXN_INVALID, DB_ODDFILESIZE | DB_RDONLY)) != 0)
+		goto err;
+
+	/*
+	 * Set our name in the Queue subsystem;  we may need it later
+	 * to deal with extents.
+	 */
+	if (dbp->type == DB_QUEUE &&
+	    (ret = __qam_set_ext_data(dbp, name)) != 0)
+		goto err;
+
+	/* Mark the dbp as opened, so that we correctly handle its close. */
+	F_SET(dbp, DB_AM_OPEN_CALLED);
+
+	/* Find out the page number of the last page in the database. */
+	if ((ret = __memp_get_last_pgno(dbp->mpf, &vdp->last_pgno)) != 0)
+		goto err;
+
+	/*
+	 * DB_ORDERCHKONLY is a special case;  our file consists of
+	 * several subdatabases, which use different hash, bt_compare,
+	 * and/or dup_compare functions.  Consequently, we couldn't verify
+	 * sorting and hashing simply by calling DB->verify() on the file.
+	 * DB_ORDERCHKONLY allows us to come back and check those things;  it
+	 * requires a subdatabase, and assumes that everything but that
+	 * database's sorting/hashing is correct.
+	 */
+	if (LF_ISSET(DB_ORDERCHKONLY)) {
+		ret = __db_vrfy_orderchkonly(dbp, vdp, name, subdb, flags);
+		goto done;
+	}
+
+	sflags = flags;
+	if (dbp->p_internal != NULL)
+		LF_CLR(DB_SALVAGE);
+
+	/*
+	 * When salvaging, we use a db to keep track of whether we've seen a
+	 * given overflow or dup page in the course of traversing normal data.
+	 * If in the end we have not, we assume its key got lost and print it
+	 * with key "UNKNOWN".
+	 */
+	if (LF_ISSET(DB_SALVAGE)) {
+		if ((ret = __db_salvage_init(vdp)) != 0)
+			goto err;
+
+		/*
+		 * If we're not being aggressive, salvage by walking the tree
+		 * and only printing the leaves we find.  "has_subdbs" will
+		 * indicate whether we found subdatabases.
+		 */
+		if (!LF_ISSET(DB_AGGRESSIVE) && __db_salvage_all(
+		    dbp, vdp, handle, callback, flags, &has_subdbs) != 0)
+			isbad = 1;
+
+		/*
+		 * If we have subdatabases, flag if any keys are found that
+		 * don't belong to a subdatabase -- they'll need to have an
+		 * "__OTHER__" subdatabase header printed first.
+		 */
+		if (has_subdbs) {
+			F_SET(vdp, SALVAGE_PRINTHEADER);
+			F_SET(vdp, SALVAGE_HASSUBDBS);
+		}
+	}
+
+	/* Walk all the pages, if a page cannot be read, verify structure. */
+	if ((ret =
+	    __db_vrfy_walkpages(dbp, vdp, handle, callback, flags)) != 0) {
+		if (ret == DB_VERIFY_BAD)
+			isbad = 1;
+		else if (ret != DB_PAGE_NOTFOUND)
+			goto err;
+	}
+
+	/* If we're verifying, verify inter-page structure. */
+	if (!LF_ISSET(DB_SALVAGE) && isbad == 0)
+		if ((t_ret = __db_vrfy_structure(dbp,
+		    vdp, name, 0, lp, rp, flags)) != 0) {
+			if (t_ret == DB_VERIFY_BAD)
+				isbad = 1;
+			else
+				goto err;
+		}
+
+	/*
+	 * If we're salvaging, output with key UNKNOWN any overflow or dup pages
+	 * we haven't been able to put in context.  Then destroy the salvager's
+	 * state-saving database.
+	 */
+	if (LF_ISSET(DB_SALVAGE)) {
+		if ((ret = __db_salvage_unknowns(dbp,
+		    vdp, handle, callback, flags)) != 0)
+			isbad = 1;
+	}
+
+	flags = sflags;
+
+#ifdef HAVE_PARTITION
+	if (t_ret == 0 && dbp->p_internal != NULL)
+		t_ret = __part_verify(dbp, vdp, name, handle, callback, flags);
+#endif
+
+	if (ret == 0)
+		ret = t_ret;
+
+	/* Don't display a footer for a database holding other databases. */
+	if (LF_ISSET(DB_SALVAGE | DB_VERIFY_PARTITION) == DB_SALVAGE &&
+	    (!has_subdbs || F_ISSET(vdp, SALVAGE_PRINTFOOTER)))
+		(void)__db_prfooter(handle, callback);
+
+done: err:
+	/* Send feedback that we're done. */
+	if (!LF_ISSET(DB_SALVAGE) && dbp->db_feedback != NULL)
+		dbp->db_feedback(dbp, DB_VERIFY, 100);
+
+	if (LF_ISSET(DB_SALVAGE) &&
+	    (t_ret = __db_salvage_destroy(vdp)) != 0 && ret == 0)
+		ret = t_ret;
+	if (fhp != NULL &&
+	    (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
+		ret = t_ret;
+	if (vdp != NULL &&
+	    (t_ret = __db_vrfy_dbinfo_destroy(env, vdp)) != 0 && ret == 0)
+		ret = t_ret;
+	if (real_name != NULL)
+		__os_free(env, real_name);
+
+	/*
+	 * DB_VERIFY_FATAL is a private error, translate to a public one.
+	 *
+	 * If we didn't find a page, it's probably a page number was corrupted.
+	 * Return the standard corruption error.
+	 *
+	 * Otherwise, if we found corruption along the way, set the return.
+	 */
+	if (ret == DB_VERIFY_FATAL ||
+	    ret == DB_PAGE_NOTFOUND || (ret == 0 && isbad == 1))
+		ret = DB_VERIFY_BAD;
+
+	/* Make sure there's a public complaint if we found corruption. */
+	if (ret != 0)
+		__db_err(env, ret, "%s", name);
+
+	return (ret);
+}
+
+/*
+ * __db_vrfy_pagezero --
+ *	Verify the master metadata page.  Use seek, read, and a local buffer
+ *	rather than the DB paging code, for safety.
+ *
+ *	Must correctly (or best-guess) set dbp->type and dbp->pagesize.
+ */
+static int
+__db_vrfy_pagezero(dbp, vdp, fhp, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	DB_FH *fhp;
+	u_int32_t flags;
+{
+	DBMETA *meta;
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	db_pgno_t freelist;
+	size_t nr;
+	int isbad, ret, swapped;
+	u_int8_t mbuf[DBMETASIZE];
+
+	isbad = ret = swapped = 0;
+	freelist = 0;
+	env = dbp->env;
+	meta = (DBMETA *)mbuf;
+	dbp->type = DB_UNKNOWN;
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, PGNO_BASE_MD, &pip)) != 0)
+		return (ret);
+
+	/*
+	 * Seek to the metadata page.
+	 * Note that if we're just starting a verification, dbp->pgsize
+	 * may be zero;  this is okay, as we want page zero anyway and
+	 * 0*0 == 0.
+	 */
+	if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0 ||
+	    (ret = __os_read(env, fhp, mbuf, DBMETASIZE, &nr)) != 0) {
+		__db_err(env, ret,
+		    "Metadata page %lu cannot be read", (u_long)PGNO_BASE_MD);
+		return (ret);
+	}
+
+	if (nr != DBMETASIZE) {
+		EPRINT((env,
+		    "Page %lu: Incomplete metadata page",
+		    (u_long)PGNO_BASE_MD));
+		return (DB_VERIFY_FATAL);
+	}
+
+	if ((ret = __db_chk_meta(env, dbp, meta, 1)) != 0) {
+		EPRINT((env,
+		    "Page %lu: metadata page corrupted", (u_long)PGNO_BASE_MD));
+		isbad = 1;
+		if (ret != -1) {
+			EPRINT((env,
+			    "Page %lu: could not check metadata page",
+			    (u_long)PGNO_BASE_MD));
+			return (DB_VERIFY_FATAL);
+		}
+	}
+
+	/*
+	 * Check all of the fields that we can.
+	 *
+	 * 08-11: Current page number.  Must == pgno.
+	 * Note that endianness doesn't matter--it's zero.
+	 */
+	if (meta->pgno != PGNO_BASE_MD) {
+		isbad = 1;
+		EPRINT((env, "Page %lu: pgno incorrectly set to %lu",
+		    (u_long)PGNO_BASE_MD, (u_long)meta->pgno));
+	}
+
+	/* 12-15: Magic number.  Must be one of valid set. */
+	if (__db_is_valid_magicno(meta->magic, &dbp->type))
+		swapped = 0;
+	else {
+		M_32_SWAP(meta->magic);
+		if (__db_is_valid_magicno(meta->magic,
+		    &dbp->type))
+			swapped = 1;
+		else {
+			isbad = 1;
+			EPRINT((env,
+			    "Page %lu: bad magic number %lu",
+			    (u_long)PGNO_BASE_MD, (u_long)meta->magic));
+		}
+	}
+
+	/*
+	 * 16-19: Version.  Must be current;  for now, we
+	 * don't support verification of old versions.
+	 */
+	if (swapped)
+		M_32_SWAP(meta->version);
+	if ((dbp->type == DB_BTREE &&
+	    (meta->version > DB_BTREEVERSION ||
+	    meta->version < DB_BTREEOLDVER)) ||
+	    (dbp->type == DB_HASH &&
+	    (meta->version > DB_HASHVERSION ||
+	    meta->version < DB_HASHOLDVER)) ||
+	    (dbp->type == DB_QUEUE &&
+	    (meta->version > DB_QAMVERSION ||
+	    meta->version < DB_QAMOLDVER))) {
+		isbad = 1;
+		EPRINT((env,
+    "Page %lu: unsupported DB version %lu; extraneous errors may result",
+		    (u_long)PGNO_BASE_MD, (u_long)meta->version));
+	}
+
+	/*
+	 * 20-23: Pagesize.  Must be power of two,
+	 * greater than 512, and less than 64K.
+	 */
+	if (swapped)
+		M_32_SWAP(meta->pagesize);
+	if (IS_VALID_PAGESIZE(meta->pagesize))
+		dbp->pgsize = meta->pagesize;
+	else {
+		isbad = 1;
+		EPRINT((env, "Page %lu: bad page size %lu",
+		    (u_long)PGNO_BASE_MD, (u_long)meta->pagesize));
+
+		/*
+		 * Now try to settle on a pagesize to use.
+		 * If the user-supplied one is reasonable,
+		 * use it;  else, guess.
+		 */
+		if (!IS_VALID_PAGESIZE(dbp->pgsize))
+			dbp->pgsize = __db_guesspgsize(env, fhp);
+	}
+
+	/*
+	 * 25: Page type.  Must be correct for dbp->type,
+	 * which is by now set as well as it can be.
+	 */
+	/* Needs no swapping--only one byte! */
+	if ((dbp->type == DB_BTREE && meta->type != P_BTREEMETA) ||
+	    (dbp->type == DB_HASH && meta->type != P_HASHMETA) ||
+	    (dbp->type == DB_QUEUE && meta->type != P_QAMMETA)) {
+		isbad = 1;
+		EPRINT((env, "Page %lu: bad page type %lu",
+		    (u_long)PGNO_BASE_MD, (u_long)meta->type));
+	}
+
+	/*
+	 * 26: Meta-flags.
+	 */
+	if (meta->metaflags != 0) {
+		if (FLD_ISSET(meta->metaflags,
+		    ~(DBMETA_CHKSUM|DBMETA_PART_RANGE|DBMETA_PART_CALLBACK))) {
+			isbad = 1;
+			EPRINT((env,
+			    "Page %lu: bad meta-data flags value %#lx",
+			    (u_long)PGNO_BASE_MD, (u_long)meta->metaflags));
+		}
+		if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM))
+			F_SET(pip, VRFY_HAS_CHKSUM);
+		if (FLD_ISSET(meta->metaflags, DBMETA_PART_RANGE))
+			F_SET(pip, VRFY_HAS_PART_RANGE);
+		if (FLD_ISSET(meta->metaflags, DBMETA_PART_CALLBACK))
+			F_SET(pip, VRFY_HAS_PART_CALLBACK);
+
+		if (FLD_ISSET(meta->metaflags,
+		    DBMETA_PART_RANGE | DBMETA_PART_CALLBACK) &&
+		    (ret = __partition_init(dbp, meta->metaflags)) != 0)
+			return (ret);
+	}
+
+	/*
+	 * 28-31: Free list page number.
+	 * 32-35: Last page in database file.
+	 * We'll verify its sensibility when we do inter-page
+	 * verification later;  for now, just store it.
+	 */
+	if (swapped)
+	    M_32_SWAP(meta->free);
+	freelist = meta->free;
+	if (swapped)
+	    M_32_SWAP(meta->last_pgno);
+	vdp->meta_last_pgno = meta->last_pgno;
+
+	/*
+	 * Initialize vdp->pages to fit a single pageinfo structure for
+	 * this one page.  We'll realloc later when we know how many
+	 * pages there are.
+	 */
+	pip->pgno = PGNO_BASE_MD;
+	pip->type = meta->type;
+
+	/*
+	 * Signal that we still have to check the info specific to
+	 * a given type of meta page.
+	 */
+	F_SET(pip, VRFY_INCOMPLETE);
+
+	pip->free = freelist;
+
+	if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+		return (ret);
+
+	/* Set up the dbp's fileid.  We don't use the regular open path. */
+	memcpy(dbp->fileid, meta->uid, DB_FILE_ID_LEN);
+
+	if (swapped == 1)
+		F_SET(dbp, DB_AM_SWAP);
+
+	return (isbad ? DB_VERIFY_BAD : 0);
+}
+
+/*
+ * __db_vrfy_walkpages --
+ *	Main loop of the verifier/salvager.  Walks through,
+ *	page by page, and verifies all pages and/or prints all data pages.
+ */
+static int
+__db_vrfy_walkpages(dbp, vdp, handle, callback, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	PAGE *h;
+	VRFY_PAGEINFO *pip;
+	db_pgno_t i;
+	int ret, t_ret, isbad;
+
+	env = dbp->env;
+	mpf = dbp->mpf;
+	h = NULL;
+	ret = isbad = t_ret = 0;
+
+	for (i = 0; i <= vdp->last_pgno; i++) {
+		/*
+		 * If DB_SALVAGE is set, we inspect our database of completed
+		 * pages, and skip any we've already printed in the subdb pass.
+		 */
+		if (LF_ISSET(DB_SALVAGE) && (__db_salvage_isdone(vdp, i) != 0))
+			continue;
+
+		/*
+		 * An individual page get can fail if:
+		 *  * This is a hash database, it is expected to find
+		 *    empty buckets, which don't have allocated pages. Create
+		 *    a dummy page so the verification can proceed.
+		 *  * We are salvaging, flag the error and continue.
+		 */
+		if ((t_ret = __memp_fget(mpf, &i,
+		    vdp->thread_info, NULL, 0, &h)) != 0) {
+			if (dbp->type == DB_HASH) {
+				if ((t_ret =
+				    __db_vrfy_getpageinfo(vdp, i, &pip)) != 0)
+					goto err1;
+				pip->type = P_INVALID;
+				pip->pgno = i;
+				F_CLR(pip, VRFY_IS_ALLZEROES);
+				if ((t_ret = __db_vrfy_putpageinfo(
+				    env, vdp, pip)) != 0)
+					goto err1;
+				continue;
+			}
+			if (t_ret == DB_PAGE_NOTFOUND) {
+				EPRINT((env,
+    "Page %lu: beyond the end of the file, metadata page has last page as %lu",
+				    (u_long)i, (u_long)vdp->last_pgno));
+				if (ret == 0)
+					return (t_ret);
+			}
+
+err1:			if (ret == 0)
+				ret = t_ret;
+			if (LF_ISSET(DB_SALVAGE))
+				continue;
+			return (ret);
+		}
+
+		if (LF_ISSET(DB_SALVAGE)) {
+			/*
+			 * We pretty much don't want to quit unless a
+			 * bomb hits.  May as well return that something
+			 * was screwy, however.
+			 */
+			if ((t_ret = __db_salvage_pg(dbp,
+			    vdp, i, h, handle, callback, flags)) != 0) {
+				if (ret == 0)
+					ret = t_ret;
+				isbad = 1;
+			}
+		} else {
+			/*
+			 * If we are not salvaging, and we get any error
+			 * other than DB_VERIFY_BAD, return immediately;
+			 * it may not be safe to proceed.  If we get
+			 * DB_VERIFY_BAD, keep going;  listing more errors
+			 * may make it easier to diagnose problems and
+			 * determine the magnitude of the corruption.
+			 *
+			 * Verify info common to all page types.
+			 */
+			if (i != PGNO_BASE_MD) {
+				ret = __db_vrfy_common(dbp, vdp, h, i, flags);
+				if (ret == DB_VERIFY_BAD)
+					isbad = 1;
+				else if (ret != 0)
+					goto err;
+			}
+
+			switch (TYPE(h)) {
+			case P_INVALID:
+				ret = __db_vrfy_invalid(dbp, vdp, h, i, flags);
+				break;
+			case __P_DUPLICATE:
+				isbad = 1;
+				EPRINT((env,
+				    "Page %lu: old-style duplicate page",
+				    (u_long)i));
+				break;
+			case P_HASH_UNSORTED:
+			case P_HASH:
+				ret = __ham_vrfy(dbp, vdp, h, i, flags);
+				break;
+			case P_IBTREE:
+			case P_IRECNO:
+			case P_LBTREE:
+			case P_LDUP:
+				ret = __bam_vrfy(dbp, vdp, h, i, flags);
+				break;
+			case P_LRECNO:
+				ret = __ram_vrfy_leaf(dbp, vdp, h, i, flags);
+				break;
+			case P_OVERFLOW:
+				ret = __db_vrfy_overflow(dbp, vdp, h, i, flags);
+				break;
+			case P_HASHMETA:
+				ret = __ham_vrfy_meta(dbp,
+				    vdp, (HMETA *)h, i, flags);
+				break;
+			case P_BTREEMETA:
+				ret = __bam_vrfy_meta(dbp,
+				    vdp, (BTMETA *)h, i, flags);
+				break;
+			case P_QAMMETA:
+				ret = __qam_vrfy_meta(dbp,
+				    vdp, (QMETA *)h, i, flags);
+				break;
+			case P_QAMDATA:
+				ret = __qam_vrfy_data(dbp,
+				    vdp, (QPAGE *)h, i, flags);
+				break;
+			default:
+				EPRINT((env,
+				    "Page %lu: unknown page type %lu",
+				    (u_long)i, (u_long)TYPE(h)));
+				isbad = 1;
+				break;
+			}
+
+			/*
+			 * Set up error return.
+			 */
+			if (ret == DB_VERIFY_BAD)
+				isbad = 1;
+			else if (ret != 0)
+				goto err;
+
+			/*
+			 * Provide feedback to the application about our
+			 * progress.  The range 0-50% comes from the fact
+			 * that this is the first of two passes through the
+			 * database (front-to-back, then top-to-bottom).
+			 */
+			if (dbp->db_feedback != NULL)
+				dbp->db_feedback(dbp, DB_VERIFY,
+				    (int)((i + 1) * 50 / (vdp->last_pgno + 1)));
+		}
+
+		/*
+		 * Just as with the page get, bail if and only if we're
+		 * not salvaging.
+		 */
+		if ((t_ret = __memp_fput(mpf,
+		    vdp->thread_info, h, dbp->priority)) != 0) {
+			if (ret == 0)
+				ret = t_ret;
+			if (!LF_ISSET(DB_SALVAGE))
+				return (ret);
+		}
+	}
+
+	/*
+	 * If we've seen a Queue metadata page, we may need to walk Queue
+	 * extent pages that won't show up between 0 and vdp->last_pgno.
+	 */
+	if (F_ISSET(vdp, VRFY_QMETA_SET) && (t_ret =
+	    __qam_vrfy_walkqueue(dbp, vdp, handle, callback, flags)) != 0) {
+		if (ret == 0)
+			ret = t_ret;
+		if (t_ret == DB_VERIFY_BAD)
+			isbad = 1;
+		else if (!LF_ISSET(DB_SALVAGE))
+			return (ret);
+	}
+
+	if (0) {
+err:		if (h != NULL && (t_ret = __memp_fput(mpf,
+		    vdp->thread_info, h, dbp->priority)) != 0)
+			return (ret == 0 ? t_ret : ret);
+	}
+
+	return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_vrfy_structure--
+ *	After a beginning-to-end walk through the database has been
+ *	completed, put together the information that has been collected
+ *	to verify the overall database structure.
+ *
+ *	Should only be called if we want to do a database verification,
+ *	i.e. if DB_SALVAGE is not set.
+ */
+static int
+__db_vrfy_structure(dbp, vdp, dbname, meta_pgno, lp, rp, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	const char *dbname;
+	db_pgno_t meta_pgno;
+	void *lp, *rp;
+	u_int32_t flags;
+{
+	DB *pgset;
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	db_pgno_t i;
+	int ret, isbad, hassubs, p;
+
+	isbad = 0;
+	pip = NULL;
+	env = dbp->env;
+	pgset = vdp->pgset;
+
+	/*
+	 * Providing feedback here is tricky;  in most situations,
+	 * we fetch each page one more time, but we do so in a top-down
+	 * order that depends on the access method.  Worse, we do this
+	 * recursively in btree, such that on any call where we're traversing
+	 * a subtree we don't know where that subtree is in the whole database;
+	 * worse still, any given database may be one of several subdbs.
+	 *
+	 * The solution is to decrement a counter vdp->pgs_remaining each time
+	 * we verify (and call feedback on) a page.  We may over- or
+	 * under-count, but the structure feedback function will ensure that we
+	 * never give a percentage under 50 or over 100.  (The first pass
+	 * covered the range 0-50%.)
+	 */
+	if (dbp->db_feedback != NULL)
+		vdp->pgs_remaining = vdp->last_pgno + 1;
+
+	/*
+	 * Call the appropriate function to downwards-traverse the db type.
+	 */
+	switch (dbp->type) {
+	case DB_BTREE:
+	case DB_RECNO:
+		if ((ret =
+		    __bam_vrfy_structure(dbp, vdp, 0, lp, rp, flags)) != 0) {
+			if (ret == DB_VERIFY_BAD)
+				isbad = 1;
+			else
+				goto err;
+		}
+
+		/*
+		 * If we have subdatabases and we know that the database is,
+		 * thus far, sound, it's safe to walk the tree of subdatabases.
+		 * Do so, and verify the structure of the databases within.
+		 */
+		if ((ret = __db_vrfy_getpageinfo(vdp, 0, &pip)) != 0)
+			goto err;
+		hassubs = F_ISSET(pip, VRFY_HAS_SUBDBS) ? 1 : 0;
+		if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+			goto err;
+		pip = NULL;
+
+		if (isbad == 0 && hassubs)
+			if ((ret =
+			    __db_vrfy_subdbs(dbp, vdp, dbname, flags)) != 0) {
+				if (ret == DB_VERIFY_BAD)
+					isbad = 1;
+				else
+					goto err;
+			}
+		break;
+	case DB_HASH:
+		if ((ret = __ham_vrfy_structure(dbp, vdp, 0, flags)) != 0) {
+			if (ret == DB_VERIFY_BAD)
+				isbad = 1;
+			else
+				goto err;
+		}
+		break;
+	case DB_QUEUE:
+		if ((ret = __qam_vrfy_structure(dbp, vdp, flags)) != 0) {
+			if (ret == DB_VERIFY_BAD)
+				isbad = 1;
+		}
+
+		/*
+		 * Queue pages may be unreferenced and totally zeroed, if
+		 * they're empty;  queue doesn't have much structure, so
+		 * this is unlikely to be wrong in any troublesome sense.
+		 * Skip to "err".
+		 */
+		goto err;
+	case DB_UNKNOWN:
+	default:
+		ret = __db_unknown_path(env, "__db_vrfy_structure");
+		goto err;
+	}
+
+	/* Walk free list. */
+	if ((ret =
+	    __db_vrfy_freelist(dbp, vdp, meta_pgno, flags)) == DB_VERIFY_BAD)
+		isbad = 1;
+
+	/*
+	 * If structure checks up until now have failed, it's likely that
+	 * checking what pages have been missed will result in oodles of
+	 * extraneous error messages being EPRINTed.  Skip to the end
+	 * if this is the case;  we're going to be printing at least one
+	 * error anyway, and probably all the more salient ones.
+	 */
+	if (ret != 0 || isbad == 1)
+		goto err;
+
+	/*
+	 * Make sure no page has been missed and that no page is still marked
+	 * "all zeroes" (only certain hash pages can be, and they're unmarked
+	 * in __ham_vrfy_structure).
+	 */
+	for (i = 0; i < vdp->last_pgno + 1; i++) {
+		if ((ret = __db_vrfy_getpageinfo(vdp, i, &pip)) != 0)
+			goto err;
+		if ((ret = __db_vrfy_pgset_get(pgset,
+		    vdp->thread_info, i, &p)) != 0)
+			goto err;
+		if (pip->type == P_OVERFLOW) {
+			if ((u_int32_t)p != pip->refcount) {
+				EPRINT((env,
+		    "Page %lu: overflow refcount %lu, referenced %lu times",
+				    (u_long)i,
+				    (u_long)pip->refcount, (u_long)p));
+				isbad = 1;
+			}
+		} else if (p == 0 &&
+#ifndef HAVE_FTRUNCATE
+		    !(i > vdp->meta_last_pgno &&
+		    (F_ISSET(pip, VRFY_IS_ALLZEROES) || pip->type == P_HASH)) &&
+#endif
+		    !(dbp->type == DB_HASH && pip->type == P_INVALID)) {
+			/*
+			 * It is OK for unreferenced hash buckets to be
+			 * marked invalid and unreferenced.
+			 */
+			EPRINT((env,
+			    "Page %lu: unreferenced page", (u_long)i));
+			isbad = 1;
+		}
+
+		if (F_ISSET(pip, VRFY_IS_ALLZEROES)
+#ifndef HAVE_FTRUNCATE
+		    && i <= vdp->meta_last_pgno
+#endif
+		    ) {
+			EPRINT((env,
+			    "Page %lu: totally zeroed page", (u_long)i));
+			isbad = 1;
+		}
+		if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+			goto err;
+		pip = NULL;
+	}
+
+err:	if (pip != NULL)
+		(void)__db_vrfy_putpageinfo(env, vdp, pip);
+
+	return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_is_valid_magicno
+ */
+static int
+__db_is_valid_magicno(magic, typep)
+	u_int32_t magic;
+	DBTYPE *typep;
+{
+	switch (magic) {
+	case DB_BTREEMAGIC:
+		*typep = DB_BTREE;
+		return (1);
+	case DB_HASHMAGIC:
+		*typep = DB_HASH;
+		return (1);
+	case DB_QAMMAGIC:
+		*typep = DB_QUEUE;
+		return (1);
+	default:
+		break;
+	}
+	*typep = DB_UNKNOWN;
+	return (0);
+}
+
+/*
+ * __db_vrfy_common --
+ *	Verify info common to all page types.
+ *
+ * PUBLIC: int  __db_vrfy_common
+ * PUBLIC:     __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+ */
+int
+__db_vrfy_common(dbp, vdp, h, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	PAGE *h;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	int ret, t_ret;
+	u_int8_t *p;
+
+	env = dbp->env;
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+
+	pip->pgno = pgno;
+	F_CLR(pip, VRFY_IS_ALLZEROES);
+
+	/*
+	 * Hash expands the table by leaving some pages between the
+	 * old last and the new last totally zeroed.  These pages may
+	 * not be all zero if they were used, freed and then reallocated.
+	 *
+	 * Queue will create sparse files if sparse record numbers are used.
+	 */
+	if (pgno != 0 && PGNO(h) == 0) {
+		F_SET(pip, VRFY_IS_ALLZEROES);
+		for (p = (u_int8_t *)h; p < (u_int8_t *)h + dbp->pgsize; p++)
+			if (*p != 0) {
+				F_CLR(pip, VRFY_IS_ALLZEROES);
+				break;
+			}
+		/*
+		 * Mark it as a hash, and we'll
+		 * check that that makes sense structurally later.
+		 * (The queue verification doesn't care, since queues
+		 * don't really have much in the way of structure.)
+		 */
+		pip->type = P_HASH;
+		ret = 0;
+		goto err;	/* well, not really an err. */
+	}
+
+	if (PGNO(h) != pgno) {
+		EPRINT((env, "Page %lu: bad page number %lu",
+		    (u_long)pgno, (u_long)h->pgno));
+		ret = DB_VERIFY_BAD;
+	}
+
+	switch (h->type) {
+	case P_INVALID:			/* Order matches ordinal value. */
+	case P_HASH_UNSORTED:
+	case P_IBTREE:
+	case P_IRECNO:
+	case P_LBTREE:
+	case P_LRECNO:
+	case P_OVERFLOW:
+	case P_HASHMETA:
+	case P_BTREEMETA:
+	case P_QAMMETA:
+	case P_QAMDATA:
+	case P_LDUP:
+	case P_HASH:
+		break;
+	default:
+		EPRINT((env, "Page %lu: bad page type %lu",
+		    (u_long)pgno, (u_long)h->type));
+		ret = DB_VERIFY_BAD;
+	}
+	pip->type = h->type;
+
+err:	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __db_vrfy_invalid --
+ *	Verify P_INVALID page.
+ *	(Yes, there's not much to do here.)
+ */
+static int
+__db_vrfy_invalid(dbp, vdp, h, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	PAGE *h;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	int ret, t_ret;
+
+	env = dbp->env;
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+	pip->next_pgno = pip->prev_pgno = 0;
+
+	if (!IS_VALID_PGNO(NEXT_PGNO(h))) {
+		EPRINT((env, "Page %lu: invalid next_pgno %lu",
+		    (u_long)pgno, (u_long)NEXT_PGNO(h)));
+		ret = DB_VERIFY_BAD;
+	} else
+		pip->next_pgno = NEXT_PGNO(h);
+
+	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __db_vrfy_datapage --
+ *	Verify elements common to data pages (P_HASH, P_LBTREE,
+ *	P_IBTREE, P_IRECNO, P_LRECNO, P_OVERFLOW, P_DUPLICATE)--i.e.,
+ *	those defined in the PAGE structure.
+ *
+ *	Called from each of the per-page routines, after the
+ *	all-page-type-common elements of pip have been verified and filled
+ *	in.
+ *
+ * PUBLIC: int __db_vrfy_datapage
+ * PUBLIC:     __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+ */
+int
+__db_vrfy_datapage(dbp, vdp, h, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	PAGE *h;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	u_int32_t smallest_entry;
+	int isbad, ret, t_ret;
+
+	env = dbp->env;
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+	isbad = 0;
+
+	/*
+	 * prev_pgno and next_pgno:  store for inter-page checks,
+	 * verify that they point to actual pages and not to self.
+	 *
+	 * !!!
+	 * Internal btree pages do not maintain these fields (indeed,
+	 * they overload them).  Skip.
+	 */
+	if (TYPE(h) != P_IBTREE && TYPE(h) != P_IRECNO) {
+		if (!IS_VALID_PGNO(PREV_PGNO(h)) || PREV_PGNO(h) == pip->pgno) {
+			isbad = 1;
+			EPRINT((env, "Page %lu: invalid prev_pgno %lu",
+			    (u_long)pip->pgno, (u_long)PREV_PGNO(h)));
+		}
+		if (!IS_VALID_PGNO(NEXT_PGNO(h)) || NEXT_PGNO(h) == pip->pgno) {
+			isbad = 1;
+			EPRINT((env, "Page %lu: invalid next_pgno %lu",
+			    (u_long)pip->pgno, (u_long)NEXT_PGNO(h)));
+		}
+		pip->prev_pgno = PREV_PGNO(h);
+		pip->next_pgno = NEXT_PGNO(h);
+	}
+
+	/*
+	 * Verify the number of entries on the page: there's no good way to
+	 * determine if this is accurate.  The best we can do is verify that
+	 * it's not more than can, in theory, fit on the page.  Then, we make
+	 * sure there are at least this many valid elements in inp[], and
+	 * hope the test catches most cases.
+	 */
+	switch (TYPE(h)) {
+	case P_HASH_UNSORTED:
+	case P_HASH:
+		smallest_entry = HKEYDATA_PSIZE(0);
+		break;
+	case P_IBTREE:
+		smallest_entry = BINTERNAL_PSIZE(0);
+		break;
+	case P_IRECNO:
+		smallest_entry = RINTERNAL_PSIZE;
+		break;
+	case P_LBTREE:
+	case P_LDUP:
+	case P_LRECNO:
+		smallest_entry = BKEYDATA_PSIZE(0);
+		break;
+	default:
+		smallest_entry = 0;
+		break;
+	}
+	if (smallest_entry * NUM_ENT(h) / 2 > dbp->pgsize) {
+		isbad = 1;
+		EPRINT((env, "Page %lu: too many entries: %lu",
+		    (u_long)pgno, (u_long)NUM_ENT(h)));
+	}
+
+	if (TYPE(h) != P_OVERFLOW)
+		pip->entries = NUM_ENT(h);
+
+	/*
+	 * btree level.  Should be zero unless we're a btree;
+	 * if we are a btree, should be between LEAFLEVEL and MAXBTREELEVEL,
+	 * and we need to save it off.
+	 */
+	switch (TYPE(h)) {
+	case P_IBTREE:
+	case P_IRECNO:
+		if (LEVEL(h) < LEAFLEVEL + 1) {
+			isbad = 1;
+			EPRINT((env, "Page %lu: bad btree level %lu",
+			    (u_long)pgno, (u_long)LEVEL(h)));
+		}
+		pip->bt_level = LEVEL(h);
+		break;
+	case P_LBTREE:
+	case P_LDUP:
+	case P_LRECNO:
+		if (LEVEL(h) != LEAFLEVEL) {
+			isbad = 1;
+			EPRINT((env,
+			    "Page %lu: btree leaf page has incorrect level %lu",
+			    (u_long)pgno, (u_long)LEVEL(h)));
+		}
+		break;
+	default:
+		if (LEVEL(h) != 0) {
+			isbad = 1;
+			EPRINT((env,
+			    "Page %lu: nonzero level %lu in non-btree database",
+			    (u_long)pgno, (u_long)LEVEL(h)));
+		}
+		break;
+	}
+
+	/*
+	 * Even though inp[] occurs in all PAGEs, we look at it in the
+	 * access-method-specific code, since btree and hash treat
+	 * item lengths very differently, and one of the most important
+	 * things we want to verify is that the data--as specified
+	 * by offset and length--cover the right part of the page
+	 * without overlaps, gaps, or violations of the page boundary.
+	 */
+	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_vrfy_meta--
+ *	Verify the access-method common parts of a meta page, using
+ *	normal mpool routines.
+ *
+ * PUBLIC: int __db_vrfy_meta
+ * PUBLIC:     __P((DB *, VRFY_DBINFO *, DBMETA *, db_pgno_t, u_int32_t));
+ */
+int
+__db_vrfy_meta(dbp, vdp, meta, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	DBMETA *meta;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	DBTYPE dbtype, magtype;
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	int isbad, ret, t_ret;
+
+	isbad = 0;
+	env = dbp->env;
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+
+	/* type plausible for a meta page */
+	switch (meta->type) {
+	case P_BTREEMETA:
+		dbtype = DB_BTREE;
+		break;
+	case P_HASHMETA:
+		dbtype = DB_HASH;
+		break;
+	case P_QAMMETA:
+		dbtype = DB_QUEUE;
+		break;
+	default:
+		ret = __db_unknown_path(env, "__db_vrfy_meta");
+		goto err;
+	}
+
+	/* magic number valid */
+	if (!__db_is_valid_magicno(meta->magic, &magtype)) {
+		isbad = 1;
+		EPRINT((env,
+		    "Page %lu: invalid magic number", (u_long)pgno));
+	}
+	if (magtype != dbtype) {
+		isbad = 1;
+		EPRINT((env,
+		    "Page %lu: magic number does not match database type",
+		    (u_long)pgno));
+	}
+
+	/* version */
+	if ((dbtype == DB_BTREE &&
+	    (meta->version > DB_BTREEVERSION ||
+	    meta->version < DB_BTREEOLDVER)) ||
+	    (dbtype == DB_HASH &&
+	    (meta->version > DB_HASHVERSION ||
+	    meta->version < DB_HASHOLDVER)) ||
+	    (dbtype == DB_QUEUE &&
+	    (meta->version > DB_QAMVERSION ||
+	    meta->version < DB_QAMOLDVER))) {
+		isbad = 1;
+		EPRINT((env,
+    "Page %lu: unsupported database version %lu; extraneous errors may result",
+		    (u_long)pgno, (u_long)meta->version));
+	}
+
+	/* pagesize */
+	if (meta->pagesize != dbp->pgsize) {
+		isbad = 1;
+		EPRINT((env, "Page %lu: invalid pagesize %lu",
+		    (u_long)pgno, (u_long)meta->pagesize));
+	}
+
+	/* Flags */
+	if (meta->metaflags != 0) {
+		if (FLD_ISSET(meta->metaflags,
+		    ~(DBMETA_CHKSUM|DBMETA_PART_RANGE|DBMETA_PART_CALLBACK))) {
+			isbad = 1;
+			EPRINT((env,
+			    "Page %lu: bad meta-data flags value %#lx",
+			    (u_long)PGNO_BASE_MD, (u_long)meta->metaflags));
+		}
+		if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM))
+			F_SET(pip, VRFY_HAS_CHKSUM);
+		if (FLD_ISSET(meta->metaflags, DBMETA_PART_RANGE))
+			F_SET(pip, VRFY_HAS_PART_RANGE);
+		if (FLD_ISSET(meta->metaflags, DBMETA_PART_CALLBACK))
+			F_SET(pip, VRFY_HAS_PART_CALLBACK);
+	}
+
+	/*
+	 * Free list.
+	 *
+	 * If this is not the main, master-database meta page, it
+	 * should not have a free list.
+	 */
+	if (pgno != PGNO_BASE_MD && meta->free != PGNO_INVALID) {
+		isbad = 1;
+		EPRINT((env,
+		    "Page %lu: nonempty free list on subdatabase metadata page",
+		    (u_long)pgno));
+	}
+
+	/* Can correctly be PGNO_INVALID--that's just the end of the list. */
+	if (meta->free != PGNO_INVALID && IS_VALID_PGNO(meta->free))
+		pip->free = meta->free;
+	else if (!IS_VALID_PGNO(meta->free)) {
+		isbad = 1;
+		EPRINT((env,
+		    "Page %lu: nonsensical free list pgno %lu",
+		    (u_long)pgno, (u_long)meta->free));
+	}
+
+	/*
+	 * Check that the meta page agrees with what we got from mpool.
+	 * If we don't have FTRUNCATE then mpool could include some
+	 * zeroed pages at the end of the file, we assume the meta page
+	 * is correct.
+	 */
+	if (pgno == PGNO_BASE_MD && meta->last_pgno != vdp->last_pgno) {
+#ifdef HAVE_FTRUNCATE
+		isbad = 1;
+		EPRINT((env,
+		    "Page %lu: last_pgno is not correct: %lu != %lu",
+		    (u_long)pgno,
+		    (u_long)meta->last_pgno, (u_long)vdp->last_pgno));
+#endif
+		vdp->meta_last_pgno = meta->last_pgno;
+	}
+
+	/*
+	 * We have now verified the common fields of the metadata page.
+	 * Clear the flag that told us they had been incompletely checked.
+	 */
+	F_CLR(pip, VRFY_INCOMPLETE);
+
+err:	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_vrfy_freelist --
+ *	Walk free list, checking off pages and verifying absence of
+ *	loops.
+ */
+static int
+__db_vrfy_freelist(dbp, vdp, meta, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t meta;
+	u_int32_t flags;
+{
+	DB *pgset;
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	db_pgno_t cur_pgno, next_pgno;
+	int p, ret, t_ret;
+
+	env = dbp->env;
+	pgset = vdp->pgset;
+	DB_ASSERT(env, pgset != NULL);
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, meta, &pip)) != 0)
+		return (ret);
+	for (next_pgno = pip->free;
+	    next_pgno != PGNO_INVALID; next_pgno = pip->next_pgno) {
+		cur_pgno = pip->pgno;
+		if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+			return (ret);
+
+		/* This shouldn't happen, but just in case. */
+		if (!IS_VALID_PGNO(next_pgno)) {
+			EPRINT((env,
+			    "Page %lu: invalid next_pgno %lu on free list page",
+			    (u_long)cur_pgno, (u_long)next_pgno));
+			return (DB_VERIFY_BAD);
+		}
+
+		/* Detect cycles. */
+		if ((ret = __db_vrfy_pgset_get(pgset,
+		    vdp->thread_info, next_pgno, &p)) != 0)
+			return (ret);
+		if (p != 0) {
+			EPRINT((env,
+		    "Page %lu: page %lu encountered a second time on free list",
+			    (u_long)cur_pgno, (u_long)next_pgno));
+			return (DB_VERIFY_BAD);
+		}
+		if ((ret = __db_vrfy_pgset_inc(pgset,
+		    vdp->thread_info, next_pgno)) != 0)
+			return (ret);
+
+		if ((ret = __db_vrfy_getpageinfo(vdp, next_pgno, &pip)) != 0)
+			return (ret);
+
+		if (pip->type != P_INVALID) {
+			EPRINT((env,
+			    "Page %lu: non-invalid page %lu on free list",
+			    (u_long)cur_pgno, (u_long)next_pgno));
+			ret = DB_VERIFY_BAD;	  /* unsafe to continue */
+			break;
+		}
+	}
+
+	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __db_vrfy_subdbs --
+ *	Walk the known-safe master database of subdbs with a cursor,
+ *	verifying the structure of each subdatabase we encounter.
+ */
+static int
+__db_vrfy_subdbs(dbp, vdp, dbname, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	const char *dbname;
+	u_int32_t flags;
+{
+	DB *mdbp;
+	DBC *dbc;
+	DBT key, data;
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	db_pgno_t meta_pgno;
+	int ret, t_ret, isbad;
+	u_int8_t type;
+
+	isbad = 0;
+	dbc = NULL;
+	env = dbp->env;
+
+	if ((ret = __db_master_open(dbp,
+	    vdp->thread_info, NULL, dbname, DB_RDONLY, 0, &mdbp)) != 0)
+		return (ret);
+
+	if ((ret = __db_cursor_int(mdbp, NULL,
+	    NULL, DB_BTREE, PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
+		goto err;
+
+	memset(&key, 0, sizeof(key));
+	memset(&data, 0, sizeof(data));
+	while ((ret = __dbc_get(dbc, &key, &data, DB_NEXT)) == 0) {
+		if (data.size != sizeof(db_pgno_t)) {
+			EPRINT((env,
+			    "Subdatabase entry not page-number size"));
+			isbad = 1;
+			goto err;
+		}
+		memcpy(&meta_pgno, data.data, data.size);
+		/*
+		 * Subdatabase meta pgnos are stored in network byte
+		 * order for cross-endian compatibility.  Swap if appropriate.
+		 */
+		DB_NTOHL_SWAP(env, &meta_pgno);
+		if (meta_pgno == PGNO_INVALID || meta_pgno > vdp->last_pgno) {
+			EPRINT((env,
+		    "Subdatabase entry references invalid page %lu",
+			    (u_long)meta_pgno));
+			isbad = 1;
+			goto err;
+		}
+		if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &pip)) != 0)
+			goto err;
+		type = pip->type;
+		if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+			goto err;
+		switch (type) {
+		case P_BTREEMETA:
+			if ((ret = __bam_vrfy_structure(
+			    dbp, vdp, meta_pgno, NULL, NULL, flags)) != 0) {
+				if (ret == DB_VERIFY_BAD)
+					isbad = 1;
+				else
+					goto err;
+			}
+			break;
+		case P_HASHMETA:
+			if ((ret = __ham_vrfy_structure(
+			    dbp, vdp, meta_pgno, flags)) != 0) {
+				if (ret == DB_VERIFY_BAD)
+					isbad = 1;
+				else
+					goto err;
+			}
+			break;
+		case P_QAMMETA:
+		default:
+			EPRINT((env,
+		    "Subdatabase entry references page %lu of invalid type %lu",
+			    (u_long)meta_pgno, (u_long)type));
+			ret = DB_VERIFY_BAD;
+			goto err;
+		}
+	}
+
+	if (ret == DB_NOTFOUND)
+		ret = 0;
+
+err:	if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if ((t_ret = __db_close(mdbp, NULL, 0)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_vrfy_struct_feedback --
+ *	Provide feedback during top-down database structure traversal.
+ *	(See comment at the beginning of __db_vrfy_structure.)
+ *
+ * PUBLIC: void __db_vrfy_struct_feedback __P((DB *, VRFY_DBINFO *));
+ */
+void
+__db_vrfy_struct_feedback(dbp, vdp)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+{
+	int progress;
+
+	if (dbp->db_feedback == NULL)
+		return;
+
+	if (vdp->pgs_remaining > 0)
+		vdp->pgs_remaining--;
+
+	/* Don't allow a feedback call of 100 until we're really done. */
+	progress = 100 - (int)(vdp->pgs_remaining * 50 / (vdp->last_pgno + 1));
+	dbp->db_feedback(dbp, DB_VERIFY, progress == 100 ? 99 : progress);
+}
+
+/*
+ * __db_vrfy_orderchkonly --
+ *	Do an sort-order/hashing check on a known-otherwise-good subdb.
+ */
+static int
+__db_vrfy_orderchkonly(dbp, vdp, name, subdb, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	const char *name, *subdb;
+	u_int32_t flags;
+{
+	BTMETA *btmeta;
+	DB *mdbp, *pgset;
+	DBC *pgsc;
+	DBT key, data;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	HASH *h_internal;
+	HMETA *hmeta;
+	PAGE *h, *currpg;
+	db_pgno_t meta_pgno, p, pgno;
+	u_int32_t bucket;
+	int t_ret, ret;
+
+	pgset = NULL;
+	pgsc = NULL;
+	env = dbp->env;
+	mpf = dbp->mpf;
+	currpg = h = NULL;
+
+	LF_CLR(DB_NOORDERCHK);
+
+	/* Open the master database and get the meta_pgno for the subdb. */
+	if ((ret = __db_master_open(dbp,
+	    vdp->thread_info, NULL, name, DB_RDONLY, 0, &mdbp)) != 0)
+		goto err;
+
+	DB_INIT_DBT(key, subdb, strlen(subdb));
+	memset(&data, 0, sizeof(data));
+	if ((ret = __db_get(mdbp,
+	    vdp->thread_info, NULL, &key, &data, 0)) != 0) {
+		if (ret == DB_NOTFOUND)
+			ret = ENOENT;
+		goto err;
+	}
+
+	if (data.size != sizeof(db_pgno_t)) {
+		EPRINT((env, "Subdatabase entry of invalid size"));
+		ret = DB_VERIFY_BAD;
+		goto err;
+	}
+
+	memcpy(&meta_pgno, data.data, data.size);
+
+	/*
+	 * Subdatabase meta pgnos are stored in network byte
+	 * order for cross-endian compatibility.  Swap if appropriate.
+	 */
+	DB_NTOHL_SWAP(env, &meta_pgno);
+
+	if ((ret = __memp_fget(mpf,
+	     &meta_pgno, vdp->thread_info, NULL, 0, &h)) != 0)
+		goto err;
+
+	if ((ret = __db_vrfy_pgset(env,
+	    vdp->thread_info, dbp->pgsize, &pgset)) != 0)
+		goto err;
+
+	switch (TYPE(h)) {
+	case P_BTREEMETA:
+		btmeta = (BTMETA *)h;
+		if (F_ISSET(&btmeta->dbmeta, BTM_RECNO)) {
+			/* Recnos have no order to check. */
+			ret = 0;
+			goto err;
+		}
+		if ((ret =
+		    __db_meta2pgset(dbp, vdp, meta_pgno, flags, pgset)) != 0)
+			goto err;
+		if ((ret = __db_cursor_int(pgset, NULL, NULL, dbp->type,
+		    PGNO_INVALID, 0, DB_LOCK_INVALIDID, &pgsc)) != 0)
+			goto err;
+		while ((ret = __db_vrfy_pgset_next(pgsc, &p)) == 0) {
+			if ((ret = __memp_fget(mpf, &p,
+			     vdp->thread_info, NULL, 0, &currpg)) != 0)
+				goto err;
+			if ((ret = __bam_vrfy_itemorder(dbp, NULL,
+			    vdp->thread_info, currpg, p, NUM_ENT(currpg), 1,
+			    F_ISSET(&btmeta->dbmeta, BTM_DUP), flags)) != 0)
+				goto err;
+			if ((ret = __memp_fput(mpf,
+			    vdp->thread_info, currpg, dbp->priority)) != 0)
+				goto err;
+			currpg = NULL;
+		}
+
+		/*
+		 * The normal exit condition for the loop above is DB_NOTFOUND.
+		 * If we see that, zero it and continue on to cleanup.
+		 * Otherwise, it's a real error and will be returned.
+		 */
+		if (ret == DB_NOTFOUND)
+			ret = 0;
+		break;
+	case P_HASHMETA:
+		hmeta = (HMETA *)h;
+		h_internal = (HASH *)dbp->h_internal;
+		/*
+		 * Make sure h_charkey is right.
+		 */
+		if (h_internal == NULL) {
+			EPRINT((env,
+			    "Page %lu: DB->h_internal field is NULL",
+			    (u_long)meta_pgno));
+			ret = DB_VERIFY_BAD;
+			goto err;
+		}
+		if (h_internal->h_hash == NULL)
+			h_internal->h_hash = hmeta->dbmeta.version < 5
+			? __ham_func4 : __ham_func5;
+		if (hmeta->h_charkey !=
+		    h_internal->h_hash(dbp, CHARKEY, sizeof(CHARKEY))) {
+			EPRINT((env,
+			    "Page %lu: incorrect hash function for database",
+			    (u_long)meta_pgno));
+			ret = DB_VERIFY_BAD;
+			goto err;
+		}
+
+		/*
+		 * Foreach bucket, verify hashing on each page in the
+		 * corresponding chain of pages.
+		 */
+		if ((ret = __db_cursor_int(dbp, NULL, NULL, dbp->type,
+		    PGNO_INVALID, 0, DB_LOCK_INVALIDID, &pgsc)) != 0)
+			goto err;
+		for (bucket = 0; bucket <= hmeta->max_bucket; bucket++) {
+			pgno = BS_TO_PAGE(bucket, hmeta->spares);
+			while (pgno != PGNO_INVALID) {
+				if ((ret = __memp_fget(mpf, &pgno,
+				    vdp->thread_info, NULL, 0, &currpg)) != 0)
+					goto err;
+				if ((ret = __ham_vrfy_hashing(pgsc,
+				    NUM_ENT(currpg), hmeta, bucket, pgno,
+				    flags, h_internal->h_hash)) != 0)
+					goto err;
+				pgno = NEXT_PGNO(currpg);
+				if ((ret = __memp_fput(mpf, vdp->thread_info,
+				    currpg, dbp->priority)) != 0)
+					goto err;
+				currpg = NULL;
+			}
+		}
+		break;
+	default:
+		EPRINT((env, "Page %lu: database metapage of bad type %lu",
+		    (u_long)meta_pgno, (u_long)TYPE(h)));
+		ret = DB_VERIFY_BAD;
+		break;
+	}
+
+err:	if (pgsc != NULL && (t_ret = __dbc_close(pgsc)) != 0 && ret == 0)
+		ret = t_ret;
+	if (pgset != NULL &&
+	    (t_ret = __db_close(pgset, NULL, 0)) != 0 && ret == 0)
+		ret = t_ret;
+	if (h != NULL && (t_ret = __memp_fput(mpf,
+	    vdp->thread_info, h, dbp->priority)) != 0)
+		ret = t_ret;
+	if (currpg != NULL &&
+	    (t_ret = __memp_fput(mpf,
+		vdp->thread_info, currpg, dbp->priority)) != 0)
+		ret = t_ret;
+	if ((t_ret = __db_close(mdbp, NULL, 0)) != 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __db_salvage_pg --
+ *	Walk through a page, salvaging all likely or plausible (w/
+ *	DB_AGGRESSIVE) key/data pairs and marking seen pages in vdp.
+ *
+ * PUBLIC: int __db_salvage_pg __P((DB *, VRFY_DBINFO *, db_pgno_t,
+ * PUBLIC:     PAGE *, void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__db_salvage_pg(dbp, vdp, pgno, h, handle, callback, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	PAGE *h;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	int keyflag, ret, t_ret;
+
+	env = dbp->env;
+	DB_ASSERT(env, LF_ISSET(DB_SALVAGE));
+
+	/*
+	 * !!!
+	 * We dump record numbers when salvaging Queue databases, but not for
+	 * immutable Recno databases.  The problem is we can't figure out the
+	 * record number from the database page in the Recno case, while the
+	 * offset in the file is sufficient for Queue.
+	 */
+	keyflag = 0;
+
+	/* If we got this page in the subdb pass, we can safely skip it. */
+	if (__db_salvage_isdone(vdp, pgno))
+		return (0);
+
+	switch (TYPE(h)) {
+	case P_BTREEMETA:
+		ret = __bam_vrfy_meta(dbp, vdp, (BTMETA *)h, pgno, flags);
+		break;
+	case P_HASH:
+	case P_HASH_UNSORTED:
+	case P_LBTREE:
+	case P_QAMDATA:
+		return (__db_salvage_leaf(dbp,
+		    vdp, pgno, h, handle, callback, flags));
+	case P_HASHMETA:
+		ret = __ham_vrfy_meta(dbp, vdp, (HMETA *)h, pgno, flags);
+		break;
+	case P_IBTREE:
+		/*
+		 * We need to mark any overflow keys on internal pages as seen,
+		 * so we don't print them out in __db_salvage_unknowns.  But if
+		 * we're an upgraded database, a P_LBTREE page may very well
+		 * have a reference to the same overflow pages (this practice
+		 * stopped somewhere around db4.5).  To give P_LBTREEs a chance
+		 * to print out any keys on shared pages, mark the page now and
+		 * deal with it at the end.
+		 */
+		return (__db_salvage_markneeded(vdp, pgno, SALVAGE_IBTREE));
+	case P_LDUP:
+		return (__db_salvage_markneeded(vdp, pgno, SALVAGE_LDUP));
+	case P_LRECNO:
+		/*
+		 * Recno leaves are tough, because the leaf could be (1) a dup
+		 * page, or it could be (2) a regular database leaf page.
+		 * Fortunately, RECNO databases are not allowed to have
+		 * duplicates.
+		 *
+		 * If there are no subdatabases, dump the page immediately if
+		 * it's a leaf in a RECNO database, otherwise wait and hopefully
+		 * it will be dumped by the leaf page that refers to it,
+		 * otherwise we'll get it with the unknowns.
+		 *
+		 * If there are subdatabases, there might be mixed types and
+		 * dbp->type can't be trusted.  We'll only get here after
+		 * salvaging each database, though, so salvaging this page
+		 * immediately isn't important.  If this page is a dup, it might
+		 * get salvaged later on, otherwise the unknowns pass will pick
+		 * it up.  Note that SALVAGE_HASSUBDBS won't get set if we're
+		 * salvaging aggressively.
+		 *
+		 * If we're salvaging aggressively, we don't know whether or not
+		 * there's subdatabases, so we wait on all recno pages.
+		 */
+		if (!LF_ISSET(DB_AGGRESSIVE) &&
+		    !F_ISSET(vdp, SALVAGE_HASSUBDBS) && dbp->type == DB_RECNO)
+			return (__db_salvage_leaf(dbp,
+			    vdp, pgno, h, handle, callback, flags));
+		return (__db_salvage_markneeded(vdp, pgno, SALVAGE_LRECNODUP));
+	case P_OVERFLOW:
+		return (__db_salvage_markneeded(vdp, pgno, SALVAGE_OVERFLOW));
+	case P_QAMMETA:
+		keyflag = 1;
+		ret = __qam_vrfy_meta(dbp, vdp, (QMETA *)h, pgno, flags);
+		break;
+	case P_INVALID:
+	case P_IRECNO:
+	case __P_DUPLICATE:
+	default:
+		/*
+		 * There's no need to display an error, the page type was
+		 * already checked and reported on.
+		 */
+		return (0);
+	}
+	if (ret != 0)
+		return (ret);
+
+	/*
+	 * We have to display the dump header if it's a metadata page.  It's
+	 * our last chance as the page was marked "seen" in the vrfy routine,
+	 * and  we won't see the page again.  We don't display headers for
+	 * the first database in a multi-database file, that database simply
+	 * contains a list of subdatabases.
+	 */
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+	if (!F_ISSET(pip, VRFY_HAS_SUBDBS) && !LF_ISSET(DB_VERIFY_PARTITION))
+		ret = __db_prheader(
+		    dbp, NULL, 0, keyflag, handle, callback, vdp, pgno);
+	if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __db_salvage_leaf --
+ *	Walk through a leaf, salvaging all likely key/data pairs and marking
+ *	seen pages in vdp.
+ *
+ * PUBLIC: int __db_salvage_leaf __P((DB *, VRFY_DBINFO *, db_pgno_t,
+ * PUBLIC:     PAGE *, void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__db_salvage_leaf(dbp, vdp, pgno, h, handle, callback, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	PAGE *h;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	ENV *env;
+
+	env = dbp->env;
+	DB_ASSERT(env, LF_ISSET(DB_SALVAGE));
+
+	/* If we got this page in the subdb pass, we can safely skip it. */
+	if (__db_salvage_isdone(vdp, pgno))
+		return (0);
+
+	switch (TYPE(h)) {
+	case P_HASH_UNSORTED:
+	case P_HASH:
+		return (__ham_salvage(dbp, vdp,
+		    pgno, h, handle, callback, flags));
+	case P_LBTREE:
+	case P_LRECNO:
+		return (__bam_salvage(dbp, vdp,
+		    pgno, TYPE(h), h, handle, callback, NULL, flags));
+	case P_QAMDATA:
+		return (__qam_salvage(dbp, vdp,
+		    pgno, h, handle, callback, flags));
+	default:
+		/*
+		 * There's no need to display an error, the page type was
+		 * already checked and reported on.
+		 */
+		return (0);
+	}
+}
+
+/*
+ * __db_salvage_unknowns --
+ *	Walk through the salvager database, printing with key "UNKNOWN"
+ *	any pages we haven't dealt with.
+ */
+static int
+__db_salvage_unknowns(dbp, vdp, handle, callback, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	DBC *dbc;
+	DBT unkdbt, key, *dbt;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	PAGE *h;
+	db_pgno_t pgno;
+	u_int32_t pgtype, ovfl_bufsz, tmp_flags;
+	int ret, t_ret;
+	void *ovflbuf;
+
+	dbc = NULL;
+	env = dbp->env;
+	mpf = dbp->mpf;
+
+	DB_INIT_DBT(unkdbt, "UNKNOWN", sizeof("UNKNOWN") - 1);
+
+	if ((ret = __os_malloc(env, dbp->pgsize, &ovflbuf)) != 0)
+		return (ret);
+	ovfl_bufsz = dbp->pgsize;
+
+	/*
+	 * We make two passes -- in the first pass, skip SALVAGE_OVERFLOW
+	 * pages, because they may be referenced by the standard database
+	 * pages that we're resolving.
+	 */
+	while ((t_ret =
+	    __db_salvage_getnext(vdp, &dbc, &pgno, &pgtype, 1)) == 0) {
+		if ((t_ret = __memp_fget(mpf,
+		    &pgno, vdp->thread_info, NULL, 0, &h)) != 0) {
+			if (ret == 0)
+				ret = t_ret;
+			continue;
+		}
+
+		dbt = NULL;
+		tmp_flags = 0;
+		switch (pgtype) {
+		case SALVAGE_LDUP:
+		case SALVAGE_LRECNODUP:
+			dbt = &unkdbt;
+			tmp_flags = DB_SA_UNKNOWNKEY;
+			/* FALLTHROUGH */
+		case SALVAGE_IBTREE:
+		case SALVAGE_LBTREE:
+		case SALVAGE_LRECNO:
+			if ((t_ret = __bam_salvage(
+			    dbp, vdp, pgno, pgtype, h, handle,
+			    callback, dbt, tmp_flags | flags)) != 0 && ret == 0)
+				ret = t_ret;
+			break;
+		case SALVAGE_OVERFLOW:
+			DB_ASSERT(env, 0);	/* Shouldn't ever happen. */
+			break;
+		case SALVAGE_HASH:
+			if ((t_ret = __ham_salvage(dbp, vdp,
+			    pgno, h, handle, callback, flags)) != 0 && ret == 0)
+				ret = t_ret;
+			break;
+		case SALVAGE_INVALID:
+		case SALVAGE_IGNORE:
+		default:
+			/*
+			 * Shouldn't happen, but if it does, just do what the
+			 * nice man says.
+			 */
+			DB_ASSERT(env, 0);
+			break;
+		}
+		if ((t_ret = __memp_fput(mpf,
+		    vdp->thread_info, h, dbp->priority)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+
+	/* We should have reached the end of the database. */
+	if (t_ret == DB_NOTFOUND)
+		t_ret = 0;
+	if (t_ret != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Re-open the cursor so we traverse the database again. */
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+	dbc = NULL;
+
+	/* Now, deal with any remaining overflow pages. */
+	while ((t_ret =
+	    __db_salvage_getnext(vdp, &dbc, &pgno, &pgtype, 0)) == 0) {
+		if ((t_ret = __memp_fget(mpf,
+		    &pgno, vdp->thread_info, NULL, 0, &h)) != 0) {
+			if (ret == 0)
+				ret = t_ret;
+			continue;
+		}
+
+		switch (pgtype) {
+		case SALVAGE_OVERFLOW:
+			/*
+			 * XXX:
+			 * This may generate multiple "UNKNOWN" keys in
+			 * a database with no dups.  What to do?
+			 */
+			if ((t_ret = __db_safe_goff(dbp, vdp,
+			    pgno, &key, &ovflbuf, &ovfl_bufsz, flags)) != 0 ||
+			    ((vdp->type == DB_BTREE || vdp->type == DB_HASH) &&
+			    (t_ret = __db_vrfy_prdbt(&unkdbt,
+			    0, " ", handle, callback, 0, vdp)) != 0) ||
+			    (t_ret = __db_vrfy_prdbt(
+			    &key, 0, " ", handle, callback, 0, vdp)) != 0)
+				if (ret == 0)
+					ret = t_ret;
+			break;
+		default:
+			DB_ASSERT(env, 0);	/* Shouldn't ever happen. */
+			break;
+		}
+		if ((t_ret = __memp_fput(mpf,
+		    vdp->thread_info, h, dbp->priority)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+
+	/* We should have reached the end of the database. */
+	if (t_ret == DB_NOTFOUND)
+		t_ret = 0;
+	if (t_ret != 0 && ret == 0)
+		ret = t_ret;
+
+	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	__os_free(env, ovflbuf);
+
+	return (ret);
+}
+
+/*
+ * Offset of the ith inp array entry, which we can compare to the offset
+ * the entry stores.
+ */
+#define	INP_OFFSET(dbp, h, i)	\
+    ((db_indx_t)((u_int8_t *)((P_INP(dbp,(h))) + (i)) - (u_int8_t *)(h)))
+
+/*
+ * __db_vrfy_inpitem --
+ *	Verify that a single entry in the inp array is sane, and update
+ *	the high water mark and current item offset.  (The former of these is
+ *	used for state information between calls, and is required;  it must
+ *	be initialized to the pagesize before the first call.)
+ *
+ *	Returns DB_VERIFY_FATAL if inp has collided with the data,
+ *	since verification can't continue from there;  returns DB_VERIFY_BAD
+ *	if anything else is wrong.
+ *
+ * PUBLIC: int __db_vrfy_inpitem __P((DB *, PAGE *,
+ * PUBLIC:     db_pgno_t, u_int32_t, int, u_int32_t, u_int32_t *, u_int32_t *));
+ */
+int
+__db_vrfy_inpitem(dbp, h, pgno, i, is_btree, flags, himarkp, offsetp)
+	DB *dbp;
+	PAGE *h;
+	db_pgno_t pgno;
+	u_int32_t i;
+	int is_btree;
+	u_int32_t flags, *himarkp, *offsetp;
+{
+	BKEYDATA *bk;
+	ENV *env;
+	db_indx_t *inp, offset, len;
+
+	env = dbp->env;
+
+	DB_ASSERT(env, himarkp != NULL);
+	inp = P_INP(dbp, h);
+
+	/*
+	 * Check that the inp array, which grows from the beginning of the
+	 * page forward, has not collided with the data, which grow from the
+	 * end of the page backward.
+	 */
+	if (inp + i >= (db_indx_t *)((u_int8_t *)h + *himarkp)) {
+		/* We've collided with the data.  We need to bail. */
+		EPRINT((env, "Page %lu: entries listing %lu overlaps data",
+		    (u_long)pgno, (u_long)i));
+		return (DB_VERIFY_FATAL);
+	}
+
+	offset = inp[i];
+
+	/*
+	 * Check that the item offset is reasonable:  it points somewhere
+	 * after the inp array and before the end of the page.
+	 */
+	if (offset <= INP_OFFSET(dbp, h, i) || offset >= dbp->pgsize) {
+		EPRINT((env, "Page %lu: bad offset %lu at page index %lu",
+		    (u_long)pgno, (u_long)offset, (u_long)i));
+		return (DB_VERIFY_BAD);
+	}
+
+	/* Update the high-water mark (what HOFFSET should be) */
+	if (offset < *himarkp)
+		*himarkp = offset;
+
+	if (is_btree) {
+		/*
+		 * Check alignment;  if it's unaligned, it's unsafe to
+		 * manipulate this item.
+		 */
+		if (offset != DB_ALIGN(offset, sizeof(u_int32_t))) {
+			EPRINT((env,
+			    "Page %lu: unaligned offset %lu at page index %lu",
+			    (u_long)pgno, (u_long)offset, (u_long)i));
+			return (DB_VERIFY_BAD);
+		}
+
+		/*
+		 * Check that the item length remains on-page.
+		 */
+		bk = GET_BKEYDATA(dbp, h, i);
+
+		/*
+		 * We need to verify the type of the item here;
+		 * we can't simply assume that it will be one of the
+		 * expected three.  If it's not a recognizable type,
+		 * it can't be considered to have a verifiable
+		 * length, so it's not possible to certify it as safe.
+		 */
+		switch (B_TYPE(bk->type)) {
+		case B_KEYDATA:
+			len = bk->len;
+			break;
+		case B_DUPLICATE:
+		case B_OVERFLOW:
+			len = BOVERFLOW_SIZE;
+			break;
+		default:
+			EPRINT((env,
+			    "Page %lu: item %lu of unrecognizable type",
+			    (u_long)pgno, (u_long)i));
+			return (DB_VERIFY_BAD);
+		}
+
+		if ((size_t)(offset + len) > dbp->pgsize) {
+			EPRINT((env,
+			    "Page %lu: item %lu extends past page boundary",
+			    (u_long)pgno, (u_long)i));
+			return (DB_VERIFY_BAD);
+		}
+	}
+
+	if (offsetp != NULL)
+		*offsetp = offset;
+	return (0);
+}
+
+/*
+ * __db_vrfy_duptype--
+ *	Given a page number and a set of flags to __bam_vrfy_subtree,
+ *	verify that the dup tree type is correct--i.e., it's a recno
+ *	if DUPSORT is not set and a btree if it is.
+ *
+ * PUBLIC: int __db_vrfy_duptype
+ * PUBLIC:     __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t));
+ */
+int
+__db_vrfy_duptype(dbp, vdp, pgno, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	u_int32_t flags;
+{
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	int ret, isbad;
+
+	env = dbp->env;
+	isbad = 0;
+
+	if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+		return (ret);
+
+	switch (pip->type) {
+	case P_IBTREE:
+	case P_LDUP:
+		if (!LF_ISSET(DB_ST_DUPSORT)) {
+			EPRINT((env,
+	    "Page %lu: sorted duplicate set in unsorted-dup database",
+			    (u_long)pgno));
+			isbad = 1;
+		}
+		break;
+	case P_IRECNO:
+	case P_LRECNO:
+		if (LF_ISSET(DB_ST_DUPSORT)) {
+			EPRINT((env,
+	    "Page %lu: unsorted duplicate set in sorted-dup database",
+			    (u_long)pgno));
+			isbad = 1;
+		}
+		break;
+	default:
+		/*
+		 * If the page is entirely zeroed, its pip->type will be a lie
+		 * (we assumed it was a hash page, as they're allowed to be
+		 * zeroed);  handle this case specially.
+		 */
+		if (F_ISSET(pip, VRFY_IS_ALLZEROES))
+			ZEROPG_ERR_PRINT(env, pgno, "duplicate page");
+		else
+			EPRINT((env,
+		    "Page %lu: duplicate page of inappropriate type %lu",
+			    (u_long)pgno, (u_long)pip->type));
+		isbad = 1;
+		break;
+	}
+
+	if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+		return (ret);
+	return (isbad == 1 ? DB_VERIFY_BAD : 0);
+}
+
+/*
+ * __db_salvage_duptree --
+ *	Attempt to salvage a given duplicate tree, given its alleged root.
+ *
+ *	The key that corresponds to this dup set has been passed to us
+ *	in DBT *key.  Because data items follow keys, though, it has been
+ *	printed once already.
+ *
+ *	The basic idea here is that pgno ought to be a P_LDUP, a P_LRECNO, a
+ *	P_IBTREE, or a P_IRECNO.  If it's an internal page, use the verifier
+ *	functions to make sure it's safe;  if it's not, we simply bail and the
+ *	data will have to be printed with no key later on.  if it is safe,
+ *	recurse on each of its children.
+ *
+ *	Whether or not it's safe, if it's a leaf page, __bam_salvage it.
+ *
+ *	At all times, use the DB hanging off vdp to mark and check what we've
+ *	done, so each page gets printed exactly once and we don't get caught
+ *	in any cycles.
+ *
+ * PUBLIC: int __db_salvage_duptree __P((DB *, VRFY_DBINFO *, db_pgno_t,
+ * PUBLIC:     DBT *, void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__db_salvage_duptree(dbp, vdp, pgno, key, handle, callback, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	DBT *key;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	DB_MPOOLFILE *mpf;
+	PAGE *h;
+	int ret, t_ret;
+
+	mpf = dbp->mpf;
+
+	if (pgno == PGNO_INVALID || !IS_VALID_PGNO(pgno))
+		return (DB_VERIFY_BAD);
+
+	/* We have a plausible page.  Try it. */
+	if ((ret = __memp_fget(mpf, &pgno, vdp->thread_info, NULL, 0, &h)) != 0)
+		return (ret);
+
+	switch (TYPE(h)) {
+	case P_IBTREE:
+	case P_IRECNO:
+		if ((ret = __db_vrfy_common(dbp, vdp, h, pgno, flags)) != 0)
+			goto err;
+		if ((ret = __bam_vrfy(dbp,
+		    vdp, h, pgno, flags | DB_NOORDERCHK)) != 0 ||
+		    (ret = __db_salvage_markdone(vdp, pgno)) != 0)
+			goto err;
+		/*
+		 * We have a known-healthy internal page.  Walk it.
+		 */
+		if ((ret = __bam_salvage_walkdupint(dbp, vdp, h, key,
+		    handle, callback, flags)) != 0)
+			goto err;
+		break;
+	case P_LRECNO:
+	case P_LDUP:
+		if ((ret = __bam_salvage(dbp,
+		    vdp, pgno, TYPE(h), h, handle, callback, key, flags)) != 0)
+			goto err;
+		break;
+	default:
+		ret = DB_VERIFY_BAD;
+		goto err;
+	}
+
+err:	if ((t_ret = __memp_fput(mpf,
+	     vdp->thread_info, h, dbp->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __db_salvage_all --
+ *	Salvage only the leaves we find by walking the tree.  If we have subdbs,
+ *	salvage each of them individually.
+ */
+static int
+__db_salvage_all(dbp, vdp, handle, callback, flags, hassubsp)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+	int *hassubsp;
+{
+	DB *pgset;
+	DBC *pgsc;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	PAGE *h;
+	VRFY_PAGEINFO *pip;
+	db_pgno_t p, meta_pgno;
+	int ret, t_ret;
+
+	*hassubsp = 0;
+
+	env = dbp->env;
+	pgset = NULL;
+	pgsc = NULL;
+	mpf = dbp->mpf;
+	h = NULL;
+	pip = NULL;
+	ret = 0;
+
+	/*
+	 * Check to make sure the page is OK and find out if it contains
+	 * subdatabases.
+	 */
+	meta_pgno = PGNO_BASE_MD;
+	if ((t_ret = __memp_fget(mpf,
+	    &meta_pgno, vdp->thread_info, NULL, 0, &h)) == 0 &&
+	    (t_ret = __db_vrfy_common(dbp, vdp, h, PGNO_BASE_MD, flags)) == 0 &&
+	    (t_ret = __db_salvage_pg(
+		dbp, vdp, PGNO_BASE_MD, h, handle, callback, flags)) == 0 &&
+	    (t_ret = __db_vrfy_getpageinfo(vdp, 0, &pip)) == 0)
+		if (F_ISSET(pip, VRFY_HAS_SUBDBS))
+			*hassubsp = 1;
+	if (pip != NULL &&
+	    (t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+		ret = t_ret;
+	if (h != NULL) {
+		if ((t_ret = __memp_fput(mpf,
+		     vdp->thread_info, h, dbp->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		h = NULL;
+	}
+	if (ret != 0)
+		return (ret);
+
+	/* Without subdatabases, we can just dump from the meta pgno. */
+	if (*hassubsp == 0)
+		return (__db_salvage(dbp,
+		    vdp, PGNO_BASE_MD, handle, callback, flags));
+
+	/*
+	 * We have subdbs.  Try to crack them.
+	 *
+	 * To do so, get a set of leaf pages in the master database, and then
+	 * walk each of the valid ones, salvaging subdbs as we go.  If any
+	 * prove invalid, just drop them;  we'll pick them up on a later pass.
+	 */
+	if ((ret = __db_vrfy_pgset(env,
+	    vdp->thread_info, dbp->pgsize, &pgset)) != 0)
+		goto err;
+	if ((ret = __db_meta2pgset(dbp, vdp, PGNO_BASE_MD, flags, pgset)) != 0)
+		goto err;
+	if ((ret = __db_cursor(pgset, vdp->thread_info, NULL, &pgsc, 0)) != 0)
+		goto err;
+	while ((t_ret = __db_vrfy_pgset_next(pgsc, &p)) == 0) {
+		if ((t_ret = __memp_fget(mpf,
+		    &p, vdp->thread_info, NULL, 0, &h)) == 0 &&
+		    (t_ret = __db_vrfy_common(dbp, vdp, h, p, flags)) == 0 &&
+		    (t_ret =
+		    __bam_vrfy(dbp, vdp, h, p, flags | DB_NOORDERCHK)) == 0)
+			t_ret = __db_salvage_subdbpg(
+			    dbp, vdp, h, handle, callback, flags);
+		if (t_ret != 0 && ret == 0)
+			ret = t_ret;
+		if (h != NULL) {
+			if ((t_ret = __memp_fput(mpf, vdp->thread_info,
+			    h, dbp->priority)) != 0 && ret == 0)
+				ret = t_ret;
+			h = NULL;
+		}
+	}
+
+	if (t_ret != DB_NOTFOUND && ret == 0)
+		ret = t_ret;
+
+err:	if (pgsc != NULL && (t_ret = __dbc_close(pgsc)) != 0 && ret == 0)
+		ret = t_ret;
+	if (pgset != NULL &&
+	    (t_ret = __db_close(pgset, NULL, 0)) != 0 && ret ==0)
+		ret = t_ret;
+	if (h != NULL &&
+	    (t_ret = __memp_fput(mpf,
+		vdp->thread_info, h, dbp->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __db_salvage_subdbpg --
+ *	Given a known-good leaf page in the master database, salvage all
+ *	leaf pages corresponding to each subdb.
+ */
+static int
+__db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	PAGE *master;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	BKEYDATA *bkkey, *bkdata;
+	BOVERFLOW *bo;
+	DB *pgset;
+	DBC *pgsc;
+	DBT key;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	PAGE *subpg;
+	db_indx_t i;
+	db_pgno_t meta_pgno;
+	int ret, err_ret, t_ret;
+	char *subdbname;
+	u_int32_t ovfl_bufsz;
+
+	env = dbp->env;
+	mpf = dbp->mpf;
+	ret = err_ret = 0;
+	subdbname = NULL;
+	pgsc = NULL;
+	pgset = NULL;
+	ovfl_bufsz = 0;
+
+	/*
+	 * For each entry, get and salvage the set of pages
+	 * corresponding to that entry.
+	 */
+	for (i = 0; i < NUM_ENT(master); i += P_INDX) {
+		bkkey = GET_BKEYDATA(dbp, master, i);
+		bkdata = GET_BKEYDATA(dbp, master, i + O_INDX);
+
+		/* Get the subdatabase name. */
+		if (B_TYPE(bkkey->type) == B_OVERFLOW) {
+			/*
+			 * We can, in principle anyway, have a subdb
+			 * name so long it overflows.  Ick.
+			 */
+			bo = (BOVERFLOW *)bkkey;
+			if ((ret = __db_safe_goff(dbp, vdp, bo->pgno,
+			    &key, &subdbname, &ovfl_bufsz, flags)) != 0) {
+				err_ret = DB_VERIFY_BAD;
+				continue;
+			}
+
+			/* Nul-terminate it. */
+			if (ovfl_bufsz < key.size + 1) {
+				if ((ret = __os_realloc(env,
+				    key.size + 1, &subdbname)) != 0)
+					goto err;
+				ovfl_bufsz = key.size + 1;
+			}
+			subdbname[key.size] = '\0';
+		} else if (B_TYPE(bkkey->type) == B_KEYDATA) {
+			if (ovfl_bufsz < (u_int32_t)bkkey->len + 1) {
+				if ((ret = __os_realloc(env,
+				    bkkey->len + 1, &subdbname)) != 0)
+					goto err;
+				ovfl_bufsz = bkkey->len + 1;
+			}
+			DB_ASSERT(env, subdbname != NULL);
+			memcpy(subdbname, bkkey->data, bkkey->len);
+			subdbname[bkkey->len] = '\0';
+		}
+
+		/* Get the corresponding pgno. */
+		if (bkdata->len != sizeof(db_pgno_t)) {
+			err_ret = DB_VERIFY_BAD;
+			continue;
+		}
+		memcpy(&meta_pgno,
+		    (db_pgno_t *)bkdata->data, sizeof(db_pgno_t));
+
+		/*
+		 * Subdatabase meta pgnos are stored in network byte
+		 * order for cross-endian compatibility.  Swap if appropriate.
+		 */
+		DB_NTOHL_SWAP(env, &meta_pgno);
+
+		/* If we can't get the subdb meta page, just skip the subdb. */
+		if (!IS_VALID_PGNO(meta_pgno) || (ret = __memp_fget(mpf,
+		    &meta_pgno, vdp->thread_info, NULL, 0, &subpg)) != 0) {
+			err_ret = ret;
+			continue;
+		}
+
+		/*
+		 * Verify the subdatabase meta page.  This has two functions.
+		 * First, if it's bad, we have no choice but to skip the subdb
+		 * and let the pages just get printed on a later pass.  Second,
+		 * the access-method-specific meta verification routines record
+		 * the various state info (such as the presence of dups)
+		 * that we need for __db_prheader().
+		 */
+		if ((ret =
+		    __db_vrfy_common(dbp, vdp, subpg, meta_pgno, flags)) != 0) {
+			err_ret = ret;
+			(void)__memp_fput(mpf,
+			    vdp->thread_info, subpg, dbp->priority);
+			continue;
+		}
+		switch (TYPE(subpg)) {
+		case P_BTREEMETA:
+			if ((ret = __bam_vrfy_meta(dbp,
+			    vdp, (BTMETA *)subpg, meta_pgno, flags)) != 0) {
+				err_ret = ret;
+				(void)__memp_fput(mpf,
+				    vdp->thread_info, subpg, dbp->priority);
+				continue;
+			}
+			break;
+		case P_HASHMETA:
+			if ((ret = __ham_vrfy_meta(dbp,
+			    vdp, (HMETA *)subpg, meta_pgno, flags)) != 0) {
+				err_ret = ret;
+				(void)__memp_fput(mpf,
+				    vdp->thread_info, subpg, dbp->priority);
+				continue;
+			}
+			break;
+		default:
+			/* This isn't an appropriate page;  skip this subdb. */
+			err_ret = DB_VERIFY_BAD;
+			continue;
+		}
+
+		if ((ret = __memp_fput(mpf,
+		    vdp->thread_info, subpg, dbp->priority)) != 0) {
+			err_ret = ret;
+			continue;
+		}
+
+		/* Print a subdatabase header. */
+		if ((ret = __db_prheader(dbp,
+		    subdbname, 0, 0, handle, callback, vdp, meta_pgno)) != 0)
+			goto err;
+
+		/* Salvage meta_pgno's tree. */
+		if ((ret = __db_salvage(dbp,
+		    vdp, meta_pgno, handle, callback, flags)) != 0)
+			err_ret = ret;
+
+		/* Print a subdatabase footer. */
+		if ((ret = __db_prfooter(handle, callback)) != 0)
+			goto err;
+	}
+
+err:	if (subdbname)
+		__os_free(env, subdbname);
+
+	if (pgsc != NULL && (t_ret = __dbc_close(pgsc)) != 0)
+		ret = t_ret;
+
+	if (pgset != NULL && (t_ret = __db_close(pgset, NULL, 0)) != 0)
+		ret = t_ret;
+
+	if ((t_ret = __db_salvage_markdone(vdp, PGNO(master))) != 0)
+		return (t_ret);
+
+	return ((err_ret != 0) ? err_ret : ret);
+}
+
+/*
+ * __db_salvage --
+ *      Given a meta page number, salvage all data from leaf pages found by
+ *      walking the meta page's tree.
+ */
+static int
+__db_salvage(dbp, vdp, meta_pgno, handle, callback, flags)
+     DB *dbp;
+     VRFY_DBINFO *vdp;
+     db_pgno_t meta_pgno;
+     void *handle;
+     int (*callback) __P((void *, const void *));
+     u_int32_t flags;
+
+{
+	DB *pgset;
+	DBC *dbc, *pgsc;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	PAGE *subpg;
+	db_pgno_t p;
+	int err_ret, ret, t_ret;
+
+	env = dbp->env;
+	mpf = dbp->mpf;
+	err_ret = ret = t_ret = 0;
+	pgsc = NULL;
+	pgset = NULL;
+	dbc = NULL;
+
+	if ((ret = __db_vrfy_pgset(env,
+	    vdp->thread_info, dbp->pgsize, &pgset)) != 0)
+		goto err;
+
+	/* Get all page numbers referenced from this meta page. */
+	if ((ret = __db_meta2pgset(dbp, vdp, meta_pgno,
+	    flags, pgset)) != 0) {
+		err_ret = ret;
+		goto err;
+	}
+
+	if ((ret = __db_cursor(pgset,
+	    vdp->thread_info, NULL, &pgsc, 0)) != 0)
+		goto err;
+
+	if (dbp->type == DB_QUEUE &&
+	    (ret = __db_cursor(dbp, vdp->thread_info, NULL, &dbc, 0)) != 0)
+		goto err;
+
+	/* Salvage every page in pgset. */
+	while ((ret = __db_vrfy_pgset_next(pgsc, &p)) == 0) {
+		if (dbp->type == DB_QUEUE) {
+#ifdef HAVE_QUEUE
+			ret = __qam_fget(dbc, &p, 0, &subpg);
+#else
+			ret = __db_no_queue_am(env);
+#endif
+			/* Don't report an error for pages not found in a queue.
+			 * The pgset is a best guess, it doesn't know about
+			 * deleted extents which leads to this error.
+			 */
+			if (ret == ENOENT || ret == DB_PAGE_NOTFOUND)
+				continue;
+		} else
+			ret = __memp_fget(mpf,
+			    &p, vdp->thread_info, NULL, 0, &subpg);
+		if (ret != 0) {
+			err_ret = ret;
+			continue;
+		}
+
+		if ((ret = __db_salvage_pg(dbp, vdp, p, subpg,
+		    handle, callback, flags)) != 0)
+			err_ret = ret;
+
+		if (dbp->type == DB_QUEUE)
+#ifdef HAVE_QUEUE
+			ret = __qam_fput(dbc, p, subpg, dbp->priority);
+#else
+			ret = __db_no_queue_am(env);
+#endif
+		else
+			ret = __memp_fput(mpf,
+			    vdp->thread_info, subpg, dbp->priority);
+		if (ret != 0)
+			err_ret = ret;
+	}
+
+	if (ret == DB_NOTFOUND)
+		ret = 0;
+
+err:
+	if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0)
+		ret = t_ret;
+	if (pgsc != NULL && (t_ret = __dbc_close(pgsc)) != 0)
+		ret = t_ret;
+	if (pgset != NULL && (t_ret = __db_close(pgset, NULL, 0)) != 0)
+		ret = t_ret;
+
+	return ((err_ret != 0) ? err_ret : ret);
+}
+
+/*
+ * __db_meta2pgset --
+ *	Given a known-safe meta page number, return the set of pages
+ *	corresponding to the database it represents.  Return DB_VERIFY_BAD if
+ *	it's not a suitable meta page or is invalid.
+ */
+static int
+__db_meta2pgset(dbp, vdp, pgno, flags, pgset)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	u_int32_t flags;
+	DB *pgset;
+{
+	DB_MPOOLFILE *mpf;
+	PAGE *h;
+	int ret, t_ret;
+
+	mpf = dbp->mpf;
+
+	if ((ret = __memp_fget(mpf, &pgno, vdp->thread_info, NULL, 0, &h)) != 0)
+		return (ret);
+
+	switch (TYPE(h)) {
+	case P_BTREEMETA:
+		ret = __bam_meta2pgset(dbp, vdp, (BTMETA *)h, flags, pgset);
+		break;
+	case P_HASHMETA:
+		ret = __ham_meta2pgset(dbp, vdp, (HMETA *)h, flags, pgset);
+		break;
+	case P_QAMMETA:
+#ifdef HAVE_QUEUE
+		ret = __qam_meta2pgset(dbp, vdp, pgset);
+		break;
+#endif
+	default:
+		ret = DB_VERIFY_BAD;
+		break;
+	}
+
+	if ((t_ret = __memp_fput(mpf, vdp->thread_info, h, dbp->priority)) != 0)
+		return (t_ret);
+	return (ret);
+}
+
+/*
+ * __db_guesspgsize --
+ *	Try to guess what the pagesize is if the one on the meta page
+ *	and the one in the db are invalid.
+ */
+static u_int
+__db_guesspgsize(env, fhp)
+	ENV *env;
+	DB_FH *fhp;
+{
+	db_pgno_t i;
+	size_t nr;
+	u_int32_t guess;
+	u_int8_t type;
+
+	for (guess = DB_MAX_PGSIZE; guess >= DB_MIN_PGSIZE; guess >>= 1) {
+		/*
+		 * We try to read three pages ahead after the first one
+		 * and make sure we have plausible types for all of them.
+		 * If the seeks fail, continue with a smaller size;
+		 * we're probably just looking past the end of the database.
+		 * If they succeed and the types are reasonable, also continue
+		 * with a size smaller;  we may be looking at pages N,
+		 * 2N, and 3N for some N > 1.
+		 *
+		 * As soon as we hit an invalid type, we stop and return
+		 * our previous guess; that last one was probably the page size.
+		 */
+		for (i = 1; i <= 3; i++) {
+			if (__os_seek(
+			    env, fhp, i, guess, SSZ(DBMETA, type)) != 0)
+				break;
+			if (__os_read(env,
+			    fhp, &type, 1, &nr) != 0 || nr == 0)
+				break;
+			if (type == P_INVALID || type >= P_PAGETYPE_MAX)
+				return (guess << 1);
+		}
+	}
+
+	/*
+	 * If we're just totally confused--the corruption takes up most of the
+	 * beginning pages of the database--go with the default size.
+	 */
+	return (DB_DEF_IOSIZE);
+}
diff --git a/db/db_vrfy_stub.c b/db/db_vrfy_stub.c
new file mode 100644
index 0000000..9ed5acd
--- /dev/null
+++ b/db/db_vrfy_stub.c
@@ -0,0 +1,117 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef HAVE_VERIFY
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/db_verify.h"
+
+/*
+ * If the library wasn't compiled with the verification support, various
+ * routines aren't available.  Stub them here, returning an appropriate
+ * error.
+ */
+
+static int __db_novrfy __P((ENV *));
+
+/*
+ * __db_novrfy --
+ *	Error when a Berkeley DB build doesn't include the access method.
+ */
+static int
+__db_novrfy(env)
+	ENV *env;
+{
+	__db_errx(env,
+	    "library build did not include support for database verification");
+	return (DB_OPNOTSUP);
+}
+
+int
+__db_verify_pp(dbp, file, database, outfile, flags)
+	DB *dbp;
+	const char *file, *database;
+	FILE *outfile;
+	u_int32_t flags;
+{
+	int ret;
+
+	COMPQUIET(file, NULL);
+	COMPQUIET(database, NULL);
+	COMPQUIET(outfile, NULL);
+	COMPQUIET(flags, 0);
+
+	ret = __db_novrfy(dbp->env);
+
+	/* The verify method is a destructor. */
+	(void)__db_close(dbp, NULL, 0);
+
+	return (ret);
+}
+
+int
+__db_verify_internal(dbp, name, subdb, handle, callback, flags)
+	DB *dbp;
+	const char *name, *subdb;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	COMPQUIET(dbp, NULL);
+	COMPQUIET(name, NULL);
+	COMPQUIET(subdb, NULL);
+	COMPQUIET(handle, NULL);
+	COMPQUIET(callback, NULL);
+	COMPQUIET(flags, 0);
+	return (0);
+}
+
+int
+__db_vrfy_getpageinfo(vdp, pgno, pipp)
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	VRFY_PAGEINFO **pipp;
+{
+	COMPQUIET(pgno, 0);
+	COMPQUIET(pipp, NULL);
+	return (__db_novrfy(vdp->pgdbp->env));
+}
+
+int
+__db_vrfy_putpageinfo(env, vdp, pip)
+	ENV *env;
+	VRFY_DBINFO *vdp;
+	VRFY_PAGEINFO *pip;
+{
+	COMPQUIET(vdp, NULL);
+	COMPQUIET(pip, NULL);
+	return (__db_novrfy(env));
+}
+
+int
+__db_vrfy_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, vdp)
+        DBT *dbtp;
+        int checkprint;
+        const char *prefix;
+        void *handle;
+        int (*callback) __P((void *, const void *));
+        int is_recno;
+        VRFY_DBINFO *vdp;
+{
+	COMPQUIET(dbtp, NULL);
+	COMPQUIET(checkprint, 0);
+	COMPQUIET(prefix, NULL);
+	COMPQUIET(handle, NULL);
+	COMPQUIET(callback, NULL);
+	COMPQUIET(is_recno, 0);
+	return (__db_novrfy(vdp->pgdbp->env));
+}
+#endif /* !HAVE_VERIFY */
diff --git a/db/db_vrfyutil.c b/db/db_vrfyutil.c
new file mode 100644
index 0000000..04d73d9
--- /dev/null
+++ b/db/db_vrfyutil.c
@@ -0,0 +1,916 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000-2009 Oracle.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/db_am.h"
+
+static int __db_vrfy_childinc __P((DBC *, VRFY_CHILDINFO *));
+static int __db_vrfy_pageinfo_create __P((ENV *, VRFY_PAGEINFO **));
+
+/*
+ * __db_vrfy_dbinfo_create --
+ *	Allocate and initialize a VRFY_DBINFO structure.
+ *
+ * PUBLIC: int __db_vrfy_dbinfo_create
+ * PUBLIC:     __P((ENV *, DB_THREAD_INFO *, u_int32_t, VRFY_DBINFO **));
+ */
+int
+__db_vrfy_dbinfo_create(env, ip, pgsize, vdpp)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	u_int32_t pgsize;
+	VRFY_DBINFO **vdpp;
+{
+	DB *cdbp, *pgdbp, *pgset;
+	VRFY_DBINFO *vdp;
+	int ret;
+
+	vdp = NULL;
+	cdbp = pgdbp = pgset = NULL;
+
+	if ((ret = __os_calloc(NULL, 1, sizeof(VRFY_DBINFO), &vdp)) != 0)
+		goto err;
+
+	if ((ret = __db_create_internal(&cdbp, env, 0)) != 0)
+		goto err;
+
+	if ((ret = __db_set_flags(cdbp, DB_DUP)) != 0)
+		goto err;
+
+	if ((ret = __db_set_pagesize(cdbp, pgsize)) != 0)
+		goto err;
+
+	/* If transactional, make sure we don't log. */
+	if (TXN_ON(env) &&
+	    (ret = __db_set_flags(cdbp, DB_TXN_NOT_DURABLE)) != 0)
+		goto err;
+	if ((ret = __db_open(cdbp, ip,
+	    NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0600, PGNO_BASE_MD)) != 0)
+		goto err;
+
+	if ((ret = __db_create_internal(&pgdbp, env, 0)) != 0)
+		goto err;
+
+	if ((ret = __db_set_pagesize(pgdbp, pgsize)) != 0)
+		goto err;
+
+	/* If transactional, make sure we don't log. */
+	if (TXN_ON(env) &&
+	    (ret = __db_set_flags(pgdbp, DB_TXN_NOT_DURABLE)) != 0)
+		goto err;
+
+	if ((ret = __db_open(pgdbp, ip,
+	    NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0600, PGNO_BASE_MD)) != 0)
+		goto err;
+
+	if ((ret = __db_vrfy_pgset(env, ip, pgsize, &pgset)) != 0)
+		goto err;
+
+	LIST_INIT(&vdp->subdbs);
+	LIST_INIT(&vdp->activepips);
+
+	vdp->cdbp = cdbp;
+	vdp->pgdbp = pgdbp;
+	vdp->pgset = pgset;
+	vdp->thread_info = ip;
+	*vdpp = vdp;
+	return (0);
+
+err:	if (cdbp != NULL)
+		(void)__db_close(cdbp, NULL, 0);
+	if (pgdbp != NULL)
+		(void)__db_close(pgdbp, NULL, 0);
+	if (vdp != NULL)
+		__os_free(env, vdp);
+	return (ret);
+}
+
+/*
+ * __db_vrfy_dbinfo_destroy --
+ *	Destructor for VRFY_DBINFO.  Destroys VRFY_PAGEINFOs and deallocates
+ *	structure.
+ *
+ * PUBLIC: int __db_vrfy_dbinfo_destroy __P((ENV *, VRFY_DBINFO *));
+ */
+int
+__db_vrfy_dbinfo_destroy(env, vdp)
+	ENV *env;
+	VRFY_DBINFO *vdp;
+{
+	VRFY_CHILDINFO *c;
+	int t_ret, ret;
+
+	ret = 0;
+
+	/*
+	 * Discard active page structures.  Ideally there wouldn't be any,
+	 * but in some error cases we may not have cleared them all out.
+	 */
+	while (LIST_FIRST(&vdp->activepips) != NULL)
+		if ((t_ret = __db_vrfy_putpageinfo(
+		    env, vdp, LIST_FIRST(&vdp->activepips))) != 0) {
+			if (ret == 0)
+				ret = t_ret;
+			break;
+		}
+
+	/* Discard subdatabase list structures. */
+	while ((c = LIST_FIRST(&vdp->subdbs)) != NULL) {
+		LIST_REMOVE(c, links);
+		__os_free(NULL, c);
+	}
+
+	if ((t_ret = __db_close(vdp->pgdbp, NULL, 0)) != 0)
+		ret = t_ret;
+
+	if ((t_ret = __db_close(vdp->cdbp, NULL, 0)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if ((t_ret = __db_close(vdp->pgset, NULL, 0)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (vdp->extents != NULL)
+		__os_free(env, vdp->extents);
+	__os_free(env, vdp);
+	return (ret);
+}
+
+/*
+ * __db_vrfy_getpageinfo --
+ *	Get a PAGEINFO structure for a given page, creating it if necessary.
+ *
+ * PUBLIC: int __db_vrfy_getpageinfo
+ * PUBLIC:     __P((VRFY_DBINFO *, db_pgno_t, VRFY_PAGEINFO **));
+ */
+int
+__db_vrfy_getpageinfo(vdp, pgno, pipp)
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	VRFY_PAGEINFO **pipp;
+{
+	DB *pgdbp;
+	DBT key, data;
+	ENV *env;
+	VRFY_PAGEINFO *pip;
+	int ret;
+
+	/*
+	 * We want a page info struct.  There are three places to get it from,
+	 * in decreasing order of preference:
+	 *
+	 * 1. vdp->activepips.  If it's already "checked out", we're
+	 *	already using it, we return the same exact structure with a
+	 *	bumped refcount.  This is necessary because this code is
+	 *	replacing array accesses, and it's common for f() to make some
+	 *	changes to a pip, and then call g() and h() which each make
+	 *	changes to the same pip.  vdps are never shared between threads
+	 *	(they're never returned to the application), so this is safe.
+	 * 2. The pgdbp.  It's not in memory, but it's in the database, so
+	 *	get it, give it a refcount of 1, and stick it on activepips.
+	 * 3. malloc.  It doesn't exist yet;  create it, then stick it on
+	 *	activepips.  We'll put it in the database when we putpageinfo
+	 *	later.
+	 */
+
+	/* Case 1. */
+	LIST_FOREACH(pip, &vdp->activepips, links)
+		if (pip->pgno == pgno)
+			goto found;
+
+	/* Case 2. */
+	pgdbp = vdp->pgdbp;
+	env = pgdbp->env;
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+	F_SET(&data, DB_DBT_MALLOC);
+	key.data = &pgno;
+	key.size = sizeof(db_pgno_t);
+
+	if ((ret = __db_get(pgdbp,
+	    vdp->thread_info, NULL, &key, &data, 0)) == 0) {
+		/* Found it. */
+		DB_ASSERT(env, data.size == sizeof(VRFY_PAGEINFO));
+		pip = data.data;
+		LIST_INSERT_HEAD(&vdp->activepips, pip, links);
+		goto found;
+	} else if (ret != DB_NOTFOUND)	/* Something nasty happened. */
+		return (ret);
+
+	/* Case 3 */
+	if ((ret = __db_vrfy_pageinfo_create(env, &pip)) != 0)
+		return (ret);
+
+	LIST_INSERT_HEAD(&vdp->activepips, pip, links);
+found:	pip->pi_refcount++;
+
+	*pipp = pip;
+	return (0);
+}
+
+/*
+ * __db_vrfy_putpageinfo --
+ *	Put back a VRFY_PAGEINFO that we're done with.
+ *
+ * PUBLIC: int __db_vrfy_putpageinfo __P((ENV *,
+ * PUBLIC:     VRFY_DBINFO *, VRFY_PAGEINFO *));
+ */
+int
+__db_vrfy_putpageinfo(env, vdp, pip)
+	ENV *env;
+	VRFY_DBINFO *vdp;
+	VRFY_PAGEINFO *pip;
+{
+	DB *pgdbp;
+	DBT key, data;
+	VRFY_PAGEINFO *p;
+	int ret;
+
+	if (--pip->pi_refcount > 0)
+		return (0);
+
+	pgdbp = vdp->pgdbp;
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	key.data = &pip->pgno;
+	key.size = sizeof(db_pgno_t);
+	data.data = pip;
+	data.size = sizeof(VRFY_PAGEINFO);
+
+	if ((ret = __db_put(pgdbp,
+	     vdp->thread_info, NULL, &key, &data, 0)) != 0)
+		return (ret);
+
+	LIST_FOREACH(p, &vdp->activepips, links)
+		if (p == pip)
+			break;
+	if (p != NULL)
+		LIST_REMOVE(p, links);
+
+	__os_ufree(env, p);
+	return (0);
+}
+
+/*
+ * __db_vrfy_pgset --
+ *	Create a temporary database for the storing of sets of page numbers.
+ *	(A mapping from page number to int, used by the *_meta2pgset functions,
+ *	as well as for keeping track of which pages the verifier has seen.)
+ *
+ * PUBLIC: int __db_vrfy_pgset __P((ENV *,
+ * PUBLIC:     DB_THREAD_INFO *, u_int32_t, DB **));
+ */
+int
+__db_vrfy_pgset(env, ip, pgsize, dbpp)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	u_int32_t pgsize;
+	DB **dbpp;
+{
+	DB *dbp;
+	int ret;
+
+	if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+		return (ret);
+	if ((ret = __db_set_pagesize(dbp, pgsize)) != 0)
+		goto err;
+
+	/* If transactional, make sure we don't log. */
+	if (TXN_ON(env) &&
+	    (ret = __db_set_flags(dbp, DB_TXN_NOT_DURABLE)) != 0)
+		goto err;
+	if ((ret = __db_open(dbp, ip,
+	    NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0600, PGNO_BASE_MD)) == 0)
+		*dbpp = dbp;
+	else
+err:		(void)__db_close(dbp, NULL, 0);
+
+	return (ret);
+}
+
+/*
+ * __db_vrfy_pgset_get --
+ *	Get the value associated in a page set with a given pgno.  Return
+ *	a 0 value (and succeed) if we've never heard of this page.
+ *
+ * PUBLIC: int __db_vrfy_pgset_get __P((DB *, DB_THREAD_INFO *, db_pgno_t,
+ * PUBLIC:     int *));
+ */
+int
+__db_vrfy_pgset_get(dbp, ip, pgno, valp)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	db_pgno_t pgno;
+	int *valp;
+{
+	DBT key, data;
+	int ret, val;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	key.data = &pgno;
+	key.size = sizeof(db_pgno_t);
+	data.data = &val;
+	data.ulen = sizeof(int);
+	F_SET(&data, DB_DBT_USERMEM);
+
+	if ((ret = __db_get(dbp, ip, NULL, &key, &data, 0)) == 0) {
+		DB_ASSERT(dbp->env, data.size == sizeof(int));
+	} else if (ret == DB_NOTFOUND)
+		val = 0;
+	else
+		return (ret);
+
+	*valp = val;
+	return (0);
+}
+
+/*
+ * __db_vrfy_pgset_inc --
+ *	Increment the value associated with a pgno by 1.
+ *
+ * PUBLIC: int __db_vrfy_pgset_inc __P((DB *, DB_THREAD_INFO *, db_pgno_t));
+ */
+int
+__db_vrfy_pgset_inc(dbp, ip, pgno)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	db_pgno_t pgno;
+{
+	DBT key, data;
+	int ret;
+	int val;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	val = 0;
+
+	key.data = &pgno;
+	key.size = sizeof(db_pgno_t);
+	data.data = &val;
+	data.ulen = sizeof(int);
+	F_SET(&data, DB_DBT_USERMEM);
+
+	if ((ret = __db_get(dbp, ip, NULL, &key, &data, 0)) == 0) {
+		DB_ASSERT(dbp->env, data.size == sizeof(int));
+	} else if (ret != DB_NOTFOUND)
+		return (ret);
+
+	data.size = sizeof(int);
+	++val;
+
+	return (__db_put(dbp, ip, NULL, &key, &data, 0));
+}
+
+/*
+ * __db_vrfy_pgset_next --
+ *	Given a cursor open in a pgset database, get the next page in the
+ *	set.
+ *
+ * PUBLIC: int __db_vrfy_pgset_next __P((DBC *, db_pgno_t *));
+ */
+int
+__db_vrfy_pgset_next(dbc, pgnop)
+	DBC *dbc;
+	db_pgno_t *pgnop;
+{
+	DBT key, data;
+	db_pgno_t pgno;
+	int ret;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+	/* We don't care about the data, just the keys. */
+	F_SET(&data, DB_DBT_USERMEM | DB_DBT_PARTIAL);
+	F_SET(&key, DB_DBT_USERMEM);
+	key.data = &pgno;
+	key.ulen = sizeof(db_pgno_t);
+
+	if ((ret = __dbc_get(dbc, &key, &data, DB_NEXT)) != 0)
+		return (ret);
+
+	DB_ASSERT(dbc->env, key.size == sizeof(db_pgno_t));
+	*pgnop = pgno;
+
+	return (0);
+}
+
+/*
+ * __db_vrfy_childcursor --
+ *	Create a cursor to walk the child list with.  Returns with a nonzero
+ *	final argument if the specified page has no children.
+ *
+ * PUBLIC: int __db_vrfy_childcursor __P((VRFY_DBINFO *, DBC **));
+ */
+int
+__db_vrfy_childcursor(vdp, dbcp)
+	VRFY_DBINFO *vdp;
+	DBC **dbcp;
+{
+	DB *cdbp;
+	DBC *dbc;
+	int ret;
+
+	cdbp = vdp->cdbp;
+
+	if ((ret = __db_cursor(cdbp, vdp->thread_info, NULL, &dbc, 0)) == 0)
+		*dbcp = dbc;
+
+	return (ret);
+}
+
+/*
+ * __db_vrfy_childput --
+ *	Add a child structure to the set for a given page.
+ *
+ * PUBLIC: int __db_vrfy_childput
+ * PUBLIC:     __P((VRFY_DBINFO *, db_pgno_t, VRFY_CHILDINFO *));
+ */
+int
+__db_vrfy_childput(vdp, pgno, cip)
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	VRFY_CHILDINFO *cip;
+{
+	DB *cdbp;
+	DBC *cc;
+	DBT key, data;
+	VRFY_CHILDINFO *oldcip;
+	int ret;
+
+	cdbp = vdp->cdbp;
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	key.data = &pgno;
+	key.size = sizeof(db_pgno_t);
+
+	/*
+	 * We want to avoid adding multiple entries for a single child page;
+	 * we only need to verify each child once, even if a child (such
+	 * as an overflow key) is multiply referenced.
+	 *
+	 * However, we also need to make sure that when walking the list
+	 * of children, we encounter them in the order they're referenced
+	 * on a page.  (This permits us, for example, to verify the
+	 * prev_pgno/next_pgno chain of Btree leaf pages.)
+	 *
+	 * Check the child database to make sure that this page isn't
+	 * already a child of the specified page number.  If it's not,
+	 * put it at the end of the duplicate set.
+	 */
+	if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0)
+		return (ret);
+	for (ret = __db_vrfy_ccset(cc, pgno, &oldcip); ret == 0;
+	    ret = __db_vrfy_ccnext(cc, &oldcip))
+		if (oldcip->pgno == cip->pgno) {
+			/*
+			 * Found a matching child.  Increment its reference
+			 * count--we've run into it again--but don't put it
+			 * again.
+			 */
+			if ((ret = __db_vrfy_childinc(cc, oldcip)) != 0 ||
+			    (ret = __db_vrfy_ccclose(cc)) != 0)
+				return (ret);
+			return (0);
+		}
+	if (ret != DB_NOTFOUND) {
+		(void)__db_vrfy_ccclose(cc);
+		return (ret);
+	}
+	if ((ret = __db_vrfy_ccclose(cc)) != 0)
+		return (ret);
+
+	cip->refcnt = 1;
+	data.data = cip;
+	data.size = sizeof(VRFY_CHILDINFO);
+
+	return (__db_put(cdbp, vdp->thread_info, NULL, &key, &data, 0));
+}
+
+/*
+ * __db_vrfy_childinc --
+ *	Increment the refcount of the VRFY_CHILDINFO struct that the child
+ * cursor is pointing to.  (The caller has just retrieved this struct, and
+ * passes it in as cip to save us a get.)
+ */
+static int
+__db_vrfy_childinc(dbc, cip)
+	DBC *dbc;
+	VRFY_CHILDINFO *cip;
+{
+	DBT key, data;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	cip->refcnt++;
+	data.data = cip;
+	data.size = sizeof(VRFY_CHILDINFO);
+
+	return (__dbc_put(dbc, &key, &data, DB_CURRENT));
+}
+
+/*
+ * __db_vrfy_ccset --
+ *	Sets a cursor created with __db_vrfy_childcursor to the first
+ *	child of the given pgno, and returns it in the third arg.
+ *
+ * PUBLIC: int __db_vrfy_ccset __P((DBC *, db_pgno_t, VRFY_CHILDINFO **));
+ */
+int
+__db_vrfy_ccset(dbc, pgno, cipp)
+	DBC *dbc;
+	db_pgno_t pgno;
+	VRFY_CHILDINFO **cipp;
+{
+	DBT key, data;
+	int ret;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	key.data = &pgno;
+	key.size = sizeof(db_pgno_t);
+
+	if ((ret = __dbc_get(dbc, &key, &data, DB_SET)) != 0)
+		return (ret);
+
+	DB_ASSERT(dbc->env, data.size == sizeof(VRFY_CHILDINFO));
+	*cipp = (VRFY_CHILDINFO *)data.data;
+
+	return (0);
+}
+
+/*
+ * __db_vrfy_ccnext --
+ *	Gets the next child of the given cursor created with
+ *	__db_vrfy_childcursor, and returns it in the memory provided in the
+ *	second arg.
+ *
+ * PUBLIC: int __db_vrfy_ccnext __P((DBC *, VRFY_CHILDINFO **));
+ */
+int
+__db_vrfy_ccnext(dbc, cipp)
+	DBC *dbc;
+	VRFY_CHILDINFO **cipp;
+{
+	DBT key, data;
+	int ret;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	if ((ret = __dbc_get(dbc, &key, &data, DB_NEXT_DUP)) != 0)
+		return (ret);
+
+	DB_ASSERT(dbc->env, data.size == sizeof(VRFY_CHILDINFO));
+	*cipp = (VRFY_CHILDINFO *)data.data;
+
+	return (0);
+}
+
+/*
+ * __db_vrfy_ccclose --
+ *	Closes the cursor created with __db_vrfy_childcursor.
+ *
+ *	This doesn't actually do anything interesting now, but it's
+ *	not inconceivable that we might change the internal database usage
+ *	and keep the interfaces the same, and a function call here or there
+ *	seldom hurts anyone.
+ *
+ * PUBLIC: int __db_vrfy_ccclose __P((DBC *));
+ */
+int
+__db_vrfy_ccclose(dbc)
+	DBC *dbc;
+{
+
+	return (__dbc_close(dbc));
+}
+
+/*
+ * __db_vrfy_pageinfo_create --
+ *	Constructor for VRFY_PAGEINFO;  allocates and initializes.
+ */
+static int
+__db_vrfy_pageinfo_create(env, pipp)
+	ENV *env;
+	VRFY_PAGEINFO **pipp;
+{
+	VRFY_PAGEINFO *pip;
+	int ret;
+
+	/*
+	 * pageinfo structs are sometimes allocated here and sometimes
+	 * allocated by fetching them from a database with DB_DBT_MALLOC.
+	 * There's no easy way for the destructor to tell which was
+	 * used, and so we always allocate with __os_umalloc so we can free
+	 * with __os_ufree.
+	 */
+	if ((ret = __os_umalloc(env, sizeof(VRFY_PAGEINFO), &pip)) != 0)
+		return (ret);
+	memset(pip, 0, sizeof(VRFY_PAGEINFO));
+
+	*pipp = pip;
+	return (0);
+}
+
+/*
+ * __db_salvage_init --
+ *	Set up salvager database.
+ *
+ * PUBLIC: int  __db_salvage_init __P((VRFY_DBINFO *));
+ */
+int
+__db_salvage_init(vdp)
+	VRFY_DBINFO *vdp;
+{
+	DB *dbp;
+	int ret;
+
+	if ((ret = __db_create_internal(&dbp, NULL, 0)) != 0)
+		return (ret);
+
+	if ((ret = __db_set_pagesize(dbp, 1024)) != 0)
+		goto err;
+
+	if ((ret = __db_open(dbp, vdp->thread_info,
+	    NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0, PGNO_BASE_MD)) != 0)
+		goto err;
+
+	vdp->salvage_pages = dbp;
+	return (0);
+
+err:	(void)__db_close(dbp, NULL, 0);
+	return (ret);
+}
+
+/*
+ * __db_salvage_destroy --
+ *	Close salvager database.
+ * PUBLIC: int  __db_salvage_destroy __P((VRFY_DBINFO *));
+ */
+int
+__db_salvage_destroy(vdp)
+	VRFY_DBINFO *vdp;
+{
+	return (vdp->salvage_pages == NULL ? 0 :
+	    __db_close(vdp->salvage_pages, NULL, 0));
+}
+
+/*
+ * __db_salvage_getnext --
+ *	Get the next (first) unprinted page in the database of pages we need to
+ *	print still.  Delete entries for any already-printed pages we encounter
+ *	in this search, as well as the page we're returning.
+ *
+ * PUBLIC: int __db_salvage_getnext
+ * PUBLIC:     __P((VRFY_DBINFO *, DBC **, db_pgno_t *, u_int32_t *, int));
+ */
+int
+__db_salvage_getnext(vdp, dbcp, pgnop, pgtypep, skip_overflow)
+	VRFY_DBINFO *vdp;
+	DBC **dbcp;
+	db_pgno_t *pgnop;
+	u_int32_t *pgtypep;
+	int skip_overflow;
+{
+	DB *dbp;
+	DBT key, data;
+	int ret;
+	u_int32_t pgtype;
+
+	dbp = vdp->salvage_pages;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	if (*dbcp == NULL &&
+	    (ret = __db_cursor(dbp, vdp->thread_info, NULL, dbcp, 0)) != 0)
+		return (ret);
+
+	while ((ret = __dbc_get(*dbcp, &key, &data, DB_NEXT)) == 0) {
+		DB_ASSERT(dbp->env, data.size == sizeof(u_int32_t));
+		memcpy(&pgtype, data.data, sizeof(pgtype));
+
+		if (skip_overflow && pgtype == SALVAGE_OVERFLOW)
+			continue;
+
+		if ((ret = __dbc_del(*dbcp, 0)) != 0)
+			return (ret);
+		if (pgtype != SALVAGE_IGNORE) {
+			DB_ASSERT(dbp->env, key.size == sizeof(db_pgno_t));
+			DB_ASSERT(dbp->env, data.size == sizeof(u_int32_t));
+
+			*pgnop = *(db_pgno_t *)key.data;
+			*pgtypep = *(u_int32_t *)data.data;
+			break;
+		}
+	}
+
+	return (ret);
+}
+
+/*
+ * __db_salvage_isdone --
+ *	Return whether or not the given pgno is already marked
+ *	SALVAGE_IGNORE (meaning that we don't need to print it again).
+ *
+ *	Returns DB_KEYEXIST if it is marked, 0 if not, or another error on
+ *	error.
+ *
+ * PUBLIC: int __db_salvage_isdone __P((VRFY_DBINFO *, db_pgno_t));
+ */
+int
+__db_salvage_isdone(vdp, pgno)
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+{
+	DB *dbp;
+	DBT key, data;
+	int ret;
+	u_int32_t currtype;
+
+	dbp = vdp->salvage_pages;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	currtype = SALVAGE_INVALID;
+	data.data = &currtype;
+	data.ulen = sizeof(u_int32_t);
+	data.flags = DB_DBT_USERMEM;
+
+	key.data = &pgno;
+	key.size = sizeof(db_pgno_t);
+
+	/*
+	 * Put an entry for this page, with pgno as key and type as data,
+	 * unless it's already there and is marked done.
+	 * If it's there and is marked anything else, that's fine--we
+	 * want to mark it done.
+	 */
+	if ((ret = __db_get(dbp,
+	    vdp->thread_info, NULL, &key, &data, 0)) == 0) {
+		/*
+		 * The key's already here.  Check and see if it's already
+		 * marked done.  If it is, return DB_KEYEXIST.  If it's not,
+		 * return 0.
+		 */
+		if (currtype == SALVAGE_IGNORE)
+			return (DB_KEYEXIST);
+		else
+			return (0);
+	} else if (ret != DB_NOTFOUND)
+		return (ret);
+
+	/* The pgno is not yet marked anything; return 0. */
+	return (0);
+}
+
+/*
+ * __db_salvage_markdone --
+ *	Mark as done a given page.
+ *
+ * PUBLIC: int __db_salvage_markdone __P((VRFY_DBINFO *, db_pgno_t));
+ */
+int
+__db_salvage_markdone(vdp, pgno)
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+{
+	DB *dbp;
+	DBT key, data;
+	int pgtype, ret;
+	u_int32_t currtype;
+
+	pgtype = SALVAGE_IGNORE;
+	dbp = vdp->salvage_pages;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	currtype = SALVAGE_INVALID;
+	data.data = &currtype;
+	data.ulen = sizeof(u_int32_t);
+	data.flags = DB_DBT_USERMEM;
+
+	key.data = &pgno;
+	key.size = sizeof(db_pgno_t);
+
+	/*
+	 * Put an entry for this page, with pgno as key and type as data,
+	 * unless it's already there and is marked done.
+	 * If it's there and is marked anything else, that's fine--we
+	 * want to mark it done, but db_salvage_isdone only lets
+	 * us know if it's marked IGNORE.
+	 *
+	 * We don't want to return DB_KEYEXIST, though;  this will
+	 * likely get passed up all the way and make no sense to the
+	 * application.  Instead, use DB_VERIFY_BAD to indicate that
+	 * we've seen this page already--it probably indicates a
+	 * multiply-linked page.
+	 */
+	if ((ret = __db_salvage_isdone(vdp, pgno)) != 0)
+		return (ret == DB_KEYEXIST ? DB_VERIFY_BAD : ret);
+
+	data.size = sizeof(u_int32_t);
+	data.data = &pgtype;
+
+	return (__db_put(dbp, vdp->thread_info, NULL, &key, &data, 0));
+}
+
+/*
+ * __db_salvage_markneeded --
+ *	If it has not yet been printed, make note of the fact that a page
+ *	must be dealt with later.
+ *
+ * PUBLIC: int __db_salvage_markneeded
+ * PUBLIC:     __P((VRFY_DBINFO *, db_pgno_t, u_int32_t));
+ */
+int
+__db_salvage_markneeded(vdp, pgno, pgtype)
+	VRFY_DBINFO *vdp;
+	db_pgno_t pgno;
+	u_int32_t pgtype;
+{
+	DB *dbp;
+	DBT key, data;
+	int ret;
+
+	dbp = vdp->salvage_pages;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+
+	key.data = &pgno;
+	key.size = sizeof(db_pgno_t);
+
+	data.data = &pgtype;
+	data.size = sizeof(u_int32_t);
+
+	/*
+	 * Put an entry for this page, with pgno as key and type as data,
+	 * unless it's already there, in which case it's presumably
+	 * already been marked done.
+	 */
+	ret = __db_put(dbp,
+	     vdp->thread_info, NULL, &key, &data, DB_NOOVERWRITE);
+	return (ret == DB_KEYEXIST ? 0 : ret);
+}
+
+/*
+ * __db_vrfy_prdbt --
+ *	Print out a DBT data element from a verification routine.
+ *
+ * PUBLIC: int __db_vrfy_prdbt __P((DBT *, int, const char *, void *,
+ * PUBLIC:     int (*)(void *, const void *), int, VRFY_DBINFO *));
+ */
+int
+__db_vrfy_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, vdp)
+	DBT *dbtp;
+	int checkprint;
+	const char *prefix;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	int is_recno;
+	VRFY_DBINFO *vdp;
+{
+	if (vdp != NULL) {
+		/*
+		 * If vdp is non-NULL, we might be the first key in the
+		 * "fake" subdatabase used for key/data pairs we can't
+		 * associate with a known subdb.
+		 *
+		 * Check and clear the SALVAGE_PRINTHEADER flag;  if
+		 * it was set, print a subdatabase header.
+		 */
+		if (F_ISSET(vdp, SALVAGE_PRINTHEADER)) {
+			(void)__db_prheader(
+			    NULL, "__OTHER__", 0, 0, handle, callback, vdp, 0);
+			F_CLR(vdp, SALVAGE_PRINTHEADER);
+			F_SET(vdp, SALVAGE_PRINTFOOTER);
+		}
+
+		/*
+		 * Even if the printable flag wasn't set by our immediate
+		 * caller, it may be set on a salvage-wide basis.
+		 */
+		if (F_ISSET(vdp, SALVAGE_PRINTABLE))
+			checkprint = 1;
+	}
+	return (
+	    __db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno));
+}
diff --git a/db/partition.c b/db/partition.c
new file mode 100644
index 0000000..4e89ede
--- /dev/null
+++ b/db/partition.c
@@ -0,0 +1,2048 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2010 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/btree.h"
+#ifdef HAVE_HASH
+#include "dbinc/hash.h"
+#endif
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/txn.h"
+#ifdef HAVE_PARTITION
+
+static int __part_rr __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+	       const char *, const char *, const char *, u_int32_t));
+static int __partc_close __P((DBC *, db_pgno_t, int *));
+static int __partc_del __P((DBC*, u_int32_t));
+static int __partc_destroy __P((DBC*));
+static int __partc_get_pp __P((DBC*, DBT *, DBT *, u_int32_t));
+static int __partc_put __P((DBC*, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static int __partc_writelock __P((DBC*));
+static int __partition_chk_meta __P((DB *,
+		DB_THREAD_INFO *, DB_TXN *, u_int32_t));
+static int __partition_setup_keys __P((DBC *,
+		DB_PARTITION *, DBMETA *, u_int32_t));
+static int __part_key_cmp __P((const void *, const void *));
+static inline void __part_search __P((DB *,
+		DB_PARTITION *, DBT *, u_int32_t *));
+
+static char *Alloc_err = "Partition open failed to allocate %d bytes";
+
+/*
+ * Allocate a partition cursor and copy flags to the partition cursor.
+ * Not passed:
+ *	DBC_PARTITIONED -- the subcursors are not.
+ *	DBC_OWN_LID -- the arg dbc owns the lock id.
+ *	DBC_WRITECURSOR DBC_WRITER -- CDS locking happens on
+ *				the whole DB, not the partition.
+ */
+#define	GET_PART_CURSOR(dbc, new_dbc, part_id) do {			     \
+	DB *__part_dbp;							     \
+	__part_dbp = part->handles[part_id];				     \
+	if ((ret = __db_cursor_int(__part_dbp,				     \
+	     (dbc)->thread_info, (dbc)->txn, __part_dbp->type,		     \
+	     PGNO_INVALID, 0, (dbc)->locker, &new_dbc)) != 0)		     \
+		goto err;						     \
+	(new_dbc)->flags = (dbc)->flags &				     \
+	    ~(DBC_PARTITIONED|DBC_OWN_LID|DBC_WRITECURSOR|DBC_WRITER);	     \
+} while (0)
+
+/*
+ * Search for the correct partition.
+ */
+static inline void __part_search(dbp, part, key, part_idp)
+	DB *dbp;
+	DB_PARTITION *part;
+	DBT *key;
+	u_int32_t *part_idp;
+{
+	db_indx_t base, indx, limit;
+	int cmp;
+	int (*func) __P((DB *, const DBT *, const DBT *));
+
+	DB_ASSERT(dbp->env, part->nparts != 0);
+	COMPQUIET(cmp, 0);
+	COMPQUIET(indx, 0);
+
+	func = ((BTREE *)dbp->bt_internal)->bt_compare;
+	DB_BINARY_SEARCH_FOR(base, limit, part->nparts, O_INDX) {
+		DB_BINARY_SEARCH_INCR(indx, base, limit, O_INDX);
+		cmp = func(dbp, key, &part->keys[indx]);
+		if (cmp == 0)
+			break;
+		if (cmp > 0)
+			DB_BINARY_SEARCH_SHIFT_BASE(indx, base, limit, O_INDX);
+	}
+	if (cmp == 0)
+		*part_idp = indx;
+	else if ((*part_idp = base) != 0)
+		(*part_idp)--;
+}
+
+/*
+ * __partition_init --
+ *	Initialize the partition structure.
+ * Called when the meta data page is read in during database open or
+ * when partition keys or a callback are set.
+ *
+ * PUBLIC: int __partition_init __P((DB *, u_int32_t));
+ */
+int
+__partition_init(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	DB_PARTITION *part;
+	int ret;
+
+	if ((part = dbp->p_internal) != NULL) {
+		if ((LF_ISSET(DBMETA_PART_RANGE) &&
+		    F_ISSET(part, PART_CALLBACK)) ||
+		    (LF_ISSET(DBMETA_PART_CALLBACK) &&
+		    F_ISSET(part, PART_RANGE))) {
+			__db_errx(dbp->env,
+			    "Cannot specify callback and range keys.");
+			return (EINVAL);
+		}
+	} else if ((ret = __os_calloc(dbp->env, 1, sizeof(*part), &part)) != 0)
+		return (ret);
+
+	if (LF_ISSET(DBMETA_PART_RANGE))
+		F_SET(part, PART_RANGE);
+	if (LF_ISSET(DBMETA_PART_CALLBACK))
+		F_SET(part, PART_CALLBACK);
+	dbp->p_internal = part;
+	/* Set up AM-specific methods that do not require an open. */
+	dbp->db_am_rename = __part_rename;
+	dbp->db_am_remove = __part_remove;
+	return (0);
+}
+/*
+ * __partition_set --
+ *	Set the partitioning keys or callback function.
+ * This routine must be called prior to creating the database.
+ * PUBLIC: int __partition_set __P((DB *, u_int32_t, DBT *,
+ * PUBLIC:	u_int32_t (*callback)(DB *, DBT *key)));
+ */
+
+int
+__partition_set(dbp, parts, keys, callback)
+	DB *dbp;
+	u_int32_t parts;
+	DBT *keys;
+	u_int32_t (*callback)(DB *, DBT *key);
+{
+	DB_PARTITION *part;
+	ENV *env;
+	int ret;
+
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_partition");
+	env = dbp->dbenv->env;
+
+	if (parts < 2) {
+		__db_errx(env, "Must specify at least 2 partitions.");
+		return (EINVAL);
+	}
+
+	if (keys == NULL && callback == NULL) {
+		__db_errx(env, "Must specify either keys or a callback.");
+		return (EINVAL);
+	}
+	if (keys != NULL && callback != NULL) {
+bad:		__db_errx(env, "May not specify both keys and a callback.");
+		return (EINVAL);
+	}
+
+	if ((part = dbp->p_internal) == NULL) {
+		if ((ret = __partition_init(dbp,
+		    keys != NULL ?
+		    DBMETA_PART_RANGE : DBMETA_PART_CALLBACK)) != 0)
+			return (ret);
+		part = dbp->p_internal;
+	} else if ((part->keys != NULL && callback != NULL) ||
+	    (part->callback != NULL && keys != NULL))
+		goto bad;
+
+	part->nparts = parts;
+	part->keys = keys;
+	part->callback = callback;
+
+	return (0);
+}
+
+/*
+ * __partition_set_dirs --
+ *	Set the directories for creating the partition databases.
+ * They must be in the environment.
+ * PUBLIC: int __partition_set_dirs __P((DB *, const char **));
+ */
+int
+__partition_set_dirs(dbp, dirp)
+	DB *dbp;
+	const char **dirp;
+{
+	DB_ENV *dbenv;
+	DB_PARTITION *part;
+	ENV *env;
+	u_int32_t ndirs, slen;
+	int i, ret;
+	const char **dir;
+	char *cp, **part_dirs, **pd;
+
+	DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_partition_dirs");
+	dbenv = dbp->dbenv;
+	env = dbp->env;
+
+	ndirs = 1;
+	slen = 0;
+	for (dir = dirp; *dir != NULL; dir++) {
+		if (F_ISSET(env, ENV_DBLOCAL))
+			slen += (u_int32_t)strlen(*dir) + 1;
+		ndirs++;
+	}
+
+	slen += sizeof(char *) * ndirs;
+	if ((ret = __os_malloc(env, slen,  &part_dirs)) != 0)
+		return (EINVAL);
+	memset(part_dirs, 0, slen);
+
+	cp = (char *) part_dirs + (sizeof(char *) * ndirs);
+	pd = part_dirs;
+	for (dir = dirp; *dir != NULL; dir++, pd++) {
+		if (F_ISSET(env, ENV_DBLOCAL)) {
+			(void)strcpy(cp, *dir);
+			*pd = cp;
+			cp += strlen(*dir) + 1;
+			continue;
+		}
+		for (i = 0; i < dbenv->data_next; i++)
+			if (strcmp(*dir, dbenv->db_data_dir[i]) == 0)
+				break;
+		if (i == dbenv->data_next) {
+			__db_errx(dbp->env,
+			    "Directory not in environment list %s", *dir);
+			__os_free(env, part_dirs);
+			return (EINVAL);
+		}
+		*pd = dbenv->db_data_dir[i];
+	}
+
+	if ((part = dbp->p_internal) == NULL) {
+		if ((ret = __partition_init(dbp, 0)) != 0)
+			return (ret);
+		part = dbp->p_internal;
+	}
+
+	part->dirs = (const char **)part_dirs;
+
+	return (0);
+}
+
+/*
+ * __partition_open --
+ *	Open/create a partitioned database.
+ * PUBLIC: int __partition_open __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC:	 DB_TXN *, const char *, DBTYPE, u_int32_t, int, int));
+ */
+int
+__partition_open(dbp, ip, txn, fname, type, flags, mode, do_open)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *fname;
+	DBTYPE type;
+	u_int32_t flags;
+	int mode, do_open;
+{
+	DB *part_db;
+	DB_PARTITION *part;
+	DBC *dbc;
+	ENV *env;
+	u_int32_t part_id;
+	int ret;
+	char *name, *sp;
+	const char **dirp, *np;
+
+	part = dbp->p_internal;
+	env = dbp->dbenv->env;
+	name = NULL;
+
+	if ((ret = __partition_chk_meta(dbp, ip, txn, flags)) != 0 && do_open)
+		goto err;
+
+	if ((ret = __os_calloc(env,
+	     part->nparts, sizeof(*part->handles), &part->handles)) != 0) {
+		__db_errx(env,
+		    Alloc_err, part->nparts * sizeof(*part->handles));
+		goto err;
+	}
+
+	DB_ASSERT(env, fname != NULL);
+	if ((ret = __os_malloc(env,
+	     strlen(fname) + PART_LEN + 1, &name)) != 0) {
+		__db_errx(env, Alloc_err, strlen(fname) + PART_LEN + 1);
+		goto err;
+	}
+
+	sp = name;
+	np = __db_rpath(fname);
+	if (np == NULL)
+		np = fname;
+	else {
+		np++;
+		(void)strncpy(name, fname, (size_t)(np - fname));
+		sp = name + (np - fname);
+	}
+
+	if (F_ISSET(dbp, DB_AM_RECOVER))
+		goto done;
+	dirp = part->dirs;
+	for (part_id = 0; part_id < part->nparts; part_id++) {
+		if ((ret = __db_create_internal(
+		    &part->handles[part_id], dbp->env, 0)) != 0)
+			goto err;
+
+		part_db = part->handles[part_id];
+		part_db->flags = F_ISSET(dbp,
+		    ~(DB_AM_CREATED | DB_AM_CREATED_MSTR | DB_AM_OPEN_CALLED));
+		part_db->adj_fileid = dbp->adj_fileid;
+		part_db->pgsize = dbp->pgsize;
+		part_db->priority = dbp->priority;
+		part_db->db_append_recno = dbp->db_append_recno;
+		part_db->db_feedback = dbp->db_feedback;
+		part_db->dup_compare = dbp->dup_compare;
+		part_db->app_private = dbp->app_private;
+		part_db->api_internal = dbp->api_internal;
+
+		if (dbp->type == DB_BTREE)
+			__bam_copy_config(dbp, part_db, part->nparts);
+#ifdef HAVE_HASH
+		if (dbp->type == DB_HASH)
+			__ham_copy_config(dbp, part_db, part->nparts);
+#endif
+
+		(void)sprintf(sp, PART_NAME, np, part_id);
+		if ((ret = __os_strdup(env, name, &part_db->fname)) != 0)
+			goto err;
+		if (do_open) {
+			/*
+			 * Cycle through the directory names passed in,
+			 * if any.
+			 */
+			if (dirp != NULL &&
+			    (part_db->dirname = *dirp++) == NULL)
+				part_db->dirname = *(dirp = part->dirs);
+			if ((ret = __db_open(part_db, ip, txn,
+			    name, NULL, type, flags, mode, PGNO_BASE_MD)) != 0)
+				goto err;
+		}
+	}
+
+	/* Get rid of the cursor used to open the database its the wrong type */
+done:	while ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL)
+		if ((ret = __dbc_destroy(dbc)) != 0)
+			break;
+
+	if (0) {
+err:		(void)__partition_close(dbp, txn, 0);
+	}
+	if (name != NULL)
+		__os_free(env, name);
+	return (ret);
+}
+
+/*
+ * __partition_chk_meta --
+ * Check for a consistent meta data page and parameters when opening a
+ * partitioned database.
+ */
+static int
+__partition_chk_meta(dbp, ip, txn, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	u_int32_t flags;
+{
+	DBMETA *meta;
+	DB_PARTITION *part;
+	DBC *dbc;
+	DB_LOCK metalock;
+	DB_MPOOLFILE *mpf;
+	ENV *env;
+	db_pgno_t base_pgno;
+	int ret, t_ret;
+
+	dbc = NULL;
+	meta = NULL;
+	LOCK_INIT(metalock);
+	part = dbp->p_internal;
+	mpf = dbp->mpf;
+	env = dbp->env;
+	ret = 0;
+
+	/* Get a cursor on the main db.  */
+	dbp->p_internal = NULL;
+	if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+		goto err;
+
+	/* Get the metadata page. */
+	base_pgno = PGNO_BASE_MD;
+	if ((ret =
+	    __db_lget(dbc, 0, base_pgno, DB_LOCK_READ, 0, &metalock)) != 0)
+		goto err;
+	if ((ret = __memp_fget(mpf, &base_pgno, ip, dbc->txn, 0, &meta)) != 0)
+		goto err;
+
+	if (meta->magic != DB_HASHMAGIC &&
+	    (meta->magic != DB_BTREEMAGIC || F_ISSET(meta, BTM_RECNO))) {
+		__db_errx(env,
+		"Partitioning may only specified on BTREE and HASH databases.");
+		ret = EINVAL;
+		goto err;
+	}
+	if (!FLD_ISSET(meta->metaflags,
+	    DBMETA_PART_RANGE | DBMETA_PART_CALLBACK)) {
+		__db_errx(env,
+		"Partitioning specified on a non-partitioned database.");
+		ret = EINVAL;
+		goto err;
+	}
+
+	if ((F_ISSET(part, PART_RANGE) &&
+	    FLD_ISSET(meta->metaflags, DBMETA_PART_CALLBACK)) ||
+	    (F_ISSET(part, PART_CALLBACK) &&
+	    FLD_ISSET(meta->metaflags, DBMETA_PART_RANGE))) {
+		__db_errx(env, "Incompatible partitioning specified.");
+		ret = EINVAL;
+		goto err;
+	}
+
+	if (FLD_ISSET(meta->metaflags, DBMETA_PART_CALLBACK) &&
+	     part->callback == NULL && !IS_RECOVERING(env) &&
+	     !F_ISSET(dbp, DB_AM_RECOVER) && !LF_ISSET(DB_RDWRMASTER)) {
+		__db_errx(env, "Partition callback not specified.");
+		ret = EINVAL;
+		goto err;
+	}
+
+	if (F_ISSET(dbp, DB_AM_RECNUM)) {
+		__db_errx(env,
+	    "Record numbers are not supported in partitioned databases.");
+		ret = EINVAL;
+		goto err;
+	}
+
+	if (part->nparts == 0) {
+		if (LF_ISSET(DB_CREATE) && meta->nparts == 0) {
+			__db_errx(env, "Zero paritions specified.");
+			ret = EINVAL;
+			goto err;
+		} else
+			part->nparts = meta->nparts;
+	} else if (meta->nparts != 0 && part->nparts != meta->nparts) {
+		__db_errx(env, "Number of partitions does not match.");
+		ret = EINVAL;
+		goto err;
+	}
+
+	if (meta->magic == DB_HASHMAGIC) {
+		if (!F_ISSET(part, PART_CALLBACK)) {
+			__db_errx(env,
+			    "Hash database must specify a partition callback.");
+			ret = EINVAL;
+		}
+	} else if (meta->magic != DB_BTREEMAGIC) {
+		__db_errx(env,
+		    "Partitioning only supported on BTREE nad HASH.");
+		ret = EINVAL;
+	} else
+		ret = __partition_setup_keys(dbc, part, meta, flags);
+
+err:	/* Put the metadata page back. */
+	if (meta != NULL && (t_ret = __memp_fput(mpf,
+	    ip, meta, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+		ret = t_ret;
+
+	if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	dbp->p_internal = part;
+	return (ret);
+}
+
+/*
+ * Support for sorting keys.  Keys must be sorted using the btree
+ * compare function so if we call qsort in __partiton_setup_keys
+ * we use this structure to pass the DBP and compare function.
+ */
+struct key_sort {
+	DB *dbp;
+	DBT *key;
+	int (*compare) __P((DB *, const DBT *, const DBT *));
+};
+
+static int __part_key_cmp(a, b)
+	const void *a, *b;
+{
+	const struct key_sort *ka, *kb;
+
+	ka = a;
+	kb = b;
+	return (ka->compare(ka->dbp, ka->key, kb->key));
+}
+/*
+ * __partition_setup_keys --
+ *	Get the partition keys into memory, or put them to disk if we
+ * are creating a partitioned database.
+ */
+static int
+__partition_setup_keys(dbc, part, meta, flags)
+	DBC *dbc;
+	DB_PARTITION *part;
+	DBMETA *meta;
+	u_int32_t flags;
+{
+	BTREE *t;
+	DB *dbp;
+	DBT data, key, *keys, *kp;
+	ENV *env;
+	u_int32_t ds, i, j;
+	u_int8_t *dd;
+	struct key_sort *ks;
+	int have_keys, ret;
+	int (*compare) __P((DB *, const DBT *, const DBT *));
+	void *dp;
+
+	COMPQUIET(dd, NULL);
+	COMPQUIET(ds, 0);
+	memset(&data, 0, sizeof(data));
+	memset(&key, 0, sizeof(key));
+	ks = NULL;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	/* Need to just read the main database. */
+	dbp->p_internal = NULL;
+	have_keys = 0;
+
+	/* First verify that things what we expect. */
+	if ((ret = __dbc_get(dbc, &key, &data, DB_FIRST)) != 0) {
+		if (ret != DB_NOTFOUND)
+			goto err;
+		if (F_ISSET(part, PART_CALLBACK)) {
+			ret = 0;
+			goto done;
+		}
+		if (!LF_ISSET(DB_CREATE) && !F_ISSET(dbp, DB_AM_RECOVER) &&
+		    !LF_ISSET(DB_RDWRMASTER)) {
+			__db_errx(env, "No range keys found.");
+			ret = EINVAL;
+			goto err;
+		}
+	} else {
+		if (F_ISSET(part, PART_CALLBACK)) {
+			__db_errx(env, "Keys found and callback set.");
+			ret = EINVAL;
+			goto err;
+		}
+		if (key.size != 0) {
+			__db_errx(env, "Partition key 0 is not empty.");
+			ret = EINVAL;
+			goto err;
+		}
+		have_keys = 1;
+	}
+
+	if (LF_ISSET(DB_CREATE) && have_keys == 0) {
+		/* Insert the keys into the master database. */
+		for (i = 0; i < part->nparts - 1; i++) {
+			if ((ret = __db_put(dbp, dbc->thread_info,
+			    dbc->txn, &part->keys[i], &data, 0)) != 0)
+				    goto err;
+		}
+
+		/*
+		 * Insert the "0" pointer.  All records less than the first
+		 * given key go into this partition.  We must use the default
+		 * compare to insert this key, otherwise it might not be first.
+		 */
+		t = dbc->dbp->bt_internal;
+		compare = t->bt_compare;
+		t->bt_compare = __bam_defcmp;
+		memset(&key, 0, sizeof(key));
+		ret = __db_put(dbp, dbc->thread_info, dbc->txn, &key, &data, 0);
+		t->bt_compare = compare;
+		if (ret != 0)
+		    goto err;
+	}
+done:	if (F_ISSET(part, PART_RANGE)) {
+		/*
+		 * Allocate one page to hold the keys plus space at the
+		 * end of the buffer to put an array of DBTs.  If there
+		 * is not enough space __dbc_get will return how much
+		 * is needed and we realloc.
+		 */
+		if ((ret = __os_malloc(env,
+		    meta->pagesize + (sizeof(DBT) * part->nparts),
+		    &part->data)) != 0) {
+			__db_errx(env, Alloc_err, meta->pagesize);
+			goto err;
+		}
+		memset(&key, 0, sizeof(key));
+		memset(&data, 0, sizeof(data));
+		data.data = part->data;
+		data.ulen = meta->pagesize;
+		data.flags = DB_DBT_USERMEM;
+again:		if ((ret = __dbc_get(dbc, &key, &data,
+		     DB_FIRST | DB_MULTIPLE_KEY)) == DB_BUFFER_SMALL) {
+			if ((ret = __os_realloc(env,
+			      data.size + (sizeof(DBT) * part->nparts),
+			      &part->data)) != 0)
+				goto err;
+			data.data = part->data;
+			data.ulen = data.size;
+			goto again;
+		}
+		if (ret == 0) {
+			/*
+			 * They passed in keys, they must match.
+			 */
+			keys = NULL;
+			compare = NULL;
+			if (have_keys == 1 && (keys = part->keys) != NULL) {
+				t = dbc->dbp->bt_internal;
+				compare = t->bt_compare;
+				if ((ret = __os_malloc(env, (part->nparts - 1)
+				     * sizeof(struct key_sort), &ks)) != 0)
+					goto err;
+				for (j = 0; j < part->nparts - 1; j++) {
+					ks[j].dbp = dbc->dbp;
+					ks[j].compare = compare;
+					ks[j].key = &keys[j];
+				}
+
+				qsort(ks, (size_t)part->nparts - 1,
+				    sizeof(struct key_sort), __part_key_cmp);
+			}
+			DB_MULTIPLE_INIT(dp, &data);
+			part->keys = (DBT *)
+			    ((u_int8_t *)part->data + data.size);
+			j = 0;
+			for (kp = part->keys;
+			    kp < &part->keys[part->nparts]; kp++, j++) {
+				DB_MULTIPLE_KEY_NEXT(dp,
+				     &data, kp->data, kp->size, dd, ds);
+				if (dp == NULL) {
+					ret = DB_NOTFOUND;
+					break;
+				}
+				if (keys != NULL && j != 0 &&
+				    compare(dbc->dbp, ks[j - 1].key, kp) != 0) {
+					if (kp->data == NULL &&
+					    F_ISSET(dbp, DB_AM_RECOVER))
+						goto err;
+					__db_errx(env,
+					  "Partition key %d does not match", j);
+					ret = EINVAL;
+					goto err;
+				}
+			}
+		}
+	}
+	if (ret == DB_NOTFOUND && F_ISSET(dbp, DB_AM_RECOVER))
+		ret = 0;
+
+err:	dbp->p_internal = part;
+	if (ks != NULL)
+		__os_free(env, ks);
+	return (ret);
+}
+
+/*
+ * __partition_get_callback --
+ *	Get the partition callback function.
+ * PUBLIC: int __partition_get_callback __P((DB *,
+ * PUBLIC:	 u_int32_t *, u_int32_t (**callback)(DB *, DBT *key)));
+ */
+int
+__partition_get_callback(dbp, parts, callback)
+	DB *dbp;
+	u_int32_t *parts;
+	u_int32_t (**callback)(DB *, DBT *key);
+{
+	DB_PARTITION *part;
+
+	part = dbp->p_internal;
+	/* Only return populated results if partitioned using callbacks. */
+	if (part != NULL && !F_ISSET(part, PART_CALLBACK))
+		part = NULL;
+	if (parts != NULL)
+		*parts = (part != NULL ? part->nparts : 0);
+	if (callback != NULL)
+		*callback = (part != NULL ? part->callback : NULL);
+
+	return (0);
+}
+
+/*
+ * __partition_get_keys --
+ *	Get partition keys.
+ * PUBLIC: int __partition_get_keys __P((DB *, u_int32_t *, DBT **));
+ */
+int
+__partition_get_keys(dbp, parts, keys)
+	DB *dbp;
+	u_int32_t *parts;
+	DBT **keys;
+{
+	DB_PARTITION *part;
+
+	part = dbp->p_internal;
+	/* Only return populated results if partitioned using ranges. */
+	if (part != NULL && !F_ISSET(part, PART_RANGE))
+		part = NULL;
+	if (parts != NULL)
+		*parts = (part != NULL ? part->nparts : 0);
+	if (keys != NULL)
+		*keys = (part != NULL ? &part->keys[1] : NULL);
+
+	return (0);
+}
+
+/*
+ * __partition_get_dirs --
+ *	Get partition dirs.
+ * PUBLIC: int __partition_get_dirs __P((DB *, const char ***));
+ */
+int
+__partition_get_dirs(dbp, dirpp)
+	DB *dbp;
+	const char ***dirpp;
+{
+	DB_PARTITION *part;
+	ENV *env;
+	u_int32_t i;
+	int ret;
+
+	env = dbp->env;
+	if ((part = dbp->p_internal) == NULL) {
+		*dirpp = NULL;
+		return (0);
+	}
+	if (!F_ISSET(dbp, DB_AM_OPEN_CALLED)) {
+		*dirpp = part->dirs;
+		return (0);
+	}
+
+	/*
+	 * We build a list once when asked.  The original directory list,
+	 * if any, was discarded at open time.
+	 */
+	if ((*dirpp = part->dirs) != NULL)
+		return (0);
+
+	if ((ret = __os_calloc(env,
+	    sizeof(char *), part->nparts + 1, (char **)&part->dirs)) != 0)
+		return (ret);
+
+	for (i = 0; i < part->nparts; i++)
+		part->dirs[i] = part->handles[i]->dirname;
+
+	*dirpp = part->dirs;
+	return (0);
+}
+
+/*
+ * __partc_init --
+ *	Initialize the access private portion of a cursor
+ *
+ * PUBLIC: int __partc_init __P((DBC *));
+ */
+int
+__partc_init(dbc)
+	DBC *dbc;
+{
+	ENV *env;
+	int ret;
+
+	env = dbc->env;
+
+	/* Allocate/initialize the internal structure. */
+	if (dbc->internal == NULL && (ret =
+	    __os_calloc(env, 1, sizeof(PART_CURSOR), &dbc->internal)) != 0)
+		return (ret);
+
+	/* Initialize methods. */
+	dbc->close = dbc->c_close = __dbc_close_pp;
+	dbc->cmp = __dbc_cmp_pp;
+	dbc->count = dbc->c_count = __dbc_count_pp;
+	dbc->del = dbc->c_del = __dbc_del_pp;
+	dbc->dup = dbc->c_dup = __dbc_dup_pp;
+	dbc->get = dbc->c_get = __partc_get_pp;
+	dbc->pget = dbc->c_pget = __dbc_pget_pp;
+	dbc->put = dbc->c_put = __dbc_put_pp;
+	dbc->am_bulk = NULL;
+	dbc->am_close = __partc_close;
+	dbc->am_del = __partc_del;
+	dbc->am_destroy = __partc_destroy;
+	dbc->am_get = NULL;
+	dbc->am_put = __partc_put;
+	dbc->am_writelock = __partc_writelock;
+
+	/* We avoid swapping partition cursors since we swap the sub cursors */
+	F_SET(dbc, DBC_PARTITIONED);
+
+	return (0);
+}
+/*
+ * __partc_get_pp --
+ *	cursor get opeartion on a partitioned database.
+ */
+static int
+__partc_get_pp(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	ENV *env;
+	int ignore_lease, ret;
+
+	dbp = dbc->dbp;
+	env = dbp->env;
+
+	ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0;
+	LF_CLR(DB_IGNORE_LEASE);
+	if ((ret = __dbc_get_arg(dbc, key, data, flags)) != 0)
+		return (ret);
+
+	ENV_ENTER(env, ip);
+
+	DEBUG_LREAD(dbc, dbc->txn, "DBcursor->get",
+	    flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags);
+
+	ret = __partc_get(dbc, key, data, flags);
+	/*
+	 * Check for master leases.
+	 */
+	if (ret == 0 &&
+	    IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease)
+		ret = __rep_lease_check(env, 1);
+
+	ENV_LEAVE(env, ip);
+	__dbt_userfree(env, key, NULL, data);
+	return (ret);
+}
+/*
+ * __partiton_get --
+ *	cursor get opeartion on a partitioned database.
+ *
+ * PUBLIC: int __partc_get __P((DBC*, DBT *, DBT *, u_int32_t));
+ */
+int
+__partc_get(dbc, key, data, flags)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DBC *orig_dbc, *new_dbc;
+	DB_PARTITION *part;
+	PART_CURSOR *cp;
+	u_int32_t multi, part_id;
+	int ret, retry, search;
+
+	dbp = dbc->dbp;
+	cp = (PART_CURSOR*)dbc->internal;
+	orig_dbc = cp->sub_cursor;
+	part = dbp->p_internal;
+
+	new_dbc = NULL;
+	retry = search = 0;
+	part_id = cp->part_id;
+	multi = flags & ~DB_OPFLAGS_MASK;
+
+	switch (flags & DB_OPFLAGS_MASK) {
+	case DB_CURRENT:
+		break;
+	case DB_FIRST:
+		part_id = 0;
+		retry = 1;
+		break;
+	case DB_GET_BOTH:
+	case DB_GET_BOTHC:
+	case DB_GET_BOTH_RANGE:
+		search = 1;
+		break;
+	case DB_SET_RANGE:
+		search = 1;
+		retry = 1;
+		break;
+	case DB_LAST:
+		part_id = part->nparts - 1;
+		retry = 1;
+		break;
+	case DB_NEXT:
+	case DB_NEXT_NODUP:
+		if (orig_dbc == NULL)
+			part_id = 0;
+		else
+			part_id = cp->part_id;
+		retry = 1;
+		break;
+	case DB_NEXT_DUP:
+		break;
+	case DB_PREV:
+	case DB_PREV_NODUP:
+		if (orig_dbc == NULL)
+			part_id = part->nparts - 1;
+		else
+			part_id = cp->part_id;
+		retry = 1;
+		break;
+	case DB_PREV_DUP:
+		break;
+	case DB_SET:
+		search = 1;
+		break;
+	default:
+		return (__db_unknown_flag(dbp->env, "__partc_get", flags));
+	}
+
+	/*
+	 * If we need to find the partition to start on, then
+	 * do a binary search of the in memory partition table.
+	 */
+	if (search == 1 && F_ISSET(part, PART_CALLBACK))
+		part_id = part->callback(dbp, key) % part->nparts;
+	else if (search == 1)
+		__part_search(dbp, part, key, &part_id);
+
+	/* Get a new cursor if necessary */
+	if (orig_dbc == NULL || cp->part_id != part_id) {
+		GET_PART_CURSOR(dbc, new_dbc, part_id);
+	} else
+		new_dbc = orig_dbc;
+
+	while ((ret = __dbc_get(new_dbc,
+	    key, data, flags)) == DB_NOTFOUND && retry == 1) {
+		switch (flags & DB_OPFLAGS_MASK) {
+		case DB_FIRST:
+		case DB_NEXT:
+		case DB_NEXT_NODUP:
+		case DB_SET_RANGE:
+			if (++part_id < part->nparts) {
+				flags = DB_FIRST | multi;
+				break;
+			}
+			goto err;
+		case DB_LAST:
+		case DB_PREV:
+		case DB_PREV_NODUP:
+			if (part_id-- > 0) {
+				flags = DB_LAST | multi;
+				break;
+			}
+			goto err;
+		default:
+			goto err;
+		}
+
+		if (new_dbc != orig_dbc && (ret = __dbc_close(new_dbc)) != 0)
+			goto err;
+		GET_PART_CURSOR(dbc, new_dbc, part_id);
+	}
+
+	if (ret != 0)
+		goto err;
+
+	/* Success: swap original and new cursors. */
+	if (new_dbc != orig_dbc) {
+		if (orig_dbc != NULL) {
+			cp->sub_cursor = NULL;
+			if ((ret = __dbc_close(orig_dbc)) != 0)
+				goto err;
+		}
+		cp->sub_cursor = new_dbc;
+		cp->part_id = part_id;
+	}
+
+	return (0);
+
+err:	if (new_dbc != NULL && new_dbc != orig_dbc)
+		(void)__dbc_close(new_dbc);
+	return (ret);
+}
+
+/*
+ * __partc_put --
+ *	cursor put opeartion on a partitioned cursor.
+ *
+ */
+static int
+__partc_put(dbc, key, data, flags, pgnop)
+	DBC *dbc;
+	DBT *key, *data;
+	u_int32_t flags;
+	db_pgno_t *pgnop;
+{
+	DB *dbp;
+	DB_PARTITION *part;
+	DBC *new_dbc;
+	PART_CURSOR *cp;
+	u_int32_t part_id;
+	int ret;
+
+	dbp = dbc->dbp;
+	cp = (PART_CURSOR*)dbc->internal;
+	part_id = cp->part_id;
+	part = dbp->p_internal;
+	*pgnop = PGNO_INVALID;
+
+	switch (flags) {
+	case DB_KEYFIRST:
+	case DB_KEYLAST:
+	case DB_NODUPDATA:
+	case DB_NOOVERWRITE:
+	case DB_OVERWRITE_DUP:
+		if (F_ISSET(part, PART_CALLBACK)) {
+			part_id = part->callback(dbp, key) % part->nparts;
+			break;
+		}
+		__part_search(dbp, part, key, &part_id);
+		break;
+	default:
+		break;
+	}
+
+	if ((new_dbc = cp->sub_cursor) == NULL || cp->part_id != part_id) {
+		if ((ret = __db_cursor_int(part->handles[part_id],
+		    dbc->thread_info, dbc->txn, part->handles[part_id]->type,
+		    PGNO_INVALID, 0, dbc->locker, &new_dbc)) != 0)
+			goto err;
+	}
+
+	if (F_ISSET(dbc, DBC_WRITER | DBC_WRITECURSOR))
+		F_SET(new_dbc, DBC_WRITER);
+	if ((ret = __dbc_put(new_dbc, key, data, flags)) != 0)
+		goto err;
+
+	if (new_dbc != cp->sub_cursor) {
+		if (cp->sub_cursor != NULL) {
+			if ((ret = __dbc_close(cp->sub_cursor)) != 0)
+				goto err;
+			cp->sub_cursor = NULL;
+		}
+		cp->sub_cursor = new_dbc;
+		cp->part_id = part_id;
+	}
+
+	return (0);
+
+err:	if (new_dbc != NULL && cp->sub_cursor != new_dbc)
+		(void)__dbc_close(new_dbc);
+	return (ret);
+}
+
+/*
+ * __partc_del
+ *	Delete interface to partitioned cursors.
+ *
+ */
+static int
+__partc_del(dbc, flags)
+	DBC *dbc;
+	u_int32_t flags;
+{
+	PART_CURSOR *cp;
+	cp = (PART_CURSOR*)dbc->internal;
+
+	if (F_ISSET(dbc, DBC_WRITER | DBC_WRITECURSOR))
+		F_SET(cp->sub_cursor, DBC_WRITER);
+	return (__dbc_del(cp->sub_cursor, flags));
+}
+
+/*
+ * __partc_writelock
+ *	Writelock interface to partitioned cursors.
+ *
+ */
+static int
+__partc_writelock(dbc)
+	DBC *dbc;
+{
+	PART_CURSOR *cp;
+	cp = (PART_CURSOR*)dbc->internal;
+
+	return (cp->sub_cursor->am_writelock(cp->sub_cursor));
+}
+
+/*
+ * __partc_close
+ *	Close interface to partitioned cursors.
+ *
+ */
+static int
+__partc_close(dbc, root_pgno, rmroot)
+	DBC *dbc;
+	db_pgno_t root_pgno;
+	int *rmroot;
+{
+	PART_CURSOR *cp;
+	int ret;
+
+	COMPQUIET(root_pgno, 0);
+	COMPQUIET(rmroot, NULL);
+
+	cp = (PART_CURSOR*)dbc->internal;
+
+	if (cp->sub_cursor == NULL)
+		return (0);
+	ret = __dbc_close(cp->sub_cursor);
+	cp->sub_cursor = NULL;
+	return (ret);
+}
+
+/*
+ * __partc_destroy --
+ *	Destroy a single cursor.
+ */
+static int
+__partc_destroy(dbc)
+	DBC *dbc;
+{
+	PART_CURSOR *cp;
+	ENV *env;
+
+	cp = (PART_CURSOR *)dbc->internal;
+	env = dbc->env;
+
+	/* Discard the structure. Don't recurse. */
+	__os_free(env, cp);
+
+	return (0);
+}
+
+/*
+ * __partiton_close
+ *	Close a partitioned database.
+ *
+ * PUBLIC: int __partition_close __P((DB *, DB_TXN *, u_int32_t));
+ */
+int
+__partition_close(dbp, txn, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	u_int32_t flags;
+{
+	DB **pdbp;
+	DB_PARTITION *part;
+	ENV *env;
+	u_int32_t i;
+	int ret, t_ret;
+
+	if ((part = dbp->p_internal) == NULL)
+		return (0);
+
+	env = dbp->env;
+	ret = 0;
+
+	if ((pdbp = part->handles) != NULL) {
+		for (i = 0; i < part->nparts; i++, pdbp++)
+			if (*pdbp != NULL && (t_ret =
+			    __db_close(*pdbp, txn, flags)) != 0 && ret == 0)
+				ret = t_ret;
+		__os_free(env, part->handles);
+	}
+	if (part->dirs != NULL)
+		__os_free(env, (char **)part->dirs);
+	if (part->data != NULL)
+		__os_free(env, (char **)part->data);
+	__os_free(env, part);
+	dbp->p_internal = NULL;
+
+	return (ret);
+}
+
+/*
+ * __partiton_sync
+ *	Sync a partitioned database.
+ *
+ * PUBLIC: int __partition_sync __P((DB *));
+ */
+int
+__partition_sync(dbp)
+	DB *dbp;
+{
+	DB **pdbp;
+	DB_PARTITION *part;
+	u_int32_t i;
+	int ret, t_ret;
+
+	ret = 0;
+	part = dbp->p_internal;
+
+	if ((pdbp = part->handles) != NULL) {
+		for (i = 0; i < part->nparts; i++, pdbp++)
+			if (*pdbp != NULL &&
+			    F_ISSET(*pdbp, DB_AM_OPEN_CALLED) && (t_ret =
+			    __memp_fsync((*pdbp)->mpf)) != 0 && ret == 0)
+				ret = t_ret;
+	}
+	if ((t_ret = __memp_fsync(dbp->mpf)) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+
+/*
+ * __partiton_stat
+ *	Stat a partitioned database.
+ *
+ * PUBLIC: int __partition_stat __P((DBC *, void *, u_int32_t));
+ */
+int
+__partition_stat(dbc, spp, flags)
+	DBC *dbc;
+	void *spp;
+	u_int32_t flags;
+{
+	DB *dbp, **pdbp;
+	DB_BTREE_STAT *fsp, *bsp;
+#ifdef HAVE_HASH
+	DB_HASH_STAT *hfsp, *hsp;
+#endif
+	DB_PARTITION *part;
+	DBC *new_dbc;
+	ENV *env;
+	u_int32_t i;
+	int ret;
+
+	dbp = dbc->dbp;
+	part = dbp->p_internal;
+	env = dbp->env;
+	fsp = NULL;
+#ifdef HAVE_HASH
+	hfsp = NULL;
+#endif
+
+	pdbp = part->handles;
+	for (i = 0; i < part->nparts; i++, pdbp++) {
+		if ((ret = __db_cursor_int(*pdbp, dbc->thread_info, dbc->txn,
+		    (*pdbp)->type, PGNO_INVALID,
+		    0, dbc->locker, &new_dbc)) != 0)
+			goto err;
+		switch (new_dbc->dbtype) {
+		case DB_BTREE:
+			if ((ret = __bam_stat(new_dbc, &bsp, flags)) != 0)
+				goto err;
+			if (fsp == NULL) {
+				fsp = bsp;
+				*(DB_BTREE_STAT **)spp = fsp;
+			} else {
+				fsp->bt_nkeys += bsp->bt_nkeys;
+				fsp->bt_ndata += bsp->bt_ndata;
+				fsp->bt_pagecnt += bsp->bt_pagecnt;
+				if (fsp->bt_levels < bsp->bt_levels)
+					fsp->bt_levels = bsp->bt_levels;
+				fsp->bt_int_pg += bsp->bt_int_pg;
+				fsp->bt_leaf_pg += bsp->bt_leaf_pg;
+				fsp->bt_dup_pg += bsp->bt_dup_pg;
+				fsp->bt_over_pg += bsp->bt_over_pg;
+				fsp->bt_free += bsp->bt_free;
+				fsp->bt_int_pgfree += bsp->bt_int_pgfree;
+				fsp->bt_leaf_pgfree += bsp->bt_leaf_pgfree;
+				fsp->bt_dup_pgfree += bsp->bt_dup_pgfree;
+				fsp->bt_over_pgfree += bsp->bt_over_pgfree;
+				__os_ufree(env, bsp);
+			}
+			break;
+#ifdef HAVE_HASH
+		case DB_HASH:
+			if ((ret = __ham_stat(new_dbc, &hsp, flags)) != 0)
+				goto err;
+			if (hfsp == NULL) {
+				hfsp = hsp;
+				*(DB_HASH_STAT **)spp = hfsp;
+			} else {
+				hfsp->hash_nkeys += hsp->hash_nkeys;
+				hfsp->hash_ndata += hsp->hash_ndata;
+				hfsp->hash_pagecnt += hsp->hash_pagecnt;
+				hfsp->hash_ffactor += hsp->hash_ffactor;
+				hfsp->hash_buckets += hsp->hash_buckets;
+				hfsp->hash_free += hsp->hash_free;
+				hfsp->hash_bfree += hsp->hash_bfree;
+				hfsp->hash_bigpages += hsp->hash_bigpages;
+				hfsp->hash_big_bfree += hsp->hash_big_bfree;
+				hfsp->hash_overflows += hsp->hash_overflows;
+				hfsp->hash_ovfl_free += hsp->hash_ovfl_free;
+				hfsp->hash_dup += hsp->hash_dup;
+				hfsp->hash_dup_free += hsp->hash_dup_free;
+				__os_ufree(env, hsp);
+			}
+			break;
+#endif
+		default:
+			break;
+		}
+		if ((ret = __dbc_close(new_dbc)) != 0)
+			goto err;
+	}
+	return (0);
+
+err:
+	if (fsp != NULL)
+		__os_ufree(env, fsp);
+	*(DB_BTREE_STAT **)spp = NULL;
+	return (ret);
+}
+
+/*
+ * __part_truncate --
+ *	Truncate a database.
+ *
+ * PUBLIC: int __part_truncate __P((DBC *, u_int32_t *));
+ */
+int
+__part_truncate(dbc, countp)
+	DBC *dbc;
+	u_int32_t *countp;
+{
+	DB *dbp, **pdbp;
+	DB_PARTITION *part;
+	DBC *new_dbc;
+	u_int32_t count, i;
+	int ret, t_ret;
+
+	dbp = dbc->dbp;
+	part = dbp->p_internal;
+	pdbp = part->handles;
+	ret = 0;
+
+	if (countp != NULL)
+		*countp = 0;
+	for (i = 0; ret == 0 && i < part->nparts; i++, pdbp++) {
+		if ((ret = __db_cursor_int(*pdbp, dbc->thread_info, dbc->txn,
+		    (*pdbp)->type, PGNO_INVALID,
+		    0, dbc->locker, &new_dbc)) != 0)
+			break;
+		switch (dbp->type) {
+		case DB_BTREE:
+		case DB_RECNO:
+			ret = __bam_truncate(new_dbc, &count);
+			break;
+		case DB_HASH:
+#ifdef HAVE_HASH
+			ret = __ham_truncate(new_dbc, &count);
+			break;
+#endif
+		case DB_QUEUE:
+		case DB_UNKNOWN:
+		default:
+			ret = __db_unknown_type(dbp->env,
+			    "DB->truncate", dbp->type);
+			count = 0;
+			break;
+		}
+		if ((t_ret = __dbc_close(new_dbc)) != 0 && ret == 0)
+			ret = t_ret;
+		if (countp != NULL)
+			*countp += count;
+	}
+
+	return (ret);
+}
+/*
+ * __part_compact -- compact a partitioned database.
+ *
+ * PUBLIC: int __part_compact __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC:     DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *));
+ */
+int
+__part_compact(dbp, ip, txn, start, stop, c_data, flags, end)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DBT *start, *stop;
+	DB_COMPACT *c_data;
+	u_int32_t flags;
+	DBT *end;
+{
+	DB **pdbp;
+	DB_PARTITION *part;
+	u_int32_t i;
+	int ret;
+
+	part = dbp->p_internal;
+	pdbp = part->handles;
+	ret = 0;
+
+	for (i = 0; ret == 0 && i < part->nparts; i++, pdbp++) {
+		switch (dbp->type) {
+		case DB_HASH:
+			if (!LF_ISSET(DB_FREELIST_ONLY))
+				goto err;
+			/* FALLTHROUGH */
+		case DB_BTREE:
+		case DB_RECNO:
+			ret = __bam_compact(*pdbp,
+			     ip, txn, start, stop, c_data, flags, end);
+			break;
+
+		default:
+	err:		ret = __dbh_am_chk(dbp, DB_OK_BTREE);
+			break;
+		}
+	}
+	return (ret);
+}
+
+/*
+ * __part_lsn_reset --
+ *	reset the lsns on each partition.
+ *
+ * PUBLIC: int __part_lsn_reset __P((DB *, DB_THREAD_INFO *));
+ */
+int
+__part_lsn_reset(dbp, ip)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+{
+	DB **pdbp;
+	DB_PARTITION *part;
+	u_int32_t i;
+	int ret;
+
+	part = dbp->p_internal;
+	pdbp = part->handles;
+	ret = 0;
+
+	for (i = 0; ret == 0 && i < part->nparts; i++, pdbp++)
+		ret = __db_lsn_reset((*pdbp)->mpf, ip);
+
+	return (ret);
+}
+
+/*
+ * __part_fileid_reset --
+ *	reset the fileid on each partition.
+ *
+ * PUBLIC: int __part_fileid_reset
+ * PUBLIC:	 __P((ENV *, DB_THREAD_INFO *, const char *, u_int32_t, int));
+ */
+int
+__part_fileid_reset(env, ip, fname, nparts, encrypted)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	const char *fname;
+	u_int32_t nparts;
+	int encrypted;
+{
+	int ret;
+	u_int32_t part_id;
+	char *name, *sp;
+	const char *np;
+
+	if ((ret = __os_malloc(env,
+	     strlen(fname) + PART_LEN + 1, &name)) != 0) {
+		__db_errx(env, Alloc_err, strlen(fname) + PART_LEN + 1);
+		return (ret);
+	}
+
+	sp = name;
+	np = __db_rpath(fname);
+	if (np == NULL)
+		np = fname;
+	else {
+		np++;
+		(void)strncpy(name, fname, (size_t)(np - fname));
+		sp = name + (np - fname);
+	}
+
+	for (part_id = 0; ret == 0 && part_id < nparts; part_id++) {
+		(void)sprintf(sp, PART_NAME, np, part_id);
+		ret = __env_fileid_reset(env, ip, sp, encrypted);
+	}
+
+	__os_free(env, name);
+	return (ret);
+}
+#ifndef HAVE_BREW
+/*
+ * __part_key_range --
+ *	Return proportion of keys relative to given key.
+ *
+ * PUBLIC: int __part_key_range __P((DBC *, DBT *, DB_KEY_RANGE *, u_int32_t));
+ */
+int
+__part_key_range(dbc, dbt, kp, flags)
+	DBC *dbc;
+	DBT *dbt;
+	DB_KEY_RANGE *kp;
+	u_int32_t flags;
+{
+	BTREE_CURSOR *cp;
+	DBC *new_dbc;
+	DB_PARTITION *part;
+	PAGE *h;
+	u_int32_t id, part_id;
+	u_int32_t elems, empty, less_elems, my_elems, greater_elems;
+	u_int32_t levels, max_levels, my_levels;
+	int ret;
+	double total_elems;
+
+	COMPQUIET(flags, 0);
+
+	part = dbc->dbp->p_internal;
+
+	/*
+	 * First we find the key range for the partition that contains the
+	 * key.  Then we scale based on estimates of the other partitions.
+	 */
+	if (F_ISSET(part, PART_CALLBACK))
+		part_id = part->callback(dbc->dbp, dbt) % part->nparts;
+	else
+		__part_search(dbc->dbp, part, dbt, &part_id);
+	GET_PART_CURSOR(dbc, new_dbc, part_id);
+
+	if ((ret = __bam_key_range(new_dbc, dbt, kp, flags)) != 0)
+		goto err;
+
+	cp = (BTREE_CURSOR *)new_dbc->internal;
+
+	if ((ret = __memp_fget(new_dbc->dbp->mpf,
+	     &cp->root, new_dbc->thread_info, new_dbc->txn, 0, &h)) != 0)
+		goto c_err;
+
+	my_elems = NUM_ENT(h);
+	my_levels = LEVEL(h);
+	max_levels = my_levels;
+
+	if ((ret = __memp_fput(new_dbc->dbp->mpf,
+	     new_dbc->thread_info, h, new_dbc->priority)) != 0)
+		goto c_err;
+
+	if ((ret = __dbc_close(new_dbc)) != 0)
+		goto err;
+	/*
+	 * We have the range within one subtree.  Now estimate
+	 * what part of the whole range that subtree is.  Figure
+	 * out how many levels each part has and how many entries
+	 * in the level below the root.
+	 */
+	empty = less_elems = greater_elems = 0;
+	for (id = 0; id < part->nparts; id++) {
+		if (id == part_id) {
+			empty = 0;
+			continue;
+		}
+		GET_PART_CURSOR(dbc, new_dbc, id);
+		cp = (BTREE_CURSOR *)new_dbc->internal;
+		if ((ret = __memp_fget(new_dbc->dbp->mpf, &cp->root,
+		     new_dbc->thread_info, new_dbc->txn, 0, &h)) != 0)
+			goto c_err;
+
+		elems = NUM_ENT(h);
+		levels = LEVEL(h);
+		if (levels == 1)
+			elems /= 2;
+
+		if ((ret = __memp_fput(new_dbc->dbp->mpf,
+		     new_dbc->thread_info, h, new_dbc->priority)) != 0)
+			goto c_err;
+
+		if ((ret = __dbc_close(new_dbc)) != 0)
+			goto err;
+
+		/* If the tree is empty, ignore it. */
+		if (elems == 0) {
+			empty++;
+			continue;
+		}
+
+		/*
+		 * If a tree has fewer levels than the max just count
+		 * it as a single element in the higher level.
+		 */
+		if (id < part_id) {
+			if (levels > max_levels) {
+				max_levels = levels;
+				less_elems = id + elems - empty;
+			} else if (levels < max_levels)
+				less_elems++;
+			else
+				less_elems += elems;
+		} else {
+			if (levels > max_levels) {
+				max_levels = levels;
+				greater_elems = (id - part_id) + elems - empty;
+			} else if (levels < max_levels)
+				greater_elems++;
+			else
+				greater_elems += elems;
+		}
+
+	}
+
+	if (my_levels < max_levels) {
+		/*
+		 * The subtree containing the key is not the tallest one.
+		 * Reduce its share by the number of records at the highest
+		 * level.  Scale the greater and lesser components up
+		 * by  the number of records on either side of this
+		 * subtree.
+		 */
+		total_elems = 1 + greater_elems + less_elems;
+		kp->equal /= total_elems;
+		kp->less /= total_elems;
+		kp->less += less_elems/total_elems;
+		kp->greater /= total_elems;
+		kp->greater += greater_elems/total_elems;
+	} else if (my_levels == max_levels) {
+		/*
+		 * The key is in one of the tallest subtrees.  We will
+		 * scale the values by the ratio of the records at the
+		 * top of this stubtree to the number of records at the
+		 * highest level.
+		 */
+		total_elems = greater_elems + less_elems;
+		if (total_elems != 0) {
+			/*
+			 * First scale down by the fraction of elements
+			 * in this subtree.
+			 */
+			total_elems += my_elems;
+			kp->equal *= my_elems;
+			kp->equal /= total_elems;
+			kp->less *= my_elems;
+			kp->less /= total_elems;
+			kp->greater *= my_elems;
+			kp->greater /= total_elems;
+			/*
+			 * Proportially add weight from the subtrees to the
+			 * left and right of this one.
+			 */
+			kp->less += less_elems / total_elems;
+			kp->greater += greater_elems / total_elems;
+		}
+	}
+
+	if (0) {
+c_err:		(void)__dbc_close(new_dbc);
+	}
+
+err:	return (ret);
+}
+#endif
+
+/*
+ * __part_remove --
+ *	Remove method for a partitioned database.
+ *
+ * PUBLIC: int __part_remove __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC:      DB_TXN *, const char *, const char *, u_int32_t));
+ */
+int
+__part_remove(dbp, ip, txn, name, subdb, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name, *subdb;
+	u_int32_t flags;
+{
+	return (__part_rr(dbp, ip, txn, name, subdb, NULL, flags));
+}
+
+/*
+ * __part_rename --
+ *	Rename method for a partitioned database.
+ *
+ * PUBLIC: int __part_rename __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC:         DB_TXN *, const char *, const char *, const char *));
+ */
+int
+__part_rename(dbp, ip, txn, name, subdb, newname)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name, *subdb, *newname;
+{
+	return (__part_rr(dbp, ip, txn, name, subdb, newname, 0));
+}
+
+/*
+ * __part_rr --
+ *	Remove/Rename method for a partitioned database.
+ */
+static int
+__part_rr(dbp, ip, txn, name, subdb, newname, flags)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	const char *name, *subdb, *newname;
+	u_int32_t flags;
+{
+	DB **pdbp, *ptmpdbp, *tmpdbp;
+	DB_PARTITION *part;
+	ENV *env;
+	u_int32_t i;
+	int ret, t_ret;
+	char *np;
+
+	env = dbp->env;
+	ret = 0;
+
+	if (subdb != NULL && name != NULL) {
+		__db_errx(env,
+	    "A partitioned database can not be in a multiple databases file");
+		return (EINVAL);
+	}
+	ENV_GET_THREAD_INFO(env, ip);
+
+	/*
+	 * Since rename no longer opens the database, we have
+	 * to do it here.
+	 */
+	if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0)
+		return (ret);
+
+	/*
+	 * We need to make sure we don't self-deadlock, so give
+	 * this dbp the same locker as the incoming one.
+	 */
+	tmpdbp->locker = dbp->locker;
+	if ((ret = __db_open(tmpdbp, ip, txn, name, NULL, dbp->type,
+	    DB_RDWRMASTER | DB_RDONLY, 0, PGNO_BASE_MD)) != 0)
+		goto err;
+
+	part = tmpdbp->p_internal;
+	pdbp = part->handles;
+	COMPQUIET(np, NULL);
+	if (newname != NULL && (ret = __os_malloc(env,
+	     strlen(newname) + PART_LEN + 1, &np)) != 0) {
+		__db_errx(env, Alloc_err, strlen(newname) + PART_LEN + 1);
+		goto err;
+	}
+	for (i = 0; i < part->nparts; i++, pdbp++) {
+		if ((ret = __db_create_internal(&ptmpdbp, env, 0)) != 0)
+			break;
+		ptmpdbp->locker = (*pdbp)->locker;
+		if (newname == NULL)
+			ret = __db_remove_int(ptmpdbp,
+			     ip, txn, (*pdbp)->fname, NULL,  flags);
+		else {
+			DB_ASSERT(env, np != NULL);
+			(void)sprintf(np, PART_NAME, newname, i);
+			ret = __db_rename_int(ptmpdbp,
+			     ip, txn, (*pdbp)->fname, NULL, np);
+		}
+		ptmpdbp->locker = NULL;
+		(void)__db_close(ptmpdbp, NULL, DB_NOSYNC);
+		if (ret != 0)
+			break;
+	}
+
+	if (newname != NULL)
+		__os_free(env, np);
+
+	if (!F_ISSET(dbp, DB_AM_OPEN_CALLED)) {
+err:		/*
+		 * Since we copied the locker ID from the dbp, we'd better not
+		 * free it here.
+		 */
+		tmpdbp->locker = NULL;
+
+		/* We need to remove the lock event we associated with this. */
+		if (txn != NULL)
+			__txn_remlock(env,
+			    txn, &tmpdbp->handle_lock, DB_LOCK_INVALIDID);
+
+		if ((t_ret = __db_close(tmpdbp,
+		    txn, DB_NOSYNC)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+	return (ret);
+}
+#ifdef HAVE_VERIFY
+/*
+ * __part_verify --
+ *	Verify a partitioned database.
+ *
+ * PUBLIC: int __part_verify __P((DB *, VRFY_DBINFO *, const char *,
+ * PUBLIC:     void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__part_verify(dbp, vdp, fname, handle, callback, flags)
+	DB *dbp;
+	VRFY_DBINFO *vdp;
+	const char *fname;
+	void *handle;
+	int (*callback) __P((void *, const void *));
+	u_int32_t flags;
+{
+	BINTERNAL *lp, *rp;
+	DB **pdbp;
+	DB_PARTITION *part;
+	DBC *dbc;
+	DBT *key;
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	u_int32_t i;
+	int ret, t_ret;
+
+	env = dbp->env;
+	lp = rp = NULL;
+	dbc = NULL;
+	ip = vdp->thread_info;
+
+	if (dbp->type == DB_BTREE) { 
+		if ((ret = __bam_open(dbp, ip, 
+		    NULL, fname, PGNO_BASE_MD, flags)) != 0)
+			goto err;
+	}
+#ifdef HAVE_HASH
+	else if ((ret = __ham_open(dbp, ip, 
+	    NULL, fname, PGNO_BASE_MD, flags)) != 0)
+		goto err;
+#endif
+
+	/*
+	 * Initalize partition db handles and get the names. Set DB_RDWRMASTER
+	 * because we may not have the partition callback, but we can still
+	 * look at the structure of the tree.
+	 */
+	if ((ret = __partition_open(dbp,
+	    ip, NULL, fname, dbp->type, flags | DB_RDWRMASTER, 0, 0)) != 0)
+		goto err;
+	part = dbp->p_internal;
+
+	if (LF_ISSET(DB_SALVAGE)) {
+		/* If we are being aggressive we don't want to dump the keys. */
+		if (LF_ISSET(DB_AGGRESSIVE))
+			dbp->p_internal = NULL;
+		ret = __db_prheader(dbp,
+		    NULL, 0, 0, handle, callback, vdp, PGNO_BASE_MD);
+		dbp->p_internal = part;
+		if (ret != 0)
+			goto err;
+	}
+
+	if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+		goto err;
+
+	pdbp = part->handles;
+	for (i = 0; i < part->nparts; i++, pdbp++) {
+		if (!F_ISSET(part, PART_RANGE) || part->keys == NULL)
+			goto vrfy;
+		if (lp != NULL)
+			__os_free(env, lp);
+		lp = rp;
+		rp = NULL;
+		if (i + 1 <  part->nparts) {
+			key = &part->keys[i + 1];
+			if ((ret = __os_malloc(env,
+			    BINTERNAL_SIZE(key->size), &rp)) != 0)
+				goto err;
+			rp->len = key->size;
+			memcpy(rp->data, key->data, key->size);
+			B_TSET(rp->type, B_KEYDATA);
+		}
+vrfy:		if ((t_ret = __db_verify(*pdbp, ip, (*pdbp)->fname,
+		    NULL, handle, callback,
+		    lp, rp, flags | DB_VERIFY_PARTITION)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+
+err:	if (lp != NULL)
+		__os_free(env, lp);
+	if (rp != NULL)
+		__os_free(env, rp);
+	return (ret);
+}
+#endif
+
+#ifdef CONFIG_TEST
+/*
+ * __part_testdocopy -- copy all partitions for testing purposes.
+ *
+ * PUBLIC: int __part_testdocopy __P((DB *, const char *));
+ */
+int
+__part_testdocopy(dbp, name)
+	DB *dbp;
+	const char *name;
+{
+	DB **pdbp;
+	DB_PARTITION *part;
+	u_int32_t i;
+	int ret;
+
+	if ((ret = __db_testdocopy(dbp->env, name)) != 0)
+		return (ret);
+
+	part = dbp->p_internal;
+	pdbp = part->handles;
+	for (i = 0; i < part->nparts; i++, pdbp++)
+		if ((ret = __db_testdocopy(dbp->env, (*pdbp)->fname)) != 0)
+			return (ret);
+
+	return (0);
+}
+#endif
+#else
+/*
+ * __db_nopartition --
+ *	Error when a Berkeley DB build doesn't include partitioning.
+ *
+ * PUBLIC: int __db_no_partition __P((ENV *));
+ */
+int
+__db_no_partition(env)
+	ENV *env;
+{
+	__db_errx(env,
+	 "library build did not include support for the database partitioning");
+	return (DB_OPNOTSUP);
+}
+/*
+ * __partition_set --
+ *	Set the partitioning keys or callback function.
+ * This routine must be called prior to creating the database.
+ * PUBLIC: int __partition_set __P((DB *, u_int32_t, DBT *,
+ * PUBLIC:	u_int32_t (*callback)(DB *, DBT *key)));
+ */
+
+int
+__partition_set(dbp, parts, keys, callback)
+	DB *dbp;
+	u_int32_t parts;
+	DBT *keys;
+	u_int32_t (*callback)(DB *, DBT *key);
+{
+	COMPQUIET(parts, 0);
+	COMPQUIET(keys, NULL);
+	COMPQUIET(callback, NULL);
+
+	return (__db_no_partition(dbp->env));
+}
+
+/*
+ * __partition_get_callback --
+ *	Set the partition callback function.  This routine must be called
+ * prior to opening a partition database that requires a function.
+ * PUBLIC: int __partition_get_callback __P((DB *,
+ * PUBLIC:	 u_int32_t *, u_int32_t (**callback)(DB *, DBT *key)));
+ */
+int
+__partition_get_callback(dbp, parts, callback)
+	DB *dbp;
+	u_int32_t *parts;
+	u_int32_t (**callback)(DB *, DBT *key);
+{
+	COMPQUIET(parts, NULL);
+	COMPQUIET(callback, NULL);
+
+	return (__db_no_partition(dbp->env));
+}
+
+/*
+ * __partition_get_dirs --
+ *	Get partition dirs.
+ * PUBLIC: int __partition_get_dirs __P((DB *, const char ***));
+ */
+int
+__partition_get_dirs(dbp, dirpp)
+	DB *dbp;
+	const char ***dirpp;
+{
+	COMPQUIET(dirpp, NULL);
+	return (__db_no_partition(dbp->env));
+}
+
+/*
+ * __partition_get_keys --
+ *	Get partition keys.
+ * PUBLIC: int __partition_get_keys __P((DB *, u_int32_t *, DBT **));
+ */
+int
+__partition_get_keys(dbp, parts, keys)
+	DB *dbp;
+	u_int32_t *parts;
+	DBT **keys;
+{
+	COMPQUIET(parts, NULL);
+	COMPQUIET(keys, NULL);
+
+	return (__db_no_partition(dbp->env));
+}
+/*
+ * __partition_init --
+ *	Initialize the partition structure.
+ * Called when the meta data page is read in during database open or
+ * when partition keys or a callback are set.
+ *
+ * PUBLIC: int __partition_init __P((DB *, u_int32_t));
+ */
+int
+__partition_init(dbp, flags)
+	DB *dbp;
+	u_int32_t flags;
+{
+	COMPQUIET(flags, 0);
+
+	return (__db_no_partition(dbp->env));
+}
+/*
+ * __part_fileid_reset --
+ *	reset the fileid on each partition.
+ *
+ * PUBLIC: int __part_fileid_reset
+ * PUBLIC:	 __P((ENV *, DB_THREAD_INFO *, const char *, u_int32_t, int));
+ */
+int
+__part_fileid_reset(env, ip, fname, nparts, encrypted)
+	ENV *env;
+	DB_THREAD_INFO *ip;
+	const char *fname;
+	u_int32_t nparts;
+	int encrypted;
+{
+	COMPQUIET(ip, NULL);
+	COMPQUIET(fname, NULL);
+	COMPQUIET(nparts, 0);
+	COMPQUIET(encrypted, 0);
+
+	return (__db_no_partition(env));
+}
+/*
+ * __partition_set_dirs --
+ *	Set the directories for creating the partition databases.
+ * They must be in the environment.
+ * PUBLIC: int __partition_set_dirs __P((DB *, const char **));
+ */
+int
+__partition_set_dirs(dbp, dirp)
+	DB *dbp;
+	const char **dirp;
+{
+	COMPQUIET(dirp, NULL);
+
+	return (__db_no_partition(dbp->env));
+}
+#endif
author	Kim Kibum <kb0929.kim@samsung.com>	2012-05-21 17:40:46 +0900
committer	Kim Kibum <kb0929.kim@samsung.com>	2012-05-21 17:40:46 +0900
commit	2e082c838d2ca750f5daac6dcdabecc22dfd4e46 (patch)
tree	01c1dd87d4cc0b62a655c0d768ff695d2d244728 /db
parent	a86e3ca152fb414b376e64c449c201d762e414dd (diff)
download	db4-2e082c838d2ca750f5daac6dcdabecc22dfd4e46.tar.gz db4-2e082c838d2ca750f5daac6dcdabecc22dfd4e46.tar.bz2 db4-2e082c838d2ca750f5daac6dcdabecc22dfd4e46.zip