summaryrefslogtreecommitdiff
path: root/db
diff options
context:
space:
mode:
authorKim Kibum <kb0929.kim@samsung.com>2012-05-21 17:40:46 +0900
committerKim Kibum <kb0929.kim@samsung.com>2012-05-21 17:40:46 +0900
commit2e082c838d2ca750f5daac6dcdabecc22dfd4e46 (patch)
tree01c1dd87d4cc0b62a655c0d768ff695d2d244728 /db
parenta86e3ca152fb414b376e64c449c201d762e414dd (diff)
downloaddb4-2e082c838d2ca750f5daac6dcdabecc22dfd4e46.tar.gz
db4-2e082c838d2ca750f5daac6dcdabecc22dfd4e46.tar.bz2
db4-2e082c838d2ca750f5daac6dcdabecc22dfd4e46.zip
Upload Tizen:Base source
Diffstat (limited to 'db')
-rw-r--r--db/crdel.src72
-rw-r--r--db/crdel_auto.c945
-rw-r--r--db/crdel_autop.c227
-rw-r--r--db/crdel_rec.c298
-rw-r--r--db/db.c1539
-rw-r--r--db/db.src328
-rw-r--r--db/db_am.c1015
-rw-r--r--db/db_auto.c3267
-rw-r--r--db/db_autop.c802
-rw-r--r--db/db_cam.c3460
-rw-r--r--db/db_cds.c177
-rw-r--r--db/db_conv.c733
-rw-r--r--db/db_dispatch.c953
-rw-r--r--db/db_dup.c203
-rw-r--r--db/db_iface.c2817
-rw-r--r--db/db_join.c940
-rw-r--r--db/db_meta.c1299
-rw-r--r--db/db_method.c1052
-rw-r--r--db/db_open.c628
-rw-r--r--db/db_overflow.c706
-rw-r--r--db/db_ovfl_vrfy.c409
-rw-r--r--db/db_pr.c1659
-rw-r--r--db/db_rec.c1859
-rw-r--r--db/db_reclaim.c246
-rw-r--r--db/db_remove.c492
-rw-r--r--db/db_rename.c372
-rw-r--r--db/db_ret.c156
-rw-r--r--db/db_setid.c213
-rw-r--r--db/db_setlsn.c137
-rw-r--r--db/db_sort_multiple.c287
-rw-r--r--db/db_stati.c494
-rw-r--r--db/db_truncate.c225
-rw-r--r--db/db_upg.c510
-rw-r--r--db/db_upg_opd.c343
-rw-r--r--db/db_vrfy.c2894
-rw-r--r--db/db_vrfy_stub.c117
-rw-r--r--db/db_vrfyutil.c916
-rw-r--r--db/partition.c2048
38 files changed, 34838 insertions, 0 deletions
diff --git a/db/crdel.src b/db/crdel.src
new file mode 100644
index 0000000..cd0b02f
--- /dev/null
+++ b/db/crdel.src
@@ -0,0 +1,72 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+DBPRIVATE
+PREFIX __crdel
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/crypto.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_dispatch.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/log.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE
+
+/*
+ * Metasub: log the creation of a subdatabase meta data page.
+ *
+ * fileid: identifies the file being acted upon.
+ * pgno: page number on which to write this meta-data page
+ * page: the actual meta-data page
+ * lsn: lsn of the page.
+ */
+BEGIN metasub 42 142
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+PGDBT page DBT s
+POINTER lsn DB_LSN * lu
+END
+
+/*
+ * Inmem_create: Log the creation of an in-memory database.
+ *
+ * name: Name of the database
+ * fid: File id of the database
+ */
+BEGIN inmem_create 44 138
+ARG fileid int32_t ld
+DBT name DBT s
+DBT fid DBT s
+ARG pgsize u_int32_t lu
+END
+
+/*
+ * Inmem_rename: Log the renaming of an in-memory only database.
+ *
+ * oldname: database's starting name
+ * newname: database's ending name
+ * fid: fileid
+ */
+BEGIN inmem_rename 44 139
+DBT oldname DBT s
+DBT newname DBT s
+DBT fid DBT s
+END
+
+/*
+ * Inmem_remove: Log the removal of an in-memory only database.
+ *
+ * name: database's ending name
+ * fid: fileid
+ */
+BEGIN inmem_remove 44 140
+DBT name DBT s
+DBT fid DBT s
+END
+
diff --git a/db/crdel_auto.c b/db/crdel_auto.c
new file mode 100644
index 0000000..801a0a5
--- /dev/null
+++ b/db/crdel_auto.c
@@ -0,0 +1,945 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __crdel_metasub_read __P((ENV *, DB **, void *,
+ * PUBLIC: void *, __crdel_metasub_args **));
+ */
+int
+__crdel_metasub_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __crdel_metasub_args **argpp;
+{
+ __crdel_metasub_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__crdel_metasub_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ memset(&argp->page, 0, sizeof(argp->page));
+ LOGCOPY_32(env,&argp->page.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->page.data = bp;
+ bp += argp->page.size;
+ if (LOG_SWAPPED(env) && dbpp != NULL && *dbpp != NULL) {
+ int t_ret;
+ if ((t_ret = __db_pageswap(*dbpp, (PAGE *)argp->page.data,
+ (size_t)argp->page.size, NULL, 1)) != 0)
+ return (t_ret);
+ }
+
+ LOGCOPY_TOLSN(env, &argp->lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __crdel_metasub_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC: u_int32_t, db_pgno_t, const DBT *, DB_LSN *));
+ */
+int
+__crdel_metasub_log(dbp, txnp, ret_lsnp, flags, pgno, page, lsn)
+ DB *dbp;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ db_pgno_t pgno;
+ const DBT *page;
+ DB_LSN * lsn;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ ENV *env;
+ u_int32_t zero, uinttmp, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ env = dbp->env;
+ rlsnp = ret_lsnp;
+ rectype = DB___crdel_metasub;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+ F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ DB_ASSERT(env, dbp->log_filename != NULL);
+ if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+ (ret = __dbreg_lazy_id(dbp)) != 0)
+ return (ret);
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t) + (page == NULL ? 0 : page->size)
+ + sizeof(*lsn);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ uinttmp = (u_int32_t)dbp->log_filename->id;
+ LOGCOPY_32(env, bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (page == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &page->size);
+ bp += sizeof(page->size);
+ memcpy(bp, page->data, page->size);
+ if (LOG_SWAPPED(env))
+ if ((ret = __db_pageswap(dbp,
+ (PAGE *)bp, (size_t)page->size, (DBT *)NULL, 0)) != 0)
+ return (ret);
+ bp += page->size;
+ }
+
+ if (lsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(lsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, lsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, lsn);
+ } else
+ memset(bp, 0, sizeof(*lsn));
+ bp += sizeof(*lsn);
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__crdel_metasub_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_create_read __P((ENV *, void *,
+ * PUBLIC: __crdel_inmem_create_args **));
+ */
+int
+__crdel_inmem_create_read(env, recbuf, argpp)
+ ENV *env;
+ void *recbuf;
+ __crdel_inmem_create_args **argpp;
+{
+ __crdel_inmem_create_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__crdel_inmem_create_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ memset(&argp->name, 0, sizeof(argp->name));
+ LOGCOPY_32(env,&argp->name.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->name.data = bp;
+ bp += argp->name.size;
+
+ memset(&argp->fid, 0, sizeof(argp->fid));
+ LOGCOPY_32(env,&argp->fid.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->fid.data = bp;
+ bp += argp->fid.size;
+
+ LOGCOPY_32(env, &argp->pgsize, bp);
+ bp += sizeof(argp->pgsize);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_create_log __P((ENV *, DB_TXN *,
+ * PUBLIC: DB_LSN *, u_int32_t, int32_t, const DBT *, const DBT *,
+ * PUBLIC: u_int32_t));
+ */
+int
+__crdel_inmem_create_log(env, txnp, ret_lsnp, flags,
+ fileid, name, fid, pgsize)
+ ENV *env;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ int32_t fileid;
+ const DBT *name;
+ const DBT *fid;
+ u_int32_t pgsize;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ u_int32_t zero, uinttmp, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ rlsnp = ret_lsnp;
+ rectype = DB___crdel_inmem_create;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t) + (name == NULL ? 0 : name->size)
+ + sizeof(u_int32_t) + (fid == NULL ? 0 : fid->size)
+ + sizeof(u_int32_t);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ uinttmp = (u_int32_t)fileid;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (name == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &name->size);
+ bp += sizeof(name->size);
+ memcpy(bp, name->data, name->size);
+ bp += name->size;
+ }
+
+ if (fid == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &fid->size);
+ bp += sizeof(fid->size);
+ memcpy(bp, fid->data, fid->size);
+ bp += fid->size;
+ }
+
+ LOGCOPY_32(env, bp, &pgsize);
+ bp += sizeof(pgsize);
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__crdel_inmem_create_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_rename_read __P((ENV *, void *,
+ * PUBLIC: __crdel_inmem_rename_args **));
+ */
+int
+__crdel_inmem_rename_read(env, recbuf, argpp)
+ ENV *env;
+ void *recbuf;
+ __crdel_inmem_rename_args **argpp;
+{
+ __crdel_inmem_rename_args *argp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__crdel_inmem_rename_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ memset(&argp->oldname, 0, sizeof(argp->oldname));
+ LOGCOPY_32(env,&argp->oldname.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->oldname.data = bp;
+ bp += argp->oldname.size;
+
+ memset(&argp->newname, 0, sizeof(argp->newname));
+ LOGCOPY_32(env,&argp->newname.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->newname.data = bp;
+ bp += argp->newname.size;
+
+ memset(&argp->fid, 0, sizeof(argp->fid));
+ LOGCOPY_32(env,&argp->fid.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->fid.data = bp;
+ bp += argp->fid.size;
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_rename_log __P((ENV *, DB_TXN *,
+ * PUBLIC: DB_LSN *, u_int32_t, const DBT *, const DBT *, const DBT *));
+ */
+int
+__crdel_inmem_rename_log(env, txnp, ret_lsnp, flags,
+ oldname, newname, fid)
+ ENV *env;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ const DBT *oldname;
+ const DBT *newname;
+ const DBT *fid;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ u_int32_t zero, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ rlsnp = ret_lsnp;
+ rectype = DB___crdel_inmem_rename;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t) + (oldname == NULL ? 0 : oldname->size)
+ + sizeof(u_int32_t) + (newname == NULL ? 0 : newname->size)
+ + sizeof(u_int32_t) + (fid == NULL ? 0 : fid->size);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ if (oldname == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &oldname->size);
+ bp += sizeof(oldname->size);
+ memcpy(bp, oldname->data, oldname->size);
+ bp += oldname->size;
+ }
+
+ if (newname == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &newname->size);
+ bp += sizeof(newname->size);
+ memcpy(bp, newname->data, newname->size);
+ bp += newname->size;
+ }
+
+ if (fid == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &fid->size);
+ bp += sizeof(fid->size);
+ memcpy(bp, fid->data, fid->size);
+ bp += fid->size;
+ }
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__crdel_inmem_rename_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_remove_read __P((ENV *, void *,
+ * PUBLIC: __crdel_inmem_remove_args **));
+ */
+int
+__crdel_inmem_remove_read(env, recbuf, argpp)
+ ENV *env;
+ void *recbuf;
+ __crdel_inmem_remove_args **argpp;
+{
+ __crdel_inmem_remove_args *argp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__crdel_inmem_remove_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ memset(&argp->name, 0, sizeof(argp->name));
+ LOGCOPY_32(env,&argp->name.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->name.data = bp;
+ bp += argp->name.size;
+
+ memset(&argp->fid, 0, sizeof(argp->fid));
+ LOGCOPY_32(env,&argp->fid.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->fid.data = bp;
+ bp += argp->fid.size;
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_remove_log __P((ENV *, DB_TXN *,
+ * PUBLIC: DB_LSN *, u_int32_t, const DBT *, const DBT *));
+ */
+int
+__crdel_inmem_remove_log(env, txnp, ret_lsnp, flags,
+ name, fid)
+ ENV *env;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ const DBT *name;
+ const DBT *fid;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ u_int32_t zero, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ rlsnp = ret_lsnp;
+ rectype = DB___crdel_inmem_remove;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t) + (name == NULL ? 0 : name->size)
+ + sizeof(u_int32_t) + (fid == NULL ? 0 : fid->size);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ if (name == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &name->size);
+ bp += sizeof(name->size);
+ memcpy(bp, name->data, name->size);
+ bp += name->size;
+ }
+
+ if (fid == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &fid->size);
+ bp += sizeof(fid->size);
+ memcpy(bp, fid->data, fid->size);
+ bp += fid->size;
+ }
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__crdel_inmem_remove_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __crdel_init_recover __P((ENV *, DB_DISTAB *));
+ */
+int
+__crdel_init_recover(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __crdel_metasub_recover, DB___crdel_metasub)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __crdel_inmem_create_recover, DB___crdel_inmem_create)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __crdel_inmem_rename_recover, DB___crdel_inmem_rename)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __crdel_inmem_remove_recover, DB___crdel_inmem_remove)) != 0)
+ return (ret);
+ return (0);
+}
diff --git a/db/crdel_autop.c b/db/crdel_autop.c
new file mode 100644
index 0000000..6bf4bb6
--- /dev/null
+++ b/db/crdel_autop.c
@@ -0,0 +1,227 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __crdel_metasub_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__crdel_metasub_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __crdel_metasub_args *argp;
+ u_int32_t i;
+ int ch;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __crdel_metasub_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__crdel_metasub%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ (void)printf("\tpage: ");
+ for (i = 0; i < argp->page.size; i++) {
+ ch = ((u_int8_t *)argp->page.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\tlsn: [%lu][%lu]\n",
+ (u_long)argp->lsn.file, (u_long)argp->lsn.offset);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_create_print __P((ENV *, DBT *,
+ * PUBLIC: DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_create_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __crdel_inmem_create_args *argp;
+ u_int32_t i;
+ int ch;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret = __crdel_inmem_create_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__crdel_inmem_create%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tname: ");
+ for (i = 0; i < argp->name.size; i++) {
+ ch = ((u_int8_t *)argp->name.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\tfid: ");
+ for (i = 0; i < argp->fid.size; i++) {
+ ch = ((u_int8_t *)argp->fid.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\tpgsize: %lu\n", (u_long)argp->pgsize);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_rename_print __P((ENV *, DBT *,
+ * PUBLIC: DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_rename_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __crdel_inmem_rename_args *argp;
+ u_int32_t i;
+ int ch;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret = __crdel_inmem_rename_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__crdel_inmem_rename%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\toldname: ");
+ for (i = 0; i < argp->oldname.size; i++) {
+ ch = ((u_int8_t *)argp->oldname.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\tnewname: ");
+ for (i = 0; i < argp->newname.size; i++) {
+ ch = ((u_int8_t *)argp->newname.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\tfid: ");
+ for (i = 0; i < argp->fid.size; i++) {
+ ch = ((u_int8_t *)argp->fid.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __crdel_inmem_remove_print __P((ENV *, DBT *,
+ * PUBLIC: DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_remove_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __crdel_inmem_remove_args *argp;
+ u_int32_t i;
+ int ch;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret = __crdel_inmem_remove_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__crdel_inmem_remove%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tname: ");
+ for (i = 0; i < argp->name.size; i++) {
+ ch = ((u_int8_t *)argp->name.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\tfid: ");
+ for (i = 0; i < argp->fid.size; i++) {
+ ch = ((u_int8_t *)argp->fid.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __crdel_init_print __P((ENV *, DB_DISTAB *));
+ */
+int
+__crdel_init_print(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __crdel_metasub_print, DB___crdel_metasub)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __crdel_inmem_create_print, DB___crdel_inmem_create)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __crdel_inmem_rename_print, DB___crdel_inmem_rename)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __crdel_inmem_remove_print, DB___crdel_inmem_remove)) != 0)
+ return (ret);
+ return (0);
+}
diff --git a/db/crdel_rec.c b/db/crdel_rec.c
new file mode 100644
index 0000000..285b965
--- /dev/null
+++ b/db/crdel_rec.c
@@ -0,0 +1,298 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/fop.h"
+#include "dbinc/hash.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+/*
+ * __crdel_metasub_recover --
+ * Recovery function for metasub.
+ *
+ * PUBLIC: int __crdel_metasub_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_metasub_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __crdel_metasub_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_p, ret, t_ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__crdel_metasub_print);
+ REC_INTRO(__crdel_metasub_read, ip, 0);
+
+ /*
+ * If we are undoing this operation, but the DB that we got back
+ * was never really opened, then this open was an in-memory open
+ * that did not finish. We can let the file creation take care
+ * of any necessary undo/cleanup.
+ */
+ if (DB_UNDO(op) && !F_ISSET(file_dbp, DB_AM_OPEN_CALLED))
+ goto done;
+
+ if ((ret = __memp_fget(mpf, &argp->pgno,
+ ip, NULL, 0, &pagep)) != 0) {
+ /* If this is an in-memory file, this might be OK. */
+ if (F_ISSET(file_dbp, DB_AM_INMEM) &&
+ (ret = __memp_fget(mpf, &argp->pgno, ip, NULL,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &pagep)) == 0) {
+ LSN_NOT_LOGGED(LSN(pagep));
+ } else {
+ *lsnp = argp->prev_lsn;
+ ret = 0;
+ goto out;
+ }
+ }
+
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
+
+ if (cmp_p == 0 && DB_REDO(op)) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ memcpy(pagep, argp->page.data, argp->page.size);
+ LSN(pagep) = *lsnp;
+
+ /*
+ * If this was an in-memory database and we are re-creating
+ * and this is the meta-data page, then we need to set up a
+ * bunch of fields in the dbo as well.
+ */
+ if (F_ISSET(file_dbp, DB_AM_INMEM) &&
+ argp->pgno == PGNO_BASE_MD &&
+ (ret = __db_meta_setup(file_dbp->env, file_dbp,
+ file_dbp->dname, (DBMETA *)pagep, 0, DB_CHK_META)) != 0)
+ goto out;
+ } else if (DB_UNDO(op)) {
+ /*
+ * We want to undo this page creation. The page creation
+ * happened in two parts. First, we called __db_pg_alloc which
+ * was logged separately. Then we wrote the meta-data onto
+ * the page. So long as we restore the LSN, then the recovery
+ * for __db_pg_alloc will do everything else.
+ *
+ * Don't bother checking the lsn on the page. If we are
+ * rolling back the next thing is that this page will get
+ * freed. Opening the subdb will have reinitialized the
+ * page, but not the lsn.
+ */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ LSN(pagep) = argp->lsn;
+ }
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL && (t_ret = __memp_fput(mpf,
+ ip, pagep, file_dbp->priority)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+
+ REC_CLOSE;
+}
+
+/*
+ * __crdel_inmem_create_recover --
+ * Recovery function for inmem_create.
+ *
+ * PUBLIC: int __crdel_inmem_create_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_create_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __crdel_inmem_create_args *argp;
+ DB *dbp;
+ int do_close, ret, t_ret;
+
+ COMPQUIET(info, NULL);
+
+ dbp = NULL;
+ do_close = 0;
+ REC_PRINT(__crdel_inmem_create_print);
+ REC_NOOP_INTRO(__crdel_inmem_create_read);
+
+ /* First, see if the DB handle already exists. */
+ if (argp->fileid == DB_LOGFILEID_INVALID) {
+ if (DB_REDO(op))
+ ret = ENOENT;
+ else
+ ret = 0;
+ } else
+ ret = __dbreg_id_to_db(env, argp->txnp, &dbp, argp->fileid, 0);
+
+ if (DB_REDO(op)) {
+ /*
+ * If the dbreg failed, that means that we're creating a
+ * tmp file.
+ */
+ if (ret != 0) {
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ goto out;
+
+ F_SET(dbp, DB_AM_RECOVER | DB_AM_INMEM);
+ memcpy(dbp->fileid, argp->fid.data, DB_FILE_ID_LEN);
+ if (((ret = __os_strdup(env,
+ argp->name.data, &dbp->dname)) != 0))
+ goto out;
+
+ /*
+ * This DBP is never going to be entered into the
+ * dbentry table, so if we leave it open here,
+ * then we're going to lose it.
+ */
+ do_close = 1;
+ }
+
+ /* Now, set the fileid. */
+ memcpy(dbp->fileid, argp->fid.data, argp->fid.size);
+ if ((ret = __memp_set_fileid(dbp->mpf, dbp->fileid)) != 0)
+ goto out;
+ dbp->preserve_fid = 1;
+ MAKE_INMEM(dbp);
+ if ((ret = __env_setup(dbp,
+ NULL, NULL, argp->name.data, TXN_INVALID, 0)) != 0)
+ goto out;
+ ret = __env_mpool(dbp, argp->name.data, 0);
+
+ if (ret == ENOENT) {
+ dbp->pgsize = argp->pgsize;
+ if ((ret = __env_mpool(dbp,
+ argp->name.data, DB_CREATE)) != 0)
+ goto out;
+ } else if (ret != 0)
+ goto out;
+ }
+
+ if (DB_UNDO(op)) {
+ if (ret == 0)
+ ret = __memp_nameop(env, argp->fid.data, NULL,
+ (const char *)argp->name.data, NULL, 1);
+
+ if (ret == ENOENT || ret == DB_DELETED)
+ ret = 0;
+ else
+ goto out;
+ }
+
+ *lsnp = argp->prev_lsn;
+
+out: if (dbp != NULL) {
+ t_ret = 0;
+
+ if (do_close || ret != 0)
+ t_ret = __db_close(dbp, NULL, DB_NOSYNC);
+ if (t_ret != 0 && ret == 0)
+ ret = t_ret;
+ }
+ REC_NOOP_CLOSE;
+}
+
+/*
+ * __crdel_inmem_rename_recover --
+ * Recovery function for inmem_rename.
+ *
+ * PUBLIC: int __crdel_inmem_rename_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_rename_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __crdel_inmem_rename_args *argp;
+ u_int8_t *fileid;
+ int ret;
+
+ COMPQUIET(info, NULL);
+
+ REC_PRINT(__crdel_inmem_rename_print);
+ REC_NOOP_INTRO(__crdel_inmem_rename_read);
+ fileid = argp->fid.data;
+
+ /* Void out errors because the files may or may not still exist. */
+ if (DB_REDO(op))
+ (void)__memp_nameop(env, fileid,
+ (const char *)argp->newname.data,
+ (const char *)argp->oldname.data,
+ (const char *)argp->newname.data, 1);
+
+ if (DB_UNDO(op))
+ (void)__memp_nameop(env, fileid,
+ (const char *)argp->oldname.data,
+ (const char *)argp->newname.data,
+ (const char *)argp->oldname.data, 1);
+
+ *lsnp = argp->prev_lsn;
+ ret = 0;
+
+ REC_NOOP_CLOSE;
+}
+
+/*
+ * __crdel_inmem_remove_recover --
+ * Recovery function for inmem_remove.
+ *
+ * PUBLIC: int __crdel_inmem_remove_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__crdel_inmem_remove_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __crdel_inmem_remove_args *argp;
+ int ret;
+
+ COMPQUIET(info, NULL);
+
+ REC_PRINT(__crdel_inmem_remove_print);
+ REC_NOOP_INTRO(__crdel_inmem_remove_read);
+
+ /*
+ * Since removes are delayed; there is no undo for a remove; only redo.
+ * The remove may fail, which is OK.
+ */
+ if (DB_REDO(op)) {
+ (void)__memp_nameop(env,
+ argp->fid.data, NULL, argp->name.data, NULL, 1);
+ }
+
+ *lsnp = argp->prev_lsn;
+ ret = 0;
+
+ REC_NOOP_CLOSE;
+}
diff --git a/db/db.c b/db/db.c
new file mode 100644
index 0000000..9caa1aa
--- /dev/null
+++ b/db/db.c
@@ -0,0 +1,1539 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
+#include "dbinc/fop.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+static int __db_disassociate __P((DB *));
+static int __db_disassociate_foreign __P ((DB *));
+
+#ifdef CONFIG_TEST
+static int __db_makecopy __P((ENV *, const char *, const char *));
+static int __qam_testdocopy __P((DB *, const char *));
+#endif
+
+/*
+ * DB.C --
+ * This file contains the utility functions for the DBP layer.
+ */
+
+/*
+ * __db_master_open --
+ * Open up a handle on a master database.
+ *
+ * PUBLIC: int __db_master_open __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC: DB_TXN *, const char *, u_int32_t, int, DB **));
+ */
+int
+__db_master_open(subdbp, ip, txn, name, flags, mode, dbpp)
+ DB *subdbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name;
+ u_int32_t flags;
+ int mode;
+ DB **dbpp;
+{
+ DB *dbp;
+ int ret;
+
+ *dbpp = NULL;
+
+ /* Open up a handle on the main database. */
+ if ((ret = __db_create_internal(&dbp, subdbp->env, 0)) != 0)
+ return (ret);
+
+ /*
+ * It's always a btree.
+ * Run in the transaction we've created.
+ * Set the pagesize in case we're creating a new database.
+ * Flag that we're creating a database with subdatabases.
+ */
+ dbp->pgsize = subdbp->pgsize;
+ F_SET(dbp, DB_AM_SUBDB);
+ F_SET(dbp, F_ISSET(subdbp,
+ DB_AM_RECOVER | DB_AM_SWAP |
+ DB_AM_ENCRYPT | DB_AM_CHKSUM | DB_AM_NOT_DURABLE));
+
+ /*
+ * If there was a subdb specified, then we only want to apply
+ * DB_EXCL to the subdb, not the actual file. We only got here
+ * because there was a subdb specified.
+ */
+ LF_CLR(DB_EXCL);
+ LF_SET(DB_RDWRMASTER);
+ if ((ret = __db_open(dbp, ip,
+ txn, name, NULL, DB_BTREE, flags, mode, PGNO_BASE_MD)) != 0)
+ goto err;
+
+ /*
+ * The items in dbp are initialized from the master file's meta page.
+ * Other items such as checksum and encryption are checked when we
+ * read the meta-page, so we do not check those here. However, if
+ * the meta-page caused checksumming to be turned on and it wasn't
+ * already, set it here.
+ */
+ if (F_ISSET(dbp, DB_AM_CHKSUM))
+ F_SET(subdbp, DB_AM_CHKSUM);
+
+ /*
+ * The user may have specified a page size for an existing file,
+ * which we want to ignore.
+ */
+ subdbp->pgsize = dbp->pgsize;
+ *dbpp = dbp;
+
+ if (0) {
+err: if (!F_ISSET(dbp, DB_AM_DISCARD))
+ (void)__db_close(dbp, txn, 0);
+ }
+
+ return (ret);
+}
+
+/*
+ * __db_master_update --
+ * Add/Open/Remove a subdatabase from a master database.
+ *
+ * PUBLIC: int __db_master_update __P((DB *, DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC: const char *, DBTYPE, mu_action, const char *, u_int32_t));
+ */
+int
+__db_master_update(mdbp, sdbp, ip, txn, subdb, type, action, newname, flags)
+ DB *mdbp, *sdbp;
+ DB_TXN *txn;
+ DB_THREAD_INFO *ip;
+ const char *subdb;
+ DBTYPE type;
+ mu_action action;
+ const char *newname;
+ u_int32_t flags;
+{
+ DBC *dbc, *ndbc;
+ DBT key, data, ndata;
+ ENV *env;
+ PAGE *p, *r;
+ db_pgno_t t_pgno;
+ int modify, ret, t_ret;
+
+ env = mdbp->env;
+ dbc = ndbc = NULL;
+ p = NULL;
+
+ /*
+ * Open up a cursor. If this is CDB and we're creating the database,
+ * make it an update cursor.
+ *
+ * Might we modify the master database? If so, we'll need to lock.
+ */
+ modify = (action != MU_OPEN || LF_ISSET(DB_CREATE)) ? 1 : 0;
+
+ if ((ret = __db_cursor(mdbp, ip, txn, &dbc,
+ (CDB_LOCKING(env) && modify) ? DB_WRITECURSOR : 0)) != 0)
+ return (ret);
+
+ /*
+ * Point the cursor at the record.
+ *
+ * If we're removing or potentially creating an entry, lock the page
+ * with DB_RMW.
+ *
+ * We do multiple cursor operations with the cursor in some cases and
+ * subsequently access the data DBT information. Set DB_DBT_MALLOC so
+ * we don't risk modification of the data between our uses of it.
+ *
+ * !!!
+ * We don't include the name's nul termination in the database.
+ */
+ DB_INIT_DBT(key, subdb, strlen(subdb));
+ memset(&data, 0, sizeof(data));
+ F_SET(&data, DB_DBT_MALLOC);
+
+ ret = __dbc_get(dbc, &key, &data,
+ DB_SET | ((STD_LOCKING(dbc) && modify) ? DB_RMW : 0));
+
+ /*
+ * What we do next--whether or not we found a record for the
+ * specified subdatabase--depends on what the specified action is.
+ * Handle ret appropriately as the first statement of each case.
+ */
+ switch (action) {
+ case MU_REMOVE:
+ /*
+ * We should have found something if we're removing it. Note
+ * that in the common case where the DB we're asking to remove
+ * doesn't exist, we won't get this far; __db_subdb_remove
+ * will already have returned an error from __db_open.
+ */
+ if (ret != 0)
+ goto err;
+
+ /*
+ * Delete the subdatabase entry first; if this fails,
+ * we don't want to touch the actual subdb pages.
+ */
+ if ((ret = __dbc_del(dbc, 0)) != 0)
+ goto err;
+
+ /*
+ * We're handling actual data, not on-page meta-data,
+ * so it hasn't been converted to/from opposite
+ * endian architectures. Do it explicitly, now.
+ */
+ memcpy(&sdbp->meta_pgno, data.data, sizeof(db_pgno_t));
+ DB_NTOHL_SWAP(env, &sdbp->meta_pgno);
+ if ((ret = __memp_fget(mdbp->mpf, &sdbp->meta_pgno,
+ ip, dbc->txn, DB_MPOOL_DIRTY, &p)) != 0)
+ goto err;
+
+ /* Free the root on the master db if it was created. */
+ if (TYPE(p) == P_BTREEMETA &&
+ ((BTMETA *)p)->root != PGNO_INVALID) {
+ if ((ret = __memp_fget(mdbp->mpf,
+ &((BTMETA *)p)->root, ip, dbc->txn,
+ DB_MPOOL_DIRTY, &r)) != 0)
+ goto err;
+
+ /* Free and put the page. */
+ if ((ret = __db_free(dbc, r)) != 0) {
+ r = NULL;
+ goto err;
+ }
+ }
+ /* Free and put the page. */
+ if ((ret = __db_free(dbc, p)) != 0) {
+ p = NULL;
+ goto err;
+ }
+ p = NULL;
+ break;
+ case MU_RENAME:
+ /* We should have found something if we're renaming it. */
+ if (ret != 0)
+ goto err;
+
+ /*
+ * Before we rename, we need to make sure we're not
+ * overwriting another subdatabase, or else this operation
+ * won't be undoable. Open a second cursor and check
+ * for the existence of newname; it shouldn't appear under
+ * us since we hold the metadata lock.
+ */
+ if ((ret = __db_cursor(mdbp, ip, txn, &ndbc,
+ CDB_LOCKING(env) ? DB_WRITECURSOR : 0)) != 0)
+ goto err;
+ DB_SET_DBT(key, newname, strlen(newname));
+
+ /*
+ * We don't actually care what the meta page of the potentially-
+ * overwritten DB is; we just care about existence.
+ */
+ memset(&ndata, 0, sizeof(ndata));
+ F_SET(&ndata, DB_DBT_USERMEM | DB_DBT_PARTIAL);
+
+ if ((ret = __dbc_get(ndbc, &key, &ndata, DB_SET)) == 0) {
+ /* A subdb called newname exists. Bail. */
+ ret = EEXIST;
+ __db_errx(env, "rename: database %s exists", newname);
+ goto err;
+ } else if (ret != DB_NOTFOUND)
+ goto err;
+
+ /*
+ * Now do the put first; we don't want to lose our only
+ * reference to the subdb. Use the second cursor so the
+ * first one continues to point to the old record.
+ */
+ if ((ret = __dbc_put(ndbc, &key, &data, DB_KEYFIRST)) != 0)
+ goto err;
+ if ((ret = __dbc_del(dbc, 0)) != 0) {
+ /*
+ * If the delete fails, try to delete the record
+ * we just put, in case we're not txn-protected.
+ */
+ (void)__dbc_del(ndbc, 0);
+ goto err;
+ }
+
+ break;
+ case MU_OPEN:
+ /*
+ * Get the subdatabase information. If it already exists,
+ * copy out the page number and we're done.
+ */
+ switch (ret) {
+ case 0:
+ if (LF_ISSET(DB_CREATE) && LF_ISSET(DB_EXCL)) {
+ ret = EEXIST;
+ goto err;
+ }
+ memcpy(&sdbp->meta_pgno, data.data, sizeof(db_pgno_t));
+ DB_NTOHL_SWAP(env, &sdbp->meta_pgno);
+ goto done;
+ case DB_NOTFOUND:
+ if (LF_ISSET(DB_CREATE))
+ break;
+ /*
+ * No db_err, it is reasonable to remove a
+ * nonexistent db.
+ */
+ ret = ENOENT;
+ goto err;
+ default:
+ goto err;
+ }
+
+ /* Create a subdatabase. */
+ if ((ret = __db_new(dbc,
+ type == DB_HASH ? P_HASHMETA : P_BTREEMETA, NULL, &p)) != 0)
+ goto err;
+ sdbp->meta_pgno = PGNO(p);
+
+ /*
+ * XXX
+ * We're handling actual data, not on-page meta-data, so it
+ * hasn't been converted to/from opposite endian architectures.
+ * Do it explicitly, now.
+ */
+ t_pgno = PGNO(p);
+ DB_HTONL_SWAP(env, &t_pgno);
+ memset(&ndata, 0, sizeof(ndata));
+ ndata.data = &t_pgno;
+ ndata.size = sizeof(db_pgno_t);
+ if ((ret = __dbc_put(dbc, &key, &ndata, 0)) != 0)
+ goto err;
+ F_SET(sdbp, DB_AM_CREATED);
+ break;
+ }
+
+err:
+done: /*
+ * If we allocated a page: if we're successful, mark the page dirty
+ * and return it to the cache, otherwise, discard/free it.
+ */
+ if (p != NULL && (t_ret = __memp_fput(mdbp->mpf,
+ dbc->thread_info, p, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Discard the cursor(s) and data. */
+ if (data.data != NULL)
+ __os_ufree(env, data.data);
+ if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ndbc != NULL && (t_ret = __dbc_close(ndbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __env_setup --
+ * Set up the underlying environment during a db_open.
+ *
+ * PUBLIC: int __env_setup __P((DB *,
+ * PUBLIC: DB_TXN *, const char *, const char *, u_int32_t, u_int32_t));
+ */
+int
+__env_setup(dbp, txn, fname, dname, id, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *fname, *dname;
+ u_int32_t id, flags;
+{
+ DB *ldbp;
+ DB_ENV *dbenv;
+ ENV *env;
+ u_int32_t maxid;
+ int ret;
+
+ env = dbp->env;
+ dbenv = env->dbenv;
+
+ /* If we don't yet have an environment, it's time to create it. */
+ if (!F_ISSET(env, ENV_OPEN_CALLED)) {
+ /* Make sure we have at least DB_MINCACHE pages in our cache. */
+ if (dbenv->mp_gbytes == 0 &&
+ dbenv->mp_bytes < dbp->pgsize * DB_MINPAGECACHE &&
+ (ret = __memp_set_cachesize(
+ dbenv, 0, dbp->pgsize * DB_MINPAGECACHE, 0)) != 0)
+ return (ret);
+
+ if ((ret = __env_open(dbenv, NULL, DB_CREATE |
+ DB_INIT_MPOOL | DB_PRIVATE | LF_ISSET(DB_THREAD), 0)) != 0)
+ return (ret);
+ }
+
+ /* Join the underlying cache. */
+ if ((!F_ISSET(dbp, DB_AM_INMEM) || dname == NULL) &&
+ (ret = __env_mpool(dbp, fname, flags)) != 0)
+ return (ret);
+
+ /* We may need a per-thread mutex. */
+ if (LF_ISSET(DB_THREAD) && (ret = __mutex_alloc(
+ env, MTX_DB_HANDLE, DB_MUTEX_PROCESS_ONLY, &dbp->mutex)) != 0)
+ return (ret);
+
+ /*
+ * Set up a bookkeeping entry for this database in the log region,
+ * if such a region exists. Note that even if we're in recovery
+ * or a replication client, where we won't log registries, we'll
+ * still need an FNAME struct, so LOGGING_ON is the correct macro.
+ */
+ if (LOGGING_ON(env) && dbp->log_filename == NULL
+#if !defined(DEBUG_ROP) && !defined(DEBUG_WOP) && !defined(DIAGNOSTIC)
+ && (txn != NULL || F_ISSET(dbp, DB_AM_RECOVER))
+#endif
+#if !defined(DEBUG_ROP)
+ && !F_ISSET(dbp, DB_AM_RDONLY)
+#endif
+ ) {
+ if ((ret = __dbreg_setup(dbp,
+ F_ISSET(dbp, DB_AM_INMEM) ? dname : fname,
+ F_ISSET(dbp, DB_AM_INMEM) ? NULL : dname, id)) != 0)
+ return (ret);
+
+ /*
+ * If we're actively logging and our caller isn't a
+ * recovery function that already did so, then assign
+ * this dbp a log fileid.
+ */
+ if (DBENV_LOGGING(env) && !F_ISSET(dbp, DB_AM_RECOVER) &&
+ (ret = __dbreg_new_id(dbp, txn)) != 0)
+ return (ret);
+ }
+
+ /*
+ * Insert ourselves into the ENV's dblist. We allocate a
+ * unique ID to each {fileid, meta page number} pair, and to
+ * each temporary file (since they all have a zero fileid).
+ * This ID gives us something to use to tell which DB handles
+ * go with which databases in all the cursor adjustment
+ * routines, where we don't want to do a lot of ugly and
+ * expensive memcmps.
+ */
+ MUTEX_LOCK(env, env->mtx_dblist);
+ maxid = 0;
+ TAILQ_FOREACH(ldbp, &env->dblist, dblistlinks) {
+ /*
+ * There are three cases: on-disk database (first clause),
+ * named in-memory database (second clause), temporary database
+ * (never matches; no clause).
+ */
+ if (!F_ISSET(dbp, DB_AM_INMEM)) {
+ if (memcmp(ldbp->fileid, dbp->fileid, DB_FILE_ID_LEN)
+ == 0 && ldbp->meta_pgno == dbp->meta_pgno)
+ break;
+ } else if (dname != NULL) {
+ if (F_ISSET(ldbp, DB_AM_INMEM) &&
+ ldbp->dname != NULL &&
+ strcmp(ldbp->dname, dname) == 0)
+ break;
+ }
+ if (ldbp->adj_fileid > maxid)
+ maxid = ldbp->adj_fileid;
+ }
+
+ /*
+ * If ldbp is NULL, we didn't find a match. Assign the dbp an
+ * adj_fileid one higher than the largest we found, and
+ * insert it at the head of the master dbp list.
+ *
+ * If ldbp is not NULL, it is a match for our dbp. Give dbp
+ * the same ID that ldbp has, and add it after ldbp so they're
+ * together in the list.
+ */
+ if (ldbp == NULL) {
+ dbp->adj_fileid = maxid + 1;
+ TAILQ_INSERT_HEAD(&env->dblist, dbp, dblistlinks);
+ } else {
+ dbp->adj_fileid = ldbp->adj_fileid;
+ TAILQ_INSERT_AFTER(&env->dblist, ldbp, dbp, dblistlinks);
+ }
+ MUTEX_UNLOCK(env, env->mtx_dblist);
+
+ return (0);
+}
+
+/*
+ * __env_mpool --
+ * Set up the underlying environment cache during a db_open.
+ *
+ * PUBLIC: int __env_mpool __P((DB *, const char *, u_int32_t));
+ */
+int
+__env_mpool(dbp, fname, flags)
+ DB *dbp;
+ const char *fname;
+ u_int32_t flags;
+{
+ DBT pgcookie;
+ DB_MPOOLFILE *mpf;
+ DB_PGINFO pginfo;
+ ENV *env;
+ int fidset, ftype, ret;
+ int32_t lsn_off;
+ u_int8_t nullfid[DB_FILE_ID_LEN];
+ u_int32_t clear_len;
+
+ env = dbp->env;
+
+ /* The LSN is the first entry on a DB page, byte offset 0. */
+ lsn_off = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LSN_OFF_NOTSET : 0;
+
+ /* It's possible that this database is already open. */
+ if (F_ISSET(dbp, DB_AM_OPEN_CALLED))
+ return (0);
+
+ /*
+ * If we need to pre- or post-process a file's pages on I/O, set the
+ * file type. If it's a hash file, always call the pgin and pgout
+ * routines. This means that hash files can never be mapped into
+ * process memory. If it's a btree file and requires swapping, we
+ * need to page the file in and out. This has to be right -- we can't
+ * mmap files that are being paged in and out.
+ */
+ switch (dbp->type) {
+ case DB_BTREE:
+ case DB_RECNO:
+ ftype = F_ISSET(dbp, DB_AM_SWAP | DB_AM_ENCRYPT | DB_AM_CHKSUM)
+ ? DB_FTYPE_SET : DB_FTYPE_NOTSET;
+ clear_len = CRYPTO_ON(env) ?
+ (dbp->pgsize != 0 ? dbp->pgsize : DB_CLEARLEN_NOTSET) :
+ DB_PAGE_DB_LEN;
+ break;
+ case DB_HASH:
+ ftype = DB_FTYPE_SET;
+ clear_len = CRYPTO_ON(env) ?
+ (dbp->pgsize != 0 ? dbp->pgsize : DB_CLEARLEN_NOTSET) :
+ DB_PAGE_DB_LEN;
+ break;
+ case DB_QUEUE:
+ ftype = F_ISSET(dbp,
+ DB_AM_SWAP | DB_AM_ENCRYPT | DB_AM_CHKSUM) ?
+ DB_FTYPE_SET : DB_FTYPE_NOTSET;
+
+ /*
+ * If we came in here without a pagesize set, then we need
+ * to mark the in-memory handle as having clear_len not
+ * set, because we don't really know the clear length or
+ * the page size yet (since the file doesn't yet exist).
+ */
+ clear_len = dbp->pgsize != 0 ? dbp->pgsize : DB_CLEARLEN_NOTSET;
+ break;
+ case DB_UNKNOWN:
+ /*
+ * If we're running in the verifier, our database might
+ * be corrupt and we might not know its type--but we may
+ * still want to be able to verify and salvage.
+ *
+ * If we can't identify the type, it's not going to be safe
+ * to call __db_pgin--we pretty much have to give up all
+ * hope of salvaging cross-endianness. Proceed anyway;
+ * at worst, the database will just appear more corrupt
+ * than it actually is, but at best, we may be able
+ * to salvage some data even with no metadata page.
+ */
+ if (F_ISSET(dbp, DB_AM_VERIFYING)) {
+ ftype = DB_FTYPE_NOTSET;
+ clear_len = DB_PAGE_DB_LEN;
+ break;
+ }
+
+ /*
+ * This might be an in-memory file and we won't know its
+ * file type until after we open it and read the meta-data
+ * page.
+ */
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ clear_len = DB_CLEARLEN_NOTSET;
+ ftype = DB_FTYPE_NOTSET;
+ lsn_off = DB_LSN_OFF_NOTSET;
+ break;
+ }
+ /* FALLTHROUGH */
+ default:
+ return (__db_unknown_type(env, "DB->open", dbp->type));
+ }
+
+ mpf = dbp->mpf;
+
+ memset(nullfid, 0, DB_FILE_ID_LEN);
+ fidset = memcmp(nullfid, dbp->fileid, DB_FILE_ID_LEN);
+ if (fidset)
+ (void)__memp_set_fileid(mpf, dbp->fileid);
+
+ (void)__memp_set_clear_len(mpf, clear_len);
+ (void)__memp_set_ftype(mpf, ftype);
+ (void)__memp_set_lsn_offset(mpf, lsn_off);
+
+ pginfo.db_pagesize = dbp->pgsize;
+ pginfo.flags =
+ F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_SWAP));
+ pginfo.type = dbp->type;
+ pgcookie.data = &pginfo;
+ pgcookie.size = sizeof(DB_PGINFO);
+ (void)__memp_set_pgcookie(mpf, &pgcookie);
+
+#ifndef DIAG_MVCC
+ if (F_ISSET(env->dbenv, DB_ENV_MULTIVERSION))
+#endif
+ if (F_ISSET(dbp, DB_AM_TXN) &&
+ dbp->type != DB_QUEUE && dbp->type != DB_UNKNOWN)
+ LF_SET(DB_MULTIVERSION);
+
+ if ((ret = __memp_fopen(mpf, NULL, fname, &dbp->dirname,
+ LF_ISSET(DB_CREATE | DB_DURABLE_UNKNOWN | DB_MULTIVERSION |
+ DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE) |
+ (F_ISSET(env->dbenv, DB_ENV_DIRECT_DB) ? DB_DIRECT : 0) |
+ (F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_TXN_NOT_DURABLE : 0),
+ 0, dbp->pgsize)) != 0) {
+ /*
+ * The open didn't work; we need to reset the mpf,
+ * retaining the in-memory semantics (if any).
+ */
+ (void)__memp_fclose(dbp->mpf, 0);
+ (void)__memp_fcreate(env, &dbp->mpf);
+ if (F_ISSET(dbp, DB_AM_INMEM))
+ MAKE_INMEM(dbp);
+ return (ret);
+ }
+
+ /*
+ * Set the open flag. We use it to mean that the dbp has gone
+ * through mpf setup, including dbreg_register. Also, below,
+ * the underlying access method open functions may want to do
+ * things like acquire cursors, so the open flag has to be set
+ * before calling them.
+ */
+ F_SET(dbp, DB_AM_OPEN_CALLED);
+ if (!fidset && fname != NULL) {
+ (void)__memp_get_fileid(dbp->mpf, dbp->fileid);
+ dbp->preserve_fid = 1;
+ }
+
+ return (0);
+}
+
+/*
+ * __db_close --
+ * DB->close method.
+ *
+ * PUBLIC: int __db_close __P((DB *, DB_TXN *, u_int32_t));
+ */
+int
+__db_close(dbp, txn, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ u_int32_t flags;
+{
+ ENV *env;
+ int db_ref, deferred_close, ret, t_ret;
+
+ env = dbp->env;
+ deferred_close = ret = 0;
+
+ /*
+ * Validate arguments, but as a DB handle destructor, we can't fail.
+ *
+ * Check for consistent transaction usage -- ignore errors. Only
+ * internal callers specify transactions, so it's a serious problem
+ * if we get error messages.
+ */
+ if (txn != NULL)
+ (void)__db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0);
+
+ /* Refresh the structure and close any underlying resources. */
+ ret = __db_refresh(dbp, txn, flags, &deferred_close, 0);
+
+ /*
+ * If we've deferred the close because the logging of the close failed,
+ * return our failure right away without destroying the handle.
+ */
+ if (deferred_close)
+ return (ret);
+
+ /* !!!
+ * This code has an apparent race between the moment we read and
+ * decrement env->db_ref and the moment we check whether it's 0.
+ * However, if the environment is DBLOCAL, the user shouldn't have a
+ * reference to the env handle anyway; the only way we can get
+ * multiple dbps sharing a local env is if we open them internally
+ * during something like a subdatabase open. If any such thing is
+ * going on while the user is closing the original dbp with a local
+ * env, someone's already badly screwed up, so there's no reason
+ * to bother engineering around this possibility.
+ */
+ MUTEX_LOCK(env, env->mtx_dblist);
+ db_ref = --env->db_ref;
+ MUTEX_UNLOCK(env, env->mtx_dblist);
+ if (F_ISSET(env, ENV_DBLOCAL) && db_ref == 0 &&
+ (t_ret = __env_close(env->dbenv, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Free the database handle. */
+ memset(dbp, CLEAR_BYTE, sizeof(*dbp));
+ __os_free(env, dbp);
+
+ return (ret);
+}
+
+/*
+ * __db_refresh --
+ * Refresh the DB structure, releasing any allocated resources.
+ * This does most of the work of closing files now because refresh
+ * is what is used during abort processing (since we can't destroy
+ * the actual handle) and during abort processing, we may have a
+ * fully opened handle.
+ *
+ * PUBLIC: int __db_refresh __P((DB *, DB_TXN *, u_int32_t, int *, int));
+ */
+int
+__db_refresh(dbp, txn, flags, deferred_closep, reuse)
+ DB *dbp;
+ DB_TXN *txn;
+ u_int32_t flags;
+ int *deferred_closep, reuse;
+{
+ DB *sdbp;
+ DBC *dbc;
+ DB_FOREIGN_INFO *f_info, *tmp;
+ DB_LOCKER *locker;
+ DB_LOCKREQ lreq;
+ ENV *env;
+ REGENV *renv;
+ REGINFO *infop;
+ u_int32_t save_flags;
+ int resync, ret, t_ret;
+
+ ret = 0;
+
+ env = dbp->env;
+ infop = env->reginfo;
+ if (infop != NULL)
+ renv = infop->primary;
+ else
+ renv = NULL;
+
+ /*
+ * If this dbp is not completely open, avoid trapping by trying to
+ * sync without an mpool file.
+ */
+ if (dbp->mpf == NULL)
+ LF_SET(DB_NOSYNC);
+
+ /* If never opened, or not currently open, it's easy. */
+ if (!F_ISSET(dbp, DB_AM_OPEN_CALLED))
+ goto never_opened;
+
+ /*
+ * If we have any secondary indices, disassociate them from us.
+ * We don't bother with the mutex here; it only protects some
+ * of the ops that will make us core-dump mid-close anyway, and
+ * if you're trying to do something with a secondary *while* you're
+ * closing the primary, you deserve what you get. The disassociation
+ * is mostly done just so we can close primaries and secondaries in
+ * any order--but within one thread of control.
+ */
+ LIST_FOREACH(sdbp, &dbp->s_secondaries, s_links) {
+ LIST_REMOVE(sdbp, s_links);
+ if ((t_ret = __db_disassociate(sdbp)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ /*
+ * Disassociate ourself from any databases using us as a foreign key
+ * database by clearing the referring db's pointer. Reclaim memory.
+ */
+ f_info = LIST_FIRST(&dbp->f_primaries);
+ while (f_info != NULL) {
+ tmp = LIST_NEXT(f_info, f_links);
+ LIST_REMOVE(f_info, f_links);
+ f_info->dbp->s_foreign = NULL;
+ __os_free(env, f_info);
+ f_info = tmp;
+ }
+
+ if (dbp->s_foreign != NULL &&
+ (t_ret = __db_disassociate_foreign(dbp)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * Sync the underlying access method. Do before closing the cursors
+ * because DB->sync allocates cursors in order to write Recno backing
+ * source text files.
+ *
+ * Sync is slow on some systems, notably Solaris filesystems where the
+ * entire buffer cache is searched. If we're in recovery, don't flush
+ * the file, it's not necessary.
+ */
+ if (!LF_ISSET(DB_NOSYNC) &&
+ !F_ISSET(dbp, DB_AM_DISCARD | DB_AM_RECOVER) &&
+ (t_ret = __db_sync(dbp)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * Go through the active cursors and call the cursor recycle routine,
+ * which resolves pending operations and moves the cursors onto the
+ * free list. Then, walk the free list and call the cursor destroy
+ * routine. Note that any failure on a close is considered "really
+ * bad" and we just break out of the loop and force forward.
+ */
+ resync = TAILQ_FIRST(&dbp->active_queue) == NULL ? 0 : 1;
+ while ((dbc = TAILQ_FIRST(&dbp->active_queue)) != NULL)
+ if ((t_ret = __dbc_close(dbc)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ break;
+ }
+
+ while ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL)
+ if ((t_ret = __dbc_destroy(dbc)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ break;
+ }
+
+ /*
+ * Close any outstanding join cursors. Join cursors destroy themselves
+ * on close and have no separate destroy routine. We don't have to set
+ * the resync flag here, because join cursors aren't write cursors.
+ */
+ while ((dbc = TAILQ_FIRST(&dbp->join_queue)) != NULL)
+ if ((t_ret = __db_join_close(dbc)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ break;
+ }
+
+ /*
+ * Sync the memory pool, even though we've already called DB->sync,
+ * because closing cursors can dirty pages by deleting items they
+ * referenced.
+ *
+ * Sync is slow on some systems, notably Solaris filesystems where the
+ * entire buffer cache is searched. If we're in recovery, don't flush
+ * the file, it's not necessary.
+ */
+ if (resync && !LF_ISSET(DB_NOSYNC) &&
+ !F_ISSET(dbp, DB_AM_DISCARD | DB_AM_RECOVER) &&
+ (t_ret = __memp_fsync(dbp->mpf)) != 0 && ret == 0)
+ ret = t_ret;
+
+never_opened:
+ /*
+ * At this point, we haven't done anything to render the DB handle
+ * unusable, at least by a transaction abort. Take the opportunity
+ * now to log the file close if we have initialized the logging
+ * information. If this log fails and we're in a transaction,
+ * we have to bail out of the attempted close; we'll need a dbp in
+ * order to successfully abort the transaction, and we can't conjure
+ * a new one up because we haven't gotten out the dbreg_register
+ * record that represents the close. In this case, we put off
+ * actually closing the dbp until we've performed the abort.
+ */
+ if (!reuse && LOGGING_ON(dbp->env) && dbp->log_filename != NULL) {
+ /*
+ * Discard the log file id, if any. We want to log the close
+ * if and only if this is not a recovery dbp or a client dbp,
+ * or a dead dbp handle.
+ */
+ DB_ASSERT(env, renv != NULL);
+ if (F_ISSET(dbp, DB_AM_RECOVER) || IS_REP_CLIENT(env) ||
+ dbp->timestamp != renv->rep_timestamp) {
+ if ((t_ret = __dbreg_revoke_id(dbp,
+ 0, DB_LOGFILEID_INVALID)) == 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __dbreg_teardown(dbp)) != 0 && ret == 0)
+ ret = t_ret;
+ } else {
+ if ((t_ret = __dbreg_close_id(dbp,
+ txn, DBREG_CLOSE)) != 0 && txn != NULL) {
+ /*
+ * We're in a txn and the attempt to log the
+ * close failed; let the txn subsystem know
+ * that we need to destroy this dbp once we're
+ * done with the abort, then bail from the
+ * close.
+ *
+ * Note that if the attempt to put off the
+ * close -also- fails--which it won't unless
+ * we're out of heap memory--we're really
+ * screwed. Panic.
+ */
+ if ((ret =
+ __txn_closeevent(env, txn, dbp)) != 0)
+ return (__env_panic(env, ret));
+ if (deferred_closep != NULL)
+ *deferred_closep = 1;
+ return (t_ret);
+ }
+ /*
+ * If dbreg_close_id failed and we were not in a
+ * transaction, then we need to finish this close
+ * because the caller can't do anything with the
+ * handle after we return an error. We rely on
+ * dbreg_close_id to mark the entry in some manner
+ * so that we do not do a clean shutdown of this
+ * environment. If shutdown isn't clean, then the
+ * application *must* run recovery and that will
+ * generate the RCLOSE record.
+ */
+ }
+
+ }
+
+ /* Close any handle we've been holding since the open. */
+ if (dbp->saved_open_fhp != NULL &&
+ (t_ret = __os_closehandle(env, dbp->saved_open_fhp)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+
+ /*
+ * Remove this DB handle from the ENV's dblist, if it's been added.
+ *
+ * Close our reference to the underlying cache while locked, we don't
+ * want to race with a thread searching for our underlying cache link
+ * while opening a DB handle.
+ *
+ * The DB handle may not yet have been added to the ENV list, don't
+ * blindly call the underlying TAILQ_REMOVE macro. Explicitly reset
+ * the field values to NULL so that we can't call TAILQ_REMOVE twice.
+ */
+ MUTEX_LOCK(env, env->mtx_dblist);
+ if (!reuse &&
+ (dbp->dblistlinks.tqe_next != NULL ||
+ dbp->dblistlinks.tqe_prev != NULL)) {
+ TAILQ_REMOVE(&env->dblist, dbp, dblistlinks);
+ dbp->dblistlinks.tqe_next = NULL;
+ dbp->dblistlinks.tqe_prev = NULL;
+ }
+
+ /* Close the memory pool file handle. */
+ if (dbp->mpf != NULL) {
+ if ((t_ret = __memp_fclose(dbp->mpf,
+ F_ISSET(dbp, DB_AM_DISCARD) ? DB_MPOOL_DISCARD : 0)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ dbp->mpf = NULL;
+ if (reuse &&
+ (t_ret = __memp_fcreate(env, &dbp->mpf)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ }
+
+ MUTEX_UNLOCK(env, env->mtx_dblist);
+
+ /*
+ * Call the access specific close function.
+ *
+ * We do this here rather than in __db_close as we need to do this when
+ * aborting an open so that file descriptors are closed and abort of
+ * renames can succeed on platforms that lock open files (such as
+ * Windows). In particular, we need to ensure that all the extents
+ * associated with a queue are closed so that queue renames can be
+ * aborted.
+ *
+ * It is also important that we do this before releasing the handle
+ * lock, because dbremove and dbrename assume that once they have the
+ * handle lock, it is safe to modify the underlying file(s).
+ *
+ * !!!
+ * Because of where these functions are called in the DB handle close
+ * process, these routines can't do anything that would dirty pages or
+ * otherwise affect closing down the database. Specifically, we can't
+ * abort and recover any of the information they control.
+ */
+#ifdef HAVE_PARTITION
+ if (dbp->p_internal != NULL &&
+ (t_ret = __partition_close(dbp, txn, flags)) != 0 && ret == 0)
+ ret = t_ret;
+#endif
+ if ((t_ret = __bam_db_close(dbp)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __ham_db_close(dbp)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __qam_db_close(dbp, dbp->flags)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * !!!
+ * At this point, the access-method specific information has been
+ * freed. From now on, we can use the dbp, but not touch any
+ * access-method specific data.
+ */
+
+ if (!reuse && dbp->locker != NULL) {
+ /* We may have pending trade operations on this dbp. */
+ if (txn == NULL)
+ txn = dbp->cur_txn;
+ if (IS_REAL_TXN(txn))
+ __txn_remlock(env,
+ txn, &dbp->handle_lock, dbp->locker);
+
+ /* We may be holding the handle lock; release it. */
+ lreq.op = DB_LOCK_PUT_ALL;
+ lreq.obj = NULL;
+ if ((t_ret = __lock_vec(env,
+ dbp->locker, 0, &lreq, 1, NULL)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if ((t_ret =
+ __lock_id_free(env, dbp->locker)) != 0 && ret == 0)
+ ret = t_ret;
+ dbp->locker = NULL;
+ LOCK_INIT(dbp->handle_lock);
+ }
+
+ /*
+ * If this is a temporary file (un-named in-memory file), then
+ * discard the locker ID allocated as the fileid.
+ */
+ if (LOCKING_ON(env) &&
+ F_ISSET(dbp, DB_AM_INMEM) && !dbp->preserve_fid &&
+ *(u_int32_t *)dbp->fileid != DB_LOCK_INVALIDID) {
+ if ((t_ret = __lock_getlocker(env->lk_handle,
+ *(u_int32_t *)dbp->fileid, 0, &locker)) == 0)
+ t_ret = __lock_id_free(env, locker);
+ if (ret == 0)
+ ret = t_ret;
+ }
+
+ if (reuse) {
+ /*
+ * If we are reusing this dbp, then we're done now. Re-init
+ * the handle, preserving important flags, and then return.
+ * This code is borrowed from __db_init, which does more
+ * than we can do here.
+ */
+ save_flags = F_ISSET(dbp, DB_AM_INMEM | DB_AM_TXN);
+
+ if ((ret = __bam_db_create(dbp)) != 0)
+ return (ret);
+ if ((ret = __ham_db_create(dbp)) != 0)
+ return (ret);
+ if ((ret = __qam_db_create(dbp)) != 0)
+ return (ret);
+
+ /* Restore flags */
+ dbp->flags = dbp->orig_flags | save_flags;
+
+ if (FLD_ISSET(save_flags, DB_AM_INMEM)) {
+ /*
+ * If this is inmem, then it may have a fileid
+ * even if it was never opened, and we need to
+ * clear out that fileid.
+ */
+ memset(dbp->fileid, 0, sizeof(dbp->fileid));
+ MAKE_INMEM(dbp);
+ }
+ return (ret);
+ }
+
+ dbp->type = DB_UNKNOWN;
+
+ /*
+ * The thread mutex may have been invalidated in __dbreg_close_id if the
+ * fname refcount did not go to 0. If not, discard the thread mutex.
+ */
+ if ((t_ret = __mutex_free(env, &dbp->mutex)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Discard any memory allocated for the file and database names. */
+ if (dbp->fname != NULL) {
+ __os_free(dbp->env, dbp->fname);
+ dbp->fname = NULL;
+ }
+ if (dbp->dname != NULL) {
+ __os_free(dbp->env, dbp->dname);
+ dbp->dname = NULL;
+ }
+
+ /* Discard any memory used to store returned data. */
+ if (dbp->my_rskey.data != NULL)
+ __os_free(dbp->env, dbp->my_rskey.data);
+ if (dbp->my_rkey.data != NULL)
+ __os_free(dbp->env, dbp->my_rkey.data);
+ if (dbp->my_rdata.data != NULL)
+ __os_free(dbp->env, dbp->my_rdata.data);
+
+ /* For safety's sake; we may refresh twice. */
+ memset(&dbp->my_rskey, 0, sizeof(DBT));
+ memset(&dbp->my_rkey, 0, sizeof(DBT));
+ memset(&dbp->my_rdata, 0, sizeof(DBT));
+
+ /* Clear out fields that normally get set during open. */
+ memset(dbp->fileid, 0, sizeof(dbp->fileid));
+ dbp->adj_fileid = 0;
+ dbp->meta_pgno = 0;
+ dbp->cur_locker = NULL;
+ dbp->cur_txn = NULL;
+ dbp->associate_locker = NULL;
+ dbp->cl_id = 0;
+ dbp->open_flags = 0;
+
+ /*
+ * If we are being refreshed with a txn specified, then we need
+ * to make sure that we clear out the lock handle field, because
+ * releasing all the locks for this transaction will release this
+ * lock and we don't want close to stumble upon this handle and
+ * try to close it.
+ */
+ if (txn != NULL)
+ LOCK_INIT(dbp->handle_lock);
+
+ /* Reset flags to whatever the user configured. */
+ dbp->flags = dbp->orig_flags;
+
+ return (ret);
+}
+
+/*
+ * __db_disassociate --
+ * Destroy the association between a given secondary and its primary.
+ */
+static int
+__db_disassociate(sdbp)
+ DB *sdbp;
+{
+ DBC *dbc;
+ int ret, t_ret;
+
+ ret = 0;
+
+ sdbp->s_callback = NULL;
+ sdbp->s_primary = NULL;
+ sdbp->get = sdbp->stored_get;
+ sdbp->close = sdbp->stored_close;
+
+ /*
+ * Complain, but proceed, if we have any active cursors. (We're in
+ * the middle of a close, so there's really no turning back.)
+ */
+ if (sdbp->s_refcnt != 1 ||
+ TAILQ_FIRST(&sdbp->active_queue) != NULL ||
+ TAILQ_FIRST(&sdbp->join_queue) != NULL) {
+ __db_errx(sdbp->env,
+ "Closing a primary DB while a secondary DB has active cursors is unsafe");
+ ret = EINVAL;
+ }
+ sdbp->s_refcnt = 0;
+
+ while ((dbc = TAILQ_FIRST(&sdbp->free_queue)) != NULL)
+ if ((t_ret = __dbc_destroy(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ F_CLR(sdbp, DB_AM_SECONDARY);
+ return (ret);
+}
+
+/*
+ * __db_disassociate_foreign --
+ * Destroy the association between a given secondary and its foreign.
+ */
+static int
+__db_disassociate_foreign(sdbp)
+ DB *sdbp;
+{
+ DB *fdbp;
+ DB_FOREIGN_INFO *f_info, *tmp;
+ int ret;
+
+ if (sdbp->s_foreign == NULL)
+ return (0);
+ if ((ret = __os_malloc(sdbp->env, sizeof(DB_FOREIGN_INFO), &tmp)) != 0)
+ return (ret);
+
+ fdbp = sdbp->s_foreign;
+ ret = 0;
+ f_info = LIST_FIRST(&fdbp->f_primaries);
+ while (f_info != NULL) {
+ tmp = LIST_NEXT(f_info, f_links);
+ if (f_info ->dbp == sdbp) {
+ LIST_REMOVE(f_info, f_links);
+ __os_free(sdbp->env, f_info);
+ }
+ f_info = tmp;
+ }
+
+ return (ret);
+}
+
+/*
+ * __db_log_page
+ * Log a meta-data or root page during a subdatabase create operation.
+ *
+ * PUBLIC: int __db_log_page __P((DB *, DB_TXN *, DB_LSN *, db_pgno_t, PAGE *));
+ */
+int
+__db_log_page(dbp, txn, lsn, pgno, page)
+ DB *dbp;
+ DB_TXN *txn;
+ DB_LSN *lsn;
+ db_pgno_t pgno;
+ PAGE *page;
+{
+ DBT page_dbt;
+ DB_LSN new_lsn;
+ int ret;
+
+ if (!LOGGING_ON(dbp->env) || txn == NULL)
+ return (0);
+
+ memset(&page_dbt, 0, sizeof(page_dbt));
+ page_dbt.size = dbp->pgsize;
+ page_dbt.data = page;
+
+ ret = __crdel_metasub_log(dbp, txn, &new_lsn, 0, pgno, &page_dbt, lsn);
+
+ if (ret == 0)
+ page->lsn = new_lsn;
+ return (ret);
+}
+
+/*
+ * __db_backup_name
+ * Create the backup file name for a given file.
+ *
+ * PUBLIC: int __db_backup_name __P((ENV *,
+ * PUBLIC: const char *, DB_TXN *, char **));
+ */
+#undef BACKUP_PREFIX
+#define BACKUP_PREFIX "__db."
+
+#undef MAX_INT_TO_HEX
+#define MAX_INT_TO_HEX 8
+
+int
+__db_backup_name(env, name, txn, backup)
+ ENV *env;
+ const char *name;
+ DB_TXN *txn;
+ char **backup;
+{
+ u_int32_t id;
+ size_t len;
+ int ret;
+ char *p, *retp;
+
+ *backup = NULL;
+
+ /*
+ * Part of the name may be a full path, so we need to make sure that
+ * we allocate enough space for it, even in the case where we don't
+ * use the entire filename for the backup name.
+ */
+ len = strlen(name) + strlen(BACKUP_PREFIX) + 2 * MAX_INT_TO_HEX + 1;
+ if ((ret = __os_malloc(env, len, &retp)) != 0)
+ return (ret);
+
+ /*
+ * Create the name. Backup file names are in one of 2 forms: in a
+ * transactional env "__db.TXNID.ID", where ID is a random number,
+ * and in any other env "__db.FILENAME".
+ *
+ * In addition, the name passed may contain an env-relative path.
+ * In that case, put the "__db." in the right place (in the last
+ * component of the pathname).
+ *
+ * There are four cases here:
+ * 1. simple path w/out transaction
+ * 2. simple path + transaction
+ * 3. multi-component path w/out transaction
+ * 4. multi-component path + transaction
+ */
+ p = __db_rpath(name);
+ if (IS_REAL_TXN(txn)) {
+ __os_unique_id(env, &id);
+ if (p == NULL) /* Case 2. */
+ snprintf(retp, len, "%s%x.%x",
+ BACKUP_PREFIX, txn->txnid, id);
+ else /* Case 4. */
+ snprintf(retp, len, "%.*s%x.%x",
+ (int)(p - name) + 1, name, txn->txnid, id);
+ } else {
+ if (p == NULL) /* Case 1. */
+ snprintf(retp, len, "%s%s", BACKUP_PREFIX, name);
+ else /* Case 3. */
+ snprintf(retp, len, "%.*s%s%s",
+ (int)(p - name) + 1, name, BACKUP_PREFIX, p + 1);
+ }
+
+ *backup = retp;
+ return (0);
+}
+
+#ifdef CONFIG_TEST
+/*
+ * __db_testcopy
+ * Create a copy of all backup files and our "main" DB.
+ *
+ * PUBLIC: #ifdef CONFIG_TEST
+ * PUBLIC: int __db_testcopy __P((ENV *, DB *, const char *));
+ * PUBLIC: #endif
+ */
+int
+__db_testcopy(env, dbp, name)
+ ENV *env;
+ DB *dbp;
+ const char *name;
+{
+ DB_MPOOL *dbmp;
+ DB_MPOOLFILE *mpf;
+
+ DB_ASSERT(env, dbp != NULL || name != NULL);
+
+ if (name == NULL) {
+ dbmp = env->mp_handle;
+ mpf = dbp->mpf;
+ name = R_ADDR(dbmp->reginfo, mpf->mfp->path_off);
+ }
+
+ if (dbp != NULL && dbp->type == DB_QUEUE)
+ return (__qam_testdocopy(dbp, name));
+ else
+#ifdef HAVE_PARTITION
+ if (dbp != NULL && DB_IS_PARTITIONED(dbp))
+ return (__part_testdocopy(dbp, name));
+ else
+#endif
+ return (__db_testdocopy(env, name));
+}
+
+static int
+__qam_testdocopy(dbp, name)
+ DB *dbp;
+ const char *name;
+{
+ DB_THREAD_INFO *ip;
+ QUEUE_FILELIST *filelist, *fp;
+ int ret;
+ char buf[DB_MAXPATHLEN], *dir;
+
+ filelist = NULL;
+ if ((ret = __db_testdocopy(dbp->env, name)) != 0)
+ return (ret);
+
+ /* Call ENV_GET_THREAD_INFO to get a valid DB_THREAD_INFO */
+ ENV_GET_THREAD_INFO(dbp->env, ip);
+ if (dbp->mpf != NULL &&
+ (ret = __qam_gen_filelist(dbp, ip, &filelist)) != 0)
+ goto done;
+
+ if (filelist == NULL)
+ return (0);
+ dir = ((QUEUE *)dbp->q_internal)->dir;
+ for (fp = filelist; fp->mpf != NULL; fp++) {
+ snprintf(buf, sizeof(buf),
+ QUEUE_EXTENT, dir, PATH_SEPARATOR[0], name, fp->id);
+ if ((ret = __db_testdocopy(dbp->env, buf)) != 0)
+ return (ret);
+ }
+
+done: __os_free(dbp->env, filelist);
+ return (0);
+}
+
+/*
+ * __db_testdocopy
+ * Create a copy of all backup files and our "main" DB.
+ * PUBLIC: int __db_testdocopy __P((ENV *, const char *));
+ */
+int
+__db_testdocopy(env, name)
+ ENV *env;
+ const char *name;
+{
+ size_t len;
+ int dircnt, i, ret;
+ char *copy, **namesp, *p, *real_name;
+
+ dircnt = 0;
+ copy = NULL;
+ namesp = NULL;
+
+ /* Create the real backing file name. */
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, name, NULL, &real_name)) != 0)
+ return (ret);
+
+ /*
+ * !!!
+ * There are tests that attempt to copy non-existent files. I'd guess
+ * it's a testing bug, but I don't have time to figure it out. Block
+ * the case here.
+ */
+ if (__os_exists(env, real_name, NULL) != 0) {
+ __os_free(env, real_name);
+ return (0);
+ }
+
+ /*
+ * Copy the file itself.
+ *
+ * Allocate space for the file name, including adding an ".afterop" and
+ * trailing nul byte.
+ */
+ len = strlen(real_name) + sizeof(".afterop");
+ if ((ret = __os_malloc(env, len, &copy)) != 0)
+ goto err;
+ snprintf(copy, len, "%s.afterop", real_name);
+ if ((ret = __db_makecopy(env, real_name, copy)) != 0)
+ goto err;
+
+ /*
+ * Get the directory path to call __os_dirlist().
+ */
+ if ((p = __db_rpath(real_name)) != NULL)
+ *p = '\0';
+ if ((ret = __os_dirlist(env, real_name, 0, &namesp, &dircnt)) != 0)
+ goto err;
+
+ /*
+ * Walk the directory looking for backup files. Backup file names in
+ * transactional environments are of the form:
+ *
+ * BACKUP_PREFIX.TXNID.ID
+ */
+ for (i = 0; i < dircnt; i++) {
+ /* Check for a related backup file name. */
+ if (strncmp(
+ namesp[i], BACKUP_PREFIX, sizeof(BACKUP_PREFIX) - 1) != 0)
+ continue;
+ p = namesp[i] + sizeof(BACKUP_PREFIX);
+ p += strspn(p, "0123456789ABCDEFabcdef");
+ if (*p != '.')
+ continue;
+ ++p;
+ p += strspn(p, "0123456789ABCDEFabcdef");
+ if (*p != '\0')
+ continue;
+
+ /*
+ * Copy the backup file.
+ *
+ * Allocate space for the file name, including adding a
+ * ".afterop" and trailing nul byte.
+ */
+ if (real_name != NULL) {
+ __os_free(env, real_name);
+ real_name = NULL;
+ }
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, namesp[i], NULL, &real_name)) != 0)
+ goto err;
+ if (copy != NULL) {
+ __os_free(env, copy);
+ copy = NULL;
+ }
+ len = strlen(real_name) + sizeof(".afterop");
+ if ((ret = __os_malloc(env, len, &copy)) != 0)
+ goto err;
+ snprintf(copy, len, "%s.afterop", real_name);
+ if ((ret = __db_makecopy(env, real_name, copy)) != 0)
+ goto err;
+ }
+
+err: if (namesp != NULL)
+ __os_dirfree(env, namesp, dircnt);
+ if (copy != NULL)
+ __os_free(env, copy);
+ if (real_name != NULL)
+ __os_free(env, real_name);
+ return (ret);
+}
+
+static int
+__db_makecopy(env, src, dest)
+ ENV *env;
+ const char *src, *dest;
+{
+ DB_FH *rfhp, *wfhp;
+ size_t rcnt, wcnt;
+ int ret;
+ char *buf;
+
+ rfhp = wfhp = NULL;
+
+ if ((ret = __os_malloc(env, 64 * 1024, &buf)) != 0)
+ goto err;
+
+ if ((ret = __os_open(env, src, 0,
+ DB_OSO_RDONLY, DB_MODE_600, &rfhp)) != 0)
+ goto err;
+ if ((ret = __os_open(env, dest, 0,
+ DB_OSO_CREATE | DB_OSO_TRUNC, DB_MODE_600, &wfhp)) != 0)
+ goto err;
+
+ for (;;) {
+ if ((ret =
+ __os_read(env, rfhp, buf, sizeof(buf), &rcnt)) != 0)
+ goto err;
+ if (rcnt == 0)
+ break;
+ if ((ret =
+ __os_write(env, wfhp, buf, sizeof(buf), &wcnt)) != 0)
+ goto err;
+ }
+
+ if (0) {
+err: __db_err(env, ret, "__db_makecopy: %s -> %s", src, dest);
+ }
+
+ if (buf != NULL)
+ __os_free(env, buf);
+ if (rfhp != NULL)
+ (void)__os_closehandle(env, rfhp);
+ if (wfhp != NULL)
+ (void)__os_closehandle(env, wfhp);
+ return (ret);
+}
+#endif
diff --git a/db/db.src b/db/db.src
new file mode 100644
index 0000000..2136b79
--- /dev/null
+++ b/db/db.src
@@ -0,0 +1,328 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+DBPRIVATE
+PREFIX __db
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/crypto.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_dispatch.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/log.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE
+
+/*
+ * addrem -- Add or remove an entry from a duplicate page.
+ *
+ * opcode: identifies if this is an add or delete.
+ * fileid: file identifier of the file being modified.
+ * pgno: duplicate page number.
+ * indx: location at which to insert or delete.
+ * nbytes: number of bytes added/removed to/from the page.
+ * hdr: header for the data item.
+ * dbt: data that is deleted or is to be added.
+ * pagelsn: former lsn of the page.
+ *
+ * If the hdr was NULL then, the dbt is a regular B_KEYDATA.
+ * If the dbt was NULL then the hdr is a complete item to be
+ * pasted on the page.
+ */
+BEGIN addrem 42 41
+ARG opcode u_int32_t lu
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+ARG indx u_int32_t lu
+ARG nbytes u_int32_t lu
+DBT hdr DBT s
+DBT dbt DBT s
+POINTER pagelsn DB_LSN * lu
+END
+
+/*
+ * big -- Handles addition and deletion of big key/data items.
+ *
+ * opcode: identifies get/put.
+ * fileid: file identifier of the file being modified.
+ * pgno: page onto which data is being added/removed.
+ * prev_pgno: the page before the one we are logging.
+ * next_pgno: the page after the one we are logging.
+ * dbt: data being written onto the page.
+ * pagelsn: former lsn of the orig_page.
+ * prevlsn: former lsn of the prev_pgno.
+ * nextlsn: former lsn of the next_pgno. This is not currently used, but
+ * may be used later if we actually do overwrites of big key/
+ * data items in place.
+ */
+BEGIN big 42 43
+ARG opcode u_int32_t lu
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+ARG prev_pgno db_pgno_t lu
+ARG next_pgno db_pgno_t lu
+DBT dbt DBT s
+POINTER pagelsn DB_LSN * lu
+POINTER prevlsn DB_LSN * lu
+POINTER nextlsn DB_LSN * lu
+END
+
+/*
+ * ovref -- Handles increment/decrement of overflow page reference count.
+ *
+ * fileid: identifies the file being modified.
+ * pgno: page number whose ref count is being incremented/decremented.
+ * adjust: the adjustment being made.
+ * lsn: the page's original lsn.
+ */
+BEGIN ovref 42 44
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+ARG adjust int32_t ld
+POINTER lsn DB_LSN * lu
+END
+
+/*
+ * relink -- Handles relinking around a page.
+ *
+ * opcode: indicates if this is an addpage or delete page
+ * pgno: the page being changed.
+ * lsn the page's original lsn.
+ * prev: the previous page.
+ * lsn_prev: the previous page's original lsn.
+ * next: the next page.
+ * lsn_next: the previous page's original lsn.
+ */
+BEGIN_COMPAT relink 42 45
+ARG opcode u_int32_t lu
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER lsn DB_LSN * lu
+ARG prev db_pgno_t lu
+POINTER lsn_prev DB_LSN * lu
+ARG next db_pgno_t lu
+POINTER lsn_next DB_LSN * lu
+END
+
+/*
+ * Debug -- log an operation upon entering an access method.
+ * op: Operation (cursor, c_close, c_get, c_put, c_del,
+ * get, put, delete).
+ * fileid: identifies the file being acted upon.
+ * key: key paramater
+ * data: data parameter
+ * flags: flags parameter
+ */
+BEGIN debug 42 47
+DBT op DBT s
+ARG fileid int32_t ld
+DBT key DBT s
+DBT data DBT s
+ARG arg_flags u_int32_t lu
+END
+
+/*
+ * noop -- do nothing, but get an LSN.
+ */
+BEGIN noop 42 48
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER prevlsn DB_LSN * lu
+END
+
+/*
+ * pg_alloc: used to record allocating a new page.
+ *
+ * meta_lsn: the original lsn of the page reference by meta_pgno.
+ * meta_pgno the page pointing at the allocated page in the free list.
+ * If the list is unsorted this is the metadata page.
+ * page_lsn: the allocated page's original lsn.
+ * pgno: the page allocated.
+ * ptype: the type of the page allocated.
+ * next: the next page on the free list.
+ * last_pgno: the last page in the file after this op (4.3+).
+ */
+BEGIN_COMPAT pg_alloc 42 49
+DB fileid int32_t ld
+POINTER meta_lsn DB_LSN * lu
+ARG meta_pgno db_pgno_t lu
+POINTER page_lsn DB_LSN * lu
+ARG pgno db_pgno_t lu
+ARG ptype u_int32_t lu
+ARG next db_pgno_t lu
+END
+
+BEGIN pg_alloc 43 49
+DB fileid int32_t ld
+POINTER meta_lsn DB_LSN * lu
+ARG meta_pgno db_pgno_t lu
+POINTER page_lsn DB_LSN * lu
+ARG pgno db_pgno_t lu
+ARG ptype u_int32_t lu
+ARG next db_pgno_t lu
+ARG last_pgno db_pgno_t lu
+END
+
+/*
+ * pg_free: used to record freeing a page.
+ * If we are maintaining a sorted free list (during compact) meta_pgno
+ * will be non-zero and refer to the page that preceeds the one we are freeing
+ * in the free list. Meta_lsn will then be the lsn of that page.
+ *
+ * pgno: the page being freed.
+ * meta_lsn: the meta-data page's original lsn.
+ * meta_pgno: the meta-data page number.
+ * header: the header from the free'd page.
+ * next: the previous next pointer on the metadata page.
+ * last_pgno: the last page in the file before this op (4.3+).
+ */
+BEGIN_COMPAT pg_free 42 50
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER meta_lsn DB_LSN * lu
+ARG meta_pgno db_pgno_t lu
+PGDBT header DBT s
+ARG next db_pgno_t lu
+END
+
+BEGIN pg_free 43 50
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER meta_lsn DB_LSN * lu
+ARG meta_pgno db_pgno_t lu
+PGDBT header DBT s
+ARG next db_pgno_t lu
+ARG last_pgno db_pgno_t lu
+END
+
+/*
+ * cksum --
+ * This log record is written when we're unable to checksum a page,
+ * before returning DB_RUNRECOVERY. This log record causes normal
+ * recovery to itself return DB_RUNRECOVERY, as only catastrophic
+ * recovery can fix things.
+ */
+BEGIN cksum 42 51
+END
+
+/*
+ * pg_freedata: used to record freeing a page with data on it.
+ *
+ * pgno: the page being freed.
+ * meta_lsn: the meta-data page's original lsn.
+ * meta_pgno: the meta-data page number.
+ * header: the header and index entries from the free'd page.
+ * data: the data from the free'd page.
+ * next: the previous next pointer on the metadata page.
+ * last_pgno: the last page in the file before this op (4.3+).
+ */
+BEGIN_COMPAT pg_freedata 42 52
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER meta_lsn DB_LSN * lu
+ARG meta_pgno db_pgno_t lu
+PGDBT header DBT s
+ARG next db_pgno_t lu
+PGDDBT data DBT s
+END
+
+BEGIN pg_freedata 43 52
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER meta_lsn DB_LSN * lu
+ARG meta_pgno db_pgno_t lu
+PGDBT header DBT s
+ARG next db_pgno_t lu
+ARG last_pgno db_pgno_t lu
+PGDDBT data DBT s
+END
+
+/*
+ * pg_prepare: used to record an aborted page in a prepared transaction.
+ *
+ * pgno: the page being freed.
+ */
+X BEGIN pg_prepare 42 53
+X DB fileid int32_t ld
+X ARG pgno db_pgno_t lu
+X END
+
+/*
+ * pg_new: used to record a new page put on the free list.
+ *
+ * pgno: the page being freed.
+ * meta_lsn: the meta-data page's original lsn.
+ * meta_pgno: the meta-data page number.
+ * header: the header from the free'd page.
+ * next: the previous next pointer on the metadata page.
+ */
+X BEGIN pg_new 42 54
+X DB fileid int32_t ld
+X ARG pgno db_pgno_t lu
+X POINTER meta_lsn DB_LSN * lu
+X ARG meta_pgno db_pgno_t lu
+X PGDBT header DBT s
+X ARG next db_pgno_t lu
+X END
+
+/*
+ * pg_init: used to reinitialize a page during truncate.
+ *
+ * pgno: the page being initialized.
+ * header: the header from the page.
+ * data: data that used to be on the page.
+ */
+BEGIN pg_init 43 60
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+PGDBT header DBT s
+PGDDBT data DBT s
+END
+
+/*
+ * pg_sort: sort the free list
+ *
+ * meta: meta page number
+ * meta_lsn: lsn on meta page.
+ * last_free: page number of new last free page.
+ * last_lsn; lsn of last free page.
+ * last_pgno: current last page number.
+ * list: list of pages and lsns to sort.
+ */
+BEGIN_COMPAT pg_sort 44 61
+DB fileid int32_t ld
+ARG meta db_pgno_t lu
+POINTER meta_lsn DB_LSN * lu
+ARG last_free db_pgno_t lu
+POINTER last_lsn DB_LSN * lu
+ARG last_pgno db_pgno_t lu
+DBT list DBT s
+END
+
+
+/*
+ * pg_truc: truncate the free list
+ *
+ * meta: meta page number
+ * meta_lsn: lsn on meta page.
+ * last_free: page number of new last free page.
+ * last_lsn; lsn of last free page.
+ * last_pgno: current last page number.
+ * list: list of pages and lsns on free list.
+ */
+BEGIN pg_trunc 49 66
+DB fileid int32_t ld
+ARG meta db_pgno_t lu
+POINTER meta_lsn DB_LSN * lu
+ARG last_free db_pgno_t lu
+POINTER last_lsn DB_LSN * lu
+ARG next_free db_pgno_t lu
+ARG last_pgno db_pgno_t lu
+DBT list DBT s
+END
+
diff --git a/db/db_am.c b/db/db_am.c
new file mode 100644
index 0000000..c453ea9
--- /dev/null
+++ b/db/db_am.c
@@ -0,0 +1,1015 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+static int __db_secondary_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+static int __dbc_set_priority __P((DBC *, DB_CACHE_PRIORITY));
+static int __dbc_get_priority __P((DBC *, DB_CACHE_PRIORITY* ));
+
+/*
+ * __db_cursor_int --
+ * Internal routine to create a cursor.
+ *
+ * PUBLIC: int __db_cursor_int __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC: DB_TXN *, DBTYPE, db_pgno_t, int, DB_LOCKER *, DBC **));
+ */
+int
+__db_cursor_int(dbp, ip, txn, dbtype, root, flags, locker, dbcp)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DBTYPE dbtype;
+ db_pgno_t root;
+ int flags;
+ DB_LOCKER *locker;
+ DBC **dbcp;
+{
+ DBC *dbc;
+ DBC_INTERNAL *cp;
+ ENV *env;
+ db_threadid_t tid;
+ int allocated, ret;
+ pid_t pid;
+
+ env = dbp->env;
+ allocated = 0;
+
+ /*
+ * If dbcp is non-NULL it is assumed to point to an area to initialize
+ * as a cursor.
+ *
+ * Take one from the free list if it's available. Take only the
+ * right type. With off page dups we may have different kinds
+ * of cursors on the queue for a single database.
+ */
+ MUTEX_LOCK(env, dbp->mutex);
+
+#ifndef HAVE_NO_DB_REFCOUNT
+ /*
+ * If this DBP is being logged then refcount the log filename
+ * relative to this transaction. We do this here because we have
+ * the dbp->mutex which protects the refcount. We want to avoid
+ * calling the function if we are duplicating a cursor. This includes
+ * the case of creating an off page duplicate cursor. If we know this
+ * cursor will not be used in an update, we could avoid this,
+ * but we don't have that information.
+ */
+ if (txn != NULL && !LF_ISSET(DBC_OPD | DBC_DUPLICATE)
+ && !F_ISSET(dbp, DB_AM_RECOVER) &&
+ dbp->log_filename != NULL && !IS_REP_CLIENT(env) &&
+ (ret = __txn_record_fname(env, txn, dbp->log_filename)) != 0) {
+ MUTEX_UNLOCK(env, dbp->mutex);
+ return (ret);
+ }
+
+#endif
+
+ TAILQ_FOREACH(dbc, &dbp->free_queue, links)
+ if (dbtype == dbc->dbtype) {
+ TAILQ_REMOVE(&dbp->free_queue, dbc, links);
+ F_CLR(dbc, ~DBC_OWN_LID);
+ break;
+ }
+ MUTEX_UNLOCK(env, dbp->mutex);
+
+ if (dbc == NULL) {
+ if ((ret = __os_calloc(env, 1, sizeof(DBC), &dbc)) != 0)
+ return (ret);
+ allocated = 1;
+ dbc->flags = 0;
+
+ dbc->dbp = dbp;
+ dbc->dbenv = dbp->dbenv;
+ dbc->env = dbp->env;
+
+ /* Set up locking information. */
+ if (LOCKING_ON(env)) {
+ /*
+ * If we are not threaded, we share a locker ID among
+ * all cursors opened in the environment handle,
+ * allocating one if this is the first cursor.
+ *
+ * This relies on the fact that non-threaded DB handles
+ * always have non-threaded environment handles, since
+ * we set DB_THREAD on DB handles created with threaded
+ * environment handles.
+ */
+ if (!DB_IS_THREADED(dbp)) {
+ if (env->env_lref == NULL && (ret =
+ __lock_id(env, NULL, &env->env_lref)) != 0)
+ goto err;
+ dbc->lref = env->env_lref;
+ } else {
+ if ((ret =
+ __lock_id(env, NULL, &dbc->lref)) != 0)
+ goto err;
+ F_SET(dbc, DBC_OWN_LID);
+ }
+
+ /*
+ * In CDB, secondary indices should share a lock file
+ * ID with the primary; otherwise we're susceptible
+ * to deadlocks. We also use __db_cursor_int rather
+ * than __db_cursor to create secondary update cursors
+ * in c_put and c_del; these won't acquire a new lock.
+ *
+ * !!!
+ * Since this is in the one-time cursor allocation
+ * code, we need to be sure to destroy, not just
+ * close, all cursors in the secondary when we
+ * associate.
+ */
+ if (CDB_LOCKING(env) &&
+ F_ISSET(dbp, DB_AM_SECONDARY))
+ memcpy(dbc->lock.fileid,
+ dbp->s_primary->fileid, DB_FILE_ID_LEN);
+ else
+ memcpy(dbc->lock.fileid,
+ dbp->fileid, DB_FILE_ID_LEN);
+
+ if (CDB_LOCKING(env)) {
+ if (F_ISSET(env->dbenv, DB_ENV_CDB_ALLDB)) {
+ /*
+ * If we are doing a single lock per
+ * environment, set up the global
+ * lock object just like we do to
+ * single thread creates.
+ */
+ DB_ASSERT(env, sizeof(db_pgno_t) ==
+ sizeof(u_int32_t));
+ dbc->lock_dbt.size = sizeof(u_int32_t);
+ dbc->lock_dbt.data = &dbc->lock.pgno;
+ dbc->lock.pgno = 0;
+ } else {
+ dbc->lock_dbt.size = DB_FILE_ID_LEN;
+ dbc->lock_dbt.data = dbc->lock.fileid;
+ }
+ } else {
+ dbc->lock.type = DB_PAGE_LOCK;
+ dbc->lock_dbt.size = sizeof(dbc->lock);
+ dbc->lock_dbt.data = &dbc->lock;
+ }
+ }
+ /* Init the DBC internal structure. */
+#ifdef HAVE_PARTITION
+ if (DB_IS_PARTITIONED(dbp)) {
+ if ((ret = __partc_init(dbc)) != 0)
+ goto err;
+ } else
+#endif
+ switch (dbtype) {
+ case DB_BTREE:
+ case DB_RECNO:
+ if ((ret = __bamc_init(dbc, dbtype)) != 0)
+ goto err;
+ break;
+ case DB_HASH:
+ if ((ret = __hamc_init(dbc)) != 0)
+ goto err;
+ break;
+ case DB_QUEUE:
+ if ((ret = __qamc_init(dbc)) != 0)
+ goto err;
+ break;
+ case DB_UNKNOWN:
+ default:
+ ret = __db_unknown_type(env, "DB->cursor", dbtype);
+ goto err;
+ }
+
+ cp = dbc->internal;
+ }
+
+ /* Refresh the DBC structure. */
+ dbc->dbtype = dbtype;
+ RESET_RET_MEM(dbc);
+ dbc->set_priority = __dbc_set_priority;
+ dbc->get_priority = __dbc_get_priority;
+ dbc->priority = dbp->priority;
+
+ if ((dbc->txn = txn) != NULL)
+ dbc->locker = txn->locker;
+ else if (LOCKING_ON(env)) {
+ /*
+ * There are certain cases in which we want to create a
+ * new cursor with a particular locker ID that is known
+ * to be the same as (and thus not conflict with) an
+ * open cursor.
+ *
+ * The most obvious case is cursor duplication; when we
+ * call DBC->dup or __dbc_idup, we want to use the original
+ * cursor's locker ID.
+ *
+ * Another case is when updating secondary indices. Standard
+ * CDB locking would mean that we might block ourself: we need
+ * to open an update cursor in the secondary while an update
+ * cursor in the primary is open, and when the secondary and
+ * primary are subdatabases or we're using env-wide locking,
+ * this is disastrous.
+ *
+ * In these cases, our caller will pass a nonzero locker
+ * ID into this function. Use this locker ID instead of
+ * the default as the locker ID for our new cursor.
+ */
+ if (locker != NULL)
+ dbc->locker = locker;
+ else {
+ /*
+ * If we are threaded then we need to set the
+ * proper thread id into the locker.
+ */
+ if (DB_IS_THREADED(dbp)) {
+ env->dbenv->thread_id(env->dbenv, &pid, &tid);
+ __lock_set_thread_id(dbc->lref, pid, tid);
+ }
+ dbc->locker = dbc->lref;
+ }
+ }
+
+ /*
+ * These fields change when we are used as a secondary index, so
+ * if the DB is a secondary, make sure they're set properly just
+ * in case we opened some cursors before we were associated.
+ *
+ * __dbc_get is used by all access methods, so this should be safe.
+ */
+ if (F_ISSET(dbp, DB_AM_SECONDARY))
+ dbc->get = dbc->c_get = __dbc_secondary_get_pp;
+
+ if (LF_ISSET(DB_CURSOR_BULK) && dbtype == DB_BTREE)
+ F_SET(dbc, DBC_BULK);
+ if (LF_ISSET(DB_CURSOR_TRANSIENT))
+ F_SET(dbc, DBC_TRANSIENT);
+ if (LF_ISSET(DBC_OPD))
+ F_SET(dbc, DBC_OPD);
+ if (F_ISSET(dbp, DB_AM_RECOVER))
+ F_SET(dbc, DBC_RECOVER);
+ if (F_ISSET(dbp, DB_AM_COMPENSATE))
+ F_SET(dbc, DBC_DONTLOCK);
+#ifdef HAVE_REPLICATION
+ /*
+ * If we are replicating from a down rev version then we must
+ * use old locking protocols.
+ */
+ if (LOGGING_ON(env) &&
+ ((LOG *)env->lg_handle->
+ reginfo.primary)->persist.version < DB_LOGVERSION_LATCHING)
+ F_SET(dbc, DBC_DOWNREV);
+#endif
+
+ /* Refresh the DBC internal structure. */
+ cp = dbc->internal;
+ cp->opd = NULL;
+ cp->pdbc = NULL;
+
+ cp->indx = 0;
+ cp->page = NULL;
+ cp->pgno = PGNO_INVALID;
+ cp->root = root;
+ cp->stream_start_pgno = cp->stream_curr_pgno = PGNO_INVALID;
+ cp->stream_off = 0;
+
+ if (DB_IS_PARTITIONED(dbp)) {
+ DBC_PART_REFRESH(dbc);
+ } else switch (dbtype) {
+ case DB_BTREE:
+ case DB_RECNO:
+ if ((ret = __bamc_refresh(dbc)) != 0)
+ goto err;
+ break;
+ case DB_HASH:
+ case DB_QUEUE:
+ break;
+ case DB_UNKNOWN:
+ default:
+ ret = __db_unknown_type(env, "DB->cursor", dbp->type);
+ goto err;
+ }
+
+ /*
+ * The transaction keeps track of how many cursors were opened within
+ * it to catch application errors where the cursor isn't closed when
+ * the transaction is resolved.
+ */
+ if (txn != NULL)
+ ++txn->cursors;
+ if (ip != NULL)
+ dbc->thread_info = ip;
+ else if (txn != NULL)
+ dbc->thread_info = txn->thread_info;
+ else
+ ENV_GET_THREAD_INFO(env, dbc->thread_info);
+
+ MUTEX_LOCK(env, dbp->mutex);
+ TAILQ_INSERT_TAIL(&dbp->active_queue, dbc, links);
+ F_SET(dbc, DBC_ACTIVE);
+ MUTEX_UNLOCK(env, dbp->mutex);
+
+ *dbcp = dbc;
+ return (0);
+
+err: if (allocated)
+ __os_free(env, dbc);
+ return (ret);
+}
+
+/*
+ * __db_put --
+ * Store a key/data pair.
+ *
+ * PUBLIC: int __db_put __P((DB *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_put(dbp, ip, txn, key, data, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ DBT tdata, tkey;
+ ENV *env;
+ void *bulk_kptr, *bulk_ptr;
+ db_recno_t recno;
+ u_int32_t cursor_flags;
+ int ret, t_ret;
+
+ env = dbp->env;
+
+ /*
+ * See the comment in __db_get() regarding DB_CURSOR_TRANSIENT.
+ *
+ * Note that the get in the DB_NOOVERWRITE case is safe to do with this
+ * flag set; if it errors in any way other than DB_NOTFOUND, we're
+ * going to close the cursor without doing anything else, and if it
+ * returns DB_NOTFOUND then it's safe to do a c_put(DB_KEYLAST) even if
+ * an access method moved the cursor, since that's not
+ * position-dependent.
+ */
+ cursor_flags = DB_WRITELOCK;
+ if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY))
+ cursor_flags |= DB_CURSOR_BULK;
+ else
+ cursor_flags |= DB_CURSOR_TRANSIENT;
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, cursor_flags)) != 0)
+ return (ret);
+
+ DEBUG_LWRITE(dbc, txn, "DB->put", key, data, flags);
+
+ SET_RET_MEM(dbc, dbp);
+
+ if (flags == DB_APPEND && !DB_IS_PRIMARY(dbp)) {
+ /*
+ * If there is an append callback, the value stored in
+ * data->data may be replaced and then freed. To avoid
+ * passing a freed pointer back to the user, just operate
+ * on a copy of the data DBT.
+ */
+ tdata = *data;
+
+ /*
+ * Append isn't a normal put operation; call the appropriate
+ * access method's append function.
+ */
+ switch (dbp->type) {
+ case DB_QUEUE:
+ if ((ret = __qam_append(dbc, key, &tdata)) != 0)
+ goto err;
+ break;
+ case DB_RECNO:
+ if ((ret = __ram_append(dbc, key, &tdata)) != 0)
+ goto err;
+ break;
+ case DB_BTREE:
+ case DB_HASH:
+ case DB_UNKNOWN:
+ default:
+ /* The interface should prevent this. */
+ DB_ASSERT(env,
+ dbp->type == DB_QUEUE || dbp->type == DB_RECNO);
+
+ ret = __db_ferr(env, "DB->put", 0);
+ goto err;
+ }
+
+ /*
+ * The append callback, if one exists, may have allocated
+ * a new tdata.data buffer. If so, free it.
+ */
+ FREE_IF_NEEDED(env, &tdata);
+
+ /* No need for a cursor put; we're done. */
+#ifdef HAVE_COMPRESSION
+ } else if (DB_IS_COMPRESSED(dbp) && !F_ISSET(dbp, DB_AM_SECONDARY) &&
+ !DB_IS_PRIMARY(dbp) && LIST_FIRST(&dbp->f_primaries) == NULL) {
+ ret = __dbc_put(dbc, key, data, flags);
+#endif
+ } else if (LF_ISSET(DB_MULTIPLE)) {
+ ret = 0;
+ memset(&tkey, 0, sizeof(tkey));
+ if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO) {
+ tkey.data = &recno;
+ tkey.size = sizeof(recno);
+ }
+ memset(&tdata, 0, sizeof(tdata));
+ DB_MULTIPLE_INIT(bulk_kptr, key);
+ DB_MULTIPLE_INIT(bulk_ptr, data);
+ key->doff = 0;
+ while (ret == 0) {
+ if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO)
+ DB_MULTIPLE_RECNO_NEXT(bulk_kptr, key,
+ recno, tdata.data, tdata.size);
+ else
+ DB_MULTIPLE_NEXT(bulk_kptr, key,
+ tkey.data, tkey.size);
+ DB_MULTIPLE_NEXT(bulk_ptr, data,
+ tdata.data, tdata.size);
+ if (bulk_kptr == NULL || bulk_ptr == NULL)
+ break;
+ ret = __dbc_put(dbc, &tkey, &tdata,
+ LF_ISSET(DB_OPFLAGS_MASK));
+ if (ret == 0)
+ ++key->doff;
+ }
+ } else if (LF_ISSET(DB_MULTIPLE_KEY)) {
+ ret = 0;
+ memset(&tkey, 0, sizeof(tkey));
+ if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO) {
+ tkey.data = &recno;
+ tkey.size = sizeof(recno);
+ }
+ memset(&tdata, 0, sizeof(tdata));
+ DB_MULTIPLE_INIT(bulk_ptr, key);
+ while (ret == 0) {
+ if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO)
+ DB_MULTIPLE_RECNO_NEXT(bulk_ptr, key, recno,
+ tdata.data, tdata.size);
+ else
+ DB_MULTIPLE_KEY_NEXT(bulk_ptr, key, tkey.data,
+ tkey.size, tdata.data, tdata.size);
+ if (bulk_ptr == NULL)
+ break;
+ ret = __dbc_put(dbc, &tkey, &tdata,
+ LF_ISSET(DB_OPFLAGS_MASK));
+ if (ret == 0)
+ ++key->doff;
+ }
+ } else
+ ret = __dbc_put(dbc, key, data, flags);
+
+err: /* Close the cursor. */
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_del --
+ * Delete the items referenced by a key.
+ *
+ * PUBLIC: int __db_del __P((DB *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DBT *, u_int32_t));
+ */
+int
+__db_del(dbp, ip, txn, key, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DBT *key;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ DBT data, tkey;
+ void *bulk_ptr;
+ db_recno_t recno;
+ u_int32_t cursor_flags, f_init, f_next;
+ int ret, t_ret;
+
+ COMPQUIET(bulk_ptr, NULL);
+ /* Allocate a cursor. */
+ cursor_flags = DB_WRITELOCK;
+ if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY))
+ cursor_flags |= DB_CURSOR_BULK;
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, cursor_flags)) != 0)
+ goto err;
+
+ DEBUG_LWRITE(dbc, txn, "DB->del", key, NULL, flags);
+
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbp) && !F_ISSET(dbp, DB_AM_SECONDARY) &&
+ !DB_IS_PRIMARY(dbp) && LIST_FIRST(&dbp->f_primaries) == NULL) {
+ F_SET(dbc, DBC_TRANSIENT);
+ ret = __dbc_bulk_del(dbc, key, flags);
+ goto err;
+ }
+#endif
+
+ /*
+ * Walk a cursor through the key/data pairs, deleting as we go. Set
+ * the DB_DBT_USERMEM flag, as this might be a threaded application
+ * and the flags checking will catch us. We don't actually want the
+ * keys or data, set DB_DBT_ISSET. We rely on __dbc_get to clear
+ * this.
+ */
+ memset(&data, 0, sizeof(data));
+ F_SET(&data, DB_DBT_USERMEM);
+ tkey = *key;
+
+ f_init = LF_ISSET(DB_MULTIPLE_KEY) ? DB_GET_BOTH : DB_SET;
+ f_next = DB_NEXT_DUP;
+
+ /*
+ * If locking (and we haven't already acquired CDB locks), set the
+ * read-modify-write flag.
+ */
+ if (STD_LOCKING(dbc)) {
+ f_init |= DB_RMW;
+ f_next |= DB_RMW;
+ }
+
+ if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+ if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO) {
+ memset(&tkey, 0, sizeof(tkey));
+ tkey.data = &recno;
+ tkey.size = sizeof(recno);
+ }
+ DB_MULTIPLE_INIT(bulk_ptr, key);
+ /* We return the number of keys deleted in doff. */
+ key->doff = 0;
+bulk_next: if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO)
+ DB_MULTIPLE_RECNO_NEXT(bulk_ptr, key,
+ recno, data.data, data.size);
+ else if (LF_ISSET(DB_MULTIPLE))
+ DB_MULTIPLE_NEXT(bulk_ptr, key, tkey.data, tkey.size);
+ else
+ DB_MULTIPLE_KEY_NEXT(bulk_ptr, key,
+ tkey.data, tkey.size, data.data, data.size);
+ if (bulk_ptr == NULL)
+ goto err;
+ }
+
+ /* We're not interested in the data -- do not return it. */
+ F_SET(&tkey, DB_DBT_ISSET);
+ F_SET(&data, DB_DBT_ISSET);
+
+ /*
+ * Optimize the simple cases. For all AMs if we don't have secondaries
+ * and are not a secondary and we aren't a foreign database and there
+ * are no dups then we can avoid a bunch of overhead. For queue we
+ * don't need to fetch the record since we delete by direct calculation
+ * from the record number.
+ *
+ * Hash permits an optimization in DB->del: since on-page duplicates are
+ * stored in a single HKEYDATA structure, it's possible to delete an
+ * entire set of them at once, and as the HKEYDATA has to be rebuilt
+ * and re-put each time it changes, this is much faster than deleting
+ * the duplicates one by one. Thus, if not pointing at an off-page
+ * duplicate set, and we're not using secondary indices (in which case
+ * we'd have to examine the items one by one anyway), let hash do this
+ * "quick delete".
+ *
+ * !!!
+ * Note that this is the only application-executed delete call in
+ * Berkeley DB that does not go through the __dbc_del function.
+ * If anything other than the delete itself (like a secondary index
+ * update) has to happen there in a particular situation, the
+ * conditions here should be modified not to use these optimizations.
+ * The ordinary AM-independent alternative will work just fine;
+ * it'll just be slower.
+ */
+ if (!F_ISSET(dbp, DB_AM_SECONDARY) && !DB_IS_PRIMARY(dbp) &&
+ LIST_FIRST(&dbp->f_primaries) == NULL) {
+#ifdef HAVE_QUEUE
+ if (dbp->type == DB_QUEUE) {
+ ret = __qam_delete(dbc, &tkey, flags);
+ goto next;
+ }
+#endif
+
+ /* Fetch the first record. */
+ if ((ret = __dbc_get(dbc, &tkey, &data, f_init)) != 0)
+ goto err;
+
+#ifdef HAVE_HASH
+ /*
+ * Hash "quick delete" removes all on-page duplicates. We
+ * can't do that if deleting specific key/data pairs.
+ */
+ if (dbp->type == DB_HASH && !LF_ISSET(DB_MULTIPLE_KEY)) {
+ DBC *sdbc;
+ sdbc = dbc;
+#ifdef HAVE_PARTITION
+ if (F_ISSET(dbc, DBC_PARTITIONED))
+ sdbc =
+ ((PART_CURSOR*)dbc->internal)->sub_cursor;
+#endif
+ if (sdbc->internal->opd == NULL) {
+ ret = __ham_quick_delete(sdbc);
+ goto next;
+ }
+ }
+#endif
+
+ if (!F_ISSET(dbp, DB_AM_DUP)) {
+ ret = dbc->am_del(dbc, 0);
+ goto next;
+ }
+ } else if ((ret = __dbc_get(dbc, &tkey, &data, f_init)) != 0)
+ goto err;
+
+ /* Walk through the set of key/data pairs, deleting as we go. */
+ for (;;) {
+ if ((ret = __dbc_del(dbc, flags)) != 0)
+ break;
+ /*
+ * With DB_MULTIPLE_KEY, the application has specified the
+ * exact records they want deleted. We don't need to walk
+ * through a set of duplicates.
+ */
+ if (LF_ISSET(DB_MULTIPLE_KEY))
+ break;
+
+ F_SET(&tkey, DB_DBT_ISSET);
+ F_SET(&data, DB_DBT_ISSET);
+ if ((ret = __dbc_get(dbc, &tkey, &data, f_next)) != 0) {
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ break;
+ }
+ }
+
+next: if (ret == 0 && LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+ ++key->doff;
+ goto bulk_next;
+ }
+err: /* Discard the cursor. */
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_sync --
+ * Flush the database cache.
+ *
+ * PUBLIC: int __db_sync __P((DB *));
+ */
+int
+__db_sync(dbp)
+ DB *dbp;
+{
+ int ret, t_ret;
+
+ ret = 0;
+
+ /* If the database was read-only, we're done. */
+ if (F_ISSET(dbp, DB_AM_RDONLY))
+ return (0);
+
+ /* If it's a Recno tree, write the backing source text file. */
+ if (dbp->type == DB_RECNO)
+ ret = __ram_writeback(dbp);
+
+ /* If the database was never backed by a database file, we're done. */
+ if (F_ISSET(dbp, DB_AM_INMEM))
+ return (ret);
+#ifdef HAVE_PARTITION
+ if (DB_IS_PARTITIONED(dbp))
+ ret = __partition_sync(dbp);
+ else
+#endif
+ if (dbp->type == DB_QUEUE)
+ ret = __qam_sync(dbp);
+ else
+ /* Flush any dirty pages from the cache to the backing file. */
+ if ((t_ret = __memp_fsync(dbp->mpf)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_associate --
+ * Associate another database as a secondary index to this one.
+ *
+ * PUBLIC: int __db_associate __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB *,
+ * PUBLIC: int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t));
+ */
+int
+__db_associate(dbp, ip, txn, sdbp, callback, flags)
+ DB *dbp, *sdbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ int (*callback) __P((DB *, const DBT *, const DBT *, DBT *));
+ u_int32_t flags;
+{
+ DBC *pdbc, *sdbc;
+ DBT key, data, skey, *tskeyp;
+ ENV *env;
+ int build, ret, t_ret;
+ u_int32_t nskey;
+
+ env = dbp->env;
+ pdbc = sdbc = NULL;
+ ret = 0;
+
+ memset(&skey, 0, sizeof(DBT));
+ nskey = 0;
+ tskeyp = NULL;
+
+ /*
+ * Check to see if the secondary is empty -- and thus if we should
+ * build it -- before we link it in and risk making it show up in other
+ * threads. Do this first so that the databases remain unassociated on
+ * error.
+ */
+ build = 0;
+ if (LF_ISSET(DB_CREATE)) {
+ if ((ret = __db_cursor(sdbp, ip, txn, &sdbc, 0)) != 0)
+ goto err;
+
+ /*
+ * We don't care about key or data; we're just doing
+ * an existence check.
+ */
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ F_SET(&key, DB_DBT_PARTIAL | DB_DBT_USERMEM);
+ F_SET(&data, DB_DBT_PARTIAL | DB_DBT_USERMEM);
+ if ((ret = __dbc_get(sdbc, &key, &data,
+ (STD_LOCKING(sdbc) ? DB_RMW : 0) |
+ DB_FIRST)) == DB_NOTFOUND) {
+ build = 1;
+ ret = 0;
+ }
+
+ if ((t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Reset for later error check. */
+ sdbc = NULL;
+
+ if (ret != 0)
+ goto err;
+ }
+
+ /*
+ * Set up the database handle as a secondary.
+ */
+ sdbp->s_callback = callback;
+ sdbp->s_primary = dbp;
+
+ sdbp->stored_get = sdbp->get;
+ sdbp->get = __db_secondary_get;
+
+ sdbp->stored_close = sdbp->close;
+ sdbp->close = __db_secondary_close_pp;
+
+ F_SET(sdbp, DB_AM_SECONDARY);
+
+ if (LF_ISSET(DB_IMMUTABLE_KEY))
+ FLD_SET(sdbp->s_assoc_flags, DB_ASSOC_IMMUTABLE_KEY);
+
+ /*
+ * Add the secondary to the list on the primary. Do it here
+ * so that we see any updates that occur while we're walking
+ * the primary.
+ */
+ MUTEX_LOCK(env, dbp->mutex);
+
+ /* See __db_s_next for an explanation of secondary refcounting. */
+ DB_ASSERT(env, sdbp->s_refcnt == 0);
+ sdbp->s_refcnt = 1;
+ LIST_INSERT_HEAD(&dbp->s_secondaries, sdbp, s_links);
+ MUTEX_UNLOCK(env, dbp->mutex);
+
+ if (build) {
+ /*
+ * We loop through the primary, putting each item we
+ * find into the new secondary.
+ *
+ * If we're using CDB, opening these two cursors puts us
+ * in a bit of a locking tangle: CDB locks are done on the
+ * primary, so that we stay deadlock-free, but that means
+ * that updating the secondary while we have a read cursor
+ * open on the primary will self-block. To get around this,
+ * we force the primary cursor to use the same locker ID
+ * as the secondary, so they won't conflict. This should
+ * be harmless even if we're not using CDB.
+ */
+ if ((ret = __db_cursor(sdbp, ip, txn, &sdbc,
+ CDB_LOCKING(sdbp->env) ? DB_WRITECURSOR : 0)) != 0)
+ goto err;
+ if ((ret = __db_cursor_int(dbp, ip,
+ txn, dbp->type, PGNO_INVALID, 0, sdbc->locker, &pdbc)) != 0)
+ goto err;
+
+ /* Lock out other threads, now that we have a locker. */
+ dbp->associate_locker = sdbc->locker;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ while ((ret = __dbc_get(pdbc, &key, &data, DB_NEXT)) == 0) {
+ if ((ret = callback(sdbp, &key, &data, &skey)) != 0) {
+ if (ret == DB_DONOTINDEX)
+ continue;
+ goto err;
+ }
+ if (F_ISSET(&skey, DB_DBT_MULTIPLE)) {
+#ifdef DIAGNOSTIC
+ __db_check_skeyset(sdbp, &skey);
+#endif
+ nskey = skey.size;
+ tskeyp = (DBT *)skey.data;
+ } else {
+ nskey = 1;
+ tskeyp = &skey;
+ }
+ SWAP_IF_NEEDED(sdbp, &key);
+ for (; nskey > 0; nskey--, tskeyp++) {
+ if ((ret = __dbc_put(sdbc,
+ tskeyp, &key, DB_UPDATE_SECONDARY)) != 0)
+ goto err;
+ FREE_IF_NEEDED(env, tskeyp);
+ }
+ SWAP_IF_NEEDED(sdbp, &key);
+ FREE_IF_NEEDED(env, &skey);
+ }
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ }
+
+err: if (sdbc != NULL && (t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (pdbc != NULL && (t_ret = __dbc_close(pdbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ dbp->associate_locker = NULL;
+
+ for (; nskey > 0; nskey--, tskeyp++)
+ FREE_IF_NEEDED(env, tskeyp);
+ FREE_IF_NEEDED(env, &skey);
+
+ return (ret);
+}
+
+/*
+ * __db_secondary_get --
+ * This wrapper function for DB->pget() is the DB->get() function
+ * on a database which has been made into a secondary index.
+ */
+static int
+__db_secondary_get(sdbp, txn, skey, data, flags)
+ DB *sdbp;
+ DB_TXN *txn;
+ DBT *skey, *data;
+ u_int32_t flags;
+{
+ DB_ASSERT(sdbp->env, F_ISSET(sdbp, DB_AM_SECONDARY));
+ return (__db_pget_pp(sdbp, txn, skey, NULL, data, flags));
+}
+
+/*
+ * __db_secondary_close --
+ * Wrapper function for DB->close() which we use on secondaries to
+ * manage refcounting and make sure we don't close them underneath
+ * a primary that is updating.
+ *
+ * PUBLIC: int __db_secondary_close __P((DB *, u_int32_t));
+ */
+int
+__db_secondary_close(sdbp, flags)
+ DB *sdbp;
+ u_int32_t flags;
+{
+ DB *primary;
+ ENV *env;
+ int doclose;
+
+ doclose = 0;
+ primary = sdbp->s_primary;
+ env = primary->env;
+
+ MUTEX_LOCK(env, primary->mutex);
+ /*
+ * Check the refcount--if it was at 1 when we were called, no
+ * thread is currently updating this secondary through the primary,
+ * so it's safe to close it for real.
+ *
+ * If it's not safe to do the close now, we do nothing; the
+ * database will actually be closed when the refcount is decremented,
+ * which can happen in either __db_s_next or __db_s_done.
+ */
+ DB_ASSERT(env, sdbp->s_refcnt != 0);
+ if (--sdbp->s_refcnt == 0) {
+ LIST_REMOVE(sdbp, s_links);
+ /* We don't want to call close while the mutex is held. */
+ doclose = 1;
+ }
+ MUTEX_UNLOCK(env, primary->mutex);
+
+ /*
+ * sdbp->close is this function; call the real one explicitly if
+ * need be.
+ */
+ return (doclose ? __db_close(sdbp, NULL, flags) : 0);
+}
+
+/*
+ * __db_associate_foreign --
+ * Associate this database (fdbp) as a foreign constraint to another
+ * database (pdbp). That is, dbp's keys appear as foreign key values in
+ * pdbp.
+ *
+ * PUBLIC: int __db_associate_foreign __P((DB *, DB *,
+ * PUBLIC: int (*)(DB *, const DBT *, DBT *, const DBT *, int *),
+ * PUBLIC: u_int32_t));
+ */
+int
+__db_associate_foreign(fdbp, pdbp, callback, flags)
+ DB *fdbp, *pdbp;
+ int (*callback)(DB *, const DBT *, DBT *, const DBT *, int *);
+ u_int32_t flags;
+{
+ DB_FOREIGN_INFO *f_info;
+ ENV *env;
+ int ret;
+
+ env = fdbp->env;
+ ret = 0;
+
+ if ((ret = __os_malloc(env, sizeof(DB_FOREIGN_INFO), &f_info)) != 0) {
+ return ret;
+ }
+ memset(f_info, 0, sizeof(DB_FOREIGN_INFO));
+
+ f_info->dbp = pdbp;
+ f_info->callback = callback;
+
+ /*
+ * It might be wise to filter this, but for now the flags only
+ * set the delete action type.
+ */
+ FLD_SET(f_info->flags, flags);
+
+ /*
+ * Add f_info to the foreign database's list of primaries. That is to
+ * say, fdbp->f_primaries lists all databases for which fdbp is a
+ * foreign constraint.
+ */
+ MUTEX_LOCK(env, fdbp->mutex);
+ LIST_INSERT_HEAD(&fdbp->f_primaries, f_info, f_links);
+ MUTEX_UNLOCK(env, fdbp->mutex);
+
+ /*
+ * Associate fdbp as pdbp's foreign db, for referential integrity
+ * checks. We don't allow the foreign db to be changed, because we
+ * currently have no way of removing pdbp from the old foreign db's list
+ * of primaries.
+ */
+ if (pdbp->s_foreign != NULL)
+ return (EINVAL);
+ pdbp->s_foreign = fdbp;
+
+ return (ret);
+}
+
+static int
+__dbc_set_priority(dbc, priority)
+ DBC *dbc;
+ DB_CACHE_PRIORITY priority;
+{
+ dbc->priority = priority;
+ return (0);
+}
+
+static int
+__dbc_get_priority(dbc, priority)
+ DBC *dbc;
+ DB_CACHE_PRIORITY *priority;
+{
+ *priority = dbc->priority;
+ return (0);
+}
diff --git a/db/db_auto.c b/db/db_auto.c
new file mode 100644
index 0000000..2ce4199
--- /dev/null
+++ b/db/db_auto.c
@@ -0,0 +1,3267 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __db_addrem_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC: __db_addrem_args **));
+ */
+int
+__db_addrem_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __db_addrem_args **argpp;
+{
+ __db_addrem_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__db_addrem_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &argp->opcode, bp);
+ bp += sizeof(argp->opcode);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, &argp->indx, bp);
+ bp += sizeof(argp->indx);
+
+ LOGCOPY_32(env, &argp->nbytes, bp);
+ bp += sizeof(argp->nbytes);
+
+ memset(&argp->hdr, 0, sizeof(argp->hdr));
+ LOGCOPY_32(env,&argp->hdr.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->hdr.data = bp;
+ bp += argp->hdr.size;
+
+ memset(&argp->dbt, 0, sizeof(argp->dbt));
+ LOGCOPY_32(env,&argp->dbt.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->dbt.data = bp;
+ bp += argp->dbt.size;
+
+ LOGCOPY_TOLSN(env, &argp->pagelsn, bp);
+ bp += sizeof(DB_LSN);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_addrem_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC: u_int32_t, u_int32_t, db_pgno_t, u_int32_t, u_int32_t,
+ * PUBLIC: const DBT *, const DBT *, DB_LSN *));
+ */
+int
+__db_addrem_log(dbp, txnp, ret_lsnp, flags,
+ opcode, pgno, indx, nbytes, hdr,
+ dbt, pagelsn)
+ DB *dbp;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ u_int32_t opcode;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ u_int32_t nbytes;
+ const DBT *hdr;
+ const DBT *dbt;
+ DB_LSN * pagelsn;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ ENV *env;
+ u_int32_t zero, uinttmp, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ env = dbp->env;
+ rlsnp = ret_lsnp;
+ rectype = DB___db_addrem;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+ F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ DB_ASSERT(env, dbp->log_filename != NULL);
+ if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+ (ret = __dbreg_lazy_id(dbp)) != 0)
+ return (ret);
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t) + (hdr == NULL ? 0 : hdr->size)
+ + sizeof(u_int32_t) + (dbt == NULL ? 0 : dbt->size)
+ + sizeof(*pagelsn);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, bp, &opcode);
+ bp += sizeof(opcode);
+
+ uinttmp = (u_int32_t)dbp->log_filename->id;
+ LOGCOPY_32(env, bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, bp, &indx);
+ bp += sizeof(indx);
+
+ LOGCOPY_32(env, bp, &nbytes);
+ bp += sizeof(nbytes);
+
+ if (hdr == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &hdr->size);
+ bp += sizeof(hdr->size);
+ memcpy(bp, hdr->data, hdr->size);
+ bp += hdr->size;
+ }
+
+ if (dbt == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &dbt->size);
+ bp += sizeof(dbt->size);
+ memcpy(bp, dbt->data, dbt->size);
+ bp += dbt->size;
+ }
+
+ if (pagelsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(pagelsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, pagelsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, pagelsn);
+ } else
+ memset(bp, 0, sizeof(*pagelsn));
+ bp += sizeof(*pagelsn);
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__db_addrem_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_big_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC: __db_big_args **));
+ */
+int
+__db_big_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __db_big_args **argpp;
+{
+ __db_big_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__db_big_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &argp->opcode, bp);
+ bp += sizeof(argp->opcode);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->prev_pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->next_pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ memset(&argp->dbt, 0, sizeof(argp->dbt));
+ LOGCOPY_32(env,&argp->dbt.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->dbt.data = bp;
+ bp += argp->dbt.size;
+
+ LOGCOPY_TOLSN(env, &argp->pagelsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_TOLSN(env, &argp->prevlsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_TOLSN(env, &argp->nextlsn, bp);
+ bp += sizeof(DB_LSN);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_big_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC: u_int32_t, u_int32_t, db_pgno_t, db_pgno_t, db_pgno_t,
+ * PUBLIC: const DBT *, DB_LSN *, DB_LSN *, DB_LSN *));
+ */
+int
+__db_big_log(dbp, txnp, ret_lsnp, flags,
+ opcode, pgno, prev_pgno, next_pgno, dbt,
+ pagelsn, prevlsn, nextlsn)
+ DB *dbp;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ u_int32_t opcode;
+ db_pgno_t pgno;
+ db_pgno_t prev_pgno;
+ db_pgno_t next_pgno;
+ const DBT *dbt;
+ DB_LSN * pagelsn;
+ DB_LSN * prevlsn;
+ DB_LSN * nextlsn;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ ENV *env;
+ u_int32_t zero, uinttmp, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ env = dbp->env;
+ rlsnp = ret_lsnp;
+ rectype = DB___db_big;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+ F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ DB_ASSERT(env, dbp->log_filename != NULL);
+ if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+ (ret = __dbreg_lazy_id(dbp)) != 0)
+ return (ret);
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t) + (dbt == NULL ? 0 : dbt->size)
+ + sizeof(*pagelsn)
+ + sizeof(*prevlsn)
+ + sizeof(*nextlsn);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, bp, &opcode);
+ bp += sizeof(opcode);
+
+ uinttmp = (u_int32_t)dbp->log_filename->id;
+ LOGCOPY_32(env, bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)prev_pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)next_pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (dbt == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &dbt->size);
+ bp += sizeof(dbt->size);
+ memcpy(bp, dbt->data, dbt->size);
+ bp += dbt->size;
+ }
+
+ if (pagelsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(pagelsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, pagelsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, pagelsn);
+ } else
+ memset(bp, 0, sizeof(*pagelsn));
+ bp += sizeof(*pagelsn);
+
+ if (prevlsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(prevlsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, prevlsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, prevlsn);
+ } else
+ memset(bp, 0, sizeof(*prevlsn));
+ bp += sizeof(*prevlsn);
+
+ if (nextlsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(nextlsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, nextlsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, nextlsn);
+ } else
+ memset(bp, 0, sizeof(*nextlsn));
+ bp += sizeof(*nextlsn);
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__db_big_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_ovref_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC: __db_ovref_args **));
+ */
+int
+__db_ovref_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __db_ovref_args **argpp;
+{
+ __db_ovref_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__db_ovref_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->adjust = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_ovref_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC: u_int32_t, db_pgno_t, int32_t, DB_LSN *));
+ */
+int
+__db_ovref_log(dbp, txnp, ret_lsnp, flags, pgno, adjust, lsn)
+ DB *dbp;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ db_pgno_t pgno;
+ int32_t adjust;
+ DB_LSN * lsn;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ ENV *env;
+ u_int32_t uinttmp, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ env = dbp->env;
+ rlsnp = ret_lsnp;
+ rectype = DB___db_ovref;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+ F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ DB_ASSERT(env, dbp->log_filename != NULL);
+ if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+ (ret = __dbreg_lazy_id(dbp)) != 0)
+ return (ret);
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(*lsn);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ uinttmp = (u_int32_t)dbp->log_filename->id;
+ LOGCOPY_32(env, bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)adjust;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (lsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(lsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, lsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, lsn);
+ } else
+ memset(bp, 0, sizeof(*lsn));
+ bp += sizeof(*lsn);
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__db_ovref_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_relink_42_read __P((ENV *, DB **, void *,
+ * PUBLIC: void *, __db_relink_42_args **));
+ */
+int
+__db_relink_42_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __db_relink_42_args **argpp;
+{
+ __db_relink_42_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__db_relink_42_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &argp->opcode, bp);
+ bp += sizeof(argp->opcode);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->prev = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->lsn_prev, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->next = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->lsn_next, bp);
+ bp += sizeof(DB_LSN);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_debug_read __P((ENV *, void *, __db_debug_args **));
+ */
+int
+__db_debug_read(env, recbuf, argpp)
+ ENV *env;
+ void *recbuf;
+ __db_debug_args **argpp;
+{
+ __db_debug_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__db_debug_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ memset(&argp->op, 0, sizeof(argp->op));
+ LOGCOPY_32(env,&argp->op.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->op.data = bp;
+ bp += argp->op.size;
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ memset(&argp->key, 0, sizeof(argp->key));
+ LOGCOPY_32(env,&argp->key.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->key.data = bp;
+ bp += argp->key.size;
+
+ memset(&argp->data, 0, sizeof(argp->data));
+ LOGCOPY_32(env,&argp->data.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->data.data = bp;
+ bp += argp->data.size;
+
+ LOGCOPY_32(env, &argp->arg_flags, bp);
+ bp += sizeof(argp->arg_flags);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_debug_log __P((ENV *, DB_TXN *, DB_LSN *,
+ * PUBLIC: u_int32_t, const DBT *, int32_t, const DBT *, const DBT *,
+ * PUBLIC: u_int32_t));
+ */
+int
+__db_debug_log(env, txnp, ret_lsnp, flags,
+ op, fileid, key, data, arg_flags)
+ ENV *env;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ const DBT *op;
+ int32_t fileid;
+ const DBT *key;
+ const DBT *data;
+ u_int32_t arg_flags;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ u_int32_t zero, uinttmp, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ rlsnp = ret_lsnp;
+ rectype = DB___db_debug;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t) + (op == NULL ? 0 : op->size)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t) + (key == NULL ? 0 : key->size)
+ + sizeof(u_int32_t) + (data == NULL ? 0 : data->size)
+ + sizeof(u_int32_t);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ if (op == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &op->size);
+ bp += sizeof(op->size);
+ memcpy(bp, op->data, op->size);
+ bp += op->size;
+ }
+
+ uinttmp = (u_int32_t)fileid;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (key == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &key->size);
+ bp += sizeof(key->size);
+ memcpy(bp, key->data, key->size);
+ bp += key->size;
+ }
+
+ if (data == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &data->size);
+ bp += sizeof(data->size);
+ memcpy(bp, data->data, data->size);
+ bp += data->size;
+ }
+
+ LOGCOPY_32(env, bp, &arg_flags);
+ bp += sizeof(arg_flags);
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__db_debug_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_noop_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC: __db_noop_args **));
+ */
+int
+__db_noop_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __db_noop_args **argpp;
+{
+ __db_noop_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__db_noop_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->prevlsn, bp);
+ bp += sizeof(DB_LSN);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_noop_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *));
+ */
+int
+__db_noop_log(dbp, txnp, ret_lsnp, flags, pgno, prevlsn)
+ DB *dbp;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ db_pgno_t pgno;
+ DB_LSN * prevlsn;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ ENV *env;
+ u_int32_t uinttmp, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ env = dbp->env;
+ rlsnp = ret_lsnp;
+ rectype = DB___db_noop;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+ F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ DB_ASSERT(env, dbp->log_filename != NULL);
+ if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+ (ret = __dbreg_lazy_id(dbp)) != 0)
+ return (ret);
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(*prevlsn);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ uinttmp = (u_int32_t)dbp->log_filename->id;
+ LOGCOPY_32(env, bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (prevlsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(prevlsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, prevlsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, prevlsn);
+ } else
+ memset(bp, 0, sizeof(*prevlsn));
+ bp += sizeof(*prevlsn);
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__db_noop_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_alloc_42_read __P((ENV *, DB **, void *,
+ * PUBLIC: void *, __db_pg_alloc_42_args **));
+ */
+int
+__db_pg_alloc_42_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __db_pg_alloc_42_args **argpp;
+{
+ __db_pg_alloc_42_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__db_pg_alloc_42_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_TOLSN(env, &argp->meta_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->meta_pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->page_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, &argp->ptype, bp);
+ bp += sizeof(argp->ptype);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->next = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_alloc_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC: __db_pg_alloc_args **));
+ */
+int
+__db_pg_alloc_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __db_pg_alloc_args **argpp;
+{
+ __db_pg_alloc_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__db_pg_alloc_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_TOLSN(env, &argp->meta_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->meta_pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->page_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, &argp->ptype, bp);
+ bp += sizeof(argp->ptype);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->next = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->last_pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_alloc_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC: u_int32_t, DB_LSN *, db_pgno_t, DB_LSN *, db_pgno_t, u_int32_t,
+ * PUBLIC: db_pgno_t, db_pgno_t));
+ */
+int
+__db_pg_alloc_log(dbp, txnp, ret_lsnp, flags, meta_lsn, meta_pgno, page_lsn, pgno, ptype,
+ next, last_pgno)
+ DB *dbp;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ DB_LSN * meta_lsn;
+ db_pgno_t meta_pgno;
+ DB_LSN * page_lsn;
+ db_pgno_t pgno;
+ u_int32_t ptype;
+ db_pgno_t next;
+ db_pgno_t last_pgno;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ ENV *env;
+ u_int32_t uinttmp, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ env = dbp->env;
+ rlsnp = ret_lsnp;
+ rectype = DB___db_pg_alloc;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+ F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ DB_ASSERT(env, dbp->log_filename != NULL);
+ if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+ (ret = __dbreg_lazy_id(dbp)) != 0)
+ return (ret);
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t)
+ + sizeof(*meta_lsn)
+ + sizeof(u_int32_t)
+ + sizeof(*page_lsn)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ uinttmp = (u_int32_t)dbp->log_filename->id;
+ LOGCOPY_32(env, bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (meta_lsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(meta_lsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, meta_lsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, meta_lsn);
+ } else
+ memset(bp, 0, sizeof(*meta_lsn));
+ bp += sizeof(*meta_lsn);
+
+ uinttmp = (u_int32_t)meta_pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (page_lsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(page_lsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, page_lsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, page_lsn);
+ } else
+ memset(bp, 0, sizeof(*page_lsn));
+ bp += sizeof(*page_lsn);
+
+ uinttmp = (u_int32_t)pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, bp, &ptype);
+ bp += sizeof(ptype);
+
+ uinttmp = (u_int32_t)next;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)last_pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__db_pg_alloc_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_free_42_read __P((ENV *, DB **, void *,
+ * PUBLIC: void *, __db_pg_free_42_args **));
+ */
+int
+__db_pg_free_42_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __db_pg_free_42_args **argpp;
+{
+ __db_pg_free_42_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__db_pg_free_42_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->meta_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->meta_pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ memset(&argp->header, 0, sizeof(argp->header));
+ LOGCOPY_32(env,&argp->header.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->header.data = bp;
+ bp += argp->header.size;
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->next = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_free_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC: __db_pg_free_args **));
+ */
+int
+__db_pg_free_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __db_pg_free_args **argpp;
+{
+ __db_pg_free_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__db_pg_free_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->meta_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->meta_pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ memset(&argp->header, 0, sizeof(argp->header));
+ LOGCOPY_32(env,&argp->header.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->header.data = bp;
+ bp += argp->header.size;
+ if (LOG_SWAPPED(env) && dbpp != NULL && *dbpp != NULL) {
+ int t_ret;
+ if ((t_ret = __db_pageswap(*dbpp, (PAGE *)argp->header.data,
+ (size_t)argp->header.size, NULL, 1)) != 0)
+ return (t_ret);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->next = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->last_pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_free_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t, const DBT *,
+ * PUBLIC: db_pgno_t, db_pgno_t));
+ */
+int
+__db_pg_free_log(dbp, txnp, ret_lsnp, flags, pgno, meta_lsn, meta_pgno, header, next,
+ last_pgno)
+ DB *dbp;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ db_pgno_t pgno;
+ DB_LSN * meta_lsn;
+ db_pgno_t meta_pgno;
+ const DBT *header;
+ db_pgno_t next;
+ db_pgno_t last_pgno;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ ENV *env;
+ u_int32_t zero, uinttmp, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ env = dbp->env;
+ rlsnp = ret_lsnp;
+ rectype = DB___db_pg_free;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+ F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ DB_ASSERT(env, dbp->log_filename != NULL);
+ if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+ (ret = __dbreg_lazy_id(dbp)) != 0)
+ return (ret);
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(*meta_lsn)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t) + (header == NULL ? 0 : header->size)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ uinttmp = (u_int32_t)dbp->log_filename->id;
+ LOGCOPY_32(env, bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (meta_lsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(meta_lsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, meta_lsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, meta_lsn);
+ } else
+ memset(bp, 0, sizeof(*meta_lsn));
+ bp += sizeof(*meta_lsn);
+
+ uinttmp = (u_int32_t)meta_pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (header == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &header->size);
+ bp += sizeof(header->size);
+ memcpy(bp, header->data, header->size);
+ if (LOG_SWAPPED(env))
+ if ((ret = __db_pageswap(dbp,
+ (PAGE *)bp, (size_t)header->size, (DBT *)NULL, 0)) != 0)
+ return (ret);
+ bp += header->size;
+ }
+
+ uinttmp = (u_int32_t)next;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)last_pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__db_pg_free_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_cksum_read __P((ENV *, void *, __db_cksum_args **));
+ */
+int
+__db_cksum_read(env, recbuf, argpp)
+ ENV *env;
+ void *recbuf;
+ __db_cksum_args **argpp;
+{
+ __db_cksum_args *argp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__db_cksum_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_cksum_log __P((ENV *, DB_TXN *, DB_LSN *, u_int32_t));
+ */
+int
+__db_cksum_log(env, txnp, ret_lsnp, flags)
+ ENV *env;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ u_int32_t rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ rlsnp = ret_lsnp;
+ rectype = DB___db_cksum;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__db_cksum_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_freedata_42_read __P((ENV *, DB **, void *,
+ * PUBLIC: void *, __db_pg_freedata_42_args **));
+ */
+int
+__db_pg_freedata_42_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __db_pg_freedata_42_args **argpp;
+{
+ __db_pg_freedata_42_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__db_pg_freedata_42_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->meta_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->meta_pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ memset(&argp->header, 0, sizeof(argp->header));
+ LOGCOPY_32(env,&argp->header.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->header.data = bp;
+ bp += argp->header.size;
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->next = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ memset(&argp->data, 0, sizeof(argp->data));
+ LOGCOPY_32(env,&argp->data.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->data.data = bp;
+ bp += argp->data.size;
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_freedata_read __P((ENV *, DB **, void *,
+ * PUBLIC: void *, __db_pg_freedata_args **));
+ */
+int
+__db_pg_freedata_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __db_pg_freedata_args **argpp;
+{
+ __db_pg_freedata_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__db_pg_freedata_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->meta_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->meta_pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ memset(&argp->header, 0, sizeof(argp->header));
+ LOGCOPY_32(env,&argp->header.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->header.data = bp;
+ bp += argp->header.size;
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->next = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->last_pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ memset(&argp->data, 0, sizeof(argp->data));
+ LOGCOPY_32(env,&argp->data.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->data.data = bp;
+ bp += argp->data.size;
+ if (LOG_SWAPPED(env) && dbpp != NULL && *dbpp != NULL) {
+ int t_ret;
+ if ((t_ret = __db_pageswap(*dbpp,
+ (PAGE *)argp->header.data, (size_t)argp->header.size,
+ &argp->data, 1)) != 0)
+ return (t_ret);
+ }
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_freedata_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t, const DBT *,
+ * PUBLIC: db_pgno_t, db_pgno_t, const DBT *));
+ */
+int
+__db_pg_freedata_log(dbp, txnp, ret_lsnp, flags, pgno, meta_lsn, meta_pgno, header, next,
+ last_pgno, data)
+ DB *dbp;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ db_pgno_t pgno;
+ DB_LSN * meta_lsn;
+ db_pgno_t meta_pgno;
+ const DBT *header;
+ db_pgno_t next;
+ db_pgno_t last_pgno;
+ const DBT *data;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ ENV *env;
+ u_int32_t zero, uinttmp, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ env = dbp->env;
+ rlsnp = ret_lsnp;
+ rectype = DB___db_pg_freedata;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+ F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ DB_ASSERT(env, dbp->log_filename != NULL);
+ if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+ (ret = __dbreg_lazy_id(dbp)) != 0)
+ return (ret);
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(*meta_lsn)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t) + (header == NULL ? 0 : header->size)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t) + (data == NULL ? 0 : data->size);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ uinttmp = (u_int32_t)dbp->log_filename->id;
+ LOGCOPY_32(env, bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (meta_lsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(meta_lsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, meta_lsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, meta_lsn);
+ } else
+ memset(bp, 0, sizeof(*meta_lsn));
+ bp += sizeof(*meta_lsn);
+
+ uinttmp = (u_int32_t)meta_pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (header == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &header->size);
+ bp += sizeof(header->size);
+ memcpy(bp, header->data, header->size);
+ if (LOG_SWAPPED(env))
+ if ((ret = __db_pageswap(dbp,
+ (PAGE *)bp, (size_t)header->size, (DBT *)data, 0)) != 0)
+ return (ret);
+ bp += header->size;
+ }
+
+ uinttmp = (u_int32_t)next;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)last_pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (data == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &data->size);
+ bp += sizeof(data->size);
+ memcpy(bp, data->data, data->size);
+ if (LOG_SWAPPED(env) && F_ISSET(data, DB_DBT_APPMALLOC))
+ __os_free(env, data->data);
+ bp += data->size;
+ }
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__db_pg_freedata_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_init_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC: __db_pg_init_args **));
+ */
+int
+__db_pg_init_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __db_pg_init_args **argpp;
+{
+ __db_pg_init_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__db_pg_init_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ memset(&argp->header, 0, sizeof(argp->header));
+ LOGCOPY_32(env,&argp->header.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->header.data = bp;
+ bp += argp->header.size;
+
+ memset(&argp->data, 0, sizeof(argp->data));
+ LOGCOPY_32(env,&argp->data.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->data.data = bp;
+ bp += argp->data.size;
+ if (LOG_SWAPPED(env) && dbpp != NULL && *dbpp != NULL) {
+ int t_ret;
+ if ((t_ret = __db_pageswap(*dbpp,
+ (PAGE *)argp->header.data, (size_t)argp->header.size,
+ &argp->data, 1)) != 0)
+ return (t_ret);
+ }
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_init_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC: u_int32_t, db_pgno_t, const DBT *, const DBT *));
+ */
+int
+__db_pg_init_log(dbp, txnp, ret_lsnp, flags, pgno, header, data)
+ DB *dbp;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ db_pgno_t pgno;
+ const DBT *header;
+ const DBT *data;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ ENV *env;
+ u_int32_t zero, uinttmp, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ env = dbp->env;
+ rlsnp = ret_lsnp;
+ rectype = DB___db_pg_init;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+ F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ DB_ASSERT(env, dbp->log_filename != NULL);
+ if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+ (ret = __dbreg_lazy_id(dbp)) != 0)
+ return (ret);
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t) + (header == NULL ? 0 : header->size)
+ + sizeof(u_int32_t) + (data == NULL ? 0 : data->size);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ uinttmp = (u_int32_t)dbp->log_filename->id;
+ LOGCOPY_32(env, bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (header == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &header->size);
+ bp += sizeof(header->size);
+ memcpy(bp, header->data, header->size);
+ if (LOG_SWAPPED(env))
+ if ((ret = __db_pageswap(dbp,
+ (PAGE *)bp, (size_t)header->size, (DBT *)data, 0)) != 0)
+ return (ret);
+ bp += header->size;
+ }
+
+ if (data == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &data->size);
+ bp += sizeof(data->size);
+ memcpy(bp, data->data, data->size);
+ if (LOG_SWAPPED(env) && F_ISSET(data, DB_DBT_APPMALLOC))
+ __os_free(env, data->data);
+ bp += data->size;
+ }
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__db_pg_init_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_sort_44_read __P((ENV *, DB **, void *,
+ * PUBLIC: void *, __db_pg_sort_44_args **));
+ */
+int
+__db_pg_sort_44_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __db_pg_sort_44_args **argpp;
+{
+ __db_pg_sort_44_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__db_pg_sort_44_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->meta = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->meta_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->last_free = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->last_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->last_pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ memset(&argp->list, 0, sizeof(argp->list));
+ LOGCOPY_32(env,&argp->list.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->list.data = bp;
+ bp += argp->list.size;
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_trunc_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC: __db_pg_trunc_args **));
+ */
+int
+__db_pg_trunc_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __db_pg_trunc_args **argpp;
+{
+ __db_pg_trunc_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__db_pg_trunc_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->meta = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->meta_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->last_free = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->last_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->next_free = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->last_pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ memset(&argp->list, 0, sizeof(argp->list));
+ LOGCOPY_32(env,&argp->list.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->list.data = bp;
+ bp += argp->list.size;
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_pg_trunc_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t, DB_LSN *, db_pgno_t,
+ * PUBLIC: db_pgno_t, const DBT *));
+ */
+int
+__db_pg_trunc_log(dbp, txnp, ret_lsnp, flags, meta, meta_lsn, last_free, last_lsn, next_free,
+ last_pgno, list)
+ DB *dbp;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ db_pgno_t meta;
+ DB_LSN * meta_lsn;
+ db_pgno_t last_free;
+ DB_LSN * last_lsn;
+ db_pgno_t next_free;
+ db_pgno_t last_pgno;
+ const DBT *list;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ ENV *env;
+ u_int32_t zero, uinttmp, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ env = dbp->env;
+ rlsnp = ret_lsnp;
+ rectype = DB___db_pg_trunc;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+ F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ DB_ASSERT(env, dbp->log_filename != NULL);
+ if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+ (ret = __dbreg_lazy_id(dbp)) != 0)
+ return (ret);
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(*meta_lsn)
+ + sizeof(u_int32_t)
+ + sizeof(*last_lsn)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t) + (list == NULL ? 0 : list->size);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ uinttmp = (u_int32_t)dbp->log_filename->id;
+ LOGCOPY_32(env, bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)meta;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (meta_lsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(meta_lsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, meta_lsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, meta_lsn);
+ } else
+ memset(bp, 0, sizeof(*meta_lsn));
+ bp += sizeof(*meta_lsn);
+
+ uinttmp = (u_int32_t)last_free;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (last_lsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(last_lsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, last_lsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, last_lsn);
+ } else
+ memset(bp, 0, sizeof(*last_lsn));
+ bp += sizeof(*last_lsn);
+
+ uinttmp = (u_int32_t)next_free;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)last_pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (list == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &list->size);
+ bp += sizeof(list->size);
+ memcpy(bp, list->data, list->size);
+ bp += list->size;
+ }
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__db_pg_trunc_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __db_init_recover __P((ENV *, DB_DISTAB *));
+ */
+int
+__db_init_recover(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_addrem_recover, DB___db_addrem)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_big_recover, DB___db_big)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_ovref_recover, DB___db_ovref)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_debug_recover, DB___db_debug)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_noop_recover, DB___db_noop)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pg_alloc_recover, DB___db_pg_alloc)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pg_free_recover, DB___db_pg_free)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_cksum_recover, DB___db_cksum)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pg_freedata_recover, DB___db_pg_freedata)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pg_init_recover, DB___db_pg_init)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pg_trunc_recover, DB___db_pg_trunc)) != 0)
+ return (ret);
+ return (0);
+}
diff --git a/db/db_autop.c b/db/db_autop.c
new file mode 100644
index 0000000..f3b0635
--- /dev/null
+++ b/db/db_autop.c
@@ -0,0 +1,802 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_dispatch.h"
+#include "dbinc/db_am.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __db_addrem_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_addrem_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __db_addrem_args *argp;
+ u_int32_t i;
+ int ch;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __db_addrem_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__db_addrem%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\topcode: %lu\n", (u_long)argp->opcode);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ (void)printf("\tindx: %lu\n", (u_long)argp->indx);
+ (void)printf("\tnbytes: %lu\n", (u_long)argp->nbytes);
+ (void)printf("\thdr: ");
+ for (i = 0; i < argp->hdr.size; i++) {
+ ch = ((u_int8_t *)argp->hdr.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\tdbt: ");
+ for (i = 0; i < argp->dbt.size; i++) {
+ ch = ((u_int8_t *)argp->dbt.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\tpagelsn: [%lu][%lu]\n",
+ (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __db_big_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_big_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __db_big_args *argp;
+ u_int32_t i;
+ int ch;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __db_big_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__db_big%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\topcode: %lu\n", (u_long)argp->opcode);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ (void)printf("\tprev_pgno: %lu\n", (u_long)argp->prev_pgno);
+ (void)printf("\tnext_pgno: %lu\n", (u_long)argp->next_pgno);
+ (void)printf("\tdbt: ");
+ for (i = 0; i < argp->dbt.size; i++) {
+ ch = ((u_int8_t *)argp->dbt.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\tpagelsn: [%lu][%lu]\n",
+ (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset);
+ (void)printf("\tprevlsn: [%lu][%lu]\n",
+ (u_long)argp->prevlsn.file, (u_long)argp->prevlsn.offset);
+ (void)printf("\tnextlsn: [%lu][%lu]\n",
+ (u_long)argp->nextlsn.file, (u_long)argp->nextlsn.offset);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __db_ovref_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_ovref_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __db_ovref_args *argp;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __db_ovref_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__db_ovref%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ (void)printf("\tadjust: %ld\n", (long)argp->adjust);
+ (void)printf("\tlsn: [%lu][%lu]\n",
+ (u_long)argp->lsn.file, (u_long)argp->lsn.offset);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __db_relink_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_relink_42_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __db_relink_42_args *argp;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __db_relink_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__db_relink_42%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\topcode: %lu\n", (u_long)argp->opcode);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ (void)printf("\tlsn: [%lu][%lu]\n",
+ (u_long)argp->lsn.file, (u_long)argp->lsn.offset);
+ (void)printf("\tprev: %lu\n", (u_long)argp->prev);
+ (void)printf("\tlsn_prev: [%lu][%lu]\n",
+ (u_long)argp->lsn_prev.file, (u_long)argp->lsn_prev.offset);
+ (void)printf("\tnext: %lu\n", (u_long)argp->next);
+ (void)printf("\tlsn_next: [%lu][%lu]\n",
+ (u_long)argp->lsn_next.file, (u_long)argp->lsn_next.offset);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __db_debug_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_debug_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __db_debug_args *argp;
+ u_int32_t i;
+ int ch;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret = __db_debug_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__db_debug%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\top: ");
+ for (i = 0; i < argp->op.size; i++) {
+ ch = ((u_int8_t *)argp->op.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tkey: ");
+ for (i = 0; i < argp->key.size; i++) {
+ ch = ((u_int8_t *)argp->key.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\tdata: ");
+ for (i = 0; i < argp->data.size; i++) {
+ ch = ((u_int8_t *)argp->data.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\targ_flags: %lu\n", (u_long)argp->arg_flags);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __db_noop_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_noop_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __db_noop_args *argp;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __db_noop_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__db_noop%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ (void)printf("\tprevlsn: [%lu][%lu]\n",
+ (u_long)argp->prevlsn.file, (u_long)argp->prevlsn.offset);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __db_pg_alloc_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_alloc_42_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __db_pg_alloc_42_args *argp;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __db_pg_alloc_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__db_pg_alloc_42%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tmeta_lsn: [%lu][%lu]\n",
+ (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset);
+ (void)printf("\tmeta_pgno: %lu\n", (u_long)argp->meta_pgno);
+ (void)printf("\tpage_lsn: [%lu][%lu]\n",
+ (u_long)argp->page_lsn.file, (u_long)argp->page_lsn.offset);
+ (void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ (void)printf("\tptype: %lu\n", (u_long)argp->ptype);
+ (void)printf("\tnext: %lu\n", (u_long)argp->next);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __db_pg_alloc_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_alloc_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __db_pg_alloc_args *argp;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __db_pg_alloc_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__db_pg_alloc%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tmeta_lsn: [%lu][%lu]\n",
+ (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset);
+ (void)printf("\tmeta_pgno: %lu\n", (u_long)argp->meta_pgno);
+ (void)printf("\tpage_lsn: [%lu][%lu]\n",
+ (u_long)argp->page_lsn.file, (u_long)argp->page_lsn.offset);
+ (void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ (void)printf("\tptype: %lu\n", (u_long)argp->ptype);
+ (void)printf("\tnext: %lu\n", (u_long)argp->next);
+ (void)printf("\tlast_pgno: %lu\n", (u_long)argp->last_pgno);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __db_pg_free_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_free_42_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __db_pg_free_42_args *argp;
+ u_int32_t i;
+ int ch;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __db_pg_free_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__db_pg_free_42%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ (void)printf("\tmeta_lsn: [%lu][%lu]\n",
+ (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset);
+ (void)printf("\tmeta_pgno: %lu\n", (u_long)argp->meta_pgno);
+ (void)printf("\theader: ");
+ for (i = 0; i < argp->header.size; i++) {
+ ch = ((u_int8_t *)argp->header.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\tnext: %lu\n", (u_long)argp->next);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __db_pg_free_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_free_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __db_pg_free_args *argp;
+ u_int32_t i;
+ int ch;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __db_pg_free_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__db_pg_free%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ (void)printf("\tmeta_lsn: [%lu][%lu]\n",
+ (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset);
+ (void)printf("\tmeta_pgno: %lu\n", (u_long)argp->meta_pgno);
+ (void)printf("\theader: ");
+ for (i = 0; i < argp->header.size; i++) {
+ ch = ((u_int8_t *)argp->header.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\tnext: %lu\n", (u_long)argp->next);
+ (void)printf("\tlast_pgno: %lu\n", (u_long)argp->last_pgno);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __db_cksum_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_cksum_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __db_cksum_args *argp;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret = __db_cksum_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__db_cksum%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __db_pg_freedata_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_freedata_42_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __db_pg_freedata_42_args *argp;
+ u_int32_t i;
+ int ch;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __db_pg_freedata_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__db_pg_freedata_42%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ (void)printf("\tmeta_lsn: [%lu][%lu]\n",
+ (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset);
+ (void)printf("\tmeta_pgno: %lu\n", (u_long)argp->meta_pgno);
+ (void)printf("\theader: ");
+ for (i = 0; i < argp->header.size; i++) {
+ ch = ((u_int8_t *)argp->header.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\tnext: %lu\n", (u_long)argp->next);
+ (void)printf("\tdata: ");
+ for (i = 0; i < argp->data.size; i++) {
+ ch = ((u_int8_t *)argp->data.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __db_pg_freedata_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_freedata_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __db_pg_freedata_args *argp;
+ u_int32_t i;
+ int ch;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __db_pg_freedata_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__db_pg_freedata%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ (void)printf("\tmeta_lsn: [%lu][%lu]\n",
+ (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset);
+ (void)printf("\tmeta_pgno: %lu\n", (u_long)argp->meta_pgno);
+ (void)printf("\theader: ");
+ for (i = 0; i < argp->header.size; i++) {
+ ch = ((u_int8_t *)argp->header.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\tnext: %lu\n", (u_long)argp->next);
+ (void)printf("\tlast_pgno: %lu\n", (u_long)argp->last_pgno);
+ (void)printf("\tdata: ");
+ for (i = 0; i < argp->data.size; i++) {
+ ch = ((u_int8_t *)argp->data.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __db_pg_init_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_init_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __db_pg_init_args *argp;
+ u_int32_t i;
+ int ch;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __db_pg_init_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__db_pg_init%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ (void)printf("\theader: ");
+ for (i = 0; i < argp->header.size; i++) {
+ ch = ((u_int8_t *)argp->header.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\tdata: ");
+ for (i = 0; i < argp->data.size; i++) {
+ ch = ((u_int8_t *)argp->data.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __db_pg_sort_44_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_sort_44_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __db_pg_sort_44_args *argp;
+ u_int32_t i;
+ int ch;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __db_pg_sort_44_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__db_pg_sort_44%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tmeta: %lu\n", (u_long)argp->meta);
+ (void)printf("\tmeta_lsn: [%lu][%lu]\n",
+ (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset);
+ (void)printf("\tlast_free: %lu\n", (u_long)argp->last_free);
+ (void)printf("\tlast_lsn: [%lu][%lu]\n",
+ (u_long)argp->last_lsn.file, (u_long)argp->last_lsn.offset);
+ (void)printf("\tlast_pgno: %lu\n", (u_long)argp->last_pgno);
+ (void)printf("\tlist: ");
+ for (i = 0; i < argp->list.size; i++) {
+ ch = ((u_int8_t *)argp->list.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __db_pg_trunc_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__db_pg_trunc_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __db_pg_trunc_args *argp;
+ u_int32_t i;
+ int ch;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __db_pg_trunc_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__db_pg_trunc%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tmeta: %lu\n", (u_long)argp->meta);
+ (void)printf("\tmeta_lsn: [%lu][%lu]\n",
+ (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset);
+ (void)printf("\tlast_free: %lu\n", (u_long)argp->last_free);
+ (void)printf("\tlast_lsn: [%lu][%lu]\n",
+ (u_long)argp->last_lsn.file, (u_long)argp->last_lsn.offset);
+ (void)printf("\tnext_free: %lu\n", (u_long)argp->next_free);
+ (void)printf("\tlast_pgno: %lu\n", (u_long)argp->last_pgno);
+ (void)printf("\tlist: ");
+ for (i = 0; i < argp->list.size; i++) {
+ ch = ((u_int8_t *)argp->list.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __db_init_print __P((ENV *, DB_DISTAB *));
+ */
+int
+__db_init_print(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_addrem_print, DB___db_addrem)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_big_print, DB___db_big)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_ovref_print, DB___db_ovref)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_debug_print, DB___db_debug)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_noop_print, DB___db_noop)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pg_alloc_print, DB___db_pg_alloc)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pg_free_print, DB___db_pg_free)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_cksum_print, DB___db_cksum)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pg_freedata_print, DB___db_pg_freedata)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pg_init_print, DB___db_pg_init)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __db_pg_trunc_print, DB___db_pg_trunc)) != 0)
+ return (ret);
+ return (0);
+}
diff --git a/db/db_cam.c b/db/db_cam.c
new file mode 100644
index 0000000..4c1322d
--- /dev/null
+++ b/db/db_cam.c
@@ -0,0 +1,3460 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000, 2010 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+static int __db_s_count __P((DB *));
+static int __db_wrlock_err __P((ENV *));
+static int __dbc_del_foreign __P((DBC *));
+static int __dbc_del_oldskey __P((DB *, DBC *, DBT *, DBT *, DBT *));
+static int __dbc_del_secondary __P((DBC *));
+static int __dbc_pget_recno __P((DBC *, DBT *, DBT *, u_int32_t));
+static inline int __dbc_put_append __P((DBC *,
+ DBT *, DBT *, u_int32_t *, u_int32_t));
+static inline int __dbc_put_fixed_len __P((DBC *, DBT *, DBT *));
+static inline int __dbc_put_partial __P((DBC *,
+ DBT *, DBT *, DBT *, DBT *, u_int32_t *, u_int32_t));
+static int __dbc_put_primary __P((DBC *, DBT *, DBT *, u_int32_t));
+static inline int __dbc_put_resolve_key __P((DBC *,
+ DBT *, DBT *, u_int32_t *, u_int32_t));
+static inline int __dbc_put_secondaries __P((DBC *,
+ DBT *, DBT *, DBT *, int, DBT *, u_int32_t *));
+
+#define CDB_LOCKING_INIT(env, dbc) \
+ /* \
+ * If we are running CDB, this had better be either a write \
+ * cursor or an immediate writer. If it's a regular writer, \
+ * that means we have an IWRITE lock and we need to upgrade \
+ * it to a write lock. \
+ */ \
+ if (CDB_LOCKING(env)) { \
+ if (!F_ISSET(dbc, DBC_WRITECURSOR | DBC_WRITER)) \
+ return (__db_wrlock_err(env)); \
+ \
+ if (F_ISSET(dbc, DBC_WRITECURSOR) && \
+ (ret = __lock_get(env, \
+ (dbc)->locker, DB_LOCK_UPGRADE, &(dbc)->lock_dbt, \
+ DB_LOCK_WRITE, &(dbc)->mylock)) != 0) \
+ return (ret); \
+ }
+#define CDB_LOCKING_DONE(env, dbc) \
+ /* Release the upgraded lock. */ \
+ if (F_ISSET(dbc, DBC_WRITECURSOR)) \
+ (void)__lock_downgrade( \
+ env, &(dbc)->mylock, DB_LOCK_IWRITE, 0);
+
+#define SET_READ_LOCKING_FLAGS(dbc, var) do { \
+ var = 0; \
+ if (!F_ISSET(dbc, DBC_READ_COMMITTED | DBC_READ_UNCOMMITTED)) { \
+ if (LF_ISSET(DB_READ_COMMITTED)) \
+ var = DBC_READ_COMMITTED | DBC_WAS_READ_COMMITTED; \
+ if (LF_ISSET(DB_READ_UNCOMMITTED)) \
+ var = DBC_READ_UNCOMMITTED; \
+ } \
+ LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED); \
+} while (0)
+
+/*
+ * __dbc_close --
+ * DBC->close.
+ *
+ * PUBLIC: int __dbc_close __P((DBC *));
+ */
+int
+__dbc_close(dbc)
+ DBC *dbc;
+{
+ DB *dbp;
+ DBC *opd;
+ DBC_INTERNAL *cp;
+ DB_TXN *txn;
+ ENV *env;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ cp = dbc->internal;
+ opd = cp->opd;
+ ret = 0;
+
+ /*
+ * Remove the cursor(s) from the active queue. We may be closing two
+ * cursors at once here, a top-level one and a lower-level, off-page
+ * duplicate one. The access-method specific cursor close routine must
+ * close both of them in a single call.
+ *
+ * !!!
+ * Cursors must be removed from the active queue before calling the
+ * access specific cursor close routine, btree depends on having that
+ * order of operations.
+ */
+ MUTEX_LOCK(env, dbp->mutex);
+
+ if (opd != NULL) {
+ DB_ASSERT(env, F_ISSET(opd, DBC_ACTIVE));
+ F_CLR(opd, DBC_ACTIVE);
+ TAILQ_REMOVE(&dbp->active_queue, opd, links);
+ }
+ DB_ASSERT(env, F_ISSET(dbc, DBC_ACTIVE));
+ F_CLR(dbc, DBC_ACTIVE);
+ TAILQ_REMOVE(&dbp->active_queue, dbc, links);
+
+ MUTEX_UNLOCK(env, dbp->mutex);
+
+ /* Call the access specific cursor close routine. */
+ if ((t_ret =
+ dbc->am_close(dbc, PGNO_INVALID, NULL)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * Release the lock after calling the access method specific close
+ * routine, a Btree cursor may have had pending deletes.
+ */
+ if (CDB_LOCKING(env)) {
+ /*
+ * Also, be sure not to free anything if mylock.off is
+ * INVALID; in some cases, such as idup'ed read cursors
+ * and secondary update cursors, a cursor in a CDB
+ * environment may not have a lock at all.
+ */
+ if ((t_ret = __LPUT(dbc, dbc->mylock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* For safety's sake, since this is going on the free queue. */
+ memset(&dbc->mylock, 0, sizeof(dbc->mylock));
+ if (opd != NULL)
+ memset(&opd->mylock, 0, sizeof(opd->mylock));
+ }
+
+ if ((txn = dbc->txn) != NULL)
+ txn->cursors--;
+
+ /* Move the cursor(s) to the free queue. */
+ MUTEX_LOCK(env, dbp->mutex);
+ if (opd != NULL) {
+ if (txn != NULL)
+ txn->cursors--;
+ TAILQ_INSERT_TAIL(&dbp->free_queue, opd, links);
+ opd = NULL;
+ }
+ TAILQ_INSERT_TAIL(&dbp->free_queue, dbc, links);
+ MUTEX_UNLOCK(env, dbp->mutex);
+
+ if (txn != NULL && F_ISSET(txn, TXN_PRIVATE) && txn->cursors == 0 &&
+ (t_ret = __txn_commit(txn, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __dbc_destroy --
+ * Destroy the cursor, called after DBC->close.
+ *
+ * PUBLIC: int __dbc_destroy __P((DBC *));
+ */
+int
+__dbc_destroy(dbc)
+ DBC *dbc;
+{
+ DB *dbp;
+ ENV *env;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ /* Remove the cursor from the free queue. */
+ MUTEX_LOCK(env, dbp->mutex);
+ TAILQ_REMOVE(&dbp->free_queue, dbc, links);
+ MUTEX_UNLOCK(env, dbp->mutex);
+
+ /* Free up allocated memory. */
+ if (dbc->my_rskey.data != NULL)
+ __os_free(env, dbc->my_rskey.data);
+ if (dbc->my_rkey.data != NULL)
+ __os_free(env, dbc->my_rkey.data);
+ if (dbc->my_rdata.data != NULL)
+ __os_free(env, dbc->my_rdata.data);
+
+ /* Call the access specific cursor destroy routine. */
+ ret = dbc->am_destroy == NULL ? 0 : dbc->am_destroy(dbc);
+
+ /*
+ * Release the lock id for this cursor.
+ */
+ if (LOCKING_ON(env) &&
+ F_ISSET(dbc, DBC_OWN_LID) &&
+ (t_ret = __lock_id_free(env, dbc->lref)) != 0 && ret == 0)
+ ret = t_ret;
+
+ __os_free(env, dbc);
+
+ return (ret);
+}
+
+/*
+ * __dbc_cmp --
+ * Compare the position of two cursors. Return whether two cursors are
+ * pointing to the same key/data pair.
+ *
+ * result == 0 if both cursors refer to the same item.
+ * result == 1 otherwise
+ *
+ * PUBLIC: int __dbc_cmp __P((DBC *, DBC *, int *));
+ */
+int
+__dbc_cmp(dbc, other_dbc, result)
+ DBC *dbc, *other_dbc;
+ int *result;
+{
+ DBC *curr_dbc, *curr_odbc;
+ DBC_INTERNAL *dbc_int, *odbc_int;
+ ENV *env;
+ int ret;
+
+ env = dbc->env;
+ ret = 0;
+
+#ifdef HAVE_PARTITION
+ if (DB_IS_PARTITIONED(dbc->dbp)) {
+ dbc = ((PART_CURSOR *)dbc->internal)->sub_cursor;
+ other_dbc = ((PART_CURSOR *)other_dbc->internal)->sub_cursor;
+ }
+ /* Both cursors must still be valid. */
+ if (dbc == NULL || other_dbc == NULL) {
+ __db_errx(env,
+"Both cursors must be initialized before calling DBC->cmp.");
+ return (EINVAL);
+ }
+
+ if (dbc->dbp != other_dbc->dbp) {
+ *result = 1;
+ return (0);
+ }
+#endif
+
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbc->dbp))
+ return (__bamc_compress_cmp(dbc, other_dbc, result));
+#endif
+
+ curr_dbc = dbc;
+ curr_odbc = other_dbc;
+ dbc_int = dbc->internal;
+ odbc_int = other_dbc->internal;
+
+ /* Both cursors must be on valid positions. */
+ if (dbc_int->pgno == PGNO_INVALID || odbc_int->pgno == PGNO_INVALID) {
+ __db_errx(env,
+"Both cursors must be initialized before calling DBC->cmp.");
+ return (EINVAL);
+ }
+
+ /*
+ * Use a loop since cursors can be nested. Off page duplicate
+ * sets can only be nested one level deep, so it is safe to use a
+ * while (true) loop.
+ */
+ while (1) {
+ if (dbc_int->pgno == odbc_int->pgno &&
+ dbc_int->indx == odbc_int->indx) {
+ /*
+ * If one cursor is sitting on an off page duplicate
+ * set, the other will be pointing to the same set. Be
+ * careful, and check anyway.
+ */
+ if (dbc_int->opd != NULL && odbc_int->opd != NULL) {
+ curr_dbc = dbc_int->opd;
+ curr_odbc = odbc_int->opd;
+ dbc_int = dbc_int->opd->internal;
+ odbc_int= odbc_int->opd->internal;
+ continue;
+ } else if (dbc_int->opd == NULL &&
+ odbc_int->opd == NULL)
+ *result = 0;
+ else {
+ __db_errx(env,
+ "DBCursor->cmp mismatched off page duplicate cursor pointers.");
+ return (EINVAL);
+ }
+
+ switch (curr_dbc->dbtype) {
+ case DB_HASH:
+ /*
+ * Make sure that on-page duplicate data
+ * indexes match, and that the deleted
+ * flags are consistent.
+ */
+ ret = __hamc_cmp(curr_dbc, curr_odbc, result);
+ break;
+ case DB_BTREE:
+ case DB_RECNO:
+ /*
+ * Check for consisted deleted flags on btree
+ * specific cursors.
+ */
+ ret = __bamc_cmp(curr_dbc, curr_odbc, result);
+ break;
+ default:
+ /* NO-OP break out. */
+ break;
+ }
+ } else
+ *result = 1;
+ return (ret);
+ }
+ /* NOTREACHED. */
+ return (ret);
+}
+
+/*
+ * __dbc_count --
+ * Return a count of duplicate data items.
+ *
+ * PUBLIC: int __dbc_count __P((DBC *, db_recno_t *));
+ */
+int
+__dbc_count(dbc, recnop)
+ DBC *dbc;
+ db_recno_t *recnop;
+{
+ ENV *env;
+ int ret;
+
+ env = dbc->env;
+
+#ifdef HAVE_PARTITION
+ if (DB_IS_PARTITIONED(dbc->dbp))
+ dbc = ((PART_CURSOR *)dbc->internal)->sub_cursor;
+#endif
+ /*
+ * Cursor Cleanup Note:
+ * All of the cursors passed to the underlying access methods by this
+ * routine are not duplicated and will not be cleaned up on return.
+ * So, pages/locks that the cursor references must be resolved by the
+ * underlying functions.
+ */
+ switch (dbc->dbtype) {
+ case DB_QUEUE:
+ case DB_RECNO:
+ *recnop = 1;
+ break;
+ case DB_HASH:
+ if (dbc->internal->opd == NULL) {
+ if ((ret = __hamc_count(dbc, recnop)) != 0)
+ return (ret);
+ break;
+ }
+ /* FALLTHROUGH */
+ case DB_BTREE:
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbc->dbp))
+ return (__bamc_compress_count(dbc, recnop));
+#endif
+ if ((ret = __bamc_count(dbc, recnop)) != 0)
+ return (ret);
+ break;
+ case DB_UNKNOWN:
+ default:
+ return (__db_unknown_type(env, "__dbc_count", dbc->dbtype));
+ }
+ return (0);
+}
+
+/*
+ * __dbc_del --
+ * DBC->del.
+ *
+ * PUBLIC: int __dbc_del __P((DBC *, u_int32_t));
+ */
+int
+__dbc_del(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ DB *dbp;
+ ENV *env;
+ int ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ CDB_LOCKING_INIT(env, dbc);
+
+ /*
+ * If we're a secondary index, and DB_UPDATE_SECONDARY isn't set
+ * (which it only is if we're being called from a primary update),
+ * then we need to call through to the primary and delete the item.
+ *
+ * Note that this will delete the current item; we don't need to
+ * delete it ourselves as well, so we can just goto done.
+ */
+ if (flags != DB_UPDATE_SECONDARY && F_ISSET(dbp, DB_AM_SECONDARY)) {
+ ret = __dbc_del_secondary(dbc);
+ goto done;
+ }
+
+ /*
+ * If we are a foreign db, go through and check any foreign key
+ * constraints first, which will make rolling back changes on an abort
+ * simpler.
+ */
+ if (LIST_FIRST(&dbp->f_primaries) != NULL &&
+ (ret = __dbc_del_foreign(dbc)) != 0)
+ goto done;
+
+ /*
+ * If we are a primary and have secondary indices, go through
+ * and delete any secondary keys that point at the current record.
+ */
+ if (DB_IS_PRIMARY(dbp) &&
+ (ret = __dbc_del_primary(dbc)) != 0)
+ goto done;
+
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbp))
+ ret = __bamc_compress_del(dbc, flags);
+ else
+#endif
+ ret = __dbc_idel(dbc, flags);
+
+done: CDB_LOCKING_DONE(env, dbc);
+
+ return (ret);
+}
+
+/*
+ * __dbc_del --
+ * Implemenation of DBC->del.
+ *
+ * PUBLIC: int __dbc_idel __P((DBC *, u_int32_t));
+ */
+int
+__dbc_idel(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DBC *opd;
+ int ret, t_ret;
+
+ COMPQUIET(flags, 0);
+
+ dbp = dbc->dbp;
+
+ /*
+ * Cursor Cleanup Note:
+ * All of the cursors passed to the underlying access methods by this
+ * routine are not duplicated and will not be cleaned up on return.
+ * So, pages/locks that the cursor references must be resolved by the
+ * underlying functions.
+ */
+
+ /*
+ * Off-page duplicate trees are locked in the primary tree, that is,
+ * we acquire a write lock in the primary tree and no locks in the
+ * off-page dup tree. If the del operation is done in an off-page
+ * duplicate tree, call the primary cursor's upgrade routine first.
+ */
+ opd = dbc->internal->opd;
+ if (opd == NULL)
+ ret = dbc->am_del(dbc, flags);
+ else if ((ret = dbc->am_writelock(dbc)) == 0)
+ ret = opd->am_del(opd, flags);
+
+ /*
+ * If this was an update that is supporting dirty reads
+ * then we may have just swapped our read for a write lock
+ * which is held by the surviving cursor. We need
+ * to explicitly downgrade this lock. The closed cursor
+ * may only have had a read lock.
+ */
+ if (F_ISSET(dbp, DB_AM_READ_UNCOMMITTED) &&
+ dbc->internal->lock_mode == DB_LOCK_WRITE) {
+ if ((t_ret =
+ __TLPUT(dbc, dbc->internal->lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (t_ret == 0)
+ dbc->internal->lock_mode = DB_LOCK_WWRITE;
+ if (dbc->internal->page != NULL && (t_ret =
+ __memp_shared(dbp->mpf, dbc->internal->page)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ }
+
+ return (ret);
+}
+
+#ifdef HAVE_COMPRESSION
+/*
+ * __dbc_bulk_del --
+ * Bulk del for a cursor.
+ *
+ * Only implemented for compressed BTrees. In this file in order to
+ * use the CDB_LOCKING_* macros.
+ *
+ * PUBLIC: #ifdef HAVE_COMPRESSION
+ * PUBLIC: int __dbc_bulk_del __P((DBC *, DBT *, u_int32_t));
+ * PUBLIC: #endif
+ */
+int
+__dbc_bulk_del(dbc, key, flags)
+ DBC *dbc;
+ DBT *key;
+ u_int32_t flags;
+{
+ ENV *env;
+ int ret;
+
+ env = dbc->env;
+
+ DB_ASSERT(env, DB_IS_COMPRESSED(dbc->dbp));
+
+ CDB_LOCKING_INIT(env, dbc);
+
+ ret = __bamc_compress_bulk_del(dbc, key, flags);
+
+ CDB_LOCKING_DONE(env, dbc);
+
+ return (ret);
+}
+#endif
+
+/*
+ * __dbc_dup --
+ * Duplicate a cursor
+ *
+ * PUBLIC: int __dbc_dup __P((DBC *, DBC **, u_int32_t));
+ */
+int
+__dbc_dup(dbc_orig, dbcp, flags)
+ DBC *dbc_orig;
+ DBC **dbcp;
+ u_int32_t flags;
+{
+ DBC *dbc_n, *dbc_nopd;
+ int ret;
+
+ dbc_n = dbc_nopd = NULL;
+
+ /* Allocate a new cursor and initialize it. */
+ if ((ret = __dbc_idup(dbc_orig, &dbc_n, flags)) != 0)
+ goto err;
+ *dbcp = dbc_n;
+
+ /*
+ * If the cursor references an off-page duplicate tree, allocate a
+ * new cursor for that tree and initialize it.
+ */
+ if (dbc_orig->internal->opd != NULL) {
+ if ((ret =
+ __dbc_idup(dbc_orig->internal->opd, &dbc_nopd, flags)) != 0)
+ goto err;
+ dbc_n->internal->opd = dbc_nopd;
+ dbc_nopd->internal->pdbc = dbc_n;
+ }
+ return (0);
+
+err: if (dbc_n != NULL)
+ (void)__dbc_close(dbc_n);
+ if (dbc_nopd != NULL)
+ (void)__dbc_close(dbc_nopd);
+
+ return (ret);
+}
+
+/*
+ * __dbc_idup --
+ * Internal version of __dbc_dup.
+ *
+ * PUBLIC: int __dbc_idup __P((DBC *, DBC **, u_int32_t));
+ */
+int
+__dbc_idup(dbc_orig, dbcp, flags)
+ DBC *dbc_orig, **dbcp;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DBC *dbc_n;
+ DBC_INTERNAL *int_n, *int_orig;
+ ENV *env;
+ int ret;
+
+ dbp = dbc_orig->dbp;
+ dbc_n = *dbcp;
+ env = dbp->env;
+
+ if ((ret = __db_cursor_int(dbp, dbc_orig->thread_info,
+ dbc_orig->txn, dbc_orig->dbtype, dbc_orig->internal->root,
+ F_ISSET(dbc_orig, DBC_OPD) | DBC_DUPLICATE,
+ dbc_orig->locker, &dbc_n)) != 0)
+ return (ret);
+
+ /* Position the cursor if requested, acquiring the necessary locks. */
+ if (LF_ISSET(DB_POSITION)) {
+ int_n = dbc_n->internal;
+ int_orig = dbc_orig->internal;
+
+ dbc_n->flags |= dbc_orig->flags & ~DBC_OWN_LID;
+
+ int_n->indx = int_orig->indx;
+ int_n->pgno = int_orig->pgno;
+ int_n->root = int_orig->root;
+ int_n->lock_mode = int_orig->lock_mode;
+
+ int_n->stream_start_pgno = int_orig->stream_start_pgno;
+ int_n->stream_off = int_orig->stream_off;
+ int_n->stream_curr_pgno = int_orig->stream_curr_pgno;
+
+ switch (dbc_orig->dbtype) {
+ case DB_QUEUE:
+ if ((ret = __qamc_dup(dbc_orig, dbc_n)) != 0)
+ goto err;
+ break;
+ case DB_BTREE:
+ case DB_RECNO:
+ if ((ret = __bamc_dup(dbc_orig, dbc_n, flags)) != 0)
+ goto err;
+ break;
+ case DB_HASH:
+ if ((ret = __hamc_dup(dbc_orig, dbc_n)) != 0)
+ goto err;
+ break;
+ case DB_UNKNOWN:
+ default:
+ ret = __db_unknown_type(env,
+ "__dbc_idup", dbc_orig->dbtype);
+ goto err;
+ }
+ } else if (F_ISSET(dbc_orig, DBC_BULK)) {
+ /*
+ * For bulk cursors, remember what page were on, even if we
+ * don't know that the next operation will be nearby.
+ */
+ dbc_n->internal->pgno = dbc_orig->internal->pgno;
+ }
+
+ /* Copy the locking flags to the new cursor. */
+ F_SET(dbc_n, F_ISSET(dbc_orig, DBC_BULK |
+ DBC_READ_COMMITTED | DBC_READ_UNCOMMITTED | DBC_WRITECURSOR));
+
+ /*
+ * If we're in CDB and this isn't an offpage dup cursor, then
+ * we need to get a lock for the duplicated cursor.
+ */
+ if (CDB_LOCKING(env) && !F_ISSET(dbc_n, DBC_OPD) &&
+ (ret = __lock_get(env, dbc_n->locker, 0,
+ &dbc_n->lock_dbt, F_ISSET(dbc_orig, DBC_WRITECURSOR) ?
+ DB_LOCK_IWRITE : DB_LOCK_READ, &dbc_n->mylock)) != 0)
+ goto err;
+
+ dbc_n->priority = dbc_orig->priority;
+ dbc_n->internal->pdbc = dbc_orig->internal->pdbc;
+ *dbcp = dbc_n;
+ return (0);
+
+err: (void)__dbc_close(dbc_n);
+ return (ret);
+}
+
+/*
+ * __dbc_newopd --
+ * Create a new off-page duplicate cursor.
+ *
+ * PUBLIC: int __dbc_newopd __P((DBC *, db_pgno_t, DBC *, DBC **));
+ */
+int
+__dbc_newopd(dbc_parent, root, oldopd, dbcp)
+ DBC *dbc_parent;
+ db_pgno_t root;
+ DBC *oldopd;
+ DBC **dbcp;
+{
+ DB *dbp;
+ DBC *opd;
+ DBTYPE dbtype;
+ int ret;
+
+ dbp = dbc_parent->dbp;
+ dbtype = (dbp->dup_compare == NULL) ? DB_RECNO : DB_BTREE;
+
+ /*
+ * On failure, we want to default to returning the old off-page dup
+ * cursor, if any; our caller can't be left with a dangling pointer
+ * to a freed cursor. On error the only allowable behavior is to
+ * close the cursor (and the old OPD cursor it in turn points to), so
+ * this should be safe.
+ */
+ *dbcp = oldopd;
+
+ if ((ret = __db_cursor_int(dbp, dbc_parent->thread_info,
+ dbc_parent->txn,
+ dbtype, root, DBC_OPD, dbc_parent->locker, &opd)) != 0)
+ return (ret);
+
+ opd->priority = dbc_parent->priority;
+ opd->internal->pdbc = dbc_parent;
+ *dbcp = opd;
+
+ /*
+ * Check to see if we already have an off-page dup cursor that we've
+ * passed in. If we do, close it. It'd be nice to use it again
+ * if it's a cursor belonging to the right tree, but if we're doing
+ * a cursor-relative operation this might not be safe, so for now
+ * we'll take the easy way out and always close and reopen.
+ *
+ * Note that under no circumstances do we want to close the old
+ * cursor without returning a valid new one; we don't want to
+ * leave the main cursor in our caller with a non-NULL pointer
+ * to a freed off-page dup cursor.
+ */
+ if (oldopd != NULL && (ret = __dbc_close(oldopd)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __dbc_get --
+ * Get using a cursor.
+ *
+ * PUBLIC: int __dbc_get __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_get(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+#ifdef HAVE_PARTITION
+ if (F_ISSET(dbc, DBC_PARTITIONED))
+ return (__partc_get(dbc, key, data, flags));
+#endif
+
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbc->dbp))
+ return (__bamc_compress_get(dbc, key, data, flags));
+#endif
+
+ return (__dbc_iget(dbc, key, data, flags));
+}
+
+/*
+ * __dbc_iget --
+ * Implementation of get using a cursor.
+ *
+ * PUBLIC: int __dbc_iget __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_iget(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DBC *ddbc, *dbc_n, *opd;
+ DBC_INTERNAL *cp, *cp_n;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ db_pgno_t pgno;
+ db_indx_t indx_off;
+ u_int32_t multi, orig_ulen, tmp_flags, tmp_read_locking, tmp_rmw;
+ u_int8_t type;
+ int key_small, ret, t_ret;
+
+ COMPQUIET(orig_ulen, 0);
+
+ key_small = 0;
+
+ /*
+ * Cursor Cleanup Note:
+ * All of the cursors passed to the underlying access methods by this
+ * routine are duplicated cursors. On return, any referenced pages
+ * will be discarded, and, if the cursor is not intended to be used
+ * again, the close function will be called. So, pages/locks that
+ * the cursor references do not need to be resolved by the underlying
+ * functions.
+ */
+ dbp = dbc->dbp;
+ env = dbp->env;
+ mpf = dbp->mpf;
+ dbc_n = NULL;
+ opd = NULL;
+
+ /* Clear OR'd in additional bits so we can check for flag equality. */
+ tmp_rmw = LF_ISSET(DB_RMW);
+ LF_CLR(DB_RMW);
+
+ SET_READ_LOCKING_FLAGS(dbc, tmp_read_locking);
+
+ multi = LF_ISSET(DB_MULTIPLE|DB_MULTIPLE_KEY);
+ LF_CLR(DB_MULTIPLE|DB_MULTIPLE_KEY);
+
+ /*
+ * Return a cursor's record number. It has nothing to do with the
+ * cursor get code except that it was put into the interface.
+ */
+ if (flags == DB_GET_RECNO) {
+ if (tmp_rmw)
+ F_SET(dbc, DBC_RMW);
+ F_SET(dbc, tmp_read_locking);
+ ret = __bamc_rget(dbc, data);
+ if (tmp_rmw)
+ F_CLR(dbc, DBC_RMW);
+ /* Clear the temp flags, but leave WAS_READ_COMMITTED. */
+ F_CLR(dbc, tmp_read_locking & ~DBC_WAS_READ_COMMITTED);
+ return (ret);
+ }
+
+ if (flags == DB_CONSUME || flags == DB_CONSUME_WAIT)
+ CDB_LOCKING_INIT(env, dbc);
+
+ /* Don't return the key or data if it was passed to us. */
+ if (!DB_RETURNS_A_KEY(dbp, flags))
+ F_SET(key, DB_DBT_ISSET);
+ if (flags == DB_GET_BOTH &&
+ (dbp->dup_compare == NULL || dbp->dup_compare == __bam_defcmp))
+ F_SET(data, DB_DBT_ISSET);
+
+ /*
+ * If we have an off-page duplicates cursor, and the operation applies
+ * to it, perform the operation. Duplicate the cursor and call the
+ * underlying function.
+ *
+ * Off-page duplicate trees are locked in the primary tree, that is,
+ * we acquire a write lock in the primary tree and no locks in the
+ * off-page dup tree. If the DB_RMW flag was specified and the get
+ * operation is done in an off-page duplicate tree, call the primary
+ * cursor's upgrade routine first.
+ */
+ cp = dbc->internal;
+ if (cp->opd != NULL &&
+ (flags == DB_CURRENT || flags == DB_GET_BOTHC ||
+ flags == DB_NEXT || flags == DB_NEXT_DUP ||
+ flags == DB_PREV || flags == DB_PREV_DUP)) {
+ if (tmp_rmw && (ret = dbc->am_writelock(dbc)) != 0)
+ goto err;
+ if (F_ISSET(dbc, DBC_TRANSIENT))
+ opd = cp->opd;
+ else if ((ret = __dbc_idup(cp->opd, &opd, DB_POSITION)) != 0)
+ goto err;
+
+ if ((ret = opd->am_get(opd, key, data, flags, NULL)) == 0)
+ goto done;
+ /*
+ * Another cursor may have deleted all of the off-page
+ * duplicates, so for operations that are moving a cursor, we
+ * need to skip the empty tree and retry on the parent cursor.
+ */
+ if (ret == DB_NOTFOUND &&
+ (flags == DB_PREV || flags == DB_NEXT)) {
+ ret = __dbc_close(opd);
+ opd = NULL;
+ if (F_ISSET(dbc, DBC_TRANSIENT))
+ cp->opd = NULL;
+ }
+ if (ret != 0)
+ goto err;
+ } else if (cp->opd != NULL && F_ISSET(dbc, DBC_TRANSIENT)) {
+ if ((ret = __dbc_close(cp->opd)) != 0)
+ goto err;
+ cp->opd = NULL;
+ }
+
+ /*
+ * Perform an operation on the main cursor. Duplicate the cursor,
+ * upgrade the lock as required, and call the underlying function.
+ */
+ switch (flags) {
+ case DB_CURRENT:
+ case DB_GET_BOTHC:
+ case DB_NEXT:
+ case DB_NEXT_DUP:
+ case DB_NEXT_NODUP:
+ case DB_PREV:
+ case DB_PREV_DUP:
+ case DB_PREV_NODUP:
+ tmp_flags = DB_POSITION;
+ break;
+ default:
+ tmp_flags = 0;
+ break;
+ }
+
+ /*
+ * If this cursor is going to be closed immediately, we don't
+ * need to take precautions to clean it up on error.
+ */
+ if (F_ISSET(dbc, DBC_TRANSIENT | DBC_PARTITIONED))
+ dbc_n = dbc;
+ else {
+ ret = __dbc_idup(dbc, &dbc_n, tmp_flags);
+
+ if (ret != 0)
+ goto err;
+ COPY_RET_MEM(dbc, dbc_n);
+ }
+
+ if (tmp_rmw)
+ F_SET(dbc_n, DBC_RMW);
+ F_SET(dbc_n, tmp_read_locking);
+
+ switch (multi) {
+ case DB_MULTIPLE:
+ F_SET(dbc_n, DBC_MULTIPLE);
+ break;
+ case DB_MULTIPLE_KEY:
+ F_SET(dbc_n, DBC_MULTIPLE_KEY);
+ break;
+ case DB_MULTIPLE | DB_MULTIPLE_KEY:
+ F_SET(dbc_n, DBC_MULTIPLE|DBC_MULTIPLE_KEY);
+ break;
+ case 0:
+ default:
+ break;
+ }
+
+retry: pgno = PGNO_INVALID;
+ ret = dbc_n->am_get(dbc_n, key, data, flags, &pgno);
+ if (tmp_rmw)
+ F_CLR(dbc_n, DBC_RMW);
+ /*
+ * Clear the temporary locking flags in the new cursor. The user's
+ * (old) cursor needs to have the WAS_READ_COMMITTED flag because this
+ * is used on the next call on that cursor.
+ */
+ F_CLR(dbc_n, tmp_read_locking);
+ F_SET(dbc, tmp_read_locking & DBC_WAS_READ_COMMITTED);
+ F_CLR(dbc_n, DBC_MULTIPLE|DBC_MULTIPLE_KEY);
+ if (ret != 0)
+ goto err;
+
+ cp_n = dbc_n->internal;
+
+ /*
+ * We may be referencing a new off-page duplicates tree. Acquire
+ * a new cursor and call the underlying function.
+ */
+ if (pgno != PGNO_INVALID) {
+ if ((ret = __dbc_newopd(dbc,
+ pgno, cp_n->opd, &cp_n->opd)) != 0)
+ goto err;
+
+ switch (flags) {
+ case DB_FIRST:
+ case DB_NEXT:
+ case DB_NEXT_NODUP:
+ case DB_SET:
+ case DB_SET_RECNO:
+ case DB_SET_RANGE:
+ tmp_flags = DB_FIRST;
+ break;
+ case DB_LAST:
+ case DB_PREV:
+ case DB_PREV_NODUP:
+ tmp_flags = DB_LAST;
+ break;
+ case DB_GET_BOTH:
+ case DB_GET_BOTHC:
+ case DB_GET_BOTH_RANGE:
+ tmp_flags = flags;
+ break;
+ default:
+ ret = __db_unknown_flag(env, "__dbc_get", flags);
+ goto err;
+ }
+ ret = cp_n->opd->am_get(cp_n->opd, key, data, tmp_flags, NULL);
+ /*
+ * Another cursor may have deleted all of the off-page
+ * duplicates, so for operations that are moving a cursor, we
+ * need to skip the empty tree and retry on the parent cursor.
+ */
+ if (ret == DB_NOTFOUND) {
+ switch (flags) {
+ case DB_FIRST:
+ case DB_NEXT:
+ case DB_NEXT_NODUP:
+ flags = DB_NEXT;
+ break;
+ case DB_LAST:
+ case DB_PREV:
+ case DB_PREV_NODUP:
+ flags = DB_PREV;
+ break;
+ default:
+ goto err;
+ }
+
+ ret = __dbc_close(cp_n->opd);
+ cp_n->opd = NULL;
+ if (ret == 0)
+ goto retry;
+ }
+ if (ret != 0)
+ goto err;
+ }
+
+done: /*
+ * Return a key/data item. The only exception is that we don't return
+ * a key if the user already gave us one, that is, if the DB_SET flag
+ * was set. The DB_SET flag is necessary. In a Btree, the user's key
+ * doesn't have to be the same as the key stored the tree, depending on
+ * the magic performed by the comparison function. As we may not have
+ * done any key-oriented operation here, the page reference may not be
+ * valid. Fill it in as necessary. We don't have to worry about any
+ * locks, the cursor must already be holding appropriate locks.
+ *
+ * XXX
+ * If not a Btree and DB_SET_RANGE is set, we shouldn't return a key
+ * either, should we?
+ */
+ cp_n = dbc_n == NULL ? dbc->internal : dbc_n->internal;
+ if (!F_ISSET(key, DB_DBT_ISSET)) {
+ if (cp_n->page == NULL && (ret = __memp_fget(mpf, &cp_n->pgno,
+ dbc->thread_info, dbc->txn, 0, &cp_n->page)) != 0)
+ goto err;
+
+ if ((ret = __db_ret(dbc, cp_n->page, cp_n->indx, key,
+ &dbc->rkey->data, &dbc->rkey->ulen)) != 0) {
+ /*
+ * If the key DBT is too small, we still want to return
+ * the size of the data. Otherwise applications are
+ * forced to check each one with a separate call. We
+ * don't want to copy the data, so we set the ulen to
+ * zero before calling __db_ret.
+ */
+ if (ret == DB_BUFFER_SMALL &&
+ F_ISSET(data, DB_DBT_USERMEM)) {
+ key_small = 1;
+ orig_ulen = data->ulen;
+ data->ulen = 0;
+ } else
+ goto err;
+ }
+ }
+ if (multi != 0 && dbc->am_bulk != NULL) {
+ /*
+ * Even if fetching from the OPD cursor we need a duplicate
+ * primary cursor if we are going after multiple keys.
+ */
+ if (dbc_n == NULL) {
+ /*
+ * Non-"_KEY" DB_MULTIPLE doesn't move the main cursor,
+ * so it's safe to just use dbc, unless the cursor
+ * has an open off-page duplicate cursor whose state
+ * might need to be preserved.
+ */
+ if ((!(multi & DB_MULTIPLE_KEY) &&
+ dbc->internal->opd == NULL) ||
+ F_ISSET(dbc, DBC_TRANSIENT | DBC_PARTITIONED))
+ dbc_n = dbc;
+ else {
+ if ((ret = __dbc_idup(dbc,
+ &dbc_n, DB_POSITION)) != 0)
+ goto err;
+ if ((ret = dbc_n->am_get(dbc_n,
+ key, data, DB_CURRENT, &pgno)) != 0)
+ goto err;
+ }
+ cp_n = dbc_n->internal;
+ }
+
+ /*
+ * If opd is set then we dupped the opd that we came in with.
+ * When we return we may have a new opd if we went to another
+ * key.
+ */
+ if (opd != NULL) {
+ DB_ASSERT(env, cp_n->opd == NULL);
+ cp_n->opd = opd;
+ opd = NULL;
+ }
+
+ /*
+ * Bulk get doesn't use __db_retcopy, so data.size won't
+ * get set up unless there is an error. Assume success
+ * here. This is the only call to am_bulk, and it avoids
+ * setting it exactly the same everywhere. If we have an
+ * DB_BUFFER_SMALL error, it'll get overwritten with the
+ * needed value.
+ */
+ data->size = data->ulen;
+ ret = dbc_n->am_bulk(dbc_n, data, flags | multi);
+ } else if (!F_ISSET(data, DB_DBT_ISSET)) {
+ ddbc = opd != NULL ? opd :
+ cp_n->opd != NULL ? cp_n->opd : dbc_n;
+ cp = ddbc->internal;
+ if (cp->page == NULL &&
+ (ret = __memp_fget(mpf, &cp->pgno,
+ dbc->thread_info, ddbc->txn, 0, &cp->page)) != 0)
+ goto err;
+
+ type = TYPE(cp->page);
+ indx_off = ((type == P_LBTREE ||
+ type == P_HASH || type == P_HASH_UNSORTED) ? O_INDX : 0);
+ ret = __db_ret(ddbc, cp->page, cp->indx + indx_off,
+ data, &dbc->rdata->data, &dbc->rdata->ulen);
+ }
+
+err: /* Don't pass DB_DBT_ISSET back to application level, error or no. */
+ F_CLR(key, DB_DBT_ISSET);
+ F_CLR(data, DB_DBT_ISSET);
+
+ /* Cleanup and cursor resolution. */
+ if (opd != NULL) {
+ /*
+ * To support dirty reads we must reget the write lock
+ * if we have just stepped off a deleted record.
+ * Since the OPD cursor does not know anything
+ * about the referencing page or cursor we need
+ * to peek at the OPD cursor and get the lock here.
+ */
+ if (F_ISSET(dbp, DB_AM_READ_UNCOMMITTED) &&
+ F_ISSET((BTREE_CURSOR *)
+ dbc->internal->opd->internal, C_DELETED))
+ if ((t_ret =
+ dbc->am_writelock(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __dbc_cleanup(
+ dbc->internal->opd, opd, ret)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ if (key_small) {
+ data->ulen = orig_ulen;
+ if (ret == 0)
+ ret = DB_BUFFER_SMALL;
+ }
+
+ if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 &&
+ (ret == 0 || ret == DB_BUFFER_SMALL))
+ ret = t_ret;
+
+ if (flags == DB_CONSUME || flags == DB_CONSUME_WAIT)
+ CDB_LOCKING_DONE(env, dbc);
+ return (ret);
+}
+
+/* Internal flags shared by the dbc_put functions. */
+#define DBC_PUT_RMW 0x001
+#define DBC_PUT_NODEL 0x002
+#define DBC_PUT_HAVEREC 0x004
+
+/*
+ * __dbc_put_resolve_key --
+ * Get the current key and data so that we can correctly update the
+ * secondary and foreign databases.
+ */
+static inline int
+__dbc_put_resolve_key(dbc, oldkey, olddata, put_statep, flags)
+ DBC *dbc;
+ DBT *oldkey, *olddata;
+ u_int32_t flags, *put_statep;
+{
+ DB *dbp;
+ ENV *env;
+ int ret, rmw;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ rmw = FLD_ISSET(*put_statep, DBC_PUT_RMW) ? DB_RMW : 0;
+
+ DB_ASSERT(env, flags == DB_CURRENT);
+ COMPQUIET(flags, 0);
+
+ /*
+ * This is safe to do on the cursor we already have;
+ * error or no, it won't move.
+ *
+ * We use DB_RMW for all of these gets because we'll be
+ * writing soon enough in the "normal" put code. In
+ * transactional databases we'll hold those write locks
+ * even if we close the cursor we're reading with.
+ *
+ * The DB_KEYEMPTY return needs special handling -- if the
+ * cursor is on a deleted key, we return DB_NOTFOUND.
+ */
+ memset(oldkey, 0, sizeof(DBT));
+ if ((ret = __dbc_get(dbc, oldkey, olddata, rmw | DB_CURRENT)) != 0)
+ return (ret == DB_KEYEMPTY ? DB_NOTFOUND : ret);
+
+ /* Record that we've looked for the old record. */
+ FLD_SET(*put_statep, DBC_PUT_HAVEREC);
+ return (0);
+}
+
+/*
+ * __dbc_put_append --
+ * Handle an append to a primary.
+ */
+static inline int
+__dbc_put_append(dbc, key, data, put_statep, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags, *put_statep;
+{
+ DB *dbp;
+ ENV *env;
+ DBC *dbc_n;
+ DBT tdata;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ ret = 0;
+ dbc_n = NULL;
+
+ DB_ASSERT(env, flags == DB_APPEND);
+ COMPQUIET(flags, 0);
+
+ /*
+ * With DB_APPEND, we need to do the insert to populate the key value.
+ * So we swap the 'normal' order of updating secondary / verifying
+ * foreign databases and inserting.
+ *
+ * If there is an append callback, the value stored in data->data may
+ * be replaced and then freed. To avoid passing a freed pointer back
+ * to the user, just operate on a copy of the data DBT.
+ */
+ tdata = *data;
+
+ /*
+ * If this cursor is going to be closed immediately, we don't
+ * need to take precautions to clean it up on error.
+ */
+ if (F_ISSET(dbc, DBC_TRANSIENT))
+ dbc_n = dbc;
+ else if ((ret = __dbc_idup(dbc, &dbc_n, 0)) != 0)
+ goto err;
+
+ /*
+ * Append isn't a normal put operation; call the appropriate access
+ * method's append function.
+ */
+ switch (dbp->type) {
+ case DB_QUEUE:
+ if ((ret = __qam_append(dbc_n, key, &tdata)) != 0)
+ goto err;
+ break;
+ case DB_RECNO:
+ if ((ret = __ram_append(dbc_n, key, &tdata)) != 0)
+ goto err;
+ break;
+ default:
+ /* The interface should prevent this. */
+ DB_ASSERT(env,
+ dbp->type == DB_QUEUE || dbp->type == DB_RECNO);
+
+ ret = __db_ferr(env, "DBC->put", 0);
+ goto err;
+ }
+
+ /*
+ * The append callback, if one exists, may have allocated a new
+ * tdata.data buffer. If so, free it.
+ */
+ FREE_IF_NEEDED(env, &tdata);
+
+ /*
+ * The key value may have been generated by the above operation, but
+ * not set in the data buffer. Make sure it is there so that secondary
+ * updates can complete.
+ */
+ if ((ret = __dbt_usercopy(env, key)) != 0)
+ goto err;
+
+ /* An append cannot be replacing an existing item. */
+ FLD_SET(*put_statep, DBC_PUT_NODEL);
+
+err: if (dbc_n != NULL &&
+ (t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __dbc_put_partial --
+ * Ensure that the data item we are using is complete and correct.
+ * Otherwise we could break the secondary constraints.
+ */
+static inline int
+__dbc_put_partial(dbc, pkey, data, orig_data, out_data, put_statep, flags)
+ DBC *dbc;
+ DBT *pkey, *data, *orig_data, *out_data;
+ u_int32_t *put_statep, flags;
+{
+ DB *dbp;
+ DBC *pdbc;
+ ENV *env;
+ int ret, rmw, t_ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ ret = t_ret = 0;
+ rmw = FLD_ISSET(*put_statep, DBC_PUT_RMW) ? DB_RMW : 0;
+
+ if (!FLD_ISSET(*put_statep, DBC_PUT_HAVEREC) &&
+ !FLD_ISSET(*put_statep, DBC_PUT_NODEL)) {
+ /*
+ * We're going to have to search the tree for the
+ * specified key. Dup a cursor (so we have the same
+ * locking info) and do a c_get.
+ */
+ if ((ret = __dbc_idup(dbc, &pdbc, 0)) != 0)
+ return (ret);
+
+ /*
+ * When doing a put with DB_CURRENT, partial data items have
+ * already been resolved.
+ */
+ DB_ASSERT(env, flags != DB_CURRENT);
+
+ F_SET(pkey, DB_DBT_ISSET);
+ ret = __dbc_get(pdbc, pkey, orig_data, rmw | DB_SET);
+ if (ret == DB_KEYEMPTY || ret == DB_NOTFOUND) {
+ FLD_SET(*put_statep, DBC_PUT_NODEL);
+ ret = 0;
+ }
+ if ((t_ret = __dbc_close(pdbc)) != 0)
+ ret = t_ret;
+ if (ret != 0)
+ return (ret);
+
+ FLD_SET(*put_statep, DBC_PUT_HAVEREC);
+ }
+
+ COMPQUIET(flags, 0);
+
+ /*
+ * Now build the new datum from orig_data and the partial data
+ * we were given. It's okay to do this if no record was
+ * returned above: a partial put on an empty record is allowed,
+ * if a little strange. The data is zero-padded.
+ */
+ return (__db_buildpartial(dbp, orig_data, data, out_data));
+}
+
+/*
+ * __dbc_put_fixed_len --
+ * Handle padding for fixed-length records.
+ */
+static inline int
+__dbc_put_fixed_len(dbc, data, out_data)
+ DBC *dbc;
+ DBT *data, *out_data;
+{
+ DB *dbp;
+ ENV *env;
+ int re_pad, ret;
+ u_int32_t re_len, size;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ ret = 0;
+
+ /*
+ * Handle fixed-length records. If the primary database has
+ * fixed-length records, we need to pad out the datum before
+ * we pass it into the callback function; we always index the
+ * "real" record.
+ */
+ if (dbp->type == DB_QUEUE) {
+ re_len = ((QUEUE *)dbp->q_internal)->re_len;
+ re_pad = ((QUEUE *)dbp->q_internal)->re_pad;
+ } else {
+ re_len = ((BTREE *)dbp->bt_internal)->re_len;
+ re_pad = ((BTREE *)dbp->bt_internal)->re_pad;
+ }
+
+ size = data->size;
+ if (size > re_len) {
+ ret = __db_rec_toobig(env, size, re_len);
+ return (ret);
+ } else if (size < re_len) {
+ /*
+ * If we're not doing a partial put, copy data->data into
+ * out_data->data, then pad out out_data->data. This overrides
+ * the assignment made above, which is used in the more common
+ * case when padding is not needed.
+ *
+ * If we're doing a partial put, the data we want are already
+ * in out_data.data; we just need to pad.
+ */
+ if (F_ISSET(data, DB_DBT_PARTIAL)) {
+ if ((ret = __os_realloc(
+ env, re_len, &out_data->data)) != 0)
+ return (ret);
+ /*
+ * In the partial case, we have built the item into
+ * out_data already using __db_buildpartial. Just need
+ * to pad from the end of out_data, not from data->size.
+ */
+ size = out_data->size;
+ } else {
+ if ((ret = __os_malloc(
+ env, re_len, &out_data->data)) != 0)
+ return (ret);
+ memcpy(out_data->data, data->data, size);
+ }
+ memset((u_int8_t *)out_data->data + size, re_pad,
+ re_len - size);
+ out_data->size = re_len;
+ }
+
+ return (ret);
+}
+
+/*
+ * __dbc_put_secondaries --
+ * Insert the secondary keys, and validate the foreign key constraints.
+ */
+static inline int
+__dbc_put_secondaries(dbc,
+ pkey, data, orig_data, s_count, s_keys_buf, put_statep)
+ DBC *dbc;
+ DBT *pkey, *data, *orig_data, *s_keys_buf;
+ int s_count;
+ u_int32_t *put_statep;
+{
+ DB *dbp, *sdbp;
+ DBC *fdbc, *sdbc;
+ DBT fdata, oldpkey, *skeyp, temppkey, tempskey, *tskeyp;
+ ENV *env;
+ int cmp, ret, rmw, t_ret;
+ u_int32_t nskey;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ fdbc = sdbc = NULL;
+ sdbp = NULL;
+ ret = t_ret = 0;
+ rmw = FLD_ISSET(*put_statep, DBC_PUT_RMW) ? DB_RMW : 0;
+
+ /*
+ * Loop through the secondaries. (Step 3.)
+ *
+ * Note that __db_s_first and __db_s_next will take care of
+ * thread-locking and refcounting issues.
+ */
+ for (ret = __db_s_first(dbp, &sdbp), skeyp = s_keys_buf;
+ sdbp != NULL && ret == 0;
+ ret = __db_s_next(&sdbp, dbc->txn), ++skeyp) {
+ DB_ASSERT(env, skeyp - s_keys_buf < s_count);
+ /*
+ * Don't process this secondary if the key is immutable and we
+ * know that the old record exists. This optimization can't be
+ * used if we have not checked for the old record yet.
+ */
+ if (FLD_ISSET(*put_statep, DBC_PUT_HAVEREC) &&
+ !FLD_ISSET(*put_statep, DBC_PUT_NODEL) &&
+ FLD_ISSET(sdbp->s_assoc_flags, DB_ASSOC_IMMUTABLE_KEY))
+ continue;
+
+ /*
+ * Call the callback for this secondary, to get the
+ * appropriate secondary key.
+ */
+ if ((ret = sdbp->s_callback(sdbp,
+ pkey, data, skeyp)) != 0) {
+ /* Not indexing is equivalent to an empty key set. */
+ if (ret == DB_DONOTINDEX) {
+ F_SET(skeyp, DB_DBT_MULTIPLE);
+ skeyp->size = 0;
+ ret = 0;
+ } else
+ goto err;
+ }
+
+ if (sdbp->s_foreign != NULL &&
+ (ret = __db_cursor_int(sdbp->s_foreign,
+ dbc->thread_info, dbc->txn, sdbp->s_foreign->type,
+ PGNO_INVALID, 0, dbc->locker, &fdbc)) != 0)
+ goto err;
+
+ /*
+ * Mark the secondary key DBT(s) as set -- that is, the
+ * callback returned at least one secondary key.
+ *
+ * Also, if this secondary index is associated with a foreign
+ * database, check that the foreign db contains the key(s) to
+ * maintain referential integrity. Set flags in fdata to avoid
+ * mem copying, we just need to know existence. We need to do
+ * this check before setting DB_DBT_ISSET, otherwise __dbc_get
+ * will overwrite the flag values.
+ */
+ if (F_ISSET(skeyp, DB_DBT_MULTIPLE)) {
+#ifdef DIAGNOSTIC
+ __db_check_skeyset(sdbp, skeyp);
+#endif
+ for (tskeyp = (DBT *)skeyp->data, nskey = skeyp->size;
+ nskey > 0; nskey--, tskeyp++) {
+ if (fdbc != NULL) {
+ memset(&fdata, 0, sizeof(DBT));
+ F_SET(&fdata,
+ DB_DBT_PARTIAL | DB_DBT_USERMEM);
+ if ((ret = __dbc_get(
+ fdbc, tskeyp, &fdata,
+ DB_SET | rmw)) == DB_NOTFOUND ||
+ ret == DB_KEYEMPTY) {
+ ret = DB_FOREIGN_CONFLICT;
+ break;
+ }
+ }
+ F_SET(tskeyp, DB_DBT_ISSET);
+ }
+ tskeyp = (DBT *)skeyp->data;
+ nskey = skeyp->size;
+ } else {
+ if (fdbc != NULL) {
+ memset(&fdata, 0, sizeof(DBT));
+ F_SET(&fdata, DB_DBT_PARTIAL | DB_DBT_USERMEM);
+ if ((ret = __dbc_get(fdbc, skeyp, &fdata,
+ DB_SET | rmw)) == DB_NOTFOUND ||
+ ret == DB_KEYEMPTY)
+ ret = DB_FOREIGN_CONFLICT;
+ }
+ F_SET(skeyp, DB_DBT_ISSET);
+ tskeyp = skeyp;
+ nskey = 1;
+ }
+ if (fdbc != NULL && (t_ret = __dbc_close(fdbc)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ fdbc = NULL;
+ if (ret != 0)
+ goto err;
+
+ /*
+ * If we have the old record, we can generate and remove any
+ * old secondary key(s) now. We can also skip the secondary
+ * put if there is no change.
+ */
+ if (FLD_ISSET(*put_statep, DBC_PUT_HAVEREC)) {
+ if ((ret = __dbc_del_oldskey(sdbp, dbc,
+ skeyp, pkey, orig_data)) == DB_KEYEXIST)
+ continue;
+ else if (ret != 0)
+ goto err;
+ }
+ if (nskey == 0)
+ continue;
+
+ /*
+ * Open a cursor in this secondary.
+ *
+ * Use the same locker ID as our primary cursor, so that
+ * we're guaranteed that the locks don't conflict (e.g. in CDB
+ * or if we're subdatabases that share and want to lock a
+ * metadata page).
+ */
+ if ((ret = __db_cursor_int(sdbp, dbc->thread_info, dbc->txn,
+ sdbp->type, PGNO_INVALID, 0, dbc->locker, &sdbc)) != 0)
+ goto err;
+
+ /*
+ * If we're in CDB, updates will fail since the new cursor
+ * isn't a writer. However, we hold the WRITE lock in the
+ * primary and will for as long as our new cursor lasts,
+ * and the primary and secondary share a lock file ID,
+ * so it's safe to consider this a WRITER. The close
+ * routine won't try to put anything because we don't
+ * really have a lock.
+ */
+ if (CDB_LOCKING(env)) {
+ DB_ASSERT(env, sdbc->mylock.off == LOCK_INVALID);
+ F_SET(sdbc, DBC_WRITER);
+ }
+
+ /*
+ * Swap the primary key to the byte order of this secondary, if
+ * necessary. By doing this now, we can compare directly
+ * against the data already in the secondary without having to
+ * swap it after reading.
+ */
+ SWAP_IF_NEEDED(sdbp, pkey);
+
+ for (; nskey > 0 && ret == 0; nskey--, tskeyp++) {
+ /* Skip this key if it is already in the database. */
+ if (!F_ISSET(tskeyp, DB_DBT_ISSET))
+ continue;
+
+ /*
+ * There are three cases here--
+ * 1) The secondary supports sorted duplicates.
+ * If we attempt to put a secondary/primary pair
+ * that already exists, that's a duplicate
+ * duplicate, and c_put will return DB_KEYEXIST
+ * (see __db_duperr). This will leave us with
+ * exactly one copy of the secondary/primary pair,
+ * and this is just right--we'll avoid deleting it
+ * later, as the old and new secondaries will
+ * match (since the old secondary is the dup dup
+ * that's already there).
+ * 2) The secondary supports duplicates, but they're not
+ * sorted. We need to avoid putting a duplicate
+ * duplicate, because the matching old and new
+ * secondaries will prevent us from deleting
+ * anything and we'll wind up with two secondary
+ * records that point to the same primary key. Do
+ * a c_get(DB_GET_BOTH); only do the put if the
+ * secondary doesn't exist.
+ * 3) The secondary doesn't support duplicates at all.
+ * In this case, secondary keys must be unique;
+ * if another primary key already exists for this
+ * secondary key, we have to either overwrite it
+ * or not put this one, and in either case we've
+ * corrupted the secondary index. Do a
+ * c_get(DB_SET). If the secondary/primary pair
+ * already exists, do nothing; if the secondary
+ * exists with a different primary, return an
+ * error; and if the secondary does not exist,
+ * put it.
+ */
+ if (!F_ISSET(sdbp, DB_AM_DUP)) {
+ /* Case 3. */
+ memset(&oldpkey, 0, sizeof(DBT));
+ F_SET(&oldpkey, DB_DBT_MALLOC);
+ ret = __dbc_get(sdbc,
+ tskeyp, &oldpkey, rmw | DB_SET);
+ if (ret == 0) {
+ cmp = __bam_defcmp(sdbp,
+ &oldpkey, pkey);
+ __os_ufree(env, oldpkey.data);
+ /*
+ * If the secondary key is unchanged,
+ * skip the put and go on to the next
+ * one.
+ */
+ if (cmp == 0)
+ continue;
+
+ __db_errx(env, "%s%s",
+ "Put results in a non-unique secondary key in an ",
+ "index not configured to support duplicates");
+ ret = EINVAL;
+ }
+ if (ret != DB_NOTFOUND && ret != DB_KEYEMPTY)
+ break;
+ } else if (!F_ISSET(sdbp, DB_AM_DUPSORT)) {
+ /* Case 2. */
+ DB_INIT_DBT(tempskey,
+ tskeyp->data, tskeyp->size);
+ DB_INIT_DBT(temppkey,
+ pkey->data, pkey->size);
+ ret = __dbc_get(sdbc, &tempskey, &temppkey,
+ rmw | DB_GET_BOTH);
+ if (ret != DB_NOTFOUND && ret != DB_KEYEMPTY)
+ break;
+ }
+
+ ret = __dbc_put(sdbc, tskeyp, pkey,
+ DB_UPDATE_SECONDARY);
+
+ /*
+ * We don't know yet whether this was a put-overwrite
+ * that in fact changed nothing. If it was, we may get
+ * DB_KEYEXIST. This is not an error.
+ */
+ if (ret == DB_KEYEXIST)
+ ret = 0;
+ }
+
+ /* Make sure the primary key is back in native byte-order. */
+ SWAP_IF_NEEDED(sdbp, pkey);
+
+ if ((t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (ret != 0)
+ goto err;
+
+ /*
+ * Mark that we have a key for this secondary so we can check
+ * it later before deleting the old one. We can't set it
+ * earlier or it would be cleared in the calls above.
+ */
+ F_SET(skeyp, DB_DBT_ISSET);
+ }
+err: if (sdbp != NULL &&
+ (t_ret = __db_s_done(sdbp, dbc->txn)) != 0 && ret == 0)
+ ret = t_ret;
+ COMPQUIET(s_count, 0);
+ return (ret);
+}
+
+static int
+__dbc_put_primary(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DB *dbp, *sdbp;
+ DBC *dbc_n, *pdbc;
+ DBT oldkey, olddata, newdata;
+ DBT *all_skeys, *skeyp, *tskeyp;
+ ENV *env;
+ int ret, t_ret, s_count;
+ u_int32_t nskey, put_state, rmw;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ ret = t_ret = s_count = 0;
+ put_state = 0;
+ sdbp = NULL;
+ pdbc = dbc_n = NULL;
+ all_skeys = NULL;
+ memset(&newdata, 0, sizeof(DBT));
+ memset(&olddata, 0, sizeof(DBT));
+
+ /*
+ * We do multiple cursor operations in some cases and subsequently
+ * access the data DBT information. Set DB_DBT_MALLOC so we don't risk
+ * modification of the data between our uses of it.
+ */
+ F_SET(&olddata, DB_DBT_MALLOC);
+
+ /*
+ * We have at least one secondary which we may need to update.
+ *
+ * There is a rather vile locking issue here. Secondary gets
+ * will always involve acquiring a read lock in the secondary,
+ * then acquiring a read lock in the primary. Ideally, we
+ * would likewise perform puts by updating all the secondaries
+ * first, then doing the actual put in the primary, to avoid
+ * deadlock (since having multiple threads doing secondary
+ * gets and puts simultaneously is probably a common case).
+ *
+ * However, if this put is a put-overwrite--and we have no way to
+ * tell in advance whether it will be--we may need to delete
+ * an outdated secondary key. In order to find that old
+ * secondary key, we need to get the record we're overwriting,
+ * before we overwrite it.
+ *
+ * (XXX: It would be nice to avoid this extra get, and have the
+ * underlying put routines somehow pass us the old record
+ * since they need to traverse the tree anyway. I'm saving
+ * this optimization for later, as it's a lot of work, and it
+ * would be hard to fit into this locking paradigm anyway.)
+ *
+ * The simple thing to do would be to go get the old record before
+ * we do anything else. Unfortunately, though, doing so would
+ * violate our "secondary, then primary" lock acquisition
+ * ordering--even in the common case where no old primary record
+ * exists, we'll still acquire and keep a lock on the page where
+ * we're about to do the primary insert.
+ *
+ * To get around this, we do the following gyrations, which
+ * hopefully solve this problem in the common case:
+ *
+ * 1) If this is a c_put(DB_CURRENT), go ahead and get the
+ * old record. We already hold the lock on this page in
+ * the primary, so no harm done, and we'll need the primary
+ * key (which we weren't passed in this case) to do any
+ * secondary puts anyway.
+ * If this is a put(DB_APPEND), then we need to insert the item,
+ * so that we can know the key value. So go ahead and insert. In
+ * the case of a put(DB_APPEND) without secondaries it is
+ * implemented in the __db_put method as an optimization.
+ *
+ * 2) If we're doing a partial put, we need to perform the
+ * get on the primary key right away, since we don't have
+ * the whole datum that the secondary key is based on.
+ * We may also need to pad out the record if the primary
+ * has a fixed record length.
+ *
+ * 3) Loop through the secondary indices, putting into each a
+ * new secondary key that corresponds to the new record.
+ *
+ * 4) If we haven't done so in (1) or (2), get the old primary
+ * key/data pair. If one does not exist--the common case--we're
+ * done with secondary indices, and can go straight on to the
+ * primary put.
+ *
+ * 5) If we do have an old primary key/data pair, however, we need
+ * to loop through all the secondaries a second time and delete
+ * the old secondary in each.
+ */
+ s_count = __db_s_count(dbp);
+ if ((ret = __os_calloc(env,
+ (u_int)s_count, sizeof(DBT), &all_skeys)) != 0)
+ goto err;
+
+ /*
+ * Primary indices can't have duplicates, so only DB_APPEND,
+ * DB_CURRENT, DB_KEYFIRST, and DB_KEYLAST make any sense. Other flags
+ * should have been caught by the checking routine, but
+ * add a sprinkling of paranoia.
+ */
+ DB_ASSERT(env, flags == DB_APPEND || flags == DB_CURRENT ||
+ flags == DB_KEYFIRST || flags == DB_KEYLAST ||
+ flags == DB_NOOVERWRITE || flags == DB_OVERWRITE_DUP);
+
+ /*
+ * We'll want to use DB_RMW in a few places, but it's only legal
+ * when locking is on.
+ */
+ rmw = STD_LOCKING(dbc) ? DB_RMW : 0;
+ if (rmw)
+ FLD_SET(put_state, DBC_PUT_RMW);
+
+ /* Resolve the primary key if required (Step 1). */
+ if (flags == DB_CURRENT) {
+ if ((ret = __dbc_put_resolve_key(dbc,
+ &oldkey, &olddata, &put_state, flags)) != 0)
+ goto err;
+ key = &oldkey;
+ } else if (flags == DB_APPEND) {
+ if ((ret = __dbc_put_append(dbc,
+ key, data, &put_state, flags)) != 0)
+ goto err;
+ }
+
+ /*
+ * PUT_NOOVERWRITE with secondaries is a troublesome case. We need
+ * to check that the insert will work prior to making any changes
+ * to secondaries. Try to work within the locking constraints outlined
+ * above.
+ *
+ * This is DB->put (DB_NOOVERWRITE). DBC->put(DB_NODUPDATA) is not
+ * relevant since it is only valid on DBs that support duplicates,
+ * which primaries with secondaries can't have.
+ */
+ if (flags == DB_NOOVERWRITE) {
+ /* Don't bother retrieving the data. */
+ F_SET(key, DB_DBT_ISSET);
+ olddata.dlen = 0;
+ olddata.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM;
+ if (__dbc_get(dbc, key, &olddata, DB_SET) != DB_NOTFOUND) {
+ ret = DB_KEYEXIST;
+ goto done;
+ }
+ }
+
+ /*
+ * Check for partial puts using DB_DBT_PARTIAL (Step 2).
+ */
+ if (F_ISSET(data, DB_DBT_PARTIAL)) {
+ if ((ret = __dbc_put_partial(dbc,
+ key, data, &olddata, &newdata, &put_state, flags)) != 0)
+ goto err;
+ } else {
+ newdata = *data;
+ }
+
+ /*
+ * Check for partial puts, with fixed length record databases (Step 2).
+ */
+ if ((dbp->type == DB_RECNO && F_ISSET(dbp, DB_AM_FIXEDLEN)) ||
+ (dbp->type == DB_QUEUE)) {
+ if ((ret = __dbc_put_fixed_len(dbc, data, &newdata)) != 0)
+ goto err;
+ }
+
+ /* Validate any foreign databases, and update secondaries. (Step 3). */
+ if ((ret = __dbc_put_secondaries(dbc, key, &newdata,
+ &olddata, s_count, all_skeys, &put_state))
+ != 0)
+ goto err;
+ /*
+ * If we've already got the old primary key/data pair, the secondary
+ * updates are already done.
+ */
+ if (FLD_ISSET(put_state, DBC_PUT_HAVEREC))
+ goto done;
+
+ /*
+ * If still necessary, go get the old primary key/data. (Step 4.)
+ *
+ * See the comments in step 2. This is real familiar.
+ */
+ if ((ret = __dbc_idup(dbc, &pdbc, 0)) != 0)
+ goto err;
+ DB_ASSERT(env, flags != DB_CURRENT);
+ F_SET(key, DB_DBT_ISSET);
+ ret = __dbc_get(pdbc, key, &olddata, rmw | DB_SET);
+ if (ret == DB_KEYEMPTY || ret == DB_NOTFOUND) {
+ FLD_SET(put_state, DBC_PUT_NODEL);
+ ret = 0;
+ }
+ if ((t_ret = __dbc_close(pdbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+
+ /*
+ * Check whether we do in fact have an old record we may need to
+ * delete. (Step 5).
+ */
+ if (FLD_ISSET(put_state, DBC_PUT_NODEL))
+ goto done;
+
+ for (ret = __db_s_first(dbp, &sdbp), skeyp = all_skeys;
+ sdbp != NULL && ret == 0;
+ ret = __db_s_next(&sdbp, dbc->txn), skeyp++) {
+ DB_ASSERT(env, skeyp - all_skeys < s_count);
+ /*
+ * Don't process this secondary if the key is immutable. We
+ * know that the old record exists, so this optimization can
+ * always be used.
+ */
+ if (FLD_ISSET(sdbp->s_assoc_flags, DB_ASSOC_IMMUTABLE_KEY))
+ continue;
+
+ if ((ret = __dbc_del_oldskey(sdbp, dbc,
+ skeyp, key, &olddata)) != 0 && ret != DB_KEYEXIST)
+ goto err;
+ }
+ if (ret != 0)
+ goto err;
+
+done:
+err:
+ if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* If newdata or olddata were used, free their buffers. */
+ if (newdata.data != NULL && newdata.data != data->data)
+ __os_free(env, newdata.data);
+ if (olddata.data != NULL)
+ __os_ufree(env, olddata.data);
+
+ CDB_LOCKING_DONE(env, dbc);
+
+ if (sdbp != NULL &&
+ (t_ret = __db_s_done(sdbp, dbc->txn)) != 0 && ret == 0)
+ ret = t_ret;
+
+ for (skeyp = all_skeys; skeyp - all_skeys < s_count; skeyp++) {
+ if (F_ISSET(skeyp, DB_DBT_MULTIPLE)) {
+ for (nskey = skeyp->size, tskeyp = (DBT *)skeyp->data;
+ nskey > 0;
+ nskey--, tskeyp++)
+ FREE_IF_NEEDED(env, tskeyp);
+ }
+ FREE_IF_NEEDED(env, skeyp);
+ }
+ if (all_skeys != NULL)
+ __os_free(env, all_skeys);
+ return (ret);
+}
+
+/*
+ * __dbc_put --
+ * Put using a cursor.
+ *
+ * PUBLIC: int __dbc_put __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_put(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DB *dbp;
+ int ret;
+
+ dbp = dbc->dbp;
+ ret = 0;
+
+ /*
+ * Putting to secondary indices is forbidden; when we need to
+ * internally update one, we're called with a private flag,
+ * DB_UPDATE_SECONDARY, which does the right thing but won't return an
+ * error during flag checking.
+ *
+ * As a convenience, many places that want the default DB_KEYLAST
+ * behavior call DBC->put with flags == 0. Protect lower-level code
+ * here by translating that.
+ *
+ * Lastly, the DB_OVERWRITE_DUP flag is equivalent to DB_KEYLAST unless
+ * there are sorted duplicates. Limit the number of places that need
+ * to test for it explicitly.
+ */
+ if (flags == DB_UPDATE_SECONDARY || flags == 0 ||
+ (flags == DB_OVERWRITE_DUP && !F_ISSET(dbp, DB_AM_DUPSORT)))
+ flags = DB_KEYLAST;
+
+ CDB_LOCKING_INIT(dbc->env, dbc);
+
+ /*
+ * Check to see if we are a primary and have secondary indices.
+ * If we are not, we save ourselves a good bit of trouble and
+ * just skip to the "normal" put.
+ */
+ if (DB_IS_PRIMARY(dbp) &&
+ ((ret = __dbc_put_primary(dbc, key, data, flags)) != 0))
+ return (ret);
+
+ /*
+ * If this is an append operation, the insert was done prior to the
+ * secondary updates, so we are finished.
+ */
+ if (flags == DB_APPEND)
+ return (ret);
+
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbp))
+ return (__bamc_compress_put(dbc, key, data, flags));
+#endif
+
+ return (__dbc_iput(dbc, key, data, flags));
+}
+
+/*
+ * __dbc_iput --
+ * Implementation of put using a cursor.
+ *
+ * PUBLIC: int __dbc_iput __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_iput(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DBC *dbc_n, *oldopd, *opd;
+ db_pgno_t pgno;
+ int ret, t_ret;
+ u_int32_t tmp_flags;
+
+ /*
+ * Cursor Cleanup Note:
+ * All of the cursors passed to the underlying access methods by this
+ * routine are duplicated cursors. On return, any referenced pages
+ * will be discarded, and, if the cursor is not intended to be used
+ * again, the close function will be called. So, pages/locks that
+ * the cursor references do not need to be resolved by the underlying
+ * functions.
+ */
+ dbc_n = NULL;
+ ret = t_ret = 0;
+
+ /*
+ * If we have an off-page duplicates cursor, and the operation applies
+ * to it, perform the operation. Duplicate the cursor and call the
+ * underlying function.
+ *
+ * Off-page duplicate trees are locked in the primary tree, that is,
+ * we acquire a write lock in the primary tree and no locks in the
+ * off-page dup tree. If the put operation is done in an off-page
+ * duplicate tree, call the primary cursor's upgrade routine first.
+ */
+ if (dbc->internal->opd != NULL &&
+ (flags == DB_AFTER || flags == DB_BEFORE || flags == DB_CURRENT)) {
+ /*
+ * A special case for hash off-page duplicates. Hash doesn't
+ * support (and is documented not to support) put operations
+ * relative to a cursor which references an already deleted
+ * item. For consistency, apply the same criteria to off-page
+ * duplicates as well.
+ */
+ if (dbc->dbtype == DB_HASH && F_ISSET(
+ ((BTREE_CURSOR *)(dbc->internal->opd->internal)),
+ C_DELETED)) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+
+ if ((ret = dbc->am_writelock(dbc)) != 0 ||
+ (ret = __dbc_dup(dbc, &dbc_n, DB_POSITION)) != 0)
+ goto err;
+ opd = dbc_n->internal->opd;
+ if ((ret = opd->am_put(
+ opd, key, data, flags, NULL)) != 0)
+ goto err;
+ goto done;
+ }
+
+ /*
+ * Perform an operation on the main cursor. Duplicate the cursor,
+ * and call the underlying function.
+ */
+ if (flags == DB_AFTER || flags == DB_BEFORE || flags == DB_CURRENT)
+ tmp_flags = DB_POSITION;
+ else
+ tmp_flags = 0;
+
+ /*
+ * If this cursor is going to be closed immediately, we don't
+ * need to take precautions to clean it up on error.
+ */
+ if (F_ISSET(dbc, DBC_TRANSIENT | DBC_PARTITIONED))
+ dbc_n = dbc;
+ else if ((ret = __dbc_idup(dbc, &dbc_n, tmp_flags)) != 0)
+ goto err;
+
+ pgno = PGNO_INVALID;
+ if ((ret = dbc_n->am_put(dbc_n, key, data, flags, &pgno)) != 0)
+ goto err;
+
+ /*
+ * We may be referencing a new off-page duplicates tree. Acquire
+ * a new cursor and call the underlying function.
+ */
+ if (pgno != PGNO_INVALID) {
+ oldopd = dbc_n->internal->opd;
+ if ((ret = __dbc_newopd(dbc, pgno, oldopd, &opd)) != 0) {
+ dbc_n->internal->opd = opd;
+ goto err;
+ }
+
+ dbc_n->internal->opd = opd;
+ opd->internal->pdbc = dbc_n;
+
+ if (flags == DB_NOOVERWRITE)
+ flags = DB_KEYLAST;
+ if ((ret = opd->am_put(
+ opd, key, data, flags, NULL)) != 0)
+ goto err;
+ }
+
+done:
+err: /* Cleanup and cursor resolution. */
+ if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __dbc_del_oldskey --
+ * Delete an old secondary key, if necessary.
+ * Returns DB_KEYEXIST if the new and old keys match..
+ */
+static int
+__dbc_del_oldskey(sdbp, dbc, skey, pkey, olddata)
+ DB *sdbp;
+ DBC *dbc;
+ DBT *skey, *pkey, *olddata;
+{
+ DB *dbp;
+ DBC *sdbc;
+ DBT *toldskeyp, *tskeyp;
+ DBT oldskey, temppkey, tempskey;
+ ENV *env;
+ int ret, t_ret;
+ u_int32_t i, noldskey, nsame, nskey, rmw;
+
+ sdbc = NULL;
+ dbp = sdbp->s_primary;
+ env = dbp->env;
+ nsame = 0;
+ rmw = STD_LOCKING(dbc) ? DB_RMW : 0;
+
+ /*
+ * Get the old secondary key.
+ */
+ memset(&oldskey, 0, sizeof(DBT));
+ if ((ret = sdbp->s_callback(sdbp, pkey, olddata, &oldskey)) != 0) {
+ if (ret == DB_DONOTINDEX ||
+ (F_ISSET(&oldskey, DB_DBT_MULTIPLE) && oldskey.size == 0))
+ /* There's no old key to delete. */
+ ret = 0;
+ return (ret);
+ }
+
+ if (F_ISSET(&oldskey, DB_DBT_MULTIPLE)) {
+#ifdef DIAGNOSTIC
+ __db_check_skeyset(sdbp, &oldskey);
+#endif
+ toldskeyp = (DBT *)oldskey.data;
+ noldskey = oldskey.size;
+ } else {
+ toldskeyp = &oldskey;
+ noldskey = 1;
+ }
+
+ if (F_ISSET(skey, DB_DBT_MULTIPLE)) {
+ nskey = skey->size;
+ skey = (DBT *)skey->data;
+ } else
+ nskey = F_ISSET(skey, DB_DBT_ISSET) ? 1 : 0;
+
+ for (; noldskey > 0 && ret == 0; noldskey--, toldskeyp++) {
+ /*
+ * Check whether this old secondary key is also a new key
+ * before we delete it. Note that bt_compare is (and must be)
+ * set no matter what access method we're in.
+ */
+ for (i = 0, tskeyp = skey; i < nskey; i++, tskeyp++)
+ if (((BTREE *)sdbp->bt_internal)->bt_compare(sdbp,
+ toldskeyp, tskeyp) == 0) {
+ nsame++;
+ F_CLR(tskeyp, DB_DBT_ISSET);
+ break;
+ }
+
+ if (i < nskey) {
+ FREE_IF_NEEDED(env, toldskeyp);
+ continue;
+ }
+
+ if (sdbc == NULL) {
+ if ((ret = __db_cursor_int(sdbp,
+ dbc->thread_info, dbc->txn, sdbp->type,
+ PGNO_INVALID, 0, dbc->locker, &sdbc)) != 0)
+ goto err;
+ if (CDB_LOCKING(env)) {
+ DB_ASSERT(env,
+ sdbc->mylock.off == LOCK_INVALID);
+ F_SET(sdbc, DBC_WRITER);
+ }
+ }
+
+ /*
+ * Don't let c_get(DB_GET_BOTH) stomp on our data. Use
+ * temporary DBTs instead.
+ */
+ SWAP_IF_NEEDED(sdbp, pkey);
+ DB_INIT_DBT(temppkey, pkey->data, pkey->size);
+ DB_INIT_DBT(tempskey, toldskeyp->data, toldskeyp->size);
+ if ((ret = __dbc_get(sdbc,
+ &tempskey, &temppkey, rmw | DB_GET_BOTH)) == 0)
+ ret = __dbc_del(sdbc, DB_UPDATE_SECONDARY);
+ else if (ret == DB_NOTFOUND)
+ ret = __db_secondary_corrupt(dbp);
+ SWAP_IF_NEEDED(sdbp, pkey);
+ FREE_IF_NEEDED(env, toldskeyp);
+ }
+
+err: for (; noldskey > 0; noldskey--, toldskeyp++)
+ FREE_IF_NEEDED(env, toldskeyp);
+ FREE_IF_NEEDED(env, &oldskey);
+ if (sdbc != NULL && (t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret == 0 && nsame == nskey)
+ return (DB_KEYEXIST);
+ return (ret);
+}
+
+/*
+ * __db_duperr()
+ * Error message: we don't currently support sorted duplicate duplicates.
+ * PUBLIC: int __db_duperr __P((DB *, u_int32_t));
+ */
+int
+__db_duperr(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ /*
+ * If we run into this error while updating a secondary index,
+ * don't yell--there's no clean way to pass DB_NODUPDATA in along
+ * with DB_UPDATE_SECONDARY, but we may run into this problem
+ * in a normal, non-error course of events.
+ *
+ * !!!
+ * If and when we ever permit duplicate duplicates in sorted-dup
+ * databases, we need to either change the secondary index code
+ * to check for dup dups, or we need to maintain the implicit
+ * "DB_NODUPDATA" behavior for databases with DB_AM_SECONDARY set.
+ */
+ if (flags != DB_NODUPDATA && !F_ISSET(dbp, DB_AM_SECONDARY))
+ __db_errx(dbp->env,
+ "Duplicate data items are not supported with sorted data");
+ return (DB_KEYEXIST);
+}
+
+/*
+ * __dbc_cleanup --
+ * Clean up duplicate cursors.
+ *
+ * PUBLIC: int __dbc_cleanup __P((DBC *, DBC *, int));
+ */
+int
+__dbc_cleanup(dbc, dbc_n, failed)
+ DBC *dbc, *dbc_n;
+ int failed;
+{
+ DB *dbp;
+ DBC *opd;
+ DBC_INTERNAL *internal;
+ DB_MPOOLFILE *mpf;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ internal = dbc->internal;
+ ret = 0;
+
+ /* Discard any pages we're holding. */
+ if (internal->page != NULL) {
+ if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+ internal->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ internal->page = NULL;
+ }
+ opd = internal->opd;
+ if (opd != NULL && opd->internal->page != NULL) {
+ if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+ opd->internal->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ opd->internal->page = NULL;
+ }
+
+ /*
+ * If dbc_n is NULL, there's no internal cursor swapping to be done
+ * and no dbc_n to close--we probably did the entire operation on an
+ * offpage duplicate cursor. Just return.
+ *
+ * If dbc and dbc_n are the same, we're either inside a DB->{put/get}
+ * operation, and as an optimization we performed the operation on
+ * the main cursor rather than on a duplicated one, or we're in a
+ * bulk get that can't have moved the cursor (DB_MULTIPLE with the
+ * initial c_get operation on an off-page dup cursor). Just
+ * return--either we know we didn't move the cursor, or we're going
+ * to close it before we return to application code, so we're sure
+ * not to visibly violate the "cursor stays put on error" rule.
+ */
+ if (dbc_n == NULL || dbc == dbc_n)
+ return (ret);
+
+ if (dbc_n->internal->page != NULL) {
+ if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+ dbc_n->internal->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ dbc_n->internal->page = NULL;
+ }
+ opd = dbc_n->internal->opd;
+ if (opd != NULL && opd->internal->page != NULL) {
+ if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+ opd->internal->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ opd->internal->page = NULL;
+ }
+
+ /*
+ * If we didn't fail before entering this routine or just now when
+ * freeing pages, swap the interesting contents of the old and new
+ * cursors.
+ */
+ if (!failed && ret == 0) {
+ if (opd != NULL)
+ opd->internal->pdbc = dbc;
+ if (internal->opd != NULL)
+ internal->opd->internal->pdbc = dbc_n;
+ dbc->internal = dbc_n->internal;
+ dbc_n->internal = internal;
+ }
+
+ /*
+ * Close the cursor we don't care about anymore. The close can fail,
+ * but we only expect DB_LOCK_DEADLOCK failures. This violates our
+ * "the cursor is unchanged on error" semantics, but since all you can
+ * do with a DB_LOCK_DEADLOCK failure is close the cursor, I believe
+ * that's OK.
+ *
+ * XXX
+ * There's no way to recover from failure to close the old cursor.
+ * All we can do is move to the new position and return an error.
+ *
+ * XXX
+ * We might want to consider adding a flag to the cursor, so that any
+ * subsequent operations other than close just return an error?
+ */
+ if ((t_ret = __dbc_close(dbc_n)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * If this was an update that is supporting dirty reads
+ * then we may have just swapped our read for a write lock
+ * which is held by the surviving cursor. We need
+ * to explicitly downgrade this lock. The closed cursor
+ * may only have had a read lock.
+ */
+ if (F_ISSET(dbp, DB_AM_READ_UNCOMMITTED) &&
+ dbc->internal->lock_mode == DB_LOCK_WRITE) {
+ if ((t_ret =
+ __TLPUT(dbc, dbc->internal->lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (t_ret == 0)
+ dbc->internal->lock_mode = DB_LOCK_WWRITE;
+ if (dbc->internal->page != NULL && (t_ret =
+ __memp_shared(dbp->mpf, dbc->internal->page)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ }
+
+ return (ret);
+}
+
+/*
+ * __dbc_secondary_get_pp --
+ * This wrapper function for DBC->pget() is the DBC->get() function
+ * for a secondary index cursor.
+ *
+ * PUBLIC: int __dbc_secondary_get_pp __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_secondary_get_pp(dbc, skey, data, flags)
+ DBC *dbc;
+ DBT *skey, *data;
+ u_int32_t flags;
+{
+ DB_ASSERT(dbc->env, F_ISSET(dbc->dbp, DB_AM_SECONDARY));
+ return (__dbc_pget_pp(dbc, skey, NULL, data, flags));
+}
+
+/*
+ * __dbc_pget --
+ * Get a primary key/data pair through a secondary index.
+ *
+ * PUBLIC: int __dbc_pget __P((DBC *, DBT *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_pget(dbc, skey, pkey, data, flags)
+ DBC *dbc;
+ DBT *skey, *pkey, *data;
+ u_int32_t flags;
+{
+ DB *pdbp, *sdbp;
+ DBC *dbc_n, *pdbc;
+ DBT nullpkey;
+ u_int32_t save_pkey_flags, tmp_flags, tmp_read_locking, tmp_rmw;
+ int pkeymalloc, ret, t_ret;
+
+ sdbp = dbc->dbp;
+ pdbp = sdbp->s_primary;
+ dbc_n = NULL;
+ pkeymalloc = t_ret = 0;
+
+ /*
+ * The challenging part of this function is getting the behavior
+ * right for all the various permutations of DBT flags. The
+ * next several blocks handle the various cases we need to
+ * deal with specially.
+ */
+
+ /*
+ * We may be called with a NULL pkey argument, if we've been
+ * wrapped by a 2-DBT get call. If so, we need to use our
+ * own DBT.
+ */
+ if (pkey == NULL) {
+ memset(&nullpkey, 0, sizeof(DBT));
+ pkey = &nullpkey;
+ }
+
+ /* Clear OR'd in additional bits so we can check for flag equality. */
+ tmp_rmw = LF_ISSET(DB_RMW);
+ LF_CLR(DB_RMW);
+
+ SET_READ_LOCKING_FLAGS(dbc, tmp_read_locking);
+ /*
+ * DB_GET_RECNO is a special case, because we're interested not in
+ * the primary key/data pair, but rather in the primary's record
+ * number.
+ */
+ if (flags == DB_GET_RECNO) {
+ if (tmp_rmw)
+ F_SET(dbc, DBC_RMW);
+ F_SET(dbc, tmp_read_locking);
+ ret = __dbc_pget_recno(dbc, pkey, data, flags);
+ if (tmp_rmw)
+ F_CLR(dbc, DBC_RMW);
+ /* Clear the temp flags, but leave WAS_READ_COMMITTED. */
+ F_CLR(dbc, tmp_read_locking & ~DBC_WAS_READ_COMMITTED);
+ return (ret);
+ }
+
+ /*
+ * If the DBTs we've been passed don't have any of the
+ * user-specified memory management flags set, we want to make sure
+ * we return values using the DBTs dbc->rskey, dbc->rkey, and
+ * dbc->rdata, respectively.
+ *
+ * There are two tricky aspects to this: first, we need to pass
+ * skey and pkey *in* to the initial c_get on the secondary key,
+ * since either or both may be looked at by it (depending on the
+ * get flag). Second, we must not use a normal DB->get call
+ * on the secondary, even though that's what we want to accomplish,
+ * because the DB handle may be free-threaded. Instead,
+ * we open a cursor, then take steps to ensure that we actually use
+ * the rkey/rdata from the *secondary* cursor.
+ *
+ * We accomplish all this by passing in the DBTs we started out
+ * with to the c_get, but swapping the contents of rskey and rkey,
+ * respectively, into rkey and rdata; __db_ret will treat them like
+ * the normal key/data pair in a c_get call, and will realloc them as
+ * need be (this is "step 1"). Then, for "step 2", we swap back
+ * rskey/rkey/rdata to normal, and do a get on the primary with the
+ * secondary dbc appointed as the owner of the returned-data memory.
+ *
+ * Note that in step 2, we copy the flags field in case we need to
+ * pass down a DB_DBT_PARTIAL or other flag that is compatible with
+ * letting DB do the memory management.
+ */
+
+ /*
+ * It is correct, though slightly sick, to attempt a partial get of a
+ * primary key. However, if we do so here, we'll never find the
+ * primary record; clear the DB_DBT_PARTIAL field of pkey just for the
+ * duration of the next call.
+ */
+ save_pkey_flags = pkey->flags;
+ F_CLR(pkey, DB_DBT_PARTIAL);
+
+ /*
+ * Now we can go ahead with the meat of this call. First, get the
+ * primary key from the secondary index. (What exactly we get depends
+ * on the flags, but the underlying cursor get will take care of the
+ * dirty work.) Duplicate the cursor, in case the later get on the
+ * primary fails.
+ */
+ switch (flags) {
+ case DB_CURRENT:
+ case DB_GET_BOTHC:
+ case DB_NEXT:
+ case DB_NEXT_DUP:
+ case DB_NEXT_NODUP:
+ case DB_PREV:
+ case DB_PREV_DUP:
+ case DB_PREV_NODUP:
+ tmp_flags = DB_POSITION;
+ break;
+ default:
+ tmp_flags = 0;
+ break;
+ }
+
+ if (F_ISSET(dbc, DBC_PARTITIONED | DBC_TRANSIENT))
+ dbc_n = dbc;
+ else if ((ret = __dbc_dup(dbc, &dbc_n, tmp_flags)) != 0)
+ return (ret);
+
+ F_SET(dbc_n, DBC_TRANSIENT);
+
+ if (tmp_rmw)
+ F_SET(dbc_n, DBC_RMW);
+ F_SET(dbc_n, tmp_read_locking);
+
+ /*
+ * If we've been handed a primary key, it will be in native byte order,
+ * so we need to swap it before reading from the secondary.
+ */
+ if (flags == DB_GET_BOTH || flags == DB_GET_BOTHC ||
+ flags == DB_GET_BOTH_RANGE)
+ SWAP_IF_NEEDED(sdbp, pkey);
+
+retry: /* Step 1. */
+ dbc_n->rdata = dbc->rkey;
+ dbc_n->rkey = dbc->rskey;
+ ret = __dbc_get(dbc_n, skey, pkey, flags);
+ /* Restore pkey's flags in case we stomped the PARTIAL flag. */
+ pkey->flags = save_pkey_flags;
+
+ /*
+ * We need to swap the primary key to native byte order if we read it
+ * successfully, or if we swapped it on entry above. We can't return
+ * with the application's data modified.
+ */
+ if (ret == 0 || flags == DB_GET_BOTH || flags == DB_GET_BOTHC ||
+ flags == DB_GET_BOTH_RANGE)
+ SWAP_IF_NEEDED(sdbp, pkey);
+
+ if (ret != 0)
+ goto err;
+
+ /*
+ * Now we're ready for "step 2". If either or both of pkey and data do
+ * not have memory management flags set--that is, if DB is managing
+ * their memory--we need to swap around the rkey/rdata structures so
+ * that we don't wind up trying to use memory managed by the primary
+ * database cursor, which we'll close before we return.
+ *
+ * !!!
+ * If you're carefully following the bouncing ball, you'll note that in
+ * the DB-managed case, the buffer hanging off of pkey is the same as
+ * dbc->rkey->data. This is just fine; we may well realloc and stomp
+ * on it when we return, if we're doing a DB_GET_BOTH and need to
+ * return a different partial or key (depending on the comparison
+ * function), but this is safe.
+ *
+ * !!!
+ * We need to use __db_cursor_int here rather than simply calling
+ * pdbp->cursor, because otherwise, if we're in CDB, we'll allocate a
+ * new locker ID and leave ourselves open to deadlocks. (Even though
+ * we're only acquiring read locks, we'll still block if there are any
+ * waiters.)
+ */
+ if ((ret = __db_cursor_int(pdbp, dbc->thread_info,
+ dbc->txn, pdbp->type, PGNO_INVALID, 0, dbc->locker, &pdbc)) != 0)
+ goto err;
+
+ F_SET(pdbc, tmp_read_locking |
+ F_ISSET(dbc, DBC_READ_UNCOMMITTED | DBC_READ_COMMITTED | DBC_RMW));
+
+ /*
+ * We're about to use pkey a second time. If DB_DBT_MALLOC is set on
+ * it, we'll leak the memory we allocated the first time. Thus, set
+ * DB_DBT_REALLOC instead so that we reuse that memory instead of
+ * leaking it.
+ *
+ * Alternatively, if the application is handling copying for pkey, we
+ * need to take a copy now. The copy will be freed on exit from
+ * __dbc_pget_pp (and we must be coming through there if DB_DBT_USERCOPY
+ * is set). In the case of DB_GET_BOTH_RANGE, the pkey supplied by
+ * the application has already been copied in but the value may have
+ * changed in the search. In that case, free the original copy and get
+ * a new one.
+ *
+ * !!!
+ * This assumes that the user must always specify a compatible realloc
+ * function if a malloc function is specified. I think this is a
+ * reasonable requirement.
+ */
+ if (F_ISSET(pkey, DB_DBT_MALLOC)) {
+ F_CLR(pkey, DB_DBT_MALLOC);
+ F_SET(pkey, DB_DBT_REALLOC);
+ pkeymalloc = 1;
+ } else if (F_ISSET(pkey, DB_DBT_USERCOPY)) {
+ if (flags == DB_GET_BOTH_RANGE)
+ __dbt_userfree(sdbp->env, NULL, pkey, NULL);
+ if ((ret = __dbt_usercopy(sdbp->env, pkey)) != 0)
+ goto err;
+ }
+
+ /*
+ * Do the actual get. Set DBC_TRANSIENT since we don't care about
+ * preserving the position on error, and it's faster. SET_RET_MEM so
+ * that the secondary DBC owns any returned-data memory.
+ */
+ F_SET(pdbc, DBC_TRANSIENT);
+ SET_RET_MEM(pdbc, dbc);
+ ret = __dbc_get(pdbc, pkey, data, DB_SET);
+
+ /*
+ * If the item wasn't found in the primary, this is a bug; our
+ * secondary has somehow gotten corrupted, and contains elements that
+ * don't correspond to anything in the primary. Complain.
+ */
+
+ /* Now close the primary cursor. */
+ if ((t_ret = __dbc_close(pdbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ else if (ret == DB_NOTFOUND) {
+ if (!F_ISSET(pdbc, DBC_READ_UNCOMMITTED))
+ ret = __db_secondary_corrupt(pdbp);
+ else switch (flags) {
+ case DB_GET_BOTHC:
+ case DB_NEXT:
+ case DB_NEXT_DUP:
+ case DB_NEXT_NODUP:
+ case DB_PREV:
+ case DB_PREV_DUP:
+ case DB_PREV_NODUP:
+ goto retry;
+ default:
+ break;
+ }
+ }
+
+err: /* Cleanup and cursor resolution. */
+ if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && ret == 0)
+ ret = t_ret;
+ if (pkeymalloc) {
+ /*
+ * If pkey had a MALLOC flag, we need to restore it; otherwise,
+ * if the user frees the buffer but reuses the DBT without
+ * NULL'ing its data field or changing the flags, we may drop
+ * core.
+ */
+ F_CLR(pkey, DB_DBT_REALLOC);
+ F_SET(pkey, DB_DBT_MALLOC);
+ }
+
+ return (ret);
+}
+
+/*
+ * __dbc_pget_recno --
+ * Perform a DB_GET_RECNO c_pget on a secondary index. Returns
+ * the secondary's record number in the pkey field and the primary's
+ * in the data field.
+ */
+static int
+__dbc_pget_recno(sdbc, pkey, data, flags)
+ DBC *sdbc;
+ DBT *pkey, *data;
+ u_int32_t flags;
+{
+ DB *pdbp, *sdbp;
+ DBC *pdbc;
+ DBT discardme, primary_key;
+ ENV *env;
+ db_recno_t oob;
+ u_int32_t rmw;
+ int ret, t_ret;
+
+ sdbp = sdbc->dbp;
+ pdbp = sdbp->s_primary;
+ env = sdbp->env;
+ pdbc = NULL;
+ ret = t_ret = 0;
+
+ rmw = LF_ISSET(DB_RMW);
+
+ memset(&discardme, 0, sizeof(DBT));
+ F_SET(&discardme, DB_DBT_USERMEM | DB_DBT_PARTIAL);
+
+ oob = RECNO_OOB;
+
+ /*
+ * If the primary is an rbtree, we want its record number, whether
+ * or not the secondary is one too. Fetch the recno into "data".
+ *
+ * If it's not an rbtree, return RECNO_OOB in "data".
+ */
+ if (F_ISSET(pdbp, DB_AM_RECNUM)) {
+ /*
+ * Get the primary key, so we can find the record number
+ * in the primary. (We're uninterested in the secondary key.)
+ */
+ memset(&primary_key, 0, sizeof(DBT));
+ F_SET(&primary_key, DB_DBT_MALLOC);
+ if ((ret = __dbc_get(sdbc,
+ &discardme, &primary_key, rmw | DB_CURRENT)) != 0)
+ return (ret);
+
+ /*
+ * Open a cursor on the primary, set it to the right record,
+ * and fetch its recno into "data".
+ *
+ * (See __dbc_pget for comments on the use of __db_cursor_int.)
+ *
+ * SET_RET_MEM so that the secondary DBC owns any returned-data
+ * memory.
+ */
+ if ((ret = __db_cursor_int(pdbp, sdbc->thread_info, sdbc->txn,
+ pdbp->type, PGNO_INVALID, 0, sdbc->locker, &pdbc)) != 0)
+ goto perr;
+ SET_RET_MEM(pdbc, sdbc);
+ if ((ret = __dbc_get(pdbc,
+ &primary_key, &discardme, rmw | DB_SET)) != 0)
+ goto perr;
+
+ ret = __dbc_get(pdbc, &discardme, data, rmw | DB_GET_RECNO);
+
+perr: __os_ufree(env, primary_key.data);
+ if (pdbc != NULL &&
+ (t_ret = __dbc_close(pdbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ return (ret);
+ } else if ((ret = __db_retcopy(env, data, &oob,
+ sizeof(oob), &sdbc->rkey->data, &sdbc->rkey->ulen)) != 0)
+ return (ret);
+
+ /*
+ * If the secondary is an rbtree, we want its record number, whether
+ * or not the primary is one too. Fetch the recno into "pkey".
+ *
+ * If it's not an rbtree, return RECNO_OOB in "pkey".
+ */
+ if (F_ISSET(sdbp, DB_AM_RECNUM))
+ return (__dbc_get(sdbc, &discardme, pkey, flags));
+ else
+ return (__db_retcopy(env, pkey, &oob,
+ sizeof(oob), &sdbc->rdata->data, &sdbc->rdata->ulen));
+}
+
+/*
+ * __db_wrlock_err -- do not have a write lock.
+ */
+static int
+__db_wrlock_err(env)
+ ENV *env;
+{
+ __db_errx(env, "Write attempted on read-only cursor");
+ return (EPERM);
+}
+
+/*
+ * __dbc_del_secondary --
+ * Perform a delete operation on a secondary index: call through
+ * to the primary and delete the primary record that this record
+ * points to.
+ *
+ * Note that deleting the primary record will call c_del on all
+ * the secondaries, including this one; thus, it is not necessary
+ * to execute both this function and an actual delete.
+ */
+static int
+__dbc_del_secondary(dbc)
+ DBC *dbc;
+{
+ DB *pdbp;
+ DBC *pdbc;
+ DBT skey, pkey;
+ ENV *env;
+ int ret, t_ret;
+ u_int32_t rmw;
+
+ pdbp = dbc->dbp->s_primary;
+ env = pdbp->env;
+ rmw = STD_LOCKING(dbc) ? DB_RMW : 0;
+
+ /*
+ * Get the current item that we're pointing at.
+ * We don't actually care about the secondary key, just
+ * the primary.
+ */
+ memset(&skey, 0, sizeof(DBT));
+ memset(&pkey, 0, sizeof(DBT));
+ F_SET(&skey, DB_DBT_PARTIAL | DB_DBT_USERMEM);
+ if ((ret = __dbc_get(dbc, &skey, &pkey, DB_CURRENT)) != 0)
+ return (ret);
+
+ SWAP_IF_NEEDED(dbc->dbp, &pkey);
+
+ /*
+ * Create a cursor on the primary with our locker ID,
+ * so that when it calls back, we don't conflict.
+ *
+ * We create a cursor explicitly because there's no
+ * way to specify the same locker ID if we're using
+ * locking but not transactions if we use the DB->del
+ * interface. This shouldn't be any less efficient
+ * anyway.
+ */
+ if ((ret = __db_cursor_int(pdbp, dbc->thread_info, dbc->txn,
+ pdbp->type, PGNO_INVALID, 0, dbc->locker, &pdbc)) != 0)
+ return (ret);
+
+ /*
+ * See comment in __dbc_put--if we're in CDB,
+ * we already hold the locks we need, and we need to flag
+ * the cursor as a WRITER so we don't run into errors
+ * when we try to delete.
+ */
+ if (CDB_LOCKING(env)) {
+ DB_ASSERT(env, pdbc->mylock.off == LOCK_INVALID);
+ F_SET(pdbc, DBC_WRITER);
+ }
+
+ /*
+ * Set the new cursor to the correct primary key. Then
+ * delete it. We don't really care about the datum;
+ * just reuse our skey DBT.
+ *
+ * If the primary get returns DB_NOTFOUND, something is amiss--
+ * every record in the secondary should correspond to some record
+ * in the primary.
+ */
+ if ((ret = __dbc_get(pdbc, &pkey, &skey, DB_SET | rmw)) == 0)
+ ret = __dbc_del(pdbc, 0);
+ else if (ret == DB_NOTFOUND)
+ ret = __db_secondary_corrupt(pdbp);
+
+ if ((t_ret = __dbc_close(pdbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __dbc_del_primary --
+ * Perform a delete operation on a primary index. Loop through
+ * all the secondary indices which correspond to this primary
+ * database, and delete any secondary keys that point at the current
+ * record.
+ *
+ * PUBLIC: int __dbc_del_primary __P((DBC *));
+ */
+int
+__dbc_del_primary(dbc)
+ DBC *dbc;
+{
+ DB *dbp, *sdbp;
+ DBC *sdbc;
+ DBT *tskeyp;
+ DBT data, pkey, skey, temppkey, tempskey;
+ ENV *env;
+ u_int32_t nskey, rmw;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ sdbp = NULL;
+ rmw = STD_LOCKING(dbc) ? DB_RMW : 0;
+
+ /*
+ * If we're called at all, we have at least one secondary.
+ * (Unfortunately, we can't assert this without grabbing the mutex.)
+ * Get the current record so that we can construct appropriate
+ * secondary keys as needed.
+ */
+ memset(&pkey, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ if ((ret = __dbc_get(dbc, &pkey, &data, DB_CURRENT)) != 0)
+ return (ret);
+
+ memset(&skey, 0, sizeof(DBT));
+ for (ret = __db_s_first(dbp, &sdbp);
+ sdbp != NULL && ret == 0;
+ ret = __db_s_next(&sdbp, dbc->txn)) {
+ /*
+ * Get the secondary key for this secondary and the current
+ * item.
+ */
+ if ((ret = sdbp->s_callback(sdbp, &pkey, &data, &skey)) != 0) {
+ /* Not indexing is equivalent to an empty key set. */
+ if (ret == DB_DONOTINDEX) {
+ F_SET(&skey, DB_DBT_MULTIPLE);
+ skey.size = 0;
+ } else /* We had a substantive error. Bail. */
+ goto err;
+ }
+
+#ifdef DIAGNOSTIC
+ if (F_ISSET(&skey, DB_DBT_MULTIPLE))
+ __db_check_skeyset(sdbp, &skey);
+#endif
+
+ if (F_ISSET(&skey, DB_DBT_MULTIPLE)) {
+ tskeyp = (DBT *)skey.data;
+ nskey = skey.size;
+ if (nskey == 0)
+ continue;
+ } else {
+ tskeyp = &skey;
+ nskey = 1;
+ }
+
+ /* Open a secondary cursor. */
+ if ((ret = __db_cursor_int(sdbp,
+ dbc->thread_info, dbc->txn, sdbp->type,
+ PGNO_INVALID, 0, dbc->locker, &sdbc)) != 0)
+ goto err;
+ /* See comment above and in __dbc_put. */
+ if (CDB_LOCKING(env)) {
+ DB_ASSERT(env, sdbc->mylock.off == LOCK_INVALID);
+ F_SET(sdbc, DBC_WRITER);
+ }
+
+ for (; nskey > 0; nskey--, tskeyp++) {
+ /*
+ * Set the secondary cursor to the appropriate item.
+ * Delete it.
+ *
+ * We want to use DB_RMW if locking is on; it's only
+ * legal then, though.
+ *
+ * !!!
+ * Don't stomp on any callback-allocated buffer in skey
+ * when we do a c_get(DB_GET_BOTH); use a temp DBT
+ * instead. Similarly, don't allow pkey to be
+ * invalidated when the cursor is closed.
+ */
+ DB_INIT_DBT(tempskey, tskeyp->data, tskeyp->size);
+ SWAP_IF_NEEDED(sdbp, &pkey);
+ DB_INIT_DBT(temppkey, pkey.data, pkey.size);
+ if ((ret = __dbc_get(sdbc, &tempskey, &temppkey,
+ DB_GET_BOTH | rmw)) == 0)
+ ret = __dbc_del(sdbc, DB_UPDATE_SECONDARY);
+ else if (ret == DB_NOTFOUND)
+ ret = __db_secondary_corrupt(dbp);
+ SWAP_IF_NEEDED(sdbp, &pkey);
+ FREE_IF_NEEDED(env, tskeyp);
+ }
+
+ if ((t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+
+ /*
+ * In the common case where there is a single secondary key, we
+ * will have freed any application-allocated data in skey
+ * already. In the multiple key case, we need to free it here.
+ * It is safe to do this twice as the macro resets the data
+ * field.
+ */
+ FREE_IF_NEEDED(env, &skey);
+ }
+
+err: if (sdbp != NULL &&
+ (t_ret = __db_s_done(sdbp, dbc->txn)) != 0 && ret == 0)
+ ret = t_ret;
+ FREE_IF_NEEDED(env, &skey);
+ return (ret);
+}
+
+/*
+ * __dbc_del_foreign --
+ * Apply the foreign database constraints for a particular foreign
+ * database when an item is being deleted (dbc points at item being deleted
+ * in the foreign database.)
+ *
+ * Delete happens in dbp, check for occurrences of key in pdpb.
+ * Terminology:
+ * Foreign db = Where delete occurs (dbp).
+ * Secondary db = Where references to dbp occur (sdbp, a secondary)
+ * Primary db = sdbp's primary database, references to dbp are secondary
+ * keys here
+ * Foreign Key = Key being deleted in dbp (fkey)
+ * Primary Key = Key of the corresponding entry in sdbp's primary (pkey).
+ */
+static int
+__dbc_del_foreign(dbc)
+ DBC *dbc;
+{
+ DB_FOREIGN_INFO *f_info;
+ DB *dbp, *pdbp, *sdbp;
+ DBC *pdbc, *sdbc;
+ DBT data, fkey, pkey;
+ ENV *env;
+ u_int32_t flags, rmw;
+ int changed, ret, t_ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ memset(&fkey, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ if ((ret = __dbc_get(dbc, &fkey, &data, DB_CURRENT)) != 0)
+ return (ret);
+
+ LIST_FOREACH(f_info, &(dbp->f_primaries), f_links) {
+ sdbp = f_info->dbp;
+ pdbp = sdbp->s_primary;
+ flags = f_info->flags;
+
+ rmw = (STD_LOCKING(dbc) &&
+ !LF_ISSET(DB_FOREIGN_ABORT)) ? DB_RMW : 0;
+
+ /*
+ * Handle CDB locking. Some of this is copied from
+ * __dbc_del_primary, but a bit more acrobatics are required.
+ * If we're not going to abort, then we need to get a write
+ * cursor. If CDB_ALLDB is set, then only one write cursor is
+ * allowed and we hold it, so we fudge things and promote the
+ * cursor on the other DBs manually, it won't cause a problem.
+ * If CDB_ALLDB is not set, then we go through the usual route
+ * to make sure we block as necessary. If there are any open
+ * read cursors on sdbp, the delete or put call later will
+ * block.
+ *
+ * If NULLIFY is set, we'll need a cursor on the primary to
+ * update it with the nullified data. Because primary and
+ * secondary dbs share a lock file ID in CDB, we open a cursor
+ * on the secondary and then get another writeable cursor on the
+ * primary via __db_cursor_int to avoid deadlocking.
+ */
+ sdbc = pdbc = NULL;
+ if (!LF_ISSET(DB_FOREIGN_ABORT) && CDB_LOCKING(env) &&
+ !F_ISSET(env->dbenv, DB_ENV_CDB_ALLDB)) {
+ ret = __db_cursor(sdbp,
+ dbc->thread_info, dbc->txn, &sdbc, DB_WRITECURSOR);
+ if (LF_ISSET(DB_FOREIGN_NULLIFY) && ret == 0) {
+ ret = __db_cursor_int(pdbp,
+ dbc->thread_info, dbc->txn, pdbp->type,
+ PGNO_INVALID, 0, dbc->locker, &pdbc);
+ F_SET(pdbc, DBC_WRITER);
+ }
+ } else {
+ ret = __db_cursor_int(sdbp, dbc->thread_info, dbc->txn,
+ sdbp->type, PGNO_INVALID, 0, dbc->locker, &sdbc);
+ if (LF_ISSET(DB_FOREIGN_NULLIFY) && ret == 0)
+ ret = __db_cursor_int(pdbp, dbc->thread_info,
+ dbc->txn, pdbp->type, PGNO_INVALID, 0,
+ dbc->locker, &pdbc);
+ }
+ if (ret != 0) {
+ if (sdbc != NULL)
+ (void)__dbc_close(sdbc);
+ return (ret);
+ }
+ if (CDB_LOCKING(env) && F_ISSET(env->dbenv, DB_ENV_CDB_ALLDB)) {
+ DB_ASSERT(env, sdbc->mylock.off == LOCK_INVALID);
+ F_SET(sdbc, DBC_WRITER);
+ if (LF_ISSET(DB_FOREIGN_NULLIFY) && pdbc != NULL) {
+ DB_ASSERT(env,
+ pdbc->mylock.off == LOCK_INVALID);
+ F_SET(pdbc, DBC_WRITER);
+ }
+ }
+
+ /*
+ * There are three actions possible when a foreign database has
+ * items corresponding to a deleted item:
+ * DB_FOREIGN_ABORT - The delete operation should be aborted.
+ * DB_FOREIGN_CASCADE - All corresponding foreign items should
+ * be deleted.
+ * DB_FOREIGN_NULLIFY - A callback needs to be made, allowing
+ * the application to modify the data DBT from the
+ * associated database. If the callback makes a
+ * modification, the updated item needs to replace the
+ * original item in the foreign db
+ */
+ memset(&pkey, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ ret = __dbc_pget(sdbc, &fkey, &pkey, &data, DB_SET|rmw);
+
+ if (ret == DB_NOTFOUND) {
+ /* No entry means no constraint */
+ ret = __dbc_close(sdbc);
+ if (LF_ISSET(DB_FOREIGN_NULLIFY) &&
+ (t_ret = __dbc_close(pdbc)) != 0)
+ ret = t_ret;
+ if (ret != 0)
+ return (ret);
+ continue;
+ } else if (ret != 0) {
+ /* Just return the error code from the pget */
+ (void)__dbc_close(sdbc);
+ if (LF_ISSET(DB_FOREIGN_NULLIFY))
+ (void)__dbc_close(pdbc);
+ return (ret);
+ } else if (LF_ISSET(DB_FOREIGN_ABORT)) {
+ /* If the record exists and ABORT is set, we're done */
+ if ((ret = __dbc_close(sdbc)) != 0)
+ return (ret);
+ return (DB_FOREIGN_CONFLICT);
+ }
+
+ /*
+ * There were matching items in the primary DB, and the action
+ * is either DB_FOREIGN_CASCADE or DB_FOREIGN_NULLIFY.
+ */
+ while (ret == 0) {
+ if (LF_ISSET(DB_FOREIGN_CASCADE)) {
+ /*
+ * Don't use the DB_UPDATE_SECONDARY flag,
+ * since we want the delete to cascade into the
+ * secondary's primary.
+ */
+ if ((ret = __dbc_del(sdbc, 0)) != 0) {
+ __db_err(env, ret,
+ "Attempt to execute cascading delete in a foreign index failed");
+ break;
+ }
+ } else if (LF_ISSET(DB_FOREIGN_NULLIFY)) {
+ changed = 0;
+ if ((ret = f_info->callback(sdbp,
+ &pkey, &data, &fkey, &changed)) != 0) {
+ __db_err(env, ret,
+ "Foreign database application callback");
+ break;
+ }
+
+ /*
+ * If the user callback modified the DBT and
+ * a put on the primary failed.
+ */
+ if (changed && (ret = __dbc_put(pdbc,
+ &pkey, &data, DB_KEYFIRST)) != 0) {
+ __db_err(env, ret,
+ "Attempt to overwrite item in foreign database with nullified value failed");
+ break;
+ }
+ }
+ /* retrieve the next matching item from the prim. db */
+ memset(&pkey, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ ret = __dbc_pget(sdbc,
+ &fkey, &pkey, &data, DB_NEXT_DUP|rmw);
+ }
+
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ if ((t_ret = __dbc_close(sdbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (LF_ISSET(DB_FOREIGN_NULLIFY) &&
+ (t_ret = __dbc_close(pdbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ return (ret);
+ }
+
+ return (ret);
+}
+
+/*
+ * __db_s_first --
+ * Get the first secondary, if any are present, from the primary.
+ *
+ * PUBLIC: int __db_s_first __P((DB *, DB **));
+ */
+int
+__db_s_first(pdbp, sdbpp)
+ DB *pdbp, **sdbpp;
+{
+ DB *sdbp;
+
+ MUTEX_LOCK(pdbp->env, pdbp->mutex);
+ sdbp = LIST_FIRST(&pdbp->s_secondaries);
+
+ /* See __db_s_next. */
+ if (sdbp != NULL)
+ sdbp->s_refcnt++;
+ MUTEX_UNLOCK(pdbp->env, pdbp->mutex);
+
+ *sdbpp = sdbp;
+
+ return (0);
+}
+
+/*
+ * __db_s_next --
+ * Get the next secondary in the list.
+ *
+ * PUBLIC: int __db_s_next __P((DB **, DB_TXN *));
+ */
+int
+__db_s_next(sdbpp, txn)
+ DB **sdbpp;
+ DB_TXN *txn;
+{
+ DB *sdbp, *pdbp, *closeme;
+ ENV *env;
+ int ret;
+
+ /*
+ * Secondary indices are kept in a linked list, s_secondaries,
+ * off each primary DB handle. If a primary is free-threaded,
+ * this list may only be traversed or modified while the primary's
+ * thread mutex is held.
+ *
+ * The tricky part is that we don't want to hold the thread mutex
+ * across the full set of secondary puts necessary for each primary
+ * put, or we'll wind up essentially single-threading all the puts
+ * to the handle; the secondary puts will each take about as
+ * long as the primary does, and may require I/O. So we instead
+ * hold the thread mutex only long enough to follow one link to the
+ * next secondary, and then we release it before performing the
+ * actual secondary put.
+ *
+ * The only danger here is that we might legitimately close a
+ * secondary index in one thread while another thread is performing
+ * a put and trying to update that same secondary index. To
+ * prevent this from happening, we refcount the secondary handles.
+ * If close is called on a secondary index handle while we're putting
+ * to it, it won't really be closed--the refcount will simply drop,
+ * and we'll be responsible for closing it here.
+ */
+ sdbp = *sdbpp;
+ pdbp = sdbp->s_primary;
+ env = pdbp->env;
+ closeme = NULL;
+
+ MUTEX_LOCK(env, pdbp->mutex);
+ DB_ASSERT(env, sdbp->s_refcnt != 0);
+ if (--sdbp->s_refcnt == 0) {
+ LIST_REMOVE(sdbp, s_links);
+ closeme = sdbp;
+ }
+ sdbp = LIST_NEXT(sdbp, s_links);
+ if (sdbp != NULL)
+ sdbp->s_refcnt++;
+ MUTEX_UNLOCK(env, pdbp->mutex);
+
+ *sdbpp = sdbp;
+
+ /*
+ * closeme->close() is a wrapper; call __db_close explicitly.
+ */
+ if (closeme == NULL)
+ ret = 0;
+ else
+ ret = __db_close(closeme, txn, 0);
+
+ return (ret);
+}
+
+/*
+ * __db_s_done --
+ * Properly decrement the refcount on a secondary database handle we're
+ * using, without calling __db_s_next.
+ *
+ * PUBLIC: int __db_s_done __P((DB *, DB_TXN *));
+ */
+int
+__db_s_done(sdbp, txn)
+ DB *sdbp;
+ DB_TXN *txn;
+{
+ DB *pdbp;
+ ENV *env;
+ int doclose, ret;
+
+ pdbp = sdbp->s_primary;
+ env = pdbp->env;
+ doclose = 0;
+
+ MUTEX_LOCK(env, pdbp->mutex);
+ DB_ASSERT(env, sdbp->s_refcnt != 0);
+ if (--sdbp->s_refcnt == 0) {
+ LIST_REMOVE(sdbp, s_links);
+ doclose = 1;
+ }
+ MUTEX_UNLOCK(env, pdbp->mutex);
+
+ if (doclose == 0)
+ ret = 0;
+ else
+ ret = __db_close(sdbp, txn, 0);
+ return (ret);
+}
+
+/*
+ * __db_s_count --
+ * Count the number of secondaries associated with a given primary.
+ */
+static int
+__db_s_count(pdbp)
+ DB *pdbp;
+{
+ DB *sdbp;
+ ENV *env;
+ int count;
+
+ env = pdbp->env;
+ count = 0;
+
+ MUTEX_LOCK(env, pdbp->mutex);
+ for (sdbp = LIST_FIRST(&pdbp->s_secondaries);
+ sdbp != NULL;
+ sdbp = LIST_NEXT(sdbp, s_links))
+ ++count;
+ MUTEX_UNLOCK(env, pdbp->mutex);
+
+ return (count);
+}
+
+/*
+ * __db_buildpartial --
+ * Build the record that will result after a partial put is applied to
+ * an existing record.
+ *
+ * This should probably be merged with __bam_build, but that requires
+ * a little trickery if we plan to keep the overflow-record optimization
+ * in that function.
+ *
+ * PUBLIC: int __db_buildpartial __P((DB *, DBT *, DBT *, DBT *));
+ */
+int
+__db_buildpartial(dbp, oldrec, partial, newrec)
+ DB *dbp;
+ DBT *oldrec, *partial, *newrec;
+{
+ ENV *env;
+ u_int32_t len, nbytes;
+ u_int8_t *buf;
+ int ret;
+
+ env = dbp->env;
+
+ DB_ASSERT(env, F_ISSET(partial, DB_DBT_PARTIAL));
+
+ memset(newrec, 0, sizeof(DBT));
+
+ nbytes = __db_partsize(oldrec->size, partial);
+ newrec->size = nbytes;
+
+ if ((ret = __os_malloc(env, nbytes, &buf)) != 0)
+ return (ret);
+ newrec->data = buf;
+
+ /* Nul or pad out the buffer, for any part that isn't specified. */
+ memset(buf,
+ F_ISSET(dbp, DB_AM_FIXEDLEN) ? ((BTREE *)dbp->bt_internal)->re_pad :
+ 0, nbytes);
+
+ /* Copy in any leading data from the original record. */
+ memcpy(buf, oldrec->data,
+ partial->doff > oldrec->size ? oldrec->size : partial->doff);
+
+ /* Copy the data from partial. */
+ memcpy(buf + partial->doff, partial->data, partial->size);
+
+ /* Copy any trailing data from the original record. */
+ len = partial->doff + partial->dlen;
+ if (oldrec->size > len)
+ memcpy(buf + partial->doff + partial->size,
+ (u_int8_t *)oldrec->data + len, oldrec->size - len);
+
+ return (0);
+}
+
+/*
+ * __db_partsize --
+ * Given the number of bytes in an existing record and a DBT that
+ * is about to be partial-put, calculate the size of the record
+ * after the put.
+ *
+ * This code is called from __bam_partsize.
+ *
+ * PUBLIC: u_int32_t __db_partsize __P((u_int32_t, DBT *));
+ */
+u_int32_t
+__db_partsize(nbytes, data)
+ u_int32_t nbytes;
+ DBT *data;
+{
+
+ /*
+ * There are really two cases here:
+ *
+ * Case 1: We are replacing some bytes that do not exist (i.e., they
+ * are past the end of the record). In this case the number of bytes
+ * we are replacing is irrelevant and all we care about is how many
+ * bytes we are going to add from offset. So, the new record length
+ * is going to be the size of the new bytes (size) plus wherever those
+ * new bytes begin (doff).
+ *
+ * Case 2: All the bytes we are replacing exist. Therefore, the new
+ * size is the oldsize (nbytes) minus the bytes we are replacing (dlen)
+ * plus the bytes we are adding (size).
+ */
+ if (nbytes < data->doff + data->dlen) /* Case 1 */
+ return (data->doff + data->size);
+
+ return (nbytes + data->size - data->dlen); /* Case 2 */
+}
+
+#ifdef DIAGNOSTIC
+/*
+ * __db_check_skeyset --
+ * Diagnostic check that the application's callback returns a set of
+ * secondary keys without repeats.
+ *
+ * PUBLIC: #ifdef DIAGNOSTIC
+ * PUBLIC: void __db_check_skeyset __P((DB *, DBT *));
+ * PUBLIC: #endif
+ */
+void
+__db_check_skeyset(sdbp, skeyp)
+ DB *sdbp;
+ DBT *skeyp;
+{
+ DBT *firstkey, *lastkey, *key1, *key2;
+ ENV *env;
+
+ env = sdbp->env;
+
+ firstkey = (DBT *)skeyp->data;
+ lastkey = firstkey + skeyp->size;
+ for (key1 = firstkey; key1 < lastkey; key1++)
+ for (key2 = key1 + 1; key2 < lastkey; key2++)
+ DB_ASSERT(env,
+ ((BTREE *)sdbp->bt_internal)->bt_compare(sdbp,
+ key1, key2) != 0);
+}
+#endif
diff --git a/db/db_cds.c b/db/db_cds.c
new file mode 100644
index 0000000..5efda31
--- /dev/null
+++ b/db/db_cds.c
@@ -0,0 +1,177 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/lock.h"
+#include "dbinc/txn.h"
+
+static int __cdsgroup_abort __P((DB_TXN *txn));
+static int __cdsgroup_commit __P((DB_TXN *txn, u_int32_t flags));
+static int __cdsgroup_discard __P((DB_TXN *txn, u_int32_t flags));
+static u_int32_t __cdsgroup_id __P((DB_TXN *txn));
+static int __cdsgroup_notsup __P((ENV *env, const char *meth));
+static int __cdsgroup_prepare __P((DB_TXN *txn, u_int8_t *gid));
+static int __cdsgroup_set_name __P((DB_TXN *txn, const char *name));
+static int __cdsgroup_set_timeout
+ __P((DB_TXN *txn, db_timeout_t timeout, u_int32_t flags));
+
+/*
+ * __cdsgroup_notsup --
+ * Error when CDS groups don't support a method.
+ */
+static int
+__cdsgroup_notsup(env, meth)
+ ENV *env;
+ const char *meth;
+{
+ __db_errx(env, "CDS groups do not support %s", meth);
+ return (DB_OPNOTSUP);
+}
+
+static int
+__cdsgroup_abort(txn)
+ DB_TXN *txn;
+{
+ return (__cdsgroup_notsup(txn->mgrp->env, "abort"));
+}
+
+static int
+__cdsgroup_commit(txn, flags)
+ DB_TXN *txn;
+ u_int32_t flags;
+{
+ DB_LOCKER *locker;
+ DB_LOCKREQ lreq;
+ ENV *env;
+ int ret, t_ret;
+
+ COMPQUIET(flags, 0);
+ env = txn->mgrp->env;
+
+ /* Check for live cursors. */
+ if (txn->cursors != 0) {
+ __db_errx(env, "CDS group has active cursors");
+ return (EINVAL);
+ }
+
+ /* We may be holding handle locks; release them. */
+ lreq.op = DB_LOCK_PUT_ALL;
+ lreq.obj = NULL;
+ ret = __lock_vec(env, txn->locker, 0, &lreq, 1, NULL);
+
+ env = txn->mgrp->env;
+ locker = txn->locker;
+ __os_free(env, txn->mgrp);
+ __os_free(env, txn);
+ if ((t_ret = __lock_id_free(env, locker)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+static int __cdsgroup_discard(txn, flags)
+ DB_TXN *txn;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+ return (__cdsgroup_notsup(txn->mgrp->env, "discard"));
+}
+
+static u_int32_t __cdsgroup_id(txn)
+ DB_TXN *txn;
+{
+ return (txn->txnid);
+}
+
+static int __cdsgroup_prepare(txn, gid)
+ DB_TXN *txn;
+ u_int8_t *gid;
+{
+ COMPQUIET(gid, NULL);
+ return (__cdsgroup_notsup(txn->mgrp->env, "prepare"));
+}
+
+static int __cdsgroup_set_name(txn, name)
+ DB_TXN *txn;
+ const char *name;
+{
+ COMPQUIET(name, NULL);
+ return (__cdsgroup_notsup(txn->mgrp->env, "set_name"));
+}
+
+static int __cdsgroup_set_timeout(txn, timeout, flags)
+ DB_TXN *txn;
+ db_timeout_t timeout;
+ u_int32_t flags;
+{
+ COMPQUIET(timeout, 0);
+ COMPQUIET(flags, 0);
+ return (__cdsgroup_notsup(txn->mgrp->env, "set_timeout"));
+}
+
+/*
+ * __cds_txn_begin --
+ * ENV->cdsgroup_begin
+ *
+ * PUBLIC: int __cdsgroup_begin __P((DB_ENV *, DB_TXN **));
+ */
+int
+__cdsgroup_begin(dbenv, txnpp)
+ DB_ENV *dbenv;
+ DB_TXN **txnpp;
+{
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_BEFORE_OPEN(env, "cdsgroup_begin");
+ if (!CDB_LOCKING(env))
+ return (__env_not_config(env, "cdsgroup_begin", DB_INIT_CDB));
+
+ ENV_ENTER(env, ip);
+ *txnpp = txn = NULL;
+ if ((ret = __os_calloc(env, 1, sizeof(DB_TXN), &txn)) != 0)
+ goto err;
+ /*
+ * We need a dummy DB_TXNMGR -- it's the only way to get from a
+ * transaction handle to the environment handle.
+ */
+ if ((ret = __os_calloc(env, 1, sizeof(DB_TXNMGR), &txn->mgrp)) != 0)
+ goto err;
+ txn->mgrp->env = env;
+
+ if ((ret = __lock_id(env, &txn->txnid, &txn->locker)) != 0)
+ goto err;
+
+ txn->flags = TXN_CDSGROUP;
+ txn->abort = __cdsgroup_abort;
+ txn->commit = __cdsgroup_commit;
+ txn->discard = __cdsgroup_discard;
+ txn->id = __cdsgroup_id;
+ txn->prepare = __cdsgroup_prepare;
+ txn->set_name = __cdsgroup_set_name;
+ txn->set_timeout = __cdsgroup_set_timeout;
+
+ *txnpp = txn;
+
+ if (0) {
+err: if (txn != NULL) {
+ if (txn->mgrp != NULL)
+ __os_free(env, txn->mgrp);
+ __os_free(env, txn);
+ }
+ }
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
diff --git a/db/db_conv.c b/db/db_conv.c
new file mode 100644
index 0000000..4572683
--- /dev/null
+++ b/db/db_conv.c
@@ -0,0 +1,733 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/log.h"
+#include "dbinc/qam.h"
+
+/*
+ * __db_pgin --
+ * Primary page-swap routine.
+ *
+ * PUBLIC: int __db_pgin __P((DB_ENV *, db_pgno_t, void *, DBT *));
+ */
+int
+__db_pgin(dbenv, pg, pp, cookie)
+ DB_ENV *dbenv;
+ db_pgno_t pg;
+ void *pp;
+ DBT *cookie;
+{
+ DB dummydb, *dbp;
+ DB_CIPHER *db_cipher;
+ DB_LSN not_used;
+ DB_PGINFO *pginfo;
+ ENV *env;
+ PAGE *pagep;
+ size_t sum_len;
+ int is_hmac, ret;
+ u_int8_t *chksum;
+
+ pginfo = (DB_PGINFO *)cookie->data;
+ env = dbenv->env;
+ pagep = (PAGE *)pp;
+
+ ret = is_hmac = 0;
+ chksum = NULL;
+ memset(&dummydb, 0, sizeof(DB));
+ dbp = &dummydb;
+ dbp->dbenv = dbenv;
+ dbp->env = env;
+ dbp->flags = pginfo->flags;
+ dbp->pgsize = pginfo->db_pagesize;
+ db_cipher = env->crypto_handle;
+ switch (pagep->type) {
+ case P_HASHMETA:
+ case P_BTREEMETA:
+ case P_QAMMETA:
+ /*
+ * If checksumming is set on the meta-page, we must set
+ * it in the dbp.
+ */
+ if (FLD_ISSET(((DBMETA *)pp)->metaflags, DBMETA_CHKSUM))
+ F_SET(dbp, DB_AM_CHKSUM);
+ else
+ F_CLR(dbp, DB_AM_CHKSUM);
+ if (((DBMETA *)pp)->encrypt_alg != 0 ||
+ F_ISSET(dbp, DB_AM_ENCRYPT))
+ is_hmac = 1;
+ /*
+ * !!!
+ * For all meta pages it is required that the chksum
+ * be at the same location. Use BTMETA to get to it
+ * for any meta type.
+ */
+ chksum = ((BTMETA *)pp)->chksum;
+ sum_len = DBMETASIZE;
+ break;
+ case P_INVALID:
+ /*
+ * We assume that we've read a file hole if we have
+ * a zero LSN, zero page number and P_INVALID. Otherwise
+ * we have an invalid page that might contain real data.
+ */
+ if (IS_ZERO_LSN(LSN(pagep)) && pagep->pgno == PGNO_INVALID) {
+ sum_len = 0;
+ break;
+ }
+ /* FALLTHROUGH */
+ default:
+ chksum = P_CHKSUM(dbp, pagep);
+ sum_len = pginfo->db_pagesize;
+ /*
+ * If we are reading in a non-meta page, then if we have
+ * a db_cipher then we are using hmac.
+ */
+ is_hmac = CRYPTO_ON(env) ? 1 : 0;
+ break;
+ }
+
+ /*
+ * We expect a checksum error if there was a configuration problem.
+ * If there is no configuration problem and we don't get a match,
+ * it's fatal: panic the system.
+ */
+ if (F_ISSET(dbp, DB_AM_CHKSUM) && sum_len != 0) {
+ if (F_ISSET(dbp, DB_AM_SWAP) && is_hmac == 0)
+ P_32_SWAP(chksum);
+ switch (ret = __db_check_chksum(
+ env, NULL, db_cipher, chksum, pp, sum_len, is_hmac)) {
+ case 0:
+ break;
+ case -1:
+ if (DBENV_LOGGING(env))
+ (void)__db_cksum_log(
+ env, NULL, &not_used, DB_FLUSH);
+ __db_errx(env,
+ "checksum error: page %lu: catastrophic recovery required",
+ (u_long)pg);
+ return (__env_panic(env, DB_RUNRECOVERY));
+ default:
+ return (ret);
+ }
+ }
+ if ((ret = __db_decrypt_pg(env, dbp, pagep)) != 0)
+ return (ret);
+ switch (pagep->type) {
+ case P_INVALID:
+ if (pginfo->type == DB_QUEUE)
+ return (__qam_pgin_out(env, pg, pp, cookie));
+ else
+ return (__ham_pgin(dbp, pg, pp, cookie));
+ case P_HASH_UNSORTED:
+ case P_HASH:
+ case P_HASHMETA:
+ return (__ham_pgin(dbp, pg, pp, cookie));
+ case P_BTREEMETA:
+ case P_IBTREE:
+ case P_IRECNO:
+ case P_LBTREE:
+ case P_LDUP:
+ case P_LRECNO:
+ case P_OVERFLOW:
+ return (__bam_pgin(dbp, pg, pp, cookie));
+ case P_QAMMETA:
+ case P_QAMDATA:
+ return (__qam_pgin_out(env, pg, pp, cookie));
+ default:
+ break;
+ }
+ return (__db_pgfmt(env, pg));
+}
+
+/*
+ * __db_pgout --
+ * Primary page-swap routine.
+ *
+ * PUBLIC: int __db_pgout __P((DB_ENV *, db_pgno_t, void *, DBT *));
+ */
+int
+__db_pgout(dbenv, pg, pp, cookie)
+ DB_ENV *dbenv;
+ db_pgno_t pg;
+ void *pp;
+ DBT *cookie;
+{
+ DB dummydb, *dbp;
+ DB_PGINFO *pginfo;
+ ENV *env;
+ PAGE *pagep;
+ int ret;
+
+ pginfo = (DB_PGINFO *)cookie->data;
+ env = dbenv->env;
+ pagep = (PAGE *)pp;
+
+ memset(&dummydb, 0, sizeof(DB));
+ dbp = &dummydb;
+ dbp->dbenv = dbenv;
+ dbp->env = env;
+ dbp->flags = pginfo->flags;
+ dbp->pgsize = pginfo->db_pagesize;
+ ret = 0;
+ switch (pagep->type) {
+ case P_INVALID:
+ if (pginfo->type == DB_QUEUE)
+ ret = __qam_pgin_out(env, pg, pp, cookie);
+ else
+ ret = __ham_pgout(dbp, pg, pp, cookie);
+ break;
+ case P_HASH:
+ case P_HASH_UNSORTED:
+ /*
+ * Support pgout of unsorted hash pages - since online
+ * replication upgrade can cause pages of this type to be
+ * written out.
+ *
+ * FALLTHROUGH
+ */
+ case P_HASHMETA:
+ ret = __ham_pgout(dbp, pg, pp, cookie);
+ break;
+ case P_BTREEMETA:
+ case P_IBTREE:
+ case P_IRECNO:
+ case P_LBTREE:
+ case P_LDUP:
+ case P_LRECNO:
+ case P_OVERFLOW:
+ ret = __bam_pgout(dbp, pg, pp, cookie);
+ break;
+ case P_QAMMETA:
+ case P_QAMDATA:
+ ret = __qam_pgin_out(env, pg, pp, cookie);
+ break;
+ default:
+ return (__db_pgfmt(env, pg));
+ }
+ if (ret)
+ return (ret);
+
+ return (__db_encrypt_and_checksum_pg(env, dbp, pagep));
+}
+
+/*
+ * __db_decrypt_pg --
+ * Utility function to decrypt a db page.
+ *
+ * PUBLIC: int __db_decrypt_pg __P((ENV *, DB *, PAGE *));
+ */
+int
+__db_decrypt_pg (env, dbp, pagep)
+ ENV *env;
+ DB *dbp;
+ PAGE *pagep;
+{
+ DB_CIPHER *db_cipher;
+ size_t pg_len, pg_off;
+ u_int8_t *iv;
+ int ret;
+
+ db_cipher = env->crypto_handle;
+ ret = 0;
+ iv = NULL;
+ if (F_ISSET(dbp, DB_AM_ENCRYPT)) {
+ DB_ASSERT(env, db_cipher != NULL);
+ DB_ASSERT(env, F_ISSET(dbp, DB_AM_CHKSUM));
+
+ pg_off = P_OVERHEAD(dbp);
+ DB_ASSERT(env, db_cipher->adj_size(pg_off) == 0);
+
+ switch (pagep->type) {
+ case P_HASHMETA:
+ case P_BTREEMETA:
+ case P_QAMMETA:
+ /*
+ * !!!
+ * For all meta pages it is required that the iv
+ * be at the same location. Use BTMETA to get to it
+ * for any meta type.
+ */
+ iv = ((BTMETA *)pagep)->iv;
+ pg_len = DBMETASIZE;
+ break;
+ case P_INVALID:
+ if (IS_ZERO_LSN(LSN(pagep)) &&
+ pagep->pgno == PGNO_INVALID) {
+ pg_len = 0;
+ break;
+ }
+ /* FALLTHROUGH */
+ default:
+ iv = P_IV(dbp, pagep);
+ pg_len = dbp->pgsize;
+ break;
+ }
+ if (pg_len != 0)
+ ret = db_cipher->decrypt(env, db_cipher->data,
+ iv, ((u_int8_t *)pagep) + pg_off,
+ pg_len - pg_off);
+ }
+ return (ret);
+}
+
+/*
+ * __db_encrypt_and_checksum_pg --
+ * Utility function to encrypt and checksum a db page.
+ *
+ * PUBLIC: int __db_encrypt_and_checksum_pg
+ * PUBLIC: __P((ENV *, DB *, PAGE *));
+ */
+int
+__db_encrypt_and_checksum_pg (env, dbp, pagep)
+ ENV *env;
+ DB *dbp;
+ PAGE *pagep;
+{
+ DB_CIPHER *db_cipher;
+ int ret;
+ size_t pg_off, pg_len, sum_len;
+ u_int8_t *chksum, *iv, *key;
+
+ chksum = iv = key = NULL;
+ db_cipher = env->crypto_handle;
+
+ if (F_ISSET(dbp, DB_AM_ENCRYPT)) {
+ DB_ASSERT(env, db_cipher != NULL);
+ DB_ASSERT(env, F_ISSET(dbp, DB_AM_CHKSUM));
+
+ pg_off = P_OVERHEAD(dbp);
+ DB_ASSERT(env, db_cipher->adj_size(pg_off) == 0);
+
+ key = db_cipher->mac_key;
+
+ switch (pagep->type) {
+ case P_HASHMETA:
+ case P_BTREEMETA:
+ case P_QAMMETA:
+ /*
+ * !!!
+ * For all meta pages it is required that the iv
+ * be at the same location. Use BTMETA to get to it
+ * for any meta type.
+ */
+ iv = ((BTMETA *)pagep)->iv;
+ pg_len = DBMETASIZE;
+ break;
+ default:
+ iv = P_IV(dbp, pagep);
+ pg_len = dbp->pgsize;
+ break;
+ }
+ if ((ret = db_cipher->encrypt(env, db_cipher->data,
+ iv, ((u_int8_t *)pagep) + pg_off, pg_len - pg_off)) != 0)
+ return (ret);
+ }
+ if (F_ISSET(dbp, DB_AM_CHKSUM)) {
+ switch (pagep->type) {
+ case P_HASHMETA:
+ case P_BTREEMETA:
+ case P_QAMMETA:
+ /*
+ * !!!
+ * For all meta pages it is required that the chksum
+ * be at the same location. Use BTMETA to get to it
+ * for any meta type.
+ */
+ chksum = ((BTMETA *)pagep)->chksum;
+ sum_len = DBMETASIZE;
+ break;
+ default:
+ chksum = P_CHKSUM(dbp, pagep);
+ sum_len = dbp->pgsize;
+ break;
+ }
+ __db_chksum(NULL, (u_int8_t *)pagep, sum_len, key, chksum);
+ if (F_ISSET(dbp, DB_AM_SWAP) && !F_ISSET(dbp, DB_AM_ENCRYPT))
+ P_32_SWAP(chksum);
+ }
+ return (0);
+}
+
+/*
+ * __db_metaswap --
+ * Byteswap the common part of the meta-data page.
+ *
+ * PUBLIC: void __db_metaswap __P((PAGE *));
+ */
+void
+__db_metaswap(pg)
+ PAGE *pg;
+{
+ u_int8_t *p;
+
+ p = (u_int8_t *)pg;
+
+ /* Swap the meta-data information. */
+ SWAP32(p); /* lsn.file */
+ SWAP32(p); /* lsn.offset */
+ SWAP32(p); /* pgno */
+ SWAP32(p); /* magic */
+ SWAP32(p); /* version */
+ SWAP32(p); /* pagesize */
+ p += 4; /* unused, page type, unused, unused */
+ SWAP32(p); /* free */
+ SWAP32(p); /* alloc_lsn part 1 */
+ SWAP32(p); /* alloc_lsn part 2 */
+ SWAP32(p); /* cached key count */
+ SWAP32(p); /* cached record count */
+ SWAP32(p); /* flags */
+}
+
+/*
+ * __db_byteswap --
+ * Byteswap an ordinary database page.
+ *
+ * PUBLIC: int __db_byteswap
+ * PUBLIC: __P((DB *, db_pgno_t, PAGE *, size_t, int));
+ */
+int
+__db_byteswap(dbp, pg, h, pagesize, pgin)
+ DB *dbp;
+ db_pgno_t pg;
+ PAGE *h;
+ size_t pagesize;
+ int pgin;
+{
+ ENV *env;
+ BINTERNAL *bi;
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ RINTERNAL *ri;
+ db_indx_t i, *inp, len, tmp;
+ u_int8_t *end, *p, *pgend;
+
+ if (pagesize == 0)
+ return (0);
+
+ env = dbp->env;
+
+ if (pgin) {
+ M_32_SWAP(h->lsn.file);
+ M_32_SWAP(h->lsn.offset);
+ M_32_SWAP(h->pgno);
+ M_32_SWAP(h->prev_pgno);
+ M_32_SWAP(h->next_pgno);
+ M_16_SWAP(h->entries);
+ M_16_SWAP(h->hf_offset);
+ }
+
+ pgend = (u_int8_t *)h + pagesize;
+
+ inp = P_INP(dbp, h);
+ if ((u_int8_t *)inp >= pgend)
+ goto out;
+
+ switch (TYPE(h)) {
+ case P_HASH_UNSORTED:
+ case P_HASH:
+ for (i = 0; i < NUM_ENT(h); i++) {
+ if (pgin)
+ M_16_SWAP(inp[i]);
+
+ if (P_ENTRY(dbp, h, i) >= pgend)
+ continue;
+
+ switch (HPAGE_TYPE(dbp, h, i)) {
+ case H_KEYDATA:
+ break;
+ case H_DUPLICATE:
+ len = LEN_HKEYDATA(dbp, h, pagesize, i);
+ p = HKEYDATA_DATA(P_ENTRY(dbp, h, i));
+ for (end = p + len; p < end;) {
+ if (pgin) {
+ P_16_SWAP(p);
+ memcpy(&tmp,
+ p, sizeof(db_indx_t));
+ p += sizeof(db_indx_t);
+ } else {
+ memcpy(&tmp,
+ p, sizeof(db_indx_t));
+ SWAP16(p);
+ }
+ p += tmp;
+ SWAP16(p);
+ }
+ break;
+ case H_OFFDUP:
+ p = HOFFPAGE_PGNO(P_ENTRY(dbp, h, i));
+ SWAP32(p); /* pgno */
+ break;
+ case H_OFFPAGE:
+ p = HOFFPAGE_PGNO(P_ENTRY(dbp, h, i));
+ SWAP32(p); /* pgno */
+ SWAP32(p); /* tlen */
+ break;
+ default:
+ return (__db_pgfmt(env, pg));
+ }
+
+ }
+
+ /*
+ * The offsets in the inp array are used to determine
+ * the size of entries on a page; therefore they
+ * cannot be converted until we've done all the
+ * entries.
+ */
+ if (!pgin)
+ for (i = 0; i < NUM_ENT(h); i++)
+ M_16_SWAP(inp[i]);
+ break;
+ case P_LBTREE:
+ case P_LDUP:
+ case P_LRECNO:
+ for (i = 0; i < NUM_ENT(h); i++) {
+ if (pgin)
+ M_16_SWAP(inp[i]);
+
+ /*
+ * In the case of on-page duplicates, key information
+ * should only be swapped once.
+ */
+ if (h->type == P_LBTREE && i > 1) {
+ if (pgin) {
+ if (inp[i] == inp[i - 2])
+ continue;
+ } else {
+ M_16_SWAP(inp[i]);
+ if (inp[i] == inp[i - 2])
+ continue;
+ M_16_SWAP(inp[i]);
+ }
+ }
+
+ bk = GET_BKEYDATA(dbp, h, i);
+ if ((u_int8_t *)bk >= pgend)
+ continue;
+ switch (B_TYPE(bk->type)) {
+ case B_KEYDATA:
+ M_16_SWAP(bk->len);
+ break;
+ case B_DUPLICATE:
+ case B_OVERFLOW:
+ bo = (BOVERFLOW *)bk;
+ M_32_SWAP(bo->pgno);
+ M_32_SWAP(bo->tlen);
+ break;
+ default:
+ return (__db_pgfmt(env, pg));
+ }
+
+ if (!pgin)
+ M_16_SWAP(inp[i]);
+ }
+ break;
+ case P_IBTREE:
+ for (i = 0; i < NUM_ENT(h); i++) {
+ if (pgin)
+ M_16_SWAP(inp[i]);
+
+ bi = GET_BINTERNAL(dbp, h, i);
+ if ((u_int8_t *)bi >= pgend)
+ continue;
+
+ M_16_SWAP(bi->len);
+ M_32_SWAP(bi->pgno);
+ M_32_SWAP(bi->nrecs);
+
+ switch (B_TYPE(bi->type)) {
+ case B_KEYDATA:
+ break;
+ case B_DUPLICATE:
+ case B_OVERFLOW:
+ bo = (BOVERFLOW *)bi->data;
+ M_32_SWAP(bo->pgno);
+ M_32_SWAP(bo->tlen);
+ break;
+ default:
+ return (__db_pgfmt(env, pg));
+ }
+
+ if (!pgin)
+ M_16_SWAP(inp[i]);
+ }
+ break;
+ case P_IRECNO:
+ for (i = 0; i < NUM_ENT(h); i++) {
+ if (pgin)
+ M_16_SWAP(inp[i]);
+
+ ri = GET_RINTERNAL(dbp, h, i);
+ if ((u_int8_t *)ri >= pgend)
+ continue;
+
+ M_32_SWAP(ri->pgno);
+ M_32_SWAP(ri->nrecs);
+
+ if (!pgin)
+ M_16_SWAP(inp[i]);
+ }
+ break;
+ case P_INVALID:
+ case P_OVERFLOW:
+ case P_QAMDATA:
+ /* Nothing to do. */
+ break;
+ default:
+ return (__db_pgfmt(env, pg));
+ }
+
+out: if (!pgin) {
+ /* Swap the header information. */
+ M_32_SWAP(h->lsn.file);
+ M_32_SWAP(h->lsn.offset);
+ M_32_SWAP(h->pgno);
+ M_32_SWAP(h->prev_pgno);
+ M_32_SWAP(h->next_pgno);
+ M_16_SWAP(h->entries);
+ M_16_SWAP(h->hf_offset);
+ }
+ return (0);
+}
+
+/*
+ * __db_pageswap --
+ * Byteswap any database page. Normally, the page to be swapped will be
+ * referenced by the "pp" argument and the pdata argument will be NULL.
+ * This function is also called by automatically generated log functions,
+ * where the page may be split into separate header and data parts. In
+ * that case, pdata is not NULL we reconsitute
+ *
+ * PUBLIC: int __db_pageswap
+ * PUBLIC: __P((DB *, void *, size_t, DBT *, int));
+ */
+int
+__db_pageswap(dbp, pp, len, pdata, pgin)
+ DB *dbp;
+ void *pp;
+ size_t len;
+ DBT *pdata;
+ int pgin;
+{
+ ENV *env;
+ db_pgno_t pg;
+ size_t pgsize;
+ void *pgcopy;
+ int ret;
+ u_int16_t hoffset;
+
+ env = dbp->env;
+
+ switch (TYPE(pp)) {
+ case P_BTREEMETA:
+ return (__bam_mswap(env, pp));
+
+ case P_HASHMETA:
+ return (__ham_mswap(env, pp));
+
+ case P_QAMMETA:
+ return (__qam_mswap(env, pp));
+
+ case P_INVALID:
+ case P_OVERFLOW:
+ case P_QAMDATA:
+ /*
+ * We may have been passed an invalid page, or a queue data
+ * page, or an overflow page where fields like hoffset have a
+ * special meaning. In that case, no swapping of the page data
+ * is required, just the fields in the page header.
+ */
+ pdata = NULL;
+ break;
+
+ default:
+ break;
+ }
+
+ if (pgin) {
+ P_32_COPYSWAP(&PGNO(pp), &pg);
+ P_16_COPYSWAP(&HOFFSET(pp), &hoffset);
+ } else {
+ pg = PGNO(pp);
+ hoffset = HOFFSET(pp);
+ }
+
+ if (pdata == NULL)
+ ret = __db_byteswap(dbp, pg, (PAGE *)pp, len, pgin);
+ else {
+ pgsize = hoffset + pdata->size;
+ if ((ret = __os_malloc(env, pgsize, &pgcopy)) != 0)
+ return (ret);
+ memset(pgcopy, 0, pgsize);
+ memcpy(pgcopy, pp, len);
+ memcpy((u_int8_t *)pgcopy + hoffset, pdata->data, pdata->size);
+
+ ret = __db_byteswap(dbp, pg, (PAGE *)pgcopy, pgsize, pgin);
+ memcpy(pp, pgcopy, len);
+
+ /*
+ * If we are swapping data to be written to the log, we can't
+ * overwrite the buffer that was passed in: it may be a pointer
+ * into a page in cache. We set DB_DBT_APPMALLOC here so that
+ * the calling code can free the memory we allocate here.
+ */
+ if (!pgin) {
+ if ((ret =
+ __os_malloc(env, pdata->size, &pdata->data)) != 0) {
+ __os_free(env, pgcopy);
+ return (ret);
+ }
+ F_SET(pdata, DB_DBT_APPMALLOC);
+ }
+ memcpy(pdata->data, (u_int8_t *)pgcopy + hoffset, pdata->size);
+ __os_free(env, pgcopy);
+ }
+
+ return (ret);
+}
diff --git a/db/db_dispatch.c b/db/db_dispatch.c
new file mode 100644
index 0000000..65dc260
--- /dev/null
+++ b/db/db_dispatch.c
@@ -0,0 +1,953 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ */
+/*
+ * Copyright (c) 1995, 1996
+ * The President and Fellows of Harvard University. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+#include "dbinc/fop.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __db_txnlist_find_internal __P((ENV *, DB_TXNHEAD *,
+ db_txnlist_type, u_int32_t, DB_TXNLIST **,
+ int, u_int32_t *));
+
+/*
+ * __db_dispatch --
+ *
+ * This is the transaction dispatch function used by the db access methods.
+ * It is designed to handle the record format used by all the access
+ * methods (the one automatically generated by the db_{h,log,read}.sh
+ * scripts in the tools directory). An application using a different
+ * recovery paradigm will supply a different dispatch function to txn_open.
+ *
+ * PUBLIC: int __db_dispatch __P((ENV *,
+ * PUBLIC: DB_DISTAB *, DBT *, DB_LSN *, db_recops, DB_TXNHEAD *));
+ */
+int
+__db_dispatch(env, dtab, db, lsnp, redo, info)
+ ENV *env; /* The environment. */
+ DB_DISTAB *dtab;
+ DBT *db; /* The log record upon which to dispatch. */
+ DB_LSN *lsnp; /* The lsn of the record being dispatched. */
+ db_recops redo; /* Redo this op (or undo it). */
+ DB_TXNHEAD *info; /* Transaction list. */
+{
+ DB_ENV *dbenv;
+ DB_LSN prev_lsn;
+ u_int32_t rectype, status, txnid, urectype;
+ int make_call, ret;
+
+ dbenv = env->dbenv;
+ LOGCOPY_32(env, &rectype, db->data);
+ LOGCOPY_32(env, &txnid, (u_int8_t *)db->data + sizeof(rectype));
+
+ make_call = ret = 0;
+
+ /* If we don't have a dispatch table, it's hard to dispatch. */
+ DB_ASSERT(env, dtab != NULL);
+
+ /*
+ * If we find a record that is in the user's number space and they
+ * have specified a recovery routine, let them handle it. If they
+ * didn't specify a recovery routine, then we expect that they've
+ * followed all our rules and registered new recovery functions.
+ */
+ switch (redo) {
+ case DB_TXN_ABORT:
+ case DB_TXN_APPLY:
+ case DB_TXN_PRINT:
+ make_call = 1;
+ break;
+ case DB_TXN_OPENFILES:
+ /*
+ * We collect all the transactions that have
+ * "begin" records, those with no previous LSN,
+ * so that we do not abort partial transactions.
+ * These are known to be undone, otherwise the
+ * log would not have been freeable.
+ */
+ LOGCOPY_TOLSN(env, &prev_lsn, (u_int8_t *)db->data +
+ sizeof(rectype) + sizeof(txnid));
+ if (txnid != 0 && prev_lsn.file == 0 && (ret =
+ __db_txnlist_add(env, info, txnid, TXN_OK, NULL)) != 0)
+ return (ret);
+
+ /* FALLTHROUGH */
+ case DB_TXN_POPENFILES:
+ if (rectype == DB___dbreg_register ||
+ rectype == DB___txn_child ||
+ rectype == DB___txn_ckp || rectype == DB___txn_recycle)
+ return ((dtab->int_dispatch[rectype])(env,
+ db, lsnp, redo, info));
+ break;
+ case DB_TXN_BACKWARD_ROLL:
+ /*
+ * Running full recovery in the backward pass. In general,
+ * we only process records during this pass that belong
+ * to aborted transactions. Unfortunately, there are several
+ * exceptions:
+ * 1. If this is a meta-record, one not associated with
+ * a transaction, then we must always process it.
+ * 2. If this is a transaction commit/abort, we must
+ * always process it, so that we know the status of
+ * every transaction.
+ * 3. If this is a child commit, we need to process it
+ * because the outcome of the child transaction depends
+ * on the outcome of the parent.
+ * 4. If this is a dbreg_register record, we must always
+ * process is because they contain non-transactional
+ * closes that must be properly handled.
+ * 5. If this is a noop, we must always undo it so that we
+ * properly handle any aborts before a file was closed.
+ * 6. If this a file remove, we need to process it to
+ * determine if the on-disk file is the same as the
+ * one being described.
+ */
+ switch (rectype) {
+ /*
+ * These either do not belong to a transaction or (regop)
+ * must be processed regardless of the status of the
+ * transaction.
+ */
+ case DB___txn_regop:
+ case DB___txn_recycle:
+ case DB___txn_ckp:
+ make_call = 1;
+ break;
+ /*
+ * These belong to a transaction whose status must be
+ * checked.
+ */
+ case DB___txn_child:
+ case DB___db_noop:
+ case DB___fop_file_remove:
+ case DB___dbreg_register:
+ make_call = 1;
+
+ /* FALLTHROUGH */
+ default:
+ if (txnid == 0)
+ break;
+
+ ret = __db_txnlist_find(env, info, txnid, &status);
+
+ /* If not found, this is an incomplete abort. */
+ if (ret == DB_NOTFOUND)
+ return (__db_txnlist_add(env,
+ info, txnid, TXN_IGNORE, lsnp));
+ if (ret != 0)
+ return (ret);
+
+ /*
+ * If we ignore the transaction, ignore the operation
+ * UNLESS this is a child commit in which case we need
+ * to make sure that the child also gets marked as
+ * ignore.
+ */
+ if (status == TXN_IGNORE && rectype != DB___txn_child) {
+ make_call = 0;
+ break;
+ }
+ if (status == TXN_COMMIT)
+ break;
+
+ /* Set make_call in case we came through default */
+ make_call = 1;
+ if (status == TXN_OK &&
+ (ret = __db_txnlist_update(env,
+ info, txnid, rectype == DB___txn_prepare ?
+ TXN_PREPARE : TXN_ABORT, NULL, &status, 0)) != 0)
+ return (ret);
+ }
+ break;
+ case DB_TXN_FORWARD_ROLL:
+ /*
+ * In the forward pass, if we haven't seen the transaction,
+ * do nothing, else recover it.
+ *
+ * We need to always redo DB___db_noop records, so that we
+ * properly handle any commits after the file was closed.
+ */
+ switch (rectype) {
+ case DB___txn_recycle:
+ case DB___txn_ckp:
+ case DB___db_noop:
+ case DB___dbreg_register:
+ make_call = 1;
+ break;
+
+ default:
+ if (txnid == 0)
+ status = 0;
+ else {
+ ret = __db_txnlist_find(env,
+ info, txnid, &status);
+
+ if (ret == DB_NOTFOUND)
+ /* Break out out of if clause. */
+ ;
+ else if (ret != 0)
+ return (ret);
+ else if (status == TXN_COMMIT) {
+ make_call = 1;
+ break;
+ }
+ }
+
+ }
+ break;
+ default:
+ return (__db_unknown_flag(
+ env, "__db_dispatch", (u_int32_t)redo));
+ }
+
+ if (make_call) {
+ /*
+ * If the debug flag is set then we are logging
+ * records for a non-durable update so that they
+ * may be examined for diagnostic purposes.
+ * So only make the call if we are printing,
+ * otherwise we need to extract the previous
+ * lsn so undo will work properly.
+ */
+ if (rectype & DB_debug_FLAG) {
+ if (redo == DB_TXN_PRINT)
+ rectype &= ~DB_debug_FLAG;
+ else {
+ LOGCOPY_TOLSN(env, lsnp,
+ (u_int8_t *)db->data +
+ sizeof(rectype) +
+ sizeof(txnid));
+ return (0);
+ }
+ }
+ if (rectype >= DB_user_BEGIN) {
+ if (dbenv->app_dispatch != NULL)
+ return (dbenv->app_dispatch(dbenv,
+ db, lsnp, redo));
+
+ /* No application-specific dispatch */
+ urectype = rectype - DB_user_BEGIN;
+ if (urectype > dtab->ext_size ||
+ dtab->ext_dispatch[urectype] == NULL) {
+ __db_errx(env,
+ "Illegal application-specific record type %lu in log",
+ (u_long)rectype);
+ return (EINVAL);
+ }
+ return ((dtab->ext_dispatch[urectype])(dbenv,
+ db, lsnp, redo));
+ } else {
+ if (rectype > dtab->int_size ||
+ dtab->int_dispatch[rectype] == NULL) {
+ __db_errx(env,
+ "Illegal record type %lu in log",
+ (u_long)rectype);
+ return (EINVAL);
+ }
+ return ((dtab->int_dispatch[rectype])(env,
+ db, lsnp, redo, info));
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * __db_add_recovery -- Add recovery functions to the dispatch table.
+ *
+ * We have two versions of this, an external one and an internal one,
+ * because application-specific functions take different arguments
+ * for dispatch (ENV versus DB_ENV).
+ *
+ * This is the external version.
+ *
+ * PUBLIC: int __db_add_recovery __P((DB_ENV *, DB_DISTAB *,
+ * PUBLIC: int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops), u_int32_t));
+ */
+int
+__db_add_recovery(dbenv, dtab, func, ndx)
+ DB_ENV *dbenv;
+ DB_DISTAB *dtab;
+ int (*func) __P((DB_ENV *, DBT *, DB_LSN *, db_recops));
+ u_int32_t ndx;
+{
+ size_t i, nsize;
+ int ret;
+
+ /* Make sure this is an application-specific record. */
+ if (ndx < DB_user_BEGIN) {
+ __db_errx(dbenv->env,
+ "Attempting to add application-specific record with invalid type %lu",
+ (u_long)ndx);
+ return (EINVAL);
+ }
+ ndx -= DB_user_BEGIN;
+
+ /* Check if we have to grow the table. */
+ if (ndx >= dtab->ext_size) {
+ nsize = ndx + 40;
+ if ((ret =
+ __os_realloc(dbenv->env, nsize *
+ sizeof((dtab->ext_dispatch)[0]), &dtab->ext_dispatch))
+ != 0)
+ return (ret);
+ for (i = dtab->ext_size; i < nsize; ++i)
+ (dtab->ext_dispatch)[i] = NULL;
+ dtab->ext_size = nsize;
+ }
+
+ (dtab->ext_dispatch)[ndx] = func;
+ return (0);
+}
+
+/*
+ * __db_add_recovery_int --
+ *
+ * Internal version of dispatch addition function.
+ *
+ *
+ * PUBLIC: int __db_add_recovery_int __P((ENV *, DB_DISTAB *,
+ * PUBLIC: int (*)(ENV *, DBT *, DB_LSN *, db_recops, void *), u_int32_t));
+ */
+int
+__db_add_recovery_int(env, dtab, func, ndx)
+ ENV *env;
+ DB_DISTAB *dtab;
+ int (*func) __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ u_int32_t ndx;
+{
+ size_t i, nsize;
+ int ret;
+
+ if (ndx >= DB_user_BEGIN) {
+ __db_errx(env,
+ "Attempting to add internal record with invalid type %lu",
+ (u_long)ndx);
+ return (EINVAL);
+ }
+
+ /* Check if we have to grow the table. */
+ if (ndx >= dtab->int_size) {
+ nsize = ndx + 40;
+ if ((ret =
+ __os_realloc(env, nsize * sizeof((dtab->int_dispatch)[0]),
+ &dtab->int_dispatch)) != 0)
+ return (ret);
+ for (i = dtab->int_size; i < nsize; ++i)
+ (dtab->int_dispatch)[i] = NULL;
+ dtab->int_size = nsize;
+ }
+
+ (dtab->int_dispatch)[ndx] = func;
+ return (0);
+}
+
+/*
+ * __db_txnlist_init --
+ * Initialize transaction linked list.
+ *
+ * PUBLIC: int __db_txnlist_init __P((ENV *, DB_THREAD_INFO *,
+ * PUBLIC: u_int32_t, u_int32_t, DB_LSN *, DB_TXNHEAD **));
+ */
+int
+__db_txnlist_init(env, ip, low_txn, hi_txn, trunc_lsn, retp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ u_int32_t low_txn, hi_txn;
+ DB_LSN *trunc_lsn;
+ DB_TXNHEAD **retp;
+{
+ DB_TXNHEAD *headp;
+ u_int32_t size, tmp;
+ int ret;
+
+ /*
+ * Size a hash table.
+ * If low is zero then we are being called during rollback
+ * and we need only one slot.
+ * Hi maybe lower than low if we have recycled txnid's.
+ * The numbers here are guesses about txn density, we can afford
+ * to look at a few entries in each slot.
+ */
+ if (low_txn == 0)
+ size = 1;
+ else {
+ if (hi_txn < low_txn) {
+ tmp = hi_txn;
+ hi_txn = low_txn;
+ low_txn = tmp;
+ }
+ tmp = hi_txn - low_txn;
+ /* See if we wrapped around. */
+ if (tmp > (TXN_MAXIMUM - TXN_MINIMUM) / 2)
+ tmp = (low_txn - TXN_MINIMUM) + (TXN_MAXIMUM - hi_txn);
+ size = tmp / 5;
+ if (size < 100)
+ size = 100;
+ }
+ if ((ret = __os_malloc(env,
+ sizeof(DB_TXNHEAD) + size * sizeof(headp->head), &headp)) != 0)
+ return (ret);
+
+ memset(headp, 0, sizeof(DB_TXNHEAD) + size * sizeof(headp->head));
+ headp->maxid = hi_txn;
+ headp->generation = 0;
+ headp->nslots = size;
+ headp->gen_alloc = 8;
+ headp->thread_info = ip;
+ if ((ret = __os_malloc(env, headp->gen_alloc *
+ sizeof(headp->gen_array[0]), &headp->gen_array)) != 0) {
+ __os_free(env, headp);
+ return (ret);
+ }
+ headp->gen_array[0].generation = 0;
+ headp->gen_array[0].txn_min = TXN_MINIMUM;
+ headp->gen_array[0].txn_max = TXN_MAXIMUM;
+ if (trunc_lsn != NULL) {
+ headp->trunc_lsn = *trunc_lsn;
+ headp->maxlsn = *trunc_lsn;
+ } else {
+ ZERO_LSN(headp->trunc_lsn);
+ ZERO_LSN(headp->maxlsn);
+ }
+ ZERO_LSN(headp->ckplsn);
+
+ *retp = headp;
+ return (0);
+}
+
+#define FIND_GENERATION(hp, txnid, gen) do { \
+ u_int32_t __i; \
+ for (__i = 0; __i <= (hp)->generation; __i++) \
+ /* The range may wrap around the end. */ \
+ if ((hp)->gen_array[__i].txn_min < \
+ (hp)->gen_array[__i].txn_max ? \
+ ((txnid) >= (hp)->gen_array[__i].txn_min && \
+ (txnid) <= (hp)->gen_array[__i].txn_max) : \
+ ((txnid) >= (hp)->gen_array[__i].txn_min || \
+ (txnid) <= (hp)->gen_array[__i].txn_max)) \
+ break; \
+ DB_ASSERT(env, __i <= (hp)->generation); \
+ gen = (hp)->gen_array[__i].generation; \
+} while (0)
+
+/*
+ * __db_txnlist_add --
+ * Add an element to our transaction linked list.
+ *
+ * PUBLIC: int __db_txnlist_add __P((ENV *,
+ * PUBLIC: DB_TXNHEAD *, u_int32_t, u_int32_t, DB_LSN *));
+ */
+int
+__db_txnlist_add(env, hp, txnid, status, lsn)
+ ENV *env;
+ DB_TXNHEAD *hp;
+ u_int32_t txnid, status;
+ DB_LSN *lsn;
+{
+ DB_TXNLIST *elp;
+ int ret;
+
+ if ((ret = __os_malloc(env, sizeof(DB_TXNLIST), &elp)) != 0)
+ return (ret);
+
+ LIST_INSERT_HEAD(&hp->head[DB_TXNLIST_MASK(hp, txnid)], elp, links);
+
+ /* Find the most recent generation containing this ID */
+ FIND_GENERATION(hp, txnid, elp->u.t.generation);
+ elp->type = TXNLIST_TXNID;
+ elp->u.t.txnid = txnid;
+ elp->u.t.status = status;
+ if (txnid > hp->maxid)
+ hp->maxid = txnid;
+ if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
+ hp->maxlsn = *lsn;
+
+ DB_ASSERT(env, lsn == NULL ||
+ status != TXN_COMMIT || LOG_COMPARE(&hp->maxlsn, lsn) >= 0);
+
+ return (0);
+}
+
+/*
+ * __db_txnlist_remove --
+ * Remove an element from our transaction linked list.
+ *
+ * PUBLIC: int __db_txnlist_remove __P((ENV *, DB_TXNHEAD *, u_int32_t));
+ */
+int
+__db_txnlist_remove(env, hp, txnid)
+ ENV *env;
+ DB_TXNHEAD *hp;
+ u_int32_t txnid;
+{
+ DB_TXNLIST *entry;
+ u_int32_t status;
+
+ return (__db_txnlist_find_internal(env,
+ hp, TXNLIST_TXNID, txnid, &entry, 1, &status));
+}
+
+/*
+ * __db_txnlist_ckp --
+ * Used to record the maximum checkpoint that will be retained
+ * after recovery. Typically this is simply the max checkpoint, but
+ * if we are doing client replication recovery or timestamp-based
+ * recovery, we are going to virtually truncate the log and we need
+ * to retain the last checkpoint before the truncation point.
+ *
+ * PUBLIC: void __db_txnlist_ckp __P((ENV *, DB_TXNHEAD *, DB_LSN *));
+ */
+void
+__db_txnlist_ckp(env, hp, ckp_lsn)
+ ENV *env;
+ DB_TXNHEAD *hp;
+ DB_LSN *ckp_lsn;
+{
+
+ COMPQUIET(env, NULL);
+
+ if (IS_ZERO_LSN(hp->ckplsn) && !IS_ZERO_LSN(hp->maxlsn) &&
+ LOG_COMPARE(&hp->maxlsn, ckp_lsn) >= 0)
+ hp->ckplsn = *ckp_lsn;
+}
+
+/*
+ * __db_txnlist_end --
+ * Discard transaction linked list.
+ *
+ * PUBLIC: void __db_txnlist_end __P((ENV *, DB_TXNHEAD *));
+ */
+void
+__db_txnlist_end(env, hp)
+ ENV *env;
+ DB_TXNHEAD *hp;
+{
+ u_int32_t i;
+ DB_TXNLIST *p;
+
+ if (hp == NULL)
+ return;
+
+ for (i = 0; i < hp->nslots; i++)
+ while (hp != NULL && (p = LIST_FIRST(&hp->head[i])) != NULL) {
+ switch (p->type) {
+ case TXNLIST_LSN:
+ __os_free(env, p->u.l.lsn_stack);
+ break;
+ case TXNLIST_DELETE:
+ case TXNLIST_TXNID:
+ default:
+ /*
+ * Possibly an incomplete DB_TXNLIST; just
+ * free it.
+ */
+ break;
+ }
+ LIST_REMOVE(p, links);
+ __os_free(env, p);
+ }
+
+ if (hp->gen_array != NULL)
+ __os_free(env, hp->gen_array);
+ __os_free(env, hp);
+}
+
+/*
+ * __db_txnlist_find --
+ * Checks to see if a txnid with the current generation is in the
+ * txnid list. This returns DB_NOTFOUND if the item isn't in the
+ * list otherwise it returns (like __db_txnlist_find_internal)
+ * the status of the transaction. A txnid of 0 means the record
+ * was generated while not in a transaction.
+ *
+ * PUBLIC: int __db_txnlist_find __P((ENV *,
+ * PUBLIC: DB_TXNHEAD *, u_int32_t, u_int32_t *));
+ */
+int
+__db_txnlist_find(env, hp, txnid, statusp)
+ ENV *env;
+ DB_TXNHEAD *hp;
+ u_int32_t txnid, *statusp;
+{
+ DB_TXNLIST *entry;
+
+ if (txnid == 0)
+ return (DB_NOTFOUND);
+
+ return (__db_txnlist_find_internal(env, hp,
+ TXNLIST_TXNID, txnid, &entry, 0, statusp));
+}
+
+/*
+ * __db_txnlist_update --
+ * Change the status of an existing transaction entry.
+ * Returns DB_NOTFOUND if no such entry exists.
+ *
+ * PUBLIC: int __db_txnlist_update __P((ENV *, DB_TXNHEAD *,
+ * PUBLIC: u_int32_t, u_int32_t, DB_LSN *, u_int32_t *, int));
+ */
+int
+__db_txnlist_update(env, hp, txnid, status, lsn, ret_status, add_ok)
+ ENV *env;
+ DB_TXNHEAD *hp;
+ u_int32_t txnid, status;
+ DB_LSN *lsn;
+ u_int32_t *ret_status;
+ int add_ok;
+{
+ DB_TXNLIST *elp;
+ int ret;
+
+ if (txnid == 0)
+ return (DB_NOTFOUND);
+
+ ret = __db_txnlist_find_internal(env,
+ hp, TXNLIST_TXNID, txnid, &elp, 0, ret_status);
+
+ if (ret == DB_NOTFOUND && add_ok) {
+ *ret_status = status;
+ return (__db_txnlist_add(env, hp, txnid, status, lsn));
+ }
+ if (ret != 0)
+ return (ret);
+
+ if (*ret_status == TXN_IGNORE)
+ return (0);
+
+ elp->u.t.status = status;
+
+ if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
+ hp->maxlsn = *lsn;
+
+ return (ret);
+}
+
+/*
+ * __db_txnlist_find_internal --
+ * Find an entry on the transaction list. If the entry is not there or
+ * the list pointer is not initialized we return DB_NOTFOUND. If the
+ * item is found, we return the status. Currently we always call this
+ * with an initialized list pointer but checking for NULL keeps it general.
+ */
+static int
+__db_txnlist_find_internal(env,
+ hp, type, txnid, txnlistp, delete, statusp)
+ ENV *env;
+ DB_TXNHEAD *hp;
+ db_txnlist_type type;
+ u_int32_t txnid;
+ DB_TXNLIST **txnlistp;
+ int delete;
+ u_int32_t *statusp;
+{
+ struct __db_headlink *head;
+ DB_TXNLIST *p;
+ u_int32_t generation, hash;
+ int ret;
+
+ ret = 0;
+
+ if (hp == NULL)
+ return (DB_NOTFOUND);
+
+ switch (type) {
+ case TXNLIST_TXNID:
+ hash = txnid;
+ FIND_GENERATION(hp, txnid, generation);
+ break;
+ case TXNLIST_DELETE:
+ case TXNLIST_LSN:
+ default:
+ return (__env_panic(env, EINVAL));
+ }
+
+ head = &hp->head[DB_TXNLIST_MASK(hp, hash)];
+ LIST_FOREACH(p, head, links) {
+ if (p->type != type)
+ continue;
+ switch (type) {
+ case TXNLIST_TXNID:
+ if (p->u.t.txnid != txnid ||
+ generation != p->u.t.generation)
+ continue;
+ *statusp = p->u.t.status;
+ break;
+
+ case TXNLIST_DELETE:
+ case TXNLIST_LSN:
+ default:
+ return (__env_panic(env, EINVAL));
+ }
+ if (delete == 1) {
+ LIST_REMOVE(p, links);
+ __os_free(env, p);
+ *txnlistp = NULL;
+ } else if (p != LIST_FIRST(head)) {
+ /* Move it to head of list. */
+ LIST_REMOVE(p, links);
+ LIST_INSERT_HEAD(head, p, links);
+ *txnlistp = p;
+ } else
+ *txnlistp = p;
+ return (ret);
+ }
+
+ return (DB_NOTFOUND);
+}
+
+/*
+ * __db_txnlist_gen --
+ * Change the current generation number.
+ *
+ * PUBLIC: int __db_txnlist_gen __P((ENV *,
+ * PUBLIC: DB_TXNHEAD *, int, u_int32_t, u_int32_t));
+ */
+int
+__db_txnlist_gen(env, hp, incr, min, max)
+ ENV *env;
+ DB_TXNHEAD *hp;
+ int incr;
+ u_int32_t min, max;
+{
+ int ret;
+
+ /*
+ * During recovery generation numbers keep track of "restart"
+ * checkpoints and recycle records. Restart checkpoints occur
+ * whenever we take a checkpoint and there are no outstanding
+ * transactions. When that happens, we can reset transaction IDs
+ * back to TXNID_MINIMUM. Currently we only do the reset
+ * at then end of recovery. Recycle records occur when txnids
+ * are exhausted during runtime. A free range of ids is identified
+ * and logged. This code maintains a stack of ranges. A txnid
+ * is given the generation number of the first range it falls into
+ * in the stack.
+ */
+ if (incr < 0) {
+ --hp->generation;
+ memmove(hp->gen_array, &hp->gen_array[1],
+ (hp->generation + 1) * sizeof(hp->gen_array[0]));
+ } else {
+ ++hp->generation;
+ if (hp->generation >= hp->gen_alloc) {
+ hp->gen_alloc *= 2;
+ if ((ret = __os_realloc(env, hp->gen_alloc *
+ sizeof(hp->gen_array[0]), &hp->gen_array)) != 0)
+ return (ret);
+ }
+ memmove(&hp->gen_array[1], &hp->gen_array[0],
+ hp->generation * sizeof(hp->gen_array[0]));
+ hp->gen_array[0].generation = hp->generation;
+ hp->gen_array[0].txn_min = min;
+ hp->gen_array[0].txn_max = max;
+ }
+ return (0);
+}
+
+/*
+ * __db_txnlist_lsnadd --
+ * Save the prev_lsn from a txn_child record.
+ *
+ * PUBLIC: int __db_txnlist_lsnadd __P((ENV *, DB_TXNHEAD *, DB_LSN *));
+ */
+int
+__db_txnlist_lsnadd(env, hp, lsnp)
+ ENV *env;
+ DB_TXNHEAD *hp;
+ DB_LSN *lsnp;
+{
+ DB_TXNLIST *elp;
+ int ret;
+
+ if (IS_ZERO_LSN(*lsnp))
+ return (0);
+
+ LIST_FOREACH(elp, &hp->head[0], links)
+ if (elp->type == TXNLIST_LSN)
+ break;
+
+ if (elp == NULL) {
+ if ((ret = __db_txnlist_lsninit(env, hp, lsnp)) != 0)
+ return (ret);
+ return (DB_SURPRISE_KID);
+ }
+
+ if (elp->u.l.stack_indx == elp->u.l.stack_size) {
+ elp->u.l.stack_size <<= 1;
+ if ((ret = __os_realloc(env, sizeof(DB_LSN) *
+ elp->u.l.stack_size, &elp->u.l.lsn_stack)) != 0) {
+ __db_txnlist_end(env, hp);
+ return (ret);
+ }
+ }
+ elp->u.l.lsn_stack[elp->u.l.stack_indx++] = *lsnp;
+
+ return (0);
+}
+
+/*
+ * __db_txnlist_lsnget --
+ *
+ * PUBLIC: int __db_txnlist_lsnget __P((ENV *,
+ * PUBLIC: DB_TXNHEAD *, DB_LSN *, u_int32_t));
+ * Get the lsn saved from a txn_child record.
+ */
+int
+__db_txnlist_lsnget(env, hp, lsnp, flags)
+ ENV *env;
+ DB_TXNHEAD *hp;
+ DB_LSN *lsnp;
+ u_int32_t flags;
+{
+ DB_TXNLIST *elp;
+
+ COMPQUIET(env, NULL);
+ COMPQUIET(flags, 0);
+
+ LIST_FOREACH(elp, &hp->head[0], links)
+ if (elp->type == TXNLIST_LSN)
+ break;
+
+ if (elp == NULL || elp->u.l.stack_indx == 0) {
+ ZERO_LSN(*lsnp);
+ return (0);
+ }
+
+ *lsnp = elp->u.l.lsn_stack[--elp->u.l.stack_indx];
+
+ return (0);
+}
+
+/*
+ * __db_txnlist_lsninit --
+ * Initialize a transaction list with an lsn array entry.
+ *
+ * PUBLIC: int __db_txnlist_lsninit __P((ENV *, DB_TXNHEAD *, DB_LSN *));
+ */
+int
+__db_txnlist_lsninit(env, hp, lsnp)
+ ENV *env;
+ DB_TXNHEAD *hp;
+ DB_LSN *lsnp;
+{
+ DB_TXNLIST *elp;
+ int ret;
+
+ elp = NULL;
+
+ if ((ret = __os_malloc(env, sizeof(DB_TXNLIST), &elp)) != 0)
+ goto err;
+ LIST_INSERT_HEAD(&hp->head[0], elp, links);
+
+ elp->type = TXNLIST_LSN;
+ if ((ret = __os_malloc(env,
+ sizeof(DB_LSN) * DB_LSN_STACK_SIZE, &elp->u.l.lsn_stack)) != 0)
+ goto err;
+ elp->u.l.stack_indx = 1;
+ elp->u.l.stack_size = DB_LSN_STACK_SIZE;
+ elp->u.l.lsn_stack[0] = *lsnp;
+
+ return (0);
+
+err: __db_txnlist_end(env, hp);
+ return (ret);
+}
+
+#ifdef DEBUG
+/*
+ * __db_txnlist_print --
+ * Print out the transaction list.
+ *
+ * PUBLIC: void __db_txnlist_print __P((DB_TXNHEAD *));
+ */
+void
+__db_txnlist_print(hp)
+ DB_TXNHEAD *hp;
+{
+ DB_TXNLIST *p;
+ u_int32_t i;
+ char *txntype;
+
+ printf("Maxid: %lu Generation: %lu\n",
+ (u_long)hp->maxid, (u_long)hp->generation);
+ for (i = 0; i < hp->nslots; i++)
+ LIST_FOREACH(p, &hp->head[i], links) {
+ if (p->type != TXNLIST_TXNID) {
+ printf("Unrecognized type: %d\n", p->type);
+ continue;
+ }
+ switch (p->u.t.status) {
+ case TXN_OK:
+ txntype = "OK";
+ break;
+ case TXN_COMMIT:
+ txntype = "commit";
+ break;
+ case TXN_PREPARE:
+ txntype = "prepare";
+ break;
+ case TXN_ABORT:
+ txntype = "abort";
+ break;
+ case TXN_IGNORE:
+ txntype = "ignore";
+ break;
+ case TXN_EXPECTED:
+ txntype = "expected";
+ break;
+ case TXN_UNEXPECTED:
+ txntype = "unexpected";
+ break;
+ default:
+ txntype = "UNKNOWN";
+ break;
+ }
+ printf("TXNID: %lx(%lu): %s\n",
+ (u_long)p->u.t.txnid,
+ (u_long)p->u.t.generation, txntype);
+ }
+}
+#endif
diff --git a/db/db_dup.c b/db/db_dup.c
new file mode 100644
index 0000000..b789e03
--- /dev/null
+++ b/db/db_dup.c
@@ -0,0 +1,203 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/mp.h"
+#include "dbinc/db_am.h"
+
+/*
+ * __db_ditem_nolog --
+ * Remove an item from a page without affecting its recoverability.
+ *
+ * PUBLIC: int __db_ditem_nolog __P((DBC *, PAGE *, u_int32_t, u_int32_t));
+ */
+int
+__db_ditem_nolog(dbc, pagep, indx, nbytes)
+ DBC *dbc;
+ PAGE *pagep;
+ u_int32_t indx, nbytes;
+{
+ DB *dbp;
+ db_indx_t cnt, *inp, offset;
+ u_int8_t *from;
+
+ dbp = dbc->dbp;
+ DB_ASSERT(dbp->env, IS_DIRTY(pagep));
+ DB_ASSERT(dbp->env, indx < NUM_ENT(pagep));
+
+ /*
+ * If there's only a single item on the page, we don't have to
+ * work hard.
+ */
+ if (NUM_ENT(pagep) == 1) {
+ NUM_ENT(pagep) = 0;
+ HOFFSET(pagep) = dbp->pgsize;
+ return (0);
+ }
+
+ inp = P_INP(dbp, pagep);
+ /*
+ * Pack the remaining key/data items at the end of the page. Use
+ * memmove(3), the regions may overlap.
+ */
+ from = (u_int8_t *)pagep + HOFFSET(pagep);
+ DB_ASSERT(dbp->env, inp[indx] >= HOFFSET(pagep));
+ memmove(from + nbytes, from, inp[indx] - HOFFSET(pagep));
+ HOFFSET(pagep) += nbytes;
+
+ /* Adjust the indices' offsets. */
+ offset = inp[indx];
+ for (cnt = 0; cnt < NUM_ENT(pagep); ++cnt)
+ if (inp[cnt] < offset)
+ inp[cnt] += nbytes;
+
+ /* Shift the indices down. */
+ --NUM_ENT(pagep);
+ if (indx != NUM_ENT(pagep))
+ memmove(&inp[indx], &inp[indx + 1],
+ sizeof(db_indx_t) * (NUM_ENT(pagep) - indx));
+
+ return (0);
+}
+
+/*
+ * __db_ditem --
+ * Remove an item from a page, logging it if enabled.
+ *
+ * PUBLIC: int __db_ditem __P((DBC *, PAGE *, u_int32_t, u_int32_t));
+ */
+int
+__db_ditem(dbc, pagep, indx, nbytes)
+ DBC *dbc;
+ PAGE *pagep;
+ u_int32_t indx, nbytes;
+{
+ DB *dbp;
+ DBT ldbt;
+ int ret;
+
+ dbp = dbc->dbp;
+
+ if (DBC_LOGGING(dbc)) {
+ ldbt.data = P_ENTRY(dbp, pagep, indx);
+ ldbt.size = nbytes;
+ if ((ret = __db_addrem_log(dbp, dbc->txn,
+ &LSN(pagep), 0, DB_REM_DUP, PGNO(pagep),
+ (u_int32_t)indx, nbytes, &ldbt, NULL, &LSN(pagep))) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(LSN(pagep));
+
+ return (__db_ditem_nolog(dbc, pagep, indx, nbytes));
+}
+
+/*
+ * __db_pitem_nolog --
+ * Put an item on a page without logging.
+ *
+ * PUBLIC: int __db_pitem_nolog
+ * PUBLIC: __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
+ */
+int
+__db_pitem_nolog(dbc, pagep, indx, nbytes, hdr, data)
+ DBC *dbc;
+ PAGE *pagep;
+ u_int32_t indx;
+ u_int32_t nbytes;
+ DBT *hdr, *data;
+{
+ BKEYDATA bk;
+ DB *dbp;
+ DBT thdr;
+ db_indx_t *inp;
+ u_int8_t *p;
+
+ dbp = dbc->dbp;
+
+ DB_ASSERT(dbp->env, IS_DIRTY(pagep));
+
+ if (nbytes > P_FREESPACE(dbp, pagep)) {
+ DB_ASSERT(dbp->env, nbytes <= P_FREESPACE(dbp, pagep));
+ return (EINVAL);
+ }
+
+ if (hdr == NULL) {
+ B_TSET(bk.type, B_KEYDATA);
+ bk.len = data == NULL ? 0 : data->size;
+
+ thdr.data = &bk;
+ thdr.size = SSZA(BKEYDATA, data);
+ hdr = &thdr;
+ }
+ inp = P_INP(dbp, pagep);
+
+ /* Adjust the index table, then put the item on the page. */
+ if (indx != NUM_ENT(pagep))
+ memmove(&inp[indx + 1], &inp[indx],
+ sizeof(db_indx_t) * (NUM_ENT(pagep) - indx));
+ HOFFSET(pagep) -= nbytes;
+ inp[indx] = HOFFSET(pagep);
+ ++NUM_ENT(pagep);
+
+ p = P_ENTRY(dbp, pagep, indx);
+ memcpy(p, hdr->data, hdr->size);
+ if (data != NULL)
+ memcpy(p + hdr->size, data->data, data->size);
+
+ return (0);
+}
+
+/*
+ * __db_pitem --
+ * Put an item on a page.
+ *
+ * PUBLIC: int __db_pitem
+ * PUBLIC: __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
+ */
+int
+__db_pitem(dbc, pagep, indx, nbytes, hdr, data)
+ DBC *dbc;
+ PAGE *pagep;
+ u_int32_t indx;
+ u_int32_t nbytes;
+ DBT *hdr, *data;
+{
+ DB *dbp;
+ int ret;
+
+ dbp = dbc->dbp;
+ /*
+ * Put a single item onto a page. The logic figuring out where to
+ * insert and whether it fits is handled in the caller. All we do
+ * here is manage the page shuffling. We cheat a little bit in that
+ * we don't want to copy the dbt on a normal put twice. If hdr is
+ * NULL, we create a BKEYDATA structure on the page, otherwise, just
+ * copy the caller's information onto the page.
+ *
+ * This routine is also used to put entries onto the page where the
+ * entry is pre-built, e.g., during recovery. In this case, the hdr
+ * will point to the entry, and the data argument will be NULL.
+ *
+ * !!!
+ * There's a tremendous potential for off-by-one errors here, since
+ * the passed in header sizes must be adjusted for the structure's
+ * placeholder for the trailing variable-length data field.
+ */
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __db_addrem_log(dbp, dbc->txn,
+ &LSN(pagep), 0, DB_ADD_DUP, PGNO(pagep),
+ (u_int32_t)indx, nbytes, hdr, data, &LSN(pagep))) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(LSN(pagep));
+
+ return (__db_pitem_nolog(dbc, pagep, indx, nbytes, hdr, data));
+}
diff --git a/db/db_iface.c b/db/db_iface.c
new file mode 100644
index 0000000..55f3e2a
--- /dev/null
+++ b/db/db_iface.c
@@ -0,0 +1,2817 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#ifndef HAVE_QUEUE
+#include "dbinc/qam.h" /* For __db_no_queue_am(). */
+#endif
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/txn.h"
+
+static int __db_associate_arg __P((DB *, DB *,
+ int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t));
+static int __dbc_del_arg __P((DBC *, u_int32_t));
+static int __dbc_pget_arg __P((DBC *, DBT *, u_int32_t));
+static int __dbc_put_arg __P((DBC *, DBT *, DBT *, u_int32_t));
+static int __db_curinval __P((const ENV *));
+static int __db_cursor_arg __P((DB *, u_int32_t));
+static int __db_del_arg __P((DB *, DBT *, u_int32_t));
+static int __db_get_arg __P((const DB *, DBT *, DBT *, u_int32_t));
+static int __db_join_arg __P((DB *, DBC **, u_int32_t));
+static int __db_open_arg __P((DB *,
+ DB_TXN *, const char *, const char *, DBTYPE, u_int32_t));
+static int __db_pget_arg __P((DB *, DBT *, u_int32_t));
+static int __db_put_arg __P((DB *, DBT *, DBT *, u_int32_t));
+static int __dbt_ferr __P((const DB *, const char *, const DBT *, int));
+static int __db_associate_foreign_arg __P((DB *, DB *,
+ int (*)(DB *, const DBT *, DBT *, const DBT *, int *),
+ u_int32_t));
+
+/*
+ * These functions implement the Berkeley DB API. They are organized in a
+ * layered fashion. The interface functions (XXX_pp) perform all generic
+ * error checks (for example, PANIC'd region, replication state change
+ * in progress, inconsistent transaction usage), call function-specific
+ * check routines (_arg) to check for proper flag usage, etc., do pre-amble
+ * processing (incrementing handle counts, handling local transactions),
+ * call the function and then do post-amble processing (local transactions,
+ * decrement handle counts).
+ *
+ * The basic structure is:
+ * Check for simple/generic errors (PANIC'd region)
+ * Check if replication is changing state (increment handle count).
+ * Call function-specific argument checking routine
+ * Create internal transaction if necessary
+ * Call underlying worker function
+ * Commit/abort internal transaction if necessary
+ * Decrement handle count
+ */
+
+/*
+ * __db_associate_pp --
+ * DB->associate pre/post processing.
+ *
+ * PUBLIC: int __db_associate_pp __P((DB *, DB_TXN *, DB *,
+ * PUBLIC: int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t));
+ */
+int
+__db_associate_pp(dbp, txn, sdbp, callback, flags)
+ DB *dbp, *sdbp;
+ DB_TXN *txn;
+ int (*callback) __P((DB *, const DBT *, const DBT *, DBT *));
+ u_int32_t flags;
+{
+ DBC *sdbc;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret, txn_local;
+
+ env = dbp->env;
+ txn_local = 0;
+
+ STRIP_AUTO_COMMIT(flags);
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(dbp, 1, 0, txn != NULL)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ /*
+ * Secondary cursors may have the primary's lock file ID, so we need
+ * to make sure that no older cursors are lying around when we make
+ * the transition.
+ */
+ if (TAILQ_FIRST(&sdbp->active_queue) != NULL ||
+ TAILQ_FIRST(&sdbp->join_queue) != NULL) {
+ __db_errx(env,
+ "Databases may not become secondary indices while cursors are open");
+ ret = EINVAL;
+ goto err;
+ }
+
+ if ((ret = __db_associate_arg(dbp, sdbp, callback, flags)) != 0)
+ goto err;
+
+ /*
+ * Create a local transaction as necessary, check for consistent
+ * transaction usage, and, if we have no transaction but do have
+ * locking on, acquire a locker id for the handle lock acquisition.
+ */
+ if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+ if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+ goto err;
+ txn_local = 1;
+ }
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+ goto err;
+
+ while ((sdbc = TAILQ_FIRST(&sdbp->free_queue)) != NULL)
+ if ((ret = __dbc_destroy(sdbc)) != 0)
+ goto err;
+
+ ret = __db_associate(dbp, ip, txn, sdbp, callback, flags);
+
+err: if (txn_local &&
+ (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0)
+ ret = t_ret;
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_associate_arg --
+ * Check DB->associate arguments.
+ */
+static int
+__db_associate_arg(dbp, sdbp, callback, flags)
+ DB *dbp, *sdbp;
+ int (*callback) __P((DB *, const DBT *, const DBT *, DBT *));
+ u_int32_t flags;
+{
+ ENV *env;
+ int ret;
+
+ env = dbp->env;
+
+ if (F_ISSET(sdbp, DB_AM_SECONDARY)) {
+ __db_errx(env,
+ "Secondary index handles may not be re-associated");
+ return (EINVAL);
+ }
+ if (F_ISSET(dbp, DB_AM_SECONDARY)) {
+ __db_errx(env,
+ "Secondary indices may not be used as primary databases");
+ return (EINVAL);
+ }
+ if (F_ISSET(dbp, DB_AM_DUP)) {
+ __db_errx(env,
+ "Primary databases may not be configured with duplicates");
+ return (EINVAL);
+ }
+ if (F_ISSET(dbp, DB_AM_RENUMBER)) {
+ __db_errx(env,
+ "Renumbering recno databases may not be used as primary databases");
+ return (EINVAL);
+ }
+
+ /*
+ * It's OK for the primary and secondary to not share an environment IFF
+ * the environments are local to the DB handle. (Specifically, cursor
+ * adjustment will work correctly in this case.) The environment being
+ * local implies the environment is not configured for either locking or
+ * transactions, as neither of those could work correctly.
+ */
+ if (dbp->env != sdbp->env &&
+ (!F_ISSET(dbp->env, ENV_DBLOCAL) ||
+ !F_ISSET(sdbp->env, ENV_DBLOCAL))) {
+ __db_errx(env,
+ "The primary and secondary must be opened in the same environment");
+ return (EINVAL);
+ }
+ if ((DB_IS_THREADED(dbp) && !DB_IS_THREADED(sdbp)) ||
+ (!DB_IS_THREADED(dbp) && DB_IS_THREADED(sdbp))) {
+ __db_errx(env,
+ "The DB_THREAD setting must be the same for primary and secondary");
+ return (EINVAL);
+ }
+ if (callback == NULL &&
+ (!F_ISSET(dbp, DB_AM_RDONLY) || !F_ISSET(sdbp, DB_AM_RDONLY))) {
+ __db_errx(env,
+ "Callback function may be NULL only when database handles are read-only");
+ return (EINVAL);
+ }
+
+ if ((ret = __db_fchk(env, "DB->associate", flags, DB_CREATE |
+ DB_IMMUTABLE_KEY)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __db_close_pp --
+ * DB->close pre/post processing.
+ *
+ * PUBLIC: int __db_close_pp __P((DB *, u_int32_t));
+ */
+int
+__db_close_pp(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = dbp->env;
+ ret = 0;
+
+ /*
+ * Close a DB handle -- as a handle destructor, we can't fail.
+ *
+ * !!!
+ * The actual argument checking is simple, do it inline, outside of
+ * the replication block.
+ */
+ if (flags != 0 && flags != DB_NOSYNC)
+ ret = __db_ferr(env, "DB->close", 0);
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (t_ret = __db_rep_enter(dbp, 0, 0, 0)) != 0) {
+ handle_check = 0;
+ if (ret == 0)
+ ret = t_ret;
+ }
+
+ if ((t_ret = __db_close(dbp, NULL, flags)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_cursor_pp --
+ * DB->cursor pre/post processing.
+ *
+ * PUBLIC: int __db_cursor_pp __P((DB *, DB_TXN *, DBC **, u_int32_t));
+ */
+int
+__db_cursor_pp(dbp, txn, dbcp, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ DBC **dbcp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ REGENV *renv;
+ int rep_blocked, ret;
+
+ env = dbp->env;
+
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->cursor");
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ rep_blocked = 0;
+ if (txn == NULL && IS_ENV_REPLICATED(env)) {
+ if ((ret = __op_rep_enter(env)) != 0)
+ goto err;
+ rep_blocked = 1;
+ renv = env->reginfo->primary;
+ if (dbp->timestamp != renv->rep_timestamp) {
+ __db_errx(env, "%s %s",
+ "replication recovery unrolled committed transactions;",
+ "open DB and DBcursor handles must be closed");
+ ret = DB_REP_HANDLE_DEAD;
+ goto err;
+ }
+ }
+ if ((ret = __db_cursor_arg(dbp, flags)) != 0)
+ goto err;
+
+ /*
+ * Check for consistent transaction usage. For now, assume this
+ * cursor might be used for read operations only (in which case
+ * it may not require a txn). We'll check more stringently in
+ * c_del and c_put. (Note this means the read-op txn tests have
+ * to be a subset of the write-op ones.)
+ */
+ if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 1)) != 0)
+ goto err;
+
+ ret = __db_cursor(dbp, ip, txn, dbcp, flags);
+
+err: /* Release replication block on error. */
+ if (ret != 0 && rep_blocked)
+ (void)__op_rep_exit(env);
+
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_cursor --
+ * DB->cursor.
+ *
+ * PUBLIC: int __db_cursor __P((DB *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DBC **, u_int32_t));
+ */
+int
+__db_cursor(dbp, ip, txn, dbcp, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DBC **dbcp;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ ENV *env;
+ db_lockmode_t mode;
+ int ret;
+
+ env = dbp->env;
+
+ if (MULTIVERSION(dbp) && txn == NULL && (LF_ISSET(DB_TXN_SNAPSHOT) ||
+ F_ISSET(env->dbenv, DB_ENV_TXN_SNAPSHOT))) {
+ if ((ret =
+ __txn_begin(env, ip, NULL, &txn, DB_TXN_SNAPSHOT)) != 0)
+ return (ret);
+ F_SET(txn, TXN_PRIVATE);
+ }
+
+ if ((ret = __db_cursor_int(dbp, ip, txn, dbp->type, PGNO_INVALID,
+ LF_ISSET(DB_CURSOR_BULK | DB_CURSOR_TRANSIENT), NULL, &dbc)) != 0)
+ return (ret);
+
+ /*
+ * If this is CDB, do all the locking in the interface, which is
+ * right here.
+ */
+ if (CDB_LOCKING(env)) {
+ mode = (LF_ISSET(DB_WRITELOCK)) ? DB_LOCK_WRITE :
+ ((LF_ISSET(DB_WRITECURSOR) || txn != NULL) ?
+ DB_LOCK_IWRITE : DB_LOCK_READ);
+ if ((ret = __lock_get(env, dbc->locker, 0,
+ &dbc->lock_dbt, mode, &dbc->mylock)) != 0)
+ goto err;
+ if (LF_ISSET(DB_WRITECURSOR))
+ F_SET(dbc, DBC_WRITECURSOR);
+ if (LF_ISSET(DB_WRITELOCK))
+ F_SET(dbc, DBC_WRITER);
+ }
+
+ if (LF_ISSET(DB_READ_UNCOMMITTED) ||
+ (txn != NULL && F_ISSET(txn, TXN_READ_UNCOMMITTED)))
+ F_SET(dbc, DBC_READ_UNCOMMITTED);
+
+ if (LF_ISSET(DB_READ_COMMITTED) ||
+ (txn != NULL && F_ISSET(txn, TXN_READ_COMMITTED)))
+ F_SET(dbc, DBC_READ_COMMITTED);
+
+ *dbcp = dbc;
+ return (0);
+
+err: (void)__dbc_close(dbc);
+ return (ret);
+}
+
+/*
+ * __db_cursor_arg --
+ * Check DB->cursor arguments.
+ */
+static int
+__db_cursor_arg(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ ENV *env;
+
+ env = dbp->env;
+
+ /*
+ * DB_READ_COMMITTED and DB_READ_UNCOMMITTED require locking.
+ */
+ if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED)) {
+ if (!LOCKING_ON(env))
+ return (__db_fnl(env, "DB->cursor"));
+ }
+
+ LF_CLR(DB_CURSOR_BULK |
+ DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_TXN_SNAPSHOT);
+
+ /* Check for invalid function flags. */
+ if (LF_ISSET(DB_WRITECURSOR)) {
+ if (DB_IS_READONLY(dbp))
+ return (__db_rdonly(env, "DB->cursor"));
+ if (!CDB_LOCKING(env))
+ return (__db_ferr(env, "DB->cursor", 0));
+ LF_CLR(DB_WRITECURSOR);
+ } else if (LF_ISSET(DB_WRITELOCK)) {
+ if (DB_IS_READONLY(dbp))
+ return (__db_rdonly(env, "DB->cursor"));
+ LF_CLR(DB_WRITELOCK);
+ }
+
+ if (flags != 0)
+ return (__db_ferr(env, "DB->cursor", 0));
+
+ return (0);
+}
+
+/*
+ * __db_del_pp --
+ * DB->del pre/post processing.
+ *
+ * PUBLIC: int __db_del_pp __P((DB *, DB_TXN *, DBT *, u_int32_t));
+ */
+int
+__db_del_pp(dbp, txn, key, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ DBT *key;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret, txn_local;
+
+ env = dbp->env;
+ txn_local = 0;
+
+ STRIP_AUTO_COMMIT(flags);
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->del");
+
+#ifdef CONFIG_TEST
+ if (IS_REP_MASTER(env))
+ DB_TEST_WAIT(env, env->test_check);
+#endif
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(dbp, 1, 0, txn != NULL)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ if ((ret = __db_del_arg(dbp, key, flags)) != 0)
+ goto err;
+
+ /* Create local transaction as necessary. */
+ if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+ if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+ goto err;
+ txn_local = 1;
+ }
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+ goto err;
+
+ ret = __db_del(dbp, ip, txn, key, flags);
+
+err: if (txn_local &&
+ (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0)
+ ret = t_ret;
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+ ENV_LEAVE(env, ip);
+ __dbt_userfree(env, key, NULL, NULL);
+ return (ret);
+}
+
+/*
+ * __db_del_arg --
+ * Check DB->delete arguments.
+ */
+static int
+__db_del_arg(dbp, key, flags)
+ DB *dbp;
+ DBT *key;
+ u_int32_t flags;
+{
+ ENV *env;
+ int ret;
+
+ env = dbp->env;
+
+ /* Check for changes to a read-only tree. */
+ if (DB_IS_READONLY(dbp))
+ return (__db_rdonly(env, "DB->del"));
+
+ /* Check for invalid function flags. */
+ switch (flags) {
+ case DB_CONSUME:
+ if (dbp->type != DB_QUEUE)
+ return (__db_ferr(env, "DB->del", 0));
+ goto copy;
+ case DB_MULTIPLE:
+ case DB_MULTIPLE_KEY:
+ if (!F_ISSET(key, DB_DBT_BULK)) {
+ __db_errx(env,
+ "DB->del with DB_MULTIPLE(_KEY) requires multiple key records");
+ return (EINVAL);
+ }
+ /* FALL THROUGH */
+ case 0:
+copy: if ((ret = __dbt_usercopy(env, key)) != 0)
+ return (ret);
+ break;
+ default:
+ return (__db_ferr(env, "DB->del", 0));
+ }
+
+ return (0);
+}
+
+/*
+ * __db_exists --
+ * DB->exists implementation.
+ *
+ * PUBLIC: int __db_exists __P((DB *, DB_TXN *, DBT *, u_int32_t));
+ */
+int
+__db_exists(dbp, txn, key, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ DBT *key;
+ u_int32_t flags;
+{
+ DBT data;
+ int ret;
+
+ /*
+ * Most flag checking is done in the DB->get call, we only check for
+ * specific incompatibilities here. This saves making __get_arg
+ * aware of the exist method's API constraints.
+ */
+ STRIP_AUTO_COMMIT(flags);
+ if ((ret = __db_fchk(dbp->env, "DB->exists", flags,
+ DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW)) != 0)
+ return (ret);
+
+ /*
+ * Configure a data DBT that returns no bytes so there's no copy
+ * of the data.
+ */
+ memset(&data, 0, sizeof(data));
+ data.dlen = 0;
+ data.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM;
+
+ return (dbp->get(dbp, txn, key, &data, flags));
+}
+
+/*
+ * db_fd_pp --
+ * DB->fd pre/post processing.
+ *
+ * PUBLIC: int __db_fd_pp __P((DB *, int *));
+ */
+int
+__db_fd_pp(dbp, fdp)
+ DB *dbp;
+ int *fdp;
+{
+ DB_FH *fhp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = dbp->env;
+
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->fd");
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, 0)) != 0)
+ goto err;
+
+ /*
+ * !!!
+ * There's no argument checking to be done.
+ *
+ * !!!
+ * The actual method call is simple, do it inline.
+ *
+ * XXX
+ * Truly spectacular layering violation.
+ */
+ if ((ret = __mp_xxx_fh(dbp->mpf, &fhp)) == 0) {
+ if (fhp == NULL) {
+ *fdp = -1;
+ __db_errx(env,
+ "Database does not have a valid file handle");
+ ret = ENOENT;
+ } else
+ *fdp = fhp->fd;
+ }
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+err: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_get_pp --
+ * DB->get pre/post processing.
+ *
+ * PUBLIC: int __db_get_pp __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_get_pp(dbp, txn, key, data, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ u_int32_t mode;
+ int handle_check, ignore_lease, ret, t_ret, txn_local;
+
+ env = dbp->env;
+ mode = 0;
+ txn_local = 0;
+
+ STRIP_AUTO_COMMIT(flags);
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get");
+
+ ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0;
+ LF_CLR(DB_IGNORE_LEASE);
+
+ if ((ret = __db_get_arg(dbp, key, data, flags)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(dbp, 1, 0, txn != NULL)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ if (LF_ISSET(DB_READ_UNCOMMITTED))
+ mode = DB_READ_UNCOMMITTED;
+ else if ((flags & DB_OPFLAGS_MASK) == DB_CONSUME ||
+ (flags & DB_OPFLAGS_MASK) == DB_CONSUME_WAIT) {
+ mode = DB_WRITELOCK;
+ if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+ if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+ goto err;
+ txn_local = 1;
+ }
+ }
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID,
+ mode == DB_WRITELOCK || LF_ISSET(DB_RMW) ? 0 : 1)) != 0)
+ goto err;
+
+ ret = __db_get(dbp, ip, txn, key, data, flags);
+ /*
+ * Check for master leases.
+ */
+ if (ret == 0 &&
+ IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease)
+ ret = __rep_lease_check(env, 1);
+
+err: if (txn_local &&
+ (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0)
+ ret = t_ret;
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ __dbt_userfree(env, key, NULL, data);
+ return (ret);
+}
+
+/*
+ * __db_get --
+ * DB->get.
+ *
+ * PUBLIC: int __db_get __P((DB *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_get(dbp, ip, txn, key, data, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ u_int32_t mode;
+ int ret, t_ret;
+
+ /*
+ * The DB_CURSOR_TRANSIENT flag indicates that we're just doing a single
+ * operation with this cursor, and that in case of error we don't need
+ * to restore it to its old position. Thus, we can perform the get
+ * without duplicating the cursor, saving some cycles in this common
+ * case.
+ */
+ mode = DB_CURSOR_TRANSIENT;
+ if (LF_ISSET(DB_READ_UNCOMMITTED)) {
+ mode |= DB_READ_UNCOMMITTED;
+ LF_CLR(DB_READ_UNCOMMITTED);
+ } else if (LF_ISSET(DB_READ_COMMITTED)) {
+ mode |= DB_READ_COMMITTED;
+ LF_CLR(DB_READ_COMMITTED);
+ } else if ((flags & DB_OPFLAGS_MASK) == DB_CONSUME ||
+ (flags & DB_OPFLAGS_MASK) == DB_CONSUME_WAIT)
+ mode |= DB_WRITELOCK;
+
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, mode)) != 0)
+ return (ret);
+
+ DEBUG_LREAD(dbc, txn, "DB->get", key, NULL, flags);
+
+ /*
+ * The semantics of bulk gets are different for DB->get vs DBC->get.
+ * Mark the cursor so the low-level bulk get routines know which
+ * behavior we want.
+ */
+ F_SET(dbc, DBC_FROM_DB_GET);
+
+ /*
+ * SET_RET_MEM indicates that if key and/or data have no DBT
+ * flags set and DB manages the returned-data memory, that memory
+ * will belong to this handle, not to the underlying cursor.
+ */
+ SET_RET_MEM(dbc, dbp);
+
+ if (LF_ISSET(~(DB_RMW | DB_MULTIPLE)) == 0)
+ LF_SET(DB_SET);
+
+#ifdef HAVE_PARTITION
+ if (F_ISSET(dbc, DBC_PARTITIONED))
+ ret = __partc_get(dbc, key, data, flags);
+ else
+#endif
+ ret = __dbc_get(dbc, key, data, flags);
+
+ if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_get_arg --
+ * DB->get argument checking, used by both DB->get and DB->pget.
+ */
+static int
+__db_get_arg(dbp, key, data, flags)
+ const DB *dbp;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ ENV *env;
+ int dirty, multi, ret;
+
+ env = dbp->env;
+
+ /*
+ * Check for read-modify-write validity. DB_RMW doesn't make sense
+ * with CDB cursors since if you're going to write the cursor, you
+ * had to create it with DB_WRITECURSOR. Regardless, we check for
+ * LOCKING_ON and not STD_LOCKING, as we don't want to disallow it.
+ * If this changes, confirm that DB does not itself set the DB_RMW
+ * flag in a path where CDB may have been configured.
+ */
+ dirty = 0;
+ if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW)) {
+ if (!LOCKING_ON(env))
+ return (__db_fnl(env, "DB->get"));
+ if ((ret = __db_fcchk(env, "DB->get",
+ flags, DB_READ_UNCOMMITTED, DB_READ_COMMITTED)) != 0)
+ return (ret);
+ if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED))
+ dirty = 1;
+ LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
+ }
+
+ multi = 0;
+ if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+ if (LF_ISSET(DB_MULTIPLE_KEY))
+ goto multi_err;
+ multi = LF_ISSET(DB_MULTIPLE) ? 1 : 0;
+ LF_CLR(DB_MULTIPLE);
+ }
+
+ /* Check for invalid function flags. */
+ switch (flags) {
+ case DB_GET_BOTH:
+ if ((ret = __dbt_usercopy(env, data)) != 0)
+ return (ret);
+ /* FALLTHROUGH */
+ case 0:
+ if ((ret = __dbt_usercopy(env, key)) != 0) {
+ __dbt_userfree(env, key, NULL, data);
+ return (ret);
+ }
+ break;
+ case DB_SET_RECNO:
+ if (!F_ISSET(dbp, DB_AM_RECNUM))
+ goto err;
+ if ((ret = __dbt_usercopy(env, key)) != 0)
+ return (ret);
+ break;
+ case DB_CONSUME:
+ case DB_CONSUME_WAIT:
+ if (dirty) {
+ __db_errx(env,
+ "%s is not supported with DB_CONSUME or DB_CONSUME_WAIT",
+ LF_ISSET(DB_READ_UNCOMMITTED) ?
+ "DB_READ_UNCOMMITTED" : "DB_READ_COMMITTED");
+ return (EINVAL);
+ }
+ if (multi)
+multi_err: return (__db_ferr(env, "DB->get", 1));
+ if (dbp->type == DB_QUEUE)
+ break;
+ /* FALLTHROUGH */
+ default:
+err: return (__db_ferr(env, "DB->get", 0));
+ }
+
+ /*
+ * Check for invalid key/data flags.
+ */
+ if ((ret =
+ __dbt_ferr(dbp, "key", key, DB_RETURNS_A_KEY(dbp, flags))) != 0)
+ return (ret);
+ if ((ret = __dbt_ferr(dbp, "data", data, 1)) != 0)
+ return (ret);
+
+ if (multi) {
+ if (!F_ISSET(data, DB_DBT_USERMEM)) {
+ __db_errx(env,
+ "DB_MULTIPLE requires DB_DBT_USERMEM be set");
+ return (EINVAL);
+ }
+ if (F_ISSET(key, DB_DBT_PARTIAL) ||
+ F_ISSET(data, DB_DBT_PARTIAL)) {
+ __db_errx(env,
+ "DB_MULTIPLE does not support DB_DBT_PARTIAL");
+ return (EINVAL);
+ }
+ if (data->ulen < 1024 ||
+ data->ulen < dbp->pgsize || data->ulen % 1024 != 0) {
+ __db_errx(env, "%s%s",
+ "DB_MULTIPLE buffers must be ",
+ "aligned, at least page size and multiples of 1KB");
+ return (EINVAL);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * __db_join_pp --
+ * DB->join pre/post processing.
+ *
+ * PUBLIC: int __db_join_pp __P((DB *, DBC **, DBC **, u_int32_t));
+ */
+int
+__db_join_pp(primary, curslist, dbcp, flags)
+ DB *primary;
+ DBC **curslist, **dbcp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = primary->env;
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret =
+ __db_rep_enter(primary, 1, 0, curslist[0]->txn != NULL)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ if ((ret = __db_join_arg(primary, curslist, flags)) == 0)
+ ret = __db_join(primary, curslist, dbcp, flags);
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+err: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_join_arg --
+ * Check DB->join arguments.
+ */
+static int
+__db_join_arg(primary, curslist, flags)
+ DB *primary;
+ DBC **curslist;
+ u_int32_t flags;
+{
+ DB_TXN *txn;
+ ENV *env;
+ int i;
+
+ env = primary->env;
+
+ switch (flags) {
+ case 0:
+ case DB_JOIN_NOSORT:
+ break;
+ default:
+ return (__db_ferr(env, "DB->join", 0));
+ }
+
+ if (curslist == NULL || curslist[0] == NULL) {
+ __db_errx(env,
+ "At least one secondary cursor must be specified to DB->join");
+ return (EINVAL);
+ }
+
+ txn = curslist[0]->txn;
+ for (i = 1; curslist[i] != NULL; i++)
+ if (curslist[i]->txn != txn) {
+ __db_errx(env,
+ "All secondary cursors must share the same transaction");
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+/*
+ * __db_key_range_pp --
+ * DB->key_range pre/post processing.
+ *
+ * PUBLIC: int __db_key_range_pp
+ * PUBLIC: __P((DB *, DB_TXN *, DBT *, DB_KEY_RANGE *, u_int32_t));
+ */
+int
+__db_key_range_pp(dbp, txn, key, kr, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ DBT *key;
+ DB_KEY_RANGE *kr;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = dbp->env;
+
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->key_range");
+
+ /*
+ * !!!
+ * The actual argument checking is simple, do it inline, outside of
+ * the replication block.
+ */
+ if (flags != 0)
+ return (__db_ferr(env, "DB->key_range", 0));
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(dbp, 1, 0, txn != NULL)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 1)) != 0)
+ goto err;
+
+ /*
+ * !!!
+ * The actual method call is simple, do it inline.
+ */
+ switch (dbp->type) {
+ case DB_BTREE:
+#ifndef HAVE_BREW
+ if ((ret = __dbt_usercopy(env, key)) != 0)
+ goto err;
+
+ /* Acquire a cursor. */
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+ break;
+
+ DEBUG_LWRITE(dbc, NULL, "bam_key_range", NULL, NULL, 0);
+#ifdef HAVE_PARTITION
+ if (DB_IS_PARTITIONED(dbp))
+ ret = __part_key_range(dbc, key, kr, flags);
+ else
+#endif
+ ret = __bam_key_range(dbc, key, kr, flags);
+
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ __dbt_userfree(env, key, NULL, NULL);
+ break;
+#else
+ COMPQUIET(dbc, NULL);
+ COMPQUIET(key, NULL);
+ COMPQUIET(kr, NULL);
+ /* FALLTHROUGH */
+#endif
+ case DB_HASH:
+ case DB_QUEUE:
+ case DB_RECNO:
+ ret = __dbh_am_chk(dbp, DB_OK_BTREE);
+ break;
+ case DB_UNKNOWN:
+ default:
+ ret = __db_unknown_type(env, "DB->key_range", dbp->type);
+ break;
+ }
+
+err: /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_open_pp --
+ * DB->open pre/post processing.
+ *
+ * PUBLIC: int __db_open_pp __P((DB *, DB_TXN *,
+ * PUBLIC: const char *, const char *, DBTYPE, u_int32_t, int));
+ */
+int
+__db_open_pp(dbp, txn, fname, dname, type, flags, mode)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *fname, *dname;
+ DBTYPE type;
+ u_int32_t flags;
+ int mode;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, nosync, remove_me, ret, t_ret, txn_local;
+
+ env = dbp->env;
+ nosync = 1;
+ handle_check = remove_me = txn_local = 0;
+
+ ENV_ENTER(env, ip);
+
+ /*
+ * Save the file and database names and flags. We do this here
+ * because we don't pass all of the flags down into the actual
+ * DB->open method call, we strip DB_AUTO_COMMIT at this layer.
+ */
+ if ((fname != NULL &&
+ (ret = __os_strdup(env, fname, &dbp->fname)) != 0))
+ goto err;
+ if ((dname != NULL &&
+ (ret = __os_strdup(env, dname, &dbp->dname)) != 0))
+ goto err;
+ dbp->open_flags = flags;
+
+ /* Save the current DB handle flags for refresh. */
+ dbp->orig_flags = dbp->flags;
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(dbp, 1, 0, txn != NULL)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ /*
+ * Create local transaction as necessary, check for consistent
+ * transaction usage.
+ */
+ if (IS_ENV_AUTO_COMMIT(env, txn, flags)) {
+ if ((ret = __db_txn_auto_init(env, ip, &txn)) != 0)
+ goto err;
+ txn_local = 1;
+ } else if (txn != NULL && !TXN_ON(env) &&
+ (!CDB_LOCKING(env) || !F_ISSET(txn, TXN_CDSGROUP))) {
+ ret = __db_not_txn_env(env);
+ goto err;
+ }
+ LF_CLR(DB_AUTO_COMMIT);
+
+ /*
+ * We check arguments after possibly creating a local transaction,
+ * which is unusual -- the reason is some flags are illegal if any
+ * kind of transaction is in effect.
+ */
+ if ((ret = __db_open_arg(dbp, txn, fname, dname, type, flags)) == 0)
+ if ((ret = __db_open(dbp, ip, txn, fname, dname, type,
+ flags, mode, PGNO_BASE_MD)) != 0)
+ goto txnerr;
+
+ /*
+ * You can open the database that describes the subdatabases in the
+ * rest of the file read-only. The content of each key's data is
+ * unspecified and applications should never be adding new records
+ * or updating existing records. However, during recovery, we need
+ * to open these databases R/W so we can redo/undo changes in them.
+ * Likewise, we need to open master databases read/write during
+ * rename and remove so we can be sure they're fully sync'ed, so
+ * we provide an override flag for the purpose.
+ */
+ if (dname == NULL && !IS_RECOVERING(env) && !LF_ISSET(DB_RDONLY) &&
+ !LF_ISSET(DB_RDWRMASTER) && F_ISSET(dbp, DB_AM_SUBDB)) {
+ __db_errx(env,
+ "files containing multiple databases may only be opened read-only");
+ ret = EINVAL;
+ goto txnerr;
+ }
+
+ /*
+ * Success: file creations have to be synchronous, otherwise we don't
+ * care.
+ */
+ if (F_ISSET(dbp, DB_AM_CREATED | DB_AM_CREATED_MSTR))
+ nosync = 0;
+
+ /* Success: don't discard the file on close. */
+ F_CLR(dbp, DB_AM_DISCARD | DB_AM_CREATED | DB_AM_CREATED_MSTR);
+
+ /*
+ * If not transactional, remove the databases/subdatabases if it is
+ * persistent. If we're transactional, the child transaction abort
+ * cleans up.
+ */
+txnerr: if (ret != 0 && !IS_REAL_TXN(txn)) {
+ remove_me = (F_ISSET(dbp, DB_AM_CREATED) &&
+ (fname != NULL || dname != NULL)) ? 1 : 0;
+ if (F_ISSET(dbp, DB_AM_CREATED_MSTR) ||
+ (dname == NULL && remove_me))
+ /* Remove file. */
+ (void)__db_remove_int(dbp,
+ ip, txn, fname, NULL, DB_FORCE);
+ else if (remove_me)
+ /* Remove subdatabase. */
+ (void)__db_remove_int(dbp,
+ ip, txn, fname, dname, DB_FORCE);
+ }
+
+ if (txn_local && (t_ret =
+ __db_txn_auto_resolve(env, txn, nosync, ret)) && ret == 0)
+ ret = t_ret;
+
+err: /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_open_arg --
+ * Check DB->open arguments.
+ */
+static int
+__db_open_arg(dbp, txn, fname, dname, type, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *fname, *dname;
+ DBTYPE type;
+ u_int32_t flags;
+{
+ ENV *env;
+ u_int32_t ok_flags;
+ int ret;
+
+ env = dbp->env;
+
+ /* Validate arguments. */
+#undef OKFLAGS
+#define OKFLAGS \
+ (DB_AUTO_COMMIT | DB_CREATE | DB_EXCL | DB_FCNTL_LOCKING | \
+ DB_MULTIVERSION | DB_NOMMAP | DB_NO_AUTO_COMMIT | DB_RDONLY | \
+ DB_RDWRMASTER | DB_READ_UNCOMMITTED | DB_THREAD | DB_TRUNCATE)
+ if ((ret = __db_fchk(env, "DB->open", flags, OKFLAGS)) != 0)
+ return (ret);
+ if (LF_ISSET(DB_EXCL) && !LF_ISSET(DB_CREATE))
+ return (__db_ferr(env, "DB->open", 1));
+ if (LF_ISSET(DB_RDONLY) && LF_ISSET(DB_CREATE))
+ return (__db_ferr(env, "DB->open", 1));
+
+#ifdef HAVE_VXWORKS
+ if (LF_ISSET(DB_TRUNCATE)) {
+ __db_errx(env, "DB_TRUNCATE not supported on VxWorks");
+ return (DB_OPNOTSUP);
+ }
+#endif
+ switch (type) {
+ case DB_UNKNOWN:
+ if (LF_ISSET(DB_CREATE|DB_TRUNCATE)) {
+ __db_errx(env,
+ "DB_UNKNOWN type specified with DB_CREATE or DB_TRUNCATE");
+ return (EINVAL);
+ }
+ ok_flags = 0;
+ break;
+ case DB_BTREE:
+ ok_flags = DB_OK_BTREE;
+ break;
+ case DB_HASH:
+#ifndef HAVE_HASH
+ return (__db_no_hash_am(env));
+#endif
+ ok_flags = DB_OK_HASH;
+ break;
+ case DB_QUEUE:
+#ifndef HAVE_QUEUE
+ return (__db_no_queue_am(env));
+#endif
+ ok_flags = DB_OK_QUEUE;
+ break;
+ case DB_RECNO:
+ ok_flags = DB_OK_RECNO;
+ break;
+ default:
+ __db_errx(env, "unknown type: %lu", (u_long)type);
+ return (EINVAL);
+ }
+ if (ok_flags)
+ DB_ILLEGAL_METHOD(dbp, ok_flags);
+
+ /* The environment may have been created, but never opened. */
+ if (!F_ISSET(env, ENV_DBLOCAL | ENV_OPEN_CALLED)) {
+ __db_errx(env, "database environment not yet opened");
+ return (EINVAL);
+ }
+
+ /*
+ * Historically, you could pass in an environment that didn't have a
+ * mpool, and DB would create a private one behind the scenes. This
+ * no longer works.
+ */
+ if (!F_ISSET(env, ENV_DBLOCAL) && !MPOOL_ON(env)) {
+ __db_errx(env, "environment did not include a memory pool");
+ return (EINVAL);
+ }
+
+ /*
+ * You can't specify threads during DB->open if subsystems in the
+ * environment weren't configured with them.
+ */
+ if (LF_ISSET(DB_THREAD) && !F_ISSET(env, ENV_DBLOCAL | ENV_THREAD)) {
+ __db_errx(env, "environment not created using DB_THREAD");
+ return (EINVAL);
+ }
+
+ /* DB_MULTIVERSION requires a database configured for transactions. */
+ if (LF_ISSET(DB_MULTIVERSION) && !IS_REAL_TXN(txn)) {
+ __db_errx(env,
+ "DB_MULTIVERSION illegal without a transaction specified");
+ return (EINVAL);
+ }
+
+ if (LF_ISSET(DB_MULTIVERSION) && type == DB_QUEUE) {
+ __db_errx(env,
+ "DB_MULTIVERSION illegal with queue databases");
+ return (EINVAL);
+ }
+
+ /* DB_TRUNCATE is neither transaction recoverable nor lockable. */
+ if (LF_ISSET(DB_TRUNCATE) && (LOCKING_ON(env) || txn != NULL)) {
+ __db_errx(env,
+ "DB_TRUNCATE illegal with %s specified",
+ LOCKING_ON(env) ? "locking" : "transactions");
+ return (EINVAL);
+ }
+
+ /* Subdatabase checks. */
+ if (dname != NULL) {
+ /* QAM can only be done on in-memory subdatabases. */
+ if (type == DB_QUEUE && fname != NULL) {
+ __db_errx(
+ env, "Queue databases must be one-per-file");
+ return (EINVAL);
+ }
+
+ /*
+ * Named in-memory databases can't support certain flags,
+ * so check here.
+ */
+ if (fname == NULL)
+ F_CLR(dbp, DB_AM_CHKSUM | DB_AM_ENCRYPT);
+ }
+
+ return (0);
+}
+
+/*
+ * __db_pget_pp --
+ * DB->pget pre/post processing.
+ *
+ * PUBLIC: int __db_pget_pp
+ * PUBLIC: __P((DB *, DB_TXN *, DBT *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_pget_pp(dbp, txn, skey, pkey, data, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ DBT *skey, *pkey, *data;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ignore_lease, ret, t_ret;
+
+ env = dbp->env;
+
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->pget");
+
+ ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0;
+ LF_CLR(DB_IGNORE_LEASE);
+
+ if ((ret = __db_pget_arg(dbp, pkey, flags)) != 0 ||
+ (ret = __db_get_arg(dbp, skey, data, flags)) != 0) {
+ __dbt_userfree(env, skey, pkey, data);
+ return (ret);
+ }
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(dbp, 1, 0, txn != NULL)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ ret = __db_pget(dbp, ip, txn, skey, pkey, data, flags);
+ /*
+ * Check for master leases.
+ */
+ if (ret == 0 &&
+ IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease)
+ ret = __rep_lease_check(env, 1);
+
+err: /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ __dbt_userfree(env, skey, pkey, data);
+ return (ret);
+}
+
+/*
+ * __db_pget --
+ * DB->pget.
+ *
+ * PUBLIC: int __db_pget __P((DB *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_pget(dbp, ip, txn, skey, pkey, data, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DBT *skey, *pkey, *data;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ u_int32_t mode;
+ int ret, t_ret;
+
+ mode = DB_CURSOR_TRANSIENT;
+ if (LF_ISSET(DB_READ_UNCOMMITTED)) {
+ mode |= DB_READ_UNCOMMITTED;
+ LF_CLR(DB_READ_UNCOMMITTED);
+ } else if (LF_ISSET(DB_READ_COMMITTED)) {
+ mode |= DB_READ_COMMITTED;
+ LF_CLR(DB_READ_COMMITTED);
+ }
+
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, mode)) != 0)
+ return (ret);
+
+ SET_RET_MEM(dbc, dbp);
+
+ DEBUG_LREAD(dbc, txn, "__db_pget", skey, NULL, flags);
+
+ /*
+ * !!!
+ * The actual method call is simple, do it inline.
+ *
+ * The underlying cursor pget will fill in a default DBT for null
+ * pkeys, and use the cursor's returned-key memory internally to
+ * store any intermediate primary keys. However, we've just set
+ * the returned-key memory to the DB handle's key memory, which
+ * is unsafe to use if the DB handle is threaded. If the pkey
+ * argument is NULL, use the DBC-owned returned-key memory
+ * instead; it'll go away when we close the cursor before we
+ * return, but in this case that's just fine, as we're not
+ * returning the primary key.
+ */
+ if (pkey == NULL)
+ dbc->rkey = &dbc->my_rkey;
+
+ /*
+ * The cursor is just a perfectly ordinary secondary database cursor.
+ * Call its c_pget() method to do the dirty work.
+ */
+ if (flags == 0 || flags == DB_RMW)
+ flags |= DB_SET;
+
+ ret = __dbc_pget(dbc, skey, pkey, data, flags);
+
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_pget_arg --
+ * Check DB->pget arguments.
+ */
+static int
+__db_pget_arg(dbp, pkey, flags)
+ DB *dbp;
+ DBT *pkey;
+ u_int32_t flags;
+{
+ ENV *env;
+ int ret;
+
+ env = dbp->env;
+
+ if (!F_ISSET(dbp, DB_AM_SECONDARY)) {
+ __db_errx(env,
+ "DB->pget may only be used on secondary indices");
+ return (EINVAL);
+ }
+
+ if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+ __db_errx(env,
+ "DB_MULTIPLE and DB_MULTIPLE_KEY may not be used on secondary indices");
+ return (EINVAL);
+ }
+
+ /* DB_CONSUME makes no sense on a secondary index. */
+ LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
+ switch (flags) {
+ case DB_CONSUME:
+ case DB_CONSUME_WAIT:
+ return (__db_ferr(env, "DB->pget", 0));
+ default:
+ /* __db_get_arg will catch the rest. */
+ break;
+ }
+
+ /*
+ * We allow the pkey field to be NULL, so that we can make the
+ * two-DBT get calls into wrappers for the three-DBT ones.
+ */
+ if (pkey != NULL &&
+ (ret = __dbt_ferr(dbp, "primary key", pkey, 1)) != 0)
+ return (ret);
+
+ if (flags == DB_GET_BOTH) {
+ /* The pkey field can't be NULL if we're doing a DB_GET_BOTH. */
+ if (pkey == NULL) {
+ __db_errx(env,
+ "DB_GET_BOTH on a secondary index requires a primary key");
+ return (EINVAL);
+ }
+ if ((ret = __dbt_usercopy(env, pkey)) != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+/*
+ * __db_put_pp --
+ * DB->put pre/post processing.
+ *
+ * PUBLIC: int __db_put_pp __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_put_pp(dbp, txn, key, data, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, txn_local, t_ret;
+
+ env = dbp->env;
+ txn_local = 0;
+
+ STRIP_AUTO_COMMIT(flags);
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->put");
+
+ if ((ret = __db_put_arg(dbp, key, data, flags)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(dbp, 1, 0, txn != NULL)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ /* Create local transaction as necessary. */
+ if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+ if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+ goto err;
+ txn_local = 1;
+ }
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+ goto err;
+
+ ret = __db_put(dbp, ip, txn, key, data, flags);
+
+err: if (txn_local &&
+ (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0)
+ ret = t_ret;
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ __dbt_userfree(env, key, NULL, data);
+ return (ret);
+}
+
+/*
+ * __db_put_arg --
+ * Check DB->put arguments.
+ */
+static int
+__db_put_arg(dbp, key, data, flags)
+ DB *dbp;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ ENV *env;
+ int ret, returnkey;
+
+ env = dbp->env;
+ returnkey = 0;
+
+ /* Check for changes to a read-only tree. */
+ if (DB_IS_READONLY(dbp))
+ return (__db_rdonly(env, "DB->put"));
+
+ /* Check for puts on a secondary. */
+ if (F_ISSET(dbp, DB_AM_SECONDARY)) {
+ __db_errx(env, "DB->put forbidden on secondary indices");
+ return (EINVAL);
+ }
+
+ if (LF_ISSET(DB_MULTIPLE_KEY | DB_MULTIPLE)) {
+ if (LF_ISSET(DB_MULTIPLE) && LF_ISSET(DB_MULTIPLE_KEY))
+ goto err;
+
+ switch (LF_ISSET(DB_OPFLAGS_MASK)) {
+ case 0:
+ case DB_OVERWRITE_DUP:
+ break;
+ default:
+ __db_errx(env,
+ "DB->put: DB_MULTIPLE(_KEY) can only be combined with DB_OVERWRITE_DUP");
+ return (EINVAL);
+ }
+
+ if (!F_ISSET(key, DB_DBT_BULK)) {
+ __db_errx(env,
+ "DB->put with DB_MULTIPLE(_KEY) requires a bulk key buffer");
+ return (EINVAL);
+ }
+ }
+ if (LF_ISSET(DB_MULTIPLE)) {
+ if (!F_ISSET(data, DB_DBT_BULK)) {
+ __db_errx(env,
+ "DB->put with DB_MULTIPLE requires a bulk data buffer");
+ return (EINVAL);
+ }
+ }
+
+ /* Check for invalid function flags. */
+ switch (LF_ISSET(DB_OPFLAGS_MASK)) {
+ case 0:
+ case DB_NOOVERWRITE:
+ case DB_OVERWRITE_DUP:
+ break;
+ case DB_APPEND:
+ if (dbp->type != DB_RECNO && dbp->type != DB_QUEUE)
+ goto err;
+ returnkey = 1;
+ break;
+ case DB_NODUPDATA:
+ if (F_ISSET(dbp, DB_AM_DUPSORT))
+ break;
+ /* FALLTHROUGH */
+ default:
+err: return (__db_ferr(env, "DB->put", 0));
+ }
+
+ /*
+ * Check for invalid key/data flags. The key may reasonably be NULL
+ * if DB_APPEND is set and the application doesn't care about the
+ * returned key.
+ */
+ if (((returnkey && key != NULL) || !returnkey) &&
+ (ret = __dbt_ferr(dbp, "key", key, returnkey)) != 0)
+ return (ret);
+ if (!LF_ISSET(DB_MULTIPLE_KEY) &&
+ (ret = __dbt_ferr(dbp, "data", data, 0)) != 0)
+ return (ret);
+
+ /*
+ * The key parameter should not be NULL or have the "partial" flag set
+ * in a put call unless the user doesn't care about a key value we'd
+ * return. The user tells us they don't care about the returned key by
+ * setting the key parameter to NULL or configuring the key DBT to not
+ * return any information. (Returned keys from a put are always record
+ * numbers, and returning part of a record number doesn't make sense:
+ * only accept a partial return if the length returned is 0.)
+ */
+ if ((returnkey &&
+ key != NULL && F_ISSET(key, DB_DBT_PARTIAL) && key->dlen != 0) ||
+ (!returnkey && F_ISSET(key, DB_DBT_PARTIAL)))
+ return (__db_ferr(env, "key DBT", 0));
+
+ /* Check for partial puts in the presence of duplicates. */
+ if (data != NULL && F_ISSET(data, DB_DBT_PARTIAL) &&
+ (F_ISSET(dbp, DB_AM_DUP) || F_ISSET(key, DB_DBT_DUPOK))) {
+ __db_errx(env,
+"a partial put in the presence of duplicates requires a cursor operation");
+ return (EINVAL);
+ }
+
+ if ((flags != DB_APPEND && (ret = __dbt_usercopy(env, key)) != 0) ||
+ (!LF_ISSET(DB_MULTIPLE_KEY) &&
+ (ret = __dbt_usercopy(env, data)) != 0))
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __db_compact_pp --
+ * DB->compact pre/post processing.
+ *
+ * PUBLIC: int __db_compact_pp __P((DB *, DB_TXN *,
+ * PUBLIC: DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *));
+ */
+int
+__db_compact_pp(dbp, txn, start, stop, c_data, flags, end)
+ DB *dbp;
+ DB_TXN *txn;
+ DBT *start, *stop;
+ DB_COMPACT *c_data;
+ u_int32_t flags;
+ DBT *end;
+{
+ DB_COMPACT *dp, l_data;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = dbp->env;
+
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->compact");
+
+ /*
+ * !!!
+ * The actual argument checking is simple, do it inline, outside of
+ * the replication block.
+ */
+ if ((ret = __db_fchk(
+ env, "DB->compact", flags, DB_FREELIST_ONLY | DB_FREE_SPACE)) != 0)
+ return (ret);
+
+ /* Check for changes to a read-only database. */
+ if (DB_IS_READONLY(dbp))
+ return (__db_rdonly(env, "DB->compact"));
+
+ if (start != NULL && (ret = __dbt_usercopy(env, start)) != 0)
+ return (ret);
+ if (stop != NULL && (ret = __dbt_usercopy(env, stop)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret = __db_rep_enter(dbp, 1, 0,
+ txn != NULL)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ if (c_data == NULL) {
+ dp = &l_data;
+ memset(dp, 0, sizeof(*dp));
+ } else
+ dp = c_data;
+#ifdef HAVE_PARTITION
+ if (DB_IS_PARTITIONED(dbp))
+ ret = __part_compact(dbp, ip, txn, start, stop, dp, flags, end);
+ else
+#endif
+ switch (dbp->type) {
+ case DB_HASH:
+ if (!LF_ISSET(DB_FREELIST_ONLY))
+ goto err;
+ /* FALLTHROUGH */
+ case DB_BTREE:
+ case DB_RECNO:
+ ret = __bam_compact(dbp, ip, txn, start, stop, dp, flags, end);
+ break;
+
+ default:
+err: ret = __dbh_am_chk(dbp, DB_OK_BTREE);
+ break;
+ }
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ __dbt_userfree(env, start, stop, NULL);
+ return (ret);
+}
+
+/*
+ * __db_associate_foreign_pp --
+ * DB->associate_foreign pre/post processing.
+ *
+ * PUBLIC: int __db_associate_foreign_pp __P((DB *, DB *,
+ * PUBLIC: int (*)(DB *, const DBT *, DBT *, const DBT *, int *),
+ * PUBLIC: u_int32_t));
+ */
+int
+__db_associate_foreign_pp(fdbp, dbp, callback, flags)
+ DB *dbp, *fdbp;
+ int (*callback) __P((DB *, const DBT *, DBT *, const DBT *, int *));
+ u_int32_t flags;
+{
+ /* Most of this is based on the implementation of associate */
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = dbp->env;
+
+ PANIC_CHECK(env);
+ STRIP_AUTO_COMMIT(flags);
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(dbp, 1, 0, 0)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ if ((ret = __db_associate_foreign_arg(fdbp, dbp, callback, flags)) != 0)
+ goto err;
+
+ ret = __db_associate_foreign(fdbp, dbp, callback, flags);
+
+err: /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_associate_foreign_arg --
+ * DB->associate_foreign argument checking.
+ */
+static int
+__db_associate_foreign_arg(fdbp, dbp, callback, flags)
+ DB *dbp, *fdbp;
+ int (*callback) __P((DB *, const DBT *, DBT *, const DBT *, int *));
+ u_int32_t flags;
+{
+ ENV *env;
+
+ env = fdbp->env;
+
+ if (F_ISSET(fdbp, DB_AM_SECONDARY)) {
+ __db_errx(env,
+ "Secondary indices may not be used as foreign databases");
+ return (EINVAL);
+ }
+ if (F_ISSET(fdbp, DB_AM_DUP)) {
+ __db_errx(env,
+ "Foreign databases may not be configured with duplicates");
+ return (EINVAL);
+ }
+ if (F_ISSET(fdbp, DB_AM_RENUMBER)) {
+ __db_errx(env,
+ "Renumbering recno databases may not be used as foreign databases");
+ return (EINVAL);
+ }
+ if (!F_ISSET(dbp, DB_AM_SECONDARY)) {
+ __db_errx(env,
+ "The associating database must be a secondary index.");
+ return (EINVAL);
+ }
+ if (LF_ISSET(DB_FOREIGN_NULLIFY) && callback == NULL) {
+ __db_errx(env,
+ "When specifying a delete action of nullify, a callback%s",
+ " function needs to be configured");
+ return (EINVAL);
+ } else if (!LF_ISSET(DB_FOREIGN_NULLIFY) && callback != NULL) {
+ __db_errx(env,
+ "When not specifying a delete action of nullify, a%s",
+ " callback function cannot be configured");
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+/*
+ * __db_sync_pp --
+ * DB->sync pre/post processing.
+ *
+ * PUBLIC: int __db_sync_pp __P((DB *, u_int32_t));
+ */
+int
+__db_sync_pp(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = dbp->env;
+
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->sync");
+
+ /*
+ * !!!
+ * The actual argument checking is simple, do it inline, outside of
+ * the replication block.
+ */
+ if (flags != 0)
+ return (__db_ferr(env, "DB->sync", 0));
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, 0)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ ret = __db_sync(dbp);
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+err: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __dbc_close_pp --
+ * DBC->close pre/post processing.
+ *
+ * PUBLIC: int __dbc_close_pp __P((DBC *));
+ */
+int
+__dbc_close_pp(dbc)
+ DBC *dbc;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ /*
+ * If the cursor is already closed we have a serious problem, and we
+ * assume that the cursor isn't on the active queue. Don't do any of
+ * the remaining cursor close processing.
+ */
+ if (!F_ISSET(dbc, DBC_ACTIVE)) {
+ __db_errx(env, "Closing already-closed cursor");
+ return (EINVAL);
+ }
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = dbc->txn == NULL && IS_ENV_REPLICATED(env);
+ ret = __dbc_close(dbc);
+
+ /* Release replication block. */
+ if (handle_check &&
+ (t_ret = __op_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __dbc_cmp_pp --
+ * DBC->cmp pre/post processing.
+ *
+ * PUBLIC: int __dbc_cmp_pp __P((DBC *, DBC *, int*, u_int32_t));
+ */
+int
+__dbc_cmp_pp(dbc, other_cursor, result, flags)
+ DBC *dbc, *other_cursor;
+ int *result;
+ u_int32_t flags;
+{
+ DB *dbp, *odbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ dbp = dbc->dbp;
+ odbp = other_cursor->dbp;
+ env = dbp->env;
+
+ if (flags != 0)
+ return (__db_ferr(env, "DBcursor->cmp", 0));
+
+ if (other_cursor == NULL) {
+ __db_errx(env, "DBcursor->cmp dbc pointer must not be null");
+ return (EINVAL);
+ }
+
+ if (dbp != odbp) {
+ __db_errx(env,
+"DBcursor->cmp both cursors must refer to the same database.");
+ return (EINVAL);
+ }
+
+ ENV_ENTER(env, ip);
+ ret = __dbc_cmp(dbc, other_cursor, result);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __dbc_count_pp --
+ * DBC->count pre/post processing.
+ *
+ * PUBLIC: int __dbc_count_pp __P((DBC *, db_recno_t *, u_int32_t));
+ */
+int
+__dbc_count_pp(dbc, recnop, flags)
+ DBC *dbc;
+ db_recno_t *recnop;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ /*
+ * !!!
+ * The actual argument checking is simple, do it inline, outside of
+ * the replication block.
+ *
+ * The cursor must be initialized, return EINVAL for an invalid cursor.
+ */
+ if (flags != 0)
+ return (__db_ferr(env, "DBcursor->count", 0));
+
+ if (!IS_INITIALIZED(dbc))
+ return (__db_curinval(env));
+
+ ENV_ENTER(env, ip);
+ ret = __dbc_count(dbc, recnop);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __dbc_del_pp --
+ * DBC->del pre/post processing.
+ *
+ * PUBLIC: int __dbc_del_pp __P((DBC *, u_int32_t));
+ */
+int
+__dbc_del_pp(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ if ((ret = __dbc_del_arg(dbc, flags)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, dbc->txn, dbc->locker, 0)) != 0)
+ goto err;
+
+ DEBUG_LWRITE(dbc, dbc->txn, "DBcursor->del", NULL, NULL, flags);
+ ret = __dbc_del(dbc, flags);
+
+err: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __dbc_del_arg --
+ * Check DBC->del arguments.
+ */
+static int
+__dbc_del_arg(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ DB *dbp;
+ ENV *env;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ /* Check for changes to a read-only tree. */
+ if (DB_IS_READONLY(dbp))
+ return (__db_rdonly(env, "DBcursor->del"));
+
+ /* Check for invalid function flags. */
+ switch (flags) {
+ case 0:
+ break;
+ case DB_CONSUME:
+ if (dbp->type != DB_QUEUE)
+ return (__db_ferr(env, "DBC->del", 0));
+ break;
+ case DB_UPDATE_SECONDARY:
+ DB_ASSERT(env, F_ISSET(dbp, DB_AM_SECONDARY));
+ break;
+ default:
+ return (__db_ferr(env, "DBcursor->del", 0));
+ }
+
+ /*
+ * The cursor must be initialized, return EINVAL for an invalid cursor,
+ * otherwise 0.
+ */
+ if (!IS_INITIALIZED(dbc))
+ return (__db_curinval(env));
+
+ return (0);
+}
+
+/*
+ * __dbc_dup_pp --
+ * DBC->dup pre/post processing.
+ *
+ * PUBLIC: int __dbc_dup_pp __P((DBC *, DBC **, u_int32_t));
+ */
+int
+__dbc_dup_pp(dbc, dbcp, flags)
+ DBC *dbc, **dbcp;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ /*
+ * !!!
+ * The actual argument checking is simple, do it inline, outside of
+ * the replication block.
+ */
+ if (flags != 0 && flags != DB_POSITION)
+ return (__db_ferr(env, "DBcursor->dup", 0));
+
+ ENV_ENTER(env, ip);
+ ret = __dbc_dup(dbc, dbcp, flags);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __dbc_get_pp --
+ * DBC->get pre/post processing.
+ *
+ * PUBLIC: int __dbc_get_pp __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_get_pp(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ignore_lease, ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0;
+ LF_CLR(DB_IGNORE_LEASE);
+ if ((ret = __dbc_get_arg(dbc, key, data, flags)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+
+ DEBUG_LREAD(dbc, dbc->txn, "DBcursor->get",
+ flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags);
+ ret = __dbc_get(dbc, key, data, flags);
+
+ /*
+ * Check for master leases.
+ */
+ if (ret == 0 &&
+ IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease)
+ ret = __rep_lease_check(env, 1);
+
+ ENV_LEAVE(env, ip);
+ __dbt_userfree(env, key, NULL, data);
+ return (ret);
+}
+
+/*
+ * __dbc_get_arg --
+ * Common DBC->get argument checking, used by both DBC->get and DBC->pget.
+ * PUBLIC: int __dbc_get_arg __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_get_arg(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DB *dbp;
+ ENV *env;
+ int dirty, multi, ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ /*
+ * Typically in checking routines that modify the flags, we have
+ * to save them and restore them, because the checking routine
+ * calls the work routine. However, this is a pure-checking
+ * routine which returns to a function that calls the work routine,
+ * so it's OK that we do not save and restore the flags, even though
+ * we modify them.
+ *
+ * Check for read-modify-write validity. DB_RMW doesn't make sense
+ * with CDB cursors since if you're going to write the cursor, you
+ * had to create it with DB_WRITECURSOR. Regardless, we check for
+ * LOCKING_ON and not STD_LOCKING, as we don't want to disallow it.
+ * If this changes, confirm that DB does not itself set the DB_RMW
+ * flag in a path where CDB may have been configured.
+ */
+ dirty = 0;
+ if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW)) {
+ if (!LOCKING_ON(env))
+ return (__db_fnl(env, "DBcursor->get"));
+ if (LF_ISSET(DB_READ_UNCOMMITTED))
+ dirty = 1;
+ LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
+ }
+
+ multi = 0;
+ if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+ multi = 1;
+ if (LF_ISSET(DB_MULTIPLE) && LF_ISSET(DB_MULTIPLE_KEY))
+ goto multi_err;
+ LF_CLR(DB_MULTIPLE | DB_MULTIPLE_KEY);
+ }
+
+ /* Check for invalid function flags. */
+ switch (flags) {
+ case DB_CONSUME:
+ case DB_CONSUME_WAIT:
+ if (dirty) {
+ __db_errx(env,
+ "DB_READ_UNCOMMITTED is not supported with DB_CONSUME or DB_CONSUME_WAIT");
+ return (EINVAL);
+ }
+ if (dbp->type != DB_QUEUE)
+ goto err;
+ break;
+ case DB_CURRENT:
+ case DB_FIRST:
+ case DB_NEXT:
+ case DB_NEXT_DUP:
+ case DB_NEXT_NODUP:
+ break;
+ case DB_LAST:
+ case DB_PREV:
+ case DB_PREV_DUP:
+ case DB_PREV_NODUP:
+ if (multi)
+multi_err: return (__db_ferr(env, "DBcursor->get", 1));
+ break;
+ case DB_GET_BOTHC:
+ if (dbp->type == DB_QUEUE)
+ goto err;
+ /* FALLTHROUGH */
+ case DB_GET_BOTH:
+ case DB_GET_BOTH_RANGE:
+ if ((ret = __dbt_usercopy(env, data)) != 0)
+ goto err;
+ /* FALLTHROUGH */
+ case DB_SET:
+ case DB_SET_RANGE:
+ if ((ret = __dbt_usercopy(env, key)) != 0)
+ goto err;
+ break;
+ case DB_GET_RECNO:
+ /*
+ * The one situation in which this might be legal with a
+ * non-RECNUM dbp is if dbp is a secondary and its primary is
+ * DB_AM_RECNUM.
+ */
+ if (!F_ISSET(dbp, DB_AM_RECNUM) &&
+ (!F_ISSET(dbp, DB_AM_SECONDARY) ||
+ !F_ISSET(dbp->s_primary, DB_AM_RECNUM)))
+ goto err;
+ break;
+ case DB_SET_RECNO:
+ if (!F_ISSET(dbp, DB_AM_RECNUM))
+ goto err;
+ if ((ret = __dbt_usercopy(env, key)) != 0)
+ goto err;
+ break;
+ default:
+err: __dbt_userfree(env, key, NULL, data);
+ return (__db_ferr(env, "DBcursor->get", 0));
+ }
+
+ /* Check for invalid key/data flags. */
+ if ((ret = __dbt_ferr(dbp, "key", key, 0)) != 0)
+ return (ret);
+ if ((ret = __dbt_ferr(dbp, "data", data, 0)) != 0)
+ return (ret);
+
+ if (multi) {
+ if (!F_ISSET(data, DB_DBT_USERMEM)) {
+ __db_errx(env,
+ "DB_MULTIPLE/DB_MULTIPLE_KEY require DB_DBT_USERMEM be set");
+ return (EINVAL);
+ }
+ if (F_ISSET(key, DB_DBT_PARTIAL) ||
+ F_ISSET(data, DB_DBT_PARTIAL)) {
+ __db_errx(env,
+ "DB_MULTIPLE/DB_MULTIPLE_KEY do not support DB_DBT_PARTIAL");
+ return (EINVAL);
+ }
+ if (data->ulen < 1024 ||
+ data->ulen < dbp->pgsize || data->ulen % 1024 != 0) {
+ __db_errx(env, "%s%s",
+ "DB_MULTIPLE/DB_MULTIPLE_KEY buffers must be ",
+ "aligned, at least page size and multiples of 1KB");
+ return (EINVAL);
+ }
+ }
+
+ /*
+ * The cursor must be initialized for DB_CURRENT, DB_GET_RECNO,
+ * DB_PREV_DUP and DB_NEXT_DUP. Return EINVAL for an invalid
+ * cursor, otherwise 0.
+ */
+ if (!IS_INITIALIZED(dbc) && (flags == DB_CURRENT ||
+ flags == DB_GET_RECNO ||
+ flags == DB_NEXT_DUP || flags == DB_PREV_DUP))
+ return (__db_curinval(env));
+
+ /* Check for consistent transaction usage. */
+ if (LF_ISSET(DB_RMW) &&
+ (ret = __db_check_txn(dbp, dbc->txn, dbc->locker, 0)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __db_secondary_close_pp --
+ * DB->close for secondaries
+ *
+ * PUBLIC: int __db_secondary_close_pp __P((DB *, u_int32_t));
+ */
+int
+__db_secondary_close_pp(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = dbp->env;
+ ret = 0;
+
+ /*
+ * As a DB handle destructor, we can't fail.
+ *
+ * !!!
+ * The actual argument checking is simple, do it inline, outside of
+ * the replication block.
+ */
+ if (flags != 0 && flags != DB_NOSYNC)
+ ret = __db_ferr(env, "DB->close", 0);
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (t_ret = __db_rep_enter(dbp, 0, 0, 0)) != 0) {
+ handle_check = 0;
+ if (ret == 0)
+ ret = t_ret;
+ }
+
+ if ((t_ret = __db_secondary_close(dbp, flags)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __dbc_pget_pp --
+ * DBC->pget pre/post processing.
+ *
+ * PUBLIC: int __dbc_pget_pp __P((DBC *, DBT *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_pget_pp(dbc, skey, pkey, data, flags)
+ DBC *dbc;
+ DBT *skey, *pkey, *data;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ignore_lease, ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0;
+ LF_CLR(DB_IGNORE_LEASE);
+ if ((ret = __dbc_pget_arg(dbc, pkey, flags)) != 0 ||
+ (ret = __dbc_get_arg(dbc, skey, data, flags)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ ret = __dbc_pget(dbc, skey, pkey, data, flags);
+ /*
+ * Check for master leases.
+ */
+ if (ret == 0 &&
+ IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease)
+ ret = __rep_lease_check(env, 1);
+
+ ENV_LEAVE(env, ip);
+
+ __dbt_userfree(env, skey, pkey, data);
+ return (ret);
+}
+
+/*
+ * __dbc_pget_arg --
+ * Check DBC->pget arguments.
+ */
+static int
+__dbc_pget_arg(dbc, pkey, flags)
+ DBC *dbc;
+ DBT *pkey;
+ u_int32_t flags;
+{
+ DB *dbp;
+ ENV *env;
+ int ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ if (!F_ISSET(dbp, DB_AM_SECONDARY)) {
+ __db_errx(env,
+ "DBcursor->pget may only be used on secondary indices");
+ return (EINVAL);
+ }
+
+ if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) {
+ __db_errx(env,
+ "DB_MULTIPLE and DB_MULTIPLE_KEY may not be used on secondary indices");
+ return (EINVAL);
+ }
+
+ switch (LF_ISSET(DB_OPFLAGS_MASK)) {
+ case DB_CONSUME:
+ case DB_CONSUME_WAIT:
+ /* These flags make no sense on a secondary index. */
+ return (__db_ferr(env, "DBcursor->pget", 0));
+ case DB_GET_BOTH:
+ case DB_GET_BOTH_RANGE:
+ /* BOTH is "get both the primary and the secondary". */
+ if (pkey == NULL) {
+ __db_errx(env,
+ "%s requires both a secondary and a primary key",
+ LF_ISSET(DB_GET_BOTH) ?
+ "DB_GET_BOTH" : "DB_GET_BOTH_RANGE");
+ return (EINVAL);
+ }
+ if ((ret = __dbt_usercopy(env, pkey)) != 0)
+ return (ret);
+ break;
+ default:
+ /* __dbc_get_arg will catch the rest. */
+ break;
+ }
+
+ /*
+ * We allow the pkey field to be NULL, so that we can make the
+ * two-DBT get calls into wrappers for the three-DBT ones.
+ */
+ if (pkey != NULL &&
+ (ret = __dbt_ferr(dbp, "primary key", pkey, 0)) != 0)
+ return (ret);
+
+ /* But the pkey field can't be NULL if we're doing a DB_GET_BOTH. */
+ if (pkey == NULL && (flags & DB_OPFLAGS_MASK) == DB_GET_BOTH) {
+ __db_errx(env,
+ "DB_GET_BOTH on a secondary index requires a primary key");
+ return (EINVAL);
+ }
+ return (0);
+}
+
+/*
+ * __dbc_put_pp --
+ * DBC->put pre/post processing.
+ *
+ * PUBLIC: int __dbc_put_pp __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__dbc_put_pp(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ if ((ret = __dbc_put_arg(dbc, key, data, flags)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, dbc->txn, dbc->locker, 0)) != 0)
+ goto err;
+
+ DEBUG_LWRITE(dbc, dbc->txn, "DBcursor->put",
+ flags == DB_KEYFIRST || flags == DB_KEYLAST ||
+ flags == DB_NODUPDATA || flags == DB_UPDATE_SECONDARY ?
+ key : NULL, data, flags);
+ ret = __dbc_put(dbc, key, data, flags);
+
+err: ENV_LEAVE(env, ip);
+ __dbt_userfree(env, key, NULL, data);
+ return (ret);
+}
+
+/*
+ * __dbc_put_arg --
+ * Check DBC->put arguments.
+ */
+static int
+__dbc_put_arg(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DB *dbp;
+ ENV *env;
+ int key_flags, ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ key_flags = 0;
+
+ /* Check for changes to a read-only tree. */
+ if (DB_IS_READONLY(dbp))
+ return (__db_rdonly(env, "DBcursor->put"));
+
+ /* Check for puts on a secondary. */
+ if (F_ISSET(dbp, DB_AM_SECONDARY)) {
+ if (flags == DB_UPDATE_SECONDARY)
+ flags = 0;
+ else {
+ __db_errx(env,
+ "DBcursor->put forbidden on secondary indices");
+ return (EINVAL);
+ }
+ }
+
+ if ((ret = __dbt_usercopy(env, data)) != 0)
+ return (ret);
+
+ /* Check for invalid function flags. */
+ switch (flags) {
+ case DB_AFTER:
+ case DB_BEFORE:
+ switch (dbp->type) {
+ case DB_BTREE:
+ case DB_HASH: /* Only with unsorted duplicates. */
+ if (!F_ISSET(dbp, DB_AM_DUP))
+ goto err;
+ if (dbp->dup_compare != NULL)
+ goto err;
+ break;
+ case DB_QUEUE: /* Not permitted. */
+ goto err;
+ case DB_RECNO: /* Only with mutable record numbers. */
+ if (!F_ISSET(dbp, DB_AM_RENUMBER))
+ goto err;
+ key_flags = key == NULL ? 0 : 1;
+ break;
+ case DB_UNKNOWN:
+ default:
+ goto err;
+ }
+ break;
+ case DB_CURRENT:
+ /*
+ * If there is a comparison function, doing a DB_CURRENT
+ * must not change the part of the data item that is used
+ * for the comparison.
+ */
+ break;
+ case DB_NODUPDATA:
+ if (!F_ISSET(dbp, DB_AM_DUPSORT))
+ goto err;
+ /* FALLTHROUGH */
+ case DB_KEYFIRST:
+ case DB_KEYLAST:
+ case DB_OVERWRITE_DUP:
+ key_flags = 1;
+ if ((ret = __dbt_usercopy(env, key)) != 0)
+ return (ret);
+ break;
+ default:
+err: return (__db_ferr(env, "DBcursor->put", 0));
+ }
+
+ /*
+ * Check for invalid key/data flags. The key may reasonably be NULL
+ * if DB_AFTER or DB_BEFORE is set and the application doesn't care
+ * about the returned key, or if the DB_CURRENT flag is set.
+ */
+ if (key_flags && (ret = __dbt_ferr(dbp, "key", key, 0)) != 0)
+ return (ret);
+ if ((ret = __dbt_ferr(dbp, "data", data, 0)) != 0)
+ return (ret);
+
+ /*
+ * The key parameter should not be NULL or have the "partial" flag set
+ * in a put call unless the user doesn't care about a key value we'd
+ * return. The user tells us they don't care about the returned key by
+ * setting the key parameter to NULL or configuring the key DBT to not
+ * return any information. (Returned keys from a put are always record
+ * numbers, and returning part of a record number doesn't make sense:
+ * only accept a partial return if the length returned is 0.)
+ */
+ if (key_flags && F_ISSET(key, DB_DBT_PARTIAL) && key->dlen != 0)
+ return (__db_ferr(env, "key DBT", 0));
+
+ /*
+ * The cursor must be initialized for anything other than DB_KEYFIRST,
+ * DB_KEYLAST or zero: return EINVAL for an invalid cursor, otherwise 0.
+ */
+ if (!IS_INITIALIZED(dbc) && flags != 0 && flags != DB_KEYFIRST &&
+ flags != DB_KEYLAST && flags != DB_NODUPDATA &&
+ flags != DB_OVERWRITE_DUP)
+ return (__db_curinval(env));
+
+ return (0);
+}
+
+/*
+ * __dbt_ferr --
+ * Check a DBT for flag errors.
+ */
+static int
+__dbt_ferr(dbp, name, dbt, check_thread)
+ const DB *dbp;
+ const char *name;
+ const DBT *dbt;
+ int check_thread;
+{
+ ENV *env;
+ int ret;
+
+ env = dbp->env;
+
+ /*
+ * Check for invalid DBT flags. We allow any of the flags to be
+ * specified to any DB or DBcursor call so that applications can
+ * set DB_DBT_MALLOC when retrieving a data item from a secondary
+ * database and then specify that same DBT as a key to a primary
+ * database, without having to clear flags.
+ */
+ if ((ret = __db_fchk(env, name, dbt->flags, DB_DBT_APPMALLOC |
+ DB_DBT_BULK | DB_DBT_DUPOK | DB_DBT_MALLOC | DB_DBT_REALLOC |
+ DB_DBT_USERCOPY | DB_DBT_USERMEM | DB_DBT_PARTIAL)) != 0)
+ return (ret);
+ switch (F_ISSET(dbt, DB_DBT_MALLOC | DB_DBT_REALLOC |
+ DB_DBT_USERCOPY | DB_DBT_USERMEM)) {
+ case 0:
+ case DB_DBT_MALLOC:
+ case DB_DBT_REALLOC:
+ case DB_DBT_USERCOPY:
+ case DB_DBT_USERMEM:
+ break;
+ default:
+ return (__db_ferr(env, name, 1));
+ }
+
+ if (F_ISSET(dbt, DB_DBT_BULK) && F_ISSET(dbt, DB_DBT_PARTIAL)) {
+ __db_errx(env,
+ "Bulk and partial operations cannot be combined on %s DBT", name);
+ return (EINVAL);
+ }
+
+ if (check_thread && DB_IS_THREADED(dbp) &&
+ !F_ISSET(dbt, DB_DBT_MALLOC | DB_DBT_REALLOC |
+ DB_DBT_USERCOPY | DB_DBT_USERMEM)) {
+ __db_errx(env,
+ "DB_THREAD mandates memory allocation flag on %s DBT",
+ name);
+ return (EINVAL);
+ }
+ return (0);
+}
+
+/*
+ * __db_curinval
+ * Report that a cursor is in an invalid state.
+ */
+static int
+__db_curinval(env)
+ const ENV *env;
+{
+ __db_errx(env,
+ "Cursor position must be set before performing this operation");
+ return (EINVAL);
+}
+
+/*
+ * __db_txn_auto_init --
+ * Handle DB_AUTO_COMMIT initialization.
+ *
+ * PUBLIC: int __db_txn_auto_init __P((ENV *, DB_THREAD_INFO *, DB_TXN **));
+ */
+int
+__db_txn_auto_init(env, ip, txnidp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ DB_TXN **txnidp;
+{
+ /*
+ * Method calls where applications explicitly specify DB_AUTO_COMMIT
+ * require additional validation: the DB_AUTO_COMMIT flag cannot be
+ * specified if a transaction cookie is also specified, nor can the
+ * flag be specified in a non-transactional environment.
+ */
+ if (*txnidp != NULL) {
+ __db_errx(env,
+ "DB_AUTO_COMMIT may not be specified along with a transaction handle");
+ return (EINVAL);
+ }
+
+ if (!TXN_ON(env)) {
+ __db_errx(env,
+ "DB_AUTO_COMMIT may not be specified in non-transactional environment");
+ return (EINVAL);
+ }
+
+ /*
+ * Our caller checked to see if replication is making a state change.
+ * Don't call the user-level API (which would repeat that check).
+ */
+ return (__txn_begin(env, ip, NULL, txnidp, 0));
+}
+
+/*
+ * __db_txn_auto_resolve --
+ * Resolve local transactions.
+ *
+ * PUBLIC: int __db_txn_auto_resolve __P((ENV *, DB_TXN *, int, int));
+ */
+int
+__db_txn_auto_resolve(env, txn, nosync, ret)
+ ENV *env;
+ DB_TXN *txn;
+ int nosync, ret;
+{
+ int t_ret;
+
+ /*
+ * We're resolving a transaction for the user, and must decrement the
+ * replication handle count. Call the user-level API.
+ */
+ if (ret == 0)
+ return (__txn_commit(txn, nosync ? DB_TXN_NOSYNC : 0));
+
+ if ((t_ret = __txn_abort(txn)) != 0)
+ return (__env_panic(env, t_ret));
+
+ return (ret);
+}
diff --git a/db/db_join.c b/db/db_join.c
new file mode 100644
index 0000000..05c11a4
--- /dev/null
+++ b/db/db_join.c
@@ -0,0 +1,940 @@
+/*
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_join.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+
+static int __db_join_close_pp __P((DBC *));
+static int __db_join_cmp __P((const void *, const void *));
+static int __db_join_del __P((DBC *, u_int32_t));
+static int __db_join_get __P((DBC *, DBT *, DBT *, u_int32_t));
+static int __db_join_get_pp __P((DBC *, DBT *, DBT *, u_int32_t));
+static int __db_join_getnext __P((DBC *, DBT *, DBT *, u_int32_t, u_int32_t));
+static int __db_join_primget __P((DB *, DB_THREAD_INFO *,
+ DB_TXN *, DB_LOCKER *, DBT *, DBT *, u_int32_t));
+static int __db_join_put __P((DBC *, DBT *, DBT *, u_int32_t));
+
+/*
+ * Check to see if the Nth secondary cursor of join cursor jc is pointing
+ * to a sorted duplicate set.
+ */
+#define SORTED_SET(jc, n) ((jc)->j_curslist[(n)]->dbp->dup_compare != NULL)
+
+/*
+ * This is the duplicate-assisted join functionality. Right now we're
+ * going to write it such that we return one item at a time, although
+ * I think we may need to optimize it to return them all at once.
+ * It should be easier to get it working this way, and I believe that
+ * changing it should be fairly straightforward.
+ *
+ * We optimize the join by sorting cursors from smallest to largest
+ * cardinality. In most cases, this is indeed optimal. However, if
+ * a cursor with large cardinality has very few data in common with the
+ * first cursor, it is possible that the join will be made faster by
+ * putting it earlier in the cursor list. Since we have no way to detect
+ * cases like this, we simply provide a flag, DB_JOIN_NOSORT, which retains
+ * the sort order specified by the caller, who may know more about the
+ * structure of the data.
+ *
+ * The first cursor moves sequentially through the duplicate set while
+ * the others search explicitly for the duplicate in question.
+ *
+ */
+
+/*
+ * __db_join --
+ * This is the interface to the duplicate-assisted join functionality.
+ * In the same way that cursors mark a position in a database, a cursor
+ * can mark a position in a join. While most cursors are created by the
+ * cursor method of a DB, join cursors are created through an explicit
+ * call to DB->join.
+ *
+ * The curslist is an array of existing, initialized cursors and primary
+ * is the DB of the primary file. The data item that joins all the
+ * cursors in the curslist is used as the key into the primary and that
+ * key and data are returned. When no more items are left in the join
+ * set, the c_next operation off the join cursor will return DB_NOTFOUND.
+ *
+ * PUBLIC: int __db_join __P((DB *, DBC **, DBC **, u_int32_t));
+ */
+int
+__db_join(primary, curslist, dbcp, flags)
+ DB *primary;
+ DBC **curslist, **dbcp;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ ENV *env;
+ JOIN_CURSOR *jc;
+ size_t ncurs, nslots;
+ u_int32_t i;
+ int ret;
+
+ env = primary->env;
+ dbc = NULL;
+ jc = NULL;
+
+ if ((ret = __os_calloc(env, 1, sizeof(DBC), &dbc)) != 0)
+ goto err;
+
+ if ((ret = __os_calloc(env, 1, sizeof(JOIN_CURSOR), &jc)) != 0)
+ goto err;
+
+ if ((ret = __os_malloc(env, 256, &jc->j_key.data)) != 0)
+ goto err;
+ jc->j_key.ulen = 256;
+ F_SET(&jc->j_key, DB_DBT_USERMEM);
+
+ F_SET(&jc->j_rdata, DB_DBT_REALLOC);
+
+ for (jc->j_curslist = curslist;
+ *jc->j_curslist != NULL; jc->j_curslist++)
+ ;
+
+ /*
+ * The number of cursor slots we allocate is one greater than
+ * the number of cursors involved in the join, because the
+ * list is NULL-terminated.
+ */
+ ncurs = (size_t)(jc->j_curslist - curslist);
+ nslots = ncurs + 1;
+
+ /*
+ * !!! -- A note on the various lists hanging off jc.
+ *
+ * j_curslist is the initial NULL-terminated list of cursors passed
+ * into __db_join. The original cursors are not modified; pristine
+ * copies are required because, in databases with unsorted dups, we
+ * must reset all of the secondary cursors after the first each
+ * time the first one is incremented, or else we will lose data
+ * which happen to be sorted differently in two different cursors.
+ *
+ * j_workcurs is where we put those copies that we're planning to
+ * work with. They're lazily c_dup'ed from j_curslist as we need
+ * them, and closed when the join cursor is closed or when we need
+ * to reset them to their original values (in which case we just
+ * c_dup afresh).
+ *
+ * j_fdupcurs is an array of cursors which point to the first
+ * duplicate in the duplicate set that contains the data value
+ * we're currently interested in. We need this to make
+ * __db_join_get correctly return duplicate duplicates; i.e., if a
+ * given data value occurs twice in the set belonging to cursor #2,
+ * and thrice in the set belonging to cursor #3, and once in all
+ * the other cursors, successive calls to __db_join_get need to
+ * return that data item six times. To make this happen, each time
+ * cursor N is allowed to advance to a new datum, all cursors M
+ * such that M > N have to be reset to the first duplicate with
+ * that datum, so __db_join_get will return all the dup-dups again.
+ * We could just reset them to the original cursor from j_curslist,
+ * but that would be a bit slower in the unsorted case and a LOT
+ * slower in the sorted one.
+ *
+ * j_exhausted is a list of boolean values which represent
+ * whether or not their corresponding cursors are "exhausted",
+ * i.e. whether the datum under the corresponding cursor has
+ * been found not to exist in any unreturned combinations of
+ * later secondary cursors, in which case they are ready to be
+ * incremented.
+ */
+
+ /* We don't want to free regions whose callocs have failed. */
+ jc->j_curslist = NULL;
+ jc->j_workcurs = NULL;
+ jc->j_fdupcurs = NULL;
+ jc->j_exhausted = NULL;
+
+ if ((ret = __os_calloc(env, nslots, sizeof(DBC *),
+ &jc->j_curslist)) != 0)
+ goto err;
+ if ((ret = __os_calloc(env, nslots, sizeof(DBC *),
+ &jc->j_workcurs)) != 0)
+ goto err;
+ if ((ret = __os_calloc(env, nslots, sizeof(DBC *),
+ &jc->j_fdupcurs)) != 0)
+ goto err;
+ if ((ret = __os_calloc(env, nslots, sizeof(u_int8_t),
+ &jc->j_exhausted)) != 0)
+ goto err;
+ for (i = 0; curslist[i] != NULL; i++) {
+ jc->j_curslist[i] = curslist[i];
+ jc->j_workcurs[i] = NULL;
+ jc->j_fdupcurs[i] = NULL;
+ jc->j_exhausted[i] = 0;
+ }
+ jc->j_ncurs = (u_int32_t)ncurs;
+
+ /*
+ * If DB_JOIN_NOSORT is not set, optimize secondary cursors by
+ * sorting in order of increasing cardinality.
+ */
+ if (!LF_ISSET(DB_JOIN_NOSORT))
+ qsort(jc->j_curslist, ncurs, sizeof(DBC *), __db_join_cmp);
+
+ /*
+ * We never need to reset the 0th cursor, so there's no
+ * solid reason to use workcurs[0] rather than curslist[0] in
+ * join_get. Nonetheless, it feels cleaner to do it for symmetry,
+ * and this is the most logical place to copy it.
+ *
+ * !!!
+ * There's no need to close the new cursor if we goto err only
+ * because this is the last thing that can fail. Modifier of this
+ * function beware!
+ */
+ if ((ret =
+ __dbc_dup(jc->j_curslist[0], jc->j_workcurs, DB_POSITION)) != 0)
+ goto err;
+
+ dbc->close = dbc->c_close = __db_join_close_pp;
+ dbc->del = dbc->c_del = __db_join_del;
+ dbc->get = dbc->c_get = __db_join_get_pp;
+ dbc->put = dbc->c_put = __db_join_put;
+ dbc->internal = (DBC_INTERNAL *)jc;
+ dbc->dbp = primary;
+ jc->j_primary = primary;
+
+ /* Stash the first cursor's transaction here for easy access. */
+ dbc->txn = curslist[0]->txn;
+
+ *dbcp = dbc;
+
+ MUTEX_LOCK(env, primary->mutex);
+ TAILQ_INSERT_TAIL(&primary->join_queue, dbc, links);
+ MUTEX_UNLOCK(env, primary->mutex);
+
+ return (0);
+
+err: if (jc != NULL) {
+ if (jc->j_curslist != NULL)
+ __os_free(env, jc->j_curslist);
+ if (jc->j_workcurs != NULL) {
+ if (jc->j_workcurs[0] != NULL)
+ (void)__dbc_close(jc->j_workcurs[0]);
+ __os_free(env, jc->j_workcurs);
+ }
+ if (jc->j_fdupcurs != NULL)
+ __os_free(env, jc->j_fdupcurs);
+ if (jc->j_exhausted != NULL)
+ __os_free(env, jc->j_exhausted);
+ __os_free(env, jc);
+ }
+ if (dbc != NULL)
+ __os_free(env, dbc);
+ return (ret);
+}
+
+/*
+ * __db_join_close_pp --
+ * DBC->close pre/post processing for join cursors.
+ */
+static int
+__db_join_close_pp(dbc)
+ DBC *dbc;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ ENV_ENTER(env, ip);
+
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(dbp, 1, 0, dbc->txn != NULL)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ ret = __db_join_close(dbc);
+
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+err: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+static int
+__db_join_put(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key;
+ DBT *data;
+ u_int32_t flags;
+{
+ COMPQUIET(dbc, NULL);
+ COMPQUIET(key, NULL);
+ COMPQUIET(data, NULL);
+ COMPQUIET(flags, 0);
+ return (EINVAL);
+}
+
+static int
+__db_join_del(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ COMPQUIET(dbc, NULL);
+ COMPQUIET(flags, 0);
+ return (EINVAL);
+}
+
+/*
+ * __db_join_get_pp --
+ * DBjoin->get pre/post processing.
+ */
+static int
+__db_join_get_pp(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ u_int32_t handle_check, save_flags;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ /* Save the original flags value. */
+ save_flags = flags;
+
+ if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW)) {
+ if (!LOCKING_ON(env))
+ return (__db_fnl(env, "DBC->get"));
+ LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
+ }
+
+ switch (flags) {
+ case 0:
+ case DB_JOIN_ITEM:
+ break;
+ default:
+ return (__db_ferr(env, "DBC->get", 0));
+ }
+
+ /*
+ * A partial get of the key of a join cursor don't make much sense;
+ * the entire key is necessary to query the primary database
+ * and find the datum, and so regardless of the size of the key
+ * it would not be a performance improvement. Since it would require
+ * special handling, we simply disallow it.
+ *
+ * A partial get of the data, however, potentially makes sense (if
+ * all possible data are a predictable large structure, for instance)
+ * and causes us no headaches, so we permit it.
+ */
+ if (F_ISSET(key, DB_DBT_PARTIAL)) {
+ __db_errx(env,
+ "DB_DBT_PARTIAL may not be set on key during join_get");
+ return (EINVAL);
+ }
+
+ ENV_ENTER(env, ip);
+
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(dbp, 1, 0, dbc->txn != NULL)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ /* Restore the original flags value. */
+ flags = save_flags;
+
+ ret = __db_join_get(dbc, key, data, flags);
+
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+err: ENV_LEAVE(env, ip);
+ __dbt_userfree(env, key, NULL, NULL);
+ return (ret);
+}
+
+static int
+__db_join_get(dbc, key_arg, data_arg, flags)
+ DBC *dbc;
+ DBT *key_arg, *data_arg;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DBC *cp;
+ DBT *key_n, key_n_mem;
+ ENV *env;
+ JOIN_CURSOR *jc;
+ int db_manage_data, ret;
+ u_int32_t i, j, operation, opmods;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ jc = (JOIN_CURSOR *)dbc->internal;
+
+ operation = LF_ISSET(DB_OPFLAGS_MASK);
+
+ /* !!!
+ * If the set of flags here changes, check that __db_join_primget
+ * is updated to handle them properly.
+ */
+ opmods = LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
+
+ /*
+ * Since we are fetching the key as a datum in the secondary indices,
+ * we must be careful of caller-specified DB_DBT_* memory
+ * management flags. If necessary, use a stack-allocated DBT;
+ * we'll appropriately copy and/or allocate the data later.
+ */
+ if (F_ISSET(key_arg,
+ DB_DBT_MALLOC | DB_DBT_USERCOPY | DB_DBT_USERMEM)) {
+ /* We just use the default buffer; no need to go malloc. */
+ key_n = &key_n_mem;
+ memset(key_n, 0, sizeof(DBT));
+ } else {
+ /*
+ * Either DB_DBT_REALLOC or the default buffer will work
+ * fine if we have to reuse it, as we do.
+ */
+ key_n = key_arg;
+ }
+ if (F_ISSET(key_arg, DB_DBT_USERCOPY))
+ key_arg->data = NULL;
+
+ /*
+ * If our last attempt to do a get on the primary key failed,
+ * short-circuit the join and try again with the same key.
+ */
+ if (F_ISSET(jc, JOIN_RETRY))
+ goto samekey;
+ F_CLR(jc, JOIN_RETRY);
+
+retry: ret = __dbc_get(jc->j_workcurs[0], &jc->j_key, key_n,
+ opmods | (jc->j_exhausted[0] ? DB_NEXT_DUP : DB_CURRENT));
+
+ if (ret == DB_BUFFER_SMALL) {
+ jc->j_key.ulen <<= 1;
+ if ((ret = __os_realloc(env,
+ jc->j_key.ulen, &jc->j_key.data)) != 0)
+ goto mem_err;
+ goto retry;
+ }
+
+ /*
+ * If ret == DB_NOTFOUND, we're out of elements of the first
+ * secondary cursor. This is how we finally finish the join
+ * if all goes well.
+ */
+ if (ret != 0)
+ goto err;
+
+ /*
+ * If jc->j_exhausted[0] == 1, we've just advanced the first cursor,
+ * and we're going to want to advance all the cursors that point to
+ * the first member of a duplicate duplicate set (j_fdupcurs[1..N]).
+ * Close all the cursors in j_fdupcurs; we'll reopen them the
+ * first time through the upcoming loop.
+ */
+ for (i = 1; i < jc->j_ncurs; i++) {
+ if (jc->j_fdupcurs[i] != NULL &&
+ (ret = __dbc_close(jc->j_fdupcurs[i])) != 0)
+ goto err;
+ jc->j_fdupcurs[i] = NULL;
+ }
+
+ /*
+ * If jc->j_curslist[1] == NULL, we have only one cursor in the join.
+ * Thus, we can safely increment that one cursor on each call
+ * to __db_join_get, and we signal this by setting jc->j_exhausted[0]
+ * right away.
+ *
+ * Otherwise, reset jc->j_exhausted[0] to 0, so that we don't
+ * increment it until we know we're ready to.
+ */
+ if (jc->j_curslist[1] == NULL)
+ jc->j_exhausted[0] = 1;
+ else
+ jc->j_exhausted[0] = 0;
+
+ /* We have the first element; now look for it in the other cursors. */
+ for (i = 1; i < jc->j_ncurs; i++) {
+ DB_ASSERT(env, jc->j_curslist[i] != NULL);
+ if (jc->j_workcurs[i] == NULL)
+ /* If this is NULL, we need to dup curslist into it. */
+ if ((ret = __dbc_dup(jc->j_curslist[i],
+ &jc->j_workcurs[i], DB_POSITION)) != 0)
+ goto err;
+
+retry2: cp = jc->j_workcurs[i];
+
+ if ((ret = __db_join_getnext(cp, &jc->j_key, key_n,
+ jc->j_exhausted[i], opmods)) == DB_NOTFOUND) {
+ /*
+ * jc->j_workcurs[i] has no more of the datum we're
+ * interested in. Go back one cursor and get
+ * a new dup. We can't just move to a new
+ * element of the outer relation, because that way
+ * we might miss duplicate duplicates in cursor i-1.
+ *
+ * If this takes us back to the first cursor,
+ * -then- we can move to a new element of the outer
+ * relation.
+ */
+ --i;
+ jc->j_exhausted[i] = 1;
+
+ if (i == 0) {
+ for (j = 1; jc->j_workcurs[j] != NULL; j++) {
+ /*
+ * We're moving to a new element of
+ * the first secondary cursor. If
+ * that cursor is sorted, then any
+ * other sorted cursors can be safely
+ * reset to the first duplicate
+ * duplicate in the current set if we
+ * have a pointer to it (we can't just
+ * leave them be, or we'll miss
+ * duplicate duplicates in the outer
+ * relation).
+ *
+ * If the first cursor is unsorted, or
+ * if cursor j is unsorted, we can
+ * make no assumptions about what
+ * we're looking for next or where it
+ * will be, so we reset to the very
+ * beginning (setting workcurs NULL
+ * will achieve this next go-round).
+ *
+ * XXX: This is likely to break
+ * horribly if any two cursors are
+ * both sorted, but have different
+ * specified sort functions. For,
+ * now, we dismiss this as pathology
+ * and let strange things happen--we
+ * can't make rope childproof.
+ */
+ if ((ret = __dbc_close(
+ jc->j_workcurs[j])) != 0)
+ goto err;
+ if (!SORTED_SET(jc, 0) ||
+ !SORTED_SET(jc, j) ||
+ jc->j_fdupcurs[j] == NULL)
+ /*
+ * Unsafe conditions;
+ * reset fully.
+ */
+ jc->j_workcurs[j] = NULL;
+ else
+ /* Partial reset suffices. */
+ if ((__dbc_dup(
+ jc->j_fdupcurs[j],
+ &jc->j_workcurs[j],
+ DB_POSITION)) != 0)
+ goto err;
+ jc->j_exhausted[j] = 0;
+ }
+ goto retry;
+ /* NOTREACHED */
+ }
+
+ /*
+ * We're about to advance the cursor and need to
+ * reset all of the workcurs[j] where j>i, so that
+ * we don't miss any duplicate duplicates.
+ */
+ for (j = i + 1;
+ jc->j_workcurs[j] != NULL;
+ j++) {
+ if ((ret =
+ __dbc_close(jc->j_workcurs[j])) != 0)
+ goto err;
+ jc->j_exhausted[j] = 0;
+ if (jc->j_fdupcurs[j] == NULL)
+ jc->j_workcurs[j] = NULL;
+ else if ((ret = __dbc_dup(jc->j_fdupcurs[j],
+ &jc->j_workcurs[j], DB_POSITION)) != 0)
+ goto err;
+ }
+ goto retry2;
+ /* NOTREACHED */
+ }
+
+ if (ret == DB_BUFFER_SMALL) {
+ jc->j_key.ulen <<= 1;
+ if ((ret = __os_realloc(env, jc->j_key.ulen,
+ &jc->j_key.data)) != 0) {
+mem_err: __db_errx(env,
+ "Allocation failed for join key, len = %lu",
+ (u_long)jc->j_key.ulen);
+ goto err;
+ }
+ goto retry2;
+ }
+
+ if (ret != 0)
+ goto err;
+
+ /*
+ * If we made it this far, we've found a matching
+ * datum in cursor i. Mark the current cursor
+ * unexhausted, so we don't miss any duplicate
+ * duplicates the next go-round--unless this is the
+ * very last cursor, in which case there are none to
+ * miss, and we'll need that exhausted flag to finally
+ * get a DB_NOTFOUND and move on to the next datum in
+ * the outermost cursor.
+ */
+ if (i + 1 != jc->j_ncurs)
+ jc->j_exhausted[i] = 0;
+ else
+ jc->j_exhausted[i] = 1;
+
+ /*
+ * If jc->j_fdupcurs[i] is NULL and the ith cursor's dups are
+ * sorted, then we're here for the first time since advancing
+ * cursor 0, and we have a new datum of interest.
+ * jc->j_workcurs[i] points to the beginning of a set of
+ * duplicate duplicates; store this into jc->j_fdupcurs[i].
+ */
+ if (SORTED_SET(jc, i) && jc->j_fdupcurs[i] == NULL && (ret =
+ __dbc_dup(cp, &jc->j_fdupcurs[i], DB_POSITION)) != 0)
+ goto err;
+ }
+
+err: if (ret != 0)
+ return (ret);
+
+ if (0) {
+samekey: /*
+ * Get the key we tried and failed to return last time;
+ * it should be the current datum of all the secondary cursors.
+ */
+ if ((ret = __dbc_get(jc->j_workcurs[0],
+ &jc->j_key, key_n, DB_CURRENT | opmods)) != 0)
+ return (ret);
+ F_CLR(jc, JOIN_RETRY);
+ }
+
+ /*
+ * ret == 0; we have a key to return.
+ *
+ * If DB_DBT_USERMEM or DB_DBT_MALLOC is set, we need to copy the key
+ * back into the dbt we were given for the key; call __db_retcopy.
+ * Otherwise, assert that we do not need to copy anything and proceed.
+ */
+ DB_ASSERT(env, F_ISSET(key_arg, DB_DBT_USERMEM | DB_DBT_MALLOC |
+ DB_DBT_USERCOPY) || key_n == key_arg);
+
+ if ((F_ISSET(key_arg, DB_DBT_USERMEM | DB_DBT_MALLOC |
+ DB_DBT_USERCOPY)) &&
+ (ret = __db_retcopy(env,
+ key_arg, key_n->data, key_n->size, NULL, NULL)) != 0) {
+ /*
+ * The retcopy failed, most commonly because we have a user
+ * buffer for the key which is too small. Set things up to
+ * retry next time, and return.
+ */
+ F_SET(jc, JOIN_RETRY);
+ return (ret);
+ }
+
+ /*
+ * If DB_JOIN_ITEM is set, we return it; otherwise we do the lookup
+ * in the primary and then return.
+ */
+ if (operation == DB_JOIN_ITEM)
+ return (0);
+
+ /*
+ * If data_arg->flags == 0--that is, if DB is managing the
+ * data DBT's memory--it's not safe to just pass the DBT
+ * through to the primary get call, since we don't want that
+ * memory to belong to the primary DB handle (and if the primary
+ * is free-threaded, it can't anyway).
+ *
+ * Instead, use memory that is managed by the join cursor, in
+ * jc->j_rdata.
+ */
+ if (!F_ISSET(data_arg, DB_DBT_MALLOC | DB_DBT_REALLOC |
+ DB_DBT_USERMEM | DB_DBT_USERCOPY))
+ db_manage_data = 1;
+ else
+ db_manage_data = 0;
+ if ((ret = __db_join_primget(jc->j_primary, dbc->thread_info,
+ jc->j_curslist[0]->txn, jc->j_curslist[0]->locker, key_n,
+ db_manage_data ? &jc->j_rdata : data_arg, opmods)) != 0) {
+ if (ret == DB_NOTFOUND) {
+ if (LF_ISSET(DB_READ_UNCOMMITTED) ||
+ (jc->j_curslist[0]->txn != NULL && F_ISSET(
+ jc->j_curslist[0]->txn, TXN_READ_UNCOMMITTED)))
+ goto retry;
+ /*
+ * If ret == DB_NOTFOUND, the primary and secondary
+ * are out of sync; every item in each secondary
+ * should correspond to something in the primary,
+ * or we shouldn't have done the join this way.
+ * Wail.
+ */
+ ret = __db_secondary_corrupt(jc->j_primary);
+ } else
+ /*
+ * The get on the primary failed for some other
+ * reason, most commonly because we're using a user
+ * buffer that's not big enough. Flag our failure
+ * so we can return the same key next time.
+ */
+ F_SET(jc, JOIN_RETRY);
+ }
+ if (db_manage_data && ret == 0) {
+ data_arg->data = jc->j_rdata.data;
+ data_arg->size = jc->j_rdata.size;
+ }
+
+ return (ret);
+}
+
+/*
+ * __db_join_close --
+ * DBC->close for join cursors.
+ *
+ * PUBLIC: int __db_join_close __P((DBC *));
+ */
+int
+__db_join_close(dbc)
+ DBC *dbc;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ JOIN_CURSOR *jc;
+ int ret, t_ret;
+ u_int32_t i;
+
+ jc = (JOIN_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+ env = dbp->env;
+ ret = t_ret = 0;
+
+ /*
+ * Remove from active list of join cursors. Note that this
+ * must happen before any action that can fail and return, or else
+ * __db_close may loop indefinitely.
+ */
+ MUTEX_LOCK(env, dbp->mutex);
+ TAILQ_REMOVE(&dbp->join_queue, dbc, links);
+ MUTEX_UNLOCK(env, dbp->mutex);
+
+ ENV_ENTER(env, ip);
+ /*
+ * Close any open scratch cursors. In each case, there may
+ * not be as many outstanding as there are cursors in
+ * curslist, but we want to close whatever's there.
+ *
+ * If any close fails, there's no reason not to close everything else;
+ * we'll just return the error code of the last one to fail. There's
+ * not much the caller can do anyway, since these cursors only exist
+ * hanging off a db-internal data structure that they shouldn't be
+ * mucking with.
+ */
+ for (i = 0; i < jc->j_ncurs; i++) {
+ if (jc->j_workcurs[i] != NULL &&
+ (t_ret = __dbc_close(jc->j_workcurs[i])) != 0)
+ ret = t_ret;
+ if (jc->j_fdupcurs[i] != NULL &&
+ (t_ret = __dbc_close(jc->j_fdupcurs[i])) != 0)
+ ret = t_ret;
+ }
+ ENV_LEAVE(env, ip);
+
+ __os_free(env, jc->j_exhausted);
+ __os_free(env, jc->j_curslist);
+ __os_free(env, jc->j_workcurs);
+ __os_free(env, jc->j_fdupcurs);
+ __os_free(env, jc->j_key.data);
+ if (jc->j_rdata.data != NULL)
+ __os_ufree(env, jc->j_rdata.data);
+ __os_free(env, jc);
+ __os_free(env, dbc);
+
+ return (ret);
+}
+
+/*
+ * __db_join_getnext --
+ * This function replaces the DBC_CONTINUE and DBC_KEYSET
+ * functionality inside the various cursor get routines.
+ *
+ * If exhausted == 0, we're not done with the current datum;
+ * return it if it matches "matching", otherwise search
+ * using DB_GET_BOTHC (which is faster than iteratively doing
+ * DB_NEXT_DUP) forward until we find one that does.
+ *
+ * If exhausted == 1, we are done with the current datum, so just
+ * leap forward to searching NEXT_DUPs.
+ *
+ * If no matching datum exists, returns DB_NOTFOUND, else 0.
+ */
+static int
+__db_join_getnext(dbc, key, data, exhausted, opmods)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t exhausted, opmods;
+{
+ int ret, cmp;
+ DB *dbp;
+ DBT ldata;
+ int (*func) __P((DB *, const DBT *, const DBT *));
+
+ dbp = dbc->dbp;
+ func = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare;
+
+ switch (exhausted) {
+ case 0:
+ /*
+ * We don't want to step on data->data; use a new
+ * DBT and malloc so we don't step on dbc's rdata memory.
+ */
+ memset(&ldata, 0, sizeof(DBT));
+ F_SET(&ldata, DB_DBT_MALLOC);
+ if ((ret = __dbc_get(dbc,
+ key, &ldata, opmods | DB_CURRENT)) != 0)
+ break;
+ cmp = func(dbp, data, &ldata);
+ if (cmp == 0) {
+ /*
+ * We have to return the real data value. Copy
+ * it into data, then free the buffer we malloc'ed
+ * above.
+ */
+ if ((ret = __db_retcopy(dbp->env, data, ldata.data,
+ ldata.size, &data->data, &data->size)) != 0)
+ return (ret);
+ __os_ufree(dbp->env, ldata.data);
+ return (0);
+ }
+
+ /*
+ * Didn't match--we want to fall through and search future
+ * dups. We just forget about ldata and free
+ * its buffer--data contains the value we're searching for.
+ */
+ __os_ufree(dbp->env, ldata.data);
+ /* FALLTHROUGH */
+ case 1:
+ ret = __dbc_get(dbc, key, data, opmods | DB_GET_BOTHC);
+ break;
+ default:
+ ret = EINVAL;
+ break;
+ }
+
+ return (ret);
+}
+
+/*
+ * __db_join_cmp --
+ * Comparison function for sorting DBCs in cardinality order.
+ */
+static int
+__db_join_cmp(a, b)
+ const void *a, *b;
+{
+ DBC *dbca, *dbcb;
+ db_recno_t counta, countb;
+
+ dbca = *((DBC * const *)a);
+ dbcb = *((DBC * const *)b);
+
+ if (__dbc_count(dbca, &counta) != 0 ||
+ __dbc_count(dbcb, &countb) != 0)
+ return (0);
+
+ return ((long)counta - (long)countb);
+}
+
+/*
+ * __db_join_primget --
+ * Perform a DB->get in the primary, being careful not to use a new
+ * locker ID if we're doing CDB locking.
+ */
+static int
+__db_join_primget(dbp, ip, txn, locker, key, data, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DB_LOCKER *locker;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ u_int32_t rmw;
+ int ret, t_ret;
+
+ if ((ret = __db_cursor_int(dbp, ip,
+ txn, dbp->type, PGNO_INVALID, 0, locker, &dbc)) != 0)
+ return (ret);
+
+ /*
+ * The only allowable flags here are the two flags copied into "opmods"
+ * in __db_join_get, DB_RMW and DB_READ_UNCOMMITTED. The former is an
+ * op on the c_get call, the latter on the cursor call. It's a DB bug
+ * if we allow any other flags down in here.
+ */
+ rmw = LF_ISSET(DB_RMW);
+ if (LF_ISSET(DB_READ_UNCOMMITTED) ||
+ (txn != NULL && F_ISSET(txn, TXN_READ_UNCOMMITTED)))
+ F_SET(dbc, DBC_READ_UNCOMMITTED);
+
+ if (LF_ISSET(DB_READ_COMMITTED) ||
+ (txn != NULL && F_ISSET(txn, TXN_READ_COMMITTED)))
+ F_SET(dbc, DBC_READ_COMMITTED);
+
+ LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW);
+ DB_ASSERT(dbp->env, flags == 0);
+
+ F_SET(dbc, DBC_TRANSIENT);
+
+ /*
+ * This shouldn't be necessary, thanks to the fact that join cursors
+ * swap in their own DB_DBT_REALLOC'ed buffers, but just for form's
+ * sake, we mirror what __db_get does.
+ */
+ SET_RET_MEM(dbc, dbp);
+
+ ret = __dbc_get(dbc, key, data, DB_SET | rmw);
+
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_secondary_corrupt --
+ * Report primary/secondary inconsistencies.
+ *
+ * PUBLIC: int __db_secondary_corrupt __P((DB *));
+ */
+int
+__db_secondary_corrupt(dbp)
+ DB *dbp;
+{
+ __db_err(dbp->env, DB_SECONDARY_BAD, "%s%s%s",
+ dbp->fname == NULL ? "unnamed" : dbp->fname,
+ dbp->dname == NULL ? "" : "/",
+ dbp->dname == NULL ? "" : dbp->dname);
+ return (DB_SECONDARY_BAD);
+}
diff --git a/db/db_meta.c b/db/db_meta.c
new file mode 100644
index 0000000..ef42e44
--- /dev/null
+++ b/db/db_meta.c
@@ -0,0 +1,1299 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+#include "dbinc/db_am.h"
+#include "dbinc/hash.h"
+
+static void __db_init_meta __P((DB *, void *, db_pgno_t, u_int32_t));
+#ifdef HAVE_FTRUNCATE
+static int __db_pglistcmp __P((const void *, const void *));
+static int __db_truncate_freelist __P((DBC *, DBMETA *,
+ PAGE *, db_pgno_t *, u_int32_t, u_int32_t));
+#endif
+
+/*
+ * __db_init_meta --
+ * Helper function for __db_new that initializes the important fields in
+ * a meta-data page (used instead of P_INIT). We need to make sure that we
+ * retain the page number and LSN of the existing page.
+ */
+static void
+__db_init_meta(dbp, p, pgno, pgtype)
+ DB *dbp;
+ void *p;
+ db_pgno_t pgno;
+ u_int32_t pgtype;
+{
+ DBMETA *meta;
+ DB_LSN save_lsn;
+
+ meta = (DBMETA *)p;
+ save_lsn = meta->lsn;
+ memset(meta, 0, sizeof(DBMETA));
+ meta->lsn = save_lsn;
+ meta->pagesize = dbp->pgsize;
+ if (F_ISSET(dbp, DB_AM_CHKSUM))
+ FLD_SET(meta->metaflags, DBMETA_CHKSUM);
+ meta->pgno = pgno;
+ meta->type = (u_int8_t)pgtype;
+}
+
+/*
+ * __db_new --
+ * Get a new page, preferably from the freelist.
+ *
+ * PUBLIC: int __db_new __P((DBC *, u_int32_t, DB_LOCK *, PAGE **));
+ */
+int
+__db_new(dbc, type, lockp, pagepp)
+ DBC *dbc;
+ u_int32_t type;
+ DB_LOCK *lockp;
+ PAGE **pagepp;
+{
+ DB *dbp;
+ DBMETA *meta;
+ DB_LOCK metalock;
+ DB_LSN lsn;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *h;
+ db_pgno_t last, *list, pgno, newnext;
+ int extend, hash, ret, t_ret;
+
+ meta = NULL;
+ dbp = dbc->dbp;
+ env = dbp->env;
+ mpf = dbp->mpf;
+ h = NULL;
+ newnext = PGNO_INVALID;
+ if (lockp != NULL)
+ LOCK_INIT(*lockp);
+
+ hash = 0;
+ ret = 0;
+ LOCK_INIT(metalock);
+
+#ifdef HAVE_HASH
+ if (dbp->type == DB_HASH) {
+ if ((ret = __ham_return_meta(dbc, DB_MPOOL_DIRTY, &meta)) != 0)
+ goto err;
+ if (meta != NULL)
+ hash = 1;
+ }
+#endif
+ if (meta == NULL) {
+ pgno = PGNO_BASE_MD;
+ if ((ret = __db_lget(dbc,
+ LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_DIRTY, &meta)) != 0)
+ goto err;
+ }
+
+ last = meta->last_pgno;
+ if (meta->free == PGNO_INVALID) {
+ if (FLD_ISSET(type, P_DONTEXTEND)) {
+ *pagepp = NULL;
+ goto err;
+ }
+ last = pgno = meta->last_pgno + 1;
+ ZERO_LSN(lsn);
+ extend = 1;
+ } else {
+ pgno = meta->free;
+ /*
+ * Lock the new page. Do this here because we must do it
+ * before getting the page and the caller may need the lock
+ * to keep readers from seeing the page before the transaction
+ * commits. We can do this because no one will hold a free
+ * page locked.
+ */
+ if (lockp != NULL && (ret =
+ __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, lockp)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_DIRTY, &h)) != 0)
+ goto err;
+
+ /*
+ * We want to take the first page off the free list and
+ * then set meta->free to the that page's next_pgno, but
+ * we need to log the change first.
+ */
+ newnext = h->next_pgno;
+ lsn = h->lsn;
+ extend = 0;
+ DB_ASSERT(env, TYPE(h) == P_INVALID);
+
+ if (TYPE(h) != P_INVALID) {
+ __db_errx(env,
+ "%s page %lu is on free list with type %lu",
+ dbp->fname, (u_long)PGNO(h), (u_long)TYPE(h));
+ return (__env_panic(env, EINVAL));
+ }
+
+ }
+
+ FLD_CLR(type, P_DONTEXTEND);
+
+ /*
+ * Log the allocation before fetching the new page. If we
+ * don't have room in the log then we don't want to tell
+ * mpool to extend the file.
+ */
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __db_pg_alloc_log(dbp, dbc->txn, &LSN(meta), 0,
+ &LSN(meta), PGNO_BASE_MD, &lsn,
+ pgno, (u_int32_t)type, newnext, meta->last_pgno)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(meta));
+
+ meta->free = newnext;
+
+ if (extend == 1) {
+ if (lockp != NULL && (ret =
+ __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, lockp)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_NEW, &h)) != 0)
+ goto err;
+ DB_ASSERT(env, last == pgno);
+ meta->last_pgno = pgno;
+ ZERO_LSN(h->lsn);
+ h->pgno = pgno;
+ }
+ LSN(h) = LSN(meta);
+
+ if (hash == 0)
+ ret = __memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
+ meta = NULL;
+ if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+
+ switch (type) {
+ case P_BTREEMETA:
+ case P_HASHMETA:
+ case P_QAMMETA:
+ __db_init_meta(dbp, h, h->pgno, type);
+ break;
+ default:
+ P_INIT(h, dbp->pgsize,
+ h->pgno, PGNO_INVALID, PGNO_INVALID, 0, type);
+ break;
+ }
+
+ /* Fix up the sorted free list if necessary. */
+#ifdef HAVE_FTRUNCATE
+ if (extend == 0) {
+ u_int32_t nelems = 0;
+
+ if ((ret = __memp_get_freelist(dbp->mpf, &nelems, &list)) != 0)
+ goto err;
+ if (nelems != 0) {
+ DB_ASSERT(env, h->pgno == list[0]);
+ memmove(list, &list[1], (nelems - 1) * sizeof(*list));
+ if ((ret = __memp_extend_freelist(
+ dbp->mpf, nelems - 1, &list)) != 0)
+ goto err;
+ }
+ }
+#else
+ COMPQUIET(list, NULL);
+#endif
+
+ *pagepp = h;
+ return (0);
+
+err: if (h != NULL)
+ (void)__memp_fput(mpf, dbc->thread_info, h, dbc->priority);
+ if (meta != NULL && hash == 0)
+ (void)__memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
+ (void)__TLPUT(dbc, metalock);
+ if (lockp != NULL)
+ (void)__LPUT(dbc, *lockp);
+ return (ret);
+}
+
+/*
+ * __db_free --
+ * Add a page to the head of the freelist.
+ *
+ * PUBLIC: int __db_free __P((DBC *, PAGE *));
+ */
+int
+__db_free(dbc, h)
+ DBC *dbc;
+ PAGE *h;
+{
+ DB *dbp;
+ DBMETA *meta;
+ DBT ddbt, ldbt;
+ DB_LOCK metalock;
+ DB_LSN *lsnp;
+ DB_MPOOLFILE *mpf;
+ PAGE *prev;
+ db_pgno_t last_pgno, next_pgno, pgno, prev_pgno;
+ u_int32_t lflag;
+ int hash, ret, t_ret;
+#ifdef HAVE_FTRUNCATE
+ db_pgno_t *list, *lp;
+ u_int32_t nelem, position, start;
+ int do_truncate;
+#endif
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ prev_pgno = PGNO_INVALID;
+ meta = NULL;
+ prev = NULL;
+ LOCK_INIT(metalock);
+#ifdef HAVE_FTRUNCATE
+ lp = NULL;
+ nelem = 0;
+ do_truncate = 0;
+#endif
+
+ /*
+ * Retrieve the metadata page. If we are not keeping a sorted
+ * free list put the page at the head of the the free list.
+ * If we are keeping a sorted free list, for truncation,
+ * then figure out where this page belongs and either
+ * link it in or truncate the file as much as possible.
+ * If either the lock get or page get routines
+ * fail, then we need to put the page with which we were called
+ * back because our caller assumes we take care of it.
+ */
+ hash = 0;
+
+ pgno = PGNO_BASE_MD;
+#ifdef HAVE_HASH
+ if (dbp->type == DB_HASH) {
+ if ((ret = __ham_return_meta(dbc,
+#ifdef HAVE_FTRUNCATE
+ 0,
+#else
+ DB_MPOOL_DIRTY,
+#endif
+ &meta)) != 0)
+ goto err;
+ if (meta != NULL)
+ hash = 1;
+ }
+#endif
+ if (meta == NULL) {
+ if ((ret = __db_lget(dbc,
+ LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ goto err;
+
+ /* If we support truncate, we might not dirty the meta page. */
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+#ifdef HAVE_FTRUNCATE
+ 0,
+#else
+ DB_MPOOL_DIRTY,
+#endif
+ &meta)) != 0)
+ goto err1;
+ }
+
+ last_pgno = meta->last_pgno;
+ next_pgno = meta->free;
+ /*
+ * Assign lsnp here so it always initialized when
+ * HAVE_FTRUNCATE is not defined.
+ */
+ lsnp = &LSN(meta);
+
+ DB_ASSERT(dbp->env, h->pgno != next_pgno);
+
+#ifdef HAVE_FTRUNCATE
+ /*
+ * If we are maintaining a sorted free list see if we either have a
+ * new truncation point or the page goes somewhere in the middle of
+ * the list. If it goes in the middle of the list, we will drop the
+ * meta page and get the previous page.
+ */
+ if ((ret = __memp_get_freelist(mpf, &nelem, &list)) != 0)
+ goto err1;
+ if (list == NULL)
+ goto no_sort;
+
+ if (h->pgno != last_pgno) {
+ /*
+ * Put the page number in the sorted list.
+ * Finds its position and the previous page,
+ * extend the list, make room and insert.
+ */
+ position = 0;
+ if (nelem != 0) {
+ __db_freelist_pos(h->pgno, list, nelem, &position);
+
+ DB_ASSERT(dbp->env, h->pgno != list[position]);
+
+ /* Get the previous page if this is not the smallest. */
+ if (position != 0 || h->pgno > list[0])
+ prev_pgno = list[position];
+ }
+
+ } else if (nelem != 0) {
+ /* Find the truncation point. */
+ for (lp = &list[nelem - 1]; lp >= list; lp--)
+ if (--last_pgno != *lp)
+ break;
+ if (lp < list || last_pgno < h->pgno - 1)
+ do_truncate = 1;
+ last_pgno = meta->last_pgno;
+ }
+
+no_sort:
+ if (prev_pgno == PGNO_INVALID) {
+#ifdef HAVE_HASH
+ if (hash) {
+ if ((ret =
+ __ham_return_meta(dbc, DB_MPOOL_DIRTY, &meta)) != 0)
+ goto err1;
+ } else
+#endif
+ if ((ret = __memp_dirty(mpf,
+ &meta, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err1;
+ lsnp = &LSN(meta);
+ } else {
+ pgno = prev_pgno;
+ if ((ret = __memp_fget(mpf, &pgno,
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &prev)) != 0)
+ goto err1;
+ next_pgno = NEXT_PGNO(prev);
+ lsnp = &LSN(prev);
+ }
+#endif
+
+ /*
+ * Log the change.
+ * We are either logging an update to the metapage or to the
+ * previous page in the sorted list.
+ */
+ if (DBC_LOGGING(dbc)) {
+ memset(&ldbt, 0, sizeof(ldbt));
+ ldbt.data = h;
+ ldbt.size = P_OVERHEAD(dbp);
+ /*
+ * If we are truncating the file, we need to make sure
+ * the logging happens before the truncation. If we
+ * are truncating multiple pages we don't need to flush the
+ * log here as it will be flushed by __db_truncate_freelist.
+ * If we are zeroing pages rather than truncating we still
+ * need to flush since they will not have valid LSNs.
+ */
+ lflag = 0;
+
+ if (h->pgno == last_pgno
+#ifdef HAVE_FTRUNCATE
+ && do_truncate == 0
+#endif
+ )
+ lflag = DB_FLUSH;
+ switch (h->type) {
+ case P_HASH:
+ case P_IBTREE:
+ case P_IRECNO:
+ case P_LBTREE:
+ case P_LRECNO:
+ case P_LDUP:
+ if (h->entries > 0) {
+ ldbt.size += h->entries * sizeof(db_indx_t);
+ ddbt.data = (u_int8_t *)h + HOFFSET(h);
+ ddbt.size = dbp->pgsize - HOFFSET(h);
+ if ((ret = __db_pg_freedata_log(dbp, dbc->txn,
+ lsnp, lflag,
+ h->pgno, lsnp, pgno,
+ &ldbt, next_pgno, last_pgno, &ddbt)) != 0)
+ goto err1;
+ goto logged;
+ }
+ break;
+ case P_HASHMETA:
+ ldbt.size = sizeof(HMETA);
+ break;
+ case P_BTREEMETA:
+ ldbt.size = sizeof(BTMETA);
+ break;
+ case P_OVERFLOW:
+ ldbt.size += OV_LEN(h);
+ break;
+ default:
+ DB_ASSERT(dbp->env, h->type != P_QAMDATA);
+ }
+
+ if ((ret = __db_pg_free_log(dbp,
+ dbc->txn, lsnp, lflag, h->pgno,
+ lsnp, pgno, &ldbt, next_pgno, last_pgno)) != 0)
+ goto err1;
+ } else
+ LSN_NOT_LOGGED(*lsnp);
+
+logged:
+#ifdef HAVE_FTRUNCATE
+ if (do_truncate) {
+ start = (u_int32_t) (lp - list) + 1;
+ meta->last_pgno--;
+ ret = __db_truncate_freelist(
+ dbc, meta, h, list, start, nelem);
+ h = NULL;
+ } else
+#endif
+ if (h->pgno == last_pgno) {
+ /*
+ * We are going to throw this page away, but if we are
+ * using MVCC then this version may stick around and we
+ * might have to make a copy.
+ */
+ if (mpf->mfp->multiversion && (ret = __memp_dirty(mpf,
+ &h, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err1;
+ LSN(h) = *lsnp;
+ P_INIT(h, dbp->pgsize,
+ h->pgno, PGNO_INVALID, next_pgno, 0, P_INVALID);
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, h, DB_PRIORITY_VERY_LOW)) != 0)
+ goto err1;
+ h = NULL;
+ /* Give the page back to the OS. */
+ if ((ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info,
+ last_pgno, 0)) != 0)
+ goto err1;
+ DB_ASSERT(dbp->env, meta->pgno == PGNO_BASE_MD);
+ meta->last_pgno--;
+ h = NULL;
+ } else {
+#ifdef HAVE_FTRUNCATE
+ if (list != NULL) {
+ /* Put the page number into the list. */
+ if ((ret =
+ __memp_extend_freelist(mpf, nelem + 1, &list)) != 0)
+ goto err1;
+ if (prev_pgno != PGNO_INVALID)
+ lp = &list[position + 1];
+ else
+ lp = list;
+ if (nelem != 0 && position != nelem)
+ memmove(lp + 1, lp, (size_t)
+ ((u_int8_t*)&list[nelem] - (u_int8_t*)lp));
+ *lp = h->pgno;
+ }
+#endif
+ /*
+ * If we are not truncating the page then we
+ * reinitialize it and put it at the head of
+ * the free list.
+ */
+ if ((ret = __memp_dirty(mpf,
+ &h, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err1;
+ LSN(h) = *lsnp;
+ P_INIT(h, dbp->pgsize,
+ h->pgno, PGNO_INVALID, next_pgno, 0, P_INVALID);
+#ifdef DIAGNOSTIC
+ memset((u_int8_t *) h + P_OVERHEAD(dbp),
+ CLEAR_BYTE, dbp->pgsize - P_OVERHEAD(dbp));
+#endif
+ if (prev_pgno == PGNO_INVALID)
+ meta->free = h->pgno;
+ else
+ NEXT_PGNO(prev) = h->pgno;
+ }
+
+ /* Discard the metadata or previous page. */
+err1: if (hash == 0 && meta != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, (PAGE *)meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (prev != (PAGE*) meta && prev != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, prev, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Discard the caller's page reference. */
+err: if (h != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * XXX
+ * We have to unlock the caller's page in the caller!
+ */
+ return (ret);
+}
+
+#ifdef HAVE_FTRUNCATE
+/*
+ * __db_freelist_pos -- find the position of a page in the freelist.
+ * The list is sorted, we do a binary search.
+ *
+ * PUBLIC: #ifdef HAVE_FTRUNCATE
+ * PUBLIC: void __db_freelist_pos __P((db_pgno_t,
+ * PUBLIC: db_pgno_t *, u_int32_t, u_int32_t *));
+ * PUBLIC: #endif
+ */
+void
+__db_freelist_pos(pgno, list, nelem, posp)
+ db_pgno_t pgno;
+ db_pgno_t *list;
+ u_int32_t nelem;
+ u_int32_t *posp;
+{
+ u_int32_t base, indx, lim;
+
+ indx = 0;
+ for (base = 0, lim = nelem; lim != 0; lim >>= 1) {
+ indx = base + (lim >> 1);
+ if (pgno == list[indx]) {
+ *posp = indx;
+ return;
+ }
+ if (pgno > list[indx]) {
+ base = indx + 1;
+ --lim;
+ }
+ }
+ if (base != 0)
+ base--;
+ *posp = base;
+ return;
+}
+
+static int
+__db_pglistcmp(a, b)
+ const void *a, *b;
+{
+ db_pglist_t *ap, *bp;
+
+ ap = (db_pglist_t *)a;
+ bp = (db_pglist_t *)b;
+
+ return ((ap->pgno > bp->pgno) ? 1 : (ap->pgno < bp->pgno) ? -1: 0);
+}
+
+/*
+ * __db_freelist_sort -- sort a list of free pages.
+ * PUBLIC: void __db_freelist_sort __P((db_pglist_t *, u_int32_t));
+ */
+void
+__db_freelist_sort(list, nelems)
+ db_pglist_t *list;
+ u_int32_t nelems;
+{
+ qsort(list, (size_t)nelems, sizeof(db_pglist_t), __db_pglistcmp);
+}
+
+/*
+ * __db_pg_truncate -- find the truncation point in a sorted freelist.
+ *
+ * PUBLIC: #ifdef HAVE_FTRUNCATE
+ * PUBLIC: int __db_pg_truncate __P((DBC *, DB_TXN *,
+ * PUBLIC: db_pglist_t *, DB_COMPACT *, u_int32_t *,
+ * PUBLIC: db_pgno_t , db_pgno_t *, DB_LSN *, int));
+ * PUBLIC: #endif
+ */
+int
+__db_pg_truncate(dbc, txn,
+ list, c_data, nelemp, free_pgno, last_pgno, lsnp, in_recovery)
+ DBC *dbc;
+ DB_TXN *txn;
+ db_pglist_t *list;
+ DB_COMPACT *c_data;
+ u_int32_t *nelemp;
+ db_pgno_t free_pgno, *last_pgno;
+ DB_LSN *lsnp;
+ int in_recovery;
+{
+ DB *dbp;
+ DBT ddbt;
+ DB_LSN null_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ db_pglist_t *lp, *slp;
+ db_pgno_t lpgno, pgno;
+ u_int32_t elems, log_size, tpoint;
+ int last, ret;
+
+ ret = 0;
+ h = NULL;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ elems = tpoint = *nelemp;
+
+ /*
+ * Figure out what (if any) pages can be truncated immediately and
+ * record the place from which we can truncate, so we can do the
+ * memp_ftruncate below. We also use this to avoid ever putting
+ * these pages on the freelist, which we are about to relink.
+ */
+ pgno = *last_pgno;
+ lp = &list[elems - 1];
+ last = 1;
+ while (tpoint != 0) {
+ if (lp->pgno != pgno)
+ break;
+ pgno--;
+ tpoint--;
+ lp--;
+ }
+
+ lp = list;
+ slp = &list[elems];
+ /*
+ * Log the sorted list. We log the whole list so it can be rebuilt.
+ * Don't overflow the log file.
+ */
+again: if (DBC_LOGGING(dbc)) {
+ last = 1;
+ lpgno = *last_pgno;
+ ddbt.size = elems * sizeof(*lp);
+ ddbt.data = lp;
+ log_size = ((LOG *)dbc->env->
+ lg_handle->reginfo.primary)->log_size;
+ if (ddbt.size > log_size / 2) {
+ elems = (log_size / 2) / sizeof(*lp);
+ ddbt.size = elems * sizeof(*lp);
+ last = 0;
+ /*
+ * If we stopped after the truncation point
+ * then we need to truncate from here.
+ */
+ if (lp + elems >= &list[tpoint])
+ lpgno = lp[elems - 1].pgno;
+ }
+ /*
+ * If this is not the begining of the list fetch the end
+ * of the previous segment. This page becomes the last_free
+ * page and will link to this segment if it is not truncated.
+ */
+ if (lp != list) {
+ if ((ret = __memp_fget(mpf, &lp[-1].pgno,
+ dbc->thread_info, txn, 0, &h)) != 0)
+ goto err;
+ }
+
+ slp = &lp[elems];
+
+ ZERO_LSN(null_lsn);
+ if ((ret = __db_pg_trunc_log(dbp, dbc->txn,
+ lsnp, last == 1 ? DB_FLUSH : 0, PGNO_BASE_MD,
+ lsnp, h != NULL ? PGNO(h) : PGNO_INVALID,
+ h != NULL ? &LSN(h) : &null_lsn,
+ free_pgno, lpgno, &ddbt)) != 0)
+ goto err;
+ if (h != NULL) {
+ LSN(h) = *lsnp;
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0)
+ goto err;
+ }
+ h = NULL;
+ } else if (!in_recovery)
+ LSN_NOT_LOGGED(*lsnp);
+
+ for (; lp < slp && lp < &list[tpoint]; lp++) {
+ if ((ret = __memp_fget(mpf, &lp->pgno, dbc->thread_info,
+ txn, !in_recovery ? DB_MPOOL_DIRTY : 0, &h)) != 0) {
+ /* Page may have been truncated later. */
+ if (in_recovery && ret == DB_PAGE_NOTFOUND) {
+ ret = 0;
+ continue;
+ }
+ goto err;
+ }
+ if (in_recovery) {
+ if (LOG_COMPARE(&LSN(h), &lp->lsn) == 0) {
+ if ((ret = __memp_dirty(mpf, &h,
+ dbc->thread_info,
+ txn, dbp->priority, 0)) != 0) {
+ (void)__memp_fput(mpf,
+ dbc->thread_info, h, dbp->priority);
+ goto err;
+ }
+ } else
+ goto skip;
+ }
+
+ if (lp == &list[tpoint - 1])
+ NEXT_PGNO(h) = PGNO_INVALID;
+ else
+ NEXT_PGNO(h) = lp[1].pgno;
+ DB_ASSERT(mpf->env, NEXT_PGNO(h) < *last_pgno);
+
+ LSN(h) = *lsnp;
+skip: if ((ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbp->priority)) != 0)
+ goto err;
+ h = NULL;
+ }
+
+ /*
+ * If we did not log everything try again. We start from slp and
+ * try to go to the end of the list.
+ */
+ if (last == 0) {
+ elems = (u_int32_t)(&list[*nelemp] - slp);
+ lp = slp;
+ goto again;
+ }
+
+ /*
+ * Truncate the file. Its possible that the last page is the
+ * only one that got truncated and that's done in the caller.
+ */
+ if (pgno != *last_pgno) {
+ if (tpoint != *nelemp &&
+ (ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info,
+ pgno + 1, in_recovery ? MP_TRUNC_RECOVER : 0)) != 0)
+ goto err;
+ if (c_data)
+ c_data->compact_pages_truncated += *last_pgno - pgno;
+ *last_pgno = pgno;
+ }
+ *nelemp = tpoint;
+
+ if (0) {
+err: if (h != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority);
+ }
+ return (ret);
+}
+
+/*
+ * __db_free_truncate --
+ * Build a sorted free list and truncate free pages at the end
+ * of the file.
+ *
+ * PUBLIC: #ifdef HAVE_FTRUNCATE
+ * PUBLIC: int __db_free_truncate __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC: u_int32_t, DB_COMPACT *, db_pglist_t **, u_int32_t *,
+ * PUBLIC: db_pgno_t *));
+ * PUBLIC: #endif
+ */
+int
+__db_free_truncate(dbp, ip, txn, flags, c_data, listp, nelemp, last_pgnop)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ u_int32_t flags;
+ DB_COMPACT *c_data;
+ db_pglist_t **listp;
+ u_int32_t *nelemp;
+ db_pgno_t *last_pgnop;
+{
+ DBC *dbc;
+ DBMETA *meta;
+ DB_LOCK metalock;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *h;
+ db_pglist_t *list, *lp;
+ db_pgno_t pgno;
+ u_int32_t nelems;
+ int ret, t_ret;
+ size_t size;
+
+ COMPQUIET(flags, 0);
+ list = NULL;
+ meta = NULL;
+ env = dbp->env;
+ mpf = dbp->mpf;
+ h = NULL;
+ nelems = 0;
+ if (listp != NULL) {
+ *listp = NULL;
+ DB_ASSERT(env, nelemp != NULL);
+ *nelemp = 0;
+ }
+
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, DB_WRITELOCK)) != 0)
+ return (ret);
+
+ pgno = PGNO_BASE_MD;
+ if ((ret = __db_lget(dbc,
+ LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, 0,
+ &meta)) != 0)
+ goto err;
+
+ if (last_pgnop != NULL)
+ *last_pgnop = meta->last_pgno;
+ if ((pgno = meta->free) == PGNO_INVALID)
+ goto done;
+
+ size = 128;
+ if ((ret = __os_malloc(env, size * sizeof(*list), &list)) != 0)
+ goto err;
+ lp = list;
+
+ do {
+ if (lp == &list[size]) {
+ size *= 2;
+ if ((ret = __os_realloc(env,
+ size * sizeof(*list), &list)) != 0)
+ goto err;
+ lp = &list[size / 2];
+ }
+ if ((ret = __memp_fget(mpf, &pgno,
+ dbc->thread_info, dbc->txn, 0, &h)) != 0)
+ goto err;
+
+ lp->pgno = pgno;
+ lp->next_pgno = NEXT_PGNO(h);
+ lp->lsn = LSN(h);
+ pgno = NEXT_PGNO(h);
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0)
+ goto err;
+ lp++;
+ } while (pgno != PGNO_INVALID);
+ nelems = (u_int32_t)(lp - list);
+
+ if ((ret = __memp_dirty(mpf,
+ &meta, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err;
+
+ /* Sort the list */
+ __db_freelist_sort(list, nelems);
+
+ if ((ret = __db_pg_truncate(dbc, txn, list, c_data,
+ &nelems, meta->free, &meta->last_pgno, &LSN(meta), 0)) != 0)
+ goto err;
+
+ if (nelems == 0)
+ meta->free = PGNO_INVALID;
+ else
+ meta->free = list[0].pgno;
+
+done: if (last_pgnop != NULL)
+ *last_pgnop = meta->last_pgno;
+
+ /*
+ * The truncate point is the number of pages in the free
+ * list back from the last page. The number of pages
+ * in the free list are the number that we can swap in.
+ */
+ if (c_data)
+ c_data->compact_truncate = (u_int32_t)meta->last_pgno - nelems;
+
+ if (nelems != 0 && listp != NULL) {
+ *listp = list;
+ *nelemp = nelems;
+ list = NULL;
+ }
+
+err: if (list != NULL)
+ __os_free(env, list);
+ if (meta != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, (PAGE *)meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+static int
+__db_truncate_freelist(dbc, meta, h, list, start, nelem)
+ DBC *dbc;
+ DBMETA *meta;
+ PAGE *h;
+ db_pgno_t *list;
+ u_int32_t start, nelem;
+{
+ DB *dbp;
+ DBT ddbt;
+ DB_LSN null_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *last_free, *pg;
+ db_pgno_t *lp, free_pgno, lpgno;
+ db_pglist_t *plist, *pp, *spp;
+ u_int32_t elem, log_size;
+ int last, ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ plist = NULL;
+ last_free = NULL;
+ pg = NULL;
+
+ if (start != 0 &&
+ (ret = __memp_fget(mpf, &list[start - 1],
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &last_free)) != 0)
+ goto err;
+
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __os_malloc(dbp->env,
+ (nelem - start) * sizeof(*pp), &plist)) != 0)
+ goto err;
+
+ pp = plist;
+ for (lp = &list[start]; lp < &list[nelem]; lp++) {
+ pp->pgno = *lp;
+ if ((ret = __memp_fget(mpf, lp,
+ dbc->thread_info, dbc->txn, 0, &pg)) != 0)
+ goto err;
+ pp->lsn = LSN(pg);
+ pp->next_pgno = NEXT_PGNO(pg);
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, pg, DB_PRIORITY_VERY_LOW)) != 0)
+ goto err;
+ pg = NULL;
+ pp++;
+ }
+ ZERO_LSN(null_lsn);
+ pp = plist;
+ elem = nelem - start;
+ log_size = ((LOG *)dbc->env->
+ lg_handle->reginfo.primary)->log_size;
+again: ddbt.data = spp = pp;
+ free_pgno = pp->pgno;
+ lpgno = meta->last_pgno;
+ ddbt.size = elem * sizeof(*pp);
+ if (ddbt.size > log_size / 2) {
+ elem = (log_size / 2) / (u_int32_t)sizeof(*pp);
+ ddbt.size = elem * sizeof(*pp);
+ pp += elem;
+ elem = (nelem - start) - (u_int32_t)(pp - plist);
+ lpgno = pp[-1].pgno;
+ last = 0;
+ } else
+ last = 1;
+ /*
+ * Get the page which will link to this section if we abort.
+ * If this is the first segment then its last_free.
+ */
+ if (spp == plist)
+ pg = last_free;
+ else if ((ret = __memp_fget(mpf, &spp[-1].pgno,
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pg)) != 0)
+ goto err;
+
+ if ((ret = __db_pg_trunc_log(dbp, dbc->txn,
+ &LSN(meta), last == 1 ? DB_FLUSH : 0,
+ PGNO(meta), &LSN(meta),
+ pg != NULL ? PGNO(pg) : PGNO_INVALID,
+ pg != NULL ? &LSN(pg) : &null_lsn,
+ free_pgno, lpgno, &ddbt)) != 0)
+ goto err;
+ if (pg != NULL) {
+ LSN(pg) = LSN(meta);
+ if (pg != last_free && (ret = __memp_fput(mpf,
+ dbc->thread_info, pg, DB_PRIORITY_VERY_LOW)) != 0)
+ goto err;
+ pg = NULL;
+ }
+ if (last == 0)
+ goto again;
+ } else
+ LSN_NOT_LOGGED(LSN(meta));
+
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, h, DB_PRIORITY_VERY_LOW)) != 0)
+ goto err;
+ h = NULL;
+ if ((ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info,
+ list[start], 0)) != 0)
+ goto err;
+ meta->last_pgno = list[start] - 1;
+
+ if (start == 0)
+ meta->free = PGNO_INVALID;
+ else {
+ NEXT_PGNO(last_free) = PGNO_INVALID;
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, last_free, dbc->priority)) != 0)
+ goto err;
+ last_free = NULL;
+ }
+
+ /* Shrink the number of elements in the list. */
+ ret = __memp_extend_freelist(mpf, start, &list);
+
+err: if (plist != NULL)
+ __os_free(dbp->env, plist);
+
+ /* We need to put the page on error. */
+ if (h != NULL)
+ (void)__memp_fput(mpf, dbc->thread_info, h, dbc->priority);
+ if (pg != NULL && pg != last_free)
+ (void)__memp_fput(mpf, dbc->thread_info, pg, dbc->priority);
+ if (last_free != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, last_free, dbc->priority);
+
+ return (ret);
+}
+#endif
+
+#ifdef DEBUG
+/*
+ * __db_lprint --
+ * Print out the list of locks currently held by a cursor.
+ *
+ * PUBLIC: int __db_lprint __P((DBC *));
+ */
+int
+__db_lprint(dbc)
+ DBC *dbc;
+{
+ DB *dbp;
+ DB_LOCKREQ req;
+ ENV *env;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ if (LOCKING_ON(env)) {
+ req.op = DB_LOCK_DUMP;
+ (void)__lock_vec(env, dbc->locker, 0, &req, 1, NULL);
+ }
+ return (0);
+}
+#endif
+
+/*
+ * __db_lget --
+ * The standard lock get call.
+ *
+ * PUBLIC: int __db_lget __P((DBC *,
+ * PUBLIC: int, db_pgno_t, db_lockmode_t, u_int32_t, DB_LOCK *));
+ */
+int
+__db_lget(dbc, action, pgno, mode, lkflags, lockp)
+ DBC *dbc;
+ int action;
+ db_pgno_t pgno;
+ db_lockmode_t mode;
+ u_int32_t lkflags;
+ DB_LOCK *lockp;
+{
+ DB *dbp;
+ DB_LOCKREQ couple[3], *reqp;
+ DB_TXN *txn;
+ ENV *env;
+ int has_timeout, i, ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ txn = dbc->txn;
+
+ /*
+ * We do not always check if we're configured for locking before
+ * calling __db_lget to acquire the lock.
+ */
+ if (CDB_LOCKING(env) || !LOCKING_ON(env) ||
+ (MULTIVERSION(dbp) && mode == DB_LOCK_READ &&
+ dbc->txn != NULL && F_ISSET(dbc->txn, TXN_SNAPSHOT)) ||
+ F_ISSET(dbc, DBC_DONTLOCK) || (F_ISSET(dbc, DBC_RECOVER) &&
+ (action != LCK_ROLLBACK || IS_REP_CLIENT(env))) ||
+ (action != LCK_ALWAYS && F_ISSET(dbc, DBC_OPD))) {
+ LOCK_INIT(*lockp);
+ return (0);
+ }
+
+ dbc->lock.pgno = pgno;
+ if (lkflags & DB_LOCK_RECORD)
+ dbc->lock.type = DB_RECORD_LOCK;
+ else
+ dbc->lock.type = DB_PAGE_LOCK;
+ lkflags &= ~DB_LOCK_RECORD;
+
+ /*
+ * If the transaction enclosing this cursor has DB_LOCK_NOWAIT set,
+ * pass that along to the lock call.
+ */
+ if (DB_NONBLOCK(dbc))
+ lkflags |= DB_LOCK_NOWAIT;
+
+ if (F_ISSET(dbc, DBC_READ_UNCOMMITTED) && mode == DB_LOCK_READ)
+ mode = DB_LOCK_READ_UNCOMMITTED;
+
+ has_timeout = F_ISSET(dbc, DBC_RECOVER) ||
+ (txn != NULL && F_ISSET(txn, TXN_LOCKTIMEOUT));
+
+ /*
+ * Transactional locking.
+ * Hold on to the previous read lock only if we are in full isolation.
+ * COUPLE_ALWAYS indicates we are holding an interior node which need
+ * not be isolated.
+ * Downgrade write locks if we are supporting dirty readers.
+ */
+ if ((action != LCK_COUPLE && action != LCK_COUPLE_ALWAYS) ||
+ !LOCK_ISSET(*lockp))
+ action = 0;
+ else if (dbc->txn == NULL || action == LCK_COUPLE_ALWAYS)
+ action = LCK_COUPLE;
+ else if (F_ISSET(dbc, DBC_READ_COMMITTED | DBC_WAS_READ_COMMITTED) &&
+ lockp->mode == DB_LOCK_READ)
+ action = LCK_COUPLE;
+ else if (lockp->mode == DB_LOCK_READ_UNCOMMITTED)
+ action = LCK_COUPLE;
+ else if (F_ISSET(dbc->dbp,
+ DB_AM_READ_UNCOMMITTED) && lockp->mode == DB_LOCK_WRITE)
+ action = LCK_DOWNGRADE;
+ else
+ action = 0;
+
+ i = 0;
+ switch (action) {
+ default:
+ if (has_timeout)
+ goto do_couple;
+ ret = __lock_get(env,
+ dbc->locker, lkflags, &dbc->lock_dbt, mode, lockp);
+ break;
+
+ case LCK_DOWNGRADE:
+ couple[0].op = DB_LOCK_GET;
+ couple[0].obj = NULL;
+ couple[0].lock = *lockp;
+ couple[0].mode = DB_LOCK_WWRITE;
+ UMRW_SET(couple[0].timeout);
+ i++;
+ /* FALLTHROUGH */
+ case LCK_COUPLE:
+do_couple: couple[i].op = has_timeout? DB_LOCK_GET_TIMEOUT : DB_LOCK_GET;
+ couple[i].obj = &dbc->lock_dbt;
+ couple[i].mode = mode;
+ UMRW_SET(couple[i].timeout);
+ i++;
+ if (has_timeout)
+ couple[0].timeout =
+ F_ISSET(dbc, DBC_RECOVER) ? 0 : txn->lock_timeout;
+ if (action == LCK_COUPLE || action == LCK_DOWNGRADE) {
+ couple[i].op = DB_LOCK_PUT;
+ couple[i].lock = *lockp;
+ i++;
+ }
+
+ ret = __lock_vec(env,
+ dbc->locker, lkflags, couple, i, &reqp);
+ if (ret == 0 || reqp == &couple[i - 1])
+ *lockp = i == 1 ? couple[0].lock : couple[i - 2].lock;
+ break;
+ }
+
+ if (txn != NULL && ret == DB_LOCK_DEADLOCK)
+ F_SET(txn, TXN_DEADLOCK);
+ return ((ret == DB_LOCK_NOTGRANTED && !F_ISSET(env->dbenv,
+ DB_ENV_TIME_NOTGRANTED)) ? DB_LOCK_DEADLOCK : ret);
+}
+
+/*
+ * __db_lput --
+ * The standard lock put call.
+ *
+ * PUBLIC: int __db_lput __P((DBC *, DB_LOCK *));
+ */
+int
+__db_lput(dbc, lockp)
+ DBC *dbc;
+ DB_LOCK *lockp;
+{
+ DB_LOCKREQ couple[2], *reqp;
+ ENV *env;
+ int action, ret;
+
+ /*
+ * Transactional locking.
+ * Hold on to the read locks only if we are in full isolation.
+ * Downgrade write locks if we are supporting dirty readers.
+ */
+ if (F_ISSET(dbc->dbp,
+ DB_AM_READ_UNCOMMITTED) && lockp->mode == DB_LOCK_WRITE)
+ action = LCK_DOWNGRADE;
+ else if (dbc->txn == NULL)
+ action = LCK_COUPLE;
+ else if (F_ISSET(dbc, DBC_READ_COMMITTED | DBC_WAS_READ_COMMITTED) &&
+ lockp->mode == DB_LOCK_READ)
+ action = LCK_COUPLE;
+ else if (lockp->mode == DB_LOCK_READ_UNCOMMITTED)
+ action = LCK_COUPLE;
+ else
+ action = 0;
+
+ env = dbc->env;
+ switch (action) {
+ case LCK_COUPLE:
+ ret = __lock_put(env, lockp);
+ break;
+ case LCK_DOWNGRADE:
+ couple[0].op = DB_LOCK_GET;
+ couple[0].obj = NULL;
+ couple[0].mode = DB_LOCK_WWRITE;
+ couple[0].lock = *lockp;
+ UMRW_SET(couple[0].timeout);
+ couple[1].op = DB_LOCK_PUT;
+ couple[1].lock = *lockp;
+ ret = __lock_vec(env, dbc->locker, 0, couple, 2, &reqp);
+ if (ret == 0 || reqp == &couple[1])
+ *lockp = couple[0].lock;
+ break;
+ default:
+ ret = 0;
+ break;
+ }
+
+ return (ret);
+}
diff --git a/db/db_method.c b/db/db_method.c
new file mode 100644
index 0000000..1182f97
--- /dev/null
+++ b/db/db_method.c
@@ -0,0 +1,1052 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+#ifdef HAVE_RPC
+#ifdef HAVE_SYSTEM_INCLUDE_FILES
+#include <rpc/rpc.h>
+#endif
+#include "db_server.h"
+#include "dbinc_auto/rpc_client_ext.h"
+#endif
+
+static int __db_get_byteswapped __P((DB *, int *));
+static int __db_get_dbname __P((DB *, const char **, const char **));
+static DB_ENV *__db_get_env __P((DB *));
+static void __db_get_msgcall
+ __P((DB *, void (**)(const DB_ENV *, const char *)));
+static DB_MPOOLFILE *__db_get_mpf __P((DB *));
+static int __db_get_multiple __P((DB *));
+static int __db_get_transactional __P((DB *));
+static int __db_get_type __P((DB *, DBTYPE *dbtype));
+static int __db_init __P((DB *, u_int32_t));
+static int __db_get_alloc __P((DB *, void *(**)(size_t),
+ void *(**)(void *, size_t), void (**)(void *)));
+static int __db_set_alloc __P((DB *, void *(*)(size_t),
+ void *(*)(void *, size_t), void (*)(void *)));
+static int __db_get_append_recno __P((DB *,
+ int (**)(DB *, DBT *, db_recno_t)));
+static int __db_set_append_recno __P((DB *, int (*)(DB *, DBT *, db_recno_t)));
+static int __db_get_cachesize __P((DB *, u_int32_t *, u_int32_t *, int *));
+static int __db_set_cachesize __P((DB *, u_int32_t, u_int32_t, int));
+static int __db_get_create_dir __P((DB *, const char **));
+static int __db_set_create_dir __P((DB *, const char *));
+static int __db_get_dup_compare
+ __P((DB *, int (**)(DB *, const DBT *, const DBT *)));
+static int __db_set_dup_compare
+ __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+static int __db_get_encrypt_flags __P((DB *, u_int32_t *));
+static int __db_set_encrypt __P((DB *, const char *, u_int32_t));
+static int __db_get_feedback __P((DB *, void (**)(DB *, int, int)));
+static int __db_set_feedback __P((DB *, void (*)(DB *, int, int)));
+static void __db_map_flags __P((DB *, u_int32_t *, u_int32_t *));
+static int __db_get_pagesize __P((DB *, u_int32_t *));
+static int __db_set_paniccall __P((DB *, void (*)(DB_ENV *, int)));
+static int __db_set_priority __P((DB *, DB_CACHE_PRIORITY));
+static int __db_get_priority __P((DB *, DB_CACHE_PRIORITY *));
+static void __db_get_errcall __P((DB *,
+ void (**)(const DB_ENV *, const char *, const char *)));
+static void __db_set_errcall
+ __P((DB *, void (*)(const DB_ENV *, const char *, const char *)));
+static void __db_get_errfile __P((DB *, FILE **));
+static void __db_set_errfile __P((DB *, FILE *));
+static void __db_get_errpfx __P((DB *, const char **));
+static void __db_set_errpfx __P((DB *, const char *));
+static void __db_set_msgcall
+ __P((DB *, void (*)(const DB_ENV *, const char *)));
+static void __db_get_msgfile __P((DB *, FILE **));
+static void __db_set_msgfile __P((DB *, FILE *));
+static void __dbh_err __P((DB *, int, const char *, ...));
+static void __dbh_errx __P((DB *, const char *, ...));
+
+/*
+ * db_create --
+ * DB constructor.
+ *
+ * EXTERN: int db_create __P((DB **, DB_ENV *, u_int32_t));
+ */
+int
+db_create(dbpp, dbenv, flags)
+ DB **dbpp;
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ ip = NULL;
+ env = dbenv == NULL ? NULL : dbenv->env;
+
+ /* Check for invalid function flags. */
+ if (flags != 0)
+ return (__db_ferr(env, "db_create", 0));
+
+ if (env != NULL)
+ ENV_ENTER(env, ip);
+ ret = __db_create_internal(dbpp, env, flags);
+ if (env != NULL)
+ ENV_LEAVE(env, ip);
+
+ return (ret);
+}
+
+/*
+ * __db_create_internal --
+ * DB constructor internal routine.
+ *
+ * PUBLIC: int __db_create_internal __P((DB **, ENV *, u_int32_t));
+ */
+int
+__db_create_internal(dbpp, env, flags)
+ DB **dbpp;
+ ENV *env;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_ENV *dbenv;
+ DB_REP *db_rep;
+ int ret;
+
+ *dbpp = NULL;
+
+ /* If we don't have an environment yet, allocate a local one. */
+ if (env == NULL) {
+ if ((ret = db_env_create(&dbenv, 0)) != 0)
+ return (ret);
+ env = dbenv->env;
+ F_SET(env, ENV_DBLOCAL);
+ } else
+ dbenv = env->dbenv;
+
+ /* Allocate and initialize the DB handle. */
+ if ((ret = __os_calloc(env, 1, sizeof(*dbp), &dbp)) != 0)
+ goto err;
+
+ dbp->dbenv = env->dbenv;
+ dbp->env = env;
+ if ((ret = __db_init(dbp, flags)) != 0)
+ goto err;
+
+ MUTEX_LOCK(env, env->mtx_dblist);
+ ++env->db_ref;
+ MUTEX_UNLOCK(env, env->mtx_dblist);
+
+ /*
+ * Set the replication timestamp; it's 0 if we're not in a replicated
+ * environment. Don't acquire a lock to read the value, even though
+ * it's opaque: all we check later is value equality, nothing else.
+ */
+ dbp->timestamp = REP_ON(env) ?
+ ((REGENV *)env->reginfo->primary)->rep_timestamp : 0;
+ /*
+ * Set the replication generation number for fid management; valid
+ * replication generations start at 1. Don't acquire a lock to
+ * read the value. All we check later is value equality.
+ */
+ db_rep = env->rep_handle;
+ dbp->fid_gen = REP_ON(env) ? ((REP *)db_rep->region)->gen : 0;
+
+ /* If not RPC, open a backing DB_MPOOLFILE handle in the memory pool. */
+ if (!RPC_ON(dbenv) && (ret = __memp_fcreate(env, &dbp->mpf)) != 0)
+ goto err;
+
+ dbp->type = DB_UNKNOWN;
+
+ *dbpp = dbp;
+ return (0);
+
+err: if (dbp != NULL) {
+ if (dbp->mpf != NULL)
+ (void)__memp_fclose(dbp->mpf, 0);
+ __os_free(env, dbp);
+ }
+
+ if (F_ISSET(env, ENV_DBLOCAL))
+ (void)__env_close(dbp->dbenv, 0);
+
+ return (ret);
+}
+
+/*
+ * __db_init --
+ * Initialize a DB structure.
+ */
+static int
+__db_init(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ int ret;
+
+ dbp->locker = NULL;
+ LOCK_INIT(dbp->handle_lock);
+
+ TAILQ_INIT(&dbp->free_queue);
+ TAILQ_INIT(&dbp->active_queue);
+ TAILQ_INIT(&dbp->join_queue);
+ LIST_INIT(&dbp->s_secondaries);
+
+ FLD_SET(dbp->am_ok,
+ DB_OK_BTREE | DB_OK_HASH | DB_OK_QUEUE | DB_OK_RECNO);
+
+ /* DB PUBLIC HANDLE LIST BEGIN */
+ dbp->associate = __db_associate_pp;
+ dbp->associate_foreign = __db_associate_foreign_pp;
+ dbp->close = __db_close_pp;
+ dbp->compact = __db_compact_pp;
+ dbp->cursor = __db_cursor_pp;
+ dbp->del = __db_del_pp;
+ dbp->dump = __db_dump_pp;
+ dbp->err = __dbh_err;
+ dbp->errx = __dbh_errx;
+ dbp->exists = __db_exists;
+ dbp->fd = __db_fd_pp;
+ dbp->get = __db_get_pp;
+ dbp->get_alloc = __db_get_alloc;
+ dbp->get_append_recno = __db_get_append_recno;
+ dbp->get_byteswapped = __db_get_byteswapped;
+ dbp->get_cachesize = __db_get_cachesize;
+ dbp->get_create_dir = __db_get_create_dir;
+ dbp->get_dbname = __db_get_dbname;
+ dbp->get_dup_compare = __db_get_dup_compare;
+ dbp->get_encrypt_flags = __db_get_encrypt_flags;
+ dbp->get_env = __db_get_env;
+ dbp->get_errcall = __db_get_errcall;
+ dbp->get_errfile = __db_get_errfile;
+ dbp->get_errpfx = __db_get_errpfx;
+ dbp->get_feedback = __db_get_feedback;
+ dbp->get_flags = __db_get_flags;
+ dbp->get_lorder = __db_get_lorder;
+ dbp->get_mpf = __db_get_mpf;
+ dbp->get_msgcall = __db_get_msgcall;
+ dbp->get_msgfile = __db_get_msgfile;
+ dbp->get_multiple = __db_get_multiple;
+ dbp->get_open_flags = __db_get_open_flags;
+ dbp->get_partition_dirs = __partition_get_dirs;
+ dbp->get_partition_callback = __partition_get_callback;
+ dbp->get_partition_keys = __partition_get_keys;
+ dbp->get_pagesize = __db_get_pagesize;
+ dbp->get_priority = __db_get_priority;
+ dbp->get_transactional = __db_get_transactional;
+ dbp->get_type = __db_get_type;
+ dbp->join = __db_join_pp;
+ dbp->key_range = __db_key_range_pp;
+ dbp->open = __db_open_pp;
+ dbp->pget = __db_pget_pp;
+ dbp->put = __db_put_pp;
+ dbp->remove = __db_remove_pp;
+ dbp->rename = __db_rename_pp;
+ dbp->set_alloc = __db_set_alloc;
+ dbp->set_append_recno = __db_set_append_recno;
+ dbp->set_cachesize = __db_set_cachesize;
+ dbp->set_create_dir = __db_set_create_dir;
+ dbp->set_dup_compare = __db_set_dup_compare;
+ dbp->set_encrypt = __db_set_encrypt;
+ dbp->set_errcall = __db_set_errcall;
+ dbp->set_errfile = __db_set_errfile;
+ dbp->set_errpfx = __db_set_errpfx;
+ dbp->set_feedback = __db_set_feedback;
+ dbp->set_flags = __db_set_flags;
+ dbp->set_lorder = __db_set_lorder;
+ dbp->set_msgcall = __db_set_msgcall;
+ dbp->set_msgfile = __db_set_msgfile;
+ dbp->set_pagesize = __db_set_pagesize;
+ dbp->set_paniccall = __db_set_paniccall;
+ dbp->set_partition = __partition_set;
+ dbp->set_partition_dirs = __partition_set_dirs;
+ dbp->set_priority = __db_set_priority;
+ dbp->sort_multiple = __db_sort_multiple;
+ dbp->stat = __db_stat_pp;
+ dbp->stat_print = __db_stat_print_pp;
+ dbp->sync = __db_sync_pp;
+ dbp->truncate = __db_truncate_pp;
+ dbp->upgrade = __db_upgrade_pp;
+ dbp->verify = __db_verify_pp;
+ /* DB PUBLIC HANDLE LIST END */
+
+ /* Access method specific. */
+ if ((ret = __bam_db_create(dbp)) != 0)
+ return (ret);
+ if ((ret = __ham_db_create(dbp)) != 0)
+ return (ret);
+ if ((ret = __qam_db_create(dbp)) != 0)
+ return (ret);
+
+#ifdef HAVE_RPC
+ /*
+ * RPC specific: must be last, as we replace methods set by the
+ * access methods.
+ */
+ if (RPC_ON(dbp->dbenv)) {
+ __dbcl_dbp_init(dbp);
+ /*
+ * !!!
+ * We wrap the DB->open method for RPC, and the rpc.src file
+ * can't handle that.
+ */
+ dbp->open = __dbcl_db_open_wrap;
+ if ((ret = __dbcl_db_create(dbp, dbp->dbenv, flags)) != 0)
+ return (ret);
+ }
+#else
+ COMPQUIET(flags, 0);
+#endif
+
+ return (0);
+}
+
+/*
+ * __dbh_am_chk --
+ * Error if an unreasonable method is called.
+ *
+ * PUBLIC: int __dbh_am_chk __P((DB *, u_int32_t));
+ */
+int
+__dbh_am_chk(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ /*
+ * We start out allowing any access methods to be called, and as the
+ * application calls the methods the options become restricted. The
+ * idea is to quit as soon as an illegal method combination is called.
+ */
+ if ((LF_ISSET(DB_OK_BTREE) && FLD_ISSET(dbp->am_ok, DB_OK_BTREE)) ||
+ (LF_ISSET(DB_OK_HASH) && FLD_ISSET(dbp->am_ok, DB_OK_HASH)) ||
+ (LF_ISSET(DB_OK_QUEUE) && FLD_ISSET(dbp->am_ok, DB_OK_QUEUE)) ||
+ (LF_ISSET(DB_OK_RECNO) && FLD_ISSET(dbp->am_ok, DB_OK_RECNO))) {
+ FLD_CLR(dbp->am_ok, ~flags);
+ return (0);
+ }
+
+ __db_errx(dbp->env,
+ "call implies an access method which is inconsistent with previous calls");
+ return (EINVAL);
+}
+
+/*
+ * __dbh_err --
+ * Db.err method.
+ */
+static void
+#ifdef STDC_HEADERS
+__dbh_err(DB *dbp, int error, const char *fmt, ...)
+#else
+__dbh_err(dbp, error, fmt, va_alist)
+ DB *dbp;
+ int error;
+ const char *fmt;
+ va_dcl
+#endif
+{
+ /* Message with error string, to stderr by default. */
+ DB_REAL_ERR(dbp->dbenv, error, DB_ERROR_SET, 1, fmt);
+}
+
+/*
+ * __dbh_errx --
+ * Db.errx method.
+ */
+static void
+#ifdef STDC_HEADERS
+__dbh_errx(DB *dbp, const char *fmt, ...)
+#else
+__dbh_errx(dbp, fmt, va_alist)
+ DB *dbp;
+ const char *fmt;
+ va_dcl
+#endif
+{
+ /* Message without error string, to stderr by default. */
+ DB_REAL_ERR(dbp->dbenv, 0, DB_ERROR_NOT_SET, 1, fmt);
+}
+
+/*
+ * __db_get_byteswapped --
+ * Return if database requires byte swapping.
+ */
+static int
+__db_get_byteswapped(dbp, isswapped)
+ DB *dbp;
+ int *isswapped;
+{
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_byteswapped");
+
+ *isswapped = F_ISSET(dbp, DB_AM_SWAP) ? 1 : 0;
+ return (0);
+}
+
+/*
+ * __db_get_dbname --
+ * Get the name of the database as passed to DB->open.
+ */
+static int
+__db_get_dbname(dbp, fnamep, dnamep)
+ DB *dbp;
+ const char **fnamep, **dnamep;
+{
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_dbname");
+
+ if (fnamep != NULL)
+ *fnamep = dbp->fname;
+ if (dnamep != NULL)
+ *dnamep = dbp->dname;
+ return (0);
+}
+
+/*
+ * __db_get_env --
+ * Get the DB_ENV handle that was passed to db_create.
+ */
+static DB_ENV *
+__db_get_env(dbp)
+ DB *dbp;
+{
+ return (dbp->dbenv);
+}
+
+/*
+ * __db_get_mpf --
+ * Get the underlying DB_MPOOLFILE handle.
+ */
+static DB_MPOOLFILE *
+__db_get_mpf(dbp)
+ DB *dbp;
+{
+ return (dbp->mpf);
+}
+
+/*
+ * get_multiple --
+ * Return whether this DB handle references a physical file with multiple
+ * databases.
+ */
+static int
+__db_get_multiple(dbp)
+ DB *dbp;
+{
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_multiple");
+
+ /*
+ * Only return TRUE if the handle is for the master database, not for
+ * any subdatabase in the physical file. If it's a Btree, with the
+ * subdatabases flag set, and the meta-data page has the right value,
+ * return TRUE. (We don't need to check it's a Btree, I suppose, but
+ * it doesn't hurt.)
+ */
+ return (dbp->type == DB_BTREE &&
+ F_ISSET(dbp, DB_AM_SUBDB) &&
+ dbp->meta_pgno == PGNO_BASE_MD ? 1 : 0);
+}
+
+/*
+ * get_transactional --
+ * Return whether this database was created in a transaction.
+ */
+static int
+__db_get_transactional(dbp)
+ DB *dbp;
+{
+ return (F_ISSET(dbp, DB_AM_TXN) ? 1 : 0);
+}
+
+/*
+ * __db_get_type --
+ * Return type of underlying database.
+ */
+static int
+__db_get_type(dbp, dbtype)
+ DB *dbp;
+ DBTYPE *dbtype;
+{
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_type");
+
+ *dbtype = dbp->type;
+ return (0);
+}
+
+/*
+ * __db_get_append_recno --
+ * Get record number append routine.
+ */
+static int
+__db_get_append_recno(dbp, funcp)
+ DB *dbp;
+ int (**funcp) __P((DB *, DBT *, db_recno_t));
+{
+ DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO);
+ if (funcp)
+ *funcp = dbp->db_append_recno;
+
+ return (0);
+}
+/*
+ * __db_set_append_recno --
+ * Set record number append routine.
+ */
+static int
+__db_set_append_recno(dbp, func)
+ DB *dbp;
+ int (*func) __P((DB *, DBT *, db_recno_t));
+{
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_append_recno");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO);
+
+ dbp->db_append_recno = func;
+
+ return (0);
+}
+
+/*
+ * __db_get_cachesize --
+ * Get underlying cache size.
+ */
+static int
+__db_get_cachesize(dbp, cache_gbytesp, cache_bytesp, ncachep)
+ DB *dbp;
+ u_int32_t *cache_gbytesp, *cache_bytesp;
+ int *ncachep;
+{
+ DB_ILLEGAL_IN_ENV(dbp, "DB->get_cachesize");
+
+ return (__memp_get_cachesize(dbp->dbenv,
+ cache_gbytesp, cache_bytesp, ncachep));
+}
+
+/*
+ * __db_set_cachesize --
+ * Set underlying cache size.
+ */
+static int
+__db_set_cachesize(dbp, cache_gbytes, cache_bytes, ncache)
+ DB *dbp;
+ u_int32_t cache_gbytes, cache_bytes;
+ int ncache;
+{
+ DB_ILLEGAL_IN_ENV(dbp, "DB->set_cachesize");
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_cachesize");
+
+ return (__memp_set_cachesize(
+ dbp->dbenv, cache_gbytes, cache_bytes, ncache));
+}
+
+static int
+__db_set_create_dir(dbp, dir)
+ DB *dbp;
+ const char *dir;
+{
+ DB_ENV *dbenv;
+ int i;
+
+ dbenv = dbp->dbenv;
+
+ for (i = 0; i < dbenv->data_next; i++)
+ if (strcmp(dir, dbenv->db_data_dir[i]) == 0)
+ break;
+
+ if (i == dbenv->data_next) {
+ __db_errx(dbp->env,
+ "Directory %s not in environment list.", dir);
+ return (EINVAL);
+ }
+
+ dbp->dirname = dbenv->db_data_dir[i];
+ return (0);
+}
+
+static int
+__db_get_create_dir(dbp, dirp)
+ DB *dbp;
+ const char **dirp;
+{
+ *dirp = dbp->dirname;
+ return (0);
+}
+
+/*
+ * __db_get_dup_compare --
+ * Get duplicate comparison routine.
+ */
+static int
+__db_get_dup_compare(dbp, funcp)
+ DB *dbp;
+ int (**funcp) __P((DB *, const DBT *, const DBT *));
+{
+
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE | DB_OK_HASH);
+
+ if (funcp != NULL) {
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbp)) {
+ *funcp =
+ ((BTREE *)dbp->bt_internal)->compress_dup_compare;
+ } else
+#endif
+ *funcp = dbp->dup_compare;
+ }
+
+ return (0);
+}
+
+/*
+ * __db_set_dup_compare --
+ * Set duplicate comparison routine.
+ */
+static int
+__db_set_dup_compare(dbp, func)
+ DB *dbp;
+ int (*func) __P((DB *, const DBT *, const DBT *));
+{
+ int ret;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_dup_compare");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE | DB_OK_HASH);
+
+ if ((ret = __db_set_flags(dbp, DB_DUPSORT)) != 0)
+ return (ret);
+
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbp)) {
+ dbp->dup_compare = __bam_compress_dupcmp;
+ ((BTREE *)dbp->bt_internal)->compress_dup_compare = func;
+ } else
+#endif
+ dbp->dup_compare = func;
+
+ return (0);
+}
+
+/*
+ * __db_get_encrypt_flags --
+ */
+static int
+__db_get_encrypt_flags(dbp, flagsp)
+ DB *dbp;
+ u_int32_t *flagsp;
+{
+ DB_ILLEGAL_IN_ENV(dbp, "DB->get_encrypt_flags");
+
+ return (__env_get_encrypt_flags(dbp->dbenv, flagsp));
+}
+
+/*
+ * __db_set_encrypt --
+ * Set database passwd.
+ */
+static int
+__db_set_encrypt(dbp, passwd, flags)
+ DB *dbp;
+ const char *passwd;
+ u_int32_t flags;
+{
+ DB_CIPHER *db_cipher;
+ int ret;
+
+ DB_ILLEGAL_IN_ENV(dbp, "DB->set_encrypt");
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_encrypt");
+
+ if ((ret = __env_set_encrypt(dbp->dbenv, passwd, flags)) != 0)
+ return (ret);
+
+ /*
+ * In a real env, this gets initialized with the region. In a local
+ * env, we must do it here.
+ */
+ db_cipher = dbp->env->crypto_handle;
+ if (!F_ISSET(db_cipher, CIPHER_ANY) &&
+ (ret = db_cipher->init(dbp->env, db_cipher)) != 0)
+ return (ret);
+
+ return (__db_set_flags(dbp, DB_ENCRYPT));
+}
+
+static void
+__db_get_errcall(dbp, errcallp)
+ DB *dbp;
+ void (**errcallp) __P((const DB_ENV *, const char *, const char *));
+{
+ __env_get_errcall(dbp->dbenv, errcallp);
+}
+
+static void
+__db_set_errcall(dbp, errcall)
+ DB *dbp;
+ void (*errcall) __P((const DB_ENV *, const char *, const char *));
+{
+ __env_set_errcall(dbp->dbenv, errcall);
+}
+
+static void
+__db_get_errfile(dbp, errfilep)
+ DB *dbp;
+ FILE **errfilep;
+{
+ __env_get_errfile(dbp->dbenv, errfilep);
+}
+
+static void
+__db_set_errfile(dbp, errfile)
+ DB *dbp;
+ FILE *errfile;
+{
+ __env_set_errfile(dbp->dbenv, errfile);
+}
+
+static void
+__db_get_errpfx(dbp, errpfxp)
+ DB *dbp;
+ const char **errpfxp;
+{
+ __env_get_errpfx(dbp->dbenv, errpfxp);
+}
+
+static void
+__db_set_errpfx(dbp, errpfx)
+ DB *dbp;
+ const char *errpfx;
+{
+ __env_set_errpfx(dbp->dbenv, errpfx);
+}
+
+static int
+__db_get_feedback(dbp, feedbackp)
+ DB *dbp;
+ void (**feedbackp) __P((DB *, int, int));
+{
+ if (feedbackp != NULL)
+ *feedbackp = dbp->db_feedback;
+ return (0);
+}
+
+static int
+__db_set_feedback(dbp, feedback)
+ DB *dbp;
+ void (*feedback) __P((DB *, int, int));
+{
+ dbp->db_feedback = feedback;
+ return (0);
+}
+
+/*
+ * __db_map_flags --
+ * Maps between public and internal flag values.
+ * This function doesn't check for validity, so it can't fail.
+ */
+static void
+__db_map_flags(dbp, inflagsp, outflagsp)
+ DB *dbp;
+ u_int32_t *inflagsp, *outflagsp;
+{
+ COMPQUIET(dbp, NULL);
+
+ if (FLD_ISSET(*inflagsp, DB_CHKSUM)) {
+ FLD_SET(*outflagsp, DB_AM_CHKSUM);
+ FLD_CLR(*inflagsp, DB_CHKSUM);
+ }
+ if (FLD_ISSET(*inflagsp, DB_ENCRYPT)) {
+ FLD_SET(*outflagsp, DB_AM_ENCRYPT | DB_AM_CHKSUM);
+ FLD_CLR(*inflagsp, DB_ENCRYPT);
+ }
+ if (FLD_ISSET(*inflagsp, DB_TXN_NOT_DURABLE)) {
+ FLD_SET(*outflagsp, DB_AM_NOT_DURABLE);
+ FLD_CLR(*inflagsp, DB_TXN_NOT_DURABLE);
+ }
+}
+
+/*
+ * __db_get_flags --
+ * The DB->get_flags method.
+ *
+ * PUBLIC: int __db_get_flags __P((DB *, u_int32_t *));
+ */
+int
+__db_get_flags(dbp, flagsp)
+ DB *dbp;
+ u_int32_t *flagsp;
+{
+ static const u_int32_t db_flags[] = {
+ DB_CHKSUM,
+ DB_DUP,
+ DB_DUPSORT,
+ DB_ENCRYPT,
+#ifdef HAVE_QUEUE
+ DB_INORDER,
+#endif
+ DB_RECNUM,
+ DB_RENUMBER,
+ DB_REVSPLITOFF,
+ DB_SNAPSHOT,
+ DB_TXN_NOT_DURABLE,
+ 0
+ };
+ u_int32_t f, flags, mapped_flag;
+ int i;
+
+ flags = 0;
+ for (i = 0; (f = db_flags[i]) != 0; i++) {
+ mapped_flag = 0;
+ __db_map_flags(dbp, &f, &mapped_flag);
+ __bam_map_flags(dbp, &f, &mapped_flag);
+ __ram_map_flags(dbp, &f, &mapped_flag);
+#ifdef HAVE_QUEUE
+ __qam_map_flags(dbp, &f, &mapped_flag);
+#endif
+ DB_ASSERT(dbp->env, f == 0);
+ if (F_ISSET(dbp, mapped_flag) == mapped_flag)
+ LF_SET(db_flags[i]);
+ }
+
+ *flagsp = flags;
+ return (0);
+}
+
+/*
+ * __db_set_flags --
+ * DB->set_flags.
+ *
+ * PUBLIC: int __db_set_flags __P((DB *, u_int32_t));
+ */
+int
+__db_set_flags(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ ENV *env;
+ int ret;
+
+ env = dbp->env;
+
+ if (LF_ISSET(DB_ENCRYPT) && !CRYPTO_ON(env)) {
+ __db_errx(env,
+ "Database environment not configured for encryption");
+ return (EINVAL);
+ }
+ if (LF_ISSET(DB_TXN_NOT_DURABLE))
+ ENV_REQUIRES_CONFIG(env,
+ env->tx_handle, "DB_NOT_DURABLE", DB_INIT_TXN);
+
+ __db_map_flags(dbp, &flags, &dbp->flags);
+
+ if ((ret = __bam_set_flags(dbp, &flags)) != 0)
+ return (ret);
+ if ((ret = __ram_set_flags(dbp, &flags)) != 0)
+ return (ret);
+#ifdef HAVE_QUEUE
+ if ((ret = __qam_set_flags(dbp, &flags)) != 0)
+ return (ret);
+#endif
+
+ return (flags == 0 ? 0 : __db_ferr(env, "DB->set_flags", 0));
+}
+
+/*
+ * __db_get_lorder --
+ * Get whether lorder is swapped or not.
+ *
+ * PUBLIC: int __db_get_lorder __P((DB *, int *));
+ */
+int
+__db_get_lorder(dbp, db_lorderp)
+ DB *dbp;
+ int *db_lorderp;
+{
+ int ret;
+
+ /* Flag if the specified byte order requires swapping. */
+ switch (ret = __db_byteorder(dbp->env, 1234)) {
+ case 0:
+ *db_lorderp = F_ISSET(dbp, DB_AM_SWAP) ? 4321 : 1234;
+ break;
+ case DB_SWAPBYTES:
+ *db_lorderp = F_ISSET(dbp, DB_AM_SWAP) ? 1234 : 4321;
+ break;
+ default:
+ return (ret);
+ /* NOTREACHED */
+ }
+
+ return (0);
+}
+
+/*
+ * __db_set_lorder --
+ * Set whether lorder is swapped or not.
+ *
+ * PUBLIC: int __db_set_lorder __P((DB *, int));
+ */
+int
+__db_set_lorder(dbp, db_lorder)
+ DB *dbp;
+ int db_lorder;
+{
+ int ret;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_lorder");
+
+ /* Flag if the specified byte order requires swapping. */
+ switch (ret = __db_byteorder(dbp->env, db_lorder)) {
+ case 0:
+ F_CLR(dbp, DB_AM_SWAP);
+ break;
+ case DB_SWAPBYTES:
+ F_SET(dbp, DB_AM_SWAP);
+ break;
+ default:
+ return (ret);
+ /* NOTREACHED */
+ }
+ return (0);
+}
+
+static int
+__db_get_alloc(dbp, mal_funcp, real_funcp, free_funcp)
+ DB *dbp;
+ void *(**mal_funcp) __P((size_t));
+ void *(**real_funcp) __P((void *, size_t));
+ void (**free_funcp) __P((void *));
+{
+ DB_ILLEGAL_IN_ENV(dbp, "DB->get_alloc");
+
+ return (__env_get_alloc(dbp->dbenv, mal_funcp,
+ real_funcp, free_funcp));
+}
+
+static int
+__db_set_alloc(dbp, mal_func, real_func, free_func)
+ DB *dbp;
+ void *(*mal_func) __P((size_t));
+ void *(*real_func) __P((void *, size_t));
+ void (*free_func) __P((void *));
+{
+ DB_ILLEGAL_IN_ENV(dbp, "DB->set_alloc");
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_alloc");
+
+ return (__env_set_alloc(dbp->dbenv, mal_func, real_func, free_func));
+}
+
+static void
+__db_get_msgcall(dbp, msgcallp)
+ DB *dbp;
+ void (**msgcallp) __P((const DB_ENV *, const char *));
+{
+ __env_get_msgcall(dbp->dbenv, msgcallp);
+}
+
+static void
+__db_set_msgcall(dbp, msgcall)
+ DB *dbp;
+ void (*msgcall) __P((const DB_ENV *, const char *));
+{
+ __env_set_msgcall(dbp->dbenv, msgcall);
+}
+
+static void
+__db_get_msgfile(dbp, msgfilep)
+ DB *dbp;
+ FILE **msgfilep;
+{
+ __env_get_msgfile(dbp->dbenv, msgfilep);
+}
+
+static void
+__db_set_msgfile(dbp, msgfile)
+ DB *dbp;
+ FILE *msgfile;
+{
+ __env_set_msgfile(dbp->dbenv, msgfile);
+}
+
+static int
+__db_get_pagesize(dbp, db_pagesizep)
+ DB *dbp;
+ u_int32_t *db_pagesizep;
+{
+ *db_pagesizep = dbp->pgsize;
+ return (0);
+}
+
+/*
+ * __db_set_pagesize --
+ * DB->set_pagesize
+ *
+ * PUBLIC: int __db_set_pagesize __P((DB *, u_int32_t));
+ */
+int
+__db_set_pagesize(dbp, db_pagesize)
+ DB *dbp;
+ u_int32_t db_pagesize;
+{
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_pagesize");
+
+ if (db_pagesize < DB_MIN_PGSIZE) {
+ __db_errx(dbp->env, "page sizes may not be smaller than %lu",
+ (u_long)DB_MIN_PGSIZE);
+ return (EINVAL);
+ }
+ if (db_pagesize > DB_MAX_PGSIZE) {
+ __db_errx(dbp->env, "page sizes may not be larger than %lu",
+ (u_long)DB_MAX_PGSIZE);
+ return (EINVAL);
+ }
+
+ /*
+ * We don't want anything that's not a power-of-2, as we rely on that
+ * for alignment of various types on the pages.
+ */
+ if (!POWER_OF_TWO(db_pagesize)) {
+ __db_errx(dbp->env, "page sizes must be a power-of-2");
+ return (EINVAL);
+ }
+
+ /*
+ * XXX
+ * Should we be checking for a page size that's not a multiple of 512,
+ * so that we never try and write less than a disk sector?
+ */
+ dbp->pgsize = db_pagesize;
+
+ return (0);
+}
+
+static int
+__db_set_paniccall(dbp, paniccall)
+ DB *dbp;
+ void (*paniccall) __P((DB_ENV *, int));
+{
+ return (__env_set_paniccall(dbp->dbenv, paniccall));
+}
+
+static int
+__db_set_priority(dbp, priority)
+ DB *dbp;
+ DB_CACHE_PRIORITY priority;
+{
+ dbp->priority = priority;
+ return (0);
+}
+
+static int
+__db_get_priority(dbp, priority)
+ DB *dbp;
+ DB_CACHE_PRIORITY *priority;
+{
+ *priority = dbp->priority;
+ return (0);
+}
diff --git a/db/db_open.c b/db/db_open.c
new file mode 100644
index 0000000..5c5db09
--- /dev/null
+++ b/db/db_open.c
@@ -0,0 +1,628 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+#include "dbinc/fop.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+/*
+ * __db_open --
+ * DB->open method.
+ *
+ * This routine gets called in three different ways:
+ *
+ * 1. It can be called to open a file/database. In this case, subdb will
+ * be NULL and meta_pgno will be PGNO_BASE_MD.
+ * 2. It can be called to open a subdatabase during normal operation. In
+ * this case, name and subname will both be non-NULL and meta_pgno will
+ * be PGNO_BASE_MD (also PGNO_INVALID).
+ * 3. It can be called to open an in-memory database (name == NULL;
+ * subname = name).
+ * 4. It can be called during recovery to open a file/database, in which case
+ * name will be non-NULL, subname will be NULL, and meta-pgno will be
+ * PGNO_BASE_MD.
+ * 5. It can be called during recovery to open a subdatabase, in which case
+ * name will be non-NULL, subname may be NULL and meta-pgno will be
+ * a valid pgno (i.e., not PGNO_BASE_MD).
+ * 6. It can be called during recovery to open an in-memory database.
+ *
+ * PUBLIC: int __db_open __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC: const char *, const char *, DBTYPE, u_int32_t, int, db_pgno_t));
+ */
+int
+__db_open(dbp, ip, txn, fname, dname, type, flags, mode, meta_pgno)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *fname, *dname;
+ DBTYPE type;
+ u_int32_t flags;
+ int mode;
+ db_pgno_t meta_pgno;
+{
+ DB *tdbp;
+ ENV *env;
+ int ret;
+ u_int32_t id;
+
+ env = dbp->env;
+ id = TXN_INVALID;
+
+ /*
+ * We must flush any existing pages before truncating the file
+ * since they could age out of mpool and overwrite new pages.
+ */
+ if (LF_ISSET(DB_TRUNCATE)) {
+ if ((ret = __db_create_internal(&tdbp, dbp->env, 0)) != 0)
+ goto err;
+ ret = __db_open(tdbp, ip, txn, fname, dname, DB_UNKNOWN,
+ DB_NOERROR | (flags & ~(DB_TRUNCATE|DB_CREATE)),
+ mode, meta_pgno);
+ if (ret == 0)
+ ret = __memp_ftruncate(tdbp->mpf, txn, ip, 0, 0);
+ (void)__db_close(tdbp, txn, DB_NOSYNC);
+ if (ret != 0 && ret != ENOENT && ret != EINVAL)
+ goto err;
+ ret = 0;
+ }
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_PREOPEN, ret, fname);
+
+ /*
+ * If the environment was configured with threads, the DB handle
+ * must also be free-threaded, so we force the DB_THREAD flag on.
+ * (See SR #2033 for why this is a requirement--recovery needs
+ * to be able to grab a dbp using __db_fileid_to_dbp, and it has
+ * no way of knowing which dbp goes with which thread, so whichever
+ * one it finds has to be usable in any of them.)
+ */
+ if (F_ISSET(env, ENV_THREAD))
+ LF_SET(DB_THREAD);
+
+ /* Convert any DB->open flags. */
+ if (LF_ISSET(DB_RDONLY))
+ F_SET(dbp, DB_AM_RDONLY);
+ if (LF_ISSET(DB_READ_UNCOMMITTED))
+ F_SET(dbp, DB_AM_READ_UNCOMMITTED);
+
+ if (IS_REAL_TXN(txn))
+ F_SET(dbp, DB_AM_TXN);
+
+ /* Fill in the type. */
+ dbp->type = type;
+
+ /*
+ * If both fname and subname are NULL, it's always a create, so make
+ * sure that we have both DB_CREATE and a type specified. It would
+ * be nice if this checking were done in __db_open where most of the
+ * interface checking is done, but this interface (__db_dbopen) is
+ * used by the recovery and limbo system, so we need to safeguard
+ * this interface as well.
+ */
+ if (fname == NULL) {
+ if (dbp->p_internal != NULL) {
+ __db_errx(env,
+ "Partitioned databases may not be in memory.");
+ return (ENOENT);
+ }
+ if (dname == NULL) {
+ if (!LF_ISSET(DB_CREATE)) {
+ __db_errx(env,
+ "DB_CREATE must be specified to create databases.");
+ return (ENOENT);
+ }
+
+ F_SET(dbp, DB_AM_INMEM);
+ F_SET(dbp, DB_AM_CREATED);
+
+ if (dbp->type == DB_UNKNOWN) {
+ __db_errx(env,
+ "DBTYPE of unknown without existing file");
+ return (EINVAL);
+ }
+
+ if (dbp->pgsize == 0)
+ dbp->pgsize = DB_DEF_IOSIZE;
+
+ /*
+ * If the file is a temporary file and we're
+ * doing locking, then we have to create a
+ * unique file ID. We can't use our normal
+ * dev/inode pair (or whatever this OS uses
+ * in place of dev/inode pairs) because no
+ * backing file will be created until the
+ * mpool cache is filled forcing the buffers
+ * to disk. Grab a random locker ID to use
+ * as a file ID. The created ID must never
+ * match a potential real file ID -- we know
+ * it won't because real file IDs contain a
+ * time stamp after the dev/inode pair, and
+ * we're simply storing a 4-byte value.
+
+ * !!!
+ * Store the locker in the file id structure
+ * -- we can get it from there as necessary,
+ * and it saves having two copies.
+ */
+ if (LOCKING_ON(env) && (ret = __lock_id(env,
+ (u_int32_t *)dbp->fileid, NULL)) != 0)
+ return (ret);
+ } else
+ MAKE_INMEM(dbp);
+
+ /*
+ * Normally we would do handle locking here, however, with
+ * in-memory files, we cannot do any database manipulation
+ * until the mpool is open, so it happens later.
+ */
+ } else if (dname == NULL && meta_pgno == PGNO_BASE_MD) {
+ /* Open/create the underlying file. Acquire locks. */
+ if ((ret = __fop_file_setup(dbp, ip,
+ txn, fname, mode, flags, &id)) != 0)
+ return (ret);
+ } else {
+ if (dbp->p_internal != NULL) {
+ __db_errx(env,
+ "Partitioned databases may not be included with multiple databases.");
+ return (ENOENT);
+ }
+ if ((ret = __fop_subdb_setup(dbp, ip,
+ txn, fname, dname, mode, flags)) != 0)
+ return (ret);
+ meta_pgno = dbp->meta_pgno;
+ }
+
+ /* Set up the underlying environment. */
+ if ((ret = __env_setup(dbp, txn, fname, dname, id, flags)) != 0)
+ return (ret);
+
+ /* For in-memory databases, we now need to open/create the database. */
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ if (dname == NULL)
+ ret = __db_new_file(dbp, ip, txn, NULL, NULL);
+ else {
+ id = TXN_INVALID;
+ if ((ret = __fop_file_setup(dbp, ip,
+ txn, dname, mode, flags, &id)) == 0 &&
+ DBENV_LOGGING(env) && !F_ISSET(dbp, DB_AM_RECOVER)
+#if !defined(DEBUG_ROP) && !defined(DEBUG_WOP) && !defined(DIAGNOSTIC)
+ && txn != NULL
+#endif
+#if !defined(DEBUG_ROP)
+ && !F_ISSET(dbp, DB_AM_RDONLY)
+#endif
+ )
+ ret = __dbreg_log_id(dbp,
+ txn, dbp->log_filename->id, 1);
+ }
+ if (ret != 0)
+ goto err;
+ }
+
+ switch (dbp->type) {
+ case DB_BTREE:
+ ret = __bam_open(dbp, ip, txn, fname, meta_pgno, flags);
+ break;
+ case DB_HASH:
+ ret = __ham_open(dbp, ip, txn, fname, meta_pgno, flags);
+ break;
+ case DB_RECNO:
+ ret = __ram_open(dbp, ip, txn, fname, meta_pgno, flags);
+ break;
+ case DB_QUEUE:
+ ret = __qam_open(
+ dbp, ip, txn, fname, meta_pgno, mode, flags);
+ break;
+ case DB_UNKNOWN:
+ return (
+ __db_unknown_type(env, "__db_dbopen", dbp->type));
+ }
+ if (ret != 0)
+ goto err;
+
+#ifdef HAVE_PARTITION
+ if (dbp->p_internal != NULL && (ret =
+ __partition_open(dbp, ip, txn, fname, type, flags, mode, 1)) != 0)
+ goto err;
+#endif
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTOPEN, ret, fname);
+
+ /*
+ * Temporary files don't need handle locks, so we only have to check
+ * for a handle lock downgrade or lockevent in the case of named
+ * files.
+ */
+ if (!F_ISSET(dbp, DB_AM_RECOVER) && (fname != NULL || dname != NULL) &&
+ LOCK_ISSET(dbp->handle_lock)) {
+ if (IS_REAL_TXN(txn))
+ ret = __txn_lockevent(env,
+ txn, dbp, &dbp->handle_lock, dbp->locker);
+ else if (LOCKING_ON(env))
+ /* Trade write handle lock for read handle lock. */
+ ret = __lock_downgrade(env,
+ &dbp->handle_lock, DB_LOCK_READ, 0);
+ }
+DB_TEST_RECOVERY_LABEL
+err:
+ return (ret);
+}
+
+/*
+ * __db_get_open_flags --
+ * Accessor for flags passed into DB->open call
+ *
+ * PUBLIC: int __db_get_open_flags __P((DB *, u_int32_t *));
+ */
+int
+__db_get_open_flags(dbp, flagsp)
+ DB *dbp;
+ u_int32_t *flagsp;
+{
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_open_flags");
+
+ *flagsp = dbp->open_flags;
+ return (0);
+}
+
+/*
+ * __db_new_file --
+ * Create a new database file.
+ *
+ * PUBLIC: int __db_new_file __P((DB *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *));
+ */
+int
+__db_new_file(dbp, ip, txn, fhp, name)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DB_FH *fhp;
+ const char *name;
+{
+ int ret;
+
+ switch (dbp->type) {
+ case DB_BTREE:
+ case DB_RECNO:
+ ret = __bam_new_file(dbp, ip, txn, fhp, name);
+ break;
+ case DB_HASH:
+ ret = __ham_new_file(dbp, ip, txn, fhp, name);
+ break;
+ case DB_QUEUE:
+ ret = __qam_new_file(dbp, ip, txn, fhp, name);
+ break;
+ case DB_UNKNOWN:
+ default:
+ __db_errx(dbp->env,
+ "%s: Invalid type %d specified", name, dbp->type);
+ ret = EINVAL;
+ break;
+ }
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOGMETA, ret, name);
+ /* Sync the file in preparation for moving it into place. */
+ if (ret == 0 && fhp != NULL)
+ ret = __os_fsync(dbp->env, fhp);
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTSYNC, ret, name);
+
+DB_TEST_RECOVERY_LABEL
+ return (ret);
+}
+
+/*
+ * __db_init_subdb --
+ * Initialize the dbp for a subdb.
+ *
+ * PUBLIC: int __db_init_subdb __P((DB *,
+ * PUBLIC: DB *, const char *, DB_THREAD_INFO *, DB_TXN *));
+ */
+int
+__db_init_subdb(mdbp, dbp, name, ip, txn)
+ DB *mdbp, *dbp;
+ const char *name;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+{
+ DBMETA *meta;
+ DB_MPOOLFILE *mpf;
+ int ret, t_ret;
+
+ ret = 0;
+ if (!F_ISSET(dbp, DB_AM_CREATED)) {
+ /* Subdb exists; read meta-data page and initialize. */
+ mpf = mdbp->mpf;
+ if ((ret = __memp_fget(mpf, &dbp->meta_pgno,
+ ip, txn, 0, &meta)) != 0)
+ goto err;
+ ret = __db_meta_setup(mdbp->env, dbp, name, meta, 0, 0);
+ if ((t_ret = __memp_fput(mpf,
+ ip, meta, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ /*
+ * If __db_meta_setup found that the meta-page hadn't
+ * been written out during recovery, we can just return.
+ */
+ if (ret == ENOENT)
+ ret = 0;
+ goto err;
+ }
+
+ /* Handle the create case here. */
+ switch (dbp->type) {
+ case DB_BTREE:
+ case DB_RECNO:
+ ret = __bam_new_subdb(mdbp, dbp, ip, txn);
+ break;
+ case DB_HASH:
+ ret = __ham_new_subdb(mdbp, dbp, ip, txn);
+ break;
+ case DB_QUEUE:
+ ret = EINVAL;
+ break;
+ case DB_UNKNOWN:
+ default:
+ __db_errx(dbp->env,
+ "Invalid subdatabase type %d specified", dbp->type);
+ return (EINVAL);
+ }
+
+err: return (ret);
+}
+
+/*
+ * __db_chk_meta --
+ * Take a buffer containing a meta-data page and check it for a valid LSN,
+ * checksum (and verify the checksum if necessary) and possibly decrypt it.
+ *
+ * Return 0 on success, >0 (errno) on error, -1 on checksum mismatch.
+ *
+ * PUBLIC: int __db_chk_meta __P((ENV *, DB *, DBMETA *, u_int32_t));
+ */
+int
+__db_chk_meta(env, dbp, meta, flags)
+ ENV *env;
+ DB *dbp;
+ DBMETA *meta;
+ u_int32_t flags;
+{
+ DB_LSN swap_lsn;
+ int is_hmac, ret, swapped;
+ u_int32_t magic, orig_chk;
+ u_int8_t *chksum;
+
+ ret = 0;
+ swapped = 0;
+
+ if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM)) {
+ if (dbp != NULL)
+ F_SET(dbp, DB_AM_CHKSUM);
+
+ is_hmac = meta->encrypt_alg == 0 ? 0 : 1;
+ chksum = ((BTMETA *)meta)->chksum;
+
+ /*
+ * If we need to swap, the checksum function overwrites the
+ * original checksum with 0, so we need to save a copy of the
+ * original for swapping later.
+ */
+ orig_chk = *(u_int32_t *)chksum;
+
+ /*
+ * We cannot add this to __db_metaswap because that gets done
+ * later after we've verified the checksum or decrypted.
+ */
+ if (LF_ISSET(DB_CHK_META)) {
+ swapped = 0;
+chk_retry: if ((ret =
+ __db_check_chksum(env, NULL, env->crypto_handle,
+ chksum, meta, DBMETASIZE, is_hmac)) != 0) {
+ if (is_hmac || swapped)
+ return (ret);
+
+ M_32_SWAP(orig_chk);
+ swapped = 1;
+ *(u_int32_t *)chksum = orig_chk;
+ goto chk_retry;
+ }
+ }
+ } else if (dbp != NULL)
+ F_CLR(dbp, DB_AM_CHKSUM);
+
+#ifdef HAVE_CRYPTO
+ ret = __crypto_decrypt_meta(env,
+ dbp, (u_int8_t *)meta, LF_ISSET(DB_CHK_META));
+#endif
+
+ /* Now that we're decrypted, we can check LSN. */
+ if (LOGGING_ON(env) && !LF_ISSET(DB_CHK_NOLSN)) {
+ /*
+ * This gets called both before and after swapping, so we
+ * need to check ourselves. If we already swapped it above,
+ * we'll know that here.
+ */
+
+ swap_lsn = meta->lsn;
+ magic = meta->magic;
+lsn_retry:
+ if (swapped) {
+ M_32_SWAP(swap_lsn.file);
+ M_32_SWAP(swap_lsn.offset);
+ M_32_SWAP(magic);
+ }
+ switch (magic) {
+ case DB_BTREEMAGIC:
+ case DB_HASHMAGIC:
+ case DB_QAMMAGIC:
+ case DB_RENAMEMAGIC:
+ break;
+ default:
+ if (swapped)
+ return (EINVAL);
+ swapped = 1;
+ goto lsn_retry;
+ }
+ if (!IS_REP_CLIENT(env) &&
+ !IS_NOT_LOGGED_LSN(swap_lsn) && !IS_ZERO_LSN(swap_lsn))
+ /* Need to do check. */
+ ret = __log_check_page_lsn(env, dbp, &swap_lsn);
+ }
+ return (ret);
+}
+
+/*
+ * __db_meta_setup --
+ *
+ * Take a buffer containing a meta-data page and figure out if it's
+ * valid, and if so, initialize the dbp from the meta-data page.
+ *
+ * PUBLIC: int __db_meta_setup __P((ENV *,
+ * PUBLIC: DB *, const char *, DBMETA *, u_int32_t, u_int32_t));
+ */
+int
+__db_meta_setup(env, dbp, name, meta, oflags, flags)
+ ENV *env;
+ DB *dbp;
+ const char *name;
+ DBMETA *meta;
+ u_int32_t oflags;
+ u_int32_t flags;
+{
+ u_int32_t magic;
+ int ret;
+
+ ret = 0;
+
+ /*
+ * Figure out what access method we're dealing with, and then
+ * call access method specific code to check error conditions
+ * based on conflicts between the found file and application
+ * arguments. A found file overrides some user information --
+ * we don't consider it an error, for example, if the user set
+ * an expected byte order and the found file doesn't match it.
+ */
+ F_CLR(dbp, DB_AM_SWAP | DB_AM_IN_RENAME);
+ magic = meta->magic;
+
+swap_retry:
+ switch (magic) {
+ case DB_BTREEMAGIC:
+ case DB_HASHMAGIC:
+ case DB_QAMMAGIC:
+ case DB_RENAMEMAGIC:
+ break;
+ case 0:
+ /*
+ * The only time this should be 0 is if we're in the
+ * midst of opening a subdb during recovery and that
+ * subdatabase had its meta-data page allocated, but
+ * not yet initialized.
+ */
+ if (F_ISSET(dbp, DB_AM_SUBDB) && ((IS_RECOVERING(env) &&
+ F_ISSET(env->lg_handle, DBLOG_FORCE_OPEN)) ||
+ meta->pgno != PGNO_INVALID))
+ return (ENOENT);
+
+ goto bad_format;
+ default:
+ if (F_ISSET(dbp, DB_AM_SWAP))
+ goto bad_format;
+
+ M_32_SWAP(magic);
+ F_SET(dbp, DB_AM_SWAP);
+ goto swap_retry;
+ }
+
+ /*
+ * We can only check the meta page if we are sure we have a meta page.
+ * If it is random data, then this check can fail. So only now can we
+ * checksum and decrypt. Don't distinguish between configuration and
+ * checksum match errors here, because we haven't opened the database
+ * and even a checksum error isn't a reason to panic the environment.
+ */
+ if ((ret = __db_chk_meta(env, dbp, meta, flags)) != 0) {
+ if (ret == -1)
+ __db_errx(env,
+ "%s: metadata page checksum error", name);
+ goto bad_format;
+ }
+
+ switch (magic) {
+ case DB_BTREEMAGIC:
+ if (dbp->type != DB_UNKNOWN &&
+ dbp->type != DB_RECNO && dbp->type != DB_BTREE)
+ goto bad_format;
+
+ flags = meta->flags;
+ if (F_ISSET(dbp, DB_AM_SWAP))
+ M_32_SWAP(flags);
+ if (LF_ISSET(BTM_RECNO))
+ dbp->type = DB_RECNO;
+ else
+ dbp->type = DB_BTREE;
+ if ((oflags & DB_TRUNCATE) == 0 && (ret =
+ __bam_metachk(dbp, name, (BTMETA *)meta)) != 0)
+ return (ret);
+ break;
+ case DB_HASHMAGIC:
+ if (dbp->type != DB_UNKNOWN && dbp->type != DB_HASH)
+ goto bad_format;
+
+ dbp->type = DB_HASH;
+ if ((oflags & DB_TRUNCATE) == 0 && (ret =
+ __ham_metachk(dbp, name, (HMETA *)meta)) != 0)
+ return (ret);
+ break;
+ case DB_QAMMAGIC:
+ if (dbp->type != DB_UNKNOWN && dbp->type != DB_QUEUE)
+ goto bad_format;
+ dbp->type = DB_QUEUE;
+ if ((oflags & DB_TRUNCATE) == 0 && (ret =
+ __qam_metachk(dbp, name, (QMETA *)meta)) != 0)
+ return (ret);
+ break;
+ case DB_RENAMEMAGIC:
+ F_SET(dbp, DB_AM_IN_RENAME);
+
+ /* Copy the file's ID. */
+ memcpy(dbp->fileid, ((DBMETA *)meta)->uid, DB_FILE_ID_LEN);
+
+ break;
+ default:
+ goto bad_format;
+ }
+
+ if (FLD_ISSET(meta->metaflags,
+ DBMETA_PART_RANGE | DBMETA_PART_CALLBACK))
+ if ((ret =
+ __partition_init(dbp, meta->metaflags)) != 0)
+ return (ret);
+ return (0);
+
+bad_format:
+ if (F_ISSET(dbp, DB_AM_RECOVER))
+ ret = ENOENT;
+ else
+ __db_errx(env,
+ "__db_meta_setup: %s: unexpected file type or format",
+ name);
+ return (ret == 0 ? EINVAL : ret);
+}
diff --git a/db/db_overflow.c b/db/db_overflow.c
new file mode 100644
index 0000000..a718182
--- /dev/null
+++ b/db/db_overflow.c
@@ -0,0 +1,706 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/mp.h"
+
+/*
+ * Big key/data code.
+ *
+ * Big key and data entries are stored on linked lists of pages. The initial
+ * reference is a structure with the total length of the item and the page
+ * number where it begins. Each entry in the linked list contains a pointer
+ * to the next page of data, and so on.
+ */
+
+/*
+ * __db_goff --
+ * Get an offpage item.
+ *
+ * PUBLIC: int __db_goff __P((DBC *,
+ * PUBLIC: DBT *, u_int32_t, db_pgno_t, void **, u_int32_t *));
+ */
+int
+__db_goff(dbc, dbt, tlen, pgno, bpp, bpsz)
+ DBC *dbc;
+ DBT *dbt;
+ u_int32_t tlen;
+ db_pgno_t pgno;
+ void **bpp;
+ u_int32_t *bpsz;
+{
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ DB_TXN *txn;
+ DBC_INTERNAL *cp;
+ ENV *env;
+ PAGE *h;
+ DB_THREAD_INFO *ip;
+ db_indx_t bytes;
+ u_int32_t curoff, needed, start;
+ u_int8_t *p, *src;
+ int ret;
+
+ dbp = dbc->dbp;
+ cp = dbc->internal;
+ env = dbp->env;
+ ip = dbc->thread_info;
+ mpf = dbp->mpf;
+ txn = dbc->txn;
+
+ /*
+ * Check if the buffer is big enough; if it is not and we are
+ * allowed to malloc space, then we'll malloc it. If we are
+ * not (DB_DBT_USERMEM), then we'll set the dbt and return
+ * appropriately.
+ */
+ if (F_ISSET(dbt, DB_DBT_PARTIAL)) {
+ start = dbt->doff;
+ if (start > tlen)
+ needed = 0;
+ else if (dbt->dlen > tlen - start)
+ needed = tlen - start;
+ else
+ needed = dbt->dlen;
+ } else {
+ start = 0;
+ needed = tlen;
+ }
+
+ /*
+ * If the caller has not requested any data, return success. This
+ * "early-out" also avoids setting up the streaming optimization when
+ * no page would be retrieved. If it were removed, the streaming code
+ * should only initialize when needed is not 0.
+ */
+ if (needed == 0) {
+ dbt->size = 0;
+ return (0);
+ }
+
+ if (F_ISSET(dbt, DB_DBT_USERCOPY))
+ goto skip_alloc;
+
+ /* Allocate any necessary memory. */
+ if (F_ISSET(dbt, DB_DBT_USERMEM)) {
+ if (needed > dbt->ulen) {
+ dbt->size = needed;
+ return (DB_BUFFER_SMALL);
+ }
+ } else if (F_ISSET(dbt, DB_DBT_MALLOC)) {
+ if ((ret = __os_umalloc(env, needed, &dbt->data)) != 0)
+ return (ret);
+ } else if (F_ISSET(dbt, DB_DBT_REALLOC)) {
+ if ((ret = __os_urealloc(env, needed, &dbt->data)) != 0)
+ return (ret);
+ } else if (bpsz != NULL && (*bpsz == 0 || *bpsz < needed)) {
+ if ((ret = __os_realloc(env, needed, bpp)) != 0)
+ return (ret);
+ *bpsz = needed;
+ dbt->data = *bpp;
+ } else if (bpp != NULL)
+ dbt->data = *bpp;
+ else {
+ DB_ASSERT(env,
+ F_ISSET(dbt,
+ DB_DBT_USERMEM | DB_DBT_MALLOC | DB_DBT_REALLOC) ||
+ bpsz != NULL || bpp != NULL);
+ return (DB_BUFFER_SMALL);
+ }
+
+skip_alloc:
+ /* Set up a start page in the overflow chain if streaming. */
+ if (cp->stream_start_pgno != PGNO_INVALID &&
+ pgno == cp->stream_start_pgno && start >= cp->stream_off &&
+ start < cp->stream_off + P_MAXSPACE(dbp, dbp->pgsize)) {
+ pgno = cp->stream_curr_pgno;
+ curoff = cp->stream_off;
+ } else {
+ cp->stream_start_pgno = cp->stream_curr_pgno = pgno;
+ cp->stream_off = curoff = 0;
+ }
+
+ /*
+ * Step through the linked list of pages, copying the data on each
+ * one into the buffer. Never copy more than the total data length.
+ */
+ dbt->size = needed;
+ for (p = dbt->data; pgno != PGNO_INVALID && needed > 0;) {
+ if ((ret = __memp_fget(mpf,
+ &pgno, ip, txn, 0, &h)) != 0)
+ return (ret);
+ DB_ASSERT(env, TYPE(h) == P_OVERFLOW);
+
+ /* Check if we need any bytes from this page. */
+ if (curoff + OV_LEN(h) >= start) {
+ bytes = OV_LEN(h);
+ src = (u_int8_t *)h + P_OVERHEAD(dbp);
+ if (start > curoff) {
+ src += start - curoff;
+ bytes -= start - curoff;
+ }
+ if (bytes > needed)
+ bytes = needed;
+ if (F_ISSET(dbt, DB_DBT_USERCOPY)) {
+ /*
+ * The offset into the DBT is the total size
+ * less the amount of data still needed. Care
+ * needs to be taken if doing a partial copy
+ * beginning at an offset other than 0.
+ */
+ if ((ret = env->dbt_usercopy(
+ dbt, dbt->size - needed,
+ src, bytes, DB_USERCOPY_SETDATA)) != 0) {
+ (void)__memp_fput(mpf,
+ ip, h, dbp->priority);
+ return (ret);
+ }
+ } else
+ memcpy(p, src, bytes);
+ p += bytes;
+ needed -= bytes;
+ }
+ cp->stream_off = curoff;
+ curoff += OV_LEN(h);
+ cp->stream_curr_pgno = pgno;
+ pgno = h->next_pgno;
+ (void)__memp_fput(mpf, ip, h, dbp->priority);
+ }
+
+ return (0);
+}
+
+/*
+ * __db_poff --
+ * Put an offpage item.
+ *
+ * PUBLIC: int __db_poff __P((DBC *, const DBT *, db_pgno_t *));
+ */
+int
+__db_poff(dbc, dbt, pgnop)
+ DBC *dbc;
+ const DBT *dbt;
+ db_pgno_t *pgnop;
+{
+ DB *dbp;
+ DBT tmp_dbt;
+ DB_LSN null_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep, *lastp;
+ db_indx_t pagespace;
+ db_pgno_t pgno;
+ u_int32_t space, sz, tlen;
+ u_int8_t *p;
+ int ret, t_ret;
+
+ /*
+ * Allocate pages and copy the key/data item into them. Calculate the
+ * number of bytes we get for pages we fill completely with a single
+ * item.
+ */
+ dbp = dbc->dbp;
+ lastp = NULL;
+ mpf = dbp->mpf;
+ pagespace = P_MAXSPACE(dbp, dbp->pgsize);
+ p = dbt->data;
+ sz = dbt->size;
+
+ /*
+ * Check whether we are streaming at the end of the overflow item.
+ * If so, the last pgno and offset will be cached in the cursor.
+ */
+ if (F_ISSET(dbt, DB_DBT_STREAMING)) {
+ tlen = dbt->size - dbt->dlen;
+ pgno = dbc->internal->stream_curr_pgno;
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info,
+ dbc->txn, DB_MPOOL_DIRTY, &lastp)) != 0)
+ return (ret);
+
+ /*
+ * Calculate how much we can write on the last page of the
+ * overflow item.
+ */
+ DB_ASSERT(dbp->env,
+ OV_LEN(lastp) == (tlen - dbc->internal->stream_off));
+ space = pagespace - OV_LEN(lastp);
+
+ /* Only copy as much data as we have. */
+ if (space > dbt->dlen)
+ space = dbt->dlen;
+
+ if (DBC_LOGGING(dbc)) {
+ tmp_dbt.data = dbt->data;
+ tmp_dbt.size = space;
+ ZERO_LSN(null_lsn);
+ if ((ret = __db_big_log(dbp, dbc->txn,
+ &LSN(lastp), 0, DB_APPEND_BIG, pgno,
+ PGNO_INVALID, PGNO_INVALID, &tmp_dbt,
+ &LSN(lastp), &null_lsn, &null_lsn)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(lastp));
+
+ memcpy((u_int8_t *)lastp + P_OVERHEAD(dbp) + OV_LEN(lastp),
+ dbt->data, space);
+ OV_LEN(lastp) += space;
+ sz -= space + dbt->doff;
+ p += space;
+ *pgnop = dbc->internal->stream_start_pgno;
+ }
+
+ ret = 0;
+ for (; sz > 0; p += pagespace, sz -= pagespace) {
+ /*
+ * Reduce pagespace so we terminate the loop correctly and
+ * don't copy too much data.
+ */
+ if (sz < pagespace)
+ pagespace = sz;
+
+ /*
+ * Allocate and initialize a new page and copy all or part of
+ * the item onto the page. If sz is less than pagespace, we
+ * have a partial record.
+ */
+ if ((ret = __db_new(dbc, P_OVERFLOW, NULL, &pagep)) != 0)
+ break;
+ if (DBC_LOGGING(dbc)) {
+ tmp_dbt.data = p;
+ tmp_dbt.size = pagespace;
+ ZERO_LSN(null_lsn);
+ if ((ret = __db_big_log(dbp, dbc->txn,
+ &LSN(pagep), 0, DB_ADD_BIG, PGNO(pagep),
+ lastp ? PGNO(lastp) : PGNO_INVALID,
+ PGNO_INVALID, &tmp_dbt, &LSN(pagep),
+ lastp == NULL ? &null_lsn : &LSN(lastp),
+ &null_lsn)) != 0) {
+ (void)__memp_fput(mpf, dbc->thread_info,
+ pagep, dbc->priority);
+ goto err;
+ }
+ } else
+ LSN_NOT_LOGGED(LSN(pagep));
+
+ /* Move LSN onto page. */
+ if (lastp != NULL)
+ LSN(lastp) = LSN(pagep);
+
+ OV_LEN(pagep) = pagespace;
+ OV_REF(pagep) = 1;
+ memcpy((u_int8_t *)pagep + P_OVERHEAD(dbp), p, pagespace);
+
+ /*
+ * If this is the first entry, update the user's info and
+ * initialize the cursor to allow for streaming of subsequent
+ * updates. Otherwise, update the entry on the last page
+ * filled in and release that page.
+ */
+ if (lastp == NULL) {
+ *pgnop = PGNO(pagep);
+ dbc->internal->stream_start_pgno =
+ dbc->internal->stream_curr_pgno = *pgnop;
+ dbc->internal->stream_off = 0;
+ } else {
+ lastp->next_pgno = PGNO(pagep);
+ pagep->prev_pgno = PGNO(lastp);
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, lastp, dbc->priority)) != 0) {
+ lastp = NULL;
+ goto err;
+ }
+ }
+ lastp = pagep;
+ }
+err: if (lastp != NULL) {
+ if (ret == 0) {
+ dbc->internal->stream_curr_pgno = PGNO(lastp);
+ dbc->internal->stream_off = dbt->size - OV_LEN(lastp);
+ }
+
+ if ((t_ret = __memp_fput(mpf, dbc->thread_info, lastp,
+ dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ return (ret);
+}
+
+/*
+ * __db_ovref --
+ * Decrement the reference count on an overflow page.
+ *
+ * PUBLIC: int __db_ovref __P((DBC *, db_pgno_t));
+ */
+int
+__db_ovref(dbc, pgno)
+ DBC *dbc;
+ db_pgno_t pgno;
+{
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ int ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+
+ if ((ret = __memp_fget(mpf, &pgno,
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &h)) != 0)
+ return (ret);
+
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __db_ovref_log(dbp,
+ dbc->txn, &LSN(h), 0, h->pgno, -1, &LSN(h))) != 0) {
+ (void)__memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority);
+ return (ret);
+ }
+ } else
+ LSN_NOT_LOGGED(LSN(h));
+
+ /*
+ * In BDB releases before 4.5, the overflow reference counts were
+ * incremented when an overflow item was split onto an internal
+ * page. There was a lock race in that code, and rather than fix
+ * the race, we changed BDB to copy overflow items when splitting
+ * them onto internal pages. The code to decrement reference
+ * counts remains so databases already in the field continue to
+ * work.
+ */
+ --OV_REF(h);
+
+ return (__memp_fput(mpf, dbc->thread_info, h, dbc->priority));
+}
+
+/*
+ * __db_doff --
+ * Delete an offpage chain of overflow pages.
+ *
+ * PUBLIC: int __db_doff __P((DBC *, db_pgno_t));
+ */
+int
+__db_doff(dbc, pgno)
+ DBC *dbc;
+ db_pgno_t pgno;
+{
+ DB *dbp;
+ DBT tmp_dbt;
+ DB_LSN null_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+
+ do {
+ if ((ret = __memp_fget(mpf, &pgno,
+ dbc->thread_info, dbc->txn, 0, &pagep)) != 0)
+ return (ret);
+
+ DB_ASSERT(dbp->env, TYPE(pagep) == P_OVERFLOW);
+ /*
+ * If it's referenced by more than one key/data item,
+ * decrement the reference count and return.
+ */
+ if (OV_REF(pagep) > 1) {
+ (void)__memp_fput(mpf,
+ dbc->thread_info, pagep, dbc->priority);
+ return (__db_ovref(dbc, pgno));
+ }
+
+ if ((ret = __memp_dirty(mpf, &pagep,
+ dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) {
+ if (pagep != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, pagep, dbc->priority);
+ return (ret);
+ }
+
+ if (DBC_LOGGING(dbc)) {
+ tmp_dbt.data = (u_int8_t *)pagep + P_OVERHEAD(dbp);
+ tmp_dbt.size = OV_LEN(pagep);
+ ZERO_LSN(null_lsn);
+ if ((ret = __db_big_log(dbp, dbc->txn,
+ &LSN(pagep), 0, DB_REM_BIG,
+ PGNO(pagep), PREV_PGNO(pagep),
+ NEXT_PGNO(pagep), &tmp_dbt,
+ &LSN(pagep), &null_lsn, &null_lsn)) != 0) {
+ (void)__memp_fput(mpf,
+ dbc->thread_info, pagep, dbc->priority);
+ return (ret);
+ }
+ } else
+ LSN_NOT_LOGGED(LSN(pagep));
+ pgno = pagep->next_pgno;
+ OV_LEN(pagep) = 0;
+ if ((ret = __db_free(dbc, pagep)) != 0)
+ return (ret);
+ } while (pgno != PGNO_INVALID);
+
+ return (0);
+}
+
+/*
+ * __db_moff --
+ * Match on overflow pages.
+ *
+ * Given a starting page number and a key, return <0, 0, >0 to indicate if the
+ * key on the page is less than, equal to or greater than the key specified.
+ * We optimize this by doing chunk at a time comparison unless the user has
+ * specified a comparison function. In this case, we need to materialize
+ * the entire object and call their comparison routine.
+ *
+ * __db_moff and __db_coff are generic functions useful in searching and
+ * ordering off page items. __db_moff matches an overflow DBT with an offpage
+ * item. __db_coff compares two offpage items for lexicographic sort order.
+ *
+ * PUBLIC: int __db_moff __P((DBC *, const DBT *, db_pgno_t, u_int32_t,
+ * PUBLIC: int (*)(DB *, const DBT *, const DBT *), int *));
+ */
+int
+__db_moff(dbc, dbt, pgno, tlen, cmpfunc, cmpp)
+ DBC *dbc;
+ const DBT *dbt;
+ db_pgno_t pgno;
+ u_int32_t tlen;
+ int (*cmpfunc) __P((DB *, const DBT *, const DBT *)), *cmpp;
+{
+ DB *dbp;
+ DBT local_dbt;
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+ PAGE *pagep;
+ void *buf;
+ u_int32_t bufsize, cmp_bytes, key_left;
+ u_int8_t *p1, *p2;
+ int ret;
+
+ dbp = dbc->dbp;
+ ip = dbc->thread_info;
+ mpf = dbp->mpf;
+
+ /*
+ * If there is a user-specified comparison function, build a
+ * contiguous copy of the key, and call it.
+ */
+ if (cmpfunc != NULL) {
+ memset(&local_dbt, 0, sizeof(local_dbt));
+ buf = NULL;
+ bufsize = 0;
+
+ if ((ret = __db_goff(dbc,
+ &local_dbt, tlen, pgno, &buf, &bufsize)) != 0)
+ return (ret);
+ /* Pass the key as the first argument */
+ *cmpp = cmpfunc(dbp, dbt, &local_dbt);
+ __os_free(dbp->env, buf);
+ return (0);
+ }
+
+ /* While there are both keys to compare. */
+ for (*cmpp = 0, p1 = dbt->data,
+ key_left = dbt->size; key_left > 0 && pgno != PGNO_INVALID;) {
+ if ((ret =
+ __memp_fget(mpf, &pgno, ip, dbc->txn, 0, &pagep)) != 0)
+ return (ret);
+
+ cmp_bytes = OV_LEN(pagep) < key_left ? OV_LEN(pagep) : key_left;
+ tlen -= cmp_bytes;
+ key_left -= cmp_bytes;
+ for (p2 = (u_int8_t *)pagep + P_OVERHEAD(dbp);
+ cmp_bytes-- > 0; ++p1, ++p2)
+ if (*p1 != *p2) {
+ *cmpp = (long)*p1 - (long)*p2;
+ break;
+ }
+ pgno = NEXT_PGNO(pagep);
+ if ((ret = __memp_fput(mpf, ip, pagep, dbp->priority)) != 0)
+ return (ret);
+ if (*cmpp != 0)
+ return (0);
+ }
+ if (key_left > 0) /* DBT is longer than the page key. */
+ *cmpp = 1;
+ else if (tlen > 0) /* DBT is shorter than the page key. */
+ *cmpp = -1;
+ else
+ *cmpp = 0;
+
+ return (0);
+}
+
+/*
+ * __db_coff --
+ * Match two offpage dbts.
+ *
+ * The DBTs must both refer to offpage items.
+ * The match happens a chunk (page) at a time unless a user defined comparison
+ * function exists. It is not possible to optimize this comparison away when
+ * a lexicographic sort order is required on mismatch.
+ *
+ * NOTE: For now this function only works for H_OFFPAGE type items. It would
+ * be simple to extend it for use with B_OVERFLOW type items. It would only
+ * require extracting the total length, and page number, dependent on the
+ * DBT type.
+ *
+ * PUBLIC: int __db_coff __P((DBC *, const DBT *, const DBT *,
+ * PUBLIC: int (*)(DB *, const DBT *, const DBT *), int *));
+ */
+int
+__db_coff(dbc, dbt, match, cmpfunc, cmpp)
+ DBC *dbc;
+ const DBT *dbt, *match;
+ int (*cmpfunc) __P((DB *, const DBT *, const DBT *)), *cmpp;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_MPOOLFILE *mpf;
+ DB_TXN *txn;
+ DBT local_key, local_match;
+ PAGE *dbt_pagep, *match_pagep;
+ db_pgno_t dbt_pgno, match_pgno;
+ u_int32_t cmp_bytes, dbt_bufsz, dbt_len, match_bufsz;
+ u_int32_t match_len, max_data, page_sz;
+ u_int8_t *p1, *p2;
+ int ret;
+ void *dbt_buf, *match_buf;
+
+ dbp = dbc->dbp;
+ ip = dbc->thread_info;
+ txn = dbc->txn;
+ mpf = dbp->mpf;
+ page_sz = dbp->pgsize;
+ *cmpp = 0;
+ dbt_buf = match_buf = NULL;
+
+ DB_ASSERT(dbp->env, HPAGE_PTYPE(dbt->data) == H_OFFPAGE);
+ DB_ASSERT(dbp->env, HPAGE_PTYPE(match->data) == H_OFFPAGE);
+
+ /* Extract potentially unaligned length and pgno fields from DBTs */
+ memcpy(&dbt_len, HOFFPAGE_TLEN(dbt->data), sizeof(u_int32_t));
+ memcpy(&dbt_pgno, HOFFPAGE_PGNO(dbt->data), sizeof(db_pgno_t));
+ memcpy(&match_len, HOFFPAGE_TLEN(match->data), sizeof(u_int32_t));
+ memcpy(&match_pgno, HOFFPAGE_PGNO(match->data), sizeof(db_pgno_t));
+ max_data = (dbt_len < match_len ? dbt_len : match_len);
+
+ /*
+ * If there is a custom comparator, fully resolve both DBTs.
+ * Then call the users comparator.
+ */
+ if (cmpfunc != NULL) {
+ memset(&local_key, 0, sizeof(local_key));
+ memset(&local_match, 0, sizeof(local_match));
+ dbt_buf = match_buf = NULL;
+ dbt_bufsz = match_bufsz = 0;
+
+ if ((ret = __db_goff(dbc, &local_key, dbt_len,
+ dbt_pgno, &dbt_buf, &dbt_bufsz)) != 0)
+ goto err1;
+ if ((ret = __db_goff(dbc, &local_match, match_len,
+ match_pgno, &match_buf, &match_bufsz)) != 0)
+ goto err1;
+ /* The key needs to be the first argument for sort order */
+ *cmpp = cmpfunc(dbp, &local_key, &local_match);
+
+err1: if (dbt_buf != NULL)
+ __os_free(dbp->env, dbt_buf);
+ if (match_buf != NULL)
+ __os_free(dbp->env, match_buf);
+ return (ret);
+ }
+
+ /* Match the offpage DBTs a page at a time. */
+ while (dbt_pgno != PGNO_INVALID && match_pgno != PGNO_INVALID) {
+ if ((ret =
+ __memp_fget(mpf, &dbt_pgno, ip, txn, 0, &dbt_pagep)) != 0)
+ return (ret);
+ if ((ret =
+ __memp_fget(mpf, &match_pgno,
+ ip, txn, 0, &match_pagep)) != 0) {
+ (void)__memp_fput(
+ mpf, ip, dbt_pagep, DB_PRIORITY_UNCHANGED);
+ return (ret);
+ }
+ cmp_bytes = page_sz < max_data ? page_sz : max_data;
+ for (p1 = (u_int8_t *)dbt_pagep + P_OVERHEAD(dbp),
+ p2 = (u_int8_t *)match_pagep + P_OVERHEAD(dbp);
+ cmp_bytes-- > 0; ++p1, ++p2)
+ if (*p1 != *p2) {
+ *cmpp = (long)*p1 - (long)*p2;
+ break;
+ }
+
+ dbt_pgno = NEXT_PGNO(dbt_pagep);
+ match_pgno = NEXT_PGNO(match_pagep);
+ max_data -= page_sz;
+ if ((ret = __memp_fput(mpf,
+ ip, dbt_pagep, DB_PRIORITY_UNCHANGED)) != 0) {
+ (void)__memp_fput(mpf,
+ ip, match_pagep, DB_PRIORITY_UNCHANGED);
+ return (ret);
+ }
+ if ((ret = __memp_fput(mpf,
+ ip, match_pagep, DB_PRIORITY_UNCHANGED)) != 0)
+ return (ret);
+ if (*cmpp != 0)
+ return (0);
+ }
+
+ /* If a lexicographic mismatch was found, then the result has already
+ * been returned. If the DBTs matched, consider the lengths of the
+ * items, and return appropriately.
+ */
+ if (dbt_len > match_len) /* DBT is longer than the match key. */
+ *cmpp = 1;
+ else if (match_len > dbt_len) /* DBT is shorter than the match key. */
+ *cmpp = -1;
+ else
+ *cmpp = 0;
+
+ return (0);
+
+}
diff --git a/db/db_ovfl_vrfy.c b/db/db_ovfl_vrfy.c
new file mode 100644
index 0000000..fdd0201
--- /dev/null
+++ b/db/db_ovfl_vrfy.c
@@ -0,0 +1,409 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/mp.h"
+
+/*
+ * __db_vrfy_overflow --
+ * Verify overflow page.
+ *
+ * PUBLIC: int __db_vrfy_overflow __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+ * PUBLIC: u_int32_t));
+ */
+int
+__db_vrfy_overflow(dbp, vdp, h, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ VRFY_PAGEINFO *pip;
+ int isbad, ret, t_ret;
+
+ isbad = 0;
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ pip->refcount = OV_REF(h);
+ if (pip->refcount < 1) {
+ EPRINT((dbp->env,
+ "Page %lu: overflow page has zero reference count",
+ (u_long)pgno));
+ isbad = 1;
+ }
+
+ /* Just store for now. */
+ pip->olen = HOFFSET(h);
+
+err: if ((t_ret = __db_vrfy_putpageinfo(dbp->env, vdp, pip)) != 0)
+ ret = t_ret;
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_vrfy_ovfl_structure --
+ * Walk a list of overflow pages, avoiding cycles and marking
+ * pages seen.
+ *
+ * PUBLIC: int __db_vrfy_ovfl_structure
+ * PUBLIC: __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t, u_int32_t));
+ */
+int
+__db_vrfy_ovfl_structure(dbp, vdp, pgno, tlen, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ u_int32_t tlen;
+ u_int32_t flags;
+{
+ DB *pgset;
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ db_pgno_t next, prev;
+ int isbad, ret, seen_cnt, t_ret;
+ u_int32_t refcount;
+
+ env = dbp->env;
+ pgset = vdp->pgset;
+ DB_ASSERT(env, pgset != NULL);
+ isbad = 0;
+
+ /* This shouldn't happen, but just to be sure. */
+ if (!IS_VALID_PGNO(pgno))
+ return (DB_VERIFY_BAD);
+
+ /*
+ * Check the first prev_pgno; it ought to be PGNO_INVALID,
+ * since there's no prev page.
+ */
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ /* The refcount is stored on the first overflow page. */
+ refcount = pip->refcount;
+
+ if (pip->type != P_OVERFLOW) {
+ EPRINT((env,
+ "Page %lu: overflow page of invalid type %lu",
+ (u_long)pgno, (u_long)pip->type));
+ ret = DB_VERIFY_BAD;
+ goto err; /* Unsafe to continue. */
+ }
+
+ prev = pip->prev_pgno;
+ if (prev != PGNO_INVALID) {
+ EPRINT((env,
+ "Page %lu: first page in overflow chain has a prev_pgno %lu",
+ (u_long)pgno, (u_long)prev));
+ isbad = 1;
+ }
+
+ for (;;) {
+ /*
+ * We may have seen this page elsewhere, if the overflow entry
+ * has been promoted to an internal page; we just want to
+ * make sure that each overflow page is seen exactly as many
+ * times as its refcount dictates.
+ *
+ * Note that this code also serves to keep us from looping
+ * infinitely if there's a cycle in an overflow chain.
+ */
+ if ((ret = __db_vrfy_pgset_get(pgset,
+ vdp->thread_info, pgno, &seen_cnt)) != 0)
+ goto err;
+ if ((u_int32_t)seen_cnt > refcount) {
+ EPRINT((env,
+ "Page %lu: encountered too many times in overflow traversal",
+ (u_long)pgno));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ if ((ret =
+ __db_vrfy_pgset_inc(pgset, vdp->thread_info, pgno)) != 0)
+ goto err;
+
+ /*
+ * Each overflow page can be referenced multiple times,
+ * because it's possible for overflow Btree keys to get
+ * promoted to internal pages. We want to make sure that
+ * each page is referenced from a Btree leaf (or Hash data
+ * page, which we consider a "leaf" here) exactly once; if
+ * the parent was a leaf, set a flag to indicate that we've
+ * seen this page in a leaf context.
+ *
+ * If the parent is not a leaf--in which case it's a Btree
+ * internal page--we don't need to bother doing any further
+ * verification, as we'll do it when we hit the leaf (or
+ * complain that we never saw the leaf). Only the first
+ * page in an overflow chain should ever have a refcount
+ * greater than 1, and the combination of the LEAFSEEN check
+ * and the fact that we bail after the first page for
+ * non-leaves should ensure this.
+ *
+ * Note that each "child" of a page, such as an overflow page,
+ * is stored and verified in a structure check exactly once,
+ * so this code does not need to contend with the fact that
+ * overflow chains used as Btree duplicate keys may be
+ * referenced multiply from a single Btree leaf page.
+ */
+ if (LF_ISSET(DB_ST_OVFL_LEAF)) {
+ if (F_ISSET(pip, VRFY_OVFL_LEAFSEEN)) {
+ EPRINT((env,
+ "Page %lu: overflow page linked twice from leaf or data page",
+ (u_long)pgno));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ F_SET(pip, VRFY_OVFL_LEAFSEEN);
+ }
+
+ /*
+ * We want to verify each overflow chain only once, and
+ * although no chain should be linked more than once from a
+ * leaf page, we can't guarantee that it'll be linked that
+ * once if it's linked from an internal page and the key
+ * is gone.
+ *
+ * seen_cnt is the number of times we'd encountered this page
+ * before calling this function.
+ */
+ if (seen_cnt == 0) {
+ /*
+ * Keep a running tab on how much of the item we've
+ * seen.
+ */
+ tlen -= pip->olen;
+
+ /* Send the application feedback about our progress. */
+ if (!LF_ISSET(DB_SALVAGE))
+ __db_vrfy_struct_feedback(dbp, vdp);
+ } else
+ goto done;
+
+ next = pip->next_pgno;
+
+ /* Are we there yet? */
+ if (next == PGNO_INVALID)
+ break;
+
+ /*
+ * We've already checked this when we saved it, but just
+ * to be sure...
+ */
+ if (!IS_VALID_PGNO(next)) {
+ EPRINT((env,
+ "Page %lu: bad next_pgno %lu on overflow page",
+ (u_long)pgno, (u_long)next));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+ if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 ||
+ (ret = __db_vrfy_getpageinfo(vdp, next, &pip)) != 0)
+ return (ret);
+ if (pip->prev_pgno != pgno) {
+ EPRINT((env,
+ "Page %lu: bad prev_pgno %lu on overflow page (should be %lu)",
+ (u_long)next, (u_long)pip->prev_pgno,
+ (u_long)pgno));
+ isbad = 1;
+ /*
+ * It's safe to continue because we have separate
+ * cycle detection.
+ */
+ }
+
+ pgno = next;
+ }
+
+ if (tlen > 0) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: overflow item incomplete", (u_long)pgno));
+ }
+
+done:
+err: if ((t_ret =
+ __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_safe_goff --
+ * Get an overflow item, very carefully, from an untrusted database,
+ * in the context of the salvager.
+ *
+ * PUBLIC: int __db_safe_goff __P((DB *, VRFY_DBINFO *,
+ * PUBLIC: db_pgno_t, DBT *, void *, u_int32_t *, u_int32_t));
+ */
+int
+__db_safe_goff(dbp, vdp, pgno, dbt, buf, bufsz, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ DBT *dbt;
+ void *buf;
+ u_int32_t *bufsz;
+ u_int32_t flags;
+{
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ int ret, t_ret;
+ u_int32_t bytesgot, bytes;
+ u_int8_t *src, *dest;
+
+ mpf = dbp->mpf;
+ h = NULL;
+ ret = t_ret = 0;
+ bytesgot = bytes = 0;
+
+ DB_ASSERT(dbp->env, bufsz != NULL);
+
+ /*
+ * Back up to the start of the overflow chain (if necessary) via the
+ * prev pointer of the overflow page. This guarantees we transverse the
+ * longest possible chains of overflow pages and won't be called again
+ * with a pgno earlier in the chain, stepping on ourselves.
+ */
+ for (;;) {
+ if ((ret = __memp_fget(
+ mpf, &pgno, vdp->thread_info, NULL, 0, &h)) != 0)
+ return (ret);
+
+ if (PREV_PGNO(h) == PGNO_INVALID ||
+ !IS_VALID_PGNO(PREV_PGNO(h)))
+ break;
+
+ pgno = PREV_PGNO(h);
+
+ if ((ret = __memp_fput(mpf,
+ vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
+ return (ret);
+ }
+ if ((ret = __memp_fput(
+ mpf, vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
+ return (ret);
+
+ h = NULL;
+
+ while ((pgno != PGNO_INVALID) && (IS_VALID_PGNO(pgno))) {
+ /*
+ * Mark that we're looking at this page; if we've seen it
+ * already, quit.
+ */
+ if ((ret = __db_salvage_markdone(vdp, pgno)) != 0)
+ break;
+
+ if ((ret = __memp_fget(mpf, &pgno,
+ vdp->thread_info, NULL, 0, &h)) != 0)
+ break;
+
+ /*
+ * Make sure it's really an overflow page, unless we're
+ * being aggressive, in which case we pretend it is.
+ */
+ if (!LF_ISSET(DB_AGGRESSIVE) && TYPE(h) != P_OVERFLOW) {
+ ret = DB_VERIFY_BAD;
+ break;
+ }
+
+ src = (u_int8_t *)h + P_OVERHEAD(dbp);
+ bytes = OV_LEN(h);
+
+ if (bytes + P_OVERHEAD(dbp) > dbp->pgsize)
+ bytes = dbp->pgsize - P_OVERHEAD(dbp);
+
+ /*
+ * Realloc if buf is too small
+ */
+ if (bytesgot + bytes > *bufsz) {
+ if ((ret =
+ __os_realloc(dbp->env, bytesgot + bytes, buf)) != 0)
+ break;
+ *bufsz = bytesgot + bytes;
+ }
+
+ dest = *(u_int8_t **)buf + bytesgot;
+ bytesgot += bytes;
+
+ memcpy(dest, src, bytes);
+
+ pgno = NEXT_PGNO(h);
+
+ if ((ret = __memp_fput(mpf,
+ vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
+ break;
+ h = NULL;
+ }
+
+ /*
+ * If we're being aggressive, salvage a partial datum if there
+ * was an error somewhere along the way.
+ */
+ if (ret == 0 || LF_ISSET(DB_AGGRESSIVE)) {
+ dbt->size = bytesgot;
+ dbt->data = *(void **)buf;
+ }
+
+ /* If we broke out on error, don't leave pages pinned. */
+ if (h != NULL && (t_ret = __memp_fput(mpf,
+ vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
diff --git a/db/db_pr.c b/db/db_pr.c
new file mode 100644
index 0000000..69fb7c7
--- /dev/null
+++ b/db/db_pr.c
@@ -0,0 +1,1659 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/qam.h"
+#include "dbinc/db_verify.h"
+
+/*
+ * __db_loadme --
+ * A nice place to put a breakpoint.
+ *
+ * PUBLIC: void __db_loadme __P((void));
+ */
+void
+__db_loadme()
+{
+ pid_t pid;
+
+ __os_id(NULL, &pid, NULL);
+}
+
+#ifdef HAVE_STATISTICS
+static int __db_bmeta __P((DB *, BTMETA *, u_int32_t));
+static int __db_hmeta __P((DB *, HMETA *, u_int32_t));
+static void __db_meta __P((DB *, DBMETA *, FN const *, u_int32_t));
+static const char *__db_pagetype_to_string __P((u_int32_t));
+static void __db_prdb __P((DB *, u_int32_t));
+static void __db_proff __P((ENV *, DB_MSGBUF *, void *));
+static int __db_prtree __P((DB *, DB_TXN *, u_int32_t));
+static int __db_qmeta __P((DB *, QMETA *, u_int32_t));
+
+/*
+ * __db_dumptree --
+ * Dump the tree to a file.
+ *
+ * PUBLIC: int __db_dumptree __P((DB *, DB_TXN *, char *, char *));
+ */
+int
+__db_dumptree(dbp, txn, op, name)
+ DB *dbp;
+ DB_TXN *txn;
+ char *op, *name;
+{
+ ENV *env;
+ FILE *fp, *orig_fp;
+ u_int32_t flags;
+ int ret;
+
+ env = dbp->env;
+
+ for (flags = 0; *op != '\0'; ++op)
+ switch (*op) {
+ case 'a':
+ LF_SET(DB_PR_PAGE);
+ break;
+ case 'h':
+ break;
+ case 'r':
+ LF_SET(DB_PR_RECOVERYTEST);
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ if (name != NULL) {
+ if ((fp = fopen(name, "w")) == NULL)
+ return (__os_get_errno());
+
+ orig_fp = dbp->dbenv->db_msgfile;
+ dbp->dbenv->db_msgfile = fp;
+ } else
+ fp = orig_fp = NULL;
+
+ __db_prdb(dbp, flags);
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+
+ ret = __db_prtree(dbp, txn, flags);
+
+ if (fp != NULL) {
+ (void)fclose(fp);
+ env->dbenv->db_msgfile = orig_fp;
+ }
+
+ return (ret);
+}
+
+static const FN __db_flags_fn[] = {
+ { DB_AM_CHKSUM, "checksumming" },
+ { DB_AM_COMPENSATE, "created by compensating transaction" },
+ { DB_AM_CREATED, "database created" },
+ { DB_AM_CREATED_MSTR, "encompassing file created" },
+ { DB_AM_DBM_ERROR, "dbm/ndbm error" },
+ { DB_AM_DELIMITER, "variable length" },
+ { DB_AM_DISCARD, "discard cached pages" },
+ { DB_AM_DUP, "duplicates" },
+ { DB_AM_DUPSORT, "sorted duplicates" },
+ { DB_AM_ENCRYPT, "encrypted" },
+ { DB_AM_FIXEDLEN, "fixed-length records" },
+ { DB_AM_INMEM, "in-memory" },
+ { DB_AM_IN_RENAME, "file is being renamed" },
+ { DB_AM_NOT_DURABLE, "changes not logged" },
+ { DB_AM_OPEN_CALLED, "open called" },
+ { DB_AM_PAD, "pad value" },
+ { DB_AM_PGDEF, "default page size" },
+ { DB_AM_RDONLY, "read-only" },
+ { DB_AM_READ_UNCOMMITTED, "read-uncommitted" },
+ { DB_AM_RECNUM, "Btree record numbers" },
+ { DB_AM_RECOVER, "opened for recovery" },
+ { DB_AM_RENUMBER, "renumber" },
+ { DB_AM_REVSPLITOFF, "no reverse splits" },
+ { DB_AM_SECONDARY, "secondary" },
+ { DB_AM_SNAPSHOT, "load on open" },
+ { DB_AM_SUBDB, "subdatabases" },
+ { DB_AM_SWAP, "needswap" },
+ { DB_AM_TXN, "transactional" },
+ { DB_AM_VERIFYING, "verifier" },
+ { 0, NULL }
+};
+
+/*
+ * __db_get_flags_fn --
+ * Return the __db_flags_fn array.
+ *
+ * PUBLIC: const FN * __db_get_flags_fn __P((void));
+ */
+const FN *
+__db_get_flags_fn()
+{
+ return (__db_flags_fn);
+}
+
+/*
+ * __db_prdb --
+ * Print out the DB structure information.
+ */
+static void
+__db_prdb(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ BTREE *bt;
+ DB_MSGBUF mb;
+ ENV *env;
+ HASH *h;
+ QUEUE *q;
+
+ env = dbp->env;
+
+ DB_MSGBUF_INIT(&mb);
+ __db_msg(env, "In-memory DB structure:");
+ __db_msgadd(env, &mb, "%s: %#lx",
+ __db_dbtype_to_string(dbp->type), (u_long)dbp->flags);
+ __db_prflags(env, &mb, dbp->flags, __db_flags_fn, " (", ")");
+ DB_MSGBUF_FLUSH(env, &mb);
+
+ switch (dbp->type) {
+ case DB_BTREE:
+ case DB_RECNO:
+ bt = dbp->bt_internal;
+ __db_msg(env, "bt_meta: %lu bt_root: %lu",
+ (u_long)bt->bt_meta, (u_long)bt->bt_root);
+ __db_msg(env, "bt_minkey: %lu", (u_long)bt->bt_minkey);
+ if (!LF_ISSET(DB_PR_RECOVERYTEST))
+ __db_msg(env, "bt_compare: %#lx bt_prefix: %#lx",
+ P_TO_ULONG(bt->bt_compare),
+ P_TO_ULONG(bt->bt_prefix));
+#ifdef HAVE_COMPRESSION
+ if (!LF_ISSET(DB_PR_RECOVERYTEST))
+ __db_msg(env, "bt_compress: %#lx bt_decompress: %#lx",
+ P_TO_ULONG(bt->bt_compress),
+ P_TO_ULONG(bt->bt_decompress));
+#endif
+ __db_msg(env, "bt_lpgno: %lu", (u_long)bt->bt_lpgno);
+ if (dbp->type == DB_RECNO) {
+ __db_msg(env,
+ "re_pad: %#lx re_delim: %#lx re_len: %lu re_source: %s",
+ (u_long)bt->re_pad, (u_long)bt->re_delim,
+ (u_long)bt->re_len,
+ bt->re_source == NULL ? "" : bt->re_source);
+ __db_msg(env,
+ "re_modified: %d re_eof: %d re_last: %lu",
+ bt->re_modified, bt->re_eof, (u_long)bt->re_last);
+ }
+ break;
+ case DB_HASH:
+ h = dbp->h_internal;
+ __db_msg(env, "meta_pgno: %lu", (u_long)h->meta_pgno);
+ __db_msg(env, "h_ffactor: %lu", (u_long)h->h_ffactor);
+ __db_msg(env, "h_nelem: %lu", (u_long)h->h_nelem);
+ if (!LF_ISSET(DB_PR_RECOVERYTEST))
+ __db_msg(env, "h_hash: %#lx", P_TO_ULONG(h->h_hash));
+ break;
+ case DB_QUEUE:
+ q = dbp->q_internal;
+ __db_msg(env, "q_meta: %lu", (u_long)q->q_meta);
+ __db_msg(env, "q_root: %lu", (u_long)q->q_root);
+ __db_msg(env, "re_pad: %#lx re_len: %lu",
+ (u_long)q->re_pad, (u_long)q->re_len);
+ __db_msg(env, "rec_page: %lu", (u_long)q->rec_page);
+ __db_msg(env, "page_ext: %lu", (u_long)q->page_ext);
+ break;
+ case DB_UNKNOWN:
+ default:
+ break;
+ }
+}
+
+/*
+ * __db_prtree --
+ * Print out the entire tree.
+ */
+static int
+__db_prtree(dbp, txn, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ u_int32_t flags;
+{
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ db_pgno_t i, last;
+ int ret;
+
+ mpf = dbp->mpf;
+
+ if (dbp->type == DB_QUEUE)
+ return (__db_prqueue(dbp, flags));
+
+ /*
+ * Find out the page number of the last page in the database, then
+ * dump each page.
+ */
+ if ((ret = __memp_get_last_pgno(mpf, &last)) != 0)
+ return (ret);
+ for (i = 0; i <= last; ++i) {
+ if ((ret = __memp_fget(mpf, &i, NULL, txn, 0, &h)) != 0)
+ return (ret);
+ (void)__db_prpage(dbp, h, flags);
+ if ((ret = __memp_fput(mpf, NULL, h, dbp->priority)) != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+/*
+ * __db_meta --
+ * Print out common metadata information.
+ */
+static void
+__db_meta(dbp, dbmeta, fn, flags)
+ DB *dbp;
+ DBMETA *dbmeta;
+ FN const *fn;
+ u_int32_t flags;
+{
+ DB_MPOOLFILE *mpf;
+ DB_MSGBUF mb;
+ ENV *env;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int8_t *p;
+ int cnt, ret;
+ const char *sep;
+
+ env = dbp->env;
+ mpf = dbp->mpf;
+ DB_MSGBUF_INIT(&mb);
+
+ __db_msg(env, "\tmagic: %#lx", (u_long)dbmeta->magic);
+ __db_msg(env, "\tversion: %lu", (u_long)dbmeta->version);
+ __db_msg(env, "\tpagesize: %lu", (u_long)dbmeta->pagesize);
+ __db_msg(env, "\ttype: %lu", (u_long)dbmeta->type);
+ __db_msg(env, "\tmetaflags %#lx", (u_long)dbmeta->metaflags);
+ __db_msg(env, "\tkeys: %lu\trecords: %lu",
+ (u_long)dbmeta->key_count, (u_long)dbmeta->record_count);
+ if (dbmeta->nparts)
+ __db_msg(env, "\tnparts: %lu", (u_long)dbmeta->nparts);
+
+ /*
+ * If we're doing recovery testing, don't display the free list,
+ * it may have changed and that makes the dump diff not work.
+ */
+ if (!LF_ISSET(DB_PR_RECOVERYTEST)) {
+ __db_msgadd(
+ env, &mb, "\tfree list: %lu", (u_long)dbmeta->free);
+ for (pgno = dbmeta->free,
+ cnt = 0, sep = ", "; pgno != PGNO_INVALID;) {
+ if ((ret = __memp_fget(mpf,
+ &pgno, NULL, NULL, 0, &h)) != 0) {
+ DB_MSGBUF_FLUSH(env, &mb);
+ __db_msg(env,
+ "Unable to retrieve free-list page: %lu: %s",
+ (u_long)pgno, db_strerror(ret));
+ break;
+ }
+ pgno = h->next_pgno;
+ (void)__memp_fput(mpf, NULL, h, dbp->priority);
+ __db_msgadd(env, &mb, "%s%lu", sep, (u_long)pgno);
+ if (++cnt % 10 == 0) {
+ DB_MSGBUF_FLUSH(env, &mb);
+ cnt = 0;
+ sep = "\t";
+ } else
+ sep = ", ";
+ }
+ DB_MSGBUF_FLUSH(env, &mb);
+ __db_msg(env, "\tlast_pgno: %lu", (u_long)dbmeta->last_pgno);
+ }
+
+ if (fn != NULL) {
+ DB_MSGBUF_FLUSH(env, &mb);
+ __db_msgadd(env, &mb, "\tflags: %#lx", (u_long)dbmeta->flags);
+ __db_prflags(env, &mb, dbmeta->flags, fn, " (", ")");
+ }
+
+ DB_MSGBUF_FLUSH(env, &mb);
+ __db_msgadd(env, &mb, "\tuid: ");
+ for (p = (u_int8_t *)dbmeta->uid,
+ cnt = 0; cnt < DB_FILE_ID_LEN; ++cnt) {
+ __db_msgadd(env, &mb, "%x", *p++);
+ if (cnt < DB_FILE_ID_LEN - 1)
+ __db_msgadd(env, &mb, " ");
+ }
+ DB_MSGBUF_FLUSH(env, &mb);
+}
+
+/*
+ * __db_bmeta --
+ * Print out the btree meta-data page.
+ */
+static int
+__db_bmeta(dbp, h, flags)
+ DB *dbp;
+ BTMETA *h;
+ u_int32_t flags;
+{
+ static const FN fn[] = {
+ { BTM_DUP, "duplicates" },
+ { BTM_RECNO, "recno" },
+ { BTM_RECNUM, "btree:recnum" },
+ { BTM_FIXEDLEN, "recno:fixed-length" },
+ { BTM_RENUMBER, "recno:renumber" },
+ { BTM_SUBDB, "multiple-databases" },
+ { BTM_DUPSORT, "sorted duplicates" },
+ { BTM_COMPRESS, "compressed" },
+ { 0, NULL }
+ };
+ ENV *env;
+
+ env = dbp->env;
+
+ __db_meta(dbp, (DBMETA *)h, fn, flags);
+
+ __db_msg(env, "\tminkey: %lu", (u_long)h->minkey);
+ if (dbp->type == DB_RECNO)
+ __db_msg(env, "\tre_len: %#lx re_pad: %#lx",
+ (u_long)h->re_len, (u_long)h->re_pad);
+ __db_msg(env, "\troot: %lu", (u_long)h->root);
+
+ return (0);
+}
+
+/*
+ * __db_hmeta --
+ * Print out the hash meta-data page.
+ */
+static int
+__db_hmeta(dbp, h, flags)
+ DB *dbp;
+ HMETA *h;
+ u_int32_t flags;
+{
+ static const FN fn[] = {
+ { DB_HASH_DUP, "duplicates" },
+ { DB_HASH_SUBDB, "multiple-databases" },
+ { DB_HASH_DUPSORT, "sorted duplicates" },
+ { 0, NULL }
+ };
+ ENV *env;
+ DB_MSGBUF mb;
+ int i;
+
+ env = dbp->env;
+ DB_MSGBUF_INIT(&mb);
+
+ __db_meta(dbp, (DBMETA *)h, fn, flags);
+
+ __db_msg(env, "\tmax_bucket: %lu", (u_long)h->max_bucket);
+ __db_msg(env, "\thigh_mask: %#lx", (u_long)h->high_mask);
+ __db_msg(env, "\tlow_mask: %#lx", (u_long)h->low_mask);
+ __db_msg(env, "\tffactor: %lu", (u_long)h->ffactor);
+ __db_msg(env, "\tnelem: %lu", (u_long)h->nelem);
+ __db_msg(env, "\th_charkey: %#lx", (u_long)h->h_charkey);
+ __db_msgadd(env, &mb, "\tspare points: ");
+ for (i = 0; i < NCACHED; i++)
+ __db_msgadd(env, &mb, "%lu ", (u_long)h->spares[i]);
+ DB_MSGBUF_FLUSH(env, &mb);
+
+ return (0);
+}
+
+/*
+ * __db_qmeta --
+ * Print out the queue meta-data page.
+ */
+static int
+__db_qmeta(dbp, h, flags)
+ DB *dbp;
+ QMETA *h;
+ u_int32_t flags;
+{
+ ENV *env;
+
+ env = dbp->env;
+
+ __db_meta(dbp, (DBMETA *)h, NULL, flags);
+
+ __db_msg(env, "\tfirst_recno: %lu", (u_long)h->first_recno);
+ __db_msg(env, "\tcur_recno: %lu", (u_long)h->cur_recno);
+ __db_msg(env, "\tre_len: %#lx re_pad: %lu",
+ (u_long)h->re_len, (u_long)h->re_pad);
+ __db_msg(env, "\trec_page: %lu", (u_long)h->rec_page);
+ __db_msg(env, "\tpage_ext: %lu", (u_long)h->page_ext);
+
+ return (0);
+}
+
+/*
+ * __db_prnpage
+ * -- Print out a specific page.
+ *
+ * PUBLIC: int __db_prnpage __P((DB *, DB_TXN *, db_pgno_t));
+ */
+int
+__db_prnpage(dbp, txn, pgno)
+ DB *dbp;
+ DB_TXN *txn;
+ db_pgno_t pgno;
+{
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ int ret, t_ret;
+
+ mpf = dbp->mpf;
+
+ if ((ret = __memp_fget(mpf, &pgno, NULL, txn, 0, &h)) != 0)
+ return (ret);
+
+ ret = __db_prpage(dbp, h, DB_PR_PAGE);
+
+ if ((t_ret = __memp_fput(mpf, NULL, h, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_prpage
+ * -- Print out a page.
+ *
+ * PUBLIC: int __db_prpage __P((DB *, PAGE *, u_int32_t));
+ */
+int
+__db_prpage(dbp, h, flags)
+ DB *dbp;
+ PAGE *h;
+ u_int32_t flags;
+{
+ BINTERNAL *bi;
+ BKEYDATA *bk;
+ DB_MSGBUF mb;
+ ENV *env;
+ HOFFPAGE a_hkd;
+ QAMDATA *qp, *qep;
+ RINTERNAL *ri;
+ db_indx_t dlen, len, i, *inp;
+ db_pgno_t pgno;
+ db_recno_t recno;
+ u_int32_t pagesize, qlen;
+ u_int8_t *ep, *hk, *p;
+ int deleted, ret;
+ const char *s;
+ void *sp;
+
+ env = dbp->env;
+ DB_MSGBUF_INIT(&mb);
+
+ /*
+ * If we're doing recovery testing and this page is P_INVALID,
+ * assume it's a page that's on the free list, and don't display it.
+ */
+ if (LF_ISSET(DB_PR_RECOVERYTEST) && TYPE(h) == P_INVALID)
+ return (0);
+
+ if ((s = __db_pagetype_to_string(TYPE(h))) == NULL) {
+ __db_msg(env, "ILLEGAL PAGE TYPE: page: %lu type: %lu",
+ (u_long)h->pgno, (u_long)TYPE(h));
+ return (EINVAL);
+ }
+
+ /*
+ * !!!
+ * Find out the page size. We don't want to do it the "right" way,
+ * by reading the value from the meta-data page, that's going to be
+ * slow. Reach down into the mpool region.
+ */
+ pagesize = (u_int32_t)dbp->mpf->mfp->stat.st_pagesize;
+
+ /* Page number, page type. */
+ __db_msgadd(env, &mb, "page %lu: %s:", (u_long)h->pgno, s);
+
+ /*
+ * LSNs on a metadata page will be different from the original after an
+ * abort, in some cases. Don't display them if we're testing recovery.
+ */
+ if (!LF_ISSET(DB_PR_RECOVERYTEST) ||
+ (TYPE(h) != P_BTREEMETA && TYPE(h) != P_HASHMETA &&
+ TYPE(h) != P_QAMMETA && TYPE(h) != P_QAMDATA))
+ __db_msgadd(env, &mb, " LSN [%lu][%lu]:",
+ (u_long)LSN(h).file, (u_long)LSN(h).offset);
+
+ /*
+ * Page level (only applicable for Btree/Recno, but we always display
+ * it, for no particular reason.
+ */
+ __db_msgadd(env, &mb, " level %lu", (u_long)h->level);
+
+ /* Record count. */
+ if (TYPE(h) == P_IBTREE ||
+ TYPE(h) == P_IRECNO || (TYPE(h) == P_LRECNO &&
+ h->pgno == ((BTREE *)dbp->bt_internal)->bt_root))
+ __db_msgadd(env, &mb, " records: %lu", (u_long)RE_NREC(h));
+ DB_MSGBUF_FLUSH(env, &mb);
+
+ switch (TYPE(h)) {
+ case P_BTREEMETA:
+ return (__db_bmeta(dbp, (BTMETA *)h, flags));
+ case P_HASHMETA:
+ return (__db_hmeta(dbp, (HMETA *)h, flags));
+ case P_QAMMETA:
+ return (__db_qmeta(dbp, (QMETA *)h, flags));
+ case P_QAMDATA: /* Should be meta->start. */
+ if (!LF_ISSET(DB_PR_PAGE))
+ return (0);
+
+ qlen = ((QUEUE *)dbp->q_internal)->re_len;
+ recno = (h->pgno - 1) * QAM_RECNO_PER_PAGE(dbp) + 1;
+ i = 0;
+ qep = (QAMDATA *)((u_int8_t *)h + pagesize - qlen);
+ for (qp = QAM_GET_RECORD(dbp, h, i); qp < qep;
+ recno++, i++, qp = QAM_GET_RECORD(dbp, h, i)) {
+ if (!F_ISSET(qp, QAM_SET))
+ continue;
+
+ __db_msgadd(env, &mb, "%s",
+ F_ISSET(qp, QAM_VALID) ? "\t" : " D");
+ __db_msgadd(env, &mb, "[%03lu] %4lu ", (u_long)recno,
+ (u_long)((u_int8_t *)qp - (u_int8_t *)h));
+ __db_prbytes(env, &mb, qp->data, qlen);
+ }
+ return (0);
+ default:
+ break;
+ }
+
+ s = "\t";
+ if (TYPE(h) != P_IBTREE && TYPE(h) != P_IRECNO) {
+ __db_msgadd(env, &mb, "%sprev: %4lu next: %4lu",
+ s, (u_long)PREV_PGNO(h), (u_long)NEXT_PGNO(h));
+ s = " ";
+ }
+ if (TYPE(h) == P_OVERFLOW) {
+ __db_msgadd(env, &mb,
+ "%sref cnt: %4lu ", s, (u_long)OV_REF(h));
+ __db_prbytes(env,
+ &mb, (u_int8_t *)h + P_OVERHEAD(dbp), OV_LEN(h));
+ return (0);
+ }
+ __db_msgadd(env, &mb, "%sentries: %4lu", s, (u_long)NUM_ENT(h));
+ __db_msgadd(env, &mb, " offset: %4lu", (u_long)HOFFSET(h));
+ DB_MSGBUF_FLUSH(env, &mb);
+
+ if (TYPE(h) == P_INVALID || !LF_ISSET(DB_PR_PAGE))
+ return (0);
+
+ ret = 0;
+ inp = P_INP(dbp, h);
+ for (i = 0; i < NUM_ENT(h); i++) {
+ if ((uintptr_t)(P_ENTRY(dbp, h, i) - (u_int8_t *)h) <
+ (uintptr_t)(P_OVERHEAD(dbp)) ||
+ (size_t)(P_ENTRY(dbp, h, i) - (u_int8_t *)h) >= pagesize) {
+ __db_msg(env,
+ "ILLEGAL PAGE OFFSET: indx: %lu of %lu",
+ (u_long)i, (u_long)inp[i]);
+ ret = EINVAL;
+ continue;
+ }
+ deleted = 0;
+ switch (TYPE(h)) {
+ case P_HASH_UNSORTED:
+ case P_HASH:
+ case P_IBTREE:
+ case P_IRECNO:
+ sp = P_ENTRY(dbp, h, i);
+ break;
+ case P_LBTREE:
+ sp = P_ENTRY(dbp, h, i);
+ deleted = i % 2 == 0 &&
+ B_DISSET(GET_BKEYDATA(dbp, h, i + O_INDX)->type);
+ break;
+ case P_LDUP:
+ case P_LRECNO:
+ sp = P_ENTRY(dbp, h, i);
+ deleted = B_DISSET(GET_BKEYDATA(dbp, h, i)->type);
+ break;
+ default:
+ goto type_err;
+ }
+ __db_msgadd(env, &mb, "%s", deleted ? " D" : "\t");
+ __db_msgadd(
+ env, &mb, "[%03lu] %4lu ", (u_long)i, (u_long)inp[i]);
+ switch (TYPE(h)) {
+ case P_HASH_UNSORTED:
+ case P_HASH:
+ hk = sp;
+ switch (HPAGE_PTYPE(hk)) {
+ case H_OFFDUP:
+ memcpy(&pgno,
+ HOFFDUP_PGNO(hk), sizeof(db_pgno_t));
+ __db_msgadd(env, &mb,
+ "%4lu [offpage dups]", (u_long)pgno);
+ DB_MSGBUF_FLUSH(env, &mb);
+ break;
+ case H_DUPLICATE:
+ /*
+ * If this is the first item on a page, then
+ * we cannot figure out how long it is, so
+ * we only print the first one in the duplicate
+ * set.
+ */
+ if (i != 0)
+ len = LEN_HKEYDATA(dbp, h, 0, i);
+ else
+ len = 1;
+
+ __db_msgadd(env, &mb, "Duplicates:");
+ DB_MSGBUF_FLUSH(env, &mb);
+ for (p = HKEYDATA_DATA(hk),
+ ep = p + len; p < ep;) {
+ memcpy(&dlen, p, sizeof(db_indx_t));
+ p += sizeof(db_indx_t);
+ __db_msgadd(env, &mb, "\t\t");
+ __db_prbytes(env, &mb, p, dlen);
+ p += sizeof(db_indx_t) + dlen;
+ }
+ break;
+ case H_KEYDATA:
+ __db_prbytes(env, &mb, HKEYDATA_DATA(hk),
+ LEN_HKEYDATA(dbp, h, i == 0 ?
+ pagesize : 0, i));
+ break;
+ case H_OFFPAGE:
+ memcpy(&a_hkd, hk, HOFFPAGE_SIZE);
+ __db_msgadd(env, &mb,
+ "overflow: total len: %4lu page: %4lu",
+ (u_long)a_hkd.tlen, (u_long)a_hkd.pgno);
+ DB_MSGBUF_FLUSH(env, &mb);
+ break;
+ default:
+ DB_MSGBUF_FLUSH(env, &mb);
+ __db_msg(env, "ILLEGAL HASH PAGE TYPE: %lu",
+ (u_long)HPAGE_PTYPE(hk));
+ ret = EINVAL;
+ break;
+ }
+ break;
+ case P_IBTREE:
+ bi = sp;
+
+ if (F_ISSET(dbp, DB_AM_RECNUM))
+ __db_msgadd(env, &mb,
+ "count: %4lu ", (u_long)bi->nrecs);
+ __db_msgadd(env, &mb,
+ "pgno: %4lu type: %lu ",
+ (u_long)bi->pgno, (u_long)bi->type);
+ switch (B_TYPE(bi->type)) {
+ case B_KEYDATA:
+ __db_prbytes(env, &mb, bi->data, bi->len);
+ break;
+ case B_DUPLICATE:
+ case B_OVERFLOW:
+ __db_proff(env, &mb, bi->data);
+ break;
+ default:
+ DB_MSGBUF_FLUSH(env, &mb);
+ __db_msg(env, "ILLEGAL BINTERNAL TYPE: %lu",
+ (u_long)B_TYPE(bi->type));
+ ret = EINVAL;
+ break;
+ }
+ break;
+ case P_IRECNO:
+ ri = sp;
+ __db_msgadd(env, &mb, "entries %4lu pgno %4lu",
+ (u_long)ri->nrecs, (u_long)ri->pgno);
+ DB_MSGBUF_FLUSH(env, &mb);
+ break;
+ case P_LBTREE:
+ case P_LDUP:
+ case P_LRECNO:
+ bk = sp;
+ switch (B_TYPE(bk->type)) {
+ case B_KEYDATA:
+ __db_prbytes(env, &mb, bk->data, bk->len);
+ break;
+ case B_DUPLICATE:
+ case B_OVERFLOW:
+ __db_proff(env, &mb, bk);
+ break;
+ default:
+ DB_MSGBUF_FLUSH(env, &mb);
+ __db_msg(env,
+ "ILLEGAL DUPLICATE/LBTREE/LRECNO TYPE: %lu",
+ (u_long)B_TYPE(bk->type));
+ ret = EINVAL;
+ break;
+ }
+ break;
+ default:
+type_err: DB_MSGBUF_FLUSH(env, &mb);
+ __db_msg(env,
+ "ILLEGAL PAGE TYPE: %lu", (u_long)TYPE(h));
+ ret = EINVAL;
+ continue;
+ }
+ }
+ return (ret);
+}
+
+/*
+ * __db_prbytes --
+ * Print out a data element.
+ *
+ * PUBLIC: void __db_prbytes __P((ENV *, DB_MSGBUF *, u_int8_t *, u_int32_t));
+ */
+void
+__db_prbytes(env, mbp, bytes, len)
+ ENV *env;
+ DB_MSGBUF *mbp;
+ u_int8_t *bytes;
+ u_int32_t len;
+{
+ u_int8_t *p;
+ u_int32_t i;
+ int msg_truncated;
+
+ __db_msgadd(env, mbp, "len: %3lu", (u_long)len);
+ if (len != 0) {
+ __db_msgadd(env, mbp, " data: ");
+
+ /*
+ * Print the first 20 bytes of the data. If that chunk is
+ * all printable characters, print it as text, else print it
+ * in hex. We have this heuristic because we're displaying
+ * things like lock objects that could be either text or data.
+ */
+ if (len > 20) {
+ len = 20;
+ msg_truncated = 1;
+ } else
+ msg_truncated = 0;
+ for (p = bytes, i = len; i > 0; --i, ++p)
+ if (!isprint((int)*p) && *p != '\t' && *p != '\n')
+ break;
+ if (i == 0)
+ for (p = bytes, i = len; i > 0; --i, ++p)
+ __db_msgadd(env, mbp, "%c", *p);
+ else
+ for (p = bytes, i = len; i > 0; --i, ++p)
+ __db_msgadd(env, mbp, "%#.2x", (u_int)*p);
+ if (msg_truncated)
+ __db_msgadd(env, mbp, "...");
+ }
+ DB_MSGBUF_FLUSH(env, mbp);
+}
+
+/*
+ * __db_proff --
+ * Print out an off-page element.
+ */
+static void
+__db_proff(env, mbp, vp)
+ ENV *env;
+ DB_MSGBUF *mbp;
+ void *vp;
+{
+ BOVERFLOW *bo;
+
+ bo = vp;
+ switch (B_TYPE(bo->type)) {
+ case B_OVERFLOW:
+ __db_msgadd(env, mbp, "overflow: total len: %4lu page: %4lu",
+ (u_long)bo->tlen, (u_long)bo->pgno);
+ break;
+ case B_DUPLICATE:
+ __db_msgadd(
+ env, mbp, "duplicate: page: %4lu", (u_long)bo->pgno);
+ break;
+ default:
+ /* NOTREACHED */
+ break;
+ }
+ DB_MSGBUF_FLUSH(env, mbp);
+}
+
+/*
+ * __db_prflags --
+ * Print out flags values.
+ *
+ * PUBLIC: void __db_prflags __P((ENV *, DB_MSGBUF *,
+ * PUBLIC: u_int32_t, const FN *, const char *, const char *));
+ */
+void
+__db_prflags(env, mbp, flags, fn, prefix, suffix)
+ ENV *env;
+ DB_MSGBUF *mbp;
+ u_int32_t flags;
+ FN const *fn;
+ const char *prefix, *suffix;
+{
+ DB_MSGBUF mb;
+ const FN *fnp;
+ int found, standalone;
+ const char *sep;
+
+ if (fn == NULL)
+ return;
+
+ /*
+ * If it's a standalone message, output the suffix (which will be the
+ * label), regardless of whether we found anything or not, and flush
+ * the line.
+ */
+ if (mbp == NULL) {
+ standalone = 1;
+ mbp = &mb;
+ DB_MSGBUF_INIT(mbp);
+ } else
+ standalone = 0;
+
+ sep = prefix == NULL ? "" : prefix;
+ for (found = 0, fnp = fn; fnp->mask != 0; ++fnp)
+ if (LF_ISSET(fnp->mask)) {
+ __db_msgadd(env, mbp, "%s%s", sep, fnp->name);
+ sep = ", ";
+ found = 1;
+ }
+
+ if ((standalone || found) && suffix != NULL)
+ __db_msgadd(env, mbp, "%s", suffix);
+ if (standalone)
+ DB_MSGBUF_FLUSH(env, mbp);
+}
+
+/*
+ * __db_lockmode_to_string --
+ * Return the name of the lock mode.
+ *
+ * PUBLIC: const char * __db_lockmode_to_string __P((db_lockmode_t));
+ */
+const char *
+__db_lockmode_to_string(mode)
+ db_lockmode_t mode;
+{
+ switch (mode) {
+ case DB_LOCK_NG:
+ return ("Not granted");
+ case DB_LOCK_READ:
+ return ("Shared/read");
+ case DB_LOCK_WRITE:
+ return ("Exclusive/write");
+ case DB_LOCK_WAIT:
+ return ("Wait for event");
+ case DB_LOCK_IWRITE:
+ return ("Intent exclusive/write");
+ case DB_LOCK_IREAD:
+ return ("Intent shared/read");
+ case DB_LOCK_IWR:
+ return ("Intent to read/write");
+ case DB_LOCK_READ_UNCOMMITTED:
+ return ("Read uncommitted");
+ case DB_LOCK_WWRITE:
+ return ("Was written");
+ default:
+ break;
+ }
+ return ("UNKNOWN LOCK MODE");
+}
+
+/*
+ * __db_pagetype_to_string --
+ * Return the name of the specified page type.
+ */
+static const char *
+__db_pagetype_to_string(type)
+ u_int32_t type;
+{
+ char *s;
+
+ s = NULL;
+ switch (type) {
+ case P_BTREEMETA:
+ s = "btree metadata";
+ break;
+ case P_LDUP:
+ s = "duplicate";
+ break;
+ case P_HASH_UNSORTED:
+ s = "hash unsorted";
+ break;
+ case P_HASH:
+ s = "hash";
+ break;
+ case P_HASHMETA:
+ s = "hash metadata";
+ break;
+ case P_IBTREE:
+ s = "btree internal";
+ break;
+ case P_INVALID:
+ s = "invalid";
+ break;
+ case P_IRECNO:
+ s = "recno internal";
+ break;
+ case P_LBTREE:
+ s = "btree leaf";
+ break;
+ case P_LRECNO:
+ s = "recno leaf";
+ break;
+ case P_OVERFLOW:
+ s = "overflow";
+ break;
+ case P_QAMMETA:
+ s = "queue metadata";
+ break;
+ case P_QAMDATA:
+ s = "queue";
+ break;
+ default:
+ /* Just return a NULL. */
+ break;
+ }
+ return (s);
+}
+
+#else /* !HAVE_STATISTICS */
+
+/*
+ * __db_dumptree --
+ * Dump the tree to a file.
+ *
+ * PUBLIC: int __db_dumptree __P((DB *, DB_TXN *, char *, char *));
+ */
+int
+__db_dumptree(dbp, txn, op, name)
+ DB *dbp;
+ DB_TXN *txn;
+ char *op, *name;
+{
+ COMPQUIET(txn, NULL);
+ COMPQUIET(op, NULL);
+ COMPQUIET(name, NULL);
+
+ return (__db_stat_not_built(dbp->env));
+}
+
+/*
+ * __db_get_flags_fn --
+ * Return the __db_flags_fn array.
+ *
+ * PUBLIC: const FN * __db_get_flags_fn __P((void));
+ */
+const FN *
+__db_get_flags_fn()
+{
+ /*
+ * !!!
+ * The Tcl API uses this interface, stub it off.
+ */
+ return (NULL);
+}
+#endif
+
+/*
+ * __db_dump_pp --
+ * DB->dump pre/post processing.
+ *
+ * PUBLIC: int __db_dump_pp __P((DB *, const char *,
+ * PUBLIC: int (*)(void *, const void *), void *, int, int));
+ */
+int
+__db_dump_pp(dbp, subname, callback, handle, pflag, keyflag)
+ DB *dbp;
+ const char *subname;
+ int (*callback) __P((void *, const void *));
+ void *handle;
+ int pflag, keyflag;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = dbp->env;
+
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->dump");
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, 1)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ ret = __db_dump(dbp, subname, callback, handle, pflag, keyflag);
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+err: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_dump --
+ * DB->dump.
+ *
+ * PUBLIC: int __db_dump __P((DB *, const char *,
+ * PUBLIC: int (*)(void *, const void *), void *, int, int));
+ */
+int
+__db_dump(dbp, subname, callback, handle, pflag, keyflag)
+ DB *dbp;
+ const char *subname;
+ int (*callback) __P((void *, const void *));
+ void *handle;
+ int pflag, keyflag;
+{
+ DBC *dbcp;
+ DBT key, data;
+ DBT keyret, dataret;
+ ENV *env;
+ db_recno_t recno;
+ int is_recno, ret, t_ret;
+ void *pointer;
+
+ env = dbp->env;
+
+ if ((ret = __db_prheader(
+ dbp, subname, pflag, keyflag, handle, callback, NULL, 0)) != 0)
+ return (ret);
+
+ /*
+ * Get a cursor and step through the database, printing out each
+ * key/data pair.
+ */
+ if ((ret = __db_cursor(dbp, NULL, NULL, &dbcp, 0)) != 0)
+ return (ret);
+
+ memset(&key, 0, sizeof(key));
+ memset(&data, 0, sizeof(data));
+ if ((ret = __os_malloc(env, 1024 * 1024, &data.data)) != 0)
+ goto err;
+ data.ulen = 1024 * 1024;
+ data.flags = DB_DBT_USERMEM;
+ is_recno = (dbp->type == DB_RECNO || dbp->type == DB_QUEUE);
+ keyflag = is_recno ? keyflag : 1;
+ if (is_recno) {
+ keyret.data = &recno;
+ keyret.size = sizeof(recno);
+ }
+
+retry: while ((ret =
+ __dbc_get(dbcp, &key, &data, DB_NEXT | DB_MULTIPLE_KEY)) == 0) {
+ DB_MULTIPLE_INIT(pointer, &data);
+ for (;;) {
+ if (is_recno)
+ DB_MULTIPLE_RECNO_NEXT(pointer, &data,
+ recno, dataret.data, dataret.size);
+ else
+ DB_MULTIPLE_KEY_NEXT(pointer,
+ &data, keyret.data,
+ keyret.size, dataret.data, dataret.size);
+
+ if (dataret.data == NULL)
+ break;
+
+ if ((keyflag &&
+ (ret = __db_prdbt(&keyret, pflag, " ",
+ handle, callback, is_recno)) != 0) ||
+ (ret = __db_prdbt(&dataret, pflag, " ",
+ handle, callback, 0)) != 0)
+ goto err;
+ }
+ }
+ if (ret == DB_BUFFER_SMALL) {
+ data.size = (u_int32_t)DB_ALIGN(data.size, 1024);
+ if ((ret = __os_realloc(env, data.size, &data.data)) != 0)
+ goto err;
+ data.ulen = data.size;
+ goto retry;
+ }
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+
+ if ((t_ret = __db_prfooter(handle, callback)) != 0 && ret == 0)
+ ret = t_ret;
+
+err: if ((t_ret = __dbc_close(dbcp)) != 0 && ret == 0)
+ ret = t_ret;
+ if (data.data != NULL)
+ __os_free(env, data.data);
+
+ return (ret);
+}
+
+/*
+ * __db_prdbt --
+ * Print out a DBT data element.
+ *
+ * PUBLIC: int __db_prdbt __P((DBT *, int, const char *, void *,
+ * PUBLIC: int (*)(void *, const void *), int));
+ */
+int
+__db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno)
+ DBT *dbtp;
+ int checkprint;
+ const char *prefix;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ int is_recno;
+{
+ static const u_char hex[] = "0123456789abcdef";
+ db_recno_t recno;
+ size_t len;
+ int ret;
+#define DBTBUFLEN 100
+ u_int8_t *p, *hp;
+ char buf[DBTBUFLEN], hbuf[DBTBUFLEN];
+
+ /*
+ * !!!
+ * This routine is the routine that dumps out items in the format
+ * used by db_dump(1) and db_load(1). This means that the format
+ * cannot change.
+ */
+ if (prefix != NULL && (ret = callback(handle, prefix)) != 0)
+ return (ret);
+ if (is_recno) {
+ /*
+ * We're printing a record number, and this has to be done
+ * in a platform-independent way. So we use the numeral in
+ * straight ASCII.
+ */
+ (void)__ua_memcpy(&recno, dbtp->data, sizeof(recno));
+ snprintf(buf, DBTBUFLEN, "%lu", (u_long)recno);
+
+ /* If we're printing data as hex, print keys as hex too. */
+ if (!checkprint) {
+ for (len = strlen(buf), p = (u_int8_t *)buf,
+ hp = (u_int8_t *)hbuf; len-- > 0; ++p) {
+ *hp++ = hex[(u_int8_t)(*p & 0xf0) >> 4];
+ *hp++ = hex[*p & 0x0f];
+ }
+ *hp = '\0';
+ ret = callback(handle, hbuf);
+ } else
+ ret = callback(handle, buf);
+
+ if (ret != 0)
+ return (ret);
+ } else if (checkprint) {
+ for (len = dbtp->size, p = dbtp->data; len--; ++p)
+ if (isprint((int)*p)) {
+ if (*p == '\\' &&
+ (ret = callback(handle, "\\")) != 0)
+ return (ret);
+ snprintf(buf, DBTBUFLEN, "%c", *p);
+ if ((ret = callback(handle, buf)) != 0)
+ return (ret);
+ } else {
+ snprintf(buf, DBTBUFLEN, "\\%c%c",
+ hex[(u_int8_t)(*p & 0xf0) >> 4],
+ hex[*p & 0x0f]);
+ if ((ret = callback(handle, buf)) != 0)
+ return (ret);
+ }
+ } else
+ for (len = dbtp->size, p = dbtp->data; len--; ++p) {
+ snprintf(buf, DBTBUFLEN, "%c%c",
+ hex[(u_int8_t)(*p & 0xf0) >> 4],
+ hex[*p & 0x0f]);
+ if ((ret = callback(handle, buf)) != 0)
+ return (ret);
+ }
+
+ return (callback(handle, "\n"));
+}
+
+/*
+ * __db_prheader --
+ * Write out header information in the format expected by db_load.
+ *
+ * PUBLIC: int __db_prheader __P((DB *, const char *, int, int, void *,
+ * PUBLIC: int (*)(void *, const void *), VRFY_DBINFO *, db_pgno_t));
+ */
+int
+__db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno)
+ DB *dbp;
+ const char *subname;
+ int pflag, keyflag;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ VRFY_DBINFO *vdp;
+ db_pgno_t meta_pgno;
+{
+ DBT dbt;
+ DBTYPE dbtype;
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ u_int32_t flags, tmp_u_int32;
+ size_t buflen;
+ char *buf;
+ int using_vdp, ret, t_ret, tmp_int;
+
+ ret = 0;
+ buf = NULL;
+ COMPQUIET(buflen, 0);
+
+ /*
+ * If dbp is NULL, then pip is guaranteed to be non-NULL; we only ever
+ * call __db_prheader with a NULL dbp from one case inside __db_prdbt,
+ * and this is a special subdatabase for "lost" items. In this case
+ * we have a vdp (from which we'll get a pip). In all other cases, we
+ * will have a non-NULL dbp (and vdp may or may not be NULL depending
+ * on whether we're salvaging).
+ */
+ if (dbp == NULL)
+ env = NULL;
+ else
+ env = dbp->env;
+ DB_ASSERT(env, dbp != NULL || vdp != NULL);
+
+ /*
+ * If we've been passed a verifier statistics object, use that; we're
+ * being called in a context where dbp->stat is unsafe.
+ *
+ * Also, the verifier may set the pflag on a per-salvage basis. If so,
+ * respect that.
+ */
+ if (vdp != NULL) {
+ if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &pip)) != 0)
+ return (ret);
+
+ if (F_ISSET(vdp, SALVAGE_PRINTABLE))
+ pflag = 1;
+ using_vdp = 1;
+ } else {
+ pip = NULL;
+ using_vdp = 0;
+ }
+
+ /*
+ * If dbp is NULL, make it a btree. Otherwise, set dbtype to whatever
+ * appropriate type for the specified meta page, or the type of the dbp.
+ */
+ if (dbp == NULL)
+ dbtype = DB_BTREE;
+ else if (using_vdp)
+ switch (pip->type) {
+ case P_BTREEMETA:
+ if (F_ISSET(pip, VRFY_IS_RECNO))
+ dbtype = DB_RECNO;
+ else
+ dbtype = DB_BTREE;
+ break;
+ case P_HASHMETA:
+ dbtype = DB_HASH;
+ break;
+ case P_QAMMETA:
+ dbtype = DB_QUEUE;
+ break;
+ default:
+ /*
+ * If the meta page is of a bogus type, it's because
+ * we have a badly corrupt database. (We must be in
+ * the verifier for pip to be non-NULL.) Pretend we're
+ * a Btree and salvage what we can.
+ */
+ DB_ASSERT(env, F_ISSET(dbp, DB_AM_VERIFYING));
+ dbtype = DB_BTREE;
+ break;
+ }
+ else
+ dbtype = dbp->type;
+
+ if ((ret = callback(handle, "VERSION=3\n")) != 0)
+ goto err;
+ if (pflag) {
+ if ((ret = callback(handle, "format=print\n")) != 0)
+ goto err;
+ } else if ((ret = callback(handle, "format=bytevalue\n")) != 0)
+ goto err;
+
+ /*
+ * 64 bytes is long enough, as a minimum bound, for any of the
+ * fields besides subname. Subname uses __db_prdbt and therefore
+ * does not need buffer space here.
+ */
+ buflen = 64;
+ if ((ret = __os_malloc(env, buflen, &buf)) != 0)
+ goto err;
+ if (subname != NULL) {
+ snprintf(buf, buflen, "database=");
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+ DB_INIT_DBT(dbt, subname, strlen(subname));
+ if ((ret = __db_prdbt(&dbt, 1, NULL, handle, callback, 0)) != 0)
+ goto err;
+ }
+ switch (dbtype) {
+ case DB_BTREE:
+ if ((ret = callback(handle, "type=btree\n")) != 0)
+ goto err;
+ if (using_vdp)
+ tmp_int = F_ISSET(pip, VRFY_HAS_RECNUMS) ? 1 : 0;
+ else {
+ if ((ret = __db_get_flags(dbp, &flags)) != 0) {
+ __db_err(env, ret, "DB->get_flags");
+ goto err;
+ }
+ tmp_int = F_ISSET(dbp, DB_AM_RECNUM) ? 1 : 0;
+ }
+ if (tmp_int && (ret = callback(handle, "recnum=1\n")) != 0)
+ goto err;
+
+ if (using_vdp)
+ tmp_u_int32 = pip->bt_minkey;
+ else
+ if ((ret =
+ __bam_get_bt_minkey(dbp, &tmp_u_int32)) != 0) {
+ __db_err(env, ret, "DB->get_bt_minkey");
+ goto err;
+ }
+ if (tmp_u_int32 != 0 && tmp_u_int32 != DEFMINKEYPAGE) {
+ snprintf(buf, buflen,
+ "bt_minkey=%lu\n", (u_long)tmp_u_int32);
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+ }
+ break;
+ case DB_HASH:
+#ifdef HAVE_HASH
+ if ((ret = callback(handle, "type=hash\n")) != 0)
+ goto err;
+ if (using_vdp)
+ tmp_u_int32 = pip->h_ffactor;
+ else
+ if ((ret =
+ __ham_get_h_ffactor(dbp, &tmp_u_int32)) != 0) {
+ __db_err(env, ret, "DB->get_h_ffactor");
+ goto err;
+ }
+ if (tmp_u_int32 != 0) {
+ snprintf(buf, buflen,
+ "h_ffactor=%lu\n", (u_long)tmp_u_int32);
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+ }
+
+ if (using_vdp)
+ tmp_u_int32 = pip->h_nelem;
+ else
+ if ((ret = __ham_get_h_nelem(dbp, &tmp_u_int32)) != 0) {
+ __db_err(env, ret, "DB->get_h_nelem");
+ goto err;
+ }
+ /*
+ * Hash databases have an h_nelem field of 0 or 1, neither
+ * of those values is interesting.
+ */
+ if (tmp_u_int32 > 1) {
+ snprintf(buf, buflen,
+ "h_nelem=%lu\n", (u_long)tmp_u_int32);
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+ }
+ break;
+#else
+ ret = __db_no_hash_am(env);
+ goto err;
+#endif
+ case DB_QUEUE:
+#ifdef HAVE_QUEUE
+ if ((ret = callback(handle, "type=queue\n")) != 0)
+ goto err;
+ if (using_vdp)
+ tmp_u_int32 = vdp->re_len;
+ else
+ if ((ret = __ram_get_re_len(dbp, &tmp_u_int32)) != 0) {
+ __db_err(env, ret, "DB->get_re_len");
+ goto err;
+ }
+ snprintf(buf, buflen, "re_len=%lu\n", (u_long)tmp_u_int32);
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+
+ if (using_vdp)
+ tmp_int = (int)vdp->re_pad;
+ else
+ if ((ret = __ram_get_re_pad(dbp, &tmp_int)) != 0) {
+ __db_err(env, ret, "DB->get_re_pad");
+ goto err;
+ }
+ if (tmp_int != 0 && tmp_int != ' ') {
+ snprintf(buf, buflen, "re_pad=%#x\n", tmp_int);
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+ }
+
+ if (using_vdp)
+ tmp_u_int32 = vdp->page_ext;
+ else
+ if ((ret =
+ __qam_get_extentsize(dbp, &tmp_u_int32)) != 0) {
+ __db_err(env, ret, "DB->get_q_extentsize");
+ goto err;
+ }
+ if (tmp_u_int32 != 0) {
+ snprintf(buf, buflen,
+ "extentsize=%lu\n", (u_long)tmp_u_int32);
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+ }
+ break;
+#else
+ ret = __db_no_queue_am(env);
+ goto err;
+#endif
+ case DB_RECNO:
+ if ((ret = callback(handle, "type=recno\n")) != 0)
+ goto err;
+ if (using_vdp)
+ tmp_int = F_ISSET(pip, VRFY_IS_RRECNO) ? 1 : 0;
+ else
+ tmp_int = F_ISSET(dbp, DB_AM_RENUMBER) ? 1 : 0;
+ if (tmp_int != 0 &&
+ (ret = callback(handle, "renumber=1\n")) != 0)
+ goto err;
+
+ if (using_vdp)
+ tmp_int = F_ISSET(pip, VRFY_IS_FIXEDLEN) ? 1 : 0;
+ else
+ tmp_int = F_ISSET(dbp, DB_AM_FIXEDLEN) ? 1 : 0;
+ if (tmp_int) {
+ if (using_vdp)
+ tmp_u_int32 = pip->re_len;
+ else
+ if ((ret =
+ __ram_get_re_len(dbp, &tmp_u_int32)) != 0) {
+ __db_err(env, ret, "DB->get_re_len");
+ goto err;
+ }
+ snprintf(buf, buflen,
+ "re_len=%lu\n", (u_long)tmp_u_int32);
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+
+ if (using_vdp)
+ tmp_int = (int)pip->re_pad;
+ else
+ if ((ret =
+ __ram_get_re_pad(dbp, &tmp_int)) != 0) {
+ __db_err(env, ret, "DB->get_re_pad");
+ goto err;
+ }
+ if (tmp_int != 0 && tmp_int != ' ') {
+ snprintf(buf,
+ buflen, "re_pad=%#x\n", (u_int)tmp_int);
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+ }
+ }
+ break;
+ case DB_UNKNOWN: /* Impossible. */
+ ret = __db_unknown_path(env, "__db_prheader");
+ goto err;
+ }
+
+ if (using_vdp) {
+ if (F_ISSET(pip, VRFY_HAS_CHKSUM))
+ if ((ret = callback(handle, "chksum=1\n")) != 0)
+ goto err;
+ if (F_ISSET(pip, VRFY_HAS_DUPS))
+ if ((ret = callback(handle, "duplicates=1\n")) != 0)
+ goto err;
+ if (F_ISSET(pip, VRFY_HAS_DUPSORT))
+ if ((ret = callback(handle, "dupsort=1\n")) != 0)
+ goto err;
+#ifdef HAVE_COMPRESSION
+ if (F_ISSET(pip, VRFY_HAS_COMPRESS))
+ if ((ret = callback(handle, "compressed=1\n")) != 0)
+ goto err;
+#endif
+ /*
+ * !!!
+ * We don't know if the page size was the default if we're
+ * salvaging. It doesn't seem that interesting to have, so
+ * we ignore it for now.
+ */
+ } else {
+ if (F_ISSET(dbp, DB_AM_CHKSUM))
+ if ((ret = callback(handle, "chksum=1\n")) != 0)
+ goto err;
+ if (F_ISSET(dbp, DB_AM_DUP))
+ if ((ret = callback(handle, "duplicates=1\n")) != 0)
+ goto err;
+ if (F_ISSET(dbp, DB_AM_DUPSORT))
+ if ((ret = callback(handle, "dupsort=1\n")) != 0)
+ goto err;
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbp))
+ if ((ret = callback(handle, "compressed=1\n")) != 0)
+ goto err;
+#endif
+ if (!F_ISSET(dbp, DB_AM_PGDEF)) {
+ snprintf(buf, buflen,
+ "db_pagesize=%lu\n", (u_long)dbp->pgsize);
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+ }
+ }
+
+#ifdef HAVE_PARTITION
+ if (DB_IS_PARTITIONED(dbp) &&
+ F_ISSET((DB_PARTITION *)dbp->p_internal, PART_RANGE)) {
+ DBT *keys;
+ u_int32_t i;
+
+ if ((ret = __partition_get_keys(dbp, &tmp_u_int32, &keys)) != 0)
+ goto err;
+ if (tmp_u_int32 != 0) {
+ snprintf(buf,
+ buflen, "nparts=%lu\n", (u_long)tmp_u_int32);
+ if ((ret = callback(handle, buf)) != 0)
+ goto err;
+ for (i = 0; i < tmp_u_int32 - 1; i++)
+ if ((ret = __db_prdbt(&keys[i],
+ pflag, " ", handle, callback, 0)) != 0)
+ goto err;
+ }
+ }
+#endif
+
+ if (keyflag && (ret = callback(handle, "keys=1\n")) != 0)
+ goto err;
+
+ ret = callback(handle, "HEADER=END\n");
+
+err: if (using_vdp &&
+ (t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ if (buf != NULL)
+ __os_free(env, buf);
+
+ return (ret);
+}
+
+/*
+ * __db_prfooter --
+ * Print the footer that marks the end of a DB dump. This is trivial,
+ * but for consistency's sake we don't want to put its literal contents
+ * in multiple places.
+ *
+ * PUBLIC: int __db_prfooter __P((void *, int (*)(void *, const void *)));
+ */
+int
+__db_prfooter(handle, callback)
+ void *handle;
+ int (*callback) __P((void *, const void *));
+{
+ return (callback(handle, "DATA=END\n"));
+}
+
+/*
+ * __db_pr_callback --
+ * Callback function for using pr_* functions from C.
+ *
+ * PUBLIC: int __db_pr_callback __P((void *, const void *));
+ */
+int
+__db_pr_callback(handle, str_arg)
+ void *handle;
+ const void *str_arg;
+{
+ char *str;
+ FILE *f;
+
+ str = (char *)str_arg;
+ f = (FILE *)handle;
+
+ if (fprintf(f, "%s", str) != (int)strlen(str))
+ return (EIO);
+
+ return (0);
+}
+
+/*
+ * __db_dbtype_to_string --
+ * Return the name of the database type.
+ *
+ * PUBLIC: const char * __db_dbtype_to_string __P((DBTYPE));
+ */
+const char *
+__db_dbtype_to_string(type)
+ DBTYPE type;
+{
+ switch (type) {
+ case DB_BTREE:
+ return ("btree");
+ case DB_HASH:
+ return ("hash");
+ case DB_RECNO:
+ return ("recno");
+ case DB_QUEUE:
+ return ("queue");
+ case DB_UNKNOWN:
+ default:
+ break;
+ }
+ return ("UNKNOWN TYPE");
+}
diff --git a/db/db_rec.c b/db/db_rec.c
new file mode 100644
index 0000000..02fe096
--- /dev/null
+++ b/db/db_rec.c
@@ -0,0 +1,1859 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2010 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/hash.h"
+
+static int __db_pg_free_recover_int __P((ENV *, DB_THREAD_INFO *,
+ __db_pg_freedata_args *, DB *, DB_LSN *, DB_MPOOLFILE *, db_recops, int));
+static int __db_pg_free_recover_42_int __P((ENV *, DB_THREAD_INFO *,
+ __db_pg_freedata_42_args *,
+ DB *, DB_LSN *, DB_MPOOLFILE *, db_recops, int));
+
+/*
+ * PUBLIC: int __db_addrem_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ *
+ * This log message is generated whenever we add or remove a duplicate
+ * to/from a duplicate page. On recover, we just do the opposite.
+ */
+int
+__db_addrem_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_addrem_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, modified, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__db_addrem_print);
+ REC_INTRO(__db_addrem_read, ip, 1);
+
+ REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+ modified = 0;
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_DUP) ||
+ (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_REM_DUP)) {
+ /* Need to redo an add, or undo a delete. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if ((ret = __db_pitem(dbc, pagep, argp->indx, argp->nbytes,
+ argp->hdr.size == 0 ? NULL : &argp->hdr,
+ argp->dbt.size == 0 ? NULL : &argp->dbt)) != 0)
+ goto out;
+ modified = 1;
+
+ } else if ((cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_ADD_DUP) ||
+ (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_REM_DUP)) {
+ /* Need to undo an add, or redo a delete. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if ((ret = __db_ditem(dbc,
+ pagep, argp->indx, argp->nbytes)) != 0)
+ goto out;
+ modified = 1;
+ }
+
+ if (modified) {
+ if (DB_REDO(op))
+ LSN(pagep) = *lsnp;
+ else
+ LSN(pagep) = argp->pagelsn;
+ }
+
+ if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, dbc->priority);
+ REC_CLOSE;
+}
+
+/*
+ * PUBLIC: int __db_big_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_big_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_big_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, modified, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__db_big_print);
+ REC_INTRO(__db_big_read, ip, 0);
+
+ REC_FGET(mpf, ip, argp->pgno, &pagep, ppage);
+ modified = 0;
+
+ /*
+ * There are three pages we need to check. The one on which we are
+ * adding data, the previous one whose next_pointer may have
+ * been updated, and the next one whose prev_pointer may have
+ * been updated.
+ */
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_BIG) ||
+ (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_REM_BIG)) {
+ /* We are either redo-ing an add, or undoing a delete. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ P_INIT(pagep, file_dbp->pgsize, argp->pgno, argp->prev_pgno,
+ argp->next_pgno, 0, P_OVERFLOW);
+ OV_LEN(pagep) = argp->dbt.size;
+ OV_REF(pagep) = 1;
+ memcpy((u_int8_t *)pagep + P_OVERHEAD(file_dbp), argp->dbt.data,
+ argp->dbt.size);
+ PREV_PGNO(pagep) = argp->prev_pgno;
+ modified = 1;
+ } else if ((cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_ADD_BIG) ||
+ (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_REM_BIG)) {
+ /*
+ * We are either undo-ing an add or redo-ing a delete.
+ * The page is about to be reclaimed in either case, so
+ * there really isn't anything to do here.
+ */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ modified = 1;
+ } else if (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_APPEND_BIG) {
+ /* We are redoing an append. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ memcpy((u_int8_t *)pagep + P_OVERHEAD(file_dbp) +
+ OV_LEN(pagep), argp->dbt.data, argp->dbt.size);
+ OV_LEN(pagep) += argp->dbt.size;
+ modified = 1;
+ } else if (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_APPEND_BIG) {
+ /* We are undoing an append. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ OV_LEN(pagep) -= argp->dbt.size;
+ memset((u_int8_t *)pagep + P_OVERHEAD(file_dbp) +
+ OV_LEN(pagep), 0, argp->dbt.size);
+ modified = 1;
+ }
+ if (modified)
+ LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn;
+
+ ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+ pagep = NULL;
+ if (ret != 0)
+ goto out;
+
+ /*
+ * We only delete a whole chain of overflow items, and appends only
+ * apply to a single page. Adding a page is the only case that
+ * needs to update the chain.
+ */
+ if (argp->opcode != DB_ADD_BIG)
+ goto done;
+
+ /* Now check the previous page. */
+ppage: if (argp->prev_pgno != PGNO_INVALID) {
+ REC_FGET(mpf, ip, argp->prev_pgno, &pagep, npage);
+ modified = 0;
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->prevlsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->prevlsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+
+ if (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_BIG) {
+ /* Redo add, undo delete. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ NEXT_PGNO(pagep) = argp->pgno;
+ modified = 1;
+ } else if (cmp_n == 0 &&
+ DB_UNDO(op) && argp->opcode == DB_ADD_BIG) {
+ /* Redo delete, undo add. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ NEXT_PGNO(pagep) = argp->next_pgno;
+ modified = 1;
+ }
+ if (modified)
+ LSN(pagep) = DB_REDO(op) ? *lsnp : argp->prevlsn;
+ ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+ pagep = NULL;
+ if (ret != 0)
+ goto out;
+ }
+ pagep = NULL;
+
+ /* Now check the next page. Can only be set on a delete. */
+npage: if (argp->next_pgno != PGNO_INVALID) {
+ REC_FGET(mpf, ip, argp->next_pgno, &pagep, done);
+ modified = 0;
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nextlsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nextlsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ PREV_PGNO(pagep) = PGNO_INVALID;
+ modified = 1;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ PREV_PGNO(pagep) = argp->pgno;
+ modified = 1;
+ }
+ if (modified)
+ LSN(pagep) = DB_REDO(op) ? *lsnp : argp->nextlsn;
+ ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+ pagep = NULL;
+ if (ret != 0)
+ goto out;
+ }
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __db_ovref_recover --
+ * Recovery function for __db_ovref().
+ *
+ * PUBLIC: int __db_ovref_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_ovref_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_ovref_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__db_ovref_print);
+ REC_INTRO(__db_ovref_read, ip, 0);
+
+ REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+
+ cmp = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(env, op, cmp, &LSN(pagep), &argp->lsn);
+ if (cmp == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ OV_REF(pagep) += argp->adjust;
+ pagep->lsn = *lsnp;
+ } else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ OV_REF(pagep) -= argp->adjust;
+ pagep->lsn = argp->lsn;
+ }
+ ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+ pagep = NULL;
+ if (ret != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __db_debug_recover --
+ * Recovery function for debug.
+ *
+ * PUBLIC: int __db_debug_recover __P((ENV *,
+ * PUBLIC: DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_debug_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_debug_args *argp;
+ int ret;
+
+ COMPQUIET(op, DB_TXN_ABORT);
+ COMPQUIET(info, NULL);
+
+ REC_PRINT(__db_debug_print);
+ REC_NOOP_INTRO(__db_debug_read);
+
+ *lsnp = argp->prev_lsn;
+ ret = 0;
+
+ REC_NOOP_CLOSE;
+}
+
+/*
+ * __db_noop_recover --
+ * Recovery function for noop.
+ *
+ * PUBLIC: int __db_noop_recover __P((ENV *,
+ * PUBLIC: DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_noop_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_noop_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__db_noop_print);
+ REC_INTRO(__db_noop_read, ip, 0);
+
+ REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->prevlsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->prevlsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ LSN(pagep) = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ LSN(pagep) = argp->prevlsn;
+ }
+ ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf,
+ ip, pagep, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __db_pg_alloc_recover --
+ * Recovery function for pg_alloc.
+ *
+ * PUBLIC: int __db_pg_alloc_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_alloc_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_pg_alloc_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DBMETA *meta;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ db_pgno_t pgno;
+ int cmp_n, cmp_p, created, level, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ meta = NULL;
+ pagep = NULL;
+ created = 0;
+ REC_PRINT(__db_pg_alloc_print);
+ REC_INTRO(__db_pg_alloc_read, ip, 0);
+
+ /*
+ * Fix up the metadata page. If we're redoing the operation, we have
+ * to get the metadata page and update its LSN and its free pointer.
+ * If we're undoing the operation and the page was ever created, we put
+ * it on the freelist.
+ */
+ pgno = PGNO_BASE_MD;
+ if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &meta)) != 0) {
+ /* The metadata page must always exist on redo. */
+ if (DB_REDO(op)) {
+ ret = __db_pgerr(file_dbp, pgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+ cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
+ cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(meta), lsnp);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+ LSN(meta) = *lsnp;
+ meta->free = argp->next;
+ if (argp->pgno > meta->last_pgno)
+ meta->last_pgno = argp->pgno;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+ LSN(meta) = argp->meta_lsn;
+ /*
+ * If the page has a zero LSN then its newly created and
+ * will be truncated rather than go on the free list.
+ */
+ if (!IS_ZERO_LSN(argp->page_lsn))
+ meta->free = argp->pgno;
+ meta->last_pgno = argp->last_pgno;
+ }
+
+#ifdef HAVE_FTRUNCATE
+ /*
+ * check to see if we are keeping a sorted freelist, if so put
+ * this back in the in memory list. It must be the first element.
+ */
+ if (op == DB_TXN_ABORT && !IS_ZERO_LSN(argp->page_lsn)) {
+ db_pgno_t *list;
+ u_int32_t nelem;
+
+ if ((ret = __memp_get_freelist(mpf, &nelem, &list)) != 0)
+ goto out;
+ if (list != NULL && (nelem == 0 || *list != argp->pgno)) {
+ if ((ret =
+ __memp_extend_freelist(mpf, nelem + 1, &list)) != 0)
+ goto out;
+ if (nelem != 0)
+ memmove(list + 1, list, nelem * sizeof(*list));
+ *list = argp->pgno;
+ }
+ }
+#endif
+
+ /*
+ * Fix up the allocated page. If the page does not exist
+ * and we can truncate it then don't create it.
+ * Otherwise if we're redoing the operation, we have
+ * to get the page (creating it if it doesn't exist), and update its
+ * LSN. If we're undoing the operation, we have to reset the page's
+ * LSN and put it on the free list.
+ */
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ /*
+ * We have to be able to identify if a page was newly
+ * created so we can recover it properly. We cannot simply
+ * look for an empty header, because hash uses a pgin
+ * function that will set the header. Instead, we explicitly
+ * try for the page without CREATE and if that fails, then
+ * create it.
+ */
+ if (DB_UNDO(op))
+ goto do_truncate;
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL,
+ DB_MPOOL_CREATE, &pagep)) != 0) {
+ if (DB_UNDO(op) && ret == ENOSPC)
+ goto do_truncate;
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ }
+ created = 1;
+ }
+
+ /* Fix up the allocated page. */
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->page_lsn);
+
+ /*
+ * If an initial allocation is aborted and then reallocated during
+ * an archival restore the log record will have an LSN for the page
+ * but the page will be empty.
+ */
+ if (IS_ZERO_LSN(LSN(pagep)))
+ cmp_p = 0;
+
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->page_lsn);
+ /*
+ * Another special case we have to handle is if we ended up with a
+ * page of all 0's which can happen if we abort between allocating a
+ * page in mpool and initializing it. In that case, even if we're
+ * undoing, we need to re-initialize the page.
+ */
+ if (DB_REDO(op) && cmp_p == 0) {
+ /* Need to redo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ switch (argp->ptype) {
+ case P_LBTREE:
+ case P_LRECNO:
+ case P_LDUP:
+ level = LEAFLEVEL;
+ break;
+ default:
+ level = 0;
+ break;
+ }
+ P_INIT(pagep, file_dbp->pgsize,
+ argp->pgno, PGNO_INVALID, PGNO_INVALID, level, argp->ptype);
+
+ pagep->lsn = *lsnp;
+ } else if (DB_UNDO(op) && (cmp_n == 0 || created)) {
+ /*
+ * This is where we handle the case of a 0'd page (pagep->pgno
+ * is equal to PGNO_INVALID).
+ * Undo the allocation, reinitialize the page and
+ * link its next pointer to the free list.
+ */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ P_INIT(pagep, file_dbp->pgsize,
+ argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID);
+
+ pagep->lsn = argp->page_lsn;
+ }
+
+do_truncate:
+ /*
+ * If the page was newly created, give it back.
+ */
+ if ((pagep == NULL || IS_ZERO_LSN(LSN(pagep))) &&
+ IS_ZERO_LSN(argp->page_lsn) && DB_UNDO(op)) {
+ /* Discard the page. */
+ if (pagep != NULL) {
+ if ((ret = __memp_fput(mpf, ip,
+ pagep, DB_PRIORITY_VERY_LOW)) != 0)
+ goto out;
+ pagep = NULL;
+ }
+ /* Give the page back to the OS. */
+ if (meta->last_pgno <= argp->pgno && (ret = __memp_ftruncate(
+ mpf, NULL, ip, argp->pgno, MP_TRUNC_RECOVER)) != 0)
+ goto out;
+ }
+
+ if (pagep != NULL) {
+ ret = __memp_fput(mpf, ip, pagep, file_dbp->priority);
+ pagep = NULL;
+ if (ret != 0)
+ goto out;
+ }
+
+ ret = __memp_fput(mpf, ip, meta, file_dbp->priority);
+ meta = NULL;
+ if (ret != 0)
+ goto out;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ if (meta != NULL)
+ (void)__memp_fput(mpf, ip, meta, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __db_pg_free_recover_int --
+ */
+static int
+__db_pg_free_recover_int(env, ip, argp, file_dbp, lsnp, mpf, op, data)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ __db_pg_freedata_args *argp;
+ DB *file_dbp;
+ DB_LSN *lsnp;
+ DB_MPOOLFILE *mpf;
+ db_recops op;
+ int data;
+{
+ DBMETA *meta;
+ DB_LSN copy_lsn;
+ PAGE *pagep, *prevp;
+ int cmp_n, cmp_p, is_meta, ret;
+
+ meta = NULL;
+ pagep = prevp = NULL;
+
+ /*
+ * Get the "metapage". This will either be the metapage
+ * or the previous page in the free list if we are doing
+ * sorted allocations. If its a previous page then
+ * we will not be truncating.
+ */
+ is_meta = argp->meta_pgno == PGNO_BASE_MD;
+
+ REC_FGET(mpf, ip, argp->meta_pgno, &meta, check_meta);
+
+ if (argp->meta_pgno != PGNO_BASE_MD)
+ prevp = (PAGE *)meta;
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
+ cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(meta), lsnp);
+
+ /*
+ * Fix up the metadata page. If we're redoing or undoing the operation
+ * we get the page and update its LSN, last and free pointer.
+ */
+ if (cmp_p == 0 && DB_REDO(op)) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+ /*
+ * If we are at the end of the file truncate, otherwise
+ * put on the free list.
+ */
+ if (argp->pgno == argp->last_pgno)
+ meta->last_pgno = argp->pgno - 1;
+ else if (is_meta)
+ meta->free = argp->pgno;
+ else
+ NEXT_PGNO(prevp) = argp->pgno;
+ LSN(meta) = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo the deallocation. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+ if (is_meta) {
+ if (meta->last_pgno < argp->pgno)
+ meta->last_pgno = argp->pgno;
+ meta->free = argp->next;
+ } else
+ NEXT_PGNO(prevp) = argp->next;
+ LSN(meta) = argp->meta_lsn;
+ }
+
+check_meta:
+ if (ret != 0 && is_meta) {
+ /* The metadata page must always exist. */
+ ret = __db_pgerr(file_dbp, argp->meta_pgno, ret);
+ goto out;
+ }
+
+ /*
+ * Get the freed page. Don't create the page if we are going to
+ * free it. If we're redoing the operation we get the page and
+ * explicitly discard its contents, then update its LSN. If we're
+ * undoing the operation, we get the page and restore its header.
+ */
+ if (DB_REDO(op) || (is_meta && meta->last_pgno < argp->pgno)) {
+ if ((ret = __memp_fget(mpf, &argp->pgno,
+ ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND)
+ goto out;
+ if (is_meta &&
+ DB_REDO(op) && meta->last_pgno <= argp->pgno)
+ goto trunc;
+ goto done;
+ }
+ } else if ((ret = __memp_fget(mpf, &argp->pgno,
+ ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0)
+ goto out;
+
+ (void)__ua_memcpy(&copy_lsn, &LSN(argp->header.data), sizeof(DB_LSN));
+ cmp_n = IS_ZERO_LSN(LSN(pagep)) ? 0 : LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &copy_lsn);
+
+ /*
+ * This page got extended by a later allocation,
+ * but its allocation was not in the scope of this
+ * recovery pass.
+ */
+ if (IS_ZERO_LSN(LSN(pagep)))
+ cmp_p = 0;
+
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &copy_lsn);
+ if (DB_REDO(op) &&
+ (cmp_p == 0 ||
+ (IS_ZERO_LSN(copy_lsn) &&
+ LOG_COMPARE(&LSN(pagep), &argp->meta_lsn) <= 0))) {
+ /* Need to redo the deallocation. */
+ /*
+ * The page can be truncated if it was truncated at runtime
+ * and the current metapage reflects the truncation.
+ */
+ if (is_meta && meta->last_pgno <= argp->pgno &&
+ argp->last_pgno <= argp->pgno) {
+ if ((ret = __memp_fput(mpf, ip,
+ pagep, DB_PRIORITY_VERY_LOW)) != 0)
+ goto out;
+ pagep = NULL;
+trunc: if ((ret = __memp_ftruncate(mpf, NULL, ip,
+ argp->pgno, MP_TRUNC_RECOVER)) != 0)
+ goto out;
+ } else if (argp->last_pgno == argp->pgno) {
+ /* The page was truncated at runtime, zero it out. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ P_INIT(pagep, 0, PGNO_INVALID,
+ PGNO_INVALID, PGNO_INVALID, 0, P_INVALID);
+ ZERO_LSN(pagep->lsn);
+ } else {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ P_INIT(pagep, file_dbp->pgsize,
+ argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID);
+ pagep->lsn = *lsnp;
+
+ }
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to reallocate the page. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ memcpy(pagep, argp->header.data, argp->header.size);
+ if (data)
+ memcpy((u_int8_t*)pagep + HOFFSET(pagep),
+ argp->data.data, argp->data.size);
+ }
+ if (pagep != NULL &&
+ (ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+
+ pagep = NULL;
+#ifdef HAVE_FTRUNCATE
+ /*
+ * If we are keeping an in memory free list remove this
+ * element from the list.
+ */
+ if (op == DB_TXN_ABORT && argp->pgno != argp->last_pgno) {
+ db_pgno_t *lp;
+ u_int32_t nelem, pos;
+
+ if ((ret = __memp_get_freelist(mpf, &nelem, &lp)) != 0)
+ goto out;
+ if (lp != NULL) {
+ pos = 0;
+ if (!is_meta) {
+ __db_freelist_pos(argp->pgno, lp, nelem, &pos);
+
+ /*
+ * If we aborted after logging but before
+ * updating the free list don't do anything.
+ */
+ if (argp->pgno != lp[pos]) {
+ DB_ASSERT(env,
+ argp->meta_pgno == lp[pos]);
+ goto done;
+ }
+ DB_ASSERT(env,
+ argp->meta_pgno == lp[pos - 1]);
+ } else if (nelem != 0 && argp->pgno != lp[pos])
+ goto done;
+
+ if (pos < nelem)
+ memmove(&lp[pos], &lp[pos + 1],
+ ((nelem - pos) - 1) * sizeof(*lp));
+
+ /* Shrink the list */
+ if ((ret =
+ __memp_extend_freelist(mpf, nelem - 1, &lp)) != 0)
+ goto out;
+ }
+ }
+#endif
+done:
+ if (meta != NULL &&
+ (ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0)
+ goto out;
+ meta = NULL;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ if (meta != NULL)
+ (void)__memp_fput(mpf, ip, meta, file_dbp->priority);
+
+ return (ret);
+}
+
+/*
+ * __db_pg_free_recover --
+ * Recovery function for pg_free.
+ *
+ * PUBLIC: int __db_pg_free_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_free_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_pg_free_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+ int ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__db_pg_free_print);
+ REC_INTRO(__db_pg_free_read, ip, 0);
+
+ ret = __db_pg_free_recover_int(env, ip,
+ (__db_pg_freedata_args *)argp, file_dbp, lsnp, mpf, op, 0);
+
+done: *lsnp = argp->prev_lsn;
+out:
+ REC_CLOSE;
+}
+
+/*
+ * __db_pg_freedata_recover --
+ * Recovery function for pg_freedata.
+ *
+ * PUBLIC: int __db_pg_freedata_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_freedata_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_pg_freedata_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+ int ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__db_pg_freedata_print);
+ REC_INTRO(__db_pg_freedata_read, ip, 0);
+
+ ret = __db_pg_free_recover_int(env,
+ ip, argp, file_dbp, lsnp, mpf, op, 1);
+
+done: *lsnp = argp->prev_lsn;
+out:
+ REC_CLOSE;
+}
+
+/*
+ * __db_cksum_recover --
+ * Recovery function for checksum failure log record.
+ *
+ * PUBLIC: int __db_cksum_recover __P((ENV *,
+ * PUBLIC: DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_cksum_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_cksum_args *argp;
+ int ret;
+
+ COMPQUIET(info, NULL);
+ COMPQUIET(lsnp, NULL);
+ COMPQUIET(op, DB_TXN_ABORT);
+
+ REC_PRINT(__db_cksum_print);
+
+ if ((ret = __db_cksum_read(env, dbtp->data, &argp)) != 0)
+ return (ret);
+
+ /*
+ * We had a checksum failure -- the only option is to run catastrophic
+ * recovery.
+ */
+ if (F_ISSET(env, ENV_RECOVER_FATAL))
+ ret = 0;
+ else {
+ __db_errx(env,
+ "Checksum failure requires catastrophic recovery");
+ ret = __env_panic(env, DB_RUNRECOVERY);
+ }
+
+ __os_free(env, argp);
+ return (ret);
+}
+
+/*
+ * __db_pg_init_recover --
+ * Recovery function to reinit pages after truncation.
+ *
+ * PUBLIC: int __db_pg_init_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_init_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_pg_init_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_LSN copy_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, ret, type;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__db_pg_init_print);
+ REC_INTRO(__db_pg_init_read, ip, 0);
+
+ mpf = file_dbp->mpf;
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (DB_UNDO(op)) {
+ if (ret == DB_PAGE_NOTFOUND)
+ goto done;
+ else {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ }
+ }
+
+ /*
+ * This page was truncated and may simply not have
+ * had an item written to it yet. This should only
+ * happen on hash databases, so confirm that.
+ */
+ DB_ASSERT(env, file_dbp->type == DB_HASH);
+ if ((ret = __memp_fget(mpf, &argp->pgno,
+ ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ }
+ }
+
+ (void)__ua_memcpy(&copy_lsn, &LSN(argp->header.data), sizeof(DB_LSN));
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &copy_lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &copy_lsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+
+ if (cmp_p == 0 && DB_REDO(op)) {
+ if (TYPE(pagep) == P_HASH)
+ type = P_HASH;
+ else
+ type = file_dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE;
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ P_INIT(pagep, file_dbp->pgsize, PGNO(pagep), PGNO_INVALID,
+ PGNO_INVALID, TYPE(pagep) == P_HASH ? 0 : 1, type);
+ pagep->lsn = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Put the data back on the page. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ memcpy(pagep, argp->header.data, argp->header.size);
+ if (argp->data.size > 0)
+ memcpy((u_int8_t*)pagep + HOFFSET(pagep),
+ argp->data.data, argp->data.size);
+ }
+ if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+
+done: *lsnp = argp->prev_lsn;
+out:
+ REC_CLOSE;
+}
+
+/*
+ * __db_pg_trunc_recover --
+ * Recovery function for pg_trunc.
+ *
+ * PUBLIC: int __db_pg_trunc_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_trunc_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+#ifdef HAVE_FTRUNCATE
+ __db_pg_trunc_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DBMETA *meta;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ db_pglist_t *pglist, *lp;
+ db_pgno_t last_pgno, *list;
+ u_int32_t felem, nelem, pos;
+ int ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__db_pg_trunc_print);
+ REC_INTRO(__db_pg_trunc_read, ip, 1);
+
+ pglist = (db_pglist_t *) argp->list.data;
+ nelem = argp->list.size / sizeof(db_pglist_t);
+ if (DB_REDO(op)) {
+ /*
+ * First call __db_pg_truncate to find the truncation
+ * point, truncate the file and return the new last_pgno.
+ */
+ last_pgno = argp->last_pgno;
+ if ((ret = __db_pg_truncate(dbc, NULL, pglist,
+ NULL, &nelem, argp->next_free, &last_pgno, lsnp, 1)) != 0)
+ goto out;
+
+ if (argp->last_free != PGNO_INVALID) {
+ /*
+ * Update the next pointer of the last page in
+ * the freelist. If the truncation point is
+ * beyond next_free then this is still in the freelist
+ * otherwise the last_free page is at the end.
+ */
+ if ((ret = __memp_fget(mpf,
+ &argp->last_free, ip, NULL, 0, &meta)) == 0) {
+ if (LOG_COMPARE(&LSN(meta),
+ &argp->last_lsn) == 0) {
+ REC_DIRTY(mpf,
+ ip, dbc->priority, &meta);
+ if (pglist->pgno > last_pgno)
+ NEXT_PGNO(meta) = PGNO_INVALID;
+ else
+ NEXT_PGNO(meta) = pglist->pgno;
+ LSN(meta) = *lsnp;
+ }
+ if ((ret = __memp_fput(mpf, ip,
+ meta, file_dbp->priority)) != 0)
+ goto out;
+ meta = NULL;
+ } else if (ret != DB_PAGE_NOTFOUND)
+ goto out;
+ }
+ if ((ret = __memp_fget(mpf, &argp->meta, ip, NULL,
+ 0, &meta)) != 0)
+ goto out;
+ if (LOG_COMPARE(&LSN(meta), &argp->meta_lsn) == 0) {
+ REC_DIRTY(mpf, ip, dbc->priority, &meta);
+ if (argp->last_free == PGNO_INVALID) {
+ if (nelem == 0)
+ meta->free = PGNO_INVALID;
+ else
+ meta->free = pglist->pgno;
+ }
+ meta->last_pgno = last_pgno;
+ LSN(meta) = *lsnp;
+ }
+ } else {
+ /* Put the free list back in its original order. */
+ for (lp = pglist; lp < &pglist[nelem]; lp++) {
+ if ((ret = __memp_fget(mpf, &lp->pgno, ip,
+ NULL, DB_MPOOL_CREATE, &pagep)) != 0)
+ goto out;
+ if (IS_ZERO_LSN(LSN(pagep)) ||
+ LOG_COMPARE(&LSN(pagep), lsnp) == 0) {
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ P_INIT(pagep, file_dbp->pgsize, lp->pgno,
+ PGNO_INVALID, lp->next_pgno, 0, P_INVALID);
+ LSN(pagep) = lp->lsn;
+ }
+ if ((ret = __memp_fput(mpf,
+ ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ }
+ /*
+ * Link the truncated part back into the free list.
+ * Its either after the last_free page or direclty
+ * linked to the metadata page.
+ */
+ if (argp->last_free != PGNO_INVALID) {
+ if ((ret = __memp_fget(mpf, &argp->last_free,
+ ip, NULL, DB_MPOOL_EDIT, &meta)) == 0) {
+ if (LOG_COMPARE(&LSN(meta), lsnp) == 0) {
+ NEXT_PGNO(meta) = argp->next_free;
+ LSN(meta) = argp->last_lsn;
+ }
+ if ((ret = __memp_fput(mpf, ip,
+ meta, file_dbp->priority)) != 0)
+ goto out;
+ } else if (ret != DB_PAGE_NOTFOUND)
+ goto out;
+ meta = NULL;
+ }
+ if ((ret = __memp_fget(mpf, &argp->meta,
+ ip, NULL, DB_MPOOL_EDIT, &meta)) != 0)
+ goto out;
+ if (LOG_COMPARE(&LSN(meta), lsnp) == 0) {
+ REC_DIRTY(mpf, ip, dbc->priority, &meta);
+ /*
+ * If we had to break up the list last_pgno
+ * may only represent the end of the block.
+ */
+ if (meta->last_pgno < argp->last_pgno)
+ meta->last_pgno = argp->last_pgno;
+ if (argp->last_free == PGNO_INVALID)
+ meta->free = argp->next_free;
+ LSN(meta) = argp->meta_lsn;
+ }
+ }
+
+ if ((ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0)
+ goto out;
+
+ if (op == DB_TXN_ABORT) {
+ /*
+ * Put the pages back on the in memory free list.
+ * If this is part of a multi-record truncate then
+ * we need to find this batch, it may not be at the end.
+ * If we aborted while writing one of the log records
+ * then this set may still be in the list.
+ */
+ if ((ret = __memp_get_freelist(mpf, &felem, &list)) != 0)
+ goto out;
+ if (list != NULL) {
+ if (felem != 0 && list[felem - 1] > pglist->pgno) {
+ __db_freelist_pos(
+ pglist->pgno, list, felem, &pos);
+ DB_ASSERT(env, pos < felem);
+ if (pglist->pgno == list[pos])
+ goto done;
+ pos++;
+ } else if (felem != 0 &&
+ list[felem - 1] == pglist->pgno)
+ goto done;
+ else
+ pos = felem;
+ if ((ret = __memp_extend_freelist(
+ mpf, felem + nelem, &list)) != 0)
+ goto out;
+ if (pos != felem)
+ memmove(&list[nelem + pos], &list[pos],
+ sizeof(*list) * (felem - pos));
+ for (lp = pglist; lp < &pglist[nelem]; lp++)
+ list[pos++] = lp->pgno;
+ }
+ }
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: REC_CLOSE;
+#else
+ /*
+ * If HAVE_FTRUNCATE is not defined, we'll never see pg_trunc records
+ * to recover.
+ */
+ COMPQUIET(env, NULL);
+ COMPQUIET(dbtp, NULL);
+ COMPQUIET(lsnp, NULL);
+ COMPQUIET(op, DB_TXN_ABORT);
+ COMPQUIET(info, NULL);
+ return (EINVAL);
+#endif
+}
+/*
+ * __db_pg_sort_44_recover --
+ * Recovery function for pg_sort.
+ * This is deprecated and kept for replication upgrades.
+ *
+ * PUBLIC: int __db_pg_sort_44_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_sort_44_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+#ifdef HAVE_FTRUNCATE
+ __db_pg_sort_44_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DBMETA *meta;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ db_pglist_t *pglist, *lp;
+ db_pgno_t pgno, *list;
+ u_int32_t felem, nelem;
+ int ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__db_pg_sort_44_print);
+ REC_INTRO(__db_pg_sort_44_read, ip, 1);
+
+ pglist = (db_pglist_t *) argp->list.data;
+ nelem = argp->list.size / sizeof(db_pglist_t);
+ if (DB_REDO(op)) {
+ pgno = argp->last_pgno;
+ __db_freelist_sort(pglist, nelem);
+ if ((ret = __db_pg_truncate(dbc, NULL,
+ pglist, NULL, &nelem, PGNO_INVALID, &pgno, lsnp, 1)) != 0)
+ goto out;
+
+ if (argp->last_free != PGNO_INVALID) {
+ if ((ret = __memp_fget(mpf,
+ &argp->last_free, ip, NULL, 0, &meta)) == 0) {
+ if (LOG_COMPARE(&LSN(meta),
+ &argp->last_lsn) == 0) {
+ REC_DIRTY(mpf,
+ ip, dbc->priority, &meta);
+ NEXT_PGNO(meta) = PGNO_INVALID;
+ LSN(meta) = *lsnp;
+ }
+ if ((ret = __memp_fput(mpf, ip,
+ meta, file_dbp->priority)) != 0)
+ goto out;
+ meta = NULL;
+ } else if (ret != DB_PAGE_NOTFOUND)
+ goto out;
+ }
+ if ((ret = __memp_fget(mpf, &argp->meta, ip, NULL,
+ 0, &meta)) != 0)
+ goto out;
+ if (LOG_COMPARE(&LSN(meta), &argp->meta_lsn) == 0) {
+ REC_DIRTY(mpf, ip, dbc->priority, &meta);
+ if (argp->last_free == PGNO_INVALID) {
+ if (nelem == 0)
+ meta->free = PGNO_INVALID;
+ else
+ meta->free = pglist->pgno;
+ }
+ meta->last_pgno = pgno;
+ LSN(meta) = *lsnp;
+ }
+ } else {
+ /* Put the free list back in its original order. */
+ for (lp = pglist; lp < &pglist[nelem]; lp++) {
+ if ((ret = __memp_fget(mpf, &lp->pgno, ip,
+ NULL, DB_MPOOL_CREATE, &pagep)) != 0)
+ goto out;
+ if (IS_ZERO_LSN(LSN(pagep)) ||
+ LOG_COMPARE(&LSN(pagep), lsnp) == 0) {
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if (lp == &pglist[nelem - 1])
+ pgno = PGNO_INVALID;
+ else
+ pgno = lp[1].pgno;
+
+ P_INIT(pagep, file_dbp->pgsize,
+ lp->pgno, PGNO_INVALID, pgno, 0, P_INVALID);
+ LSN(pagep) = lp->lsn;
+ }
+ if ((ret = __memp_fput(mpf,
+ ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ }
+ if (argp->last_free != PGNO_INVALID) {
+ if ((ret = __memp_fget(mpf, &argp->last_free,
+ ip, NULL, DB_MPOOL_EDIT, &meta)) == 0) {
+ if (LOG_COMPARE(&LSN(meta), lsnp) == 0) {
+ NEXT_PGNO(meta) = pglist->pgno;
+ LSN(meta) = argp->last_lsn;
+ }
+ if ((ret = __memp_fput(mpf, ip,
+ meta, file_dbp->priority)) != 0)
+ goto out;
+ } else if (ret != DB_PAGE_NOTFOUND)
+ goto out;
+ meta = NULL;
+ }
+ if ((ret = __memp_fget(mpf, &argp->meta,
+ ip, NULL, DB_MPOOL_EDIT, &meta)) != 0)
+ goto out;
+ if (LOG_COMPARE(&LSN(meta), lsnp) == 0) {
+ REC_DIRTY(mpf, ip, dbc->priority, &meta);
+ meta->last_pgno = argp->last_pgno;
+ if (argp->last_free == PGNO_INVALID)
+ meta->free = pglist->pgno;
+ LSN(meta) = argp->meta_lsn;
+ }
+ }
+ if (op == DB_TXN_ABORT) {
+ if ((ret = __memp_get_freelist(mpf, &felem, &list)) != 0)
+ goto out;
+ if (list != NULL) {
+ DB_ASSERT(env, felem == 0 ||
+ argp->last_free == list[felem - 1]);
+ if ((ret = __memp_extend_freelist(
+ mpf, felem + nelem, &list)) != 0)
+ goto out;
+ for (lp = pglist; lp < &pglist[nelem]; lp++)
+ list[felem++] = lp->pgno;
+ }
+ }
+
+ if ((ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0)
+ goto out;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: REC_CLOSE;
+#else
+ /*
+ * If HAVE_FTRUNCATE is not defined, we'll never see pg_sort records
+ * to recover.
+ */
+ COMPQUIET(env, NULL);
+ COMPQUIET(dbtp, NULL);
+ COMPQUIET(lsnp, NULL);
+ COMPQUIET(op, DB_TXN_ABORT);
+ COMPQUIET(info, NULL);
+ return (EINVAL);
+#endif
+}
+
+/*
+ * __db_pg_alloc_42_recover --
+ * Recovery function for pg_alloc.
+ *
+ * PUBLIC: int __db_pg_alloc_42_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_alloc_42_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_pg_alloc_42_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DBMETA *meta;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ db_pgno_t pgno;
+ int cmp_n, cmp_p, created, level, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ meta = NULL;
+ pagep = NULL;
+ created = 0;
+ REC_PRINT(__db_pg_alloc_42_print);
+ REC_INTRO(__db_pg_alloc_42_read, ip, 0);
+
+ /*
+ * Fix up the metadata page. If we're redoing the operation, we have
+ * to get the metadata page and update its LSN and its free pointer.
+ * If we're undoing the operation and the page was ever created, we put
+ * it on the freelist.
+ */
+ pgno = PGNO_BASE_MD;
+ if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &meta)) != 0) {
+ /* The metadata page must always exist on redo. */
+ if (DB_REDO(op)) {
+ ret = __db_pgerr(file_dbp, pgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+ cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
+ cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+ LSN(meta) = *lsnp;
+ meta->free = argp->next;
+ if (argp->pgno > meta->last_pgno)
+ meta->last_pgno = argp->pgno;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ goto no_rollback;
+ }
+
+ /*
+ * Fix up the allocated page. If the page does not exist
+ * and we can truncate it then don't create it.
+ * Otherwise if we're redoing the operation, we have
+ * to get the page (creating it if it doesn't exist), and update its
+ * LSN. If we're undoing the operation, we have to reset the page's
+ * LSN and put it on the free list, or truncate it.
+ */
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ /*
+ * We have to be able to identify if a page was newly
+ * created so we can recover it properly. We cannot simply
+ * look for an empty header, because hash uses a pgin
+ * function that will set the header. Instead, we explicitly
+ * try for the page without CREATE and if that fails, then
+ * create it.
+ */
+ if ((ret = __memp_fget(mpf, &argp->pgno,
+ ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) {
+ if (DB_UNDO(op) && ret == ENOSPC)
+ goto do_truncate;
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ }
+ created = 1;
+ }
+
+ /* Fix up the allocated page. */
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->page_lsn);
+
+ /*
+ * If an initial allocation is aborted and then reallocated during
+ * an archival restore the log record will have an LSN for the page
+ * but the page will be empty.
+ */
+ if (IS_ZERO_LSN(LSN(pagep)) ||
+ (IS_ZERO_LSN(argp->page_lsn) && IS_INIT_LSN(LSN(pagep))))
+ cmp_p = 0;
+
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->page_lsn);
+ /*
+ * Another special case we have to handle is if we ended up with a
+ * page of all 0's which can happen if we abort between allocating a
+ * page in mpool and initializing it. In that case, even if we're
+ * undoing, we need to re-initialize the page.
+ */
+ if (DB_REDO(op) && cmp_p == 0) {
+ /* Need to redo update described. */
+ switch (argp->ptype) {
+ case P_LBTREE:
+ case P_LRECNO:
+ case P_LDUP:
+ level = LEAFLEVEL;
+ break;
+ default:
+ level = 0;
+ break;
+ }
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ P_INIT(pagep, file_dbp->pgsize,
+ argp->pgno, PGNO_INVALID, PGNO_INVALID, level, argp->ptype);
+
+ pagep->lsn = *lsnp;
+ } else if (DB_UNDO(op) && (cmp_n == 0 || created)) {
+ /*
+ * This is where we handle the case of a 0'd page (pagep->pgno
+ * is equal to PGNO_INVALID).
+ * Undo the allocation, reinitialize the page and
+ * link its next pointer to the free list.
+ */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ P_INIT(pagep, file_dbp->pgsize,
+ argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID);
+
+ pagep->lsn = argp->page_lsn;
+ }
+
+do_truncate:
+ /*
+ * We cannot undo things from 4.2 land, because we nolonger
+ * have limbo processing.
+ */
+ if ((pagep == NULL || IS_ZERO_LSN(LSN(pagep))) &&
+ IS_ZERO_LSN(argp->page_lsn) && DB_UNDO(op)) {
+no_rollback: __db_errx(env,
+"Cannot replicate prepared transactions from master running release 4.2 ");
+ ret = __env_panic(env, EINVAL);
+ }
+
+ if (pagep != NULL &&
+ (ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+ if ((ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0)
+ goto out;
+ meta = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ if (meta != NULL)
+ (void)__memp_fput(mpf, ip, meta, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __db_pg_free_recover_42_int --
+ */
+static int
+__db_pg_free_recover_42_int(env, ip, argp, file_dbp, lsnp, mpf, op, data)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ __db_pg_freedata_42_args *argp;
+ DB *file_dbp;
+ DB_LSN *lsnp;
+ DB_MPOOLFILE *mpf;
+ db_recops op;
+ int data;
+{
+ DBMETA *meta;
+ DB_LSN copy_lsn;
+ PAGE *pagep, *prevp;
+ int cmp_n, cmp_p, is_meta, ret;
+
+ meta = NULL;
+ pagep = NULL;
+ prevp = NULL;
+
+ /*
+ * Get the "metapage". This will either be the metapage
+ * or the previous page in the free list if we are doing
+ * sorted allocations. If its a previous page then
+ * we will not be truncating.
+ */
+ is_meta = argp->meta_pgno == PGNO_BASE_MD;
+
+ REC_FGET(mpf, ip, argp->meta_pgno, &meta, check_meta);
+
+ if (argp->meta_pgno != PGNO_BASE_MD)
+ prevp = (PAGE *)meta;
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
+ cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
+
+ /*
+ * Fix up the metadata page. If we're redoing or undoing the operation
+ * we get the page and update its LSN, last and free pointer.
+ */
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo the deallocation. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+ if (prevp == NULL)
+ meta->free = argp->pgno;
+ else
+ NEXT_PGNO(prevp) = argp->pgno;
+ /*
+ * If this was a compensating transaction and
+ * we are a replica, then we never executed the
+ * original allocation which incremented meta->free.
+ */
+ if (prevp == NULL && meta->last_pgno < meta->free)
+ meta->last_pgno = meta->free;
+ LSN(meta) = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo the deallocation. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+ if (prevp == NULL)
+ meta->free = argp->next;
+ else
+ NEXT_PGNO(prevp) = argp->next;
+ LSN(meta) = argp->meta_lsn;
+ if (prevp == NULL && meta->last_pgno < argp->pgno)
+ meta->last_pgno = argp->pgno;
+ }
+
+check_meta:
+ if (ret != 0 && is_meta) {
+ /* The metadata page must always exist. */
+ ret = __db_pgerr(file_dbp, argp->meta_pgno, ret);
+ goto out;
+ }
+
+ /*
+ * Get the freed page. If we support truncate then don't
+ * create the page if we are going to free it. If we're
+ * redoing the operation we get the page and explicitly discard
+ * its contents, then update its LSN. If we're undoing the
+ * operation, we get the page and restore its header.
+ * If we don't support truncate, then we must create the page
+ * and roll it back.
+ */
+ if ((ret = __memp_fget(mpf, &argp->pgno,
+ ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0)
+ goto out;
+
+ (void)__ua_memcpy(&copy_lsn, &LSN(argp->header.data), sizeof(DB_LSN));
+ cmp_n = IS_ZERO_LSN(LSN(pagep)) ? 0 : LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &copy_lsn);
+
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &copy_lsn);
+ if (DB_REDO(op) &&
+ (cmp_p == 0 ||
+ (IS_ZERO_LSN(copy_lsn) &&
+ LOG_COMPARE(&LSN(pagep), &argp->meta_lsn) <= 0))) {
+ /* Need to redo the deallocation. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ P_INIT(pagep, file_dbp->pgsize,
+ argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID);
+ pagep->lsn = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to reallocate the page. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ memcpy(pagep, argp->header.data, argp->header.size);
+ if (data)
+ memcpy((u_int8_t*)pagep + HOFFSET(pagep),
+ argp->data.data, argp->data.size);
+ }
+ if (pagep != NULL &&
+ (ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+
+ pagep = NULL;
+ if (meta != NULL &&
+ (ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0)
+ goto out;
+ meta = NULL;
+
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ if (meta != NULL)
+ (void)__memp_fput(mpf, ip, meta, file_dbp->priority);
+
+ return (ret);
+}
+
+/*
+ * __db_pg_free_42_recover --
+ * Recovery function for pg_free.
+ *
+ * PUBLIC: int __db_pg_free_42_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_free_42_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_pg_free_42_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+ int ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__db_pg_free_42_print);
+ REC_INTRO(__db_pg_free_42_read, ip, 0);
+
+ ret = __db_pg_free_recover_42_int(env, ip,
+ (__db_pg_freedata_42_args *)argp, file_dbp, lsnp, mpf, op, 0);
+
+done: *lsnp = argp->prev_lsn;
+out:
+ REC_CLOSE;
+}
+
+/*
+ * __db_pg_freedata_42_recover --
+ * Recovery function for pg_freedata.
+ *
+ * PUBLIC: int __db_pg_freedata_42_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_pg_freedata_42_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_pg_freedata_42_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+ int ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__db_pg_freedata_42_print);
+ REC_INTRO(__db_pg_freedata_42_read, ip, 0);
+
+ ret = __db_pg_free_recover_42_int(
+ env, ip, argp, file_dbp, lsnp, mpf, op, 1);
+
+done: *lsnp = argp->prev_lsn;
+out:
+ REC_CLOSE;
+}
+
+/*
+ * __db_relink_42_recover --
+ * Recovery function for relink.
+ *
+ * PUBLIC: int __db_relink_42_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__db_relink_42_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __db_relink_42_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, modified, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__db_relink_42_print);
+ REC_INTRO(__db_relink_42_read, ip, 0);
+
+ /*
+ * There are up to three pages we need to check -- the page, and the
+ * previous and next pages, if they existed. For a page add operation,
+ * the current page is the result of a split and is being recovered
+ * elsewhere, so all we need do is recover the next page.
+ */
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (DB_REDO(op)) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ }
+ goto next2;
+ }
+ if (argp->opcode == DB_ADD_PAGE_COMPAT)
+ goto next1;
+
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Redo the relink. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->lsn = *lsnp;
+ } else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) {
+ /* Undo the relink. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->next_pgno = argp->next;
+ pagep->prev_pgno = argp->prev;
+ pagep->lsn = argp->lsn;
+ }
+next1: if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+next2: if ((ret = __memp_fget(mpf, &argp->next, ip, NULL, 0, &pagep)) != 0) {
+ if (DB_REDO(op)) {
+ ret = __db_pgerr(file_dbp, argp->next, ret);
+ goto out;
+ }
+ goto prev;
+ }
+ modified = 0;
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_next);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_next);
+ if ((argp->opcode == DB_REM_PAGE_COMPAT && cmp_p == 0 && DB_REDO(op)) ||
+ (argp->opcode == DB_ADD_PAGE_COMPAT && cmp_n == 0 && DB_UNDO(op))) {
+ /* Redo the remove or undo the add. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->prev_pgno = argp->prev;
+ modified = 1;
+ } else if ((argp->opcode == DB_REM_PAGE_COMPAT &&
+ cmp_n == 0 && DB_UNDO(op)) ||
+ (argp->opcode == DB_ADD_PAGE_COMPAT && cmp_p == 0 && DB_REDO(op))) {
+ /* Undo the remove or redo the add. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->prev_pgno = argp->pgno;
+ modified = 1;
+ }
+ if (modified) {
+ if (DB_UNDO(op))
+ pagep->lsn = argp->lsn_next;
+ else
+ pagep->lsn = *lsnp;
+ }
+ if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+ if (argp->opcode == DB_ADD_PAGE_COMPAT)
+ goto done;
+
+prev: if ((ret = __memp_fget(mpf, &argp->prev, ip, NULL, 0, &pagep)) != 0) {
+ if (DB_REDO(op)) {
+ ret = __db_pgerr(file_dbp, argp->prev, ret);
+ goto out;
+ }
+ goto done;
+ }
+ modified = 0;
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_prev);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_prev);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Redo the relink. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->next_pgno = argp->next;
+ modified = 1;
+ } else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) {
+ /* Undo the relink. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->next_pgno = argp->pgno;
+ modified = 1;
+ }
+ if (modified) {
+ if (DB_UNDO(op))
+ pagep->lsn = argp->lsn_prev;
+ else
+ pagep->lsn = *lsnp;
+ }
+ if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ REC_CLOSE;
+}
diff --git a/db/db_reclaim.c b/db/db_reclaim.c
new file mode 100644
index 0000000..a44d054
--- /dev/null
+++ b/db/db_reclaim.c
@@ -0,0 +1,246 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/mp.h"
+
+/*
+ * __db_traverse_big
+ * Traverse a chain of overflow pages and call the callback routine
+ * on each one. The calling convention for the callback is:
+ * callback(dbc, page, cookie, did_put),
+ * where did_put is a return value indicating if the page in question has
+ * already been returned to the mpool.
+ *
+ * PUBLIC: int __db_traverse_big __P((DBC *, db_pgno_t,
+ * PUBLIC: int (*)(DBC *, PAGE *, void *, int *), void *));
+ */
+int
+__db_traverse_big(dbc, pgno, callback, cookie)
+ DBC *dbc;
+ db_pgno_t pgno;
+ int (*callback) __P((DBC *, PAGE *, void *, int *));
+ void *cookie;
+{
+ DB_MPOOLFILE *mpf;
+ PAGE *p;
+ int did_put, ret;
+
+ mpf = dbc->dbp->mpf;
+
+ do {
+ did_put = 0;
+ if ((ret = __memp_fget(mpf,
+ &pgno, dbc->thread_info, dbc->txn, 0, &p)) != 0)
+ return (ret);
+ /*
+ * If we are freeing pages only process the overflow
+ * chain if the head of the chain has a refcount of 1.
+ */
+ pgno = NEXT_PGNO(p);
+ if (callback == __db_truncate_callback && OV_REF(p) != 1)
+ pgno = PGNO_INVALID;
+ if ((ret = callback(dbc, p, cookie, &did_put)) == 0 &&
+ !did_put)
+ ret = __memp_fput(mpf,
+ dbc->thread_info, p, dbc->priority);
+ } while (ret == 0 && pgno != PGNO_INVALID);
+
+ return (ret);
+}
+
+/*
+ * __db_reclaim_callback
+ * This is the callback routine used during a delete of a subdatabase.
+ * we are traversing a btree or hash table and trying to free all the
+ * pages. Since they share common code for duplicates and overflow
+ * items, we traverse them identically and use this routine to do the
+ * actual free. The reason that this is callback is because hash uses
+ * the same traversal code for statistics gathering.
+ *
+ * PUBLIC: int __db_reclaim_callback __P((DBC *, PAGE *, void *, int *));
+ */
+int
+__db_reclaim_callback(dbc, p, cookie, putp)
+ DBC *dbc;
+ PAGE *p;
+ void *cookie;
+ int *putp;
+{
+ DB *dbp;
+ int ret;
+
+ COMPQUIET(cookie, NULL);
+ dbp = dbc->dbp;
+
+ /*
+ * We don't want to log the free of the root with the subdb.
+ * If we abort then the subdb may not be openable to undo
+ * the free.
+ */
+ if ((dbp->type == DB_BTREE || dbp->type == DB_RECNO) &&
+ PGNO(p) == ((BTREE *)dbp->bt_internal)->bt_root)
+ return (0);
+ if ((ret = __db_free(dbc, p)) != 0)
+ return (ret);
+ *putp = 1;
+
+ return (0);
+}
+
+/*
+ * __db_truncate_callback
+ * This is the callback routine used during a truncate.
+ * we are traversing a btree or hash table and trying to free all the
+ * pages.
+ *
+ * PUBLIC: int __db_truncate_callback __P((DBC *, PAGE *, void *, int *));
+ */
+int
+__db_truncate_callback(dbc, p, cookie, putp)
+ DBC *dbc;
+ PAGE *p;
+ void *cookie;
+ int *putp;
+{
+ DB *dbp;
+ DBT ddbt, ldbt;
+ DB_MPOOLFILE *mpf;
+ db_indx_t indx, len, off, tlen, top;
+ u_int8_t *hk, type;
+ u_int32_t *countp;
+ int ret;
+
+ top = NUM_ENT(p);
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ countp = cookie;
+ *putp = 1;
+
+ switch (TYPE(p)) {
+ case P_LBTREE:
+ /* Skip for off-page duplicates and deleted items. */
+ for (indx = 0; indx < top; indx += P_INDX) {
+ type = GET_BKEYDATA(dbp, p, indx + O_INDX)->type;
+ if (!B_DISSET(type) && B_TYPE(type) != B_DUPLICATE)
+ ++*countp;
+ }
+ /* FALLTHROUGH */
+ case P_IBTREE:
+ case P_IRECNO:
+ case P_INVALID:
+ if (dbp->type != DB_HASH &&
+ ((BTREE *)dbp->bt_internal)->bt_root == PGNO(p)) {
+ type = dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE;
+ goto reinit;
+ }
+ break;
+ case P_OVERFLOW:
+ if ((ret = __memp_dirty(mpf,
+ &p, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ return (ret);
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __db_ovref_log(dbp, dbc->txn,
+ &LSN(p), 0, p->pgno, -1, &LSN(p))) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(LSN(p));
+ if (--OV_REF(p) != 0)
+ *putp = 0;
+ break;
+ case P_LRECNO:
+ for (indx = 0; indx < top; indx += O_INDX) {
+ type = GET_BKEYDATA(dbp, p, indx)->type;
+ if (!B_DISSET(type))
+ ++*countp;
+ }
+
+ if (((BTREE *)dbp->bt_internal)->bt_root == PGNO(p)) {
+ type = P_LRECNO;
+ goto reinit;
+ }
+ break;
+ case P_LDUP:
+ /* Correct for deleted items. */
+ for (indx = 0; indx < top; indx += O_INDX)
+ if (!B_DISSET(GET_BKEYDATA(dbp, p, indx)->type))
+ ++*countp;
+
+ break;
+ case P_HASH:
+ /* Correct for on-page duplicates and deleted items. */
+ for (indx = 0; indx < top; indx += P_INDX) {
+ switch (*H_PAIRDATA(dbp, p, indx)) {
+ case H_OFFDUP:
+ break;
+ case H_OFFPAGE:
+ case H_KEYDATA:
+ ++*countp;
+ break;
+ case H_DUPLICATE:
+ tlen = LEN_HDATA(dbp, p, 0, indx);
+ hk = H_PAIRDATA(dbp, p, indx);
+ for (off = 0; off < tlen;
+ off += len + 2 * sizeof(db_indx_t)) {
+ ++*countp;
+ memcpy(&len,
+ HKEYDATA_DATA(hk)
+ + off, sizeof(db_indx_t));
+ }
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, p->pgno));
+ }
+ }
+ /* Don't free the head of the bucket. */
+ if (PREV_PGNO(p) == PGNO_INVALID) {
+ type = P_HASH;
+
+reinit: if ((ret = __memp_dirty(mpf, &p,
+ dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ return (ret);
+ *putp = 0;
+ if (DBC_LOGGING(dbc)) {
+ memset(&ldbt, 0, sizeof(ldbt));
+ memset(&ddbt, 0, sizeof(ddbt));
+ ldbt.data = p;
+ ldbt.size = P_OVERHEAD(dbp);
+ ldbt.size += p->entries * sizeof(db_indx_t);
+ ddbt.data = (u_int8_t *)p + HOFFSET(p);
+ ddbt.size = dbp->pgsize - HOFFSET(p);
+ if ((ret = __db_pg_init_log(dbp,
+ dbc->txn, &LSN(p), 0,
+ p->pgno, &ldbt, &ddbt)) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(LSN(p));
+
+ P_INIT(p, dbp->pgsize, PGNO(p), PGNO_INVALID,
+ PGNO_INVALID, type == P_HASH ? 0 : 1, type);
+ }
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, p->pgno));
+ }
+
+ if (*putp == 1) {
+ if ((ret = __db_free(dbc, p)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __memp_fput(mpf, dbc->thread_info, p,
+ dbc->priority)) != 0)
+ return (ret);
+ *putp = 1;
+ }
+
+ return (0);
+}
diff --git a/db/db_remove.c b/db/db_remove.c
new file mode 100644
index 0000000..6b59ec3
--- /dev/null
+++ b/db/db_remove.c
@@ -0,0 +1,492 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2010 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/fop.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __db_dbtxn_remove __P((DB *,
+ DB_THREAD_INFO *, DB_TXN *, const char *, const char *));
+static int __db_subdb_remove __P((DB *,
+ DB_THREAD_INFO *, DB_TXN *, const char *, const char *));
+
+/*
+ * __env_dbremove_pp
+ * ENV->dbremove pre/post processing.
+ *
+ * PUBLIC: int __env_dbremove_pp __P((DB_ENV *,
+ * PUBLIC: DB_TXN *, const char *, const char *, u_int32_t));
+ */
+int
+__env_dbremove_pp(dbenv, txn, name, subdb, flags)
+ DB_ENV *dbenv;
+ DB_TXN *txn;
+ const char *name, *subdb;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret, txn_local;
+
+ dbp = NULL;
+ env = dbenv->env;
+ txn_local = 0;
+
+ ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->dbremove");
+
+ /*
+ * The actual argument checking is simple, do it inline, outside of
+ * the replication block.
+ */
+ if ((ret = __db_fchk(env,
+ "DB->remove", flags, DB_AUTO_COMMIT | DB_TXN_NOT_DURABLE)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret = __env_rep_enter(env, 1)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ /*
+ * Create local transaction as necessary, check for consistent
+ * transaction usage.
+ */
+ if (IS_ENV_AUTO_COMMIT(env, txn, flags)) {
+ if ((ret = __db_txn_auto_init(env, ip, &txn)) != 0)
+ goto err;
+ txn_local = 1;
+ } else
+ if (txn != NULL && !TXN_ON(env) &&
+ (!CDB_LOCKING(env) || !F_ISSET(txn, TXN_CDSGROUP))) {
+ ret = __db_not_txn_env(env);
+ goto err;
+ }
+ LF_CLR(DB_AUTO_COMMIT);
+
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ goto err;
+ if (LF_ISSET(DB_TXN_NOT_DURABLE) &&
+ (ret = __db_set_flags(dbp, DB_TXN_NOT_DURABLE)) != 0)
+ goto err;
+ LF_CLR(DB_TXN_NOT_DURABLE);
+
+ ret = __db_remove_int(dbp, ip, txn, name, subdb, flags);
+
+ if (txn_local) {
+ /*
+ * We created the DBP here and when we commit/abort, we'll
+ * release all the transactional locks, including the handle
+ * lock; mark the handle cleared explicitly.
+ */
+ LOCK_INIT(dbp->handle_lock);
+ dbp->locker = NULL;
+ } else if (txn != NULL) {
+ /*
+ * We created this handle locally so we need to close it
+ * and clean it up. Unfortunately, it's holding transactional
+ * locks that need to persist until the end of transaction.
+ * If we invalidate the locker id (dbp->locker), then the close
+ * won't free these locks prematurely.
+ */
+ dbp->locker = NULL;
+ }
+
+err: if (txn_local && (t_ret =
+ __db_txn_auto_resolve(env, txn, 0, ret)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * We never opened this dbp for real, so don't include a transaction
+ * handle, and use NOSYNC to avoid calling into mpool.
+ *
+ * !!!
+ * Note we're reversing the order of operations: we started the txn and
+ * then opened the DB handle; we're resolving the txn and then closing
+ * closing the DB handle -- a DB handle cannot be closed before
+ * resolving the txn.
+ */
+ if (dbp != NULL &&
+ (t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_remove_pp
+ * DB->remove pre/post processing.
+ *
+ * PUBLIC: int __db_remove_pp
+ * PUBLIC: __P((DB *, const char *, const char *, u_int32_t));
+ */
+int
+__db_remove_pp(dbp, name, subdb, flags)
+ DB *dbp;
+ const char *name, *subdb;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = dbp->env;
+
+ /*
+ * Validate arguments, continuing to destroy the handle on failure.
+ *
+ * Cannot use DB_ILLEGAL_AFTER_OPEN directly because it returns.
+ *
+ * !!!
+ * We have a serious problem if we're here with a handle used to open
+ * a database -- we'll destroy the handle, and the application won't
+ * ever be able to close the database.
+ */
+ if (F_ISSET(dbp, DB_AM_OPEN_CALLED))
+ return (__db_mi_open(env, "DB->remove", 1));
+
+ /* Validate arguments. */
+ if ((ret = __db_fchk(env, "DB->remove", flags, 0)) != 0)
+ return (ret);
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, NULL, DB_LOCK_INVALIDID, 0)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret = __db_rep_enter(dbp, 1, 1, 0)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ /* Remove the file. */
+ ret = __db_remove(dbp, ip, NULL, name, subdb, flags);
+
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+err: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_remove
+ * DB->remove method.
+ *
+ * PUBLIC: int __db_remove __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC: DB_TXN *, const char *, const char *, u_int32_t));
+ */
+int
+__db_remove(dbp, ip, txn, name, subdb, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name, *subdb;
+ u_int32_t flags;
+{
+ int ret, t_ret;
+
+ ret = __db_remove_int(dbp, ip, txn, name, subdb, flags);
+
+ if ((t_ret = __db_close(dbp, txn, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_remove_int
+ * Worker function for the DB->remove method.
+ *
+ * PUBLIC: int __db_remove_int __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC: DB_TXN *, const char *, const char *, u_int32_t));
+ */
+int
+__db_remove_int(dbp, ip, txn, name, subdb, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name, *subdb;
+ u_int32_t flags;
+{
+ ENV *env;
+ int ret;
+ char *real_name, *tmpname;
+
+ env = dbp->env;
+ real_name = tmpname = NULL;
+
+ if (name == NULL && subdb == NULL) {
+ __db_errx(env, "Remove on temporary files invalid");
+ ret = EINVAL;
+ goto err;
+ }
+
+ if (name == NULL) {
+ MAKE_INMEM(dbp);
+ real_name = (char *)subdb;
+ } else if (subdb != NULL) {
+ ret = __db_subdb_remove(dbp, ip, txn, name, subdb);
+ goto err;
+ }
+
+ /* Handle transactional file removes separately. */
+ if (IS_REAL_TXN(txn)) {
+ ret = __db_dbtxn_remove(dbp, ip, txn, name, subdb);
+ goto err;
+ }
+
+ /*
+ * The remaining case is a non-transactional file remove.
+ *
+ * Find the real name of the file.
+ */
+ if (!F_ISSET(dbp, DB_AM_INMEM) && (ret = __db_appname(env,
+ DB_APP_DATA, name, &dbp->dirname, &real_name)) != 0)
+ goto err;
+
+ /*
+ * If this is a file and force is set, remove the temporary file, which
+ * may have been left around. Ignore errors because the temporary file
+ * might not exist.
+ */
+ if (!F_ISSET(dbp, DB_AM_INMEM) && LF_ISSET(DB_FORCE) &&
+ (ret = __db_backup_name(env, real_name, NULL, &tmpname)) == 0)
+ (void)__os_unlink(env, tmpname, 0);
+
+ if ((ret = __fop_remove_setup(dbp, NULL, real_name, 0)) != 0)
+ goto err;
+
+ if (dbp->db_am_remove != NULL &&
+ (ret = dbp->db_am_remove(dbp, ip, NULL, name, subdb, flags)) != 0)
+ goto err;
+
+ ret = F_ISSET(dbp, DB_AM_INMEM) ?
+ __db_inmem_remove(dbp, NULL, real_name) :
+ __fop_remove(env,
+ NULL, dbp->fileid, name, &dbp->dirname, DB_APP_DATA,
+ F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0);
+
+err: if (!F_ISSET(dbp, DB_AM_INMEM) && real_name != NULL)
+ __os_free(env, real_name);
+ if (tmpname != NULL)
+ __os_free(env, tmpname);
+
+ return (ret);
+}
+
+/*
+ * __db_inmem_remove --
+ * Removal of a named in-memory database.
+ *
+ * PUBLIC: int __db_inmem_remove __P((DB *, DB_TXN *, const char *));
+ */
+int
+__db_inmem_remove(dbp, txn, name)
+ DB *dbp;
+ DB_TXN *txn;
+ const char *name;
+{
+ DBT fid_dbt, name_dbt;
+ DB_LOCKER *locker;
+ DB_LSN lsn;
+ ENV *env;
+ int ret;
+
+ env = dbp->env;
+ locker = NULL;
+
+ DB_ASSERT(env, name != NULL);
+
+ /* This had better exist if we are trying to do a remove. */
+ (void)__memp_set_flags(dbp->mpf, DB_MPOOL_NOFILE, 1);
+ if ((ret = __memp_fopen(dbp->mpf, NULL,
+ name, &dbp->dirname, 0, 0, 0)) != 0)
+ return (ret);
+ if ((ret = __memp_get_fileid(dbp->mpf, dbp->fileid)) != 0)
+ return (ret);
+ dbp->preserve_fid = 1;
+
+ if (LOCKING_ON(env)) {
+ if (dbp->locker == NULL &&
+ (ret = __lock_id(env, NULL, &dbp->locker)) != 0)
+ return (ret);
+ locker = txn == NULL ? dbp->locker : txn->locker;
+ }
+
+ /*
+ * In a transactional environment, we'll play the same game we play
+ * for databases in the file system -- create a temporary database
+ * and put it in with the current name and then rename this one to
+ * another name. We'll then use a commit-time event to remove the
+ * entry.
+ */
+ if ((ret =
+ __fop_lock_handle(env, dbp, locker, DB_LOCK_WRITE, NULL, 0)) != 0)
+ return (ret);
+
+ if (!IS_REAL_TXN(txn))
+ ret = __memp_nameop(env, dbp->fileid, NULL, name, NULL, 1);
+ else if (LOGGING_ON(env)) {
+ if (txn != NULL && (ret =
+ __txn_remevent(env, txn, name, dbp->fileid, 1)) != 0)
+ return (ret);
+
+ DB_INIT_DBT(name_dbt, name, strlen(name) + 1);
+ DB_INIT_DBT(fid_dbt, dbp->fileid, DB_FILE_ID_LEN);
+ ret = __crdel_inmem_remove_log(
+ env, txn, &lsn, 0, &name_dbt, &fid_dbt);
+ }
+
+ return (ret);
+}
+
+/*
+ * __db_subdb_remove --
+ * Remove a subdatabase.
+ */
+static int
+__db_subdb_remove(dbp, ip, txn, name, subdb)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name, *subdb;
+{
+ DB *mdbp, *sdbp;
+ int ret, t_ret;
+
+ mdbp = sdbp = NULL;
+
+ /* Open the subdatabase. */
+ if ((ret = __db_create_internal(&sdbp, dbp->env, 0)) != 0)
+ goto err;
+ if (F_ISSET(dbp, DB_AM_NOT_DURABLE) &&
+ (ret = __db_set_flags(sdbp, DB_TXN_NOT_DURABLE)) != 0)
+ goto err;
+ if ((ret = __db_open(sdbp, ip,
+ txn, name, subdb, DB_UNKNOWN, DB_WRITEOPEN, 0, PGNO_BASE_MD)) != 0)
+ goto err;
+
+ DB_TEST_RECOVERY(sdbp, DB_TEST_PREDESTROY, ret, name);
+
+ /* Free up the pages in the subdatabase. */
+ switch (sdbp->type) {
+ case DB_BTREE:
+ case DB_RECNO:
+ if ((ret = __bam_reclaim(sdbp, ip, txn)) != 0)
+ goto err;
+ break;
+ case DB_HASH:
+ if ((ret = __ham_reclaim(sdbp, ip, txn)) != 0)
+ goto err;
+ break;
+ case DB_QUEUE:
+ case DB_UNKNOWN:
+ default:
+ ret = __db_unknown_type(
+ sdbp->env, "__db_subdb_remove", sdbp->type);
+ goto err;
+ }
+
+ /*
+ * Remove the entry from the main database and free the subdatabase
+ * metadata page.
+ */
+ if ((ret = __db_master_open(sdbp, ip, txn, name, 0, 0, &mdbp)) != 0)
+ goto err;
+
+ if ((ret = __db_master_update(mdbp,
+ sdbp, ip, txn, subdb, sdbp->type, MU_REMOVE, NULL, 0)) != 0)
+ goto err;
+
+ DB_TEST_RECOVERY(sdbp, DB_TEST_POSTDESTROY, ret, name);
+
+DB_TEST_RECOVERY_LABEL
+err:
+ /* Close the main and subdatabases. */
+ if ((t_ret = __db_close(sdbp, txn, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (mdbp != NULL &&
+ (t_ret = __db_close(mdbp, txn, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+static int
+__db_dbtxn_remove(dbp, ip, txn, name, subdb)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name, *subdb;
+{
+ ENV *env;
+ int ret;
+ char *tmpname;
+
+ env = dbp->env;
+ tmpname = NULL;
+
+ /*
+ * This is a transactional remove, so we have to keep the name
+ * of the file locked until the transaction commits. As a result,
+ * we implement remove by renaming the file to some other name
+ * (which creates a dummy named file as a placeholder for the
+ * file being rename/dremoved) and then deleting that file as
+ * a delayed remove at commit.
+ */
+ if ((ret = __db_backup_name(env,
+ F_ISSET(dbp, DB_AM_INMEM) ? subdb : name, txn, &tmpname)) != 0)
+ return (ret);
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_PREDESTROY, ret, name);
+
+ if ((ret = __db_rename_int(dbp,
+ txn->thread_info, txn, name, subdb, tmpname)) != 0)
+ goto err;
+
+ /*
+ * The internal removes will also translate into delayed removes.
+ */
+ if (dbp->db_am_remove != NULL &&
+ (ret = dbp->db_am_remove(dbp, ip, txn, tmpname, NULL, 0)) != 0)
+ goto err;
+
+ ret = F_ISSET(dbp, DB_AM_INMEM) ?
+ __db_inmem_remove(dbp, txn, tmpname) :
+ __fop_remove(env,
+ txn, dbp->fileid, tmpname, &dbp->dirname, DB_APP_DATA,
+ F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0);
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, name);
+
+err:
+DB_TEST_RECOVERY_LABEL
+ if (tmpname != NULL)
+ __os_free(env, tmpname);
+
+ return (ret);
+}
diff --git a/db/db_rename.c b/db/db_rename.c
new file mode 100644
index 0000000..1fdf721
--- /dev/null
+++ b/db/db_rename.c
@@ -0,0 +1,372 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2010 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/fop.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __db_rename __P((DB *, DB_THREAD_INFO *,
+ DB_TXN *, const char *, const char *, const char *));
+static int __db_subdb_rename __P((DB *, DB_THREAD_INFO *,
+ DB_TXN *, const char *, const char *, const char *));
+
+/*
+ * __env_dbrename_pp
+ * ENV->dbrename pre/post processing.
+ *
+ * PUBLIC: int __env_dbrename_pp __P((DB_ENV *, DB_TXN *,
+ * PUBLIC: const char *, const char *, const char *, u_int32_t));
+ */
+int
+__env_dbrename_pp(dbenv, txn, name, subdb, newname, flags)
+ DB_ENV *dbenv;
+ DB_TXN *txn;
+ const char *name, *subdb, *newname;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret, txn_local;
+
+ env = dbenv->env;
+ dbp = NULL;
+ txn_local = 0;
+
+ ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->dbrename");
+
+ /*
+ * The actual argument checking is simple, do it inline, outside of
+ * the replication block.
+ */
+ if ((ret = __db_fchk(env, "DB->rename", flags, DB_AUTO_COMMIT)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret = __env_rep_enter(env, 1)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ /*
+ * Create local transaction as necessary, check for consistent
+ * transaction usage.
+ */
+ if (IS_ENV_AUTO_COMMIT(env, txn, flags)) {
+ if ((ret = __db_txn_auto_init(env, ip, &txn)) != 0)
+ goto err;
+ txn_local = 1;
+ } else
+ if (txn != NULL && !TXN_ON(env) &&
+ (!CDB_LOCKING(env) || !F_ISSET(txn, TXN_CDSGROUP))) {
+ ret = __db_not_txn_env(env);
+ goto err;
+ }
+
+ LF_CLR(DB_AUTO_COMMIT);
+
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ goto err;
+
+ ret = __db_rename_int(dbp, ip, txn, name, subdb, newname);
+
+ if (txn_local) {
+ /*
+ * We created the DBP here and when we commit/abort, we'll
+ * release all the transactional locks, including the handle
+ * lock; mark the handle cleared explicitly.
+ */
+ LOCK_INIT(dbp->handle_lock);
+ dbp->locker = NULL;
+ } else if (txn != NULL) {
+ /*
+ * We created this handle locally so we need to close it and
+ * clean it up. Unfortunately, it's holding transactional
+ * or CDS group locks that need to persist until the end of
+ * transaction. If we invalidate the locker (dbp->locker),
+ * then the close won't free these locks prematurely.
+ */
+ dbp->locker = NULL;
+ }
+
+err: if (txn_local && (t_ret =
+ __db_txn_auto_resolve(env, txn, 0, ret)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * We never opened this dbp for real, so don't include a transaction
+ * handle, and use NOSYNC to avoid calling into mpool.
+ *
+ * !!!
+ * Note we're reversing the order of operations: we started the txn and
+ * then opened the DB handle; we're resolving the txn and then closing
+ * closing the DB handle -- it's safer.
+ */
+ if (dbp != NULL &&
+ (t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_rename_pp
+ * DB->rename pre/post processing.
+ *
+ * PUBLIC: int __db_rename_pp __P((DB *,
+ * PUBLIC: const char *, const char *, const char *, u_int32_t));
+ */
+int
+__db_rename_pp(dbp, name, subdb, newname, flags)
+ DB *dbp;
+ const char *name, *subdb, *newname;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = dbp->env;
+ handle_check = 0;
+
+ /*
+ * Validate arguments, continuing to destroy the handle on failure.
+ *
+ * Cannot use DB_ILLEGAL_AFTER_OPEN directly because it returns.
+ *
+ * !!!
+ * We have a serious problem if we're here with a handle used to open
+ * a database -- we'll destroy the handle, and the application won't
+ * ever be able to close the database.
+ */
+ if (F_ISSET(dbp, DB_AM_OPEN_CALLED))
+ return (__db_mi_open(env, "DB->rename", 1));
+
+ /* Validate arguments. */
+ if ((ret = __db_fchk(env, "DB->rename", flags, 0)) != 0)
+ return (ret);
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, NULL, DB_LOCK_INVALIDID, 0)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret = __db_rep_enter(dbp, 1, 1, 0)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ /* Rename the file. */
+ ret = __db_rename(dbp, ip, NULL, name, subdb, newname);
+
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+err: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_rename
+ * DB->rename method.
+ *
+ */
+static int
+__db_rename(dbp, ip, txn, name, subdb, newname)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name, *subdb, *newname;
+{
+ int ret, t_ret;
+
+ ret = __db_rename_int(dbp, ip, txn, name, subdb, newname);
+
+ if ((t_ret = __db_close(dbp, txn, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_rename_int
+ * Worker function for DB->rename method; the close of the dbp is
+ * left in the wrapper routine.
+ *
+ * PUBLIC: int __db_rename_int __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC: DB_TXN *, const char *, const char *, const char *));
+ */
+int
+__db_rename_int(dbp, ip, txn, name, subdb, newname)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name, *subdb, *newname;
+{
+ ENV *env;
+ int ret;
+ char *old, *real_name;
+
+ env = dbp->env;
+ real_name = NULL;
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_PREDESTROY, ret, name);
+
+ if (name == NULL && subdb == NULL) {
+ __db_errx(env, "Rename on temporary files invalid");
+ ret = EINVAL;
+ goto err;
+ }
+
+ if (name == NULL)
+ MAKE_INMEM(dbp);
+ else if (subdb != NULL) {
+ ret = __db_subdb_rename(dbp, ip, txn, name, subdb, newname);
+ goto err;
+ }
+
+ /*
+ * From here on down, this pertains to files or in-memory databases.
+ *
+ * Find the real name of the file.
+ */
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ old = (char *)subdb;
+ real_name = (char *)subdb;
+ } else {
+ if ((ret = __db_appname(env, DB_APP_DATA,
+ name, &dbp->dirname, &real_name)) != 0)
+ goto err;
+ old = (char *)name;
+ }
+ DB_ASSERT(env, old != NULL);
+
+ if ((ret = __fop_remove_setup(dbp, txn, real_name, 0)) != 0)
+ goto err;
+
+ if (dbp->db_am_rename != NULL &&
+ (ret = dbp->db_am_rename(dbp, ip, txn, name, subdb, newname)) != 0)
+ goto err;
+
+ /*
+ * The transactional case and non-transactional case are
+ * quite different. In the non-transactional case, we simply
+ * do the rename. In the transactional case, since we need
+ * the ability to back out and maintain locking, we have to
+ * create a temporary object as a placeholder. This is all
+ * taken care of in the fop layer.
+ */
+ if (IS_REAL_TXN(txn)) {
+ if ((ret = __fop_dummy(dbp, txn, old, newname)) != 0)
+ goto err;
+ } else {
+ if ((ret = __fop_dbrename(dbp, old, newname)) != 0)
+ goto err;
+ }
+
+ /*
+ * I am pretty sure that we haven't gotten a dbreg id, so calling
+ * dbreg_filelist_update is not necessary.
+ */
+ DB_ASSERT(env, dbp->log_filename == NULL ||
+ dbp->log_filename->id == DB_LOGFILEID_INVALID);
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, newname);
+
+DB_TEST_RECOVERY_LABEL
+err: if (!F_ISSET(dbp, DB_AM_INMEM) && real_name != NULL)
+ __os_free(env, real_name);
+
+ return (ret);
+}
+
+/*
+ * __db_subdb_rename --
+ * Rename a subdatabase.
+ */
+static int
+__db_subdb_rename(dbp, ip, txn, name, subdb, newname)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name, *subdb, *newname;
+{
+ DB *mdbp;
+ ENV *env;
+ PAGE *meta;
+ int ret, t_ret;
+
+ mdbp = NULL;
+ meta = NULL;
+ env = dbp->env;
+
+ /*
+ * We have not opened this dbp so it isn't marked as a subdb,
+ * but it ought to be.
+ */
+ F_SET(dbp, DB_AM_SUBDB);
+
+ /*
+ * Rename the entry in the main database. We need to first
+ * get the meta-data page number (via MU_OPEN) so that we can
+ * read the meta-data page and obtain a handle lock. Once we've
+ * done that, we can proceed to do the rename in the master.
+ */
+ if ((ret = __db_master_open(dbp, ip, txn, name, 0, 0, &mdbp)) != 0)
+ goto err;
+
+ if ((ret = __db_master_update(mdbp, dbp, ip, txn, subdb, dbp->type,
+ MU_OPEN, NULL, 0)) != 0)
+ goto err;
+
+ if ((ret = __memp_fget(mdbp->mpf, &dbp->meta_pgno,
+ ip, txn, 0, &meta)) != 0)
+ goto err;
+ memcpy(dbp->fileid, ((DBMETA *)meta)->uid, DB_FILE_ID_LEN);
+ if ((ret = __fop_lock_handle(env,
+ dbp, mdbp->locker, DB_LOCK_WRITE, NULL, NOWAIT_FLAG(txn))) != 0)
+ goto err;
+
+ ret = __memp_fput(mdbp->mpf, ip, meta, dbp->priority);
+ meta = NULL;
+ if (ret != 0)
+ goto err;
+
+ if ((ret = __db_master_update(mdbp, dbp, ip, txn,
+ subdb, dbp->type, MU_RENAME, newname, 0)) != 0)
+ goto err;
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, name);
+
+DB_TEST_RECOVERY_LABEL
+err:
+ if (meta != NULL && (t_ret =
+ __memp_fput(mdbp->mpf, ip, meta, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (mdbp != NULL &&
+ (t_ret = __db_close(mdbp, txn, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
diff --git a/db/db_ret.c b/db/db_ret.c
new file mode 100644
index 0000000..5ff60d1
--- /dev/null
+++ b/db/db_ret.c
@@ -0,0 +1,156 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+
+/*
+ * __db_ret --
+ * Build return DBT.
+ *
+ * PUBLIC: int __db_ret __P((DBC *,
+ * PUBLIC: PAGE *, u_int32_t, DBT *, void **, u_int32_t *));
+ */
+int
+__db_ret(dbc, h, indx, dbt, memp, memsize)
+ DBC *dbc;
+ PAGE *h;
+ u_int32_t indx;
+ DBT *dbt;
+ void **memp;
+ u_int32_t *memsize;
+{
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ DB *dbp;
+ HOFFPAGE ho;
+ u_int32_t len;
+ u_int8_t *hk;
+ void *data;
+
+ dbp = dbc->dbp;
+
+ switch (TYPE(h)) {
+ case P_HASH_UNSORTED:
+ case P_HASH:
+ hk = P_ENTRY(dbp, h, indx);
+ if (HPAGE_PTYPE(hk) == H_OFFPAGE) {
+ memcpy(&ho, hk, sizeof(HOFFPAGE));
+ return (__db_goff(dbc, dbt,
+ ho.tlen, ho.pgno, memp, memsize));
+ }
+ len = LEN_HKEYDATA(dbp, h, dbp->pgsize, indx);
+ data = HKEYDATA_DATA(hk);
+ break;
+ case P_LBTREE:
+ case P_LDUP:
+ case P_LRECNO:
+ bk = GET_BKEYDATA(dbp, h, indx);
+ if (B_TYPE(bk->type) == B_OVERFLOW) {
+ bo = (BOVERFLOW *)bk;
+ return (__db_goff(dbc, dbt,
+ bo->tlen, bo->pgno, memp, memsize));
+ }
+ len = bk->len;
+ data = bk->data;
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, h->pgno));
+ }
+
+ return (__db_retcopy(dbp->env, dbt, data, len, memp, memsize));
+}
+
+/*
+ * __db_retcopy --
+ * Copy the returned data into the user's DBT, handling special flags.
+ *
+ * PUBLIC: int __db_retcopy __P((ENV *, DBT *,
+ * PUBLIC: void *, u_int32_t, void **, u_int32_t *));
+ */
+int
+__db_retcopy(env, dbt, data, len, memp, memsize)
+ ENV *env;
+ DBT *dbt;
+ void *data;
+ u_int32_t len;
+ void **memp;
+ u_int32_t *memsize;
+{
+ int ret;
+
+ ret = 0;
+
+ /* If returning a partial record, reset the length. */
+ if (F_ISSET(dbt, DB_DBT_PARTIAL)) {
+ data = (u_int8_t *)data + dbt->doff;
+ if (len > dbt->doff) {
+ len -= dbt->doff;
+ if (len > dbt->dlen)
+ len = dbt->dlen;
+ } else
+ len = 0;
+ }
+
+ /*
+ * Allocate memory to be owned by the application: DB_DBT_MALLOC,
+ * DB_DBT_REALLOC.
+ *
+ * !!!
+ * We always allocate memory, even if we're copying out 0 bytes. This
+ * guarantees consistency, i.e., the application can always free memory
+ * without concern as to how many bytes of the record were requested.
+ *
+ * Use the memory specified by the application: DB_DBT_USERMEM.
+ *
+ * !!!
+ * If the length we're going to copy is 0, the application-supplied
+ * memory pointer is allowed to be NULL.
+ */
+ if (F_ISSET(dbt, DB_DBT_USERCOPY)) {
+ dbt->size = len;
+ return (len == 0 ? 0 : env->dbt_usercopy(dbt, 0, data,
+ len, DB_USERCOPY_SETDATA));
+
+ } else if (F_ISSET(dbt, DB_DBT_MALLOC))
+ ret = __os_umalloc(env, len, &dbt->data);
+ else if (F_ISSET(dbt, DB_DBT_REALLOC)) {
+ if (dbt->data == NULL || dbt->size == 0 || dbt->size < len)
+ ret = __os_urealloc(env, len, &dbt->data);
+ } else if (F_ISSET(dbt, DB_DBT_USERMEM)) {
+ if (len != 0 && (dbt->data == NULL || dbt->ulen < len))
+ ret = DB_BUFFER_SMALL;
+ } else if (memp == NULL || memsize == NULL)
+ ret = EINVAL;
+ else {
+ if (len != 0 && (*memsize == 0 || *memsize < len)) {
+ if ((ret = __os_realloc(env, len, memp)) == 0)
+ *memsize = len;
+ else
+ *memsize = 0;
+ }
+ if (ret == 0)
+ dbt->data = *memp;
+ }
+
+ if (ret == 0 && len != 0)
+ memcpy(dbt->data, data, len);
+
+ /*
+ * Return the length of the returned record in the DBT size field.
+ * This satisfies the requirement that if we're using user memory
+ * and insufficient memory was provided, return the amount necessary
+ * in the size field.
+ */
+ dbt->size = len;
+
+ return (ret);
+}
diff --git a/db/db_setid.c b/db/db_setid.c
new file mode 100644
index 0000000..a78977e
--- /dev/null
+++ b/db/db_setid.c
@@ -0,0 +1,213 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/db_am.h"
+#include "dbinc/mp.h"
+
+/*
+ * __env_fileid_reset_pp --
+ * ENV->fileid_reset pre/post processing.
+ *
+ * PUBLIC: int __env_fileid_reset_pp __P((DB_ENV *, const char *, u_int32_t));
+ */
+int
+__env_fileid_reset_pp(dbenv, name, flags)
+ DB_ENV *dbenv;
+ const char *name;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->fileid_reset");
+
+ /*
+ * !!!
+ * The actual argument checking is simple, do it inline, outside of
+ * the replication block.
+ */
+ if (flags != 0 && flags != DB_ENCRYPT)
+ return (__db_ferr(env, "DB_ENV->fileid_reset", 0));
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env,
+ (__env_fileid_reset(env, ip, name, LF_ISSET(DB_ENCRYPT) ? 1 : 0)),
+ 1, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __env_fileid_reset --
+ * Reset the file IDs for every database in the file.
+ * PUBLIC: int __env_fileid_reset
+ * PUBLIC: __P((ENV *, DB_THREAD_INFO *, const char *, int));
+ */
+int
+__env_fileid_reset(env, ip, name, encrypted)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ const char *name;
+ int encrypted;
+{
+ DB *dbp;
+ DBC *dbcp;
+ DBMETA *meta;
+ DBT key, data;
+ DB_FH *fhp;
+ DB_MPOOLFILE *mpf;
+ DB_PGINFO cookie;
+ db_pgno_t pgno;
+ int t_ret, ret;
+ size_t n;
+ char *real_name;
+ u_int8_t fileid[DB_FILE_ID_LEN], mbuf[DBMETASIZE];
+ void *pagep;
+
+ dbp = NULL;
+ dbcp = NULL;
+ fhp = NULL;
+ real_name = NULL;
+
+ /* Get the real backing file name. */
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, name, NULL, &real_name)) != 0)
+ return (ret);
+
+ /* Get a new file ID. */
+ if ((ret = __os_fileid(env, real_name, 1, fileid)) != 0)
+ goto err;
+
+ /*
+ * The user may have physically copied a file currently open in the
+ * cache, which means if we open this file through the cache before
+ * updating the file ID on page 0, we might connect to the file from
+ * which the copy was made.
+ */
+ if ((ret = __os_open(env, real_name, 0, 0, 0, &fhp)) != 0) {
+ __db_err(env, ret, "%s", real_name);
+ goto err;
+ }
+ if ((ret = __os_read(env, fhp, mbuf, sizeof(mbuf), &n)) != 0)
+ goto err;
+
+ if (n != sizeof(mbuf)) {
+ ret = EINVAL;
+ __db_errx(env,
+ "__env_fileid_reset: %s: unexpected file type or format",
+ real_name);
+ goto err;
+ }
+
+ /*
+ * Create the DB object.
+ */
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ goto err;
+
+ /* If configured with a password, the databases are encrypted. */
+ if (encrypted && (ret = __db_set_flags(dbp, DB_ENCRYPT)) != 0)
+ goto err;
+
+ if ((ret = __db_meta_setup(env,
+ dbp, real_name, (DBMETA *)mbuf, 0, DB_CHK_META)) != 0)
+ goto err;
+
+ meta = (DBMETA *)mbuf;
+ if (FLD_ISSET(meta->metaflags,
+ DBMETA_PART_RANGE | DBMETA_PART_CALLBACK) && (ret =
+ __part_fileid_reset(env, ip, name, meta->nparts, encrypted)) != 0)
+ goto err;
+
+ memcpy(meta->uid, fileid, DB_FILE_ID_LEN);
+ cookie.db_pagesize = sizeof(mbuf);
+ cookie.flags = dbp->flags;
+ cookie.type = dbp->type;
+ key.data = &cookie;
+
+ if ((ret = __db_pgout(env->dbenv, 0, mbuf, &key)) != 0)
+ goto err;
+ if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+ goto err;
+ if ((ret = __os_write(env, fhp, mbuf, sizeof(mbuf), &n)) != 0)
+ goto err;
+ if ((ret = __os_fsync(env, fhp)) != 0)
+ goto err;
+
+ /*
+ * Page 0 of the file has an updated file ID, and we can open it in
+ * the cache without connecting to a different, existing file. Open
+ * the file in the cache, and update the file IDs for subdatabases.
+ * (No existing code, as far as I know, actually uses the file ID of
+ * a subdatabase, but it's cleaner to get them all.)
+ */
+
+ /*
+ * If the database file doesn't support subdatabases, we only have
+ * to update a single metadata page. Otherwise, we have to open a
+ * cursor and step through the master database, and update all of
+ * the subdatabases' metadata pages.
+ */
+ if (meta->type != P_BTREEMETA || !F_ISSET(meta, BTM_SUBDB))
+ goto err;
+
+ /*
+ * Open the DB file.
+ *
+ * !!!
+ * Note DB_RDWRMASTER flag, we need to open the master database file
+ * for writing in this case.
+ */
+ if ((ret = __db_open(dbp, ip, NULL,
+ name, NULL, DB_UNKNOWN, DB_RDWRMASTER, 0, PGNO_BASE_MD)) != 0)
+ goto err;
+
+ mpf = dbp->mpf;
+ memset(&key, 0, sizeof(key));
+ memset(&data, 0, sizeof(data));
+ if ((ret = __db_cursor(dbp, ip, NULL, &dbcp, 0)) != 0)
+ goto err;
+ while ((ret = __dbc_get(dbcp, &key, &data, DB_NEXT)) == 0) {
+ /*
+ * XXX
+ * We're handling actual data, not on-page meta-data, so it
+ * hasn't been converted to/from opposite endian architectures.
+ * Do it explicitly, now.
+ */
+ memcpy(&pgno, data.data, sizeof(db_pgno_t));
+ DB_NTOHL_SWAP(env, &pgno);
+ if ((ret = __memp_fget(mpf, &pgno, ip, NULL,
+ DB_MPOOL_DIRTY, &pagep)) != 0)
+ goto err;
+ memcpy(((DBMETA *)pagep)->uid, fileid, DB_FILE_ID_LEN);
+ if ((ret = __memp_fput(mpf, ip, pagep, dbcp->priority)) != 0)
+ goto err;
+ }
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+
+err: if (dbcp != NULL && (t_ret = __dbc_close(dbcp)) != 0 && ret == 0)
+ ret = t_ret;
+ if (dbp != NULL && (t_ret = __db_close(dbp, NULL, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ if (fhp != NULL &&
+ (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
+ ret = t_ret;
+ if (real_name != NULL)
+ __os_free(env, real_name);
+
+ return (ret);
+}
diff --git a/db/db_setlsn.c b/db/db_setlsn.c
new file mode 100644
index 0000000..51ee7d3
--- /dev/null
+++ b/db/db_setlsn.c
@@ -0,0 +1,137 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/qam.h"
+
+static int __env_lsn_reset __P((ENV *, DB_THREAD_INFO *, const char *, int));
+
+/*
+ * __env_lsn_reset_pp --
+ * ENV->lsn_reset pre/post processing.
+ *
+ * PUBLIC: int __env_lsn_reset_pp __P((DB_ENV *, const char *, u_int32_t));
+ */
+int
+__env_lsn_reset_pp(dbenv, name, flags)
+ DB_ENV *dbenv;
+ const char *name;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->lsn_reset");
+
+ /*
+ * !!!
+ * The actual argument checking is simple, do it inline, outside of
+ * the replication block.
+ */
+ if (flags != 0 && flags != DB_ENCRYPT)
+ return (__db_ferr(env, "DB_ENV->lsn_reset", 0));
+
+ ENV_ENTER(env, ip);
+ REPLICATION_WRAP(env,
+ (__env_lsn_reset(env, ip, name, LF_ISSET(DB_ENCRYPT) ? 1 : 0)),
+ 1, ret);
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __env_lsn_reset --
+ * Reset the LSNs for every page in the file.
+ */
+static int
+__env_lsn_reset(env, ip, name, encrypted)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ const char *name;
+ int encrypted;
+{
+ DB *dbp;
+ int t_ret, ret;
+
+ /* Create the DB object. */
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ return (ret);
+
+ /* If configured with a password, the databases are encrypted. */
+ if (encrypted && (ret = __db_set_flags(dbp, DB_ENCRYPT)) != 0)
+ goto err;
+
+ /*
+ * Open the DB file.
+ *
+ * !!!
+ * Note DB_RDWRMASTER flag, we need to open the master database file
+ * for writing in this case.
+ */
+ if ((ret = __db_open(dbp, ip, NULL,
+ name, NULL, DB_UNKNOWN, DB_RDWRMASTER, 0, PGNO_BASE_MD)) != 0) {
+ __db_err(env, ret, "%s", name);
+ goto err;
+ }
+
+ ret = __db_lsn_reset(dbp->mpf, ip);
+#ifdef HAVE_PARTITION
+ if (ret == 0 && DB_IS_PARTITIONED(dbp))
+ ret = __part_lsn_reset(dbp, ip);
+ else
+#endif
+ if (ret == 0 && dbp->type == DB_QUEUE)
+#ifdef HAVE_QUEUE
+ ret = __qam_lsn_reset(dbp, ip);
+#else
+ ret = __db_no_queue_am(env);
+#endif
+
+err: if ((t_ret = __db_close(dbp, NULL, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __db_lsn_reset -- reset the lsn for a db mpool handle.
+ * PUBLIC: int __db_lsn_reset __P((DB_MPOOLFILE *, DB_THREAD_INFO *));
+ */
+int
+__db_lsn_reset(mpf, ip)
+ DB_MPOOLFILE *mpf;
+ DB_THREAD_INFO *ip;
+{
+ PAGE *pagep;
+ db_pgno_t pgno;
+ int ret;
+
+ /* Reset the LSN on every page of the database file. */
+ for (pgno = 0;
+ (ret = __memp_fget(mpf,
+ &pgno, ip, NULL, DB_MPOOL_DIRTY, &pagep)) == 0;
+ ++pgno) {
+ LSN_NOT_LOGGED(pagep->lsn);
+ if ((ret = __memp_fput(mpf,
+ ip, pagep, DB_PRIORITY_UNCHANGED)) != 0)
+ break;
+ }
+
+ if (ret == DB_PAGE_NOTFOUND)
+ ret = 0;
+
+ return (ret);
+}
diff --git a/db/db_sort_multiple.c b/db/db_sort_multiple.c
new file mode 100644
index 0000000..32ae2df
--- /dev/null
+++ b/db/db_sort_multiple.c
@@ -0,0 +1,287 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+
+static int __db_quicksort __P((DB *, DBT *, DBT *, u_int32_t *, u_int32_t *,
+ u_int32_t *, u_int32_t *, u_int32_t));
+
+/*
+ * __db_compare_both --
+ * Use the comparison functions from db to compare akey and bkey, and if
+ * DB_DUPSORT adata and bdata.
+ *
+ * PUBLIC: int __db_compare_both __P((DB *, const DBT *, const DBT *,
+ * PUBLIC: const DBT *, const DBT *));
+ */
+int
+__db_compare_both(db, akey, adata, bkey, bdata)
+ DB *db;
+ const DBT *akey;
+ const DBT *adata;
+ const DBT *bkey;
+ const DBT *bdata;
+{
+ BTREE *t;
+ int cmp;
+
+ t = (BTREE *)db->bt_internal;
+
+ cmp = t->bt_compare(db, akey, bkey);
+ if (cmp != 0) return cmp;
+ if (!F_ISSET(db, DB_AM_DUPSORT)) return 0;
+
+ if (adata == 0) return bdata == 0 ? 0 : -1;
+ if (bdata == 0) return 1;
+
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(db))
+ return t->compress_dup_compare(db, adata, bdata);
+#endif
+ return db->dup_compare(db, adata, bdata);
+}
+
+#define DB_SORT_SWAP(a, ad, b, bd) \
+do { \
+ tmp = (a)[0]; (a)[0] = (b)[0]; (b)[0] = tmp; \
+ tmp = (a)[-1]; (a)[-1] = (b)[-1]; (b)[-1] = tmp; \
+ if (data != NULL) { \
+ tmp = (ad)[0]; (ad)[0] = (bd)[0]; (bd)[0] = tmp; \
+ tmp = (ad)[-1]; (ad)[-1] = (bd)[-1]; (bd)[-1] = tmp; \
+ } \
+} while (0)
+
+#define DB_SORT_LOAD_DBT(a, ad, aptr, adptr) \
+do { \
+ (a).data = (u_int8_t*)key->data + (aptr)[0]; \
+ (a).size = (aptr)[-1]; \
+ if (data != NULL) { \
+ (ad).data = (u_int8_t*)data->data + (adptr)[0]; \
+ (ad).size = (adptr)[-1]; \
+ } \
+} while (0)
+
+#define DB_SORT_COMPARE(a, ad, b, bd) (data != NULL ? \
+ __db_compare_both(db, &(a), &(ad), &(b), &(bd)) : \
+ __db_compare_both(db, &(a), 0, &(b), 0))
+
+#define DB_SORT_STACKSIZE 32
+
+/*
+ * __db_quicksort --
+ * The quicksort implementation for __db_sort_multiple() and
+ * __db_sort_multiple_key().
+ */
+static int
+__db_quicksort(db, key, data, kstart, kend, dstart, dend, size)
+ DB *db;
+ DBT *key, *data;
+ u_int32_t *kstart, *kend, *dstart, *dend;
+ u_int32_t size;
+{
+ int ret;
+ u_int32_t tmp;
+ u_int32_t *kmiddle, *dmiddle, *kptr, *dptr;
+ DBT a, ad, b, bd, m, md;
+ ENV *env;
+
+ struct DB_SORT_quicksort_stack {
+ u_int32_t *kstart;
+ u_int32_t *kend;
+ u_int32_t *dstart;
+ u_int32_t *dend;
+ } stackbuf[DB_SORT_STACKSIZE], *stack;
+ u_int32_t soff, slen;
+
+ ret = 0;
+ env = db->env;
+
+ memset(&a, 0, sizeof(DBT));
+ memset(&ad, 0, sizeof(DBT));
+ memset(&b, 0, sizeof(DBT));
+ memset(&bd, 0, sizeof(DBT));
+ memset(&m, 0, sizeof(DBT));
+ memset(&md, 0, sizeof(DBT));
+
+ /* NB end is smaller than start */
+
+ stack = stackbuf;
+ soff = 0;
+ slen = DB_SORT_STACKSIZE;
+
+ start:
+ if (kend >= kstart) goto pop;
+
+ /* If there's only one value, it's already sorted */
+ tmp = (u_int32_t)(kstart - kend) / size;
+ if (tmp == 1) goto pop;
+
+ DB_SORT_LOAD_DBT(a, ad, kstart, dstart);
+ DB_SORT_LOAD_DBT(b, bd, kend + size, dend + size);
+
+ if (tmp == 2) {
+ /* Special case the sorting of two value sequences */
+ if (DB_SORT_COMPARE(a, ad, b, bd) > 0) {
+ DB_SORT_SWAP(kstart, dstart, kend + size, dend + size);
+ }
+ goto pop;
+ }
+
+ kmiddle = kstart - (tmp / 2) * size;
+ dmiddle = dstart - (tmp / 2) * size;
+ DB_SORT_LOAD_DBT(m, md, kmiddle, dmiddle);
+
+ /* Find the median of three */
+ if (DB_SORT_COMPARE(a, ad, b, bd) < 0) {
+ if (DB_SORT_COMPARE(m, md, a, ad) < 0) {
+ /* m < a < b */
+ DB_SORT_SWAP(kstart, dstart, kend + size, dend + size);
+ } else if (DB_SORT_COMPARE(m, md, b, bd) < 0) {
+ /* a < m < b */
+ DB_SORT_SWAP(kmiddle,
+ dmiddle, kend + size, dend + size);
+ } else {
+ /* a < b < m */
+ /* Do nothing */
+ }
+ } else {
+ if (DB_SORT_COMPARE(a, ad, m, md) < 0) {
+ /* b < a < m */
+ DB_SORT_SWAP(kstart, dstart, kend + size, dend + size);
+ } else if (DB_SORT_COMPARE(b, bd, m, md) < 0) {
+ /* b < m < a */
+ DB_SORT_SWAP(kmiddle,
+ dmiddle, kend + size, dend + size);
+ } else {
+ /* m < b < a */
+ /* Do nothing */
+ }
+ }
+
+ /* partition */
+ DB_SORT_LOAD_DBT(b, bd, kend + size, dend + size);
+ kmiddle = kstart;
+ dmiddle = dstart;
+ for (kptr = kstart, dptr = dstart; kptr > kend;
+ kptr -= size, dptr -= size) {
+ DB_SORT_LOAD_DBT(a, ad, kptr, dptr);
+ if (DB_SORT_COMPARE(a, ad, b, bd) < 0) {
+ DB_SORT_SWAP(kmiddle, dmiddle, kptr, dptr);
+ kmiddle -= size;
+ dmiddle -= size;
+ }
+ }
+
+ DB_SORT_SWAP(kmiddle, dmiddle, kend + size, dend + size);
+
+ if (soff == slen) {
+ /* Grow the stack */
+ slen = slen * 2;
+ if (stack == stackbuf) {
+ ret = __os_malloc(env, slen *
+ sizeof(struct DB_SORT_quicksort_stack), &stack);
+ if (ret != 0) goto error;
+ memcpy(stack, stackbuf, soff *
+ sizeof(struct DB_SORT_quicksort_stack));
+ } else {
+ ret = __os_realloc(env, slen *
+ sizeof(struct DB_SORT_quicksort_stack), &stack);
+ if (ret != 0) goto error;
+ }
+ }
+
+ /* divide and conquer */
+ stack[soff].kstart = kmiddle - size;
+ stack[soff].kend = kend;
+ stack[soff].dstart = dmiddle - size;
+ stack[soff].dend = dend;
+ ++soff;
+
+ kend = kmiddle;
+ dend = dmiddle;
+
+ goto start;
+
+ pop:
+ if (soff != 0) {
+ --soff;
+ kstart = stack[soff].kstart;
+ kend = stack[soff].kend;
+ dstart = stack[soff].dstart;
+ dend = stack[soff].dend;
+ goto start;
+ }
+
+ error:
+ if (stack != stackbuf)
+ __os_free(env, stack);
+
+ return ret;
+}
+
+#undef DB_SORT_SWAP
+#undef DB_SORT_LOAD_DBT
+
+/*
+ * __db_sort_multiple --
+ * If flags == DB_MULTIPLE_KEY, sorts a DB_MULTIPLE_KEY format DBT using
+ * the BTree comparison function and duplicate comparison function.
+ *
+ * If flags == DB_MULTIPLE, sorts one or two DB_MULTIPLE format DBTs using
+ * the BTree comparison function and duplicate comparison function. Will
+ * assume key and data specifies pairs of key/data to sort together. If
+ * data is NULL, will just sort key according to the btree comparison
+ * function.
+ *
+ * Uses an in-place quicksort algorithm, with median of three for the pivot
+ * point.
+ *
+ * PUBLIC: int __db_sort_multiple __P((DB *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_sort_multiple(db, key, data, flags)
+ DB *db;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ u_int32_t *kstart, *kend, *dstart, *dend;
+
+ /* TODO: sanity checks on the DBTs */
+ /* DB_ILLEGAL_METHOD(db, DB_OK_BTREE); */
+
+ kstart = (u_int32_t*)((u_int8_t *)key->data + key->ulen) - 1;
+
+ switch (flags) {
+ case DB_MULTIPLE:
+ if (data != NULL)
+ dstart = (u_int32_t*)((u_int8_t *)data->data +
+ data->ulen) - 1;
+ else
+ dstart = kstart;
+
+ /* Find the end */
+ for (kend = kstart, dend = dstart;
+ *kend != (u_int32_t)-1 && *dend != (u_int32_t)-1;
+ kend -= 2, dend -= 2)
+ ;
+
+ return (__db_quicksort(db, key, data, kstart, kend, dstart,
+ dend, 2));
+ case DB_MULTIPLE_KEY:
+ /* Find the end */
+ for (kend = kstart; *kend != (u_int32_t)-1; kend -= 4)
+ ;
+
+ return (__db_quicksort(db, key, key, kstart, kend, kstart - 2,
+ kend - 2, 4));
+ default:
+ return (__db_ferr(db->env, "DB->sort_multiple", 0));
+ }
+}
diff --git a/db/db_stati.c b/db/db_stati.c
new file mode 100644
index 0000000..b8d3a3f
--- /dev/null
+++ b/db/db_stati.c
@@ -0,0 +1,494 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/qam.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+
+#ifdef HAVE_STATISTICS
+static int __db_print_all __P((DB *, u_int32_t));
+static int __db_print_citem __P((DBC *));
+static int __db_print_cursor __P((DB *));
+static int __db_print_stats __P((DB *, DB_THREAD_INFO *, u_int32_t));
+static int __db_stat __P((DB *, DB_THREAD_INFO *, DB_TXN *, void *, u_int32_t));
+static int __db_stat_arg __P((DB *, u_int32_t));
+
+/*
+ * __db_stat_pp --
+ * DB->stat pre/post processing.
+ *
+ * PUBLIC: int __db_stat_pp __P((DB *, DB_TXN *, void *, u_int32_t));
+ */
+int
+__db_stat_pp(dbp, txn, spp, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ void *spp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = dbp->env;
+
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->stat");
+
+ if ((ret = __db_stat_arg(dbp, flags)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret = __db_rep_enter(dbp, 1, 0,
+ txn != NULL)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ ret = __db_stat(dbp, ip, txn, spp, flags);
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+err: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_stat --
+ * DB->stat.
+ *
+ */
+static int
+__db_stat(dbp, ip, txn, spp, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ void *spp;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ ENV *env;
+ int ret, t_ret;
+
+ env = dbp->env;
+
+ /* Acquire a cursor. */
+ if ((ret = __db_cursor(dbp, ip, txn,
+ &dbc, LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED))) != 0)
+ return (ret);
+
+ DEBUG_LWRITE(dbc, NULL, "DB->stat", NULL, NULL, flags);
+ LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED);
+#ifdef HAVE_PARTITION
+ if (DB_IS_PARTITIONED(dbp))
+ ret = __partition_stat(dbc, spp, flags);
+ else
+#endif
+ switch (dbp->type) {
+ case DB_BTREE:
+ case DB_RECNO:
+ ret = __bam_stat(dbc, spp, flags);
+ break;
+ case DB_HASH:
+ ret = __ham_stat(dbc, spp, flags);
+ break;
+ case DB_QUEUE:
+ ret = __qam_stat(dbc, spp, flags);
+ break;
+ case DB_UNKNOWN:
+ default:
+ ret = (__db_unknown_type(env, "DB->stat", dbp->type));
+ break;
+ }
+
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_stat_arg --
+ * Check DB->stat arguments.
+ */
+static int
+__db_stat_arg(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ ENV *env;
+
+ env = dbp->env;
+
+ /* Check for invalid function flags. */
+ LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED);
+ switch (flags) {
+ case 0:
+ case DB_FAST_STAT:
+ break;
+ default:
+ return (__db_ferr(env, "DB->stat", 0));
+ }
+
+ return (0);
+}
+
+/*
+ * __db_stat_print_pp --
+ * DB->stat_print pre/post processing.
+ *
+ * PUBLIC: int __db_stat_print_pp __P((DB *, u_int32_t));
+ */
+int
+__db_stat_print_pp(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret;
+
+ env = dbp->env;
+
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->stat_print");
+
+ /*
+ * !!!
+ * The actual argument checking is simple, do it inline.
+ */
+ if ((ret = __db_fchk(env,
+ "DB->stat_print", flags, DB_FAST_STAT | DB_STAT_ALL)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, 0)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ ret = __db_stat_print(dbp, ip, flags);
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+err: ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_stat_print --
+ * DB->stat_print.
+ *
+ * PUBLIC: int __db_stat_print __P((DB *, DB_THREAD_INFO *, u_int32_t));
+ */
+int
+__db_stat_print(dbp, ip, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ u_int32_t flags;
+{
+ time_t now;
+ int ret;
+ char time_buf[CTIME_BUFLEN];
+
+ (void)time(&now);
+ __db_msg(dbp->env, "%.24s\tLocal time", __os_ctime(&now, time_buf));
+
+ if (LF_ISSET(DB_STAT_ALL) && (ret = __db_print_all(dbp, flags)) != 0)
+ return (ret);
+
+ if ((ret = __db_print_stats(dbp, ip, flags)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __db_print_stats --
+ * Display default DB handle statistics.
+ */
+static int
+__db_print_stats(dbp, ip, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ ENV *env;
+ int ret, t_ret;
+
+ env = dbp->env;
+
+ /* Acquire a cursor. */
+ if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+ return (ret);
+
+ DEBUG_LWRITE(dbc, NULL, "DB->stat_print", NULL, NULL, 0);
+
+ switch (dbp->type) {
+ case DB_BTREE:
+ case DB_RECNO:
+ ret = __bam_stat_print(dbc, flags);
+ break;
+ case DB_HASH:
+ ret = __ham_stat_print(dbc, flags);
+ break;
+ case DB_QUEUE:
+ ret = __qam_stat_print(dbc, flags);
+ break;
+ case DB_UNKNOWN:
+ default:
+ ret = (__db_unknown_type(env, "DB->stat_print", dbp->type));
+ break;
+ }
+
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_print_all --
+ * Display debugging DB handle statistics.
+ */
+static int
+__db_print_all(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ static const FN fn[] = {
+ { DB_AM_CHKSUM, "DB_AM_CHKSUM" },
+ { DB_AM_COMPENSATE, "DB_AM_COMPENSATE" },
+ { DB_AM_CREATED, "DB_AM_CREATED" },
+ { DB_AM_CREATED_MSTR, "DB_AM_CREATED_MSTR" },
+ { DB_AM_DBM_ERROR, "DB_AM_DBM_ERROR" },
+ { DB_AM_DELIMITER, "DB_AM_DELIMITER" },
+ { DB_AM_DISCARD, "DB_AM_DISCARD" },
+ { DB_AM_DUP, "DB_AM_DUP" },
+ { DB_AM_DUPSORT, "DB_AM_DUPSORT" },
+ { DB_AM_ENCRYPT, "DB_AM_ENCRYPT" },
+ { DB_AM_FIXEDLEN, "DB_AM_FIXEDLEN" },
+ { DB_AM_INMEM, "DB_AM_INMEM" },
+ { DB_AM_IN_RENAME, "DB_AM_IN_RENAME" },
+ { DB_AM_NOT_DURABLE, "DB_AM_NOT_DURABLE" },
+ { DB_AM_OPEN_CALLED, "DB_AM_OPEN_CALLED" },
+ { DB_AM_PAD, "DB_AM_PAD" },
+ { DB_AM_PGDEF, "DB_AM_PGDEF" },
+ { DB_AM_RDONLY, "DB_AM_RDONLY" },
+ { DB_AM_READ_UNCOMMITTED, "DB_AM_READ_UNCOMMITTED" },
+ { DB_AM_RECNUM, "DB_AM_RECNUM" },
+ { DB_AM_RECOVER, "DB_AM_RECOVER" },
+ { DB_AM_RENUMBER, "DB_AM_RENUMBER" },
+ { DB_AM_REVSPLITOFF, "DB_AM_REVSPLITOFF" },
+ { DB_AM_SECONDARY, "DB_AM_SECONDARY" },
+ { DB_AM_SNAPSHOT, "DB_AM_SNAPSHOT" },
+ { DB_AM_SUBDB, "DB_AM_SUBDB" },
+ { DB_AM_SWAP, "DB_AM_SWAP" },
+ { DB_AM_TXN, "DB_AM_TXN" },
+ { DB_AM_VERIFYING, "DB_AM_VERIFYING" },
+ { 0, NULL }
+ };
+ ENV *env;
+ char time_buf[CTIME_BUFLEN];
+
+ env = dbp->env;
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "DB handle information:");
+ STAT_ULONG("Page size", dbp->pgsize);
+ STAT_ISSET("Append recno", dbp->db_append_recno);
+ STAT_ISSET("Feedback", dbp->db_feedback);
+ STAT_ISSET("Dup compare", dbp->dup_compare);
+ STAT_ISSET("App private", dbp->app_private);
+ STAT_ISSET("DbEnv", dbp->env);
+ STAT_STRING("Type", __db_dbtype_to_string(dbp->type));
+
+ __mutex_print_debug_single(env, "Thread mutex", dbp->mutex, flags);
+
+ STAT_STRING("File", dbp->fname);
+ STAT_STRING("Database", dbp->dname);
+ STAT_HEX("Open flags", dbp->open_flags);
+
+ __db_print_fileid(env, dbp->fileid, "\tFile ID");
+
+ STAT_ULONG("Cursor adjust ID", dbp->adj_fileid);
+ STAT_ULONG("Meta pgno", dbp->meta_pgno);
+ if (dbp->locker != NULL)
+ STAT_ULONG("Locker ID", dbp->locker->id);
+ if (dbp->cur_locker != NULL)
+ STAT_ULONG("Handle lock", dbp->cur_locker->id);
+ if (dbp->associate_locker != NULL)
+ STAT_ULONG("Associate lock", dbp->associate_locker->id);
+ STAT_ULONG("RPC remote ID", dbp->cl_id);
+
+ __db_msg(env,
+ "%.24s\tReplication handle timestamp",
+ dbp->timestamp == 0 ? "0" : __os_ctime(&dbp->timestamp, time_buf));
+
+ STAT_ISSET("Secondary callback", dbp->s_callback);
+ STAT_ISSET("Primary handle", dbp->s_primary);
+
+ STAT_ISSET("api internal", dbp->api_internal);
+ STAT_ISSET("Btree/Recno internal", dbp->bt_internal);
+ STAT_ISSET("Hash internal", dbp->h_internal);
+ STAT_ISSET("Queue internal", dbp->q_internal);
+
+ __db_prflags(env, NULL, dbp->flags, fn, NULL, "\tFlags");
+
+ if (dbp->log_filename == NULL)
+ STAT_ISSET("File naming information", dbp->log_filename);
+ else
+ __dbreg_print_fname(env, dbp->log_filename);
+
+ (void)__db_print_cursor(dbp);
+
+ return (0);
+}
+
+/*
+ * __db_print_cursor --
+ * Display the cursor active and free queues.
+ */
+static int
+__db_print_cursor(dbp)
+ DB *dbp;
+{
+ DBC *dbc;
+ ENV *env;
+ int ret, t_ret;
+
+ env = dbp->env;
+
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "DB handle cursors:");
+
+ ret = 0;
+ MUTEX_LOCK(dbp->env, dbp->mutex);
+ __db_msg(env, "Active queue:");
+ TAILQ_FOREACH(dbc, &dbp->active_queue, links)
+ if ((t_ret = __db_print_citem(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ __db_msg(env, "Join queue:");
+ TAILQ_FOREACH(dbc, &dbp->join_queue, links)
+ if ((t_ret = __db_print_citem(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ __db_msg(env, "Free queue:");
+ TAILQ_FOREACH(dbc, &dbp->free_queue, links)
+ if ((t_ret = __db_print_citem(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ MUTEX_UNLOCK(dbp->env, dbp->mutex);
+
+ return (ret);
+}
+
+static int
+__db_print_citem(dbc)
+ DBC *dbc;
+{
+ static const FN fn[] = {
+ { DBC_ACTIVE, "DBC_ACTIVE" },
+ { DBC_DONTLOCK, "DBC_DONTLOCK" },
+ { DBC_MULTIPLE, "DBC_MULTIPLE" },
+ { DBC_MULTIPLE_KEY, "DBC_MULTIPLE_KEY" },
+ { DBC_OPD, "DBC_OPD" },
+ { DBC_OWN_LID, "DBC_OWN_LID" },
+ { DBC_READ_COMMITTED, "DBC_READ_COMMITTED" },
+ { DBC_READ_UNCOMMITTED, "DBC_READ_UNCOMMITTED" },
+ { DBC_RECOVER, "DBC_RECOVER" },
+ { DBC_RMW, "DBC_RMW" },
+ { DBC_TRANSIENT, "DBC_TRANSIENT" },
+ { DBC_WAS_READ_COMMITTED,"DBC_WAS_READ_COMMITTED" },
+ { DBC_WRITECURSOR, "DBC_WRITECURSOR" },
+ { DBC_WRITER, "DBC_WRITER" },
+ { 0, NULL }
+ };
+ DB *dbp;
+ DBC_INTERNAL *cp;
+ ENV *env;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ cp = dbc->internal;
+
+ STAT_POINTER("DBC", dbc);
+ STAT_POINTER("Associated dbp", dbc->dbp);
+ STAT_POINTER("Associated txn", dbc->txn);
+ STAT_POINTER("Internal", cp);
+ STAT_HEX("Default locker ID", dbc->lref == NULL ? 0 : dbc->lref->id);
+ STAT_HEX("Locker", P_TO_ULONG(dbc->locker));
+ STAT_STRING("Type", __db_dbtype_to_string(dbc->dbtype));
+
+ STAT_POINTER("Off-page duplicate cursor", cp->opd);
+ STAT_POINTER("Referenced page", cp->page);
+ STAT_ULONG("Root", cp->root);
+ STAT_ULONG("Page number", cp->pgno);
+ STAT_ULONG("Page index", cp->indx);
+ STAT_STRING("Lock mode", __db_lockmode_to_string(cp->lock_mode));
+ __db_prflags(env, NULL, dbc->flags, fn, NULL, "\tFlags");
+
+ switch (dbc->dbtype) {
+ case DB_BTREE:
+ case DB_RECNO:
+ __bam_print_cursor(dbc);
+ break;
+ case DB_HASH:
+ __ham_print_cursor(dbc);
+ break;
+ case DB_UNKNOWN:
+ DB_ASSERT(env, dbp->type != DB_UNKNOWN);
+ /* FALLTHROUGH */
+ case DB_QUEUE:
+ default:
+ break;
+ }
+ return (0);
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__db_stat_pp(dbp, txn, spp, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ void *spp;
+ u_int32_t flags;
+{
+ COMPQUIET(spp, NULL);
+ COMPQUIET(txn, NULL);
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbp->env));
+}
+
+int
+__db_stat_print_pp(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbp->env));
+}
+#endif
diff --git a/db/db_truncate.c b/db/db_truncate.c
new file mode 100644
index 0000000..66f4180
--- /dev/null
+++ b/db/db_truncate.c
@@ -0,0 +1,225 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/qam.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/partition.h"
+#include "dbinc/txn.h"
+
+static int __db_cursor_check __P((DB *));
+
+/*
+ * __db_truncate_pp
+ * DB->truncate pre/post processing.
+ *
+ * PUBLIC: int __db_truncate_pp __P((DB *, DB_TXN *, u_int32_t *, u_int32_t));
+ */
+int
+__db_truncate_pp(dbp, txn, countp, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ u_int32_t *countp, flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int handle_check, ret, t_ret, txn_local;
+
+ env = dbp->env;
+ handle_check = txn_local = 0;
+
+ STRIP_AUTO_COMMIT(flags);
+
+ /* Check for invalid flags. */
+ if (F_ISSET(dbp, DB_AM_SECONDARY)) {
+ __db_errx(env, "DB->truncate forbidden on secondary indices");
+ return (EINVAL);
+ }
+ if ((ret = __db_fchk(env, "DB->truncate", flags, 0)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+
+ /*
+ * Make sure there are no active cursors on this db. Since we drop
+ * pages we cannot really adjust cursors.
+ */
+ if ((ret = __db_cursor_check(dbp)) != 0) {
+ __db_errx(env,
+ "DB->truncate not permitted with active cursors");
+ goto err;
+ }
+
+#ifdef CONFIG_TEST
+ if (IS_REP_MASTER(env))
+ DB_TEST_WAIT(env, env->test_check);
+#endif
+ /* Check for replication block. */
+ handle_check = IS_ENV_REPLICATED(env);
+ if (handle_check &&
+ (ret = __db_rep_enter(dbp, 1, 0, txn != NULL)) != 0) {
+ handle_check = 0;
+ goto err;
+ }
+
+ /*
+ * Check for changes to a read-only database. This must be after the
+ * replication block so that we cannot race master/client state changes.
+ */
+ if (DB_IS_READONLY(dbp)) {
+ ret = __db_rdonly(env, "DB->truncate");
+ goto err;
+ }
+
+ /*
+ * Create local transaction as necessary, check for consistent
+ * transaction usage.
+ */
+ if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+ if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+ goto err;
+ txn_local = 1;
+ }
+
+ /* Check for consistent transaction usage. */
+ if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+ goto err;
+
+ ret = __db_truncate(dbp, ip, txn, countp);
+
+err: if (txn_local &&
+ (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0)
+ ret = t_ret;
+
+ /* Release replication block. */
+ if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_truncate
+ * DB->truncate.
+ *
+ * PUBLIC: int __db_truncate __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC: u_int32_t *));
+ */
+int
+__db_truncate(dbp, ip, txn, countp)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ u_int32_t *countp;
+{
+ DB *sdbp;
+ DBC *dbc;
+ ENV *env;
+ u_int32_t scount;
+ int ret, t_ret;
+
+ env = dbp->env;
+ dbc = NULL;
+ ret = 0;
+
+ /*
+ * Run through all secondaries and truncate them first. The count
+ * returned is the count of the primary only. QUEUE uses normal
+ * processing to truncate so it will update the secondaries normally.
+ */
+ if (dbp->type != DB_QUEUE && DB_IS_PRIMARY(dbp)) {
+ if ((ret = __db_s_first(dbp, &sdbp)) != 0)
+ return (ret);
+ for (; sdbp != NULL && ret == 0; ret = __db_s_next(&sdbp, txn))
+ if ((ret = __db_truncate(sdbp, ip, txn, &scount)) != 0)
+ break;
+ if (sdbp != NULL)
+ (void)__db_s_done(sdbp, txn);
+ if (ret != 0)
+ return (ret);
+ }
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_PREDESTROY, ret, NULL);
+
+ /* Acquire a cursor. */
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+ return (ret);
+
+ DEBUG_LWRITE(dbc, txn, "DB->truncate", NULL, NULL, 0);
+#ifdef HAVE_PARTITION
+ if (DB_IS_PARTITIONED(dbp))
+ ret = __part_truncate(dbc, countp);
+ else
+#endif
+ switch (dbp->type) {
+ case DB_BTREE:
+ case DB_RECNO:
+ ret = __bam_truncate(dbc, countp);
+ break;
+ case DB_HASH:
+ ret = __ham_truncate(dbc, countp);
+ break;
+ case DB_QUEUE:
+ ret = __qam_truncate(dbc, countp);
+ break;
+ case DB_UNKNOWN:
+ default:
+ ret = __db_unknown_type(env, "DB->truncate", dbp->type);
+ break;
+ }
+
+ /* Discard the cursor. */
+ if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, NULL);
+
+DB_TEST_RECOVERY_LABEL
+
+ return (ret);
+}
+
+/*
+ * __db_cursor_check --
+ * See if there are any active cursors on this db.
+ */
+static int
+__db_cursor_check(dbp)
+ DB *dbp;
+{
+ DB *ldbp;
+ DBC *dbc;
+ ENV *env;
+ int found;
+
+ env = dbp->env;
+
+ MUTEX_LOCK(env, env->mtx_dblist);
+ FIND_FIRST_DB_MATCH(env, dbp, ldbp);
+ for (found = 0;
+ !found && ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+ ldbp = TAILQ_NEXT(ldbp, dblistlinks)) {
+ MUTEX_LOCK(env, dbp->mutex);
+ TAILQ_FOREACH(dbc, &ldbp->active_queue, links)
+ if (IS_INITIALIZED(dbc)) {
+ found = 1;
+ break;
+ }
+ MUTEX_UNLOCK(env, dbp->mutex);
+ }
+ MUTEX_UNLOCK(env, env->mtx_dblist);
+
+ return (found ? EINVAL : 0);
+}
diff --git a/db/db_upg.c b/db/db_upg.c
new file mode 100644
index 0000000..5a6db94
--- /dev/null
+++ b/db/db_upg.c
@@ -0,0 +1,510 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/qam.h"
+
+/*
+ * __db_upgrade_pp --
+ * DB->upgrade pre/post processing.
+ *
+ * PUBLIC: int __db_upgrade_pp __P((DB *, const char *, u_int32_t));
+ */
+int
+__db_upgrade_pp(dbp, fname, flags)
+ DB *dbp;
+ const char *fname;
+ u_int32_t flags;
+{
+#ifdef HAVE_UPGRADE_SUPPORT
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret;
+
+ env = dbp->env;
+
+ /*
+ * !!!
+ * The actual argument checking is simple, do it inline.
+ */
+ if ((ret = __db_fchk(env, "DB->upgrade", flags, DB_DUPSORT)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+ ret = __db_upgrade(dbp, fname, flags);
+ ENV_LEAVE(env, ip);
+ return (ret);
+#else
+ COMPQUIET(dbp, NULL);
+ COMPQUIET(fname, NULL);
+ COMPQUIET(flags, 0);
+
+ __db_errx(dbp->env, "upgrade not supported");
+ return (EINVAL);
+#endif
+}
+
+#ifdef HAVE_UPGRADE_SUPPORT
+static int (* const func_31_list[P_PAGETYPE_MAX])
+ __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)) = {
+ NULL, /* P_INVALID */
+ NULL, /* __P_DUPLICATE */
+ __ham_31_hash, /* P_HASH_UNSORTED */
+ NULL, /* P_IBTREE */
+ NULL, /* P_IRECNO */
+ __bam_31_lbtree, /* P_LBTREE */
+ NULL, /* P_LRECNO */
+ NULL, /* P_OVERFLOW */
+ __ham_31_hashmeta, /* P_HASHMETA */
+ __bam_31_btreemeta, /* P_BTREEMETA */
+ NULL, /* P_QAMMETA */
+ NULL, /* P_QAMDATA */
+ NULL, /* P_LDUP */
+ NULL, /* P_HASH */
+};
+
+static int (* const func_46_list[P_PAGETYPE_MAX])
+ __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)) = {
+ NULL, /* P_INVALID */
+ NULL, /* __P_DUPLICATE */
+ __ham_46_hash, /* P_HASH_UNSORTED */
+ NULL, /* P_IBTREE */
+ NULL, /* P_IRECNO */
+ NULL, /* P_LBTREE */
+ NULL, /* P_LRECNO */
+ NULL, /* P_OVERFLOW */
+ __ham_46_hashmeta, /* P_HASHMETA */
+ NULL, /* P_BTREEMETA */
+ NULL, /* P_QAMMETA */
+ NULL, /* P_QAMDATA */
+ NULL, /* P_LDUP */
+ NULL, /* P_HASH */
+};
+
+static int __db_page_pass __P((DB *, char *, u_int32_t, int (* const [])
+ (DB *, char *, u_int32_t, DB_FH *, PAGE *, int *), DB_FH *));
+static int __db_set_lastpgno __P((DB *, char *, DB_FH *));
+
+/*
+ * __db_upgrade --
+ * Upgrade an existing database.
+ *
+ * PUBLIC: int __db_upgrade __P((DB *, const char *, u_int32_t));
+ */
+int
+__db_upgrade(dbp, fname, flags)
+ DB *dbp;
+ const char *fname;
+ u_int32_t flags;
+{
+ DBMETA *meta;
+ DB_FH *fhp;
+ ENV *env;
+ size_t n;
+ int ret, t_ret, use_mp_open;
+ u_int8_t mbuf[256], tmpflags;
+ char *real_name;
+
+ use_mp_open = 0;
+ env = dbp->env;
+ fhp = NULL;
+
+ /* Get the real backing file name. */
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, fname, NULL, &real_name)) != 0)
+ return (ret);
+
+ /* Open the file. */
+ if ((ret = __os_open(env, real_name, 0, 0, 0, &fhp)) != 0) {
+ __db_err(env, ret, "%s", real_name);
+ return (ret);
+ }
+
+ /* Initialize the feedback. */
+ if (dbp->db_feedback != NULL)
+ dbp->db_feedback(dbp, DB_UPGRADE, 0);
+
+ /*
+ * Read the metadata page. We read 256 bytes, which is larger than
+ * any access method's metadata page and smaller than any disk sector.
+ */
+ if ((ret = __os_read(env, fhp, mbuf, sizeof(mbuf), &n)) != 0)
+ goto err;
+
+ switch (((DBMETA *)mbuf)->magic) {
+ case DB_BTREEMAGIC:
+ switch (((DBMETA *)mbuf)->version) {
+ case 6:
+ /*
+ * Before V7 not all pages had page types, so we do the
+ * single meta-data page by hand.
+ */
+ if ((ret =
+ __bam_30_btreemeta(dbp, real_name, mbuf)) != 0)
+ goto err;
+ if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+ goto err;
+ if ((ret = __os_write(env, fhp, mbuf, 256, &n)) != 0)
+ goto err;
+ /* FALLTHROUGH */
+ case 7:
+ /*
+ * We need the page size to do more. Rip it out of
+ * the meta-data page.
+ */
+ memcpy(&dbp->pgsize, mbuf + 20, sizeof(u_int32_t));
+
+ if ((ret = __db_page_pass(
+ dbp, real_name, flags, func_31_list, fhp)) != 0)
+ goto err;
+ /* FALLTHROUGH */
+ case 8:
+ if ((ret =
+ __db_set_lastpgno(dbp, real_name, fhp)) != 0)
+ goto err;
+ /* FALLTHROUGH */
+ case 9:
+ break;
+ default:
+ __db_errx(env, "%s: unsupported btree version: %lu",
+ real_name, (u_long)((DBMETA *)mbuf)->version);
+ ret = DB_OLD_VERSION;
+ goto err;
+ }
+ break;
+ case DB_HASHMAGIC:
+ switch (((DBMETA *)mbuf)->version) {
+ case 4:
+ case 5:
+ /*
+ * Before V6 not all pages had page types, so we do the
+ * single meta-data page by hand.
+ */
+ if ((ret =
+ __ham_30_hashmeta(dbp, real_name, mbuf)) != 0)
+ goto err;
+ if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+ goto err;
+ if ((ret = __os_write(env, fhp, mbuf, 256, &n)) != 0)
+ goto err;
+
+ /*
+ * Before V6, we created hash pages one by one as they
+ * were needed, using hashhdr.ovfl_point to reserve
+ * a block of page numbers for them. A consequence
+ * of this was that, if no overflow pages had been
+ * created, the current doubling might extend past
+ * the end of the database file.
+ *
+ * In DB 3.X, we now create all the hash pages
+ * belonging to a doubling atomically; it's not
+ * safe to just save them for later, because when
+ * we create an overflow page we'll just create
+ * a new last page (whatever that may be). Grow
+ * the database to the end of the current doubling.
+ */
+ if ((ret =
+ __ham_30_sizefix(dbp, fhp, real_name, mbuf)) != 0)
+ goto err;
+ /* FALLTHROUGH */
+ case 6:
+ /*
+ * We need the page size to do more. Rip it out of
+ * the meta-data page.
+ */
+ memcpy(&dbp->pgsize, mbuf + 20, sizeof(u_int32_t));
+
+ if ((ret = __db_page_pass(
+ dbp, real_name, flags, func_31_list, fhp)) != 0)
+ goto err;
+ /* FALLTHROUGH */
+ case 7:
+ if ((ret =
+ __db_set_lastpgno(dbp, real_name, fhp)) != 0)
+ goto err;
+ /* FALLTHROUGH */
+ case 8:
+ /*
+ * Any upgrade that has proceeded this far has metadata
+ * pages compatible with hash version 8 metadata pages,
+ * so casting mbuf to a dbmeta is safe.
+ * If a newer revision moves the pagesize, checksum or
+ * encrypt_alg flags in the metadata, then the
+ * extraction of the fields will need to use hard coded
+ * offsets.
+ */
+ meta = (DBMETA*)mbuf;
+ /*
+ * We need the page size to do more. Extract it from
+ * the meta-data page.
+ */
+ memcpy(&dbp->pgsize, &meta->pagesize,
+ sizeof(u_int32_t));
+ /*
+ * Rip out metadata and encrypt_alg fields from the
+ * metadata page. So the upgrade can know how big
+ * the page metadata pre-amble is. Any upgrade that has
+ * proceeded this far has metadata pages compatible
+ * with hash version 8 metadata pages, so extracting
+ * the fields is safe.
+ */
+ memcpy(&tmpflags, &meta->metaflags, sizeof(u_int8_t));
+ if (FLD_ISSET(tmpflags, DBMETA_CHKSUM))
+ F_SET(dbp, DB_AM_CHKSUM);
+ memcpy(&tmpflags, &meta->encrypt_alg, sizeof(u_int8_t));
+ if (tmpflags != 0) {
+ if (!CRYPTO_ON(dbp->env)) {
+ __db_errx(env,
+"Attempt to upgrade an encrypted database without providing a password.");
+ ret = EINVAL;
+ goto err;
+ }
+ F_SET(dbp, DB_AM_ENCRYPT);
+ }
+
+ /*
+ * This is ugly. It is necessary to have a usable
+ * mpool in the dbp to upgrade from an unsorted
+ * to a sorted hash database. The mpool file is used
+ * to resolve offpage key items, which are needed to
+ * determine sort order. Having mpool open and access
+ * the file does not affect the page pass, since the
+ * page pass only updates DB_HASH_UNSORTED pages
+ * in-place, and the mpool file is only used to read
+ * OFFPAGE items.
+ */
+ use_mp_open = 1;
+ if ((ret = __os_closehandle(env, fhp)) != 0)
+ return (ret);
+ dbp->type = DB_HASH;
+ if ((ret = __env_mpool(dbp, fname,
+ DB_AM_NOT_DURABLE | DB_AM_VERIFYING)) != 0)
+ return (ret);
+ fhp = dbp->mpf->fhp;
+
+ /* Do the actual conversion pass. */
+ if ((ret = __db_page_pass(
+ dbp, real_name, flags, func_46_list, fhp)) != 0)
+ goto err;
+
+ /* FALLTHROUGH */
+ case 9:
+ break;
+ default:
+ __db_errx(env, "%s: unsupported hash version: %lu",
+ real_name, (u_long)((DBMETA *)mbuf)->version);
+ ret = DB_OLD_VERSION;
+ goto err;
+ }
+ break;
+ case DB_QAMMAGIC:
+ switch (((DBMETA *)mbuf)->version) {
+ case 1:
+ /*
+ * If we're in a Queue database, the only page that
+ * needs upgrading is the meta-database page, don't
+ * bother with a full pass.
+ */
+ if ((ret = __qam_31_qammeta(dbp, real_name, mbuf)) != 0)
+ return (ret);
+ /* FALLTHROUGH */
+ case 2:
+ if ((ret = __qam_32_qammeta(dbp, real_name, mbuf)) != 0)
+ return (ret);
+ if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+ goto err;
+ if ((ret = __os_write(env, fhp, mbuf, 256, &n)) != 0)
+ goto err;
+ /* FALLTHROUGH */
+ case 3:
+ case 4:
+ break;
+ default:
+ __db_errx(env, "%s: unsupported queue version: %lu",
+ real_name, (u_long)((DBMETA *)mbuf)->version);
+ ret = DB_OLD_VERSION;
+ goto err;
+ }
+ break;
+ default:
+ M_32_SWAP(((DBMETA *)mbuf)->magic);
+ switch (((DBMETA *)mbuf)->magic) {
+ case DB_BTREEMAGIC:
+ case DB_HASHMAGIC:
+ case DB_QAMMAGIC:
+ __db_errx(env,
+ "%s: DB->upgrade only supported on native byte-order systems",
+ real_name);
+ break;
+ default:
+ __db_errx(env,
+ "%s: unrecognized file type", real_name);
+ break;
+ }
+ ret = EINVAL;
+ goto err;
+ }
+
+ ret = __os_fsync(env, fhp);
+
+ /*
+ * If mp_open was used, then rely on the database close to clean up
+ * any file handles.
+ */
+err: if (use_mp_open == 0 && fhp != NULL &&
+ (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
+ ret = t_ret;
+ __os_free(env, real_name);
+
+ /* We're done. */
+ if (dbp->db_feedback != NULL)
+ dbp->db_feedback(dbp, DB_UPGRADE, 100);
+
+ return (ret);
+}
+
+/*
+ * __db_page_pass --
+ * Walk the pages of the database, upgrading whatever needs it.
+ */
+static int
+__db_page_pass(dbp, real_name, flags, fl, fhp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ int (* const fl[P_PAGETYPE_MAX])
+ __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ DB_FH *fhp;
+{
+ ENV *env;
+ PAGE *page;
+ db_pgno_t i, pgno_last;
+ size_t n;
+ int dirty, ret;
+
+ env = dbp->env;
+
+ /* Determine the last page of the file. */
+ if ((ret = __db_lastpgno(dbp, real_name, fhp, &pgno_last)) != 0)
+ return (ret);
+
+ /* Allocate memory for a single page. */
+ if ((ret = __os_malloc(env, dbp->pgsize, &page)) != 0)
+ return (ret);
+
+ /* Walk the file, calling the underlying conversion functions. */
+ for (i = 0; i < pgno_last; ++i) {
+ if (dbp->db_feedback != NULL)
+ dbp->db_feedback(
+ dbp, DB_UPGRADE, (int)((i * 100)/pgno_last));
+ if ((ret = __os_seek(env, fhp, i, dbp->pgsize, 0)) != 0)
+ break;
+ if ((ret = __os_read(env, fhp, page, dbp->pgsize, &n)) != 0)
+ break;
+ dirty = 0;
+ /* Always decrypt the page. */
+ if ((ret = __db_decrypt_pg(env, dbp, page)) != 0)
+ break;
+ if (fl[TYPE(page)] != NULL && (ret = fl[TYPE(page)]
+ (dbp, real_name, flags, fhp, page, &dirty)) != 0)
+ break;
+ if (dirty) {
+ if ((ret = __db_encrypt_and_checksum_pg(
+ env, dbp, page)) != 0)
+ break;
+ if ((ret =
+ __os_seek(env, fhp, i, dbp->pgsize, 0)) != 0)
+ break;
+ if ((ret = __os_write(env,
+ fhp, page, dbp->pgsize, &n)) != 0)
+ break;
+ }
+ }
+
+ __os_free(dbp->env, page);
+ return (ret);
+}
+
+/*
+ * __db_lastpgno --
+ * Return the current last page number of the file.
+ *
+ * PUBLIC: int __db_lastpgno __P((DB *, char *, DB_FH *, db_pgno_t *));
+ */
+int
+__db_lastpgno(dbp, real_name, fhp, pgno_lastp)
+ DB *dbp;
+ char *real_name;
+ DB_FH *fhp;
+ db_pgno_t *pgno_lastp;
+{
+ ENV *env;
+ db_pgno_t pgno_last;
+ u_int32_t mbytes, bytes;
+ int ret;
+
+ env = dbp->env;
+
+ if ((ret = __os_ioinfo(env,
+ real_name, fhp, &mbytes, &bytes, NULL)) != 0) {
+ __db_err(env, ret, "%s", real_name);
+ return (ret);
+ }
+
+ /* Page sizes have to be a power-of-two. */
+ if (bytes % dbp->pgsize != 0) {
+ __db_errx(env,
+ "%s: file size not a multiple of the pagesize", real_name);
+ return (EINVAL);
+ }
+ pgno_last = mbytes * (MEGABYTE / dbp->pgsize);
+ pgno_last += bytes / dbp->pgsize;
+
+ *pgno_lastp = pgno_last;
+ return (0);
+}
+
+/*
+ * __db_set_lastpgno --
+ * Update the meta->last_pgno field.
+ *
+ * Code assumes that we do not have checksums/crypto on the page.
+ */
+static int
+__db_set_lastpgno(dbp, real_name, fhp)
+ DB *dbp;
+ char *real_name;
+ DB_FH *fhp;
+{
+ DBMETA meta;
+ ENV *env;
+ int ret;
+ size_t n;
+
+ env = dbp->env;
+ if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+ return (ret);
+ if ((ret = __os_read(env, fhp, &meta, sizeof(meta), &n)) != 0)
+ return (ret);
+ dbp->pgsize = meta.pagesize;
+ if ((ret = __db_lastpgno(dbp, real_name, fhp, &meta.last_pgno)) != 0)
+ return (ret);
+ if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0)
+ return (ret);
+ if ((ret = __os_write(env, fhp, &meta, sizeof(meta), &n)) != 0)
+ return (ret);
+
+ return (0);
+}
+#endif /* HAVE_UPGRADE_SUPPORT */
diff --git a/db/db_upg_opd.c b/db/db_upg_opd.c
new file mode 100644
index 0000000..ea143cf
--- /dev/null
+++ b/db/db_upg_opd.c
@@ -0,0 +1,343 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+
+static int __db_build_bi __P((DB *, DB_FH *, PAGE *, PAGE *, u_int32_t, int *));
+static int __db_build_ri __P((DB *, DB_FH *, PAGE *, PAGE *, u_int32_t, int *));
+static int __db_up_ovref __P((DB *, DB_FH *, db_pgno_t));
+
+#define GET_PAGE(dbp, fhp, pgno, page) { \
+ if ((ret = __os_seek( \
+ dbp->env, fhp, pgno, (dbp)->pgsize, 0)) != 0) \
+ goto err; \
+ if ((ret = __os_read(dbp->env, \
+ fhp, page, (dbp)->pgsize, &n)) != 0) \
+ goto err; \
+}
+#define PUT_PAGE(dbp, fhp, pgno, page) { \
+ if ((ret = __os_seek( \
+ dbp->env, fhp, pgno, (dbp)->pgsize, 0)) != 0) \
+ goto err; \
+ if ((ret = __os_write(dbp->env, \
+ fhp, page, (dbp)->pgsize, &n)) != 0) \
+ goto err; \
+}
+
+/*
+ * __db_31_offdup --
+ * Convert 3.0 off-page duplicates to 3.1 off-page duplicates.
+ *
+ * PUBLIC: int __db_31_offdup __P((DB *, char *, DB_FH *, int, db_pgno_t *));
+ */
+int
+__db_31_offdup(dbp, real_name, fhp, sorted, pgnop)
+ DB *dbp;
+ char *real_name;
+ DB_FH *fhp;
+ int sorted;
+ db_pgno_t *pgnop;
+{
+ PAGE *ipage, *page;
+ db_indx_t indx;
+ db_pgno_t cur_cnt, i, next_cnt, pgno, *pgno_cur, pgno_last;
+ db_pgno_t *pgno_next, pgno_max, *tmp;
+ db_recno_t nrecs;
+ size_t n;
+ int level, nomem, ret;
+
+ ipage = page = NULL;
+ pgno_cur = pgno_next = NULL;
+
+ /* Allocate room to hold a page. */
+ if ((ret = __os_malloc(dbp->env, dbp->pgsize, &page)) != 0)
+ goto err;
+
+ /*
+ * Walk the chain of 3.0 off-page duplicates. Each one is converted
+ * in place to a 3.1 off-page duplicate page. If the duplicates are
+ * sorted, they are converted to a Btree leaf page, otherwise to a
+ * Recno leaf page.
+ */
+ for (nrecs = 0, cur_cnt = pgno_max = 0,
+ pgno = *pgnop; pgno != PGNO_INVALID;) {
+ if (pgno_max == cur_cnt) {
+ pgno_max += 20;
+ if ((ret = __os_realloc(dbp->env, pgno_max *
+ sizeof(db_pgno_t), &pgno_cur)) != 0)
+ goto err;
+ }
+ pgno_cur[cur_cnt++] = pgno;
+
+ GET_PAGE(dbp, fhp, pgno, page);
+ nrecs += NUM_ENT(page);
+ LEVEL(page) = LEAFLEVEL;
+ TYPE(page) = sorted ? P_LDUP : P_LRECNO;
+ /*
+ * !!!
+ * DB didn't zero the LSNs on off-page duplicates pages.
+ */
+ ZERO_LSN(LSN(page));
+ PUT_PAGE(dbp, fhp, pgno, page);
+
+ pgno = NEXT_PGNO(page);
+ }
+
+ /* If we only have a single page, it's easy. */
+ if (cur_cnt <= 1)
+ goto done;
+
+ /*
+ * pgno_cur is the list of pages we just converted. We're
+ * going to walk that list, but we'll need to create a new
+ * list while we do so.
+ */
+ if ((ret = __os_malloc(dbp->env,
+ cur_cnt * sizeof(db_pgno_t), &pgno_next)) != 0)
+ goto err;
+
+ /* Figure out where we can start allocating new pages. */
+ if ((ret = __db_lastpgno(dbp, real_name, fhp, &pgno_last)) != 0)
+ goto err;
+
+ /* Allocate room for an internal page. */
+ if ((ret = __os_malloc(dbp->env, dbp->pgsize, &ipage)) != 0)
+ goto err;
+ PGNO(ipage) = PGNO_INVALID;
+
+ /*
+ * Repeatedly walk the list of pages, building internal pages, until
+ * there's only one page at a level.
+ */
+ for (level = LEAFLEVEL + 1; cur_cnt > 1; ++level) {
+ for (indx = 0, i = next_cnt = 0; i < cur_cnt;) {
+ if (indx == 0) {
+ P_INIT(ipage, dbp->pgsize, pgno_last,
+ PGNO_INVALID, PGNO_INVALID,
+ level, sorted ? P_IBTREE : P_IRECNO);
+ ZERO_LSN(LSN(ipage));
+
+ pgno_next[next_cnt++] = pgno_last++;
+ }
+
+ GET_PAGE(dbp, fhp, pgno_cur[i], page);
+
+ /*
+ * If the duplicates are sorted, put the first item on
+ * the lower-level page onto a Btree internal page. If
+ * the duplicates are not sorted, create an internal
+ * Recno structure on the page. If either case doesn't
+ * fit, push out the current page and start a new one.
+ */
+ nomem = 0;
+ if (sorted) {
+ if ((ret = __db_build_bi(
+ dbp, fhp, ipage, page, indx, &nomem)) != 0)
+ goto err;
+ } else
+ if ((ret = __db_build_ri(
+ dbp, fhp, ipage, page, indx, &nomem)) != 0)
+ goto err;
+ if (nomem) {
+ indx = 0;
+ PUT_PAGE(dbp, fhp, PGNO(ipage), ipage);
+ } else {
+ ++indx;
+ ++NUM_ENT(ipage);
+ ++i;
+ }
+ }
+
+ /*
+ * Push out the last internal page. Set the top-level record
+ * count if we've reached the top.
+ */
+ if (next_cnt == 1)
+ RE_NREC_SET(ipage, nrecs);
+ PUT_PAGE(dbp, fhp, PGNO(ipage), ipage);
+
+ /* Swap the current and next page number arrays. */
+ cur_cnt = next_cnt;
+ tmp = pgno_cur;
+ pgno_cur = pgno_next;
+ pgno_next = tmp;
+ }
+
+done: *pgnop = pgno_cur[0];
+
+err: if (pgno_cur != NULL)
+ __os_free(dbp->env, pgno_cur);
+ if (pgno_next != NULL)
+ __os_free(dbp->env, pgno_next);
+ if (ipage != NULL)
+ __os_free(dbp->env, ipage);
+ if (page != NULL)
+ __os_free(dbp->env, page);
+
+ return (ret);
+}
+
+/*
+ * __db_build_bi --
+ * Build a BINTERNAL entry for a parent page.
+ */
+static int
+__db_build_bi(dbp, fhp, ipage, page, indx, nomemp)
+ DB *dbp;
+ DB_FH *fhp;
+ PAGE *ipage, *page;
+ u_int32_t indx;
+ int *nomemp;
+{
+ BINTERNAL bi, *child_bi;
+ BKEYDATA *child_bk;
+ u_int8_t *p;
+ int ret;
+ db_indx_t *inp;
+
+ inp = P_INP(dbp, ipage);
+ switch (TYPE(page)) {
+ case P_IBTREE:
+ child_bi = GET_BINTERNAL(dbp, page, 0);
+ if (P_FREESPACE(dbp, ipage) < BINTERNAL_PSIZE(child_bi->len)) {
+ *nomemp = 1;
+ return (0);
+ }
+ inp[indx] =
+ HOFFSET(ipage) -= BINTERNAL_SIZE(child_bi->len);
+ p = P_ENTRY(dbp, ipage, indx);
+
+ bi.len = child_bi->len;
+ B_TSET(bi.type, child_bi->type);
+ bi.pgno = PGNO(page);
+ bi.nrecs = __bam_total(dbp, page);
+ memcpy(p, &bi, SSZA(BINTERNAL, data));
+ p += SSZA(BINTERNAL, data);
+ memcpy(p, child_bi->data, child_bi->len);
+
+ /* Increment the overflow ref count. */
+ if (B_TYPE(child_bi->type) == B_OVERFLOW)
+ if ((ret = __db_up_ovref(dbp, fhp,
+ ((BOVERFLOW *)(child_bi->data))->pgno)) != 0)
+ return (ret);
+ break;
+ case P_LDUP:
+ child_bk = GET_BKEYDATA(dbp, page, 0);
+ switch (B_TYPE(child_bk->type)) {
+ case B_KEYDATA:
+ if (P_FREESPACE(dbp, ipage) <
+ BINTERNAL_PSIZE(child_bk->len)) {
+ *nomemp = 1;
+ return (0);
+ }
+ inp[indx] =
+ HOFFSET(ipage) -= BINTERNAL_SIZE(child_bk->len);
+ p = P_ENTRY(dbp, ipage, indx);
+
+ bi.len = child_bk->len;
+ B_TSET(bi.type, child_bk->type);
+ bi.pgno = PGNO(page);
+ bi.nrecs = __bam_total(dbp, page);
+ memcpy(p, &bi, SSZA(BINTERNAL, data));
+ p += SSZA(BINTERNAL, data);
+ memcpy(p, child_bk->data, child_bk->len);
+ break;
+ case B_OVERFLOW:
+ if (P_FREESPACE(dbp, ipage) <
+ BINTERNAL_PSIZE(BOVERFLOW_SIZE)) {
+ *nomemp = 1;
+ return (0);
+ }
+ inp[indx] =
+ HOFFSET(ipage) -= BINTERNAL_SIZE(BOVERFLOW_SIZE);
+ p = P_ENTRY(dbp, ipage, indx);
+
+ bi.len = BOVERFLOW_SIZE;
+ B_TSET(bi.type, child_bk->type);
+ bi.pgno = PGNO(page);
+ bi.nrecs = __bam_total(dbp, page);
+ memcpy(p, &bi, SSZA(BINTERNAL, data));
+ p += SSZA(BINTERNAL, data);
+ memcpy(p, child_bk, BOVERFLOW_SIZE);
+
+ /* Increment the overflow ref count. */
+ if ((ret = __db_up_ovref(dbp, fhp,
+ ((BOVERFLOW *)child_bk)->pgno)) != 0)
+ return (ret);
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, PGNO(page)));
+ }
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, PGNO(page)));
+ }
+
+ return (0);
+}
+
+/*
+ * __db_build_ri --
+ * Build a RINTERNAL entry for an internal parent page.
+ */
+static int
+__db_build_ri(dbp, fhp, ipage, page, indx, nomemp)
+ DB *dbp;
+ DB_FH *fhp;
+ PAGE *ipage, *page;
+ u_int32_t indx;
+ int *nomemp;
+{
+ RINTERNAL ri;
+ db_indx_t *inp;
+
+ COMPQUIET(fhp, NULL);
+ inp = P_INP(dbp, ipage);
+ if (P_FREESPACE(dbp, ipage) < RINTERNAL_PSIZE) {
+ *nomemp = 1;
+ return (0);
+ }
+
+ ri.pgno = PGNO(page);
+ ri.nrecs = __bam_total(dbp, page);
+ inp[indx] = HOFFSET(ipage) -= RINTERNAL_SIZE;
+ memcpy(P_ENTRY(dbp, ipage, indx), &ri, RINTERNAL_SIZE);
+
+ return (0);
+}
+
+/*
+ * __db_up_ovref --
+ * Increment/decrement the reference count on an overflow page.
+ */
+static int
+__db_up_ovref(dbp, fhp, pgno)
+ DB *dbp;
+ DB_FH *fhp;
+ db_pgno_t pgno;
+{
+ PAGE *page;
+ size_t n;
+ int ret;
+
+ /* Allocate room to hold a page. */
+ if ((ret = __os_malloc(dbp->env, dbp->pgsize, &page)) != 0)
+ return (ret);
+
+ GET_PAGE(dbp, fhp, pgno, page);
+ ++OV_REF(page);
+ PUT_PAGE(dbp, fhp, pgno, page);
+
+err: __os_free(dbp->env, page);
+
+ return (ret);
+}
diff --git a/db/db_vrfy.c b/db/db_vrfy.c
new file mode 100644
index 0000000..7ea9c62
--- /dev/null
+++ b/db/db_vrfy.c
@@ -0,0 +1,2894 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
+
+/*
+ * This is the code for DB->verify, the DB database consistency checker.
+ * For now, it checks all subdatabases in a database, and verifies
+ * everything it knows how to (i.e. it's all-or-nothing, and one can't
+ * check only for a subset of possible problems).
+ */
+
+static u_int __db_guesspgsize __P((ENV *, DB_FH *));
+static int __db_is_valid_magicno __P((u_int32_t, DBTYPE *));
+static int __db_meta2pgset
+ __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t, DB *));
+static int __db_salvage __P((DB *, VRFY_DBINFO *,
+ db_pgno_t, void *, int (*)(void *, const void *), u_int32_t));
+static int __db_salvage_subdbpg __P((DB *, VRFY_DBINFO *,
+ PAGE *, void *, int (*)(void *, const void *), u_int32_t));
+static int __db_salvage_all __P((DB *, VRFY_DBINFO *, void *,
+ int(*)(void *, const void *), u_int32_t, int *));
+static int __db_salvage_unknowns __P((DB *, VRFY_DBINFO *, void *,
+ int (*)(void *, const void *), u_int32_t));
+static int __db_verify_arg __P((DB *, const char *, void *, u_int32_t));
+static int __db_vrfy_freelist
+ __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t));
+static int __db_vrfy_invalid
+ __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+static int __db_vrfy_orderchkonly __P((DB *,
+ VRFY_DBINFO *, const char *, const char *, u_int32_t));
+static int __db_vrfy_pagezero __P((DB *, VRFY_DBINFO *, DB_FH *, u_int32_t));
+static int __db_vrfy_subdbs
+ __P((DB *, VRFY_DBINFO *, const char *, u_int32_t));
+static int __db_vrfy_structure __P((DB *, VRFY_DBINFO *,
+ const char *, db_pgno_t, void *, void *, u_int32_t));
+static int __db_vrfy_walkpages __P((DB *, VRFY_DBINFO *,
+ void *, int (*)(void *, const void *), u_int32_t));
+
+#define VERIFY_FLAGS \
+ (DB_AGGRESSIVE | \
+ DB_NOORDERCHK | DB_ORDERCHKONLY | DB_PRINTABLE | DB_SALVAGE | DB_UNREF)
+
+/*
+ * __db_verify_pp --
+ * DB->verify public interface.
+ *
+ * PUBLIC: int __db_verify_pp
+ * PUBLIC: __P((DB *, const char *, const char *, FILE *, u_int32_t));
+ */
+int
+__db_verify_pp(dbp, file, database, outfile, flags)
+ DB *dbp;
+ const char *file, *database;
+ FILE *outfile;
+ u_int32_t flags;
+{
+ /*
+ * __db_verify_pp is a wrapper to __db_verify_internal, which lets
+ * us pass appropriate equivalents to FILE * in from the non-C APIs.
+ * That's why the usual ENV_ENTER macros are in __db_verify_internal,
+ * not here.
+ */
+ return (__db_verify_internal(dbp,
+ file, database, outfile, __db_pr_callback, flags));
+}
+
+/*
+ * __db_verify_internal --
+ *
+ * PUBLIC: int __db_verify_internal __P((DB *, const char *,
+ * PUBLIC: const char *, void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__db_verify_internal(dbp, fname, dname, handle, callback, flags)
+ DB *dbp;
+ const char *fname, *dname;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ret, t_ret;
+
+ env = dbp->env;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->verify");
+
+ if (!LF_ISSET(DB_SALVAGE))
+ LF_SET(DB_UNREF);
+
+ ENV_ENTER(env, ip);
+
+ if ((ret = __db_verify_arg(dbp, dname, handle, flags)) == 0)
+ ret = __db_verify(dbp, ip,
+ fname, dname, handle, callback, NULL, NULL, flags);
+
+ /* Db.verify is a DB handle destructor. */
+ if ((t_ret = __db_close(dbp, NULL, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ ENV_LEAVE(env, ip);
+ return (ret);
+}
+
+/*
+ * __db_verify_arg --
+ * Check DB->verify arguments.
+ */
+static int
+__db_verify_arg(dbp, dname, handle, flags)
+ DB *dbp;
+ const char *dname;
+ void *handle;
+ u_int32_t flags;
+{
+ ENV *env;
+ int ret;
+
+ env = dbp->env;
+
+ if ((ret = __db_fchk(env, "DB->verify", flags, VERIFY_FLAGS)) != 0)
+ return (ret);
+
+ /*
+ * DB_SALVAGE is mutually exclusive with the other flags except
+ * DB_AGGRESSIVE, DB_PRINTABLE.
+ *
+ * DB_AGGRESSIVE and DB_PRINTABLE are only meaningful when salvaging.
+ *
+ * DB_SALVAGE requires an output stream.
+ */
+ if (LF_ISSET(DB_SALVAGE)) {
+ if (LF_ISSET(~(DB_AGGRESSIVE | DB_PRINTABLE | DB_SALVAGE)))
+ return (__db_ferr(env, "DB->verify", 1));
+ if (handle == NULL) {
+ __db_errx(env,
+ "DB_SALVAGE requires a an output handle");
+ return (EINVAL);
+ }
+ } else
+ if (LF_ISSET(DB_AGGRESSIVE | DB_PRINTABLE))
+ return (__db_ferr(env, "DB->verify", 1));
+
+ /*
+ * DB_ORDERCHKONLY is mutually exclusive with DB_SALVAGE and
+ * DB_NOORDERCHK, and requires a database name.
+ */
+ if ((ret = __db_fcchk(env, "DB->verify", flags,
+ DB_ORDERCHKONLY, DB_SALVAGE | DB_NOORDERCHK)) != 0)
+ return (ret);
+ if (LF_ISSET(DB_ORDERCHKONLY) && dname == NULL) {
+ __db_errx(env, "DB_ORDERCHKONLY requires a database name");
+ return (EINVAL);
+ }
+ return (0);
+}
+
+/*
+ * __db_verify --
+ * Walk the entire file page-by-page, either verifying with or without
+ * dumping in db_dump -d format, or DB_SALVAGE-ing whatever key/data
+ * pairs can be found and dumping them in standard (db_load-ready)
+ * dump format.
+ *
+ * (Salvaging isn't really a verification operation, but we put it
+ * here anyway because it requires essentially identical top-level
+ * code.)
+ *
+ * flags may be 0, DB_NOORDERCHK, DB_ORDERCHKONLY, or DB_SALVAGE
+ * (and optionally DB_AGGRESSIVE).
+ * PUBLIC: int __db_verify __P((DB *, DB_THREAD_INFO *, const char *,
+ * PUBLIC: const char *, void *, int (*)(void *, const void *),
+ * PUBLIC: void *, void *, u_int32_t));
+ */
+int
+__db_verify(dbp, ip, name, subdb, handle, callback, lp, rp, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ const char *name, *subdb;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ void *lp, *rp;
+ u_int32_t flags;
+{
+ DB_FH *fhp;
+ ENV *env;
+ VRFY_DBINFO *vdp;
+ u_int32_t sflags;
+ int has_subdbs, isbad, ret, t_ret;
+ char *real_name;
+
+ env = dbp->env;
+ fhp = NULL;
+ vdp = NULL;
+ real_name = NULL;
+ has_subdbs = isbad = ret = t_ret = 0;
+
+ F_SET(dbp, DB_AM_VERIFYING);
+
+ /* Initialize any feedback function. */
+ if (!LF_ISSET(DB_SALVAGE) && dbp->db_feedback != NULL)
+ dbp->db_feedback(dbp, DB_VERIFY, 0);
+
+ /*
+ * We don't know how large the cache is, and if the database
+ * in question uses a small page size--which we don't know
+ * yet!--it may be uncomfortably small for the default page
+ * size [#2143]. However, the things we need temporary
+ * databases for in dbinfo are largely tiny, so using a
+ * 1024-byte pagesize is probably not going to be a big hit,
+ * and will make us fit better into small spaces.
+ */
+ if ((ret = __db_vrfy_dbinfo_create(env, ip, 1024, &vdp)) != 0)
+ goto err;
+
+ /*
+ * Note whether the user has requested that we use printable
+ * chars where possible. We won't get here with this flag if
+ * we're not salvaging.
+ */
+ if (LF_ISSET(DB_PRINTABLE))
+ F_SET(vdp, SALVAGE_PRINTABLE);
+
+ /* Find the real name of the file. */
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, name, &dbp->dirname, &real_name)) != 0)
+ goto err;
+
+ /*
+ * Our first order of business is to verify page 0, which is
+ * the metadata page for the master database of subdatabases
+ * or of the only database in the file. We want to do this by hand
+ * rather than just calling __db_open in case it's corrupt--various
+ * things in __db_open might act funny.
+ *
+ * Once we know the metadata page is healthy, I believe that it's
+ * safe to open the database normally and then use the page swapping
+ * code, which makes life easier.
+ */
+ if ((ret = __os_open(env, real_name, 0, DB_OSO_RDONLY, 0, &fhp)) != 0)
+ goto err;
+
+ /* Verify the metadata page 0; set pagesize and type. */
+ if ((ret = __db_vrfy_pagezero(dbp, vdp, fhp, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ /*
+ * We can assume at this point that dbp->pagesize and dbp->type are
+ * set correctly, or at least as well as they can be, and that
+ * locking, logging, and txns are not in use. Thus we can trust
+ * the memp code not to look at the page, and thus to be safe
+ * enough to use.
+ *
+ * The dbp is not open, but the file is open in the fhp, and we
+ * cannot assume that __db_open is safe. Call __env_setup,
+ * the [safe] part of __db_open that initializes the environment--
+ * and the mpool--manually.
+ */
+ if ((ret = __env_setup(dbp, NULL,
+ name, subdb, TXN_INVALID, DB_ODDFILESIZE | DB_RDONLY)) != 0)
+ goto err;
+
+ /*
+ * Set our name in the Queue subsystem; we may need it later
+ * to deal with extents.
+ */
+ if (dbp->type == DB_QUEUE &&
+ (ret = __qam_set_ext_data(dbp, name)) != 0)
+ goto err;
+
+ /* Mark the dbp as opened, so that we correctly handle its close. */
+ F_SET(dbp, DB_AM_OPEN_CALLED);
+
+ /* Find out the page number of the last page in the database. */
+ if ((ret = __memp_get_last_pgno(dbp->mpf, &vdp->last_pgno)) != 0)
+ goto err;
+
+ /*
+ * DB_ORDERCHKONLY is a special case; our file consists of
+ * several subdatabases, which use different hash, bt_compare,
+ * and/or dup_compare functions. Consequently, we couldn't verify
+ * sorting and hashing simply by calling DB->verify() on the file.
+ * DB_ORDERCHKONLY allows us to come back and check those things; it
+ * requires a subdatabase, and assumes that everything but that
+ * database's sorting/hashing is correct.
+ */
+ if (LF_ISSET(DB_ORDERCHKONLY)) {
+ ret = __db_vrfy_orderchkonly(dbp, vdp, name, subdb, flags);
+ goto done;
+ }
+
+ sflags = flags;
+ if (dbp->p_internal != NULL)
+ LF_CLR(DB_SALVAGE);
+
+ /*
+ * When salvaging, we use a db to keep track of whether we've seen a
+ * given overflow or dup page in the course of traversing normal data.
+ * If in the end we have not, we assume its key got lost and print it
+ * with key "UNKNOWN".
+ */
+ if (LF_ISSET(DB_SALVAGE)) {
+ if ((ret = __db_salvage_init(vdp)) != 0)
+ goto err;
+
+ /*
+ * If we're not being aggressive, salvage by walking the tree
+ * and only printing the leaves we find. "has_subdbs" will
+ * indicate whether we found subdatabases.
+ */
+ if (!LF_ISSET(DB_AGGRESSIVE) && __db_salvage_all(
+ dbp, vdp, handle, callback, flags, &has_subdbs) != 0)
+ isbad = 1;
+
+ /*
+ * If we have subdatabases, flag if any keys are found that
+ * don't belong to a subdatabase -- they'll need to have an
+ * "__OTHER__" subdatabase header printed first.
+ */
+ if (has_subdbs) {
+ F_SET(vdp, SALVAGE_PRINTHEADER);
+ F_SET(vdp, SALVAGE_HASSUBDBS);
+ }
+ }
+
+ /* Walk all the pages, if a page cannot be read, verify structure. */
+ if ((ret =
+ __db_vrfy_walkpages(dbp, vdp, handle, callback, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else if (ret != DB_PAGE_NOTFOUND)
+ goto err;
+ }
+
+ /* If we're verifying, verify inter-page structure. */
+ if (!LF_ISSET(DB_SALVAGE) && isbad == 0)
+ if ((t_ret = __db_vrfy_structure(dbp,
+ vdp, name, 0, lp, rp, flags)) != 0) {
+ if (t_ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ /*
+ * If we're salvaging, output with key UNKNOWN any overflow or dup pages
+ * we haven't been able to put in context. Then destroy the salvager's
+ * state-saving database.
+ */
+ if (LF_ISSET(DB_SALVAGE)) {
+ if ((ret = __db_salvage_unknowns(dbp,
+ vdp, handle, callback, flags)) != 0)
+ isbad = 1;
+ }
+
+ flags = sflags;
+
+#ifdef HAVE_PARTITION
+ if (t_ret == 0 && dbp->p_internal != NULL)
+ t_ret = __part_verify(dbp, vdp, name, handle, callback, flags);
+#endif
+
+ if (ret == 0)
+ ret = t_ret;
+
+ /* Don't display a footer for a database holding other databases. */
+ if (LF_ISSET(DB_SALVAGE | DB_VERIFY_PARTITION) == DB_SALVAGE &&
+ (!has_subdbs || F_ISSET(vdp, SALVAGE_PRINTFOOTER)))
+ (void)__db_prfooter(handle, callback);
+
+done: err:
+ /* Send feedback that we're done. */
+ if (!LF_ISSET(DB_SALVAGE) && dbp->db_feedback != NULL)
+ dbp->db_feedback(dbp, DB_VERIFY, 100);
+
+ if (LF_ISSET(DB_SALVAGE) &&
+ (t_ret = __db_salvage_destroy(vdp)) != 0 && ret == 0)
+ ret = t_ret;
+ if (fhp != NULL &&
+ (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
+ ret = t_ret;
+ if (vdp != NULL &&
+ (t_ret = __db_vrfy_dbinfo_destroy(env, vdp)) != 0 && ret == 0)
+ ret = t_ret;
+ if (real_name != NULL)
+ __os_free(env, real_name);
+
+ /*
+ * DB_VERIFY_FATAL is a private error, translate to a public one.
+ *
+ * If we didn't find a page, it's probably a page number was corrupted.
+ * Return the standard corruption error.
+ *
+ * Otherwise, if we found corruption along the way, set the return.
+ */
+ if (ret == DB_VERIFY_FATAL ||
+ ret == DB_PAGE_NOTFOUND || (ret == 0 && isbad == 1))
+ ret = DB_VERIFY_BAD;
+
+ /* Make sure there's a public complaint if we found corruption. */
+ if (ret != 0)
+ __db_err(env, ret, "%s", name);
+
+ return (ret);
+}
+
+/*
+ * __db_vrfy_pagezero --
+ * Verify the master metadata page. Use seek, read, and a local buffer
+ * rather than the DB paging code, for safety.
+ *
+ * Must correctly (or best-guess) set dbp->type and dbp->pagesize.
+ */
+static int
+__db_vrfy_pagezero(dbp, vdp, fhp, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ DB_FH *fhp;
+ u_int32_t flags;
+{
+ DBMETA *meta;
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ db_pgno_t freelist;
+ size_t nr;
+ int isbad, ret, swapped;
+ u_int8_t mbuf[DBMETASIZE];
+
+ isbad = ret = swapped = 0;
+ freelist = 0;
+ env = dbp->env;
+ meta = (DBMETA *)mbuf;
+ dbp->type = DB_UNKNOWN;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, PGNO_BASE_MD, &pip)) != 0)
+ return (ret);
+
+ /*
+ * Seek to the metadata page.
+ * Note that if we're just starting a verification, dbp->pgsize
+ * may be zero; this is okay, as we want page zero anyway and
+ * 0*0 == 0.
+ */
+ if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0 ||
+ (ret = __os_read(env, fhp, mbuf, DBMETASIZE, &nr)) != 0) {
+ __db_err(env, ret,
+ "Metadata page %lu cannot be read", (u_long)PGNO_BASE_MD);
+ return (ret);
+ }
+
+ if (nr != DBMETASIZE) {
+ EPRINT((env,
+ "Page %lu: Incomplete metadata page",
+ (u_long)PGNO_BASE_MD));
+ return (DB_VERIFY_FATAL);
+ }
+
+ if ((ret = __db_chk_meta(env, dbp, meta, 1)) != 0) {
+ EPRINT((env,
+ "Page %lu: metadata page corrupted", (u_long)PGNO_BASE_MD));
+ isbad = 1;
+ if (ret != -1) {
+ EPRINT((env,
+ "Page %lu: could not check metadata page",
+ (u_long)PGNO_BASE_MD));
+ return (DB_VERIFY_FATAL);
+ }
+ }
+
+ /*
+ * Check all of the fields that we can.
+ *
+ * 08-11: Current page number. Must == pgno.
+ * Note that endianness doesn't matter--it's zero.
+ */
+ if (meta->pgno != PGNO_BASE_MD) {
+ isbad = 1;
+ EPRINT((env, "Page %lu: pgno incorrectly set to %lu",
+ (u_long)PGNO_BASE_MD, (u_long)meta->pgno));
+ }
+
+ /* 12-15: Magic number. Must be one of valid set. */
+ if (__db_is_valid_magicno(meta->magic, &dbp->type))
+ swapped = 0;
+ else {
+ M_32_SWAP(meta->magic);
+ if (__db_is_valid_magicno(meta->magic,
+ &dbp->type))
+ swapped = 1;
+ else {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: bad magic number %lu",
+ (u_long)PGNO_BASE_MD, (u_long)meta->magic));
+ }
+ }
+
+ /*
+ * 16-19: Version. Must be current; for now, we
+ * don't support verification of old versions.
+ */
+ if (swapped)
+ M_32_SWAP(meta->version);
+ if ((dbp->type == DB_BTREE &&
+ (meta->version > DB_BTREEVERSION ||
+ meta->version < DB_BTREEOLDVER)) ||
+ (dbp->type == DB_HASH &&
+ (meta->version > DB_HASHVERSION ||
+ meta->version < DB_HASHOLDVER)) ||
+ (dbp->type == DB_QUEUE &&
+ (meta->version > DB_QAMVERSION ||
+ meta->version < DB_QAMOLDVER))) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: unsupported DB version %lu; extraneous errors may result",
+ (u_long)PGNO_BASE_MD, (u_long)meta->version));
+ }
+
+ /*
+ * 20-23: Pagesize. Must be power of two,
+ * greater than 512, and less than 64K.
+ */
+ if (swapped)
+ M_32_SWAP(meta->pagesize);
+ if (IS_VALID_PAGESIZE(meta->pagesize))
+ dbp->pgsize = meta->pagesize;
+ else {
+ isbad = 1;
+ EPRINT((env, "Page %lu: bad page size %lu",
+ (u_long)PGNO_BASE_MD, (u_long)meta->pagesize));
+
+ /*
+ * Now try to settle on a pagesize to use.
+ * If the user-supplied one is reasonable,
+ * use it; else, guess.
+ */
+ if (!IS_VALID_PAGESIZE(dbp->pgsize))
+ dbp->pgsize = __db_guesspgsize(env, fhp);
+ }
+
+ /*
+ * 25: Page type. Must be correct for dbp->type,
+ * which is by now set as well as it can be.
+ */
+ /* Needs no swapping--only one byte! */
+ if ((dbp->type == DB_BTREE && meta->type != P_BTREEMETA) ||
+ (dbp->type == DB_HASH && meta->type != P_HASHMETA) ||
+ (dbp->type == DB_QUEUE && meta->type != P_QAMMETA)) {
+ isbad = 1;
+ EPRINT((env, "Page %lu: bad page type %lu",
+ (u_long)PGNO_BASE_MD, (u_long)meta->type));
+ }
+
+ /*
+ * 26: Meta-flags.
+ */
+ if (meta->metaflags != 0) {
+ if (FLD_ISSET(meta->metaflags,
+ ~(DBMETA_CHKSUM|DBMETA_PART_RANGE|DBMETA_PART_CALLBACK))) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: bad meta-data flags value %#lx",
+ (u_long)PGNO_BASE_MD, (u_long)meta->metaflags));
+ }
+ if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM))
+ F_SET(pip, VRFY_HAS_CHKSUM);
+ if (FLD_ISSET(meta->metaflags, DBMETA_PART_RANGE))
+ F_SET(pip, VRFY_HAS_PART_RANGE);
+ if (FLD_ISSET(meta->metaflags, DBMETA_PART_CALLBACK))
+ F_SET(pip, VRFY_HAS_PART_CALLBACK);
+
+ if (FLD_ISSET(meta->metaflags,
+ DBMETA_PART_RANGE | DBMETA_PART_CALLBACK) &&
+ (ret = __partition_init(dbp, meta->metaflags)) != 0)
+ return (ret);
+ }
+
+ /*
+ * 28-31: Free list page number.
+ * 32-35: Last page in database file.
+ * We'll verify its sensibility when we do inter-page
+ * verification later; for now, just store it.
+ */
+ if (swapped)
+ M_32_SWAP(meta->free);
+ freelist = meta->free;
+ if (swapped)
+ M_32_SWAP(meta->last_pgno);
+ vdp->meta_last_pgno = meta->last_pgno;
+
+ /*
+ * Initialize vdp->pages to fit a single pageinfo structure for
+ * this one page. We'll realloc later when we know how many
+ * pages there are.
+ */
+ pip->pgno = PGNO_BASE_MD;
+ pip->type = meta->type;
+
+ /*
+ * Signal that we still have to check the info specific to
+ * a given type of meta page.
+ */
+ F_SET(pip, VRFY_INCOMPLETE);
+
+ pip->free = freelist;
+
+ if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+ return (ret);
+
+ /* Set up the dbp's fileid. We don't use the regular open path. */
+ memcpy(dbp->fileid, meta->uid, DB_FILE_ID_LEN);
+
+ if (swapped == 1)
+ F_SET(dbp, DB_AM_SWAP);
+
+ return (isbad ? DB_VERIFY_BAD : 0);
+}
+
+/*
+ * __db_vrfy_walkpages --
+ * Main loop of the verifier/salvager. Walks through,
+ * page by page, and verifies all pages and/or prints all data pages.
+ */
+static int
+__db_vrfy_walkpages(dbp, vdp, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *h;
+ VRFY_PAGEINFO *pip;
+ db_pgno_t i;
+ int ret, t_ret, isbad;
+
+ env = dbp->env;
+ mpf = dbp->mpf;
+ h = NULL;
+ ret = isbad = t_ret = 0;
+
+ for (i = 0; i <= vdp->last_pgno; i++) {
+ /*
+ * If DB_SALVAGE is set, we inspect our database of completed
+ * pages, and skip any we've already printed in the subdb pass.
+ */
+ if (LF_ISSET(DB_SALVAGE) && (__db_salvage_isdone(vdp, i) != 0))
+ continue;
+
+ /*
+ * An individual page get can fail if:
+ * * This is a hash database, it is expected to find
+ * empty buckets, which don't have allocated pages. Create
+ * a dummy page so the verification can proceed.
+ * * We are salvaging, flag the error and continue.
+ */
+ if ((t_ret = __memp_fget(mpf, &i,
+ vdp->thread_info, NULL, 0, &h)) != 0) {
+ if (dbp->type == DB_HASH) {
+ if ((t_ret =
+ __db_vrfy_getpageinfo(vdp, i, &pip)) != 0)
+ goto err1;
+ pip->type = P_INVALID;
+ pip->pgno = i;
+ F_CLR(pip, VRFY_IS_ALLZEROES);
+ if ((t_ret = __db_vrfy_putpageinfo(
+ env, vdp, pip)) != 0)
+ goto err1;
+ continue;
+ }
+ if (t_ret == DB_PAGE_NOTFOUND) {
+ EPRINT((env,
+ "Page %lu: beyond the end of the file, metadata page has last page as %lu",
+ (u_long)i, (u_long)vdp->last_pgno));
+ if (ret == 0)
+ return (t_ret);
+ }
+
+err1: if (ret == 0)
+ ret = t_ret;
+ if (LF_ISSET(DB_SALVAGE))
+ continue;
+ return (ret);
+ }
+
+ if (LF_ISSET(DB_SALVAGE)) {
+ /*
+ * We pretty much don't want to quit unless a
+ * bomb hits. May as well return that something
+ * was screwy, however.
+ */
+ if ((t_ret = __db_salvage_pg(dbp,
+ vdp, i, h, handle, callback, flags)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ isbad = 1;
+ }
+ } else {
+ /*
+ * If we are not salvaging, and we get any error
+ * other than DB_VERIFY_BAD, return immediately;
+ * it may not be safe to proceed. If we get
+ * DB_VERIFY_BAD, keep going; listing more errors
+ * may make it easier to diagnose problems and
+ * determine the magnitude of the corruption.
+ *
+ * Verify info common to all page types.
+ */
+ if (i != PGNO_BASE_MD) {
+ ret = __db_vrfy_common(dbp, vdp, h, i, flags);
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else if (ret != 0)
+ goto err;
+ }
+
+ switch (TYPE(h)) {
+ case P_INVALID:
+ ret = __db_vrfy_invalid(dbp, vdp, h, i, flags);
+ break;
+ case __P_DUPLICATE:
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: old-style duplicate page",
+ (u_long)i));
+ break;
+ case P_HASH_UNSORTED:
+ case P_HASH:
+ ret = __ham_vrfy(dbp, vdp, h, i, flags);
+ break;
+ case P_IBTREE:
+ case P_IRECNO:
+ case P_LBTREE:
+ case P_LDUP:
+ ret = __bam_vrfy(dbp, vdp, h, i, flags);
+ break;
+ case P_LRECNO:
+ ret = __ram_vrfy_leaf(dbp, vdp, h, i, flags);
+ break;
+ case P_OVERFLOW:
+ ret = __db_vrfy_overflow(dbp, vdp, h, i, flags);
+ break;
+ case P_HASHMETA:
+ ret = __ham_vrfy_meta(dbp,
+ vdp, (HMETA *)h, i, flags);
+ break;
+ case P_BTREEMETA:
+ ret = __bam_vrfy_meta(dbp,
+ vdp, (BTMETA *)h, i, flags);
+ break;
+ case P_QAMMETA:
+ ret = __qam_vrfy_meta(dbp,
+ vdp, (QMETA *)h, i, flags);
+ break;
+ case P_QAMDATA:
+ ret = __qam_vrfy_data(dbp,
+ vdp, (QPAGE *)h, i, flags);
+ break;
+ default:
+ EPRINT((env,
+ "Page %lu: unknown page type %lu",
+ (u_long)i, (u_long)TYPE(h)));
+ isbad = 1;
+ break;
+ }
+
+ /*
+ * Set up error return.
+ */
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else if (ret != 0)
+ goto err;
+
+ /*
+ * Provide feedback to the application about our
+ * progress. The range 0-50% comes from the fact
+ * that this is the first of two passes through the
+ * database (front-to-back, then top-to-bottom).
+ */
+ if (dbp->db_feedback != NULL)
+ dbp->db_feedback(dbp, DB_VERIFY,
+ (int)((i + 1) * 50 / (vdp->last_pgno + 1)));
+ }
+
+ /*
+ * Just as with the page get, bail if and only if we're
+ * not salvaging.
+ */
+ if ((t_ret = __memp_fput(mpf,
+ vdp->thread_info, h, dbp->priority)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ if (!LF_ISSET(DB_SALVAGE))
+ return (ret);
+ }
+ }
+
+ /*
+ * If we've seen a Queue metadata page, we may need to walk Queue
+ * extent pages that won't show up between 0 and vdp->last_pgno.
+ */
+ if (F_ISSET(vdp, VRFY_QMETA_SET) && (t_ret =
+ __qam_vrfy_walkqueue(dbp, vdp, handle, callback, flags)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ if (t_ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else if (!LF_ISSET(DB_SALVAGE))
+ return (ret);
+ }
+
+ if (0) {
+err: if (h != NULL && (t_ret = __memp_fput(mpf,
+ vdp->thread_info, h, dbp->priority)) != 0)
+ return (ret == 0 ? t_ret : ret);
+ }
+
+ return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_vrfy_structure--
+ * After a beginning-to-end walk through the database has been
+ * completed, put together the information that has been collected
+ * to verify the overall database structure.
+ *
+ * Should only be called if we want to do a database verification,
+ * i.e. if DB_SALVAGE is not set.
+ */
+static int
+__db_vrfy_structure(dbp, vdp, dbname, meta_pgno, lp, rp, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ const char *dbname;
+ db_pgno_t meta_pgno;
+ void *lp, *rp;
+ u_int32_t flags;
+{
+ DB *pgset;
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ db_pgno_t i;
+ int ret, isbad, hassubs, p;
+
+ isbad = 0;
+ pip = NULL;
+ env = dbp->env;
+ pgset = vdp->pgset;
+
+ /*
+ * Providing feedback here is tricky; in most situations,
+ * we fetch each page one more time, but we do so in a top-down
+ * order that depends on the access method. Worse, we do this
+ * recursively in btree, such that on any call where we're traversing
+ * a subtree we don't know where that subtree is in the whole database;
+ * worse still, any given database may be one of several subdbs.
+ *
+ * The solution is to decrement a counter vdp->pgs_remaining each time
+ * we verify (and call feedback on) a page. We may over- or
+ * under-count, but the structure feedback function will ensure that we
+ * never give a percentage under 50 or over 100. (The first pass
+ * covered the range 0-50%.)
+ */
+ if (dbp->db_feedback != NULL)
+ vdp->pgs_remaining = vdp->last_pgno + 1;
+
+ /*
+ * Call the appropriate function to downwards-traverse the db type.
+ */
+ switch (dbp->type) {
+ case DB_BTREE:
+ case DB_RECNO:
+ if ((ret =
+ __bam_vrfy_structure(dbp, vdp, 0, lp, rp, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ /*
+ * If we have subdatabases and we know that the database is,
+ * thus far, sound, it's safe to walk the tree of subdatabases.
+ * Do so, and verify the structure of the databases within.
+ */
+ if ((ret = __db_vrfy_getpageinfo(vdp, 0, &pip)) != 0)
+ goto err;
+ hassubs = F_ISSET(pip, VRFY_HAS_SUBDBS) ? 1 : 0;
+ if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+ goto err;
+ pip = NULL;
+
+ if (isbad == 0 && hassubs)
+ if ((ret =
+ __db_vrfy_subdbs(dbp, vdp, dbname, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+ break;
+ case DB_HASH:
+ if ((ret = __ham_vrfy_structure(dbp, vdp, 0, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+ break;
+ case DB_QUEUE:
+ if ((ret = __qam_vrfy_structure(dbp, vdp, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ }
+
+ /*
+ * Queue pages may be unreferenced and totally zeroed, if
+ * they're empty; queue doesn't have much structure, so
+ * this is unlikely to be wrong in any troublesome sense.
+ * Skip to "err".
+ */
+ goto err;
+ case DB_UNKNOWN:
+ default:
+ ret = __db_unknown_path(env, "__db_vrfy_structure");
+ goto err;
+ }
+
+ /* Walk free list. */
+ if ((ret =
+ __db_vrfy_freelist(dbp, vdp, meta_pgno, flags)) == DB_VERIFY_BAD)
+ isbad = 1;
+
+ /*
+ * If structure checks up until now have failed, it's likely that
+ * checking what pages have been missed will result in oodles of
+ * extraneous error messages being EPRINTed. Skip to the end
+ * if this is the case; we're going to be printing at least one
+ * error anyway, and probably all the more salient ones.
+ */
+ if (ret != 0 || isbad == 1)
+ goto err;
+
+ /*
+ * Make sure no page has been missed and that no page is still marked
+ * "all zeroes" (only certain hash pages can be, and they're unmarked
+ * in __ham_vrfy_structure).
+ */
+ for (i = 0; i < vdp->last_pgno + 1; i++) {
+ if ((ret = __db_vrfy_getpageinfo(vdp, i, &pip)) != 0)
+ goto err;
+ if ((ret = __db_vrfy_pgset_get(pgset,
+ vdp->thread_info, i, &p)) != 0)
+ goto err;
+ if (pip->type == P_OVERFLOW) {
+ if ((u_int32_t)p != pip->refcount) {
+ EPRINT((env,
+ "Page %lu: overflow refcount %lu, referenced %lu times",
+ (u_long)i,
+ (u_long)pip->refcount, (u_long)p));
+ isbad = 1;
+ }
+ } else if (p == 0 &&
+#ifndef HAVE_FTRUNCATE
+ !(i > vdp->meta_last_pgno &&
+ (F_ISSET(pip, VRFY_IS_ALLZEROES) || pip->type == P_HASH)) &&
+#endif
+ !(dbp->type == DB_HASH && pip->type == P_INVALID)) {
+ /*
+ * It is OK for unreferenced hash buckets to be
+ * marked invalid and unreferenced.
+ */
+ EPRINT((env,
+ "Page %lu: unreferenced page", (u_long)i));
+ isbad = 1;
+ }
+
+ if (F_ISSET(pip, VRFY_IS_ALLZEROES)
+#ifndef HAVE_FTRUNCATE
+ && i <= vdp->meta_last_pgno
+#endif
+ ) {
+ EPRINT((env,
+ "Page %lu: totally zeroed page", (u_long)i));
+ isbad = 1;
+ }
+ if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+ goto err;
+ pip = NULL;
+ }
+
+err: if (pip != NULL)
+ (void)__db_vrfy_putpageinfo(env, vdp, pip);
+
+ return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_is_valid_magicno
+ */
+static int
+__db_is_valid_magicno(magic, typep)
+ u_int32_t magic;
+ DBTYPE *typep;
+{
+ switch (magic) {
+ case DB_BTREEMAGIC:
+ *typep = DB_BTREE;
+ return (1);
+ case DB_HASHMAGIC:
+ *typep = DB_HASH;
+ return (1);
+ case DB_QAMMAGIC:
+ *typep = DB_QUEUE;
+ return (1);
+ default:
+ break;
+ }
+ *typep = DB_UNKNOWN;
+ return (0);
+}
+
+/*
+ * __db_vrfy_common --
+ * Verify info common to all page types.
+ *
+ * PUBLIC: int __db_vrfy_common
+ * PUBLIC: __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+ */
+int
+__db_vrfy_common(dbp, vdp, h, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ int ret, t_ret;
+ u_int8_t *p;
+
+ env = dbp->env;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ pip->pgno = pgno;
+ F_CLR(pip, VRFY_IS_ALLZEROES);
+
+ /*
+ * Hash expands the table by leaving some pages between the
+ * old last and the new last totally zeroed. These pages may
+ * not be all zero if they were used, freed and then reallocated.
+ *
+ * Queue will create sparse files if sparse record numbers are used.
+ */
+ if (pgno != 0 && PGNO(h) == 0) {
+ F_SET(pip, VRFY_IS_ALLZEROES);
+ for (p = (u_int8_t *)h; p < (u_int8_t *)h + dbp->pgsize; p++)
+ if (*p != 0) {
+ F_CLR(pip, VRFY_IS_ALLZEROES);
+ break;
+ }
+ /*
+ * Mark it as a hash, and we'll
+ * check that that makes sense structurally later.
+ * (The queue verification doesn't care, since queues
+ * don't really have much in the way of structure.)
+ */
+ pip->type = P_HASH;
+ ret = 0;
+ goto err; /* well, not really an err. */
+ }
+
+ if (PGNO(h) != pgno) {
+ EPRINT((env, "Page %lu: bad page number %lu",
+ (u_long)pgno, (u_long)h->pgno));
+ ret = DB_VERIFY_BAD;
+ }
+
+ switch (h->type) {
+ case P_INVALID: /* Order matches ordinal value. */
+ case P_HASH_UNSORTED:
+ case P_IBTREE:
+ case P_IRECNO:
+ case P_LBTREE:
+ case P_LRECNO:
+ case P_OVERFLOW:
+ case P_HASHMETA:
+ case P_BTREEMETA:
+ case P_QAMMETA:
+ case P_QAMDATA:
+ case P_LDUP:
+ case P_HASH:
+ break;
+ default:
+ EPRINT((env, "Page %lu: bad page type %lu",
+ (u_long)pgno, (u_long)h->type));
+ ret = DB_VERIFY_BAD;
+ }
+ pip->type = h->type;
+
+err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __db_vrfy_invalid --
+ * Verify P_INVALID page.
+ * (Yes, there's not much to do here.)
+ */
+static int
+__db_vrfy_invalid(dbp, vdp, h, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ int ret, t_ret;
+
+ env = dbp->env;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+ pip->next_pgno = pip->prev_pgno = 0;
+
+ if (!IS_VALID_PGNO(NEXT_PGNO(h))) {
+ EPRINT((env, "Page %lu: invalid next_pgno %lu",
+ (u_long)pgno, (u_long)NEXT_PGNO(h)));
+ ret = DB_VERIFY_BAD;
+ } else
+ pip->next_pgno = NEXT_PGNO(h);
+
+ if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __db_vrfy_datapage --
+ * Verify elements common to data pages (P_HASH, P_LBTREE,
+ * P_IBTREE, P_IRECNO, P_LRECNO, P_OVERFLOW, P_DUPLICATE)--i.e.,
+ * those defined in the PAGE structure.
+ *
+ * Called from each of the per-page routines, after the
+ * all-page-type-common elements of pip have been verified and filled
+ * in.
+ *
+ * PUBLIC: int __db_vrfy_datapage
+ * PUBLIC: __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t));
+ */
+int
+__db_vrfy_datapage(dbp, vdp, h, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ u_int32_t smallest_entry;
+ int isbad, ret, t_ret;
+
+ env = dbp->env;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+ isbad = 0;
+
+ /*
+ * prev_pgno and next_pgno: store for inter-page checks,
+ * verify that they point to actual pages and not to self.
+ *
+ * !!!
+ * Internal btree pages do not maintain these fields (indeed,
+ * they overload them). Skip.
+ */
+ if (TYPE(h) != P_IBTREE && TYPE(h) != P_IRECNO) {
+ if (!IS_VALID_PGNO(PREV_PGNO(h)) || PREV_PGNO(h) == pip->pgno) {
+ isbad = 1;
+ EPRINT((env, "Page %lu: invalid prev_pgno %lu",
+ (u_long)pip->pgno, (u_long)PREV_PGNO(h)));
+ }
+ if (!IS_VALID_PGNO(NEXT_PGNO(h)) || NEXT_PGNO(h) == pip->pgno) {
+ isbad = 1;
+ EPRINT((env, "Page %lu: invalid next_pgno %lu",
+ (u_long)pip->pgno, (u_long)NEXT_PGNO(h)));
+ }
+ pip->prev_pgno = PREV_PGNO(h);
+ pip->next_pgno = NEXT_PGNO(h);
+ }
+
+ /*
+ * Verify the number of entries on the page: there's no good way to
+ * determine if this is accurate. The best we can do is verify that
+ * it's not more than can, in theory, fit on the page. Then, we make
+ * sure there are at least this many valid elements in inp[], and
+ * hope the test catches most cases.
+ */
+ switch (TYPE(h)) {
+ case P_HASH_UNSORTED:
+ case P_HASH:
+ smallest_entry = HKEYDATA_PSIZE(0);
+ break;
+ case P_IBTREE:
+ smallest_entry = BINTERNAL_PSIZE(0);
+ break;
+ case P_IRECNO:
+ smallest_entry = RINTERNAL_PSIZE;
+ break;
+ case P_LBTREE:
+ case P_LDUP:
+ case P_LRECNO:
+ smallest_entry = BKEYDATA_PSIZE(0);
+ break;
+ default:
+ smallest_entry = 0;
+ break;
+ }
+ if (smallest_entry * NUM_ENT(h) / 2 > dbp->pgsize) {
+ isbad = 1;
+ EPRINT((env, "Page %lu: too many entries: %lu",
+ (u_long)pgno, (u_long)NUM_ENT(h)));
+ }
+
+ if (TYPE(h) != P_OVERFLOW)
+ pip->entries = NUM_ENT(h);
+
+ /*
+ * btree level. Should be zero unless we're a btree;
+ * if we are a btree, should be between LEAFLEVEL and MAXBTREELEVEL,
+ * and we need to save it off.
+ */
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ case P_IRECNO:
+ if (LEVEL(h) < LEAFLEVEL + 1) {
+ isbad = 1;
+ EPRINT((env, "Page %lu: bad btree level %lu",
+ (u_long)pgno, (u_long)LEVEL(h)));
+ }
+ pip->bt_level = LEVEL(h);
+ break;
+ case P_LBTREE:
+ case P_LDUP:
+ case P_LRECNO:
+ if (LEVEL(h) != LEAFLEVEL) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: btree leaf page has incorrect level %lu",
+ (u_long)pgno, (u_long)LEVEL(h)));
+ }
+ break;
+ default:
+ if (LEVEL(h) != 0) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: nonzero level %lu in non-btree database",
+ (u_long)pgno, (u_long)LEVEL(h)));
+ }
+ break;
+ }
+
+ /*
+ * Even though inp[] occurs in all PAGEs, we look at it in the
+ * access-method-specific code, since btree and hash treat
+ * item lengths very differently, and one of the most important
+ * things we want to verify is that the data--as specified
+ * by offset and length--cover the right part of the page
+ * without overlaps, gaps, or violations of the page boundary.
+ */
+ if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_vrfy_meta--
+ * Verify the access-method common parts of a meta page, using
+ * normal mpool routines.
+ *
+ * PUBLIC: int __db_vrfy_meta
+ * PUBLIC: __P((DB *, VRFY_DBINFO *, DBMETA *, db_pgno_t, u_int32_t));
+ */
+int
+__db_vrfy_meta(dbp, vdp, meta, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ DBMETA *meta;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ DBTYPE dbtype, magtype;
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ int isbad, ret, t_ret;
+
+ isbad = 0;
+ env = dbp->env;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ /* type plausible for a meta page */
+ switch (meta->type) {
+ case P_BTREEMETA:
+ dbtype = DB_BTREE;
+ break;
+ case P_HASHMETA:
+ dbtype = DB_HASH;
+ break;
+ case P_QAMMETA:
+ dbtype = DB_QUEUE;
+ break;
+ default:
+ ret = __db_unknown_path(env, "__db_vrfy_meta");
+ goto err;
+ }
+
+ /* magic number valid */
+ if (!__db_is_valid_magicno(meta->magic, &magtype)) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: invalid magic number", (u_long)pgno));
+ }
+ if (magtype != dbtype) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: magic number does not match database type",
+ (u_long)pgno));
+ }
+
+ /* version */
+ if ((dbtype == DB_BTREE &&
+ (meta->version > DB_BTREEVERSION ||
+ meta->version < DB_BTREEOLDVER)) ||
+ (dbtype == DB_HASH &&
+ (meta->version > DB_HASHVERSION ||
+ meta->version < DB_HASHOLDVER)) ||
+ (dbtype == DB_QUEUE &&
+ (meta->version > DB_QAMVERSION ||
+ meta->version < DB_QAMOLDVER))) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: unsupported database version %lu; extraneous errors may result",
+ (u_long)pgno, (u_long)meta->version));
+ }
+
+ /* pagesize */
+ if (meta->pagesize != dbp->pgsize) {
+ isbad = 1;
+ EPRINT((env, "Page %lu: invalid pagesize %lu",
+ (u_long)pgno, (u_long)meta->pagesize));
+ }
+
+ /* Flags */
+ if (meta->metaflags != 0) {
+ if (FLD_ISSET(meta->metaflags,
+ ~(DBMETA_CHKSUM|DBMETA_PART_RANGE|DBMETA_PART_CALLBACK))) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: bad meta-data flags value %#lx",
+ (u_long)PGNO_BASE_MD, (u_long)meta->metaflags));
+ }
+ if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM))
+ F_SET(pip, VRFY_HAS_CHKSUM);
+ if (FLD_ISSET(meta->metaflags, DBMETA_PART_RANGE))
+ F_SET(pip, VRFY_HAS_PART_RANGE);
+ if (FLD_ISSET(meta->metaflags, DBMETA_PART_CALLBACK))
+ F_SET(pip, VRFY_HAS_PART_CALLBACK);
+ }
+
+ /*
+ * Free list.
+ *
+ * If this is not the main, master-database meta page, it
+ * should not have a free list.
+ */
+ if (pgno != PGNO_BASE_MD && meta->free != PGNO_INVALID) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: nonempty free list on subdatabase metadata page",
+ (u_long)pgno));
+ }
+
+ /* Can correctly be PGNO_INVALID--that's just the end of the list. */
+ if (meta->free != PGNO_INVALID && IS_VALID_PGNO(meta->free))
+ pip->free = meta->free;
+ else if (!IS_VALID_PGNO(meta->free)) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: nonsensical free list pgno %lu",
+ (u_long)pgno, (u_long)meta->free));
+ }
+
+ /*
+ * Check that the meta page agrees with what we got from mpool.
+ * If we don't have FTRUNCATE then mpool could include some
+ * zeroed pages at the end of the file, we assume the meta page
+ * is correct.
+ */
+ if (pgno == PGNO_BASE_MD && meta->last_pgno != vdp->last_pgno) {
+#ifdef HAVE_FTRUNCATE
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: last_pgno is not correct: %lu != %lu",
+ (u_long)pgno,
+ (u_long)meta->last_pgno, (u_long)vdp->last_pgno));
+#endif
+ vdp->meta_last_pgno = meta->last_pgno;
+ }
+
+ /*
+ * We have now verified the common fields of the metadata page.
+ * Clear the flag that told us they had been incompletely checked.
+ */
+ F_CLR(pip, VRFY_INCOMPLETE);
+
+err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_vrfy_freelist --
+ * Walk free list, checking off pages and verifying absence of
+ * loops.
+ */
+static int
+__db_vrfy_freelist(dbp, vdp, meta, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t meta;
+ u_int32_t flags;
+{
+ DB *pgset;
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ db_pgno_t cur_pgno, next_pgno;
+ int p, ret, t_ret;
+
+ env = dbp->env;
+ pgset = vdp->pgset;
+ DB_ASSERT(env, pgset != NULL);
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, meta, &pip)) != 0)
+ return (ret);
+ for (next_pgno = pip->free;
+ next_pgno != PGNO_INVALID; next_pgno = pip->next_pgno) {
+ cur_pgno = pip->pgno;
+ if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+ return (ret);
+
+ /* This shouldn't happen, but just in case. */
+ if (!IS_VALID_PGNO(next_pgno)) {
+ EPRINT((env,
+ "Page %lu: invalid next_pgno %lu on free list page",
+ (u_long)cur_pgno, (u_long)next_pgno));
+ return (DB_VERIFY_BAD);
+ }
+
+ /* Detect cycles. */
+ if ((ret = __db_vrfy_pgset_get(pgset,
+ vdp->thread_info, next_pgno, &p)) != 0)
+ return (ret);
+ if (p != 0) {
+ EPRINT((env,
+ "Page %lu: page %lu encountered a second time on free list",
+ (u_long)cur_pgno, (u_long)next_pgno));
+ return (DB_VERIFY_BAD);
+ }
+ if ((ret = __db_vrfy_pgset_inc(pgset,
+ vdp->thread_info, next_pgno)) != 0)
+ return (ret);
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, next_pgno, &pip)) != 0)
+ return (ret);
+
+ if (pip->type != P_INVALID) {
+ EPRINT((env,
+ "Page %lu: non-invalid page %lu on free list",
+ (u_long)cur_pgno, (u_long)next_pgno));
+ ret = DB_VERIFY_BAD; /* unsafe to continue */
+ break;
+ }
+ }
+
+ if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __db_vrfy_subdbs --
+ * Walk the known-safe master database of subdbs with a cursor,
+ * verifying the structure of each subdatabase we encounter.
+ */
+static int
+__db_vrfy_subdbs(dbp, vdp, dbname, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ const char *dbname;
+ u_int32_t flags;
+{
+ DB *mdbp;
+ DBC *dbc;
+ DBT key, data;
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ db_pgno_t meta_pgno;
+ int ret, t_ret, isbad;
+ u_int8_t type;
+
+ isbad = 0;
+ dbc = NULL;
+ env = dbp->env;
+
+ if ((ret = __db_master_open(dbp,
+ vdp->thread_info, NULL, dbname, DB_RDONLY, 0, &mdbp)) != 0)
+ return (ret);
+
+ if ((ret = __db_cursor_int(mdbp, NULL,
+ NULL, DB_BTREE, PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
+ goto err;
+
+ memset(&key, 0, sizeof(key));
+ memset(&data, 0, sizeof(data));
+ while ((ret = __dbc_get(dbc, &key, &data, DB_NEXT)) == 0) {
+ if (data.size != sizeof(db_pgno_t)) {
+ EPRINT((env,
+ "Subdatabase entry not page-number size"));
+ isbad = 1;
+ goto err;
+ }
+ memcpy(&meta_pgno, data.data, data.size);
+ /*
+ * Subdatabase meta pgnos are stored in network byte
+ * order for cross-endian compatibility. Swap if appropriate.
+ */
+ DB_NTOHL_SWAP(env, &meta_pgno);
+ if (meta_pgno == PGNO_INVALID || meta_pgno > vdp->last_pgno) {
+ EPRINT((env,
+ "Subdatabase entry references invalid page %lu",
+ (u_long)meta_pgno));
+ isbad = 1;
+ goto err;
+ }
+ if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &pip)) != 0)
+ goto err;
+ type = pip->type;
+ if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+ goto err;
+ switch (type) {
+ case P_BTREEMETA:
+ if ((ret = __bam_vrfy_structure(
+ dbp, vdp, meta_pgno, NULL, NULL, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+ break;
+ case P_HASHMETA:
+ if ((ret = __ham_vrfy_structure(
+ dbp, vdp, meta_pgno, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+ break;
+ case P_QAMMETA:
+ default:
+ EPRINT((env,
+ "Subdatabase entry references page %lu of invalid type %lu",
+ (u_long)meta_pgno, (u_long)type));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ }
+
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+
+err: if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if ((t_ret = __db_close(mdbp, NULL, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __db_vrfy_struct_feedback --
+ * Provide feedback during top-down database structure traversal.
+ * (See comment at the beginning of __db_vrfy_structure.)
+ *
+ * PUBLIC: void __db_vrfy_struct_feedback __P((DB *, VRFY_DBINFO *));
+ */
+void
+__db_vrfy_struct_feedback(dbp, vdp)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+{
+ int progress;
+
+ if (dbp->db_feedback == NULL)
+ return;
+
+ if (vdp->pgs_remaining > 0)
+ vdp->pgs_remaining--;
+
+ /* Don't allow a feedback call of 100 until we're really done. */
+ progress = 100 - (int)(vdp->pgs_remaining * 50 / (vdp->last_pgno + 1));
+ dbp->db_feedback(dbp, DB_VERIFY, progress == 100 ? 99 : progress);
+}
+
+/*
+ * __db_vrfy_orderchkonly --
+ * Do an sort-order/hashing check on a known-otherwise-good subdb.
+ */
+static int
+__db_vrfy_orderchkonly(dbp, vdp, name, subdb, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ const char *name, *subdb;
+ u_int32_t flags;
+{
+ BTMETA *btmeta;
+ DB *mdbp, *pgset;
+ DBC *pgsc;
+ DBT key, data;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ HASH *h_internal;
+ HMETA *hmeta;
+ PAGE *h, *currpg;
+ db_pgno_t meta_pgno, p, pgno;
+ u_int32_t bucket;
+ int t_ret, ret;
+
+ pgset = NULL;
+ pgsc = NULL;
+ env = dbp->env;
+ mpf = dbp->mpf;
+ currpg = h = NULL;
+
+ LF_CLR(DB_NOORDERCHK);
+
+ /* Open the master database and get the meta_pgno for the subdb. */
+ if ((ret = __db_master_open(dbp,
+ vdp->thread_info, NULL, name, DB_RDONLY, 0, &mdbp)) != 0)
+ goto err;
+
+ DB_INIT_DBT(key, subdb, strlen(subdb));
+ memset(&data, 0, sizeof(data));
+ if ((ret = __db_get(mdbp,
+ vdp->thread_info, NULL, &key, &data, 0)) != 0) {
+ if (ret == DB_NOTFOUND)
+ ret = ENOENT;
+ goto err;
+ }
+
+ if (data.size != sizeof(db_pgno_t)) {
+ EPRINT((env, "Subdatabase entry of invalid size"));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+ memcpy(&meta_pgno, data.data, data.size);
+
+ /*
+ * Subdatabase meta pgnos are stored in network byte
+ * order for cross-endian compatibility. Swap if appropriate.
+ */
+ DB_NTOHL_SWAP(env, &meta_pgno);
+
+ if ((ret = __memp_fget(mpf,
+ &meta_pgno, vdp->thread_info, NULL, 0, &h)) != 0)
+ goto err;
+
+ if ((ret = __db_vrfy_pgset(env,
+ vdp->thread_info, dbp->pgsize, &pgset)) != 0)
+ goto err;
+
+ switch (TYPE(h)) {
+ case P_BTREEMETA:
+ btmeta = (BTMETA *)h;
+ if (F_ISSET(&btmeta->dbmeta, BTM_RECNO)) {
+ /* Recnos have no order to check. */
+ ret = 0;
+ goto err;
+ }
+ if ((ret =
+ __db_meta2pgset(dbp, vdp, meta_pgno, flags, pgset)) != 0)
+ goto err;
+ if ((ret = __db_cursor_int(pgset, NULL, NULL, dbp->type,
+ PGNO_INVALID, 0, DB_LOCK_INVALIDID, &pgsc)) != 0)
+ goto err;
+ while ((ret = __db_vrfy_pgset_next(pgsc, &p)) == 0) {
+ if ((ret = __memp_fget(mpf, &p,
+ vdp->thread_info, NULL, 0, &currpg)) != 0)
+ goto err;
+ if ((ret = __bam_vrfy_itemorder(dbp, NULL,
+ vdp->thread_info, currpg, p, NUM_ENT(currpg), 1,
+ F_ISSET(&btmeta->dbmeta, BTM_DUP), flags)) != 0)
+ goto err;
+ if ((ret = __memp_fput(mpf,
+ vdp->thread_info, currpg, dbp->priority)) != 0)
+ goto err;
+ currpg = NULL;
+ }
+
+ /*
+ * The normal exit condition for the loop above is DB_NOTFOUND.
+ * If we see that, zero it and continue on to cleanup.
+ * Otherwise, it's a real error and will be returned.
+ */
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ break;
+ case P_HASHMETA:
+ hmeta = (HMETA *)h;
+ h_internal = (HASH *)dbp->h_internal;
+ /*
+ * Make sure h_charkey is right.
+ */
+ if (h_internal == NULL) {
+ EPRINT((env,
+ "Page %lu: DB->h_internal field is NULL",
+ (u_long)meta_pgno));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ if (h_internal->h_hash == NULL)
+ h_internal->h_hash = hmeta->dbmeta.version < 5
+ ? __ham_func4 : __ham_func5;
+ if (hmeta->h_charkey !=
+ h_internal->h_hash(dbp, CHARKEY, sizeof(CHARKEY))) {
+ EPRINT((env,
+ "Page %lu: incorrect hash function for database",
+ (u_long)meta_pgno));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+ /*
+ * Foreach bucket, verify hashing on each page in the
+ * corresponding chain of pages.
+ */
+ if ((ret = __db_cursor_int(dbp, NULL, NULL, dbp->type,
+ PGNO_INVALID, 0, DB_LOCK_INVALIDID, &pgsc)) != 0)
+ goto err;
+ for (bucket = 0; bucket <= hmeta->max_bucket; bucket++) {
+ pgno = BS_TO_PAGE(bucket, hmeta->spares);
+ while (pgno != PGNO_INVALID) {
+ if ((ret = __memp_fget(mpf, &pgno,
+ vdp->thread_info, NULL, 0, &currpg)) != 0)
+ goto err;
+ if ((ret = __ham_vrfy_hashing(pgsc,
+ NUM_ENT(currpg), hmeta, bucket, pgno,
+ flags, h_internal->h_hash)) != 0)
+ goto err;
+ pgno = NEXT_PGNO(currpg);
+ if ((ret = __memp_fput(mpf, vdp->thread_info,
+ currpg, dbp->priority)) != 0)
+ goto err;
+ currpg = NULL;
+ }
+ }
+ break;
+ default:
+ EPRINT((env, "Page %lu: database metapage of bad type %lu",
+ (u_long)meta_pgno, (u_long)TYPE(h)));
+ ret = DB_VERIFY_BAD;
+ break;
+ }
+
+err: if (pgsc != NULL && (t_ret = __dbc_close(pgsc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (pgset != NULL &&
+ (t_ret = __db_close(pgset, NULL, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ if (h != NULL && (t_ret = __memp_fput(mpf,
+ vdp->thread_info, h, dbp->priority)) != 0)
+ ret = t_ret;
+ if (currpg != NULL &&
+ (t_ret = __memp_fput(mpf,
+ vdp->thread_info, currpg, dbp->priority)) != 0)
+ ret = t_ret;
+ if ((t_ret = __db_close(mdbp, NULL, 0)) != 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __db_salvage_pg --
+ * Walk through a page, salvaging all likely or plausible (w/
+ * DB_AGGRESSIVE) key/data pairs and marking seen pages in vdp.
+ *
+ * PUBLIC: int __db_salvage_pg __P((DB *, VRFY_DBINFO *, db_pgno_t,
+ * PUBLIC: PAGE *, void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__db_salvage_pg(dbp, vdp, pgno, h, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ PAGE *h;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ int keyflag, ret, t_ret;
+
+ env = dbp->env;
+ DB_ASSERT(env, LF_ISSET(DB_SALVAGE));
+
+ /*
+ * !!!
+ * We dump record numbers when salvaging Queue databases, but not for
+ * immutable Recno databases. The problem is we can't figure out the
+ * record number from the database page in the Recno case, while the
+ * offset in the file is sufficient for Queue.
+ */
+ keyflag = 0;
+
+ /* If we got this page in the subdb pass, we can safely skip it. */
+ if (__db_salvage_isdone(vdp, pgno))
+ return (0);
+
+ switch (TYPE(h)) {
+ case P_BTREEMETA:
+ ret = __bam_vrfy_meta(dbp, vdp, (BTMETA *)h, pgno, flags);
+ break;
+ case P_HASH:
+ case P_HASH_UNSORTED:
+ case P_LBTREE:
+ case P_QAMDATA:
+ return (__db_salvage_leaf(dbp,
+ vdp, pgno, h, handle, callback, flags));
+ case P_HASHMETA:
+ ret = __ham_vrfy_meta(dbp, vdp, (HMETA *)h, pgno, flags);
+ break;
+ case P_IBTREE:
+ /*
+ * We need to mark any overflow keys on internal pages as seen,
+ * so we don't print them out in __db_salvage_unknowns. But if
+ * we're an upgraded database, a P_LBTREE page may very well
+ * have a reference to the same overflow pages (this practice
+ * stopped somewhere around db4.5). To give P_LBTREEs a chance
+ * to print out any keys on shared pages, mark the page now and
+ * deal with it at the end.
+ */
+ return (__db_salvage_markneeded(vdp, pgno, SALVAGE_IBTREE));
+ case P_LDUP:
+ return (__db_salvage_markneeded(vdp, pgno, SALVAGE_LDUP));
+ case P_LRECNO:
+ /*
+ * Recno leaves are tough, because the leaf could be (1) a dup
+ * page, or it could be (2) a regular database leaf page.
+ * Fortunately, RECNO databases are not allowed to have
+ * duplicates.
+ *
+ * If there are no subdatabases, dump the page immediately if
+ * it's a leaf in a RECNO database, otherwise wait and hopefully
+ * it will be dumped by the leaf page that refers to it,
+ * otherwise we'll get it with the unknowns.
+ *
+ * If there are subdatabases, there might be mixed types and
+ * dbp->type can't be trusted. We'll only get here after
+ * salvaging each database, though, so salvaging this page
+ * immediately isn't important. If this page is a dup, it might
+ * get salvaged later on, otherwise the unknowns pass will pick
+ * it up. Note that SALVAGE_HASSUBDBS won't get set if we're
+ * salvaging aggressively.
+ *
+ * If we're salvaging aggressively, we don't know whether or not
+ * there's subdatabases, so we wait on all recno pages.
+ */
+ if (!LF_ISSET(DB_AGGRESSIVE) &&
+ !F_ISSET(vdp, SALVAGE_HASSUBDBS) && dbp->type == DB_RECNO)
+ return (__db_salvage_leaf(dbp,
+ vdp, pgno, h, handle, callback, flags));
+ return (__db_salvage_markneeded(vdp, pgno, SALVAGE_LRECNODUP));
+ case P_OVERFLOW:
+ return (__db_salvage_markneeded(vdp, pgno, SALVAGE_OVERFLOW));
+ case P_QAMMETA:
+ keyflag = 1;
+ ret = __qam_vrfy_meta(dbp, vdp, (QMETA *)h, pgno, flags);
+ break;
+ case P_INVALID:
+ case P_IRECNO:
+ case __P_DUPLICATE:
+ default:
+ /*
+ * There's no need to display an error, the page type was
+ * already checked and reported on.
+ */
+ return (0);
+ }
+ if (ret != 0)
+ return (ret);
+
+ /*
+ * We have to display the dump header if it's a metadata page. It's
+ * our last chance as the page was marked "seen" in the vrfy routine,
+ * and we won't see the page again. We don't display headers for
+ * the first database in a multi-database file, that database simply
+ * contains a list of subdatabases.
+ */
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+ if (!F_ISSET(pip, VRFY_HAS_SUBDBS) && !LF_ISSET(DB_VERIFY_PARTITION))
+ ret = __db_prheader(
+ dbp, NULL, 0, keyflag, handle, callback, vdp, pgno);
+ if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __db_salvage_leaf --
+ * Walk through a leaf, salvaging all likely key/data pairs and marking
+ * seen pages in vdp.
+ *
+ * PUBLIC: int __db_salvage_leaf __P((DB *, VRFY_DBINFO *, db_pgno_t,
+ * PUBLIC: PAGE *, void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__db_salvage_leaf(dbp, vdp, pgno, h, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ PAGE *h;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ ENV *env;
+
+ env = dbp->env;
+ DB_ASSERT(env, LF_ISSET(DB_SALVAGE));
+
+ /* If we got this page in the subdb pass, we can safely skip it. */
+ if (__db_salvage_isdone(vdp, pgno))
+ return (0);
+
+ switch (TYPE(h)) {
+ case P_HASH_UNSORTED:
+ case P_HASH:
+ return (__ham_salvage(dbp, vdp,
+ pgno, h, handle, callback, flags));
+ case P_LBTREE:
+ case P_LRECNO:
+ return (__bam_salvage(dbp, vdp,
+ pgno, TYPE(h), h, handle, callback, NULL, flags));
+ case P_QAMDATA:
+ return (__qam_salvage(dbp, vdp,
+ pgno, h, handle, callback, flags));
+ default:
+ /*
+ * There's no need to display an error, the page type was
+ * already checked and reported on.
+ */
+ return (0);
+ }
+}
+
+/*
+ * __db_salvage_unknowns --
+ * Walk through the salvager database, printing with key "UNKNOWN"
+ * any pages we haven't dealt with.
+ */
+static int
+__db_salvage_unknowns(dbp, vdp, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ DBC *dbc;
+ DBT unkdbt, key, *dbt;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t pgtype, ovfl_bufsz, tmp_flags;
+ int ret, t_ret;
+ void *ovflbuf;
+
+ dbc = NULL;
+ env = dbp->env;
+ mpf = dbp->mpf;
+
+ DB_INIT_DBT(unkdbt, "UNKNOWN", sizeof("UNKNOWN") - 1);
+
+ if ((ret = __os_malloc(env, dbp->pgsize, &ovflbuf)) != 0)
+ return (ret);
+ ovfl_bufsz = dbp->pgsize;
+
+ /*
+ * We make two passes -- in the first pass, skip SALVAGE_OVERFLOW
+ * pages, because they may be referenced by the standard database
+ * pages that we're resolving.
+ */
+ while ((t_ret =
+ __db_salvage_getnext(vdp, &dbc, &pgno, &pgtype, 1)) == 0) {
+ if ((t_ret = __memp_fget(mpf,
+ &pgno, vdp->thread_info, NULL, 0, &h)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ continue;
+ }
+
+ dbt = NULL;
+ tmp_flags = 0;
+ switch (pgtype) {
+ case SALVAGE_LDUP:
+ case SALVAGE_LRECNODUP:
+ dbt = &unkdbt;
+ tmp_flags = DB_SA_UNKNOWNKEY;
+ /* FALLTHROUGH */
+ case SALVAGE_IBTREE:
+ case SALVAGE_LBTREE:
+ case SALVAGE_LRECNO:
+ if ((t_ret = __bam_salvage(
+ dbp, vdp, pgno, pgtype, h, handle,
+ callback, dbt, tmp_flags | flags)) != 0 && ret == 0)
+ ret = t_ret;
+ break;
+ case SALVAGE_OVERFLOW:
+ DB_ASSERT(env, 0); /* Shouldn't ever happen. */
+ break;
+ case SALVAGE_HASH:
+ if ((t_ret = __ham_salvage(dbp, vdp,
+ pgno, h, handle, callback, flags)) != 0 && ret == 0)
+ ret = t_ret;
+ break;
+ case SALVAGE_INVALID:
+ case SALVAGE_IGNORE:
+ default:
+ /*
+ * Shouldn't happen, but if it does, just do what the
+ * nice man says.
+ */
+ DB_ASSERT(env, 0);
+ break;
+ }
+ if ((t_ret = __memp_fput(mpf,
+ vdp->thread_info, h, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ /* We should have reached the end of the database. */
+ if (t_ret == DB_NOTFOUND)
+ t_ret = 0;
+ if (t_ret != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Re-open the cursor so we traverse the database again. */
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ dbc = NULL;
+
+ /* Now, deal with any remaining overflow pages. */
+ while ((t_ret =
+ __db_salvage_getnext(vdp, &dbc, &pgno, &pgtype, 0)) == 0) {
+ if ((t_ret = __memp_fget(mpf,
+ &pgno, vdp->thread_info, NULL, 0, &h)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ continue;
+ }
+
+ switch (pgtype) {
+ case SALVAGE_OVERFLOW:
+ /*
+ * XXX:
+ * This may generate multiple "UNKNOWN" keys in
+ * a database with no dups. What to do?
+ */
+ if ((t_ret = __db_safe_goff(dbp, vdp,
+ pgno, &key, &ovflbuf, &ovfl_bufsz, flags)) != 0 ||
+ ((vdp->type == DB_BTREE || vdp->type == DB_HASH) &&
+ (t_ret = __db_vrfy_prdbt(&unkdbt,
+ 0, " ", handle, callback, 0, vdp)) != 0) ||
+ (t_ret = __db_vrfy_prdbt(
+ &key, 0, " ", handle, callback, 0, vdp)) != 0)
+ if (ret == 0)
+ ret = t_ret;
+ break;
+ default:
+ DB_ASSERT(env, 0); /* Shouldn't ever happen. */
+ break;
+ }
+ if ((t_ret = __memp_fput(mpf,
+ vdp->thread_info, h, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ /* We should have reached the end of the database. */
+ if (t_ret == DB_NOTFOUND)
+ t_ret = 0;
+ if (t_ret != 0 && ret == 0)
+ ret = t_ret;
+
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ __os_free(env, ovflbuf);
+
+ return (ret);
+}
+
+/*
+ * Offset of the ith inp array entry, which we can compare to the offset
+ * the entry stores.
+ */
+#define INP_OFFSET(dbp, h, i) \
+ ((db_indx_t)((u_int8_t *)((P_INP(dbp,(h))) + (i)) - (u_int8_t *)(h)))
+
+/*
+ * __db_vrfy_inpitem --
+ * Verify that a single entry in the inp array is sane, and update
+ * the high water mark and current item offset. (The former of these is
+ * used for state information between calls, and is required; it must
+ * be initialized to the pagesize before the first call.)
+ *
+ * Returns DB_VERIFY_FATAL if inp has collided with the data,
+ * since verification can't continue from there; returns DB_VERIFY_BAD
+ * if anything else is wrong.
+ *
+ * PUBLIC: int __db_vrfy_inpitem __P((DB *, PAGE *,
+ * PUBLIC: db_pgno_t, u_int32_t, int, u_int32_t, u_int32_t *, u_int32_t *));
+ */
+int
+__db_vrfy_inpitem(dbp, h, pgno, i, is_btree, flags, himarkp, offsetp)
+ DB *dbp;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t i;
+ int is_btree;
+ u_int32_t flags, *himarkp, *offsetp;
+{
+ BKEYDATA *bk;
+ ENV *env;
+ db_indx_t *inp, offset, len;
+
+ env = dbp->env;
+
+ DB_ASSERT(env, himarkp != NULL);
+ inp = P_INP(dbp, h);
+
+ /*
+ * Check that the inp array, which grows from the beginning of the
+ * page forward, has not collided with the data, which grow from the
+ * end of the page backward.
+ */
+ if (inp + i >= (db_indx_t *)((u_int8_t *)h + *himarkp)) {
+ /* We've collided with the data. We need to bail. */
+ EPRINT((env, "Page %lu: entries listing %lu overlaps data",
+ (u_long)pgno, (u_long)i));
+ return (DB_VERIFY_FATAL);
+ }
+
+ offset = inp[i];
+
+ /*
+ * Check that the item offset is reasonable: it points somewhere
+ * after the inp array and before the end of the page.
+ */
+ if (offset <= INP_OFFSET(dbp, h, i) || offset >= dbp->pgsize) {
+ EPRINT((env, "Page %lu: bad offset %lu at page index %lu",
+ (u_long)pgno, (u_long)offset, (u_long)i));
+ return (DB_VERIFY_BAD);
+ }
+
+ /* Update the high-water mark (what HOFFSET should be) */
+ if (offset < *himarkp)
+ *himarkp = offset;
+
+ if (is_btree) {
+ /*
+ * Check alignment; if it's unaligned, it's unsafe to
+ * manipulate this item.
+ */
+ if (offset != DB_ALIGN(offset, sizeof(u_int32_t))) {
+ EPRINT((env,
+ "Page %lu: unaligned offset %lu at page index %lu",
+ (u_long)pgno, (u_long)offset, (u_long)i));
+ return (DB_VERIFY_BAD);
+ }
+
+ /*
+ * Check that the item length remains on-page.
+ */
+ bk = GET_BKEYDATA(dbp, h, i);
+
+ /*
+ * We need to verify the type of the item here;
+ * we can't simply assume that it will be one of the
+ * expected three. If it's not a recognizable type,
+ * it can't be considered to have a verifiable
+ * length, so it's not possible to certify it as safe.
+ */
+ switch (B_TYPE(bk->type)) {
+ case B_KEYDATA:
+ len = bk->len;
+ break;
+ case B_DUPLICATE:
+ case B_OVERFLOW:
+ len = BOVERFLOW_SIZE;
+ break;
+ default:
+ EPRINT((env,
+ "Page %lu: item %lu of unrecognizable type",
+ (u_long)pgno, (u_long)i));
+ return (DB_VERIFY_BAD);
+ }
+
+ if ((size_t)(offset + len) > dbp->pgsize) {
+ EPRINT((env,
+ "Page %lu: item %lu extends past page boundary",
+ (u_long)pgno, (u_long)i));
+ return (DB_VERIFY_BAD);
+ }
+ }
+
+ if (offsetp != NULL)
+ *offsetp = offset;
+ return (0);
+}
+
+/*
+ * __db_vrfy_duptype--
+ * Given a page number and a set of flags to __bam_vrfy_subtree,
+ * verify that the dup tree type is correct--i.e., it's a recno
+ * if DUPSORT is not set and a btree if it is.
+ *
+ * PUBLIC: int __db_vrfy_duptype
+ * PUBLIC: __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t));
+ */
+int
+__db_vrfy_duptype(dbp, vdp, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ int ret, isbad;
+
+ env = dbp->env;
+ isbad = 0;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ switch (pip->type) {
+ case P_IBTREE:
+ case P_LDUP:
+ if (!LF_ISSET(DB_ST_DUPSORT)) {
+ EPRINT((env,
+ "Page %lu: sorted duplicate set in unsorted-dup database",
+ (u_long)pgno));
+ isbad = 1;
+ }
+ break;
+ case P_IRECNO:
+ case P_LRECNO:
+ if (LF_ISSET(DB_ST_DUPSORT)) {
+ EPRINT((env,
+ "Page %lu: unsorted duplicate set in sorted-dup database",
+ (u_long)pgno));
+ isbad = 1;
+ }
+ break;
+ default:
+ /*
+ * If the page is entirely zeroed, its pip->type will be a lie
+ * (we assumed it was a hash page, as they're allowed to be
+ * zeroed); handle this case specially.
+ */
+ if (F_ISSET(pip, VRFY_IS_ALLZEROES))
+ ZEROPG_ERR_PRINT(env, pgno, "duplicate page");
+ else
+ EPRINT((env,
+ "Page %lu: duplicate page of inappropriate type %lu",
+ (u_long)pgno, (u_long)pip->type));
+ isbad = 1;
+ break;
+ }
+
+ if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0)
+ return (ret);
+ return (isbad == 1 ? DB_VERIFY_BAD : 0);
+}
+
+/*
+ * __db_salvage_duptree --
+ * Attempt to salvage a given duplicate tree, given its alleged root.
+ *
+ * The key that corresponds to this dup set has been passed to us
+ * in DBT *key. Because data items follow keys, though, it has been
+ * printed once already.
+ *
+ * The basic idea here is that pgno ought to be a P_LDUP, a P_LRECNO, a
+ * P_IBTREE, or a P_IRECNO. If it's an internal page, use the verifier
+ * functions to make sure it's safe; if it's not, we simply bail and the
+ * data will have to be printed with no key later on. if it is safe,
+ * recurse on each of its children.
+ *
+ * Whether or not it's safe, if it's a leaf page, __bam_salvage it.
+ *
+ * At all times, use the DB hanging off vdp to mark and check what we've
+ * done, so each page gets printed exactly once and we don't get caught
+ * in any cycles.
+ *
+ * PUBLIC: int __db_salvage_duptree __P((DB *, VRFY_DBINFO *, db_pgno_t,
+ * PUBLIC: DBT *, void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__db_salvage_duptree(dbp, vdp, pgno, key, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ DBT *key;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ int ret, t_ret;
+
+ mpf = dbp->mpf;
+
+ if (pgno == PGNO_INVALID || !IS_VALID_PGNO(pgno))
+ return (DB_VERIFY_BAD);
+
+ /* We have a plausible page. Try it. */
+ if ((ret = __memp_fget(mpf, &pgno, vdp->thread_info, NULL, 0, &h)) != 0)
+ return (ret);
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ case P_IRECNO:
+ if ((ret = __db_vrfy_common(dbp, vdp, h, pgno, flags)) != 0)
+ goto err;
+ if ((ret = __bam_vrfy(dbp,
+ vdp, h, pgno, flags | DB_NOORDERCHK)) != 0 ||
+ (ret = __db_salvage_markdone(vdp, pgno)) != 0)
+ goto err;
+ /*
+ * We have a known-healthy internal page. Walk it.
+ */
+ if ((ret = __bam_salvage_walkdupint(dbp, vdp, h, key,
+ handle, callback, flags)) != 0)
+ goto err;
+ break;
+ case P_LRECNO:
+ case P_LDUP:
+ if ((ret = __bam_salvage(dbp,
+ vdp, pgno, TYPE(h), h, handle, callback, key, flags)) != 0)
+ goto err;
+ break;
+ default:
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+err: if ((t_ret = __memp_fput(mpf,
+ vdp->thread_info, h, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __db_salvage_all --
+ * Salvage only the leaves we find by walking the tree. If we have subdbs,
+ * salvage each of them individually.
+ */
+static int
+__db_salvage_all(dbp, vdp, handle, callback, flags, hassubsp)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+ int *hassubsp;
+{
+ DB *pgset;
+ DBC *pgsc;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *h;
+ VRFY_PAGEINFO *pip;
+ db_pgno_t p, meta_pgno;
+ int ret, t_ret;
+
+ *hassubsp = 0;
+
+ env = dbp->env;
+ pgset = NULL;
+ pgsc = NULL;
+ mpf = dbp->mpf;
+ h = NULL;
+ pip = NULL;
+ ret = 0;
+
+ /*
+ * Check to make sure the page is OK and find out if it contains
+ * subdatabases.
+ */
+ meta_pgno = PGNO_BASE_MD;
+ if ((t_ret = __memp_fget(mpf,
+ &meta_pgno, vdp->thread_info, NULL, 0, &h)) == 0 &&
+ (t_ret = __db_vrfy_common(dbp, vdp, h, PGNO_BASE_MD, flags)) == 0 &&
+ (t_ret = __db_salvage_pg(
+ dbp, vdp, PGNO_BASE_MD, h, handle, callback, flags)) == 0 &&
+ (t_ret = __db_vrfy_getpageinfo(vdp, 0, &pip)) == 0)
+ if (F_ISSET(pip, VRFY_HAS_SUBDBS))
+ *hassubsp = 1;
+ if (pip != NULL &&
+ (t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ if (h != NULL) {
+ if ((t_ret = __memp_fput(mpf,
+ vdp->thread_info, h, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ h = NULL;
+ }
+ if (ret != 0)
+ return (ret);
+
+ /* Without subdatabases, we can just dump from the meta pgno. */
+ if (*hassubsp == 0)
+ return (__db_salvage(dbp,
+ vdp, PGNO_BASE_MD, handle, callback, flags));
+
+ /*
+ * We have subdbs. Try to crack them.
+ *
+ * To do so, get a set of leaf pages in the master database, and then
+ * walk each of the valid ones, salvaging subdbs as we go. If any
+ * prove invalid, just drop them; we'll pick them up on a later pass.
+ */
+ if ((ret = __db_vrfy_pgset(env,
+ vdp->thread_info, dbp->pgsize, &pgset)) != 0)
+ goto err;
+ if ((ret = __db_meta2pgset(dbp, vdp, PGNO_BASE_MD, flags, pgset)) != 0)
+ goto err;
+ if ((ret = __db_cursor(pgset, vdp->thread_info, NULL, &pgsc, 0)) != 0)
+ goto err;
+ while ((t_ret = __db_vrfy_pgset_next(pgsc, &p)) == 0) {
+ if ((t_ret = __memp_fget(mpf,
+ &p, vdp->thread_info, NULL, 0, &h)) == 0 &&
+ (t_ret = __db_vrfy_common(dbp, vdp, h, p, flags)) == 0 &&
+ (t_ret =
+ __bam_vrfy(dbp, vdp, h, p, flags | DB_NOORDERCHK)) == 0)
+ t_ret = __db_salvage_subdbpg(
+ dbp, vdp, h, handle, callback, flags);
+ if (t_ret != 0 && ret == 0)
+ ret = t_ret;
+ if (h != NULL) {
+ if ((t_ret = __memp_fput(mpf, vdp->thread_info,
+ h, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ h = NULL;
+ }
+ }
+
+ if (t_ret != DB_NOTFOUND && ret == 0)
+ ret = t_ret;
+
+err: if (pgsc != NULL && (t_ret = __dbc_close(pgsc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (pgset != NULL &&
+ (t_ret = __db_close(pgset, NULL, 0)) != 0 && ret ==0)
+ ret = t_ret;
+ if (h != NULL &&
+ (t_ret = __memp_fput(mpf,
+ vdp->thread_info, h, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __db_salvage_subdbpg --
+ * Given a known-good leaf page in the master database, salvage all
+ * leaf pages corresponding to each subdb.
+ */
+static int
+__db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *master;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ BKEYDATA *bkkey, *bkdata;
+ BOVERFLOW *bo;
+ DB *pgset;
+ DBC *pgsc;
+ DBT key;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *subpg;
+ db_indx_t i;
+ db_pgno_t meta_pgno;
+ int ret, err_ret, t_ret;
+ char *subdbname;
+ u_int32_t ovfl_bufsz;
+
+ env = dbp->env;
+ mpf = dbp->mpf;
+ ret = err_ret = 0;
+ subdbname = NULL;
+ pgsc = NULL;
+ pgset = NULL;
+ ovfl_bufsz = 0;
+
+ /*
+ * For each entry, get and salvage the set of pages
+ * corresponding to that entry.
+ */
+ for (i = 0; i < NUM_ENT(master); i += P_INDX) {
+ bkkey = GET_BKEYDATA(dbp, master, i);
+ bkdata = GET_BKEYDATA(dbp, master, i + O_INDX);
+
+ /* Get the subdatabase name. */
+ if (B_TYPE(bkkey->type) == B_OVERFLOW) {
+ /*
+ * We can, in principle anyway, have a subdb
+ * name so long it overflows. Ick.
+ */
+ bo = (BOVERFLOW *)bkkey;
+ if ((ret = __db_safe_goff(dbp, vdp, bo->pgno,
+ &key, &subdbname, &ovfl_bufsz, flags)) != 0) {
+ err_ret = DB_VERIFY_BAD;
+ continue;
+ }
+
+ /* Nul-terminate it. */
+ if (ovfl_bufsz < key.size + 1) {
+ if ((ret = __os_realloc(env,
+ key.size + 1, &subdbname)) != 0)
+ goto err;
+ ovfl_bufsz = key.size + 1;
+ }
+ subdbname[key.size] = '\0';
+ } else if (B_TYPE(bkkey->type) == B_KEYDATA) {
+ if (ovfl_bufsz < (u_int32_t)bkkey->len + 1) {
+ if ((ret = __os_realloc(env,
+ bkkey->len + 1, &subdbname)) != 0)
+ goto err;
+ ovfl_bufsz = bkkey->len + 1;
+ }
+ DB_ASSERT(env, subdbname != NULL);
+ memcpy(subdbname, bkkey->data, bkkey->len);
+ subdbname[bkkey->len] = '\0';
+ }
+
+ /* Get the corresponding pgno. */
+ if (bkdata->len != sizeof(db_pgno_t)) {
+ err_ret = DB_VERIFY_BAD;
+ continue;
+ }
+ memcpy(&meta_pgno,
+ (db_pgno_t *)bkdata->data, sizeof(db_pgno_t));
+
+ /*
+ * Subdatabase meta pgnos are stored in network byte
+ * order for cross-endian compatibility. Swap if appropriate.
+ */
+ DB_NTOHL_SWAP(env, &meta_pgno);
+
+ /* If we can't get the subdb meta page, just skip the subdb. */
+ if (!IS_VALID_PGNO(meta_pgno) || (ret = __memp_fget(mpf,
+ &meta_pgno, vdp->thread_info, NULL, 0, &subpg)) != 0) {
+ err_ret = ret;
+ continue;
+ }
+
+ /*
+ * Verify the subdatabase meta page. This has two functions.
+ * First, if it's bad, we have no choice but to skip the subdb
+ * and let the pages just get printed on a later pass. Second,
+ * the access-method-specific meta verification routines record
+ * the various state info (such as the presence of dups)
+ * that we need for __db_prheader().
+ */
+ if ((ret =
+ __db_vrfy_common(dbp, vdp, subpg, meta_pgno, flags)) != 0) {
+ err_ret = ret;
+ (void)__memp_fput(mpf,
+ vdp->thread_info, subpg, dbp->priority);
+ continue;
+ }
+ switch (TYPE(subpg)) {
+ case P_BTREEMETA:
+ if ((ret = __bam_vrfy_meta(dbp,
+ vdp, (BTMETA *)subpg, meta_pgno, flags)) != 0) {
+ err_ret = ret;
+ (void)__memp_fput(mpf,
+ vdp->thread_info, subpg, dbp->priority);
+ continue;
+ }
+ break;
+ case P_HASHMETA:
+ if ((ret = __ham_vrfy_meta(dbp,
+ vdp, (HMETA *)subpg, meta_pgno, flags)) != 0) {
+ err_ret = ret;
+ (void)__memp_fput(mpf,
+ vdp->thread_info, subpg, dbp->priority);
+ continue;
+ }
+ break;
+ default:
+ /* This isn't an appropriate page; skip this subdb. */
+ err_ret = DB_VERIFY_BAD;
+ continue;
+ }
+
+ if ((ret = __memp_fput(mpf,
+ vdp->thread_info, subpg, dbp->priority)) != 0) {
+ err_ret = ret;
+ continue;
+ }
+
+ /* Print a subdatabase header. */
+ if ((ret = __db_prheader(dbp,
+ subdbname, 0, 0, handle, callback, vdp, meta_pgno)) != 0)
+ goto err;
+
+ /* Salvage meta_pgno's tree. */
+ if ((ret = __db_salvage(dbp,
+ vdp, meta_pgno, handle, callback, flags)) != 0)
+ err_ret = ret;
+
+ /* Print a subdatabase footer. */
+ if ((ret = __db_prfooter(handle, callback)) != 0)
+ goto err;
+ }
+
+err: if (subdbname)
+ __os_free(env, subdbname);
+
+ if (pgsc != NULL && (t_ret = __dbc_close(pgsc)) != 0)
+ ret = t_ret;
+
+ if (pgset != NULL && (t_ret = __db_close(pgset, NULL, 0)) != 0)
+ ret = t_ret;
+
+ if ((t_ret = __db_salvage_markdone(vdp, PGNO(master))) != 0)
+ return (t_ret);
+
+ return ((err_ret != 0) ? err_ret : ret);
+}
+
+/*
+ * __db_salvage --
+ * Given a meta page number, salvage all data from leaf pages found by
+ * walking the meta page's tree.
+ */
+static int
+__db_salvage(dbp, vdp, meta_pgno, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t meta_pgno;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+
+{
+ DB *pgset;
+ DBC *dbc, *pgsc;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *subpg;
+ db_pgno_t p;
+ int err_ret, ret, t_ret;
+
+ env = dbp->env;
+ mpf = dbp->mpf;
+ err_ret = ret = t_ret = 0;
+ pgsc = NULL;
+ pgset = NULL;
+ dbc = NULL;
+
+ if ((ret = __db_vrfy_pgset(env,
+ vdp->thread_info, dbp->pgsize, &pgset)) != 0)
+ goto err;
+
+ /* Get all page numbers referenced from this meta page. */
+ if ((ret = __db_meta2pgset(dbp, vdp, meta_pgno,
+ flags, pgset)) != 0) {
+ err_ret = ret;
+ goto err;
+ }
+
+ if ((ret = __db_cursor(pgset,
+ vdp->thread_info, NULL, &pgsc, 0)) != 0)
+ goto err;
+
+ if (dbp->type == DB_QUEUE &&
+ (ret = __db_cursor(dbp, vdp->thread_info, NULL, &dbc, 0)) != 0)
+ goto err;
+
+ /* Salvage every page in pgset. */
+ while ((ret = __db_vrfy_pgset_next(pgsc, &p)) == 0) {
+ if (dbp->type == DB_QUEUE) {
+#ifdef HAVE_QUEUE
+ ret = __qam_fget(dbc, &p, 0, &subpg);
+#else
+ ret = __db_no_queue_am(env);
+#endif
+ /* Don't report an error for pages not found in a queue.
+ * The pgset is a best guess, it doesn't know about
+ * deleted extents which leads to this error.
+ */
+ if (ret == ENOENT || ret == DB_PAGE_NOTFOUND)
+ continue;
+ } else
+ ret = __memp_fget(mpf,
+ &p, vdp->thread_info, NULL, 0, &subpg);
+ if (ret != 0) {
+ err_ret = ret;
+ continue;
+ }
+
+ if ((ret = __db_salvage_pg(dbp, vdp, p, subpg,
+ handle, callback, flags)) != 0)
+ err_ret = ret;
+
+ if (dbp->type == DB_QUEUE)
+#ifdef HAVE_QUEUE
+ ret = __qam_fput(dbc, p, subpg, dbp->priority);
+#else
+ ret = __db_no_queue_am(env);
+#endif
+ else
+ ret = __memp_fput(mpf,
+ vdp->thread_info, subpg, dbp->priority);
+ if (ret != 0)
+ err_ret = ret;
+ }
+
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+
+err:
+ if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0)
+ ret = t_ret;
+ if (pgsc != NULL && (t_ret = __dbc_close(pgsc)) != 0)
+ ret = t_ret;
+ if (pgset != NULL && (t_ret = __db_close(pgset, NULL, 0)) != 0)
+ ret = t_ret;
+
+ return ((err_ret != 0) ? err_ret : ret);
+}
+
+/*
+ * __db_meta2pgset --
+ * Given a known-safe meta page number, return the set of pages
+ * corresponding to the database it represents. Return DB_VERIFY_BAD if
+ * it's not a suitable meta page or is invalid.
+ */
+static int
+__db_meta2pgset(dbp, vdp, pgno, flags, pgset)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ u_int32_t flags;
+ DB *pgset;
+{
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ int ret, t_ret;
+
+ mpf = dbp->mpf;
+
+ if ((ret = __memp_fget(mpf, &pgno, vdp->thread_info, NULL, 0, &h)) != 0)
+ return (ret);
+
+ switch (TYPE(h)) {
+ case P_BTREEMETA:
+ ret = __bam_meta2pgset(dbp, vdp, (BTMETA *)h, flags, pgset);
+ break;
+ case P_HASHMETA:
+ ret = __ham_meta2pgset(dbp, vdp, (HMETA *)h, flags, pgset);
+ break;
+ case P_QAMMETA:
+#ifdef HAVE_QUEUE
+ ret = __qam_meta2pgset(dbp, vdp, pgset);
+ break;
+#endif
+ default:
+ ret = DB_VERIFY_BAD;
+ break;
+ }
+
+ if ((t_ret = __memp_fput(mpf, vdp->thread_info, h, dbp->priority)) != 0)
+ return (t_ret);
+ return (ret);
+}
+
+/*
+ * __db_guesspgsize --
+ * Try to guess what the pagesize is if the one on the meta page
+ * and the one in the db are invalid.
+ */
+static u_int
+__db_guesspgsize(env, fhp)
+ ENV *env;
+ DB_FH *fhp;
+{
+ db_pgno_t i;
+ size_t nr;
+ u_int32_t guess;
+ u_int8_t type;
+
+ for (guess = DB_MAX_PGSIZE; guess >= DB_MIN_PGSIZE; guess >>= 1) {
+ /*
+ * We try to read three pages ahead after the first one
+ * and make sure we have plausible types for all of them.
+ * If the seeks fail, continue with a smaller size;
+ * we're probably just looking past the end of the database.
+ * If they succeed and the types are reasonable, also continue
+ * with a size smaller; we may be looking at pages N,
+ * 2N, and 3N for some N > 1.
+ *
+ * As soon as we hit an invalid type, we stop and return
+ * our previous guess; that last one was probably the page size.
+ */
+ for (i = 1; i <= 3; i++) {
+ if (__os_seek(
+ env, fhp, i, guess, SSZ(DBMETA, type)) != 0)
+ break;
+ if (__os_read(env,
+ fhp, &type, 1, &nr) != 0 || nr == 0)
+ break;
+ if (type == P_INVALID || type >= P_PAGETYPE_MAX)
+ return (guess << 1);
+ }
+ }
+
+ /*
+ * If we're just totally confused--the corruption takes up most of the
+ * beginning pages of the database--go with the default size.
+ */
+ return (DB_DEF_IOSIZE);
+}
diff --git a/db/db_vrfy_stub.c b/db/db_vrfy_stub.c
new file mode 100644
index 0000000..9ed5acd
--- /dev/null
+++ b/db/db_vrfy_stub.c
@@ -0,0 +1,117 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#ifndef HAVE_VERIFY
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/db_verify.h"
+
+/*
+ * If the library wasn't compiled with the verification support, various
+ * routines aren't available. Stub them here, returning an appropriate
+ * error.
+ */
+
+static int __db_novrfy __P((ENV *));
+
+/*
+ * __db_novrfy --
+ * Error when a Berkeley DB build doesn't include the access method.
+ */
+static int
+__db_novrfy(env)
+ ENV *env;
+{
+ __db_errx(env,
+ "library build did not include support for database verification");
+ return (DB_OPNOTSUP);
+}
+
+int
+__db_verify_pp(dbp, file, database, outfile, flags)
+ DB *dbp;
+ const char *file, *database;
+ FILE *outfile;
+ u_int32_t flags;
+{
+ int ret;
+
+ COMPQUIET(file, NULL);
+ COMPQUIET(database, NULL);
+ COMPQUIET(outfile, NULL);
+ COMPQUIET(flags, 0);
+
+ ret = __db_novrfy(dbp->env);
+
+ /* The verify method is a destructor. */
+ (void)__db_close(dbp, NULL, 0);
+
+ return (ret);
+}
+
+int
+__db_verify_internal(dbp, name, subdb, handle, callback, flags)
+ DB *dbp;
+ const char *name, *subdb;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ COMPQUIET(dbp, NULL);
+ COMPQUIET(name, NULL);
+ COMPQUIET(subdb, NULL);
+ COMPQUIET(handle, NULL);
+ COMPQUIET(callback, NULL);
+ COMPQUIET(flags, 0);
+ return (0);
+}
+
+int
+__db_vrfy_getpageinfo(vdp, pgno, pipp)
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ VRFY_PAGEINFO **pipp;
+{
+ COMPQUIET(pgno, 0);
+ COMPQUIET(pipp, NULL);
+ return (__db_novrfy(vdp->pgdbp->env));
+}
+
+int
+__db_vrfy_putpageinfo(env, vdp, pip)
+ ENV *env;
+ VRFY_DBINFO *vdp;
+ VRFY_PAGEINFO *pip;
+{
+ COMPQUIET(vdp, NULL);
+ COMPQUIET(pip, NULL);
+ return (__db_novrfy(env));
+}
+
+int
+__db_vrfy_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, vdp)
+ DBT *dbtp;
+ int checkprint;
+ const char *prefix;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ int is_recno;
+ VRFY_DBINFO *vdp;
+{
+ COMPQUIET(dbtp, NULL);
+ COMPQUIET(checkprint, 0);
+ COMPQUIET(prefix, NULL);
+ COMPQUIET(handle, NULL);
+ COMPQUIET(callback, NULL);
+ COMPQUIET(is_recno, 0);
+ return (__db_novrfy(vdp->pgdbp->env));
+}
+#endif /* !HAVE_VERIFY */
diff --git a/db/db_vrfyutil.c b/db/db_vrfyutil.c
new file mode 100644
index 0000000..04d73d9
--- /dev/null
+++ b/db/db_vrfyutil.c
@@ -0,0 +1,916 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2000-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/db_am.h"
+
+static int __db_vrfy_childinc __P((DBC *, VRFY_CHILDINFO *));
+static int __db_vrfy_pageinfo_create __P((ENV *, VRFY_PAGEINFO **));
+
+/*
+ * __db_vrfy_dbinfo_create --
+ * Allocate and initialize a VRFY_DBINFO structure.
+ *
+ * PUBLIC: int __db_vrfy_dbinfo_create
+ * PUBLIC: __P((ENV *, DB_THREAD_INFO *, u_int32_t, VRFY_DBINFO **));
+ */
+int
+__db_vrfy_dbinfo_create(env, ip, pgsize, vdpp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ u_int32_t pgsize;
+ VRFY_DBINFO **vdpp;
+{
+ DB *cdbp, *pgdbp, *pgset;
+ VRFY_DBINFO *vdp;
+ int ret;
+
+ vdp = NULL;
+ cdbp = pgdbp = pgset = NULL;
+
+ if ((ret = __os_calloc(NULL, 1, sizeof(VRFY_DBINFO), &vdp)) != 0)
+ goto err;
+
+ if ((ret = __db_create_internal(&cdbp, env, 0)) != 0)
+ goto err;
+
+ if ((ret = __db_set_flags(cdbp, DB_DUP)) != 0)
+ goto err;
+
+ if ((ret = __db_set_pagesize(cdbp, pgsize)) != 0)
+ goto err;
+
+ /* If transactional, make sure we don't log. */
+ if (TXN_ON(env) &&
+ (ret = __db_set_flags(cdbp, DB_TXN_NOT_DURABLE)) != 0)
+ goto err;
+ if ((ret = __db_open(cdbp, ip,
+ NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0600, PGNO_BASE_MD)) != 0)
+ goto err;
+
+ if ((ret = __db_create_internal(&pgdbp, env, 0)) != 0)
+ goto err;
+
+ if ((ret = __db_set_pagesize(pgdbp, pgsize)) != 0)
+ goto err;
+
+ /* If transactional, make sure we don't log. */
+ if (TXN_ON(env) &&
+ (ret = __db_set_flags(pgdbp, DB_TXN_NOT_DURABLE)) != 0)
+ goto err;
+
+ if ((ret = __db_open(pgdbp, ip,
+ NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0600, PGNO_BASE_MD)) != 0)
+ goto err;
+
+ if ((ret = __db_vrfy_pgset(env, ip, pgsize, &pgset)) != 0)
+ goto err;
+
+ LIST_INIT(&vdp->subdbs);
+ LIST_INIT(&vdp->activepips);
+
+ vdp->cdbp = cdbp;
+ vdp->pgdbp = pgdbp;
+ vdp->pgset = pgset;
+ vdp->thread_info = ip;
+ *vdpp = vdp;
+ return (0);
+
+err: if (cdbp != NULL)
+ (void)__db_close(cdbp, NULL, 0);
+ if (pgdbp != NULL)
+ (void)__db_close(pgdbp, NULL, 0);
+ if (vdp != NULL)
+ __os_free(env, vdp);
+ return (ret);
+}
+
+/*
+ * __db_vrfy_dbinfo_destroy --
+ * Destructor for VRFY_DBINFO. Destroys VRFY_PAGEINFOs and deallocates
+ * structure.
+ *
+ * PUBLIC: int __db_vrfy_dbinfo_destroy __P((ENV *, VRFY_DBINFO *));
+ */
+int
+__db_vrfy_dbinfo_destroy(env, vdp)
+ ENV *env;
+ VRFY_DBINFO *vdp;
+{
+ VRFY_CHILDINFO *c;
+ int t_ret, ret;
+
+ ret = 0;
+
+ /*
+ * Discard active page structures. Ideally there wouldn't be any,
+ * but in some error cases we may not have cleared them all out.
+ */
+ while (LIST_FIRST(&vdp->activepips) != NULL)
+ if ((t_ret = __db_vrfy_putpageinfo(
+ env, vdp, LIST_FIRST(&vdp->activepips))) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ break;
+ }
+
+ /* Discard subdatabase list structures. */
+ while ((c = LIST_FIRST(&vdp->subdbs)) != NULL) {
+ LIST_REMOVE(c, links);
+ __os_free(NULL, c);
+ }
+
+ if ((t_ret = __db_close(vdp->pgdbp, NULL, 0)) != 0)
+ ret = t_ret;
+
+ if ((t_ret = __db_close(vdp->cdbp, NULL, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if ((t_ret = __db_close(vdp->pgset, NULL, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (vdp->extents != NULL)
+ __os_free(env, vdp->extents);
+ __os_free(env, vdp);
+ return (ret);
+}
+
+/*
+ * __db_vrfy_getpageinfo --
+ * Get a PAGEINFO structure for a given page, creating it if necessary.
+ *
+ * PUBLIC: int __db_vrfy_getpageinfo
+ * PUBLIC: __P((VRFY_DBINFO *, db_pgno_t, VRFY_PAGEINFO **));
+ */
+int
+__db_vrfy_getpageinfo(vdp, pgno, pipp)
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ VRFY_PAGEINFO **pipp;
+{
+ DB *pgdbp;
+ DBT key, data;
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ int ret;
+
+ /*
+ * We want a page info struct. There are three places to get it from,
+ * in decreasing order of preference:
+ *
+ * 1. vdp->activepips. If it's already "checked out", we're
+ * already using it, we return the same exact structure with a
+ * bumped refcount. This is necessary because this code is
+ * replacing array accesses, and it's common for f() to make some
+ * changes to a pip, and then call g() and h() which each make
+ * changes to the same pip. vdps are never shared between threads
+ * (they're never returned to the application), so this is safe.
+ * 2. The pgdbp. It's not in memory, but it's in the database, so
+ * get it, give it a refcount of 1, and stick it on activepips.
+ * 3. malloc. It doesn't exist yet; create it, then stick it on
+ * activepips. We'll put it in the database when we putpageinfo
+ * later.
+ */
+
+ /* Case 1. */
+ LIST_FOREACH(pip, &vdp->activepips, links)
+ if (pip->pgno == pgno)
+ goto found;
+
+ /* Case 2. */
+ pgdbp = vdp->pgdbp;
+ env = pgdbp->env;
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ F_SET(&data, DB_DBT_MALLOC);
+ key.data = &pgno;
+ key.size = sizeof(db_pgno_t);
+
+ if ((ret = __db_get(pgdbp,
+ vdp->thread_info, NULL, &key, &data, 0)) == 0) {
+ /* Found it. */
+ DB_ASSERT(env, data.size == sizeof(VRFY_PAGEINFO));
+ pip = data.data;
+ LIST_INSERT_HEAD(&vdp->activepips, pip, links);
+ goto found;
+ } else if (ret != DB_NOTFOUND) /* Something nasty happened. */
+ return (ret);
+
+ /* Case 3 */
+ if ((ret = __db_vrfy_pageinfo_create(env, &pip)) != 0)
+ return (ret);
+
+ LIST_INSERT_HEAD(&vdp->activepips, pip, links);
+found: pip->pi_refcount++;
+
+ *pipp = pip;
+ return (0);
+}
+
+/*
+ * __db_vrfy_putpageinfo --
+ * Put back a VRFY_PAGEINFO that we're done with.
+ *
+ * PUBLIC: int __db_vrfy_putpageinfo __P((ENV *,
+ * PUBLIC: VRFY_DBINFO *, VRFY_PAGEINFO *));
+ */
+int
+__db_vrfy_putpageinfo(env, vdp, pip)
+ ENV *env;
+ VRFY_DBINFO *vdp;
+ VRFY_PAGEINFO *pip;
+{
+ DB *pgdbp;
+ DBT key, data;
+ VRFY_PAGEINFO *p;
+ int ret;
+
+ if (--pip->pi_refcount > 0)
+ return (0);
+
+ pgdbp = vdp->pgdbp;
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ key.data = &pip->pgno;
+ key.size = sizeof(db_pgno_t);
+ data.data = pip;
+ data.size = sizeof(VRFY_PAGEINFO);
+
+ if ((ret = __db_put(pgdbp,
+ vdp->thread_info, NULL, &key, &data, 0)) != 0)
+ return (ret);
+
+ LIST_FOREACH(p, &vdp->activepips, links)
+ if (p == pip)
+ break;
+ if (p != NULL)
+ LIST_REMOVE(p, links);
+
+ __os_ufree(env, p);
+ return (0);
+}
+
+/*
+ * __db_vrfy_pgset --
+ * Create a temporary database for the storing of sets of page numbers.
+ * (A mapping from page number to int, used by the *_meta2pgset functions,
+ * as well as for keeping track of which pages the verifier has seen.)
+ *
+ * PUBLIC: int __db_vrfy_pgset __P((ENV *,
+ * PUBLIC: DB_THREAD_INFO *, u_int32_t, DB **));
+ */
+int
+__db_vrfy_pgset(env, ip, pgsize, dbpp)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ u_int32_t pgsize;
+ DB **dbpp;
+{
+ DB *dbp;
+ int ret;
+
+ if ((ret = __db_create_internal(&dbp, env, 0)) != 0)
+ return (ret);
+ if ((ret = __db_set_pagesize(dbp, pgsize)) != 0)
+ goto err;
+
+ /* If transactional, make sure we don't log. */
+ if (TXN_ON(env) &&
+ (ret = __db_set_flags(dbp, DB_TXN_NOT_DURABLE)) != 0)
+ goto err;
+ if ((ret = __db_open(dbp, ip,
+ NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0600, PGNO_BASE_MD)) == 0)
+ *dbpp = dbp;
+ else
+err: (void)__db_close(dbp, NULL, 0);
+
+ return (ret);
+}
+
+/*
+ * __db_vrfy_pgset_get --
+ * Get the value associated in a page set with a given pgno. Return
+ * a 0 value (and succeed) if we've never heard of this page.
+ *
+ * PUBLIC: int __db_vrfy_pgset_get __P((DB *, DB_THREAD_INFO *, db_pgno_t,
+ * PUBLIC: int *));
+ */
+int
+__db_vrfy_pgset_get(dbp, ip, pgno, valp)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ db_pgno_t pgno;
+ int *valp;
+{
+ DBT key, data;
+ int ret, val;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ key.data = &pgno;
+ key.size = sizeof(db_pgno_t);
+ data.data = &val;
+ data.ulen = sizeof(int);
+ F_SET(&data, DB_DBT_USERMEM);
+
+ if ((ret = __db_get(dbp, ip, NULL, &key, &data, 0)) == 0) {
+ DB_ASSERT(dbp->env, data.size == sizeof(int));
+ } else if (ret == DB_NOTFOUND)
+ val = 0;
+ else
+ return (ret);
+
+ *valp = val;
+ return (0);
+}
+
+/*
+ * __db_vrfy_pgset_inc --
+ * Increment the value associated with a pgno by 1.
+ *
+ * PUBLIC: int __db_vrfy_pgset_inc __P((DB *, DB_THREAD_INFO *, db_pgno_t));
+ */
+int
+__db_vrfy_pgset_inc(dbp, ip, pgno)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ db_pgno_t pgno;
+{
+ DBT key, data;
+ int ret;
+ int val;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ val = 0;
+
+ key.data = &pgno;
+ key.size = sizeof(db_pgno_t);
+ data.data = &val;
+ data.ulen = sizeof(int);
+ F_SET(&data, DB_DBT_USERMEM);
+
+ if ((ret = __db_get(dbp, ip, NULL, &key, &data, 0)) == 0) {
+ DB_ASSERT(dbp->env, data.size == sizeof(int));
+ } else if (ret != DB_NOTFOUND)
+ return (ret);
+
+ data.size = sizeof(int);
+ ++val;
+
+ return (__db_put(dbp, ip, NULL, &key, &data, 0));
+}
+
+/*
+ * __db_vrfy_pgset_next --
+ * Given a cursor open in a pgset database, get the next page in the
+ * set.
+ *
+ * PUBLIC: int __db_vrfy_pgset_next __P((DBC *, db_pgno_t *));
+ */
+int
+__db_vrfy_pgset_next(dbc, pgnop)
+ DBC *dbc;
+ db_pgno_t *pgnop;
+{
+ DBT key, data;
+ db_pgno_t pgno;
+ int ret;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+ /* We don't care about the data, just the keys. */
+ F_SET(&data, DB_DBT_USERMEM | DB_DBT_PARTIAL);
+ F_SET(&key, DB_DBT_USERMEM);
+ key.data = &pgno;
+ key.ulen = sizeof(db_pgno_t);
+
+ if ((ret = __dbc_get(dbc, &key, &data, DB_NEXT)) != 0)
+ return (ret);
+
+ DB_ASSERT(dbc->env, key.size == sizeof(db_pgno_t));
+ *pgnop = pgno;
+
+ return (0);
+}
+
+/*
+ * __db_vrfy_childcursor --
+ * Create a cursor to walk the child list with. Returns with a nonzero
+ * final argument if the specified page has no children.
+ *
+ * PUBLIC: int __db_vrfy_childcursor __P((VRFY_DBINFO *, DBC **));
+ */
+int
+__db_vrfy_childcursor(vdp, dbcp)
+ VRFY_DBINFO *vdp;
+ DBC **dbcp;
+{
+ DB *cdbp;
+ DBC *dbc;
+ int ret;
+
+ cdbp = vdp->cdbp;
+
+ if ((ret = __db_cursor(cdbp, vdp->thread_info, NULL, &dbc, 0)) == 0)
+ *dbcp = dbc;
+
+ return (ret);
+}
+
+/*
+ * __db_vrfy_childput --
+ * Add a child structure to the set for a given page.
+ *
+ * PUBLIC: int __db_vrfy_childput
+ * PUBLIC: __P((VRFY_DBINFO *, db_pgno_t, VRFY_CHILDINFO *));
+ */
+int
+__db_vrfy_childput(vdp, pgno, cip)
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ VRFY_CHILDINFO *cip;
+{
+ DB *cdbp;
+ DBC *cc;
+ DBT key, data;
+ VRFY_CHILDINFO *oldcip;
+ int ret;
+
+ cdbp = vdp->cdbp;
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ key.data = &pgno;
+ key.size = sizeof(db_pgno_t);
+
+ /*
+ * We want to avoid adding multiple entries for a single child page;
+ * we only need to verify each child once, even if a child (such
+ * as an overflow key) is multiply referenced.
+ *
+ * However, we also need to make sure that when walking the list
+ * of children, we encounter them in the order they're referenced
+ * on a page. (This permits us, for example, to verify the
+ * prev_pgno/next_pgno chain of Btree leaf pages.)
+ *
+ * Check the child database to make sure that this page isn't
+ * already a child of the specified page number. If it's not,
+ * put it at the end of the duplicate set.
+ */
+ if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0)
+ return (ret);
+ for (ret = __db_vrfy_ccset(cc, pgno, &oldcip); ret == 0;
+ ret = __db_vrfy_ccnext(cc, &oldcip))
+ if (oldcip->pgno == cip->pgno) {
+ /*
+ * Found a matching child. Increment its reference
+ * count--we've run into it again--but don't put it
+ * again.
+ */
+ if ((ret = __db_vrfy_childinc(cc, oldcip)) != 0 ||
+ (ret = __db_vrfy_ccclose(cc)) != 0)
+ return (ret);
+ return (0);
+ }
+ if (ret != DB_NOTFOUND) {
+ (void)__db_vrfy_ccclose(cc);
+ return (ret);
+ }
+ if ((ret = __db_vrfy_ccclose(cc)) != 0)
+ return (ret);
+
+ cip->refcnt = 1;
+ data.data = cip;
+ data.size = sizeof(VRFY_CHILDINFO);
+
+ return (__db_put(cdbp, vdp->thread_info, NULL, &key, &data, 0));
+}
+
+/*
+ * __db_vrfy_childinc --
+ * Increment the refcount of the VRFY_CHILDINFO struct that the child
+ * cursor is pointing to. (The caller has just retrieved this struct, and
+ * passes it in as cip to save us a get.)
+ */
+static int
+__db_vrfy_childinc(dbc, cip)
+ DBC *dbc;
+ VRFY_CHILDINFO *cip;
+{
+ DBT key, data;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ cip->refcnt++;
+ data.data = cip;
+ data.size = sizeof(VRFY_CHILDINFO);
+
+ return (__dbc_put(dbc, &key, &data, DB_CURRENT));
+}
+
+/*
+ * __db_vrfy_ccset --
+ * Sets a cursor created with __db_vrfy_childcursor to the first
+ * child of the given pgno, and returns it in the third arg.
+ *
+ * PUBLIC: int __db_vrfy_ccset __P((DBC *, db_pgno_t, VRFY_CHILDINFO **));
+ */
+int
+__db_vrfy_ccset(dbc, pgno, cipp)
+ DBC *dbc;
+ db_pgno_t pgno;
+ VRFY_CHILDINFO **cipp;
+{
+ DBT key, data;
+ int ret;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ key.data = &pgno;
+ key.size = sizeof(db_pgno_t);
+
+ if ((ret = __dbc_get(dbc, &key, &data, DB_SET)) != 0)
+ return (ret);
+
+ DB_ASSERT(dbc->env, data.size == sizeof(VRFY_CHILDINFO));
+ *cipp = (VRFY_CHILDINFO *)data.data;
+
+ return (0);
+}
+
+/*
+ * __db_vrfy_ccnext --
+ * Gets the next child of the given cursor created with
+ * __db_vrfy_childcursor, and returns it in the memory provided in the
+ * second arg.
+ *
+ * PUBLIC: int __db_vrfy_ccnext __P((DBC *, VRFY_CHILDINFO **));
+ */
+int
+__db_vrfy_ccnext(dbc, cipp)
+ DBC *dbc;
+ VRFY_CHILDINFO **cipp;
+{
+ DBT key, data;
+ int ret;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ if ((ret = __dbc_get(dbc, &key, &data, DB_NEXT_DUP)) != 0)
+ return (ret);
+
+ DB_ASSERT(dbc->env, data.size == sizeof(VRFY_CHILDINFO));
+ *cipp = (VRFY_CHILDINFO *)data.data;
+
+ return (0);
+}
+
+/*
+ * __db_vrfy_ccclose --
+ * Closes the cursor created with __db_vrfy_childcursor.
+ *
+ * This doesn't actually do anything interesting now, but it's
+ * not inconceivable that we might change the internal database usage
+ * and keep the interfaces the same, and a function call here or there
+ * seldom hurts anyone.
+ *
+ * PUBLIC: int __db_vrfy_ccclose __P((DBC *));
+ */
+int
+__db_vrfy_ccclose(dbc)
+ DBC *dbc;
+{
+
+ return (__dbc_close(dbc));
+}
+
+/*
+ * __db_vrfy_pageinfo_create --
+ * Constructor for VRFY_PAGEINFO; allocates and initializes.
+ */
+static int
+__db_vrfy_pageinfo_create(env, pipp)
+ ENV *env;
+ VRFY_PAGEINFO **pipp;
+{
+ VRFY_PAGEINFO *pip;
+ int ret;
+
+ /*
+ * pageinfo structs are sometimes allocated here and sometimes
+ * allocated by fetching them from a database with DB_DBT_MALLOC.
+ * There's no easy way for the destructor to tell which was
+ * used, and so we always allocate with __os_umalloc so we can free
+ * with __os_ufree.
+ */
+ if ((ret = __os_umalloc(env, sizeof(VRFY_PAGEINFO), &pip)) != 0)
+ return (ret);
+ memset(pip, 0, sizeof(VRFY_PAGEINFO));
+
+ *pipp = pip;
+ return (0);
+}
+
+/*
+ * __db_salvage_init --
+ * Set up salvager database.
+ *
+ * PUBLIC: int __db_salvage_init __P((VRFY_DBINFO *));
+ */
+int
+__db_salvage_init(vdp)
+ VRFY_DBINFO *vdp;
+{
+ DB *dbp;
+ int ret;
+
+ if ((ret = __db_create_internal(&dbp, NULL, 0)) != 0)
+ return (ret);
+
+ if ((ret = __db_set_pagesize(dbp, 1024)) != 0)
+ goto err;
+
+ if ((ret = __db_open(dbp, vdp->thread_info,
+ NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0, PGNO_BASE_MD)) != 0)
+ goto err;
+
+ vdp->salvage_pages = dbp;
+ return (0);
+
+err: (void)__db_close(dbp, NULL, 0);
+ return (ret);
+}
+
+/*
+ * __db_salvage_destroy --
+ * Close salvager database.
+ * PUBLIC: int __db_salvage_destroy __P((VRFY_DBINFO *));
+ */
+int
+__db_salvage_destroy(vdp)
+ VRFY_DBINFO *vdp;
+{
+ return (vdp->salvage_pages == NULL ? 0 :
+ __db_close(vdp->salvage_pages, NULL, 0));
+}
+
+/*
+ * __db_salvage_getnext --
+ * Get the next (first) unprinted page in the database of pages we need to
+ * print still. Delete entries for any already-printed pages we encounter
+ * in this search, as well as the page we're returning.
+ *
+ * PUBLIC: int __db_salvage_getnext
+ * PUBLIC: __P((VRFY_DBINFO *, DBC **, db_pgno_t *, u_int32_t *, int));
+ */
+int
+__db_salvage_getnext(vdp, dbcp, pgnop, pgtypep, skip_overflow)
+ VRFY_DBINFO *vdp;
+ DBC **dbcp;
+ db_pgno_t *pgnop;
+ u_int32_t *pgtypep;
+ int skip_overflow;
+{
+ DB *dbp;
+ DBT key, data;
+ int ret;
+ u_int32_t pgtype;
+
+ dbp = vdp->salvage_pages;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ if (*dbcp == NULL &&
+ (ret = __db_cursor(dbp, vdp->thread_info, NULL, dbcp, 0)) != 0)
+ return (ret);
+
+ while ((ret = __dbc_get(*dbcp, &key, &data, DB_NEXT)) == 0) {
+ DB_ASSERT(dbp->env, data.size == sizeof(u_int32_t));
+ memcpy(&pgtype, data.data, sizeof(pgtype));
+
+ if (skip_overflow && pgtype == SALVAGE_OVERFLOW)
+ continue;
+
+ if ((ret = __dbc_del(*dbcp, 0)) != 0)
+ return (ret);
+ if (pgtype != SALVAGE_IGNORE) {
+ DB_ASSERT(dbp->env, key.size == sizeof(db_pgno_t));
+ DB_ASSERT(dbp->env, data.size == sizeof(u_int32_t));
+
+ *pgnop = *(db_pgno_t *)key.data;
+ *pgtypep = *(u_int32_t *)data.data;
+ break;
+ }
+ }
+
+ return (ret);
+}
+
+/*
+ * __db_salvage_isdone --
+ * Return whether or not the given pgno is already marked
+ * SALVAGE_IGNORE (meaning that we don't need to print it again).
+ *
+ * Returns DB_KEYEXIST if it is marked, 0 if not, or another error on
+ * error.
+ *
+ * PUBLIC: int __db_salvage_isdone __P((VRFY_DBINFO *, db_pgno_t));
+ */
+int
+__db_salvage_isdone(vdp, pgno)
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+{
+ DB *dbp;
+ DBT key, data;
+ int ret;
+ u_int32_t currtype;
+
+ dbp = vdp->salvage_pages;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ currtype = SALVAGE_INVALID;
+ data.data = &currtype;
+ data.ulen = sizeof(u_int32_t);
+ data.flags = DB_DBT_USERMEM;
+
+ key.data = &pgno;
+ key.size = sizeof(db_pgno_t);
+
+ /*
+ * Put an entry for this page, with pgno as key and type as data,
+ * unless it's already there and is marked done.
+ * If it's there and is marked anything else, that's fine--we
+ * want to mark it done.
+ */
+ if ((ret = __db_get(dbp,
+ vdp->thread_info, NULL, &key, &data, 0)) == 0) {
+ /*
+ * The key's already here. Check and see if it's already
+ * marked done. If it is, return DB_KEYEXIST. If it's not,
+ * return 0.
+ */
+ if (currtype == SALVAGE_IGNORE)
+ return (DB_KEYEXIST);
+ else
+ return (0);
+ } else if (ret != DB_NOTFOUND)
+ return (ret);
+
+ /* The pgno is not yet marked anything; return 0. */
+ return (0);
+}
+
+/*
+ * __db_salvage_markdone --
+ * Mark as done a given page.
+ *
+ * PUBLIC: int __db_salvage_markdone __P((VRFY_DBINFO *, db_pgno_t));
+ */
+int
+__db_salvage_markdone(vdp, pgno)
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+{
+ DB *dbp;
+ DBT key, data;
+ int pgtype, ret;
+ u_int32_t currtype;
+
+ pgtype = SALVAGE_IGNORE;
+ dbp = vdp->salvage_pages;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ currtype = SALVAGE_INVALID;
+ data.data = &currtype;
+ data.ulen = sizeof(u_int32_t);
+ data.flags = DB_DBT_USERMEM;
+
+ key.data = &pgno;
+ key.size = sizeof(db_pgno_t);
+
+ /*
+ * Put an entry for this page, with pgno as key and type as data,
+ * unless it's already there and is marked done.
+ * If it's there and is marked anything else, that's fine--we
+ * want to mark it done, but db_salvage_isdone only lets
+ * us know if it's marked IGNORE.
+ *
+ * We don't want to return DB_KEYEXIST, though; this will
+ * likely get passed up all the way and make no sense to the
+ * application. Instead, use DB_VERIFY_BAD to indicate that
+ * we've seen this page already--it probably indicates a
+ * multiply-linked page.
+ */
+ if ((ret = __db_salvage_isdone(vdp, pgno)) != 0)
+ return (ret == DB_KEYEXIST ? DB_VERIFY_BAD : ret);
+
+ data.size = sizeof(u_int32_t);
+ data.data = &pgtype;
+
+ return (__db_put(dbp, vdp->thread_info, NULL, &key, &data, 0));
+}
+
+/*
+ * __db_salvage_markneeded --
+ * If it has not yet been printed, make note of the fact that a page
+ * must be dealt with later.
+ *
+ * PUBLIC: int __db_salvage_markneeded
+ * PUBLIC: __P((VRFY_DBINFO *, db_pgno_t, u_int32_t));
+ */
+int
+__db_salvage_markneeded(vdp, pgno, pgtype)
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ u_int32_t pgtype;
+{
+ DB *dbp;
+ DBT key, data;
+ int ret;
+
+ dbp = vdp->salvage_pages;
+
+ memset(&key, 0, sizeof(DBT));
+ memset(&data, 0, sizeof(DBT));
+
+ key.data = &pgno;
+ key.size = sizeof(db_pgno_t);
+
+ data.data = &pgtype;
+ data.size = sizeof(u_int32_t);
+
+ /*
+ * Put an entry for this page, with pgno as key and type as data,
+ * unless it's already there, in which case it's presumably
+ * already been marked done.
+ */
+ ret = __db_put(dbp,
+ vdp->thread_info, NULL, &key, &data, DB_NOOVERWRITE);
+ return (ret == DB_KEYEXIST ? 0 : ret);
+}
+
+/*
+ * __db_vrfy_prdbt --
+ * Print out a DBT data element from a verification routine.
+ *
+ * PUBLIC: int __db_vrfy_prdbt __P((DBT *, int, const char *, void *,
+ * PUBLIC: int (*)(void *, const void *), int, VRFY_DBINFO *));
+ */
+int
+__db_vrfy_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, vdp)
+ DBT *dbtp;
+ int checkprint;
+ const char *prefix;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ int is_recno;
+ VRFY_DBINFO *vdp;
+{
+ if (vdp != NULL) {
+ /*
+ * If vdp is non-NULL, we might be the first key in the
+ * "fake" subdatabase used for key/data pairs we can't
+ * associate with a known subdb.
+ *
+ * Check and clear the SALVAGE_PRINTHEADER flag; if
+ * it was set, print a subdatabase header.
+ */
+ if (F_ISSET(vdp, SALVAGE_PRINTHEADER)) {
+ (void)__db_prheader(
+ NULL, "__OTHER__", 0, 0, handle, callback, vdp, 0);
+ F_CLR(vdp, SALVAGE_PRINTHEADER);
+ F_SET(vdp, SALVAGE_PRINTFOOTER);
+ }
+
+ /*
+ * Even if the printable flag wasn't set by our immediate
+ * caller, it may be set on a salvage-wide basis.
+ */
+ if (F_ISSET(vdp, SALVAGE_PRINTABLE))
+ checkprint = 1;
+ }
+ return (
+ __db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno));
+}
diff --git a/db/partition.c b/db/partition.c
new file mode 100644
index 0000000..4e89ede
--- /dev/null
+++ b/db/partition.c
@@ -0,0 +1,2048 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2001, 2010 Oracle and/or its affiliates. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/btree.h"
+#ifdef HAVE_HASH
+#include "dbinc/hash.h"
+#endif
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/txn.h"
+#ifdef HAVE_PARTITION
+
+static int __part_rr __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ const char *, const char *, const char *, u_int32_t));
+static int __partc_close __P((DBC *, db_pgno_t, int *));
+static int __partc_del __P((DBC*, u_int32_t));
+static int __partc_destroy __P((DBC*));
+static int __partc_get_pp __P((DBC*, DBT *, DBT *, u_int32_t));
+static int __partc_put __P((DBC*, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static int __partc_writelock __P((DBC*));
+static int __partition_chk_meta __P((DB *,
+ DB_THREAD_INFO *, DB_TXN *, u_int32_t));
+static int __partition_setup_keys __P((DBC *,
+ DB_PARTITION *, DBMETA *, u_int32_t));
+static int __part_key_cmp __P((const void *, const void *));
+static inline void __part_search __P((DB *,
+ DB_PARTITION *, DBT *, u_int32_t *));
+
+static char *Alloc_err = "Partition open failed to allocate %d bytes";
+
+/*
+ * Allocate a partition cursor and copy flags to the partition cursor.
+ * Not passed:
+ * DBC_PARTITIONED -- the subcursors are not.
+ * DBC_OWN_LID -- the arg dbc owns the lock id.
+ * DBC_WRITECURSOR DBC_WRITER -- CDS locking happens on
+ * the whole DB, not the partition.
+ */
+#define GET_PART_CURSOR(dbc, new_dbc, part_id) do { \
+ DB *__part_dbp; \
+ __part_dbp = part->handles[part_id]; \
+ if ((ret = __db_cursor_int(__part_dbp, \
+ (dbc)->thread_info, (dbc)->txn, __part_dbp->type, \
+ PGNO_INVALID, 0, (dbc)->locker, &new_dbc)) != 0) \
+ goto err; \
+ (new_dbc)->flags = (dbc)->flags & \
+ ~(DBC_PARTITIONED|DBC_OWN_LID|DBC_WRITECURSOR|DBC_WRITER); \
+} while (0)
+
+/*
+ * Search for the correct partition.
+ */
+static inline void __part_search(dbp, part, key, part_idp)
+ DB *dbp;
+ DB_PARTITION *part;
+ DBT *key;
+ u_int32_t *part_idp;
+{
+ db_indx_t base, indx, limit;
+ int cmp;
+ int (*func) __P((DB *, const DBT *, const DBT *));
+
+ DB_ASSERT(dbp->env, part->nparts != 0);
+ COMPQUIET(cmp, 0);
+ COMPQUIET(indx, 0);
+
+ func = ((BTREE *)dbp->bt_internal)->bt_compare;
+ DB_BINARY_SEARCH_FOR(base, limit, part->nparts, O_INDX) {
+ DB_BINARY_SEARCH_INCR(indx, base, limit, O_INDX);
+ cmp = func(dbp, key, &part->keys[indx]);
+ if (cmp == 0)
+ break;
+ if (cmp > 0)
+ DB_BINARY_SEARCH_SHIFT_BASE(indx, base, limit, O_INDX);
+ }
+ if (cmp == 0)
+ *part_idp = indx;
+ else if ((*part_idp = base) != 0)
+ (*part_idp)--;
+}
+
+/*
+ * __partition_init --
+ * Initialize the partition structure.
+ * Called when the meta data page is read in during database open or
+ * when partition keys or a callback are set.
+ *
+ * PUBLIC: int __partition_init __P((DB *, u_int32_t));
+ */
+int
+__partition_init(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ DB_PARTITION *part;
+ int ret;
+
+ if ((part = dbp->p_internal) != NULL) {
+ if ((LF_ISSET(DBMETA_PART_RANGE) &&
+ F_ISSET(part, PART_CALLBACK)) ||
+ (LF_ISSET(DBMETA_PART_CALLBACK) &&
+ F_ISSET(part, PART_RANGE))) {
+ __db_errx(dbp->env,
+ "Cannot specify callback and range keys.");
+ return (EINVAL);
+ }
+ } else if ((ret = __os_calloc(dbp->env, 1, sizeof(*part), &part)) != 0)
+ return (ret);
+
+ if (LF_ISSET(DBMETA_PART_RANGE))
+ F_SET(part, PART_RANGE);
+ if (LF_ISSET(DBMETA_PART_CALLBACK))
+ F_SET(part, PART_CALLBACK);
+ dbp->p_internal = part;
+ /* Set up AM-specific methods that do not require an open. */
+ dbp->db_am_rename = __part_rename;
+ dbp->db_am_remove = __part_remove;
+ return (0);
+}
+/*
+ * __partition_set --
+ * Set the partitioning keys or callback function.
+ * This routine must be called prior to creating the database.
+ * PUBLIC: int __partition_set __P((DB *, u_int32_t, DBT *,
+ * PUBLIC: u_int32_t (*callback)(DB *, DBT *key)));
+ */
+
+int
+__partition_set(dbp, parts, keys, callback)
+ DB *dbp;
+ u_int32_t parts;
+ DBT *keys;
+ u_int32_t (*callback)(DB *, DBT *key);
+{
+ DB_PARTITION *part;
+ ENV *env;
+ int ret;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_partition");
+ env = dbp->dbenv->env;
+
+ if (parts < 2) {
+ __db_errx(env, "Must specify at least 2 partitions.");
+ return (EINVAL);
+ }
+
+ if (keys == NULL && callback == NULL) {
+ __db_errx(env, "Must specify either keys or a callback.");
+ return (EINVAL);
+ }
+ if (keys != NULL && callback != NULL) {
+bad: __db_errx(env, "May not specify both keys and a callback.");
+ return (EINVAL);
+ }
+
+ if ((part = dbp->p_internal) == NULL) {
+ if ((ret = __partition_init(dbp,
+ keys != NULL ?
+ DBMETA_PART_RANGE : DBMETA_PART_CALLBACK)) != 0)
+ return (ret);
+ part = dbp->p_internal;
+ } else if ((part->keys != NULL && callback != NULL) ||
+ (part->callback != NULL && keys != NULL))
+ goto bad;
+
+ part->nparts = parts;
+ part->keys = keys;
+ part->callback = callback;
+
+ return (0);
+}
+
+/*
+ * __partition_set_dirs --
+ * Set the directories for creating the partition databases.
+ * They must be in the environment.
+ * PUBLIC: int __partition_set_dirs __P((DB *, const char **));
+ */
+int
+__partition_set_dirs(dbp, dirp)
+ DB *dbp;
+ const char **dirp;
+{
+ DB_ENV *dbenv;
+ DB_PARTITION *part;
+ ENV *env;
+ u_int32_t ndirs, slen;
+ int i, ret;
+ const char **dir;
+ char *cp, **part_dirs, **pd;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_partition_dirs");
+ dbenv = dbp->dbenv;
+ env = dbp->env;
+
+ ndirs = 1;
+ slen = 0;
+ for (dir = dirp; *dir != NULL; dir++) {
+ if (F_ISSET(env, ENV_DBLOCAL))
+ slen += (u_int32_t)strlen(*dir) + 1;
+ ndirs++;
+ }
+
+ slen += sizeof(char *) * ndirs;
+ if ((ret = __os_malloc(env, slen, &part_dirs)) != 0)
+ return (EINVAL);
+ memset(part_dirs, 0, slen);
+
+ cp = (char *) part_dirs + (sizeof(char *) * ndirs);
+ pd = part_dirs;
+ for (dir = dirp; *dir != NULL; dir++, pd++) {
+ if (F_ISSET(env, ENV_DBLOCAL)) {
+ (void)strcpy(cp, *dir);
+ *pd = cp;
+ cp += strlen(*dir) + 1;
+ continue;
+ }
+ for (i = 0; i < dbenv->data_next; i++)
+ if (strcmp(*dir, dbenv->db_data_dir[i]) == 0)
+ break;
+ if (i == dbenv->data_next) {
+ __db_errx(dbp->env,
+ "Directory not in environment list %s", *dir);
+ __os_free(env, part_dirs);
+ return (EINVAL);
+ }
+ *pd = dbenv->db_data_dir[i];
+ }
+
+ if ((part = dbp->p_internal) == NULL) {
+ if ((ret = __partition_init(dbp, 0)) != 0)
+ return (ret);
+ part = dbp->p_internal;
+ }
+
+ part->dirs = (const char **)part_dirs;
+
+ return (0);
+}
+
+/*
+ * __partition_open --
+ * Open/create a partitioned database.
+ * PUBLIC: int __partition_open __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC: DB_TXN *, const char *, DBTYPE, u_int32_t, int, int));
+ */
+int
+__partition_open(dbp, ip, txn, fname, type, flags, mode, do_open)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *fname;
+ DBTYPE type;
+ u_int32_t flags;
+ int mode, do_open;
+{
+ DB *part_db;
+ DB_PARTITION *part;
+ DBC *dbc;
+ ENV *env;
+ u_int32_t part_id;
+ int ret;
+ char *name, *sp;
+ const char **dirp, *np;
+
+ part = dbp->p_internal;
+ env = dbp->dbenv->env;
+ name = NULL;
+
+ if ((ret = __partition_chk_meta(dbp, ip, txn, flags)) != 0 && do_open)
+ goto err;
+
+ if ((ret = __os_calloc(env,
+ part->nparts, sizeof(*part->handles), &part->handles)) != 0) {
+ __db_errx(env,
+ Alloc_err, part->nparts * sizeof(*part->handles));
+ goto err;
+ }
+
+ DB_ASSERT(env, fname != NULL);
+ if ((ret = __os_malloc(env,
+ strlen(fname) + PART_LEN + 1, &name)) != 0) {
+ __db_errx(env, Alloc_err, strlen(fname) + PART_LEN + 1);
+ goto err;
+ }
+
+ sp = name;
+ np = __db_rpath(fname);
+ if (np == NULL)
+ np = fname;
+ else {
+ np++;
+ (void)strncpy(name, fname, (size_t)(np - fname));
+ sp = name + (np - fname);
+ }
+
+ if (F_ISSET(dbp, DB_AM_RECOVER))
+ goto done;
+ dirp = part->dirs;
+ for (part_id = 0; part_id < part->nparts; part_id++) {
+ if ((ret = __db_create_internal(
+ &part->handles[part_id], dbp->env, 0)) != 0)
+ goto err;
+
+ part_db = part->handles[part_id];
+ part_db->flags = F_ISSET(dbp,
+ ~(DB_AM_CREATED | DB_AM_CREATED_MSTR | DB_AM_OPEN_CALLED));
+ part_db->adj_fileid = dbp->adj_fileid;
+ part_db->pgsize = dbp->pgsize;
+ part_db->priority = dbp->priority;
+ part_db->db_append_recno = dbp->db_append_recno;
+ part_db->db_feedback = dbp->db_feedback;
+ part_db->dup_compare = dbp->dup_compare;
+ part_db->app_private = dbp->app_private;
+ part_db->api_internal = dbp->api_internal;
+
+ if (dbp->type == DB_BTREE)
+ __bam_copy_config(dbp, part_db, part->nparts);
+#ifdef HAVE_HASH
+ if (dbp->type == DB_HASH)
+ __ham_copy_config(dbp, part_db, part->nparts);
+#endif
+
+ (void)sprintf(sp, PART_NAME, np, part_id);
+ if ((ret = __os_strdup(env, name, &part_db->fname)) != 0)
+ goto err;
+ if (do_open) {
+ /*
+ * Cycle through the directory names passed in,
+ * if any.
+ */
+ if (dirp != NULL &&
+ (part_db->dirname = *dirp++) == NULL)
+ part_db->dirname = *(dirp = part->dirs);
+ if ((ret = __db_open(part_db, ip, txn,
+ name, NULL, type, flags, mode, PGNO_BASE_MD)) != 0)
+ goto err;
+ }
+ }
+
+ /* Get rid of the cursor used to open the database its the wrong type */
+done: while ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL)
+ if ((ret = __dbc_destroy(dbc)) != 0)
+ break;
+
+ if (0) {
+err: (void)__partition_close(dbp, txn, 0);
+ }
+ if (name != NULL)
+ __os_free(env, name);
+ return (ret);
+}
+
+/*
+ * __partition_chk_meta --
+ * Check for a consistent meta data page and parameters when opening a
+ * partitioned database.
+ */
+static int
+__partition_chk_meta(dbp, ip, txn, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ u_int32_t flags;
+{
+ DBMETA *meta;
+ DB_PARTITION *part;
+ DBC *dbc;
+ DB_LOCK metalock;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ db_pgno_t base_pgno;
+ int ret, t_ret;
+
+ dbc = NULL;
+ meta = NULL;
+ LOCK_INIT(metalock);
+ part = dbp->p_internal;
+ mpf = dbp->mpf;
+ env = dbp->env;
+ ret = 0;
+
+ /* Get a cursor on the main db. */
+ dbp->p_internal = NULL;
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+ goto err;
+
+ /* Get the metadata page. */
+ base_pgno = PGNO_BASE_MD;
+ if ((ret =
+ __db_lget(dbc, 0, base_pgno, DB_LOCK_READ, 0, &metalock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &base_pgno, ip, dbc->txn, 0, &meta)) != 0)
+ goto err;
+
+ if (meta->magic != DB_HASHMAGIC &&
+ (meta->magic != DB_BTREEMAGIC || F_ISSET(meta, BTM_RECNO))) {
+ __db_errx(env,
+ "Partitioning may only specified on BTREE and HASH databases.");
+ ret = EINVAL;
+ goto err;
+ }
+ if (!FLD_ISSET(meta->metaflags,
+ DBMETA_PART_RANGE | DBMETA_PART_CALLBACK)) {
+ __db_errx(env,
+ "Partitioning specified on a non-partitioned database.");
+ ret = EINVAL;
+ goto err;
+ }
+
+ if ((F_ISSET(part, PART_RANGE) &&
+ FLD_ISSET(meta->metaflags, DBMETA_PART_CALLBACK)) ||
+ (F_ISSET(part, PART_CALLBACK) &&
+ FLD_ISSET(meta->metaflags, DBMETA_PART_RANGE))) {
+ __db_errx(env, "Incompatible partitioning specified.");
+ ret = EINVAL;
+ goto err;
+ }
+
+ if (FLD_ISSET(meta->metaflags, DBMETA_PART_CALLBACK) &&
+ part->callback == NULL && !IS_RECOVERING(env) &&
+ !F_ISSET(dbp, DB_AM_RECOVER) && !LF_ISSET(DB_RDWRMASTER)) {
+ __db_errx(env, "Partition callback not specified.");
+ ret = EINVAL;
+ goto err;
+ }
+
+ if (F_ISSET(dbp, DB_AM_RECNUM)) {
+ __db_errx(env,
+ "Record numbers are not supported in partitioned databases.");
+ ret = EINVAL;
+ goto err;
+ }
+
+ if (part->nparts == 0) {
+ if (LF_ISSET(DB_CREATE) && meta->nparts == 0) {
+ __db_errx(env, "Zero paritions specified.");
+ ret = EINVAL;
+ goto err;
+ } else
+ part->nparts = meta->nparts;
+ } else if (meta->nparts != 0 && part->nparts != meta->nparts) {
+ __db_errx(env, "Number of partitions does not match.");
+ ret = EINVAL;
+ goto err;
+ }
+
+ if (meta->magic == DB_HASHMAGIC) {
+ if (!F_ISSET(part, PART_CALLBACK)) {
+ __db_errx(env,
+ "Hash database must specify a partition callback.");
+ ret = EINVAL;
+ }
+ } else if (meta->magic != DB_BTREEMAGIC) {
+ __db_errx(env,
+ "Partitioning only supported on BTREE nad HASH.");
+ ret = EINVAL;
+ } else
+ ret = __partition_setup_keys(dbc, part, meta, flags);
+
+err: /* Put the metadata page back. */
+ if (meta != NULL && (t_ret = __memp_fput(mpf,
+ ip, meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ dbp->p_internal = part;
+ return (ret);
+}
+
+/*
+ * Support for sorting keys. Keys must be sorted using the btree
+ * compare function so if we call qsort in __partiton_setup_keys
+ * we use this structure to pass the DBP and compare function.
+ */
+struct key_sort {
+ DB *dbp;
+ DBT *key;
+ int (*compare) __P((DB *, const DBT *, const DBT *));
+};
+
+static int __part_key_cmp(a, b)
+ const void *a, *b;
+{
+ const struct key_sort *ka, *kb;
+
+ ka = a;
+ kb = b;
+ return (ka->compare(ka->dbp, ka->key, kb->key));
+}
+/*
+ * __partition_setup_keys --
+ * Get the partition keys into memory, or put them to disk if we
+ * are creating a partitioned database.
+ */
+static int
+__partition_setup_keys(dbc, part, meta, flags)
+ DBC *dbc;
+ DB_PARTITION *part;
+ DBMETA *meta;
+ u_int32_t flags;
+{
+ BTREE *t;
+ DB *dbp;
+ DBT data, key, *keys, *kp;
+ ENV *env;
+ u_int32_t ds, i, j;
+ u_int8_t *dd;
+ struct key_sort *ks;
+ int have_keys, ret;
+ int (*compare) __P((DB *, const DBT *, const DBT *));
+ void *dp;
+
+ COMPQUIET(dd, NULL);
+ COMPQUIET(ds, 0);
+ memset(&data, 0, sizeof(data));
+ memset(&key, 0, sizeof(key));
+ ks = NULL;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ /* Need to just read the main database. */
+ dbp->p_internal = NULL;
+ have_keys = 0;
+
+ /* First verify that things what we expect. */
+ if ((ret = __dbc_get(dbc, &key, &data, DB_FIRST)) != 0) {
+ if (ret != DB_NOTFOUND)
+ goto err;
+ if (F_ISSET(part, PART_CALLBACK)) {
+ ret = 0;
+ goto done;
+ }
+ if (!LF_ISSET(DB_CREATE) && !F_ISSET(dbp, DB_AM_RECOVER) &&
+ !LF_ISSET(DB_RDWRMASTER)) {
+ __db_errx(env, "No range keys found.");
+ ret = EINVAL;
+ goto err;
+ }
+ } else {
+ if (F_ISSET(part, PART_CALLBACK)) {
+ __db_errx(env, "Keys found and callback set.");
+ ret = EINVAL;
+ goto err;
+ }
+ if (key.size != 0) {
+ __db_errx(env, "Partition key 0 is not empty.");
+ ret = EINVAL;
+ goto err;
+ }
+ have_keys = 1;
+ }
+
+ if (LF_ISSET(DB_CREATE) && have_keys == 0) {
+ /* Insert the keys into the master database. */
+ for (i = 0; i < part->nparts - 1; i++) {
+ if ((ret = __db_put(dbp, dbc->thread_info,
+ dbc->txn, &part->keys[i], &data, 0)) != 0)
+ goto err;
+ }
+
+ /*
+ * Insert the "0" pointer. All records less than the first
+ * given key go into this partition. We must use the default
+ * compare to insert this key, otherwise it might not be first.
+ */
+ t = dbc->dbp->bt_internal;
+ compare = t->bt_compare;
+ t->bt_compare = __bam_defcmp;
+ memset(&key, 0, sizeof(key));
+ ret = __db_put(dbp, dbc->thread_info, dbc->txn, &key, &data, 0);
+ t->bt_compare = compare;
+ if (ret != 0)
+ goto err;
+ }
+done: if (F_ISSET(part, PART_RANGE)) {
+ /*
+ * Allocate one page to hold the keys plus space at the
+ * end of the buffer to put an array of DBTs. If there
+ * is not enough space __dbc_get will return how much
+ * is needed and we realloc.
+ */
+ if ((ret = __os_malloc(env,
+ meta->pagesize + (sizeof(DBT) * part->nparts),
+ &part->data)) != 0) {
+ __db_errx(env, Alloc_err, meta->pagesize);
+ goto err;
+ }
+ memset(&key, 0, sizeof(key));
+ memset(&data, 0, sizeof(data));
+ data.data = part->data;
+ data.ulen = meta->pagesize;
+ data.flags = DB_DBT_USERMEM;
+again: if ((ret = __dbc_get(dbc, &key, &data,
+ DB_FIRST | DB_MULTIPLE_KEY)) == DB_BUFFER_SMALL) {
+ if ((ret = __os_realloc(env,
+ data.size + (sizeof(DBT) * part->nparts),
+ &part->data)) != 0)
+ goto err;
+ data.data = part->data;
+ data.ulen = data.size;
+ goto again;
+ }
+ if (ret == 0) {
+ /*
+ * They passed in keys, they must match.
+ */
+ keys = NULL;
+ compare = NULL;
+ if (have_keys == 1 && (keys = part->keys) != NULL) {
+ t = dbc->dbp->bt_internal;
+ compare = t->bt_compare;
+ if ((ret = __os_malloc(env, (part->nparts - 1)
+ * sizeof(struct key_sort), &ks)) != 0)
+ goto err;
+ for (j = 0; j < part->nparts - 1; j++) {
+ ks[j].dbp = dbc->dbp;
+ ks[j].compare = compare;
+ ks[j].key = &keys[j];
+ }
+
+ qsort(ks, (size_t)part->nparts - 1,
+ sizeof(struct key_sort), __part_key_cmp);
+ }
+ DB_MULTIPLE_INIT(dp, &data);
+ part->keys = (DBT *)
+ ((u_int8_t *)part->data + data.size);
+ j = 0;
+ for (kp = part->keys;
+ kp < &part->keys[part->nparts]; kp++, j++) {
+ DB_MULTIPLE_KEY_NEXT(dp,
+ &data, kp->data, kp->size, dd, ds);
+ if (dp == NULL) {
+ ret = DB_NOTFOUND;
+ break;
+ }
+ if (keys != NULL && j != 0 &&
+ compare(dbc->dbp, ks[j - 1].key, kp) != 0) {
+ if (kp->data == NULL &&
+ F_ISSET(dbp, DB_AM_RECOVER))
+ goto err;
+ __db_errx(env,
+ "Partition key %d does not match", j);
+ ret = EINVAL;
+ goto err;
+ }
+ }
+ }
+ }
+ if (ret == DB_NOTFOUND && F_ISSET(dbp, DB_AM_RECOVER))
+ ret = 0;
+
+err: dbp->p_internal = part;
+ if (ks != NULL)
+ __os_free(env, ks);
+ return (ret);
+}
+
+/*
+ * __partition_get_callback --
+ * Get the partition callback function.
+ * PUBLIC: int __partition_get_callback __P((DB *,
+ * PUBLIC: u_int32_t *, u_int32_t (**callback)(DB *, DBT *key)));
+ */
+int
+__partition_get_callback(dbp, parts, callback)
+ DB *dbp;
+ u_int32_t *parts;
+ u_int32_t (**callback)(DB *, DBT *key);
+{
+ DB_PARTITION *part;
+
+ part = dbp->p_internal;
+ /* Only return populated results if partitioned using callbacks. */
+ if (part != NULL && !F_ISSET(part, PART_CALLBACK))
+ part = NULL;
+ if (parts != NULL)
+ *parts = (part != NULL ? part->nparts : 0);
+ if (callback != NULL)
+ *callback = (part != NULL ? part->callback : NULL);
+
+ return (0);
+}
+
+/*
+ * __partition_get_keys --
+ * Get partition keys.
+ * PUBLIC: int __partition_get_keys __P((DB *, u_int32_t *, DBT **));
+ */
+int
+__partition_get_keys(dbp, parts, keys)
+ DB *dbp;
+ u_int32_t *parts;
+ DBT **keys;
+{
+ DB_PARTITION *part;
+
+ part = dbp->p_internal;
+ /* Only return populated results if partitioned using ranges. */
+ if (part != NULL && !F_ISSET(part, PART_RANGE))
+ part = NULL;
+ if (parts != NULL)
+ *parts = (part != NULL ? part->nparts : 0);
+ if (keys != NULL)
+ *keys = (part != NULL ? &part->keys[1] : NULL);
+
+ return (0);
+}
+
+/*
+ * __partition_get_dirs --
+ * Get partition dirs.
+ * PUBLIC: int __partition_get_dirs __P((DB *, const char ***));
+ */
+int
+__partition_get_dirs(dbp, dirpp)
+ DB *dbp;
+ const char ***dirpp;
+{
+ DB_PARTITION *part;
+ ENV *env;
+ u_int32_t i;
+ int ret;
+
+ env = dbp->env;
+ if ((part = dbp->p_internal) == NULL) {
+ *dirpp = NULL;
+ return (0);
+ }
+ if (!F_ISSET(dbp, DB_AM_OPEN_CALLED)) {
+ *dirpp = part->dirs;
+ return (0);
+ }
+
+ /*
+ * We build a list once when asked. The original directory list,
+ * if any, was discarded at open time.
+ */
+ if ((*dirpp = part->dirs) != NULL)
+ return (0);
+
+ if ((ret = __os_calloc(env,
+ sizeof(char *), part->nparts + 1, (char **)&part->dirs)) != 0)
+ return (ret);
+
+ for (i = 0; i < part->nparts; i++)
+ part->dirs[i] = part->handles[i]->dirname;
+
+ *dirpp = part->dirs;
+ return (0);
+}
+
+/*
+ * __partc_init --
+ * Initialize the access private portion of a cursor
+ *
+ * PUBLIC: int __partc_init __P((DBC *));
+ */
+int
+__partc_init(dbc)
+ DBC *dbc;
+{
+ ENV *env;
+ int ret;
+
+ env = dbc->env;
+
+ /* Allocate/initialize the internal structure. */
+ if (dbc->internal == NULL && (ret =
+ __os_calloc(env, 1, sizeof(PART_CURSOR), &dbc->internal)) != 0)
+ return (ret);
+
+ /* Initialize methods. */
+ dbc->close = dbc->c_close = __dbc_close_pp;
+ dbc->cmp = __dbc_cmp_pp;
+ dbc->count = dbc->c_count = __dbc_count_pp;
+ dbc->del = dbc->c_del = __dbc_del_pp;
+ dbc->dup = dbc->c_dup = __dbc_dup_pp;
+ dbc->get = dbc->c_get = __partc_get_pp;
+ dbc->pget = dbc->c_pget = __dbc_pget_pp;
+ dbc->put = dbc->c_put = __dbc_put_pp;
+ dbc->am_bulk = NULL;
+ dbc->am_close = __partc_close;
+ dbc->am_del = __partc_del;
+ dbc->am_destroy = __partc_destroy;
+ dbc->am_get = NULL;
+ dbc->am_put = __partc_put;
+ dbc->am_writelock = __partc_writelock;
+
+ /* We avoid swapping partition cursors since we swap the sub cursors */
+ F_SET(dbc, DBC_PARTITIONED);
+
+ return (0);
+}
+/*
+ * __partc_get_pp --
+ * cursor get opeartion on a partitioned database.
+ */
+static int
+__partc_get_pp(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ int ignore_lease, ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0;
+ LF_CLR(DB_IGNORE_LEASE);
+ if ((ret = __dbc_get_arg(dbc, key, data, flags)) != 0)
+ return (ret);
+
+ ENV_ENTER(env, ip);
+
+ DEBUG_LREAD(dbc, dbc->txn, "DBcursor->get",
+ flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags);
+
+ ret = __partc_get(dbc, key, data, flags);
+ /*
+ * Check for master leases.
+ */
+ if (ret == 0 &&
+ IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease)
+ ret = __rep_lease_check(env, 1);
+
+ ENV_LEAVE(env, ip);
+ __dbt_userfree(env, key, NULL, data);
+ return (ret);
+}
+/*
+ * __partiton_get --
+ * cursor get opeartion on a partitioned database.
+ *
+ * PUBLIC: int __partc_get __P((DBC*, DBT *, DBT *, u_int32_t));
+ */
+int
+__partc_get(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DBC *orig_dbc, *new_dbc;
+ DB_PARTITION *part;
+ PART_CURSOR *cp;
+ u_int32_t multi, part_id;
+ int ret, retry, search;
+
+ dbp = dbc->dbp;
+ cp = (PART_CURSOR*)dbc->internal;
+ orig_dbc = cp->sub_cursor;
+ part = dbp->p_internal;
+
+ new_dbc = NULL;
+ retry = search = 0;
+ part_id = cp->part_id;
+ multi = flags & ~DB_OPFLAGS_MASK;
+
+ switch (flags & DB_OPFLAGS_MASK) {
+ case DB_CURRENT:
+ break;
+ case DB_FIRST:
+ part_id = 0;
+ retry = 1;
+ break;
+ case DB_GET_BOTH:
+ case DB_GET_BOTHC:
+ case DB_GET_BOTH_RANGE:
+ search = 1;
+ break;
+ case DB_SET_RANGE:
+ search = 1;
+ retry = 1;
+ break;
+ case DB_LAST:
+ part_id = part->nparts - 1;
+ retry = 1;
+ break;
+ case DB_NEXT:
+ case DB_NEXT_NODUP:
+ if (orig_dbc == NULL)
+ part_id = 0;
+ else
+ part_id = cp->part_id;
+ retry = 1;
+ break;
+ case DB_NEXT_DUP:
+ break;
+ case DB_PREV:
+ case DB_PREV_NODUP:
+ if (orig_dbc == NULL)
+ part_id = part->nparts - 1;
+ else
+ part_id = cp->part_id;
+ retry = 1;
+ break;
+ case DB_PREV_DUP:
+ break;
+ case DB_SET:
+ search = 1;
+ break;
+ default:
+ return (__db_unknown_flag(dbp->env, "__partc_get", flags));
+ }
+
+ /*
+ * If we need to find the partition to start on, then
+ * do a binary search of the in memory partition table.
+ */
+ if (search == 1 && F_ISSET(part, PART_CALLBACK))
+ part_id = part->callback(dbp, key) % part->nparts;
+ else if (search == 1)
+ __part_search(dbp, part, key, &part_id);
+
+ /* Get a new cursor if necessary */
+ if (orig_dbc == NULL || cp->part_id != part_id) {
+ GET_PART_CURSOR(dbc, new_dbc, part_id);
+ } else
+ new_dbc = orig_dbc;
+
+ while ((ret = __dbc_get(new_dbc,
+ key, data, flags)) == DB_NOTFOUND && retry == 1) {
+ switch (flags & DB_OPFLAGS_MASK) {
+ case DB_FIRST:
+ case DB_NEXT:
+ case DB_NEXT_NODUP:
+ case DB_SET_RANGE:
+ if (++part_id < part->nparts) {
+ flags = DB_FIRST | multi;
+ break;
+ }
+ goto err;
+ case DB_LAST:
+ case DB_PREV:
+ case DB_PREV_NODUP:
+ if (part_id-- > 0) {
+ flags = DB_LAST | multi;
+ break;
+ }
+ goto err;
+ default:
+ goto err;
+ }
+
+ if (new_dbc != orig_dbc && (ret = __dbc_close(new_dbc)) != 0)
+ goto err;
+ GET_PART_CURSOR(dbc, new_dbc, part_id);
+ }
+
+ if (ret != 0)
+ goto err;
+
+ /* Success: swap original and new cursors. */
+ if (new_dbc != orig_dbc) {
+ if (orig_dbc != NULL) {
+ cp->sub_cursor = NULL;
+ if ((ret = __dbc_close(orig_dbc)) != 0)
+ goto err;
+ }
+ cp->sub_cursor = new_dbc;
+ cp->part_id = part_id;
+ }
+
+ return (0);
+
+err: if (new_dbc != NULL && new_dbc != orig_dbc)
+ (void)__dbc_close(new_dbc);
+ return (ret);
+}
+
+/*
+ * __partc_put --
+ * cursor put opeartion on a partitioned cursor.
+ *
+ */
+static int
+__partc_put(dbc, key, data, flags, pgnop)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+ db_pgno_t *pgnop;
+{
+ DB *dbp;
+ DB_PARTITION *part;
+ DBC *new_dbc;
+ PART_CURSOR *cp;
+ u_int32_t part_id;
+ int ret;
+
+ dbp = dbc->dbp;
+ cp = (PART_CURSOR*)dbc->internal;
+ part_id = cp->part_id;
+ part = dbp->p_internal;
+ *pgnop = PGNO_INVALID;
+
+ switch (flags) {
+ case DB_KEYFIRST:
+ case DB_KEYLAST:
+ case DB_NODUPDATA:
+ case DB_NOOVERWRITE:
+ case DB_OVERWRITE_DUP:
+ if (F_ISSET(part, PART_CALLBACK)) {
+ part_id = part->callback(dbp, key) % part->nparts;
+ break;
+ }
+ __part_search(dbp, part, key, &part_id);
+ break;
+ default:
+ break;
+ }
+
+ if ((new_dbc = cp->sub_cursor) == NULL || cp->part_id != part_id) {
+ if ((ret = __db_cursor_int(part->handles[part_id],
+ dbc->thread_info, dbc->txn, part->handles[part_id]->type,
+ PGNO_INVALID, 0, dbc->locker, &new_dbc)) != 0)
+ goto err;
+ }
+
+ if (F_ISSET(dbc, DBC_WRITER | DBC_WRITECURSOR))
+ F_SET(new_dbc, DBC_WRITER);
+ if ((ret = __dbc_put(new_dbc, key, data, flags)) != 0)
+ goto err;
+
+ if (new_dbc != cp->sub_cursor) {
+ if (cp->sub_cursor != NULL) {
+ if ((ret = __dbc_close(cp->sub_cursor)) != 0)
+ goto err;
+ cp->sub_cursor = NULL;
+ }
+ cp->sub_cursor = new_dbc;
+ cp->part_id = part_id;
+ }
+
+ return (0);
+
+err: if (new_dbc != NULL && cp->sub_cursor != new_dbc)
+ (void)__dbc_close(new_dbc);
+ return (ret);
+}
+
+/*
+ * __partc_del
+ * Delete interface to partitioned cursors.
+ *
+ */
+static int
+__partc_del(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ PART_CURSOR *cp;
+ cp = (PART_CURSOR*)dbc->internal;
+
+ if (F_ISSET(dbc, DBC_WRITER | DBC_WRITECURSOR))
+ F_SET(cp->sub_cursor, DBC_WRITER);
+ return (__dbc_del(cp->sub_cursor, flags));
+}
+
+/*
+ * __partc_writelock
+ * Writelock interface to partitioned cursors.
+ *
+ */
+static int
+__partc_writelock(dbc)
+ DBC *dbc;
+{
+ PART_CURSOR *cp;
+ cp = (PART_CURSOR*)dbc->internal;
+
+ return (cp->sub_cursor->am_writelock(cp->sub_cursor));
+}
+
+/*
+ * __partc_close
+ * Close interface to partitioned cursors.
+ *
+ */
+static int
+__partc_close(dbc, root_pgno, rmroot)
+ DBC *dbc;
+ db_pgno_t root_pgno;
+ int *rmroot;
+{
+ PART_CURSOR *cp;
+ int ret;
+
+ COMPQUIET(root_pgno, 0);
+ COMPQUIET(rmroot, NULL);
+
+ cp = (PART_CURSOR*)dbc->internal;
+
+ if (cp->sub_cursor == NULL)
+ return (0);
+ ret = __dbc_close(cp->sub_cursor);
+ cp->sub_cursor = NULL;
+ return (ret);
+}
+
+/*
+ * __partc_destroy --
+ * Destroy a single cursor.
+ */
+static int
+__partc_destroy(dbc)
+ DBC *dbc;
+{
+ PART_CURSOR *cp;
+ ENV *env;
+
+ cp = (PART_CURSOR *)dbc->internal;
+ env = dbc->env;
+
+ /* Discard the structure. Don't recurse. */
+ __os_free(env, cp);
+
+ return (0);
+}
+
+/*
+ * __partiton_close
+ * Close a partitioned database.
+ *
+ * PUBLIC: int __partition_close __P((DB *, DB_TXN *, u_int32_t));
+ */
+int
+__partition_close(dbp, txn, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ u_int32_t flags;
+{
+ DB **pdbp;
+ DB_PARTITION *part;
+ ENV *env;
+ u_int32_t i;
+ int ret, t_ret;
+
+ if ((part = dbp->p_internal) == NULL)
+ return (0);
+
+ env = dbp->env;
+ ret = 0;
+
+ if ((pdbp = part->handles) != NULL) {
+ for (i = 0; i < part->nparts; i++, pdbp++)
+ if (*pdbp != NULL && (t_ret =
+ __db_close(*pdbp, txn, flags)) != 0 && ret == 0)
+ ret = t_ret;
+ __os_free(env, part->handles);
+ }
+ if (part->dirs != NULL)
+ __os_free(env, (char **)part->dirs);
+ if (part->data != NULL)
+ __os_free(env, (char **)part->data);
+ __os_free(env, part);
+ dbp->p_internal = NULL;
+
+ return (ret);
+}
+
+/*
+ * __partiton_sync
+ * Sync a partitioned database.
+ *
+ * PUBLIC: int __partition_sync __P((DB *));
+ */
+int
+__partition_sync(dbp)
+ DB *dbp;
+{
+ DB **pdbp;
+ DB_PARTITION *part;
+ u_int32_t i;
+ int ret, t_ret;
+
+ ret = 0;
+ part = dbp->p_internal;
+
+ if ((pdbp = part->handles) != NULL) {
+ for (i = 0; i < part->nparts; i++, pdbp++)
+ if (*pdbp != NULL &&
+ F_ISSET(*pdbp, DB_AM_OPEN_CALLED) && (t_ret =
+ __memp_fsync((*pdbp)->mpf)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ if ((t_ret = __memp_fsync(dbp->mpf)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __partiton_stat
+ * Stat a partitioned database.
+ *
+ * PUBLIC: int __partition_stat __P((DBC *, void *, u_int32_t));
+ */
+int
+__partition_stat(dbc, spp, flags)
+ DBC *dbc;
+ void *spp;
+ u_int32_t flags;
+{
+ DB *dbp, **pdbp;
+ DB_BTREE_STAT *fsp, *bsp;
+#ifdef HAVE_HASH
+ DB_HASH_STAT *hfsp, *hsp;
+#endif
+ DB_PARTITION *part;
+ DBC *new_dbc;
+ ENV *env;
+ u_int32_t i;
+ int ret;
+
+ dbp = dbc->dbp;
+ part = dbp->p_internal;
+ env = dbp->env;
+ fsp = NULL;
+#ifdef HAVE_HASH
+ hfsp = NULL;
+#endif
+
+ pdbp = part->handles;
+ for (i = 0; i < part->nparts; i++, pdbp++) {
+ if ((ret = __db_cursor_int(*pdbp, dbc->thread_info, dbc->txn,
+ (*pdbp)->type, PGNO_INVALID,
+ 0, dbc->locker, &new_dbc)) != 0)
+ goto err;
+ switch (new_dbc->dbtype) {
+ case DB_BTREE:
+ if ((ret = __bam_stat(new_dbc, &bsp, flags)) != 0)
+ goto err;
+ if (fsp == NULL) {
+ fsp = bsp;
+ *(DB_BTREE_STAT **)spp = fsp;
+ } else {
+ fsp->bt_nkeys += bsp->bt_nkeys;
+ fsp->bt_ndata += bsp->bt_ndata;
+ fsp->bt_pagecnt += bsp->bt_pagecnt;
+ if (fsp->bt_levels < bsp->bt_levels)
+ fsp->bt_levels = bsp->bt_levels;
+ fsp->bt_int_pg += bsp->bt_int_pg;
+ fsp->bt_leaf_pg += bsp->bt_leaf_pg;
+ fsp->bt_dup_pg += bsp->bt_dup_pg;
+ fsp->bt_over_pg += bsp->bt_over_pg;
+ fsp->bt_free += bsp->bt_free;
+ fsp->bt_int_pgfree += bsp->bt_int_pgfree;
+ fsp->bt_leaf_pgfree += bsp->bt_leaf_pgfree;
+ fsp->bt_dup_pgfree += bsp->bt_dup_pgfree;
+ fsp->bt_over_pgfree += bsp->bt_over_pgfree;
+ __os_ufree(env, bsp);
+ }
+ break;
+#ifdef HAVE_HASH
+ case DB_HASH:
+ if ((ret = __ham_stat(new_dbc, &hsp, flags)) != 0)
+ goto err;
+ if (hfsp == NULL) {
+ hfsp = hsp;
+ *(DB_HASH_STAT **)spp = hfsp;
+ } else {
+ hfsp->hash_nkeys += hsp->hash_nkeys;
+ hfsp->hash_ndata += hsp->hash_ndata;
+ hfsp->hash_pagecnt += hsp->hash_pagecnt;
+ hfsp->hash_ffactor += hsp->hash_ffactor;
+ hfsp->hash_buckets += hsp->hash_buckets;
+ hfsp->hash_free += hsp->hash_free;
+ hfsp->hash_bfree += hsp->hash_bfree;
+ hfsp->hash_bigpages += hsp->hash_bigpages;
+ hfsp->hash_big_bfree += hsp->hash_big_bfree;
+ hfsp->hash_overflows += hsp->hash_overflows;
+ hfsp->hash_ovfl_free += hsp->hash_ovfl_free;
+ hfsp->hash_dup += hsp->hash_dup;
+ hfsp->hash_dup_free += hsp->hash_dup_free;
+ __os_ufree(env, hsp);
+ }
+ break;
+#endif
+ default:
+ break;
+ }
+ if ((ret = __dbc_close(new_dbc)) != 0)
+ goto err;
+ }
+ return (0);
+
+err:
+ if (fsp != NULL)
+ __os_ufree(env, fsp);
+ *(DB_BTREE_STAT **)spp = NULL;
+ return (ret);
+}
+
+/*
+ * __part_truncate --
+ * Truncate a database.
+ *
+ * PUBLIC: int __part_truncate __P((DBC *, u_int32_t *));
+ */
+int
+__part_truncate(dbc, countp)
+ DBC *dbc;
+ u_int32_t *countp;
+{
+ DB *dbp, **pdbp;
+ DB_PARTITION *part;
+ DBC *new_dbc;
+ u_int32_t count, i;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ part = dbp->p_internal;
+ pdbp = part->handles;
+ ret = 0;
+
+ if (countp != NULL)
+ *countp = 0;
+ for (i = 0; ret == 0 && i < part->nparts; i++, pdbp++) {
+ if ((ret = __db_cursor_int(*pdbp, dbc->thread_info, dbc->txn,
+ (*pdbp)->type, PGNO_INVALID,
+ 0, dbc->locker, &new_dbc)) != 0)
+ break;
+ switch (dbp->type) {
+ case DB_BTREE:
+ case DB_RECNO:
+ ret = __bam_truncate(new_dbc, &count);
+ break;
+ case DB_HASH:
+#ifdef HAVE_HASH
+ ret = __ham_truncate(new_dbc, &count);
+ break;
+#endif
+ case DB_QUEUE:
+ case DB_UNKNOWN:
+ default:
+ ret = __db_unknown_type(dbp->env,
+ "DB->truncate", dbp->type);
+ count = 0;
+ break;
+ }
+ if ((t_ret = __dbc_close(new_dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (countp != NULL)
+ *countp += count;
+ }
+
+ return (ret);
+}
+/*
+ * __part_compact -- compact a partitioned database.
+ *
+ * PUBLIC: int __part_compact __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC: DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *));
+ */
+int
+__part_compact(dbp, ip, txn, start, stop, c_data, flags, end)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DBT *start, *stop;
+ DB_COMPACT *c_data;
+ u_int32_t flags;
+ DBT *end;
+{
+ DB **pdbp;
+ DB_PARTITION *part;
+ u_int32_t i;
+ int ret;
+
+ part = dbp->p_internal;
+ pdbp = part->handles;
+ ret = 0;
+
+ for (i = 0; ret == 0 && i < part->nparts; i++, pdbp++) {
+ switch (dbp->type) {
+ case DB_HASH:
+ if (!LF_ISSET(DB_FREELIST_ONLY))
+ goto err;
+ /* FALLTHROUGH */
+ case DB_BTREE:
+ case DB_RECNO:
+ ret = __bam_compact(*pdbp,
+ ip, txn, start, stop, c_data, flags, end);
+ break;
+
+ default:
+ err: ret = __dbh_am_chk(dbp, DB_OK_BTREE);
+ break;
+ }
+ }
+ return (ret);
+}
+
+/*
+ * __part_lsn_reset --
+ * reset the lsns on each partition.
+ *
+ * PUBLIC: int __part_lsn_reset __P((DB *, DB_THREAD_INFO *));
+ */
+int
+__part_lsn_reset(dbp, ip)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+{
+ DB **pdbp;
+ DB_PARTITION *part;
+ u_int32_t i;
+ int ret;
+
+ part = dbp->p_internal;
+ pdbp = part->handles;
+ ret = 0;
+
+ for (i = 0; ret == 0 && i < part->nparts; i++, pdbp++)
+ ret = __db_lsn_reset((*pdbp)->mpf, ip);
+
+ return (ret);
+}
+
+/*
+ * __part_fileid_reset --
+ * reset the fileid on each partition.
+ *
+ * PUBLIC: int __part_fileid_reset
+ * PUBLIC: __P((ENV *, DB_THREAD_INFO *, const char *, u_int32_t, int));
+ */
+int
+__part_fileid_reset(env, ip, fname, nparts, encrypted)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ const char *fname;
+ u_int32_t nparts;
+ int encrypted;
+{
+ int ret;
+ u_int32_t part_id;
+ char *name, *sp;
+ const char *np;
+
+ if ((ret = __os_malloc(env,
+ strlen(fname) + PART_LEN + 1, &name)) != 0) {
+ __db_errx(env, Alloc_err, strlen(fname) + PART_LEN + 1);
+ return (ret);
+ }
+
+ sp = name;
+ np = __db_rpath(fname);
+ if (np == NULL)
+ np = fname;
+ else {
+ np++;
+ (void)strncpy(name, fname, (size_t)(np - fname));
+ sp = name + (np - fname);
+ }
+
+ for (part_id = 0; ret == 0 && part_id < nparts; part_id++) {
+ (void)sprintf(sp, PART_NAME, np, part_id);
+ ret = __env_fileid_reset(env, ip, sp, encrypted);
+ }
+
+ __os_free(env, name);
+ return (ret);
+}
+#ifndef HAVE_BREW
+/*
+ * __part_key_range --
+ * Return proportion of keys relative to given key.
+ *
+ * PUBLIC: int __part_key_range __P((DBC *, DBT *, DB_KEY_RANGE *, u_int32_t));
+ */
+int
+__part_key_range(dbc, dbt, kp, flags)
+ DBC *dbc;
+ DBT *dbt;
+ DB_KEY_RANGE *kp;
+ u_int32_t flags;
+{
+ BTREE_CURSOR *cp;
+ DBC *new_dbc;
+ DB_PARTITION *part;
+ PAGE *h;
+ u_int32_t id, part_id;
+ u_int32_t elems, empty, less_elems, my_elems, greater_elems;
+ u_int32_t levels, max_levels, my_levels;
+ int ret;
+ double total_elems;
+
+ COMPQUIET(flags, 0);
+
+ part = dbc->dbp->p_internal;
+
+ /*
+ * First we find the key range for the partition that contains the
+ * key. Then we scale based on estimates of the other partitions.
+ */
+ if (F_ISSET(part, PART_CALLBACK))
+ part_id = part->callback(dbc->dbp, dbt) % part->nparts;
+ else
+ __part_search(dbc->dbp, part, dbt, &part_id);
+ GET_PART_CURSOR(dbc, new_dbc, part_id);
+
+ if ((ret = __bam_key_range(new_dbc, dbt, kp, flags)) != 0)
+ goto err;
+
+ cp = (BTREE_CURSOR *)new_dbc->internal;
+
+ if ((ret = __memp_fget(new_dbc->dbp->mpf,
+ &cp->root, new_dbc->thread_info, new_dbc->txn, 0, &h)) != 0)
+ goto c_err;
+
+ my_elems = NUM_ENT(h);
+ my_levels = LEVEL(h);
+ max_levels = my_levels;
+
+ if ((ret = __memp_fput(new_dbc->dbp->mpf,
+ new_dbc->thread_info, h, new_dbc->priority)) != 0)
+ goto c_err;
+
+ if ((ret = __dbc_close(new_dbc)) != 0)
+ goto err;
+ /*
+ * We have the range within one subtree. Now estimate
+ * what part of the whole range that subtree is. Figure
+ * out how many levels each part has and how many entries
+ * in the level below the root.
+ */
+ empty = less_elems = greater_elems = 0;
+ for (id = 0; id < part->nparts; id++) {
+ if (id == part_id) {
+ empty = 0;
+ continue;
+ }
+ GET_PART_CURSOR(dbc, new_dbc, id);
+ cp = (BTREE_CURSOR *)new_dbc->internal;
+ if ((ret = __memp_fget(new_dbc->dbp->mpf, &cp->root,
+ new_dbc->thread_info, new_dbc->txn, 0, &h)) != 0)
+ goto c_err;
+
+ elems = NUM_ENT(h);
+ levels = LEVEL(h);
+ if (levels == 1)
+ elems /= 2;
+
+ if ((ret = __memp_fput(new_dbc->dbp->mpf,
+ new_dbc->thread_info, h, new_dbc->priority)) != 0)
+ goto c_err;
+
+ if ((ret = __dbc_close(new_dbc)) != 0)
+ goto err;
+
+ /* If the tree is empty, ignore it. */
+ if (elems == 0) {
+ empty++;
+ continue;
+ }
+
+ /*
+ * If a tree has fewer levels than the max just count
+ * it as a single element in the higher level.
+ */
+ if (id < part_id) {
+ if (levels > max_levels) {
+ max_levels = levels;
+ less_elems = id + elems - empty;
+ } else if (levels < max_levels)
+ less_elems++;
+ else
+ less_elems += elems;
+ } else {
+ if (levels > max_levels) {
+ max_levels = levels;
+ greater_elems = (id - part_id) + elems - empty;
+ } else if (levels < max_levels)
+ greater_elems++;
+ else
+ greater_elems += elems;
+ }
+
+ }
+
+ if (my_levels < max_levels) {
+ /*
+ * The subtree containing the key is not the tallest one.
+ * Reduce its share by the number of records at the highest
+ * level. Scale the greater and lesser components up
+ * by the number of records on either side of this
+ * subtree.
+ */
+ total_elems = 1 + greater_elems + less_elems;
+ kp->equal /= total_elems;
+ kp->less /= total_elems;
+ kp->less += less_elems/total_elems;
+ kp->greater /= total_elems;
+ kp->greater += greater_elems/total_elems;
+ } else if (my_levels == max_levels) {
+ /*
+ * The key is in one of the tallest subtrees. We will
+ * scale the values by the ratio of the records at the
+ * top of this stubtree to the number of records at the
+ * highest level.
+ */
+ total_elems = greater_elems + less_elems;
+ if (total_elems != 0) {
+ /*
+ * First scale down by the fraction of elements
+ * in this subtree.
+ */
+ total_elems += my_elems;
+ kp->equal *= my_elems;
+ kp->equal /= total_elems;
+ kp->less *= my_elems;
+ kp->less /= total_elems;
+ kp->greater *= my_elems;
+ kp->greater /= total_elems;
+ /*
+ * Proportially add weight from the subtrees to the
+ * left and right of this one.
+ */
+ kp->less += less_elems / total_elems;
+ kp->greater += greater_elems / total_elems;
+ }
+ }
+
+ if (0) {
+c_err: (void)__dbc_close(new_dbc);
+ }
+
+err: return (ret);
+}
+#endif
+
+/*
+ * __part_remove --
+ * Remove method for a partitioned database.
+ *
+ * PUBLIC: int __part_remove __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC: DB_TXN *, const char *, const char *, u_int32_t));
+ */
+int
+__part_remove(dbp, ip, txn, name, subdb, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name, *subdb;
+ u_int32_t flags;
+{
+ return (__part_rr(dbp, ip, txn, name, subdb, NULL, flags));
+}
+
+/*
+ * __part_rename --
+ * Rename method for a partitioned database.
+ *
+ * PUBLIC: int __part_rename __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC: DB_TXN *, const char *, const char *, const char *));
+ */
+int
+__part_rename(dbp, ip, txn, name, subdb, newname)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name, *subdb, *newname;
+{
+ return (__part_rr(dbp, ip, txn, name, subdb, newname, 0));
+}
+
+/*
+ * __part_rr --
+ * Remove/Rename method for a partitioned database.
+ */
+static int
+__part_rr(dbp, ip, txn, name, subdb, newname, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name, *subdb, *newname;
+ u_int32_t flags;
+{
+ DB **pdbp, *ptmpdbp, *tmpdbp;
+ DB_PARTITION *part;
+ ENV *env;
+ u_int32_t i;
+ int ret, t_ret;
+ char *np;
+
+ env = dbp->env;
+ ret = 0;
+
+ if (subdb != NULL && name != NULL) {
+ __db_errx(env,
+ "A partitioned database can not be in a multiple databases file");
+ return (EINVAL);
+ }
+ ENV_GET_THREAD_INFO(env, ip);
+
+ /*
+ * Since rename no longer opens the database, we have
+ * to do it here.
+ */
+ if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0)
+ return (ret);
+
+ /*
+ * We need to make sure we don't self-deadlock, so give
+ * this dbp the same locker as the incoming one.
+ */
+ tmpdbp->locker = dbp->locker;
+ if ((ret = __db_open(tmpdbp, ip, txn, name, NULL, dbp->type,
+ DB_RDWRMASTER | DB_RDONLY, 0, PGNO_BASE_MD)) != 0)
+ goto err;
+
+ part = tmpdbp->p_internal;
+ pdbp = part->handles;
+ COMPQUIET(np, NULL);
+ if (newname != NULL && (ret = __os_malloc(env,
+ strlen(newname) + PART_LEN + 1, &np)) != 0) {
+ __db_errx(env, Alloc_err, strlen(newname) + PART_LEN + 1);
+ goto err;
+ }
+ for (i = 0; i < part->nparts; i++, pdbp++) {
+ if ((ret = __db_create_internal(&ptmpdbp, env, 0)) != 0)
+ break;
+ ptmpdbp->locker = (*pdbp)->locker;
+ if (newname == NULL)
+ ret = __db_remove_int(ptmpdbp,
+ ip, txn, (*pdbp)->fname, NULL, flags);
+ else {
+ DB_ASSERT(env, np != NULL);
+ (void)sprintf(np, PART_NAME, newname, i);
+ ret = __db_rename_int(ptmpdbp,
+ ip, txn, (*pdbp)->fname, NULL, np);
+ }
+ ptmpdbp->locker = NULL;
+ (void)__db_close(ptmpdbp, NULL, DB_NOSYNC);
+ if (ret != 0)
+ break;
+ }
+
+ if (newname != NULL)
+ __os_free(env, np);
+
+ if (!F_ISSET(dbp, DB_AM_OPEN_CALLED)) {
+err: /*
+ * Since we copied the locker ID from the dbp, we'd better not
+ * free it here.
+ */
+ tmpdbp->locker = NULL;
+
+ /* We need to remove the lock event we associated with this. */
+ if (txn != NULL)
+ __txn_remlock(env,
+ txn, &tmpdbp->handle_lock, DB_LOCK_INVALIDID);
+
+ if ((t_ret = __db_close(tmpdbp,
+ txn, DB_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ return (ret);
+}
+#ifdef HAVE_VERIFY
+/*
+ * __part_verify --
+ * Verify a partitioned database.
+ *
+ * PUBLIC: int __part_verify __P((DB *, VRFY_DBINFO *, const char *,
+ * PUBLIC: void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__part_verify(dbp, vdp, fname, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ const char *fname;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ BINTERNAL *lp, *rp;
+ DB **pdbp;
+ DB_PARTITION *part;
+ DBC *dbc;
+ DBT *key;
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ u_int32_t i;
+ int ret, t_ret;
+
+ env = dbp->env;
+ lp = rp = NULL;
+ dbc = NULL;
+ ip = vdp->thread_info;
+
+ if (dbp->type == DB_BTREE) {
+ if ((ret = __bam_open(dbp, ip,
+ NULL, fname, PGNO_BASE_MD, flags)) != 0)
+ goto err;
+ }
+#ifdef HAVE_HASH
+ else if ((ret = __ham_open(dbp, ip,
+ NULL, fname, PGNO_BASE_MD, flags)) != 0)
+ goto err;
+#endif
+
+ /*
+ * Initalize partition db handles and get the names. Set DB_RDWRMASTER
+ * because we may not have the partition callback, but we can still
+ * look at the structure of the tree.
+ */
+ if ((ret = __partition_open(dbp,
+ ip, NULL, fname, dbp->type, flags | DB_RDWRMASTER, 0, 0)) != 0)
+ goto err;
+ part = dbp->p_internal;
+
+ if (LF_ISSET(DB_SALVAGE)) {
+ /* If we are being aggressive we don't want to dump the keys. */
+ if (LF_ISSET(DB_AGGRESSIVE))
+ dbp->p_internal = NULL;
+ ret = __db_prheader(dbp,
+ NULL, 0, 0, handle, callback, vdp, PGNO_BASE_MD);
+ dbp->p_internal = part;
+ if (ret != 0)
+ goto err;
+ }
+
+ if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+ goto err;
+
+ pdbp = part->handles;
+ for (i = 0; i < part->nparts; i++, pdbp++) {
+ if (!F_ISSET(part, PART_RANGE) || part->keys == NULL)
+ goto vrfy;
+ if (lp != NULL)
+ __os_free(env, lp);
+ lp = rp;
+ rp = NULL;
+ if (i + 1 < part->nparts) {
+ key = &part->keys[i + 1];
+ if ((ret = __os_malloc(env,
+ BINTERNAL_SIZE(key->size), &rp)) != 0)
+ goto err;
+ rp->len = key->size;
+ memcpy(rp->data, key->data, key->size);
+ B_TSET(rp->type, B_KEYDATA);
+ }
+vrfy: if ((t_ret = __db_verify(*pdbp, ip, (*pdbp)->fname,
+ NULL, handle, callback,
+ lp, rp, flags | DB_VERIFY_PARTITION)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+err: if (lp != NULL)
+ __os_free(env, lp);
+ if (rp != NULL)
+ __os_free(env, rp);
+ return (ret);
+}
+#endif
+
+#ifdef CONFIG_TEST
+/*
+ * __part_testdocopy -- copy all partitions for testing purposes.
+ *
+ * PUBLIC: int __part_testdocopy __P((DB *, const char *));
+ */
+int
+__part_testdocopy(dbp, name)
+ DB *dbp;
+ const char *name;
+{
+ DB **pdbp;
+ DB_PARTITION *part;
+ u_int32_t i;
+ int ret;
+
+ if ((ret = __db_testdocopy(dbp->env, name)) != 0)
+ return (ret);
+
+ part = dbp->p_internal;
+ pdbp = part->handles;
+ for (i = 0; i < part->nparts; i++, pdbp++)
+ if ((ret = __db_testdocopy(dbp->env, (*pdbp)->fname)) != 0)
+ return (ret);
+
+ return (0);
+}
+#endif
+#else
+/*
+ * __db_nopartition --
+ * Error when a Berkeley DB build doesn't include partitioning.
+ *
+ * PUBLIC: int __db_no_partition __P((ENV *));
+ */
+int
+__db_no_partition(env)
+ ENV *env;
+{
+ __db_errx(env,
+ "library build did not include support for the database partitioning");
+ return (DB_OPNOTSUP);
+}
+/*
+ * __partition_set --
+ * Set the partitioning keys or callback function.
+ * This routine must be called prior to creating the database.
+ * PUBLIC: int __partition_set __P((DB *, u_int32_t, DBT *,
+ * PUBLIC: u_int32_t (*callback)(DB *, DBT *key)));
+ */
+
+int
+__partition_set(dbp, parts, keys, callback)
+ DB *dbp;
+ u_int32_t parts;
+ DBT *keys;
+ u_int32_t (*callback)(DB *, DBT *key);
+{
+ COMPQUIET(parts, 0);
+ COMPQUIET(keys, NULL);
+ COMPQUIET(callback, NULL);
+
+ return (__db_no_partition(dbp->env));
+}
+
+/*
+ * __partition_get_callback --
+ * Set the partition callback function. This routine must be called
+ * prior to opening a partition database that requires a function.
+ * PUBLIC: int __partition_get_callback __P((DB *,
+ * PUBLIC: u_int32_t *, u_int32_t (**callback)(DB *, DBT *key)));
+ */
+int
+__partition_get_callback(dbp, parts, callback)
+ DB *dbp;
+ u_int32_t *parts;
+ u_int32_t (**callback)(DB *, DBT *key);
+{
+ COMPQUIET(parts, NULL);
+ COMPQUIET(callback, NULL);
+
+ return (__db_no_partition(dbp->env));
+}
+
+/*
+ * __partition_get_dirs --
+ * Get partition dirs.
+ * PUBLIC: int __partition_get_dirs __P((DB *, const char ***));
+ */
+int
+__partition_get_dirs(dbp, dirpp)
+ DB *dbp;
+ const char ***dirpp;
+{
+ COMPQUIET(dirpp, NULL);
+ return (__db_no_partition(dbp->env));
+}
+
+/*
+ * __partition_get_keys --
+ * Get partition keys.
+ * PUBLIC: int __partition_get_keys __P((DB *, u_int32_t *, DBT **));
+ */
+int
+__partition_get_keys(dbp, parts, keys)
+ DB *dbp;
+ u_int32_t *parts;
+ DBT **keys;
+{
+ COMPQUIET(parts, NULL);
+ COMPQUIET(keys, NULL);
+
+ return (__db_no_partition(dbp->env));
+}
+/*
+ * __partition_init --
+ * Initialize the partition structure.
+ * Called when the meta data page is read in during database open or
+ * when partition keys or a callback are set.
+ *
+ * PUBLIC: int __partition_init __P((DB *, u_int32_t));
+ */
+int
+__partition_init(dbp, flags)
+ DB *dbp;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+
+ return (__db_no_partition(dbp->env));
+}
+/*
+ * __part_fileid_reset --
+ * reset the fileid on each partition.
+ *
+ * PUBLIC: int __part_fileid_reset
+ * PUBLIC: __P((ENV *, DB_THREAD_INFO *, const char *, u_int32_t, int));
+ */
+int
+__part_fileid_reset(env, ip, fname, nparts, encrypted)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ const char *fname;
+ u_int32_t nparts;
+ int encrypted;
+{
+ COMPQUIET(ip, NULL);
+ COMPQUIET(fname, NULL);
+ COMPQUIET(nparts, 0);
+ COMPQUIET(encrypted, 0);
+
+ return (__db_no_partition(env));
+}
+/*
+ * __partition_set_dirs --
+ * Set the directories for creating the partition databases.
+ * They must be in the environment.
+ * PUBLIC: int __partition_set_dirs __P((DB *, const char **));
+ */
+int
+__partition_set_dirs(dbp, dirp)
+ DB *dbp;
+ const char **dirp;
+{
+ COMPQUIET(dirp, NULL);
+
+ return (__db_no_partition(dbp->env));
+}
+#endif