summaryrefslogtreecommitdiff
path: root/db/btree
diff options
context:
space:
mode:
Diffstat (limited to 'db/btree')
-rw-r--r--db/btree/bt_compare.c211
-rw-r--r--db/btree/bt_conv.c98
-rw-r--r--db/btree/bt_curadj.c573
-rw-r--r--db/btree/bt_cursor.c2131
-rw-r--r--db/btree/bt_delete.c530
-rw-r--r--db/btree/bt_method.c387
-rw-r--r--db/btree/bt_open.c468
-rw-r--r--db/btree/bt_put.c859
-rw-r--r--db/btree/bt_rec.c1219
-rw-r--r--db/btree/bt_reclaim.c53
-rw-r--r--db/btree/bt_recno.c1369
-rw-r--r--db/btree/bt_rsearch.c429
-rw-r--r--db/btree/bt_search.c471
-rw-r--r--db/btree/bt_split.c1126
-rw-r--r--db/btree/bt_stat.c480
-rw-r--r--db/btree/bt_upgrade.c164
-rw-r--r--db/btree/bt_verify.c2237
-rw-r--r--db/btree/btree.src296
-rw-r--r--db/btree/btree_auto.c2284
19 files changed, 15385 insertions, 0 deletions
diff --git a/db/btree/bt_compare.c b/db/btree/bt_compare.c
new file mode 100644
index 000000000..91481c313
--- /dev/null
+++ b/db/btree/bt_compare.c
@@ -0,0 +1,211 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: bt_compare.c,v 11.12 2000/10/26 19:00:28 krinsky Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "btree.h"
+
+/*
+ * __bam_cmp --
+ * Compare a key to a given record.
+ *
+ * PUBLIC: int __bam_cmp __P((DB *, const DBT *, PAGE *,
+ * PUBLIC: u_int32_t, int (*)(DB *, const DBT *, const DBT *), int *));
+ */
+int
+__bam_cmp(dbp, dbt, h, indx, func, cmpp)
+ DB *dbp;
+ const DBT *dbt;
+ PAGE *h;
+ u_int32_t indx;
+ int (*func)__P((DB *, const DBT *, const DBT *));
+ int *cmpp;
+{
+ BINTERNAL *bi;
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ DBT pg_dbt;
+
+ /*
+ * Returns:
+ * < 0 if dbt is < page record
+ * = 0 if dbt is = page record
+ * > 0 if dbt is > page record
+ *
+ * !!!
+ * We do not clear the pg_dbt DBT even though it's likely to contain
+ * random bits. That should be okay, because the app's comparison
+ * routine had better not be looking at fields other than data/size.
+ * We don't clear it because we go through this path a lot and it's
+ * expensive.
+ */
+ switch (TYPE(h)) {
+ case P_LBTREE:
+ case P_LDUP:
+ case P_LRECNO:
+ bk = GET_BKEYDATA(h, indx);
+ if (B_TYPE(bk->type) == B_OVERFLOW)
+ bo = (BOVERFLOW *)bk;
+ else {
+ pg_dbt.data = bk->data;
+ pg_dbt.size = bk->len;
+ *cmpp = func(dbp, dbt, &pg_dbt);
+ return (0);
+ }
+ break;
+ case P_IBTREE:
+ /*
+ * The following code guarantees that the left-most key on an
+ * internal page at any place in the tree sorts less than any
+ * user-specified key. The reason is that if we have reached
+ * this internal page, we know the user key must sort greater
+ * than the key we're storing for this page in any internal
+ * pages at levels above us in the tree. It then follows that
+ * any user-specified key cannot sort less than the first page
+ * which we reference, and so there's no reason to call the
+ * comparison routine. While this may save us a comparison
+ * routine call or two, the real reason for this is because
+ * we don't maintain a copy of the smallest key in the tree,
+ * so that we don't have to update all the levels of the tree
+ * should the application store a new smallest key. And, so,
+ * we may not have a key to compare, which makes doing the
+ * comparison difficult and error prone.
+ */
+ if (indx == 0) {
+ *cmpp = 1;
+ return (0);
+ }
+
+ bi = GET_BINTERNAL(h, indx);
+ if (B_TYPE(bi->type) == B_OVERFLOW)
+ bo = (BOVERFLOW *)(bi->data);
+ else {
+ pg_dbt.data = bi->data;
+ pg_dbt.size = bi->len;
+ *cmpp = func(dbp, dbt, &pg_dbt);
+ return (0);
+ }
+ break;
+ default:
+ return (__db_pgfmt(dbp, PGNO(h)));
+ }
+
+ /*
+ * Overflow.
+ */
+ return (__db_moff(dbp, dbt,
+ bo->pgno, bo->tlen, func == __bam_defcmp ? NULL : func, cmpp));
+}
+
+/*
+ * __bam_defcmp --
+ * Default comparison routine.
+ *
+ * PUBLIC: int __bam_defcmp __P((DB *, const DBT *, const DBT *));
+ */
+int
+__bam_defcmp(dbp, a, b)
+ DB *dbp;
+ const DBT *a, *b;
+{
+ size_t len;
+ u_int8_t *p1, *p2;
+
+ COMPQUIET(dbp, NULL);
+
+ /*
+ * Returns:
+ * < 0 if a is < b
+ * = 0 if a is = b
+ * > 0 if a is > b
+ *
+ * XXX
+ * If a size_t doesn't fit into a long, or if the difference between
+ * any two characters doesn't fit into an int, this routine can lose.
+ * What we need is a signed integral type that's guaranteed to be at
+ * least as large as a size_t, and there is no such thing.
+ */
+ len = a->size > b->size ? b->size : a->size;
+ for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2)
+ if (*p1 != *p2)
+ return ((long)*p1 - (long)*p2);
+ return ((long)a->size - (long)b->size);
+}
+
+/*
+ * __bam_defpfx --
+ * Default prefix routine.
+ *
+ * PUBLIC: size_t __bam_defpfx __P((DB *, const DBT *, const DBT *));
+ */
+size_t
+__bam_defpfx(dbp, a, b)
+ DB *dbp;
+ const DBT *a, *b;
+{
+ size_t cnt, len;
+ u_int8_t *p1, *p2;
+
+ COMPQUIET(dbp, NULL);
+
+ cnt = 1;
+ len = a->size > b->size ? b->size : a->size;
+ for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2, ++cnt)
+ if (*p1 != *p2)
+ return (cnt);
+
+ /*
+ * We know that a->size must be <= b->size, or they wouldn't be
+ * in this order.
+ */
+ return (a->size < b->size ? a->size + 1 : a->size);
+}
diff --git a/db/btree/bt_conv.c b/db/btree/bt_conv.c
new file mode 100644
index 000000000..fd30f375f
--- /dev/null
+++ b/db/btree/bt_conv.c
@@ -0,0 +1,98 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: bt_conv.c,v 11.6 2000/03/31 00:30:26 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "db_swap.h"
+#include "btree.h"
+
+/*
+ * __bam_pgin --
+ * Convert host-specific page layout from the host-independent format
+ * stored on disk.
+ *
+ * PUBLIC: int __bam_pgin __P((DB_ENV *, db_pgno_t, void *, DBT *));
+ */
+int
+__bam_pgin(dbenv, pg, pp, cookie)
+ DB_ENV *dbenv;
+ db_pgno_t pg;
+ void *pp;
+ DBT *cookie;
+{
+ DB_PGINFO *pginfo;
+ PAGE *h;
+
+ pginfo = (DB_PGINFO *)cookie->data;
+ if (!pginfo->needswap)
+ return (0);
+
+ h = pp;
+ return (TYPE(h) == P_BTREEMETA ? __bam_mswap(pp) :
+ __db_byteswap(dbenv, pg, pp, pginfo->db_pagesize, 1));
+}
+
+/*
+ * __bam_pgout --
+ * Convert host-specific page layout to the host-independent format
+ * stored on disk.
+ *
+ * PUBLIC: int __bam_pgout __P((DB_ENV *, db_pgno_t, void *, DBT *));
+ */
+int
+__bam_pgout(dbenv, pg, pp, cookie)
+ DB_ENV *dbenv;
+ db_pgno_t pg;
+ void *pp;
+ DBT *cookie;
+{
+ DB_PGINFO *pginfo;
+ PAGE *h;
+
+ pginfo = (DB_PGINFO *)cookie->data;
+ if (!pginfo->needswap)
+ return (0);
+
+ h = pp;
+ return (TYPE(h) == P_BTREEMETA ? __bam_mswap(pp) :
+ __db_byteswap(dbenv, pg, pp, pginfo->db_pagesize, 0));
+}
+
+/*
+ * __bam_mswap --
+ * Swap the bytes on the btree metadata page.
+ *
+ * PUBLIC: int __bam_mswap __P((PAGE *));
+ */
+int
+__bam_mswap(pg)
+ PAGE *pg;
+{
+ u_int8_t *p;
+
+ __db_metaswap(pg);
+
+ p = (u_int8_t *)pg + sizeof(DBMETA);
+
+ SWAP32(p); /* maxkey */
+ SWAP32(p); /* minkey */
+ SWAP32(p); /* re_len */
+ SWAP32(p); /* re_pad */
+ SWAP32(p); /* root */
+
+ return (0);
+}
diff --git a/db/btree/bt_curadj.c b/db/btree/bt_curadj.c
new file mode 100644
index 000000000..011acd2f4
--- /dev/null
+++ b/db/btree/bt_curadj.c
@@ -0,0 +1,573 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: bt_curadj.c,v 11.20 2001/01/17 16:15:49 bostic Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "btree.h"
+#include "txn.h"
+
+static int __bam_opd_cursor __P((DB *, DBC *, db_pgno_t, u_int32_t, u_int32_t));
+
+#ifdef DEBUG
+/*
+ * __bam_cprint --
+ * Display the current internal cursor.
+ *
+ * PUBLIC: void __bam_cprint __P((DBC *));
+ */
+void
+__bam_cprint(dbc)
+ DBC *dbc;
+{
+ BTREE_CURSOR *cp;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ fprintf(stderr, "\tinternal: ovflsize: %lu", (u_long)cp->ovflsize);
+ if (dbc->dbtype == DB_RECNO)
+ fprintf(stderr, " recno: %lu", (u_long)cp->recno);
+ if (F_ISSET(cp, C_DELETED))
+ fprintf(stderr, " (deleted)");
+ fprintf(stderr, "\n");
+}
+#endif
+
+/*
+ * Cursor adjustments are logged if they are for subtransactions. This is
+ * because it's possible for a subtransaction to adjust cursors which will
+ * still be active after the subtransaction aborts, and so which must be
+ * restored to their previous locations. Cursors that can be both affected
+ * by our cursor adjustments and active after our transaction aborts can
+ * only be found in our parent transaction -- cursors in other transactions,
+ * including other child transactions of our parent, must have conflicting
+ * locker IDs, and so cannot be affected by adjustments in this transaction.
+ */
+
+/*
+ * __bam_ca_delete --
+ * Update the cursors when items are deleted and when already deleted
+ * items are overwritten. Return the number of relevant cursors found.
+ *
+ * PUBLIC: int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, int));
+ */
+int
+__bam_ca_delete(dbp, pgno, indx, delete)
+ DB *dbp;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ int delete;
+{
+ BTREE_CURSOR *cp;
+ DB *ldbp;
+ DB_ENV *dbenv;
+ DBC *dbc;
+ int count; /* !!!: Has to contain max number of cursors. */
+
+ dbenv = dbp->dbenv;
+
+ /*
+ * Adjust the cursors. We have the page write locked, so the
+ * only other cursors that can be pointing at a page are
+ * those in the same thread of control. Unfortunately, we don't
+ * know that they're using the same DB handle, so traverse
+ * all matching DB handles in the same DB_ENV, then all cursors
+ * on each matching DB handle.
+ *
+ * Each cursor is single-threaded, so we only need to lock the
+ * list of DBs and then the list of cursors in each DB.
+ */
+ MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp);
+ for (count = 0, ldbp = __dblist_get(dbenv, dbp->adj_fileid);
+ ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+ ldbp = LIST_NEXT(ldbp, dblistlinks)) {
+ MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
+ for (dbc = TAILQ_FIRST(&ldbp->active_queue);
+ dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+ cp = (BTREE_CURSOR *)dbc->internal;
+ if (cp->pgno == pgno && cp->indx == indx) {
+ if (delete)
+ F_SET(cp, C_DELETED);
+ else
+ F_CLR(cp, C_DELETED);
+ ++count;
+ }
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp);
+
+ return (count);
+}
+
+/*
+ * __ram_ca_delete --
+ * Return the number of relevant cursors.
+ *
+ * PUBLIC: int __ram_ca_delete __P((DB *, db_pgno_t));
+ */
+int
+__ram_ca_delete(dbp, root_pgno)
+ DB *dbp;
+ db_pgno_t root_pgno;
+{
+ DB *ldbp;
+ DBC *dbc;
+ DB_ENV *dbenv;
+ int found;
+
+ found = 0;
+ dbenv = dbp->dbenv;
+
+ /*
+ * Review the cursors. See the comment in __bam_ca_delete().
+ */
+ MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp);
+ for (ldbp = __dblist_get(dbenv, dbp->adj_fileid);
+ found == 0 && ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+ ldbp = LIST_NEXT(ldbp, dblistlinks)) {
+ MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
+ for (dbc = TAILQ_FIRST(&ldbp->active_queue);
+ found == 0 && dbc != NULL; dbc = TAILQ_NEXT(dbc, links))
+ if (dbc->internal->root == root_pgno)
+ found = 1;
+ MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp);
+ return (found);
+}
+
+/*
+ * __bam_ca_di --
+ * Adjust the cursors during a delete or insert.
+ *
+ * PUBLIC: int __bam_ca_di __P((DBC *, db_pgno_t, u_int32_t, int));
+ */
+int
+__bam_ca_di(my_dbc, pgno, indx, adjust)
+ DBC *my_dbc;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ int adjust;
+{
+ DB *dbp, *ldbp;
+ DB_ENV *dbenv;
+ DB_LSN lsn;
+ DB_TXN *my_txn;
+ DBC *dbc;
+ DBC_INTERNAL *cp;
+ int found, ret;
+
+ dbp = my_dbc->dbp;
+ dbenv = dbp->dbenv;
+
+ my_txn = IS_SUBTRANSACTION(my_dbc->txn) ? my_dbc->txn : NULL;
+
+ /*
+ * Adjust the cursors. See the comment in __bam_ca_delete().
+ */
+ found = 0;
+ MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp);
+ for (ldbp = __dblist_get(dbenv, dbp->adj_fileid);
+ ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+ ldbp = LIST_NEXT(ldbp, dblistlinks)) {
+ MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
+ for (dbc = TAILQ_FIRST(&ldbp->active_queue);
+ dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+ if (dbc->dbtype == DB_RECNO)
+ continue;
+ cp = dbc->internal;
+ if (cp->pgno == pgno && cp->indx >= indx) {
+ /* Cursor indices should never be negative. */
+ DB_ASSERT(cp->indx != 0 || adjust > 0);
+
+ cp->indx += adjust;
+ if (my_txn != NULL && dbc->txn != my_txn)
+ found = 1;
+ }
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp);
+
+ if (found != 0 && DB_LOGGING(my_dbc)) {
+ if ((ret = __bam_curadj_log(dbenv,
+ my_dbc->txn, &lsn, 0, dbp->log_fileid,
+ DB_CA_DI, pgno, 0, 0, adjust, indx, 0)) != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+/*
+ * __bam_opd_cursor -- create a new opd cursor.
+ */
+static int
+__bam_opd_cursor(dbp, dbc, first, tpgno, ti)
+ DB *dbp;
+ DBC *dbc;
+ db_pgno_t tpgno;
+ u_int32_t first, ti;
+{
+ BTREE_CURSOR *cp, *orig_cp;
+ DBC *dbc_nopd;
+ int ret;
+
+ orig_cp = (BTREE_CURSOR *)dbc->internal;
+ dbc_nopd = NULL;
+
+ /*
+ * Allocate a new cursor and create the stack. If duplicates
+ * are sorted, we've just created an off-page duplicate Btree.
+ * If duplicates aren't sorted, we've just created a Recno tree.
+ */
+ if ((ret = __db_c_newopd(dbc, tpgno, &dbc_nopd)) != 0)
+ return (ret);
+
+ cp = (BTREE_CURSOR *)dbc_nopd->internal;
+ cp->pgno = tpgno;
+ cp->indx = ti;
+
+ if (dbp->dup_compare == NULL) {
+ /*
+ * Converting to off-page Recno trees is tricky. The
+ * record number for the cursor is the index + 1 (to
+ * convert to 1-based record numbers).
+ */
+ cp->recno = ti + 1;
+ }
+
+ /*
+ * Transfer the deleted flag from the top-level cursor to the
+ * created one.
+ */
+ if (F_ISSET(orig_cp, C_DELETED)) {
+ F_SET(cp, C_DELETED);
+ F_CLR(orig_cp, C_DELETED);
+ }
+
+ /* Stack the cursors and reset the initial cursor's index. */
+ orig_cp->opd = dbc_nopd;
+ orig_cp->indx = first;
+ return (0);
+}
+
+/*
+ * __bam_ca_dup --
+ * Adjust the cursors when moving items from a leaf page to a duplicates
+ * page.
+ *
+ * PUBLIC: int __bam_ca_dup __P((DBC *,
+ * PUBLIC: u_int32_t, db_pgno_t, u_int32_t, db_pgno_t, u_int32_t));
+ */
+int
+__bam_ca_dup(my_dbc, first, fpgno, fi, tpgno, ti)
+ DBC *my_dbc;
+ db_pgno_t fpgno, tpgno;
+ u_int32_t first, fi, ti;
+{
+ BTREE_CURSOR *orig_cp;
+ DB *dbp, *ldbp;
+ DBC *dbc;
+ DB_ENV *dbenv;
+ DB_LSN lsn;
+ DB_TXN *my_txn;
+ int found, ret;
+
+ dbp = my_dbc->dbp;
+ dbenv = dbp->dbenv;
+ my_txn = IS_SUBTRANSACTION(my_dbc->txn) ? my_dbc->txn : NULL;
+
+ /*
+ * Adjust the cursors. See the comment in __bam_ca_delete().
+ */
+ found = 0;
+ MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp);
+ for (ldbp = __dblist_get(dbenv, dbp->adj_fileid);
+ ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+ ldbp = LIST_NEXT(ldbp, dblistlinks)) {
+loop: MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
+ for (dbc = TAILQ_FIRST(&ldbp->active_queue);
+ dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+ /* Find cursors pointing to this record. */
+ orig_cp = (BTREE_CURSOR *)dbc->internal;
+ if (orig_cp->pgno != fpgno || orig_cp->indx != fi)
+ continue;
+
+ /*
+ * Since we rescan the list see if this is already
+ * converted.
+ */
+ if (orig_cp->opd != NULL)
+ continue;
+
+ MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
+ if ((ret = __bam_opd_cursor(dbp,
+ dbc, first, tpgno, ti)) !=0)
+ return (ret);
+ if (my_txn != NULL && dbc->txn != my_txn)
+ found = 1;
+ /* We released the MUTEX to get a cursor, start over. */
+ goto loop;
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp);
+
+ if (found != 0 && DB_LOGGING(my_dbc)) {
+ if ((ret = __bam_curadj_log(dbenv,
+ my_dbc->txn, &lsn, 0, dbp->log_fileid,
+ DB_CA_DUP, fpgno, tpgno, 0, first, fi, ti)) != 0)
+ return (ret);
+ }
+ return (0);
+}
+
+/*
+ * __bam_ca_undodup --
+ * Adjust the cursors when returning items to a leaf page
+ * from a duplicate page.
+ * Called only during undo processing.
+ *
+ * PUBLIC: int __bam_ca_undodup __P((DB *,
+ * PUBLIC: u_int32_t, db_pgno_t, u_int32_t, u_int32_t));
+ */
+int
+__bam_ca_undodup(dbp, first, fpgno, fi, ti)
+ DB *dbp;
+ db_pgno_t fpgno;
+ u_int32_t first, fi, ti;
+{
+ BTREE_CURSOR *orig_cp;
+ DB *ldbp;
+ DBC *dbc;
+ DB_ENV *dbenv;
+ int ret;
+
+ dbenv = dbp->dbenv;
+
+ /*
+ * Adjust the cursors. See the comment in __bam_ca_delete().
+ */
+ MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp);
+ for (ldbp = __dblist_get(dbenv, dbp->adj_fileid);
+ ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+ ldbp = LIST_NEXT(ldbp, dblistlinks)) {
+loop: MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
+ for (dbc = TAILQ_FIRST(&ldbp->active_queue);
+ dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+ orig_cp = (BTREE_CURSOR *)dbc->internal;
+
+ if (orig_cp->pgno != fpgno ||
+ orig_cp->indx != first ||
+ ((BTREE_CURSOR *)orig_cp->opd->internal)->indx
+ != ti)
+ continue;
+ MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
+ if ((ret = orig_cp->opd->c_close(orig_cp->opd)) != 0)
+ return (ret);
+ orig_cp->opd = NULL;
+ orig_cp->indx = fi;
+ /*
+ * We released the MUTEX to free a cursor,
+ * start over.
+ */
+ goto loop;
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp);
+
+ return (0);
+}
+
+/*
+ * __bam_ca_rsplit --
+ * Adjust the cursors when doing reverse splits.
+ *
+ * PUBLIC: int __bam_ca_rsplit __P((DBC *, db_pgno_t, db_pgno_t));
+ */
+int
+__bam_ca_rsplit(my_dbc, fpgno, tpgno)
+ DBC* my_dbc;
+ db_pgno_t fpgno, tpgno;
+{
+ DB *dbp, *ldbp;
+ DBC *dbc;
+ DB_ENV *dbenv;
+ DB_LSN lsn;
+ DB_TXN *my_txn;
+ int found, ret;
+
+ dbp = my_dbc->dbp;
+ dbenv = dbp->dbenv;
+ my_txn = IS_SUBTRANSACTION(my_dbc->txn) ? my_dbc->txn : NULL;
+
+ /*
+ * Adjust the cursors. See the comment in __bam_ca_delete().
+ */
+ found = 0;
+ MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp);
+ for (ldbp = __dblist_get(dbenv, dbp->adj_fileid);
+ ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+ ldbp = LIST_NEXT(ldbp, dblistlinks)) {
+ MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
+ for (dbc = TAILQ_FIRST(&ldbp->active_queue);
+ dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+ if (dbc->dbtype == DB_RECNO)
+ continue;
+ if (dbc->internal->pgno == fpgno) {
+ dbc->internal->pgno = tpgno;
+ if (my_txn != NULL && dbc->txn != my_txn)
+ found = 1;
+ }
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp);
+
+ if (found != 0 && DB_LOGGING(my_dbc)) {
+ if ((ret = __bam_curadj_log(dbenv,
+ my_dbc->txn, &lsn, 0, dbp->log_fileid,
+ DB_CA_RSPLIT, fpgno, tpgno, 0, 0, 0, 0)) != 0)
+ return (ret);
+ }
+ return (0);
+}
+
+/*
+ * __bam_ca_split --
+ * Adjust the cursors when splitting a page.
+ *
+ * PUBLIC: int __bam_ca_split __P((DBC *,
+ * PUBLIC: db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t, int));
+ */
+int
+__bam_ca_split(my_dbc, ppgno, lpgno, rpgno, split_indx, cleft)
+ DBC *my_dbc;
+ db_pgno_t ppgno, lpgno, rpgno;
+ u_int32_t split_indx;
+ int cleft;
+{
+ DB *dbp, *ldbp;
+ DBC *dbc;
+ DBC_INTERNAL *cp;
+ DB_ENV *dbenv;
+ DB_LSN lsn;
+ DB_TXN *my_txn;
+ int found, ret;
+
+ dbp = my_dbc->dbp;
+ dbenv = dbp->dbenv;
+ my_txn = IS_SUBTRANSACTION(my_dbc->txn) ? my_dbc->txn : NULL;
+
+ /*
+ * Adjust the cursors. See the comment in __bam_ca_delete().
+ *
+ * If splitting the page that a cursor was on, the cursor has to be
+ * adjusted to point to the same record as before the split. Most
+ * of the time we don't adjust pointers to the left page, because
+ * we're going to copy its contents back over the original page. If
+ * the cursor is on the right page, it is decremented by the number of
+ * records split to the left page.
+ */
+ found = 0;
+ MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp);
+ for (ldbp = __dblist_get(dbenv, dbp->adj_fileid);
+ ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+ ldbp = LIST_NEXT(ldbp, dblistlinks)) {
+ MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
+ for (dbc = TAILQ_FIRST(&ldbp->active_queue);
+ dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+ if (dbc->dbtype == DB_RECNO)
+ continue;
+ cp = dbc->internal;
+ if (cp->pgno == ppgno) {
+ if (my_txn != NULL && dbc->txn != my_txn)
+ found = 1;
+ if (cp->indx < split_indx) {
+ if (cleft)
+ cp->pgno = lpgno;
+ } else {
+ cp->pgno = rpgno;
+ cp->indx -= split_indx;
+ }
+ }
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp);
+
+ if (found != 0 && DB_LOGGING(my_dbc)) {
+ if ((ret = __bam_curadj_log(dbenv, my_dbc->txn,
+ &lsn, 0, dbp->log_fileid, DB_CA_SPLIT, ppgno, rpgno,
+ cleft ? lpgno : PGNO_INVALID, 0, split_indx, 0)) != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+/*
+ * __bam_ca_undosplit --
+ * Adjust the cursors when undoing a split of a page.
+ * If we grew a level we will execute this for both the
+ * left and the right pages.
+ * Called only during undo processing.
+ *
+ * PUBLIC: void __bam_ca_undosplit __P((DB *,
+ * PUBLIC: db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t));
+ */
+void
+__bam_ca_undosplit(dbp, frompgno, topgno, lpgno, split_indx)
+ DB *dbp;
+ db_pgno_t frompgno, topgno, lpgno;
+ u_int32_t split_indx;
+{
+ DB *ldbp;
+ DBC *dbc;
+ DB_ENV *dbenv;
+ DBC_INTERNAL *cp;
+
+ dbenv = dbp->dbenv;
+
+ /*
+ * Adjust the cursors. See the comment in __bam_ca_delete().
+ *
+ * When backing out a split, we move the cursor back
+ * to the original offset and bump it by the split_indx.
+ */
+ MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp);
+ for (ldbp = __dblist_get(dbenv, dbp->adj_fileid);
+ ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+ ldbp = LIST_NEXT(ldbp, dblistlinks)) {
+ MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
+ for (dbc = TAILQ_FIRST(&ldbp->active_queue);
+ dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+ if (dbc->dbtype == DB_RECNO)
+ continue;
+ cp = dbc->internal;
+ if (cp->pgno == topgno) {
+ cp->pgno = frompgno;
+ cp->indx += split_indx;
+ } else if (cp->pgno == lpgno)
+ cp->pgno = frompgno;
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp);
+}
diff --git a/db/btree/bt_cursor.c b/db/btree/bt_cursor.c
new file mode 100644
index 000000000..84ab7c807
--- /dev/null
+++ b/db/btree/bt_cursor.c
@@ -0,0 +1,2131 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: bt_cursor.c,v 11.88 2001/01/11 18:19:49 bostic Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <stdlib.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "db_shash.h"
+#include "btree.h"
+#include "lock.h"
+#include "qam.h"
+#include "common_ext.h"
+
+static int __bam_c_close __P((DBC *, db_pgno_t, int *));
+static int __bam_c_del __P((DBC *));
+static int __bam_c_destroy __P((DBC *));
+static int __bam_c_first __P((DBC *));
+static int __bam_c_get __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static int __bam_c_getstack __P((DBC *));
+static int __bam_c_last __P((DBC *));
+static int __bam_c_next __P((DBC *, int));
+static int __bam_c_physdel __P((DBC *));
+static int __bam_c_prev __P((DBC *));
+static int __bam_c_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static void __bam_c_reset __P((BTREE_CURSOR *));
+static int __bam_c_search __P((DBC *, const DBT *, u_int32_t, int *));
+static int __bam_c_writelock __P((DBC *));
+static int __bam_getboth_finddatum __P((DBC *, DBT *));
+static int __bam_getbothc __P((DBC *, DBT *));
+static int __bam_isopd __P((DBC *, db_pgno_t *));
+
+/*
+ * Acquire a new page/lock. If we hold a page/lock, discard the page, and
+ * lock-couple the lock.
+ *
+ * !!!
+ * We have to handle both where we have a lock to lock-couple and where we
+ * don't -- we don't duplicate locks when we duplicate cursors if we are
+ * running in a transaction environment as there's no point if locks are
+ * never discarded. This means that the cursor may or may not hold a lock.
+ */
+#undef ACQUIRE
+#define ACQUIRE(dbc, mode, lpgno, lock, fpgno, pagep, ret) {\
+ if ((pagep) != NULL) { \
+ ret = memp_fput((dbc)->dbp->mpf, pagep, 0); \
+ pagep = NULL; \
+ } else \
+ ret = 0; \
+ if ((ret) == 0 && STD_LOCKING(dbc)) \
+ ret = __db_lget(dbc, \
+ (lock).off == LOCK_INVALID ? 0 : LCK_COUPLE, \
+ lpgno, mode, 0, &lock); \
+ else \
+ (lock).off = LOCK_INVALID; \
+ if ((ret) == 0) \
+ ret = memp_fget((dbc)->dbp->mpf, &(fpgno), 0, &(pagep));\
+}
+
+/* Acquire a new page/lock for a cursor. */
+#undef ACQUIRE_CUR
+#define ACQUIRE_CUR(dbc, mode, ret) { \
+ BTREE_CURSOR *__cp = (BTREE_CURSOR *)(dbc)->internal; \
+ ACQUIRE(dbc, mode, \
+ __cp->pgno, __cp->lock, __cp->pgno, __cp->page, ret); \
+ if ((ret) == 0) \
+ __cp->lock_mode = (mode); \
+}
+
+/*
+ * Acquire a new page/lock for a cursor, and move the cursor on success.
+ * The reason that this is a separate macro is because we don't want to
+ * set the pgno/indx fields in the cursor until we actually have the lock,
+ * otherwise the cursor adjust routines will adjust the cursor even though
+ * we're not really on the page.
+ */
+#undef ACQUIRE_CUR_SET
+#define ACQUIRE_CUR_SET(dbc, mode, p, ret) { \
+ BTREE_CURSOR *__cp = (BTREE_CURSOR *)(dbc)->internal; \
+ ACQUIRE(dbc, mode, p, __cp->lock, p, __cp->page, ret); \
+ if ((ret) == 0) { \
+ __cp->pgno = p; \
+ __cp->indx = 0; \
+ __cp->lock_mode = (mode); \
+ } \
+}
+
+/*
+ * Acquire a write lock if we don't already have one.
+ *
+ * !!!
+ * See ACQUIRE macro on why we handle cursors that don't have locks.
+ */
+#undef ACQUIRE_WRITE_LOCK
+#define ACQUIRE_WRITE_LOCK(dbc, ret) { \
+ BTREE_CURSOR *__cp = (BTREE_CURSOR *)(dbc)->internal; \
+ ret = 0; \
+ if (STD_LOCKING(dbc) && \
+ __cp->lock_mode != DB_LOCK_WRITE && \
+ ((ret) = __db_lget(dbc, \
+ __cp->lock.off == LOCK_INVALID ? 0 : LCK_COUPLE, \
+ __cp->pgno, DB_LOCK_WRITE, 0, &__cp->lock)) == 0) \
+ __cp->lock_mode = DB_LOCK_WRITE; \
+}
+
+/* Discard the current page/lock. */
+#undef DISCARD
+#define DISCARD(dbc, ldiscard, lock, pagep, ret) { \
+ int __t_ret; \
+ if ((pagep) != NULL) { \
+ ret = memp_fput((dbc)->dbp->mpf, pagep, 0); \
+ pagep = NULL; \
+ } else \
+ ret = 0; \
+ if ((lock).off != LOCK_INVALID) { \
+ __t_ret = ldiscard ? \
+ __LPUT((dbc), lock): __TLPUT((dbc), lock); \
+ if (__t_ret != 0 && (ret) == 0) \
+ ret = __t_ret; \
+ (lock).off = LOCK_INVALID; \
+ } \
+}
+
+/* Discard the current page/lock for a cursor. */
+#undef DISCARD_CUR
+#define DISCARD_CUR(dbc, ret) { \
+ BTREE_CURSOR *__cp = (BTREE_CURSOR *)(dbc)->internal; \
+ DISCARD(dbc, 0, __cp->lock, __cp->page, ret); \
+ if ((ret) == 0) \
+ __cp->lock_mode = DB_LOCK_NG; \
+}
+
+/* If on-page item is a deleted record. */
+#undef IS_DELETED
+#define IS_DELETED(page, indx) \
+ B_DISSET(GET_BKEYDATA(page, \
+ (indx) + (TYPE(page) == P_LBTREE ? O_INDX : 0))->type)
+#undef IS_CUR_DELETED
+#define IS_CUR_DELETED(dbc) \
+ IS_DELETED((dbc)->internal->page, (dbc)->internal->indx)
+
+/*
+ * Test to see if two cursors could point to duplicates of the same key.
+ * In the case of off-page duplicates they are they same, as the cursors
+ * will be in the same off-page duplicate tree. In the case of on-page
+ * duplicates, the key index offsets must be the same. For the last test,
+ * as the original cursor may not have a valid page pointer, we use the
+ * current cursor's.
+ */
+#undef IS_DUPLICATE
+#define IS_DUPLICATE(dbc, i1, i2) \
+ (((PAGE *)(dbc)->internal->page)->inp[i1] == \
+ ((PAGE *)(dbc)->internal->page)->inp[i2])
+#undef IS_CUR_DUPLICATE
+#define IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx) \
+ (F_ISSET(dbc, DBC_OPD) || \
+ (orig_pgno == (dbc)->internal->pgno && \
+ IS_DUPLICATE(dbc, (dbc)->internal->indx, orig_indx)))
+
+/*
+ * __bam_c_reset --
+ * Initialize internal cursor structure.
+ */
+static void
+__bam_c_reset(cp)
+ BTREE_CURSOR *cp;
+{
+ cp->csp = cp->sp;
+ cp->lock.off = LOCK_INVALID;
+ cp->lock_mode = DB_LOCK_NG;
+ cp->recno = RECNO_OOB;
+ cp->order = INVALID_ORDER;
+ cp->flags = 0;
+}
+
+/*
+ * __bam_c_init --
+ * Initialize the access private portion of a cursor
+ *
+ * PUBLIC: int __bam_c_init __P((DBC *, DBTYPE));
+ */
+int
+__bam_c_init(dbc, dbtype)
+ DBC *dbc;
+ DBTYPE dbtype;
+{
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ int ret;
+ u_int32_t minkey;
+
+ dbp = dbc->dbp;
+
+ /* Allocate/initialize the internal structure. */
+ if (dbc->internal == NULL) {
+ if ((ret = __os_malloc(dbp->dbenv,
+ sizeof(BTREE_CURSOR), NULL, &cp)) != 0)
+ return (ret);
+ dbc->internal = (DBC_INTERNAL *)cp;
+
+ cp->sp = cp->csp = cp->stack;
+ cp->esp = cp->stack + sizeof(cp->stack) / sizeof(cp->stack[0]);
+ } else
+ cp = (BTREE_CURSOR *)dbc->internal;
+ __bam_c_reset(cp);
+
+ /* Initialize methods. */
+ dbc->c_close = __db_c_close;
+ dbc->c_count = __db_c_count;
+ dbc->c_del = __db_c_del;
+ dbc->c_dup = __db_c_dup;
+ dbc->c_get = __db_c_get;
+ dbc->c_put = __db_c_put;
+ if (dbtype == DB_BTREE) {
+ dbc->c_am_close = __bam_c_close;
+ dbc->c_am_del = __bam_c_del;
+ dbc->c_am_destroy = __bam_c_destroy;
+ dbc->c_am_get = __bam_c_get;
+ dbc->c_am_put = __bam_c_put;
+ dbc->c_am_writelock = __bam_c_writelock;
+ } else {
+ dbc->c_am_close = __bam_c_close;
+ dbc->c_am_del = __ram_c_del;
+ dbc->c_am_destroy = __bam_c_destroy;
+ dbc->c_am_get = __ram_c_get;
+ dbc->c_am_put = __ram_c_put;
+ dbc->c_am_writelock = __bam_c_writelock;
+ }
+
+ /*
+ * The btree leaf page data structures require that two key/data pairs
+ * (or four items) fit on a page, but other than that there's no fixed
+ * requirement. The btree off-page duplicates only require two items,
+ * to be exact, but requiring four for them as well seems reasonable.
+ *
+ * Recno uses the btree bt_ovflsize value -- it's close enough.
+ */
+ t = dbp->bt_internal;
+ minkey = F_ISSET(dbc, DBC_OPD) ? 2 : t->bt_minkey;
+ cp->ovflsize = B_MINKEY_TO_OVFLSIZE(minkey, dbp->pgsize);
+
+ return (0);
+}
+
+/*
+ * __bam_c_refresh
+ * Set things up properly for cursor re-use.
+ *
+ * PUBLIC: int __bam_c_refresh __P((DBC *));
+ */
+int
+__bam_c_refresh(dbc)
+ DBC *dbc;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ __bam_c_reset(cp);
+
+ /*
+ * If our caller set the root page number, it's because the root was
+ * known. This is always the case for off page dup cursors. Else,
+ * pull it out of our internal information.
+ */
+ if (cp->root == PGNO_INVALID)
+ cp->root = ((BTREE *)dbp->bt_internal)->bt_root;
+
+ /* Initialize for record numbers. */
+ if (F_ISSET(dbc, DBC_OPD) ||
+ dbc->dbtype == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) {
+ F_SET(cp, C_RECNUM);
+
+ /*
+ * All btrees that support record numbers, optionally standard
+ * recno trees, and all off-page duplicate recno trees have
+ * mutable record numbers.
+ */
+ if ((F_ISSET(dbc, DBC_OPD) && dbc->dbtype == DB_RECNO) ||
+ F_ISSET(dbp, DB_BT_RECNUM | DB_RE_RENUMBER))
+ F_SET(cp, C_RENUMBER);
+ }
+
+ return (0);
+}
+
+/*
+ * __bam_c_close --
+ * Close down the cursor.
+ */
+static int
+__bam_c_close(dbc, root_pgno, rmroot)
+ DBC *dbc;
+ db_pgno_t root_pgno;
+ int *rmroot;
+{
+ BTREE_CURSOR *cp, *cp_opd, *cp_c;
+ DB *dbp;
+ DBC *dbc_opd, *dbc_c;
+ PAGE *h;
+ u_int32_t num;
+ int cdb_lock, ret, t_ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ cp_opd = (dbc_opd = cp->opd) == NULL ?
+ NULL : (BTREE_CURSOR *)dbc_opd->internal;
+ cdb_lock = ret = 0;
+
+ /*
+ * There are 3 ways this function is called:
+ *
+ * 1. Closing a primary cursor: we get called with a pointer to a
+ * primary cursor that has a NULL opd field. This happens when
+ * closing a btree/recno database cursor without an associated
+ * off-page duplicate tree.
+ *
+ * 2. Closing a primary and an off-page duplicate cursor stack: we
+ * get called with a pointer to the primary cursor which has a
+ * non-NULL opd field. This happens when closing a btree cursor
+ * into database with an associated off-page btree/recno duplicate
+ * tree. (It can't be a primary recno database, recno databases
+ * don't support duplicates.)
+ *
+ * 3. Closing an off-page duplicate cursor stack: we get called with
+ * a pointer to the off-page duplicate cursor. This happens when
+ * closing a non-btree database that has an associated off-page
+ * btree/recno duplicate tree or for a btree database when the
+ * opd tree is not empty (root_pgno == PGNO_INVALID).
+ *
+ * If either the primary or off-page duplicate cursor deleted a btree
+ * key/data pair, check to see if the item is still referenced by a
+ * different cursor. If it is, confirm that cursor's delete flag is
+ * set and leave it to that cursor to do the delete.
+ *
+ * NB: The test for == 0 below is correct. Our caller already removed
+ * our cursor argument from the active queue, we won't find it when we
+ * search the queue in __bam_ca_delete().
+ * NB: It can't be true that both the primary and off-page duplicate
+ * cursors have deleted a btree key/data pair. Either the primary
+ * cursor may have deleted an item and there's no off-page duplicate
+ * cursor, or there's an off-page duplicate cursor and it may have
+ * deleted an item.
+ *
+ * Primary recno databases aren't an issue here. Recno keys are either
+ * deleted immediately or never deleted, and do not have to be handled
+ * here.
+ *
+ * Off-page duplicate recno databases are an issue here, cases #2 and
+ * #3 above can both be off-page recno databases. The problem is the
+ * same as the final problem for off-page duplicate btree databases.
+ * If we no longer need the off-page duplicate tree, we want to remove
+ * it. For off-page duplicate btrees, we are done with the tree when
+ * we delete the last item it contains, i.e., there can be no further
+ * references to it when it's empty. For off-page duplicate recnos,
+ * we remove items from the tree as the application calls the remove
+ * function, so we are done with the tree when we close the last cursor
+ * that references it.
+ *
+ * We optionally take the root page number from our caller. If the
+ * primary database is a btree, we can get it ourselves because dbc
+ * is the primary cursor. If the primary database is not a btree,
+ * the problem is that we may be dealing with a stack of pages. The
+ * cursor we're using to do the delete points at the bottom of that
+ * stack and we need the top of the stack.
+ */
+ if (F_ISSET(cp, C_DELETED)) {
+ dbc_c = dbc;
+ switch (dbc->dbtype) {
+ case DB_BTREE: /* Case #1, #3. */
+ if (__bam_ca_delete(dbp, cp->pgno, cp->indx, 1) == 0)
+ goto lock;
+ goto done;
+ case DB_RECNO:
+ if (!F_ISSET(dbc, DBC_OPD)) /* Case #1. */
+ goto done;
+ /* Case #3. */
+ if (__ram_ca_delete(dbp, cp->root) == 0)
+ goto lock;
+ goto done;
+ default:
+ return (__db_unknown_type(dbp->dbenv,
+ "__bam_c_close", dbc->dbtype));
+ }
+ }
+
+ if (dbc_opd == NULL)
+ goto done;
+
+ if (F_ISSET(cp_opd, C_DELETED)) { /* Case #2. */
+ /*
+ * We will not have been provided a root page number. Acquire
+ * one from the primary database.
+ */
+ if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &h)) != 0)
+ goto err;
+ root_pgno = GET_BOVERFLOW(h, cp->indx + O_INDX)->pgno;
+ if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
+ goto err;
+
+ dbc_c = dbc_opd;
+ switch (dbc_opd->dbtype) {
+ case DB_BTREE:
+ if (__bam_ca_delete(
+ dbp, cp_opd->pgno, cp_opd->indx, 1) == 0)
+ goto lock;
+ goto done;
+ case DB_RECNO:
+ if (__ram_ca_delete(dbp, cp_opd->root) == 0)
+ goto lock;
+ goto done;
+ default:
+ return (__db_unknown_type(dbp->dbenv,
+ "__bam_c_close", dbc->dbtype));
+ }
+ }
+ goto done;
+
+lock: cp_c = (BTREE_CURSOR *)dbc_c->internal;
+
+ /*
+ * If this is CDB, upgrade the lock if necessary. While we acquired
+ * the write lock to logically delete the record, we released it when
+ * we returned from that call, and so may not be holding a write lock
+ * at the moment. NB: to get here in CDB we must either be holding a
+ * write lock or be the only cursor that is permitted to acquire write
+ * locks. The reason is that there can never be more than a single CDB
+ * write cursor (that cursor cannot be dup'd), and so that cursor must
+ * be closed and the item therefore deleted before any other cursor
+ * could acquire a reference to this item.
+ *
+ * Note that dbc may be an off-page dup cursor; this is the sole
+ * instance in which an OPD cursor does any locking, but it's necessary
+ * because we may be closed by ourselves without a parent cursor
+ * handy, and we have to do a lock upgrade on behalf of somebody.
+ * If this is the case, the OPD has been given the parent's locking
+ * info in __db_c_get--the OPD is also a WRITEDUP.
+ */
+ if (CDB_LOCKING(dbp->dbenv)) {
+ DB_ASSERT(!F_ISSET(dbc, DBC_OPD) || F_ISSET(dbc, DBC_WRITEDUP));
+ if (!F_ISSET(dbc, DBC_WRITER)) {
+ if ((ret =
+ lock_get(dbp->dbenv, dbc->locker, DB_LOCK_UPGRADE,
+ &dbc->lock_dbt, DB_LOCK_WRITE, &dbc->mylock)) != 0)
+ goto err;
+ cdb_lock = 1;
+ }
+
+ cp_c->lock.off = LOCK_INVALID;
+ if ((ret =
+ memp_fget(dbp->mpf, &cp_c->pgno, 0, &cp_c->page)) != 0)
+ goto err;
+
+ goto delete;
+ }
+
+ /*
+ * The variable dbc_c has been initialized to reference the cursor in
+ * which we're going to do the delete. Initialize the cursor's page
+ * and lock structures as necessary.
+ *
+ * First, we may not need to acquire any locks. If we're in case #3,
+ * that is, the primary database isn't a btree database, our caller
+ * is responsible for acquiring any necessary locks before calling us.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ cp_c->lock.off = LOCK_INVALID;
+ if ((ret =
+ memp_fget(dbp->mpf, &cp_c->pgno, 0, &cp_c->page)) != 0)
+ goto err;
+ goto delete;
+ }
+
+ /*
+ * Otherwise, acquire a write lock. If the cursor that did the initial
+ * logical deletion (and which had a write lock) is not the same as the
+ * cursor doing the physical deletion (which may have only ever had a
+ * read lock on the item), we need to upgrade. The confusion comes as
+ * follows:
+ *
+ * C1 created, acquires item read lock
+ * C2 dup C1, create C2, also has item read lock.
+ * C1 acquire write lock, delete item
+ * C1 close
+ * C2 close, needs a write lock to physically delete item.
+ *
+ * If we're in a TXN, we know that C2 will be able to acquire the write
+ * lock, because no locker other than the one shared by C1 and C2 can
+ * acquire a write lock -- the original write lock C1 acquire was never
+ * discarded.
+ *
+ * If we're not in a TXN, it's nastier. Other cursors might acquire
+ * read locks on the item after C1 closed, discarding its write lock,
+ * and such locks would prevent C2 from acquiring a read lock. That's
+ * OK, though, we'll simply wait until we can acquire a read lock, or
+ * we'll deadlock. (Which better not happen, since we're not in a TXN.)
+ *
+ * Lock the primary database page, regardless of whether we're deleting
+ * an item on a primary database page or an off-page duplicates page.
+ */
+ ACQUIRE(dbc, DB_LOCK_WRITE,
+ cp->pgno, cp_c->lock, cp_c->pgno, cp_c->page, ret);
+ if (ret != 0)
+ goto err;
+
+delete: /*
+ * If the delete occurred in a btree, delete the on-page physical item
+ * referenced by the cursor.
+ */
+ if (dbc_c->dbtype == DB_BTREE && (ret = __bam_c_physdel(dbc_c)) != 0)
+ goto err;
+
+ /*
+ * If we're not working in an off-page duplicate tree, then we're
+ * done.
+ */
+ if (!F_ISSET(dbc_c, DBC_OPD) || root_pgno == PGNO_INVALID)
+ goto done;
+
+ /*
+ * We may have just deleted the last element in the off-page duplicate
+ * tree, and closed the last cursor in the tree. For an off-page btree
+ * there are no other cursors in the tree by definition, if the tree is
+ * empty. For an off-page recno we know we have closed the last cursor
+ * in the tree because the __ram_ca_delete call above returned 0 only
+ * in that case. So, if the off-page duplicate tree is empty at this
+ * point, we want to remove it.
+ */
+ if ((ret = memp_fget(dbp->mpf, &root_pgno, 0, &h)) != 0)
+ goto err;
+ if ((num = NUM_ENT(h)) == 0) {
+ if ((ret = __db_free(dbc, h)) != 0)
+ goto err;
+ } else {
+ if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
+ goto err;
+ goto done;
+ }
+
+ /*
+ * When removing the tree, we have to do one of two things. If this is
+ * case #2, that is, the primary tree is a btree, delete the key that's
+ * associated with the tree from the btree leaf page. We know we are
+ * the only reference to it and we already have the correct lock. We
+ * detect this case because the cursor that was passed to us references
+ * an off-page duplicate cursor.
+ *
+ * If this is case #3, that is, the primary tree isn't a btree, pass
+ * the information back to our caller, it's their job to do cleanup on
+ * the primary page.
+ */
+ if (dbc_opd != NULL) {
+ cp->lock.off = LOCK_INVALID;
+ if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0)
+ goto err;
+ if ((ret = __bam_c_physdel(dbc)) != 0)
+ goto err;
+ } else
+ *rmroot = 1;
+err:
+done: /*
+ * Discard the page references and locks, and confirm that the stack
+ * has been emptied.
+ */
+ if (dbc_opd != NULL) {
+ DISCARD_CUR(dbc_opd, t_ret);
+ if (t_ret != 0 && ret == 0)
+ ret = t_ret;
+ }
+ DISCARD_CUR(dbc, t_ret);
+ if (t_ret != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Downgrade any CDB lock we acquired. */
+ if (cdb_lock)
+ (void)__lock_downgrade(
+ dbp->dbenv, &dbc->mylock, DB_LOCK_IWRITE, 0);
+
+ return (ret);
+}
+
+/*
+ * __bam_c_destroy --
+ * Close a single cursor -- internal version.
+ */
+static int
+__bam_c_destroy(dbc)
+ DBC *dbc;
+{
+ /* Discard the structures. */
+ __os_free(dbc->internal, sizeof(BTREE_CURSOR));
+
+ return (0);
+}
+
+/*
+ * __bam_c_count --
+ * Return a count of on and off-page duplicates.
+ *
+ * PUBLIC: int __bam_c_count __P((DBC *, db_recno_t *));
+ */
+int
+__bam_c_count(dbc, recnop)
+ DBC *dbc;
+ db_recno_t *recnop;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ db_indx_t indx, top;
+ db_recno_t recno;
+ int ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * Called with the top-level cursor that may reference an off-page
+ * duplicates page. If it's a set of on-page duplicates, get the
+ * page and count. Otherwise, get the root page of the off-page
+ * duplicate tree, and use the count. We don't have to acquire any
+ * new locks, we have to have a read lock to even get here.
+ */
+ if (cp->opd == NULL) {
+ if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0)
+ return (ret);
+
+ /*
+ * Move back to the beginning of the set of duplicates and
+ * then count forward.
+ */
+ for (indx = cp->indx;; indx -= P_INDX)
+ if (indx == 0 ||
+ !IS_DUPLICATE(dbc, indx, indx - P_INDX))
+ break;
+ for (recno = 1, top = NUM_ENT(cp->page) - P_INDX;
+ indx < top; ++recno, indx += P_INDX)
+ if (!IS_DUPLICATE(dbc, indx, indx + P_INDX))
+ break;
+ *recnop = recno;
+ } else {
+ if ((ret = memp_fget(dbp->mpf,
+ &cp->opd->internal->root, 0, &cp->page)) != 0)
+ return (ret);
+
+ *recnop = RE_NREC(cp->page);
+ }
+
+ ret = memp_fput(dbp->mpf, cp->page, 0);
+ cp->page = NULL;
+
+ return (ret);
+}
+
+/*
+ * __bam_c_del --
+ * Delete using a cursor.
+ */
+static int
+__bam_c_del(dbc)
+ DBC *dbc;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ ret = 0;
+
+ /* If the item was already deleted, return failure. */
+ if (F_ISSET(cp, C_DELETED))
+ return (DB_KEYEMPTY);
+
+ /*
+ * This code is always called with a page lock but no page.
+ */
+ DB_ASSERT(cp->page == NULL);
+
+ /*
+ * We don't physically delete the record until the cursor moves, so
+ * we have to have a long-lived write lock on the page instead of a
+ * a long-lived read lock. Note, we have to have a read lock to even
+ * get here.
+ *
+ * If we're maintaining record numbers, we lock the entire tree, else
+ * we lock the single page.
+ */
+ if (F_ISSET(cp, C_RECNUM)) {
+ if ((ret = __bam_c_getstack(dbc)) != 0)
+ goto err;
+ cp->page = cp->csp->page;
+ } else {
+ ACQUIRE_CUR(dbc, DB_LOCK_WRITE, ret);
+ if (ret != 0)
+ goto err;
+ }
+
+ /* Log the change. */
+ if (DB_LOGGING(dbc) &&
+ (ret = __bam_cdel_log(dbp->dbenv, dbc->txn, &LSN(cp->page), 0,
+ dbp->log_fileid, PGNO(cp->page), &LSN(cp->page), cp->indx)) != 0)
+ goto err;
+
+ /* Set the intent-to-delete flag on the page. */
+ if (TYPE(cp->page) == P_LBTREE)
+ B_DSET(GET_BKEYDATA(cp->page, cp->indx + O_INDX)->type);
+ else
+ B_DSET(GET_BKEYDATA(cp->page, cp->indx)->type);
+
+ /* Mark the page dirty. */
+ ret = memp_fset(dbp->mpf, cp->page, DB_MPOOL_DIRTY);
+
+err: /*
+ * If we've been successful so far and the tree has record numbers,
+ * adjust the record counts. Either way, release acquired page(s).
+ */
+ if (F_ISSET(cp, C_RECNUM)) {
+ if (ret == 0)
+ ret = __bam_adjust(dbc, -1);
+ (void)__bam_stkrel(dbc, 0);
+ } else
+ if (cp->page != NULL &&
+ (t_ret = memp_fput(dbp->mpf, cp->page, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ cp->page = NULL;
+
+ /* Update the cursors last, after all chance of failure is past. */
+ if (ret == 0)
+ (void)__bam_ca_delete(dbp, cp->pgno, cp->indx, 1);
+
+ return (ret);
+}
+
+/*
+ * __bam_c_dup --
+ * Duplicate a btree cursor, such that the new one holds appropriate
+ * locks for the position of the original.
+ *
+ * PUBLIC: int __bam_c_dup __P((DBC *, DBC *));
+ */
+int
+__bam_c_dup(orig_dbc, new_dbc)
+ DBC *orig_dbc, *new_dbc;
+{
+ BTREE_CURSOR *orig, *new;
+ int ret;
+
+ orig = (BTREE_CURSOR *)orig_dbc->internal;
+ new = (BTREE_CURSOR *)new_dbc->internal;
+
+ /*
+ * If we're holding a lock we need to acquire a copy of it, unless
+ * we're in a transaction. We don't need to copy any lock we're
+ * holding inside a transaction because all the locks are retained
+ * until the transaction commits or aborts.
+ */
+ if (orig->lock.off != LOCK_INVALID && orig_dbc->txn == NULL) {
+ if ((ret = __db_lget(new_dbc,
+ 0, new->pgno, new->lock_mode, 0, &new->lock)) != 0)
+ return (ret);
+ }
+ new->ovflsize = orig->ovflsize;
+ new->recno = orig->recno;
+ new->flags = orig->flags;
+
+ return (0);
+}
+
+/*
+ * __bam_c_get --
+ * Get using a cursor (btree).
+ */
+static int
+__bam_c_get(dbc, key, data, flags, pgnop)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+ db_pgno_t *pgnop;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ db_pgno_t orig_pgno;
+ db_indx_t orig_indx;
+ int exact, newopd, ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ orig_pgno = cp->pgno;
+ orig_indx = cp->indx;
+
+ newopd = 0;
+ switch (flags) {
+ case DB_CURRENT:
+ /* It's not possible to return a deleted record. */
+ if (F_ISSET(cp, C_DELETED)) {
+ ret = DB_KEYEMPTY;
+ goto err;
+ }
+
+ /*
+ * Acquire the current page. We have at least a read-lock
+ * already. The caller may have set DB_RMW asking for a
+ * write lock, but upgrading to a write lock has no better
+ * chance of succeeding now instead of later, so don't try.
+ */
+ if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0)
+ goto err;
+ break;
+ case DB_FIRST:
+ newopd = 1;
+ if ((ret = __bam_c_first(dbc)) != 0)
+ goto err;
+ break;
+ case DB_GET_BOTH:
+ /*
+ * There are two ways to get here based on DBcursor->c_get
+ * with the DB_GET_BOTH flag set:
+ *
+ * 1. Searching a sorted off-page duplicate tree: do a tree
+ * search.
+ *
+ * 2. Searching btree: do a tree search. If it returns a
+ * reference to off-page duplicate tree, return immediately
+ * and let our caller deal with it. If the search doesn't
+ * return a reference to off-page duplicate tree, start an
+ * on-page search.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ if ((ret = __bam_c_search(
+ dbc, data, DB_GET_BOTH, &exact)) != 0)
+ goto err;
+ if (!exact) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ } else {
+ if ((ret = __bam_c_search(
+ dbc, key, DB_GET_BOTH, &exact)) != 0)
+ return (ret);
+ if (!exact) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+
+ if (pgnop != NULL && __bam_isopd(dbc, pgnop)) {
+ newopd = 1;
+ break;
+ }
+ if ((ret = __bam_getboth_finddatum(dbc, data)) != 0)
+ goto err;
+ }
+ break;
+ case DB_GET_BOTHC:
+ if ((ret = __bam_getbothc(dbc, data)) != 0)
+ goto err;
+ break;
+ case DB_LAST:
+ newopd = 1;
+ if ((ret = __bam_c_last(dbc)) != 0)
+ goto err;
+ break;
+ case DB_NEXT:
+ newopd = 1;
+ if (cp->pgno == PGNO_INVALID) {
+ if ((ret = __bam_c_first(dbc)) != 0)
+ goto err;
+ } else
+ if ((ret = __bam_c_next(dbc, 1)) != 0)
+ goto err;
+ break;
+ case DB_NEXT_DUP:
+ if ((ret = __bam_c_next(dbc, 1)) != 0)
+ goto err;
+ if (!IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx)) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ break;
+ case DB_NEXT_NODUP:
+ newopd = 1;
+ if (cp->pgno == PGNO_INVALID) {
+ if ((ret = __bam_c_first(dbc)) != 0)
+ goto err;
+ } else
+ do {
+ if ((ret = __bam_c_next(dbc, 1)) != 0)
+ goto err;
+ } while (IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx));
+ break;
+ case DB_PREV:
+ newopd = 1;
+ if (cp->pgno == PGNO_INVALID) {
+ if ((ret = __bam_c_last(dbc)) != 0)
+ goto err;
+ } else
+ if ((ret = __bam_c_prev(dbc)) != 0)
+ goto err;
+ break;
+ case DB_PREV_NODUP:
+ newopd = 1;
+ if (cp->pgno == PGNO_INVALID) {
+ if ((ret = __bam_c_last(dbc)) != 0)
+ goto err;
+ } else
+ do {
+ if ((ret = __bam_c_prev(dbc)) != 0)
+ goto err;
+ } while (IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx));
+ break;
+ case DB_SET:
+ case DB_SET_RECNO:
+ newopd = 1;
+ if ((ret = __bam_c_search(dbc, key, flags, &exact)) != 0)
+ goto err;
+ break;
+ case DB_SET_RANGE:
+ newopd = 1;
+ if ((ret = __bam_c_search(dbc, key, flags, &exact)) != 0)
+ goto err;
+
+ /*
+ * As we didn't require an exact match, the search function
+ * may have returned an entry past the end of the page. Or,
+ * we may be referencing a deleted record. If so, move to
+ * the next entry.
+ */
+ if (cp->indx == NUM_ENT(cp->page) || IS_CUR_DELETED(dbc))
+ if ((ret = __bam_c_next(dbc, 0)) != 0)
+ goto err;
+ break;
+ default:
+ ret = __db_unknown_flag(dbp->dbenv, "__bam_c_get", flags);
+ goto err;
+ }
+
+ /*
+ * We may have moved to an off-page duplicate tree. Return that
+ * information to our caller.
+ */
+ if (newopd && pgnop != NULL)
+ (void)__bam_isopd(dbc, pgnop);
+
+ /* Don't return the key, it was passed to us */
+ if (flags == DB_SET)
+ F_SET(key, DB_DBT_ISSET);
+
+err: /*
+ * Regardless of whether we were successful or not, if the cursor
+ * moved, clear the delete flag, DBcursor->c_get never references
+ * a deleted key, if it moved at all.
+ */
+ if (F_ISSET(cp, C_DELETED)
+ && (cp->pgno != orig_pgno || cp->indx != orig_indx))
+ F_CLR(cp, C_DELETED);
+
+ return (ret);
+}
+
+/*
+ * __bam_getbothc --
+ * Search for a matching data item on a join.
+ */
+static int
+__bam_getbothc(dbc, data)
+ DBC *dbc;
+ DBT *data;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ int cmp, exact, ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * Acquire the current page. We have at least a read-lock
+ * already. The caller may have set DB_RMW asking for a
+ * write lock, but upgrading to a write lock has no better
+ * chance of succeeding now instead of later, so don't try.
+ */
+ if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0)
+ return (ret);
+
+ /*
+ * An off-page duplicate cursor. Search the remaining duplicates
+ * for one which matches (do a normal btree search, then verify
+ * that the retrieved record is greater than the original one).
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ /*
+ * Check to make sure the desired item comes strictly after
+ * the current position; if it doesn't, return DB_NOTFOUND.
+ */
+ if ((ret = __bam_cmp(dbp, data, cp->page, cp->indx,
+ dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare,
+ &cmp)) != 0)
+ return (ret);
+
+ if (cmp <= 0)
+ return (DB_NOTFOUND);
+
+ /* Discard the current page, we're going to do a full search. */
+ if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0)
+ return (ret);
+ cp->page = NULL;
+
+ return (__bam_c_search(dbc, data, DB_GET_BOTH, &exact));
+ }
+
+ /*
+ * We're doing a DBC->c_get(DB_GET_BOTHC) and we're already searching
+ * a set of on-page duplicates (either sorted or unsorted). Continue
+ * a linear search from after the current position.
+ *
+ * (Note that we could have just finished a "set" of one duplicate,
+ * i.e. not a duplicate at all, but the following check will always
+ * return DB_NOTFOUND in this case, which is the desired behavior.)
+ */
+ if (cp->indx + P_INDX >= NUM_ENT(cp->page) ||
+ !IS_DUPLICATE(dbc, cp->indx, cp->indx + P_INDX))
+ return (DB_NOTFOUND);
+ cp->indx += P_INDX;
+
+ return (__bam_getboth_finddatum(dbc, data));
+}
+
+/*
+ * __bam_getboth_finddatum --
+ * Find a matching on-page data item.
+ */
+static int
+__bam_getboth_finddatum(dbc, data)
+ DBC *dbc;
+ DBT *data;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ db_indx_t base, lim, top;
+ int cmp, ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * Called (sometimes indirectly) from DBC->get to search on-page data
+ * item(s) for a matching value. If the original flag was DB_GET_BOTH,
+ * the cursor argument is set to the first data item for the key. If
+ * the original flag was DB_GET_BOTHC, the cursor argument is set to
+ * the first data item that we can potentially return. In both cases,
+ * there may or may not be additional duplicate data items to search.
+ *
+ * If the duplicates are not sorted, do a linear search.
+ *
+ * If the duplicates are sorted, do a binary search. The reason for
+ * this is that large pages and small key/data pairs result in large
+ * numbers of on-page duplicates before they get pushed off-page.
+ */
+ if (dbp->dup_compare == NULL) {
+ for (;; cp->indx += P_INDX) {
+ if (!IS_CUR_DELETED(dbc) &&
+ (ret = __bam_cmp(dbp, data, cp->page,
+ cp->indx + O_INDX, __bam_defcmp, &cmp)) != 0)
+ return (ret);
+ if (cmp == 0)
+ return (0);
+
+ if (cp->indx + P_INDX >= NUM_ENT(cp->page) ||
+ !IS_DUPLICATE(dbc, cp->indx, cp->indx + P_INDX))
+ break;
+ }
+ } else {
+ /*
+ * Find the top and bottom of the duplicate set. Binary search
+ * requires at least two items, don't loop if there's only one.
+ */
+ for (base = top = cp->indx;
+ top < NUM_ENT(cp->page); top += P_INDX)
+ if (!IS_DUPLICATE(dbc, cp->indx, top))
+ break;
+ if (base == (top - P_INDX)) {
+ if ((ret = __bam_cmp(dbp, data,
+ cp->page, cp->indx + O_INDX,
+ dbp->dup_compare, &cmp)) != 0)
+ return (ret);
+ return (cmp == 0 ? 0 : DB_NOTFOUND);
+ }
+
+ for (lim =
+ (top - base) / (db_indx_t)P_INDX; lim != 0; lim >>= 1) {
+ cp->indx = base + ((lim >> 1) * P_INDX);
+ if ((ret = __bam_cmp(dbp, data, cp->page,
+ cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0)
+ return (ret);
+ if (cmp == 0) {
+ if (!IS_CUR_DELETED(dbc))
+ return (0);
+ break;
+ }
+ if (cmp > 0) {
+ base = cp->indx + P_INDX;
+ --lim;
+ }
+ }
+ }
+ return (DB_NOTFOUND);
+}
+
+/*
+ * __bam_c_put --
+ * Put using a cursor.
+ */
+static int
+__bam_c_put(dbc, key, data, flags, pgnop)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+ db_pgno_t *pgnop;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT dbt;
+ u_int32_t iiop;
+ int cmp, exact, needkey, ret, stack;
+ void *arg;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+split: needkey = ret = stack = 0;
+ switch (flags) {
+ case DB_AFTER:
+ case DB_BEFORE:
+ case DB_CURRENT:
+ needkey = 1;
+ iiop = flags;
+
+ /*
+ * If the Btree has record numbers (and we're not replacing an
+ * existing record), we need a complete stack so that we can
+ * adjust the record counts. The check for flags == DB_CURRENT
+ * is superfluous but left in for clarity. (If C_RECNUM is set
+ * we know that flags must be DB_CURRENT, as DB_AFTER/DB_BEFORE
+ * are illegal in a Btree unless it's configured for duplicates
+ * and you cannot configure a Btree for both record renumbering
+ * and duplicates.)
+ */
+ if (flags == DB_CURRENT &&
+ F_ISSET(cp, C_RECNUM) && F_ISSET(cp, C_DELETED)) {
+ if ((ret = __bam_c_getstack(dbc)) != 0)
+ goto err;
+ /*
+ * Initialize the cursor from the stack. Don't take
+ * the page number or page index, they should already
+ * be set.
+ */
+ cp->page = cp->csp->page;
+ cp->lock = cp->csp->lock;
+ cp->lock_mode = cp->csp->lock_mode;
+
+ stack = 1;
+ break;
+ }
+
+ /* Acquire the current page with a write lock. */
+ ACQUIRE_WRITE_LOCK(dbc, ret);
+ if (ret != 0)
+ goto err;
+ if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0)
+ goto err;
+ break;
+ case DB_KEYFIRST:
+ case DB_KEYLAST:
+ case DB_NODUPDATA:
+ /*
+ * Searching off-page, sorted duplicate tree: do a tree search
+ * for the correct item; __bam_c_search returns the smallest
+ * slot greater than the key, use it.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ if ((ret =
+ __bam_c_search(dbc, data, flags, &exact)) != 0)
+ goto err;
+ stack = 1;
+
+ /* Disallow "sorted" duplicate duplicates. */
+ if (exact) {
+ ret = __db_duperr(dbp, flags);
+ goto err;
+ }
+ iiop = DB_BEFORE;
+ break;
+ }
+
+ /* Searching a btree. */
+ if ((ret = __bam_c_search(dbc, key,
+ flags == DB_KEYFIRST || dbp->dup_compare != NULL ?
+ DB_KEYFIRST : DB_KEYLAST, &exact)) != 0)
+ goto err;
+ stack = 1;
+
+ /*
+ * If we don't have an exact match, __bam_c_search returned
+ * the smallest slot greater than the key, use it.
+ */
+ if (!exact) {
+ iiop = DB_KEYFIRST;
+ break;
+ }
+
+ /*
+ * If duplicates aren't supported, replace the current item.
+ * (If implementing the DB->put function, our caller already
+ * checked the DB_NOOVERWRITE flag.)
+ */
+ if (!F_ISSET(dbp, DB_AM_DUP)) {
+ iiop = DB_CURRENT;
+ break;
+ }
+
+ /*
+ * If we find a matching entry, it may be an off-page duplicate
+ * tree. Return the page number to our caller, we need a new
+ * cursor.
+ */
+ if (pgnop != NULL && __bam_isopd(dbc, pgnop))
+ goto done;
+
+ /* If the duplicates aren't sorted, move to the right slot. */
+ if (dbp->dup_compare == NULL) {
+ if (flags == DB_KEYFIRST)
+ iiop = DB_BEFORE;
+ else
+ for (;; cp->indx += P_INDX)
+ if (cp->indx + P_INDX >=
+ NUM_ENT(cp->page) ||
+ !IS_DUPLICATE(dbc, cp->indx,
+ cp->indx + P_INDX)) {
+ iiop = DB_AFTER;
+ break;
+ }
+ break;
+ }
+
+ /*
+ * We know that we're looking at the first of a set of sorted
+ * on-page duplicates. Walk the list to find the right slot.
+ */
+ for (;; cp->indx += P_INDX) {
+ if ((ret = __bam_cmp(dbp, data, cp->page,
+ cp->indx + O_INDX, dbp->dup_compare, &cmp)) !=0)
+ return (ret);
+ if (cmp < 0) {
+ iiop = DB_BEFORE;
+ break;
+ }
+
+ /* Disallow "sorted" duplicate duplicates. */
+ if (cmp == 0) {
+ if (IS_DELETED(cp->page, cp->indx)) {
+ iiop = DB_CURRENT;
+ break;
+ }
+ ret = __db_duperr(dbp, flags);
+ goto err;
+ }
+
+ if (cp->indx + P_INDX >= NUM_ENT(cp->page) ||
+ ((PAGE *)cp->page)->inp[cp->indx] !=
+ ((PAGE *)cp->page)->inp[cp->indx + P_INDX]) {
+ iiop = DB_AFTER;
+ break;
+ }
+ }
+ break;
+ default:
+ ret = __db_unknown_flag(dbp->dbenv, "__bam_c_put", flags);
+ goto err;
+ }
+
+ switch (ret = __bam_iitem(dbc, key, data, iiop, 0)) {
+ case 0:
+ break;
+ case DB_NEEDSPLIT:
+ /*
+ * To split, we need a key for the page. Either use the key
+ * argument or get a copy of the key from the page.
+ */
+ if (flags == DB_AFTER ||
+ flags == DB_BEFORE || flags == DB_CURRENT) {
+ memset(&dbt, 0, sizeof(DBT));
+ if ((ret = __db_ret(dbp, cp->page, 0, &dbt,
+ &dbc->rkey.data, &dbc->rkey.ulen)) != 0)
+ goto err;
+ arg = &dbt;
+ } else
+ arg = F_ISSET(dbc, DBC_OPD) ? data : key;
+
+ /*
+ * Discard any locks and pinned pages (the locks are discarded
+ * even if we're running with transactions, as they lock pages
+ * that we're sorry we ever acquired). If stack is set and the
+ * cursor entries are valid, they point to the same entries as
+ * the stack, don't free them twice.
+ */
+ if (stack)
+ ret = __bam_stkrel(dbc, STK_CLRDBC | STK_NOLOCK);
+ else
+ DISCARD_CUR(dbc, ret);
+ if (ret != 0)
+ goto err;
+
+ /* Split the tree. */
+ if ((ret = __bam_split(dbc, arg)) != 0)
+ return (ret);
+
+ goto split;
+ default:
+ goto err;
+ }
+
+err:
+done: /*
+ * Discard any pages pinned in the tree and their locks, except for
+ * the leaf page. Note, the leaf page participated in any stack we
+ * acquired, and so we have to adjust the stack as necessary. If
+ * there was only a single page on the stack, we don't have to free
+ * further stack pages.
+ */
+ if (stack && BT_STK_POP(cp) != NULL)
+ (void)__bam_stkrel(dbc, 0);
+
+ /*
+ * Regardless of whether we were successful or not, clear the delete
+ * flag. If we're successful, we either moved the cursor or the item
+ * is no longer deleted. If we're not successful, then we're just a
+ * copy, no need to have the flag set.
+ */
+ F_CLR(cp, C_DELETED);
+
+ return (ret);
+}
+
+/*
+ * __bam_c_rget --
+ * Return the record number for a cursor.
+ *
+ * PUBLIC: int __bam_c_rget __P((DBC *, DBT *, u_int32_t));
+ */
+int
+__bam_c_rget(dbc, data, flags)
+ DBC *dbc;
+ DBT *data;
+ u_int32_t flags;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT dbt;
+ db_recno_t recno;
+ int exact, ret;
+
+ COMPQUIET(flags, 0);
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * Get the page with the current item on it.
+ * Get a copy of the key.
+ * Release the page, making sure we don't release it twice.
+ */
+ if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0)
+ return (ret);
+ memset(&dbt, 0, sizeof(DBT));
+ if ((ret = __db_ret(dbp, cp->page,
+ cp->indx, &dbt, &dbc->rkey.data, &dbc->rkey.ulen)) != 0)
+ goto err;
+ ret = memp_fput(dbp->mpf, cp->page, 0);
+ cp->page = NULL;
+ if (ret != 0)
+ return (ret);
+
+ if ((ret = __bam_search(dbc, &dbt,
+ F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND,
+ 1, &recno, &exact)) != 0)
+ goto err;
+
+ ret = __db_retcopy(dbp, data,
+ &recno, sizeof(recno), &dbc->rdata.data, &dbc->rdata.ulen);
+
+ /* Release the stack. */
+err: __bam_stkrel(dbc, 0);
+
+ return (ret);
+}
+
+/*
+ * __bam_c_writelock --
+ * Upgrade the cursor to a write lock.
+ */
+static int
+__bam_c_writelock(dbc)
+ DBC *dbc;
+{
+ BTREE_CURSOR *cp;
+ int ret;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ if (cp->lock_mode == DB_LOCK_WRITE)
+ return (0);
+
+ /*
+ * When writing to an off-page duplicate tree, we need to have the
+ * appropriate page in the primary tree locked. The general DBC
+ * code calls us first with the primary cursor so we can acquire the
+ * appropriate lock.
+ */
+ ACQUIRE_WRITE_LOCK(dbc, ret);
+ return (ret);
+}
+
+/*
+ * __bam_c_first --
+ * Return the first record.
+ */
+static int
+__bam_c_first(dbc)
+ DBC *dbc;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ db_pgno_t pgno;
+ int ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ ret = 0;
+
+ /* Walk down the left-hand side of the tree. */
+ for (pgno = cp->root;;) {
+ ACQUIRE_CUR_SET(dbc, DB_LOCK_READ, pgno, ret);
+ if (ret != 0)
+ return (ret);
+
+ /* If we find a leaf page, we're done. */
+ if (ISLEAF(cp->page))
+ break;
+
+ pgno = GET_BINTERNAL(cp->page, 0)->pgno;
+ }
+
+ /* If we want a write lock instead of a read lock, get it now. */
+ if (F_ISSET(dbc, DBC_RMW)) {
+ ACQUIRE_WRITE_LOCK(dbc, ret);
+ if (ret != 0)
+ return (ret);
+ }
+
+ /* If on an empty page or a deleted record, move to the next one. */
+ if (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(dbc))
+ if ((ret = __bam_c_next(dbc, 0)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __bam_c_last --
+ * Return the last record.
+ */
+static int
+__bam_c_last(dbc)
+ DBC *dbc;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ db_pgno_t pgno;
+ int ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ ret = 0;
+
+ /* Walk down the right-hand side of the tree. */
+ for (pgno = cp->root;;) {
+ ACQUIRE_CUR_SET(dbc, DB_LOCK_READ, pgno, ret);
+ if (ret != 0)
+ return (ret);
+
+ /* If we find a leaf page, we're done. */
+ if (ISLEAF(cp->page))
+ break;
+
+ pgno =
+ GET_BINTERNAL(cp->page, NUM_ENT(cp->page) - O_INDX)->pgno;
+ }
+
+ /* If we want a write lock instead of a read lock, get it now. */
+ if (F_ISSET(dbc, DBC_RMW)) {
+ ACQUIRE_WRITE_LOCK(dbc, ret);
+ if (ret != 0)
+ return (ret);
+ }
+
+ cp->indx = NUM_ENT(cp->page) == 0 ? 0 :
+ NUM_ENT(cp->page) -
+ (TYPE(cp->page) == P_LBTREE ? P_INDX : O_INDX);
+
+ /* If on an empty page or a deleted record, move to the previous one. */
+ if (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(dbc))
+ if ((ret = __bam_c_prev(dbc)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __bam_c_next --
+ * Move to the next record.
+ */
+static int
+__bam_c_next(dbc, initial_move)
+ DBC *dbc;
+ int initial_move;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ db_indx_t adjust;
+ db_lockmode_t lock_mode;
+ db_pgno_t pgno;
+ int ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ ret = 0;
+
+ /*
+ * We're either moving through a page of duplicates or a btree leaf
+ * page.
+ *
+ * !!!
+ * This code handles empty pages and pages with only deleted entries.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ adjust = O_INDX;
+ lock_mode = DB_LOCK_NG;
+ } else {
+ adjust = dbc->dbtype == DB_BTREE ? P_INDX : O_INDX;
+ lock_mode =
+ F_ISSET(dbc, DBC_RMW) ? DB_LOCK_WRITE : DB_LOCK_READ;
+ }
+ if (cp->page == NULL) {
+ ACQUIRE_CUR(dbc, lock_mode, ret);
+ if (ret != 0)
+ return (ret);
+ }
+
+ if (initial_move)
+ cp->indx += adjust;
+
+ for (;;) {
+ /*
+ * If at the end of the page, move to a subsequent page.
+ *
+ * !!!
+ * Check for >= NUM_ENT. If the original search landed us on
+ * NUM_ENT, we may have incremented indx before the test.
+ */
+ if (cp->indx >= NUM_ENT(cp->page)) {
+ if ((pgno
+ = NEXT_PGNO(cp->page)) == PGNO_INVALID)
+ return (DB_NOTFOUND);
+
+ ACQUIRE_CUR_SET(dbc, lock_mode, pgno, ret);
+ if (ret != 0)
+ return (ret);
+ continue;
+ }
+ if (IS_CUR_DELETED(dbc)) {
+ cp->indx += adjust;
+ continue;
+ }
+ break;
+ }
+ return (0);
+}
+
+/*
+ * __bam_c_prev --
+ * Move to the previous record.
+ */
+static int
+__bam_c_prev(dbc)
+ DBC *dbc;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ db_indx_t adjust;
+ db_lockmode_t lock_mode;
+ db_pgno_t pgno;
+ int ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ ret = 0;
+
+ /*
+ * We're either moving through a page of duplicates or a btree leaf
+ * page.
+ *
+ * !!!
+ * This code handles empty pages and pages with only deleted entries.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ adjust = O_INDX;
+ lock_mode = DB_LOCK_NG;
+ } else {
+ adjust = dbc->dbtype == DB_BTREE ? P_INDX : O_INDX;
+ lock_mode =
+ F_ISSET(dbc, DBC_RMW) ? DB_LOCK_WRITE : DB_LOCK_READ;
+ }
+ if (cp->page == NULL) {
+ ACQUIRE_CUR(dbc, lock_mode, ret);
+ if (ret != 0)
+ return (ret);
+ }
+
+ for (;;) {
+ /* If at the beginning of the page, move to a previous one. */
+ if (cp->indx == 0) {
+ if ((pgno =
+ PREV_PGNO(cp->page)) == PGNO_INVALID)
+ return (DB_NOTFOUND);
+
+ ACQUIRE_CUR_SET(dbc, lock_mode, pgno, ret);
+ if (ret != 0)
+ return (ret);
+
+ if ((cp->indx = NUM_ENT(cp->page)) == 0)
+ continue;
+ }
+
+ /* Ignore deleted records. */
+ cp->indx -= adjust;
+ if (IS_CUR_DELETED(dbc))
+ continue;
+
+ break;
+ }
+ return (0);
+}
+
+/*
+ * __bam_c_search --
+ * Move to a specified record.
+ */
+static int
+__bam_c_search(dbc, key, flags, exactp)
+ DBC *dbc;
+ const DBT *key;
+ u_int32_t flags;
+ int *exactp;
+{
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ PAGE *h;
+ db_indx_t indx;
+ db_pgno_t bt_lpgno;
+ db_recno_t recno;
+ u_int32_t sflags;
+ int cmp, ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ t = dbp->bt_internal;
+ ret = 0;
+
+ /*
+ * Find an entry in the database. Discard any lock we currently hold,
+ * we're going to search the tree.
+ */
+ DISCARD_CUR(dbc, ret);
+ if (ret != 0)
+ return (ret);
+
+ switch (flags) {
+ case DB_SET_RECNO:
+ if ((ret = __ram_getno(dbc, key, &recno, 0)) != 0)
+ return (ret);
+ sflags = (F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND) | S_EXACT;
+ if ((ret = __bam_rsearch(dbc, &recno, sflags, 1, exactp)) != 0)
+ return (ret);
+ break;
+ case DB_SET:
+ case DB_GET_BOTH:
+ sflags = (F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND) | S_EXACT;
+ goto search;
+ case DB_SET_RANGE:
+ sflags =
+ (F_ISSET(dbc, DBC_RMW) ? S_WRITE : S_READ) | S_DUPFIRST;
+ goto search;
+ case DB_KEYFIRST:
+ sflags = S_KEYFIRST;
+ goto fast_search;
+ case DB_KEYLAST:
+ case DB_NODUPDATA:
+ sflags = S_KEYLAST;
+fast_search: /*
+ * If the application has a history of inserting into the first
+ * or last pages of the database, we check those pages first to
+ * avoid doing a full search.
+ *
+ * If the tree has record numbers, we need a complete stack so
+ * that we can adjust the record counts, so fast_search isn't
+ * possible.
+ */
+ if (F_ISSET(cp, C_RECNUM))
+ goto search;
+
+ /*
+ * !!!
+ * We do not mutex protect the t->bt_lpgno field, which means
+ * that it can only be used in an advisory manner. If we find
+ * page we can use, great. If we don't, we don't care, we do
+ * it the slow way instead. Regardless, copy it into a local
+ * variable, otherwise we might acquire a lock for a page and
+ * then read a different page because it changed underfoot.
+ */
+ bt_lpgno = t->bt_lpgno;
+
+ /*
+ * If the tree has no history of insertion, do it the slow way.
+ */
+ if (bt_lpgno == PGNO_INVALID)
+ goto search;
+
+ /* Lock and retrieve the page on which we last inserted. */
+ h = NULL;
+ ACQUIRE(dbc,
+ DB_LOCK_WRITE, bt_lpgno, cp->lock, bt_lpgno, h, ret);
+ if (ret != 0)
+ goto fast_miss;
+
+ /*
+ * It's okay if the page type isn't right or it's empty, it
+ * just means that the world changed.
+ */
+ if (TYPE(h) != P_LBTREE || NUM_ENT(h) == 0)
+ goto fast_miss;
+
+ /*
+ * What we do here is test to see if we're at the beginning or
+ * end of the tree and if the new item sorts before/after the
+ * first/last page entry. We don't try and catch inserts into
+ * the middle of the tree (although we could, as long as there
+ * were two keys on the page and we saved both the index and
+ * the page number of the last insert).
+ */
+ if (h->next_pgno == PGNO_INVALID) {
+ indx = NUM_ENT(h) - P_INDX;
+ if ((ret = __bam_cmp(dbp,
+ key, h, indx, t->bt_compare, &cmp)) != 0)
+ return (ret);
+
+ if (cmp < 0)
+ goto try_begin;
+ if (cmp > 0) {
+ indx += P_INDX;
+ goto fast_hit;
+ }
+
+ /*
+ * Found a duplicate. If doing DB_KEYLAST, we're at
+ * the correct position, otherwise, move to the first
+ * of the duplicates. If we're looking at off-page
+ * duplicates, duplicate duplicates aren't permitted,
+ * so we're done.
+ */
+ if (flags == DB_KEYLAST)
+ goto fast_hit;
+ for (;
+ indx > 0 && h->inp[indx - P_INDX] == h->inp[indx];
+ indx -= P_INDX)
+ ;
+ goto fast_hit;
+ }
+try_begin: if (h->prev_pgno == PGNO_INVALID) {
+ indx = 0;
+ if ((ret = __bam_cmp(dbp,
+ key, h, indx, t->bt_compare, &cmp)) != 0)
+ return (ret);
+
+ if (cmp > 0)
+ goto fast_miss;
+ if (cmp < 0)
+ goto fast_hit;
+
+ /*
+ * Found a duplicate. If doing DB_KEYFIRST, we're at
+ * the correct position, otherwise, move to the last
+ * of the duplicates. If we're looking at off-page
+ * duplicates, duplicate duplicates aren't permitted,
+ * so we're done.
+ */
+ if (flags == DB_KEYFIRST)
+ goto fast_hit;
+ for (;
+ indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
+ h->inp[indx] == h->inp[indx + P_INDX];
+ indx += P_INDX)
+ ;
+ goto fast_hit;
+ }
+ goto fast_miss;
+
+fast_hit: /* Set the exact match flag, we may have found a duplicate. */
+ *exactp = cmp == 0;
+
+ /*
+ * Insert the entry in the stack. (Our caller is likely to
+ * call __bam_stkrel() after our return.)
+ */
+ BT_STK_CLR(cp);
+ BT_STK_ENTER(dbp->dbenv,
+ cp, h, indx, cp->lock, cp->lock_mode, ret);
+ if (ret != 0)
+ return (ret);
+ break;
+
+fast_miss: /*
+ * This was not the right page, so we do not need to retain
+ * the lock even in the presence of transactions.
+ */
+ DISCARD(dbc, 1, cp->lock, h, ret);
+ if (ret != 0)
+ return (ret);
+
+search: if ((ret =
+ __bam_search(dbc, key, sflags, 1, NULL, exactp)) != 0)
+ return (ret);
+ break;
+ default:
+ return (__db_unknown_flag(dbp->dbenv, "__bam_c_search", flags));
+ }
+
+ /* Initialize the cursor from the stack. */
+ cp->page = cp->csp->page;
+ cp->pgno = cp->csp->page->pgno;
+ cp->indx = cp->csp->indx;
+ cp->lock = cp->csp->lock;
+ cp->lock_mode = cp->csp->lock_mode;
+
+ /*
+ * If we inserted a key into the first or last slot of the tree,
+ * remember where it was so we can do it more quickly next time.
+ */
+ if (TYPE(cp->page) == P_LBTREE &&
+ (flags == DB_KEYFIRST || flags == DB_KEYLAST))
+ t->bt_lpgno =
+ (NEXT_PGNO(cp->page) == PGNO_INVALID &&
+ cp->indx >= NUM_ENT(cp->page)) ||
+ (PREV_PGNO(cp->page) == PGNO_INVALID &&
+ cp->indx == 0) ? cp->pgno : PGNO_INVALID;
+ return (0);
+}
+
+/*
+ * __bam_c_physdel --
+ * Physically remove an item from the page.
+ */
+static int
+__bam_c_physdel(dbc)
+ DBC *dbc;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT key;
+ DB_LOCK lock;
+ PAGE *h;
+ db_pgno_t pgno;
+ int delete_page, empty_page, exact, level, ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ delete_page = empty_page = ret = 0;
+
+ /* If the page is going to be emptied, consider deleting it. */
+ delete_page = empty_page =
+ NUM_ENT(cp->page) == (TYPE(cp->page) == P_LBTREE ? 2 : 1);
+
+ /*
+ * Check if the application turned off reverse splits. Applications
+ * can't turn off reverse splits in off-page duplicate trees, that
+ * space will never be reused unless the exact same key is specified.
+ */
+ if (delete_page &&
+ !F_ISSET(dbc, DBC_OPD) && F_ISSET(dbp, DB_BT_REVSPLIT))
+ delete_page = 0;
+
+ /*
+ * We never delete the last leaf page. (Not really true -- we delete
+ * the last leaf page of off-page duplicate trees, but that's handled
+ * by our caller, not down here.)
+ */
+ if (delete_page && cp->pgno == cp->root)
+ delete_page = 0;
+
+ /*
+ * To delete a leaf page other than an empty root page, we need a
+ * copy of a key from the page. Use the 0th page index since it's
+ * the last key the page held.
+ */
+ if (delete_page) {
+ memset(&key, 0, sizeof(DBT));
+ if ((ret = __db_ret(dbp, cp->page,
+ 0, &key, &dbc->rkey.data, &dbc->rkey.ulen)) != 0)
+ return (ret);
+ }
+
+ /*
+ * Delete the items. If page isn't empty, we adjust the cursors.
+ *
+ * !!!
+ * The following operations to delete a page may deadlock. The easy
+ * scenario is if we're deleting an item because we're closing cursors
+ * because we've already deadlocked and want to call txn_abort(). If
+ * we fail due to deadlock, we'll leave a locked, possibly empty page
+ * in the tree, which won't be empty long because we'll undo the delete
+ * when we undo the transaction's modifications.
+ *
+ * !!!
+ * Delete the key item first, otherwise the on-page duplicate checks
+ * in __bam_ditem() won't work!
+ */
+ if (TYPE(cp->page) == P_LBTREE) {
+ if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
+ return (ret);
+ if (!empty_page)
+ if ((ret = __bam_ca_di(dbc,
+ PGNO(cp->page), cp->indx, -1)) != 0)
+ return (ret);
+ }
+ if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
+ return (ret);
+ if (!empty_page)
+ if ((ret = __bam_ca_di(dbc, PGNO(cp->page), cp->indx, -1)) != 0)
+ return (ret);
+
+ /* If we're not going to try and delete the page, we're done. */
+ if (!delete_page)
+ return (0);
+
+ /*
+ * Call __bam_search to reacquire the empty leaf page, but this time
+ * get both the leaf page and it's parent, locked. Jump back up the
+ * tree, until we have the top pair of pages that we want to delete.
+ * Once we have the top page that we want to delete locked, lock the
+ * underlying pages and check to make sure they're still empty. If
+ * they are, delete them.
+ */
+ for (level = LEAFLEVEL;; ++level) {
+ /* Acquire a page and its parent, locked. */
+ if ((ret = __bam_search(
+ dbc, &key, S_WRPAIR, level, NULL, &exact)) != 0)
+ return (ret);
+
+ /*
+ * If we reach the root or the parent page isn't going to be
+ * empty when we delete one record, stop.
+ */
+ h = cp->csp[-1].page;
+ if (h->pgno == cp->root || NUM_ENT(h) != 1)
+ break;
+
+ /* Discard the stack, retaining no locks. */
+ (void)__bam_stkrel(dbc, STK_NOLOCK);
+ }
+
+ /*
+ * Move the stack pointer one after the last entry, we may be about
+ * to push more items onto the page stack.
+ */
+ ++cp->csp;
+
+ /*
+ * cp->csp[-2].page is now the parent page, which we may or may not be
+ * going to delete, and cp->csp[-1].page is the first page we know we
+ * are going to delete. Walk down the chain of pages, acquiring pages
+ * until we've acquired a leaf page. Generally, this shouldn't happen;
+ * we should only see a single internal page with one item and a single
+ * leaf page with no items. The scenario where we could see something
+ * else is if reverse splits were turned off for awhile and then turned
+ * back on. That could result in all sorts of strangeness, e.g., empty
+ * pages in the tree, trees that looked like linked lists, and so on.
+ *
+ * !!!
+ * Sheer paranoia: if we find any pages that aren't going to be emptied
+ * by the delete, someone else added an item while we were walking the
+ * tree, and we discontinue the delete. Shouldn't be possible, but we
+ * check regardless.
+ */
+ for (h = cp->csp[-1].page;;) {
+ if (ISLEAF(h)) {
+ if (NUM_ENT(h) != 0)
+ break;
+ break;
+ } else
+ if (NUM_ENT(h) != 1)
+ break;
+
+ /*
+ * Get the next page, write lock it and push it onto the stack.
+ * We know it's index 0, because it can only have one element.
+ */
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ pgno = GET_BINTERNAL(h, 0)->pgno;
+ break;
+ case P_IRECNO:
+ pgno = GET_RINTERNAL(h, 0)->pgno;
+ break;
+ default:
+ return (__db_pgfmt(dbp, PGNO(h)));
+ }
+
+ if ((ret =
+ __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &lock)) != 0)
+ break;
+ if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
+ break;
+ BT_STK_PUSH(dbp->dbenv, cp, h, 0, lock, DB_LOCK_WRITE, ret);
+ if (ret != 0)
+ break;
+ }
+
+ /* Adjust the cursor stack to reference the last page on the stack. */
+ BT_STK_POP(cp);
+
+ /*
+ * If everything worked, delete the stack, otherwise, release the
+ * stack and page locks without further damage.
+ */
+ if (ret == 0)
+ ret = __bam_dpages(dbc, cp->sp);
+ else
+ (void)__bam_stkrel(dbc, 0);
+
+ return (ret);
+}
+
+/*
+ * __bam_c_getstack --
+ * Acquire a full stack for a cursor.
+ */
+static int
+__bam_c_getstack(dbc)
+ DBC *dbc;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT dbt;
+ PAGE *h;
+ int exact, ret, t_ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * Get the page with the current item on it. The caller of this
+ * routine has to already hold a read lock on the page, so there
+ * is no additional lock to acquire.
+ */
+ if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &h)) != 0)
+ return (ret);
+
+ /* Get a copy of a key from the page. */
+ memset(&dbt, 0, sizeof(DBT));
+ if ((ret = __db_ret(dbp,
+ h, 0, &dbt, &dbc->rkey.data, &dbc->rkey.ulen)) != 0)
+ goto err;
+
+ /* Get a write-locked stack for the page. */
+ exact = 0;
+ ret = __bam_search(dbc, &dbt, S_KEYFIRST, 1, NULL, &exact);
+
+err: /* Discard the key and the page. */
+ if ((t_ret = memp_fput(dbp->mpf, h, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __bam_isopd --
+ * Return if the cursor references an off-page duplicate tree via its
+ * page number.
+ */
+static int
+__bam_isopd(dbc, pgnop)
+ DBC *dbc;
+ db_pgno_t *pgnop;
+{
+ BOVERFLOW *bo;
+
+ if (TYPE(dbc->internal->page) != P_LBTREE)
+ return (0);
+
+ bo = GET_BOVERFLOW(dbc->internal->page, dbc->internal->indx + O_INDX);
+ if (B_TYPE(bo->type) == B_DUPLICATE) {
+ *pgnop = bo->pgno;
+ return (1);
+ }
+ return (0);
+}
diff --git a/db/btree/bt_delete.c b/db/btree/bt_delete.c
new file mode 100644
index 000000000..972588788
--- /dev/null
+++ b/db/btree/bt_delete.c
@@ -0,0 +1,530 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: bt_delete.c,v 11.31 2001/01/17 18:48:46 bostic Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "db_shash.h"
+#include "btree.h"
+#include "lock.h"
+
+/*
+ * __bam_delete --
+ * Delete the items referenced by a key.
+ *
+ * PUBLIC: int __bam_delete __P((DB *, DB_TXN *, DBT *, u_int32_t));
+ */
+int
+__bam_delete(dbp, txn, key, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ DBT *key;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ DBT lkey;
+ DBT data;
+ u_int32_t f_init, f_next;
+ int ret, t_ret;
+
+ PANIC_CHECK(dbp->dbenv);
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->del");
+ DB_CHECK_TXN(dbp, txn);
+
+ /* Check for invalid flags. */
+ if ((ret =
+ __db_delchk(dbp, key, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0)
+ return (ret);
+
+ /* Allocate a cursor. */
+ if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
+ return (ret);
+
+ DEBUG_LWRITE(dbc, txn, "bam_delete", key, NULL, flags);
+
+ /*
+ * Walk a cursor through the key/data pairs, deleting as we go. Set
+ * the DB_DBT_USERMEM flag, as this might be a threaded application
+ * and the flags checking will catch us. We don't actually want the
+ * keys or data, so request a partial of length 0.
+ */
+ memset(&lkey, 0, sizeof(lkey));
+ F_SET(&lkey, DB_DBT_USERMEM | DB_DBT_PARTIAL);
+ memset(&data, 0, sizeof(data));
+ F_SET(&data, DB_DBT_USERMEM | DB_DBT_PARTIAL);
+
+ /*
+ * If locking (and we haven't already acquired CDB locks), set the
+ * read-modify-write flag.
+ */
+ f_init = DB_SET;
+ f_next = DB_NEXT_DUP;
+ if (STD_LOCKING(dbc)) {
+ f_init |= DB_RMW;
+ f_next |= DB_RMW;
+ }
+
+ /* Walk through the set of key/data pairs, deleting as we go. */
+ if ((ret = dbc->c_get(dbc, key, &data, f_init)) != 0)
+ goto err;
+ for (;;) {
+ if ((ret = dbc->c_del(dbc, 0)) != 0)
+ goto err;
+ if ((ret = dbc->c_get(dbc, &lkey, &data, f_next)) != 0) {
+ if (ret == DB_NOTFOUND) {
+ ret = 0;
+ break;
+ }
+ goto err;
+ }
+ }
+
+err: /* Discard the cursor. */
+ if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __bam_ditem --
+ * Delete one or more entries from a page.
+ *
+ * PUBLIC: int __bam_ditem __P((DBC *, PAGE *, u_int32_t));
+ */
+int
+__bam_ditem(dbc, h, indx)
+ DBC *dbc;
+ PAGE *h;
+ u_int32_t indx;
+{
+ BINTERNAL *bi;
+ BKEYDATA *bk;
+ DB *dbp;
+ u_int32_t nbytes;
+ int ret;
+
+ dbp = dbc->dbp;
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ bi = GET_BINTERNAL(h, indx);
+ switch (B_TYPE(bi->type)) {
+ case B_DUPLICATE:
+ case B_KEYDATA:
+ nbytes = BINTERNAL_SIZE(bi->len);
+ break;
+ case B_OVERFLOW:
+ nbytes = BINTERNAL_SIZE(bi->len);
+ if ((ret =
+ __db_doff(dbc, ((BOVERFLOW *)bi->data)->pgno)) != 0)
+ return (ret);
+ break;
+ default:
+ return (__db_pgfmt(dbp, PGNO(h)));
+ }
+ break;
+ case P_IRECNO:
+ nbytes = RINTERNAL_SIZE;
+ break;
+ case P_LBTREE:
+ /*
+ * If it's a duplicate key, discard the index and don't touch
+ * the actual page item.
+ *
+ * !!!
+ * This works because no data item can have an index matching
+ * any other index so even if the data item is in a key "slot",
+ * it won't match any other index.
+ */
+ if ((indx % 2) == 0) {
+ /*
+ * Check for a duplicate after us on the page. NOTE:
+ * we have to delete the key item before deleting the
+ * data item, otherwise the "indx + P_INDX" calculation
+ * won't work!
+ */
+ if (indx + P_INDX < (u_int32_t)NUM_ENT(h) &&
+ h->inp[indx] == h->inp[indx + P_INDX])
+ return (__bam_adjindx(dbc,
+ h, indx, indx + O_INDX, 0));
+ /*
+ * Check for a duplicate before us on the page. It
+ * doesn't matter if we delete the key item before or
+ * after the data item for the purposes of this one.
+ */
+ if (indx > 0 && h->inp[indx] == h->inp[indx - P_INDX])
+ return (__bam_adjindx(dbc,
+ h, indx, indx - P_INDX, 0));
+ }
+ /* FALLTHROUGH */
+ case P_LDUP:
+ case P_LRECNO:
+ bk = GET_BKEYDATA(h, indx);
+ switch (B_TYPE(bk->type)) {
+ case B_DUPLICATE:
+ nbytes = BOVERFLOW_SIZE;
+ break;
+ case B_OVERFLOW:
+ nbytes = BOVERFLOW_SIZE;
+ if ((ret = __db_doff(
+ dbc, (GET_BOVERFLOW(h, indx))->pgno)) != 0)
+ return (ret);
+ break;
+ case B_KEYDATA:
+ nbytes = BKEYDATA_SIZE(bk->len);
+ break;
+ default:
+ return (__db_pgfmt(dbp, PGNO(h)));
+ }
+ break;
+ default:
+ return (__db_pgfmt(dbp, PGNO(h)));
+ }
+
+ /* Delete the item and mark the page dirty. */
+ if ((ret = __db_ditem(dbc, h, indx, nbytes)) != 0)
+ return (ret);
+ if ((ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __bam_adjindx --
+ * Adjust an index on the page.
+ *
+ * PUBLIC: int __bam_adjindx __P((DBC *, PAGE *, u_int32_t, u_int32_t, int));
+ */
+int
+__bam_adjindx(dbc, h, indx, indx_copy, is_insert)
+ DBC *dbc;
+ PAGE *h;
+ u_int32_t indx, indx_copy;
+ int is_insert;
+{
+ DB *dbp;
+ db_indx_t copy;
+ int ret;
+
+ dbp = dbc->dbp;
+
+ /* Log the change. */
+ if (DB_LOGGING(dbc) &&
+ (ret = __bam_adj_log(dbp->dbenv, dbc->txn, &LSN(h),
+ 0, dbp->log_fileid, PGNO(h), &LSN(h), indx, indx_copy,
+ (u_int32_t)is_insert)) != 0)
+ return (ret);
+
+ /* Shuffle the indices and mark the page dirty. */
+ if (is_insert) {
+ copy = h->inp[indx_copy];
+ if (indx != NUM_ENT(h))
+ memmove(&h->inp[indx + O_INDX], &h->inp[indx],
+ sizeof(db_indx_t) * (NUM_ENT(h) - indx));
+ h->inp[indx] = copy;
+ ++NUM_ENT(h);
+ } else {
+ --NUM_ENT(h);
+ if (indx != NUM_ENT(h))
+ memmove(&h->inp[indx], &h->inp[indx + O_INDX],
+ sizeof(db_indx_t) * (NUM_ENT(h) - indx));
+ }
+ if ((ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __bam_dpages --
+ * Delete a set of locked pages.
+ *
+ * PUBLIC: int __bam_dpages __P((DBC *, EPG *));
+ */
+int
+__bam_dpages(dbc, stack_epg)
+ DBC *dbc;
+ EPG *stack_epg;
+{
+ BTREE_CURSOR *cp;
+ BINTERNAL *bi;
+ DB *dbp;
+ DBT a, b;
+ DB_LOCK c_lock, p_lock;
+ EPG *epg;
+ PAGE *child, *parent;
+ db_indx_t nitems;
+ db_pgno_t pgno, root_pgno;
+ db_recno_t rcnt;
+ int done, ret, t_ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * We have the entire stack of deletable pages locked.
+ *
+ * Btree calls us with a pointer to the beginning of a stack, where
+ * the first page in the stack is to have a single item deleted, and
+ * the rest of the pages are to be removed.
+ *
+ * Recno calls us with a pointer into the middle of the stack, where
+ * the referenced page is to have a single item deleted, and pages
+ * after the stack reference are to be removed.
+ *
+ * First, discard any pages that we don't care about.
+ */
+ ret = 0;
+ for (epg = cp->sp; epg < stack_epg; ++epg) {
+ if ((t_ret =
+ memp_fput(dbp->mpf, epg->page, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ (void)__TLPUT(dbc, epg->lock);
+ }
+ if (ret != 0)
+ goto err;
+
+ /*
+ * !!!
+ * There is an interesting deadlock situation here. We have to relink
+ * the leaf page chain around the leaf page being deleted. Consider
+ * a cursor walking through the leaf pages, that has the previous page
+ * read-locked and is waiting on a lock for the page we're deleting.
+ * It will deadlock here. Before we unlink the subtree, we relink the
+ * leaf page chain.
+ */
+ if ((ret = __db_relink(dbc, DB_REM_PAGE, cp->csp->page, NULL, 1)) != 0)
+ goto err;
+
+ /*
+ * Delete the last item that references the underlying pages that are
+ * to be deleted, and adjust cursors that reference that page. Then,
+ * save that page's page number and item count and release it. If
+ * the application isn't retaining locks because it's running without
+ * transactions, this lets the rest of the tree get back to business
+ * immediately.
+ */
+ if ((ret = __bam_ditem(dbc, epg->page, epg->indx)) != 0)
+ goto err;
+ if ((ret = __bam_ca_di(dbc, PGNO(epg->page), epg->indx, -1)) != 0)
+ goto err;
+
+ pgno = PGNO(epg->page);
+ nitems = NUM_ENT(epg->page);
+
+ if ((ret = memp_fput(dbp->mpf, epg->page, 0)) != 0)
+ goto err_inc;
+ (void)__TLPUT(dbc, epg->lock);
+
+ /* Free the rest of the pages in the stack. */
+ while (++epg <= cp->csp) {
+ /*
+ * Delete page entries so they will be restored as part of
+ * recovery. We don't need to do cursor adjustment here as
+ * the pages are being emptied by definition and so cannot
+ * be referenced by a cursor.
+ */
+ if (NUM_ENT(epg->page) != 0) {
+ DB_ASSERT(NUM_ENT(epg->page) == 1);
+
+ if ((ret = __bam_ditem(dbc, epg->page, epg->indx)) != 0)
+ goto err;
+ }
+
+ if ((ret = __db_free(dbc, epg->page)) != 0) {
+ epg->page = NULL;
+ goto err_inc;
+ }
+ (void)__TLPUT(dbc, epg->lock);
+ }
+
+ if (0) {
+err_inc: ++epg;
+err: for (; epg <= cp->csp; ++epg) {
+ if (epg->page != NULL)
+ (void)memp_fput(dbp->mpf, epg->page, 0);
+ (void)__TLPUT(dbc, epg->lock);
+ }
+ BT_STK_CLR(cp);
+ return (ret);
+ }
+ BT_STK_CLR(cp);
+
+ /*
+ * If we just deleted the next-to-last item from the root page, the
+ * tree can collapse one or more levels. While there remains only a
+ * single item on the root page, write lock the last page referenced
+ * by the root page and copy it over the root page.
+ */
+ root_pgno = cp->root;
+ if (pgno != root_pgno || nitems != 1)
+ return (0);
+
+ for (done = 0; !done;) {
+ /* Initialize. */
+ parent = child = NULL;
+ p_lock.off = c_lock.off = LOCK_INVALID;
+
+ /* Lock the root. */
+ pgno = root_pgno;
+ if ((ret =
+ __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &p_lock)) != 0)
+ goto stop;
+ if ((ret = memp_fget(dbp->mpf, &pgno, 0, &parent)) != 0)
+ goto stop;
+
+ if (NUM_ENT(parent) != 1)
+ goto stop;
+
+ switch (TYPE(parent)) {
+ case P_IBTREE:
+ /*
+ * If this is overflow, then try to delete it.
+ * The child may or may not still point at it.
+ */
+ bi = GET_BINTERNAL(parent, 0);
+ if (B_TYPE(bi->type) == B_OVERFLOW)
+ if ((ret = __db_doff(dbc,
+ ((BOVERFLOW *)bi->data)->pgno)) != 0)
+ goto stop;
+ pgno = bi->pgno;
+ break;
+ case P_IRECNO:
+ pgno = GET_RINTERNAL(parent, 0)->pgno;
+ break;
+ default:
+ goto stop;
+ }
+
+ /* Lock the child page. */
+ if ((ret =
+ __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &c_lock)) != 0)
+ goto stop;
+ if ((ret = memp_fget(dbp->mpf, &pgno, 0, &child)) != 0)
+ goto stop;
+
+ /* Log the change. */
+ if (DB_LOGGING(dbc)) {
+ memset(&a, 0, sizeof(a));
+ a.data = child;
+ a.size = dbp->pgsize;
+ memset(&b, 0, sizeof(b));
+ b.data = P_ENTRY(parent, 0);
+ b.size = TYPE(parent) == P_IRECNO ? RINTERNAL_SIZE :
+ BINTERNAL_SIZE(((BINTERNAL *)b.data)->len);
+ if ((ret =
+ __bam_rsplit_log(dbp->dbenv, dbc->txn, &child->lsn,
+ 0, dbp->log_fileid, PGNO(child), &a, PGNO(parent),
+ RE_NREC(parent), &b, &parent->lsn)) != 0)
+ goto stop;
+ }
+
+ /*
+ * Make the switch.
+ *
+ * One fixup -- internal pages below the top level do not store
+ * a record count, so we have to preserve it if we're not
+ * converting to a leaf page. Note also that we are about to
+ * overwrite the parent page, including its LSN. This is OK
+ * because the log message we wrote describing this update
+ * stores its LSN on the child page. When the child is copied
+ * onto the parent, the correct LSN is copied into place.
+ */
+ COMPQUIET(rcnt, 0);
+ if (F_ISSET(cp, C_RECNUM) && LEVEL(child) > LEAFLEVEL)
+ rcnt = RE_NREC(parent);
+ memcpy(parent, child, dbp->pgsize);
+ PGNO(parent) = root_pgno;
+ if (F_ISSET(cp, C_RECNUM) && LEVEL(child) > LEAFLEVEL)
+ RE_NREC_SET(parent, rcnt);
+
+ /* Mark the pages dirty. */
+ if ((ret = memp_fset(dbp->mpf, parent, DB_MPOOL_DIRTY)) != 0)
+ goto stop;
+ if ((ret = memp_fset(dbp->mpf, child, DB_MPOOL_DIRTY)) != 0)
+ goto stop;
+
+ /* Adjust the cursors. */
+ if ((ret = __bam_ca_rsplit(dbc, PGNO(child), root_pgno)) != 0)
+ goto stop;
+
+ /*
+ * Free the page copied onto the root page and discard its
+ * lock. (The call to __db_free() discards our reference
+ * to the page.)
+ */
+ if ((ret = __db_free(dbc, child)) != 0) {
+ child = NULL;
+ goto stop;
+ }
+ child = NULL;
+
+ if (0) {
+stop: done = 1;
+ }
+ if (p_lock.off != LOCK_INVALID)
+ (void)__TLPUT(dbc, p_lock);
+ if (parent != NULL &&
+ (t_ret = memp_fput(dbp->mpf, parent, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ if (c_lock.off != LOCK_INVALID)
+ (void)__TLPUT(dbc, c_lock);
+ if (child != NULL &&
+ (t_ret = memp_fput(dbp->mpf, child, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ return (ret);
+}
diff --git a/db/btree/bt_method.c b/db/btree/bt_method.c
new file mode 100644
index 000000000..5e3af27d0
--- /dev/null
+++ b/db/btree/bt_method.c
@@ -0,0 +1,387 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: bt_method.c,v 11.20 2000/11/30 00:58:28 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "btree.h"
+#include "qam.h"
+
+static int __bam_set_bt_compare
+ __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+static int __bam_set_bt_maxkey __P((DB *, u_int32_t));
+static int __bam_set_bt_minkey __P((DB *, u_int32_t));
+static int __bam_set_bt_prefix
+ __P((DB *, size_t(*)(DB *, const DBT *, const DBT *)));
+static int __ram_set_re_delim __P((DB *, int));
+static int __ram_set_re_len __P((DB *, u_int32_t));
+static int __ram_set_re_pad __P((DB *, int));
+static int __ram_set_re_source __P((DB *, const char *));
+
+/*
+ * __bam_db_create --
+ * Btree specific initialization of the DB structure.
+ *
+ * PUBLIC: int __bam_db_create __P((DB *));
+ */
+int
+__bam_db_create(dbp)
+ DB *dbp;
+{
+ BTREE *t;
+ int ret;
+
+ /* Allocate and initialize the private btree structure. */
+ if ((ret = __os_calloc(dbp->dbenv, 1, sizeof(BTREE), &t)) != 0)
+ return (ret);
+ dbp->bt_internal = t;
+
+ t->bt_minkey = DEFMINKEYPAGE; /* Btree */
+ t->bt_compare = __bam_defcmp;
+ t->bt_prefix = __bam_defpfx;
+
+ dbp->set_bt_compare = __bam_set_bt_compare;
+ dbp->set_bt_maxkey = __bam_set_bt_maxkey;
+ dbp->set_bt_minkey = __bam_set_bt_minkey;
+ dbp->set_bt_prefix = __bam_set_bt_prefix;
+
+ t->re_pad = ' '; /* Recno */
+ t->re_delim = '\n';
+ t->re_eof = 1;
+
+ dbp->set_re_delim = __ram_set_re_delim;
+ dbp->set_re_len = __ram_set_re_len;
+ dbp->set_re_pad = __ram_set_re_pad;
+ dbp->set_re_source = __ram_set_re_source;
+
+ return (0);
+}
+
+/*
+ * __bam_db_close --
+ * Btree specific discard of the DB structure.
+ *
+ * PUBLIC: int __bam_db_close __P((DB *));
+ */
+int
+__bam_db_close(dbp)
+ DB *dbp;
+{
+ BTREE *t;
+
+ t = dbp->bt_internal;
+ /* Recno */
+ /* Close any backing source file descriptor. */
+ if (t->re_fp != NULL)
+ (void)fclose(t->re_fp);
+
+ /* Free any backing source file name. */
+ if (t->re_source != NULL)
+ __os_freestr(t->re_source);
+
+ __os_free(t, sizeof(BTREE));
+ dbp->bt_internal = NULL;
+
+ return (0);
+}
+
+/*
+ * __bam_set_flags --
+ * Set Btree specific flags.
+ *
+ * PUBLIC: int __bam_set_flags __P((DB *, u_int32_t *flagsp));
+ */
+int
+__bam_set_flags(dbp, flagsp)
+ DB *dbp;
+ u_int32_t *flagsp;
+{
+ u_int32_t flags;
+
+ flags = *flagsp;
+ if (LF_ISSET(DB_DUP | DB_DUPSORT | DB_RECNUM | DB_REVSPLITOFF)) {
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_flags");
+
+ /*
+ * The DB_DUP and DB_DUPSORT flags are shared by the Hash
+ * and Btree access methods.
+ */
+ if (LF_ISSET(DB_DUP | DB_DUPSORT))
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE | DB_OK_HASH);
+
+ if (LF_ISSET(DB_RECNUM | DB_REVSPLITOFF))
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+ if (LF_ISSET(DB_DUP | DB_DUPSORT)) {
+ /* DB_DUP/DB_DUPSORT is incompatible with DB_RECNUM. */
+ if (F_ISSET(dbp, DB_BT_RECNUM))
+ goto incompat;
+
+ if (LF_ISSET(DB_DUPSORT)) {
+ if (dbp->dup_compare == NULL)
+ dbp->dup_compare = __bam_defcmp;
+ F_SET(dbp, DB_AM_DUPSORT);
+ }
+
+ F_SET(dbp, DB_AM_DUP);
+ LF_CLR(DB_DUP | DB_DUPSORT);
+ }
+
+ if (LF_ISSET(DB_RECNUM)) {
+ /* DB_RECNUM is incompatible with DB_DUP/DB_DUPSORT. */
+ if (F_ISSET(dbp, DB_AM_DUP))
+ goto incompat;
+
+ F_SET(dbp, DB_BT_RECNUM);
+ LF_CLR(DB_RECNUM);
+ }
+
+ if (LF_ISSET(DB_REVSPLITOFF)) {
+ F_SET(dbp, DB_BT_REVSPLIT);
+ LF_CLR(DB_REVSPLITOFF);
+ }
+
+ *flagsp = flags;
+ }
+ return (0);
+
+incompat:
+ return (__db_ferr(dbp->dbenv, "DB->set_flags", 1));
+}
+
+/*
+ * __bam_set_bt_compare --
+ * Set the comparison function.
+ */
+static int
+__bam_set_bt_compare(dbp, func)
+ DB *dbp;
+ int (*func) __P((DB *, const DBT *, const DBT *));
+{
+ BTREE *t;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "set_bt_compare");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+ t = dbp->bt_internal;
+
+ /*
+ * Can't default the prefix routine if the user supplies a comparison
+ * routine; shortening the keys can break their comparison algorithm.
+ */
+ t->bt_compare = func;
+ if (t->bt_prefix == __bam_defpfx)
+ t->bt_prefix = NULL;
+
+ return (0);
+}
+
+/*
+ * __bam_set_bt_maxkey --
+ * Set the maximum keys per page.
+ */
+static int
+__bam_set_bt_maxkey(dbp, bt_maxkey)
+ DB *dbp;
+ u_int32_t bt_maxkey;
+{
+ BTREE *t;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "set_bt_maxkey");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+ t = dbp->bt_internal;
+
+ if (bt_maxkey < 1) {
+ __db_err(dbp->dbenv, "minimum bt_maxkey value is 1");
+ return (EINVAL);
+ }
+
+ t->bt_maxkey = bt_maxkey;
+ return (0);
+}
+
+/*
+ * __bam_set_bt_minkey --
+ * Set the minimum keys per page.
+ */
+static int
+__bam_set_bt_minkey(dbp, bt_minkey)
+ DB *dbp;
+ u_int32_t bt_minkey;
+{
+ BTREE *t;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "set_bt_minkey");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+ t = dbp->bt_internal;
+
+ if (bt_minkey < 2) {
+ __db_err(dbp->dbenv, "minimum bt_minkey value is 2");
+ return (EINVAL);
+ }
+
+ t->bt_minkey = bt_minkey;
+ return (0);
+}
+
+/*
+ * __bam_set_bt_prefix --
+ * Set the prefix function.
+ */
+static int
+__bam_set_bt_prefix(dbp, func)
+ DB *dbp;
+ size_t (*func) __P((DB *, const DBT *, const DBT *));
+{
+ BTREE *t;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "set_bt_prefix");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+ t = dbp->bt_internal;
+
+ t->bt_prefix = func;
+ return (0);
+}
+
+/*
+ * __ram_set_flags --
+ * Set Recno specific flags.
+ *
+ * PUBLIC: int __ram_set_flags __P((DB *, u_int32_t *flagsp));
+ */
+int
+__ram_set_flags(dbp, flagsp)
+ DB *dbp;
+ u_int32_t *flagsp;
+{
+ u_int32_t flags;
+
+ flags = *flagsp;
+ if (LF_ISSET(DB_RENUMBER | DB_SNAPSHOT)) {
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_flags");
+
+ DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO);
+
+ if (LF_ISSET(DB_RENUMBER)) {
+ F_SET(dbp, DB_RE_RENUMBER);
+ LF_CLR(DB_RENUMBER);
+ }
+
+ if (LF_ISSET(DB_SNAPSHOT)) {
+ F_SET(dbp, DB_RE_SNAPSHOT);
+ LF_CLR(DB_SNAPSHOT);
+ }
+
+ *flagsp = flags;
+ }
+ return (0);
+}
+
+/*
+ * __ram_set_re_delim --
+ * Set the variable-length input record delimiter.
+ */
+static int
+__ram_set_re_delim(dbp, re_delim)
+ DB *dbp;
+ int re_delim;
+{
+ BTREE *t;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "set_re_delim");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO);
+
+ t = dbp->bt_internal;
+
+ t->re_delim = re_delim;
+ F_SET(dbp, DB_RE_DELIMITER);
+
+ return (0);
+}
+
+/*
+ * __ram_set_re_len --
+ * Set the variable-length input record length.
+ */
+static int
+__ram_set_re_len(dbp, re_len)
+ DB *dbp;
+ u_int32_t re_len;
+{
+ BTREE *t;
+ QUEUE *q;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "set_re_len");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO);
+
+ t = dbp->bt_internal;
+ t->re_len = re_len;
+
+ q = dbp->q_internal;
+ q->re_len = re_len;
+
+ F_SET(dbp, DB_RE_FIXEDLEN);
+
+ return (0);
+}
+
+/*
+ * __ram_set_re_pad --
+ * Set the fixed-length record pad character.
+ */
+static int
+__ram_set_re_pad(dbp, re_pad)
+ DB *dbp;
+ int re_pad;
+{
+ BTREE *t;
+ QUEUE *q;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "set_re_pad");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO);
+
+ t = dbp->bt_internal;
+ t->re_pad = re_pad;
+
+ q = dbp->q_internal;
+ q->re_pad = re_pad;
+
+ F_SET(dbp, DB_RE_PAD);
+
+ return (0);
+}
+
+/*
+ * __ram_set_re_source --
+ * Set the backing source file name.
+ */
+static int
+__ram_set_re_source(dbp, re_source)
+ DB *dbp;
+ const char *re_source;
+{
+ BTREE *t;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "set_re_source");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO);
+
+ t = dbp->bt_internal;
+
+ return (__os_strdup(dbp->dbenv, re_source, &t->re_source));
+}
diff --git a/db/btree/bt_open.c b/db/btree/bt_open.c
new file mode 100644
index 000000000..405c1880f
--- /dev/null
+++ b/db/btree/bt_open.c
@@ -0,0 +1,468 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: bt_open.c,v 11.42 2000/11/30 00:58:28 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <limits.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "db_swap.h"
+#include "btree.h"
+#include "db_shash.h"
+#include "lock.h"
+#include "log.h"
+#include "mp.h"
+
+/*
+ * __bam_open --
+ * Open a btree.
+ *
+ * PUBLIC: int __bam_open __P((DB *, const char *, db_pgno_t, u_int32_t));
+ */
+int
+__bam_open(dbp, name, base_pgno, flags)
+ DB *dbp;
+ const char *name;
+ db_pgno_t base_pgno;
+ u_int32_t flags;
+{
+ BTREE *t;
+
+ t = dbp->bt_internal;
+
+ /* Initialize the remaining fields/methods of the DB. */
+ dbp->del = __bam_delete;
+ dbp->key_range = __bam_key_range;
+ dbp->stat = __bam_stat;
+
+ /*
+ * We don't permit the user to specify a prefix routine if they didn't
+ * also specify a comparison routine, they can't know enough about our
+ * comparison routine to get it right.
+ */
+ if (t->bt_compare == __bam_defcmp && t->bt_prefix != __bam_defpfx) {
+ __db_err(dbp->dbenv,
+"prefix comparison may not be specified for default comparison routine");
+ return (EINVAL);
+ }
+
+ /*
+ * Verify that the bt_minkey value specified won't cause the
+ * calculation of ovflsize to underflow [#2406] for this pagesize.
+ */
+ if (B_MINKEY_TO_OVFLSIZE(t->bt_minkey, dbp->pgsize) >
+ B_MINKEY_TO_OVFLSIZE(DEFMINKEYPAGE, dbp->pgsize)) {
+ __db_err(dbp->dbenv,
+ "bt_minkey value of %lu too high for page size of %lu",
+ (u_long)t->bt_minkey, (u_long)dbp->pgsize);
+ return (EINVAL);
+ }
+
+ /* Start up the tree. */
+ return (__bam_read_root(dbp, name, base_pgno, flags));
+}
+
+/*
+ * __bam_metachk --
+ *
+ * PUBLIC: int __bam_metachk __P((DB *, const char *, BTMETA *));
+ */
+int
+__bam_metachk(dbp, name, btm)
+ DB *dbp;
+ const char *name;
+ BTMETA *btm;
+{
+ DB_ENV *dbenv;
+ u_int32_t vers;
+ int ret;
+
+ dbenv = dbp->dbenv;
+
+ /*
+ * At this point, all we know is that the magic number is for a Btree.
+ * Check the version, the database may be out of date.
+ */
+ vers = btm->dbmeta.version;
+ if (F_ISSET(dbp, DB_AM_SWAP))
+ M_32_SWAP(vers);
+ switch (vers) {
+ case 6:
+ case 7:
+ __db_err(dbenv,
+ "%s: btree version %lu requires a version upgrade",
+ name, (u_long)vers);
+ return (DB_OLD_VERSION);
+ case 8:
+ break;
+ default:
+ __db_err(dbenv,
+ "%s: unsupported btree version: %lu", name, (u_long)vers);
+ return (EINVAL);
+ }
+
+ /* Swap the page if we need to. */
+ if (F_ISSET(dbp, DB_AM_SWAP) && (ret = __bam_mswap((PAGE *)btm)) != 0)
+ return (ret);
+
+ /*
+ * Check application info against metadata info, and set info, flags,
+ * and type based on metadata info.
+ */
+ if ((ret =
+ __db_fchk(dbenv, "DB->open", btm->dbmeta.flags, BTM_MASK)) != 0)
+ return (ret);
+
+ if (F_ISSET(&btm->dbmeta, BTM_RECNO)) {
+ if (dbp->type == DB_BTREE)
+ goto wrong_type;
+ dbp->type = DB_RECNO;
+ DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO);
+ } else {
+ if (dbp->type == DB_RECNO)
+ goto wrong_type;
+ dbp->type = DB_BTREE;
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+ }
+
+ if (F_ISSET(&btm->dbmeta, BTM_DUP))
+ F_SET(dbp, DB_AM_DUP);
+ else
+ if (F_ISSET(dbp, DB_AM_DUP)) {
+ __db_err(dbenv,
+ "%s: DB_DUP specified to open method but not set in database",
+ name);
+ return (EINVAL);
+ }
+
+ if (F_ISSET(&btm->dbmeta, BTM_RECNUM)) {
+ if (dbp->type != DB_BTREE)
+ goto wrong_type;
+ F_SET(dbp, DB_BT_RECNUM);
+
+ if ((ret = __db_fcchk(dbenv,
+ "DB->open", dbp->flags, DB_AM_DUP, DB_BT_RECNUM)) != 0)
+ return (ret);
+ } else
+ if (F_ISSET(dbp, DB_BT_RECNUM)) {
+ __db_err(dbenv,
+ "%s: DB_RECNUM specified to open method but not set in database",
+ name);
+ return (EINVAL);
+ }
+
+ if (F_ISSET(&btm->dbmeta, BTM_FIXEDLEN)) {
+ if (dbp->type != DB_RECNO)
+ goto wrong_type;
+ F_SET(dbp, DB_RE_FIXEDLEN);
+ } else
+ if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
+ __db_err(dbenv,
+ "%s: DB_FIXEDLEN specified to open method but not set in database",
+ name);
+ return (EINVAL);
+ }
+
+ if (F_ISSET(&btm->dbmeta, BTM_RENUMBER)) {
+ if (dbp->type != DB_RECNO)
+ goto wrong_type;
+ F_SET(dbp, DB_RE_RENUMBER);
+ } else
+ if (F_ISSET(dbp, DB_RE_RENUMBER)) {
+ __db_err(dbenv,
+ "%s: DB_RENUMBER specified to open method but not set in database",
+ name);
+ return (EINVAL);
+ }
+
+ if (F_ISSET(&btm->dbmeta, BTM_SUBDB))
+ F_SET(dbp, DB_AM_SUBDB);
+ else
+ if (F_ISSET(dbp, DB_AM_SUBDB)) {
+ __db_err(dbenv,
+ "%s: multiple databases specified but not supported by file",
+ name);
+ return (EINVAL);
+ }
+
+ if (F_ISSET(&btm->dbmeta, BTM_DUPSORT)) {
+ if (dbp->dup_compare == NULL)
+ dbp->dup_compare = __bam_defcmp;
+ F_SET(dbp, DB_AM_DUPSORT);
+ } else
+ if (dbp->dup_compare != NULL) {
+ __db_err(dbenv,
+ "%s: duplicate sort specified but not supported in database",
+ name);
+ return (EINVAL);
+ }
+
+ /* Set the page size. */
+ dbp->pgsize = btm->dbmeta.pagesize;
+
+ /* Copy the file's ID. */
+ memcpy(dbp->fileid, btm->dbmeta.uid, DB_FILE_ID_LEN);
+
+ return (0);
+
+wrong_type:
+ if (dbp->type == DB_BTREE)
+ __db_err(dbenv,
+ "open method type is Btree, database type is Recno");
+ else
+ __db_err(dbenv,
+ "open method type is Recno, database type is Btree");
+ return (EINVAL);
+}
+
+/*
+ * __bam_read_root --
+ * Check (and optionally create) a tree.
+ *
+ * PUBLIC: int __bam_read_root __P((DB *, const char *, db_pgno_t, u_int32_t));
+ */
+int
+__bam_read_root(dbp, name, base_pgno, flags)
+ DB *dbp;
+ const char *name;
+ db_pgno_t base_pgno;
+ u_int32_t flags;
+{
+ BTMETA *meta;
+ BTREE *t;
+ DBC *dbc;
+ DB_LSN orig_lsn;
+ DB_LOCK metalock;
+ PAGE *root;
+ int locked, ret, t_ret;
+
+ ret = 0;
+ t = dbp->bt_internal;
+ meta = NULL;
+ root = NULL;
+ locked = 0;
+
+ /*
+ * Get a cursor. If DB_CREATE is specified, we may be creating
+ * the root page, and to do that safely in CDB we need a write
+ * cursor. In STD_LOCKING mode, we'll synchronize using the
+ * meta page lock instead.
+ */
+ if ((ret = dbp->cursor(dbp, dbp->open_txn,
+ &dbc, LF_ISSET(DB_CREATE) && CDB_LOCKING(dbp->dbenv) ?
+ DB_WRITECURSOR : 0)) != 0)
+ return (ret);
+
+ /* Get, and optionally create the metadata page. */
+ if ((ret =
+ __db_lget(dbc, 0, base_pgno, DB_LOCK_READ, 0, &metalock)) != 0)
+ goto err;
+ if ((ret = memp_fget(
+ dbp->mpf, &base_pgno, DB_MPOOL_CREATE, (PAGE **)&meta)) != 0)
+ goto err;
+
+ /*
+ * If the magic number is correct, we're not creating the tree.
+ * Correct any fields that may not be right. Note, all of the
+ * local flags were set by DB->open.
+ */
+again: if (meta->dbmeta.magic != 0) {
+ t->bt_maxkey = meta->maxkey;
+ t->bt_minkey = meta->minkey;
+ t->re_pad = meta->re_pad;
+ t->re_len = meta->re_len;
+
+ t->bt_meta = base_pgno;
+ t->bt_root = meta->root;
+
+ (void)memp_fput(dbp->mpf, meta, 0);
+ meta = NULL;
+ goto done;
+ }
+
+ /* In recovery if it's not there it will be created elsewhere.*/
+ if (IS_RECOVERING(dbp->dbenv))
+ goto done;
+
+ /* If we're doing CDB; we now have to get the write lock. */
+ if (CDB_LOCKING(dbp->dbenv)) {
+ /*
+ * We'd better have DB_CREATE set if we're actually doing
+ * the create.
+ */
+ DB_ASSERT(LF_ISSET(DB_CREATE));
+ if ((ret = lock_get(dbp->dbenv, dbc->locker, DB_LOCK_UPGRADE,
+ &dbc->lock_dbt, DB_LOCK_WRITE, &dbc->mylock)) != 0)
+ goto err;
+ }
+
+ /*
+ * If we are doing locking, relase the read lock and get a write lock.
+ * We want to avoid deadlock.
+ */
+ if (locked == 0 && STD_LOCKING(dbc)) {
+ if ((ret = __LPUT(dbc, metalock)) != 0)
+ goto err;
+ if ((ret = __db_lget(dbc,
+ 0, base_pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ goto err;
+ locked = 1;
+ goto again;
+ }
+
+ /* Initialize the tree structure metadata information. */
+ orig_lsn = meta->dbmeta.lsn;
+ memset(meta, 0, sizeof(BTMETA));
+ meta->dbmeta.lsn = orig_lsn;
+ meta->dbmeta.pgno = base_pgno;
+ meta->dbmeta.magic = DB_BTREEMAGIC;
+ meta->dbmeta.version = DB_BTREEVERSION;
+ meta->dbmeta.pagesize = dbp->pgsize;
+ meta->dbmeta.type = P_BTREEMETA;
+ meta->dbmeta.free = PGNO_INVALID;
+ if (F_ISSET(dbp, DB_AM_DUP))
+ F_SET(&meta->dbmeta, BTM_DUP);
+ if (F_ISSET(dbp, DB_RE_FIXEDLEN))
+ F_SET(&meta->dbmeta, BTM_FIXEDLEN);
+ if (F_ISSET(dbp, DB_BT_RECNUM))
+ F_SET(&meta->dbmeta, BTM_RECNUM);
+ if (F_ISSET(dbp, DB_RE_RENUMBER))
+ F_SET(&meta->dbmeta, BTM_RENUMBER);
+ if (F_ISSET(dbp, DB_AM_SUBDB))
+ F_SET(&meta->dbmeta, BTM_SUBDB);
+ if (dbp->dup_compare != NULL)
+ F_SET(&meta->dbmeta, BTM_DUPSORT);
+ if (dbp->type == DB_RECNO)
+ F_SET(&meta->dbmeta, BTM_RECNO);
+ memcpy(meta->dbmeta.uid, dbp->fileid, DB_FILE_ID_LEN);
+
+ meta->maxkey = t->bt_maxkey;
+ meta->minkey = t->bt_minkey;
+ meta->re_len = t->re_len;
+ meta->re_pad = t->re_pad;
+
+ /* If necessary, log the meta-data and root page creates. */
+ if ((ret = __db_log_page(dbp,
+ name, &orig_lsn, base_pgno, (PAGE *)meta)) != 0)
+ goto err;
+
+ /* Create and initialize a root page. */
+ if ((ret = __db_new(dbc,
+ dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE, &root)) != 0)
+ goto err;
+ root->level = LEAFLEVEL;
+
+ if (dbp->open_txn != NULL && (ret = __bam_root_log(dbp->dbenv,
+ dbp->open_txn, &meta->dbmeta.lsn, 0, dbp->log_fileid,
+ meta->dbmeta.pgno, root->pgno, &meta->dbmeta.lsn)) != 0)
+ goto err;
+
+ meta->root = root->pgno;
+
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOGMETA, ret, name);
+ if ((ret = __db_log_page(dbp,
+ name, &root->lsn, root->pgno, root)) != 0)
+ goto err;
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOG, ret, name);
+
+ t->bt_meta = base_pgno;
+ t->bt_root = root->pgno;
+
+ /* Release the metadata and root pages. */
+ if ((ret = memp_fput(dbp->mpf, meta, DB_MPOOL_DIRTY)) != 0)
+ goto err;
+ meta = NULL;
+ if ((ret = memp_fput(dbp->mpf, root, DB_MPOOL_DIRTY)) != 0)
+ goto err;
+ root = NULL;
+
+ /*
+ * Flush the metadata and root pages to disk.
+ *
+ * !!!
+ * It's not useful to return not-yet-flushed here -- convert it to
+ * an error.
+ */
+ if ((ret = memp_fsync(dbp->mpf)) == DB_INCOMPLETE) {
+ __db_err(dbp->dbenv, "Metapage flush failed");
+ ret = EINVAL;
+ }
+ DB_TEST_RECOVERY(dbp, DB_TEST_POSTSYNC, ret, name);
+
+done: /*
+ * !!!
+ * We already did an insert and so the last-page-inserted has been
+ * set. I'm not sure where the *right* place to clear this value
+ * is, it's not intuitively obvious that it belongs here.
+ */
+ t->bt_lpgno = PGNO_INVALID;
+
+err:
+DB_TEST_RECOVERY_LABEL
+ /* Put any remaining pages back. */
+ if (meta != NULL)
+ if ((t_ret = memp_fput(dbp->mpf, meta, 0)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ if (root != NULL)
+ if ((t_ret = memp_fput(dbp->mpf, root, 0)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+
+ /* We can release the metapage lock when we are done. */
+ if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
diff --git a/db/btree/bt_put.c b/db/btree/bt_put.c
new file mode 100644
index 000000000..19a04526d
--- /dev/null
+++ b/db/btree/bt_put.c
@@ -0,0 +1,859 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: bt_put.c,v 11.46 2001/01/17 18:48:46 bostic Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "btree.h"
+
+static int __bam_dup_convert __P((DBC *, PAGE *, u_int32_t));
+static int __bam_ovput
+ __P((DBC *, u_int32_t, db_pgno_t, PAGE *, u_int32_t, DBT *));
+
+/*
+ * __bam_iitem --
+ * Insert an item into the tree.
+ *
+ * PUBLIC: int __bam_iitem __P((DBC *, DBT *, DBT *, u_int32_t, u_int32_t));
+ */
+int
+__bam_iitem(dbc, key, data, op, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t op, flags;
+{
+ BKEYDATA *bk, bk_tmp;
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT bk_hdr, tdbt;
+ PAGE *h;
+ db_indx_t indx;
+ u_int32_t data_size, have_bytes, need_bytes, needed;
+ int cmp, bigkey, bigdata, dupadjust, padrec, replace, ret, was_deleted;
+
+ COMPQUIET(bk, NULL);
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ t = dbp->bt_internal;
+ h = cp->page;
+ indx = cp->indx;
+ dupadjust = replace = was_deleted = 0;
+
+ /*
+ * Fixed-length records with partial puts: it's an error to specify
+ * anything other simple overwrite.
+ */
+ if (F_ISSET(dbp, DB_RE_FIXEDLEN) &&
+ F_ISSET(data, DB_DBT_PARTIAL) && data->dlen != data->size) {
+ data_size = data->size;
+ goto len_err;
+ }
+
+ /*
+ * Figure out how much space the data will take, including if it's a
+ * partial record.
+ *
+ * Fixed-length records: it's an error to specify a record that's
+ * longer than the fixed-length, and we never require less than
+ * the fixed-length record size.
+ */
+ data_size = F_ISSET(data, DB_DBT_PARTIAL) ?
+ __bam_partsize(op, data, h, indx) : data->size;
+ padrec = 0;
+ if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
+ if (data_size > t->re_len) {
+len_err: __db_err(dbp->dbenv,
+ "Length improper for fixed length record %lu",
+ (u_long)data_size);
+ return (EINVAL);
+ }
+ if (data_size < t->re_len) {
+ padrec = 1;
+ data_size = t->re_len;
+ }
+ }
+
+ /*
+ * Handle partial puts or short fixed-length records: build the
+ * real record.
+ */
+ if (padrec || F_ISSET(data, DB_DBT_PARTIAL)) {
+ tdbt = *data;
+ if ((ret =
+ __bam_build(dbc, op, &tdbt, h, indx, data_size)) != 0)
+ return (ret);
+ data = &tdbt;
+ }
+
+ /*
+ * If the user has specified a duplicate comparison function, return
+ * an error if DB_CURRENT was specified and the replacement data
+ * doesn't compare equal to the current data. This stops apps from
+ * screwing up the duplicate sort order. We have to do this after
+ * we build the real record so that we're comparing the real items.
+ */
+ if (op == DB_CURRENT && dbp->dup_compare != NULL) {
+ if ((ret = __bam_cmp(dbp, data, h,
+ indx + (TYPE(h) == P_LBTREE ? O_INDX : 0),
+ dbp->dup_compare, &cmp)) != 0)
+ return (ret);
+ if (cmp != 0) {
+ __db_err(dbp->dbenv,
+ "Current data differs from put data");
+ return (EINVAL);
+ }
+ }
+
+ /*
+ * If the key or data item won't fit on a page, we'll have to store
+ * them on overflow pages.
+ */
+ needed = 0;
+ bigdata = data_size > cp->ovflsize;
+ switch (op) {
+ case DB_KEYFIRST:
+ /* We're adding a new key and data pair. */
+ bigkey = key->size > cp->ovflsize;
+ if (bigkey)
+ needed += BOVERFLOW_PSIZE;
+ else
+ needed += BKEYDATA_PSIZE(key->size);
+ if (bigdata)
+ needed += BOVERFLOW_PSIZE;
+ else
+ needed += BKEYDATA_PSIZE(data_size);
+ break;
+ case DB_AFTER:
+ case DB_BEFORE:
+ case DB_CURRENT:
+ /*
+ * We're either overwriting the data item of a key/data pair
+ * or we're creating a new on-page duplicate and only adding
+ * a data item.
+ *
+ * !!!
+ * We're not currently correcting for space reclaimed from
+ * already deleted items, but I don't think it's worth the
+ * complexity.
+ */
+ bigkey = 0;
+ if (op == DB_CURRENT) {
+ bk = GET_BKEYDATA(h,
+ indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
+ if (B_TYPE(bk->type) == B_KEYDATA)
+ have_bytes = BKEYDATA_PSIZE(bk->len);
+ else
+ have_bytes = BOVERFLOW_PSIZE;
+ need_bytes = 0;
+ } else {
+ have_bytes = 0;
+ need_bytes = sizeof(db_indx_t);
+ }
+ if (bigdata)
+ need_bytes += BOVERFLOW_PSIZE;
+ else
+ need_bytes += BKEYDATA_PSIZE(data_size);
+
+ if (have_bytes < need_bytes)
+ needed += need_bytes - have_bytes;
+ break;
+ default:
+ return (__db_unknown_flag(dbp->dbenv, "__bam_iitem", op));
+ }
+
+ /*
+ * If there's not enough room, or the user has put a ceiling on the
+ * number of keys permitted in the page, split the page.
+ *
+ * XXX
+ * The t->bt_maxkey test here may be insufficient -- do we have to
+ * check in the btree split code, so we don't undo it there!?!?
+ */
+ if (P_FREESPACE(h) < needed ||
+ (t->bt_maxkey != 0 && NUM_ENT(h) > t->bt_maxkey))
+ return (DB_NEEDSPLIT);
+
+ /*
+ * The code breaks it up into five cases:
+ *
+ * 1. Insert a new key/data pair.
+ * 2. Append a new data item (a new duplicate).
+ * 3. Insert a new data item (a new duplicate).
+ * 4. Delete and re-add the data item (overflow item).
+ * 5. Overwrite the data item.
+ */
+ switch (op) {
+ case DB_KEYFIRST: /* 1. Insert a new key/data pair. */
+ if (bigkey) {
+ if ((ret = __bam_ovput(dbc,
+ B_OVERFLOW, PGNO_INVALID, h, indx, key)) != 0)
+ return (ret);
+ } else
+ if ((ret = __db_pitem(dbc, h, indx,
+ BKEYDATA_SIZE(key->size), NULL, key)) != 0)
+ return (ret);
+
+ if ((ret = __bam_ca_di(dbc, PGNO(h), indx, 1)) != 0)
+ return (ret);
+ ++indx;
+ break;
+ case DB_AFTER: /* 2. Append a new data item. */
+ if (TYPE(h) == P_LBTREE) {
+ /* Copy the key for the duplicate and adjust cursors. */
+ if ((ret =
+ __bam_adjindx(dbc, h, indx + P_INDX, indx, 1)) != 0)
+ return (ret);
+ if ((ret =
+ __bam_ca_di(dbc, PGNO(h), indx + P_INDX, 1)) != 0)
+ return (ret);
+
+ indx += 3;
+ dupadjust = 1;
+
+ cp->indx += 2;
+ } else {
+ ++indx;
+ cp->indx += 1;
+ }
+ break;
+ case DB_BEFORE: /* 3. Insert a new data item. */
+ if (TYPE(h) == P_LBTREE) {
+ /* Copy the key for the duplicate and adjust cursors. */
+ if ((ret = __bam_adjindx(dbc, h, indx, indx, 1)) != 0)
+ return (ret);
+ if ((ret = __bam_ca_di(dbc, PGNO(h), indx, 1)) != 0)
+ return (ret);
+
+ ++indx;
+ dupadjust = 1;
+ }
+ break;
+ case DB_CURRENT:
+ /*
+ * Clear the cursor's deleted flag. The problem is that if
+ * we deadlock or fail while deleting the overflow item or
+ * replacing the non-overflow item, a subsequent cursor close
+ * will try and remove the item because the cursor's delete
+ * flag is set
+ */
+ (void)__bam_ca_delete(dbp, PGNO(h), indx, 0);
+
+ if (TYPE(h) == P_LBTREE) {
+ ++indx;
+ dupadjust = 1;
+
+ /*
+ * In a Btree deleted records aren't counted (deleted
+ * records are counted in a Recno because all accesses
+ * are based on record number). If it's a Btree and
+ * it's a DB_CURRENT operation overwriting a previously
+ * deleted record, increment the record count.
+ */
+ was_deleted = B_DISSET(bk->type);
+ }
+
+ /*
+ * 4. Delete and re-add the data item.
+ *
+ * If we're changing the type of the on-page structure, or we
+ * are referencing offpage items, we have to delete and then
+ * re-add the item. We do not do any cursor adjustments here
+ * because we're going to immediately re-add the item into the
+ * same slot.
+ */
+ if (bigdata || B_TYPE(bk->type) != B_KEYDATA) {
+ if ((ret = __bam_ditem(dbc, h, indx)) != 0)
+ return (ret);
+ break;
+ }
+
+ /* 5. Overwrite the data item. */
+ replace = 1;
+ break;
+ default:
+ return (__db_unknown_flag(dbp->dbenv, "__bam_iitem", op));
+ }
+
+ /* Add the data. */
+ if (bigdata) {
+ if ((ret = __bam_ovput(dbc,
+ B_OVERFLOW, PGNO_INVALID, h, indx, data)) != 0)
+ return (ret);
+ } else {
+ if (LF_ISSET(BI_DELETED)) {
+ B_TSET(bk_tmp.type, B_KEYDATA, 1);
+ bk_tmp.len = data->size;
+ bk_hdr.data = &bk_tmp;
+ bk_hdr.size = SSZA(BKEYDATA, data);
+ ret = __db_pitem(dbc, h, indx,
+ BKEYDATA_SIZE(data->size), &bk_hdr, data);
+ } else if (replace)
+ ret = __bam_ritem(dbc, h, indx, data);
+ else
+ ret = __db_pitem(dbc, h, indx,
+ BKEYDATA_SIZE(data->size), NULL, data);
+ if (ret != 0)
+ return (ret);
+ }
+ if ((ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0)
+ return (ret);
+
+ /*
+ * Re-position the cursors if necessary and reset the current cursor
+ * to point to the new item.
+ */
+ if (op != DB_CURRENT) {
+ if ((ret = __bam_ca_di(dbc, PGNO(h), indx, 1)) != 0)
+ return (ret);
+ cp->indx = TYPE(h) == P_LBTREE ? indx - O_INDX : indx;
+ }
+
+ /*
+ * If we've changed the record count, update the tree. There's no
+ * need to adjust the count if the operation not performed on the
+ * current record or when the current record was previously deleted.
+ */
+ if (F_ISSET(cp, C_RECNUM) && (op != DB_CURRENT || was_deleted))
+ if ((ret = __bam_adjust(dbc, 1)) != 0)
+ return (ret);
+
+ /*
+ * If a Btree leaf page is at least 50% full and we may have added or
+ * modified a duplicate data item, see if the set of duplicates takes
+ * up at least 25% of the space on the page. If it does, move it onto
+ * its own page.
+ */
+ if (dupadjust && P_FREESPACE(h) <= dbp->pgsize / 2) {
+ if ((ret = __bam_dup_convert(dbc, h, indx - O_INDX)) != 0)
+ return (ret);
+ }
+
+ /* If we've modified a recno file, set the flag. */
+ if (dbc->dbtype == DB_RECNO)
+ t->re_modified = 1;
+
+ return (ret);
+}
+
+/*
+ * __bam_partsize --
+ * Figure out how much space a partial data item is in total.
+ *
+ * PUBLIC: u_int32_t __bam_partsize __P((u_int32_t, DBT *, PAGE *, u_int32_t));
+ */
+u_int32_t
+__bam_partsize(op, data, h, indx)
+ u_int32_t op, indx;
+ DBT *data;
+ PAGE *h;
+{
+ BKEYDATA *bk;
+ u_int32_t nbytes;
+
+ /*
+ * If the record doesn't already exist, it's simply the data we're
+ * provided.
+ */
+ if (op != DB_CURRENT)
+ return (data->doff + data->size);
+
+ /*
+ * Otherwise, it's the data provided plus any already existing data
+ * that we're not replacing.
+ */
+ bk = GET_BKEYDATA(h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
+ nbytes =
+ B_TYPE(bk->type) == B_OVERFLOW ? ((BOVERFLOW *)bk)->tlen : bk->len;
+
+ /*
+ * There are really two cases here:
+ *
+ * Case 1: We are replacing some bytes that do not exist (i.e., they
+ * are past the end of the record). In this case the number of bytes
+ * we are replacing is irrelevant and all we care about is how many
+ * bytes we are going to add from offset. So, the new record length
+ * is going to be the size of the new bytes (size) plus wherever those
+ * new bytes begin (doff).
+ *
+ * Case 2: All the bytes we are replacing exist. Therefore, the new
+ * size is the oldsize (nbytes) minus the bytes we are replacing (dlen)
+ * plus the bytes we are adding (size).
+ */
+ if (nbytes < data->doff + data->dlen) /* Case 1 */
+ return (data->doff + data->size);
+
+ return (nbytes + data->size - data->dlen); /* Case 2 */
+}
+
+/*
+ * __bam_build --
+ * Build the real record for a partial put, or short fixed-length record.
+ *
+ * PUBLIC: int __bam_build __P((DBC *, u_int32_t,
+ * PUBLIC: DBT *, PAGE *, u_int32_t, u_int32_t));
+ */
+int
+__bam_build(dbc, op, dbt, h, indx, nbytes)
+ DBC *dbc;
+ u_int32_t op, indx, nbytes;
+ DBT *dbt;
+ PAGE *h;
+{
+ BKEYDATA *bk, tbk;
+ BOVERFLOW *bo;
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT copy;
+ u_int32_t len, tlen;
+ u_int8_t *p;
+ int ret;
+
+ COMPQUIET(bo, NULL);
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *) dbc->internal;
+ t = dbp->bt_internal;
+
+ /* We use the record data return memory, it's only a short-term use. */
+ if (dbc->rdata.ulen < nbytes) {
+ if ((ret = __os_realloc(dbp->dbenv,
+ nbytes, NULL, &dbc->rdata.data)) != 0) {
+ dbc->rdata.ulen = 0;
+ dbc->rdata.data = NULL;
+ return (ret);
+ }
+ dbc->rdata.ulen = nbytes;
+ }
+
+ /*
+ * We use nul or pad bytes for any part of the record that isn't
+ * specified; get it over with.
+ */
+ memset(dbc->rdata.data,
+ F_ISSET(dbp, DB_RE_FIXEDLEN) ? t->re_pad : 0, nbytes);
+
+ /*
+ * In the next clauses, we need to do three things: a) set p to point
+ * to the place at which to copy the user's data, b) set tlen to the
+ * total length of the record, not including the bytes contributed by
+ * the user, and c) copy any valid data from an existing record. If
+ * it's not a partial put (this code is called for both partial puts
+ * and fixed-length record padding) or it's a new key, we can cut to
+ * the chase.
+ */
+ if (!F_ISSET(dbt, DB_DBT_PARTIAL) || op != DB_CURRENT) {
+ p = (u_int8_t *)dbc->rdata.data + dbt->doff;
+ tlen = dbt->doff;
+ goto user_copy;
+ }
+
+ /* Find the current record. */
+ if (indx < NUM_ENT(h)) {
+ bk = GET_BKEYDATA(h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
+ bo = (BOVERFLOW *)bk;
+ } else {
+ bk = &tbk;
+ B_TSET(bk->type, B_KEYDATA, 0);
+ bk->len = 0;
+ }
+ if (B_TYPE(bk->type) == B_OVERFLOW) {
+ /*
+ * In the case of an overflow record, we shift things around
+ * in the current record rather than allocate a separate copy.
+ */
+ memset(&copy, 0, sizeof(copy));
+ if ((ret = __db_goff(dbp, &copy, bo->tlen,
+ bo->pgno, &dbc->rdata.data, &dbc->rdata.ulen)) != 0)
+ return (ret);
+
+ /* Skip any leading data from the original record. */
+ tlen = dbt->doff;
+ p = (u_int8_t *)dbc->rdata.data + dbt->doff;
+
+ /*
+ * Copy in any trailing data from the original record.
+ *
+ * If the original record was larger than the original offset
+ * plus the bytes being deleted, there is trailing data in the
+ * original record we need to preserve. If we aren't deleting
+ * the same number of bytes as we're inserting, copy it up or
+ * down, into place.
+ *
+ * Use memmove(), the regions may overlap.
+ */
+ if (bo->tlen > dbt->doff + dbt->dlen) {
+ len = bo->tlen - (dbt->doff + dbt->dlen);
+ if (dbt->dlen != dbt->size)
+ memmove(p + dbt->size, p + dbt->dlen, len);
+ tlen += len;
+ }
+ } else {
+ /* Copy in any leading data from the original record. */
+ memcpy(dbc->rdata.data,
+ bk->data, dbt->doff > bk->len ? bk->len : dbt->doff);
+ tlen = dbt->doff;
+ p = (u_int8_t *)dbc->rdata.data + dbt->doff;
+
+ /* Copy in any trailing data from the original record. */
+ len = dbt->doff + dbt->dlen;
+ if (bk->len > len) {
+ memcpy(p + dbt->size, bk->data + len, bk->len - len);
+ tlen += bk->len - len;
+ }
+ }
+
+user_copy:
+ /*
+ * Copy in the application provided data -- p and tlen must have been
+ * initialized above.
+ */
+ memcpy(p, dbt->data, dbt->size);
+ tlen += dbt->size;
+
+ /* Set the DBT to reference our new record. */
+ dbc->rdata.size = F_ISSET(dbp, DB_RE_FIXEDLEN) ? t->re_len : tlen;
+ dbc->rdata.dlen = 0;
+ dbc->rdata.doff = 0;
+ dbc->rdata.flags = 0;
+ *dbt = dbc->rdata;
+ return (0);
+}
+
+/*
+ * __bam_ritem --
+ * Replace an item on a page.
+ *
+ * PUBLIC: int __bam_ritem __P((DBC *, PAGE *, u_int32_t, DBT *));
+ */
+int
+__bam_ritem(dbc, h, indx, data)
+ DBC *dbc;
+ PAGE *h;
+ u_int32_t indx;
+ DBT *data;
+{
+ BKEYDATA *bk;
+ DB *dbp;
+ DBT orig, repl;
+ db_indx_t cnt, lo, ln, min, off, prefix, suffix;
+ int32_t nbytes;
+ int ret;
+ u_int8_t *p, *t;
+
+ dbp = dbc->dbp;
+
+ /*
+ * Replace a single item onto a page. The logic figuring out where
+ * to insert and whether it fits is handled in the caller. All we do
+ * here is manage the page shuffling.
+ */
+ bk = GET_BKEYDATA(h, indx);
+
+ /* Log the change. */
+ if (DB_LOGGING(dbc)) {
+ /*
+ * We might as well check to see if the two data items share
+ * a common prefix and suffix -- it can save us a lot of log
+ * message if they're large.
+ */
+ min = data->size < bk->len ? data->size : bk->len;
+ for (prefix = 0,
+ p = bk->data, t = data->data;
+ prefix < min && *p == *t; ++prefix, ++p, ++t)
+ ;
+
+ min -= prefix;
+ for (suffix = 0,
+ p = (u_int8_t *)bk->data + bk->len - 1,
+ t = (u_int8_t *)data->data + data->size - 1;
+ suffix < min && *p == *t; ++suffix, --p, --t)
+ ;
+
+ /* We only log the parts of the keys that have changed. */
+ orig.data = (u_int8_t *)bk->data + prefix;
+ orig.size = bk->len - (prefix + suffix);
+ repl.data = (u_int8_t *)data->data + prefix;
+ repl.size = data->size - (prefix + suffix);
+ if ((ret = __bam_repl_log(dbp->dbenv, dbc->txn,
+ &LSN(h), 0, dbp->log_fileid, PGNO(h), &LSN(h),
+ (u_int32_t)indx, (u_int32_t)B_DISSET(bk->type),
+ &orig, &repl, (u_int32_t)prefix, (u_int32_t)suffix)) != 0)
+ return (ret);
+ }
+
+ /*
+ * Set references to the first in-use byte on the page and the
+ * first byte of the item being replaced.
+ */
+ p = (u_int8_t *)h + HOFFSET(h);
+ t = (u_int8_t *)bk;
+
+ /*
+ * If the entry is growing in size, shift the beginning of the data
+ * part of the page down. If the entry is shrinking in size, shift
+ * the beginning of the data part of the page up. Use memmove(3),
+ * the regions overlap.
+ */
+ lo = BKEYDATA_SIZE(bk->len);
+ ln = BKEYDATA_SIZE(data->size);
+ if (lo != ln) {
+ nbytes = lo - ln; /* Signed difference. */
+ if (p == t) /* First index is fast. */
+ h->inp[indx] += nbytes;
+ else { /* Else, shift the page. */
+ memmove(p + nbytes, p, t - p);
+
+ /* Adjust the indices' offsets. */
+ off = h->inp[indx];
+ for (cnt = 0; cnt < NUM_ENT(h); ++cnt)
+ if (h->inp[cnt] <= off)
+ h->inp[cnt] += nbytes;
+ }
+
+ /* Clean up the page and adjust the item's reference. */
+ HOFFSET(h) += nbytes;
+ t += nbytes;
+ }
+
+ /* Copy the new item onto the page. */
+ bk = (BKEYDATA *)t;
+ B_TSET(bk->type, B_KEYDATA, 0);
+ bk->len = data->size;
+ memcpy(bk->data, data->data, data->size);
+
+ return (0);
+}
+
+/*
+ * __bam_dup_convert --
+ * Check to see if the duplicate set at indx should have its own page.
+ * If it should, create it.
+ */
+static int
+__bam_dup_convert(dbc, h, indx)
+ DBC *dbc;
+ PAGE *h;
+ u_int32_t indx;
+{
+ BTREE_CURSOR *cp;
+ BKEYDATA *bk;
+ DB *dbp;
+ DBT hdr;
+ PAGE *dp;
+ db_indx_t cnt, cpindx, dindx, first, sz;
+ int ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * Count the duplicate records and calculate how much room they're
+ * using on the page.
+ */
+ while (indx > 0 && h->inp[indx] == h->inp[indx - P_INDX])
+ indx -= P_INDX;
+ for (cnt = 0, sz = 0, first = indx;; ++cnt, indx += P_INDX) {
+ if (indx >= NUM_ENT(h) || h->inp[first] != h->inp[indx])
+ break;
+ bk = GET_BKEYDATA(h, indx);
+ sz += B_TYPE(bk->type) == B_KEYDATA ?
+ BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE;
+ bk = GET_BKEYDATA(h, indx + O_INDX);
+ sz += B_TYPE(bk->type) == B_KEYDATA ?
+ BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE;
+ }
+
+ /*
+ * We have to do these checks when the user is replacing the cursor's
+ * data item -- if the application replaces a duplicate item with a
+ * larger data item, it can increase the amount of space used by the
+ * duplicates, requiring this check. But that means we may have done
+ * this check when it wasn't a duplicate item after all.
+ */
+ if (cnt == 1)
+ return (0);
+
+ /*
+ * If this set of duplicates is using more than 25% of the page, move
+ * them off. The choice of 25% is a WAG, but the value must be small
+ * enough that we can always split a page without putting duplicates
+ * on two different pages.
+ */
+ if (sz < dbp->pgsize / 4)
+ return (0);
+
+ /* Get a new page. */
+ if ((ret = __db_new(dbc,
+ dbp->dup_compare == NULL ? P_LRECNO : P_LDUP, &dp)) != 0)
+ return (ret);
+ P_INIT(dp, dbp->pgsize, dp->pgno,
+ PGNO_INVALID, PGNO_INVALID, LEAFLEVEL, TYPE(dp));
+
+ /*
+ * Move this set of duplicates off the page. First points to the first
+ * key of the first duplicate key/data pair, cnt is the number of pairs
+ * we're dealing with.
+ */
+ memset(&hdr, 0, sizeof(hdr));
+ dindx = first;
+ indx = first;
+ cpindx = 0;
+ do {
+ /* Move cursors referencing the old entry to the new entry. */
+ if ((ret = __bam_ca_dup(dbc, first,
+ PGNO(h), indx, PGNO(dp), cpindx)) != 0)
+ goto err;
+
+ /*
+ * Copy the entry to the new page. If the off-duplicate page
+ * If the off-duplicate page is a Btree page (i.e. dup_compare
+ * will be non-NULL, we use Btree pages for sorted dups,
+ * and Recno pages for unsorted dups), move all entries
+ * normally, even deleted ones. If it's a Recno page,
+ * deleted entries are discarded (if the deleted entry is
+ * overflow, then free up those pages).
+ */
+ bk = GET_BKEYDATA(h, dindx + 1);
+ hdr.data = bk;
+ hdr.size = B_TYPE(bk->type) == B_KEYDATA ?
+ BKEYDATA_SIZE(bk->len) : BOVERFLOW_SIZE;
+ if (dbp->dup_compare == NULL && B_DISSET(bk->type)) {
+ /*
+ * Unsorted dups, i.e. recno page, and we have
+ * a deleted entry, don't move it, but if it was
+ * an overflow entry, we need to free those pages.
+ */
+ if (B_TYPE(bk->type) == B_OVERFLOW &&
+ (ret = __db_doff(dbc,
+ (GET_BOVERFLOW(h, dindx + 1))->pgno)) != 0)
+ goto err;
+ } else {
+ if ((ret = __db_pitem(
+ dbc, dp, cpindx, hdr.size, &hdr, NULL)) != 0)
+ goto err;
+ ++cpindx;
+ }
+ /* Delete all but the last reference to the key. */
+ if (cnt != 1) {
+ if ((ret = __bam_adjindx(dbc,
+ h, dindx, first + 1, 0)) != 0)
+ goto err;
+ } else
+ dindx++;
+
+ /* Delete the data item. */
+ if ((ret = __db_ditem(dbc, h, dindx, hdr.size)) != 0)
+ goto err;
+ indx += P_INDX;
+ } while (--cnt);
+
+ /* Put in a new data item that points to the duplicates page. */
+ if ((ret = __bam_ovput(dbc,
+ B_DUPLICATE, dp->pgno, h, first + 1, NULL)) != 0)
+ goto err;
+
+ /* Adjust cursors for all the above movments. */
+ if ((ret = __bam_ca_di(dbc,
+ PGNO(h), first + P_INDX, first + P_INDX - indx)) != 0)
+ goto err;
+
+ return (memp_fput(dbp->mpf, dp, DB_MPOOL_DIRTY));
+
+err: (void)__db_free(dbc, dp);
+ return (ret);
+}
+
+/*
+ * __bam_ovput --
+ * Build an item for an off-page duplicates page or overflow page and
+ * insert it on the page.
+ */
+static int
+__bam_ovput(dbc, type, pgno, h, indx, item)
+ DBC *dbc;
+ u_int32_t type, indx;
+ db_pgno_t pgno;
+ PAGE *h;
+ DBT *item;
+{
+ BOVERFLOW bo;
+ DBT hdr;
+ int ret;
+
+ UMRW_SET(bo.unused1);
+ B_TSET(bo.type, type, 0);
+ UMRW_SET(bo.unused2);
+
+ /*
+ * If we're creating an overflow item, do so and acquire the page
+ * number for it. If we're creating an off-page duplicates tree,
+ * we are giving the page number as an argument.
+ */
+ if (type == B_OVERFLOW) {
+ if ((ret = __db_poff(dbc, item, &bo.pgno)) != 0)
+ return (ret);
+ bo.tlen = item->size;
+ } else {
+ bo.pgno = pgno;
+ bo.tlen = 0;
+ }
+
+ /* Store the new record on the page. */
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.data = &bo;
+ hdr.size = BOVERFLOW_SIZE;
+ return (__db_pitem(dbc, h, indx, BOVERFLOW_SIZE, &hdr, NULL));
+}
diff --git a/db/btree/bt_rec.c b/db/btree/bt_rec.c
new file mode 100644
index 000000000..24dc9bc6a
--- /dev/null
+++ b/db/btree/bt_rec.c
@@ -0,0 +1,1219 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: bt_rec.c,v 11.35 2001/01/10 16:24:47 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "hash.h"
+#include "btree.h"
+#include "log.h"
+
+#define IS_BTREE_PAGE(pagep) \
+ (TYPE(pagep) == P_IBTREE || \
+ TYPE(pagep) == P_LBTREE || TYPE(pagep) == P_LDUP)
+
+/*
+ * __bam_pg_alloc_recover --
+ * Recovery function for pg_alloc.
+ *
+ * PUBLIC: int __bam_pg_alloc_recover
+ * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_pg_alloc_recover(dbenv, dbtp, lsnp, op, info)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_pg_alloc_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ DBMETA *meta;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ db_pgno_t pgno;
+ int cmp_n, cmp_p, level, modified, ret;
+
+ REC_PRINT(__bam_pg_alloc_print);
+ REC_INTRO(__bam_pg_alloc_read, 0);
+
+ /*
+ * Fix up the allocated page. If we're redoing the operation, we have
+ * to get the page (creating it if it doesn't exist), and update its
+ * LSN. If we're undoing the operation, we have to reset the page's
+ * LSN and put it on the free list.
+ *
+ * Fix up the metadata page. If we're redoing the operation, we have
+ * to get the metadata page and update its LSN and its free pointer.
+ * If we're undoing the operation and the page was ever created, we put
+ * it on the freelist.
+ */
+ pgno = PGNO_BASE_MD;
+ meta = NULL;
+ if ((ret = memp_fget(mpf, &pgno, 0, &meta)) != 0) {
+ /* The metadata page must always exist on redo. */
+ if (DB_REDO(op)) {
+ (void)__db_pgerr(file_dbp, pgno);
+ goto out;
+ } else
+ goto done;
+ }
+ if ((ret = memp_fget(mpf, &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0) {
+ /*
+ * We specify creation and check for it later, because this
+ * operation was supposed to create the page, and even in
+ * the undo case it's going to get linked onto the freelist
+ * which we're also fixing up.
+ */
+ (void)__db_pgerr(file_dbp, argp->pgno);
+ goto err;
+ }
+
+ /* Fix up the allocated page. */
+ modified = 0;
+ cmp_n = log_compare(lsnp, &LSN(pagep));
+ cmp_p = log_compare(&LSN(pagep), &argp->page_lsn);
+
+ /*
+ * If an inital allocation is aborted and then reallocated
+ * during an archival restore the log record will have
+ * an LSN for the page but the page will be empty.
+ */
+ if (IS_ZERO_LSN(LSN(pagep)))
+ cmp_p = 0;
+ CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->page_lsn);
+ /*
+ * If we we rolled back this allocation previously during an
+ * archive restore, the page may have the LSN of the meta page
+ * at the point of the roll back. This will be no more
+ * than the LSN of the metadata page at the time of this allocation.
+ */
+ if (DB_REDO(op) &&
+ (cmp_p == 0 ||
+ (IS_ZERO_LSN(argp->page_lsn) &&
+ log_compare(&LSN(pagep), &argp->meta_lsn) <= 0))) {
+ /* Need to redo update described. */
+ switch (argp->ptype) {
+ case P_LBTREE:
+ case P_LRECNO:
+ case P_LDUP:
+ level = LEAFLEVEL;
+ break;
+ default:
+ level = 0;
+ break;
+ }
+ P_INIT(pagep, file_dbp->pgsize,
+ argp->pgno, PGNO_INVALID, PGNO_INVALID, level, argp->ptype);
+
+ pagep->lsn = *lsnp;
+ modified = 1;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /*
+ * Undo the allocation, reinitialize the page and
+ * link its next pointer to the free list.
+ */
+ P_INIT(pagep, file_dbp->pgsize,
+ argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID);
+
+ pagep->lsn = argp->page_lsn;
+ modified = 1;
+ }
+
+ if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) {
+ goto err;
+ }
+
+ /*
+ * If the page was newly created, put it on the limbo list.
+ */
+ if (IS_ZERO_LSN(LSN(pagep)) &&
+ IS_ZERO_LSN(argp->page_lsn) && DB_UNDO(op)) {
+ /* Put the page in limbo.*/
+ if ((ret = __db_add_limbo(dbenv,
+ info, argp->fileid, argp->pgno, 1)) != 0)
+ goto err;
+ }
+
+ /* Fix up the metadata page. */
+ modified = 0;
+ cmp_n = log_compare(lsnp, &LSN(meta));
+ cmp_p = log_compare(&LSN(meta), &argp->meta_lsn);
+ CHECK_LSN(op, cmp_p, &LSN(meta), &argp->meta_lsn);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ LSN(meta) = *lsnp;
+ meta->free = argp->next;
+ modified = 1;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ LSN(meta) = argp->meta_lsn;
+
+ /*
+ * If the page has a zero LSN then its newly created
+ * and will go into limbo rather than directly on the
+ * free list.
+ */
+ if (!IS_ZERO_LSN(argp->page_lsn))
+ meta->free = argp->pgno;
+ modified = 1;
+ }
+ if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0)
+ goto out;
+ /*
+ * This could be the metapage from a subdb which is read from disk
+ * to recover its creation.
+ */
+ if (F_ISSET(file_dbp, DB_AM_SUBDB))
+ switch (argp->type) {
+ case P_BTREEMETA:
+ case P_HASHMETA:
+ case P_QAMMETA:
+ file_dbp->sync(file_dbp, 0);
+ break;
+ }
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+ if (0) {
+err:
+ if (meta != NULL)
+ (void)memp_fput(mpf, meta, 0);
+ }
+out: REC_CLOSE;
+}
+
+/*
+ * __bam_pg_free_recover --
+ * Recovery function for pg_free.
+ *
+ * PUBLIC: int __bam_pg_free_recover
+ * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_pg_free_recover(dbenv, dbtp, lsnp, op, info)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_pg_free_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ DBMETA *meta;
+ DB_LSN copy_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ db_pgno_t pgno;
+ int cmp_n, cmp_p, modified, ret;
+
+ COMPQUIET(info, NULL);
+ REC_PRINT(__bam_pg_free_print);
+ REC_INTRO(__bam_pg_free_read, 1);
+
+ /*
+ * Fix up the freed page. If we're redoing the operation we get the
+ * page and explicitly discard its contents, then update its LSN. If
+ * we're undoing the operation, we get the page and restore its header.
+ * Create the page if necessary, we may be freeing an aborted
+ * create.
+ */
+ if ((ret = memp_fget(mpf, &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0)
+ goto out;
+ modified = 0;
+ __ua_memcpy(&copy_lsn, &LSN(argp->header.data), sizeof(DB_LSN));
+ cmp_n = log_compare(lsnp, &LSN(pagep));
+ cmp_p = log_compare(&LSN(pagep), &copy_lsn);
+ CHECK_LSN(op, cmp_p, &LSN(pagep), &copy_lsn);
+ if (DB_REDO(op) &&
+ (cmp_p == 0 ||
+ (IS_ZERO_LSN(copy_lsn) &&
+ log_compare(&LSN(pagep), &argp->meta_lsn) <= 0))) {
+ /* Need to redo update described. */
+ P_INIT(pagep, file_dbp->pgsize,
+ argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID);
+ pagep->lsn = *lsnp;
+
+ modified = 1;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ memcpy(pagep, argp->header.data, argp->header.size);
+
+ modified = 1;
+ }
+ if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
+ goto out;
+
+ /*
+ * Fix up the metadata page. If we're redoing or undoing the operation
+ * we get the page and update its LSN and free pointer.
+ */
+ pgno = PGNO_BASE_MD;
+ if ((ret = memp_fget(mpf, &pgno, 0, &meta)) != 0) {
+ /* The metadata page must always exist. */
+ (void)__db_pgerr(file_dbp, pgno);
+ goto out;
+ }
+
+ modified = 0;
+ cmp_n = log_compare(lsnp, &LSN(meta));
+ cmp_p = log_compare(&LSN(meta), &argp->meta_lsn);
+ CHECK_LSN(op, cmp_p, &LSN(meta), &argp->meta_lsn);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo the deallocation. */
+ meta->free = argp->pgno;
+ LSN(meta) = *lsnp;
+ modified = 1;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo the deallocation. */
+ meta->free = argp->next;
+ LSN(meta) = argp->meta_lsn;
+ modified = 1;
+ }
+ if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0)
+ goto out;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: REC_CLOSE;
+}
+
+/*
+ * __bam_split_recover --
+ * Recovery function for split.
+ *
+ * PUBLIC: int __bam_split_recover
+ * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_split_recover(dbenv, dbtp, lsnp, op, info)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_split_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *_lp, *lp, *np, *pp, *_rp, *rp, *sp;
+ db_pgno_t pgno, root_pgno;
+ u_int32_t ptype;
+ int cmp, l_update, p_update, r_update, rc, ret, rootsplit, t_ret;
+
+ COMPQUIET(info, NULL);
+ REC_PRINT(__bam_split_print);
+
+ mpf = NULL;
+ _lp = lp = np = pp = _rp = rp = NULL;
+ sp = NULL;
+
+ REC_INTRO(__bam_split_read, 1);
+
+ /*
+ * There are two kinds of splits that we have to recover from. The
+ * first is a root-page split, where the root page is split from a
+ * leaf page into an internal page and two new leaf pages are created.
+ * The second is where a page is split into two pages, and a new key
+ * is inserted into the parent page.
+ *
+ * DBTs are not aligned in log records, so we need to copy the page
+ * so that we can access fields within it throughout this routine.
+ * Although we could hardcode the unaligned copies in this routine,
+ * we will be calling into regular btree functions with this page,
+ * so it's got to be aligned. Copying it into allocated memory is
+ * the only way to guarantee this.
+ */
+ if ((ret = __os_malloc(dbenv, argp->pg.size, NULL, &sp)) != 0)
+ goto out;
+ memcpy(sp, argp->pg.data, argp->pg.size);
+
+ pgno = PGNO(sp);
+ root_pgno = argp->root_pgno;
+ rootsplit = pgno == root_pgno;
+ if (memp_fget(mpf, &argp->left, 0, &lp) != 0)
+ lp = NULL;
+ if (memp_fget(mpf, &argp->right, 0, &rp) != 0)
+ rp = NULL;
+
+ if (DB_REDO(op)) {
+ l_update = r_update = p_update = 0;
+ /*
+ * Decide if we need to resplit the page.
+ *
+ * If this is a root split, then the root has to exist, it's
+ * the page we're splitting and it gets modified. If this is
+ * not a root split, then the left page has to exist, for the
+ * same reason.
+ */
+ if (rootsplit) {
+ if ((ret = memp_fget(mpf, &pgno, 0, &pp)) != 0) {
+ (void)__db_pgerr(file_dbp, pgno);
+ pp = NULL;
+ goto out;
+ }
+ cmp = log_compare(&LSN(pp), &LSN(argp->pg.data));
+ CHECK_LSN(op, cmp, &LSN(pp), &LSN(argp->pg.data));
+ p_update = cmp == 0;
+ } else if (lp == NULL) {
+ (void)__db_pgerr(file_dbp, argp->left);
+ goto out;
+ }
+
+ if (lp != NULL) {
+ cmp = log_compare(&LSN(lp), &argp->llsn);
+ CHECK_LSN(op, cmp, &LSN(lp), &argp->llsn);
+ if (cmp == 0)
+ l_update = 1;
+ } else
+ l_update = 1;
+
+ if (rp != NULL) {
+ cmp = log_compare(&LSN(rp), &argp->rlsn);
+ CHECK_LSN(op, cmp, &LSN(rp), &argp->rlsn);
+ if (cmp == 0)
+ r_update = 1;
+ } else
+ r_update = 1;
+ if (!p_update && !l_update && !r_update)
+ goto check_next;
+
+ /* Allocate and initialize new left/right child pages. */
+ if ((ret =
+ __os_malloc(dbenv, file_dbp->pgsize, NULL, &_lp)) != 0
+ || (ret =
+ __os_malloc(dbenv, file_dbp->pgsize, NULL, &_rp)) != 0)
+ goto out;
+ if (rootsplit) {
+ P_INIT(_lp, file_dbp->pgsize, argp->left,
+ PGNO_INVALID,
+ ISINTERNAL(sp) ? PGNO_INVALID : argp->right,
+ LEVEL(sp), TYPE(sp));
+ P_INIT(_rp, file_dbp->pgsize, argp->right,
+ ISINTERNAL(sp) ? PGNO_INVALID : argp->left,
+ PGNO_INVALID, LEVEL(sp), TYPE(sp));
+ } else {
+ P_INIT(_lp, file_dbp->pgsize, PGNO(sp),
+ ISINTERNAL(sp) ? PGNO_INVALID : PREV_PGNO(sp),
+ ISINTERNAL(sp) ? PGNO_INVALID : argp->right,
+ LEVEL(sp), TYPE(sp));
+ P_INIT(_rp, file_dbp->pgsize, argp->right,
+ ISINTERNAL(sp) ? PGNO_INVALID : sp->pgno,
+ ISINTERNAL(sp) ? PGNO_INVALID : NEXT_PGNO(sp),
+ LEVEL(sp), TYPE(sp));
+ }
+
+ /* Split the page. */
+ if ((ret = __bam_copy(file_dbp, sp, _lp, 0, argp->indx)) != 0 ||
+ (ret = __bam_copy(file_dbp, sp, _rp, argp->indx,
+ NUM_ENT(sp))) != 0)
+ goto out;
+
+ /* If the left child is wrong, update it. */
+ if (lp == NULL && (ret =
+ memp_fget(mpf, &argp->left, DB_MPOOL_CREATE, &lp)) != 0) {
+ (void)__db_pgerr(file_dbp, argp->left);
+ lp = NULL;
+ goto out;
+ }
+ if (l_update) {
+ memcpy(lp, _lp, file_dbp->pgsize);
+ lp->lsn = *lsnp;
+ if ((ret = memp_fput(mpf, lp, DB_MPOOL_DIRTY)) != 0)
+ goto out;
+ lp = NULL;
+ }
+
+ /* If the right child is wrong, update it. */
+ if (rp == NULL && (ret = memp_fget(mpf,
+ &argp->right, DB_MPOOL_CREATE, &rp)) != 0) {
+ (void)__db_pgerr(file_dbp, argp->right);
+ rp = NULL;
+ goto out;
+ }
+ if (r_update) {
+ memcpy(rp, _rp, file_dbp->pgsize);
+ rp->lsn = *lsnp;
+ if ((ret = memp_fput(mpf, rp, DB_MPOOL_DIRTY)) != 0)
+ goto out;
+ rp = NULL;
+ }
+
+ /*
+ * If the parent page is wrong, update it. This is of interest
+ * only if it was a root split, since root splits create parent
+ * pages. All other splits modify a parent page, but those are
+ * separately logged and recovered.
+ */
+ if (rootsplit && p_update) {
+ if (IS_BTREE_PAGE(sp)) {
+ ptype = P_IBTREE;
+ rc = argp->opflags & SPL_NRECS ? 1 : 0;
+ } else {
+ ptype = P_IRECNO;
+ rc = 1;
+ }
+
+ P_INIT(pp, file_dbp->pgsize, root_pgno,
+ PGNO_INVALID, PGNO_INVALID, _lp->level + 1, ptype);
+ RE_NREC_SET(pp,
+ rc ? __bam_total(_lp) + __bam_total(_rp) : 0);
+
+ pp->lsn = *lsnp;
+ if ((ret = memp_fput(mpf, pp, DB_MPOOL_DIRTY)) != 0)
+ goto out;
+ pp = NULL;
+ }
+
+check_next: /*
+ * Finally, redo the next-page link if necessary. This is of
+ * interest only if it wasn't a root split -- inserting a new
+ * page in the tree requires that any following page have its
+ * previous-page pointer updated to our new page. The next
+ * page must exist because we're redoing the operation.
+ */
+ if (!rootsplit && !IS_ZERO_LSN(argp->nlsn)) {
+ if ((ret = memp_fget(mpf, &argp->npgno, 0, &np)) != 0) {
+ (void)__db_pgerr(file_dbp, argp->npgno);
+ np = NULL;
+ goto out;
+ }
+ cmp = log_compare(&LSN(np), &argp->nlsn);
+ CHECK_LSN(op, cmp, &LSN(np), &argp->nlsn);
+ if (cmp == 0) {
+ PREV_PGNO(np) = argp->right;
+ np->lsn = *lsnp;
+ if ((ret =
+ memp_fput(mpf, np, DB_MPOOL_DIRTY)) != 0)
+ goto out;
+ np = NULL;
+ }
+ }
+ } else {
+ /*
+ * If the split page is wrong, replace its contents with the
+ * logged page contents. If the page doesn't exist, it means
+ * that the create of the page never happened, nor did any of
+ * the adds onto the page that caused the split, and there's
+ * really no undo-ing to be done.
+ */
+ if ((ret = memp_fget(mpf, &pgno, 0, &pp)) != 0) {
+ pp = NULL;
+ goto lrundo;
+ }
+ if (log_compare(lsnp, &LSN(pp)) == 0) {
+ memcpy(pp, argp->pg.data, argp->pg.size);
+ if ((ret = memp_fput(mpf, pp, DB_MPOOL_DIRTY)) != 0)
+ goto out;
+ pp = NULL;
+ }
+
+ /*
+ * If it's a root split and the left child ever existed, update
+ * its LSN. (If it's not a root split, we've updated the left
+ * page already -- it's the same as the split page.) If the
+ * right child ever existed, root split or not, update its LSN.
+ * The undo of the page allocation(s) will restore them to the
+ * free list.
+ */
+lrundo: if ((rootsplit && lp != NULL) || rp != NULL) {
+ if (rootsplit && lp != NULL &&
+ log_compare(lsnp, &LSN(lp)) == 0) {
+ lp->lsn = argp->llsn;
+ if ((ret =
+ memp_fput(mpf, lp, DB_MPOOL_DIRTY)) != 0)
+ goto out;
+ lp = NULL;
+ }
+ if (rp != NULL &&
+ log_compare(lsnp, &LSN(rp)) == 0) {
+ rp->lsn = argp->rlsn;
+ if ((ret =
+ memp_fput(mpf, rp, DB_MPOOL_DIRTY)) != 0)
+ goto out;
+ rp = NULL;
+ }
+ }
+
+ /*
+ * Finally, undo the next-page link if necessary. This is of
+ * interest only if it wasn't a root split -- inserting a new
+ * page in the tree requires that any following page have its
+ * previous-page pointer updated to our new page. Since it's
+ * possible that the next-page never existed, we ignore it as
+ * if there's nothing to undo.
+ */
+ if (!rootsplit && !IS_ZERO_LSN(argp->nlsn)) {
+ if ((ret = memp_fget(mpf, &argp->npgno, 0, &np)) != 0) {
+ np = NULL;
+ goto done;
+ }
+ if (log_compare(lsnp, &LSN(np)) == 0) {
+ PREV_PGNO(np) = argp->left;
+ np->lsn = argp->nlsn;
+ if (memp_fput(mpf, np, DB_MPOOL_DIRTY))
+ goto out;
+ np = NULL;
+ }
+ }
+ }
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: /* Free any pages that weren't dirtied. */
+ if (pp != NULL && (t_ret = memp_fput(mpf, pp, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ if (lp != NULL && (t_ret = memp_fput(mpf, lp, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ if (np != NULL && (t_ret = memp_fput(mpf, np, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ if (rp != NULL && (t_ret = memp_fput(mpf, rp, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Free any allocated space. */
+ if (_lp != NULL)
+ __os_free(_lp, file_dbp->pgsize);
+ if (_rp != NULL)
+ __os_free(_rp, file_dbp->pgsize);
+ if (sp != NULL)
+ __os_free(sp, argp->pg.size);
+
+ REC_CLOSE;
+}
+
+/*
+ * __bam_rsplit_recover --
+ * Recovery function for a reverse split.
+ *
+ * PUBLIC: int __bam_rsplit_recover
+ * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_rsplit_recover(dbenv, dbtp, lsnp, op, info)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_rsplit_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_LSN copy_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ db_pgno_t pgno, root_pgno;
+ int cmp_n, cmp_p, modified, ret;
+
+ COMPQUIET(info, NULL);
+ REC_PRINT(__bam_rsplit_print);
+ REC_INTRO(__bam_rsplit_read, 1);
+
+ /* Fix the root page. */
+ pgno = root_pgno = argp->root_pgno;
+ if ((ret = memp_fget(mpf, &pgno, 0, &pagep)) != 0) {
+ /* The root page must always exist if we are going forward. */
+ if (DB_REDO(op)) {
+ __db_pgerr(file_dbp, pgno);
+ goto out;
+ }
+ /* This must be the root of an OPD tree. */
+ DB_ASSERT(root_pgno !=
+ ((BTREE *)file_dbp->bt_internal)->bt_root);
+ ret = 0;
+ goto done;
+ }
+ modified = 0;
+ cmp_n = log_compare(lsnp, &LSN(pagep));
+ cmp_p = log_compare(&LSN(pagep), &argp->rootlsn);
+ CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->rootlsn);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ memcpy(pagep, argp->pgdbt.data, argp->pgdbt.size);
+ pagep->pgno = root_pgno;
+ pagep->lsn = *lsnp;
+ modified = 1;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ P_INIT(pagep, file_dbp->pgsize, root_pgno,
+ argp->nrec, PGNO_INVALID, pagep->level + 1,
+ IS_BTREE_PAGE(pagep) ? P_IBTREE : P_IRECNO);
+ if ((ret = __db_pitem(dbc, pagep, 0,
+ argp->rootent.size, &argp->rootent, NULL)) != 0)
+ goto out;
+ pagep->lsn = argp->rootlsn;
+ modified = 1;
+ }
+ if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
+ goto out;
+
+ /*
+ * Fix the page copied over the root page. It's possible that the
+ * page never made it to disk, so if we're undo-ing and the page
+ * doesn't exist, it's okay and there's nothing further to do.
+ */
+ if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) {
+ if (DB_UNDO(op))
+ goto done;
+ (void)__db_pgerr(file_dbp, argp->pgno);
+ goto out;
+ }
+ modified = 0;
+ __ua_memcpy(&copy_lsn, &LSN(argp->pgdbt.data), sizeof(DB_LSN));
+ cmp_n = log_compare(lsnp, &LSN(pagep));
+ cmp_p = log_compare(&LSN(pagep), &copy_lsn);
+ CHECK_LSN(op, cmp_p, &LSN(pagep), &copy_lsn);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ pagep->lsn = *lsnp;
+ modified = 1;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ memcpy(pagep, argp->pgdbt.data, argp->pgdbt.size);
+ modified = 1;
+ }
+ if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
+ goto out;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: REC_CLOSE;
+}
+
+/*
+ * __bam_adj_recover --
+ * Recovery function for adj.
+ *
+ * PUBLIC: int __bam_adj_recover
+ * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_adj_recover(dbenv, dbtp, lsnp, op, info)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_adj_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, modified, ret;
+
+ COMPQUIET(info, NULL);
+ REC_PRINT(__bam_adj_print);
+ REC_INTRO(__bam_adj_read, 1);
+
+ /* Get the page; if it never existed and we're undoing, we're done. */
+ if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) {
+ if (DB_UNDO(op))
+ goto done;
+ (void)__db_pgerr(file_dbp, argp->pgno);
+ goto out;
+ }
+
+ modified = 0;
+ cmp_n = log_compare(lsnp, &LSN(pagep));
+ cmp_p = log_compare(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->lsn);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ if ((ret = __bam_adjindx(dbc,
+ pagep, argp->indx, argp->indx_copy, argp->is_insert)) != 0)
+ goto err;
+
+ LSN(pagep) = *lsnp;
+ modified = 1;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ if ((ret = __bam_adjindx(dbc,
+ pagep, argp->indx, argp->indx_copy, !argp->is_insert)) != 0)
+ goto err;
+
+ LSN(pagep) = argp->lsn;
+ modified = 1;
+ }
+ if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
+ goto out;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+ if (0) {
+err: (void)memp_fput(mpf, pagep, 0);
+ }
+out: REC_CLOSE;
+}
+
+/*
+ * __bam_cadjust_recover --
+ * Recovery function for the adjust of a count change in an internal
+ * page.
+ *
+ * PUBLIC: int __bam_cadjust_recover
+ * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_cadjust_recover(dbenv, dbtp, lsnp, op, info)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_cadjust_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, modified, ret;
+
+ COMPQUIET(info, NULL);
+ REC_PRINT(__bam_cadjust_print);
+ REC_INTRO(__bam_cadjust_read, 1);
+
+ /* Get the page; if it never existed and we're undoing, we're done. */
+ if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) {
+ if (DB_UNDO(op))
+ goto done;
+ (void)__db_pgerr(file_dbp, argp->pgno);
+ goto out;
+ }
+
+ modified = 0;
+ cmp_n = log_compare(lsnp, &LSN(pagep));
+ cmp_p = log_compare(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->lsn);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ if (IS_BTREE_PAGE(pagep)) {
+ GET_BINTERNAL(pagep, argp->indx)->nrecs += argp->adjust;
+ if (argp->opflags & CAD_UPDATEROOT)
+ RE_NREC_ADJ(pagep, argp->adjust);
+ } else {
+ GET_RINTERNAL(pagep, argp->indx)->nrecs += argp->adjust;
+ if (argp->opflags & CAD_UPDATEROOT)
+ RE_NREC_ADJ(pagep, argp->adjust);
+ }
+
+ LSN(pagep) = *lsnp;
+ modified = 1;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ if (IS_BTREE_PAGE(pagep)) {
+ GET_BINTERNAL(pagep, argp->indx)->nrecs -= argp->adjust;
+ if (argp->opflags & CAD_UPDATEROOT)
+ RE_NREC_ADJ(pagep, -(argp->adjust));
+ } else {
+ GET_RINTERNAL(pagep, argp->indx)->nrecs -= argp->adjust;
+ if (argp->opflags & CAD_UPDATEROOT)
+ RE_NREC_ADJ(pagep, -(argp->adjust));
+ }
+ LSN(pagep) = argp->lsn;
+ modified = 1;
+ }
+ if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
+ goto out;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: REC_CLOSE;
+}
+
+/*
+ * __bam_cdel_recover --
+ * Recovery function for the intent-to-delete of a cursor record.
+ *
+ * PUBLIC: int __bam_cdel_recover
+ * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_cdel_recover(dbenv, dbtp, lsnp, op, info)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_cdel_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ u_int32_t indx;
+ int cmp_n, cmp_p, modified, ret;
+
+ COMPQUIET(info, NULL);
+ REC_PRINT(__bam_cdel_print);
+ REC_INTRO(__bam_cdel_read, 1);
+
+ /* Get the page; if it never existed and we're undoing, we're done. */
+ if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) {
+ if (DB_UNDO(op))
+ goto done;
+ (void)__db_pgerr(file_dbp, argp->pgno);
+ goto out;
+ }
+
+ modified = 0;
+ cmp_n = log_compare(lsnp, &LSN(pagep));
+ cmp_p = log_compare(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->lsn);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ indx = argp->indx + (TYPE(pagep) == P_LBTREE ? O_INDX : 0);
+ B_DSET(GET_BKEYDATA(pagep, indx)->type);
+
+ LSN(pagep) = *lsnp;
+ modified = 1;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ indx = argp->indx + (TYPE(pagep) == P_LBTREE ? O_INDX : 0);
+ B_DCLR(GET_BKEYDATA(pagep, indx)->type);
+
+ (void)__bam_ca_delete(file_dbp, argp->pgno, argp->indx, 0);
+
+ LSN(pagep) = argp->lsn;
+ modified = 1;
+ }
+ if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
+ goto out;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: REC_CLOSE;
+}
+
+/*
+ * __bam_repl_recover --
+ * Recovery function for page item replacement.
+ *
+ * PUBLIC: int __bam_repl_recover
+ * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_repl_recover(dbenv, dbtp, lsnp, op, info)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_repl_args *argp;
+ BKEYDATA *bk;
+ DB *file_dbp;
+ DBC *dbc;
+ DBT dbt;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, modified, ret;
+ u_int8_t *p;
+
+ COMPQUIET(info, NULL);
+ REC_PRINT(__bam_repl_print);
+ REC_INTRO(__bam_repl_read, 1);
+
+ /* Get the page; if it never existed and we're undoing, we're done. */
+ if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) {
+ if (DB_UNDO(op))
+ goto done;
+ (void)__db_pgerr(file_dbp, argp->pgno);
+ goto out;
+ }
+ bk = GET_BKEYDATA(pagep, argp->indx);
+
+ modified = 0;
+ cmp_n = log_compare(lsnp, &LSN(pagep));
+ cmp_p = log_compare(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->lsn);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /*
+ * Need to redo update described.
+ *
+ * Re-build the replacement item.
+ */
+ memset(&dbt, 0, sizeof(dbt));
+ dbt.size = argp->prefix + argp->suffix + argp->repl.size;
+ if ((ret = __os_malloc(dbenv, dbt.size, NULL, &dbt.data)) != 0)
+ goto err;
+ p = dbt.data;
+ memcpy(p, bk->data, argp->prefix);
+ p += argp->prefix;
+ memcpy(p, argp->repl.data, argp->repl.size);
+ p += argp->repl.size;
+ memcpy(p, bk->data + (bk->len - argp->suffix), argp->suffix);
+
+ ret = __bam_ritem(dbc, pagep, argp->indx, &dbt);
+ __os_free(dbt.data, dbt.size);
+ if (ret != 0)
+ goto err;
+
+ LSN(pagep) = *lsnp;
+ modified = 1;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /*
+ * Need to undo update described.
+ *
+ * Re-build the original item.
+ */
+ memset(&dbt, 0, sizeof(dbt));
+ dbt.size = argp->prefix + argp->suffix + argp->orig.size;
+ if ((ret = __os_malloc(dbenv, dbt.size, NULL, &dbt.data)) != 0)
+ goto err;
+ p = dbt.data;
+ memcpy(p, bk->data, argp->prefix);
+ p += argp->prefix;
+ memcpy(p, argp->orig.data, argp->orig.size);
+ p += argp->orig.size;
+ memcpy(p, bk->data + (bk->len - argp->suffix), argp->suffix);
+
+ ret = __bam_ritem(dbc, pagep, argp->indx, &dbt);
+ __os_free(dbt.data, dbt.size);
+ if (ret != 0)
+ goto err;
+
+ /* Reset the deleted flag, if necessary. */
+ if (argp->isdeleted)
+ B_DSET(GET_BKEYDATA(pagep, argp->indx)->type);
+
+ LSN(pagep) = argp->lsn;
+ modified = 1;
+ }
+ if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0)
+ goto out;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+ if (0) {
+err: (void)memp_fput(mpf, pagep, 0);
+ }
+out: REC_CLOSE;
+}
+
+/*
+ * __bam_root_recover --
+ * Recovery function for setting the root page on the meta-data page.
+ *
+ * PUBLIC: int __bam_root_recover
+ * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_root_recover(dbenv, dbtp, lsnp, op, info)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_root_args *argp;
+ BTMETA *meta;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ int cmp_n, cmp_p, modified, ret;
+
+ COMPQUIET(info, NULL);
+ REC_PRINT(__bam_root_print);
+ REC_INTRO(__bam_root_read, 0);
+
+ if ((ret = memp_fget(mpf, &argp->meta_pgno, 0, &meta)) != 0) {
+ /* The metadata page must always exist on redo. */
+ if (DB_REDO(op)) {
+ (void)__db_pgerr(file_dbp, argp->meta_pgno);
+ goto out;
+ } else
+ goto done;
+ }
+
+ modified = 0;
+ cmp_n = log_compare(lsnp, &LSN(meta));
+ cmp_p = log_compare(&LSN(meta), &argp->meta_lsn);
+ CHECK_LSN(op, cmp_p, &LSN(meta), &argp->meta_lsn);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ meta->root = argp->root_pgno;
+ meta->dbmeta.lsn = *lsnp;
+ ((BTREE *)file_dbp->bt_internal)->bt_root = meta->root;
+ modified = 1;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Nothing to undo except lsn. */
+ meta->dbmeta.lsn = argp->meta_lsn;
+ modified = 1;
+ }
+ if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0)
+ goto out;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: REC_CLOSE;
+}
+
+/*
+ * __bam_curadj_recover --
+ * Transaction abort function to undo cursor adjustments.
+ * This should only be triggered by subtransaction aborts.
+ *
+ * PUBLIC: int __bam_curadj_recover
+ * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_curadj_recover(dbenv, dbtp, lsnp, op, info)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_curadj_args *argp;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ int ret;
+
+ COMPQUIET(info, NULL);
+
+ REC_PRINT(__bam_curadj_print);
+ REC_INTRO(__bam_curadj_read, 0);
+
+ ret = 0;
+ if (op != DB_TXN_ABORT)
+ goto done;
+
+ switch(argp->mode) {
+ case DB_CA_DI:
+ if ((ret = __bam_ca_di(dbc, argp->from_pgno,
+ argp->from_indx, -(int)argp->first_indx)) != 0)
+ goto out;
+ break;
+ case DB_CA_DUP:
+ if ((ret = __bam_ca_undodup(file_dbp, argp->first_indx,
+ argp->from_pgno, argp->from_indx, argp->to_indx)) != 0)
+ goto out;
+ break;
+
+ case DB_CA_RSPLIT:
+ if ((ret =
+ __bam_ca_rsplit(dbc, argp->to_pgno, argp->from_pgno)) != 0)
+ goto out;
+ break;
+
+ case DB_CA_SPLIT:
+ __bam_ca_undosplit(file_dbp, argp->from_pgno,
+ argp->to_pgno, argp->left_pgno, argp->from_indx);
+ break;
+ }
+
+done: *lsnp = argp->prev_lsn;
+out: REC_CLOSE;
+}
+
+/*
+ * __bam_rcuradj_recover --
+ * Transaction abort function to undo cursor adjustments in rrecno.
+ * This should only be triggered by subtransaction aborts.
+ *
+ * PUBLIC: int __bam_rcuradj_recover
+ * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_rcuradj_recover(dbenv, dbtp, lsnp, op, info)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_rcuradj_args *argp;
+ BTREE_CURSOR *cp;
+ DB *file_dbp;
+ DBC *dbc, *rdbc;
+ DB_MPOOLFILE *mpf;
+ int ret, t_ret;
+
+ COMPQUIET(info, NULL);
+ rdbc = NULL;
+
+ REC_PRINT(__bam_rcuradj_print);
+ REC_INTRO(__bam_rcuradj_read, 0);
+
+ ret = t_ret = 0;
+
+ if (op != DB_TXN_ABORT)
+ goto done;
+
+ /*
+ * We don't know whether we're in an offpage dup set, and
+ * thus don't know whether the dbc REC_INTRO has handed us is
+ * of a reasonable type. It's certainly unset, so if this is
+ * an offpage dup set, we don't have an OPD cursor. The
+ * simplest solution is just to allocate a whole new cursor
+ * for our use; we're only really using it to hold pass some
+ * state into __ram_ca, and this way we don't need to make
+ * this function know anything about how offpage dups work.
+ */
+ if ((ret =
+ __db_icursor(file_dbp, NULL, DB_RECNO, argp->root, 0, &rdbc)) != 0)
+ goto out;
+
+ cp = (BTREE_CURSOR *)rdbc->internal;
+ F_SET(cp, C_RENUMBER);
+ cp->recno = argp->recno;
+
+ switch(argp->mode) {
+ case CA_DELETE:
+ /*
+ * The way to undo a delete is with an insert. Since
+ * we're undoing it, the delete flag must be set.
+ */
+ F_SET(cp, C_DELETED);
+ F_SET(cp, C_RENUMBER); /* Just in case. */
+ cp->order = argp->order;
+ __ram_ca(rdbc, CA_ICURRENT);
+ break;
+ case CA_IAFTER:
+ case CA_IBEFORE:
+ case CA_ICURRENT:
+ /*
+ * The way to undo an insert is with a delete. The delete
+ * flag is unset to start with.
+ */
+ F_CLR(cp, C_DELETED);
+ cp->order = INVALID_ORDER;
+ __ram_ca(rdbc, CA_DELETE);
+ break;
+ }
+
+done: *lsnp = argp->prev_lsn;
+out: if (rdbc != NULL && (t_ret = rdbc->c_close(rdbc)) != 0 && ret == 0)
+ ret = t_ret;
+ REC_CLOSE;
+}
diff --git a/db/btree/bt_reclaim.c b/db/btree/bt_reclaim.c
new file mode 100644
index 000000000..538d837c2
--- /dev/null
+++ b/db/btree/bt_reclaim.c
@@ -0,0 +1,53 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: bt_reclaim.c,v 11.5 2000/03/22 04:21:01 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "db_shash.h"
+#include "lock.h"
+#include "btree.h"
+
+/*
+ * __bam_reclaim --
+ * Free a database.
+ *
+ * PUBLIC: int __bam_reclaim __P((DB *, DB_TXN *));
+ */
+int
+__bam_reclaim(dbp, txn)
+ DB *dbp;
+ DB_TXN *txn;
+{
+ DBC *dbc;
+ int ret, t_ret;
+
+ /* Acquire a cursor. */
+ if ((ret = dbp->cursor(dbp, txn, &dbc, 0)) != 0)
+ return (ret);
+
+ /* Walk the tree, freeing pages. */
+ ret = __bam_traverse(dbc,
+ DB_LOCK_WRITE, dbc->internal->root, __db_reclaim_callback, dbc);
+
+ /* Discard the cursor. */
+ if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
diff --git a/db/btree/bt_recno.c b/db/btree/bt_recno.c
new file mode 100644
index 000000000..6ac0cac35
--- /dev/null
+++ b/db/btree/bt_recno.c
@@ -0,0 +1,1369 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: bt_recno.c,v 11.65 2001/01/18 14:33:22 bostic Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <limits.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "btree.h"
+#include "db_ext.h"
+#include "db_shash.h"
+#include "lock.h"
+#include "lock_ext.h"
+#include "qam.h"
+#include "txn.h"
+
+static int __ram_add __P((DBC *, db_recno_t *, DBT *, u_int32_t, u_int32_t));
+static int __ram_delete __P((DB *, DB_TXN *, DBT *, u_int32_t));
+static int __ram_put __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+static int __ram_source __P((DB *));
+static int __ram_sread __P((DBC *, db_recno_t));
+static int __ram_update __P((DBC *, db_recno_t, int));
+
+/*
+ * In recno, there are two meanings to the on-page "deleted" flag. If we're
+ * re-numbering records, it means the record was implicitly created. We skip
+ * over implicitly created records if doing a cursor "next" or "prev", and
+ * return DB_KEYEMPTY if they're explicitly requested.. If not re-numbering
+ * records, it means that the record was implicitly created, or was deleted.
+ * We skip over implicitly created or deleted records if doing a cursor "next"
+ * or "prev", and return DB_KEYEMPTY if they're explicitly requested.
+ *
+ * If we're re-numbering records, then we have to detect in the cursor that
+ * a record was deleted, and adjust the cursor as necessary on the next get.
+ * If we're not re-numbering records, then we can detect that a record has
+ * been deleted by looking at the actual on-page record, so we completely
+ * ignore the cursor's delete flag. This is different from the B+tree code.
+ * It also maintains whether the cursor references a deleted record in the
+ * cursor, and it doesn't always check the on-page value.
+ */
+#define CD_SET(cp) { \
+ if (F_ISSET(cp, C_RENUMBER)) \
+ F_SET(cp, C_DELETED); \
+}
+#define CD_CLR(cp) { \
+ if (F_ISSET(cp, C_RENUMBER)) { \
+ F_CLR(cp, C_DELETED); \
+ cp->order = INVALID_ORDER; \
+ } \
+}
+#define CD_ISSET(cp) \
+ (F_ISSET(cp, C_RENUMBER) && F_ISSET(cp, C_DELETED))
+
+/*
+ * Macros for comparing the ordering of two cursors.
+ * cp1 comes before cp2 iff one of the following holds:
+ * cp1's recno is less than cp2's recno
+ * recnos are equal, both deleted, and cp1's order is less than cp2's
+ * recnos are equal, cp1 deleted, and cp2 not deleted
+ */
+#define C_LESSTHAN(cp1, cp2) \
+ (((cp1)->recno < (cp2)->recno) || \
+ (((cp1)->recno == (cp2)->recno) && \
+ ((CD_ISSET((cp1)) && CD_ISSET((cp2)) && (cp1)->order < (cp2)->order) || \
+ (CD_ISSET((cp1)) && !CD_ISSET((cp2))))))
+
+/*
+ * cp1 is equal to cp2 iff their recnos and delete flags are identical,
+ * and if the delete flag is set their orders are also identical.
+ */
+#define C_EQUAL(cp1, cp2) \
+ ((cp1)->recno == (cp2)->recno && CD_ISSET((cp1)) == CD_ISSET((cp2)) && \
+ (!CD_ISSET((cp1)) || (cp1)->order == (cp2)->order))
+
+/*
+ * Do we need to log the current cursor adjustment?
+ */
+#define CURADJ_LOG(dbc) \
+ (DB_LOGGING((dbc)) && (dbc)->txn != NULL && (dbc)->txn->parent != NULL)
+
+/*
+ * __ram_open --
+ * Recno open function.
+ *
+ * PUBLIC: int __ram_open __P((DB *, const char *, db_pgno_t, u_int32_t));
+ */
+int
+__ram_open(dbp, name, base_pgno, flags)
+ DB *dbp;
+ const char *name;
+ db_pgno_t base_pgno;
+ u_int32_t flags;
+{
+ BTREE *t;
+ DBC *dbc;
+ int ret, t_ret;
+
+ t = dbp->bt_internal;
+
+ /* Initialize the remaining fields/methods of the DB. */
+ dbp->del = __ram_delete;
+ dbp->put = __ram_put;
+ dbp->stat = __bam_stat;
+
+ /* Start up the tree. */
+ if ((ret = __bam_read_root(dbp, name, base_pgno, flags)) != 0)
+ return (ret);
+
+ /*
+ * If the user specified a source tree, open it and map it in.
+ *
+ * !!!
+ * We don't complain if the user specified transactions or threads.
+ * It's possible to make it work, but you'd better know what you're
+ * doing!
+ */
+ if (t->re_source != NULL && (ret = __ram_source(dbp)) != 0)
+ return (ret);
+
+ /* If we're snapshotting an underlying source file, do it now. */
+ if (F_ISSET(dbp, DB_RE_SNAPSHOT)) {
+ /* Allocate a cursor. */
+ if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0)
+ return (ret);
+
+ /* Do the snapshot. */
+ if ((ret = __ram_update(dbc,
+ DB_MAX_RECORDS, 0)) != 0 && ret == DB_NOTFOUND)
+ ret = 0;
+
+ /* Discard the cursor. */
+ if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ return (0);
+}
+
+/*
+ * __ram_delete --
+ * Recno db->del function.
+ */
+static int
+__ram_delete(dbp, txn, key, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ DBT *key;
+ u_int32_t flags;
+{
+ BTREE_CURSOR *cp;
+ DBC *dbc;
+ db_recno_t recno;
+ int ret, t_ret;
+
+ PANIC_CHECK(dbp->dbenv);
+
+ /* Check for invalid flags. */
+ if ((ret = __db_delchk(dbp,
+ key, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0)
+ return (ret);
+
+ /* Acquire a cursor. */
+ if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
+ return (ret);
+
+ DEBUG_LWRITE(dbc, txn, "ram_delete", key, NULL, flags);
+
+ /* Check the user's record number and fill in as necessary. */
+ if ((ret = __ram_getno(dbc, key, &recno, 0)) != 0)
+ goto err;
+
+ /* Do the delete. */
+ cp = (BTREE_CURSOR *)dbc->internal;
+ cp->recno = recno;
+
+ ret = __ram_c_del(dbc);
+
+ /* Release the cursor. */
+err: if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __ram_put --
+ * Recno db->put function.
+ */
+static int
+__ram_put(dbp, txn, key, data, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ db_recno_t recno;
+ int ret, t_ret;
+
+ PANIC_CHECK(dbp->dbenv);
+
+ /* Check for invalid flags. */
+ if ((ret = __db_putchk(dbp,
+ key, data, flags, F_ISSET(dbp, DB_AM_RDONLY), 0)) != 0)
+ return (ret);
+
+ /* Allocate a cursor. */
+ if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
+ return (ret);
+
+ DEBUG_LWRITE(dbc, txn, "ram_put", key, data, flags);
+
+ /*
+ * If we're appending to the tree, make sure we've read in all of
+ * the backing source file. Otherwise, check the user's record
+ * number and fill in as necessary. If we found the record or it
+ * simply didn't exist, add the user's record.
+ */
+ if (flags == DB_APPEND)
+ ret = __ram_update(dbc, DB_MAX_RECORDS, 0);
+ else
+ ret = __ram_getno(dbc, key, &recno, 1);
+ if (ret == 0 || ret == DB_NOTFOUND)
+ ret = __ram_add(dbc, &recno, data, flags, 0);
+
+ /* Discard the cursor. */
+ if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Return the record number if we're appending to the tree. */
+ if (ret == 0 && flags == DB_APPEND)
+ ret = __db_retcopy(dbp, key, &recno, sizeof(recno),
+ &dbc->rkey.data, &dbc->rkey.ulen);
+
+ return (ret);
+}
+
+/*
+ * __ram_c_del --
+ * Recno cursor->c_del function.
+ *
+ * PUBLIC: int __ram_c_del __P((DBC *));
+ */
+int
+__ram_c_del(dbc)
+ DBC *dbc;
+{
+ BKEYDATA bk;
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_LSN lsn;
+ DBT hdr, data;
+ EPG *epg;
+ int exact, ret, stack;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ t = dbp->bt_internal;
+ stack = 0;
+
+ /*
+ * The semantics of cursors during delete are as follows: in
+ * non-renumbering recnos, records are replaced with a marker
+ * containing a delete flag. If the record referenced by this cursor
+ * has already been deleted, we will detect that as part of the delete
+ * operation, and fail.
+ *
+ * In renumbering recnos, cursors which represent deleted items
+ * are flagged with the C_DELETED flag, and it is an error to
+ * call c_del a second time without an intervening cursor motion.
+ */
+ if (CD_ISSET(cp))
+ return (DB_KEYEMPTY);
+
+ /* Search the tree for the key; delete only deletes exact matches. */
+ if ((ret = __bam_rsearch(dbc, &cp->recno, S_DELETE, 1, &exact)) != 0)
+ goto err;
+ if (!exact) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ stack = 1;
+ cp->page = cp->csp->page;
+ cp->pgno = cp->csp->page->pgno;
+ cp->indx = cp->csp->indx;
+
+ /*
+ * If re-numbering records, the on-page deleted flag can only mean
+ * that this record was implicitly created. Applications aren't
+ * permitted to delete records they never created, return an error.
+ *
+ * If not re-numbering records, the on-page deleted flag means that
+ * this record was implicitly created, or, was deleted at some time.
+ * The former is an error because applications aren't permitted to
+ * delete records they never created, the latter is an error because
+ * if the record was "deleted", we could never have found it.
+ */
+ if (B_DISSET(GET_BKEYDATA(cp->page, cp->indx)->type)) {
+ ret = DB_KEYEMPTY;
+ goto err;
+ }
+
+ if (F_ISSET(cp, C_RENUMBER)) {
+ /* Delete the item, adjust the counts, adjust the cursors. */
+ if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
+ goto err;
+ __bam_adjust(dbc, -1);
+ if (__ram_ca(dbc, CA_DELETE) > 0 &&
+ CURADJ_LOG(dbc) && (ret = __bam_rcuradj_log(dbp->dbenv,
+ dbc->txn, &lsn, 0, dbp->log_fileid, CA_DELETE,
+ cp->root, cp->recno, cp->order)) != 0)
+ goto err;
+
+ /*
+ * If the page is empty, delete it.
+ *
+ * We never delete a root page. First, root pages of primary
+ * databases never go away, recno or otherwise. However, if
+ * it's the root page of an off-page duplicates database, then
+ * it can be deleted. We don't delete it here because we have
+ * no way of telling the primary database page holder (e.g.,
+ * the hash access method) that its page element should cleaned
+ * up because the underlying tree is gone. So, we keep the page
+ * around until the last cursor referencing the empty tree is
+ * are closed, and then clean it up.
+ */
+ if (NUM_ENT(cp->page) == 0 && PGNO(cp->page) != cp->root) {
+ /*
+ * We already have a locked stack of pages. However,
+ * there are likely entries in the stack that aren't
+ * going to be emptied by removing the single reference
+ * to the emptied page (or one of its parents).
+ */
+ for (epg = cp->sp; epg <= cp->csp; ++epg)
+ if (NUM_ENT(epg->page) <= 1)
+ break;
+
+ /*
+ * We want to delete a single item out of the last page
+ * that we're not deleting, back up to that page.
+ */
+ ret = __bam_dpages(dbc, --epg);
+
+ /*
+ * Regardless of the return from __bam_dpages, it will
+ * discard our stack and pinned page.
+ */
+ stack = 0;
+ cp->page = NULL;
+ }
+ } else {
+ /* Use a delete/put pair to replace the record with a marker. */
+ if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
+ goto err;
+
+ B_TSET(bk.type, B_KEYDATA, 1);
+ bk.len = 0;
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.data = &bk;
+ hdr.size = SSZA(BKEYDATA, data);
+ memset(&data, 0, sizeof(data));
+ data.data = (void *)"";
+ data.size = 0;
+ if ((ret = __db_pitem(dbc,
+ cp->page, cp->indx, BKEYDATA_SIZE(0), &hdr, &data)) != 0)
+ goto err;
+ }
+
+ t->re_modified = 1;
+
+err: if (stack)
+ __bam_stkrel(dbc, STK_CLRDBC);
+
+ return (ret);
+}
+
+/*
+ * __ram_c_get --
+ * Recno cursor->c_get function.
+ *
+ * PUBLIC: int __ram_c_get
+ * PUBLIC: __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+ */
+int
+__ram_c_get(dbc, key, data, flags, pgnop)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+ db_pgno_t *pgnop;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ int cmp, exact, ret;
+
+ COMPQUIET(pgnop, NULL);
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+retry: switch (flags) {
+ case DB_CURRENT:
+ /*
+ * If we're using mutable records and the deleted flag is
+ * set, the cursor is pointing at a nonexistent record;
+ * return an error.
+ */
+ if (CD_ISSET(cp))
+ return (DB_KEYEMPTY);
+ break;
+ case DB_NEXT_DUP:
+ /*
+ * If we're not in an off-page dup set, we know there's no
+ * next duplicate since recnos don't have them. If we
+ * are in an off-page dup set, the next item assuredly is
+ * a dup, so we set flags to DB_NEXT and keep going.
+ */
+ if (!F_ISSET(dbc, DBC_OPD))
+ return (DB_NOTFOUND);
+ /* FALLTHROUGH */
+ case DB_NEXT_NODUP:
+ /*
+ * Recno databases don't have duplicates, set flags to DB_NEXT
+ * and keep going.
+ */
+ /* FALLTHROUGH */
+ case DB_NEXT:
+ flags = DB_NEXT;
+ /*
+ * If record numbers are mutable: if we just deleted a record,
+ * we have to avoid incrementing the record number so that we
+ * return the right record by virtue of renumbering the tree.
+ */
+ if (CD_ISSET(cp))
+ break;
+
+ if (cp->recno != RECNO_OOB) {
+ ++cp->recno;
+ break;
+ }
+ /* FALLTHROUGH */
+ case DB_FIRST:
+ flags = DB_NEXT;
+ cp->recno = 1;
+ break;
+ case DB_PREV_NODUP:
+ /*
+ * Recno databases don't have duplicates, set flags to DB_PREV
+ * and keep going.
+ */
+ /* FALLTHROUGH */
+ case DB_PREV:
+ flags = DB_PREV;
+ if (cp->recno != RECNO_OOB) {
+ if (cp->recno == 1) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ --cp->recno;
+ break;
+ }
+ /* FALLTHROUGH */
+ case DB_LAST:
+ flags = DB_PREV;
+ if (((ret = __ram_update(dbc,
+ DB_MAX_RECORDS, 0)) != 0) && ret != DB_NOTFOUND)
+ goto err;
+ if ((ret = __bam_nrecs(dbc, &cp->recno)) != 0)
+ goto err;
+ if (cp->recno == 0) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ break;
+ case DB_GET_BOTHC:
+ /*
+ * If we're doing a join and these are offpage dups,
+ * we want to keep searching forward from after the
+ * current cursor position. Increment the recno by 1,
+ * then proceed as for a DB_SET.
+ *
+ * Otherwise, we know there are no additional matching
+ * data, as recnos don't have dups. return DB_NOTFOUND.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ cp->recno++;
+ break;
+ }
+ ret = DB_NOTFOUND;
+ goto err;
+ /* NOTREACHED */
+ case DB_GET_BOTH:
+ /*
+ * If we're searching a set of off-page dups, we start
+ * a new linear search from the first record. Otherwise,
+ * we compare the single data item associated with the
+ * requested record for a match.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ cp->recno = 1;
+ break;
+ }
+ /* FALLTHROUGH */
+ case DB_SET:
+ case DB_SET_RANGE:
+ if ((ret = __ram_getno(dbc, key, &cp->recno, 0)) != 0)
+ goto err;
+ break;
+ default:
+ ret = __db_unknown_flag(dbp->dbenv, "__ram_c_get", flags);
+ goto err;
+ }
+
+ /*
+ * For DB_PREV, DB_LAST, DB_SET and DB_SET_RANGE, we have already
+ * called __ram_update() to make sure sufficient records have been
+ * read from the backing source file. Do it now for DB_CURRENT (if
+ * the current record was deleted we may need more records from the
+ * backing file for a DB_CURRENT operation), DB_FIRST and DB_NEXT.
+ */
+ if ((flags == DB_NEXT || flags == DB_CURRENT) && ((ret =
+ __ram_update(dbc, cp->recno, 0)) != 0) && ret != DB_NOTFOUND)
+ goto err;
+
+ for (;; ++cp->recno) {
+ /* Search the tree for the record. */
+ if ((ret = __bam_rsearch(dbc, &cp->recno,
+ F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND,
+ 1, &exact)) != 0)
+ goto err;
+ if (!exact) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+
+ /*
+ * Copy the page into the cursor, discarding any lock we
+ * are currently holding.
+ */
+ cp->page = cp->csp->page;
+ cp->pgno = cp->csp->page->pgno;
+ cp->indx = cp->csp->indx;
+ (void)__TLPUT(dbc, cp->lock);
+ cp->lock = cp->csp->lock;
+ cp->lock_mode = cp->csp->lock_mode;
+
+ /*
+ * If re-numbering records, the on-page deleted flag means this
+ * record was implicitly created. If not re-numbering records,
+ * the on-page deleted flag means this record was implicitly
+ * created, or, it was deleted at some time. Regardless, we
+ * skip such records if doing cursor next/prev operations or
+ * walking through off-page duplicates, and fail if they were
+ * requested explicitly by the application.
+ */
+ if (B_DISSET(GET_BKEYDATA(cp->page, cp->indx)->type))
+ switch (flags) {
+ case DB_NEXT:
+ case DB_PREV:
+ (void)__bam_stkrel(dbc, STK_CLRDBC);
+ goto retry;
+ case DB_GET_BOTH:
+ (void)__bam_stkrel(dbc, STK_CLRDBC);
+ continue;
+ default:
+ ret = DB_KEYEMPTY;
+ goto err;
+ }
+
+ if (flags == DB_GET_BOTH || flags == DB_GET_BOTHC) {
+ if ((ret = __bam_cmp(dbp, data,
+ cp->page, cp->indx, __bam_defcmp, &cmp)) != 0)
+ return (ret);
+ if (cmp == 0)
+ break;
+ if (!F_ISSET(dbc, DBC_OPD)) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ (void)__bam_stkrel(dbc, STK_CLRDBC);
+ } else
+ break;
+ }
+
+ /* Return the key if the user didn't give us one. */
+ if (!F_ISSET(dbc, DBC_OPD)) {
+ if (flags != DB_SET && flags != DB_SET_RANGE)
+ ret = __db_retcopy(dbp,
+ key, &cp->recno, sizeof(cp->recno),
+ &dbc->rkey.data, &dbc->rkey.ulen);
+ F_SET(key, DB_DBT_ISSET);
+ }
+
+ /* The cursor was reset, no further delete adjustment is necessary. */
+err: CD_CLR(cp);
+
+ return (ret);
+}
+
+/*
+ * __ram_c_put --
+ * Recno cursor->c_put function.
+ *
+ * PUBLIC: int __ram_c_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+ */
+int
+__ram_c_put(dbc, key, data, flags, pgnop)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+ db_pgno_t *pgnop;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_LSN lsn;
+ int exact, nc, ret, t_ret;
+ u_int32_t iiflags;
+ void *arg;
+
+ COMPQUIET(pgnop, NULL);
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * DB_KEYFIRST and DB_KEYLAST will only be set if we're dealing with
+ * an off-page duplicate tree, they can't be specified at user level.
+ * Translate them into something else.
+ */
+ switch (flags) {
+ case DB_KEYFIRST:
+ cp->recno = 1;
+ flags = DB_BEFORE;
+ break;
+ case DB_KEYLAST:
+ if ((ret = __ram_add(dbc, &cp->recno, data, DB_APPEND, 0)) != 0)
+ return (ret);
+ if (CURADJ_LOG(dbc) && (ret = __bam_rcuradj_log(dbp->dbenv,
+ dbc->txn, &lsn, 0, dbp->log_fileid, CA_ICURRENT,
+ cp->root, cp->recno, cp->order)))
+ return (ret);
+ return (0);
+ }
+
+ /*
+ * If we're putting with a cursor that's marked C_DELETED, we need to
+ * take special care; the cursor doesn't "really" reference the item
+ * corresponding to its current recno, but instead is "between" that
+ * record and the current one. Translate the actual insert into
+ * DB_BEFORE, and let the __ram_ca work out the gory details of what
+ * should wind up pointing where.
+ */
+ if (CD_ISSET(cp))
+ iiflags = DB_BEFORE;
+ else
+ iiflags = flags;
+
+split: if ((ret = __bam_rsearch(dbc, &cp->recno, S_INSERT, 1, &exact)) != 0)
+ goto err;
+ /*
+ * An inexact match is okay; it just means we're one record past the
+ * end, which is reasonable if we're marked deleted.
+ */
+ DB_ASSERT(exact || CD_ISSET(cp));
+
+ cp->page = cp->csp->page;
+ cp->pgno = cp->csp->page->pgno;
+ cp->indx = cp->csp->indx;
+
+ ret = __bam_iitem(dbc, key, data, iiflags, 0);
+ t_ret = __bam_stkrel(dbc, STK_CLRDBC);
+
+ if (t_ret != 0 && (ret == 0 || ret == DB_NEEDSPLIT))
+ ret = t_ret;
+ else if (ret == DB_NEEDSPLIT) {
+ arg = &cp->recno;
+ if ((ret = __bam_split(dbc, arg)) != 0)
+ goto err;
+ goto split;
+ }
+ if (ret != 0)
+ goto err;
+
+ switch (flags) { /* Adjust the cursors. */
+ case DB_AFTER:
+ nc = __ram_ca(dbc, CA_IAFTER);
+
+ /*
+ * We only need to adjust this cursor forward if we truly added
+ * the item after the current recno, rather than remapping it
+ * to DB_BEFORE.
+ */
+ if (iiflags == DB_AFTER)
+ ++cp->recno;
+
+ /* Only log if __ram_ca found any relevant cursors. */
+ if (nc > 0 && CURADJ_LOG(dbc) &&
+ (ret = __bam_rcuradj_log(dbp->dbenv,
+ dbc->txn, &lsn, 0, dbp->log_fileid, CA_IAFTER,
+ cp->root, cp->recno, cp->order)) != 0)
+ goto err;
+ break;
+ case DB_BEFORE:
+ nc = __ram_ca(dbc, CA_IBEFORE);
+ --cp->recno;
+
+ /* Only log if __ram_ca found any relevant cursors. */
+ if (nc > 0 && CURADJ_LOG(dbc) &&
+ (ret = __bam_rcuradj_log(dbp->dbenv,
+ dbc->txn, &lsn, 0, dbp->log_fileid, CA_IBEFORE,
+ cp->root, cp->recno, cp->order)) != 0)
+ goto err;
+ break;
+ case DB_CURRENT:
+ /*
+ * We only need to do an adjustment if we actually
+ * added an item, which we only would have done if the
+ * cursor was marked deleted.
+ *
+ * Only log if __ram_ca found any relevant cursors.
+ */
+ if (CD_ISSET(cp) && __ram_ca(dbc, CA_ICURRENT) > 0 &&
+ CURADJ_LOG(dbc) && (ret = __bam_rcuradj_log(
+ dbp->dbenv, dbc->txn, &lsn, 0, dbp->log_fileid,
+ CA_ICURRENT, cp->root, cp->recno, cp->order)) != 0)
+ goto err;
+ break;
+ }
+
+ /* Return the key if we've created a new record. */
+ if (!F_ISSET(dbc, DBC_OPD) && (flags == DB_AFTER || flags == DB_BEFORE))
+ ret = __db_retcopy(dbp, key, &cp->recno,
+ sizeof(cp->recno), &dbc->rkey.data, &dbc->rkey.ulen);
+
+ /* The cursor was reset, no further delete adjustment is necessary. */
+err: CD_CLR(cp);
+
+ return (ret);
+}
+
+/*
+ * __ram_ca --
+ * Adjust cursors. Returns the number of relevant cursors.
+ *
+ * PUBLIC: int __ram_ca __P((DBC *, ca_recno_arg));
+ */
+int
+__ram_ca(dbc_arg, op)
+ DBC *dbc_arg;
+ ca_recno_arg op;
+{
+ BTREE_CURSOR *cp, *cp_arg;
+ DB *dbp, *ldbp;
+ DB_ENV *dbenv;
+ DBC *dbc;
+ db_recno_t recno;
+ int adjusted, found;
+ u_int32_t order;
+
+ dbp = dbc_arg->dbp;
+ dbenv = dbp->dbenv;
+ cp_arg = (BTREE_CURSOR *)dbc_arg->internal;
+ recno = cp_arg->recno;
+
+ found = 0;
+
+ /*
+ * It only makes sense to adjust cursors if we're a renumbering
+ * recno; we should only be called if this is one.
+ */
+ DB_ASSERT(F_ISSET(cp_arg, C_RENUMBER));
+
+ MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp);
+ /*
+ * Adjust the cursors. See the comment in __bam_ca_delete().
+ */
+ /*
+ * If we're doing a delete, we need to find the highest
+ * order of any cursor currently pointing at this item,
+ * so we can assign a higher order to the newly deleted
+ * cursor. Unfortunately, this requires a second pass through
+ * the cursor list.
+ */
+ if (op == CA_DELETE) {
+ order = 1;
+ for (ldbp = __dblist_get(dbenv, dbp->adj_fileid);
+ ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+ ldbp = LIST_NEXT(ldbp, dblistlinks)) {
+ MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
+ for (dbc = TAILQ_FIRST(&ldbp->active_queue);
+ dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+ cp = (BTREE_CURSOR *)dbc->internal;
+ if (cp_arg->root == cp->root &&
+ recno == cp->recno && CD_ISSET(cp) &&
+ order <= cp->order)
+ order = cp->order + 1;
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
+ }
+ } else
+ order = INVALID_ORDER;
+
+ /* Now go through and do the actual adjustments. */
+ for (ldbp = __dblist_get(dbenv, dbp->adj_fileid);
+ ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+ ldbp = LIST_NEXT(ldbp, dblistlinks)) {
+ MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
+ for (dbc = TAILQ_FIRST(&ldbp->active_queue);
+ dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+ cp = (BTREE_CURSOR *)dbc->internal;
+ if (cp_arg->root != cp->root)
+ continue;
+ ++found;
+ adjusted = 0;
+ switch (op) {
+ case CA_DELETE:
+ if (recno < cp->recno) {
+ --cp->recno;
+ /*
+ * If the adjustment made them equal,
+ * we have to merge the orders.
+ */
+ if (recno == cp->recno && CD_ISSET(cp))
+ cp->order += order;
+ } else if (recno == cp->recno &&
+ !CD_ISSET(cp)) {
+ CD_SET(cp);
+ cp->order = order;
+ }
+ break;
+ case CA_IBEFORE:
+ /*
+ * IBEFORE is just like IAFTER, except that we
+ * adjust cursors on the current record too.
+ */
+ if (C_EQUAL(cp_arg, cp)) {
+ ++cp->recno;
+ adjusted = 1;
+ }
+ goto iafter;
+ case CA_ICURRENT:
+
+ /*
+ * If the original cursor wasn't deleted, we
+ * just did a replacement and so there's no
+ * need to adjust anything--we shouldn't have
+ * gotten this far. Otherwise, we behave
+ * much like an IAFTER, except that all
+ * cursors pointing to the current item get
+ * marked undeleted and point to the new
+ * item.
+ */
+ DB_ASSERT(CD_ISSET(cp_arg));
+ if (C_EQUAL(cp_arg, cp)) {
+ CD_CLR(cp);
+ break;
+ }
+ /* FALLTHROUGH */
+ case CA_IAFTER:
+iafter: if (!adjusted && C_LESSTHAN(cp_arg, cp)) {
+ ++cp->recno;
+ adjusted = 1;
+ }
+ if (recno == cp->recno && adjusted)
+ /*
+ * If we've moved this cursor's recno,
+ * split its order number--i.e.,
+ * decrement it by enough so that
+ * the lowest cursor moved has order 1.
+ * cp_arg->order is the split point,
+ * so decrement by one less than that.
+ */
+ cp->order -= (cp_arg->order - 1);
+ break;
+ }
+ }
+ MUTEX_THREAD_UNLOCK(dbp->dbenv, dbp->mutexp);
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp);
+
+ return (found);
+}
+
+/*
+ * __ram_getno --
+ * Check the user's record number, and make sure we've seen it.
+ *
+ * PUBLIC: int __ram_getno __P((DBC *, const DBT *, db_recno_t *, int));
+ */
+int
+__ram_getno(dbc, key, rep, can_create)
+ DBC *dbc;
+ const DBT *key;
+ db_recno_t *rep;
+ int can_create;
+{
+ DB *dbp;
+ db_recno_t recno;
+
+ dbp = dbc->dbp;
+
+ /* Check the user's record number. */
+ if ((recno = *(db_recno_t *)key->data) == 0) {
+ __db_err(dbp->dbenv, "illegal record number of 0");
+ return (EINVAL);
+ }
+ if (rep != NULL)
+ *rep = recno;
+
+ /*
+ * Btree can neither create records nor read them in. Recno can
+ * do both, see if we can find the record.
+ */
+ return (dbc->dbtype == DB_RECNO ?
+ __ram_update(dbc, recno, can_create) : 0);
+}
+
+/*
+ * __ram_update --
+ * Ensure the tree has records up to and including the specified one.
+ */
+static int
+__ram_update(dbc, recno, can_create)
+ DBC *dbc;
+ db_recno_t recno;
+ int can_create;
+{
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ db_recno_t nrecs;
+ int ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ t = dbp->bt_internal;
+
+ /*
+ * If we can't create records and we've read the entire backing input
+ * file, we're done.
+ */
+ if (!can_create && t->re_eof)
+ return (0);
+
+ /*
+ * If we haven't seen this record yet, try to get it from the original
+ * file.
+ */
+ if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
+ return (ret);
+ if (!t->re_eof && recno > nrecs) {
+ if ((ret = __ram_sread(dbc, recno)) != 0 && ret != DB_NOTFOUND)
+ return (ret);
+ if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
+ return (ret);
+ }
+
+ /*
+ * If we can create records, create empty ones up to the requested
+ * record.
+ */
+ if (!can_create || recno <= nrecs + 1)
+ return (0);
+
+ dbc->rdata.dlen = 0;
+ dbc->rdata.doff = 0;
+ dbc->rdata.flags = 0;
+ if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
+ if (dbc->rdata.ulen < t->re_len) {
+ if ((ret = __os_realloc(dbp->dbenv,
+ t->re_len, NULL, &dbc->rdata.data)) != 0) {
+ dbc->rdata.ulen = 0;
+ dbc->rdata.data = NULL;
+ return (ret);
+ }
+ dbc->rdata.ulen = t->re_len;
+ }
+ dbc->rdata.size = t->re_len;
+ memset(dbc->rdata.data, t->re_pad, t->re_len);
+ } else
+ dbc->rdata.size = 0;
+
+ while (recno > ++nrecs)
+ if ((ret = __ram_add(dbc,
+ &nrecs, &dbc->rdata, 0, BI_DELETED)) != 0)
+ return (ret);
+ return (0);
+}
+
+/*
+ * __ram_source --
+ * Load information about the backing file.
+ */
+static int
+__ram_source(dbp)
+ DB *dbp;
+{
+ BTREE *t;
+ char *source;
+ int ret;
+
+ t = dbp->bt_internal;
+
+ /* Find the real name, and swap out the one we had before. */
+ if ((ret = __db_appname(dbp->dbenv,
+ DB_APP_DATA, NULL, t->re_source, 0, NULL, &source)) != 0)
+ return (ret);
+ __os_freestr(t->re_source);
+ t->re_source = source;
+
+ /*
+ * !!!
+ * It's possible that the backing source file is read-only. We don't
+ * much care other than we'll complain if there are any modifications
+ * when it comes time to write the database back to the source.
+ */
+ if ((t->re_fp = fopen(t->re_source, "r")) == NULL) {
+ ret = errno;
+ __db_err(dbp->dbenv, "%s: %s", t->re_source, db_strerror(ret));
+ return (ret);
+ }
+
+ t->re_eof = 0;
+ return (0);
+}
+
+/*
+ * __ram_writeback --
+ * Rewrite the backing file.
+ *
+ * PUBLIC: int __ram_writeback __P((DB *));
+ */
+int
+__ram_writeback(dbp)
+ DB *dbp;
+{
+ BTREE *t;
+ DB_ENV *dbenv;
+ DBC *dbc;
+ DBT key, data;
+ FILE *fp;
+ db_recno_t keyno;
+ int ret, t_ret;
+ u_int8_t delim, *pad;
+
+ t = dbp->bt_internal;
+ dbenv = dbp->dbenv;
+ fp = NULL;
+
+ /* If the file wasn't modified, we're done. */
+ if (!t->re_modified)
+ return (0);
+
+ /* If there's no backing source file, we're done. */
+ if (t->re_source == NULL) {
+ t->re_modified = 0;
+ return (0);
+ }
+
+ /* Allocate a cursor. */
+ if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0)
+ return (ret);
+
+ /*
+ * Read any remaining records into the tree.
+ *
+ * !!!
+ * This is why we can't support transactions when applications specify
+ * backing (re_source) files. At this point we have to read in the
+ * rest of the records from the file so that we can write all of the
+ * records back out again, which could modify a page for which we'd
+ * have to log changes and which we don't have locked. This could be
+ * partially fixed by taking a snapshot of the entire file during the
+ * DB->open as DB->open is transaction protected. But, if a checkpoint
+ * occurs then, the part of the log holding the copy of the file could
+ * be discarded, and that would make it impossible to recover in the
+ * face of disaster. This could all probably be fixed, but it would
+ * require transaction protecting the backing source file.
+ *
+ * XXX
+ * This could be made to work now that we have transactions protecting
+ * file operations. Margo has specifically asked for the privilege of
+ * doing this work.
+ */
+ if ((ret =
+ __ram_update(dbc, DB_MAX_RECORDS, 0)) != 0 && ret != DB_NOTFOUND)
+ return (ret);
+
+ /*
+ * Close any existing file handle and re-open the file, truncating it.
+ */
+ if (t->re_fp != NULL) {
+ if (fclose(t->re_fp) != 0) {
+ ret = errno;
+ goto err;
+ }
+ t->re_fp = NULL;
+ }
+ if ((fp = fopen(t->re_source, "w")) == NULL) {
+ ret = errno;
+ __db_err(dbenv, "%s: %s", t->re_source, db_strerror(ret));
+ goto err;
+ }
+
+ /*
+ * We step through the records, writing each one out. Use the record
+ * number and the dbp->get() function, instead of a cursor, so we find
+ * and write out "deleted" or non-existent records.
+ */
+ memset(&key, 0, sizeof(key));
+ memset(&data, 0, sizeof(data));
+ key.size = sizeof(db_recno_t);
+ key.data = &keyno;
+
+ /*
+ * We'll need the delimiter if we're doing variable-length records,
+ * and the pad character if we're doing fixed-length records.
+ */
+ delim = t->re_delim;
+ if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
+ if ((ret = __os_malloc(dbenv, t->re_len, NULL, &pad)) != 0)
+ goto err;
+ memset(pad, t->re_pad, t->re_len);
+ } else
+ COMPQUIET(pad, NULL);
+ for (keyno = 1;; ++keyno) {
+ switch (ret = dbp->get(dbp, NULL, &key, &data, 0)) {
+ case 0:
+ if (fwrite(data.data, 1, data.size, fp) != data.size)
+ goto write_err;
+ break;
+ case DB_KEYEMPTY:
+ if (F_ISSET(dbp, DB_RE_FIXEDLEN) &&
+ fwrite(pad, 1, t->re_len, fp) != t->re_len)
+ goto write_err;
+ break;
+ case DB_NOTFOUND:
+ ret = 0;
+ goto done;
+ }
+ if (!F_ISSET(dbp, DB_RE_FIXEDLEN) &&
+ fwrite(&delim, 1, 1, fp) != 1) {
+write_err: ret = errno;
+ __db_err(dbp->dbenv,
+ "%s: write failed to backing file: %s",
+ t->re_source, strerror(ret));
+ goto err;
+ }
+ }
+
+err:
+done: /* Close the file descriptor. */
+ if (fp != NULL && fclose(fp) != 0) {
+ if (ret == 0)
+ ret = errno;
+ __db_err(dbenv, "%s: %s", t->re_source, db_strerror(errno));
+ }
+
+ /* Discard the cursor. */
+ if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (ret == 0)
+ t->re_modified = 0;
+
+ return (ret);
+}
+
+/*
+ * __ram_sread --
+ * Read records from a source file.
+ */
+static int
+__ram_sread(dbc, top)
+ DBC *dbc;
+ db_recno_t top;
+{
+ BTREE *t;
+ DB *dbp;
+ DBT data;
+ db_recno_t recno;
+ size_t len;
+ int ch, ret, was_modified;
+
+ t = dbc->dbp->bt_internal;
+ dbp = dbc->dbp;
+ was_modified = t->re_modified;
+
+ if ((ret = __bam_nrecs(dbc, &recno)) != 0)
+ return (ret);
+
+ /* Use the record data return memory, it's only a short-term use. */
+ len = F_ISSET(dbp, DB_RE_FIXEDLEN) ? t->re_len : 256;
+ if (dbc->rdata.ulen < len) {
+ if ((ret = __os_realloc(
+ dbp->dbenv, len, NULL, &dbc->rdata.data)) != 0) {
+ dbc->rdata.ulen = 0;
+ dbc->rdata.data = NULL;
+ return (ret);
+ }
+ dbc->rdata.ulen = len;
+ }
+
+ memset(&data, 0, sizeof(data));
+ while (recno < top) {
+ data.data = dbc->rdata.data;
+ data.size = 0;
+ if (F_ISSET(dbp, DB_RE_FIXEDLEN))
+ for (len = t->re_len; len > 0; --len) {
+ if ((ch = getc(t->re_fp)) == EOF)
+ goto eof;
+ ((u_int8_t *)data.data)[data.size++] = ch;
+ }
+ else
+ for (;;) {
+ if ((ch = getc(t->re_fp)) == EOF)
+ goto eof;
+ if (ch == t->re_delim)
+ break;
+
+ ((u_int8_t *)data.data)[data.size++] = ch;
+ if (data.size == dbc->rdata.ulen) {
+ if ((ret = __os_realloc(dbp->dbenv,
+ dbc->rdata.ulen *= 2,
+ NULL, &dbc->rdata.data)) != 0) {
+ dbc->rdata.ulen = 0;
+ dbc->rdata.data = NULL;
+ return (ret);
+ } else
+ data.data = dbc->rdata.data;
+ }
+ }
+
+ /*
+ * Another process may have read this record from the input
+ * file and stored it into the database already, in which
+ * case we don't need to repeat that operation. We detect
+ * this by checking if the last record we've read is greater
+ * or equal to the number of records in the database.
+ */
+ if (t->re_last >= recno) {
+ ++recno;
+ if ((ret = __ram_add(dbc, &recno, &data, 0, 0)) != 0)
+ goto err;
+ }
+ ++t->re_last;
+ }
+
+ if (0) {
+eof: t->re_eof = 1;
+ ret = DB_NOTFOUND;
+ }
+err: if (!was_modified)
+ t->re_modified = 0;
+
+ return (ret);
+}
+
+/*
+ * __ram_add --
+ * Add records into the tree.
+ */
+static int
+__ram_add(dbc, recnop, data, flags, bi_flags)
+ DBC *dbc;
+ db_recno_t *recnop;
+ DBT *data;
+ u_int32_t flags, bi_flags;
+{
+ BKEYDATA *bk;
+ BTREE_CURSOR *cp;
+ int exact, ret, stack;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+retry: /* Find the slot for insertion. */
+ if ((ret = __bam_rsearch(dbc, recnop,
+ S_INSERT | (flags == DB_APPEND ? S_APPEND : 0), 1, &exact)) != 0)
+ return (ret);
+ stack = 1;
+ cp->page = cp->csp->page;
+ cp->pgno = cp->csp->page->pgno;
+ cp->indx = cp->csp->indx;
+
+ /*
+ * The application may modify the data based on the selected record
+ * number.
+ */
+ if (flags == DB_APPEND && dbc->dbp->db_append_recno != NULL &&
+ (ret = dbc->dbp->db_append_recno(dbc->dbp, data, *recnop)) != 0)
+ goto err;
+
+ /*
+ * If re-numbering records, the on-page deleted flag means this record
+ * was implicitly created. If not re-numbering records, the on-page
+ * deleted flag means this record was implicitly created, or, it was
+ * deleted at some time.
+ *
+ * If DB_NOOVERWRITE is set and the item already exists in the tree,
+ * return an error unless the item was either marked for deletion or
+ * only implicitly created.
+ */
+ if (exact) {
+ bk = GET_BKEYDATA(cp->page, cp->indx);
+ if (!B_DISSET(bk->type) && flags == DB_NOOVERWRITE) {
+ ret = DB_KEYEXIST;
+ goto err;
+ }
+ }
+
+ /*
+ * Select the arguments for __bam_iitem() and do the insert. If the
+ * key is an exact match, or we're replacing the data item with a
+ * new data item, replace the current item. If the key isn't an exact
+ * match, we're inserting a new key/data pair, before the search
+ * location.
+ */
+ switch (ret = __bam_iitem(dbc,
+ NULL, data, exact ? DB_CURRENT : DB_BEFORE, bi_flags)) {
+ case 0:
+ /*
+ * Don't adjust anything.
+ *
+ * If we inserted a record, no cursors need adjusting because
+ * the only new record it's possible to insert is at the very
+ * end of the tree. The necessary adjustments to the internal
+ * page counts were made by __bam_iitem().
+ *
+ * If we overwrote a record, no cursors need adjusting because
+ * future DBcursor->get calls will simply return the underlying
+ * record (there's no adjustment made for the DB_CURRENT flag
+ * when a cursor get operation immediately follows a cursor
+ * delete operation, and the normal adjustment for the DB_NEXT
+ * flag is still correct).
+ */
+ break;
+ case DB_NEEDSPLIT:
+ /* Discard the stack of pages and split the page. */
+ (void)__bam_stkrel(dbc, STK_CLRDBC);
+ stack = 0;
+
+ if ((ret = __bam_split(dbc, recnop)) != 0)
+ goto err;
+
+ goto retry;
+ /* NOTREACHED */
+ default:
+ goto err;
+ }
+
+err: if (stack)
+ __bam_stkrel(dbc, STK_CLRDBC);
+
+ return (ret);
+}
diff --git a/db/btree/bt_rsearch.c b/db/btree/bt_rsearch.c
new file mode 100644
index 000000000..7102cd715
--- /dev/null
+++ b/db/btree/bt_rsearch.c
@@ -0,0 +1,429 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: bt_rsearch.c,v 11.21 2000/03/28 21:50:04 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "btree.h"
+#include "db_shash.h"
+#include "lock.h"
+
+/*
+ * __bam_rsearch --
+ * Search a btree for a record number.
+ *
+ * PUBLIC: int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *));
+ */
+int
+__bam_rsearch(dbc, recnop, flags, stop, exactp)
+ DBC *dbc;
+ db_recno_t *recnop;
+ u_int32_t flags;
+ int stop, *exactp;
+{
+ BINTERNAL *bi;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_LOCK lock;
+ PAGE *h;
+ RINTERNAL *ri;
+ db_indx_t adjust, deloffset, indx, top;
+ db_lockmode_t lock_mode;
+ db_pgno_t pg;
+ db_recno_t recno, t_recno, total;
+ int ret, stack;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ BT_STK_CLR(cp);
+
+ /*
+ * There are several ways we search a btree tree. The flags argument
+ * specifies if we're acquiring read or write locks and if we are
+ * locking pairs of pages. In addition, if we're adding or deleting
+ * an item, we have to lock the entire tree, regardless. See btree.h
+ * for more details.
+ *
+ * If write-locking pages, we need to know whether or not to acquire a
+ * write lock on a page before getting it. This depends on how deep it
+ * is in tree, which we don't know until we acquire the root page. So,
+ * if we need to lock the root page we may have to upgrade it later,
+ * because we won't get the correct lock initially.
+ *
+ * Retrieve the root page.
+ */
+ pg = cp->root;
+ stack = LF_ISSET(S_STACK);
+ lock_mode = stack ? DB_LOCK_WRITE : DB_LOCK_READ;
+ if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
+ return (ret);
+ if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) {
+ /* Did not read it, so we can release the lock */
+ (void)__LPUT(dbc, lock);
+ return (ret);
+ }
+
+ /*
+ * Decide if we need to save this page; if we do, write lock it.
+ * We deliberately don't lock-couple on this call. If the tree
+ * is tiny, i.e., one page, and two threads are busily updating
+ * the root page, we're almost guaranteed deadlocks galore, as
+ * each one gets a read lock and then blocks the other's attempt
+ * for a write lock.
+ */
+ if (!stack &&
+ ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) ||
+ (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) {
+ (void)memp_fput(dbp->mpf, h, 0);
+ (void)__LPUT(dbc, lock);
+ lock_mode = DB_LOCK_WRITE;
+ if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
+ return (ret);
+ if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) {
+ /* Did not read it, so we can release the lock */
+ (void)__LPUT(dbc, lock);
+ return (ret);
+ }
+ stack = 1;
+ }
+
+ /*
+ * If appending to the tree, set the record number now -- we have the
+ * root page locked.
+ *
+ * Delete only deletes exact matches, read only returns exact matches.
+ * Note, this is different from __bam_search(), which returns non-exact
+ * matches for read.
+ *
+ * The record may not exist. We can only return the correct location
+ * for the record immediately after the last record in the tree, so do
+ * a fast check now.
+ */
+ total = RE_NREC(h);
+ if (LF_ISSET(S_APPEND)) {
+ *exactp = 0;
+ *recnop = recno = total + 1;
+ } else {
+ recno = *recnop;
+ if (recno <= total)
+ *exactp = 1;
+ else {
+ *exactp = 0;
+ if (!LF_ISSET(S_PAST_EOF) || recno > total + 1) {
+ /*
+ * Keep the page locked for serializability.
+ *
+ * XXX
+ * This leaves the root page locked, which will
+ * eliminate any concurrency. A possible fix
+ * would be to lock the last leaf page instead.
+ */
+ (void)memp_fput(dbp->mpf, h, 0);
+ (void)__TLPUT(dbc, lock);
+ return (DB_NOTFOUND);
+ }
+ }
+ }
+
+ /*
+ * !!!
+ * Record numbers in the tree are 0-based, but the recno is
+ * 1-based. All of the calculations below have to take this
+ * into account.
+ */
+ for (total = 0;;) {
+ switch (TYPE(h)) {
+ case P_LBTREE:
+ case P_LDUP:
+ recno -= total;
+ /*
+ * There may be logically deleted records on the page.
+ * If there are enough, the record may not exist.
+ */
+ if (TYPE(h) == P_LBTREE) {
+ adjust = P_INDX;
+ deloffset = O_INDX;
+ } else {
+ adjust = O_INDX;
+ deloffset = 0;
+ }
+ for (t_recno = 0, indx = 0;; indx += adjust) {
+ if (indx >= NUM_ENT(h)) {
+ *exactp = 0;
+ if (!LF_ISSET(S_PAST_EOF) ||
+ recno > t_recno + 1) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ }
+ if (!B_DISSET(
+ GET_BKEYDATA(h, indx + deloffset)->type) &&
+ ++t_recno == recno)
+ break;
+ }
+
+ /* Correct from 1-based to 0-based for a page offset. */
+ BT_STK_ENTER(dbp->dbenv,
+ cp, h, indx, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ return (0);
+ case P_IBTREE:
+ for (indx = 0, top = NUM_ENT(h);;) {
+ bi = GET_BINTERNAL(h, indx);
+ if (++indx == top || total + bi->nrecs >= recno)
+ break;
+ total += bi->nrecs;
+ }
+ pg = bi->pgno;
+ break;
+ case P_LRECNO:
+ recno -= total;
+
+ /* Correct from 1-based to 0-based for a page offset. */
+ --recno;
+ BT_STK_ENTER(dbp->dbenv,
+ cp, h, recno, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ return (0);
+ case P_IRECNO:
+ for (indx = 0, top = NUM_ENT(h);;) {
+ ri = GET_RINTERNAL(h, indx);
+ if (++indx == top || total + ri->nrecs >= recno)
+ break;
+ total += ri->nrecs;
+ }
+ pg = ri->pgno;
+ break;
+ default:
+ return (__db_pgfmt(dbp, h->pgno));
+ }
+ --indx;
+
+ if (stack) {
+ /* Return if this is the lowest page wanted. */
+ if (LF_ISSET(S_PARENT) && stop == h->level) {
+ BT_STK_ENTER(dbp->dbenv,
+ cp, h, indx, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ return (0);
+ }
+ BT_STK_PUSH(dbp->dbenv,
+ cp, h, indx, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+
+ lock_mode = DB_LOCK_WRITE;
+ if ((ret =
+ __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
+ goto err;
+ } else {
+ /*
+ * Decide if we want to return a pointer to the next
+ * page in the stack. If we do, write lock it and
+ * never unlock it.
+ */
+ if ((LF_ISSET(S_PARENT) &&
+ (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) ||
+ (h->level - 1) == LEAFLEVEL)
+ stack = 1;
+
+ (void)memp_fput(dbp->mpf, h, 0);
+
+ lock_mode = stack &&
+ LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ;
+ if ((ret = __db_lget(dbc,
+ LCK_COUPLE, pg, lock_mode, 0, &lock)) != 0) {
+ /*
+ * If we fail, discard the lock we held. This
+ * is OK because this only happens when we are
+ * descending the tree holding read-locks.
+ */
+ __LPUT(dbc, lock);
+ goto err;
+ }
+ }
+
+ if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0)
+ goto err;
+ }
+ /* NOTREACHED */
+
+err: BT_STK_POP(cp);
+ __bam_stkrel(dbc, 0);
+ return (ret);
+}
+
+/*
+ * __bam_adjust --
+ * Adjust the tree after adding or deleting a record.
+ *
+ * PUBLIC: int __bam_adjust __P((DBC *, int32_t));
+ */
+int
+__bam_adjust(dbc, adjust)
+ DBC *dbc;
+ int32_t adjust;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ EPG *epg;
+ PAGE *h;
+ db_pgno_t root_pgno;
+ int ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ root_pgno = cp->root;
+
+ /* Update the record counts for the tree. */
+ for (epg = cp->sp; epg <= cp->csp; ++epg) {
+ h = epg->page;
+ if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) {
+ if (DB_LOGGING(dbc) &&
+ (ret = __bam_cadjust_log(dbp->dbenv,
+ dbc->txn, &LSN(h), 0, dbp->log_fileid,
+ PGNO(h), &LSN(h), (u_int32_t)epg->indx, adjust,
+ PGNO(h) == root_pgno ? CAD_UPDATEROOT : 0)) != 0)
+ return (ret);
+
+ if (TYPE(h) == P_IBTREE)
+ GET_BINTERNAL(h, epg->indx)->nrecs += adjust;
+ else
+ GET_RINTERNAL(h, epg->indx)->nrecs += adjust;
+
+ if (PGNO(h) == root_pgno)
+ RE_NREC_ADJ(h, adjust);
+
+ if ((ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0)
+ return (ret);
+ }
+ }
+ return (0);
+}
+
+/*
+ * __bam_nrecs --
+ * Return the number of records in the tree.
+ *
+ * PUBLIC: int __bam_nrecs __P((DBC *, db_recno_t *));
+ */
+int
+__bam_nrecs(dbc, rep)
+ DBC *dbc;
+ db_recno_t *rep;
+{
+ DB *dbp;
+ DB_LOCK lock;
+ PAGE *h;
+ db_pgno_t pgno;
+ int ret;
+
+ dbp = dbc->dbp;
+
+ pgno = dbc->internal->root;
+ if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &lock)) != 0)
+ return (ret);
+ if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
+ return (ret);
+
+ *rep = RE_NREC(h);
+
+ (void)memp_fput(dbp->mpf, h, 0);
+ (void)__TLPUT(dbc, lock);
+
+ return (0);
+}
+
+/*
+ * __bam_total --
+ * Return the number of records below a page.
+ *
+ * PUBLIC: db_recno_t __bam_total __P((PAGE *));
+ */
+db_recno_t
+__bam_total(h)
+ PAGE *h;
+{
+ db_recno_t nrecs;
+ db_indx_t indx, top;
+
+ nrecs = 0;
+ top = NUM_ENT(h);
+
+ switch (TYPE(h)) {
+ case P_LBTREE:
+ /* Check for logically deleted records. */
+ for (indx = 0; indx < top; indx += P_INDX)
+ if (!B_DISSET(GET_BKEYDATA(h, indx + O_INDX)->type))
+ ++nrecs;
+ break;
+ case P_LDUP:
+ /* Check for logically deleted records. */
+ for (indx = 0; indx < top; indx += O_INDX)
+ if (!B_DISSET(GET_BKEYDATA(h, indx)->type))
+ ++nrecs;
+ break;
+ case P_IBTREE:
+ for (indx = 0; indx < top; indx += O_INDX)
+ nrecs += GET_BINTERNAL(h, indx)->nrecs;
+ break;
+ case P_LRECNO:
+ nrecs = NUM_ENT(h);
+ break;
+ case P_IRECNO:
+ for (indx = 0; indx < top; indx += O_INDX)
+ nrecs += GET_RINTERNAL(h, indx)->nrecs;
+ break;
+ }
+
+ return (nrecs);
+}
diff --git a/db/btree/bt_search.c b/db/btree/bt_search.c
new file mode 100644
index 000000000..d822198f2
--- /dev/null
+++ b/db/btree/bt_search.c
@@ -0,0 +1,471 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: bt_search.c,v 11.32 2001/01/17 20:19:46 bostic Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "db_shash.h"
+#include "btree.h"
+#include "lock.h"
+
+/*
+ * __bam_search --
+ * Search a btree for a key.
+ *
+ * PUBLIC: int __bam_search __P((DBC *,
+ * PUBLIC: const DBT *, u_int32_t, int, db_recno_t *, int *));
+ */
+int
+__bam_search(dbc, key, flags, stop, recnop, exactp)
+ DBC *dbc;
+ const DBT *key;
+ u_int32_t flags;
+ int stop, *exactp;
+ db_recno_t *recnop;
+{
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_LOCK lock;
+ PAGE *h;
+ db_indx_t base, i, indx, lim;
+ db_lockmode_t lock_mode;
+ db_pgno_t pg;
+ db_recno_t recno;
+ int adjust, cmp, deloffset, ret, stack;
+ int (*func) __P((DB *, const DBT *, const DBT *));
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ t = dbp->bt_internal;
+ recno = 0;
+
+ BT_STK_CLR(cp);
+
+ /*
+ * There are several ways we search a btree tree. The flags argument
+ * specifies if we're acquiring read or write locks, if we position
+ * to the first or last item in a set of duplicates, if we return
+ * deleted items, and if we are locking pairs of pages. In addition,
+ * if we're modifying record numbers, we have to lock the entire tree
+ * regardless. See btree.h for more details.
+ *
+ * If write-locking pages, we need to know whether or not to acquire a
+ * write lock on a page before getting it. This depends on how deep it
+ * is in tree, which we don't know until we acquire the root page. So,
+ * if we need to lock the root page we may have to upgrade it later,
+ * because we won't get the correct lock initially.
+ *
+ * Retrieve the root page.
+ */
+try_again:
+ pg = cp->root;
+ stack = LF_ISSET(S_STACK) && F_ISSET(cp, C_RECNUM);
+ lock_mode = stack ? DB_LOCK_WRITE : DB_LOCK_READ;
+ if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
+ return (ret);
+ if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) {
+ /* Did not read it, so we can release the lock */
+ (void)__LPUT(dbc, lock);
+ return (ret);
+ }
+
+ /*
+ * Decide if we need to save this page; if we do, write lock it.
+ * We deliberately don't lock-couple on this call. If the tree
+ * is tiny, i.e., one page, and two threads are busily updating
+ * the root page, we're almost guaranteed deadlocks galore, as
+ * each one gets a read lock and then blocks the other's attempt
+ * for a write lock.
+ */
+ if (!stack &&
+ ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) ||
+ (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) {
+ (void)memp_fput(dbp->mpf, h, 0);
+ (void)__LPUT(dbc, lock);
+ lock_mode = DB_LOCK_WRITE;
+ if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
+ return (ret);
+ if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) {
+ /* Did not read it, so we can release the lock */
+ (void)__LPUT(dbc, lock);
+ return (ret);
+ }
+ if (!((LF_ISSET(S_PARENT)
+ && (u_int8_t)(stop + 1) >= h->level) ||
+ (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) {
+ /* Someone else split the root, start over. */
+ (void)memp_fput(dbp->mpf, h, 0);
+ (void)__LPUT(dbc, lock);
+ goto try_again;
+ }
+ stack = 1;
+ }
+
+ /* Choose a comparison function. */
+ func = F_ISSET(dbc, DBC_OPD) ?
+ (dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare) :
+ t->bt_compare;
+
+ for (;;) {
+ /*
+ * Do a binary search on the current page. If we're searching
+ * a Btree leaf page, we have to walk the indices in groups of
+ * two. If we're searching an internal page or a off-page dup
+ * page, they're an index per page item. If we find an exact
+ * match on a leaf page, we're done.
+ */
+ adjust = TYPE(h) == P_LBTREE ? P_INDX : O_INDX;
+ for (base = 0,
+ lim = NUM_ENT(h) / (db_indx_t)adjust; lim != 0; lim >>= 1) {
+ indx = base + ((lim >> 1) * adjust);
+ if ((ret =
+ __bam_cmp(dbp, key, h, indx, func, &cmp)) != 0)
+ goto err;
+ if (cmp == 0) {
+ if (TYPE(h) == P_LBTREE || TYPE(h) == P_LDUP)
+ goto found;
+ goto next;
+ }
+ if (cmp > 0) {
+ base = indx + adjust;
+ --lim;
+ }
+ }
+
+ /*
+ * No match found. Base is the smallest index greater than
+ * key and may be zero or a last + O_INDX index.
+ *
+ * If it's a leaf page, return base as the "found" value.
+ * Delete only deletes exact matches.
+ */
+ if (TYPE(h) == P_LBTREE || TYPE(h) == P_LDUP) {
+ *exactp = 0;
+
+ if (LF_ISSET(S_EXACT))
+ goto notfound;
+
+ if (LF_ISSET(S_STK_ONLY)) {
+ BT_STK_NUM(dbp->dbenv, cp, h, base, ret);
+ __LPUT(dbc, lock);
+ (void)memp_fput(dbp->mpf, h, 0);
+ return (ret);
+ }
+
+ /*
+ * !!!
+ * Possibly returning a deleted record -- DB_SET_RANGE,
+ * DB_KEYFIRST and DB_KEYLAST don't require an exact
+ * match, and we don't want to walk multiple pages here
+ * to find an undeleted record. This is handled by the
+ * calling routine.
+ */
+ BT_STK_ENTER(dbp->dbenv,
+ cp, h, base, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ return (0);
+ }
+
+ /*
+ * If it's not a leaf page, record the internal page (which is
+ * a parent page for the key). Decrement the base by 1 if it's
+ * non-zero so that if a split later occurs, the inserted page
+ * will be to the right of the saved page.
+ */
+ indx = base > 0 ? base - O_INDX : base;
+
+ /*
+ * If we're trying to calculate the record number, sum up
+ * all the record numbers on this page up to the indx point.
+ */
+next: if (recnop != NULL)
+ for (i = 0; i < indx; ++i)
+ recno += GET_BINTERNAL(h, i)->nrecs;
+
+ pg = GET_BINTERNAL(h, indx)->pgno;
+
+ if (LF_ISSET(S_STK_ONLY)) {
+ if (stop == h->level) {
+ BT_STK_NUM(dbp->dbenv, cp, h, indx, ret);
+ __LPUT(dbc, lock);
+ (void)memp_fput(dbp->mpf, h, 0);
+ return (ret);
+ }
+ BT_STK_NUMPUSH(dbp->dbenv, cp, h, indx, ret);
+ (void)memp_fput(dbp->mpf, h, 0);
+ if ((ret = __db_lget(dbc,
+ LCK_COUPLE, pg, lock_mode, 0, &lock)) != 0) {
+ /*
+ * Discard our lock and return on failure. This
+ * is OK because it only happens when descending
+ * the tree holding read-locks.
+ */
+ __LPUT(dbc, lock);
+ return (ret);
+ }
+ } else if (stack) {
+ /* Return if this is the lowest page wanted. */
+ if (LF_ISSET(S_PARENT) && stop == h->level) {
+ BT_STK_ENTER(dbp->dbenv,
+ cp, h, indx, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ return (0);
+ }
+ BT_STK_PUSH(dbp->dbenv,
+ cp, h, indx, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+
+ lock_mode = DB_LOCK_WRITE;
+ if ((ret =
+ __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
+ goto err;
+ } else {
+ /*
+ * Decide if we want to return a reference to the next
+ * page in the return stack. If so, lock it and never
+ * unlock it.
+ */
+ if ((LF_ISSET(S_PARENT) &&
+ (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) ||
+ (h->level - 1) == LEAFLEVEL)
+ stack = 1;
+
+ (void)memp_fput(dbp->mpf, h, 0);
+
+ lock_mode = stack &&
+ LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ;
+ if ((ret = __db_lget(dbc,
+ LCK_COUPLE, pg, lock_mode, 0, &lock)) != 0) {
+ /*
+ * If we fail, discard the lock we held. This
+ * is OK because this only happens when we are
+ * descending the tree holding read-locks.
+ */
+ __LPUT(dbc, lock);
+ goto err;
+ }
+ }
+ if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0)
+ goto err;
+ }
+ /* NOTREACHED */
+
+found: *exactp = 1;
+
+ /*
+ * If we're trying to calculate the record number, add in the
+ * offset on this page and correct for the fact that records
+ * in the tree are 0-based.
+ */
+ if (recnop != NULL)
+ *recnop = recno + (indx / P_INDX) + 1;
+
+ /*
+ * If we got here, we know that we have a Btree leaf or off-page
+ * duplicates page. If it's a Btree leaf page, we have to handle
+ * on-page duplicates.
+ *
+ * If there are duplicates, go to the first/last one. This is
+ * safe because we know that we're not going to leave the page,
+ * all duplicate sets that are not on overflow pages exist on a
+ * single leaf page.
+ */
+ if (TYPE(h) == P_LBTREE) {
+ if (LF_ISSET(S_DUPLAST))
+ while (indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
+ h->inp[indx] == h->inp[indx + P_INDX])
+ indx += P_INDX;
+ else
+ while (indx > 0 &&
+ h->inp[indx] == h->inp[indx - P_INDX])
+ indx -= P_INDX;
+ }
+
+ /*
+ * Now check if we are allowed to return deleted items; if not, then
+ * find the next (or previous) non-deleted duplicate entry. (We do
+ * not move from the original found key on the basis of the S_DELNO
+ * flag.)
+ */
+ if (LF_ISSET(S_DELNO)) {
+ deloffset = TYPE(h) == P_LBTREE ? O_INDX : 0;
+ if (LF_ISSET(S_DUPLAST))
+ while (B_DISSET(GET_BKEYDATA(
+ h, indx + deloffset)->type) && indx > 0 &&
+ h->inp[indx] == h->inp[indx - adjust])
+ indx -= adjust;
+ else
+ while (B_DISSET(GET_BKEYDATA(
+ h, indx + deloffset)->type) &&
+ indx < (db_indx_t)(NUM_ENT(h) - adjust) &&
+ h->inp[indx] == h->inp[indx + adjust])
+ indx += adjust;
+
+ /*
+ * If we weren't able to find a non-deleted duplicate, return
+ * DB_NOTFOUND.
+ */
+ if (B_DISSET(GET_BKEYDATA(h, indx + deloffset)->type))
+ goto notfound;
+ }
+
+ if (LF_ISSET(S_STK_ONLY)) {
+ BT_STK_NUM(dbp->dbenv, cp, h, indx, ret);
+ __LPUT(dbc, lock);
+ (void)memp_fput(dbp->mpf, h, 0);
+ } else {
+ BT_STK_ENTER(dbp->dbenv, cp, h, indx, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ }
+ return (0);
+
+notfound:
+ /* Keep the page locked for serializability. */
+ (void)memp_fput(dbp->mpf, h, 0);
+ (void)__TLPUT(dbc, lock);
+ ret = DB_NOTFOUND;
+
+err: BT_STK_POP(cp);
+ __bam_stkrel(dbc, 0);
+ return (ret);
+}
+
+/*
+ * __bam_stkrel --
+ * Release all pages currently held in the stack.
+ *
+ * PUBLIC: int __bam_stkrel __P((DBC *, u_int32_t));
+ */
+int
+__bam_stkrel(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ EPG *epg;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * Release inner pages first.
+ *
+ * The caller must be sure that setting STK_NOLOCK will not effect
+ * either serializability or recoverability.
+ */
+ for (ret = 0, epg = cp->sp; epg <= cp->csp; ++epg) {
+ if (epg->page != NULL) {
+ if (LF_ISSET(STK_CLRDBC) && cp->page == epg->page) {
+ cp->page = NULL;
+ cp->lock.off = LOCK_INVALID;
+ }
+ if ((t_ret = memp_fput(
+ dbp->mpf, epg->page, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ /*
+ * XXX
+ * Temporary fix for #3243 -- under certain deadlock
+ * conditions we call here again and re-free the page.
+ * The correct fix is to never release a stack that
+ * doesn't hold items.
+ */
+ epg->page = NULL;
+ }
+ if (epg->lock.off != LOCK_INVALID) {
+ if (LF_ISSET(STK_NOLOCK))
+ (void)__LPUT(dbc, epg->lock);
+ else
+ (void)__TLPUT(dbc, epg->lock);
+ }
+ }
+
+ /* Clear the stack, all pages have been released. */
+ BT_STK_CLR(cp);
+
+ return (ret);
+}
+
+/*
+ * __bam_stkgrow --
+ * Grow the stack.
+ *
+ * PUBLIC: int __bam_stkgrow __P((DB_ENV *, BTREE_CURSOR *));
+ */
+int
+__bam_stkgrow(dbenv, cp)
+ DB_ENV *dbenv;
+ BTREE_CURSOR *cp;
+{
+ EPG *p;
+ size_t entries;
+ int ret;
+
+ entries = cp->esp - cp->sp;
+
+ if ((ret = __os_calloc(dbenv, entries * 2, sizeof(EPG), &p)) != 0)
+ return (ret);
+ memcpy(p, cp->sp, entries * sizeof(EPG));
+ if (cp->sp != cp->stack)
+ __os_free(cp->sp, entries * sizeof(EPG));
+ cp->sp = p;
+ cp->csp = p + entries;
+ cp->esp = p + entries * 2;
+ return (0);
+}
diff --git a/db/btree/bt_split.c b/db/btree/bt_split.c
new file mode 100644
index 000000000..f76337b19
--- /dev/null
+++ b/db/btree/bt_split.c
@@ -0,0 +1,1126 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: bt_split.c,v 11.31 2000/12/22 19:08:27 bostic Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <limits.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "db_shash.h"
+#include "lock.h"
+#include "btree.h"
+
+static int __bam_broot __P((DBC *, PAGE *, PAGE *, PAGE *));
+static int __bam_page __P((DBC *, EPG *, EPG *));
+static int __bam_pinsert __P((DBC *, EPG *, PAGE *, PAGE *, int));
+static int __bam_psplit __P((DBC *, EPG *, PAGE *, PAGE *, db_indx_t *));
+static int __bam_root __P((DBC *, EPG *));
+static int __ram_root __P((DBC *, PAGE *, PAGE *, PAGE *));
+
+/*
+ * __bam_split --
+ * Split a page.
+ *
+ * PUBLIC: int __bam_split __P((DBC *, void *));
+ */
+int
+__bam_split(dbc, arg)
+ DBC *dbc;
+ void *arg;
+{
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ enum { UP, DOWN } dir;
+ db_pgno_t root_pgno;
+ int exact, level, ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ root_pgno = cp->root;
+
+ /*
+ * The locking protocol we use to avoid deadlock to acquire locks by
+ * walking down the tree, but we do it as lazily as possible, locking
+ * the root only as a last resort. We expect all stack pages to have
+ * been discarded before we're called; we discard all short-term locks.
+ *
+ * When __bam_split is first called, we know that a leaf page was too
+ * full for an insert. We don't know what leaf page it was, but we
+ * have the key/recno that caused the problem. We call XX_search to
+ * reacquire the leaf page, but this time get both the leaf page and
+ * its parent, locked. We then split the leaf page and see if the new
+ * internal key will fit into the parent page. If it will, we're done.
+ *
+ * If it won't, we discard our current locks and repeat the process,
+ * only this time acquiring the parent page and its parent, locked.
+ * This process repeats until we succeed in the split, splitting the
+ * root page as the final resort. The entire process then repeats,
+ * as necessary, until we split a leaf page.
+ *
+ * XXX
+ * A traditional method of speeding this up is to maintain a stack of
+ * the pages traversed in the original search. You can detect if the
+ * stack is correct by storing the page's LSN when it was searched and
+ * comparing that LSN with the current one when it's locked during the
+ * split. This would be an easy change for this code, but I have no
+ * numbers that indicate it's worthwhile.
+ */
+ t = dbp->bt_internal;
+ for (dir = UP, level = LEAFLEVEL;; dir == UP ? ++level : --level) {
+ /*
+ * Acquire a page and its parent, locked.
+ */
+ if ((ret = (dbc->dbtype == DB_BTREE ?
+ __bam_search(dbc, arg, S_WRPAIR, level, NULL, &exact) :
+ __bam_rsearch(dbc,
+ (db_recno_t *)arg, S_WRPAIR, level, &exact))) != 0)
+ return (ret);
+
+ /*
+ * Split the page if it still needs it (it's possible another
+ * thread of control has already split the page). If we are
+ * guaranteed that two items will fit on the page, the split
+ * is no longer necessary.
+ */
+ if (2 * B_MAXSIZEONPAGE(cp->ovflsize)
+ <= (db_indx_t)P_FREESPACE(cp->csp[0].page)) {
+ __bam_stkrel(dbc, STK_NOLOCK);
+ return (0);
+ }
+ ret = cp->csp[0].page->pgno == root_pgno ?
+ __bam_root(dbc, &cp->csp[0]) :
+ __bam_page(dbc, &cp->csp[-1], &cp->csp[0]);
+ BT_STK_CLR(cp);
+
+ switch (ret) {
+ case 0:
+ /* Once we've split the leaf page, we're done. */
+ if (level == LEAFLEVEL)
+ return (0);
+
+ /* Switch directions. */
+ if (dir == UP)
+ dir = DOWN;
+ break;
+ case DB_NEEDSPLIT:
+ /*
+ * It's possible to fail to split repeatedly, as other
+ * threads may be modifying the tree, or the page usage
+ * is sufficiently bad that we don't get enough space
+ * the first time.
+ */
+ if (dir == DOWN)
+ dir = UP;
+ break;
+ default:
+ return (ret);
+ }
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * __bam_root --
+ * Split the root page of a btree.
+ */
+static int
+__bam_root(dbc, cp)
+ DBC *dbc;
+ EPG *cp;
+{
+ DB *dbp;
+ DBT log_dbt;
+ DB_LSN log_lsn;
+ PAGE *lp, *rp;
+ db_indx_t split;
+ u_int32_t opflags;
+ int ret;
+
+ dbp = dbc->dbp;
+
+ /* Yeah, right. */
+ if (cp->page->level >= MAXBTREELEVEL) {
+ __db_err(dbp->dbenv,
+ "Too many btree levels: %d", cp->page->level);
+ ret = ENOSPC;
+ goto err;
+ }
+
+ /* Create new left and right pages for the split. */
+ lp = rp = NULL;
+ if ((ret = __db_new(dbc, TYPE(cp->page), &lp)) != 0 ||
+ (ret = __db_new(dbc, TYPE(cp->page), &rp)) != 0)
+ goto err;
+ P_INIT(lp, dbp->pgsize, lp->pgno,
+ PGNO_INVALID, ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno,
+ cp->page->level, TYPE(cp->page));
+ P_INIT(rp, dbp->pgsize, rp->pgno,
+ ISINTERNAL(cp->page) ? PGNO_INVALID : lp->pgno, PGNO_INVALID,
+ cp->page->level, TYPE(cp->page));
+
+ /* Split the page. */
+ if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0)
+ goto err;
+
+ /* Log the change. */
+ if (DB_LOGGING(dbc)) {
+ memset(&log_dbt, 0, sizeof(log_dbt));
+ log_dbt.data = cp->page;
+ log_dbt.size = dbp->pgsize;
+ ZERO_LSN(log_lsn);
+ opflags = F_ISSET(
+ (BTREE_CURSOR *)dbc->internal, C_RECNUM) ? SPL_NRECS : 0;
+ if ((ret = __bam_split_log(dbp->dbenv, dbc->txn,
+ &LSN(cp->page), 0, dbp->log_fileid, PGNO(lp), &LSN(lp),
+ PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp), 0, &log_lsn,
+ dbc->internal->root, &log_dbt, opflags)) != 0)
+ goto err;
+ LSN(lp) = LSN(cp->page);
+ LSN(rp) = LSN(cp->page);
+ }
+
+ /* Clean up the new root page. */
+ if ((ret = (dbc->dbtype == DB_RECNO ?
+ __ram_root(dbc, cp->page, lp, rp) :
+ __bam_broot(dbc, cp->page, lp, rp))) != 0)
+ goto err;
+
+ /* Adjust any cursors. */
+ if ((ret = __bam_ca_split(dbc,
+ cp->page->pgno, lp->pgno, rp->pgno, split, 1)) != 0)
+ goto err;
+
+ /* Success -- write the real pages back to the store. */
+ (void)memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY);
+ (void)__TLPUT(dbc, cp->lock);
+ (void)memp_fput(dbp->mpf, lp, DB_MPOOL_DIRTY);
+ (void)memp_fput(dbp->mpf, rp, DB_MPOOL_DIRTY);
+
+ return (0);
+
+err: if (lp != NULL)
+ (void)__db_free(dbc, lp);
+ if (rp != NULL)
+ (void)__db_free(dbc, rp);
+ (void)memp_fput(dbp->mpf, cp->page, 0);
+ (void)__TLPUT(dbc, cp->lock);
+ return (ret);
+}
+
+/*
+ * __bam_page --
+ * Split the non-root page of a btree.
+ */
+static int
+__bam_page(dbc, pp, cp)
+ DBC *dbc;
+ EPG *pp, *cp;
+{
+ BTREE_CURSOR *bc;
+ DBT log_dbt;
+ DB_LSN log_lsn;
+ DB *dbp;
+ DB_LOCK tplock;
+ DB_LSN save_lsn;
+ PAGE *lp, *rp, *alloc_rp, *tp;
+ db_indx_t split;
+ u_int32_t opflags;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ alloc_rp = lp = rp = tp = NULL;
+ tplock.off = LOCK_INVALID;
+ ret = -1;
+
+ /*
+ * Create a new right page for the split, and fill in everything
+ * except its LSN and page number.
+ *
+ * We malloc space for both the left and right pages, so we don't get
+ * a new page from the underlying buffer pool until we know the split
+ * is going to succeed. The reason is that we can't release locks
+ * acquired during the get-a-new-page process because metadata page
+ * locks can't be discarded on failure since we may have modified the
+ * free list. So, if you assume that we're holding a write lock on the
+ * leaf page which ran out of space and started this split (e.g., we
+ * have already written records to the page, or we retrieved a record
+ * from it with the DB_RMW flag set), failing in a split with both a
+ * leaf page locked and the metadata page locked can potentially lock
+ * up the tree badly, because we've violated the rule of always locking
+ * down the tree, and never up.
+ */
+ if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &rp)) != 0)
+ goto err;
+ P_INIT(rp, dbp->pgsize, 0,
+ ISINTERNAL(cp->page) ? PGNO_INVALID : PGNO(cp->page),
+ ISINTERNAL(cp->page) ? PGNO_INVALID : NEXT_PGNO(cp->page),
+ cp->page->level, TYPE(cp->page));
+
+ /*
+ * Create new left page for the split, and fill in everything
+ * except its LSN and next-page page number.
+ */
+ if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &lp)) != 0)
+ goto err;
+ P_INIT(lp, dbp->pgsize, PGNO(cp->page),
+ ISINTERNAL(cp->page) ? PGNO_INVALID : PREV_PGNO(cp->page),
+ ISINTERNAL(cp->page) ? PGNO_INVALID : 0,
+ cp->page->level, TYPE(cp->page));
+
+ /*
+ * Split right.
+ *
+ * Only the indices are sorted on the page, i.e., the key/data pairs
+ * aren't, so it's simpler to copy the data from the split page onto
+ * two new pages instead of copying half the data to a new right page
+ * and compacting the left page in place. Since the left page can't
+ * change, we swap the original and the allocated left page after the
+ * split.
+ */
+ if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0)
+ goto err;
+
+ /*
+ * Test to see if we are going to be able to insert the new pages into
+ * the parent page. The interesting failure here is that the parent
+ * page can't hold the new keys, and has to be split in turn, in which
+ * case we want to release all the locks we can.
+ */
+ if ((ret = __bam_pinsert(dbc, pp, lp, rp, 1)) != 0)
+ goto err;
+
+ /*
+ * Fix up the previous pointer of any leaf page following the split
+ * page.
+ *
+ * There's interesting deadlock situations here as we try to write-lock
+ * a page that's not in our direct ancestry. Consider a cursor walking
+ * backward through the leaf pages, that has our following page locked,
+ * and is waiting on a lock for the page we're splitting. In that case
+ * we're going to deadlock here . It's probably OK, stepping backward
+ * through the tree isn't a common operation.
+ */
+ if (ISLEAF(cp->page) && NEXT_PGNO(cp->page) != PGNO_INVALID) {
+ if ((ret = __db_lget(dbc,
+ 0, NEXT_PGNO(cp->page), DB_LOCK_WRITE, 0, &tplock)) != 0)
+ goto err;
+ if ((ret =
+ memp_fget(dbp->mpf, &NEXT_PGNO(cp->page), 0, &tp)) != 0)
+ goto err;
+ }
+
+ /*
+ * We've got everything locked down we need, and we know the split
+ * is going to succeed. Go and get the additional page we'll need.
+ */
+ if ((ret = __db_new(dbc, TYPE(cp->page), &alloc_rp)) != 0)
+ goto err;
+
+ /*
+ * Fix up the page numbers we didn't have before. We have to do this
+ * before calling __bam_pinsert because it may copy a page number onto
+ * the parent page and it takes the page number from its page argument.
+ */
+ PGNO(rp) = NEXT_PGNO(lp) = PGNO(alloc_rp);
+
+ /* Actually update the parent page. */
+ if ((ret = __bam_pinsert(dbc, pp, lp, rp, 0)) != 0)
+ goto err;
+
+ bc = (BTREE_CURSOR *)dbc->internal;
+ /* Log the change. */
+ if (DB_LOGGING(dbc)) {
+ memset(&log_dbt, 0, sizeof(log_dbt));
+ log_dbt.data = cp->page;
+ log_dbt.size = dbp->pgsize;
+ if (tp == NULL)
+ ZERO_LSN(log_lsn);
+ opflags = F_ISSET(bc, C_RECNUM) ? SPL_NRECS : 0;
+ if ((ret = __bam_split_log(dbp->dbenv, dbc->txn,
+ &LSN(cp->page), 0, dbp->log_fileid, PGNO(cp->page),
+ &LSN(cp->page), PGNO(alloc_rp), &LSN(alloc_rp),
+ (u_int32_t)NUM_ENT(lp),
+ tp == NULL ? 0 : PGNO(tp),
+ tp == NULL ? &log_lsn : &LSN(tp),
+ bc->root, &log_dbt, opflags)) != 0)
+ goto err;
+
+ /* Update the LSNs for all involved pages. */
+ LSN(alloc_rp) = LSN(cp->page);
+ LSN(lp) = LSN(cp->page);
+ LSN(rp) = LSN(cp->page);
+ if (tp != NULL)
+ LSN(tp) = LSN(cp->page);
+ }
+
+ /*
+ * Copy the left and right pages into place. There are two paths
+ * through here. Either we are logging and we set the LSNs in the
+ * logging path. However, if we are not logging, then we do not
+ * have valid LSNs on lp or rp. The correct LSNs to use are the
+ * ones on the page we got from __db_new or the one that was
+ * originally on cp->page. In both cases, we save the LSN from the
+ * real database page (not a malloc'd one) and reapply it after we
+ * do the copy.
+ */
+ save_lsn = alloc_rp->lsn;
+ memcpy(alloc_rp, rp, LOFFSET(rp));
+ memcpy((u_int8_t *)alloc_rp + HOFFSET(rp),
+ (u_int8_t *)rp + HOFFSET(rp), dbp->pgsize - HOFFSET(rp));
+ alloc_rp->lsn = save_lsn;
+
+ save_lsn = cp->page->lsn;
+ memcpy(cp->page, lp, LOFFSET(lp));
+ memcpy((u_int8_t *)cp->page + HOFFSET(lp),
+ (u_int8_t *)lp + HOFFSET(lp), dbp->pgsize - HOFFSET(lp));
+ cp->page->lsn = save_lsn;
+
+ /* Fix up the next-page link. */
+ if (tp != NULL)
+ PREV_PGNO(tp) = PGNO(rp);
+
+ /* Adjust any cursors. */
+ if ((ret = __bam_ca_split(dbc,
+ PGNO(cp->page), PGNO(cp->page), PGNO(rp), split, 0)) != 0)
+ goto err;
+
+ __os_free(lp, dbp->pgsize);
+ __os_free(rp, dbp->pgsize);
+
+ /*
+ * Success -- write the real pages back to the store. As we never
+ * acquired any sort of lock on the new page, we release it before
+ * releasing locks on the pages that reference it. We're finished
+ * modifying the page so it's not really necessary, but it's neater.
+ */
+ if ((t_ret =
+ memp_fput(dbp->mpf, alloc_rp, DB_MPOOL_DIRTY)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret =
+ memp_fput(dbp->mpf, pp->page, DB_MPOOL_DIRTY)) != 0 && ret == 0)
+ ret = t_ret;
+ (void)__TLPUT(dbc, pp->lock);
+ if ((t_ret =
+ memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY)) != 0 && ret == 0)
+ ret = t_ret;
+ (void)__TLPUT(dbc, cp->lock);
+ if (tp != NULL) {
+ if ((t_ret =
+ memp_fput(dbp->mpf, tp, DB_MPOOL_DIRTY)) != 0 && ret == 0)
+ ret = t_ret;
+ (void)__TLPUT(dbc, tplock);
+ }
+ return (ret);
+
+err: if (lp != NULL)
+ __os_free(lp, dbp->pgsize);
+ if (rp != NULL)
+ __os_free(rp, dbp->pgsize);
+ if (alloc_rp != NULL)
+ (void)__db_free(dbc, alloc_rp);
+
+ if (tp != NULL)
+ (void)memp_fput(dbp->mpf, tp, 0);
+ if (tplock.off != LOCK_INVALID)
+ /* We never updated the next page, we can release it. */
+ (void)__LPUT(dbc, tplock);
+
+ (void)memp_fput(dbp->mpf, pp->page, 0);
+ if (ret == DB_NEEDSPLIT)
+ (void)__LPUT(dbc, pp->lock);
+ else
+ (void)__TLPUT(dbc, pp->lock);
+
+ (void)memp_fput(dbp->mpf, cp->page, 0);
+ if (ret == DB_NEEDSPLIT)
+ (void)__LPUT(dbc, cp->lock);
+ else
+ (void)__TLPUT(dbc, cp->lock);
+
+ return (ret);
+}
+
+/*
+ * __bam_broot --
+ * Fix up the btree root page after it has been split.
+ */
+static int
+__bam_broot(dbc, rootp, lp, rp)
+ DBC *dbc;
+ PAGE *rootp, *lp, *rp;
+{
+ BINTERNAL bi, *child_bi;
+ BKEYDATA *child_bk;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT hdr, data;
+ db_pgno_t root_pgno;
+ int ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * If the root page was a leaf page, change it into an internal page.
+ * We copy the key we split on (but not the key's data, in the case of
+ * a leaf page) to the new root page.
+ */
+ root_pgno = cp->root;
+ P_INIT(rootp, dbp->pgsize,
+ root_pgno, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IBTREE);
+
+ memset(&data, 0, sizeof(data));
+ memset(&hdr, 0, sizeof(hdr));
+
+ /*
+ * The btree comparison code guarantees that the left-most key on any
+ * internal btree page is never used, so it doesn't need to be filled
+ * in. Set the record count if necessary.
+ */
+ memset(&bi, 0, sizeof(bi));
+ bi.len = 0;
+ B_TSET(bi.type, B_KEYDATA, 0);
+ bi.pgno = lp->pgno;
+ if (F_ISSET(cp, C_RECNUM)) {
+ bi.nrecs = __bam_total(lp);
+ RE_NREC_SET(rootp, bi.nrecs);
+ }
+ hdr.data = &bi;
+ hdr.size = SSZA(BINTERNAL, data);
+ if ((ret =
+ __db_pitem(dbc, rootp, 0, BINTERNAL_SIZE(0), &hdr, NULL)) != 0)
+ return (ret);
+
+ switch (TYPE(rp)) {
+ case P_IBTREE:
+ /* Copy the first key of the child page onto the root page. */
+ child_bi = GET_BINTERNAL(rp, 0);
+
+ bi.len = child_bi->len;
+ B_TSET(bi.type, child_bi->type, 0);
+ bi.pgno = rp->pgno;
+ if (F_ISSET(cp, C_RECNUM)) {
+ bi.nrecs = __bam_total(rp);
+ RE_NREC_ADJ(rootp, bi.nrecs);
+ }
+ hdr.data = &bi;
+ hdr.size = SSZA(BINTERNAL, data);
+ data.data = child_bi->data;
+ data.size = child_bi->len;
+ if ((ret = __db_pitem(dbc, rootp, 1,
+ BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0)
+ return (ret);
+
+ /* Increment the overflow ref count. */
+ if (B_TYPE(child_bi->type) == B_OVERFLOW)
+ if ((ret = __db_ovref(dbc,
+ ((BOVERFLOW *)(child_bi->data))->pgno, 1)) != 0)
+ return (ret);
+ break;
+ case P_LDUP:
+ case P_LBTREE:
+ /* Copy the first key of the child page onto the root page. */
+ child_bk = GET_BKEYDATA(rp, 0);
+ switch (B_TYPE(child_bk->type)) {
+ case B_KEYDATA:
+ bi.len = child_bk->len;
+ B_TSET(bi.type, child_bk->type, 0);
+ bi.pgno = rp->pgno;
+ if (F_ISSET(cp, C_RECNUM)) {
+ bi.nrecs = __bam_total(rp);
+ RE_NREC_ADJ(rootp, bi.nrecs);
+ }
+ hdr.data = &bi;
+ hdr.size = SSZA(BINTERNAL, data);
+ data.data = child_bk->data;
+ data.size = child_bk->len;
+ if ((ret = __db_pitem(dbc, rootp, 1,
+ BINTERNAL_SIZE(child_bk->len), &hdr, &data)) != 0)
+ return (ret);
+ break;
+ case B_DUPLICATE:
+ case B_OVERFLOW:
+ bi.len = BOVERFLOW_SIZE;
+ B_TSET(bi.type, child_bk->type, 0);
+ bi.pgno = rp->pgno;
+ if (F_ISSET(cp, C_RECNUM)) {
+ bi.nrecs = __bam_total(rp);
+ RE_NREC_ADJ(rootp, bi.nrecs);
+ }
+ hdr.data = &bi;
+ hdr.size = SSZA(BINTERNAL, data);
+ data.data = child_bk;
+ data.size = BOVERFLOW_SIZE;
+ if ((ret = __db_pitem(dbc, rootp, 1,
+ BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0)
+ return (ret);
+
+ /* Increment the overflow ref count. */
+ if (B_TYPE(child_bk->type) == B_OVERFLOW)
+ if ((ret = __db_ovref(dbc,
+ ((BOVERFLOW *)child_bk)->pgno, 1)) != 0)
+ return (ret);
+ break;
+ default:
+ return (__db_pgfmt(dbp, rp->pgno));
+ }
+ break;
+ default:
+ return (__db_pgfmt(dbp, rp->pgno));
+ }
+ return (0);
+}
+
+/*
+ * __ram_root --
+ * Fix up the recno root page after it has been split.
+ */
+static int
+__ram_root(dbc, rootp, lp, rp)
+ DBC *dbc;
+ PAGE *rootp, *lp, *rp;
+{
+ DB *dbp;
+ DBT hdr;
+ RINTERNAL ri;
+ db_pgno_t root_pgno;
+ int ret;
+
+ dbp = dbc->dbp;
+ root_pgno = dbc->internal->root;
+
+ /* Initialize the page. */
+ P_INIT(rootp, dbp->pgsize,
+ root_pgno, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IRECNO);
+
+ /* Initialize the header. */
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.data = &ri;
+ hdr.size = RINTERNAL_SIZE;
+
+ /* Insert the left and right keys, set the header information. */
+ ri.pgno = lp->pgno;
+ ri.nrecs = __bam_total(lp);
+ if ((ret = __db_pitem(dbc, rootp, 0, RINTERNAL_SIZE, &hdr, NULL)) != 0)
+ return (ret);
+ RE_NREC_SET(rootp, ri.nrecs);
+ ri.pgno = rp->pgno;
+ ri.nrecs = __bam_total(rp);
+ if ((ret = __db_pitem(dbc, rootp, 1, RINTERNAL_SIZE, &hdr, NULL)) != 0)
+ return (ret);
+ RE_NREC_ADJ(rootp, ri.nrecs);
+ return (0);
+}
+
+/*
+ * __bam_pinsert --
+ * Insert a new key into a parent page, completing the split.
+ */
+static int
+__bam_pinsert(dbc, parent, lchild, rchild, space_check)
+ DBC *dbc;
+ EPG *parent;
+ PAGE *lchild, *rchild;
+ int space_check;
+{
+ BINTERNAL bi, *child_bi;
+ BKEYDATA *child_bk, *tmp_bk;
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT a, b, hdr, data;
+ PAGE *ppage;
+ RINTERNAL ri;
+ db_indx_t off;
+ db_recno_t nrecs;
+ size_t (*func) __P((DB *, const DBT *, const DBT *));
+ u_int32_t n, nbytes, nksize;
+ int ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ t = dbp->bt_internal;
+ ppage = parent->page;
+
+ /* If handling record numbers, count records split to the right page. */
+ nrecs = F_ISSET(cp, C_RECNUM) && !space_check ? __bam_total(rchild) : 0;
+
+ /*
+ * Now we insert the new page's first key into the parent page, which
+ * completes the split. The parent points to a PAGE and a page index
+ * offset, where the new key goes ONE AFTER the index, because we split
+ * to the right.
+ *
+ * XXX
+ * Some btree algorithms replace the key for the old page as well as
+ * the new page. We don't, as there's no reason to believe that the
+ * first key on the old page is any better than the key we have, and,
+ * in the case of a key being placed at index 0 causing the split, the
+ * key is unavailable.
+ */
+ off = parent->indx + O_INDX;
+
+ /*
+ * Calculate the space needed on the parent page.
+ *
+ * Prefix trees: space hack used when inserting into BINTERNAL pages.
+ * Retain only what's needed to distinguish between the new entry and
+ * the LAST entry on the page to its left. If the keys compare equal,
+ * retain the entire key. We ignore overflow keys, and the entire key
+ * must be retained for the next-to-leftmost key on the leftmost page
+ * of each level, or the search will fail. Applicable ONLY to internal
+ * pages that have leaf pages as children. Further reduction of the
+ * key between pairs of internal pages loses too much information.
+ */
+ switch (TYPE(rchild)) {
+ case P_IBTREE:
+ child_bi = GET_BINTERNAL(rchild, 0);
+ nbytes = BINTERNAL_PSIZE(child_bi->len);
+
+ if (P_FREESPACE(ppage) < nbytes)
+ return (DB_NEEDSPLIT);
+ if (space_check)
+ return (0);
+
+ /* Add a new record for the right page. */
+ memset(&bi, 0, sizeof(bi));
+ bi.len = child_bi->len;
+ B_TSET(bi.type, child_bi->type, 0);
+ bi.pgno = rchild->pgno;
+ bi.nrecs = nrecs;
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.data = &bi;
+ hdr.size = SSZA(BINTERNAL, data);
+ memset(&data, 0, sizeof(data));
+ data.data = child_bi->data;
+ data.size = child_bi->len;
+ if ((ret = __db_pitem(dbc, ppage, off,
+ BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0)
+ return (ret);
+
+ /* Increment the overflow ref count. */
+ if (B_TYPE(child_bi->type) == B_OVERFLOW)
+ if ((ret = __db_ovref(dbc,
+ ((BOVERFLOW *)(child_bi->data))->pgno, 1)) != 0)
+ return (ret);
+ break;
+ case P_LDUP:
+ case P_LBTREE:
+ child_bk = GET_BKEYDATA(rchild, 0);
+ switch (B_TYPE(child_bk->type)) {
+ case B_KEYDATA:
+ /*
+ * We set t->bt_prefix to NULL if we have a comparison
+ * callback but no prefix compression callback. But,
+ * if we're splitting in an off-page duplicates tree,
+ * we still have to do some checking. If using the
+ * default off-page duplicates comparison routine we
+ * can use the default prefix compression callback. If
+ * not using the default off-page duplicates comparison
+ * routine, we can't do any kind of prefix compression
+ * as there's no way for an application to specify a
+ * prefix compression callback that corresponds to its
+ * comparison callback.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ if (dbp->dup_compare == __bam_defcmp)
+ func = __bam_defpfx;
+ else
+ func = NULL;
+ } else
+ func = t->bt_prefix;
+
+ nbytes = BINTERNAL_PSIZE(child_bk->len);
+ nksize = child_bk->len;
+ if (func == NULL)
+ goto noprefix;
+ if (ppage->prev_pgno == PGNO_INVALID && off <= 1)
+ goto noprefix;
+ tmp_bk = GET_BKEYDATA(lchild, NUM_ENT(lchild) -
+ (TYPE(lchild) == P_LDUP ? O_INDX : P_INDX));
+ if (B_TYPE(tmp_bk->type) != B_KEYDATA)
+ goto noprefix;
+ memset(&a, 0, sizeof(a));
+ a.size = tmp_bk->len;
+ a.data = tmp_bk->data;
+ memset(&b, 0, sizeof(b));
+ b.size = child_bk->len;
+ b.data = child_bk->data;
+ nksize = func(dbp, &a, &b);
+ if ((n = BINTERNAL_PSIZE(nksize)) < nbytes)
+ nbytes = n;
+ else
+noprefix: nksize = child_bk->len;
+
+ if (P_FREESPACE(ppage) < nbytes)
+ return (DB_NEEDSPLIT);
+ if (space_check)
+ return (0);
+
+ memset(&bi, 0, sizeof(bi));
+ bi.len = nksize;
+ B_TSET(bi.type, child_bk->type, 0);
+ bi.pgno = rchild->pgno;
+ bi.nrecs = nrecs;
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.data = &bi;
+ hdr.size = SSZA(BINTERNAL, data);
+ memset(&data, 0, sizeof(data));
+ data.data = child_bk->data;
+ data.size = nksize;
+ if ((ret = __db_pitem(dbc, ppage, off,
+ BINTERNAL_SIZE(nksize), &hdr, &data)) != 0)
+ return (ret);
+ break;
+ case B_DUPLICATE:
+ case B_OVERFLOW:
+ nbytes = BINTERNAL_PSIZE(BOVERFLOW_SIZE);
+
+ if (P_FREESPACE(ppage) < nbytes)
+ return (DB_NEEDSPLIT);
+ if (space_check)
+ return (0);
+
+ memset(&bi, 0, sizeof(bi));
+ bi.len = BOVERFLOW_SIZE;
+ B_TSET(bi.type, child_bk->type, 0);
+ bi.pgno = rchild->pgno;
+ bi.nrecs = nrecs;
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.data = &bi;
+ hdr.size = SSZA(BINTERNAL, data);
+ memset(&data, 0, sizeof(data));
+ data.data = child_bk;
+ data.size = BOVERFLOW_SIZE;
+ if ((ret = __db_pitem(dbc, ppage, off,
+ BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0)
+ return (ret);
+
+ /* Increment the overflow ref count. */
+ if (B_TYPE(child_bk->type) == B_OVERFLOW)
+ if ((ret = __db_ovref(dbc,
+ ((BOVERFLOW *)child_bk)->pgno, 1)) != 0)
+ return (ret);
+ break;
+ default:
+ return (__db_pgfmt(dbp, rchild->pgno));
+ }
+ break;
+ case P_IRECNO:
+ case P_LRECNO:
+ nbytes = RINTERNAL_PSIZE;
+
+ if (P_FREESPACE(ppage) < nbytes)
+ return (DB_NEEDSPLIT);
+ if (space_check)
+ return (0);
+
+ /* Add a new record for the right page. */
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.data = &ri;
+ hdr.size = RINTERNAL_SIZE;
+ ri.pgno = rchild->pgno;
+ ri.nrecs = nrecs;
+ if ((ret = __db_pitem(dbc,
+ ppage, off, RINTERNAL_SIZE, &hdr, NULL)) != 0)
+ return (ret);
+ break;
+ default:
+ return (__db_pgfmt(dbp, rchild->pgno));
+ }
+
+ /*
+ * If a Recno or Btree with record numbers AM page, or an off-page
+ * duplicates tree, adjust the parent page's left page record count.
+ */
+ if (F_ISSET(cp, C_RECNUM)) {
+ /* Log the change. */
+ if (DB_LOGGING(dbc) &&
+ (ret = __bam_cadjust_log(dbp->dbenv, dbc->txn,
+ &LSN(ppage), 0, dbp->log_fileid, PGNO(ppage),
+ &LSN(ppage), parent->indx, -(int32_t)nrecs, 0)) != 0)
+ return (ret);
+
+ /* Update the left page count. */
+ if (dbc->dbtype == DB_RECNO)
+ GET_RINTERNAL(ppage, parent->indx)->nrecs -= nrecs;
+ else
+ GET_BINTERNAL(ppage, parent->indx)->nrecs -= nrecs;
+ }
+
+ return (0);
+}
+
+/*
+ * __bam_psplit --
+ * Do the real work of splitting the page.
+ */
+static int
+__bam_psplit(dbc, cp, lp, rp, splitret)
+ DBC *dbc;
+ EPG *cp;
+ PAGE *lp, *rp;
+ db_indx_t *splitret;
+{
+ DB *dbp;
+ PAGE *pp;
+ db_indx_t half, nbytes, off, splitp, top;
+ int adjust, cnt, iflag, isbigkey, ret;
+
+ dbp = dbc->dbp;
+ pp = cp->page;
+ adjust = TYPE(pp) == P_LBTREE ? P_INDX : O_INDX;
+
+ /*
+ * If we're splitting the first (last) page on a level because we're
+ * inserting (appending) a key to it, it's likely that the data is
+ * sorted. Moving a single item to the new page is less work and can
+ * push the fill factor higher than normal. If we're wrong it's not
+ * a big deal, we'll just do the split the right way next time.
+ */
+ off = 0;
+ if (NEXT_PGNO(pp) == PGNO_INVALID &&
+ ((ISINTERNAL(pp) && cp->indx == NUM_ENT(cp->page) - 1) ||
+ (!ISINTERNAL(pp) && cp->indx == NUM_ENT(cp->page))))
+ off = NUM_ENT(cp->page) - adjust;
+ else if (PREV_PGNO(pp) == PGNO_INVALID && cp->indx == 0)
+ off = adjust;
+
+ if (off != 0)
+ goto sort;
+
+ /*
+ * Split the data to the left and right pages. Try not to split on
+ * an overflow key. (Overflow keys on internal pages will slow down
+ * searches.) Refuse to split in the middle of a set of duplicates.
+ *
+ * First, find the optimum place to split.
+ *
+ * It's possible to try and split past the last record on the page if
+ * there's a very large record at the end of the page. Make sure this
+ * doesn't happen by bounding the check at the next-to-last entry on
+ * the page.
+ *
+ * Note, we try and split half the data present on the page. This is
+ * because another process may have already split the page and left
+ * it half empty. We don't try and skip the split -- we don't know
+ * how much space we're going to need on the page, and we may need up
+ * to half the page for a big item, so there's no easy test to decide
+ * if we need to split or not. Besides, if two threads are inserting
+ * data into the same place in the database, we're probably going to
+ * need more space soon anyway.
+ */
+ top = NUM_ENT(pp) - adjust;
+ half = (dbp->pgsize - HOFFSET(pp)) / 2;
+ for (nbytes = 0, off = 0; off < top && nbytes < half; ++off)
+ switch (TYPE(pp)) {
+ case P_IBTREE:
+ if (B_TYPE(GET_BINTERNAL(pp, off)->type) == B_KEYDATA)
+ nbytes +=
+ BINTERNAL_SIZE(GET_BINTERNAL(pp, off)->len);
+ else
+ nbytes += BINTERNAL_SIZE(BOVERFLOW_SIZE);
+ break;
+ case P_LBTREE:
+ if (B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA)
+ nbytes +=
+ BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len);
+ else
+ nbytes += BOVERFLOW_SIZE;
+
+ ++off;
+ /* FALLTHROUGH */
+ case P_LDUP:
+ case P_LRECNO:
+ if (B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA)
+ nbytes +=
+ BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len);
+ else
+ nbytes += BOVERFLOW_SIZE;
+ break;
+ case P_IRECNO:
+ nbytes += RINTERNAL_SIZE;
+ break;
+ default:
+ return (__db_pgfmt(dbp, pp->pgno));
+ }
+sort: splitp = off;
+
+ /*
+ * Splitp is either at or just past the optimum split point. If the
+ * tree type is such that we're going to promote a key to an internal
+ * page, and our current choice is an overflow key, look for something
+ * close by that's smaller.
+ */
+ switch (TYPE(pp)) {
+ case P_IBTREE:
+ iflag = 1;
+ isbigkey = B_TYPE(GET_BINTERNAL(pp, off)->type) != B_KEYDATA;
+ break;
+ case P_LBTREE:
+ case P_LDUP:
+ iflag = 0;
+ isbigkey = B_TYPE(GET_BKEYDATA(pp, off)->type) != B_KEYDATA;
+ break;
+ default:
+ iflag = isbigkey = 0;
+ }
+ if (isbigkey)
+ for (cnt = 1; cnt <= 3; ++cnt) {
+ off = splitp + cnt * adjust;
+ if (off < (db_indx_t)NUM_ENT(pp) &&
+ ((iflag &&
+ B_TYPE(GET_BINTERNAL(pp,off)->type) == B_KEYDATA) ||
+ B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA)) {
+ splitp = off;
+ break;
+ }
+ if (splitp <= (db_indx_t)(cnt * adjust))
+ continue;
+ off = splitp - cnt * adjust;
+ if (iflag ?
+ B_TYPE(GET_BINTERNAL(pp, off)->type) == B_KEYDATA :
+ B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA) {
+ splitp = off;
+ break;
+ }
+ }
+
+ /*
+ * We can't split in the middle a set of duplicates. We know that
+ * no duplicate set can take up more than about 25% of the page,
+ * because that's the point where we push it off onto a duplicate
+ * page set. So, this loop can't be unbounded.
+ */
+ if (TYPE(pp) == P_LBTREE &&
+ pp->inp[splitp] == pp->inp[splitp - adjust])
+ for (cnt = 1;; ++cnt) {
+ off = splitp + cnt * adjust;
+ if (off < NUM_ENT(pp) &&
+ pp->inp[splitp] != pp->inp[off]) {
+ splitp = off;
+ break;
+ }
+ if (splitp <= (db_indx_t)(cnt * adjust))
+ continue;
+ off = splitp - cnt * adjust;
+ if (pp->inp[splitp] != pp->inp[off]) {
+ splitp = off + adjust;
+ break;
+ }
+ }
+
+ /* We're going to split at splitp. */
+ if ((ret = __bam_copy(dbp, pp, lp, 0, splitp)) != 0)
+ return (ret);
+ if ((ret = __bam_copy(dbp, pp, rp, splitp, NUM_ENT(pp))) != 0)
+ return (ret);
+
+ *splitret = splitp;
+ return (0);
+}
+
+/*
+ * __bam_copy --
+ * Copy a set of records from one page to another.
+ *
+ * PUBLIC: int __bam_copy __P((DB *, PAGE *, PAGE *, u_int32_t, u_int32_t));
+ */
+int
+__bam_copy(dbp, pp, cp, nxt, stop)
+ DB *dbp;
+ PAGE *pp, *cp;
+ u_int32_t nxt, stop;
+{
+ db_indx_t nbytes, off;
+
+ /*
+ * Copy the rest of the data to the right page. Nxt is the next
+ * offset placed on the target page.
+ */
+ for (off = 0; nxt < stop; ++nxt, ++NUM_ENT(cp), ++off) {
+ switch (TYPE(pp)) {
+ case P_IBTREE:
+ if (B_TYPE(GET_BINTERNAL(pp, nxt)->type) == B_KEYDATA)
+ nbytes =
+ BINTERNAL_SIZE(GET_BINTERNAL(pp, nxt)->len);
+ else
+ nbytes = BINTERNAL_SIZE(BOVERFLOW_SIZE);
+ break;
+ case P_LBTREE:
+ /*
+ * If we're on a key and it's a duplicate, just copy
+ * the offset.
+ */
+ if (off != 0 && (nxt % P_INDX) == 0 &&
+ pp->inp[nxt] == pp->inp[nxt - P_INDX]) {
+ cp->inp[off] = cp->inp[off - P_INDX];
+ continue;
+ }
+ /* FALLTHROUGH */
+ case P_LDUP:
+ case P_LRECNO:
+ if (B_TYPE(GET_BKEYDATA(pp, nxt)->type) == B_KEYDATA)
+ nbytes =
+ BKEYDATA_SIZE(GET_BKEYDATA(pp, nxt)->len);
+ else
+ nbytes = BOVERFLOW_SIZE;
+ break;
+ case P_IRECNO:
+ nbytes = RINTERNAL_SIZE;
+ break;
+ default:
+ return (__db_pgfmt(dbp, pp->pgno));
+ }
+ cp->inp[off] = HOFFSET(cp) -= nbytes;
+ memcpy(P_ENTRY(cp, off), P_ENTRY(pp, nxt), nbytes);
+ }
+ return (0);
+}
diff --git a/db/btree/bt_stat.c b/db/btree/bt_stat.c
new file mode 100644
index 000000000..349bb40cf
--- /dev/null
+++ b/db/btree/bt_stat.c
@@ -0,0 +1,480 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: bt_stat.c,v 11.29 2000/11/28 21:42:27 bostic Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "db_shash.h"
+#include "lock.h"
+#include "btree.h"
+
+/*
+ * __bam_stat --
+ * Gather/print the btree statistics
+ *
+ * PUBLIC: int __bam_stat __P((DB *, void *, void *(*)(size_t), u_int32_t));
+ */
+int
+__bam_stat(dbp, spp, db_malloc, flags)
+ DB *dbp;
+ void *spp;
+ void *(*db_malloc) __P((size_t));
+ u_int32_t flags;
+{
+ BTMETA *meta;
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DBC *dbc;
+ DB_BTREE_STAT *sp;
+ DB_LOCK lock, metalock;
+ PAGE *h;
+ db_pgno_t pgno;
+ int ret, t_ret;
+
+ PANIC_CHECK(dbp->dbenv);
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->stat");
+
+ meta = NULL;
+ t = dbp->bt_internal;
+ sp = NULL;
+ metalock.off = lock.off = LOCK_INVALID;
+ h = NULL;
+ ret = 0;
+
+ /* Check for invalid flags. */
+ if ((ret = __db_statchk(dbp, flags)) != 0)
+ return (ret);
+
+ /* Acquire a cursor. */
+ if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0)
+ return (ret);
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ DEBUG_LWRITE(dbc, NULL, "bam_stat", NULL, NULL, flags);
+
+ /* Allocate and clear the structure. */
+ if ((ret = __os_malloc(dbp->dbenv, sizeof(*sp), db_malloc, &sp)) != 0)
+ goto err;
+ memset(sp, 0, sizeof(*sp));
+
+ /* If the app just wants the record count, make it fast. */
+ if (flags == DB_RECORDCOUNT) {
+ if ((ret = __db_lget(dbc, 0,
+ cp->root, DB_LOCK_READ, 0, &lock)) != 0)
+ goto err;
+ if ((ret = memp_fget(dbp->mpf,
+ &cp->root, 0, (PAGE **)&h)) != 0)
+ goto err;
+
+ sp->bt_nkeys = RE_NREC(h);
+
+ goto done;
+ }
+ if (flags == DB_CACHED_COUNTS) {
+ if ((ret = __db_lget(dbc,
+ 0, t->bt_meta, DB_LOCK_READ, 0, &lock)) != 0)
+ goto err;
+ if ((ret =
+ memp_fget(dbp->mpf, &t->bt_meta, 0, (PAGE **)&meta)) != 0)
+ goto err;
+ sp->bt_nkeys = meta->dbmeta.key_count;
+ sp->bt_ndata = meta->dbmeta.record_count;
+
+ goto done;
+ }
+
+ /* Get the metadata page for the entire database. */
+ pgno = PGNO_BASE_MD;
+ if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &metalock)) != 0)
+ goto err;
+ if ((ret = memp_fget(dbp->mpf, &pgno, 0, (PAGE **)&meta)) != 0)
+ goto err;
+
+ /* Walk the metadata free list, counting pages. */
+ for (sp->bt_free = 0, pgno = meta->dbmeta.free; pgno != PGNO_INVALID;) {
+ ++sp->bt_free;
+
+ if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
+ goto err;
+
+ pgno = h->next_pgno;
+ if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
+ goto err;
+ h = NULL;
+ }
+
+ /* Get the root page. */
+ pgno = cp->root;
+ if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &lock)) != 0)
+ goto err;
+ if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
+ goto err;
+
+ /* Get the levels from the root page. */
+ sp->bt_levels = h->level;
+
+ /* Discard the root page. */
+ if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
+ goto err;
+ h = NULL;
+ __LPUT(dbc, lock);
+
+ /* Walk the tree. */
+ if ((ret = __bam_traverse(dbc,
+ DB_LOCK_READ, cp->root, __bam_stat_callback, sp)) != 0)
+ goto err;
+
+ /*
+ * Get the subdatabase metadata page if it's not the same as the
+ * one we already have.
+ */
+ if (t->bt_meta != PGNO_BASE_MD || !F_ISSET(dbp, DB_AM_RDONLY)) {
+ if ((ret = memp_fput(dbp->mpf, meta, 0)) != 0)
+ goto err;
+ meta = NULL;
+ __LPUT(dbc, metalock);
+
+ if ((ret = __db_lget(dbc,
+ 0, t->bt_meta, F_ISSET(dbp, DB_AM_RDONLY) ?
+ DB_LOCK_READ : DB_LOCK_WRITE, 0, &metalock)) != 0)
+ goto err;
+ if ((ret =
+ memp_fget(dbp->mpf, &t->bt_meta, 0, (PAGE **)&meta)) != 0)
+ goto err;
+ }
+
+ /* Get metadata page statistics. */
+ sp->bt_metaflags = meta->dbmeta.flags;
+ sp->bt_maxkey = meta->maxkey;
+ sp->bt_minkey = meta->minkey;
+ sp->bt_re_len = meta->re_len;
+ sp->bt_re_pad = meta->re_pad;
+ sp->bt_pagesize = meta->dbmeta.pagesize;
+ sp->bt_magic = meta->dbmeta.magic;
+ sp->bt_version = meta->dbmeta.version;
+ if (!F_ISSET(dbp, DB_AM_RDONLY)) {
+ meta->dbmeta.key_count = sp->bt_nkeys;
+ meta->dbmeta.record_count = sp->bt_ndata;
+ }
+
+ /* Discard the metadata page. */
+ if ((ret = memp_fput(dbp->mpf,
+ meta, F_ISSET(dbp, DB_AM_RDONLY) ? 0 : DB_MPOOL_DIRTY)) != 0)
+ goto err;
+ meta = NULL;
+ __LPUT(dbc, metalock);
+
+done: *(DB_BTREE_STAT **)spp = sp;
+
+ if (0) {
+err: if (sp != NULL)
+ __os_free(sp, sizeof(*sp));
+ }
+
+ if (h != NULL &&
+ (t_ret = memp_fput(dbp->mpf, h, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (meta != NULL &&
+ (t_ret = memp_fput(dbp->mpf, meta, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (lock.off != LOCK_INVALID)
+ __LPUT(dbc, lock);
+
+ if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __bam_traverse --
+ * Walk a Btree database.
+ *
+ * PUBLIC: int __bam_traverse __P((DBC *, db_lockmode_t,
+ * PUBLIC: db_pgno_t, int (*)(DB *, PAGE *, void *, int *), void *));
+ */
+int
+__bam_traverse(dbc, mode, root_pgno, callback, cookie)
+ DBC *dbc;
+ db_lockmode_t mode;
+ db_pgno_t root_pgno;
+ int (*callback)__P((DB *, PAGE *, void *, int *));
+ void *cookie;
+{
+ BINTERNAL *bi;
+ BKEYDATA *bk;
+ DB *dbp;
+ DB_LOCK lock;
+ PAGE *h;
+ RINTERNAL *ri;
+ db_indx_t indx;
+ int already_put, ret, t_ret;
+
+ dbp = dbc->dbp;
+
+ if ((ret = __db_lget(dbc, 0, root_pgno, mode, 0, &lock)) != 0)
+ return (ret);
+ if ((ret = memp_fget(dbp->mpf, &root_pgno, 0, &h)) != 0)
+ goto err;
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) {
+ bi = GET_BINTERNAL(h, indx);
+ if (B_TYPE(bi->type) == B_OVERFLOW &&
+ (ret = __db_traverse_big(dbp,
+ ((BOVERFLOW *)bi->data)->pgno,
+ callback, cookie)) != 0)
+ goto err;
+ if ((ret = __bam_traverse(
+ dbc, mode, bi->pgno, callback, cookie)) != 0)
+ break;
+ }
+ break;
+ case P_IRECNO:
+ for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) {
+ ri = GET_RINTERNAL(h, indx);
+ if ((ret = __bam_traverse(
+ dbc, mode, ri->pgno, callback, cookie)) != 0)
+ break;
+ }
+ break;
+ case P_LBTREE:
+ for (indx = 0; indx < NUM_ENT(h); indx += P_INDX) {
+ bk = GET_BKEYDATA(h, indx);
+ if (B_TYPE(bk->type) == B_OVERFLOW &&
+ (ret = __db_traverse_big(dbp,
+ GET_BOVERFLOW(h, indx)->pgno,
+ callback, cookie)) != 0)
+ goto err;
+ bk = GET_BKEYDATA(h, indx + O_INDX);
+ if (B_TYPE(bk->type) == B_DUPLICATE &&
+ (ret = __bam_traverse(dbc, mode,
+ GET_BOVERFLOW(h, indx + O_INDX)->pgno,
+ callback, cookie)) != 0)
+ goto err;
+ if (B_TYPE(bk->type) == B_OVERFLOW &&
+ (ret = __db_traverse_big(dbp,
+ GET_BOVERFLOW(h, indx + O_INDX)->pgno,
+ callback, cookie)) != 0)
+ goto err;
+ }
+ break;
+ case P_LDUP:
+ case P_LRECNO:
+ for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) {
+ bk = GET_BKEYDATA(h, indx);
+ if (B_TYPE(bk->type) == B_OVERFLOW &&
+ (ret = __db_traverse_big(dbp,
+ GET_BOVERFLOW(h, indx)->pgno,
+ callback, cookie)) != 0)
+ goto err;
+ }
+ break;
+ }
+
+ already_put = 0;
+ if ((ret = callback(dbp, h, cookie, &already_put)) != 0)
+ goto err;
+
+err: if (!already_put &&
+ (t_ret = memp_fput(dbp->mpf, h, 0)) != 0 && ret != 0)
+ ret = t_ret;
+ __LPUT(dbc, lock);
+
+ return (ret);
+}
+
+/*
+ * __bam_stat_callback --
+ * Statistics callback.
+ *
+ * PUBLIC: int __bam_stat_callback __P((DB *, PAGE *, void *, int *));
+ */
+int
+__bam_stat_callback(dbp, h, cookie, putp)
+ DB *dbp;
+ PAGE *h;
+ void *cookie;
+ int *putp;
+{
+ DB_BTREE_STAT *sp;
+ db_indx_t indx, top;
+ u_int8_t type;
+
+ sp = cookie;
+ *putp = 0;
+ top = NUM_ENT(h);
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ case P_IRECNO:
+ ++sp->bt_int_pg;
+ sp->bt_int_pgfree += P_FREESPACE(h);
+ break;
+ case P_LBTREE:
+ /* Correct for on-page duplicates and deleted items. */
+ for (indx = 0; indx < top; indx += P_INDX) {
+ if (indx + P_INDX >= top ||
+ h->inp[indx] != h->inp[indx + P_INDX])
+ ++sp->bt_nkeys;
+
+ type = GET_BKEYDATA(h, indx + O_INDX)->type;
+ if (!B_DISSET(type) && B_TYPE(type) != B_DUPLICATE)
+ ++sp->bt_ndata;
+ }
+
+ ++sp->bt_leaf_pg;
+ sp->bt_leaf_pgfree += P_FREESPACE(h);
+ break;
+ case P_LRECNO:
+ /*
+ * If walking a recno tree, then each of these items is a key.
+ * Otherwise, we're walking an off-page duplicate set.
+ */
+ if (dbp->type == DB_RECNO) {
+ sp->bt_nkeys += top;
+
+ /*
+ * Correct for deleted items in non-renumbering
+ * Recno databases.
+ */
+ if (F_ISSET(dbp, DB_RE_RENUMBER))
+ sp->bt_ndata += top;
+ else
+ for (indx = 0; indx < top; indx += O_INDX) {
+ type = GET_BKEYDATA(h, indx)->type;
+ if (!B_DISSET(type))
+ ++sp->bt_ndata;
+ }
+
+ ++sp->bt_leaf_pg;
+ sp->bt_leaf_pgfree += P_FREESPACE(h);
+ } else {
+ sp->bt_ndata += top;
+
+ ++sp->bt_dup_pg;
+ sp->bt_dup_pgfree += P_FREESPACE(h);
+ }
+ break;
+ case P_LDUP:
+ /* Correct for deleted items. */
+ for (indx = 0; indx < top; indx += O_INDX)
+ if (!B_DISSET(GET_BKEYDATA(h, indx)->type))
+ ++sp->bt_ndata;
+
+ ++sp->bt_dup_pg;
+ sp->bt_dup_pgfree += P_FREESPACE(h);
+ break;
+ case P_OVERFLOW:
+ ++sp->bt_over_pg;
+ sp->bt_over_pgfree += P_OVFLSPACE(dbp->pgsize, h);
+ break;
+ default:
+ return (__db_pgfmt(dbp, h->pgno));
+ }
+ return (0);
+}
+
+/*
+ * __bam_key_range --
+ * Return proportion of keys relative to given key. The numbers are
+ * slightly skewed due to on page duplicates.
+ *
+ * PUBLIC: int __bam_key_range __P((DB *,
+ * PUBLIC: DB_TXN *, DBT *, DB_KEY_RANGE *, u_int32_t));
+ */
+int
+__bam_key_range(dbp, txn, dbt, kp, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ DBT *dbt;
+ DB_KEY_RANGE *kp;
+ u_int32_t flags;
+{
+ BTREE_CURSOR *cp;
+ DBC *dbc;
+ EPG *sp;
+ double factor;
+ int exact, ret, t_ret;
+
+ PANIC_CHECK(dbp->dbenv);
+ DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->key_range");
+
+ if (flags != 0)
+ return (__db_ferr(dbp->dbenv, "DB->key_range", 0));
+
+ /* Acquire a cursor. */
+ if ((ret = dbp->cursor(dbp, txn, &dbc, 0)) != 0)
+ return (ret);
+
+ DEBUG_LWRITE(dbc, NULL, "bam_key_range", NULL, NULL, 0);
+
+ if ((ret = __bam_search(dbc, dbt, S_STK_ONLY, 1, NULL, &exact)) != 0)
+ goto err;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ kp->less = kp->greater = 0.0;
+
+ factor = 1.0;
+ /* Correct the leaf page. */
+ cp->csp->entries /= 2;
+ cp->csp->indx /= 2;
+ for (sp = cp->sp; sp <= cp->csp; ++sp) {
+ /*
+ * At each level we know that pages greater than indx contain
+ * keys greater than what we are looking for and those less
+ * than indx are less than. The one pointed to by indx may
+ * have some less, some greater or even equal. If indx is
+ * equal to the number of entries, then the key is out of range
+ * and everything is less.
+ */
+ if (sp->indx == 0)
+ kp->greater += factor * (sp->entries - 1)/sp->entries;
+ else if (sp->indx == sp->entries)
+ kp->less += factor;
+ else {
+ kp->less += factor * sp->indx / sp->entries;
+ kp->greater += factor *
+ (sp->entries - sp->indx - 1) / sp->entries;
+ }
+ factor *= 1.0/sp->entries;
+ }
+
+ /*
+ * If there was an exact match then assign 1 n'th to the key itself.
+ * Otherwise that factor belongs to those greater than the key, unless
+ * the key was out of range.
+ */
+ if (exact)
+ kp->equal = factor;
+ else {
+ if (kp->less != 1)
+ kp->greater += factor;
+ kp->equal = 0;
+ }
+
+ BT_STK_CLR(cp);
+
+err: if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
diff --git a/db/btree/bt_upgrade.c b/db/btree/bt_upgrade.c
new file mode 100644
index 000000000..4032dba3b
--- /dev/null
+++ b/db/btree/bt_upgrade.c
@@ -0,0 +1,164 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: bt_upgrade.c,v 11.19 2000/11/30 00:58:29 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <limits.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "db_swap.h"
+#include "btree.h"
+#include "db_am.h"
+#include "db_upgrade.h"
+
+/*
+ * __bam_30_btreemeta --
+ * Upgrade the metadata pages from version 6 to version 7.
+ *
+ * PUBLIC: int __bam_30_btreemeta __P((DB *, char *, u_int8_t *));
+ */
+int
+__bam_30_btreemeta(dbp, real_name, buf)
+ DB *dbp;
+ char *real_name;
+ u_int8_t *buf;
+{
+ BTMETA30 *newmeta;
+ BTMETA2X *oldmeta;
+ DB_ENV *dbenv;
+ int ret;
+
+ dbenv = dbp->dbenv;
+
+ newmeta = (BTMETA30 *)buf;
+ oldmeta = (BTMETA2X *)buf;
+
+ /*
+ * Move things from the end up, so we do not overwrite things.
+ * We are going to create a new uid, so we can move the stuff
+ * at the end of the structure first, overwriting the uid.
+ */
+
+ newmeta->re_pad = oldmeta->re_pad;
+ newmeta->re_len = oldmeta->re_len;
+ newmeta->minkey = oldmeta->minkey;
+ newmeta->maxkey = oldmeta->maxkey;
+ newmeta->dbmeta.free = oldmeta->free;
+ newmeta->dbmeta.flags = oldmeta->flags;
+ newmeta->dbmeta.type = P_BTREEMETA;
+
+ newmeta->dbmeta.version = 7;
+ /* Replace the unique ID. */
+ if ((ret = __os_fileid(dbenv, real_name, 1, buf + 36)) != 0)
+ return (ret);
+
+ newmeta->root = 1;
+
+ return (0);
+}
+
+/*
+ * __bam_31_btreemeta --
+ * Upgrade the database from version 7 to version 8.
+ *
+ * PUBLIC: int __bam_31_btreemeta
+ * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ */
+int
+__bam_31_btreemeta(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ BTMETA31 *newmeta;
+ BTMETA30 *oldmeta;
+
+ COMPQUIET(dbp, NULL);
+ COMPQUIET(real_name, NULL);
+ COMPQUIET(fhp, NULL);
+
+ newmeta = (BTMETA31 *)h;
+ oldmeta = (BTMETA30 *)h;
+
+ /*
+ * Copy the effected fields down the page.
+ * The fields may overlap each other so we
+ * start at the bottom and use memmove.
+ */
+ newmeta->root = oldmeta->root;
+ newmeta->re_pad = oldmeta->re_pad;
+ newmeta->re_len = oldmeta->re_len;
+ newmeta->minkey = oldmeta->minkey;
+ newmeta->maxkey = oldmeta->maxkey;
+ memmove(newmeta->dbmeta.uid,
+ oldmeta->dbmeta.uid, sizeof(oldmeta->dbmeta.uid));
+ newmeta->dbmeta.flags = oldmeta->dbmeta.flags;
+ newmeta->dbmeta.record_count = 0;
+ newmeta->dbmeta.key_count = 0;
+ ZERO_LSN(newmeta->dbmeta.unused3);
+
+ /* Set the version number. */
+ newmeta->dbmeta.version = 8;
+
+ /* Upgrade the flags. */
+ if (LF_ISSET(DB_DUPSORT))
+ F_SET(&newmeta->dbmeta, BTM_DUPSORT);
+
+ *dirtyp = 1;
+ return (0);
+}
+
+/*
+ * __bam_31_lbtree --
+ * Upgrade the database btree leaf pages.
+ *
+ * PUBLIC: int __bam_31_lbtree
+ * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ */
+int
+__bam_31_lbtree(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ BKEYDATA *bk;
+ db_pgno_t pgno;
+ db_indx_t indx;
+ int ret;
+
+ ret = 0;
+ for (indx = O_INDX; indx < NUM_ENT(h); indx += P_INDX) {
+ bk = GET_BKEYDATA(h, indx);
+ if (B_TYPE(bk->type) == B_DUPLICATE) {
+ pgno = GET_BOVERFLOW(h, indx)->pgno;
+ if ((ret = __db_31_offdup(dbp, real_name, fhp,
+ LF_ISSET(DB_DUPSORT) ? 1 : 0, &pgno)) != 0)
+ break;
+ if (pgno != GET_BOVERFLOW(h, indx)->pgno) {
+ *dirtyp = 1;
+ GET_BOVERFLOW(h, indx)->pgno = pgno;
+ }
+ }
+ }
+
+ return (ret);
+}
diff --git a/db/btree/bt_verify.c b/db/btree/bt_verify.c
new file mode 100644
index 000000000..9f8647e7e
--- /dev/null
+++ b/db/btree/bt_verify.c
@@ -0,0 +1,2237 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ *
+ * $Id: bt_verify.c,v 1.44 2000/12/06 19:55:44 ubell Exp $
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: bt_verify.c,v 1.44 2000/12/06 19:55:44 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "db_verify.h"
+#include "btree.h"
+
+static int __bam_safe_getdata __P((DB *, PAGE *, u_int32_t, int, DBT *, int *));
+static int __bam_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+ db_indx_t *, u_int32_t));
+static int __bam_vrfy_treeorder __P((DB *, db_pgno_t, PAGE *, BINTERNAL *,
+ BINTERNAL *, int (*)(DB *, const DBT *, const DBT *), u_int32_t));
+static int __ram_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+ db_indx_t *, u_int32_t));
+
+#define OKFLAGS (DB_AGGRESSIVE | DB_NOORDERCHK | DB_SALVAGE)
+
+/*
+ * __bam_vrfy_meta --
+ * Verify the btree-specific part of a metadata page.
+ *
+ * PUBLIC: int __bam_vrfy_meta __P((DB *, VRFY_DBINFO *, BTMETA *,
+ * PUBLIC: db_pgno_t, u_int32_t));
+ */
+int
+__bam_vrfy_meta(dbp, vdp, meta, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ BTMETA *meta;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ VRFY_PAGEINFO *pip;
+ int isbad, t_ret, ret;
+ db_indx_t ovflsize;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ isbad = 0;
+
+ /*
+ * If VRFY_INCOMPLETE is not set, then we didn't come through
+ * __db_vrfy_pagezero and didn't incompletely
+ * check this page--we haven't checked it at all.
+ * Thus we need to call __db_vrfy_meta and check the common fields.
+ *
+ * If VRFY_INCOMPLETE is set, we've already done all the same work
+ * in __db_vrfy_pagezero, so skip the check.
+ */
+ if (!F_ISSET(pip, VRFY_INCOMPLETE) &&
+ (ret = __db_vrfy_meta(dbp, vdp, &meta->dbmeta, pgno, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ /* bt_minkey: must be >= 2; must produce sensible ovflsize */
+
+ /* avoid division by zero */
+ ovflsize = meta->minkey > 0 ?
+ B_MINKEY_TO_OVFLSIZE(meta->minkey, dbp->pgsize) : 0;
+
+ if (meta->minkey < 2 ||
+ ovflsize > B_MINKEY_TO_OVFLSIZE(DEFMINKEYPAGE, dbp->pgsize)) {
+ pip->bt_minkey = 0;
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Nonsensical bt_minkey value %lu on metadata page %lu",
+ (u_long)meta->minkey, (u_long)pgno));
+ } else
+ pip->bt_minkey = meta->minkey;
+
+ /* bt_maxkey: no constraints (XXX: right?) */
+ pip->bt_maxkey = meta->maxkey;
+
+ /* re_len: no constraints on this (may be zero or huge--we make rope) */
+ pip->re_len = meta->re_len;
+
+ /*
+ * The root must not be current page or 0 and it must be within
+ * database. If this metadata page is the master meta data page
+ * of the file, then the root page had better be page 1.
+ */
+ pip->root = 0;
+ if (meta->root == PGNO_INVALID
+ || meta->root == pgno || !IS_VALID_PGNO(meta->root) ||
+ (pgno == PGNO_BASE_MD && meta->root != 1)) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Nonsensical root page %lu on metadata page %lu",
+ (u_long)meta->root, (u_long)vdp->last_pgno));
+ } else
+ pip->root = meta->root;
+
+ /* Flags. */
+ if (F_ISSET(&meta->dbmeta, BTM_RENUMBER))
+ F_SET(pip, VRFY_IS_RRECNO);
+
+ if (F_ISSET(&meta->dbmeta, BTM_SUBDB)) {
+ /*
+ * If this is a master db meta page, it had better not have
+ * duplicates.
+ */
+ if (F_ISSET(&meta->dbmeta, BTM_DUP) && pgno == PGNO_BASE_MD) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Btree metadata page %lu has both duplicates and multiple databases",
+ (u_long)pgno));
+ }
+ F_SET(pip, VRFY_HAS_SUBDBS);
+ }
+
+ if (F_ISSET(&meta->dbmeta, BTM_DUP))
+ F_SET(pip, VRFY_HAS_DUPS);
+ if (F_ISSET(&meta->dbmeta, BTM_DUPSORT))
+ F_SET(pip, VRFY_HAS_DUPSORT);
+ if (F_ISSET(&meta->dbmeta, BTM_RECNUM))
+ F_SET(pip, VRFY_HAS_RECNUMS);
+ if (F_ISSET(pip, VRFY_HAS_RECNUMS) && F_ISSET(pip, VRFY_HAS_DUPS)) {
+ EPRINT((dbp->dbenv,
+ "Btree metadata page %lu illegally has both recnums and dups",
+ (u_long)pgno));
+ isbad = 1;
+ }
+
+ if (F_ISSET(&meta->dbmeta, BTM_RECNO)) {
+ F_SET(pip, VRFY_IS_RECNO);
+ dbp->type = DB_RECNO;
+ } else if (F_ISSET(pip, VRFY_IS_RRECNO)) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Metadata page %lu has renumber flag set but is not recno",
+ (u_long)pgno));
+ }
+
+ if (F_ISSET(pip, VRFY_IS_RECNO) && F_ISSET(pip, VRFY_HAS_DUPS)) {
+ EPRINT((dbp->dbenv,
+ "Recno metadata page %lu specifies duplicates",
+ (u_long)pgno));
+ isbad = 1;
+ }
+
+ if (F_ISSET(&meta->dbmeta, BTM_FIXEDLEN))
+ F_SET(pip, VRFY_IS_FIXEDLEN);
+ else if (pip->re_len > 0) {
+ /*
+ * It's wrong to have an re_len if it's not a fixed-length
+ * database
+ */
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "re_len of %lu in non-fixed-length database",
+ (u_long)pip->re_len));
+ }
+
+ /*
+ * We do not check that the rest of the page is 0, because it may
+ * not be and may still be correct.
+ */
+
+err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __ram_vrfy_leaf --
+ * Verify a recno leaf page.
+ *
+ * PUBLIC: int __ram_vrfy_leaf __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+ * PUBLIC: u_int32_t));
+ */
+int
+__ram_vrfy_leaf(dbp, vdp, h, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ BKEYDATA *bk;
+ VRFY_PAGEINFO *pip;
+ db_indx_t i;
+ int ret, t_ret, isbad;
+ u_int32_t re_len_guess, len;
+
+ isbad = 0;
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ if ((ret = __db_fchk(dbp->dbenv,
+ "__ram_vrfy_leaf", flags, OKFLAGS)) != 0)
+ goto err;
+
+ if (TYPE(h) != P_LRECNO) {
+ /* We should not have been called. */
+ TYPE_ERR_PRINT(dbp->dbenv, "__ram_vrfy_leaf", pgno, TYPE(h));
+ DB_ASSERT(0);
+ ret = EINVAL;
+ goto err;
+ }
+
+ /*
+ * Verify (and, if relevant, save off) page fields common to
+ * all PAGEs.
+ */
+ if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ /*
+ * Verify inp[]. Return immediately if it returns DB_VERIFY_BAD;
+ * further checks are dangerous.
+ */
+ if ((ret = __bam_vrfy_inp(dbp,
+ vdp, h, pgno, &pip->entries, flags)) != 0)
+ goto err;
+
+ if (F_ISSET(pip, VRFY_HAS_DUPS)) {
+ EPRINT((dbp->dbenv,
+ "Recno database has dups on page %lu", (u_long)pgno));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+ /*
+ * Walk through inp and see if the lengths of all the records are the
+ * same--if so, this may be a fixed-length database, and we want to
+ * save off this value. We know inp to be safe if we've gotten this
+ * far.
+ */
+ re_len_guess = 0;
+ for (i = 0; i < NUM_ENT(h); i++) {
+ bk = GET_BKEYDATA(h, i);
+ /* KEYEMPTY. Go on. */
+ if (B_DISSET(bk->type))
+ continue;
+ if (bk->type == B_OVERFLOW)
+ len = ((BOVERFLOW *)bk)->tlen;
+ else if (bk->type == B_KEYDATA)
+ len = bk->len;
+ else {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Nonsensical type for item %lu, page %lu",
+ (u_long)i, (u_long)pgno));
+ continue;
+ }
+ if (re_len_guess == 0)
+ re_len_guess = len;
+
+ /*
+ * Is this item's len the same as the last one's? If not,
+ * reset to 0 and break--we don't have a single re_len.
+ * Otherwise, go on to the next item.
+ */
+ if (re_len_guess != len) {
+ re_len_guess = 0;
+ break;
+ }
+ }
+ pip->re_len = re_len_guess;
+
+ /* Save off record count. */
+ pip->rec_cnt = NUM_ENT(h);
+
+err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : 0);
+}
+
+/*
+ * __bam_vrfy --
+ * Verify a btree leaf or internal page.
+ *
+ * PUBLIC: int __bam_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+ * PUBLIC: u_int32_t));
+ */
+int
+__bam_vrfy(dbp, vdp, h, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ VRFY_PAGEINFO *pip;
+ int ret, t_ret, isbad;
+
+ isbad = 0;
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ case P_IRECNO:
+ case P_LBTREE:
+ case P_LDUP:
+ break;
+ default:
+ TYPE_ERR_PRINT(dbp->dbenv, "__bam_vrfy", pgno, TYPE(h));
+ DB_ASSERT(0);
+ ret = EINVAL;
+ goto err;
+ }
+
+ /*
+ * Verify (and, if relevant, save off) page fields common to
+ * all PAGEs.
+ */
+ if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ /*
+ * The record count is, on internal pages, stored in an overloaded
+ * next_pgno field. Save it off; we'll verify it when we check
+ * overall database structure. We could overload the field
+ * in VRFY_PAGEINFO, too, but this seems gross, and space
+ * is not at such a premium.
+ */
+ pip->rec_cnt = RE_NREC(h);
+
+ /*
+ * Verify inp[].
+ */
+ if (TYPE(h) == P_IRECNO) {
+ if ((ret = __ram_vrfy_inp(dbp,
+ vdp, h, pgno, &pip->entries, flags)) != 0)
+ goto err;
+ } else if ((ret = __bam_vrfy_inp(dbp,
+ vdp, h, pgno, &pip->entries, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ EPRINT((dbp->dbenv,
+ "item order check on page %lu unsafe: skipping",
+ (u_long)pgno));
+ } else if (!LF_ISSET(DB_NOORDERCHK) && (ret =
+ __bam_vrfy_itemorder(dbp, vdp, h, pgno, 0, 0, 0, flags)) != 0) {
+ /*
+ * We know that the elements of inp are reasonable.
+ *
+ * Check that elements fall in the proper order.
+ */
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : 0);
+}
+
+/*
+ * __ram_vrfy_inp --
+ * Verify that all entries in a P_IRECNO inp[] array are reasonable,
+ * and count them. Note that P_LRECNO uses __bam_vrfy_inp;
+ * P_IRECNOs are a special, and simpler, case, since they have
+ * RINTERNALs rather than BKEYDATA/BINTERNALs.
+ */
+static int
+__ram_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ db_indx_t *nentriesp;
+ u_int32_t flags;
+{
+ RINTERNAL *ri;
+ VRFY_CHILDINFO child;
+ VRFY_PAGEINFO *pip;
+ int ret, t_ret, isbad;
+ u_int32_t himark, i, offset, nentries;
+ u_int8_t *pagelayout, *p;
+
+ isbad = 0;
+ memset(&child, 0, sizeof(VRFY_CHILDINFO));
+ nentries = 0;
+ pagelayout = NULL;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ if (TYPE(h) != P_IRECNO) {
+ TYPE_ERR_PRINT(dbp->dbenv, "__ram_vrfy_inp", pgno, TYPE(h));
+ DB_ASSERT(0);
+ ret = EINVAL;
+ goto err;
+ }
+
+ himark = dbp->pgsize;
+ if ((ret =
+ __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &pagelayout)) != 0)
+ goto err;
+ memset(pagelayout, 0, dbp->pgsize);
+ for (i = 0; i < NUM_ENT(h); i++) {
+ if ((u_int8_t *)h->inp + i >= (u_int8_t *)h + himark) {
+ EPRINT((dbp->dbenv,
+ "Page %lu entries listing %lu overlaps data",
+ (u_long)pgno, (u_long)i));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ offset = h->inp[i];
+ /*
+ * Check that the item offset is reasonable: it points
+ * somewhere after the inp array and before the end of the
+ * page.
+ */
+ if (offset <= (u_int32_t)((u_int8_t *)h->inp + i -
+ (u_int8_t *)h) ||
+ offset > (u_int32_t)(dbp->pgsize - RINTERNAL_SIZE)) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Bad offset %lu at page %lu index %lu",
+ (u_long)offset, (u_long)pgno, (u_long)i));
+ continue;
+ }
+
+ /* Update the high-water mark (what HOFFSET should be) */
+ if (offset < himark)
+ himark = offset;
+
+ nentries++;
+
+ /* Make sure this RINTERNAL is not multiply referenced. */
+ ri = GET_RINTERNAL(h, i);
+ if (pagelayout[offset] == 0) {
+ pagelayout[offset] = 1;
+ child.pgno = ri->pgno;
+ child.type = V_RECNO;
+ child.nrecs = ri->nrecs;
+ if ((ret = __db_vrfy_childput(vdp, pgno, &child)) != 0)
+ goto err;
+ } else {
+ EPRINT((dbp->dbenv,
+ "RINTERNAL structure at offset %lu, page %lu referenced twice",
+ (u_long)offset, (u_long)pgno));
+ isbad = 1;
+ }
+ }
+
+ for (p = pagelayout + himark;
+ p < pagelayout + dbp->pgsize;
+ p += RINTERNAL_SIZE)
+ if (*p != 1) {
+ EPRINT((dbp->dbenv,
+ "Gap between items at offset %lu, page %lu",
+ (u_long)(p - pagelayout), (u_long)pgno));
+ isbad = 1;
+ }
+
+ if ((db_indx_t)himark != HOFFSET(h)) {
+ EPRINT((dbp->dbenv, "Bad HOFFSET %lu, appears to be %lu",
+ (u_long)(HOFFSET(h)), (u_long)himark));
+ isbad = 1;
+ }
+
+ *nentriesp = nentries;
+
+err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ if (pagelayout != NULL)
+ __os_free(pagelayout, dbp->pgsize);
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __bam_vrfy_inp --
+ * Verify that all entries in inp[] array are reasonable;
+ * count them.
+ */
+static int
+__bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ db_indx_t *nentriesp;
+ u_int32_t flags;
+{
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ VRFY_CHILDINFO child;
+ VRFY_PAGEINFO *pip;
+ int isbad, initem, isdupitem, ret, t_ret;
+ u_int32_t himark, offset; /* These would be db_indx_ts but for algnmt.*/
+ u_int32_t i, endoff, nentries;
+ u_int8_t *pagelayout;
+
+ isbad = isdupitem = 0;
+ nentries = 0;
+ memset(&child, 0, sizeof(VRFY_CHILDINFO));
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ case P_LBTREE:
+ case P_LDUP:
+ case P_LRECNO:
+ break;
+ default:
+ /*
+ * In the salvager, we might call this from a page which
+ * we merely suspect is a btree page. Otherwise, it
+ * shouldn't get called--if it is, that's a verifier bug.
+ */
+ if (LF_ISSET(DB_SALVAGE))
+ break;
+ TYPE_ERR_PRINT(dbp->dbenv, "__bam_vrfy_inp", pgno, TYPE(h));
+ DB_ASSERT(0);
+ ret = EINVAL;
+ goto err;
+ }
+
+ /*
+ * Loop through inp[], the array of items, until we either
+ * run out of entries or collide with the data. Keep track
+ * of h_offset in himark.
+ *
+ * For each element in inp[i], make sure it references a region
+ * that starts after the end of the inp array (as defined by
+ * NUM_ENT(h)), ends before the beginning of the page, doesn't
+ * overlap any other regions, and doesn't have a gap between
+ * it and the region immediately after it.
+ */
+ himark = dbp->pgsize;
+ if ((ret = __os_malloc(dbp->dbenv,
+ dbp->pgsize, NULL, &pagelayout)) != 0)
+ goto err;
+ memset(pagelayout, 0, dbp->pgsize);
+ for (i = 0; i < NUM_ENT(h); i++) {
+
+ ret = __db_vrfy_inpitem(dbp,
+ h, pgno, i, 1, flags, &himark, &offset);
+ if (ret == DB_VERIFY_BAD) {
+ isbad = 1;
+ continue;
+ } else if (ret == DB_VERIFY_FATAL) {
+ isbad = 1;
+ goto err;
+ } else if (ret != 0)
+ DB_ASSERT(0);
+
+ /*
+ * We now have a plausible beginning for the item, and we know
+ * its length is safe.
+ *
+ * Mark the beginning and end in pagelayout so we can make sure
+ * items have no overlaps or gaps.
+ */
+ bk = GET_BKEYDATA(h, i);
+#define ITEM_BEGIN 1
+#define ITEM_END 2
+ if (pagelayout[offset] == 0)
+ pagelayout[offset] = ITEM_BEGIN;
+ else if (pagelayout[offset] == ITEM_BEGIN) {
+ /*
+ * Having two inp entries that point at the same patch
+ * of page is legal if and only if the page is
+ * a btree leaf and they're onpage duplicate keys--
+ * that is, if (i % P_INDX) == 0.
+ */
+ if ((i % P_INDX == 0) && (TYPE(h) == P_LBTREE)) {
+ /* Flag for later. */
+ F_SET(pip, VRFY_HAS_DUPS);
+
+ /* Bump up nentries so we don't undercount. */
+ nentries++;
+
+ /*
+ * We'll check to make sure the end is
+ * equal, too.
+ */
+ isdupitem = 1;
+ } else {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Duplicated item %lu on page %lu",
+ (u_long)i, (u_long)pgno));
+ }
+ }
+
+ /*
+ * Mark the end. Its location varies with the page type
+ * and the item type.
+ *
+ * If the end already has a sign other than 0, do nothing--
+ * it's an overlap that we'll catch later.
+ */
+ switch(B_TYPE(bk->type)) {
+ case B_KEYDATA:
+ if (TYPE(h) == P_IBTREE)
+ /* It's a BINTERNAL. */
+ endoff = offset + BINTERNAL_SIZE(bk->len) - 1;
+ else
+ endoff = offset + BKEYDATA_SIZE(bk->len) - 1;
+ break;
+ case B_DUPLICATE:
+ /*
+ * Flag that we have dups; we'll check whether
+ * that's okay during the structure check.
+ */
+ F_SET(pip, VRFY_HAS_DUPS);
+ /* FALLTHROUGH */
+ case B_OVERFLOW:
+ /*
+ * Overflow entries on internal pages are stored
+ * as the _data_ of a BINTERNAL; overflow entries
+ * on leaf pages are stored as the entire entry.
+ */
+ endoff = offset +
+ ((TYPE(h) == P_IBTREE) ?
+ BINTERNAL_SIZE(BOVERFLOW_SIZE) :
+ BOVERFLOW_SIZE) - 1;
+ break;
+ default:
+ /*
+ * We'll complain later; for now, just mark
+ * a minimum.
+ */
+ endoff = offset + BKEYDATA_SIZE(0) - 1;
+ break;
+ }
+
+ /*
+ * If this is an onpage duplicate key we've seen before,
+ * the end had better coincide too.
+ */
+ if (isdupitem && pagelayout[endoff] != ITEM_END) {
+ EPRINT((dbp->dbenv,
+ "Duplicated item %lu on page %lu",
+ (u_long)i, (u_long)pgno));
+ isbad = 1;
+ } else if (pagelayout[endoff] == 0)
+ pagelayout[endoff] = ITEM_END;
+ isdupitem = 0;
+
+ /*
+ * There should be no deleted items in a quiescent tree,
+ * except in recno.
+ */
+ if (B_DISSET(bk->type) && TYPE(h) != P_LRECNO) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Item %lu on page %lu marked deleted",
+ (u_long)i, (u_long)pgno));
+ }
+
+ /*
+ * Check the type and such of bk--make sure it's reasonable
+ * for the pagetype.
+ */
+ switch (B_TYPE(bk->type)) {
+ case B_KEYDATA:
+ /*
+ * This is a normal, non-overflow BKEYDATA or BINTERNAL.
+ * The only thing to check is the len, and that's
+ * already been done.
+ */
+ break;
+ case B_DUPLICATE:
+ if (TYPE(h) == P_IBTREE) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Duplicate page referenced by internal btree page %lu at item %lu",
+ (u_long)pgno, (u_long)i));
+ break;
+ } else if (TYPE(h) == P_LRECNO) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Duplicate page referenced by recno page %lu at item %lu",
+ (u_long)pgno, (u_long)i));
+ break;
+ }
+ /* FALLTHROUGH */
+ case B_OVERFLOW:
+ bo = (TYPE(h) == P_IBTREE) ?
+ (BOVERFLOW *)(((BINTERNAL *)bk)->data) :
+ (BOVERFLOW *)bk;
+
+ if (B_TYPE(bk->type) == B_OVERFLOW)
+ /* Make sure tlen is reasonable. */
+ if (bo->tlen > dbp->pgsize * vdp->last_pgno) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Impossible tlen %lu, item %lu, page %lu",
+ (u_long)bo->tlen, (u_long)i,
+ (u_long)pgno));
+ /* Don't save as a child. */
+ break;
+ }
+
+ if (!IS_VALID_PGNO(bo->pgno) || bo->pgno == pgno ||
+ bo->pgno == PGNO_INVALID) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Offpage item %lu, page %lu has bad pgno",
+ (u_long)i, (u_long)pgno));
+ /* Don't save as a child. */
+ break;
+ }
+
+ child.pgno = bo->pgno;
+ child.type = (B_TYPE(bk->type) == B_OVERFLOW ?
+ V_OVERFLOW : V_DUPLICATE);
+ child.tlen = bo->tlen;
+ if ((ret = __db_vrfy_childput(vdp, pgno, &child)) != 0)
+ goto err;
+ break;
+ default:
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Item %lu on page %lu of invalid type %lu",
+ (u_long)i, (u_long)pgno));
+ break;
+ }
+ }
+
+ /*
+ * Now, loop through and make sure the items are contiguous and
+ * non-overlapping.
+ */
+ initem = 0;
+ for (i = himark; i < dbp->pgsize; i++)
+ if (initem == 0)
+ switch (pagelayout[i]) {
+ case 0:
+ /* May be just for alignment. */
+ if (i != ALIGN(i, sizeof(u_int32_t)))
+ continue;
+
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Gap between items, page %lu offset %lu",
+ (u_long)pgno, (u_long)i));
+ /* Find the end of the gap */
+ for ( ; pagelayout[i + 1] == 0 &&
+ (size_t)(i + 1) < dbp->pgsize; i++)
+ ;
+ break;
+ case ITEM_BEGIN:
+ /* We've found an item. Check its alignment. */
+ if (i != ALIGN(i, sizeof(u_int32_t))) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Offset %lu page %lu unaligned",
+ (u_long)i, (u_long)pgno));
+ }
+ initem = 1;
+ nentries++;
+ break;
+ case ITEM_END:
+ /*
+ * We've hit the end of an item even though
+ * we don't think we're in one; must
+ * be an overlap.
+ */
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Overlapping items, page %lu offset %lu",
+ (u_long)pgno, (u_long)i));
+ break;
+ default:
+ /* Should be impossible. */
+ DB_ASSERT(0);
+ ret = EINVAL;
+ goto err;
+ }
+ else
+ switch (pagelayout[i]) {
+ case 0:
+ /* In the middle of an item somewhere. Okay. */
+ break;
+ case ITEM_END:
+ /* End of an item; switch to out-of-item mode.*/
+ initem = 0;
+ break;
+ case ITEM_BEGIN:
+ /*
+ * Hit a second item beginning without an
+ * end. Overlap.
+ */
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Overlapping items, page %lu offset %lu",
+ (u_long)pgno, (u_long)i));
+ break;
+ }
+
+ (void)__os_free(pagelayout, dbp->pgsize);
+
+ /* Verify HOFFSET. */
+ if ((db_indx_t)himark != HOFFSET(h)) {
+ EPRINT((dbp->dbenv, "Bad HOFFSET %lu, appears to be %lu",
+ (u_long)HOFFSET(h), (u_long)himark));
+ isbad = 1;
+ }
+
+err: if (nentriesp != NULL)
+ *nentriesp = nentries;
+
+ if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __bam_vrfy_itemorder --
+ * Make sure the items on a page sort correctly.
+ *
+ * Assumes that NUM_ENT(h) and inp[0]..inp[NUM_ENT(h) - 1] are
+ * reasonable; be sure that __bam_vrfy_inp has been called first.
+ *
+ * If ovflok is set, it also assumes that overflow page chains
+ * hanging off the current page have been sanity-checked, and so we
+ * can use __bam_cmp to verify their ordering. If it is not set,
+ * and we run into an overflow page, carp and return DB_VERIFY_BAD;
+ * we shouldn't be called if any exist.
+ *
+ * PUBLIC: int __bam_vrfy_itemorder __P((DB *, VRFY_DBINFO *, PAGE *,
+ * PUBLIC: db_pgno_t, u_int32_t, int, int, u_int32_t));
+ */
+int
+__bam_vrfy_itemorder(dbp, vdp, h, pgno, nentries, ovflok, hasdups, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t nentries;
+ int ovflok, hasdups;
+ u_int32_t flags;
+{
+ DBT dbta, dbtb, dup1, dup2, *p1, *p2, *tmp;
+ BTREE *bt;
+ BINTERNAL *bi;
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ VRFY_PAGEINFO *pip;
+ db_indx_t i;
+ int cmp, freedup1, freedup2, isbad, ret, t_ret;
+ int (*dupfunc) __P((DB *, const DBT *, const DBT *));
+ int (*func) __P((DB *, const DBT *, const DBT *));
+ void *buf1, *buf2, *tmpbuf;
+
+ /*
+ * We need to work in the ORDERCHKONLY environment where we might
+ * not have a pip, but we also may need to work in contexts where
+ * NUM_ENT isn't safe.
+ */
+ if (vdp != NULL) {
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+ nentries = pip->entries;
+ } else
+ pip = NULL;
+
+ ret = isbad = 0;
+ bo = NULL; /* Shut up compiler. */
+
+ memset(&dbta, 0, sizeof(DBT));
+ F_SET(&dbta, DB_DBT_REALLOC);
+
+ memset(&dbtb, 0, sizeof(DBT));
+ F_SET(&dbtb, DB_DBT_REALLOC);
+
+ buf1 = buf2 = NULL;
+
+ DB_ASSERT(!LF_ISSET(DB_NOORDERCHK));
+
+ dupfunc = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare;
+ if (TYPE(h) == P_LDUP)
+ func = dupfunc;
+ else {
+ func = __bam_defcmp;
+ if (dbp->bt_internal != NULL) {
+ bt = (BTREE *)dbp->bt_internal;
+ if (bt->bt_compare != NULL)
+ func = bt->bt_compare;
+ }
+ }
+
+ /*
+ * We alternate our use of dbta and dbtb so that we can walk
+ * through the page key-by-key without copying a dbt twice.
+ * p1 is always the dbt for index i - 1, and p2 for index i.
+ */
+ p1 = &dbta;
+ p2 = &dbtb;
+
+ /*
+ * Loop through the entries. nentries ought to contain the
+ * actual count, and so is a safe way to terminate the loop; whether
+ * we inc. by one or two depends on whether we're a leaf page--
+ * on a leaf page, we care only about keys. On internal pages
+ * and LDUP pages, we want to check the order of all entries.
+ *
+ * Note that on IBTREE pages, we start with item 1, since item
+ * 0 doesn't get looked at by __bam_cmp.
+ */
+ for (i = (TYPE(h) == P_IBTREE) ? 1 : 0; i < nentries;
+ i += (TYPE(h) == P_LBTREE) ? P_INDX : O_INDX) {
+ /*
+ * Put key i-1, now in p2, into p1, by swapping DBTs and bufs.
+ */
+ tmp = p1;
+ p1 = p2;
+ p2 = tmp;
+ tmpbuf = buf1;
+ buf1 = buf2;
+ buf2 = tmpbuf;
+
+ /*
+ * Get key i into p2.
+ */
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ bi = GET_BINTERNAL(h, i);
+ if (B_TYPE(bi->type) == B_OVERFLOW) {
+ bo = (BOVERFLOW *)(bi->data);
+ goto overflow;
+ } else {
+ p2->data = bi->data;
+ p2->size = bi->len;
+ }
+
+ /*
+ * The leftmost key on an internal page must be
+ * len 0, since it's just a placeholder and
+ * automatically sorts less than all keys.
+ *
+ * XXX
+ * This criterion does not currently hold!
+ * See todo list item #1686. Meanwhile, it's harmless
+ * to just not check for it.
+ */
+#if 0
+ if (i == 0 && bi->len != 0) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Lowest key on internal page %lu of nonzero length",
+ (u_long)pgno));
+ }
+#endif
+ break;
+ case P_LBTREE:
+ case P_LDUP:
+ bk = GET_BKEYDATA(h, i);
+ if (B_TYPE(bk->type) == B_OVERFLOW) {
+ bo = (BOVERFLOW *)bk;
+ goto overflow;
+ } else {
+ p2->data = bk->data;
+ p2->size = bk->len;
+ }
+ break;
+ default:
+ /*
+ * This means our caller screwed up and sent us
+ * an inappropriate page.
+ */
+ TYPE_ERR_PRINT(dbp->dbenv,
+ "__bam_vrfy_itemorder", pgno, TYPE(h))
+ DB_ASSERT(0);
+ ret = EINVAL;
+ goto err;
+ }
+
+ if (0) {
+ /*
+ * If ovflok != 1, we can't safely go chasing
+ * overflow pages with the normal routines now;
+ * they might be unsafe or nonexistent. Mark this
+ * page as incomplete and return.
+ *
+ * Note that we don't need to worry about freeing
+ * buffers, since they can't have been allocated
+ * if overflow items are unsafe.
+ */
+overflow: if (!ovflok) {
+ F_SET(pip, VRFY_INCOMPLETE);
+ goto err;
+ }
+
+ /*
+ * Overflow items are safe to chase. Do so.
+ * Fetch the overflow item into p2->data,
+ * NULLing it or reallocing it as appropriate.
+ *
+ * (We set p2->data to buf2 before the call
+ * so we're sure to realloc if we can and if p2
+ * was just pointing at a non-overflow item.)
+ */
+ p2->data = buf2;
+ if ((ret = __db_goff(dbp,
+ p2, bo->tlen, bo->pgno, NULL, NULL)) != 0) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Error %lu in fetching overflow item %lu, page %lu",
+ (u_long)ret, (u_long)i, (u_long)pgno));
+ }
+ /* In case it got realloc'ed and thus changed. */
+ buf2 = p2->data;
+ }
+
+ /* Compare with the last key. */
+ if (p1->data != NULL && p2->data != NULL) {
+ cmp = func(dbp, p1, p2);
+
+ /* comparison succeeded */
+ if (cmp > 0) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Out-of-order key, page %lu item %lu",
+ (u_long)pgno, (u_long)i));
+ /* proceed */
+ } else if (cmp == 0) {
+ /*
+ * If they compared equally, this
+ * had better be a (sub)database with dups.
+ * Mark it so we can check during the
+ * structure check.
+ */
+ if (pip != NULL)
+ F_SET(pip, VRFY_HAS_DUPS);
+ else if (hasdups == 0) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Database with no duplicates has duplicated keys on page %lu",
+ (u_long)pgno));
+ }
+
+ /*
+ * If we're a btree leaf, check to see
+ * if the data items of these on-page dups are
+ * in sorted order. If not, flag this, so
+ * that we can make sure during the
+ * structure checks that the DUPSORT flag
+ * is unset.
+ *
+ * At this point i points to a duplicate key.
+ * Compare the datum before it (same key)
+ * to the datum after it, i.e. i-1 to i+1.
+ */
+ if (TYPE(h) == P_LBTREE) {
+ /*
+ * Unsafe; continue and we'll pick
+ * up the bogus nentries later.
+ */
+ if (i + 1 >= (db_indx_t)nentries)
+ continue;
+
+ /*
+ * We don't bother with clever memory
+ * management with on-page dups,
+ * as it's only really a big win
+ * in the overflow case, and overflow
+ * dups are probably (?) rare.
+ */
+ if (((ret = __bam_safe_getdata(dbp,
+ h, i - 1, ovflok, &dup1,
+ &freedup1)) != 0) ||
+ ((ret = __bam_safe_getdata(dbp,
+ h, i + 1, ovflok, &dup2,
+ &freedup2)) != 0))
+ goto err;
+
+ /*
+ * If either of the data are NULL,
+ * it's because they're overflows and
+ * it's not safe to chase them now.
+ * Mark an incomplete and return.
+ */
+ if (dup1.data == NULL ||
+ dup2.data == NULL) {
+ DB_ASSERT(!ovflok);
+ F_SET(pip, VRFY_INCOMPLETE);
+ goto err;
+ }
+
+ /*
+ * If the dups are out of order,
+ * flag this. It's not an error
+ * until we do the structure check
+ * and see whether DUPSORT is set.
+ */
+ if (dupfunc(dbp, &dup1, &dup2) > 0)
+ F_SET(pip, VRFY_DUPS_UNSORTED);
+
+ if (freedup1)
+ __os_free(dup1.data, 0);
+ if (freedup2)
+ __os_free(dup2.data, 0);
+ }
+ }
+ }
+ }
+
+err: if (pip != NULL &&
+ ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0) && ret == 0)
+ ret = t_ret;
+
+ if (buf1 != NULL)
+ __os_free(buf1, 0);
+ if (buf2 != NULL)
+ __os_free(buf2, 0);
+
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __bam_vrfy_structure --
+ * Verify the tree structure of a btree database (including the master
+ * database containing subdbs).
+ *
+ * PUBLIC: int __bam_vrfy_structure __P((DB *, VRFY_DBINFO *, db_pgno_t,
+ * PUBLIC: u_int32_t));
+ */
+int
+__bam_vrfy_structure(dbp, vdp, meta_pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t meta_pgno;
+ u_int32_t flags;
+{
+ DB *pgset;
+ VRFY_PAGEINFO *mip, *rip;
+ db_pgno_t root, p;
+ int t_ret, ret;
+ u_int32_t nrecs, level, relen, stflags;
+
+ mip = rip = 0;
+ pgset = vdp->pgset;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &mip)) != 0)
+ return (ret);
+
+ if ((ret = __db_vrfy_pgset_get(pgset, meta_pgno, (int *)&p)) != 0)
+ goto err;
+ if (p != 0) {
+ EPRINT((dbp->dbenv,
+ "Btree metadata page number %lu observed twice",
+ (u_long)meta_pgno));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ if ((ret = __db_vrfy_pgset_inc(pgset, meta_pgno)) != 0)
+ goto err;
+
+ root = mip->root;
+
+ if (root == 0) {
+ EPRINT((dbp->dbenv,
+ "Btree metadata page %lu has no root", (u_long)meta_pgno));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, root, &rip)) != 0)
+ goto err;
+
+ switch (rip->type) {
+ case P_IBTREE:
+ case P_LBTREE:
+ stflags = flags | ST_TOPLEVEL;
+ if (F_ISSET(mip, VRFY_HAS_DUPS))
+ stflags |= ST_DUPOK;
+ if (F_ISSET(mip, VRFY_HAS_DUPSORT))
+ stflags |= ST_DUPSORT;
+ if (F_ISSET(mip, VRFY_HAS_RECNUMS))
+ stflags |= ST_RECNUM;
+ ret = __bam_vrfy_subtree(dbp,
+ vdp, root, NULL, NULL, stflags, NULL, NULL, NULL);
+ break;
+ case P_IRECNO:
+ case P_LRECNO:
+ stflags = flags | ST_RECNUM | ST_IS_RECNO | ST_TOPLEVEL;
+ if (mip->re_len > 0)
+ stflags |= ST_RELEN;
+ if ((ret = __bam_vrfy_subtree(dbp, vdp,
+ root, NULL, NULL, stflags, &level, &nrecs, &relen)) != 0)
+ goto err;
+ /*
+ * Even if mip->re_len > 0, re_len may come back zero if the
+ * tree is empty. It should be okay to just skip the check in
+ * this case, as if there are any non-deleted keys at all,
+ * that should never happen.
+ */
+ if (mip->re_len > 0 && relen > 0 && mip->re_len != relen) {
+ EPRINT((dbp->dbenv,
+ "Recno database with meta page %lu has bad re_len %lu",
+ (u_long)meta_pgno, (u_long)relen));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ ret = 0;
+ break;
+ case P_LDUP:
+ EPRINT((dbp->dbenv,
+ "Duplicate tree referenced from metadata page %lu",
+ (u_long)meta_pgno));
+ ret = DB_VERIFY_BAD;
+ break;
+ default:
+ EPRINT((dbp->dbenv,
+ "Btree root of incorrect type %lu on meta page %lu",
+ (u_long)rip->type, (u_long)meta_pgno));
+ ret = DB_VERIFY_BAD;
+ break;
+ }
+
+err: if (mip != NULL &&
+ ((t_ret = __db_vrfy_putpageinfo(vdp, mip)) != 0) && ret == 0)
+ t_ret = ret;
+ if (rip != NULL &&
+ ((t_ret = __db_vrfy_putpageinfo(vdp, rip)) != 0) && ret == 0)
+ t_ret = ret;
+ return (ret);
+}
+
+/*
+ * __bam_vrfy_subtree--
+ * Verify a subtree (or entire) btree with specified root.
+ *
+ * Note that this is public because it must be called to verify
+ * offpage dup trees, including from hash.
+ *
+ * PUBLIC: int __bam_vrfy_subtree __P((DB *, VRFY_DBINFO *, db_pgno_t, void *,
+ * PUBLIC: void *, u_int32_t, u_int32_t *, u_int32_t *, u_int32_t *));
+ */
+int
+__bam_vrfy_subtree(dbp,
+ vdp, pgno, l, r, flags, levelp, nrecsp, relenp)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ void *l, *r;
+ u_int32_t flags, *levelp, *nrecsp, *relenp;
+{
+ BINTERNAL *li, *ri, *lp, *rp;
+ DB *pgset;
+ DBC *cc;
+ PAGE *h;
+ VRFY_CHILDINFO *child;
+ VRFY_PAGEINFO *pip;
+ db_recno_t nrecs, child_nrecs;
+ db_indx_t i;
+ int ret, t_ret, isbad, toplevel, p;
+ int (*func) __P((DB *, const DBT *, const DBT *));
+ u_int32_t level, child_level, stflags, child_relen, relen;
+
+ ret = isbad = 0;
+ nrecs = 0;
+ h = NULL;
+ relen = 0;
+ rp = (BINTERNAL *)r;
+ lp = (BINTERNAL *)l;
+
+ /* Provide feedback on our progress to the application. */
+ if (!LF_ISSET(DB_SALVAGE))
+ __db_vrfy_struct_feedback(dbp, vdp);
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ cc = NULL;
+ level = pip->bt_level;
+
+ toplevel = LF_ISSET(ST_TOPLEVEL);
+ LF_CLR(ST_TOPLEVEL);
+
+ /*
+ * We are recursively descending a btree, starting from the root
+ * and working our way out to the leaves.
+ *
+ * There are four cases we need to deal with:
+ * 1. pgno is a recno leaf page. Any children are overflows.
+ * 2. pgno is a duplicate leaf page. Any children
+ * are overflow pages; traverse them, and then return
+ * level and nrecs.
+ * 3. pgno is an ordinary leaf page. Check whether dups are
+ * allowed, and if so, traverse any off-page dups or
+ * overflows. Then return nrecs and level.
+ * 4. pgno is a recno internal page. Recursively check any
+ * child pages, making sure their levels are one lower
+ * and their nrecs sum to ours.
+ * 5. pgno is a btree internal page. Same as #4, plus we
+ * must verify that for each pair of BINTERNAL entries
+ * N and N+1, the leftmost item on N's child sorts
+ * greater than N, and the rightmost item on N's child
+ * sorts less than N+1.
+ *
+ * Furthermore, in any sorted page type (P_LDUP, P_LBTREE, P_IBTREE),
+ * we need to verify the internal sort order is correct if,
+ * due to overflow items, we were not able to do so earlier.
+ */
+ switch (pip->type) {
+ case P_LRECNO:
+ case P_LDUP:
+ case P_LBTREE:
+ /*
+ * Cases 1, 2 and 3 (overflow pages are common to all three);
+ * traverse child list, looking for overflows.
+ */
+ if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0)
+ goto err;
+ for (ret = __db_vrfy_ccset(cc, pgno, &child); ret == 0;
+ ret = __db_vrfy_ccnext(cc, &child))
+ if (child->type == V_OVERFLOW &&
+ (ret = __db_vrfy_ovfl_structure(dbp, vdp,
+ child->pgno, child->tlen,
+ flags | ST_OVFL_LEAF)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto done;
+ }
+
+ if ((ret = __db_vrfy_ccclose(cc)) != 0)
+ goto err;
+ cc = NULL;
+
+ /* Case 1 */
+ if (pip->type == P_LRECNO) {
+ if (!LF_ISSET(ST_IS_RECNO) &&
+ !(LF_ISSET(ST_DUPOK) && !LF_ISSET(ST_DUPSORT))) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Recno leaf page %lu in non-recno tree",
+ (u_long)pgno));
+ goto done;
+ }
+ goto leaf;
+ } else if (LF_ISSET(ST_IS_RECNO)) {
+ /*
+ * It's a non-recno leaf. Had better not be a recno
+ * subtree.
+ */
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Non-recno leaf page %lu in recno tree",
+ (u_long)pgno));
+ goto done;
+ }
+
+ /* Case 2--no more work. */
+ if (pip->type == P_LDUP)
+ goto leaf;
+
+ /* Case 3 */
+
+ /* Check if we have any dups. */
+ if (F_ISSET(pip, VRFY_HAS_DUPS)) {
+ /* If dups aren't allowed in this btree, trouble. */
+ if (!LF_ISSET(ST_DUPOK)) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Duplicates on page %lu in non-dup btree",
+ (u_long)pgno));
+ } else {
+ /*
+ * We correctly have dups. If any are off-page,
+ * traverse those btrees recursively.
+ */
+ if ((ret =
+ __db_vrfy_childcursor(vdp, &cc)) != 0)
+ goto err;
+ for (ret = __db_vrfy_ccset(cc, pgno, &child);
+ ret == 0;
+ ret = __db_vrfy_ccnext(cc, &child)) {
+ stflags = flags | ST_RECNUM | ST_DUPSET;
+ /* Skip any overflow entries. */
+ if (child->type == V_DUPLICATE) {
+ if ((ret = __db_vrfy_duptype(
+ dbp, vdp, child->pgno,
+ stflags)) != 0) {
+ isbad = 1;
+ /* Next child. */
+ continue;
+ }
+ if ((ret = __bam_vrfy_subtree(
+ dbp, vdp, child->pgno, NULL,
+ NULL, stflags, NULL, NULL,
+ NULL)) != 0) {
+ if (ret !=
+ DB_VERIFY_BAD)
+ goto err;
+ else
+ isbad = 1;
+ }
+ }
+ }
+
+ if ((ret = __db_vrfy_ccclose(cc)) != 0)
+ goto err;
+ cc = NULL;
+
+ /*
+ * If VRFY_DUPS_UNSORTED is set,
+ * ST_DUPSORT had better not be.
+ */
+ if (F_ISSET(pip, VRFY_DUPS_UNSORTED) &&
+ LF_ISSET(ST_DUPSORT)) {
+ EPRINT((dbp->dbenv,
+ "Unsorted duplicate set at page %lu in sorted-dup database",
+ (u_long)pgno));
+ isbad = 1;
+ }
+ }
+ }
+ goto leaf;
+ break;
+ case P_IBTREE:
+ case P_IRECNO:
+ /* We handle these below. */
+ break;
+ default:
+ /*
+ * If a P_IBTREE or P_IRECNO contains a reference to an
+ * invalid page, we'll wind up here; handle it gracefully.
+ * Note that the code at the "done" label assumes that the
+ * current page is a btree/recno one of some sort; this
+ * is not the case here, so we goto err.
+ */
+ EPRINT((dbp->dbenv,
+ "Page %lu is of inappropriate type %lu",
+ (u_long)pgno, (u_long)pip->type));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+ /*
+ * Cases 4 & 5: This is a btree or recno internal page. For each child,
+ * recurse, keeping a running count of nrecs and making sure the level
+ * is always reasonable.
+ */
+ if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0)
+ goto err;
+ for (ret = __db_vrfy_ccset(cc, pgno, &child); ret == 0;
+ ret = __db_vrfy_ccnext(cc, &child))
+ if (child->type == V_RECNO) {
+ if (pip->type != P_IRECNO) {
+ TYPE_ERR_PRINT(dbp->dbenv, "__bam_vrfy_subtree",
+ pgno, pip->type);
+ DB_ASSERT(0);
+ ret = EINVAL;
+ goto err;
+ }
+ if ((ret = __bam_vrfy_subtree(dbp, vdp, child->pgno,
+ NULL, NULL, flags, &child_level, &child_nrecs,
+ &child_relen)) != 0) {
+ if (ret != DB_VERIFY_BAD)
+ goto done;
+ else
+ isbad = 1;
+ }
+
+ if (LF_ISSET(ST_RELEN)) {
+ if (relen == 0)
+ relen = child_relen;
+ /*
+ * child_relen may be zero if the child subtree
+ * is empty.
+ */
+ else if (child_relen > 0 &&
+ relen != child_relen) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Recno page %lu returned bad re_len",
+ (u_long)child->pgno));
+ }
+ if (relenp)
+ *relenp = relen;
+ }
+ if (LF_ISSET(ST_RECNUM))
+ nrecs += child_nrecs;
+ if (level != child_level + 1) {
+ isbad = 1;
+ EPRINT((dbp->dbenv, "%s%lu%s%lu%s%lu",
+ "Recno level incorrect on page ",
+ (u_long)child->pgno, ": got ",
+ (u_long)child_level, ", expected ",
+ (u_long)(level - 1)));
+ }
+ } else if (child->type == V_OVERFLOW &&
+ (ret = __db_vrfy_ovfl_structure(dbp, vdp,
+ child->pgno, child->tlen, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto done;
+ }
+
+ if ((ret = __db_vrfy_ccclose(cc)) != 0)
+ goto err;
+ cc = NULL;
+
+ /* We're done with case 4. */
+ if (pip->type == P_IRECNO)
+ goto done;
+
+ /*
+ * Case 5. Btree internal pages.
+ * As described above, we need to iterate through all the
+ * items on the page and make sure that our children sort appropriately
+ * with respect to them.
+ *
+ * For each entry, li will be the "left-hand" key for the entry
+ * itself, which must sort lower than all entries on its child;
+ * ri will be the key to its right, which must sort greater.
+ */
+ if (h == NULL && (ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
+ goto err;
+ for (i = 0; i < pip->entries; i += O_INDX) {
+ li = GET_BINTERNAL(h, i);
+ ri = (i + O_INDX < pip->entries) ?
+ GET_BINTERNAL(h, i + O_INDX) : NULL;
+
+ /*
+ * The leftmost key is forcibly sorted less than all entries,
+ * so don't bother passing it.
+ */
+ if ((ret = __bam_vrfy_subtree(dbp, vdp, li->pgno,
+ i == 0 ? NULL : li, ri, flags, &child_level,
+ &child_nrecs, NULL)) != 0) {
+ if (ret != DB_VERIFY_BAD)
+ goto done;
+ else
+ isbad = 1;
+ }
+
+ if (LF_ISSET(ST_RECNUM)) {
+ /*
+ * Keep a running tally on the actual record count so
+ * we can return it to our parent (if we have one) or
+ * compare it to the NRECS field if we're a root page.
+ */
+ nrecs += child_nrecs;
+
+ /*
+ * Make sure the actual record count of the child
+ * is equal to the value in the BINTERNAL structure.
+ */
+ if (li->nrecs != child_nrecs) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Item %lu page %lu has incorrect record count of %lu, should be %lu",
+ (u_long)i, (u_long)pgno, (u_long)li->nrecs,
+ (u_long)child_nrecs));
+ }
+ }
+
+ if (level != child_level + 1) {
+ isbad = 1;
+ EPRINT((dbp->dbenv, "%s%lu%s%lu%s%lu",
+ "Btree level incorrect on page ", (u_long)li->pgno,
+ ": got ", (u_long)child_level, ", expected ",
+ (u_long)(level - 1)));
+ }
+ }
+
+ if (0) {
+leaf: level = LEAFLEVEL;
+ if (LF_ISSET(ST_RECNUM))
+ nrecs = pip->rec_cnt;
+
+ /* XXX
+ * We should verify that the record count on a leaf page
+ * is the sum of the number of keys and the number of
+ * records in its off-page dups. This requires looking
+ * at the page again, however, and it may all be changing
+ * soon, so for now we don't bother.
+ */
+
+ if (LF_ISSET(ST_RELEN) && relenp)
+ *relenp = pip->re_len;
+ }
+done: if (F_ISSET(pip, VRFY_INCOMPLETE) && isbad == 0 && ret == 0) {
+ /*
+ * During the page-by-page pass, item order verification was
+ * not finished due to the presence of overflow items. If
+ * isbad == 0, though, it's now safe to do so, as we've
+ * traversed any child overflow pages. Do it.
+ */
+ if (h == NULL && (ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
+ goto err;
+ if ((ret = __bam_vrfy_itemorder(dbp,
+ vdp, h, pgno, 0, 1, 0, flags)) != 0)
+ goto err;
+ F_CLR(pip, VRFY_INCOMPLETE);
+ }
+
+ /*
+ * Our parent has sent us BINTERNAL pointers to parent records
+ * so that we can verify our place with respect to them. If it's
+ * appropriate--we have a default sort function--verify this.
+ */
+ if (isbad == 0 && ret == 0 && !LF_ISSET(DB_NOORDERCHK) && lp != NULL) {
+ if (h == NULL && (ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
+ goto err;
+
+ /*
+ * __bam_vrfy_treeorder needs to know what comparison function
+ * to use. If ST_DUPSET is set, we're in a duplicate tree
+ * and we use the duplicate comparison function; otherwise,
+ * use the btree one. If unset, use the default, of course.
+ */
+ func = LF_ISSET(ST_DUPSET) ? dbp->dup_compare :
+ ((BTREE *)dbp->bt_internal)->bt_compare;
+ if (func == NULL)
+ func = __bam_defcmp;
+
+ if ((ret = __bam_vrfy_treeorder(
+ dbp, pgno, h, lp, rp, func, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+ }
+
+ /*
+ * This is guaranteed to succeed for leaf pages, but no harm done.
+ *
+ * Internal pages below the top level do not store their own
+ * record numbers, so we skip them.
+ */
+ if (LF_ISSET(ST_RECNUM) && nrecs != pip->rec_cnt && toplevel) {
+ isbad = 1;
+ EPRINT((dbp->dbenv,
+ "Bad record count on page %lu: got %lu, expected %lu",
+ (u_long)pgno, (u_long)nrecs, (u_long)pip->rec_cnt));
+ }
+
+ if (levelp)
+ *levelp = level;
+ if (nrecsp)
+ *nrecsp = nrecs;
+
+ pgset = vdp->pgset;
+ if ((ret = __db_vrfy_pgset_get(pgset, pgno, &p)) != 0)
+ goto err;
+ if (p != 0) {
+ isbad = 1;
+ EPRINT((dbp->dbenv, "Page %lu linked twice", (u_long)pgno));
+ } else if ((ret = __db_vrfy_pgset_inc(pgset, pgno)) != 0)
+ goto err;
+
+err: if (h != NULL && (t_ret = memp_fput(dbp->mpf, h, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ if (cc != NULL && ((t_ret = __db_vrfy_ccclose(cc)) != 0) && ret == 0)
+ ret = t_ret;
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __bam_vrfy_treeorder --
+ * Verify that the lowest key on a page sorts greater than the
+ * BINTERNAL which points to it (lp), and the highest key
+ * sorts less than the BINTERNAL above that (rp).
+ *
+ * If lp is NULL, this means that it was the leftmost key on the
+ * parent, which (regardless of sort function) sorts less than
+ * all keys. No need to check it.
+ *
+ * If rp is NULL, lp was the highest key on the parent, so there's
+ * no higher key we must sort less than.
+ */
+static int
+__bam_vrfy_treeorder(dbp, pgno, h, lp, rp, func, flags)
+ DB *dbp;
+ db_pgno_t pgno;
+ PAGE *h;
+ BINTERNAL *lp, *rp;
+ int (*func) __P((DB *, const DBT *, const DBT *));
+ u_int32_t flags;
+{
+ BOVERFLOW *bo;
+ DBT dbt;
+ db_indx_t last;
+ int ret, cmp;
+
+ memset(&dbt, 0, sizeof(DBT));
+ F_SET(&dbt, DB_DBT_MALLOC);
+ ret = 0;
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ case P_LDUP:
+ last = NUM_ENT(h) - O_INDX;
+ break;
+ case P_LBTREE:
+ last = NUM_ENT(h) - P_INDX;
+ break;
+ default:
+ TYPE_ERR_PRINT(dbp->dbenv,
+ "__bam_vrfy_treeorder", pgno, TYPE(h));
+ DB_ASSERT(0);
+ return (EINVAL);
+ }
+
+ /*
+ * The key on page h, the child page, is more likely to be
+ * an overflow page, so we pass its offset, rather than lp/rp's,
+ * into __bam_cmp. This will take advantage of __db_moff.
+ */
+
+ /*
+ * Skip first-item check if we're an internal page--the first
+ * entry on an internal page is treated specially by __bam_cmp,
+ * so what's on the page shouldn't matter. (Plus, since we're passing
+ * our page and item 0 as to __bam_cmp, we'll sort before our
+ * parent and falsely report a failure.)
+ */
+ if (lp != NULL && TYPE(h) != P_IBTREE) {
+ if (lp->type == B_KEYDATA) {
+ dbt.data = lp->data;
+ dbt.size = lp->len;
+ } else if (lp->type == B_OVERFLOW) {
+ bo = (BOVERFLOW *)lp->data;
+ if ((ret = __db_goff(dbp, &dbt, bo->tlen, bo->pgno,
+ NULL, NULL)) != 0)
+ return (ret);
+ } else {
+ DB_ASSERT(0);
+ EPRINT((dbp->dbenv,
+ "Unknown type for internal record"));
+ return (EINVAL);
+ }
+
+ /* On error, fall through, free if neeeded, and return. */
+ if ((ret = __bam_cmp(dbp, &dbt, h, 0, func, &cmp)) == 0) {
+ if (cmp > 0) {
+ EPRINT((dbp->dbenv,
+ "First item on page %lu sorted greater than parent entry",
+ (u_long)PGNO(h)));
+ ret = DB_VERIFY_BAD;
+ }
+ } else
+ EPRINT((dbp->dbenv,
+ "First item on page %lu had comparison error",
+ (u_long)PGNO(h)));
+
+ if (dbt.data != lp->data)
+ __os_free(dbt.data, 0);
+ if (ret != 0)
+ return (ret);
+ }
+
+ if (rp != NULL) {
+ if (rp->type == B_KEYDATA) {
+ dbt.data = rp->data;
+ dbt.size = rp->len;
+ } else if (rp->type == B_OVERFLOW) {
+ bo = (BOVERFLOW *)rp->data;
+ if ((ret = __db_goff(dbp, &dbt, bo->tlen, bo->pgno,
+ NULL, NULL)) != 0)
+ return (ret);
+ } else {
+ DB_ASSERT(0);
+ EPRINT((dbp->dbenv,
+ "Unknown type for internal record"));
+ return (EINVAL);
+ }
+
+ /* On error, fall through, free if neeeded, and return. */
+ if ((ret = __bam_cmp(dbp, &dbt, h, last, func, &cmp)) == 0) {
+ if (cmp < 0) {
+ EPRINT((dbp->dbenv,
+ "Last item on page %lu sorted greater than parent entry",
+ (u_long)PGNO(h)));
+ ret = DB_VERIFY_BAD;
+ }
+ } else
+ EPRINT((dbp->dbenv,
+ "Last item on page %lu had comparison error",
+ (u_long)PGNO(h)));
+
+ if (dbt.data != rp->data)
+ __os_free(dbt.data, 0);
+ }
+
+ return (ret);
+}
+
+/*
+ * __bam_salvage --
+ * Safely dump out anything that looks like a key on an alleged
+ * btree leaf page.
+ *
+ * PUBLIC: int __bam_salvage __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t,
+ * PUBLIC: PAGE *, void *, int (*)(void *, const void *), DBT *,
+ * PUBLIC: u_int32_t));
+ */
+int
+__bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ u_int32_t pgtype;
+ PAGE *h;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ DBT *key;
+ u_int32_t flags;
+{
+ DBT dbt, unkdbt;
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ db_indx_t i, beg, end;
+ u_int32_t himark;
+ u_int8_t *pgmap;
+ void *ovflbuf;
+ int t_ret, ret, err_ret;
+
+ /* Shut up lint. */
+ COMPQUIET(end, 0);
+
+ ovflbuf = pgmap = NULL;
+ err_ret = ret = 0;
+
+ memset(&dbt, 0, sizeof(DBT));
+ dbt.flags = DB_DBT_REALLOC;
+
+ memset(&unkdbt, 0, sizeof(DBT));
+ unkdbt.size = strlen("UNKNOWN") + 1;
+ unkdbt.data = "UNKNOWN";
+
+ /*
+ * Allocate a buffer for overflow items. Start at one page;
+ * __db_safe_goff will realloc as needed.
+ */
+ if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &ovflbuf)) != 0)
+ return (ret);
+
+ if (LF_ISSET(DB_AGGRESSIVE)) {
+ if ((ret =
+ __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &pgmap)) != 0)
+ goto err;
+ memset(pgmap, 0, dbp->pgsize);
+ }
+
+ /*
+ * Loop through the inp array, spitting out key/data pairs.
+ *
+ * If we're salvaging normally, loop from 0 through NUM_ENT(h).
+ * If we're being aggressive, loop until we hit the end of the page--
+ * NUM_ENT() may be bogus.
+ */
+ himark = dbp->pgsize;
+ for (i = 0;; i += O_INDX) {
+ /* If we're not aggressive, break when we hit NUM_ENT(h). */
+ if (!LF_ISSET(DB_AGGRESSIVE) && i >= NUM_ENT(h))
+ break;
+
+ /* Verify the current item. */
+ ret = __db_vrfy_inpitem(dbp,
+ h, pgno, i, 1, flags, &himark, NULL);
+ /* If this returned a fatality, it's time to break. */
+ if (ret == DB_VERIFY_FATAL) {
+ /*
+ * Don't return DB_VERIFY_FATAL; it's private
+ * and means only that we can't go on with this
+ * page, not with the whole database. It's
+ * not even an error if we've run into it
+ * after NUM_ENT(h).
+ */
+ ret = (i < NUM_ENT(h)) ? DB_VERIFY_BAD : 0;
+ break;
+ }
+
+ /*
+ * If this returned 0, it's safe to print or (carefully)
+ * try to fetch.
+ */
+ if (ret == 0) {
+ /*
+ * We only want to print deleted items if
+ * DB_AGGRESSIVE is set.
+ */
+ bk = GET_BKEYDATA(h, i);
+ if (!LF_ISSET(DB_AGGRESSIVE) && B_DISSET(bk->type))
+ continue;
+
+ /*
+ * We're going to go try to print the next item. If
+ * key is non-NULL, we're a dup page, so we've got to
+ * print the key first, unless SA_SKIPFIRSTKEY is set
+ * and we're on the first entry.
+ */
+ if (key != NULL &&
+ (i != 0 || !LF_ISSET(SA_SKIPFIRSTKEY)))
+ if ((ret = __db_prdbt(key,
+ 0, " ", handle, callback, 0, NULL)) != 0)
+ err_ret = ret;
+
+ beg = h->inp[i];
+ switch (B_TYPE(bk->type)) {
+ case B_DUPLICATE:
+ end = beg + BOVERFLOW_SIZE - 1;
+ /*
+ * If we're not on a normal btree leaf page,
+ * there shouldn't be off-page
+ * dup sets. Something's confused; just
+ * drop it, and the code to pick up unlinked
+ * offpage dup sets will print it out
+ * with key "UNKNOWN" later.
+ */
+ if (pgtype != P_LBTREE)
+ break;
+
+ bo = (BOVERFLOW *)bk;
+
+ /*
+ * If the page number is unreasonable, or
+ * if this is supposed to be a key item,
+ * just spit out "UNKNOWN"--the best we
+ * can do is run into the data items in the
+ * unlinked offpage dup pass.
+ */
+ if (!IS_VALID_PGNO(bo->pgno) ||
+ (i % P_INDX == 0)) {
+ /* Not much to do on failure. */
+ if ((ret = __db_prdbt(&unkdbt, 0, " ",
+ handle, callback, 0, NULL)) != 0)
+ err_ret = ret;
+ break;
+ }
+
+ if ((ret = __db_salvage_duptree(dbp,
+ vdp, bo->pgno, &dbt, handle, callback,
+ flags | SA_SKIPFIRSTKEY)) != 0)
+ err_ret = ret;
+
+ break;
+ case B_KEYDATA:
+ end = ALIGN(beg + bk->len, sizeof(u_int32_t)) - 1;
+ dbt.data = bk->data;
+ dbt.size = bk->len;
+ if ((ret = __db_prdbt(&dbt,
+ 0, " ", handle, callback, 0, NULL)) != 0)
+ err_ret = ret;
+ break;
+ case B_OVERFLOW:
+ end = beg + BOVERFLOW_SIZE - 1;
+ bo = (BOVERFLOW *)bk;
+ if ((ret = __db_safe_goff(dbp, vdp,
+ bo->pgno, &dbt, &ovflbuf, flags)) != 0) {
+ err_ret = ret;
+ /* We care about err_ret more. */
+ (void)__db_prdbt(&unkdbt, 0, " ",
+ handle, callback, 0, NULL);
+ break;
+ }
+ if ((ret = __db_prdbt(&dbt,
+ 0, " ", handle, callback, 0, NULL)) != 0)
+ err_ret = ret;
+ break;
+ default:
+ /*
+ * We should never get here; __db_vrfy_inpitem
+ * should not be returning 0 if bk->type
+ * is unrecognizable.
+ */
+ DB_ASSERT(0);
+ return (EINVAL);
+ }
+
+ /*
+ * If we're being aggressive, mark the beginning
+ * and end of the item; we'll come back and print
+ * whatever "junk" is in the gaps in case we had
+ * any bogus inp elements and thereby missed stuff.
+ */
+ if (LF_ISSET(DB_AGGRESSIVE)) {
+ pgmap[beg] = ITEM_BEGIN;
+ pgmap[end] = ITEM_END;
+ }
+ }
+ }
+
+ /*
+ * If i is odd and this is a btree leaf, we've printed out a key but not
+ * a datum; fix this imbalance by printing an "UNKNOWN".
+ */
+ if (pgtype == P_LBTREE && (i % P_INDX == 1) && ((ret =
+ __db_prdbt(&unkdbt, 0, " ", handle, callback, 0, NULL)) != 0))
+ err_ret = ret;
+
+err: if (pgmap != NULL)
+ __os_free(pgmap, 0);
+ __os_free(ovflbuf, 0);
+
+ /* Mark this page as done. */
+ if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0)
+ return (t_ret);
+
+ return ((err_ret != 0) ? err_ret : ret);
+}
+
+/*
+ * __bam_salvage_walkdupint --
+ * Walk a known-good btree or recno internal page which is part of
+ * a dup tree, calling __db_salvage_duptree on each child page.
+ *
+ * PUBLIC: int __bam_salvage_walkdupint __P((DB *, VRFY_DBINFO *, PAGE *,
+ * PUBLIC: DBT *, void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__bam_salvage_walkdupint(dbp, vdp, h, key, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ DBT *key;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ RINTERNAL *ri;
+ BINTERNAL *bi;
+ int ret, t_ret;
+ db_indx_t i;
+
+ ret = 0;
+ for (i = 0; i < NUM_ENT(h); i++) {
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ bi = GET_BINTERNAL(h, i);
+ if ((t_ret = __db_salvage_duptree(dbp,
+ vdp, bi->pgno, key, handle, callback, flags)) != 0)
+ ret = t_ret;
+ case P_IRECNO:
+ ri = GET_RINTERNAL(h, i);
+ if ((t_ret = __db_salvage_duptree(dbp,
+ vdp, ri->pgno, key, handle, callback, flags)) != 0)
+ ret = t_ret;
+ break;
+ default:
+ __db_err(dbp->dbenv,
+ "__bam_salvage_walkdupint called on non-int. page");
+ DB_ASSERT(0);
+ return (EINVAL);
+ }
+ /* Pass SA_SKIPFIRSTKEY, if set, on to the 0th child only. */
+ flags &= ~LF_ISSET(SA_SKIPFIRSTKEY);
+ }
+
+ return (ret);
+}
+
+/*
+ * __bam_meta2pgset --
+ * Given a known-good meta page, return in pgsetp a 0-terminated list of
+ * db_pgno_t's corresponding to the pages in the btree.
+ *
+ * We do this by a somewhat sleazy method, to avoid having to traverse the
+ * btree structure neatly: we walk down the left side to the very
+ * first leaf page, then we mark all the pages in the chain of
+ * NEXT_PGNOs (being wary of cycles and invalid ones), then we
+ * consolidate our scratch array into a nice list, and return. This
+ * avoids the memory management hassles of recursion and the
+ * trouble of walking internal pages--they just don't matter, except
+ * for the left branch.
+ *
+ * PUBLIC: int __bam_meta2pgset __P((DB *, VRFY_DBINFO *, BTMETA *,
+ * PUBLIC: u_int32_t, DB *));
+ */
+int
+__bam_meta2pgset(dbp, vdp, btmeta, flags, pgset)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ BTMETA *btmeta;
+ u_int32_t flags;
+ DB *pgset;
+{
+ BINTERNAL *bi;
+ PAGE *h;
+ RINTERNAL *ri;
+ db_pgno_t current, p;
+ int err_ret, ret;
+
+ h = NULL;
+ ret = err_ret = 0;
+ DB_ASSERT(pgset != NULL);
+ for (current = btmeta->root;;) {
+ if (!IS_VALID_PGNO(current) || current == PGNO(btmeta)) {
+ err_ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ if ((ret = memp_fget(dbp->mpf, &current, 0, &h)) != 0) {
+ err_ret = ret;
+ goto err;
+ }
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ case P_IRECNO:
+ if ((ret = __bam_vrfy(dbp,
+ vdp, h, current, flags | DB_NOORDERCHK)) != 0) {
+ err_ret = ret;
+ goto err;
+ }
+ if (TYPE(h) == P_IBTREE) {
+ bi = GET_BINTERNAL(h, 0);
+ current = bi->pgno;
+ } else { /* P_IRECNO */
+ ri = GET_RINTERNAL(h, 0);
+ current = ri->pgno;
+ }
+ break;
+ case P_LBTREE:
+ case P_LRECNO:
+ goto traverse;
+ default:
+ err_ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+ if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
+ err_ret = ret;
+ h = NULL;
+ }
+
+ /*
+ * At this point, current is the pgno of leaf page h, the 0th in the
+ * tree we're concerned with.
+ */
+traverse:
+ while (IS_VALID_PGNO(current) && current != PGNO_INVALID) {
+ if (h == NULL &&
+ (ret = memp_fget(dbp->mpf, &current, 0, &h) != 0)) {
+ err_ret = ret;
+ break;
+ }
+
+ if ((ret = __db_vrfy_pgset_get(pgset, current, (int *)&p)) != 0)
+ goto err;
+
+ if (p != 0) {
+ /*
+ * We've found a cycle. Return success anyway--
+ * our caller may as well use however much of
+ * the pgset we've come up with.
+ */
+ break;
+ }
+ if ((ret = __db_vrfy_pgset_inc(pgset, current)) != 0)
+ goto err;
+
+ current = NEXT_PGNO(h);
+ if ((ret = memp_fput(dbp->mpf, h, 0)) != 0)
+ err_ret = ret;
+ h = NULL;
+ }
+
+err: if (h != NULL)
+ (void)memp_fput(dbp->mpf, h, 0);
+
+ return (ret == 0 ? err_ret : ret);
+}
+
+/*
+ * __bam_safe_getdata --
+ *
+ * Utility function for __bam_vrfy_itemorder. Safely gets the datum at
+ * index i, page h, and sticks it in DBT dbt. If ovflok is 1 and i's an
+ * overflow item, we do a safe_goff to get the item and signal that we need
+ * to free dbt->data; if ovflok is 0, we leaves the DBT zeroed.
+ */
+static int
+__bam_safe_getdata(dbp, h, i, ovflok, dbt, freedbtp)
+ DB *dbp;
+ PAGE *h;
+ u_int32_t i;
+ int ovflok;
+ DBT *dbt;
+ int *freedbtp;
+{
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+
+ memset(dbt, 0, sizeof(DBT));
+ *freedbtp = 0;
+
+ bk = GET_BKEYDATA(h, i);
+ if (B_TYPE(bk->type) == B_OVERFLOW) {
+ if (!ovflok)
+ return (0);
+
+ bo = (BOVERFLOW *)bk;
+ F_SET(dbt, DB_DBT_MALLOC);
+
+ *freedbtp = 1;
+ return (__db_goff(dbp, dbt, bo->tlen, bo->pgno, NULL, NULL));
+ } else {
+ dbt->data = bk->data;
+ dbt->size = bk->len;
+ }
+
+ return (0);
+}
diff --git a/db/btree/btree.src b/db/btree/btree.src
new file mode 100644
index 000000000..a1eba7d7f
--- /dev/null
+++ b/db/btree/btree.src
@@ -0,0 +1,296 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ *
+ * $Id: btree.src,v 10.26 2000/12/12 17:40:23 bostic Exp $
+ */
+
+PREFIX bam
+
+INCLUDE #include "db_config.h"
+INCLUDE
+INCLUDE #ifndef NO_SYSTEM_INCLUDES
+INCLUDE #include <sys/types.h>
+INCLUDE
+INCLUDE #include <ctype.h>
+INCLUDE #include <errno.h>
+INCLUDE #include <string.h>
+INCLUDE #endif
+INCLUDE
+INCLUDE #include "db_int.h"
+INCLUDE #include "db_page.h"
+INCLUDE #include "db_dispatch.h"
+INCLUDE #include "db_am.h"
+INCLUDE #include "btree.h"
+INCLUDE #include "txn.h"
+INCLUDE
+
+/*
+ * BTREE-pg_alloc: used to record allocating a new page.
+ *
+ * meta_lsn: the meta-data page's original lsn.
+ * page_lsn: the allocated page's original lsn.
+ * pgno: the page allocated.
+ * next: the next page on the free list.
+ */
+BEGIN pg_alloc 51
+ARG fileid int32_t ld
+POINTER meta_lsn DB_LSN * lu
+POINTER page_lsn DB_LSN * lu
+ARG pgno db_pgno_t lu
+ARG ptype u_int32_t lu
+ARG next db_pgno_t lu
+END
+
+DEPRECATED pg_alloc1 60
+ARG fileid int32_t ld
+POINTER meta_lsn DB_LSN * lu
+POINTER alloc_lsn DB_LSN * lu
+POINTER page_lsn DB_LSN * lu
+ARG pgno db_pgno_t lu
+ARG ptype u_int32_t lu
+ARG next db_pgno_t lu
+END
+
+/*
+ * BTREE-pg_free: used to record freeing a page.
+ *
+ * pgno: the page being freed.
+ * meta_lsn: the meta-data page's original lsn.
+ * header: the header from the free'd page.
+ * next: the previous next pointer on the metadata page.
+ */
+BEGIN pg_free 52
+ARG fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER meta_lsn DB_LSN * lu
+DBT header DBT s
+ARG next db_pgno_t lu
+END
+
+DEPRECATED pg_free1 61
+ARG fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER meta_lsn DB_LSN * lu
+POINTER alloc_lsn DB_LSN * lu
+DBT header DBT s
+ARG next db_pgno_t lu
+END
+
+/*
+ * BTREE-split: used to log a page split.
+ *
+ * left: the page number for the low-order contents.
+ * llsn: the left page's original LSN.
+ * right: the page number for the high-order contents.
+ * rlsn: the right page's original LSN.
+ * indx: the number of entries that went to the left page.
+ * npgno: the next page number
+ * nlsn: the next page's original LSN (or 0 if no next page).
+ * pg: the split page's contents before the split.
+ */
+DEPRECATED split1 53
+ARG fileid int32_t ld
+ARG left db_pgno_t lu
+POINTER llsn DB_LSN * lu
+ARG right db_pgno_t lu
+POINTER rlsn DB_LSN * lu
+ARG indx u_int32_t lu
+ARG npgno db_pgno_t lu
+POINTER nlsn DB_LSN * lu
+DBT pg DBT s
+END
+
+/*
+ * BTREE-split: used to log a page split.
+ *
+ * left: the page number for the low-order contents.
+ * llsn: the left page's original LSN.
+ * right: the page number for the high-order contents.
+ * rlsn: the right page's original LSN.
+ * indx: the number of entries that went to the left page.
+ * npgno: the next page number
+ * npgno: the next page number
+ * nlsn: the next page's original LSN (or 0 if no next page).
+ * root_pgno: the root page number
+ * pg: the split page's contents before the split.
+ * opflags: SPL_NRECS: if splitting a tree that maintains a record count.
+ */
+BEGIN split 62
+ARG fileid int32_t ld
+ARG left db_pgno_t lu
+POINTER llsn DB_LSN * lu
+ARG right db_pgno_t lu
+POINTER rlsn DB_LSN * lu
+ARG indx u_int32_t lu
+ARG npgno db_pgno_t lu
+POINTER nlsn DB_LSN * lu
+ARG root_pgno db_pgno_t lu
+DBT pg DBT s
+ARG opflags u_int32_t lu
+END
+
+/*
+ * BTREE-rsplit: used to log a reverse-split
+ *
+ * pgno: the page number of the page copied over the root.
+ * pgdbt: the page being copied on the root page.
+ * nrec: the tree's record count.
+ * rootent: last entry on the root page.
+ * rootlsn: the root page's original lsn.
+ */
+DEPRECATED rsplit1 54
+ARG fileid int32_t ld
+ARG pgno db_pgno_t lu
+DBT pgdbt DBT s
+ARG nrec db_pgno_t lu
+DBT rootent DBT s
+POINTER rootlsn DB_LSN * lu
+END
+
+/*
+ * BTREE-rsplit: used to log a reverse-split
+ *
+ * pgno: the page number of the page copied over the root.
+ * pgdbt: the page being copied on the root page.
+ * root_pgno: the root page number.
+ * nrec: the tree's record count.
+ * rootent: last entry on the root page.
+ * rootlsn: the root page's original lsn.
+ */
+BEGIN rsplit 63
+ARG fileid int32_t ld
+ARG pgno db_pgno_t lu
+DBT pgdbt DBT s
+ARG root_pgno db_pgno_t lu
+ARG nrec db_pgno_t lu
+DBT rootent DBT s
+POINTER rootlsn DB_LSN * lu
+END
+
+/*
+ * BTREE-adj: used to log the adjustment of an index.
+ *
+ * pgno: the page modified.
+ * lsn: the page's original lsn.
+ * indx: the index adjusted.
+ * indx_copy: the index to copy if inserting.
+ * is_insert: 0 if a delete, 1 if an insert.
+ */
+BEGIN adj 55
+ARG fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER lsn DB_LSN * lu
+ARG indx u_int32_t lu
+ARG indx_copy u_int32_t lu
+ARG is_insert u_int32_t lu
+END
+
+/*
+ * BTREE-cadjust: used to adjust the count change in an internal page.
+ *
+ * pgno: the page modified.
+ * lsn: the page's original lsn.
+ * indx: the index to be adjusted.
+ * adjust: the signed adjustment.
+ * opflags: CAD_UPDATEROOT: if root page count was adjusted.
+ */
+BEGIN cadjust 56
+ARG fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER lsn DB_LSN * lu
+ARG indx u_int32_t lu
+ARG adjust int32_t ld
+ARG opflags u_int32_t lu
+END
+
+/*
+ * BTREE-cdel: used to log the intent-to-delete of a cursor record.
+ *
+ * pgno: the page modified.
+ * lsn: the page's original lsn.
+ * indx: the index to be deleted.
+ */
+BEGIN cdel 57
+ARG fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER lsn DB_LSN * lu
+ARG indx u_int32_t lu
+END
+
+/*
+ * BTREE-repl: used to log the replacement of an item.
+ *
+ * pgno: the page modified.
+ * lsn: the page's original lsn.
+ * orig: the original data.
+ * new: the replacement data.
+ * duplicate: the prefix of the replacement that matches the original.
+ */
+BEGIN repl 58
+ARG fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER lsn DB_LSN * lu
+ARG indx u_int32_t lu
+ARG isdeleted u_int32_t lu
+DBT orig DBT s
+DBT repl DBT s
+ARG prefix u_int32_t lu
+ARG suffix u_int32_t lu
+END
+
+/*
+ * BTREE-root: log the assignment of a root btree page.
+ */
+BEGIN root 59
+ARG fileid int32_t ld
+ARG meta_pgno db_pgno_t lu
+ARG root_pgno db_pgno_t lu
+POINTER meta_lsn DB_LSN * lu
+END
+
+/*
+ * BTREE-curadj: undo cursor adjustments on txn abort.
+ * Should only be processed during DB_TXN_ABORT.
+ * NOTE: the first_indx field gets used to hold
+ * signed index adjustment in one case.
+ * care should be taken if its size is changed.
+ */
+BEGIN curadj 64
+/* Fileid of db affected. */
+ARG fileid int32_t ld
+/* Which adjustment. */
+ARG mode db_ca_mode ld
+/* Page entry is from. */
+ARG from_pgno db_pgno_t lu
+/* Page entry went to. */
+ARG to_pgno db_pgno_t lu
+/* Left page of root split. */
+ARG left_pgno db_pgno_t lu
+/* First index of dup set. Also used as adjustment. */
+ARG first_indx u_int32_t lu
+/* Index entry is from. */
+ARG from_indx u_int32_t lu
+/* Index where entry went. */
+ARG to_indx u_int32_t lu
+END
+
+/*
+ * BTREE-rcuradj: undo cursor adjustments on txn abort in
+ * renumbering recno trees.
+ * Should only be processed during DB_TXN_ABORT.
+ */
+BEGIN rcuradj 65
+/* Fileid of db affected. */
+ARG fileid int32_t ld
+/* Which adjustment. */
+ARG mode ca_recno_arg ld
+/* Root page number. */
+ARG root db_pgno_t ld
+/* Recno of the adjustment. */
+ARG recno db_recno_t ld
+/* Order number of the adjustment. */
+ARG order u_int32_t ld
+END
diff --git a/db/btree/btree_auto.c b/db/btree/btree_auto.c
new file mode 100644
index 000000000..fdb27b7d2
--- /dev/null
+++ b/db/btree/btree_auto.c
@@ -0,0 +1,2284 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+#include "db_config.h"
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <ctype.h>
+#include <errno.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "db_dispatch.h"
+#include "db_am.h"
+#include "btree.h"
+#include "txn.h"
+
+int
+__bam_pg_alloc_log(dbenv, txnid, ret_lsnp, flags,
+ fileid, meta_lsn, page_lsn, pgno, ptype, next)
+ DB_ENV *dbenv;
+ DB_TXN *txnid;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ int32_t fileid;
+ DB_LSN * meta_lsn;
+ DB_LSN * page_lsn;
+ db_pgno_t pgno;
+ u_int32_t ptype;
+ db_pgno_t next;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn;
+ u_int32_t rectype, txn_num;
+ int ret;
+ u_int8_t *bp;
+
+ rectype = DB_bam_pg_alloc;
+ if (txnid != NULL &&
+ TAILQ_FIRST(&txnid->kids) != NULL &&
+ (ret = __txn_activekids(dbenv, rectype, txnid)) != 0)
+ return (ret);
+ txn_num = txnid == NULL ? 0 : txnid->txnid;
+ if (txnid == NULL) {
+ ZERO_LSN(null_lsn);
+ lsnp = &null_lsn;
+ } else
+ lsnp = &txnid->last_lsn;
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(fileid)
+ + sizeof(*meta_lsn)
+ + sizeof(*page_lsn)
+ + sizeof(pgno)
+ + sizeof(ptype)
+ + sizeof(next);
+ if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0)
+ return (ret);
+
+ bp = logrec.data;
+ memcpy(bp, &rectype, sizeof(rectype));
+ bp += sizeof(rectype);
+ memcpy(bp, &txn_num, sizeof(txn_num));
+ bp += sizeof(txn_num);
+ memcpy(bp, lsnp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(bp, &fileid, sizeof(fileid));
+ bp += sizeof(fileid);
+ if (meta_lsn != NULL)
+ memcpy(bp, meta_lsn, sizeof(*meta_lsn));
+ else
+ memset(bp, 0, sizeof(*meta_lsn));
+ bp += sizeof(*meta_lsn);
+ if (page_lsn != NULL)
+ memcpy(bp, page_lsn, sizeof(*page_lsn));
+ else
+ memset(bp, 0, sizeof(*page_lsn));
+ bp += sizeof(*page_lsn);
+ memcpy(bp, &pgno, sizeof(pgno));
+ bp += sizeof(pgno);
+ memcpy(bp, &ptype, sizeof(ptype));
+ bp += sizeof(ptype);
+ memcpy(bp, &next, sizeof(next));
+ bp += sizeof(next);
+ DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size);
+ ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags);
+ if (txnid != NULL)
+ txnid->last_lsn = *ret_lsnp;
+ __os_free(logrec.data, logrec.size);
+ return (ret);
+}
+
+int
+__bam_pg_alloc_print(dbenv, dbtp, lsnp, notused2, notused3)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_pg_alloc_args *argp;
+ u_int32_t i;
+ u_int ch;
+ int ret;
+
+ i = 0;
+ ch = 0;
+ notused2 = DB_TXN_ABORT;
+ notused3 = NULL;
+
+ if ((ret = __bam_pg_alloc_read(dbenv, dbtp->data, &argp)) != 0)
+ return (ret);
+ printf("[%lu][%lu]bam_pg_alloc: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file,
+ (u_long)lsnp->offset,
+ (u_long)argp->type,
+ (u_long)argp->txnid->txnid,
+ (u_long)argp->prev_lsn.file,
+ (u_long)argp->prev_lsn.offset);
+ printf("\tfileid: %ld\n", (long)argp->fileid);
+ printf("\tmeta_lsn: [%lu][%lu]\n",
+ (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset);
+ printf("\tpage_lsn: [%lu][%lu]\n",
+ (u_long)argp->page_lsn.file, (u_long)argp->page_lsn.offset);
+ printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ printf("\tptype: %lu\n", (u_long)argp->ptype);
+ printf("\tnext: %lu\n", (u_long)argp->next);
+ printf("\n");
+ __os_free(argp, 0);
+ return (0);
+}
+
+int
+__bam_pg_alloc_read(dbenv, recbuf, argpp)
+ DB_ENV *dbenv;
+ void *recbuf;
+ __bam_pg_alloc_args **argpp;
+{
+ __bam_pg_alloc_args *argp;
+ u_int8_t *bp;
+ int ret;
+
+ ret = __os_malloc(dbenv, sizeof(__bam_pg_alloc_args) +
+ sizeof(DB_TXN), NULL, &argp);
+ if (ret != 0)
+ return (ret);
+ argp->txnid = (DB_TXN *)&argp[1];
+ bp = recbuf;
+ memcpy(&argp->type, bp, sizeof(argp->type));
+ bp += sizeof(argp->type);
+ memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid));
+ bp += sizeof(argp->txnid->txnid);
+ memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(&argp->fileid, bp, sizeof(argp->fileid));
+ bp += sizeof(argp->fileid);
+ memcpy(&argp->meta_lsn, bp, sizeof(argp->meta_lsn));
+ bp += sizeof(argp->meta_lsn);
+ memcpy(&argp->page_lsn, bp, sizeof(argp->page_lsn));
+ bp += sizeof(argp->page_lsn);
+ memcpy(&argp->pgno, bp, sizeof(argp->pgno));
+ bp += sizeof(argp->pgno);
+ memcpy(&argp->ptype, bp, sizeof(argp->ptype));
+ bp += sizeof(argp->ptype);
+ memcpy(&argp->next, bp, sizeof(argp->next));
+ bp += sizeof(argp->next);
+ *argpp = argp;
+ return (0);
+}
+
+int
+__bam_pg_alloc1_print(dbenv, dbtp, lsnp, notused2, notused3)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_pg_alloc1_args *argp;
+ u_int32_t i;
+ u_int ch;
+ int ret;
+
+ i = 0;
+ ch = 0;
+ notused2 = DB_TXN_ABORT;
+ notused3 = NULL;
+
+ if ((ret = __bam_pg_alloc1_read(dbenv, dbtp->data, &argp)) != 0)
+ return (ret);
+ printf("[%lu][%lu]bam_pg_alloc1: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file,
+ (u_long)lsnp->offset,
+ (u_long)argp->type,
+ (u_long)argp->txnid->txnid,
+ (u_long)argp->prev_lsn.file,
+ (u_long)argp->prev_lsn.offset);
+ printf("\tfileid: %ld\n", (long)argp->fileid);
+ printf("\tmeta_lsn: [%lu][%lu]\n",
+ (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset);
+ printf("\talloc_lsn: [%lu][%lu]\n",
+ (u_long)argp->alloc_lsn.file, (u_long)argp->alloc_lsn.offset);
+ printf("\tpage_lsn: [%lu][%lu]\n",
+ (u_long)argp->page_lsn.file, (u_long)argp->page_lsn.offset);
+ printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ printf("\tptype: %lu\n", (u_long)argp->ptype);
+ printf("\tnext: %lu\n", (u_long)argp->next);
+ printf("\n");
+ __os_free(argp, 0);
+ return (0);
+}
+
+int
+__bam_pg_alloc1_read(dbenv, recbuf, argpp)
+ DB_ENV *dbenv;
+ void *recbuf;
+ __bam_pg_alloc1_args **argpp;
+{
+ __bam_pg_alloc1_args *argp;
+ u_int8_t *bp;
+ int ret;
+
+ ret = __os_malloc(dbenv, sizeof(__bam_pg_alloc1_args) +
+ sizeof(DB_TXN), NULL, &argp);
+ if (ret != 0)
+ return (ret);
+ argp->txnid = (DB_TXN *)&argp[1];
+ bp = recbuf;
+ memcpy(&argp->type, bp, sizeof(argp->type));
+ bp += sizeof(argp->type);
+ memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid));
+ bp += sizeof(argp->txnid->txnid);
+ memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(&argp->fileid, bp, sizeof(argp->fileid));
+ bp += sizeof(argp->fileid);
+ memcpy(&argp->meta_lsn, bp, sizeof(argp->meta_lsn));
+ bp += sizeof(argp->meta_lsn);
+ memcpy(&argp->alloc_lsn, bp, sizeof(argp->alloc_lsn));
+ bp += sizeof(argp->alloc_lsn);
+ memcpy(&argp->page_lsn, bp, sizeof(argp->page_lsn));
+ bp += sizeof(argp->page_lsn);
+ memcpy(&argp->pgno, bp, sizeof(argp->pgno));
+ bp += sizeof(argp->pgno);
+ memcpy(&argp->ptype, bp, sizeof(argp->ptype));
+ bp += sizeof(argp->ptype);
+ memcpy(&argp->next, bp, sizeof(argp->next));
+ bp += sizeof(argp->next);
+ *argpp = argp;
+ return (0);
+}
+
+int
+__bam_pg_free_log(dbenv, txnid, ret_lsnp, flags,
+ fileid, pgno, meta_lsn, header, next)
+ DB_ENV *dbenv;
+ DB_TXN *txnid;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DB_LSN * meta_lsn;
+ const DBT *header;
+ db_pgno_t next;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn;
+ u_int32_t zero;
+ u_int32_t rectype, txn_num;
+ int ret;
+ u_int8_t *bp;
+
+ rectype = DB_bam_pg_free;
+ if (txnid != NULL &&
+ TAILQ_FIRST(&txnid->kids) != NULL &&
+ (ret = __txn_activekids(dbenv, rectype, txnid)) != 0)
+ return (ret);
+ txn_num = txnid == NULL ? 0 : txnid->txnid;
+ if (txnid == NULL) {
+ ZERO_LSN(null_lsn);
+ lsnp = &null_lsn;
+ } else
+ lsnp = &txnid->last_lsn;
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(fileid)
+ + sizeof(pgno)
+ + sizeof(*meta_lsn)
+ + sizeof(u_int32_t) + (header == NULL ? 0 : header->size)
+ + sizeof(next);
+ if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0)
+ return (ret);
+
+ bp = logrec.data;
+ memcpy(bp, &rectype, sizeof(rectype));
+ bp += sizeof(rectype);
+ memcpy(bp, &txn_num, sizeof(txn_num));
+ bp += sizeof(txn_num);
+ memcpy(bp, lsnp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(bp, &fileid, sizeof(fileid));
+ bp += sizeof(fileid);
+ memcpy(bp, &pgno, sizeof(pgno));
+ bp += sizeof(pgno);
+ if (meta_lsn != NULL)
+ memcpy(bp, meta_lsn, sizeof(*meta_lsn));
+ else
+ memset(bp, 0, sizeof(*meta_lsn));
+ bp += sizeof(*meta_lsn);
+ if (header == NULL) {
+ zero = 0;
+ memcpy(bp, &zero, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else {
+ memcpy(bp, &header->size, sizeof(header->size));
+ bp += sizeof(header->size);
+ memcpy(bp, header->data, header->size);
+ bp += header->size;
+ }
+ memcpy(bp, &next, sizeof(next));
+ bp += sizeof(next);
+ DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size);
+ ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags);
+ if (txnid != NULL)
+ txnid->last_lsn = *ret_lsnp;
+ __os_free(logrec.data, logrec.size);
+ return (ret);
+}
+
+int
+__bam_pg_free_print(dbenv, dbtp, lsnp, notused2, notused3)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_pg_free_args *argp;
+ u_int32_t i;
+ u_int ch;
+ int ret;
+
+ i = 0;
+ ch = 0;
+ notused2 = DB_TXN_ABORT;
+ notused3 = NULL;
+
+ if ((ret = __bam_pg_free_read(dbenv, dbtp->data, &argp)) != 0)
+ return (ret);
+ printf("[%lu][%lu]bam_pg_free: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file,
+ (u_long)lsnp->offset,
+ (u_long)argp->type,
+ (u_long)argp->txnid->txnid,
+ (u_long)argp->prev_lsn.file,
+ (u_long)argp->prev_lsn.offset);
+ printf("\tfileid: %ld\n", (long)argp->fileid);
+ printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ printf("\tmeta_lsn: [%lu][%lu]\n",
+ (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset);
+ printf("\theader: ");
+ for (i = 0; i < argp->header.size; i++) {
+ ch = ((u_int8_t *)argp->header.data)[i];
+ if (isprint(ch) || ch == 0xa)
+ putchar(ch);
+ else
+ printf("%#x ", ch);
+ }
+ printf("\n");
+ printf("\tnext: %lu\n", (u_long)argp->next);
+ printf("\n");
+ __os_free(argp, 0);
+ return (0);
+}
+
+int
+__bam_pg_free_read(dbenv, recbuf, argpp)
+ DB_ENV *dbenv;
+ void *recbuf;
+ __bam_pg_free_args **argpp;
+{
+ __bam_pg_free_args *argp;
+ u_int8_t *bp;
+ int ret;
+
+ ret = __os_malloc(dbenv, sizeof(__bam_pg_free_args) +
+ sizeof(DB_TXN), NULL, &argp);
+ if (ret != 0)
+ return (ret);
+ argp->txnid = (DB_TXN *)&argp[1];
+ bp = recbuf;
+ memcpy(&argp->type, bp, sizeof(argp->type));
+ bp += sizeof(argp->type);
+ memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid));
+ bp += sizeof(argp->txnid->txnid);
+ memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(&argp->fileid, bp, sizeof(argp->fileid));
+ bp += sizeof(argp->fileid);
+ memcpy(&argp->pgno, bp, sizeof(argp->pgno));
+ bp += sizeof(argp->pgno);
+ memcpy(&argp->meta_lsn, bp, sizeof(argp->meta_lsn));
+ bp += sizeof(argp->meta_lsn);
+ memset(&argp->header, 0, sizeof(argp->header));
+ memcpy(&argp->header.size, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ argp->header.data = bp;
+ bp += argp->header.size;
+ memcpy(&argp->next, bp, sizeof(argp->next));
+ bp += sizeof(argp->next);
+ *argpp = argp;
+ return (0);
+}
+
+int
+__bam_pg_free1_print(dbenv, dbtp, lsnp, notused2, notused3)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_pg_free1_args *argp;
+ u_int32_t i;
+ u_int ch;
+ int ret;
+
+ i = 0;
+ ch = 0;
+ notused2 = DB_TXN_ABORT;
+ notused3 = NULL;
+
+ if ((ret = __bam_pg_free1_read(dbenv, dbtp->data, &argp)) != 0)
+ return (ret);
+ printf("[%lu][%lu]bam_pg_free1: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file,
+ (u_long)lsnp->offset,
+ (u_long)argp->type,
+ (u_long)argp->txnid->txnid,
+ (u_long)argp->prev_lsn.file,
+ (u_long)argp->prev_lsn.offset);
+ printf("\tfileid: %ld\n", (long)argp->fileid);
+ printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ printf("\tmeta_lsn: [%lu][%lu]\n",
+ (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset);
+ printf("\talloc_lsn: [%lu][%lu]\n",
+ (u_long)argp->alloc_lsn.file, (u_long)argp->alloc_lsn.offset);
+ printf("\theader: ");
+ for (i = 0; i < argp->header.size; i++) {
+ ch = ((u_int8_t *)argp->header.data)[i];
+ if (isprint(ch) || ch == 0xa)
+ putchar(ch);
+ else
+ printf("%#x ", ch);
+ }
+ printf("\n");
+ printf("\tnext: %lu\n", (u_long)argp->next);
+ printf("\n");
+ __os_free(argp, 0);
+ return (0);
+}
+
+int
+__bam_pg_free1_read(dbenv, recbuf, argpp)
+ DB_ENV *dbenv;
+ void *recbuf;
+ __bam_pg_free1_args **argpp;
+{
+ __bam_pg_free1_args *argp;
+ u_int8_t *bp;
+ int ret;
+
+ ret = __os_malloc(dbenv, sizeof(__bam_pg_free1_args) +
+ sizeof(DB_TXN), NULL, &argp);
+ if (ret != 0)
+ return (ret);
+ argp->txnid = (DB_TXN *)&argp[1];
+ bp = recbuf;
+ memcpy(&argp->type, bp, sizeof(argp->type));
+ bp += sizeof(argp->type);
+ memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid));
+ bp += sizeof(argp->txnid->txnid);
+ memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(&argp->fileid, bp, sizeof(argp->fileid));
+ bp += sizeof(argp->fileid);
+ memcpy(&argp->pgno, bp, sizeof(argp->pgno));
+ bp += sizeof(argp->pgno);
+ memcpy(&argp->meta_lsn, bp, sizeof(argp->meta_lsn));
+ bp += sizeof(argp->meta_lsn);
+ memcpy(&argp->alloc_lsn, bp, sizeof(argp->alloc_lsn));
+ bp += sizeof(argp->alloc_lsn);
+ memset(&argp->header, 0, sizeof(argp->header));
+ memcpy(&argp->header.size, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ argp->header.data = bp;
+ bp += argp->header.size;
+ memcpy(&argp->next, bp, sizeof(argp->next));
+ bp += sizeof(argp->next);
+ *argpp = argp;
+ return (0);
+}
+
+int
+__bam_split1_print(dbenv, dbtp, lsnp, notused2, notused3)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_split1_args *argp;
+ u_int32_t i;
+ u_int ch;
+ int ret;
+
+ i = 0;
+ ch = 0;
+ notused2 = DB_TXN_ABORT;
+ notused3 = NULL;
+
+ if ((ret = __bam_split1_read(dbenv, dbtp->data, &argp)) != 0)
+ return (ret);
+ printf("[%lu][%lu]bam_split1: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file,
+ (u_long)lsnp->offset,
+ (u_long)argp->type,
+ (u_long)argp->txnid->txnid,
+ (u_long)argp->prev_lsn.file,
+ (u_long)argp->prev_lsn.offset);
+ printf("\tfileid: %ld\n", (long)argp->fileid);
+ printf("\tleft: %lu\n", (u_long)argp->left);
+ printf("\tllsn: [%lu][%lu]\n",
+ (u_long)argp->llsn.file, (u_long)argp->llsn.offset);
+ printf("\tright: %lu\n", (u_long)argp->right);
+ printf("\trlsn: [%lu][%lu]\n",
+ (u_long)argp->rlsn.file, (u_long)argp->rlsn.offset);
+ printf("\tindx: %lu\n", (u_long)argp->indx);
+ printf("\tnpgno: %lu\n", (u_long)argp->npgno);
+ printf("\tnlsn: [%lu][%lu]\n",
+ (u_long)argp->nlsn.file, (u_long)argp->nlsn.offset);
+ printf("\tpg: ");
+ for (i = 0; i < argp->pg.size; i++) {
+ ch = ((u_int8_t *)argp->pg.data)[i];
+ if (isprint(ch) || ch == 0xa)
+ putchar(ch);
+ else
+ printf("%#x ", ch);
+ }
+ printf("\n");
+ printf("\n");
+ __os_free(argp, 0);
+ return (0);
+}
+
+int
+__bam_split1_read(dbenv, recbuf, argpp)
+ DB_ENV *dbenv;
+ void *recbuf;
+ __bam_split1_args **argpp;
+{
+ __bam_split1_args *argp;
+ u_int8_t *bp;
+ int ret;
+
+ ret = __os_malloc(dbenv, sizeof(__bam_split1_args) +
+ sizeof(DB_TXN), NULL, &argp);
+ if (ret != 0)
+ return (ret);
+ argp->txnid = (DB_TXN *)&argp[1];
+ bp = recbuf;
+ memcpy(&argp->type, bp, sizeof(argp->type));
+ bp += sizeof(argp->type);
+ memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid));
+ bp += sizeof(argp->txnid->txnid);
+ memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(&argp->fileid, bp, sizeof(argp->fileid));
+ bp += sizeof(argp->fileid);
+ memcpy(&argp->left, bp, sizeof(argp->left));
+ bp += sizeof(argp->left);
+ memcpy(&argp->llsn, bp, sizeof(argp->llsn));
+ bp += sizeof(argp->llsn);
+ memcpy(&argp->right, bp, sizeof(argp->right));
+ bp += sizeof(argp->right);
+ memcpy(&argp->rlsn, bp, sizeof(argp->rlsn));
+ bp += sizeof(argp->rlsn);
+ memcpy(&argp->indx, bp, sizeof(argp->indx));
+ bp += sizeof(argp->indx);
+ memcpy(&argp->npgno, bp, sizeof(argp->npgno));
+ bp += sizeof(argp->npgno);
+ memcpy(&argp->nlsn, bp, sizeof(argp->nlsn));
+ bp += sizeof(argp->nlsn);
+ memset(&argp->pg, 0, sizeof(argp->pg));
+ memcpy(&argp->pg.size, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ argp->pg.data = bp;
+ bp += argp->pg.size;
+ *argpp = argp;
+ return (0);
+}
+
+int
+__bam_split_log(dbenv, txnid, ret_lsnp, flags,
+ fileid, left, llsn, right, rlsn, indx,
+ npgno, nlsn, root_pgno, pg, opflags)
+ DB_ENV *dbenv;
+ DB_TXN *txnid;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ int32_t fileid;
+ db_pgno_t left;
+ DB_LSN * llsn;
+ db_pgno_t right;
+ DB_LSN * rlsn;
+ u_int32_t indx;
+ db_pgno_t npgno;
+ DB_LSN * nlsn;
+ db_pgno_t root_pgno;
+ const DBT *pg;
+ u_int32_t opflags;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn;
+ u_int32_t zero;
+ u_int32_t rectype, txn_num;
+ int ret;
+ u_int8_t *bp;
+
+ rectype = DB_bam_split;
+ if (txnid != NULL &&
+ TAILQ_FIRST(&txnid->kids) != NULL &&
+ (ret = __txn_activekids(dbenv, rectype, txnid)) != 0)
+ return (ret);
+ txn_num = txnid == NULL ? 0 : txnid->txnid;
+ if (txnid == NULL) {
+ ZERO_LSN(null_lsn);
+ lsnp = &null_lsn;
+ } else
+ lsnp = &txnid->last_lsn;
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(fileid)
+ + sizeof(left)
+ + sizeof(*llsn)
+ + sizeof(right)
+ + sizeof(*rlsn)
+ + sizeof(indx)
+ + sizeof(npgno)
+ + sizeof(*nlsn)
+ + sizeof(root_pgno)
+ + sizeof(u_int32_t) + (pg == NULL ? 0 : pg->size)
+ + sizeof(opflags);
+ if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0)
+ return (ret);
+
+ bp = logrec.data;
+ memcpy(bp, &rectype, sizeof(rectype));
+ bp += sizeof(rectype);
+ memcpy(bp, &txn_num, sizeof(txn_num));
+ bp += sizeof(txn_num);
+ memcpy(bp, lsnp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(bp, &fileid, sizeof(fileid));
+ bp += sizeof(fileid);
+ memcpy(bp, &left, sizeof(left));
+ bp += sizeof(left);
+ if (llsn != NULL)
+ memcpy(bp, llsn, sizeof(*llsn));
+ else
+ memset(bp, 0, sizeof(*llsn));
+ bp += sizeof(*llsn);
+ memcpy(bp, &right, sizeof(right));
+ bp += sizeof(right);
+ if (rlsn != NULL)
+ memcpy(bp, rlsn, sizeof(*rlsn));
+ else
+ memset(bp, 0, sizeof(*rlsn));
+ bp += sizeof(*rlsn);
+ memcpy(bp, &indx, sizeof(indx));
+ bp += sizeof(indx);
+ memcpy(bp, &npgno, sizeof(npgno));
+ bp += sizeof(npgno);
+ if (nlsn != NULL)
+ memcpy(bp, nlsn, sizeof(*nlsn));
+ else
+ memset(bp, 0, sizeof(*nlsn));
+ bp += sizeof(*nlsn);
+ memcpy(bp, &root_pgno, sizeof(root_pgno));
+ bp += sizeof(root_pgno);
+ if (pg == NULL) {
+ zero = 0;
+ memcpy(bp, &zero, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else {
+ memcpy(bp, &pg->size, sizeof(pg->size));
+ bp += sizeof(pg->size);
+ memcpy(bp, pg->data, pg->size);
+ bp += pg->size;
+ }
+ memcpy(bp, &opflags, sizeof(opflags));
+ bp += sizeof(opflags);
+ DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size);
+ ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags);
+ if (txnid != NULL)
+ txnid->last_lsn = *ret_lsnp;
+ __os_free(logrec.data, logrec.size);
+ return (ret);
+}
+
+int
+__bam_split_print(dbenv, dbtp, lsnp, notused2, notused3)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_split_args *argp;
+ u_int32_t i;
+ u_int ch;
+ int ret;
+
+ i = 0;
+ ch = 0;
+ notused2 = DB_TXN_ABORT;
+ notused3 = NULL;
+
+ if ((ret = __bam_split_read(dbenv, dbtp->data, &argp)) != 0)
+ return (ret);
+ printf("[%lu][%lu]bam_split: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file,
+ (u_long)lsnp->offset,
+ (u_long)argp->type,
+ (u_long)argp->txnid->txnid,
+ (u_long)argp->prev_lsn.file,
+ (u_long)argp->prev_lsn.offset);
+ printf("\tfileid: %ld\n", (long)argp->fileid);
+ printf("\tleft: %lu\n", (u_long)argp->left);
+ printf("\tllsn: [%lu][%lu]\n",
+ (u_long)argp->llsn.file, (u_long)argp->llsn.offset);
+ printf("\tright: %lu\n", (u_long)argp->right);
+ printf("\trlsn: [%lu][%lu]\n",
+ (u_long)argp->rlsn.file, (u_long)argp->rlsn.offset);
+ printf("\tindx: %lu\n", (u_long)argp->indx);
+ printf("\tnpgno: %lu\n", (u_long)argp->npgno);
+ printf("\tnlsn: [%lu][%lu]\n",
+ (u_long)argp->nlsn.file, (u_long)argp->nlsn.offset);
+ printf("\troot_pgno: %lu\n", (u_long)argp->root_pgno);
+ printf("\tpg: ");
+ for (i = 0; i < argp->pg.size; i++) {
+ ch = ((u_int8_t *)argp->pg.data)[i];
+ if (isprint(ch) || ch == 0xa)
+ putchar(ch);
+ else
+ printf("%#x ", ch);
+ }
+ printf("\n");
+ printf("\topflags: %lu\n", (u_long)argp->opflags);
+ printf("\n");
+ __os_free(argp, 0);
+ return (0);
+}
+
+int
+__bam_split_read(dbenv, recbuf, argpp)
+ DB_ENV *dbenv;
+ void *recbuf;
+ __bam_split_args **argpp;
+{
+ __bam_split_args *argp;
+ u_int8_t *bp;
+ int ret;
+
+ ret = __os_malloc(dbenv, sizeof(__bam_split_args) +
+ sizeof(DB_TXN), NULL, &argp);
+ if (ret != 0)
+ return (ret);
+ argp->txnid = (DB_TXN *)&argp[1];
+ bp = recbuf;
+ memcpy(&argp->type, bp, sizeof(argp->type));
+ bp += sizeof(argp->type);
+ memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid));
+ bp += sizeof(argp->txnid->txnid);
+ memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(&argp->fileid, bp, sizeof(argp->fileid));
+ bp += sizeof(argp->fileid);
+ memcpy(&argp->left, bp, sizeof(argp->left));
+ bp += sizeof(argp->left);
+ memcpy(&argp->llsn, bp, sizeof(argp->llsn));
+ bp += sizeof(argp->llsn);
+ memcpy(&argp->right, bp, sizeof(argp->right));
+ bp += sizeof(argp->right);
+ memcpy(&argp->rlsn, bp, sizeof(argp->rlsn));
+ bp += sizeof(argp->rlsn);
+ memcpy(&argp->indx, bp, sizeof(argp->indx));
+ bp += sizeof(argp->indx);
+ memcpy(&argp->npgno, bp, sizeof(argp->npgno));
+ bp += sizeof(argp->npgno);
+ memcpy(&argp->nlsn, bp, sizeof(argp->nlsn));
+ bp += sizeof(argp->nlsn);
+ memcpy(&argp->root_pgno, bp, sizeof(argp->root_pgno));
+ bp += sizeof(argp->root_pgno);
+ memset(&argp->pg, 0, sizeof(argp->pg));
+ memcpy(&argp->pg.size, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ argp->pg.data = bp;
+ bp += argp->pg.size;
+ memcpy(&argp->opflags, bp, sizeof(argp->opflags));
+ bp += sizeof(argp->opflags);
+ *argpp = argp;
+ return (0);
+}
+
+int
+__bam_rsplit1_print(dbenv, dbtp, lsnp, notused2, notused3)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_rsplit1_args *argp;
+ u_int32_t i;
+ u_int ch;
+ int ret;
+
+ i = 0;
+ ch = 0;
+ notused2 = DB_TXN_ABORT;
+ notused3 = NULL;
+
+ if ((ret = __bam_rsplit1_read(dbenv, dbtp->data, &argp)) != 0)
+ return (ret);
+ printf("[%lu][%lu]bam_rsplit1: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file,
+ (u_long)lsnp->offset,
+ (u_long)argp->type,
+ (u_long)argp->txnid->txnid,
+ (u_long)argp->prev_lsn.file,
+ (u_long)argp->prev_lsn.offset);
+ printf("\tfileid: %ld\n", (long)argp->fileid);
+ printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ printf("\tpgdbt: ");
+ for (i = 0; i < argp->pgdbt.size; i++) {
+ ch = ((u_int8_t *)argp->pgdbt.data)[i];
+ if (isprint(ch) || ch == 0xa)
+ putchar(ch);
+ else
+ printf("%#x ", ch);
+ }
+ printf("\n");
+ printf("\tnrec: %lu\n", (u_long)argp->nrec);
+ printf("\trootent: ");
+ for (i = 0; i < argp->rootent.size; i++) {
+ ch = ((u_int8_t *)argp->rootent.data)[i];
+ if (isprint(ch) || ch == 0xa)
+ putchar(ch);
+ else
+ printf("%#x ", ch);
+ }
+ printf("\n");
+ printf("\trootlsn: [%lu][%lu]\n",
+ (u_long)argp->rootlsn.file, (u_long)argp->rootlsn.offset);
+ printf("\n");
+ __os_free(argp, 0);
+ return (0);
+}
+
+int
+__bam_rsplit1_read(dbenv, recbuf, argpp)
+ DB_ENV *dbenv;
+ void *recbuf;
+ __bam_rsplit1_args **argpp;
+{
+ __bam_rsplit1_args *argp;
+ u_int8_t *bp;
+ int ret;
+
+ ret = __os_malloc(dbenv, sizeof(__bam_rsplit1_args) +
+ sizeof(DB_TXN), NULL, &argp);
+ if (ret != 0)
+ return (ret);
+ argp->txnid = (DB_TXN *)&argp[1];
+ bp = recbuf;
+ memcpy(&argp->type, bp, sizeof(argp->type));
+ bp += sizeof(argp->type);
+ memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid));
+ bp += sizeof(argp->txnid->txnid);
+ memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(&argp->fileid, bp, sizeof(argp->fileid));
+ bp += sizeof(argp->fileid);
+ memcpy(&argp->pgno, bp, sizeof(argp->pgno));
+ bp += sizeof(argp->pgno);
+ memset(&argp->pgdbt, 0, sizeof(argp->pgdbt));
+ memcpy(&argp->pgdbt.size, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ argp->pgdbt.data = bp;
+ bp += argp->pgdbt.size;
+ memcpy(&argp->nrec, bp, sizeof(argp->nrec));
+ bp += sizeof(argp->nrec);
+ memset(&argp->rootent, 0, sizeof(argp->rootent));
+ memcpy(&argp->rootent.size, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ argp->rootent.data = bp;
+ bp += argp->rootent.size;
+ memcpy(&argp->rootlsn, bp, sizeof(argp->rootlsn));
+ bp += sizeof(argp->rootlsn);
+ *argpp = argp;
+ return (0);
+}
+
+int
+__bam_rsplit_log(dbenv, txnid, ret_lsnp, flags,
+ fileid, pgno, pgdbt, root_pgno, nrec, rootent,
+ rootlsn)
+ DB_ENV *dbenv;
+ DB_TXN *txnid;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ int32_t fileid;
+ db_pgno_t pgno;
+ const DBT *pgdbt;
+ db_pgno_t root_pgno;
+ db_pgno_t nrec;
+ const DBT *rootent;
+ DB_LSN * rootlsn;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn;
+ u_int32_t zero;
+ u_int32_t rectype, txn_num;
+ int ret;
+ u_int8_t *bp;
+
+ rectype = DB_bam_rsplit;
+ if (txnid != NULL &&
+ TAILQ_FIRST(&txnid->kids) != NULL &&
+ (ret = __txn_activekids(dbenv, rectype, txnid)) != 0)
+ return (ret);
+ txn_num = txnid == NULL ? 0 : txnid->txnid;
+ if (txnid == NULL) {
+ ZERO_LSN(null_lsn);
+ lsnp = &null_lsn;
+ } else
+ lsnp = &txnid->last_lsn;
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(fileid)
+ + sizeof(pgno)
+ + sizeof(u_int32_t) + (pgdbt == NULL ? 0 : pgdbt->size)
+ + sizeof(root_pgno)
+ + sizeof(nrec)
+ + sizeof(u_int32_t) + (rootent == NULL ? 0 : rootent->size)
+ + sizeof(*rootlsn);
+ if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0)
+ return (ret);
+
+ bp = logrec.data;
+ memcpy(bp, &rectype, sizeof(rectype));
+ bp += sizeof(rectype);
+ memcpy(bp, &txn_num, sizeof(txn_num));
+ bp += sizeof(txn_num);
+ memcpy(bp, lsnp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(bp, &fileid, sizeof(fileid));
+ bp += sizeof(fileid);
+ memcpy(bp, &pgno, sizeof(pgno));
+ bp += sizeof(pgno);
+ if (pgdbt == NULL) {
+ zero = 0;
+ memcpy(bp, &zero, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else {
+ memcpy(bp, &pgdbt->size, sizeof(pgdbt->size));
+ bp += sizeof(pgdbt->size);
+ memcpy(bp, pgdbt->data, pgdbt->size);
+ bp += pgdbt->size;
+ }
+ memcpy(bp, &root_pgno, sizeof(root_pgno));
+ bp += sizeof(root_pgno);
+ memcpy(bp, &nrec, sizeof(nrec));
+ bp += sizeof(nrec);
+ if (rootent == NULL) {
+ zero = 0;
+ memcpy(bp, &zero, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else {
+ memcpy(bp, &rootent->size, sizeof(rootent->size));
+ bp += sizeof(rootent->size);
+ memcpy(bp, rootent->data, rootent->size);
+ bp += rootent->size;
+ }
+ if (rootlsn != NULL)
+ memcpy(bp, rootlsn, sizeof(*rootlsn));
+ else
+ memset(bp, 0, sizeof(*rootlsn));
+ bp += sizeof(*rootlsn);
+ DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size);
+ ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags);
+ if (txnid != NULL)
+ txnid->last_lsn = *ret_lsnp;
+ __os_free(logrec.data, logrec.size);
+ return (ret);
+}
+
+int
+__bam_rsplit_print(dbenv, dbtp, lsnp, notused2, notused3)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_rsplit_args *argp;
+ u_int32_t i;
+ u_int ch;
+ int ret;
+
+ i = 0;
+ ch = 0;
+ notused2 = DB_TXN_ABORT;
+ notused3 = NULL;
+
+ if ((ret = __bam_rsplit_read(dbenv, dbtp->data, &argp)) != 0)
+ return (ret);
+ printf("[%lu][%lu]bam_rsplit: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file,
+ (u_long)lsnp->offset,
+ (u_long)argp->type,
+ (u_long)argp->txnid->txnid,
+ (u_long)argp->prev_lsn.file,
+ (u_long)argp->prev_lsn.offset);
+ printf("\tfileid: %ld\n", (long)argp->fileid);
+ printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ printf("\tpgdbt: ");
+ for (i = 0; i < argp->pgdbt.size; i++) {
+ ch = ((u_int8_t *)argp->pgdbt.data)[i];
+ if (isprint(ch) || ch == 0xa)
+ putchar(ch);
+ else
+ printf("%#x ", ch);
+ }
+ printf("\n");
+ printf("\troot_pgno: %lu\n", (u_long)argp->root_pgno);
+ printf("\tnrec: %lu\n", (u_long)argp->nrec);
+ printf("\trootent: ");
+ for (i = 0; i < argp->rootent.size; i++) {
+ ch = ((u_int8_t *)argp->rootent.data)[i];
+ if (isprint(ch) || ch == 0xa)
+ putchar(ch);
+ else
+ printf("%#x ", ch);
+ }
+ printf("\n");
+ printf("\trootlsn: [%lu][%lu]\n",
+ (u_long)argp->rootlsn.file, (u_long)argp->rootlsn.offset);
+ printf("\n");
+ __os_free(argp, 0);
+ return (0);
+}
+
+int
+__bam_rsplit_read(dbenv, recbuf, argpp)
+ DB_ENV *dbenv;
+ void *recbuf;
+ __bam_rsplit_args **argpp;
+{
+ __bam_rsplit_args *argp;
+ u_int8_t *bp;
+ int ret;
+
+ ret = __os_malloc(dbenv, sizeof(__bam_rsplit_args) +
+ sizeof(DB_TXN), NULL, &argp);
+ if (ret != 0)
+ return (ret);
+ argp->txnid = (DB_TXN *)&argp[1];
+ bp = recbuf;
+ memcpy(&argp->type, bp, sizeof(argp->type));
+ bp += sizeof(argp->type);
+ memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid));
+ bp += sizeof(argp->txnid->txnid);
+ memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(&argp->fileid, bp, sizeof(argp->fileid));
+ bp += sizeof(argp->fileid);
+ memcpy(&argp->pgno, bp, sizeof(argp->pgno));
+ bp += sizeof(argp->pgno);
+ memset(&argp->pgdbt, 0, sizeof(argp->pgdbt));
+ memcpy(&argp->pgdbt.size, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ argp->pgdbt.data = bp;
+ bp += argp->pgdbt.size;
+ memcpy(&argp->root_pgno, bp, sizeof(argp->root_pgno));
+ bp += sizeof(argp->root_pgno);
+ memcpy(&argp->nrec, bp, sizeof(argp->nrec));
+ bp += sizeof(argp->nrec);
+ memset(&argp->rootent, 0, sizeof(argp->rootent));
+ memcpy(&argp->rootent.size, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ argp->rootent.data = bp;
+ bp += argp->rootent.size;
+ memcpy(&argp->rootlsn, bp, sizeof(argp->rootlsn));
+ bp += sizeof(argp->rootlsn);
+ *argpp = argp;
+ return (0);
+}
+
+int
+__bam_adj_log(dbenv, txnid, ret_lsnp, flags,
+ fileid, pgno, lsn, indx, indx_copy, is_insert)
+ DB_ENV *dbenv;
+ DB_TXN *txnid;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DB_LSN * lsn;
+ u_int32_t indx;
+ u_int32_t indx_copy;
+ u_int32_t is_insert;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn;
+ u_int32_t rectype, txn_num;
+ int ret;
+ u_int8_t *bp;
+
+ rectype = DB_bam_adj;
+ if (txnid != NULL &&
+ TAILQ_FIRST(&txnid->kids) != NULL &&
+ (ret = __txn_activekids(dbenv, rectype, txnid)) != 0)
+ return (ret);
+ txn_num = txnid == NULL ? 0 : txnid->txnid;
+ if (txnid == NULL) {
+ ZERO_LSN(null_lsn);
+ lsnp = &null_lsn;
+ } else
+ lsnp = &txnid->last_lsn;
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(fileid)
+ + sizeof(pgno)
+ + sizeof(*lsn)
+ + sizeof(indx)
+ + sizeof(indx_copy)
+ + sizeof(is_insert);
+ if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0)
+ return (ret);
+
+ bp = logrec.data;
+ memcpy(bp, &rectype, sizeof(rectype));
+ bp += sizeof(rectype);
+ memcpy(bp, &txn_num, sizeof(txn_num));
+ bp += sizeof(txn_num);
+ memcpy(bp, lsnp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(bp, &fileid, sizeof(fileid));
+ bp += sizeof(fileid);
+ memcpy(bp, &pgno, sizeof(pgno));
+ bp += sizeof(pgno);
+ if (lsn != NULL)
+ memcpy(bp, lsn, sizeof(*lsn));
+ else
+ memset(bp, 0, sizeof(*lsn));
+ bp += sizeof(*lsn);
+ memcpy(bp, &indx, sizeof(indx));
+ bp += sizeof(indx);
+ memcpy(bp, &indx_copy, sizeof(indx_copy));
+ bp += sizeof(indx_copy);
+ memcpy(bp, &is_insert, sizeof(is_insert));
+ bp += sizeof(is_insert);
+ DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size);
+ ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags);
+ if (txnid != NULL)
+ txnid->last_lsn = *ret_lsnp;
+ __os_free(logrec.data, logrec.size);
+ return (ret);
+}
+
+int
+__bam_adj_print(dbenv, dbtp, lsnp, notused2, notused3)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_adj_args *argp;
+ u_int32_t i;
+ u_int ch;
+ int ret;
+
+ i = 0;
+ ch = 0;
+ notused2 = DB_TXN_ABORT;
+ notused3 = NULL;
+
+ if ((ret = __bam_adj_read(dbenv, dbtp->data, &argp)) != 0)
+ return (ret);
+ printf("[%lu][%lu]bam_adj: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file,
+ (u_long)lsnp->offset,
+ (u_long)argp->type,
+ (u_long)argp->txnid->txnid,
+ (u_long)argp->prev_lsn.file,
+ (u_long)argp->prev_lsn.offset);
+ printf("\tfileid: %ld\n", (long)argp->fileid);
+ printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ printf("\tlsn: [%lu][%lu]\n",
+ (u_long)argp->lsn.file, (u_long)argp->lsn.offset);
+ printf("\tindx: %lu\n", (u_long)argp->indx);
+ printf("\tindx_copy: %lu\n", (u_long)argp->indx_copy);
+ printf("\tis_insert: %lu\n", (u_long)argp->is_insert);
+ printf("\n");
+ __os_free(argp, 0);
+ return (0);
+}
+
+int
+__bam_adj_read(dbenv, recbuf, argpp)
+ DB_ENV *dbenv;
+ void *recbuf;
+ __bam_adj_args **argpp;
+{
+ __bam_adj_args *argp;
+ u_int8_t *bp;
+ int ret;
+
+ ret = __os_malloc(dbenv, sizeof(__bam_adj_args) +
+ sizeof(DB_TXN), NULL, &argp);
+ if (ret != 0)
+ return (ret);
+ argp->txnid = (DB_TXN *)&argp[1];
+ bp = recbuf;
+ memcpy(&argp->type, bp, sizeof(argp->type));
+ bp += sizeof(argp->type);
+ memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid));
+ bp += sizeof(argp->txnid->txnid);
+ memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(&argp->fileid, bp, sizeof(argp->fileid));
+ bp += sizeof(argp->fileid);
+ memcpy(&argp->pgno, bp, sizeof(argp->pgno));
+ bp += sizeof(argp->pgno);
+ memcpy(&argp->lsn, bp, sizeof(argp->lsn));
+ bp += sizeof(argp->lsn);
+ memcpy(&argp->indx, bp, sizeof(argp->indx));
+ bp += sizeof(argp->indx);
+ memcpy(&argp->indx_copy, bp, sizeof(argp->indx_copy));
+ bp += sizeof(argp->indx_copy);
+ memcpy(&argp->is_insert, bp, sizeof(argp->is_insert));
+ bp += sizeof(argp->is_insert);
+ *argpp = argp;
+ return (0);
+}
+
+int
+__bam_cadjust_log(dbenv, txnid, ret_lsnp, flags,
+ fileid, pgno, lsn, indx, adjust, opflags)
+ DB_ENV *dbenv;
+ DB_TXN *txnid;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DB_LSN * lsn;
+ u_int32_t indx;
+ int32_t adjust;
+ u_int32_t opflags;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn;
+ u_int32_t rectype, txn_num;
+ int ret;
+ u_int8_t *bp;
+
+ rectype = DB_bam_cadjust;
+ if (txnid != NULL &&
+ TAILQ_FIRST(&txnid->kids) != NULL &&
+ (ret = __txn_activekids(dbenv, rectype, txnid)) != 0)
+ return (ret);
+ txn_num = txnid == NULL ? 0 : txnid->txnid;
+ if (txnid == NULL) {
+ ZERO_LSN(null_lsn);
+ lsnp = &null_lsn;
+ } else
+ lsnp = &txnid->last_lsn;
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(fileid)
+ + sizeof(pgno)
+ + sizeof(*lsn)
+ + sizeof(indx)
+ + sizeof(adjust)
+ + sizeof(opflags);
+ if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0)
+ return (ret);
+
+ bp = logrec.data;
+ memcpy(bp, &rectype, sizeof(rectype));
+ bp += sizeof(rectype);
+ memcpy(bp, &txn_num, sizeof(txn_num));
+ bp += sizeof(txn_num);
+ memcpy(bp, lsnp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(bp, &fileid, sizeof(fileid));
+ bp += sizeof(fileid);
+ memcpy(bp, &pgno, sizeof(pgno));
+ bp += sizeof(pgno);
+ if (lsn != NULL)
+ memcpy(bp, lsn, sizeof(*lsn));
+ else
+ memset(bp, 0, sizeof(*lsn));
+ bp += sizeof(*lsn);
+ memcpy(bp, &indx, sizeof(indx));
+ bp += sizeof(indx);
+ memcpy(bp, &adjust, sizeof(adjust));
+ bp += sizeof(adjust);
+ memcpy(bp, &opflags, sizeof(opflags));
+ bp += sizeof(opflags);
+ DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size);
+ ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags);
+ if (txnid != NULL)
+ txnid->last_lsn = *ret_lsnp;
+ __os_free(logrec.data, logrec.size);
+ return (ret);
+}
+
+int
+__bam_cadjust_print(dbenv, dbtp, lsnp, notused2, notused3)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_cadjust_args *argp;
+ u_int32_t i;
+ u_int ch;
+ int ret;
+
+ i = 0;
+ ch = 0;
+ notused2 = DB_TXN_ABORT;
+ notused3 = NULL;
+
+ if ((ret = __bam_cadjust_read(dbenv, dbtp->data, &argp)) != 0)
+ return (ret);
+ printf("[%lu][%lu]bam_cadjust: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file,
+ (u_long)lsnp->offset,
+ (u_long)argp->type,
+ (u_long)argp->txnid->txnid,
+ (u_long)argp->prev_lsn.file,
+ (u_long)argp->prev_lsn.offset);
+ printf("\tfileid: %ld\n", (long)argp->fileid);
+ printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ printf("\tlsn: [%lu][%lu]\n",
+ (u_long)argp->lsn.file, (u_long)argp->lsn.offset);
+ printf("\tindx: %lu\n", (u_long)argp->indx);
+ printf("\tadjust: %ld\n", (long)argp->adjust);
+ printf("\topflags: %lu\n", (u_long)argp->opflags);
+ printf("\n");
+ __os_free(argp, 0);
+ return (0);
+}
+
+int
+__bam_cadjust_read(dbenv, recbuf, argpp)
+ DB_ENV *dbenv;
+ void *recbuf;
+ __bam_cadjust_args **argpp;
+{
+ __bam_cadjust_args *argp;
+ u_int8_t *bp;
+ int ret;
+
+ ret = __os_malloc(dbenv, sizeof(__bam_cadjust_args) +
+ sizeof(DB_TXN), NULL, &argp);
+ if (ret != 0)
+ return (ret);
+ argp->txnid = (DB_TXN *)&argp[1];
+ bp = recbuf;
+ memcpy(&argp->type, bp, sizeof(argp->type));
+ bp += sizeof(argp->type);
+ memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid));
+ bp += sizeof(argp->txnid->txnid);
+ memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(&argp->fileid, bp, sizeof(argp->fileid));
+ bp += sizeof(argp->fileid);
+ memcpy(&argp->pgno, bp, sizeof(argp->pgno));
+ bp += sizeof(argp->pgno);
+ memcpy(&argp->lsn, bp, sizeof(argp->lsn));
+ bp += sizeof(argp->lsn);
+ memcpy(&argp->indx, bp, sizeof(argp->indx));
+ bp += sizeof(argp->indx);
+ memcpy(&argp->adjust, bp, sizeof(argp->adjust));
+ bp += sizeof(argp->adjust);
+ memcpy(&argp->opflags, bp, sizeof(argp->opflags));
+ bp += sizeof(argp->opflags);
+ *argpp = argp;
+ return (0);
+}
+
+int
+__bam_cdel_log(dbenv, txnid, ret_lsnp, flags,
+ fileid, pgno, lsn, indx)
+ DB_ENV *dbenv;
+ DB_TXN *txnid;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DB_LSN * lsn;
+ u_int32_t indx;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn;
+ u_int32_t rectype, txn_num;
+ int ret;
+ u_int8_t *bp;
+
+ rectype = DB_bam_cdel;
+ if (txnid != NULL &&
+ TAILQ_FIRST(&txnid->kids) != NULL &&
+ (ret = __txn_activekids(dbenv, rectype, txnid)) != 0)
+ return (ret);
+ txn_num = txnid == NULL ? 0 : txnid->txnid;
+ if (txnid == NULL) {
+ ZERO_LSN(null_lsn);
+ lsnp = &null_lsn;
+ } else
+ lsnp = &txnid->last_lsn;
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(fileid)
+ + sizeof(pgno)
+ + sizeof(*lsn)
+ + sizeof(indx);
+ if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0)
+ return (ret);
+
+ bp = logrec.data;
+ memcpy(bp, &rectype, sizeof(rectype));
+ bp += sizeof(rectype);
+ memcpy(bp, &txn_num, sizeof(txn_num));
+ bp += sizeof(txn_num);
+ memcpy(bp, lsnp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(bp, &fileid, sizeof(fileid));
+ bp += sizeof(fileid);
+ memcpy(bp, &pgno, sizeof(pgno));
+ bp += sizeof(pgno);
+ if (lsn != NULL)
+ memcpy(bp, lsn, sizeof(*lsn));
+ else
+ memset(bp, 0, sizeof(*lsn));
+ bp += sizeof(*lsn);
+ memcpy(bp, &indx, sizeof(indx));
+ bp += sizeof(indx);
+ DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size);
+ ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags);
+ if (txnid != NULL)
+ txnid->last_lsn = *ret_lsnp;
+ __os_free(logrec.data, logrec.size);
+ return (ret);
+}
+
+int
+__bam_cdel_print(dbenv, dbtp, lsnp, notused2, notused3)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_cdel_args *argp;
+ u_int32_t i;
+ u_int ch;
+ int ret;
+
+ i = 0;
+ ch = 0;
+ notused2 = DB_TXN_ABORT;
+ notused3 = NULL;
+
+ if ((ret = __bam_cdel_read(dbenv, dbtp->data, &argp)) != 0)
+ return (ret);
+ printf("[%lu][%lu]bam_cdel: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file,
+ (u_long)lsnp->offset,
+ (u_long)argp->type,
+ (u_long)argp->txnid->txnid,
+ (u_long)argp->prev_lsn.file,
+ (u_long)argp->prev_lsn.offset);
+ printf("\tfileid: %ld\n", (long)argp->fileid);
+ printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ printf("\tlsn: [%lu][%lu]\n",
+ (u_long)argp->lsn.file, (u_long)argp->lsn.offset);
+ printf("\tindx: %lu\n", (u_long)argp->indx);
+ printf("\n");
+ __os_free(argp, 0);
+ return (0);
+}
+
+int
+__bam_cdel_read(dbenv, recbuf, argpp)
+ DB_ENV *dbenv;
+ void *recbuf;
+ __bam_cdel_args **argpp;
+{
+ __bam_cdel_args *argp;
+ u_int8_t *bp;
+ int ret;
+
+ ret = __os_malloc(dbenv, sizeof(__bam_cdel_args) +
+ sizeof(DB_TXN), NULL, &argp);
+ if (ret != 0)
+ return (ret);
+ argp->txnid = (DB_TXN *)&argp[1];
+ bp = recbuf;
+ memcpy(&argp->type, bp, sizeof(argp->type));
+ bp += sizeof(argp->type);
+ memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid));
+ bp += sizeof(argp->txnid->txnid);
+ memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(&argp->fileid, bp, sizeof(argp->fileid));
+ bp += sizeof(argp->fileid);
+ memcpy(&argp->pgno, bp, sizeof(argp->pgno));
+ bp += sizeof(argp->pgno);
+ memcpy(&argp->lsn, bp, sizeof(argp->lsn));
+ bp += sizeof(argp->lsn);
+ memcpy(&argp->indx, bp, sizeof(argp->indx));
+ bp += sizeof(argp->indx);
+ *argpp = argp;
+ return (0);
+}
+
+int
+__bam_repl_log(dbenv, txnid, ret_lsnp, flags,
+ fileid, pgno, lsn, indx, isdeleted, orig,
+ repl, prefix, suffix)
+ DB_ENV *dbenv;
+ DB_TXN *txnid;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ int32_t fileid;
+ db_pgno_t pgno;
+ DB_LSN * lsn;
+ u_int32_t indx;
+ u_int32_t isdeleted;
+ const DBT *orig;
+ const DBT *repl;
+ u_int32_t prefix;
+ u_int32_t suffix;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn;
+ u_int32_t zero;
+ u_int32_t rectype, txn_num;
+ int ret;
+ u_int8_t *bp;
+
+ rectype = DB_bam_repl;
+ if (txnid != NULL &&
+ TAILQ_FIRST(&txnid->kids) != NULL &&
+ (ret = __txn_activekids(dbenv, rectype, txnid)) != 0)
+ return (ret);
+ txn_num = txnid == NULL ? 0 : txnid->txnid;
+ if (txnid == NULL) {
+ ZERO_LSN(null_lsn);
+ lsnp = &null_lsn;
+ } else
+ lsnp = &txnid->last_lsn;
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(fileid)
+ + sizeof(pgno)
+ + sizeof(*lsn)
+ + sizeof(indx)
+ + sizeof(isdeleted)
+ + sizeof(u_int32_t) + (orig == NULL ? 0 : orig->size)
+ + sizeof(u_int32_t) + (repl == NULL ? 0 : repl->size)
+ + sizeof(prefix)
+ + sizeof(suffix);
+ if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0)
+ return (ret);
+
+ bp = logrec.data;
+ memcpy(bp, &rectype, sizeof(rectype));
+ bp += sizeof(rectype);
+ memcpy(bp, &txn_num, sizeof(txn_num));
+ bp += sizeof(txn_num);
+ memcpy(bp, lsnp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(bp, &fileid, sizeof(fileid));
+ bp += sizeof(fileid);
+ memcpy(bp, &pgno, sizeof(pgno));
+ bp += sizeof(pgno);
+ if (lsn != NULL)
+ memcpy(bp, lsn, sizeof(*lsn));
+ else
+ memset(bp, 0, sizeof(*lsn));
+ bp += sizeof(*lsn);
+ memcpy(bp, &indx, sizeof(indx));
+ bp += sizeof(indx);
+ memcpy(bp, &isdeleted, sizeof(isdeleted));
+ bp += sizeof(isdeleted);
+ if (orig == NULL) {
+ zero = 0;
+ memcpy(bp, &zero, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else {
+ memcpy(bp, &orig->size, sizeof(orig->size));
+ bp += sizeof(orig->size);
+ memcpy(bp, orig->data, orig->size);
+ bp += orig->size;
+ }
+ if (repl == NULL) {
+ zero = 0;
+ memcpy(bp, &zero, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ } else {
+ memcpy(bp, &repl->size, sizeof(repl->size));
+ bp += sizeof(repl->size);
+ memcpy(bp, repl->data, repl->size);
+ bp += repl->size;
+ }
+ memcpy(bp, &prefix, sizeof(prefix));
+ bp += sizeof(prefix);
+ memcpy(bp, &suffix, sizeof(suffix));
+ bp += sizeof(suffix);
+ DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size);
+ ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags);
+ if (txnid != NULL)
+ txnid->last_lsn = *ret_lsnp;
+ __os_free(logrec.data, logrec.size);
+ return (ret);
+}
+
+int
+__bam_repl_print(dbenv, dbtp, lsnp, notused2, notused3)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_repl_args *argp;
+ u_int32_t i;
+ u_int ch;
+ int ret;
+
+ i = 0;
+ ch = 0;
+ notused2 = DB_TXN_ABORT;
+ notused3 = NULL;
+
+ if ((ret = __bam_repl_read(dbenv, dbtp->data, &argp)) != 0)
+ return (ret);
+ printf("[%lu][%lu]bam_repl: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file,
+ (u_long)lsnp->offset,
+ (u_long)argp->type,
+ (u_long)argp->txnid->txnid,
+ (u_long)argp->prev_lsn.file,
+ (u_long)argp->prev_lsn.offset);
+ printf("\tfileid: %ld\n", (long)argp->fileid);
+ printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ printf("\tlsn: [%lu][%lu]\n",
+ (u_long)argp->lsn.file, (u_long)argp->lsn.offset);
+ printf("\tindx: %lu\n", (u_long)argp->indx);
+ printf("\tisdeleted: %lu\n", (u_long)argp->isdeleted);
+ printf("\torig: ");
+ for (i = 0; i < argp->orig.size; i++) {
+ ch = ((u_int8_t *)argp->orig.data)[i];
+ if (isprint(ch) || ch == 0xa)
+ putchar(ch);
+ else
+ printf("%#x ", ch);
+ }
+ printf("\n");
+ printf("\trepl: ");
+ for (i = 0; i < argp->repl.size; i++) {
+ ch = ((u_int8_t *)argp->repl.data)[i];
+ if (isprint(ch) || ch == 0xa)
+ putchar(ch);
+ else
+ printf("%#x ", ch);
+ }
+ printf("\n");
+ printf("\tprefix: %lu\n", (u_long)argp->prefix);
+ printf("\tsuffix: %lu\n", (u_long)argp->suffix);
+ printf("\n");
+ __os_free(argp, 0);
+ return (0);
+}
+
+int
+__bam_repl_read(dbenv, recbuf, argpp)
+ DB_ENV *dbenv;
+ void *recbuf;
+ __bam_repl_args **argpp;
+{
+ __bam_repl_args *argp;
+ u_int8_t *bp;
+ int ret;
+
+ ret = __os_malloc(dbenv, sizeof(__bam_repl_args) +
+ sizeof(DB_TXN), NULL, &argp);
+ if (ret != 0)
+ return (ret);
+ argp->txnid = (DB_TXN *)&argp[1];
+ bp = recbuf;
+ memcpy(&argp->type, bp, sizeof(argp->type));
+ bp += sizeof(argp->type);
+ memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid));
+ bp += sizeof(argp->txnid->txnid);
+ memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(&argp->fileid, bp, sizeof(argp->fileid));
+ bp += sizeof(argp->fileid);
+ memcpy(&argp->pgno, bp, sizeof(argp->pgno));
+ bp += sizeof(argp->pgno);
+ memcpy(&argp->lsn, bp, sizeof(argp->lsn));
+ bp += sizeof(argp->lsn);
+ memcpy(&argp->indx, bp, sizeof(argp->indx));
+ bp += sizeof(argp->indx);
+ memcpy(&argp->isdeleted, bp, sizeof(argp->isdeleted));
+ bp += sizeof(argp->isdeleted);
+ memset(&argp->orig, 0, sizeof(argp->orig));
+ memcpy(&argp->orig.size, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ argp->orig.data = bp;
+ bp += argp->orig.size;
+ memset(&argp->repl, 0, sizeof(argp->repl));
+ memcpy(&argp->repl.size, bp, sizeof(u_int32_t));
+ bp += sizeof(u_int32_t);
+ argp->repl.data = bp;
+ bp += argp->repl.size;
+ memcpy(&argp->prefix, bp, sizeof(argp->prefix));
+ bp += sizeof(argp->prefix);
+ memcpy(&argp->suffix, bp, sizeof(argp->suffix));
+ bp += sizeof(argp->suffix);
+ *argpp = argp;
+ return (0);
+}
+
+int
+__bam_root_log(dbenv, txnid, ret_lsnp, flags,
+ fileid, meta_pgno, root_pgno, meta_lsn)
+ DB_ENV *dbenv;
+ DB_TXN *txnid;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ int32_t fileid;
+ db_pgno_t meta_pgno;
+ db_pgno_t root_pgno;
+ DB_LSN * meta_lsn;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn;
+ u_int32_t rectype, txn_num;
+ int ret;
+ u_int8_t *bp;
+
+ rectype = DB_bam_root;
+ if (txnid != NULL &&
+ TAILQ_FIRST(&txnid->kids) != NULL &&
+ (ret = __txn_activekids(dbenv, rectype, txnid)) != 0)
+ return (ret);
+ txn_num = txnid == NULL ? 0 : txnid->txnid;
+ if (txnid == NULL) {
+ ZERO_LSN(null_lsn);
+ lsnp = &null_lsn;
+ } else
+ lsnp = &txnid->last_lsn;
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(fileid)
+ + sizeof(meta_pgno)
+ + sizeof(root_pgno)
+ + sizeof(*meta_lsn);
+ if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0)
+ return (ret);
+
+ bp = logrec.data;
+ memcpy(bp, &rectype, sizeof(rectype));
+ bp += sizeof(rectype);
+ memcpy(bp, &txn_num, sizeof(txn_num));
+ bp += sizeof(txn_num);
+ memcpy(bp, lsnp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(bp, &fileid, sizeof(fileid));
+ bp += sizeof(fileid);
+ memcpy(bp, &meta_pgno, sizeof(meta_pgno));
+ bp += sizeof(meta_pgno);
+ memcpy(bp, &root_pgno, sizeof(root_pgno));
+ bp += sizeof(root_pgno);
+ if (meta_lsn != NULL)
+ memcpy(bp, meta_lsn, sizeof(*meta_lsn));
+ else
+ memset(bp, 0, sizeof(*meta_lsn));
+ bp += sizeof(*meta_lsn);
+ DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size);
+ ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags);
+ if (txnid != NULL)
+ txnid->last_lsn = *ret_lsnp;
+ __os_free(logrec.data, logrec.size);
+ return (ret);
+}
+
+int
+__bam_root_print(dbenv, dbtp, lsnp, notused2, notused3)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_root_args *argp;
+ u_int32_t i;
+ u_int ch;
+ int ret;
+
+ i = 0;
+ ch = 0;
+ notused2 = DB_TXN_ABORT;
+ notused3 = NULL;
+
+ if ((ret = __bam_root_read(dbenv, dbtp->data, &argp)) != 0)
+ return (ret);
+ printf("[%lu][%lu]bam_root: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file,
+ (u_long)lsnp->offset,
+ (u_long)argp->type,
+ (u_long)argp->txnid->txnid,
+ (u_long)argp->prev_lsn.file,
+ (u_long)argp->prev_lsn.offset);
+ printf("\tfileid: %ld\n", (long)argp->fileid);
+ printf("\tmeta_pgno: %lu\n", (u_long)argp->meta_pgno);
+ printf("\troot_pgno: %lu\n", (u_long)argp->root_pgno);
+ printf("\tmeta_lsn: [%lu][%lu]\n",
+ (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset);
+ printf("\n");
+ __os_free(argp, 0);
+ return (0);
+}
+
+int
+__bam_root_read(dbenv, recbuf, argpp)
+ DB_ENV *dbenv;
+ void *recbuf;
+ __bam_root_args **argpp;
+{
+ __bam_root_args *argp;
+ u_int8_t *bp;
+ int ret;
+
+ ret = __os_malloc(dbenv, sizeof(__bam_root_args) +
+ sizeof(DB_TXN), NULL, &argp);
+ if (ret != 0)
+ return (ret);
+ argp->txnid = (DB_TXN *)&argp[1];
+ bp = recbuf;
+ memcpy(&argp->type, bp, sizeof(argp->type));
+ bp += sizeof(argp->type);
+ memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid));
+ bp += sizeof(argp->txnid->txnid);
+ memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(&argp->fileid, bp, sizeof(argp->fileid));
+ bp += sizeof(argp->fileid);
+ memcpy(&argp->meta_pgno, bp, sizeof(argp->meta_pgno));
+ bp += sizeof(argp->meta_pgno);
+ memcpy(&argp->root_pgno, bp, sizeof(argp->root_pgno));
+ bp += sizeof(argp->root_pgno);
+ memcpy(&argp->meta_lsn, bp, sizeof(argp->meta_lsn));
+ bp += sizeof(argp->meta_lsn);
+ *argpp = argp;
+ return (0);
+}
+
+int
+__bam_curadj_log(dbenv, txnid, ret_lsnp, flags,
+ fileid, mode, from_pgno, to_pgno, left_pgno, first_indx,
+ from_indx, to_indx)
+ DB_ENV *dbenv;
+ DB_TXN *txnid;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ int32_t fileid;
+ db_ca_mode mode;
+ db_pgno_t from_pgno;
+ db_pgno_t to_pgno;
+ db_pgno_t left_pgno;
+ u_int32_t first_indx;
+ u_int32_t from_indx;
+ u_int32_t to_indx;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn;
+ u_int32_t rectype, txn_num;
+ int ret;
+ u_int8_t *bp;
+
+ rectype = DB_bam_curadj;
+ if (txnid != NULL &&
+ TAILQ_FIRST(&txnid->kids) != NULL &&
+ (ret = __txn_activekids(dbenv, rectype, txnid)) != 0)
+ return (ret);
+ txn_num = txnid == NULL ? 0 : txnid->txnid;
+ if (txnid == NULL) {
+ ZERO_LSN(null_lsn);
+ lsnp = &null_lsn;
+ } else
+ lsnp = &txnid->last_lsn;
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(fileid)
+ + sizeof(mode)
+ + sizeof(from_pgno)
+ + sizeof(to_pgno)
+ + sizeof(left_pgno)
+ + sizeof(first_indx)
+ + sizeof(from_indx)
+ + sizeof(to_indx);
+ if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0)
+ return (ret);
+
+ bp = logrec.data;
+ memcpy(bp, &rectype, sizeof(rectype));
+ bp += sizeof(rectype);
+ memcpy(bp, &txn_num, sizeof(txn_num));
+ bp += sizeof(txn_num);
+ memcpy(bp, lsnp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(bp, &fileid, sizeof(fileid));
+ bp += sizeof(fileid);
+ memcpy(bp, &mode, sizeof(mode));
+ bp += sizeof(mode);
+ memcpy(bp, &from_pgno, sizeof(from_pgno));
+ bp += sizeof(from_pgno);
+ memcpy(bp, &to_pgno, sizeof(to_pgno));
+ bp += sizeof(to_pgno);
+ memcpy(bp, &left_pgno, sizeof(left_pgno));
+ bp += sizeof(left_pgno);
+ memcpy(bp, &first_indx, sizeof(first_indx));
+ bp += sizeof(first_indx);
+ memcpy(bp, &from_indx, sizeof(from_indx));
+ bp += sizeof(from_indx);
+ memcpy(bp, &to_indx, sizeof(to_indx));
+ bp += sizeof(to_indx);
+ DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size);
+ ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags);
+ if (txnid != NULL)
+ txnid->last_lsn = *ret_lsnp;
+ __os_free(logrec.data, logrec.size);
+ return (ret);
+}
+
+int
+__bam_curadj_print(dbenv, dbtp, lsnp, notused2, notused3)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_curadj_args *argp;
+ u_int32_t i;
+ u_int ch;
+ int ret;
+
+ i = 0;
+ ch = 0;
+ notused2 = DB_TXN_ABORT;
+ notused3 = NULL;
+
+ if ((ret = __bam_curadj_read(dbenv, dbtp->data, &argp)) != 0)
+ return (ret);
+ printf("[%lu][%lu]bam_curadj: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file,
+ (u_long)lsnp->offset,
+ (u_long)argp->type,
+ (u_long)argp->txnid->txnid,
+ (u_long)argp->prev_lsn.file,
+ (u_long)argp->prev_lsn.offset);
+ printf("\tfileid: %ld\n", (long)argp->fileid);
+ printf("\tmode: %ld\n", (long)argp->mode);
+ printf("\tfrom_pgno: %lu\n", (u_long)argp->from_pgno);
+ printf("\tto_pgno: %lu\n", (u_long)argp->to_pgno);
+ printf("\tleft_pgno: %lu\n", (u_long)argp->left_pgno);
+ printf("\tfirst_indx: %lu\n", (u_long)argp->first_indx);
+ printf("\tfrom_indx: %lu\n", (u_long)argp->from_indx);
+ printf("\tto_indx: %lu\n", (u_long)argp->to_indx);
+ printf("\n");
+ __os_free(argp, 0);
+ return (0);
+}
+
+int
+__bam_curadj_read(dbenv, recbuf, argpp)
+ DB_ENV *dbenv;
+ void *recbuf;
+ __bam_curadj_args **argpp;
+{
+ __bam_curadj_args *argp;
+ u_int8_t *bp;
+ int ret;
+
+ ret = __os_malloc(dbenv, sizeof(__bam_curadj_args) +
+ sizeof(DB_TXN), NULL, &argp);
+ if (ret != 0)
+ return (ret);
+ argp->txnid = (DB_TXN *)&argp[1];
+ bp = recbuf;
+ memcpy(&argp->type, bp, sizeof(argp->type));
+ bp += sizeof(argp->type);
+ memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid));
+ bp += sizeof(argp->txnid->txnid);
+ memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(&argp->fileid, bp, sizeof(argp->fileid));
+ bp += sizeof(argp->fileid);
+ memcpy(&argp->mode, bp, sizeof(argp->mode));
+ bp += sizeof(argp->mode);
+ memcpy(&argp->from_pgno, bp, sizeof(argp->from_pgno));
+ bp += sizeof(argp->from_pgno);
+ memcpy(&argp->to_pgno, bp, sizeof(argp->to_pgno));
+ bp += sizeof(argp->to_pgno);
+ memcpy(&argp->left_pgno, bp, sizeof(argp->left_pgno));
+ bp += sizeof(argp->left_pgno);
+ memcpy(&argp->first_indx, bp, sizeof(argp->first_indx));
+ bp += sizeof(argp->first_indx);
+ memcpy(&argp->from_indx, bp, sizeof(argp->from_indx));
+ bp += sizeof(argp->from_indx);
+ memcpy(&argp->to_indx, bp, sizeof(argp->to_indx));
+ bp += sizeof(argp->to_indx);
+ *argpp = argp;
+ return (0);
+}
+
+int
+__bam_rcuradj_log(dbenv, txnid, ret_lsnp, flags,
+ fileid, mode, root, recno, order)
+ DB_ENV *dbenv;
+ DB_TXN *txnid;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ int32_t fileid;
+ ca_recno_arg mode;
+ db_pgno_t root;
+ db_recno_t recno;
+ u_int32_t order;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn;
+ u_int32_t rectype, txn_num;
+ int ret;
+ u_int8_t *bp;
+
+ rectype = DB_bam_rcuradj;
+ if (txnid != NULL &&
+ TAILQ_FIRST(&txnid->kids) != NULL &&
+ (ret = __txn_activekids(dbenv, rectype, txnid)) != 0)
+ return (ret);
+ txn_num = txnid == NULL ? 0 : txnid->txnid;
+ if (txnid == NULL) {
+ ZERO_LSN(null_lsn);
+ lsnp = &null_lsn;
+ } else
+ lsnp = &txnid->last_lsn;
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(fileid)
+ + sizeof(mode)
+ + sizeof(root)
+ + sizeof(recno)
+ + sizeof(order);
+ if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0)
+ return (ret);
+
+ bp = logrec.data;
+ memcpy(bp, &rectype, sizeof(rectype));
+ bp += sizeof(rectype);
+ memcpy(bp, &txn_num, sizeof(txn_num));
+ bp += sizeof(txn_num);
+ memcpy(bp, lsnp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(bp, &fileid, sizeof(fileid));
+ bp += sizeof(fileid);
+ memcpy(bp, &mode, sizeof(mode));
+ bp += sizeof(mode);
+ memcpy(bp, &root, sizeof(root));
+ bp += sizeof(root);
+ memcpy(bp, &recno, sizeof(recno));
+ bp += sizeof(recno);
+ memcpy(bp, &order, sizeof(order));
+ bp += sizeof(order);
+ DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size);
+ ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags);
+ if (txnid != NULL)
+ txnid->last_lsn = *ret_lsnp;
+ __os_free(logrec.data, logrec.size);
+ return (ret);
+}
+
+int
+__bam_rcuradj_print(dbenv, dbtp, lsnp, notused2, notused3)
+ DB_ENV *dbenv;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_rcuradj_args *argp;
+ u_int32_t i;
+ u_int ch;
+ int ret;
+
+ i = 0;
+ ch = 0;
+ notused2 = DB_TXN_ABORT;
+ notused3 = NULL;
+
+ if ((ret = __bam_rcuradj_read(dbenv, dbtp->data, &argp)) != 0)
+ return (ret);
+ printf("[%lu][%lu]bam_rcuradj: rec: %lu txnid %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file,
+ (u_long)lsnp->offset,
+ (u_long)argp->type,
+ (u_long)argp->txnid->txnid,
+ (u_long)argp->prev_lsn.file,
+ (u_long)argp->prev_lsn.offset);
+ printf("\tfileid: %ld\n", (long)argp->fileid);
+ printf("\tmode: %ld\n", (long)argp->mode);
+ printf("\troot: %ld\n", (long)argp->root);
+ printf("\trecno: %ld\n", (long)argp->recno);
+ printf("\torder: %ld\n", (long)argp->order);
+ printf("\n");
+ __os_free(argp, 0);
+ return (0);
+}
+
+int
+__bam_rcuradj_read(dbenv, recbuf, argpp)
+ DB_ENV *dbenv;
+ void *recbuf;
+ __bam_rcuradj_args **argpp;
+{
+ __bam_rcuradj_args *argp;
+ u_int8_t *bp;
+ int ret;
+
+ ret = __os_malloc(dbenv, sizeof(__bam_rcuradj_args) +
+ sizeof(DB_TXN), NULL, &argp);
+ if (ret != 0)
+ return (ret);
+ argp->txnid = (DB_TXN *)&argp[1];
+ bp = recbuf;
+ memcpy(&argp->type, bp, sizeof(argp->type));
+ bp += sizeof(argp->type);
+ memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid));
+ bp += sizeof(argp->txnid->txnid);
+ memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN));
+ bp += sizeof(DB_LSN);
+ memcpy(&argp->fileid, bp, sizeof(argp->fileid));
+ bp += sizeof(argp->fileid);
+ memcpy(&argp->mode, bp, sizeof(argp->mode));
+ bp += sizeof(argp->mode);
+ memcpy(&argp->root, bp, sizeof(argp->root));
+ bp += sizeof(argp->root);
+ memcpy(&argp->recno, bp, sizeof(argp->recno));
+ bp += sizeof(argp->recno);
+ memcpy(&argp->order, bp, sizeof(argp->order));
+ bp += sizeof(argp->order);
+ *argpp = argp;
+ return (0);
+}
+
+int
+__bam_init_print(dbenv)
+ DB_ENV *dbenv;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_pg_alloc_print, DB_bam_pg_alloc)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_pg_alloc1_print, DB_bam_pg_alloc1)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_pg_free_print, DB_bam_pg_free)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_pg_free1_print, DB_bam_pg_free1)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_split1_print, DB_bam_split1)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_split_print, DB_bam_split)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_rsplit1_print, DB_bam_rsplit1)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_rsplit_print, DB_bam_rsplit)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_adj_print, DB_bam_adj)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_cadjust_print, DB_bam_cadjust)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_cdel_print, DB_bam_cdel)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_repl_print, DB_bam_repl)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_root_print, DB_bam_root)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_curadj_print, DB_bam_curadj)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_rcuradj_print, DB_bam_rcuradj)) != 0)
+ return (ret);
+ return (0);
+}
+
+int
+__bam_init_recover(dbenv)
+ DB_ENV *dbenv;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_pg_alloc_recover, DB_bam_pg_alloc)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __deprecated_recover, DB_bam_pg_alloc1)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_pg_free_recover, DB_bam_pg_free)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __deprecated_recover, DB_bam_pg_free1)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __deprecated_recover, DB_bam_split1)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_split_recover, DB_bam_split)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __deprecated_recover, DB_bam_rsplit1)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_rsplit_recover, DB_bam_rsplit)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_adj_recover, DB_bam_adj)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_cadjust_recover, DB_bam_cadjust)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_cdel_recover, DB_bam_cdel)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_repl_recover, DB_bam_repl)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_root_recover, DB_bam_root)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_curadj_recover, DB_bam_curadj)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery(dbenv,
+ __bam_rcuradj_recover, DB_bam_rcuradj)) != 0)
+ return (ret);
+ return (0);
+}
+