summaryrefslogtreecommitdiff
path: root/btree
diff options
context:
space:
mode:
Diffstat (limited to 'btree')
-rw-r--r--btree/Makefile.inc7
-rw-r--r--btree/bt_close.c182
-rw-r--r--btree/bt_compact.c3018
-rw-r--r--btree/bt_compare.c213
-rw-r--r--btree/bt_compress.c3024
-rw-r--r--btree/bt_conv.c250
-rw-r--r--btree/bt_curadj.c620
-rw-r--r--btree/bt_cursor.c3055
-rw-r--r--btree/bt_debug.c329
-rw-r--r--btree/bt_delete.c1084
-rw-r--r--btree/bt_get.c105
-rw-r--r--btree/bt_method.c734
-rw-r--r--btree/bt_open.c933
-rw-r--r--btree/bt_overflow.c228
-rw-r--r--btree/bt_page.c98
-rw-r--r--btree/bt_put.c1201
-rw-r--r--btree/bt_rec.c2035
-rw-r--r--btree/bt_reclaim.c97
-rw-r--r--btree/bt_recno.c1385
-rw-r--r--btree/bt_rsearch.c502
-rw-r--r--btree/bt_search.c1028
-rw-r--r--btree/bt_seq.c460
-rw-r--r--btree/bt_split.c1839
-rw-r--r--btree/bt_stat.c669
-rw-r--r--btree/bt_upgrade.c153
-rw-r--r--btree/bt_utils.c260
-rw-r--r--btree/bt_verify.c2746
-rw-r--r--btree/btree.h383
-rw-r--r--btree/btree.src291
-rw-r--r--btree/btree_auto.c3547
-rw-r--r--btree/btree_autop.c766
-rw-r--r--btree/extern.h70
l---------btree/tags1
33 files changed, 27059 insertions, 4254 deletions
diff --git a/btree/Makefile.inc b/btree/Makefile.inc
deleted file mode 100644
index 8ed7649..0000000
--- a/btree/Makefile.inc
+++ /dev/null
@@ -1,7 +0,0 @@
-# @(#)Makefile.inc 8.2 (Berkeley) 7/14/94
-
-.PATH: ${.CURDIR}/db/btree
-
-SRCS+= bt_close.c bt_conv.c bt_debug.c bt_delete.c bt_get.c bt_open.c \
- bt_overflow.c bt_page.c bt_put.c bt_search.c bt_seq.c bt_split.c \
- bt_utils.c
diff --git a/btree/bt_close.c b/btree/bt_close.c
deleted file mode 100644
index 27f9ab6..0000000
--- a/btree/bt_close.c
+++ /dev/null
@@ -1,182 +0,0 @@
-/*-
- * Copyright (c) 1990, 1993, 1994
- * The Regents of the University of California. All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * Mike Olson.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#if defined(LIBC_SCCS) && !defined(lint)
-static char sccsid[] = "@(#)bt_close.c 8.7 (Berkeley) 8/17/94";
-#endif /* LIBC_SCCS and not lint */
-
-#include <sys/param.h>
-
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-#include <db.h>
-#include "btree.h"
-
-static int bt_meta __P((BTREE *));
-
-/*
- * BT_CLOSE -- Close a btree.
- *
- * Parameters:
- * dbp: pointer to access method
- *
- * Returns:
- * RET_ERROR, RET_SUCCESS
- */
-int
-__bt_close(dbp)
- DB *dbp;
-{
- BTREE *t;
- int fd;
-
- t = dbp->internal;
-
- /* Toss any page pinned across calls. */
- if (t->bt_pinned != NULL) {
- mpool_put(t->bt_mp, t->bt_pinned, 0);
- t->bt_pinned = NULL;
- }
-
- /* Sync the tree. */
- if (__bt_sync(dbp, 0) == RET_ERROR)
- return (RET_ERROR);
-
- /* Close the memory pool. */
- if (mpool_close(t->bt_mp) == RET_ERROR)
- return (RET_ERROR);
-
- /* Free random memory. */
- if (t->bt_cursor.key.data != NULL) {
- free(t->bt_cursor.key.data);
- t->bt_cursor.key.size = 0;
- t->bt_cursor.key.data = NULL;
- }
- if (t->bt_rkey.data) {
- free(t->bt_rkey.data);
- t->bt_rkey.size = 0;
- t->bt_rkey.data = NULL;
- }
- if (t->bt_rdata.data) {
- free(t->bt_rdata.data);
- t->bt_rdata.size = 0;
- t->bt_rdata.data = NULL;
- }
-
- fd = t->bt_fd;
- free(t);
- free(dbp);
- return (close(fd) ? RET_ERROR : RET_SUCCESS);
-}
-
-/*
- * BT_SYNC -- sync the btree to disk.
- *
- * Parameters:
- * dbp: pointer to access method
- *
- * Returns:
- * RET_SUCCESS, RET_ERROR.
- */
-int
-__bt_sync(dbp, flags)
- const DB *dbp;
- u_int flags;
-{
- BTREE *t;
- int status;
-
- t = dbp->internal;
-
- /* Toss any page pinned across calls. */
- if (t->bt_pinned != NULL) {
- mpool_put(t->bt_mp, t->bt_pinned, 0);
- t->bt_pinned = NULL;
- }
-
- /* Sync doesn't currently take any flags. */
- if (flags != 0) {
- errno = EINVAL;
- return (RET_ERROR);
- }
-
- if (F_ISSET(t, B_INMEM | B_RDONLY) || !F_ISSET(t, B_MODIFIED))
- return (RET_SUCCESS);
-
- if (F_ISSET(t, B_METADIRTY) && bt_meta(t) == RET_ERROR)
- return (RET_ERROR);
-
- if ((status = mpool_sync(t->bt_mp)) == RET_SUCCESS)
- F_CLR(t, B_MODIFIED);
-
- return (status);
-}
-
-/*
- * BT_META -- write the tree meta data to disk.
- *
- * Parameters:
- * t: tree
- *
- * Returns:
- * RET_ERROR, RET_SUCCESS
- */
-static int
-bt_meta(t)
- BTREE *t;
-{
- BTMETA m;
- void *p;
-
- if ((p = mpool_get(t->bt_mp, P_META, 0)) == NULL)
- return (RET_ERROR);
-
- /* Fill in metadata. */
- m.magic = BTREEMAGIC;
- m.version = BTREEVERSION;
- m.psize = t->bt_psize;
- m.free = t->bt_free;
- m.nrecs = t->bt_nrecs;
- m.flags = F_ISSET(t, SAVEMETA);
-
- memmove(p, &m, sizeof(BTMETA));
- mpool_put(t->bt_mp, p, MPOOL_DIRTY);
- return (RET_SUCCESS);
-}
diff --git a/btree/bt_compact.c b/btree/bt_compact.c
new file mode 100644
index 0000000..6b22e04
--- /dev/null
+++ b/btree/bt_compact.c
@@ -0,0 +1,3018 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __bam_compact_dups __P((DBC *,
+ PAGE **, u_int32_t, int, DB_COMPACT *, int *));
+static int __bam_compact_int __P((DBC *,
+ DBT *, DBT *, u_int32_t, int *, DB_COMPACT *, int *));
+static int __bam_compact_isdone __P((DBC *, DBT *, PAGE *, int *));
+static int __bam_csearch __P((DBC *, DBT *, u_int32_t, int));
+static int __bam_lock_tree __P((DBC *, EPG *, EPG *csp, u_int32_t, u_int32_t));
+static int __bam_lock_subtree __P((DBC *, PAGE *, u_int32_t, u_int32_t));
+static int __bam_merge __P((DBC *,
+ DBC *, u_int32_t, DBT *, DB_COMPACT *,int *));
+static int __bam_merge_internal __P((DBC *, DBC *, int, DB_COMPACT *, int *));
+static int __bam_merge_pages __P((DBC *, DBC *, DB_COMPACT *));
+static int __bam_merge_records __P((DBC *, DBC*, u_int32_t, DB_COMPACT *));
+static int __bam_truncate_internal_overflow __P((DBC *, PAGE *, DB_COMPACT *));
+static int __bam_truncate_overflow __P((DBC *,
+ db_pgno_t, PAGE **, DB_COMPACT *));
+static int __bam_truncate_page __P((DBC *, PAGE **, PAGE *, int));
+static int __bam_truncate_root_page __P((DBC *,
+ PAGE *, u_int32_t, DB_COMPACT *));
+
+#ifdef HAVE_FTRUNCATE
+static int __bam_free_freelist __P((DB *, DB_THREAD_INFO *, DB_TXN *));
+static int __bam_savekey __P((DBC *, int, DBT *));
+static int __bam_setup_freelist __P((DB *, db_pglist_t *, u_int32_t));
+static int __bam_truncate_internal __P((DB *,
+ DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *));
+#endif
+
+#define SAVE_START \
+ do { \
+ save_data = *c_data; \
+ ret = __db_retcopy(env, \
+ &save_start, current.data, current.size, \
+ &save_start.data, &save_start.ulen); \
+ } while (0)
+
+/*
+ * Only restore those things that are negated by aborting the
+ * transaction. We don't restore the number of deadlocks, for example.
+ */
+
+#define RESTORE_START \
+ do { \
+ c_data->compact_pages_free = \
+ save_data.compact_pages_free; \
+ c_data->compact_levels = save_data.compact_levels; \
+ c_data->compact_truncate = save_data.compact_truncate; \
+ ret = __db_retcopy(env, &current, \
+ save_start.data, save_start.size, \
+ &current.data, &current.ulen); \
+ } while (0)
+
+/*
+ e __bam_compact -- compact a btree.
+ *
+ * PUBLIC: int __bam_compact __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC: DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *));
+ */
+int
+__bam_compact(dbp, ip, txn, start, stop, c_data, flags, end)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DBT *start, *stop;
+ DB_COMPACT *c_data;
+ u_int32_t flags;
+ DBT *end;
+{
+ DBC *dbc;
+ DBT current, save_start;
+ DB_COMPACT save_data;
+ ENV *env;
+ u_int32_t factor, retry;
+ int deadlock, have_freelist, isdone, ret, span, t_ret, txn_local;
+
+#ifdef HAVE_FTRUNCATE
+ db_pglist_t *list;
+ db_pgno_t last_pgno;
+ u_int32_t nelems, truncated;
+#endif
+
+ env = dbp->env;
+
+ memset(&current, 0, sizeof(current));
+ memset(&save_start, 0, sizeof(save_start));
+ dbc = NULL;
+ factor = 0;
+ have_freelist = deadlock = isdone = ret = span = 0;
+ ret = retry = 0;
+
+#ifdef HAVE_FTRUNCATE
+ list = NULL;
+ last_pgno = 0;
+ nelems = truncated = 0;
+#endif
+
+ /*
+ * We pass "current" to the internal routine, indicating where that
+ * routine should begin its work and expecting that it will return to
+ * us the last key that it processed.
+ */
+ if (start != NULL && (ret = __db_retcopy(env,
+ &current, start->data, start->size,
+ &current.data, &current.ulen)) != 0)
+ return (ret);
+
+ if (IS_DB_AUTO_COMMIT(dbp, txn))
+ txn_local = 1;
+ else
+ txn_local = 0;
+ if (!LF_ISSET(DB_FREE_SPACE | DB_FREELIST_ONLY))
+ goto no_free;
+ if (LF_ISSET(DB_FREELIST_ONLY))
+ LF_SET(DB_FREE_SPACE);
+
+#ifdef HAVE_FTRUNCATE
+ /* Sort the freelist and set up the in-memory list representation. */
+ if (txn_local && (ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+ goto err;
+
+ if ((ret = __db_free_truncate(dbp, ip,
+ txn, flags, c_data, &list, &nelems, &last_pgno)) != 0) {
+ LF_CLR(DB_FREE_SPACE);
+ goto terr;
+ }
+
+ /* If the freelist is empty and we are not filling, get out. */
+ if (nelems == 0 && LF_ISSET(DB_FREELIST_ONLY)) {
+ ret = 0;
+ LF_CLR(DB_FREE_SPACE);
+ goto terr;
+ }
+ if ((ret = __bam_setup_freelist(dbp, list, nelems)) != 0) {
+ /* Someone else owns the free list. */
+ if (ret == EBUSY)
+ ret = 0;
+ }
+ if (ret == 0)
+ have_freelist = 1;
+
+ /* Commit the txn and release the meta page lock. */
+terr: if (txn_local) {
+ if ((t_ret = __txn_commit(txn, DB_TXN_NOSYNC)) != 0 && ret == 0)
+ ret = t_ret;
+ txn = NULL;
+ }
+ if (ret != 0)
+ goto err;
+
+ /* Save the number truncated so far, we will add what we get below. */
+ truncated = c_data->compact_pages_truncated;
+ if (LF_ISSET(DB_FREELIST_ONLY))
+ goto done;
+#endif
+
+ /*
+ * We want factor to be the target number of free bytes on each page,
+ * so we know when to stop adding items to a page. Make sure to
+ * subtract the page overhead when computing this target. This can
+ * result in a 1-2% error on the smallest page.
+ * First figure out how many bytes we should use:
+ */
+no_free:
+ factor = dbp->pgsize - SIZEOF_PAGE;
+ if (c_data->compact_fillpercent != 0) {
+ factor *= c_data->compact_fillpercent;
+ factor /= 100;
+ }
+ /* Now convert to the number of free bytes to target. */
+ factor = (dbp->pgsize - SIZEOF_PAGE) - factor;
+
+ if (c_data->compact_pages == 0)
+ c_data->compact_pages = DB_MAX_PAGES;
+
+ do {
+ deadlock = 0;
+
+ SAVE_START;
+ if (ret != 0)
+ break;
+
+ if (txn_local) {
+ if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0)
+ break;
+
+ if (c_data->compact_timeout != 0 &&
+ (ret = __txn_set_timeout(txn,
+ c_data->compact_timeout, DB_SET_LOCK_TIMEOUT)) != 0)
+ goto err;
+ }
+
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+ goto err;
+
+ if ((ret = __bam_compact_int(dbc, &current, stop, factor,
+ &span, c_data, &isdone)) ==
+ DB_LOCK_DEADLOCK && txn_local) {
+ /*
+ * We retry on deadlock. Cancel the statistics
+ * and reset the start point to before this
+ * iteration.
+ */
+ deadlock = 1;
+ c_data->compact_deadlock++;
+ RESTORE_START;
+ }
+ /*
+ * If we could not get a lock while holding an internal
+ * node latched, commit the current local transaction otherwise
+ * report a deadlock.
+ */
+ if (ret == DB_LOCK_NOTGRANTED) {
+ if (txn_local || retry++ < 5)
+ ret = 0;
+ else
+ ret = DB_LOCK_DEADLOCK;
+ } else
+ retry = 0;
+
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+err: if (txn_local && txn != NULL) {
+ if (ret == 0 && deadlock == 0)
+ ret = __txn_commit(txn, DB_TXN_NOSYNC);
+ else if ((t_ret = __txn_abort(txn)) != 0 && ret == 0)
+ ret = t_ret;
+ txn = NULL;
+ }
+ } while (ret == 0 && !isdone);
+
+ if (ret == 0 && end != NULL)
+ ret = __db_retcopy(env, end, current.data, current.size,
+ &end->data, &end->ulen);
+ if (current.data != NULL)
+ __os_free(env, current.data);
+ if (save_start.data != NULL)
+ __os_free(env, save_start.data);
+
+#ifdef HAVE_FTRUNCATE
+ /*
+ * Finish up truncation work. If there are pages left in the free
+ * list then search the internal nodes of the tree as we may have
+ * missed some while walking the leaf nodes. Then calculate how
+ * many pages we have truncated and release the in-memory free list.
+ */
+done: if (LF_ISSET(DB_FREE_SPACE)) {
+ DBMETA *meta;
+ db_pgno_t pgno;
+
+ pgno = PGNO_BASE_MD;
+ isdone = 1;
+ if (ret == 0 && !LF_ISSET(DB_FREELIST_ONLY) && (t_ret =
+ __memp_fget(dbp->mpf, &pgno, ip, txn, 0, &meta)) == 0) {
+ isdone = meta->free == PGNO_INVALID;
+ ret = __memp_fput(dbp->mpf, ip, meta, dbp->priority);
+ }
+
+ if (!isdone)
+ ret = __bam_truncate_internal(dbp, ip, txn, c_data);
+
+ /* Clean up the free list. */
+ if (list != NULL)
+ __os_free(env, list);
+
+ if ((t_ret =
+ __memp_fget(dbp->mpf, &pgno, ip, txn, 0, &meta)) == 0) {
+ c_data->compact_pages_truncated =
+ truncated + last_pgno - meta->last_pgno;
+ if ((t_ret = __memp_fput(dbp->mpf, ip,
+ meta, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ } else if (ret == 0)
+ ret = t_ret;
+
+ if (have_freelist && (t_ret =
+ __bam_free_freelist(dbp, ip, txn)) != 0 && ret == 0)
+ t_ret = ret;
+ }
+#endif
+
+ return (ret);
+}
+
+/*
+ * __bam_csearch -- isolate search code for bam_compact.
+ * This routine hides the differences between searching
+ * a BTREE and a RECNO from the rest of the code.
+ */
+#define CS_READ 0 /* We are just reading. */
+#define CS_PARENT 1 /* We want the parent too, write lock. */
+#define CS_NEXT 2 /* Get the next page. */
+#define CS_NEXT_WRITE 3 /* Get the next page and write lock. */
+#define CS_DEL 4 /* Get a stack to delete a page. */
+#define CS_START 5 /* Starting level for stack, write lock. */
+#define CS_NEXT_BOTH 6 /* Get this page and the next, write lock. */
+#define CS_GETRECNO 0x80 /* Extract record number from start. */
+
+static int
+__bam_csearch(dbc, start, sflag, level)
+ DBC *dbc;
+ DBT *start;
+ u_int32_t sflag;
+ int level;
+{
+ BTREE_CURSOR *cp;
+ int not_used, ret;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ if (dbc->dbtype == DB_RECNO) {
+ /* If GETRECNO is not set the cp->recno is what we want. */
+ if (FLD_ISSET(sflag, CS_GETRECNO)) {
+ if (start == NULL || start->size == 0)
+ cp->recno = 1;
+ else if ((ret =
+ __ram_getno(dbc, start, &cp->recno, 0)) != 0)
+ return (ret);
+ FLD_CLR(sflag, CS_GETRECNO);
+ }
+ switch (sflag) {
+ case CS_READ:
+ sflag = SR_READ;
+ break;
+ case CS_NEXT:
+ sflag = SR_PARENT | SR_READ;
+ break;
+ case CS_START:
+ level = LEAFLEVEL;
+ /* FALLTHROUGH */
+ case CS_DEL:
+ case CS_NEXT_WRITE:
+ sflag = SR_STACK;
+ break;
+ case CS_NEXT_BOTH:
+ sflag = SR_BOTH | SR_NEXT | SR_WRITE;
+ break;
+ case CS_PARENT:
+ sflag = SR_PARENT | SR_WRITE;
+ break;
+ default:
+ return (__env_panic(dbc->env, EINVAL));
+ }
+ if ((ret = __bam_rsearch(dbc,
+ &cp->recno, sflag, level, &not_used)) != 0)
+ return (ret);
+ /* Reset the cursor's recno to the beginning of the page. */
+ cp->recno -= cp->csp->indx;
+ } else {
+ FLD_CLR(sflag, CS_GETRECNO);
+ switch (sflag) {
+ case CS_READ:
+ sflag = SR_READ | SR_DUPFIRST;
+ break;
+ case CS_DEL:
+ sflag = SR_DEL;
+ break;
+ case CS_NEXT:
+ sflag = SR_NEXT;
+ break;
+ case CS_NEXT_WRITE:
+ sflag = SR_NEXT | SR_WRITE;
+ break;
+ case CS_NEXT_BOTH:
+ sflag = SR_BOTH | SR_NEXT | SR_WRITE;
+ break;
+ case CS_START:
+ sflag = SR_START | SR_WRITE;
+ break;
+ case CS_PARENT:
+ sflag = SR_PARENT | SR_WRITE;
+ break;
+ default:
+ return (__env_panic(dbc->env, EINVAL));
+ }
+ if (start == NULL || start->size == 0)
+ FLD_SET(sflag, SR_MIN);
+
+ if ((ret = __bam_search(dbc,
+ cp->root, start, sflag, level, NULL, &not_used)) != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+/*
+ * __bam_compact_int -- internal compaction routine.
+ * Called either with a cursor on the main database
+ * or a cursor initialized to the root of an off page duplicate
+ * tree.
+ */
+static int
+__bam_compact_int(dbc, start, stop, factor, spanp, c_data, donep)
+ DBC *dbc;
+ DBT *start, *stop;
+ u_int32_t factor;
+ int *spanp;
+ DB_COMPACT *c_data;
+ int *donep;
+{
+ BTREE_CURSOR *cp, *ncp;
+ DB *dbp;
+ DBC *ndbc;
+ DB_LOCK metalock, next_lock, nnext_lock, prev_lock, saved_lock;
+ DB_MPOOLFILE *dbmp;
+ ENV *env;
+ EPG *epg;
+ PAGE *pg, *ppg, *npg;
+ db_pgno_t metapgno, npgno, nnext_pgno;
+ db_pgno_t pgno, prev_pgno, ppgno, saved_pgno;
+ db_recno_t next_recno;
+ u_int32_t sflag, pgs_free;
+ int check_dups, check_trunc, clear_root, isdone;
+ int merged, nentry, next_p, pgs_done, ret, t_ret, tdone;
+
+#ifdef DEBUG
+#define CTRACE(dbc, location, t, start, f) do { \
+ DBT __trace; \
+ DB_SET_DBT(__trace, t, strlen(t)); \
+ DEBUG_LWRITE( \
+ dbc, (dbc)->txn, location, &__trace, start, f) \
+ } while (0)
+#define PTRACE(dbc, location, p, start, f) do { \
+ char __buf[32]; \
+ (void)snprintf(__buf, \
+ sizeof(__buf), "pgno: %lu", (u_long)p); \
+ CTRACE(dbc, location, __buf, start, f); \
+ } while (0)
+#else
+#define CTRACE(dbc, location, t, start, f)
+#define PTRACE(dbc, location, p, start, f)
+#endif
+
+ ndbc = NULL;
+ pg = NULL;
+ npg = NULL;
+
+ isdone = 0;
+ tdone = 0;
+ pgs_done = 0;
+ next_recno = 0;
+ next_p = 0;
+ clear_root = 0;
+ metapgno = PGNO_BASE_MD;
+ LOCK_INIT(next_lock);
+ LOCK_INIT(nnext_lock);
+ LOCK_INIT(saved_lock);
+ LOCK_INIT(metalock);
+ LOCK_INIT(prev_lock);
+ check_trunc = c_data->compact_truncate != PGNO_INVALID;
+ check_dups = (!F_ISSET(dbc, DBC_OPD) &&
+ F_ISSET(dbc->dbp, DB_AM_DUP)) || check_trunc;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ dbmp = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ pgs_free = c_data->compact_pages_free;
+
+ /* Search down the tree for the starting point. */
+ if ((ret = __bam_csearch(dbc,
+ start, CS_READ | CS_GETRECNO, LEAFLEVEL)) != 0) {
+ /* Its not an error to compact an empty db. */
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ isdone = 1;
+ goto err;
+ }
+
+ /*
+ * Get the first leaf page. The loop below will change pg so
+ * we clear the stack reference so we don't put a a page twice.
+ */
+ pg = cp->csp->page;
+ cp->csp->page = NULL;
+ next_recno = cp->recno;
+next: /*
+ * This is the start of the main compaction loop. There are 3
+ * parts to the process:
+ * 1) Walk the leaf pages of the tree looking for a page to
+ * process. We do this with read locks. Save the
+ * key from the page and release it.
+ * 2) Set up a cursor stack which will write lock the page
+ * and enough of its ancestors to get the job done.
+ * This could go to the root if we might delete a subtree
+ * or we have record numbers to update.
+ * 3) Loop fetching pages after the above page and move enough
+ * data to fill it.
+ * We exit the loop if we are at the end of the leaf pages, are
+ * about to lock a new subtree (we span) or on error.
+ */
+
+ /* Walk the pages looking for something to fill up. */
+ while ((npgno = NEXT_PGNO(pg)) != PGNO_INVALID) {
+ c_data->compact_pages_examine++;
+ PTRACE(dbc, "Next", PGNO(pg), start, 0);
+
+ /* If we have fetched the next page, get the new key. */
+ if (next_p == 1 &&
+ dbc->dbtype != DB_RECNO && NUM_ENT(pg) != 0) {
+ if ((ret = __db_ret(dbc, pg, 0, start,
+ &start->data, &start->ulen)) != 0)
+ goto err;
+ }
+ next_recno += NUM_ENT(pg);
+ if (P_FREESPACE(dbp, pg) > factor ||
+ (check_trunc && PGNO(pg) > c_data->compact_truncate))
+ break;
+ if (stop != NULL && stop->size > 0) {
+ if ((ret = __bam_compact_isdone(dbc,
+ stop, pg, &isdone)) != 0)
+ goto err;
+ if (isdone)
+ goto done;
+ }
+
+ /*
+ * The page does not need more data or to be swapped,
+ * check to see if we want to look at possible duplicate
+ * trees or overflow records and the move on to the next page.
+ */
+ cp->recno += NUM_ENT(pg);
+ next_p = 1;
+ tdone = pgs_done;
+ PTRACE(dbc, "Dups", PGNO(pg), start, 0);
+ if (check_dups && (ret = __bam_compact_dups(
+ dbc, &pg, factor, 0, c_data, &pgs_done)) != 0)
+ goto err;
+ npgno = NEXT_PGNO(pg);
+ if ((ret = __memp_fput(dbmp,
+ dbc->thread_info, pg, dbc->priority)) != 0)
+ goto err;
+ pg = NULL;
+ /*
+ * If we don't do anything we don't need to hold
+ * the lock on the previous page, so couple always.
+ */
+ if ((ret = __db_lget(dbc,
+ tdone == pgs_done ? LCK_COUPLE_ALWAYS : LCK_COUPLE,
+ npgno, DB_LOCK_READ, 0, &cp->csp->lock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(dbmp, &npgno,
+ dbc->thread_info, dbc->txn, 0, &pg)) != 0)
+ goto err;
+ }
+
+ /*
+ * When we get here we have 3 cases:
+ * 1) We've reached the end of the leaf linked list and are done.
+ * 2) A page whose freespace exceeds our target and therefore needs
+ * to have data added to it.
+ * 3) A page that doesn't have too much free space but needs to be
+ * checked for truncation.
+ * In both cases 2 and 3, we need that page's first key or record
+ * number. We may already have it, if not get it here.
+ */
+ if ((nentry = NUM_ENT(pg)) != 0) {
+ next_p = 0;
+ /* Get a copy of the first recno on the page. */
+ if (dbc->dbtype == DB_RECNO) {
+ if ((ret = __db_retcopy(dbp->env, start,
+ &cp->recno, sizeof(cp->recno),
+ &start->data, &start->ulen)) != 0)
+ goto err;
+ } else if (start->size == 0 && (ret = __db_ret(dbc,
+ pg, 0, start, &start->data, &start->ulen)) != 0)
+ goto err;
+
+ if (npgno == PGNO_INVALID) {
+ /* End of the tree, check its duplicates and exit. */
+ PTRACE(dbc, "GoDone", PGNO(pg), start, 0);
+ if (check_dups && (ret = __bam_compact_dups(dbc,
+ &pg, factor, 0, c_data, &pgs_done)) != 0)
+ goto err;
+ c_data->compact_pages_examine++;
+ isdone = 1;
+ goto done;
+ }
+ }
+
+ /* Release the page so we don't deadlock getting its parent. */
+ if ((ret = __memp_fput(dbmp, dbc->thread_info, pg, dbc->priority)) != 0)
+ goto err;
+ if ((ret = __LPUT(dbc, cp->csp->lock)) != 0)
+ goto err;
+ BT_STK_CLR(cp);
+ pg = NULL;
+ saved_pgno = PGNO_INVALID;
+ prev_pgno = PGNO_INVALID;
+ nnext_pgno = PGNO_INVALID;
+
+ /*
+ * We must lock the metadata page first because we cannot block
+ * while holding interior nodes of the tree pinned.
+ */
+
+ if (!LOCK_ISSET(metalock) && pgs_free == c_data->compact_pages_free &&
+ (ret = __db_lget(dbc,
+ LCK_ALWAYS, metapgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ goto err;
+
+ /*
+ * Setup the cursor stack. There are 3 cases:
+ * 1) the page is empty and will be deleted: nentry == 0.
+ * 2) the next page has the same parent: *spanp == 0.
+ * 3) the next page has a different parent: *spanp == 1.
+ *
+ * We now need to search the tree again, getting a write lock
+ * on the page we are going to merge or delete. We do this by
+ * searching down the tree and locking as much of the subtree
+ * above the page as needed. In the case of a delete we will
+ * find the maximal subtree that can be deleted. In the case
+ * of merge if the current page and the next page are siblings
+ * with the same parent then we only need to lock the parent.
+ * Otherwise *span will be set and we need to search to find the
+ * lowest common ancestor. Dbc will be set to contain the subtree
+ * containing the page to be merged or deleted. Ndbc will contain
+ * the minimal subtree containing that page and its next sibling.
+ * In all cases for DB_RECNO we simplify things and get the whole
+ * tree if we need more than a single parent.
+ * The tree can collapse while we don't have it locked, so the
+ * page we are looking for may be gone. If so we are at
+ * the right most end of the leaf pages and are done.
+ */
+
+retry: pg = NULL;
+ if (npg != NULL && (ret = __memp_fput(dbmp,
+ dbc->thread_info, npg, dbc->priority)) != 0)
+ goto err;
+ npg = NULL;
+ if (ndbc != NULL) {
+ ncp = (BTREE_CURSOR *)ndbc->internal;
+ if (clear_root == 1) {
+ ncp->sp->page = NULL;
+ LOCK_INIT(ncp->sp->lock);
+ }
+ if ((ret = __bam_stkrel(ndbc, 0)) != 0)
+ goto err;
+ }
+ clear_root = 0;
+ /* Case 1 -- page is empty. */
+ if (nentry == 0) {
+ CTRACE(dbc, "Empty", "", start, 0);
+ if (next_p == 1)
+ sflag = CS_NEXT_WRITE;
+ else
+ sflag = CS_DEL;
+ if ((ret = __bam_csearch(dbc, start, sflag, LEAFLEVEL)) != 0) {
+ isdone = 1;
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ goto err;
+ }
+
+ pg = cp->csp->page;
+ /* Check to see if the page is still empty. */
+ if (NUM_ENT(pg) != 0)
+ npgno = PGNO(pg);
+ else {
+ npgno = NEXT_PGNO(pg);
+ /* If this is now the root, we are very done. */
+ if (PGNO(pg) == cp->root)
+ isdone = 1;
+ else {
+ if (npgno != PGNO_INVALID) {
+ TRY_LOCK(dbc, npgno, saved_pgno,
+ next_lock, DB_LOCK_WRITE, retry);
+ if (ret != 0)
+ goto err;
+ }
+ if (PREV_PGNO(pg) != PGNO_INVALID) {
+ TRY_LOCK(dbc, PREV_PGNO(pg), prev_pgno,
+ prev_lock, DB_LOCK_WRITE, retry);
+ if (ret != 0)
+ goto err;
+ }
+ if ((ret =
+ __bam_dpages(dbc, 0, BTD_RELINK)) != 0)
+ goto err;
+ c_data->compact_pages_free++;
+ if ((ret = __TLPUT(dbc, prev_lock)) != 0)
+ goto err;
+ LOCK_INIT(prev_lock);
+ if ((ret = __TLPUT(dbc, next_lock)) != 0)
+ goto err;
+ LOCK_INIT(next_lock);
+ goto next_no_release;
+ }
+ }
+ goto next_page;
+ }
+
+ /* case 3 -- different parents. */
+ if (*spanp) {
+ CTRACE(dbc, "Span", "", start, 0);
+ /*
+ * Search the tree looking for the page containing and
+ * the next page after the current key.
+ * The stack will be rooted at the page that spans
+ * the current and next pages. The two subtrees
+ * are returned below that. For BTREE the current
+ * page subtreee will be first while for RECNO the
+ * next page subtree will be first
+ */
+ if (ndbc == NULL && (ret = __dbc_dup(dbc, &ndbc, 0)) != 0)
+ goto err;
+ ncp = (BTREE_CURSOR *)ndbc->internal;
+
+ ncp->recno = cp->recno;
+ cp->recno = next_recno;
+
+ if ((ret = __bam_csearch(dbc, start, CS_NEXT_BOTH, 0)) != 0) {
+ if (ret == DB_NOTFOUND) {
+ isdone = 1;
+ ret = 0;
+ }
+ goto err;
+ }
+
+ /*
+ * Find the top of the stack for the second subtree.
+ */
+ for (epg = cp->csp - 1; epg > cp->sp; epg--)
+ if (LEVEL(epg->page) == LEAFLEVEL)
+ break;
+ DB_ASSERT(env, epg != cp->sp);
+
+ /*
+ * Copy the root. We will have two instances of the
+ * same page, be careful not to free both.
+ */
+ BT_STK_PUSH(env, ncp, cp->sp->page, cp->sp->indx,
+ cp->sp->lock, cp->sp->lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ clear_root = 1;
+
+ /* Copy the stack containing the next page. */
+ for (epg++; epg <= cp->csp; epg++) {
+ BT_STK_PUSH(env, ncp, epg->page, epg->indx,
+ epg->lock, epg->lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ }
+ /* adjust the stack pointer to remove these items. */
+ ncp->csp--;
+ cp->csp -= ncp->csp - ncp->sp;
+
+ /*
+ * If this is RECNO then we want to swap the stacks.
+ */
+ if (dbp->type == DB_RECNO) {
+ ndbc->internal = (DBC_INTERNAL *)cp;
+ dbc->internal = (DBC_INTERNAL *)ncp;
+ cp = ncp;
+ ncp = (BTREE_CURSOR *)ndbc->internal;
+ cp->sp->indx--;
+ } else
+ ncp->sp->indx++;
+
+ DB_ASSERT(env,
+ NEXT_PGNO(cp->csp->page) == PGNO(ncp->csp->page));
+ pg = cp->csp->page;
+
+ /*
+ * The page may have emptied while we waited for the
+ * lock or the record we are looking for may have
+ * moved.
+ * Reset npgno so we re-get this page when we go back
+ * to the top.
+ */
+ if (NUM_ENT(pg) == 0 ||
+ (dbc->dbtype == DB_RECNO &&
+ NEXT_PGNO(cp->csp->page) != PGNO(ncp->csp->page))) {
+ npgno = PGNO(pg);
+ *spanp = 0;
+ goto next_page;
+ }
+
+ if (check_trunc && PGNO(pg) > c_data->compact_truncate) {
+ if (PREV_PGNO(pg) != PGNO_INVALID) {
+ TRY_LOCK2(dbc, ndbc, PREV_PGNO(pg), prev_pgno,
+ prev_lock, DB_LOCK_WRITE, retry);
+ if (ret != 0)
+ goto err1;
+ }
+ pgs_done++;
+ /* Get a fresh low numbered page. */
+ if ((ret = __bam_truncate_page(dbc,
+ &pg, ncp->csp->page, 1)) != 0)
+ goto err1;
+ if ((ret = __TLPUT(dbc, prev_lock)) != 0)
+ goto err;
+ LOCK_INIT(prev_lock);
+ }
+ *spanp = 0;
+ PTRACE(dbc, "SDups", PGNO(ncp->csp->page), start, 0);
+ if (check_dups && (ret = __bam_compact_dups(ndbc,
+ &ncp->csp->page, factor, 1, c_data, &pgs_done)) != 0)
+ goto err1;
+
+ /* Check to see if the tree collapsed. */
+ if (PGNO(ncp->csp->page) == ncp->root)
+ goto done;
+
+ pg = cp->csp->page;
+ npgno = NEXT_PGNO(pg);
+ PTRACE(dbc, "SDups", PGNO(pg), start, 0);
+ if (check_dups && (ret =
+ __bam_compact_dups(dbc, &cp->csp->page,
+ factor, 1, c_data, &pgs_done)) != 0)
+ goto err1;
+
+ /*
+ * We may have dropped our locks, check again
+ * to see if we still need to fill this page and
+ * we are in a spanning situation.
+ */
+
+ if (P_FREESPACE(dbp, pg) <= factor ||
+ cp->csp[-1].indx != NUM_ENT(cp->csp[-1].page) - 1)
+ goto next_page;
+
+ /*
+ * Try to move things into a single parent.
+ */
+ merged = 0;
+ for (epg = cp->sp; epg != cp->csp; epg++) {
+ PTRACE(dbc, "PMerge", PGNO(epg->page), start, 0);
+ if ((ret = __bam_merge_internal(dbc,
+ ndbc, LEVEL(epg->page), c_data, &merged)) != 0)
+ break;
+ if (merged)
+ break;
+ }
+
+ if (ret != 0 && ret != DB_LOCK_NOTGRANTED)
+ goto err1;
+ /*
+ * If we merged the parent, then we no longer span.
+ * Otherwise if we tried to merge the parent but would
+ * block on one of the other leaf pages try again.
+ * If we did not merge any records of the parent,
+ * exit to commit any local transactions and try again.
+ */
+ if (merged || ret == DB_LOCK_NOTGRANTED) {
+ if (merged)
+ pgs_done++;
+ else
+ goto done;
+ if (cp->csp->page == NULL)
+ goto deleted;
+ npgno = PGNO(pg);
+ next_recno = cp->recno;
+ goto next_page;
+ }
+ PTRACE(dbc, "SMerge", PGNO(cp->csp->page), start, 0);
+
+ /* if we remove the next page, then we need its next locked */
+ npgno = NEXT_PGNO(ncp->csp->page);
+ if (npgno != PGNO_INVALID) {
+ TRY_LOCK2(dbc, ndbc, npgno,
+ nnext_pgno, nnext_lock, DB_LOCK_WRITE, retry);
+ if (ret != 0)
+ goto err1;
+ }
+ if ((ret = __bam_merge(dbc,
+ ndbc, factor, stop, c_data, &isdone)) != 0)
+ goto err1;
+ pgs_done++;
+ /*
+ * __bam_merge could have freed our stack if it
+ * deleted a page possibly collapsing the tree.
+ */
+ if (cp->csp->page == NULL)
+ goto deleted;
+ cp->recno += NUM_ENT(pg);
+
+ if ((ret = __TLPUT(dbc, nnext_lock)) != 0)
+ goto err1;
+ LOCK_INIT(nnext_lock);
+
+ /* If we did not bump to the next page something did not fit. */
+ if (npgno != NEXT_PGNO(pg)) {
+ npgno = NEXT_PGNO(pg);
+ goto next_page;
+ }
+ } else {
+ /* Case 2 -- same parents. */
+ CTRACE(dbc, "Sib", "", start, 0);
+ if ((ret =
+ __bam_csearch(dbc, start, CS_PARENT, LEAFLEVEL)) != 0) {
+ if (ret == DB_NOTFOUND) {
+ isdone = 1;
+ ret = 0;
+ }
+ goto err;
+ }
+
+ pg = cp->csp->page;
+ DB_ASSERT(env, IS_DIRTY(pg));
+ DB_ASSERT(env,
+ PGNO(pg) == cp->root || IS_DIRTY(cp->csp[-1].page));
+
+ /* We now have a write lock, recheck the page. */
+ if ((nentry = NUM_ENT(pg)) == 0) {
+ npgno = PGNO(pg);
+ goto next_page;
+ }
+
+ /* Check duplicate trees, we have a write lock on the page. */
+ PTRACE(dbc, "SibDup", PGNO(pg), start, 0);
+ if (check_dups && (ret =
+ __bam_compact_dups(dbc, &cp->csp->page,
+ factor, 1, c_data, &pgs_done)) != 0)
+ goto err1;
+ pg = cp->csp->page;
+ npgno = NEXT_PGNO(pg);
+
+ /* Check to see if the tree collapsed. */
+ if (PGNO(pg) == cp->root)
+ goto err1;
+ DB_ASSERT(env, cp->csp - cp->sp == 1);
+
+ /* After re-locking check to see if we still need to fill. */
+ if (P_FREESPACE(dbp, pg) <= factor) {
+ if (check_trunc &&
+ PGNO(pg) > c_data->compact_truncate) {
+ if (PREV_PGNO(pg) != PGNO_INVALID) {
+ TRY_LOCK(dbc, PREV_PGNO(pg), prev_pgno,
+ prev_lock, DB_LOCK_WRITE, retry);
+ if (ret != 0)
+ goto err1;
+ }
+ if (npgno != PGNO_INVALID) {
+ TRY_LOCK(dbc, npgno, saved_pgno,
+ next_lock, DB_LOCK_WRITE, retry);
+ if (ret != 0)
+ goto err1;
+ }
+ pgs_done++;
+ /* Get a fresh low numbered page. */
+ if ((ret = __bam_truncate_page(dbc,
+ &pg, NULL, 1)) != 0)
+ goto err1;
+ if ((ret = __TLPUT(dbc, prev_lock)) != 0)
+ goto err1;
+ LOCK_INIT(prev_lock);
+ if ((ret = __TLPUT(dbc, next_lock)) != 0)
+ goto err1;
+ LOCK_INIT(next_lock);
+ }
+ goto next_page;
+ }
+
+ /* If they have the same parent, just dup the cursor */
+ if (ndbc != NULL && (ret = __dbc_close(ndbc)) != 0)
+ goto err1;
+ if ((ret = __dbc_dup(dbc, &ndbc, DB_POSITION)) != 0)
+ goto err1;
+ ncp = (BTREE_CURSOR *)ndbc->internal;
+
+ /*
+ * ncp->recno needs to have the recno of the next page.
+ * Bump it by the number of records on the current page.
+ */
+ ncp->recno += NUM_ENT(pg);
+ }
+
+ pgno = PGNO(cp->csp->page);
+ ppgno = PGNO(cp->csp[-1].page);
+ /* Fetch pages until we fill this one. */
+ while (!isdone && npgno != PGNO_INVALID &&
+ P_FREESPACE(dbp, pg) > factor && c_data->compact_pages != 0) {
+ /*
+ * merging may have to free the parent page, if it does,
+ * refetch it but do it decending the tree.
+ */
+ epg = &cp->csp[-1];
+ if ((ppg = epg->page) == NULL) {
+ if ((ret = __memp_fput(dbmp, dbc->thread_info,
+ cp->csp->page, dbc->priority)) != 0)
+ goto err1;
+ pg = NULL;
+ if ((ret = __memp_fget(dbmp, &ppgno, dbc->thread_info,
+ dbc->txn, DB_MPOOL_DIRTY, &ppg)) != 0)
+ goto err1;
+ if ((ret = __memp_fget(dbmp, &pgno, dbc->thread_info,
+ dbc->txn, DB_MPOOL_DIRTY, &pg)) != 0)
+ goto err1;
+ epg->page = ppg;
+ cp->csp->page = pg;
+ }
+
+ /*
+ * If our current position is the last one on a parent
+ * page, then we are about to merge across different
+ * internal nodes. Thus, we need to lock higher up
+ * in the tree. We will exit the routine and commit
+ * what we have done so far. Set spanp so we know
+ * we are in this case when we come back.
+ */
+ if (epg->indx == NUM_ENT(ppg) - 1) {
+ *spanp = 1;
+ npgno = PGNO(pg);
+ next_recno = cp->recno;
+ epg->page = ppg;
+ goto next_page;
+ }
+
+ /* Lock and get the next page. */
+ TRY_LOCK(dbc, npgno,
+ saved_pgno, saved_lock, DB_LOCK_WRITE, retry);
+ if (ret != 0)
+ goto err1;
+ if ((ret = __LPUT(dbc, ncp->lock)) != 0)
+ goto err1;
+ ncp->lock = saved_lock;
+ LOCK_INIT(saved_lock);
+ saved_pgno = PGNO_INVALID;
+
+ if ((ret = __memp_fget(dbmp, &npgno,
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &npg)) != 0)
+ goto err1;
+
+ if (check_trunc &&
+ PGNO(pg) > c_data->compact_truncate) {
+ if (PREV_PGNO(pg) != PGNO_INVALID) {
+ TRY_LOCK(dbc, PREV_PGNO(pg),
+ prev_pgno, prev_lock, DB_LOCK_WRITE, retry);
+ if (ret != 0)
+ goto err1;
+ }
+ pgs_done++;
+ /* Get a fresh low numbered page. */
+ if ((ret = __bam_truncate_page(dbc, &pg, npg, 1)) != 0)
+ goto err1;
+ if ((ret = __TLPUT(dbc, prev_lock)) != 0)
+ goto err1;
+ LOCK_INIT(prev_lock);
+ pgno = PGNO(pg);
+ }
+ c_data->compact_pages_examine++;
+
+ PTRACE(dbc, "MDups", PGNO(npg), start, 0);
+ if (check_dups && (ret = __bam_compact_dups(ndbc,
+ &npg, factor, 1, c_data, &pgs_done)) != 0)
+ goto err1;
+
+ npgno = NEXT_PGNO(npg);
+ if (npgno != PGNO_INVALID) {
+ TRY_LOCK(dbc, npgno,
+ nnext_pgno, nnext_lock, DB_LOCK_WRITE, retry);
+ if (ret != 0)
+ goto err1;
+ }
+
+ /* copy the common parent to the stack. */
+ BT_STK_PUSH(env, ncp, ppg,
+ epg->indx + 1, epg->lock, epg->lock_mode, ret);
+ if (ret != 0)
+ goto err1;
+
+ /* Put the page on the stack. */
+ BT_STK_ENTER(env, ncp, npg, 0, ncp->lock, DB_LOCK_WRITE, ret);
+
+ LOCK_INIT(ncp->lock);
+ npg = NULL;
+
+ /*
+ * Merge the pages. This will either free the next
+ * page or just update its parent pointer.
+ */
+ PTRACE(dbc, "Merge", PGNO(cp->csp->page), start, 0);
+ if ((ret = __bam_merge(dbc,
+ ndbc, factor, stop, c_data, &isdone)) != 0)
+ goto err1;
+
+ pgs_done++;
+
+ if ((ret = __TLPUT(dbc, nnext_lock)) != 0)
+ goto err1;
+ LOCK_INIT(nnext_lock);
+
+ /*
+ * __bam_merge could have freed our stack if it
+ * deleted a page possibly collapsing the tree.
+ */
+ if (cp->csp->page == NULL)
+ goto deleted;
+ /* If we did not bump to the next page something did not fit. */
+ if (npgno != NEXT_PGNO(pg))
+ break;
+ }
+
+ /* Bottom of the main loop. Move to the next page. */
+ npgno = NEXT_PGNO(pg);
+ cp->recno += NUM_ENT(pg);
+ next_recno = cp->recno;
+
+next_page:
+ if (ndbc != NULL) {
+ ncp = (BTREE_CURSOR *)ndbc->internal;
+ if (ncp->sp->page == cp->sp->page) {
+ ncp->sp->page = NULL;
+ LOCK_INIT(ncp->sp->lock);
+ }
+ if ((ret = __bam_stkrel(ndbc,
+ pgs_done == 0 ? STK_NOLOCK : 0)) != 0)
+ goto err;
+ }
+ /*
+ * Unlatch the tree before trying to lock the next page. We must
+ * unlatch to avoid a latch deadlock but we want to hold the
+ * lock on the parent node so this leaf cannot be unlinked.
+ */
+ pg = NULL;
+ if ((ret = __bam_stkrel(dbc, STK_PGONLY)) != 0)
+ goto err;
+ if ((ret = __db_lget(dbc, 0, npgno, DB_LOCK_READ, 0, &next_lock)) != 0)
+ goto err;
+ if ((ret = __bam_stkrel(dbc, pgs_done == 0 ? STK_NOLOCK : 0)) != 0)
+ goto err;
+ if ((ret = __TLPUT(dbc, saved_lock)) != 0)
+ goto err;
+ if ((ret = __TLPUT(dbc, prev_lock)) != 0)
+ goto err;
+
+next_no_release:
+ pg = NULL;
+
+ if (npgno == PGNO_INVALID || c_data->compact_pages == 0)
+ isdone = 1;
+ if (!isdone) {
+ /*
+ * If we are at the end of this parent commit the
+ * transaction so we don't tie things up.
+ */
+ if (pgs_done != 0 && *spanp) {
+deleted: if (((ret = __bam_stkrel(ndbc, 0)) != 0 ||
+ (ret = __dbc_close(ndbc)) != 0))
+ goto err;
+ *donep = 0;
+ return (0);
+ }
+
+ /* Reget the next page to look at. */
+ cp->recno = next_recno;
+ if ((ret = __memp_fget(dbmp, &npgno,
+ dbc->thread_info, dbc->txn, 0, &pg)) != 0)
+ goto err;
+ cp->csp->lock = next_lock;
+ LOCK_INIT(next_lock);
+ next_p = 1;
+ /* If we did not do anything we can drop the metalock. */
+ if (pgs_done == 0 && (ret = __LPUT(dbc, metalock)) != 0)
+ goto err;
+ goto next;
+ }
+
+done:
+ if (0) {
+ /*
+ * We come here if pg came from cp->csp->page and could
+ * have already been fput.
+ */
+err1: pg = NULL;
+ }
+err: /*
+ * Don't release locks (STK_PGONLY)if we had an error, we could reveal
+ * a bad tree to a dirty reader. Wait till the abort to free the locks.
+ */
+ sflag = STK_CLRDBC;
+ if (dbc->txn != NULL && ret != 0)
+ sflag |= STK_PGONLY;
+ if (ndbc != NULL) {
+ ncp = (BTREE_CURSOR *)ndbc->internal;
+ if (npg == ncp->csp->page)
+ npg = NULL;
+ if (ncp->sp->page == cp->sp->page) {
+ ncp->sp->page = NULL;
+ LOCK_INIT(ncp->sp->lock);
+ }
+ if ((t_ret = __bam_stkrel(ndbc, sflag)) != 0 && ret == 0)
+ ret = t_ret;
+ else if ((t_ret = __dbc_close(ndbc)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ if (pg == cp->csp->page)
+ pg = NULL;
+ if ((t_ret = __bam_stkrel(dbc, sflag)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (pg != NULL && (t_ret =
+ __memp_fput(dbmp,
+ dbc->thread_info, pg, dbc->priority) != 0) && ret == 0)
+ ret = t_ret;
+ if (npg != NULL && (t_ret =
+ __memp_fput(dbmp,
+ dbc->thread_info, npg, dbc->priority) != 0) && ret == 0)
+ ret = t_ret;
+
+ *donep = isdone;
+
+ return (ret);
+}
+
+/*
+ * __bam_merge -- do actual merging of leaf pages.
+ */
+static int
+__bam_merge(dbc, ndbc, factor, stop, c_data, donep)
+ DBC *dbc, *ndbc;
+ u_int32_t factor;
+ DBT *stop;
+ DB_COMPACT *c_data;
+ int *donep;
+{
+ BTREE_CURSOR *cp, *ncp;
+ DB *dbp;
+ PAGE *pg, *npg;
+ db_indx_t nent;
+ int ret;
+
+ DB_ASSERT(NULL, dbc != NULL);
+ DB_ASSERT(NULL, ndbc != NULL);
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ ncp = (BTREE_CURSOR *)ndbc->internal;
+ pg = cp->csp->page;
+ npg = ncp->csp->page;
+
+ nent = NUM_ENT(npg);
+
+ /* If the page is empty just throw it away. */
+ if (nent == 0)
+ goto free_page;
+
+ /* Find if the stopping point is on this page. */
+ if (stop != NULL && stop->size != 0) {
+ if ((ret = __bam_compact_isdone(dbc, stop, npg, donep)) != 0)
+ return (ret);
+ if (*donep)
+ return (0);
+ }
+
+ /*
+ * If there is too much data then just move records one at a time.
+ * Otherwise copy the data space over and fix up the index table.
+ * If we are on the left most child we will effect our parent's
+ * index entry so we call merge_records to figure out key sizes.
+ */
+ if ((dbc->dbtype == DB_BTREE &&
+ ncp->csp[-1].indx == 0 && ncp->csp[-1].entries != 1) ||
+ (int)(P_FREESPACE(dbp, pg) -
+ ((dbp->pgsize - P_OVERHEAD(dbp)) -
+ P_FREESPACE(dbp, npg))) < (int)factor)
+ ret = __bam_merge_records(dbc, ndbc, factor, c_data);
+ else
+free_page: ret = __bam_merge_pages(dbc, ndbc, c_data);
+
+ return (ret);
+}
+
+static int
+__bam_merge_records(dbc, ndbc, factor, c_data)
+ DBC *dbc, *ndbc;
+ u_int32_t factor;
+ DB_COMPACT *c_data;
+{
+ BINTERNAL *bi;
+ BKEYDATA *bk, *tmp_bk;
+ BTREE *t;
+ BTREE_CURSOR *cp, *ncp;
+ DB *dbp;
+ DBT a, b, data, hdr;
+ ENV *env;
+ EPG *epg;
+ PAGE *pg, *npg;
+ db_indx_t adj, indx, nent, *ninp, pind;
+ int32_t adjust;
+ u_int32_t freespace, nksize, pfree, size;
+ int first_dup, is_dup, next_dup, n_ok, ret;
+ size_t (*func) __P((DB *, const DBT *, const DBT *));
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ t = dbp->bt_internal;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ ncp = (BTREE_CURSOR *)ndbc->internal;
+ pg = cp->csp->page;
+ npg = ncp->csp->page;
+ memset(&hdr, 0, sizeof(hdr));
+ pind = NUM_ENT(pg);
+ n_ok = 0;
+ adjust = 0;
+ ret = 0;
+ nent = NUM_ENT(npg);
+
+ DB_ASSERT(env, nent != 0);
+
+ /* See if we want to swap out this page. */
+ if (c_data->compact_truncate != PGNO_INVALID &&
+ PGNO(npg) > c_data->compact_truncate) {
+ /* Get a fresh low numbered page. */
+ if ((ret = __bam_truncate_page(ndbc, &npg, pg, 1)) != 0)
+ goto err;
+ }
+
+ ninp = P_INP(dbp, npg);
+
+ /*
+ * pg is the page that is being filled, it is in the stack in cp.
+ * npg is the next page, it is in the stack in ncp.
+ */
+ freespace = P_FREESPACE(dbp, pg);
+
+ adj = TYPE(npg) == P_LBTREE ? P_INDX : O_INDX;
+ /*
+ * Loop through the records and find the stopping point.
+ */
+ for (indx = 0; indx < nent; indx += adj) {
+ bk = GET_BKEYDATA(dbp, npg, indx);
+
+ /* Size of the key. */
+ size = BITEM_PSIZE(bk);
+
+ /* Size of the data. */
+ if (TYPE(pg) == P_LBTREE)
+ size += BITEM_PSIZE(GET_BKEYDATA(dbp, npg, indx + 1));
+ /*
+ * If we are at a duplicate set, skip ahead to see and
+ * get the total size for the group.
+ */
+ n_ok = adj;
+ if (TYPE(pg) == P_LBTREE &&
+ indx < nent - adj &&
+ ninp[indx] == ninp[indx + adj]) {
+ do {
+ /* Size of index for key reference. */
+ size += sizeof(db_indx_t);
+ n_ok++;
+ /* Size of data item. */
+ size += BITEM_PSIZE(
+ GET_BKEYDATA(dbp, npg, indx + n_ok));
+ n_ok++;
+ } while (indx + n_ok < nent &&
+ ninp[indx] == ninp[indx + n_ok]);
+ }
+ /* if the next set will not fit on the page we are done. */
+ if (freespace < size)
+ break;
+
+ /*
+ * Otherwise figure out if we are past the goal and if
+ * adding this set will put us closer to the goal than
+ * we are now.
+ */
+ if ((freespace - size) < factor) {
+ if (freespace - factor > factor - (freespace - size))
+ indx += n_ok;
+ break;
+ }
+ freespace -= size;
+ indx += n_ok - adj;
+ }
+
+ /* If we have hit the first record then there is nothing we can move. */
+ if (indx == 0)
+ goto done;
+ if (TYPE(pg) != P_LBTREE && TYPE(pg) != P_LDUP) {
+ if (indx == nent)
+ return (__bam_merge_pages(dbc, ndbc, c_data));
+ goto no_check;
+ }
+ /*
+ * We need to update npg's parent key. Avoid creating a new key
+ * that will be too big. Get what space will be available on the
+ * parents. Then if there will not be room for this key, see if
+ * prefix compression will make it work, if not backup till we
+ * find something that will. (Needless to say, this is a very
+ * unlikely event.) If we are deleting this page then we will
+ * need to propagate the next key to our grand parents, so we
+ * see if that will fit.
+ */
+ pfree = dbp->pgsize;
+ for (epg = &ncp->csp[-1]; epg >= ncp->sp; epg--)
+ if ((freespace = P_FREESPACE(dbp, epg->page)) < pfree) {
+ bi = GET_BINTERNAL(dbp, epg->page, epg->indx);
+ /* Add back in the key we will be deleting. */
+ freespace += BINTERNAL_PSIZE(bi->len);
+ if (freespace < pfree)
+ pfree = freespace;
+ if (epg->indx != 0)
+ break;
+ }
+
+ /*
+ * If we are at the end, we will delete this page. We need to
+ * check the next parent key only if we are the leftmost page and
+ * will therefore have to propagate the key up the tree.
+ */
+ if (indx == nent) {
+ if (ncp->csp[-1].indx != 0 || ncp->csp[-1].entries == 1 ||
+ BINTERNAL_PSIZE(GET_BINTERNAL(dbp,
+ ncp->csp[-1].page, 1)->len) <= pfree)
+ return (__bam_merge_pages(dbc, ndbc, c_data));
+ indx -= adj;
+ }
+ bk = GET_BKEYDATA(dbp, npg, indx);
+ if (indx != 0 && BINTERNAL_SIZE(bk->len) >= pfree) {
+ if (F_ISSET(dbc, DBC_OPD)) {
+ if (dbp->dup_compare == __bam_defcmp)
+ func = __bam_defpfx;
+ else
+ func = NULL;
+ } else
+ func = t->bt_prefix;
+ } else
+ func = NULL;
+
+ /* Skip to the beginning of a duplicate set. */
+ while (indx != 0 && ninp[indx] == ninp[indx - adj])
+ indx -= adj;
+
+ while (indx != 0 && BINTERNAL_SIZE(bk->len) >= pfree) {
+ if (B_TYPE(bk->type) != B_KEYDATA)
+ goto noprefix;
+ /*
+ * Figure out if we can truncate this key.
+ * Code borrowed from bt_split.c
+ */
+ if (func == NULL)
+ goto noprefix;
+ tmp_bk = GET_BKEYDATA(dbp, npg, indx - adj);
+ if (B_TYPE(tmp_bk->type) != B_KEYDATA)
+ goto noprefix;
+ memset(&a, 0, sizeof(a));
+ a.size = tmp_bk->len;
+ a.data = tmp_bk->data;
+ memset(&b, 0, sizeof(b));
+ b.size = bk->len;
+ b.data = bk->data;
+ nksize = (u_int32_t)func(dbp, &a, &b);
+ if (BINTERNAL_PSIZE(nksize) < pfree)
+ break;
+noprefix:
+ /* Skip to the beginning of a duplicate set. */
+ do {
+ indx -= adj;
+ } while (indx != 0 && ninp[indx] == ninp[indx - adj]);
+
+ bk = GET_BKEYDATA(dbp, npg, indx);
+ }
+
+ /*
+ * indx references the first record that will not move to the previous
+ * page. If it is 0 then we could not find a key that would fit in
+ * the parent that would permit us to move any records.
+ */
+ if (indx == 0)
+ goto done;
+ DB_ASSERT(env, indx <= nent);
+
+ /* Loop through the records and move them from npg to pg. */
+no_check: is_dup = first_dup = next_dup = 0;
+ pg = cp->csp->page;
+ npg = ncp->csp->page;
+ DB_ASSERT(env, IS_DIRTY(pg));
+ DB_ASSERT(env, IS_DIRTY(npg));
+ ninp = P_INP(dbp, npg);
+ do {
+ bk = GET_BKEYDATA(dbp, npg, 0);
+ /* Figure out if we are in a duplicate group or not. */
+ if ((NUM_ENT(npg) % 2) == 0) {
+ if (NUM_ENT(npg) > 2 && ninp[0] == ninp[2]) {
+ if (!is_dup) {
+ first_dup = 1;
+ is_dup = 1;
+ } else
+ first_dup = 0;
+
+ next_dup = 1;
+ } else if (next_dup) {
+ is_dup = 1;
+ first_dup = 0;
+ next_dup = 0;
+ } else
+ is_dup = 0;
+ }
+
+ if (is_dup && !first_dup && (pind % 2) == 0) {
+ /* Duplicate key. */
+ if ((ret = __bam_adjindx(dbc,
+ pg, pind, pind - P_INDX, 1)) != 0)
+ goto err;
+ if (!next_dup)
+ is_dup = 0;
+ } else switch (B_TYPE(bk->type)) {
+ case B_KEYDATA:
+ hdr.data = bk;
+ hdr.size = SSZA(BKEYDATA, data);
+ data.size = bk->len;
+ data.data = bk->data;
+ if ((ret = __db_pitem(dbc, pg, pind,
+ BKEYDATA_SIZE(bk->len), &hdr, &data)) != 0)
+ goto err;
+ break;
+ case B_OVERFLOW:
+ case B_DUPLICATE:
+ data.size = BOVERFLOW_SIZE;
+ data.data = bk;
+ if ((ret = __db_pitem(dbc, pg, pind,
+ BOVERFLOW_SIZE, &data, NULL)) != 0)
+ goto err;
+ break;
+ default:
+ __db_errx(env,
+ "Unknown record format, page %lu, indx 0",
+ (u_long)PGNO(pg));
+ ret = EINVAL;
+ goto err;
+ }
+ pind++;
+ if (next_dup && (NUM_ENT(npg) % 2) == 0) {
+ if ((ret = __bam_adjindx(ndbc,
+ npg, 0, O_INDX, 0)) != 0)
+ goto err;
+ } else {
+ if ((ret = __db_ditem(ndbc,
+ npg, 0, BITEM_SIZE(bk))) != 0)
+ goto err;
+ }
+ adjust++;
+ } while (--indx != 0);
+
+ DB_ASSERT(env, NUM_ENT(npg) != 0);
+
+ if (adjust != 0 &&
+ (F_ISSET(cp, C_RECNUM) || F_ISSET(dbc, DBC_OPD))) {
+ if (TYPE(pg) == P_LBTREE)
+ adjust /= P_INDX;
+ if ((ret = __bam_adjust(ndbc, -adjust)) != 0)
+ goto err;
+
+ if ((ret = __bam_adjust(dbc, adjust)) != 0)
+ goto err;
+ }
+
+ /* Update parent with new key. */
+ if (ndbc->dbtype == DB_BTREE &&
+ (ret = __bam_pupdate(ndbc, pg)) != 0)
+ goto err;
+
+done: if (cp->sp->page == ncp->sp->page) {
+ cp->sp->page = NULL;
+ LOCK_INIT(cp->sp->lock);
+ }
+ ret = __bam_stkrel(ndbc, STK_CLRDBC);
+
+err: return (ret);
+}
+
+static int
+__bam_merge_pages(dbc, ndbc, c_data)
+ DBC *dbc, *ndbc;
+ DB_COMPACT *c_data;
+{
+ BTREE_CURSOR *cp, *ncp;
+ DB *dbp;
+ DBT data, hdr;
+ DB_MPOOLFILE *dbmp;
+ PAGE *pg, *npg;
+ db_indx_t nent, *ninp, *pinp;
+ db_pgno_t ppgno;
+ u_int8_t *bp;
+ u_int32_t len;
+ int i, level, ret;
+
+ COMPQUIET(ppgno, PGNO_INVALID);
+ dbp = dbc->dbp;
+ dbmp = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ ncp = (BTREE_CURSOR *)ndbc->internal;
+ pg = cp->csp->page;
+ npg = ncp->csp->page;
+ memset(&hdr, 0, sizeof(hdr));
+ nent = NUM_ENT(npg);
+
+ /* If the page is empty just throw it away. */
+ if (nent == 0)
+ goto free_page;
+
+ pg = cp->csp->page;
+ npg = ncp->csp->page;
+ DB_ASSERT(dbp->env, IS_DIRTY(pg));
+ DB_ASSERT(dbp->env, IS_DIRTY(npg));
+ DB_ASSERT(dbp->env, nent == NUM_ENT(npg));
+
+ /* Bulk copy the data to the new page. */
+ len = dbp->pgsize - HOFFSET(npg);
+ if (DBC_LOGGING(dbc)) {
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.data = npg;
+ hdr.size = LOFFSET(dbp, npg);
+ memset(&data, 0, sizeof(data));
+ data.data = (u_int8_t *)npg + HOFFSET(npg);
+ data.size = len;
+ if ((ret = __bam_merge_log(dbp,
+ dbc->txn, &LSN(pg), 0, PGNO(pg),
+ &LSN(pg), PGNO(npg), &LSN(npg), &hdr, &data, 0)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(pg));
+ LSN(npg) = LSN(pg);
+ bp = (u_int8_t *)pg + HOFFSET(pg) - len;
+ memcpy(bp, (u_int8_t *)npg + HOFFSET(npg), len);
+
+ /* Copy index table offset by what was there already. */
+ pinp = P_INP(dbp, pg) + NUM_ENT(pg);
+ ninp = P_INP(dbp, npg);
+ for (i = 0; i < NUM_ENT(npg); i++)
+ *pinp++ = *ninp++ - (dbp->pgsize - HOFFSET(pg));
+ HOFFSET(pg) -= len;
+ NUM_ENT(pg) += i;
+
+ NUM_ENT(npg) = 0;
+ HOFFSET(npg) += len;
+
+ if (F_ISSET(cp, C_RECNUM) || F_ISSET(dbc, DBC_OPD)) {
+ /*
+ * There are two cases here regarding the stack.
+ * Either we have two two level stacks but only ndbc
+ * references the parent page or we have a multilevel
+ * stack and only ndbc has an entry for the spanning
+ * page.
+ */
+ if (TYPE(pg) == P_LBTREE)
+ i /= P_INDX;
+ if ((ret = __bam_adjust(ndbc, -i)) != 0)
+ goto err;
+
+ if ((ret = __bam_adjust(dbc, i)) != 0)
+ goto err;
+ }
+
+free_page:
+ /*
+ * __bam_dpages may decide to collapse the tree.
+ * This can happen if we have the root and there
+ * are exactly 2 pointers left in it.
+ * If it can collapse the tree we must free the other
+ * stack since it will nolonger be valid. This
+ * must be done before hand because we cannot
+ * hold a page pinned if it might be truncated.
+ */
+ if ((ret = __bam_relink(dbc,
+ ncp->csp->page, cp->csp->page, PGNO_INVALID)) != 0)
+ goto err;
+ /* Drop the duplicate reference to the sub tree root. */
+ cp->sp->page = NULL;
+ LOCK_INIT(cp->sp->lock);
+ if (PGNO(ncp->sp->page) == ncp->root &&
+ NUM_ENT(ncp->sp->page) == 2) {
+ if ((ret = __bam_stkrel(dbc, STK_CLRDBC | STK_PGONLY)) != 0)
+ goto err;
+ level = LEVEL(ncp->sp->page);
+ ppgno = PGNO(ncp->csp[-1].page);
+ } else
+ level = 0;
+ if (c_data->compact_truncate > PGNO(npg))
+ c_data->compact_truncate--;
+ if ((ret = __bam_dpages(ndbc,
+ 0, ndbc->dbtype == DB_RECNO ? 0 : BTD_UPDATE)) != 0)
+ goto err;
+ npg = NULL;
+ c_data->compact_pages_free++;
+ c_data->compact_pages--;
+ if (level != 0) {
+ if ((ret = __memp_fget(dbmp, &ncp->root,
+ dbc->thread_info, dbc->txn, 0, &npg)) != 0)
+ goto err;
+ if (level == LEVEL(npg))
+ level = 0;
+ if ((ret = __memp_fput(dbmp,
+ dbc->thread_info, npg, dbc->priority)) != 0)
+ goto err;
+ npg = NULL;
+ if (level != 0) {
+ c_data->compact_levels++;
+ c_data->compact_pages_free++;
+ if (c_data->compact_truncate > ppgno)
+ c_data->compact_truncate--;
+ if (c_data->compact_pages != 0)
+ c_data->compact_pages--;
+ }
+ }
+
+err: return (ret);
+}
+
+/*
+ * __bam_merge_internal --
+ * Merge internal nodes of the tree.
+ */
+static int
+__bam_merge_internal(dbc, ndbc, level, c_data, merged)
+ DBC *dbc, *ndbc;
+ int level;
+ DB_COMPACT *c_data;
+ int *merged;
+{
+ BINTERNAL bi, *bip, *fip;
+ BTREE_CURSOR *cp, *ncp;
+ DB *dbp;
+ DBT data, hdr;
+ DB_MPOOLFILE *dbmp;
+ EPG *epg, *save_csp, *nsave_csp;
+ PAGE *pg, *npg;
+ RINTERNAL *rk;
+ db_indx_t first, indx, pind;
+ db_pgno_t ppgno;
+ int32_t nrecs, trecs;
+ u_int16_t size;
+ u_int32_t freespace, pfree;
+ int ret;
+
+ COMPQUIET(bip, NULL);
+ COMPQUIET(ppgno, PGNO_INVALID);
+ DB_ASSERT(NULL, dbc != NULL);
+ DB_ASSERT(NULL, ndbc != NULL);
+
+ /*
+ * ndbc will contain the the dominating parent of the subtree.
+ * dbc will have the tree containing the left child.
+ *
+ * The stacks descend to the leaf level.
+ * If this is a recno tree then both stacks will start at the root.
+ */
+ dbp = dbc->dbp;
+ dbmp = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ ncp = (BTREE_CURSOR *)ndbc->internal;
+ *merged = 0;
+ ret = 0;
+
+ /*
+ * Set the stacks to the level requested.
+ * Save the old value to restore when we exit.
+ */
+ save_csp = cp->csp;
+ cp->csp = &cp->csp[-level + 1];
+ pg = cp->csp->page;
+ pind = NUM_ENT(pg);
+
+ nsave_csp = ncp->csp;
+ ncp->csp = &ncp->csp[-level + 1];
+ npg = ncp->csp->page;
+ indx = NUM_ENT(npg);
+
+ /*
+ * The caller may have two stacks that include common ancestors, we
+ * check here for convenience.
+ */
+ if (npg == pg)
+ goto done;
+
+ if (TYPE(pg) == P_IBTREE) {
+ /*
+ * Check for overflow keys on both pages while we have
+ * them locked.
+ */
+ if ((ret =
+ __bam_truncate_internal_overflow(dbc, pg, c_data)) != 0)
+ goto err;
+ if ((ret =
+ __bam_truncate_internal_overflow(dbc, npg, c_data)) != 0)
+ goto err;
+ }
+
+ /*
+ * If we are about to move data off the left most page of an
+ * internal node we will need to update its parents, make sure there
+ * will be room for the new key on all the parents in the stack.
+ * If not, move less data.
+ */
+ fip = NULL;
+ if (TYPE(pg) == P_IBTREE) {
+ /* See where we run out of space. */
+ freespace = P_FREESPACE(dbp, pg);
+ /*
+ * The leftmost key of an internal page is not accurate.
+ * Go up the tree to find a non-leftmost parent.
+ */
+ epg = ncp->csp;
+ while (--epg >= ncp->sp && epg->indx == 0)
+ continue;
+ fip = bip = GET_BINTERNAL(dbp, epg->page, epg->indx);
+ epg = ncp->csp;
+
+ for (indx = 0;;) {
+ size = BINTERNAL_PSIZE(bip->len);
+ if (size > freespace)
+ break;
+ freespace -= size;
+ if (++indx >= NUM_ENT(npg))
+ break;
+ bip = GET_BINTERNAL(dbp, npg, indx);
+ }
+
+ /* See if we are deleting the page and we are not left most. */
+ if (indx == NUM_ENT(npg) && epg[-1].indx != 0)
+ goto fits;
+
+ pfree = dbp->pgsize;
+ for (epg--; epg >= ncp->sp; epg--)
+ if ((freespace = P_FREESPACE(dbp, epg->page)) < pfree) {
+ bip = GET_BINTERNAL(dbp, epg->page, epg->indx);
+ /* Add back in the key we will be deleting. */
+ freespace += BINTERNAL_PSIZE(bip->len);
+ if (freespace < pfree)
+ pfree = freespace;
+ if (epg->indx != 0)
+ break;
+ }
+ epg = ncp->csp;
+
+ /* If we are at the end of the page we will delete it. */
+ if (indx == NUM_ENT(npg)) {
+ if (NUM_ENT(epg[-1].page) == 1)
+ goto fits;
+ bip =
+ GET_BINTERNAL(dbp, epg[-1].page, epg[-1].indx + 1);
+ } else
+ bip = GET_BINTERNAL(dbp, npg, indx);
+
+ /* Back up until we have a key that fits. */
+ while (indx != 0 && BINTERNAL_PSIZE(bip->len) > pfree) {
+ indx--;
+ bip = GET_BINTERNAL(dbp, npg, indx);
+ }
+ if (indx == 0)
+ goto done;
+ }
+
+fits: memset(&bi, 0, sizeof(bi));
+ memset(&hdr, 0, sizeof(hdr));
+ memset(&data, 0, sizeof(data));
+ trecs = 0;
+
+ /*
+ * Copy data between internal nodes till one is full
+ * or the other is empty.
+ */
+ first = 0;
+ nrecs = 0;
+ do {
+ if (dbc->dbtype == DB_BTREE) {
+ bip = GET_BINTERNAL(dbp, npg, 0);
+ size = fip == NULL ?
+ BINTERNAL_SIZE(bip->len) :
+ BINTERNAL_SIZE(fip->len);
+ if (P_FREESPACE(dbp, pg) < size + sizeof(db_indx_t))
+ break;
+
+ if (fip == NULL) {
+ data.size = bip->len;
+ data.data = bip->data;
+ } else {
+ data.size = fip->len;
+ data.data = fip->data;
+ }
+ bi.len = data.size;
+ B_TSET(bi.type, bip->type);
+ bi.pgno = bip->pgno;
+ bi.nrecs = bip->nrecs;
+ hdr.data = &bi;
+ hdr.size = SSZA(BINTERNAL, data);
+ if (F_ISSET(cp, C_RECNUM) || F_ISSET(dbc, DBC_OPD))
+ nrecs = (int32_t)bip->nrecs;
+ } else {
+ rk = GET_RINTERNAL(dbp, npg, 0);
+ size = RINTERNAL_SIZE;
+ if (P_FREESPACE(dbp, pg) < size + sizeof(db_indx_t))
+ break;
+
+ hdr.data = rk;
+ hdr.size = size;
+ nrecs = (int32_t)rk->nrecs;
+ }
+ /*
+ * Try to lock the subtree leaf records without waiting.
+ * We must lock the subtree below the record we are merging
+ * and the one after it since that is were a search will wind
+ * up if it has already looked at our parent. After the first
+ * move we have the current subtree already locked.
+ * If we merged any records then we will revisit this
+ * node when we merge its leaves. If not we will return
+ * NOTGRANTED and our caller will do a retry. We only
+ * need to do this if we are in a transation. If not then
+ * we cannot abort and things will be hosed up on error
+ * anyway.
+ */
+ if (dbc->txn != NULL && (ret = __bam_lock_tree(ndbc,
+ ncp->csp, nsave_csp, first,
+ NUM_ENT(ncp->csp->page) == 1 ? 1 : 2)) != 0) {
+ if (ret != DB_LOCK_NOTGRANTED)
+ goto err;
+ break;
+ }
+ first = 1;
+ if ((ret = __db_pitem(dbc, pg, pind, size, &hdr, &data)) != 0)
+ goto err;
+ pind++;
+ if (fip != NULL) {
+ /* reset size to be for the record being deleted. */
+ size = BINTERNAL_SIZE(bip->len);
+ fip = NULL;
+ }
+ if ((ret = __db_ditem(ndbc, npg, 0, size)) != 0)
+ goto err;
+ *merged = 1;
+ trecs += nrecs;
+ } while (--indx != 0);
+
+ if (!*merged)
+ goto done;
+
+ if (trecs != 0) {
+ cp->csp--;
+ ret = __bam_adjust(dbc, trecs);
+ if (ret != 0)
+ goto err;
+ cp->csp++;
+ ncp->csp--;
+ if ((ret = __bam_adjust(ndbc, -trecs)) != 0)
+ goto err;
+ ncp->csp++;
+ }
+
+ /*
+ * Either we emptied the page or we need to update its
+ * parent to reflect the first page we now point to.
+ * First get rid of the bottom of the stack,
+ * bam_dpages will clear the stack. Maintain transactional
+ * locks on the leaf pages to protect changes at this level.
+ */
+ do {
+ if ((ret = __memp_fput(dbmp, dbc->thread_info,
+ nsave_csp->page, dbc->priority)) != 0)
+ goto err;
+ nsave_csp->page = NULL;
+ if ((ret = __TLPUT(dbc, nsave_csp->lock)) != 0)
+ goto err;
+ LOCK_INIT(nsave_csp->lock);
+ nsave_csp--;
+ } while (nsave_csp != ncp->csp);
+
+ if (NUM_ENT(npg) == 0) {
+ /*
+ * __bam_dpages may decide to collapse the tree
+ * so we need to free our other stack. The tree
+ * will change in hight and our stack will nolonger
+ * be valid.
+ */
+ cp->csp = save_csp;
+ cp->sp->page = NULL;
+ LOCK_INIT(cp->sp->lock);
+ if (PGNO(ncp->sp->page) == ncp->root &&
+ NUM_ENT(ncp->sp->page) == 2) {
+ if ((ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0)
+ goto err;
+ level = LEVEL(ncp->sp->page);
+ ppgno = PGNO(ncp->csp[-1].page);
+ } else
+ level = 0;
+
+ if (c_data->compact_truncate > PGNO(npg))
+ c_data->compact_truncate--;
+ ret = __bam_dpages(ndbc,
+ 0, ndbc->dbtype == DB_RECNO ?
+ BTD_RELINK : BTD_UPDATE | BTD_RELINK);
+ c_data->compact_pages_free++;
+ if (ret == 0 && level != 0) {
+ if ((ret = __memp_fget(dbmp, &ncp->root,
+ dbc->thread_info, dbc->txn, 0, &npg)) != 0)
+ goto err;
+ if (level == LEVEL(npg))
+ level = 0;
+ if ((ret = __memp_fput(dbmp,
+ dbc->thread_info, npg, dbc->priority)) != 0)
+ goto err;
+ npg = NULL;
+ if (level != 0) {
+ c_data->compact_levels++;
+ c_data->compact_pages_free++;
+ if (c_data->compact_truncate > ppgno)
+ c_data->compact_truncate--;
+ if (c_data->compact_pages != 0)
+ c_data->compact_pages--;
+ }
+ }
+ } else {
+ ret = __bam_pupdate(ndbc, npg);
+
+ if (NUM_ENT(npg) != 0 &&
+ c_data->compact_truncate != PGNO_INVALID &&
+ PGNO(npg) > c_data->compact_truncate &&
+ ncp->csp != ncp->sp) {
+ if ((ret = __bam_truncate_page(ndbc, &npg, pg, 1)) != 0)
+ goto err;
+ }
+ if (c_data->compact_truncate != PGNO_INVALID &&
+ PGNO(pg) > c_data->compact_truncate && cp->csp != cp->sp) {
+ if ((ret = __bam_truncate_page(dbc, &pg, npg, 1)) != 0)
+ goto err;
+ }
+ }
+ cp->csp = save_csp;
+
+ return (ret);
+
+done:
+err: cp->csp = save_csp;
+ ncp->csp = nsave_csp;
+
+ return (ret);
+}
+
+/*
+ * __bam_compact_dups -- try to compress off page dup trees.
+ * We may or may not have a write lock on this page.
+ */
+static int
+__bam_compact_dups(dbc, ppg, factor, have_lock, c_data, donep)
+ DBC *dbc;
+ PAGE **ppg;
+ u_int32_t factor;
+ int have_lock;
+ DB_COMPACT *c_data;
+ int *donep;
+{
+ BOVERFLOW *bo;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBC *opd;
+ DBT start;
+ DB_MPOOLFILE *dbmp;
+ ENV *env;
+ PAGE *dpg, *pg;
+ db_indx_t i;
+ db_pgno_t pgno;
+ int isdone, level, ret, span, t_ret;
+
+ span = 0;
+ ret = 0;
+ opd = NULL;
+
+ DB_ASSERT(NULL, dbc != NULL);
+ dbp = dbc->dbp;
+ env = dbp->env;
+ dbmp = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ pg = *ppg;
+
+ for (i = 0; i < NUM_ENT(pg); i++) {
+ bo = GET_BOVERFLOW(dbp, pg, i);
+ if (B_TYPE(bo->type) == B_KEYDATA)
+ continue;
+ c_data->compact_pages_examine++;
+ if (bo->pgno > c_data->compact_truncate) {
+ (*donep)++;
+ if (!have_lock) {
+ /*
+ * The caller should have the page at
+ * least read locked. Drop the buffer
+ * and get the write lock.
+ */
+ pgno = PGNO(pg);
+ if ((ret = __memp_fput(dbmp, dbc->thread_info,
+ pg, dbc->priority)) != 0)
+ goto err;
+ *ppg = NULL;
+ if ((ret = __db_lget(dbc, 0, pgno,
+ DB_LOCK_WRITE, 0, &cp->csp->lock)) != 0)
+ goto err;
+ have_lock = 1;
+ if ((ret = __memp_fget(dbmp, &pgno,
+ dbc->thread_info,
+ dbc->txn, DB_MPOOL_DIRTY, ppg)) != 0)
+ goto err;
+ pg = *ppg;
+ }
+ if ((ret =
+ __bam_truncate_root_page(dbc, pg, i, c_data)) != 0)
+ goto err;
+ /* Just in case it should move. Could it? */
+ bo = GET_BOVERFLOW(dbp, pg, i);
+ }
+
+ if (B_TYPE(bo->type) == B_OVERFLOW) {
+ if ((ret = __bam_truncate_overflow(dbc,
+ bo->pgno, have_lock ? NULL : ppg, c_data)) != 0)
+ goto err;
+ (*donep)++;
+ continue;
+ }
+ /*
+ * Take a peek at the root. If it's a leaf then
+ * there is no tree here, avoid all the trouble.
+ */
+ if ((ret = __memp_fget(dbmp, &bo->pgno,
+ dbc->thread_info, dbc->txn, 0, &dpg)) != 0)
+ goto err;
+
+ level = dpg->level;
+ if ((ret = __memp_fput(dbmp,
+ dbc->thread_info, dpg, dbc->priority)) != 0)
+ goto err;
+ if (level == LEAFLEVEL)
+ continue;
+ if ((ret = __dbc_newopd(dbc, bo->pgno, NULL, &opd)) != 0)
+ return (ret);
+ if (!have_lock) {
+ /*
+ * The caller should have the page at
+ * least read locked. Drop the buffer
+ * and get the write lock.
+ */
+ pgno = PGNO(pg);
+ if ((ret = __memp_fput(dbmp, dbc->thread_info,
+ pg, dbc->priority)) != 0)
+ goto err;
+ *ppg = NULL;
+ if ((ret = __db_lget(dbc, 0, pgno,
+ DB_LOCK_WRITE, 0, &cp->csp->lock)) != 0)
+ goto err;
+ have_lock = 1;
+ if ((ret = __memp_fget(dbmp, &pgno,
+ dbc->thread_info,
+ dbc->txn, DB_MPOOL_DIRTY, ppg)) != 0)
+ goto err;
+ pg = *ppg;
+ }
+ (*donep)++;
+ memset(&start, 0, sizeof(start));
+ do {
+ if ((ret = __bam_compact_int(opd, &start,
+ NULL, factor, &span, c_data, &isdone)) != 0)
+ break;
+ } while (!isdone);
+
+ if (start.data != NULL)
+ __os_free(env, start.data);
+
+ if (ret != 0)
+ goto err;
+
+ ret = __dbc_close(opd);
+ opd = NULL;
+ if (ret != 0)
+ goto err;
+ }
+
+err: if (opd != NULL && (t_ret = __dbc_close(opd)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __bam_truncate_page -- swap a page with a lower numbered page.
+ * The cusor has a stack which includes at least the
+ * immediate parent of this page.
+ */
+static int
+__bam_truncate_page(dbc, pgp, opg, update_parent)
+ DBC *dbc;
+ PAGE **pgp, *opg;
+ int update_parent;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT data, hdr;
+ DB_LSN lsn;
+ DB_LOCK lock;
+ EPG *epg;
+ PAGE *newpage;
+ db_pgno_t newpgno, oldpgno, *pgnop;
+ int ret;
+
+ DB_ASSERT(NULL, dbc != NULL);
+ dbp = dbc->dbp;
+ LOCK_INIT(lock);
+
+ /*
+ * We want to free a page that lives in the part of the file that
+ * can be truncated, so we're going to move it onto a free page
+ * that is in the part of the file that need not be truncated.
+ * Since the freelist is ordered now, we can simply call __db_new
+ * which will grab the first element off the freelist; we know this
+ * is the lowest numbered free page.
+ */
+ if ((ret = __db_new(dbc, P_DONTEXTEND | TYPE(*pgp),
+ TYPE(*pgp) == P_LBTREE ? &lock : NULL, &newpage)) != 0)
+ return (ret);
+
+ /*
+ * If newpage is null then __db_new would have had to allocate
+ * a new page from the filesystem, so there is no reason
+ * to continue this action.
+ */
+ if (newpage == NULL)
+ return (0);
+
+ /*
+ * It is possible that a higher page is allocated if other threads
+ * are allocating at the same time, if so, just put it back.
+ */
+ if (PGNO(newpage) > PGNO(*pgp)) {
+ /* Its unfortunate but you can't just free a new overflow. */
+ if (TYPE(newpage) == P_OVERFLOW)
+ OV_LEN(newpage) = 0;
+ if ((ret = __LPUT(dbc, lock)) != 0)
+ return (ret);
+ return (__db_free(dbc, newpage));
+ }
+
+ /* Log if necessary. */
+ if (DBC_LOGGING(dbc)) {
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.data = *pgp;
+ hdr.size = P_OVERHEAD(dbp);
+ memset(&data, 0, sizeof(data));
+ if (TYPE(*pgp) == P_OVERFLOW) {
+ data.data = (u_int8_t *)*pgp + P_OVERHEAD(dbp);
+ data.size = OV_LEN(*pgp);
+ } else {
+ data.data = (u_int8_t *)*pgp + HOFFSET(*pgp);
+ data.size = dbp->pgsize - HOFFSET(*pgp);
+ hdr.size += NUM_ENT(*pgp) * sizeof(db_indx_t);
+ }
+ if ((ret = __bam_merge_log(dbp, dbc->txn,
+ &LSN(newpage), 0, PGNO(newpage), &LSN(newpage),
+ PGNO(*pgp), &LSN(*pgp), &hdr, &data, 1)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(newpage));
+
+ oldpgno = PGNO(*pgp);
+ newpgno = PGNO(newpage);
+ lsn = LSN(newpage);
+ memcpy(newpage, *pgp, dbp->pgsize);
+ PGNO(newpage) = newpgno;
+ LSN(newpage) = lsn;
+
+ /* Empty the old page. */
+ if ((ret = __memp_dirty(dbp->mpf,
+ pgp, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err;
+ if (TYPE(*pgp) == P_OVERFLOW)
+ OV_LEN(*pgp) = 0;
+ else {
+ HOFFSET(*pgp) = dbp->pgsize;
+ NUM_ENT(*pgp) = 0;
+ }
+ LSN(*pgp) = lsn;
+
+ /* Update siblings. */
+ switch (TYPE(newpage)) {
+ case P_OVERFLOW:
+ case P_LBTREE:
+ case P_LRECNO:
+ case P_LDUP:
+ if (NEXT_PGNO(newpage) == PGNO_INVALID &&
+ PREV_PGNO(newpage) == PGNO_INVALID)
+ break;
+ if ((ret = __bam_relink(dbc, *pgp, opg, PGNO(newpage))) != 0)
+ goto err;
+ break;
+ default:
+ break;
+ }
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * Now, if we free this page, it will get truncated, when we free
+ * all the pages after it in the file.
+ */
+ ret = __db_free(dbc, *pgp);
+ /* db_free always puts the page. */
+ *pgp = newpage;
+
+ if (ret != 0)
+ return (ret);
+
+ if (!update_parent)
+ goto done;
+
+ /* Update the parent. */
+ epg = &cp->csp[-1];
+
+ switch (TYPE(epg->page)) {
+ case P_IBTREE:
+ pgnop = &GET_BINTERNAL(dbp, epg->page, epg->indx)->pgno;
+ break;
+ case P_IRECNO:
+ pgnop = &GET_RINTERNAL(dbp, epg->page, epg->indx)->pgno;
+ break;
+ default:
+ pgnop = &GET_BOVERFLOW(dbp, epg->page, epg->indx)->pgno;
+ break;
+ }
+ DB_ASSERT(dbp->env, oldpgno == *pgnop);
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __bam_pgno_log(dbp, dbc->txn, &LSN(epg->page),
+ 0, PGNO(epg->page), &LSN(epg->page), (u_int32_t)epg->indx,
+ *pgnop, PGNO(newpage))) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(LSN(epg->page));
+
+ *pgnop = PGNO(newpage);
+ cp->csp->page = newpage;
+ if ((ret = __TLPUT(dbc, lock)) != 0)
+ return (ret);
+
+done: return (0);
+
+err: (void)__memp_fput(dbp->mpf, dbc->thread_info, newpage, dbc->priority);
+ (void)__TLPUT(dbc, lock);
+ return (ret);
+}
+
+/*
+ * __bam_truncate_overflow -- find overflow pages to truncate.
+ * Walk the pages of an overflow chain and swap out
+ * high numbered pages. We are passed the first page
+ * but only deal with the second and subsequent pages.
+ */
+
+static int
+__bam_truncate_overflow(dbc, pgno, ppg, c_data)
+ DBC *dbc;
+ db_pgno_t pgno;
+ PAGE **ppg;
+ DB_COMPACT *c_data;
+{
+ DB *dbp;
+ DB_LOCK lock;
+ PAGE *page;
+ db_pgno_t ppgno;
+ int have_lock, ret, t_ret;
+
+ dbp = dbc->dbp;
+ page = NULL;
+ LOCK_INIT(lock);
+ have_lock = ppg == NULL;
+
+ if ((ret = __memp_fget(dbp->mpf, &pgno,
+ dbc->thread_info, dbc->txn, 0, &page)) != 0)
+ return (ret);
+
+ while ((pgno = NEXT_PGNO(page)) != PGNO_INVALID) {
+ if ((ret = __memp_fput(dbp->mpf,
+ dbc->thread_info, page, dbc->priority)) != 0)
+ return (ret);
+ if ((ret = __memp_fget(dbp->mpf, &pgno,
+ dbc->thread_info, dbc->txn, 0, &page)) != 0)
+ return (ret);
+ if (pgno <= c_data->compact_truncate)
+ continue;
+ if (have_lock == 0) {
+ ppgno = PGNO(*ppg);
+ if ((ret = __memp_fput(dbp->mpf, dbc->thread_info,
+ *ppg, dbc->priority)) != 0)
+ goto err;
+ *ppg = NULL;
+ if ((ret = __db_lget(dbc, 0, ppgno,
+ DB_LOCK_WRITE, 0, &lock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(dbp->mpf, &ppgno,
+ dbc->thread_info,
+ dbc->txn, DB_MPOOL_DIRTY, ppg)) != 0)
+ goto err;
+ have_lock = 1;
+ }
+ if ((ret = __bam_truncate_page(dbc, &page, NULL, 0)) != 0)
+ break;
+ }
+
+err: if (page != NULL &&
+ (t_ret = __memp_fput( dbp->mpf,
+ dbc->thread_info, page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __bam_truncate_root_page -- swap a page which is
+ * the root of an off page dup tree or the head of an overflow.
+ * The page is reference by the pg/indx passed in.
+ */
+static int
+__bam_truncate_root_page(dbc, pg, indx, c_data)
+ DBC *dbc;
+ PAGE *pg;
+ u_int32_t indx;
+ DB_COMPACT *c_data;
+{
+ BINTERNAL *bi;
+ BOVERFLOW *bo;
+ DB *dbp;
+ DBT orig;
+ PAGE *page;
+ db_pgno_t newpgno, *pgnop;
+ int ret, t_ret;
+
+ COMPQUIET(c_data, NULL);
+ COMPQUIET(bo, NULL);
+ COMPQUIET(newpgno, PGNO_INVALID);
+ dbp = dbc->dbp;
+ page = NULL;
+ if (TYPE(pg) == P_IBTREE) {
+ bi = GET_BINTERNAL(dbp, pg, indx);
+ if (B_TYPE(bi->type) == B_OVERFLOW) {
+ bo = (BOVERFLOW *)(bi->data);
+ pgnop = &bo->pgno;
+ } else
+ pgnop = &bi->pgno;
+ } else {
+ bo = GET_BOVERFLOW(dbp, pg, indx);
+ pgnop = &bo->pgno;
+ }
+
+ DB_ASSERT(dbp->env, IS_DIRTY(pg));
+
+ if ((ret = __memp_fget(dbp->mpf, pgnop,
+ dbc->thread_info, dbc->txn, 0, &page)) != 0)
+ goto err;
+
+ /*
+ * If this is a multiply reference overflow key, then we will just
+ * copy it and decrement the reference count. This is part of a
+ * fix to get rid of multiple references.
+ */
+ if (TYPE(page) == P_OVERFLOW && OV_REF(page) > 1) {
+ if ((ret = __db_ovref(dbc, bo->pgno)) != 0)
+ goto err;
+ memset(&orig, 0, sizeof(orig));
+ if ((ret = __db_goff(dbc, &orig, bo->tlen, bo->pgno,
+ &orig.data, &orig.size)) == 0)
+ ret = __db_poff(dbc, &orig, &newpgno);
+ if (orig.data != NULL)
+ __os_free(dbp->env, orig.data);
+ if (ret != 0)
+ goto err;
+ } else {
+ if ((ret = __bam_truncate_page(dbc, &page, NULL, 0)) != 0)
+ goto err;
+ newpgno = PGNO(page);
+ /* If we could not allocate from the free list, give up.*/
+ if (newpgno == *pgnop)
+ goto err;
+ }
+
+ /* Update the reference. */
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __bam_pgno_log(dbp,
+ dbc->txn, &LSN(pg), 0, PGNO(pg),
+ &LSN(pg), (u_int32_t)indx, *pgnop, newpgno)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(pg));
+
+ *pgnop = newpgno;
+
+err: if (page != NULL && (t_ret =
+ __memp_fput(dbp->mpf, dbc->thread_info,
+ page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * -- bam_truncate_internal_overflow -- find overflow keys
+ * on internal pages and if they have high page
+ * numbers swap them with lower pages and truncate them.
+ * Note that if there are overflow keys in the internal
+ * nodes they will get copied adding pages to the database.
+ */
+static int
+__bam_truncate_internal_overflow(dbc, page, c_data)
+ DBC *dbc;
+ PAGE *page;
+ DB_COMPACT *c_data;
+{
+ BINTERNAL *bi;
+ BOVERFLOW *bo;
+ db_indx_t indx;
+ int ret;
+
+ COMPQUIET(bo, NULL);
+ ret = 0;
+ for (indx = 0; indx < NUM_ENT(page); indx++) {
+ bi = GET_BINTERNAL(dbc->dbp, page, indx);
+ if (B_TYPE(bi->type) != B_OVERFLOW)
+ continue;
+ bo = (BOVERFLOW *)(bi->data);
+ if (bo->pgno > c_data->compact_truncate && (ret =
+ __bam_truncate_root_page(dbc, page, indx, c_data)) != 0)
+ break;
+ if ((ret = __bam_truncate_overflow(
+ dbc, bo->pgno, NULL, c_data)) != 0)
+ break;
+ }
+ return (ret);
+}
+
+/*
+ * __bam_compact_isdone ---
+ *
+ * Check to see if the stop key specified by the caller is on the
+ * current page, in which case we are done compacting.
+ */
+static int
+__bam_compact_isdone(dbc, stop, pg, isdone)
+ DBC *dbc;
+ DBT *stop;
+ PAGE *pg;
+ int *isdone;
+{
+ db_recno_t recno;
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ int cmp, ret;
+
+ *isdone = 0;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ t = dbc->dbp->bt_internal;
+
+ if (dbc->dbtype == DB_RECNO) {
+ if ((ret = __ram_getno(dbc, stop, &recno, 0)) != 0)
+ return (ret);
+ *isdone = cp->recno > recno;
+ } else {
+ DB_ASSERT(dbc->dbp->env, TYPE(pg) == P_LBTREE);
+ if ((ret = __bam_cmp(dbc, stop, pg, 0,
+ t->bt_compare, &cmp)) != 0)
+ return (ret);
+
+ *isdone = cmp <= 0;
+ }
+ return (0);
+}
+
+/*
+ * Lock the subtrees from the top of the stack.
+ * The 0'th child may be in the stack and locked otherwise iterate
+ * through the records by calling __bam_lock_subtree.
+ */
+static int
+__bam_lock_tree(dbc, sp, csp, start, stop)
+ DBC *dbc;
+ EPG *sp, *csp;
+ u_int32_t start, stop;
+{
+ PAGE *cpage;
+ db_pgno_t pgno;
+ int ret;
+
+ if (dbc->dbtype == DB_RECNO)
+ pgno = GET_RINTERNAL(dbc->dbp, sp->page, 0)->pgno;
+ else
+ pgno = GET_BINTERNAL(dbc->dbp, sp->page, 0)->pgno;
+ cpage = (sp + 1)->page;
+ /*
+ * First recurse down the left most sub tree if it is in the cursor
+ * stack. We already have these pages latched and locked if its a
+ * leaf.
+ */
+ if (start == 0 && sp + 1 != csp && pgno == PGNO(cpage) &&
+ (ret = __bam_lock_tree(dbc, sp + 1, csp, 0, NUM_ENT(cpage))) != 0)
+ return (ret);
+
+ /*
+ * Then recurse on the other records on the page if needed.
+ * If the page is in the stack then its already locked or
+ * was processed above.
+ */
+ if (start == 0 && pgno == PGNO(cpage))
+ start = 1;
+
+ if (start == stop)
+ return (0);
+ return (__bam_lock_subtree(dbc, sp->page, start, stop));
+
+}
+
+/*
+ * Lock the subtree from the current node.
+ */
+static int
+__bam_lock_subtree(dbc, page, indx, stop)
+ DBC *dbc;
+ PAGE *page;
+ u_int32_t indx, stop;
+{
+ DB *dbp;
+ DB_LOCK lock;
+ PAGE *cpage;
+ db_pgno_t pgno;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+
+ for (; indx < stop; indx++) {
+ if (dbc->dbtype == DB_RECNO)
+ pgno = GET_RINTERNAL(dbc->dbp, page, indx)->pgno;
+ else
+ pgno = GET_BINTERNAL(dbc->dbp, page, indx)->pgno;
+ if (LEVEL(page) - 1 == LEAFLEVEL) {
+ if ((ret = __db_lget(dbc, 0, pgno,
+ DB_LOCK_WRITE, DB_LOCK_NOWAIT, &lock)) != 0) {
+ if (ret == DB_LOCK_DEADLOCK)
+ return (DB_LOCK_NOTGRANTED);
+ return (ret);
+ }
+ } else {
+ if ((ret = __memp_fget(dbp->mpf, &pgno,
+ dbc->thread_info, dbc->txn, 0, &cpage)) != 0)
+ return (ret);
+ ret = __bam_lock_subtree(dbc, cpage, 0, NUM_ENT(cpage));
+ if ((t_ret = __memp_fput(dbp->mpf, dbc->thread_info,
+ cpage, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ return (ret);
+ }
+ }
+ return (0);
+}
+
+#ifdef HAVE_FTRUNCATE
+/*
+ * __bam_savekey -- save the key from an internal page.
+ * We need to save information so that we can
+ * fetch then next internal node of the tree. This means
+ * we need the btree key on this current page, or the
+ * next record number.
+ */
+static int
+__bam_savekey(dbc, next, start)
+ DBC *dbc;
+ int next;
+ DBT *start;
+{
+ BINTERNAL *bi;
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_LOCK lock;
+ ENV *env;
+ PAGE *pg;
+ RINTERNAL *ri;
+ db_indx_t indx, top;
+ db_pgno_t pgno, saved_pgno;
+ int ret, t_ret;
+ u_int32_t len;
+ u_int8_t *data;
+ int level;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ pg = cp->csp->page;
+ ret = 0;
+
+ if (dbc->dbtype == DB_RECNO) {
+ if (next)
+ for (indx = 0, top = NUM_ENT(pg); indx != top; indx++) {
+ ri = GET_RINTERNAL(dbp, pg, indx);
+ cp->recno += ri->nrecs;
+ }
+ return (__db_retcopy(env, start, &cp->recno,
+ sizeof(cp->recno), &start->data, &start->ulen));
+
+ }
+
+ bi = GET_BINTERNAL(dbp, pg, NUM_ENT(pg) - 1);
+ data = bi->data;
+ len = bi->len;
+ LOCK_INIT(lock);
+ saved_pgno = PGNO_INVALID;
+ /* If there is single record on the page it may have an empty key. */
+ while (len == 0) {
+ /*
+ * We should not have an empty data page, since we just
+ * compacted things, check anyway and punt.
+ */
+ if (NUM_ENT(pg) == 0)
+ goto no_key;
+ pgno = bi->pgno;
+ level = LEVEL(pg);
+ if (pg != cp->csp->page &&
+ (ret = __memp_fput(dbp->mpf,
+ dbc->thread_info, pg, dbc->priority)) != 0) {
+ pg = NULL;
+ goto err;
+ }
+ if (level - 1 == LEAFLEVEL) {
+ TRY_LOCK(dbc, pgno, saved_pgno,
+ lock, DB_LOCK_READ, retry);
+ if (ret != 0)
+ goto err;
+ }
+ if ((ret = __memp_fget(dbp->mpf, &pgno,
+ dbc->thread_info, dbc->txn, 0, &pg)) != 0)
+ goto err;
+
+ /*
+ * At the data level use the last key to try and avoid the
+ * possibility that the user has a zero length key, if they
+ * do, we punt.
+ */
+ if (pg->level == LEAFLEVEL) {
+ bk = GET_BKEYDATA(dbp, pg, NUM_ENT(pg) - 2);
+ data = bk->data;
+ len = bk->len;
+ if (len == 0) {
+no_key: __db_errx(env,
+ "Compact cannot handle zero length key");
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ } else {
+ bi = GET_BINTERNAL(dbp, pg, NUM_ENT(pg) - 1);
+ data = bi->data;
+ len = bi->len;
+ }
+ }
+ if (B_TYPE(bi->type) == B_OVERFLOW) {
+ bo = (BOVERFLOW *)(data);
+ ret = __db_goff(dbc, start, bo->tlen, bo->pgno,
+ &start->data, &start->ulen);
+ }
+ else
+ ret = __db_retcopy(env,
+ start, data, len, &start->data, &start->ulen);
+
+err: if (pg != NULL && pg != cp->csp->page &&
+ (t_ret = __memp_fput(dbp->mpf, dbc->thread_info,
+ pg, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+
+retry: return (DB_LOCK_NOTGRANTED);
+}
+
+/*
+ * bam_truncate_internal --
+ * Find high numbered pages in the internal nodes of a tree and
+ * swap them.
+ */
+static int
+__bam_truncate_internal(dbp, ip, txn, c_data)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DB_COMPACT *c_data;
+{
+ BTREE_CURSOR *cp;
+ DBC *dbc;
+ DBT start;
+ DB_LOCK meta_lock;
+ PAGE *pg;
+ db_pgno_t pgno;
+ u_int32_t sflag;
+ int level, local_txn, ret, t_ret;
+
+ dbc = NULL;
+ memset(&start, 0, sizeof(start));
+
+ if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+ local_txn = 1;
+ txn = NULL;
+ } else
+ local_txn = 0;
+
+ level = LEAFLEVEL + 1;
+ sflag = CS_READ | CS_GETRECNO;
+ LOCK_INIT(meta_lock);
+
+new_txn:
+ if (local_txn &&
+ (ret = __txn_begin(dbp->env, ip, NULL, &txn, 0)) != 0)
+ goto err;
+
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+ goto err;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * If the the root is a leaf we have nothing to do.
+ * Searching an empty RECNO tree will return NOTFOUND below and loop.
+ */
+ if ((ret = __memp_fget(dbp->mpf, &cp->root, ip, txn, 0, &pg)) != 0)
+ goto err;
+ if (LEVEL(pg) == LEAFLEVEL) {
+ ret = __memp_fput(dbp->mpf, ip, pg, dbp->priority);
+ goto err;
+ }
+ if ((ret = __memp_fput(dbp->mpf, ip, pg, dbp->priority)) != 0)
+ goto err;
+
+ pgno = PGNO_INVALID;
+ do {
+ if ((ret = __bam_csearch(dbc, &start, sflag, level)) != 0) {
+ /* No more at this level, go up one. */
+ if (ret == DB_NOTFOUND) {
+ level++;
+ if (start.data != NULL)
+ __os_free(dbp->env, start.data);
+ memset(&start, 0, sizeof(start));
+ sflag = CS_READ | CS_GETRECNO;
+ continue;
+ }
+ goto err;
+ }
+ c_data->compact_pages_examine++;
+
+ pg = cp->csp->page;
+ pgno = PGNO(pg);
+
+ sflag = CS_NEXT | CS_GETRECNO;
+ /* Grab info about the page and drop the stack. */
+ if (pgno != cp->root && (ret = __bam_savekey(dbc,
+ pgno <= c_data->compact_truncate, &start)) != 0) {
+ if (ret == DB_LOCK_NOTGRANTED)
+ continue;
+ goto err;
+ }
+
+ if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0)
+ goto err;
+ if (pgno == cp->root)
+ break;
+
+ if (pgno <= c_data->compact_truncate)
+ continue;
+
+ /* Get the meta page lock before latching interior nodes. */
+ if (!LOCK_ISSET(meta_lock) && (ret = __db_lget(dbc,
+ 0, PGNO_BASE_MD, DB_LOCK_WRITE, 0, &meta_lock)) != 0)
+ goto err;
+
+ /* Reget the page with a write latch, and its parent too. */
+ if ((ret = __bam_csearch(dbc,
+ &start, CS_PARENT | CS_GETRECNO, level)) != 0) {
+ if (ret == DB_NOTFOUND) {
+ ret = 0;
+ }
+ goto err;
+ }
+ pg = cp->csp->page;
+ pgno = PGNO(pg);
+
+ if (pgno > c_data->compact_truncate) {
+ if ((ret = __bam_truncate_page(dbc, &pg, NULL, 1)) != 0)
+ goto err;
+ if (pgno == PGNO(pg)) {
+ /* We could not allocate. Give up. */
+ pgno = cp->root;
+ }
+ }
+
+ if ((ret = __bam_stkrel(dbc,
+ pgno > c_data->compact_truncate ? 0 : STK_NOLOCK)) != 0)
+ goto err;
+
+ /* We are locking subtrees, so drop the write locks asap. */
+ if (local_txn && pgno > c_data->compact_truncate)
+ break;
+ } while (pgno != cp->root);
+
+ if ((ret = __dbc_close(dbc)) != 0)
+ goto err;
+ dbc = NULL;
+ if (local_txn) {
+ if ((ret = __txn_commit(txn, DB_TXN_NOSYNC)) != 0)
+ goto err;
+ txn = NULL;
+ LOCK_INIT(meta_lock);
+ }
+ if (pgno != ((BTREE *)dbp->bt_internal)->bt_root)
+ goto new_txn;
+
+err: if (txn != NULL && ret != 0)
+ sflag = STK_PGONLY;
+ else
+ sflag = 0;
+ if (txn == NULL)
+ if ((t_ret = __LPUT(dbc, meta_lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (dbc != NULL && (t_ret = __bam_stkrel(dbc, sflag)) != 0 && ret == 0)
+ ret = t_ret;
+ if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ if (local_txn &&
+ txn != NULL && (t_ret = __txn_abort(txn)) != 0 && ret == 0)
+ ret = t_ret;
+ if (start.data != NULL)
+ __os_free(dbp->env, start.data);
+ return (ret);
+}
+
+static int
+__bam_setup_freelist(dbp, list, nelems)
+ DB *dbp;
+ db_pglist_t *list;
+ u_int32_t nelems;
+{
+ DB_MPOOLFILE *mpf;
+ db_pgno_t *plist;
+ int ret;
+
+ mpf = dbp->mpf;
+
+ if ((ret = __memp_alloc_freelist(mpf, nelems, &plist)) != 0)
+ return (ret);
+
+ while (nelems-- != 0)
+ *plist++ = list++->pgno;
+
+ return (0);
+}
+
+static int
+__bam_free_freelist(dbp, ip, txn)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+{
+ DBC *dbc;
+ DB_LOCK lock;
+ int auto_commit, ret, t_ret;
+
+ LOCK_INIT(lock);
+ auto_commit = ret = 0;
+
+ /*
+ * If we are not in a transaction then we need to get
+ * a lock on the meta page, otherwise we should already
+ * have the lock.
+ */
+
+ dbc = NULL;
+ if (IS_DB_AUTO_COMMIT(dbp, txn)) {
+ /*
+ * We must not timeout the lock or we will not free the list.
+ * We ignore errors from txn_begin as there is little that
+ * the application can do with the error and we want to
+ * get the lock and free the list if at all possible.
+ */
+ if (__txn_begin(dbp->env, ip, NULL, &txn, 0) == 0) {
+ (void)__lock_set_timeout(dbp->env,
+ txn->locker, 0, DB_SET_TXN_TIMEOUT);
+ (void)__lock_set_timeout(dbp->env,
+ txn->locker, 0, DB_SET_LOCK_TIMEOUT);
+ auto_commit = 1;
+ }
+ /* Get a cursor so we can call __db_lget. */
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+ return (ret);
+
+ if ((ret = __db_lget(dbc,
+ 0, PGNO_BASE_MD, DB_LOCK_WRITE, 0, &lock)) != 0)
+ goto err;
+ }
+
+ ret = __memp_free_freelist(dbp->mpf);
+
+err: if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (auto_commit && __txn_abort(txn) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+#endif
diff --git a/btree/bt_compare.c b/btree/bt_compare.c
new file mode 100644
index 0000000..bc340f2
--- /dev/null
+++ b/btree/bt_compare.c
@@ -0,0 +1,213 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+
+/*
+ * __bam_cmp --
+ * Compare a key to a given record.
+ *
+ * PUBLIC: int __bam_cmp __P((DBC *, const DBT *, PAGE *, u_int32_t,
+ * PUBLIC: int (*)(DB *, const DBT *, const DBT *), int *));
+ */
+int
+__bam_cmp(dbc, dbt, h, indx, func, cmpp)
+ DBC *dbc;
+ const DBT *dbt;
+ PAGE *h;
+ u_int32_t indx;
+ int (*func)__P((DB *, const DBT *, const DBT *));
+ int *cmpp;
+{
+ BINTERNAL *bi;
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ DB *dbp;
+ DBT pg_dbt;
+
+ dbp = dbc->dbp;
+
+ /*
+ * Returns:
+ * < 0 if dbt is < page record
+ * = 0 if dbt is = page record
+ * > 0 if dbt is > page record
+ *
+ * !!!
+ * We do not clear the pg_dbt DBT even though it's likely to contain
+ * random bits. That should be okay, because the app's comparison
+ * routine had better not be looking at fields other than data, size
+ * and app_data. We don't clear it because we go through this path a
+ * lot and it's expensive.
+ */
+ switch (TYPE(h)) {
+ case P_LBTREE:
+ case P_LDUP:
+ case P_LRECNO:
+ bk = GET_BKEYDATA(dbp, h, indx);
+ if (B_TYPE(bk->type) == B_OVERFLOW)
+ bo = (BOVERFLOW *)bk;
+ else {
+ pg_dbt.app_data = NULL;
+ pg_dbt.data = bk->data;
+ pg_dbt.size = bk->len;
+ *cmpp = func(dbp, dbt, &pg_dbt);
+ return (0);
+ }
+ break;
+ case P_IBTREE:
+ /*
+ * The following code guarantees that the left-most key on an
+ * internal page at any place in the tree sorts less than any
+ * user-specified key. The reason is that if we have reached
+ * this internal page, we know the user key must sort greater
+ * than the key we're storing for this page in any internal
+ * pages at levels above us in the tree. It then follows that
+ * any user-specified key cannot sort less than the first page
+ * which we reference, and so there's no reason to call the
+ * comparison routine. While this may save us a comparison
+ * routine call or two, the real reason for this is because
+ * we don't maintain a copy of the smallest key in the tree,
+ * so that we don't have to update all the levels of the tree
+ * should the application store a new smallest key. And, so,
+ * we may not have a key to compare, which makes doing the
+ * comparison difficult and error prone.
+ */
+ if (indx == 0) {
+ *cmpp = 1;
+ return (0);
+ }
+
+ bi = GET_BINTERNAL(dbp, h, indx);
+ if (B_TYPE(bi->type) == B_OVERFLOW)
+ bo = (BOVERFLOW *)(bi->data);
+ else {
+ pg_dbt.app_data = NULL;
+ pg_dbt.data = bi->data;
+ pg_dbt.size = bi->len;
+ *cmpp = func(dbp, dbt, &pg_dbt);
+ return (0);
+ }
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, PGNO(h)));
+ }
+
+ /*
+ * Overflow.
+ */
+ return (__db_moff(dbc, dbt, bo->pgno, bo->tlen,
+ func == __bam_defcmp ? NULL : func, cmpp));
+}
+
+/*
+ * __bam_defcmp --
+ * Default comparison routine.
+ *
+ * PUBLIC: int __bam_defcmp __P((DB *, const DBT *, const DBT *));
+ */
+int
+__bam_defcmp(dbp, a, b)
+ DB *dbp;
+ const DBT *a, *b;
+{
+ size_t len;
+ u_int8_t *p1, *p2;
+
+ COMPQUIET(dbp, NULL);
+
+ /*
+ * Returns:
+ * < 0 if a is < b
+ * = 0 if a is = b
+ * > 0 if a is > b
+ *
+ * XXX
+ * If a size_t doesn't fit into a long, or if the difference between
+ * any two characters doesn't fit into an int, this routine can lose.
+ * What we need is a signed integral type that's guaranteed to be at
+ * least as large as a size_t, and there is no such thing.
+ */
+ len = a->size > b->size ? b->size : a->size;
+ for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2)
+ if (*p1 != *p2)
+ return ((long)*p1 - (long)*p2);
+ return ((long)a->size - (long)b->size);
+}
+
+/*
+ * __bam_defpfx --
+ * Default prefix routine.
+ *
+ * PUBLIC: size_t __bam_defpfx __P((DB *, const DBT *, const DBT *));
+ */
+size_t
+__bam_defpfx(dbp, a, b)
+ DB *dbp;
+ const DBT *a, *b;
+{
+ size_t cnt, len;
+ u_int8_t *p1, *p2;
+
+ COMPQUIET(dbp, NULL);
+
+ cnt = 1;
+ len = a->size > b->size ? b->size : a->size;
+ for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2, ++cnt)
+ if (*p1 != *p2)
+ return (cnt);
+
+ /*
+ * They match up to the smaller of the two sizes.
+ * Collate the longer after the shorter.
+ */
+ if (a->size < b->size)
+ return (a->size + 1);
+ if (b->size < a->size)
+ return (b->size + 1);
+ return (b->size);
+}
diff --git a/btree/bt_compress.c b/btree/bt_compress.c
new file mode 100644
index 0000000..bdf1e17
--- /dev/null
+++ b/btree/bt_compress.c
@@ -0,0 +1,3024 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+
+#ifdef HAVE_COMPRESSION
+
+static int __bam_compress_marshal_data __P((DB *, const DBT *, DBT *));
+static int __bam_compress_set_dbt __P((DB *, DBT *, const void *, u_int32_t));
+static int __bamc_compress_del_and_get_next __P((DBC *, DBT *, DBT *));
+static int __bamc_compress_get_bothc __P((DBC *, DBT *, u_int32_t));
+static int __bamc_compress_get_multiple_key __P((DBC *, DBT *, u_int32_t));
+static int __bamc_compress_get_multiple __P((DBC *, DBT *, DBT *,u_int32_t));
+static int __bamc_compress_get_next __P((DBC *, u_int32_t));
+static int __bamc_compress_get_next_dup __P((DBC *, DBT *, u_int32_t));
+static int __bamc_compress_get_next_nodup __P((DBC *, u_int32_t));
+static int __bamc_compress_get_prev __P((DBC *, u_int32_t));
+static int __bamc_compress_get_prev_dup __P((DBC *, u_int32_t));
+static int __bamc_compress_get_prev_nodup __P((DBC *, u_int32_t));
+static int __bamc_compress_get_set __P((DBC *,
+ DBT *, DBT *, u_int32_t, u_int32_t));
+static int __bamc_compress_ibulk_del __P((DBC *, DBT *, u_int32_t));
+static int __bamc_compress_idel __P((DBC *, u_int32_t));
+static int __bamc_compress_iget __P((DBC *, DBT *, DBT *, u_int32_t));
+static int __bamc_compress_iput __P((DBC *, DBT *, DBT *, u_int32_t));
+static int __bamc_compress_relocate __P((DBC *));
+static void __bamc_compress_reset __P((DBC *));
+static int __bamc_compress_seek __P((DBC *,
+ const DBT *, const DBT *, u_int32_t));
+static int __bamc_compress_store __P((DBC *,
+ DBT *, DBT*, DBT **, DBT **, DBT *, DBT *));
+static int __bamc_next_decompress __P((DBC *));
+static int __bamc_start_decompress __P((DBC *));
+
+/*
+ * Call __dbc_iget(), resizing DBTs if DB_BUFFER_SMALL is returned.
+ * We're always using a transient cursor when this macro is used, so
+ * we have to replace the OP with DB_CURRENT when we retry.
+ */
+#define CMP_IGET_RETRY(ret, dbc, dbt1, dbt2, flags) do { \
+ DB_ASSERT((dbc)->env, F_ISSET((dbt1), DB_DBT_USERMEM)); \
+ DB_ASSERT((dbc)->env, F_ISSET((dbt2), DB_DBT_USERMEM)); \
+ if (((ret) =__dbc_iget((dbc), \
+ (dbt1), (dbt2), (flags))) == DB_BUFFER_SMALL) { \
+ if ((CMP_RESIZE_DBT((ret), (dbc)->env, (dbt1))) != 0) \
+ break; \
+ if ((CMP_RESIZE_DBT((ret), (dbc)->env, (dbt2))) != 0) \
+ break; \
+ (ret) = __dbc_iget((dbc), (dbt1), (dbt2), \
+ ((flags) & ~DB_OPFLAGS_MASK) | DB_CURRENT); \
+ } \
+} while (0)
+
+#define CMP_INIT_DBT(dbt) do { \
+ (dbt)->data = NULL; \
+ (dbt)->size = 0; \
+ (dbt)->ulen = 0; \
+ (dbt)->doff = 0; \
+ (dbt)->dlen = 0; \
+ (dbt)->flags = DB_DBT_USERMEM; \
+ (dbt)->app_data = NULL; \
+} while (0)
+
+#define CMP_FREE_DBT(env, dbt) do { \
+ DB_ASSERT((env), F_ISSET((dbt), DB_DBT_USERMEM)); \
+ __os_free((env), (dbt)->data); \
+} while (0)
+
+#define CMP_RESIZE_DBT(ret, env, dbt) \
+ (((dbt)->size > (dbt)->ulen) ? \
+ ((((ret) = __os_realloc((env), (dbt)->size, &(dbt)->data)) \
+ != 0) ? (ret) : (((dbt)->ulen = (dbt)->size), 0)) : 0)
+
+static int
+__bam_compress_set_dbt(dbp, dbt, data, size)
+ DB *dbp;
+ DBT *dbt;
+ const void *data;
+ u_int32_t size;
+{
+ int ret;
+
+ ret = 0;
+ DB_ASSERT(dbp->env, F_ISSET(dbt, DB_DBT_USERMEM));
+
+ dbt->size = size;
+ if (CMP_RESIZE_DBT(ret, dbp->env, dbt) != 0)
+ return (ret);
+
+ memcpy(dbt->data, data, size);
+ return (0);
+}
+
+/******************************************************************************/
+
+/*
+ * Very simple key/data stream to give __bamc_compress_merge_insert()
+ * a source of data to work on.
+ */
+struct __bam_compress_stream;
+typedef struct __bam_compress_stream BTREE_COMPRESS_STREAM;
+struct __bam_compress_stream
+{
+ int (*next)(BTREE_COMPRESS_STREAM *, DBT *, DBT *);
+
+ void *kptr, *dptr;
+ DBT *key, *data;
+};
+
+/*
+ * These function prototypes can not go at the beginning because they rely on
+ * on BTREE_COMPRESS_STREAM defined above.
+ * The prototypes are required to avoid the Microsoft C++ compiler generating
+ * warnings about mismatching parameter lists.
+ */
+static int __bam_cs_next_done __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *));
+static int __bam_cs_single_next __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *));
+static void __bam_cs_create_single
+ __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *));
+static int __bam_cs_single_keyonly_next
+ __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *));
+static void __bam_cs_create_single_keyonly
+ __P((BTREE_COMPRESS_STREAM *, DBT *));
+static int __bam_cs_multiple_key_next
+ __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *));
+static void __bam_cs_create_multiple_key __P((BTREE_COMPRESS_STREAM *, DBT *));
+static int __bam_cs_multiple_next __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *));
+static void __bam_cs_create_multiple
+ __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *));
+static int __bam_cs_multiple_keyonly_next
+ __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *));
+static void __bam_cs_create_multiple_keyonly
+ __P((BTREE_COMPRESS_STREAM *, DBT *));
+static int __bamc_compress_merge_insert
+ __P((DBC *, BTREE_COMPRESS_STREAM *, u_int32_t *, u_int32_t));
+static int __bamc_compress_merge_delete
+ __P((DBC *, BTREE_COMPRESS_STREAM *, u_int32_t *));
+static int __bamc_compress_merge_delete_dups
+ __P((DBC *, BTREE_COMPRESS_STREAM *, u_int32_t *));
+
+/* BTREE_COMPRESS_STREAM->next() for when the data has finished. */
+static int
+__bam_cs_next_done(stream, key, data)
+ BTREE_COMPRESS_STREAM *stream;
+ DBT *key, *data;
+{
+ COMPQUIET(stream, NULL);
+ COMPQUIET(key, NULL);
+ COMPQUIET(data, NULL);
+ return (0);
+}
+
+/* BTREE_COMPRESS_STREAM->next() for a single key/data pair. */
+static int
+__bam_cs_single_next(stream, key, data)
+ BTREE_COMPRESS_STREAM *stream;
+ DBT *key, *data;
+{
+ key->data = stream->key->data;
+ key->size = stream->key->size;
+ data->data = stream->data->data;
+ data->size = stream->data->size;
+ stream->next = __bam_cs_next_done;
+ return (1);
+}
+
+/* Create a BTREE_COMPRESS_STREAM for a single key/data pair */
+static void
+__bam_cs_create_single(stream, key, data)
+ BTREE_COMPRESS_STREAM *stream;
+ DBT *key, *data;
+{
+ stream->next = __bam_cs_single_next;
+ stream->key = key;
+ stream->data = data;
+}
+
+/* BTREE_COMPRESS_STREAM->next() for a single key. */
+static int
+__bam_cs_single_keyonly_next(stream, key, data)
+ BTREE_COMPRESS_STREAM *stream;
+ DBT *key, *data;
+{
+ key->data = stream->key->data;
+ key->size = stream->key->size;
+ if (data != NULL) {
+ data->data = NULL;
+ data->size = 0;
+ }
+ stream->next = __bam_cs_next_done;
+ return (1);
+}
+
+/* Create a BTREE_COMPRESS_STREAM for a single key/data pair */
+static void
+__bam_cs_create_single_keyonly(stream, key)
+ BTREE_COMPRESS_STREAM *stream;
+ DBT *key;
+{
+ stream->next = __bam_cs_single_keyonly_next;
+ stream->key = key;
+}
+
+/*
+ * BTREE_COMPRESS_STREAM->next() for a single buffer in the DB_MULTIPLE_KEY
+ * format.
+ */
+static int
+__bam_cs_multiple_key_next(stream, key, data)
+ BTREE_COMPRESS_STREAM *stream;
+ DBT *key, *data;
+{
+ DB_MULTIPLE_KEY_NEXT(stream->kptr, stream->key, key->data, key->size,
+ data->data, data->size);
+ if (key->data == NULL) {
+ stream->next = __bam_cs_next_done;
+ return (0);
+ }
+ return (1);
+}
+
+/*
+ * Create a BTREE_COMPRESS_STREAM for a single buffer in the DB_MULTIPLE_KEY
+ * format.
+ */
+static void
+__bam_cs_create_multiple_key(stream, multiple)
+ BTREE_COMPRESS_STREAM *stream;
+ DBT *multiple;
+{
+ stream->next = __bam_cs_multiple_key_next;
+ stream->key = multiple;
+ DB_MULTIPLE_INIT(stream->kptr, stream->key);
+}
+
+/* BTREE_COMPRESS_STREAM->next() for two buffers in the DB_MULTIPLE format. */
+static int
+__bam_cs_multiple_next(stream, key, data)
+ BTREE_COMPRESS_STREAM *stream;
+ DBT *key, *data;
+{
+ DB_MULTIPLE_NEXT(stream->kptr, stream->key, key->data, key->size);
+ DB_MULTIPLE_NEXT(stream->dptr, stream->data, data->data, data->size);
+ if (key->data == NULL || data->data == NULL) {
+ stream->next = __bam_cs_next_done;
+ return (0);
+ }
+ return (1);
+}
+
+/* Create a BTREE_COMPRESS_STREAM for two buffers in the DB_MULTIPLE format. */
+static void
+__bam_cs_create_multiple(stream, key, data)
+ BTREE_COMPRESS_STREAM *stream;
+ DBT *key, *data;
+{
+ stream->next = __bam_cs_multiple_next;
+ stream->key = key;
+ stream->data = data;
+ DB_MULTIPLE_INIT(stream->kptr, stream->key);
+ DB_MULTIPLE_INIT(stream->dptr, stream->data);
+}
+
+/*
+ * BTREE_COMPRESS_STREAM->next() for a single buffer in the DB_MULTIPLE
+ * format.
+ */
+static int
+__bam_cs_multiple_keyonly_next(stream, key, data)
+ BTREE_COMPRESS_STREAM *stream;
+ DBT *key, *data;
+{
+ DB_MULTIPLE_NEXT(stream->kptr, stream->key, key->data, key->size);
+ if (key->data == NULL) {
+ stream->next = __bam_cs_next_done;
+ return (0);
+ }
+ if (data != NULL) {
+ data->data = NULL;
+ data->size = 0;
+ }
+ return (1);
+}
+
+/*
+ * Create a BTREE_COMPRESS_STREAM for a single buffer in the DB_MULTIPLE
+ * format.
+ */
+static void
+__bam_cs_create_multiple_keyonly(stream, key)
+ BTREE_COMPRESS_STREAM *stream;
+ DBT *key;
+{
+ stream->next = __bam_cs_multiple_keyonly_next;
+ stream->key = key;
+ DB_MULTIPLE_INIT(stream->kptr, stream->key);
+}
+
+/******************************************************************************/
+
+/*
+ * Marshal data in initial data format into destbuf, resizing destbuf if
+ * necessary.
+ */
+static int
+__bam_compress_marshal_data(dbp, data, destbuf)
+ DB *dbp;
+ const DBT *data;
+ DBT *destbuf;
+{
+ int ret;
+ u_int8_t *ptr;
+
+ ret = 0;
+ DB_ASSERT(dbp->env, F_ISSET(destbuf, DB_DBT_USERMEM));
+
+ destbuf->size = __db_compress_count_int(data->size);
+ destbuf->size += data->size;
+ if (CMP_RESIZE_DBT(ret, dbp->env, destbuf) != 0)
+ return (ret);
+
+ ptr = (u_int8_t*)destbuf->data;
+ ptr += __db_compress_int(ptr, data->size);
+ memcpy(ptr, data->data, data->size);
+
+ return (0);
+}
+
+/*
+ * Unmarshal initial data from source into data - does not copy, points
+ * into source.
+ */
+#define CMP_UNMARSHAL_DATA(src, dest) do { \
+ (dest)->data = ((u_int8_t*)(src)->data) + \
+ __db_decompress_int32((u_int8_t*)(src)->data, \
+ &(dest)->size); \
+} while (0)
+
+/******************************************************************************/
+
+/*
+ * __bam_compress_dupcmp --
+ * Duplicate comparison function for compressed BTrees.
+ *
+ * PUBLIC: int __bam_compress_dupcmp __P((DB *, const DBT *, const DBT *));
+ */
+int
+__bam_compress_dupcmp(db, a, b)
+ DB *db;
+ const DBT *a;
+ const DBT *b;
+{
+ DBT dcmp_a, dcmp_b;
+
+ /* Decompress the initial data in a */
+ CMP_UNMARSHAL_DATA(a, &dcmp_a);
+ dcmp_a.ulen = 0;
+ dcmp_a.doff = 0;
+ dcmp_a.dlen = 0;
+ dcmp_a.flags = 0;
+ dcmp_a.app_data = 0;
+
+ /* Decompress the initial data in b */
+ CMP_UNMARSHAL_DATA(b, &dcmp_b);
+ dcmp_b.ulen = 0;
+ dcmp_b.doff = 0;
+ dcmp_b.dlen = 0;
+ dcmp_b.flags = 0;
+ dcmp_b.app_data = 0;
+
+ /* Call the user's duplicate compare function */
+ return ((BTREE *)db->bt_internal)->
+ compress_dup_compare(db, &dcmp_a, &dcmp_b);
+}
+
+/*
+ * __bam_defcompress --
+ * Default compression routine.
+ *
+ * PUBLIC: int __bam_defcompress __P((DB *, const DBT *, const DBT *,
+ * PUBLIC: const DBT *, const DBT *, DBT *));
+ */
+int
+__bam_defcompress(dbp, prevKey, prevData, key, data, dest)
+ DB *dbp;
+ const DBT *prevKey, *prevData, *key, *data;
+ DBT *dest;
+{
+ u_int8_t *ptr;
+ const u_int8_t *k, *p;
+ size_t len, prefix, suffix;
+
+ COMPQUIET(dbp, NULL);
+
+ k = (const u_int8_t*)key->data;
+ p = (const u_int8_t*)prevKey->data;
+ len = key->size > prevKey->size ? prevKey->size : key->size;
+ for (; len-- && *k == *p; ++k, ++p)
+ continue;
+
+ prefix = (size_t)(k - (u_int8_t*)key->data);
+ suffix = key->size - prefix;
+
+ if (prefix == prevKey->size && suffix == 0) {
+ /* It's a duplicate - do prefix compression on the value */
+ k = (const u_int8_t*)data->data;
+ p = (const u_int8_t*)prevData->data;
+ len = data->size > prevData->size ? prevData->size : data->size;
+ for (; len-- && *k == *p; ++k, ++p)
+ continue;
+
+ prefix = (size_t)(k - (u_int8_t*)data->data);
+ suffix = data->size - prefix;
+
+ /* Check that we have enough space in dest */
+ dest->size = (u_int32_t)(1 + __db_compress_count_int(prefix) +
+ __db_compress_count_int(suffix) + suffix);
+ if (dest->size > dest->ulen)
+ return (DB_BUFFER_SMALL);
+
+ /* Magic identifying byte */
+ ptr = (u_int8_t*)dest->data;
+ *ptr = CMP_INT_SPARE_VAL;
+ ++ptr;
+
+ /* prefix length */
+ ptr += __db_compress_int(ptr, prefix);
+
+ /* suffix length */
+ ptr += __db_compress_int(ptr, suffix);
+
+ /* suffix */
+ memcpy(ptr, k, suffix);
+
+ return (0);
+ }
+
+ /* Check that we have enough space in dest */
+ dest->size = (u_int32_t)(__db_compress_count_int(prefix) +
+ __db_compress_count_int(suffix) +
+ __db_compress_count_int(data->size) + suffix + data->size);
+ if (dest->size > dest->ulen)
+ return (DB_BUFFER_SMALL);
+
+ /* prefix length */
+ ptr = (u_int8_t*)dest->data;
+ ptr += __db_compress_int(ptr, prefix);
+
+ /* suffix length */
+ ptr += __db_compress_int(ptr, suffix);
+
+ /* data length */
+ ptr += __db_compress_int(ptr, data->size);
+
+ /* suffix */
+ memcpy(ptr, k, suffix);
+ ptr += suffix;
+
+ /* data */
+ memcpy(ptr, data->data, data->size);
+
+ return (0);
+}
+
+/*
+ * __bam_defdecompress --
+ * Default decompression routine.
+ *
+ * PUBLIC: int __bam_defdecompress __P((DB *, const DBT *, const DBT *, DBT *,
+ * PUBLIC: DBT *, DBT *));
+ */
+int
+__bam_defdecompress(dbp, prevKey, prevData, compressed, destKey, destData)
+ DB *dbp;
+ const DBT *prevKey, *prevData;
+ DBT *compressed, *destKey, *destData;
+{
+ u_int8_t *s, *d;
+ u_int32_t prefix, suffix, size;
+
+ COMPQUIET(dbp, NULL);
+
+ /*
+ * Check for the magic identifying byte, that tells us that this is a
+ * compressed duplicate value.
+ */
+ s = (u_int8_t*)compressed->data;
+ if (*s == CMP_INT_SPARE_VAL) {
+ ++s;
+ size = 1;
+
+ /* Unmarshal prefix and suffix */
+ size += __db_decompress_count_int(s);
+ if (size > compressed->size)
+ return (EINVAL);
+ s += __db_decompress_int32(s, &prefix);
+
+ size += __db_decompress_count_int(s);
+ if (size > compressed->size)
+ return (EINVAL);
+ s += __db_decompress_int32(s, &suffix);
+
+ /* Check destination lengths */
+ destKey->size = prevKey->size;
+ destData->size = prefix + suffix;
+ if (destKey->size > destKey->ulen ||
+ destData->size > destData->ulen)
+ return (DB_BUFFER_SMALL);
+
+ /* Write the key */
+ memcpy(destKey->data, prevKey->data, destKey->size);
+
+ /* Write the prefix */
+ if (prefix > prevData->size)
+ return (EINVAL);
+ d = (u_int8_t*)destData->data;
+ memcpy(d, prevData->data, prefix);
+ d += prefix;
+
+ /* Write the suffix */
+ size += suffix;
+ if (size > compressed->size)
+ return (EINVAL);
+ memcpy(d, s, suffix);
+ s += suffix;
+
+ /* Return bytes read */
+ compressed->size = (u_int32_t)(s - (u_int8_t*)compressed->data);
+ return (0);
+ }
+
+ /* Unmarshal prefix, suffix and data length */
+ size = __db_decompress_count_int(s);
+ if (size > compressed->size)
+ return (EINVAL);
+ s += __db_decompress_int32(s, &prefix);
+
+ size += __db_decompress_count_int(s);
+ if (size > compressed->size)
+ return (EINVAL);
+ s += __db_decompress_int32(s, &suffix);
+
+ size += __db_decompress_count_int(s);
+ if (size > compressed->size)
+ return (EINVAL);
+ s += __db_decompress_int32(s, &destData->size);
+
+ /* Check destination lengths */
+ destKey->size = prefix + suffix;
+ if (destKey->size > destKey->ulen || destData->size > destData->ulen)
+ return (DB_BUFFER_SMALL);
+
+ /* Write the prefix */
+ if (prefix > prevKey->size)
+ return (EINVAL);
+ d = (u_int8_t*)destKey->data;
+ memcpy(d, prevKey->data, prefix);
+ d += prefix;
+
+ /* Write the suffix */
+ size += suffix;
+ if (size > compressed->size)
+ return (EINVAL);
+ memcpy(d, s, suffix);
+ s += suffix;
+
+ /* Write the data */
+ size += destData->size;
+ if (size > compressed->size)
+ return (EINVAL);
+ memcpy(destData->data, s, destData->size);
+ s += destData->size;
+
+ /* Return bytes read */
+ compressed->size = (u_int32_t)(s - (u_int8_t*)compressed->data);
+ return (0);
+}
+
+/******************************************************************************/
+
+/*
+ * Set dbc up to start decompressing the compressed key/data pair, dbc->key1
+ * and dbc->compressed.
+ */
+static int
+__bamc_start_decompress(dbc)
+ DBC *dbc;
+{
+ BTREE_CURSOR *cp;
+ int ret;
+ u_int32_t datasize;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ cp->prevKey = NULL;
+ cp->prevData = NULL;
+ cp->currentKey = &cp->key1;
+ cp->currentData = &cp->data1;
+ cp->compcursor = (u_int8_t*)cp->compressed.data;
+ cp->compend = cp->compcursor + cp->compressed.size;
+ cp->prevcursor = NULL;
+ cp->prev2cursor = NULL;
+
+ /* Unmarshal the first data */
+ cp->compcursor += __db_decompress_int32(cp->compcursor, &datasize);
+ ret = __bam_compress_set_dbt(dbc->dbp,
+ cp->currentData, cp->compcursor, datasize);
+
+ if (ret == 0)
+ cp->compcursor += datasize;
+ return (ret);
+}
+
+/* Decompress the next key/data pair from dbc->compressed. */
+static int
+__bamc_next_decompress(dbc)
+ DBC *dbc;
+{
+ DBT compressed;
+ int ret;
+ BTREE_CURSOR *cp;
+ DB *db;
+
+ ret = 0;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ db = dbc->dbp;
+
+ if (cp->compcursor >= cp->compend)
+ return (DB_NOTFOUND);
+
+ cp->prevKey = cp->currentKey;
+ cp->prevData = cp->currentData;
+ cp->prev2cursor = cp->prevcursor;
+ cp->prevcursor = cp->compcursor;
+
+ if (cp->currentKey == &cp->key1) {
+ cp->currentKey = &cp->key2;
+ cp->currentData = &cp->data2;
+ } else {
+ cp->currentKey = &cp->key1;
+ cp->currentData = &cp->data1;
+ }
+
+ compressed.flags = DB_DBT_USERMEM;
+ compressed.data = (void*)cp->compcursor;
+ compressed.ulen = compressed.size =
+ (u_int32_t)(cp->compend - cp->compcursor);
+ compressed.app_data = NULL;
+
+ while ((ret = ((BTREE *)db->bt_internal)->bt_decompress(db,
+ cp->prevKey, cp->prevData, &compressed,
+ cp->currentKey, cp->currentData)) == DB_BUFFER_SMALL) {
+ if (CMP_RESIZE_DBT(ret, dbc->env, cp->currentKey) != 0)
+ break;
+ if (CMP_RESIZE_DBT(ret, dbc->env, cp->currentData) != 0)
+ break;
+ }
+
+ if (ret == 0)
+ cp->compcursor += compressed.size;
+ return (ret);
+}
+
+/*
+ * Store key and data into destkey and destbuf, using the compression
+ * callback given.
+ */
+static int
+__bamc_compress_store(dbc, key, data, prevKey, prevData, destkey, destbuf)
+ DBC *dbc;
+ DBT *key, *data;
+ DBT **prevKey, **prevData;
+ DBT *destkey, *destbuf;
+{
+ int ret;
+ DBT dest;
+
+ if (*prevKey == 0) {
+ if ((ret = __bam_compress_set_dbt(dbc->dbp,
+ destkey, key->data, key->size)) != 0)
+ return (ret);
+
+ /* Marshal data - resize if it won't fit */
+ ret = __bam_compress_marshal_data(dbc->dbp, data, destbuf);
+
+ } else if (((BTREE_CURSOR *)dbc->internal)->ovflsize > destbuf->size) {
+ /*
+ * Don't write more than cp->ovflsize bytes to the destination
+ * buffer - destbuf must be at least cp->ovflsize in size.
+ */
+ dest.flags = DB_DBT_USERMEM;
+ dest.data = (u_int8_t*)destbuf->data + destbuf->size;
+ dest.ulen =
+ ((BTREE_CURSOR *)dbc->internal)->ovflsize - destbuf->size;
+ dest.size = 0;
+ dest.app_data = NULL;
+
+ ret = ((BTREE *)dbc->dbp->bt_internal)->bt_compress(
+ dbc->dbp, *prevKey, *prevData, key, data, &dest);
+
+ if (ret == 0)
+ destbuf->size += dest.size;
+ } else
+ ret = DB_BUFFER_SMALL;
+
+ if (ret == 0) {
+ *prevKey = key;
+ *prevData = data;
+ }
+
+ return (ret);
+}
+
+/*
+ * Move dbc->dbc to the correct position to start linear searching for
+ * seek_key/seek_data - the biggest key smaller than or equal to
+ * seek_key/seek_data.
+ */
+static int
+__bamc_compress_seek(dbc, seek_key, seek_data, flags)
+ DBC *dbc;
+ const DBT *seek_key;
+ const DBT *seek_data;
+ u_int32_t flags;
+{
+ int ret;
+ u_int32_t method;
+ DB *dbp;
+ BTREE_CURSOR *cp;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ if ((ret = __bam_compress_set_dbt(
+ dbp, &cp->key1, seek_key->data, seek_key->size)) != 0)
+ return (ret);
+
+ /*
+ * We allow seek_data to be 0 for __bamc_compress_get_set() with
+ * DB_SET
+ */
+ if (F_ISSET(dbp, DB_AM_DUPSORT) && seek_data != NULL) {
+ if ((ret = __bam_compress_marshal_data(
+ dbp, seek_data, &cp->compressed)) != 0)
+ return (ret);
+
+ method = DB_GET_BOTH_LTE;
+ } else
+ method = DB_SET_LTE;
+
+ CMP_IGET_RETRY(ret, dbc, &cp->key1, &cp->compressed, method | flags);
+
+ if (ret == 0 &&
+ F_ISSET(dbp, DB_AM_DUPSORT) && seek_data == NULL &&
+ __db_compare_both(dbp, seek_key, 0, &cp->key1, 0) == 0) {
+ /*
+ * Some entries for seek_key might be in the previous chunk,
+ * so we need to start searching there.
+ */
+ CMP_IGET_RETRY(ret,
+ dbc, &cp->key1, &cp->compressed, DB_PREV | flags);
+ if (ret == DB_NOTFOUND) {
+ /* No previous, we must need the first entry */
+ CMP_IGET_RETRY(ret,
+ dbc, &cp->key1, &cp->compressed, DB_FIRST | flags);
+ }
+ }
+
+ return (ret);
+}
+
+/* Reset the cursor to an uninitialized state */
+static void
+__bamc_compress_reset(dbc)
+ DBC *dbc;
+{
+ BTREE_CURSOR *cp;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ cp->prevKey = 0;
+ cp->prevData = 0;
+ cp->currentKey = 0;
+ cp->currentData = 0;
+ cp->compcursor = 0;
+ cp->compend = 0;
+ cp->prevcursor = 0;
+ cp->prev2cursor = 0;
+
+ F_CLR(cp, C_COMPRESS_DELETED|C_COMPRESS_MODIFIED);
+}
+
+/*
+ * Duplicate the cursor and delete the current entry, move the original cursor
+ * on and then close the cursor we used to delete. We do that to make sure that
+ * the close method runs __bamc_physdel(), and actually gets rid of the deleted
+ * entry!
+ */
+static int
+__bamc_compress_del_and_get_next(dbc, nextk, nextc)
+ DBC *dbc;
+ DBT *nextk, *nextc;
+{
+ int ret, ret_n;
+ DBC *dbc_n;
+
+ if ((ret = __dbc_dup(dbc, &dbc_n, DB_POSITION | DB_SHALLOW_DUP)) != 0)
+ return (ret);
+ F_SET(dbc_n, DBC_TRANSIENT);
+
+ if ((ret = __dbc_idel(dbc_n, 0)) != 0)
+ goto err;
+
+ /* Read the next position */
+ CMP_IGET_RETRY(ret, dbc, nextk, nextc, DB_NEXT);
+
+ err:
+ if ((ret_n = __dbc_close(dbc_n)) != 0 && ret == 0)
+ ret = ret_n;
+
+ /* No need to relocate this cursor */
+ F_CLR((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED);
+
+ return (ret);
+}
+
+/*
+ * Duplicate the cursor, re-locate the position that this cursor pointed to
+ * using the duplicate (it may have been deleted), and then swap
+ * the cursors. We do that to make sure that the close method runs
+ * __bamc_physdel(), and gets rid of the entry that may have been deleted.
+ */
+static int
+__bamc_compress_relocate(dbc)
+ DBC *dbc;
+{
+ int ret, t_ret;
+ BTREE_CURSOR *cp, *cp_n;
+ DBC *dbc_n;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ if ((ret = __dbc_dup(dbc, &dbc_n, 0)) != 0)
+ return (ret);
+ F_SET(dbc_n, DBC_TRANSIENT);
+
+ cp_n = (BTREE_CURSOR *)dbc_n->internal;
+
+ if (F_ISSET(cp, C_COMPRESS_DELETED)) {
+ /* Find the position after the deleted entry again */
+ ret = __bamc_compress_get_set(
+ dbc_n, &cp->del_key, &cp->del_data, 0, 0);
+ if (ret == DB_NOTFOUND) {
+ __bamc_compress_reset(dbc_n);
+ ret = 0;
+ } else if (ret != 0)
+ goto err;
+
+ F_SET(cp_n, C_COMPRESS_DELETED);
+
+ } else if (cp->currentKey != NULL) {
+ /* Find the current entry again */
+ ret = __bamc_compress_get_set(
+ dbc_n, cp->currentKey, cp->currentData,
+ F_ISSET(dbc->dbp, DB_AM_DUPSORT) ? DB_GET_BOTH : DB_SET, 0);
+
+ if (ret == DB_NOTFOUND) {
+ /* The current entry has been deleted */
+ if ((ret = __bam_compress_set_dbt(dbc_n->dbp,
+ &cp_n->del_key,
+ cp->currentKey->data, cp->currentKey->size)) != 0)
+ return (ret);
+ if ((ret = __bam_compress_set_dbt(dbc_n->dbp,
+ &cp_n->del_data, cp->currentData->data,
+ cp->currentData->size)) != 0)
+ return (ret);
+ F_SET(cp_n, C_COMPRESS_DELETED);
+ ret = 0;
+ } else if (ret != 0)
+ goto err;
+ }
+
+ err:
+ /* Cleanup and cursor resolution. This also clears the
+ C_COMPRESS_MODIFIED flag. */
+ if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/******************************************************************************/
+
+#define CMP_STORE(key, data) do { \
+ while ((ret = __bamc_compress_store(dbc, (key), (data), \
+ &prevDestKey, &prevDestData, &destkey, &destbuf)) \
+ == DB_BUFFER_SMALL) { \
+ if ((ret = __dbc_iput(dbc, \
+ &destkey, &destbuf, DB_KEYLAST)) != 0) \
+ goto end; \
+ prevDestKey = NULL; \
+ prevDestData = NULL; \
+ destbuf.size = 0; \
+ } \
+} while (0)
+
+/* Merge the sorted key/data pairs from stream into the compressed database. */
+static int
+__bamc_compress_merge_insert(dbc, stream, countp, flags)
+ DBC *dbc;
+ BTREE_COMPRESS_STREAM *stream;
+ u_int32_t *countp;
+ u_int32_t flags;
+{
+ DBT ikey1, ikey2, idata1, idata2, nextk, nextc, nextd, destkey, destbuf;
+ DBT *ikey, *idata, *prevIkey, *prevIdata, *prevDestKey, *prevDestData;
+ int ret, bulk_ret, cmp, nextExists, moreCompressed, iSmallEnough;
+ int moreStream;
+ u_int32_t chunk_count;
+ ENV *env;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+
+ env = dbc->env;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+ bulk_ret = 0;
+
+ memset(&ikey1, 0, sizeof(DBT));
+ memset(&ikey2, 0, sizeof(DBT));
+ memset(&idata1, 0, sizeof(DBT));
+ memset(&idata2, 0, sizeof(DBT));
+
+ CMP_INIT_DBT(&nextk);
+ CMP_INIT_DBT(&nextc);
+ memset(&nextd, 0, sizeof(DBT));
+
+ CMP_INIT_DBT(&destkey);
+ CMP_INIT_DBT(&destbuf);
+ if ((ret = __os_malloc(env, cp->ovflsize, &destbuf.data)) != 0)
+ goto end;
+ destbuf.ulen = cp->ovflsize;
+
+ if (countp != NULL)
+ *countp = 0;
+ chunk_count = 0;
+
+ /* Get the first input key and data */
+ ret = 0;
+ prevIkey = NULL;
+ prevIdata = NULL;
+ ikey = &ikey1;
+ idata = &idata1;
+ if (stream->next(stream, ikey, idata) == 0)
+ goto end;
+
+ prevDestKey = NULL;
+ prevDestData = NULL;
+
+ moreStream = 1;
+ while (moreStream != 0) {
+ nextExists = 1;
+ moreCompressed = 1;
+
+ /* Seek the ikey/idata position */
+ ret = __bamc_compress_seek(dbc, ikey, idata, 0);
+ if (ret == 0) {
+ /*
+ * Delete the key - we might overwrite it below
+ * but it's safer to just always delete it, and it
+ * doesn't seem significantly slower to do so.
+ */
+ ret = __bamc_compress_del_and_get_next(dbc, &nextk,
+ &nextc);
+ if (ret == DB_NOTFOUND) {
+ ret = 0;
+ nextExists = 0;
+ } else if (ret == 0) {
+ CMP_UNMARSHAL_DATA(&nextc, &nextd);
+ } else
+ goto end;
+ ret = __bamc_start_decompress(dbc);
+ } else if (ret == DB_NOTFOUND) {
+ moreCompressed = 0;
+
+ /* Read the next position */
+ CMP_IGET_RETRY(ret, dbc, &nextk, &nextc, DB_FIRST);
+ if (ret == DB_NOTFOUND) {
+ ret = 0;
+ nextExists = 0;
+ } else if (ret == 0) {
+ CMP_UNMARSHAL_DATA(&nextc, &nextd);
+ }
+ }
+
+ if (ret != 0)
+ goto end;
+
+ /* !nextExists || ikey/idata < nextk/nextd */
+ iSmallEnough = 1;
+
+ while (moreCompressed != 0 || iSmallEnough != 0) {
+ if (moreCompressed == 0)
+ cmp = 1;
+ else if (iSmallEnough == 0)
+ cmp = -1;
+ else
+ cmp = __db_compare_both(dbp, cp->currentKey,
+ cp->currentData, ikey, idata);
+
+ if (cmp < 0) {
+store_current: CMP_STORE(cp->currentKey, cp->currentData);
+ if (ret != 0)
+ goto end;
+ } else {
+ switch (flags) {
+ case DB_KEYLAST:
+ case DB_KEYFIRST:
+ case DB_NODUPDATA:
+ if (cmp == 0 && bulk_ret == 0 &&
+ F_ISSET(dbp, DB_AM_DUPSORT)) {
+ bulk_ret = __db_duperr(dbp,
+ flags);
+
+ /*
+ * Continue until we store
+ * the current chunk,
+ * but don't insert any
+ * more entries.
+ */
+ moreStream = 0;
+ iSmallEnough = 0;
+
+ goto store_current;
+ }
+ break;
+ default:
+ break;
+ }
+
+ CMP_STORE(ikey, idata);
+ if (ret != 0)
+ goto end;
+ ++chunk_count;
+
+ /*
+ * prevDestKey/prevDestData now point to
+ * the same DBTs as ikey/idata. We don't
+ * want to overwrite them, so swap them
+ * to point to the other DBTs.
+ */
+ if (ikey == &ikey1) {
+ ikey = &ikey2;
+ idata = &idata2;
+ prevIkey = &ikey1;
+ prevIdata = &idata1;
+ } else {
+ ikey = &ikey1;
+ idata = &idata1;
+ prevIkey = &ikey2;
+ prevIdata = &idata2;
+ }
+
+ do {
+ /* Get the next input key and data */
+ if (stream->next(
+ stream, ikey, idata) == 0) {
+ moreStream = 0;
+ iSmallEnough = 0;
+ break;
+ }
+
+#ifdef DIAGNOSTIC
+ /* Check that the stream is sorted */
+ DB_ASSERT(env, __db_compare_both(dbp,
+ ikey, idata, prevIkey,
+ prevIdata) >= 0);
+#endif
+
+ /* Check for duplicates in the stream */
+ } while (__db_compare_both(dbp, ikey, idata,
+ prevIkey, prevIdata) == 0);
+
+ /*
+ * Check that !nextExists ||
+ * ikey/idata < nextk/nextd
+ */
+ if (moreStream != 0 && nextExists != 0 &&
+ __db_compare_both(dbp, ikey,
+ idata, &nextk, &nextd) >= 0)
+ iSmallEnough = 0;
+ }
+
+ if (cmp <= 0) {
+ ret = __bamc_next_decompress(dbc);
+ if (ret == DB_NOTFOUND) {
+ moreCompressed = 0;
+ ret = 0;
+ } else if (ret != 0)
+ goto end;
+
+ }
+ }
+
+ if (prevDestKey != NULL) {
+ if ((ret = __dbc_iput(
+ dbc, &destkey, &destbuf, DB_KEYLAST)) != 0)
+ goto end;
+
+ if (countp != NULL)
+ *countp += chunk_count;
+ chunk_count = 0;
+
+ prevDestKey = NULL;
+ prevDestData = NULL;
+ destbuf.size = 0;
+ }
+ }
+
+ end:
+ CMP_FREE_DBT(env, &destkey);
+ CMP_FREE_DBT(env, &destbuf);
+ CMP_FREE_DBT(env, &nextk);
+ CMP_FREE_DBT(env, &nextc);
+
+ return (ret != 0 ? ret : bulk_ret);
+}
+
+/******************************************************************************/
+
+/* Remove the sorted key/data pairs in stream from the compressed database. */
+static int
+__bamc_compress_merge_delete(dbc, stream, countp)
+ DBC *dbc;
+ BTREE_COMPRESS_STREAM *stream;
+ u_int32_t *countp;
+{
+ DBT ikey, idata, nextk, nextc, nextd, destkey, destbuf, pdestkey;
+ DBT pdestdata;
+#ifdef DIAGNOSTIC
+ DBT pikey, pidata;
+#endif
+ DBT *prevDestKey, *prevDestData;
+ int ret, bulk_ret, cmp, moreCompressed, moreStream, nextExists;
+ int iSmallEnough;
+ u_int32_t chunk_count;
+ ENV *env;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+
+ env = dbc->env;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+ bulk_ret = 0;
+
+ memset(&ikey, 0, sizeof(DBT));
+ memset(&idata, 0, sizeof(DBT));
+
+ CMP_INIT_DBT(&nextk);
+ CMP_INIT_DBT(&nextc);
+ memset(&nextd, 0, sizeof(DBT));
+
+ CMP_INIT_DBT(&pdestkey);
+ CMP_INIT_DBT(&pdestdata);
+
+ CMP_INIT_DBT(&destkey);
+ CMP_INIT_DBT(&destbuf);
+ if ((ret = __os_malloc(env, cp->ovflsize, &destbuf.data)) != 0)
+ goto end;
+ destbuf.ulen = cp->ovflsize;
+
+ if (countp != NULL)
+ *countp = 0;
+ chunk_count = 0;
+
+ /* Get the first input key and data */
+ ret = 0;
+ if (stream->next(stream, &ikey, &idata) == 0)
+ goto end;
+
+ prevDestKey = NULL;
+ prevDestData = NULL;
+
+ moreStream = 1;
+ while (moreStream != 0) {
+ nextExists = 1;
+ moreCompressed = 1;
+
+ /* Seek the ikey/idata position */
+ if ((ret = __bamc_compress_seek(dbc, &ikey, &idata, 0)) != 0)
+ goto end;
+
+ /*
+ * Delete the key - we might overwrite it below but it's safer
+ * to just always delete it, and it doesn't seem significantly
+ * slower to do so.
+ */
+ ret = __bamc_compress_del_and_get_next(dbc, &nextk, &nextc);
+ if (ret == DB_NOTFOUND) {
+ ret = 0;
+ nextExists = 0;
+ } else if (ret == 0) {
+ CMP_UNMARSHAL_DATA(&nextc, &nextd);
+ } else
+ goto end;
+
+ if ((ret = __bamc_start_decompress(dbc)) != 0)
+ goto end;
+
+ /* !nextExists || ikey/idata < nextk/nextd */
+ iSmallEnough = 1;
+
+ while (moreCompressed != 0 || iSmallEnough != 0) {
+ if (moreCompressed == 0)
+ cmp = 1;
+ else if (iSmallEnough == 0)
+ cmp = -1;
+ else
+ cmp = __db_compare_both(dbp, cp->currentKey,
+ cp->currentData, &ikey, &idata);
+
+ if (cmp < 0) {
+ if ((ret = __bamc_compress_store(dbc,
+ cp->currentKey, cp->currentData,
+ &prevDestKey, &prevDestData,
+ &destkey, &destbuf)) != 0)
+ goto end;
+
+ if ((ret = __bam_compress_set_dbt(dbp,
+ &pdestkey, cp->currentKey->data,
+ cp->currentKey->size)) != 0)
+ goto end;
+ if ((ret = __bam_compress_set_dbt(dbp,
+ &pdestdata, cp->currentData->data,
+ cp->currentData->size)) != 0)
+ goto end;
+ prevDestKey = &pdestkey;
+ prevDestData = &pdestdata;
+ } else {
+ if (cmp != 0) {
+ /*
+ * Continue until we store the current
+ * chunk, but don't delete any more
+ * entries.
+ */
+ bulk_ret = DB_NOTFOUND;
+ moreStream = 0;
+ iSmallEnough = 0;
+ } else
+ ++chunk_count;
+
+#ifdef DIAGNOSTIC
+ pikey = ikey;
+ pidata = idata;
+#endif
+
+ /* Get the next input key and data */
+ if (stream->next(stream, &ikey, &idata) == 0) {
+ moreStream = 0;
+ iSmallEnough = 0;
+ }
+
+#ifdef DIAGNOSTIC
+ /* Check that the stream is sorted */
+ DB_ASSERT(env, moreStream == 0 ||
+ __db_compare_both(dbp, &ikey, &idata,
+ &pikey, &pidata) >= 0);
+#endif
+
+ /*
+ * Check that !nextExists ||
+ * ikey/idata < nextk/nextd
+ */
+ if (moreStream != 0 && nextExists != 0 &&
+ __db_compare_both(dbp, &ikey,
+ &idata, &nextk, &nextd) >= 0)
+ iSmallEnough = 0;
+ }
+
+ if (cmp <= 0) {
+ ret = __bamc_next_decompress(dbc);
+ if (ret == DB_NOTFOUND) {
+ moreCompressed = 0;
+ ret = 0;
+ } else if (ret != 0)
+ goto end;
+ }
+ }
+
+ if (prevDestKey != NULL) {
+ if ((ret = __dbc_iput(
+ dbc, &destkey, &destbuf, DB_KEYLAST)) != 0)
+ goto end;
+
+ if (countp)
+ *countp += chunk_count;
+ chunk_count = 0;
+
+ prevDestKey = NULL;
+ prevDestData = NULL;
+ destbuf.size = 0;
+ }
+ }
+
+ end:
+ CMP_FREE_DBT(env, &destkey);
+ CMP_FREE_DBT(env, &destbuf);
+ CMP_FREE_DBT(env, &pdestkey);
+ CMP_FREE_DBT(env, &pdestdata);
+ CMP_FREE_DBT(env, &nextk);
+ CMP_FREE_DBT(env, &nextc);
+
+ return (ret != 0 ? ret : bulk_ret);
+}
+
+/*
+ * Remove the sorted keys in stream along with all duplicate values from
+ * the compressed database.
+ */
+static int
+__bamc_compress_merge_delete_dups(dbc, stream, countp)
+ DBC *dbc;
+ BTREE_COMPRESS_STREAM *stream;
+ u_int32_t *countp;
+{
+ DBC *dbc_n;
+ DBT ikey, nextk, noread, destkey, destbuf, pdestkey, pdestdata;
+#ifdef DIAGNOSTIC
+ DBT pikey;
+#endif
+ DBT *prevDestKey, *prevDestData;
+ int ret, ret_n, bulk_ret, cmp, moreCompressed, moreStream, nextExists;
+ int iSmallEnough, ifound;
+ u_int32_t chunk_count;
+ ENV *env;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+
+ env = dbc->env;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+ bulk_ret = 0;
+
+ memset(&ikey, 0, sizeof(DBT));
+
+ CMP_INIT_DBT(&nextk);
+
+ memset(&noread, 0, sizeof(DBT));
+ noread.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM;
+
+ CMP_INIT_DBT(&pdestkey);
+ CMP_INIT_DBT(&pdestdata);
+
+ CMP_INIT_DBT(&destkey);
+ CMP_INIT_DBT(&destbuf);
+ if ((ret = __os_malloc(env, cp->ovflsize, &destbuf.data)) != 0)
+ goto end;
+ destbuf.ulen = cp->ovflsize;
+
+ if (countp != NULL)
+ *countp = 0;
+ chunk_count = 0;
+
+ /* Get the first input key and data */
+ ret = 0;
+ if (stream->next(stream, &ikey, NULL) == 0)
+ goto end;
+ ifound = 0;
+
+ prevDestKey = NULL;
+ prevDestData = NULL;
+
+ moreStream = 1;
+ iSmallEnough = 0;
+ nextExists = 0;
+ while (moreStream != 0) {
+ if (iSmallEnough != 0) {
+ if (nextExists == 0) {
+ /*
+ * We've finished deleting the last key
+ * in the database
+ */
+ if (ifound == 0) {
+ bulk_ret = DB_NOTFOUND;
+ } else
+ ++chunk_count;
+ break;
+ }
+
+ /* Move to the next chunk */
+ CMP_IGET_RETRY(
+ ret, dbc, &cp->key1, &cp->compressed, DB_CURRENT);
+ if (ret == DB_NOTFOUND) {
+ ret = 0;
+ break;
+ } else if (ret != 0)
+ goto end;
+ } else
+ /* Seek the ikey position */
+ if ((ret =
+ __bamc_compress_seek(dbc, &ikey, NULL, 0)) != 0)
+ goto end;
+
+ nextExists = 1;
+ moreCompressed = 1;
+
+ /*
+ * Delete the key - we might overwrite it below but it's
+ * safer to just always delete it, and it doesn't seem
+ * significantly slower to do so.
+ */
+ ret = __bamc_compress_del_and_get_next(dbc, &nextk, &noread);
+ if (ret == DB_NOTFOUND) {
+ ret = 0;
+ nextExists = 0;
+ } else if (ret != 0)
+ goto end;
+
+ if ((ret = __bamc_start_decompress(dbc)) != 0)
+ goto end;
+
+ /* !nextExists || ikey <= nextk */
+ iSmallEnough = 1;
+
+ while (moreCompressed != 0) {
+ if (moreCompressed == 0)
+ cmp = 1;
+ else if (iSmallEnough == 0)
+ cmp = -1;
+ else
+ cmp = __db_compare_both(
+ dbp, cp->currentKey, NULL, &ikey, NULL);
+
+ if (cmp < 0) {
+ if ((ret = __bamc_compress_store(dbc,
+ cp->currentKey, cp->currentData,
+ &prevDestKey,
+ &prevDestData, &destkey, &destbuf)) != 0)
+ goto end;
+
+ if ((ret = __bam_compress_set_dbt(dbp,
+ &pdestkey, cp->currentKey->data,
+ cp->currentKey->size)) != 0)
+ goto end;
+ if ((ret = __bam_compress_set_dbt(dbp,
+ &pdestdata, cp->currentData->data,
+ cp->currentData->size)) != 0)
+ goto end;
+ prevDestKey = &pdestkey;
+ prevDestData = &pdestdata;
+ } else if (cmp > 0) {
+ if (ifound == 0) {
+ /*
+ * Continue until we store the
+ * current chunk, but don't delete
+ * any more entries.
+ */
+ bulk_ret = DB_NOTFOUND;
+ moreStream = 0;
+ iSmallEnough = 0;
+ } else
+ ++chunk_count;
+
+#ifdef DIAGNOSTIC
+ pikey = ikey;
+#endif
+
+ /* Get the next input key */
+ if (stream->next(stream, &ikey, NULL) == 0) {
+ moreStream = 0;
+ iSmallEnough = 0;
+ }
+ ifound = 0;
+
+#ifdef DIAGNOSTIC
+ /* Check that the stream is sorted */
+ DB_ASSERT(env, moreStream == 0 ||
+ __db_compare_both(dbp, &ikey, NULL,
+ &pikey, NULL) >= 0);
+#endif
+
+ /* Check that !nextExists || ikey <= nextk */
+ if (moreStream != 0 && nextExists != 0 &&
+ __db_compare_both(dbp,
+ &ikey, NULL, &nextk, NULL) > 0)
+ iSmallEnough = 0;
+ } else /* cmp == 0 */
+ ifound = 1;
+
+ if (cmp <= 0) {
+ ret = __bamc_next_decompress(dbc);
+ if (ret == DB_NOTFOUND) {
+ moreCompressed = 0;
+ ret = 0;
+ } else if (ret != 0)
+ goto end;
+ }
+ }
+
+ if (prevDestKey != NULL) {
+ /*
+ * Do the DBC->put() with a duplicate cursor, so that
+ * the main cursor's position isn't changed - we might
+ * need it to be the same in order to use DB_CURRENT
+ * above.
+ */
+ if ((ret = __dbc_dup(dbc, &dbc_n, 0)) != 0)
+ goto end;
+ F_SET(dbc_n, DBC_TRANSIENT);
+
+ ret = __dbc_iput(dbc_n, &destkey, &destbuf, DB_KEYLAST);
+
+ if ((ret_n = __dbc_close(dbc_n)) != 0 && ret == 0)
+ ret = ret_n;
+
+ if (ret != 0)
+ goto end;
+
+ if (countp)
+ *countp += chunk_count;
+ chunk_count = 0;
+
+ prevDestKey = NULL;
+ prevDestData = NULL;
+ destbuf.size = 0;
+ }
+ }
+
+ end:
+ CMP_FREE_DBT(env, &destkey);
+ CMP_FREE_DBT(env, &destbuf);
+ CMP_FREE_DBT(env, &pdestkey);
+ CMP_FREE_DBT(env, &pdestdata);
+ CMP_FREE_DBT(env, &nextk);
+
+ return (ret != 0 ? ret : bulk_ret);
+}
+
+/******************************************************************************/
+
+/* Implements DB_PREV and DB_LAST for __bamc_compress_get() */
+static int
+__bamc_compress_get_prev(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ int ret;
+ u_int32_t tofind;
+ BTREE_CURSOR *cp;
+
+ ret = 0;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ F_CLR(cp, C_COMPRESS_DELETED);
+
+ if (cp->prevKey != NULL) {
+ /* Return the stored previous key */
+ cp->currentKey = cp->prevKey;
+ cp->currentData = cp->prevData;
+ cp->compcursor = cp->prevcursor;
+ cp->prevKey = 0;
+ cp->prevData = 0;
+ cp->prevcursor = cp->prev2cursor;
+ cp->prev2cursor = 0;
+ } else {
+ if (cp->currentKey == NULL) {
+ /* No current key, so fetch the last key */
+ flags |= DB_LAST;
+ tofind = (u_int32_t)-1;
+ } else if (cp->prevcursor == 0) {
+ /*
+ * The current key is at the begining of the
+ * compressed block, so get the last key from the
+ * previous block
+ */
+ flags |= DB_PREV;
+ tofind = (u_int32_t)-1;
+ } else {
+ /*
+ * We have to search for the previous key in the
+ * current block
+ */
+ flags |= DB_CURRENT;
+ tofind = (u_int32_t)
+ (cp->prevcursor - (u_int8_t*)cp->compressed.data);
+ }
+
+ CMP_IGET_RETRY(ret, dbc, &cp->key1, &cp->compressed, flags);
+ if (ret != 0)
+ return (ret);
+
+ /* Decompress until we reach tofind */
+ ret = __bamc_start_decompress(dbc);
+ while (ret == 0 && tofind > (u_int32_t)
+ (cp->compcursor - (u_int8_t*)cp->compressed.data)) {
+ ret = __bamc_next_decompress(dbc);
+ }
+
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ }
+
+ return (ret);
+}
+
+/* Implements DB_PREV_DUP for __bamc_compress_get() */
+static int
+__bamc_compress_get_prev_dup(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ int ret;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ BTREE *t;
+
+ ret = 0;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+ t = (BTREE *)dbp->bt_internal;
+
+ if (cp->currentKey == 0)
+ return (EINVAL);
+
+ /* If this is a deleted entry, del_key is already set, otherwise we
+ have to set it now */
+ if (!F_ISSET(cp, C_COMPRESS_DELETED)) {
+ if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key,
+ cp->currentKey->data, cp->currentKey->size)) != 0)
+ return (ret);
+ }
+
+ if ((ret = __bamc_compress_get_prev(dbc, flags)) != 0)
+ return (ret);
+
+ if (t->bt_compare(dbp, cp->currentKey, &cp->del_key) != 0)
+ return (DB_NOTFOUND);
+
+ return (0);
+}
+
+/* Implements DB_PREV_NODUP for __bamc_compress_get() */
+static int
+__bamc_compress_get_prev_nodup(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ int ret;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ BTREE *t;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+ t = (BTREE *)dbp->bt_internal;
+
+ if (cp->currentKey == 0)
+ return (__bamc_compress_get_prev(dbc, flags));
+
+ /*
+ * If this is a deleted entry, del_key is already set, otherwise we
+ * have to set it now.
+ */
+ if (!F_ISSET(cp, C_COMPRESS_DELETED))
+ if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key,
+ cp->currentKey->data, cp->currentKey->size)) != 0)
+ return (ret);
+
+ /*
+ * Linear search for the next non-duplicate key - this is
+ * especially inefficient for DB_PREV_NODUP, since we have to
+ * decompress from the begining of the chunk to find previous
+ * key/data pairs. Instead we could check for key equality as we
+ * decompress.
+ */
+ do
+ if ((ret = __bamc_compress_get_prev(dbc, flags)) != 0)
+ return (ret);
+ while (t->bt_compare(dbp, cp->currentKey, &cp->del_key) == 0);
+
+ return (0);
+}
+
+/* Implements DB_NEXT and DB_FIRST for __bamc_compress_get() */
+static int
+__bamc_compress_get_next(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ int ret;
+ BTREE_CURSOR *cp;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ if (F_ISSET(cp, C_COMPRESS_DELETED)) {
+ if (cp->currentKey == 0)
+ return (DB_NOTFOUND);
+ F_CLR(cp, C_COMPRESS_DELETED);
+ return (0);
+ } else if (cp->currentKey) {
+ ret = __bamc_next_decompress(dbc);
+ if (ret != DB_NOTFOUND)
+ return (ret);
+
+ flags |= DB_NEXT;
+ } else
+ flags |= DB_FIRST;
+
+ CMP_IGET_RETRY(ret, dbc, &cp->key1, &cp->compressed, flags);
+ if (ret == DB_NOTFOUND) {
+ /*
+ * Reset the cursor, so that
+ * __bamc_compress_get_multiple_key will end up pointing
+ * to the right place
+ */
+ __bamc_compress_reset(dbc);
+ return (DB_NOTFOUND);
+ } else if (ret != 0)
+ return (ret);
+
+ ret = __bamc_start_decompress(dbc);
+
+ return (ret);
+}
+
+/* Implements DB_NEXT_DUP for __bamc_compress_get() */
+static int
+__bamc_compress_get_next_dup(dbc, key, flags)
+ DBC *dbc;
+ DBT *key;
+ u_int32_t flags;
+{
+ int ret;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ BTREE *t;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+ t = (BTREE *)dbp->bt_internal;
+
+ if (cp->currentKey == 0)
+ return (EINVAL);
+
+ if (F_ISSET(cp, C_COMPRESS_DELETED)) {
+ /*
+ * Check that the next entry has the same key as the
+ * deleted entry.
+ */
+ if (cp->currentKey == 0)
+ return (DB_NOTFOUND);
+ F_CLR(cp, C_COMPRESS_DELETED);
+ return (t->bt_compare(dbp,
+ cp->currentKey, &cp->del_key) == 0 ? 0 : DB_NOTFOUND);
+ }
+
+ /* Check that the next entry has the same key as the previous entry */
+ ret = __bamc_next_decompress(dbc);
+ if (ret == 0 && t->bt_compare(dbp, cp->currentKey, cp->prevKey) != 0)
+ return (DB_NOTFOUND);
+ if (ret != DB_NOTFOUND)
+ return (ret);
+
+ if (key == NULL) {
+ /* Copy the current key to del_key */
+ if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key,
+ cp->currentKey->data, cp->currentKey->size)) != 0)
+ return (ret);
+ key = &cp->del_key;
+ }
+
+ /* Fetch the next chunk */
+ CMP_IGET_RETRY(ret, dbc, &cp->key1, &cp->compressed, DB_NEXT | flags);
+ if (ret == DB_NOTFOUND) {
+ /*
+ * Reset the cursor, so that __bamc_compress_get_multiple
+ * will end up pointing to the right place
+ */
+ __bamc_compress_reset(dbc);
+ return (DB_NOTFOUND);
+ } else if (ret != 0)
+ return (ret);
+
+ if ((ret = __bamc_start_decompress(dbc)) != 0)
+ return (ret);
+
+ /* Check the keys are the same */
+ if (t->bt_compare(dbp, cp->currentKey, key) != 0)
+ return (DB_NOTFOUND);
+
+ return (0);
+}
+
+/* Implements DB_NEXT_NODUP for __bamc_compress_get() */
+static int
+__bamc_compress_get_next_nodup(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ int ret;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ BTREE *t;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+ t = (BTREE *)dbp->bt_internal;
+
+ if (cp->currentKey == 0)
+ return (__bamc_compress_get_next(dbc, flags));
+
+ /*
+ * If this is a deleted entry, del_key is already set, otherwise
+ * we have to set it now
+ */
+ if (!F_ISSET(cp, C_COMPRESS_DELETED))
+ if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key,
+ cp->currentKey->data, cp->currentKey->size)) != 0)
+ return (ret);
+
+ /* Linear search for the next non-duplicate key */
+ do
+ if ((ret = __bamc_compress_get_next(dbc, flags)) != 0)
+ return (ret);
+ while (t->bt_compare(dbp, cp->currentKey, &cp->del_key) == 0);
+
+ return (ret);
+}
+
+/*
+ * Implements DB_SET, DB_SET_RANGE, DB_GET_BOTH, and DB_GET_BOTH_RANGE
+ * for __bamc_compress_get()
+ */
+static int
+__bamc_compress_get_set(dbc, key, data, method, flags)
+ DBC *dbc;
+ DBT *key;
+ DBT *data;
+ u_int32_t method;
+ u_int32_t flags;
+{
+ int ret, cmp;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+
+ if (method == DB_SET || method == DB_SET_RANGE)
+ data = NULL;
+
+ F_CLR(cp, C_COMPRESS_DELETED);
+
+ ret = __bamc_compress_seek(dbc, key, data, flags);
+ if (ret == DB_NOTFOUND)
+ CMP_IGET_RETRY(ret, dbc,
+ &cp->key1, &cp->compressed, DB_FIRST | flags);
+ if (ret != 0)
+ return (ret);
+
+ /* Decompress and perform a linear search for the key */
+ cmp = 0;
+ ret = __bamc_start_decompress(dbc);
+ while (ret == 0 && (cmp = __db_compare_both(dbp,
+ cp->currentKey, cp->currentData, key, data)) < 0) {
+ ret = __bamc_next_decompress(dbc);
+ if (ret == DB_NOTFOUND) {
+ CMP_IGET_RETRY(ret, dbc,
+ &cp->key1, &cp->compressed, DB_NEXT | flags);
+ if (ret == 0)
+ ret = __bamc_start_decompress(dbc);
+ }
+ }
+
+ switch (method) {
+ case DB_SET:
+ case DB_GET_BOTH_RANGE:
+ /*
+ * We need to exactly match the key, and if cmp != 0 we
+ * might not have - so check again here.
+ */
+ if (ret == 0 &&
+ __db_compare_both(dbp, cp->currentKey, 0, key, 0) != 0) {
+ /* We didn't find the key */
+ ret = DB_NOTFOUND;
+ }
+ break;
+ case DB_GET_BOTH:
+ if (ret == 0 && (cmp != 0 || (!F_ISSET(dbp, DB_AM_DUPSORT) &&
+ __bam_defcmp(dbp, cp->currentData, data) != 0))) {
+ /* We didn't find the key/data pair */
+ ret = DB_NOTFOUND;
+ }
+ break;
+ default:
+ DB_ASSERT(dbp->env, method == 0 || method == DB_SET_RANGE);
+ }
+
+ return (ret);
+}
+
+/* Implements DB_GET_BOTHC for __bamc_compress_get() */
+static int
+__bamc_compress_get_bothc(dbc, data, flags)
+ DBC *dbc;
+ DBT *data;
+ u_int32_t flags;
+{
+ int ret, cmp;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+
+ /* Check that the data we are looking for comes after the current
+ position */
+ if (__db_compare_both(dbp, cp->currentKey,
+ cp->currentData, cp->currentKey, data) >= 0)
+ return (DB_NOTFOUND);
+
+ cmp = 0;
+ /* Perform a linear search for the data in the current chunk */
+ while ((ret = __bamc_next_decompress(dbc)) == 0 &&
+ (cmp = __db_compare_both(
+ dbp, cp->currentKey, cp->currentData, cp->prevKey, data)) < 0)
+ continue;
+
+ if (ret == 0)
+ return (cmp == 0 ? 0 : DB_NOTFOUND);
+ if (ret != DB_NOTFOUND)
+ return (ret);
+
+ /* Copy the current key to del_key */
+ if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key,
+ cp->currentKey->data, cp->currentKey->size)) != 0)
+ return (ret);
+
+ /* Search for the data using DB_GET_BOTH */
+ return __bamc_compress_get_set(
+ dbc, &cp->del_key, data, DB_GET_BOTH, flags);
+}
+
+/* Implements DB_MULTIPLE_KEY for __bamc_compress_get() */
+static int
+__bamc_compress_get_multiple_key(dbc, data, flags)
+ DBC *dbc;
+ DBT *data;
+ u_int32_t flags;
+{
+ int ret;
+ u_int8_t *writekey, *writedata;
+ void *mptr;
+ BTREE_CURSOR *cp;
+
+ ret = 0;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ DB_MULTIPLE_WRITE_INIT(mptr, data);
+ DB_MULTIPLE_KEY_RESERVE_NEXT(mptr, data, writekey, cp->currentKey->size,
+ writedata, cp->currentData->size);
+ if (writekey == NULL) {
+ data->size = cp->currentKey->size + cp->currentData->size +
+ 4 * sizeof(u_int32_t);
+ return DB_BUFFER_SMALL;
+ }
+ DB_ASSERT(dbc->dbp->env, writedata != NULL);
+
+ memcpy(writekey, cp->currentKey->data, cp->currentKey->size);
+ memcpy(writedata, cp->currentData->data, cp->currentData->size);
+
+ while ((ret = __bamc_compress_get_next(dbc, flags)) == 0) {
+ DB_MULTIPLE_KEY_RESERVE_NEXT(mptr, data, writekey,
+ cp->currentKey->size, writedata, cp->currentData->size);
+ if (writekey == NULL)
+ break;
+ DB_ASSERT(dbc->dbp->env, writedata != NULL);
+
+ /*
+ * We could choose to optimize this by just storing one
+ * copy of a key for each set of duplicate data.
+ */
+ memcpy(writekey, cp->currentKey->data, cp->currentKey->size);
+ memcpy(writedata, cp->currentData->data, cp->currentData->size);
+ }
+
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+
+ if (ret == 0)
+ /*
+ * Rewind to the previous key/data, since we can't fit
+ * this one in the buffer
+ */
+ ret = __bamc_compress_get_prev(dbc, flags);
+
+ return (ret);
+}
+
+/* Implements DB_MULTIPLE for __bamc_compress_get() */
+static int
+__bamc_compress_get_multiple(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ int ret;
+ u_int8_t *writedata;
+ void *mptr;
+ BTREE_CURSOR *cp;
+
+ ret = 0;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ data->size = 0;
+
+ DB_MULTIPLE_WRITE_INIT(mptr, data);
+ DB_MULTIPLE_RESERVE_NEXT(mptr, data, writedata, cp->currentData->size);
+ data->size += cp->currentData->size + 2 * sizeof(u_int32_t);
+ if (writedata == NULL)
+ return DB_BUFFER_SMALL;
+
+ memcpy(writedata, cp->currentData->data, cp->currentData->size);
+
+ while ((ret = __bamc_compress_get_next_dup(dbc, key, flags)) == 0) {
+ DB_MULTIPLE_RESERVE_NEXT(
+ mptr, data, writedata, cp->currentData->size);
+ data->size += cp->currentData->size + 2 * sizeof(u_int32_t);
+ if (writedata == NULL) {
+ /* DBC_FROM_DB_GET indicates we need to fit all the
+ * duplicates into the buffer or return DB_BUFFER_SMALL.
+ * [#17039]
+ */
+ if (F_ISSET(dbc, DBC_FROM_DB_GET))
+ return DB_BUFFER_SMALL;
+ break;
+ }
+
+ memcpy(writedata, cp->currentData->data, cp->currentData->size);
+ }
+
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+
+ if (ret == 0)
+ /*
+ * Rewind to the previous key/data, as that's now our current
+ * entry.
+ */
+ ret = __bamc_compress_get_prev(dbc, flags);
+
+ return (ret);
+}
+
+/*
+ * __bamc_compress_iget --
+ * Get using a compressed cursor. (internal)
+ */
+static int
+__bamc_compress_iget(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ int ret;
+ u_int32_t multiple, method;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+ ret = 0;
+
+ multiple = flags & (DB_MULTIPLE|DB_MULTIPLE_KEY);
+ method = flags & DB_OPFLAGS_MASK;
+ flags = flags & ~(DB_OPFLAGS_MASK|DB_MULTIPLE|DB_MULTIPLE_KEY);
+
+ switch (method) {
+ case DB_CURRENT:
+ if (F_ISSET(cp, C_COMPRESS_DELETED))
+ ret = DB_KEYEMPTY;
+ else if (cp->currentKey == NULL)
+ ret = EINVAL;
+ break;
+ case DB_FIRST:
+ __bamc_compress_reset(dbc);
+ ret = __bamc_compress_get_next(dbc, flags);
+ break;
+ case DB_NEXT:
+ ret = __bamc_compress_get_next(dbc, flags);
+ break;
+ case DB_NEXT_DUP:
+ ret = __bamc_compress_get_next_dup(dbc, 0, flags);
+ break;
+ case DB_NEXT_NODUP:
+ ret = __bamc_compress_get_next_nodup(dbc, flags);
+ break;
+ case DB_LAST:
+ __bamc_compress_reset(dbc);
+ ret = __bamc_compress_get_prev(dbc, flags);
+ break;
+ case DB_PREV:
+ ret = __bamc_compress_get_prev(dbc, flags);
+ break;
+ case DB_PREV_DUP:
+ ret = __bamc_compress_get_prev_dup(dbc, flags);
+ break;
+ case DB_PREV_NODUP:
+ ret = __bamc_compress_get_prev_nodup(dbc, flags);
+ break;
+ case DB_SET:
+ if (((BTREE *)
+ dbc->dbp->bt_internal)->bt_compare == __bam_defcmp)
+ F_SET(key, DB_DBT_ISSET);
+ /* FALL THROUGH */
+ case DB_SET_RANGE:
+ ret = __bamc_compress_get_set(dbc, key, 0, method, flags);
+ break;
+ case DB_GET_BOTH:
+ if (!F_ISSET(dbc->dbp, DB_AM_DUPSORT) || ((BTREE *)dbc->dbp->
+ bt_internal)->compress_dup_compare == __bam_defcmp)
+ F_SET(data, DB_DBT_ISSET);
+ /* FALL THROUGH */
+ case DB_GET_BOTH_RANGE:
+ if (((BTREE *)
+ dbc->dbp->bt_internal)->bt_compare == __bam_defcmp)
+ F_SET(key, DB_DBT_ISSET);
+ ret = __bamc_compress_get_set(dbc, key, data, method, flags);
+ break;
+ case DB_GET_BOTHC:
+ ret = __bamc_compress_get_bothc(dbc, data, flags);
+ break;
+ default:
+ ret = __db_unknown_flag(dbp->env, "__bamc_compress_iget",
+ method);
+ break;
+ }
+
+ if (ret != 0)
+ goto err;
+
+ switch (multiple) {
+ case 0:
+ if (!F_ISSET(key, DB_DBT_ISSET))
+ ret = __db_retcopy(dbc->env, key,
+ cp->currentKey->data, cp->currentKey->size,
+ &dbc->rkey->data, &dbc->rkey->ulen);
+ if (!F_ISSET(data, DB_DBT_ISSET) && ret == 0)
+ ret = __db_retcopy(dbc->env, data,
+ cp->currentData->data, cp->currentData->size,
+ &dbc->rdata->data, &dbc->rdata->ulen);
+ break;
+ case DB_MULTIPLE:
+ if (!F_ISSET(key, DB_DBT_ISSET))
+ ret = __db_retcopy(dbc->env, key,
+ cp->currentKey->data, cp->currentKey->size,
+ &dbc->rkey->data, &dbc->rkey->ulen);
+ if (ret == 0)
+ ret =
+ __bamc_compress_get_multiple(dbc, key, data, flags);
+ break;
+ case DB_MULTIPLE_KEY:
+ ret = __bamc_compress_get_multiple_key(dbc, data, flags);
+ break;
+ default:
+ ret = __db_unknown_flag(dbp->env, "__bamc_compress_iget",
+ multiple);
+ break;
+ }
+
+ err:
+ F_CLR(key, DB_DBT_ISSET);
+ F_CLR(data, DB_DBT_ISSET);
+
+ return (ret);
+}
+
+/*
+ * __bamc_compress_get --
+ * Get using a compressed cursor.
+ *
+ * PUBLIC: int __bamc_compress_get __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__bamc_compress_get(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DBC *dbc_n;
+ int ret, t_ret;
+ u_int32_t tmp_flags;
+
+ switch (flags & DB_OPFLAGS_MASK) {
+ case DB_CURRENT:
+ case DB_GET_BOTHC:
+ case DB_NEXT:
+ case DB_NEXT_DUP:
+ case DB_NEXT_NODUP:
+ case DB_PREV:
+ case DB_PREV_DUP:
+ case DB_PREV_NODUP:
+ if (F_ISSET((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED)
+ && (ret = __bamc_compress_relocate(dbc)) != 0)
+ return (ret);
+ tmp_flags = DB_POSITION;
+ break;
+ default:
+ F_CLR((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED);
+ tmp_flags = 0;
+ break;
+ }
+
+ if (F_ISSET(dbc, DBC_TRANSIENT))
+ dbc_n = dbc;
+ else {
+ if ((ret = __dbc_dup(dbc, &dbc_n, tmp_flags)) != 0)
+ goto err;
+
+ /*
+ * We don't care about preserving the cursor's position on
+ * error.
+ */
+ F_SET(dbc_n, DBC_TRANSIENT);
+
+ COPY_RET_MEM(dbc, dbc_n);
+ }
+
+ if ((ret = __bamc_compress_iget(dbc_n, key, data, flags)) != 0)
+ goto err;
+
+err:
+ /* Cleanup and cursor resolution. */
+ if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 &&
+ (ret == 0 || ret == DB_BUFFER_SMALL))
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __bamc_compress_iput --
+ * Put using a compressed cursor (internal)
+ */
+static int
+__bamc_compress_iput(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ int ret;
+ u_int32_t multi;
+ DBT kcpy, pdata, empty;
+ BTREE_COMPRESS_STREAM stream;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ ENV *env;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+ env = dbc->env;
+
+ memset(&pdata, 0, sizeof(DBT));
+ memset(&empty, 0, sizeof(DBT));
+
+ multi = LF_ISSET(DB_MULTIPLE|DB_MULTIPLE_KEY);
+ LF_CLR(DB_MULTIPLE|DB_MULTIPLE_KEY);
+
+ switch (flags) {
+ case DB_CURRENT:
+ if (cp->currentKey == 0 || F_ISSET(cp, C_COMPRESS_DELETED)) {
+ ret = DB_NOTFOUND;
+ goto end;
+ }
+
+ if (F_ISSET(data, DB_DBT_PARTIAL)) {
+ if ((ret = __db_buildpartial(
+ dbp, cp->currentData, data, &pdata)) != 0)
+ goto end;
+ data = &pdata;
+ }
+
+ if (F_ISSET(dbp, DB_AM_DUPSORT) &&
+ ((BTREE *)dbp->bt_internal)->compress_dup_compare(
+ dbp, cp->currentData, data) != 0) {
+ __db_errx(env,
+ "Existing data sorts differently from put data");
+ ret = EINVAL;
+ goto end;
+ }
+ CMP_INIT_DBT(&kcpy);
+ if ((ret = __bam_compress_set_dbt(dbp,
+ &kcpy, cp->currentKey->data, cp->currentKey->size)) != 0)
+ goto end;
+
+ __bam_cs_create_single(&stream, &kcpy, data);
+ ret = __bamc_compress_merge_insert(dbc, &stream, NULL, flags);
+
+ if (ret == 0)
+ /* Position the cursor on the entry written */
+ ret = __bamc_compress_get_set(
+ dbc, &kcpy, data, DB_GET_BOTH_RANGE, 0);
+
+ CMP_FREE_DBT(env, &kcpy);
+ break;
+ case DB_KEYFIRST:
+ case DB_KEYLAST:
+ case DB_NODUPDATA:
+ case DB_OVERWRITE_DUP:
+ switch (multi) {
+ case 0:
+ if (F_ISSET(data, DB_DBT_PARTIAL)) {
+ if ((ret = __bamc_compress_get_set(dbc, key,
+ data, DB_SET, 0)) != 0 &&
+ ret != DB_NOTFOUND)
+ goto end;
+ if ((ret = __db_buildpartial(dbp,
+ ret == DB_NOTFOUND ? &empty :
+ cp->currentData, data, &pdata)) != 0)
+ goto end;
+ data = &pdata;
+ }
+
+ __bam_cs_create_single(&stream, key, data);
+ ret = __bamc_compress_merge_insert(
+ dbc, &stream, NULL, flags);
+
+ if (ret == 0)
+ /* Position the cursor on the entry written */
+ ret = __bamc_compress_get_set(
+ dbc, key, data, DB_GET_BOTH_RANGE, 0);
+ break;
+ case DB_MULTIPLE:
+ __bam_cs_create_multiple(&stream, key, data);
+ ret = __bamc_compress_merge_insert(
+ dbc, &stream, &key->doff, flags);
+ break;
+ case DB_MULTIPLE_KEY:
+ __bam_cs_create_multiple_key(&stream, key);
+ ret = __bamc_compress_merge_insert(
+ dbc, &stream, &key->doff, flags);
+ break;
+ default:
+ return (__db_unknown_flag(
+ dbp->env, "__bamc_compress_iput", multi));
+ }
+ break;
+ case DB_NOOVERWRITE:
+ /* Check key doesn't already exist */
+ ret = __bamc_compress_get_set(dbc, key, 0, DB_SET, 0);
+ if (ret != DB_NOTFOUND) {
+ if (ret == 0)
+ ret = DB_KEYEXIST;
+ goto end;
+ }
+
+ if (F_ISSET(data, DB_DBT_PARTIAL)) {
+ if ((ret = __db_buildpartial(
+ dbp, &empty, data, &pdata)) != 0)
+ goto end;
+ data = &pdata;
+ }
+
+ __bam_cs_create_single(&stream, key, data);
+ ret = __bamc_compress_merge_insert(dbc, &stream, NULL, flags);
+
+ if (ret == 0)
+ /* Position the cursor on the entry written */
+ ret = __bamc_compress_get_set(
+ dbc, key, data, DB_GET_BOTH_RANGE, 0);
+ break;
+ default:
+ return (__db_unknown_flag(
+ dbp->env, "__bamc_compress_iput", flags));
+ }
+
+ end:
+ if (pdata.data != NULL)
+ __os_free(env, pdata.data);
+ return (ret);
+}
+
+/*
+ * __bamc_compress_put --
+ * Put using a compressed cursor.
+ *
+ * PUBLIC: int __bamc_compress_put __P((DBC *, DBT *, DBT *, u_int32_t));
+ */
+int
+__bamc_compress_put(dbc, key, data, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DBC *dbc_n;
+ int ret, t_ret;
+
+ if (F_ISSET((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED)) {
+ if ((flags & DB_OPFLAGS_MASK) == DB_CURRENT &&
+ (ret = __bamc_compress_relocate(dbc)) != 0)
+ return (ret);
+ F_CLR((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED);
+ }
+
+ if (F_ISSET(dbc, DBC_TRANSIENT))
+ dbc_n = dbc;
+ else {
+ if ((ret = __dbc_dup(dbc, &dbc_n,
+ (flags & DB_OPFLAGS_MASK) == DB_CURRENT ?
+ DB_POSITION : 0)) != 0)
+ goto err;
+
+ /*
+ * We don't care about preserving the cursor's position on
+ * error.
+ */
+ F_SET(dbc_n, DBC_TRANSIENT);
+ }
+
+ if ((ret = __bamc_compress_iput(dbc_n, key, data, flags)) != 0)
+ goto err;
+
+err:
+ /* Cleanup and cursor resolution. */
+ if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 &&
+ (ret == 0 || ret == DB_BUFFER_SMALL))
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __bamc_compress_idel --
+ * Del using a compressed cursor. (internal)
+ */
+static int
+__bamc_compress_idel(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ int ret;
+ BTREE_COMPRESS_STREAM stream;
+ DB *dbp;
+ BTREE_CURSOR *cp;
+
+ COMPQUIET(flags, 0);
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ if (F_ISSET(cp, C_COMPRESS_DELETED))
+ return DB_KEYEMPTY;
+ if (cp->currentKey == 0)
+ return DB_NOTFOUND;
+
+ if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key,
+ cp->currentKey->data, cp->currentKey->size)) != 0)
+ goto err;
+ if ((ret = __bam_compress_set_dbt(dbp, &cp->del_data,
+ cp->currentData->data, cp->currentData->size)) != 0)
+ goto err;
+
+ __bam_cs_create_single(&stream, &cp->del_key, &cp->del_data);
+ if ((ret = __bamc_compress_merge_delete(dbc, &stream, NULL)) != 0)
+ goto err;
+
+ /* Position the cursor on the entry after the key/data deleted */
+ ret = __bamc_compress_get_set(dbc, &cp->del_key, &cp->del_data, 0, 0);
+ if (ret == DB_NOTFOUND) {
+ __bamc_compress_reset(dbc);
+ ret = 0;
+ } else if (ret != 0)
+ goto err;
+
+ /* Mark current as being deleted */
+ F_SET(cp, C_COMPRESS_DELETED);
+
+ err:
+ return (ret);
+}
+
+/*
+ * __bamc_compress_del --
+ * Del using a compressed cursor.
+ *
+ * PUBLIC: int __bamc_compress_del __P((DBC *, u_int32_t));
+ */
+int
+__bamc_compress_del(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ int ret, t_ret;
+ DBC *dbc_n;
+
+ if (F_ISSET((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED) &&
+ (ret = __bamc_compress_relocate(dbc)) != 0)
+ return (ret);
+
+ if (F_ISSET(dbc, DBC_TRANSIENT))
+ dbc_n = dbc;
+ else {
+ if ((ret = __dbc_dup(dbc, &dbc_n, DB_POSITION)) != 0)
+ goto err;
+
+ /*
+ * We don't care about preserving the cursor's position on
+ * error.
+ */
+ F_SET(dbc_n, DBC_TRANSIENT);
+
+ COPY_RET_MEM(dbc, dbc_n);
+ }
+
+ if ((ret = __bamc_compress_idel(dbc_n, flags)) != 0)
+ goto err;
+
+err:
+ /* Cleanup and cursor resolution. */
+ if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 &&
+ (ret == 0 || ret == DB_BUFFER_SMALL))
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __bamc_compress_ibulk_del --
+ * Bulk del using a compressed cursor. (internal)
+ */
+static int
+__bamc_compress_ibulk_del(dbc, key, flags)
+ DBC *dbc;
+ DBT *key;
+ u_int32_t flags;
+{
+ BTREE_COMPRESS_STREAM stream;
+
+ switch (flags) {
+ case 0:
+ __bam_cs_create_single_keyonly(&stream, key);
+ return (__bamc_compress_merge_delete_dups(dbc, &stream, NULL));
+ case DB_MULTIPLE:
+ __bam_cs_create_multiple_keyonly(&stream, key);
+ return (__bamc_compress_merge_delete_dups(
+ dbc, &stream, &key->doff));
+ case DB_MULTIPLE_KEY:
+ __bam_cs_create_multiple_key(&stream, key);
+ return (__bamc_compress_merge_delete(dbc, &stream, &key->doff));
+ default:
+ break;
+ }
+
+ return (__db_unknown_flag(
+ dbc->env, "__bamc_compress_ibulk_del", flags));
+}
+
+/*
+ * __bamc_compress_bulk_del --
+ * Bulk del using a compressed cursor.
+ *
+ * PUBLIC: int __bamc_compress_bulk_del __P((DBC *, DBT *, u_int32_t));
+ */
+int
+__bamc_compress_bulk_del(dbc, key, flags)
+ DBC *dbc;
+ DBT *key;
+ u_int32_t flags;
+{
+ int ret, t_ret;
+ DBC *dbc_n;
+
+ F_CLR((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED);
+
+ if (F_ISSET(dbc, DBC_TRANSIENT))
+ dbc_n = dbc;
+ else {
+ if ((ret = __dbc_dup(dbc, &dbc_n, 0)) != 0)
+ goto err;
+
+ /*
+ * We don't care about preserving the cursor's position on
+ * error.
+ */
+ F_SET(dbc_n, DBC_TRANSIENT);
+ }
+
+ if ((ret = __bamc_compress_ibulk_del(dbc_n, key, flags)) != 0)
+ goto err;
+
+err:
+ /* Cleanup and cursor resolution. */
+ if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 &&
+ (ret == 0 || ret == DB_BUFFER_SMALL))
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __bamc_compress_count --
+ * Count using a compressed cursor.
+ *
+ * PUBLIC: int __bamc_compress_count __P((DBC *, db_recno_t *));
+ */
+int
+__bamc_compress_count(dbc, countp)
+ DBC *dbc;
+ db_recno_t *countp;
+{
+ int ret, t_ret;
+ db_recno_t count;
+ DBT *key;
+ DBC *dbc_n;
+ BTREE_CURSOR *cp;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * If the current entry is deleted use del_key, otherwise use
+ * currentKey.
+ */
+ if (F_ISSET(cp, C_COMPRESS_DELETED))
+ key = &cp->del_key;
+ else
+ key = cp->currentKey;
+
+ /* Duplicate the cursor */
+ if ((ret = __dbc_dup(dbc, &dbc_n, 0)) != 0)
+ return (ret);
+
+ /* We don't care about preserving the cursor's position on error */
+ F_SET(dbc_n, DBC_TRANSIENT);
+
+ /* Find the first duplicate */
+ if ((ret = __bamc_compress_get_set(dbc_n, key, 0, DB_SET, 0)) != 0)
+ goto err;
+ count = 1;
+
+ /* Count subsequent duplicates */
+ while ((ret = __bamc_compress_get_next_dup(dbc_n, key, 0)) == 0)
+ ++count;
+
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ else if (ret != 0)
+ goto err;
+
+ *countp = count;
+
+ err:
+ if ((t_ret = __dbc_close(dbc_n)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __bamc_compress_cmp --
+ * Compare which compressed value is pointed to.
+ *
+ * PUBLIC: int __bamc_compress_cmp __P((DBC *, DBC *, int *));
+ */
+int
+__bamc_compress_cmp(dbc, other_dbc, result)
+ DBC *dbc, *other_dbc;
+ int *result;
+{
+ DB *dbp;
+ BTREE_CURSOR *cp, *ocp;
+
+ /*
+ * At this point, we already know that the cursors point to the same
+ * DB.
+ */
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ ocp = (BTREE_CURSOR *)other_dbc->internal;
+
+ if (F_ISSET(cp, C_COMPRESS_DELETED))
+ if (F_ISSET(ocp, C_COMPRESS_DELETED))
+ *result = __db_compare_both(
+ dbp, &cp->del_key, &cp->del_data,
+ &ocp->del_key, &ocp->del_data) == 0 ? 0 : 1;
+ else {
+ if (ocp->currentKey == 0)
+ goto err;
+
+ *result = __db_compare_both(
+ dbp, &cp->del_key, &cp->del_data,
+ ocp->currentKey, ocp->currentData) == 0 ? 0 : 1;
+ }
+ else {
+ if (cp->currentKey == 0)
+ goto err;
+
+ if (F_ISSET(ocp, C_COMPRESS_DELETED))
+ *result = __db_compare_both(
+ dbp, cp->currentKey, cp->currentData,
+ &ocp->del_key, &ocp->del_data) == 0 ? 0 : 1;
+ else {
+ if (ocp->currentKey == 0)
+ goto err;
+
+ *result = __db_compare_both(
+ dbp, cp->currentKey, cp->currentData,
+ ocp->currentKey, ocp->currentData) == 0 ? 0 : 1;
+ }
+ }
+ return (0);
+
+ err:
+ __db_errx(dbc->env,
+ "Both cursors must be initialized before calling DBC->cmp.");
+ return (EINVAL);
+}
+
+/*
+ * __bamc_compress_dup --
+ * Duplicate the compression specific part of a btree cursor.
+ *
+ * PUBLIC: int __bamc_compress_dup __P((DBC *, DBC *, u_int32_t));
+ */
+int
+__bamc_compress_dup(orig_dbc, new_dbc, flags)
+ DBC *orig_dbc, *new_dbc;
+ u_int32_t flags;
+{
+ int ret;
+ DB *dbp;
+ BTREE_CURSOR *orig, *new;
+
+ dbp = new_dbc->dbp;
+
+ orig = (BTREE_CURSOR *)orig_dbc->internal;
+ new = (BTREE_CURSOR *)new_dbc->internal;
+
+ if (orig->currentKey != NULL && !LF_ISSET(DB_SHALLOW_DUP)) {
+ new->currentKey = &new->key1;
+ new->currentData = &new->data1;
+
+ if ((ret = __bam_compress_set_dbt(dbp, new->currentKey,
+ orig->currentKey->data, orig->currentKey->size)) != 0)
+ return (ret);
+ if ((ret = __bam_compress_set_dbt(dbp, new->currentData,
+ orig->currentData->data, orig->currentData->size)) != 0)
+ return (ret);
+
+ if (orig->prevKey) {
+ new->prevKey = &new->key2;
+ new->prevData = &new->data2;
+
+ if ((ret = __bam_compress_set_dbt(dbp, new->prevKey,
+ orig->prevKey->data, orig->prevKey->size)) != 0)
+ return (ret);
+ if ((ret = __bam_compress_set_dbt(dbp, new->prevData,
+ orig->prevData->data, orig->prevData->size)) != 0)
+ return (ret);
+ }
+
+ if ((ret = __bam_compress_set_dbt(dbp, &new->compressed,
+ orig->compressed.data, orig->compressed.size)) != 0)
+ return (ret);
+
+ new->compcursor = (u_int8_t*)new->compressed.data +
+ (orig->compcursor - (u_int8_t*)orig->compressed.data);
+ new->compend = (u_int8_t*)new->compressed.data +
+ (orig->compend - (u_int8_t*)orig->compressed.data);
+ new->prevcursor = orig->prevcursor == NULL ? NULL :
+ (u_int8_t*)new->compressed.data + (orig->prevcursor -
+ (u_int8_t*)orig->compressed.data);
+ new->prev2cursor = orig->prev2cursor == NULL ? NULL :
+ (u_int8_t*)new->compressed.data + (orig->prev2cursor -
+ (u_int8_t*)orig->compressed.data);
+
+ if (F_ISSET(orig, C_COMPRESS_DELETED)) {
+ if ((ret = __bam_compress_set_dbt(dbp, &new->del_key,
+ orig->del_key.data, orig->del_key.size)) != 0)
+ return (ret);
+ if ((ret = __bam_compress_set_dbt(dbp, &new->del_data,
+ orig->del_data.data, orig->del_data.size)) != 0)
+ return (ret);
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * __bam_compress_salvage --
+ * Salvage the compressed data from the key/data pair
+ *
+ * PUBLIC: int __bam_compress_salvage __P((DB *, VRFY_DBINFO *,
+ * PUBLIC: void *, int (*)(void *, const void *), DBT *, DBT *));
+ */
+int
+__bam_compress_salvage(dbp, vdp, handle, callback, key, data)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ DBT *key, *data;
+{
+ DBT key1, key2, data1, data2, compressed;
+ DBT *currentKey, *currentData, *prevKey, *prevData;
+ ENV *env;
+ int ret, t_ret;
+ u_int8_t *compcursor, *compend;
+ u_int32_t datasize, size;
+
+ env = dbp->env;
+
+ memset(&key1, 0, sizeof(DBT));
+ memset(&key2, 0, sizeof(DBT));
+ memset(&data1, 0, sizeof(DBT));
+ memset(&data2, 0, sizeof(DBT));
+ memset(&compressed, 0, sizeof(DBT));
+
+ key1.flags = DB_DBT_USERMEM;
+ key2.flags = DB_DBT_USERMEM;
+ data1.flags = DB_DBT_USERMEM;
+ data2.flags = DB_DBT_USERMEM;
+ compressed.flags = DB_DBT_USERMEM;
+
+ prevKey = NULL;
+ prevData = NULL;
+ currentKey = key;
+ currentData = &data2;
+ compcursor = (u_int8_t*)data->data;
+ compend = compcursor + data->size;
+
+ if (data->size == 0) {
+ ret = DB_VERIFY_FATAL;
+ goto unknown_data;
+ }
+
+ /* Unmarshal the first data */
+ size = __db_decompress_count_int(compcursor);
+ if (size == 0xFF || compcursor + size > compend) {
+ ret = DB_VERIFY_FATAL;
+ goto unknown_data;
+ }
+ compcursor += __db_decompress_int32(compcursor, &datasize);
+
+ if (compcursor + datasize > compend) {
+ ret = DB_VERIFY_FATAL;
+ goto unknown_data;
+ }
+ if ((ret = __bam_compress_set_dbt(
+ dbp, currentData, compcursor, datasize)) != 0)
+ goto err;
+ compcursor += datasize;
+
+ /* Output first data (first key has already been output by our caller */
+ if ((ret = __db_vrfy_prdbt(
+ currentData, 0, " ", handle, callback, 0, vdp)) != 0)
+ goto err;
+
+ while (compcursor < compend) {
+ prevKey = currentKey;
+ prevData = currentData;
+
+ if (currentKey == &key1) {
+ currentKey = &key2;
+ currentData = &data2;
+ } else {
+ currentKey = &key1;
+ currentData = &data1;
+ }
+
+ compressed.data = (void*)compcursor;
+ compressed.ulen = compressed.size =
+ (u_int32_t)(compend - compcursor);
+
+ /* Decompress the next key/data pair */
+ while ((ret = ((BTREE *)dbp->bt_internal)->bt_decompress(
+ dbp, prevKey, prevData,
+ &compressed, currentKey, currentData)) == DB_BUFFER_SMALL) {
+ if (CMP_RESIZE_DBT(ret, env, currentKey) != 0)
+ break;
+ if (CMP_RESIZE_DBT(ret, env, currentData) != 0)
+ break;
+ }
+
+ if (ret == EINVAL) {
+ ret = DB_VERIFY_FATAL;
+ goto err;
+ }
+ if (ret != 0)
+ goto err;
+
+ compcursor += compressed.size;
+
+ if (compcursor > compend) {
+ ret = DB_VERIFY_FATAL;
+ goto err;
+ }
+
+ /* Output the next key/data pair */
+ if ((ret = __db_vrfy_prdbt(
+ currentKey, 0, " ", handle, callback, 0, vdp)) != 0)
+ goto err;
+ if ((ret = __db_vrfy_prdbt(
+ currentData, 0, " ", handle, callback, 0, vdp)) != 0)
+ goto err;
+ }
+
+ if (0) {
+ unknown_data:
+ /*
+ * Make sure we output a data value for the key that's
+ * already been output
+ */
+ DB_INIT_DBT(
+ compressed, "UNKNOWN_DATA", sizeof("UNKNOWN_DATA") - 1);
+ if ((t_ret = __db_vrfy_prdbt(
+ &compressed, 0, " ", handle, callback, 0, vdp)) != 0)
+ ret = t_ret;
+ }
+
+ err:
+ __os_free(env, key1.data);
+ __os_free(env, key2.data);
+ __os_free(env, data1.data);
+ __os_free(env, data2.data);
+ return (ret);
+}
+
+/*
+ * __bam_compress_count --
+ * Calculate key and entry counts for the compressed BTree
+ *
+ * PUBLIC: int __bam_compress_count __P((DBC *, u_int32_t *, u_int32_t *));
+ */
+int
+__bam_compress_count(dbc, nkeysp, ndatap)
+ DBC *dbc;
+ u_int32_t *nkeysp, *ndatap;
+{
+ int ret, t_ret;
+ u_int32_t nkeys, ndata;
+ DB *dbp;
+ BTREE *t;
+ DBC *dbc_n;
+ BTREE_CURSOR *cp_n;
+
+ dbp = dbc->dbp;
+ t = (BTREE *)dbp->bt_internal;
+
+ /* Duplicate the cursor */
+ if ((ret = __dbc_dup(dbc, &dbc_n, 0)) != 0)
+ return (ret);
+
+ /* We don't care about preserving the cursor's position on error */
+ F_SET(dbc_n, DBC_TRANSIENT);
+
+ cp_n = (BTREE_CURSOR *)dbc_n->internal;
+
+ nkeys = 0;
+ ndata = 0;
+
+ CMP_IGET_RETRY(ret, dbc_n, &cp_n->key1, &cp_n->compressed, DB_FIRST);
+ if (ret != 0)
+ goto err;
+
+ if ((ret = __bamc_start_decompress(dbc_n)) != 0)
+ goto err;
+ nkeys += 1;
+
+ for (;;) {
+ ndata += 1;
+
+ ret = __bamc_next_decompress(dbc_n);
+ if (ret == DB_NOTFOUND) {
+ if (cp_n->currentKey == &cp_n->key1) {
+ /*
+ * Make sure that the previous key isn't
+ * overwritten when we fetch the next chunk.
+ */
+ if ((ret = __bam_compress_set_dbt(dbp,
+ &cp_n->key2, cp_n->key1.data,
+ cp_n->key1.size)) != 0)
+ goto err;
+ }
+
+ CMP_IGET_RETRY(ret, dbc_n, &cp_n->key1,
+ &cp_n->compressed, DB_NEXT);
+ if (ret != 0)
+ goto err;
+
+ ret = __bamc_start_decompress(dbc_n);
+
+ cp_n->prevKey = &cp_n->key2;
+ }
+
+ if (ret != 0)
+ goto err;
+
+ if (t->bt_compare(dbp, cp_n->currentKey, cp_n->prevKey) != 0)
+ nkeys += 1;
+ }
+
+err:
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+
+ if ((t_ret = __dbc_close(dbc_n)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (ret == 0) {
+ if (nkeysp != NULL)
+ *nkeysp = nkeys;
+ if (ndatap != NULL)
+ *ndatap = ndata;
+ }
+
+ return (ret);
+}
+
+#endif
diff --git a/btree/bt_conv.c b/btree/bt_conv.c
index 1cb208b..aa14173 100644
--- a/btree/bt_conv.c
+++ b/btree/bt_conv.c
@@ -1,221 +1,95 @@
/*-
- * Copyright (c) 1990, 1993, 1994
- * The Regents of the University of California. All rights reserved.
+ * See the file LICENSE for redistribution information.
*
- * This code is derived from software contributed to Berkeley by
- * Mike Olson.
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
*
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+ * $Id$
*/
-#if defined(LIBC_SCCS) && !defined(lint)
-static char sccsid[] = "@(#)bt_conv.c 8.5 (Berkeley) 8/17/94";
-#endif /* LIBC_SCCS and not lint */
-
-#include <sys/param.h>
-
-#include <stdio.h>
+#include "db_config.h"
-#include <db.h>
-#include "btree.h"
-
-static void mswap __P((PAGE *));
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
/*
- * __BT_BPGIN, __BT_BPGOUT --
- * Convert host-specific number layout to/from the host-independent
- * format stored on disk.
+ * __bam_pgin --
+ * Convert host-specific page layout from the host-independent format
+ * stored on disk.
*
- * Parameters:
- * t: tree
- * pg: page number
- * h: page to convert
+ * PUBLIC: int __bam_pgin __P((DB *, db_pgno_t, void *, DBT *));
*/
-void
-__bt_pgin(t, pg, pp)
- void *t;
- pgno_t pg;
+int
+__bam_pgin(dbp, pg, pp, cookie)
+ DB *dbp;
+ db_pgno_t pg;
void *pp;
+ DBT *cookie;
{
+ DB_PGINFO *pginfo;
PAGE *h;
- indx_t i, top;
- u_char flags;
- char *p;
- if (!F_ISSET(((BTREE *)t), B_NEEDSWAP))
- return;
- if (pg == P_META) {
- mswap(pp);
- return;
- }
+ pginfo = (DB_PGINFO *)cookie->data;
+ if (!F_ISSET(pginfo, DB_AM_SWAP))
+ return (0);
h = pp;
- M_32_SWAP(h->pgno);
- M_32_SWAP(h->prevpg);
- M_32_SWAP(h->nextpg);
- M_32_SWAP(h->flags);
- M_16_SWAP(h->lower);
- M_16_SWAP(h->upper);
-
- top = NEXTINDEX(h);
- if ((h->flags & P_TYPE) == P_BINTERNAL)
- for (i = 0; i < top; i++) {
- M_16_SWAP(h->linp[i]);
- p = (char *)GETBINTERNAL(h, i);
- P_32_SWAP(p);
- p += sizeof(u_int32_t);
- P_32_SWAP(p);
- p += sizeof(pgno_t);
- if (*(u_char *)p & P_BIGKEY) {
- p += sizeof(u_char);
- P_32_SWAP(p);
- p += sizeof(pgno_t);
- P_32_SWAP(p);
- }
- }
- else if ((h->flags & P_TYPE) == P_BLEAF)
- for (i = 0; i < top; i++) {
- M_16_SWAP(h->linp[i]);
- p = (char *)GETBLEAF(h, i);
- P_32_SWAP(p);
- p += sizeof(u_int32_t);
- P_32_SWAP(p);
- p += sizeof(u_int32_t);
- flags = *(u_char *)p;
- if (flags & (P_BIGKEY | P_BIGDATA)) {
- p += sizeof(u_char);
- if (flags & P_BIGKEY) {
- P_32_SWAP(p);
- p += sizeof(pgno_t);
- P_32_SWAP(p);
- }
- if (flags & P_BIGDATA) {
- p += sizeof(u_int32_t);
- P_32_SWAP(p);
- p += sizeof(pgno_t);
- P_32_SWAP(p);
- }
- }
- }
+ return (TYPE(h) == P_BTREEMETA ? __bam_mswap(dbp->env, pp) :
+ __db_byteswap(dbp, pg, pp, pginfo->db_pagesize, 1));
}
-void
-__bt_pgout(t, pg, pp)
- void *t;
- pgno_t pg;
+/*
+ * __bam_pgout --
+ * Convert host-specific page layout to the host-independent format
+ * stored on disk.
+ *
+ * PUBLIC: int __bam_pgout __P((DB *, db_pgno_t, void *, DBT *));
+ */
+int
+__bam_pgout(dbp, pg, pp, cookie)
+ DB *dbp;
+ db_pgno_t pg;
void *pp;
+ DBT *cookie;
{
+ DB_PGINFO *pginfo;
PAGE *h;
- indx_t i, top;
- u_char flags;
- char *p;
- if (!F_ISSET(((BTREE *)t), B_NEEDSWAP))
- return;
- if (pg == P_META) {
- mswap(pp);
- return;
- }
+ pginfo = (DB_PGINFO *)cookie->data;
+ if (!F_ISSET(pginfo, DB_AM_SWAP))
+ return (0);
h = pp;
- top = NEXTINDEX(h);
- if ((h->flags & P_TYPE) == P_BINTERNAL)
- for (i = 0; i < top; i++) {
- p = (char *)GETBINTERNAL(h, i);
- P_32_SWAP(p);
- p += sizeof(u_int32_t);
- P_32_SWAP(p);
- p += sizeof(pgno_t);
- if (*(u_char *)p & P_BIGKEY) {
- p += sizeof(u_char);
- P_32_SWAP(p);
- p += sizeof(pgno_t);
- P_32_SWAP(p);
- }
- M_16_SWAP(h->linp[i]);
- }
- else if ((h->flags & P_TYPE) == P_BLEAF)
- for (i = 0; i < top; i++) {
- p = (char *)GETBLEAF(h, i);
- P_32_SWAP(p);
- p += sizeof(u_int32_t);
- P_32_SWAP(p);
- p += sizeof(u_int32_t);
- flags = *(u_char *)p;
- if (flags & (P_BIGKEY | P_BIGDATA)) {
- p += sizeof(u_char);
- if (flags & P_BIGKEY) {
- P_32_SWAP(p);
- p += sizeof(pgno_t);
- P_32_SWAP(p);
- }
- if (flags & P_BIGDATA) {
- p += sizeof(u_int32_t);
- P_32_SWAP(p);
- p += sizeof(pgno_t);
- P_32_SWAP(p);
- }
- }
- M_16_SWAP(h->linp[i]);
- }
-
- M_32_SWAP(h->pgno);
- M_32_SWAP(h->prevpg);
- M_32_SWAP(h->nextpg);
- M_32_SWAP(h->flags);
- M_16_SWAP(h->lower);
- M_16_SWAP(h->upper);
+ return (TYPE(h) == P_BTREEMETA ? __bam_mswap(dbp->env, pp) :
+ __db_byteswap(dbp, pg, pp, pginfo->db_pagesize, 0));
}
/*
- * MSWAP -- Actually swap the bytes on the meta page.
+ * __bam_mswap --
+ * Swap the bytes on the btree metadata page.
*
- * Parameters:
- * p: page to convert
+ * PUBLIC: int __bam_mswap __P((ENV *, PAGE *));
*/
-static void
-mswap(pg)
+int
+__bam_mswap(env, pg)
+ ENV *env;
PAGE *pg;
{
- char *p;
+ u_int8_t *p;
+
+ COMPQUIET(env, NULL);
+
+ __db_metaswap(pg);
+ p = (u_int8_t *)pg + sizeof(DBMETA);
+
+ p += sizeof(u_int32_t); /* unused */
+ SWAP32(p); /* minkey */
+ SWAP32(p); /* re_len */
+ SWAP32(p); /* re_pad */
+ SWAP32(p); /* root */
+ p += 92 * sizeof(u_int32_t); /* unused */
+ SWAP32(p); /* crypto_magic */
- p = (char *)pg;
- P_32_SWAP(p); /* magic */
- p += sizeof(u_int32_t);
- P_32_SWAP(p); /* version */
- p += sizeof(u_int32_t);
- P_32_SWAP(p); /* psize */
- p += sizeof(u_int32_t);
- P_32_SWAP(p); /* free */
- p += sizeof(u_int32_t);
- P_32_SWAP(p); /* nrecs */
- p += sizeof(u_int32_t);
- P_32_SWAP(p); /* flags */
- p += sizeof(u_int32_t);
+ return (0);
}
diff --git a/btree/bt_curadj.c b/btree/bt_curadj.c
new file mode 100644
index 0000000..3f6077d
--- /dev/null
+++ b/btree/bt_curadj.c
@@ -0,0 +1,620 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/mp.h"
+
+static int __bam_opd_cursor __P((DB *, DBC *, db_pgno_t, u_int32_t, u_int32_t));
+
+/*
+ * Cursor adjustments are logged if they are for subtransactions. This is
+ * because it's possible for a subtransaction to adjust cursors which will
+ * still be active after the subtransaction aborts, and so which must be
+ * restored to their previous locations. Cursors that can be both affected
+ * by our cursor adjustments and active after our transaction aborts can
+ * only be found in our parent transaction -- cursors in other transactions,
+ * including other child transactions of our parent, must have conflicting
+ * locker IDs, and so cannot be affected by adjustments in this transaction.
+ */
+
+/*
+ * __bam_ca_delete --
+ * Update the cursors when items are deleted and when already deleted
+ * items are overwritten. Return the number of relevant cursors found.
+ *
+ * PUBLIC: int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, int, int *));
+ */
+int
+__bam_ca_delete(dbp, pgno, indx, delete, countp)
+ DB *dbp;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ int delete, *countp;
+{
+ BTREE_CURSOR *cp;
+ DB *ldbp;
+ DBC *dbc;
+ ENV *env;
+ int count; /* !!!: Has to contain max number of cursors. */
+
+ env = dbp->env;
+
+ /*
+ * Adjust the cursors. We have the page write locked, so the
+ * only other cursors that can be pointing at a page are
+ * those in the same thread of control. Unfortunately, we don't
+ * know that they're using the same DB handle, so traverse
+ * all matching DB handles in the same ENV, then all cursors
+ * on each matching DB handle.
+ *
+ * Each cursor is single-threaded, so we only need to lock the
+ * list of DBs and then the list of cursors in each DB.
+ */
+ MUTEX_LOCK(env, env->mtx_dblist);
+ FIND_FIRST_DB_MATCH(env, dbp, ldbp);
+ for (count = 0;
+ ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+ ldbp = TAILQ_NEXT(ldbp, dblistlinks)) {
+ MUTEX_LOCK(env, dbp->mutex);
+ TAILQ_FOREACH(dbc, &ldbp->active_queue, links) {
+ cp = (BTREE_CURSOR *)dbc->internal;
+ if (cp->pgno == pgno && cp->indx == indx &&
+ !MVCC_SKIP_CURADJ(dbc, pgno)) {
+ /*
+ * [#8032] This assert is checking
+ * for possible race conditions where we
+ * hold a cursor position without a lock.
+ * Unfortunately, there are paths in the
+ * Btree code that do not satisfy these
+ * conditions. None of them are known to
+ * be a problem, but this assert should
+ * be re-activated when the Btree stack
+ * code is re-written.
+ DB_ASSERT(env, !STD_LOCKING(dbc) ||
+ cp->lock_mode != DB_LOCK_NG);
+ */
+ if (delete) {
+ F_SET(cp, C_DELETED);
+ /*
+ * If we're deleting the item, we can't
+ * keep a streaming offset cached.
+ */
+ cp->stream_start_pgno = PGNO_INVALID;
+ } else
+ F_CLR(cp, C_DELETED);
+
+#ifdef HAVE_COMPRESSION
+ /*
+ * We also set the C_COMPRESS_MODIFIED flag,
+ * which prompts the compression code to look
+ * for it's current entry again if it needs to.
+ *
+ * The flag isn't cleared, because the
+ * compression code still needs to do that even
+ * for an entry that becomes undeleted.
+ *
+ * This flag also needs to be set if an entry is
+ * updated, but since the compression code
+ * always deletes before an update, setting it
+ * here is sufficient.
+ */
+ F_SET(cp, C_COMPRESS_MODIFIED);
+#endif
+
+ ++count;
+ }
+ }
+ MUTEX_UNLOCK(env, dbp->mutex);
+ }
+ MUTEX_UNLOCK(env, env->mtx_dblist);
+
+ if (countp != NULL)
+ *countp = count;
+ return (0);
+}
+
+/*
+ * __ram_ca_delete --
+ * Return if any relevant cursors found.
+ *
+ * PUBLIC: int __ram_ca_delete __P((DB *, db_pgno_t, int *));
+ */
+int
+__ram_ca_delete(dbp, root_pgno, foundp)
+ DB *dbp;
+ db_pgno_t root_pgno;
+ int *foundp;
+{
+ DB *ldbp;
+ DBC *dbc;
+ ENV *env;
+ int found;
+
+ env = dbp->env;
+
+ /*
+ * Review the cursors. See the comment in __bam_ca_delete().
+ */
+ MUTEX_LOCK(env, env->mtx_dblist);
+ FIND_FIRST_DB_MATCH(env, dbp, ldbp);
+ for (found = 0;
+ found == 0 && ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+ ldbp = TAILQ_NEXT(ldbp, dblistlinks)) {
+ MUTEX_LOCK(env, dbp->mutex);
+ TAILQ_FOREACH(dbc, &ldbp->active_queue, links)
+ if (dbc->internal->root == root_pgno &&
+ !MVCC_SKIP_CURADJ(dbc, root_pgno)) {
+ found = 1;
+ break;
+ }
+ MUTEX_UNLOCK(env, dbp->mutex);
+ }
+ MUTEX_UNLOCK(env, env->mtx_dblist);
+
+ *foundp = found;
+ return (0);
+}
+
+/*
+ * __bam_ca_di --
+ * Adjust the cursors during a delete or insert.
+ *
+ * PUBLIC: int __bam_ca_di __P((DBC *, db_pgno_t, u_int32_t, int));
+ */
+int
+__bam_ca_di(my_dbc, pgno, indx, adjust)
+ DBC *my_dbc;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ int adjust;
+{
+ DB *dbp, *ldbp;
+ DBC *dbc;
+ DBC_INTERNAL *cp;
+ DB_LSN lsn;
+ DB_TXN *my_txn;
+ ENV *env;
+ int found, ret;
+
+ dbp = my_dbc->dbp;
+ env = dbp->env;
+
+ my_txn = IS_SUBTRANSACTION(my_dbc->txn) ? my_dbc->txn : NULL;
+
+ /*
+ * Adjust the cursors. See the comment in __bam_ca_delete().
+ */
+ MUTEX_LOCK(env, env->mtx_dblist);
+ FIND_FIRST_DB_MATCH(env, dbp, ldbp);
+ for (found = 0;
+ ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+ ldbp = TAILQ_NEXT(ldbp, dblistlinks)) {
+ MUTEX_LOCK(env, dbp->mutex);
+ TAILQ_FOREACH(dbc, &ldbp->active_queue, links) {
+ if (dbc->dbtype == DB_RECNO)
+ continue;
+ cp = dbc->internal;
+ if (cp->pgno == pgno && cp->indx >= indx &&
+ (dbc == my_dbc || !MVCC_SKIP_CURADJ(dbc, pgno))) {
+ /* Cursor indices should never be negative. */
+ DB_ASSERT(env, cp->indx != 0 || adjust > 0);
+ /* [#8032]
+ DB_ASSERT(env, !STD_LOCKING(dbc) ||
+ cp->lock_mode != DB_LOCK_NG);
+ */
+ cp->indx += adjust;
+ if (my_txn != NULL && dbc->txn != my_txn)
+ found = 1;
+ }
+ }
+ MUTEX_UNLOCK(env, dbp->mutex);
+ }
+ MUTEX_UNLOCK(env, env->mtx_dblist);
+
+ if (found != 0 && DBC_LOGGING(my_dbc)) {
+ if ((ret = __bam_curadj_log(dbp, my_dbc->txn, &lsn, 0,
+ DB_CA_DI, pgno, 0, 0, (u_int32_t)adjust, indx, 0)) != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+/*
+ * __bam_opd_cursor -- create a new opd cursor.
+ */
+static int
+__bam_opd_cursor(dbp, dbc, first, tpgno, ti)
+ DB *dbp;
+ DBC *dbc;
+ db_pgno_t tpgno;
+ u_int32_t first, ti;
+{
+ BTREE_CURSOR *cp, *orig_cp;
+ DBC *dbc_nopd;
+ int ret;
+
+ orig_cp = (BTREE_CURSOR *)dbc->internal;
+ dbc_nopd = NULL;
+
+ /*
+ * Allocate a new cursor and create the stack. If duplicates
+ * are sorted, we've just created an off-page duplicate Btree.
+ * If duplicates aren't sorted, we've just created a Recno tree.
+ *
+ * Note that in order to get here at all, there shouldn't be
+ * an old off-page dup cursor--to augment the checking dbc_newopd
+ * will do, assert this.
+ */
+ DB_ASSERT(dbp->env, orig_cp->opd == NULL);
+ if ((ret = __dbc_newopd(dbc, tpgno, orig_cp->opd, &dbc_nopd)) != 0)
+ return (ret);
+
+ cp = (BTREE_CURSOR *)dbc_nopd->internal;
+ cp->pgno = tpgno;
+ cp->indx = ti;
+
+ if (dbp->dup_compare == NULL) {
+ /*
+ * Converting to off-page Recno trees is tricky. The
+ * record number for the cursor is the index + 1 (to
+ * convert to 1-based record numbers).
+ */
+ cp->recno = ti + 1;
+ }
+
+ /*
+ * Transfer the deleted flag from the top-level cursor to the
+ * created one.
+ */
+ if (F_ISSET(orig_cp, C_DELETED)) {
+ F_SET(cp, C_DELETED);
+ F_CLR(orig_cp, C_DELETED);
+ }
+
+ /* Stack the cursors and reset the initial cursor's index. */
+ orig_cp->opd = dbc_nopd;
+ orig_cp->indx = first;
+ return (0);
+}
+
+/*
+ * __bam_ca_dup --
+ * Adjust the cursors when moving items from a leaf page to a duplicates
+ * page.
+ *
+ * PUBLIC: int __bam_ca_dup __P((DBC *,
+ * PUBLIC: u_int32_t, db_pgno_t, u_int32_t, db_pgno_t, u_int32_t));
+ */
+int
+__bam_ca_dup(my_dbc, first, fpgno, fi, tpgno, ti)
+ DBC *my_dbc;
+ db_pgno_t fpgno, tpgno;
+ u_int32_t first, fi, ti;
+{
+ BTREE_CURSOR *orig_cp;
+ DB *dbp, *ldbp;
+ DBC *dbc;
+ DB_LSN lsn;
+ DB_TXN *my_txn;
+ ENV *env;
+ int found, ret, t_ret;
+
+ dbp = my_dbc->dbp;
+ env = dbp->env;
+ my_txn = IS_SUBTRANSACTION(my_dbc->txn) ? my_dbc->txn : NULL;
+ ret = 0;
+
+ /*
+ * Adjust the cursors. See the comment in __bam_ca_delete().
+ */
+ MUTEX_LOCK(env, env->mtx_dblist);
+ FIND_FIRST_DB_MATCH(env, dbp, ldbp);
+ for (found = 0;
+ ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+ ldbp = TAILQ_NEXT(ldbp, dblistlinks)) {
+loop: MUTEX_LOCK(env, dbp->mutex);
+ TAILQ_FOREACH(dbc, &ldbp->active_queue, links) {
+ /* Find cursors pointing to this record. */
+ orig_cp = (BTREE_CURSOR *)dbc->internal;
+ if (orig_cp->pgno != fpgno || orig_cp->indx != fi ||
+ MVCC_SKIP_CURADJ(dbc, fpgno))
+ continue;
+
+ /*
+ * Since we rescan the list see if this is already
+ * converted.
+ */
+ if (orig_cp->opd != NULL)
+ continue;
+
+ MUTEX_UNLOCK(env, dbp->mutex);
+ /* [#8032]
+ DB_ASSERT(env, !STD_LOCKING(dbc) ||
+ orig_cp->lock_mode != DB_LOCK_NG);
+ */
+ if ((ret = __bam_opd_cursor(dbp,
+ dbc, first, tpgno, ti)) != 0)
+ goto err;
+ if (my_txn != NULL && dbc->txn != my_txn)
+ found = 1;
+ /* We released the mutex to get a cursor, start over. */
+ goto loop;
+ }
+ MUTEX_UNLOCK(env, dbp->mutex);
+ }
+err: MUTEX_UNLOCK(env, env->mtx_dblist);
+
+ if (found != 0 && DBC_LOGGING(my_dbc)) {
+ if ((t_ret = __bam_curadj_log(dbp, my_dbc->txn,
+ &lsn, 0, DB_CA_DUP, fpgno, tpgno, 0, first, fi, ti)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ }
+
+ return (ret);
+}
+
+/*
+ * __bam_ca_undodup --
+ * Adjust the cursors when returning items to a leaf page
+ * from a duplicate page.
+ * Called only during undo processing.
+ *
+ * PUBLIC: int __bam_ca_undodup __P((DB *,
+ * PUBLIC: u_int32_t, db_pgno_t, u_int32_t, u_int32_t));
+ */
+int
+__bam_ca_undodup(dbp, first, fpgno, fi, ti)
+ DB *dbp;
+ db_pgno_t fpgno;
+ u_int32_t first, fi, ti;
+{
+ BTREE_CURSOR *orig_cp;
+ DB *ldbp;
+ DBC *dbc;
+ ENV *env;
+ int ret;
+
+ env = dbp->env;
+ ret = 0;
+
+ /*
+ * Adjust the cursors. See the comment in __bam_ca_delete().
+ */
+ MUTEX_LOCK(env, env->mtx_dblist);
+ FIND_FIRST_DB_MATCH(env, dbp, ldbp);
+ for (;
+ ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+ ldbp = TAILQ_NEXT(ldbp, dblistlinks)) {
+loop: MUTEX_LOCK(env, dbp->mutex);
+ TAILQ_FOREACH(dbc, &ldbp->active_queue, links) {
+ orig_cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * A note on the orig_cp->opd != NULL requirement here:
+ * it's possible that there's a cursor that refers to
+ * the same duplicate set, but which has no opd cursor,
+ * because it refers to a different item and we took
+ * care of it while processing a previous record.
+ */
+ if (orig_cp->pgno != fpgno ||
+ orig_cp->indx != first ||
+ orig_cp->opd == NULL || ((BTREE_CURSOR *)
+ orig_cp->opd->internal)->indx != ti ||
+ MVCC_SKIP_CURADJ(dbc, fpgno))
+ continue;
+ MUTEX_UNLOCK(env, dbp->mutex);
+ if ((ret = __dbc_close(orig_cp->opd)) != 0)
+ goto err;
+ orig_cp->opd = NULL;
+ orig_cp->indx = fi;
+ /*
+ * We released the mutex to free a cursor,
+ * start over.
+ */
+ goto loop;
+ }
+ MUTEX_UNLOCK(env, dbp->mutex);
+ }
+err: MUTEX_UNLOCK(env, env->mtx_dblist);
+
+ return (ret);
+}
+
+/*
+ * __bam_ca_rsplit --
+ * Adjust the cursors when doing reverse splits.
+ *
+ * PUBLIC: int __bam_ca_rsplit __P((DBC *, db_pgno_t, db_pgno_t));
+ */
+int
+__bam_ca_rsplit(my_dbc, fpgno, tpgno)
+ DBC* my_dbc;
+ db_pgno_t fpgno, tpgno;
+{
+ DB *dbp, *ldbp;
+ DBC *dbc;
+ DB_LSN lsn;
+ DB_TXN *my_txn;
+ ENV *env;
+ int found, ret;
+
+ dbp = my_dbc->dbp;
+ env = dbp->env;
+ my_txn = IS_SUBTRANSACTION(my_dbc->txn) ? my_dbc->txn : NULL;
+
+ /*
+ * Adjust the cursors. See the comment in __bam_ca_delete().
+ */
+ MUTEX_LOCK(env, env->mtx_dblist);
+ FIND_FIRST_DB_MATCH(env, dbp, ldbp);
+ for (found = 0;
+ ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+ ldbp = TAILQ_NEXT(ldbp, dblistlinks)) {
+ MUTEX_LOCK(env, dbp->mutex);
+ TAILQ_FOREACH(dbc, &ldbp->active_queue, links) {
+ if (dbc->dbtype == DB_RECNO)
+ continue;
+ if (dbc->internal->pgno == fpgno &&
+ !MVCC_SKIP_CURADJ(dbc, fpgno)) {
+ dbc->internal->pgno = tpgno;
+ /* [#8032]
+ DB_ASSERT(env, !STD_LOCKING(dbc) ||
+ dbc->internal->lock_mode != DB_LOCK_NG);
+ */
+ if (my_txn != NULL && dbc->txn != my_txn)
+ found = 1;
+ }
+ }
+ MUTEX_UNLOCK(env, dbp->mutex);
+ }
+ MUTEX_UNLOCK(env, env->mtx_dblist);
+
+ if (found != 0 && DBC_LOGGING(my_dbc)) {
+ if ((ret = __bam_curadj_log(dbp, my_dbc->txn,
+ &lsn, 0, DB_CA_RSPLIT, fpgno, tpgno, 0, 0, 0, 0)) != 0)
+ return (ret);
+ }
+ return (0);
+}
+
+/*
+ * __bam_ca_split --
+ * Adjust the cursors when splitting a page.
+ *
+ * PUBLIC: int __bam_ca_split __P((DBC *,
+ * PUBLIC: db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t, int));
+ */
+int
+__bam_ca_split(my_dbc, ppgno, lpgno, rpgno, split_indx, cleft)
+ DBC *my_dbc;
+ db_pgno_t ppgno, lpgno, rpgno;
+ u_int32_t split_indx;
+ int cleft;
+{
+ DB *dbp, *ldbp;
+ DBC *dbc;
+ DBC_INTERNAL *cp;
+ DB_LSN lsn;
+ DB_TXN *my_txn;
+ ENV *env;
+ int found, ret;
+
+ dbp = my_dbc->dbp;
+ env = dbp->env;
+ my_txn = IS_SUBTRANSACTION(my_dbc->txn) ? my_dbc->txn : NULL;
+
+ /*
+ * Adjust the cursors. See the comment in __bam_ca_delete().
+ *
+ * If splitting the page that a cursor was on, the cursor has to be
+ * adjusted to point to the same record as before the split. Most
+ * of the time we don't adjust pointers to the left page, because
+ * we're going to copy its contents back over the original page. If
+ * the cursor is on the right page, it is decremented by the number of
+ * records split to the left page.
+ */
+ MUTEX_LOCK(env, env->mtx_dblist);
+ FIND_FIRST_DB_MATCH(env, dbp, ldbp);
+ for (found = 0;
+ ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+ ldbp = TAILQ_NEXT(ldbp, dblistlinks)) {
+ MUTEX_LOCK(env, dbp->mutex);
+ TAILQ_FOREACH(dbc, &ldbp->active_queue, links) {
+ if (dbc->dbtype == DB_RECNO)
+ continue;
+ cp = dbc->internal;
+ if (cp->pgno == ppgno &&
+ !MVCC_SKIP_CURADJ(dbc, ppgno)) {
+ /* [#8032]
+ DB_ASSERT(env, !STD_LOCKING(dbc) ||
+ cp->lock_mode != DB_LOCK_NG);
+ */
+ if (my_txn != NULL && dbc->txn != my_txn)
+ found = 1;
+ if (cp->indx < split_indx) {
+ if (cleft)
+ cp->pgno = lpgno;
+ } else {
+ cp->pgno = rpgno;
+ cp->indx -= split_indx;
+ }
+ }
+ }
+ MUTEX_UNLOCK(env, dbp->mutex);
+ }
+ MUTEX_UNLOCK(env, env->mtx_dblist);
+
+ if (found != 0 && DBC_LOGGING(my_dbc)) {
+ if ((ret = __bam_curadj_log(dbp,
+ my_dbc->txn, &lsn, 0, DB_CA_SPLIT, ppgno, rpgno,
+ cleft ? lpgno : PGNO_INVALID, 0, split_indx, 0)) != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+/*
+ * __bam_ca_undosplit --
+ * Adjust the cursors when undoing a split of a page.
+ * If we grew a level we will execute this for both the
+ * left and the right pages.
+ * Called only during undo processing.
+ *
+ * PUBLIC: int __bam_ca_undosplit __P((DB *,
+ * PUBLIC: db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t));
+ */
+int
+__bam_ca_undosplit(dbp, frompgno, topgno, lpgno, split_indx)
+ DB *dbp;
+ db_pgno_t frompgno, topgno, lpgno;
+ u_int32_t split_indx;
+{
+ DB *ldbp;
+ DBC *dbc;
+ DBC_INTERNAL *cp;
+ ENV *env;
+
+ env = dbp->env;
+
+ /*
+ * Adjust the cursors. See the comment in __bam_ca_delete().
+ *
+ * When backing out a split, we move the cursor back
+ * to the original offset and bump it by the split_indx.
+ */
+ MUTEX_LOCK(env, env->mtx_dblist);
+ FIND_FIRST_DB_MATCH(env, dbp, ldbp);
+ for (;
+ ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+ ldbp = TAILQ_NEXT(ldbp, dblistlinks)) {
+ MUTEX_LOCK(env, dbp->mutex);
+ TAILQ_FOREACH(dbc, &ldbp->active_queue, links) {
+ if (dbc->dbtype == DB_RECNO)
+ continue;
+ cp = dbc->internal;
+ if (cp->pgno == topgno &&
+ !MVCC_SKIP_CURADJ(dbc, topgno)) {
+ cp->pgno = frompgno;
+ cp->indx += split_indx;
+ } else if (cp->pgno == lpgno &&
+ !MVCC_SKIP_CURADJ(dbc, lpgno))
+ cp->pgno = frompgno;
+ }
+ MUTEX_UNLOCK(env, dbp->mutex);
+ }
+ MUTEX_UNLOCK(env, env->mtx_dblist);
+
+ return (0);
+}
diff --git a/btree/bt_cursor.c b/btree/bt_cursor.c
new file mode 100644
index 0000000..b0d6f7d
--- /dev/null
+++ b/btree/bt_cursor.c
@@ -0,0 +1,3055 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+static int __bam_bulk __P((DBC *, DBT *, u_int32_t));
+static int __bamc_close __P((DBC *, db_pgno_t, int *));
+static int __bamc_del __P((DBC *, u_int32_t));
+static int __bamc_destroy __P((DBC *));
+static int __bamc_get __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static int __bamc_getstack __P((DBC *));
+static int __bamc_next __P((DBC *, int, int));
+static int __bamc_physdel __P((DBC *));
+static int __bamc_prev __P((DBC *));
+static int __bamc_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static int __bamc_search __P((DBC *,
+ db_pgno_t, const DBT *, u_int32_t, int *));
+static int __bamc_writelock __P((DBC *));
+static int __bam_getboth_finddatum __P((DBC *, DBT *, u_int32_t));
+static int __bam_getbothc __P((DBC *, DBT *));
+static int __bam_get_prev __P((DBC *));
+static int __bam_isopd __P((DBC *, db_pgno_t *));
+#ifdef HAVE_COMPRESSION
+static int __bam_getlte __P((DBC *, DBT *, DBT *));
+#endif
+
+/*
+ * Acquire a new page/lock. If we hold a page/lock, discard the page, and
+ * lock-couple the lock.
+ *
+ * !!!
+ * We have to handle both where we have a lock to lock-couple and where we
+ * don't -- we don't duplicate locks when we duplicate cursors if we are
+ * running in a transaction environment as there's no point if locks are
+ * never discarded. This means that the cursor may or may not hold a lock.
+ * In the case where we are descending the tree we always want to unlock
+ * the held interior page so we use ACQUIRE_COUPLE.
+ */
+#undef ACQUIRE
+#define ACQUIRE(dbc, mode, lpgno, lock, fpgno, pagep, flags, ret) do { \
+ DB_MPOOLFILE *__mpf = (dbc)->dbp->mpf; \
+ if ((pagep) != NULL) { \
+ ret = __memp_fput(__mpf, \
+ (dbc)->thread_info, pagep, dbc->priority); \
+ pagep = NULL; \
+ } else \
+ ret = 0; \
+ if ((ret) == 0 && STD_LOCKING(dbc)) \
+ ret = __db_lget( \
+ dbc, LCK_COUPLE, lpgno, mode, flags, &(lock)); \
+ if ((ret) == 0) \
+ ret = __memp_fget(__mpf, &(fpgno), \
+ (dbc)->thread_info, (dbc)->txn, 0, &(pagep)); \
+} while (0)
+
+/* Acquire a new page/lock for a cursor. */
+#undef ACQUIRE_CUR
+#define ACQUIRE_CUR(dbc, mode, p, flags, ret) do { \
+ BTREE_CURSOR *__cp = (BTREE_CURSOR *)(dbc)->internal; \
+ if (p != __cp->pgno) \
+ __cp->pgno = PGNO_INVALID; \
+ ACQUIRE(dbc, mode, p, __cp->lock, p, __cp->page, flags, ret); \
+ if ((ret) == 0) { \
+ __cp->pgno = p; \
+ __cp->lock_mode = (mode); \
+ } \
+} while (0)
+
+/*
+ * Acquire a write lock if we don't already have one.
+ *
+ * !!!
+ * See ACQUIRE macro on why we handle cursors that don't have locks.
+ */
+#undef ACQUIRE_WRITE_LOCK
+#define ACQUIRE_WRITE_LOCK(dbc, ret) do { \
+ BTREE_CURSOR *__cp = (BTREE_CURSOR *)(dbc)->internal; \
+ DB_MPOOLFILE *__mpf = (dbc)->dbp->mpf; \
+ int __get_page = 0; \
+ ret = 0; \
+ if (STD_LOCKING(dbc) && __cp->lock_mode != DB_LOCK_WRITE) { \
+ if (__cp->page != NULL) { \
+ (ret) = __memp_fput(__mpf, (dbc)->thread_info, \
+ __cp->page, (dbc)->priority); \
+ __cp->page = NULL; \
+ __get_page = 1; \
+ if ((ret) !=0) \
+ break; \
+ } \
+ if (((ret) = __db_lget((dbc), \
+ LOCK_ISSET(__cp->lock) ? LCK_COUPLE : 0, \
+ __cp->pgno, DB_LOCK_WRITE, 0, &__cp->lock)) != 0) \
+ break; \
+ __cp->lock_mode = DB_LOCK_WRITE; \
+ if (__get_page == 0) \
+ break; \
+ (ret) = __memp_fget(__mpf, &__cp->pgno, \
+ (dbc)->thread_info, \
+ (dbc)->txn, DB_MPOOL_DIRTY, &__cp->page); \
+ } \
+} while (0)
+
+/* Discard the current page/lock for a cursor. */
+#undef DISCARD_CUR
+#define DISCARD_CUR(dbc, ret) do { \
+ BTREE_CURSOR *__cp = (BTREE_CURSOR *)(dbc)->internal; \
+ DB_MPOOLFILE *__mpf = (dbc)->dbp->mpf; \
+ int __t_ret; \
+ if ((__cp->page) != NULL) { \
+ __t_ret = __memp_fput(__mpf, \
+ (dbc)->thread_info, __cp->page, dbc->priority);\
+ __cp->page = NULL; \
+ } else \
+ __t_ret = 0; \
+ if (__t_ret != 0 && (ret) == 0) \
+ ret = __t_ret; \
+ __t_ret = __TLPUT((dbc), __cp->lock); \
+ if (__t_ret != 0 && (ret) == 0) \
+ ret = __t_ret; \
+ if ((ret) == 0 && !LOCK_ISSET(__cp->lock)) \
+ __cp->lock_mode = DB_LOCK_NG; \
+ __cp->stream_start_pgno = PGNO_INVALID; \
+} while (0)
+
+/* If on-page item is a deleted record. */
+#undef IS_DELETED
+#define IS_DELETED(dbp, page, indx) \
+ B_DISSET(GET_BKEYDATA(dbp, page, \
+ (indx) + (TYPE(page) == P_LBTREE ? O_INDX : 0))->type)
+#undef IS_CUR_DELETED
+#define IS_CUR_DELETED(dbc) \
+ IS_DELETED((dbc)->dbp, (dbc)->internal->page, (dbc)->internal->indx)
+
+/*
+ * Test to see if two cursors could point to duplicates of the same key.
+ * In the case of off-page duplicates they are they same, as the cursors
+ * will be in the same off-page duplicate tree. In the case of on-page
+ * duplicates, the key index offsets must be the same. For the last test,
+ * as the original cursor may not have a valid page pointer, we use the
+ * current cursor's.
+ */
+#undef IS_DUPLICATE
+#define IS_DUPLICATE(dbc, i1, i2) \
+ (P_INP((dbc)->dbp,((PAGE *)(dbc)->internal->page))[i1] == \
+ P_INP((dbc)->dbp,((PAGE *)(dbc)->internal->page))[i2])
+#undef IS_CUR_DUPLICATE
+#define IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx) \
+ (F_ISSET(dbc, DBC_OPD) || \
+ (orig_pgno == (dbc)->internal->pgno && \
+ IS_DUPLICATE(dbc, (dbc)->internal->indx, orig_indx)))
+
+/*
+ * __bamc_init --
+ * Initialize the access private portion of a cursor
+ *
+ * PUBLIC: int __bamc_init __P((DBC *, DBTYPE));
+ */
+int
+__bamc_init(dbc, dbtype)
+ DBC *dbc;
+ DBTYPE dbtype;
+{
+ ENV *env;
+ int ret;
+#ifdef HAVE_COMPRESSION
+ BTREE_CURSOR *cp;
+#endif
+
+ env = dbc->env;
+
+ /* Allocate/initialize the internal structure. */
+ if (dbc->internal == NULL) {
+ if ((ret = __os_calloc(
+ env, 1, sizeof(BTREE_CURSOR), &dbc->internal)) != 0)
+ return (ret);
+
+#ifdef HAVE_COMPRESSION
+ cp = (BTREE_CURSOR*)dbc->internal;
+ cp->compressed.flags = DB_DBT_USERMEM;
+ cp->key1.flags = DB_DBT_USERMEM;
+ cp->key2.flags = DB_DBT_USERMEM;
+ cp->data1.flags = DB_DBT_USERMEM;
+ cp->data2.flags = DB_DBT_USERMEM;
+ cp->del_key.flags = DB_DBT_USERMEM;
+ cp->del_data.flags = DB_DBT_USERMEM;
+#endif
+ }
+
+ /* Initialize methods. */
+ dbc->close = dbc->c_close = __dbc_close_pp;
+ dbc->cmp = __dbc_cmp_pp;
+ dbc->count = dbc->c_count = __dbc_count_pp;
+ dbc->del = dbc->c_del = __dbc_del_pp;
+ dbc->dup = dbc->c_dup = __dbc_dup_pp;
+ dbc->get = dbc->c_get = __dbc_get_pp;
+ dbc->pget = dbc->c_pget = __dbc_pget_pp;
+ dbc->put = dbc->c_put = __dbc_put_pp;
+ if (dbtype == DB_BTREE) {
+ dbc->am_bulk = __bam_bulk;
+ dbc->am_close = __bamc_close;
+ dbc->am_del = __bamc_del;
+ dbc->am_destroy = __bamc_destroy;
+ dbc->am_get = __bamc_get;
+ dbc->am_put = __bamc_put;
+ dbc->am_writelock = __bamc_writelock;
+ } else {
+ dbc->am_bulk = __bam_bulk;
+ dbc->am_close = __bamc_close;
+ dbc->am_del = __ramc_del;
+ dbc->am_destroy = __bamc_destroy;
+ dbc->am_get = __ramc_get;
+ dbc->am_put = __ramc_put;
+ dbc->am_writelock = __bamc_writelock;
+ }
+
+ return (0);
+}
+
+/*
+ * __bamc_refresh
+ * Set things up properly for cursor re-use.
+ *
+ * PUBLIC: int __bamc_refresh __P((DBC *));
+ */
+int
+__bamc_refresh(dbc)
+ DBC *dbc;
+{
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+
+ dbp = dbc->dbp;
+ t = dbp->bt_internal;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * If our caller set the root page number, it's because the root was
+ * known. This is always the case for off page dup cursors. Else,
+ * pull it out of our internal information.
+ */
+ if (cp->root == PGNO_INVALID)
+ cp->root = t->bt_root;
+
+ LOCK_INIT(cp->lock);
+ cp->lock_mode = DB_LOCK_NG;
+
+ if (cp->sp == NULL) {
+ cp->sp = cp->stack;
+ cp->esp = cp->stack + sizeof(cp->stack) / sizeof(cp->stack[0]);
+ }
+ BT_STK_CLR(cp);
+
+#ifdef HAVE_COMPRESSION
+ /* Initialize compression */
+ cp->prevKey = 0;
+ cp->prevData = 0;
+ cp->currentKey = 0;
+ cp->currentData = 0;
+ cp->compcursor = 0;
+ cp->compend = 0;
+ cp->prevcursor = 0;
+ cp->prev2cursor = 0;
+#endif
+
+ /*
+ * The btree leaf page data structures require that two key/data pairs
+ * (or four items) fit on a page, but other than that there's no fixed
+ * requirement. The btree off-page duplicates only require two items,
+ * to be exact, but requiring four for them as well seems reasonable.
+ *
+ * Recno uses the btree bt_ovflsize value -- it's close enough.
+ */
+ cp->ovflsize = B_MINKEY_TO_OVFLSIZE(
+ dbp, F_ISSET(dbc, DBC_OPD) ? 2 : t->bt_minkey, dbp->pgsize);
+
+ cp->recno = RECNO_OOB;
+ cp->order = INVALID_ORDER;
+ cp->flags = 0;
+
+ /* Initialize for record numbers. */
+ if (F_ISSET(dbc, DBC_OPD) ||
+ dbc->dbtype == DB_RECNO || F_ISSET(dbp, DB_AM_RECNUM)) {
+ F_SET(cp, C_RECNUM);
+
+ /*
+ * All btrees that support record numbers, optionally standard
+ * recno trees, and all off-page duplicate recno trees have
+ * mutable record numbers.
+ */
+ if ((F_ISSET(dbc, DBC_OPD) && dbc->dbtype == DB_RECNO) ||
+ F_ISSET(dbp, DB_AM_RECNUM | DB_AM_RENUMBER))
+ F_SET(cp, C_RENUMBER);
+ }
+
+ return (0);
+}
+
+/*
+ * __bamc_close --
+ * Close down the cursor.
+ */
+static int
+__bamc_close(dbc, root_pgno, rmroot)
+ DBC *dbc;
+ db_pgno_t root_pgno;
+ int *rmroot;
+{
+ BTREE_CURSOR *cp, *cp_opd, *cp_c;
+ DB *dbp;
+ DBC *dbc_opd, *dbc_c;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *h;
+ int cdb_lock, count, ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ cp_opd = (dbc_opd = cp->opd) == NULL ?
+ NULL : (BTREE_CURSOR *)dbc_opd->internal;
+ cdb_lock = ret = 0;
+
+ /*
+ * There are 3 ways this function is called:
+ *
+ * 1. Closing a primary cursor: we get called with a pointer to a
+ * primary cursor that has a NULL opd field. This happens when
+ * closing a btree/recno database cursor without an associated
+ * off-page duplicate tree.
+ *
+ * 2. Closing a primary and an off-page duplicate cursor stack: we
+ * get called with a pointer to the primary cursor which has a
+ * non-NULL opd field. This happens when closing a btree cursor
+ * into database with an associated off-page btree/recno duplicate
+ * tree. (It can't be a primary recno database, recno databases
+ * don't support duplicates.)
+ *
+ * 3. Closing an off-page duplicate cursor stack: we get called with
+ * a pointer to the off-page duplicate cursor. This happens when
+ * closing a non-btree database that has an associated off-page
+ * btree/recno duplicate tree or for a btree database when the
+ * opd tree is not empty (root_pgno == PGNO_INVALID).
+ *
+ * If either the primary or off-page duplicate cursor deleted a btree
+ * key/data pair, check to see if the item is still referenced by a
+ * different cursor. If it is, confirm that cursor's delete flag is
+ * set and leave it to that cursor to do the delete.
+ *
+ * NB: The test for == 0 below is correct. Our caller already removed
+ * our cursor argument from the active queue, we won't find it when we
+ * search the queue in __bam_ca_delete().
+ * NB: It can't be true that both the primary and off-page duplicate
+ * cursors have deleted a btree key/data pair. Either the primary
+ * cursor may have deleted an item and there's no off-page duplicate
+ * cursor, or there's an off-page duplicate cursor and it may have
+ * deleted an item.
+ *
+ * Primary recno databases aren't an issue here. Recno keys are either
+ * deleted immediately or never deleted, and do not have to be handled
+ * here.
+ *
+ * Off-page duplicate recno databases are an issue here, cases #2 and
+ * #3 above can both be off-page recno databases. The problem is the
+ * same as the final problem for off-page duplicate btree databases.
+ * If we no longer need the off-page duplicate tree, we want to remove
+ * it. For off-page duplicate btrees, we are done with the tree when
+ * we delete the last item it contains, i.e., there can be no further
+ * references to it when it's empty. For off-page duplicate recnos,
+ * we remove items from the tree as the application calls the remove
+ * function, so we are done with the tree when we close the last cursor
+ * that references it.
+ *
+ * We optionally take the root page number from our caller. If the
+ * primary database is a btree, we can get it ourselves because dbc
+ * is the primary cursor. If the primary database is not a btree,
+ * the problem is that we may be dealing with a stack of pages. The
+ * cursor we're using to do the delete points at the bottom of that
+ * stack and we need the top of the stack.
+ */
+ if (F_ISSET(cp, C_DELETED)) {
+ dbc_c = dbc;
+ switch (dbc->dbtype) {
+ case DB_BTREE: /* Case #1, #3. */
+ if ((ret = __bam_ca_delete(
+ dbp, cp->pgno, cp->indx, 1, &count)) != 0)
+ goto err;
+ if (count == 0)
+ goto lock;
+ goto done;
+ case DB_RECNO:
+ if (!F_ISSET(dbc, DBC_OPD)) /* Case #1. */
+ goto done;
+ /* Case #3. */
+ if ((ret = __ram_ca_delete(dbp, cp->root, &count)) != 0)
+ goto err;
+ if (count == 0)
+ goto lock;
+ goto done;
+ case DB_HASH:
+ case DB_QUEUE:
+ case DB_UNKNOWN:
+ default:
+ ret = __db_unknown_type(
+ env, "DbCursor.close", dbc->dbtype);
+ goto err;
+ }
+ }
+
+ if (dbc_opd == NULL)
+ goto done;
+
+ if (F_ISSET(cp_opd, C_DELETED)) { /* Case #2. */
+ /*
+ * We will not have been provided a root page number. Acquire
+ * one from the primary database.
+ */
+ if ((h = cp->page) == NULL && (ret = __memp_fget(mpf, &cp->pgno,
+ dbc->thread_info, dbc->txn, 0, &h)) != 0)
+ goto err;
+ root_pgno = GET_BOVERFLOW(dbp, h, cp->indx + O_INDX)->pgno;
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0)
+ goto err;
+ cp->page = NULL;
+
+ dbc_c = dbc_opd;
+ switch (dbc_opd->dbtype) {
+ case DB_BTREE:
+ if ((ret = __bam_ca_delete(
+ dbp, cp_opd->pgno, cp_opd->indx, 1, &count)) != 0)
+ goto err;
+ if (count == 0)
+ goto lock;
+ goto done;
+ case DB_RECNO:
+ if ((ret =
+ __ram_ca_delete(dbp, cp_opd->root, &count)) != 0)
+ goto err;
+ if (count == 0)
+ goto lock;
+ goto done;
+ case DB_HASH:
+ case DB_QUEUE:
+ case DB_UNKNOWN:
+ default:
+ ret = __db_unknown_type(
+ env, "DbCursor.close", dbc->dbtype);
+ goto err;
+ }
+ }
+ goto done;
+
+lock: cp_c = (BTREE_CURSOR *)dbc_c->internal;
+
+ /*
+ * If this is CDB, upgrade the lock if necessary. While we acquired
+ * the write lock to logically delete the record, we released it when
+ * we returned from that call, and so may not be holding a write lock
+ * at the moment.
+ */
+ if (CDB_LOCKING(env)) {
+ if (F_ISSET(dbc, DBC_WRITECURSOR)) {
+ if ((ret = __lock_get(env,
+ dbc->locker, DB_LOCK_UPGRADE, &dbc->lock_dbt,
+ DB_LOCK_WRITE, &dbc->mylock)) != 0)
+ goto err;
+ cdb_lock = 1;
+ }
+ goto delete;
+ }
+
+ /*
+ * The variable dbc_c has been initialized to reference the cursor in
+ * which we're going to do the delete. Initialize the cursor's lock
+ * structures as necessary.
+ *
+ * First, we may not need to acquire any locks. If we're in case #3,
+ * that is, the primary database isn't a btree database, our caller
+ * is responsible for acquiring any necessary locks before calling us.
+ */
+ if (F_ISSET(dbc, DBC_OPD))
+ goto delete;
+
+ /*
+ * Otherwise, acquire a write lock on the primary database's page.
+ *
+ * Lock the primary database page, regardless of whether we're deleting
+ * an item on a primary database page or an off-page duplicates page.
+ *
+ * If the cursor that did the initial logical deletion (and had a write
+ * lock) is not the same cursor doing the physical deletion (which may
+ * have only ever had a read lock on the item), we need to upgrade to a
+ * write lock. The confusion comes as follows:
+ *
+ * C1 created, acquires item read lock
+ * C2 dup C1, create C2, also has item read lock.
+ * C1 acquire write lock, delete item
+ * C1 close
+ * C2 close, needs a write lock to physically delete item.
+ *
+ * If we're in a TXN, we know that C2 will be able to acquire the write
+ * lock, because no locker other than the one shared by C1 and C2 can
+ * acquire a write lock -- the original write lock C1 acquired was never
+ * discarded.
+ *
+ * If we're not in a TXN, it's nastier. Other cursors might acquire
+ * read locks on the item after C1 closed, discarding its write lock,
+ * and such locks would prevent C2 from acquiring a read lock. That's
+ * OK, though, we'll simply wait until we can acquire a write lock, or
+ * we'll deadlock. (Which better not happen, since we're not in a TXN.)
+ *
+ * There are similar scenarios with dirty reads, where the cursor may
+ * have downgraded its write lock to a was-write lock.
+ */
+ if (STD_LOCKING(dbc))
+ if ((ret = __db_lget(dbc,
+ LCK_COUPLE, cp->pgno, DB_LOCK_WRITE, 0, &cp->lock)) != 0)
+ goto err;
+
+delete: /*
+ * If the delete occurred in a Btree, we're going to look at the page
+ * to see if the item has to be physically deleted. Otherwise, we do
+ * not need the actual page (and it may not even exist, it might have
+ * been truncated from the file after an allocation aborted).
+ *
+ * Delete the on-page physical item referenced by the cursor.
+ */
+ if (dbc_c->dbtype == DB_BTREE) {
+ if ((ret = __memp_fget(mpf, &cp_c->pgno, dbc->thread_info,
+ dbc->txn, DB_MPOOL_DIRTY, &cp_c->page)) != 0)
+ goto err;
+ if ((ret = __bamc_physdel(dbc_c)) != 0)
+ goto err;
+ }
+
+ /*
+ * If we're not working in an off-page duplicate tree, then we're
+ * done.
+ */
+ if (!F_ISSET(dbc_c, DBC_OPD) || root_pgno == PGNO_INVALID)
+ goto done;
+
+ /*
+ * We may have just deleted the last element in the off-page duplicate
+ * tree, and closed the last cursor in the tree. For an off-page btree
+ * there are no other cursors in the tree by definition, if the tree is
+ * empty. For an off-page recno we know we have closed the last cursor
+ * in the tree because the __ram_ca_delete call above returned 0 only
+ * in that case. So, if the off-page duplicate tree is empty at this
+ * point, we want to remove it.
+ */
+ if (((h = dbc_c->internal->page) == NULL || h->pgno != root_pgno) &&
+ (ret = __memp_fget(mpf,
+ &root_pgno, dbc->thread_info, dbc->txn, 0, &h)) != 0)
+ goto err;
+ if (NUM_ENT(h) == 0) {
+ if (h != dbc_c->internal->page)
+ DISCARD_CUR(dbc_c, ret);
+ else
+ dbc_c->internal->page = NULL;
+ if (ret != 0)
+ goto err;
+ if ((ret = __db_free(dbc, h)) != 0)
+ goto err;
+ } else {
+ if (h != dbc_c->internal->page && (ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0)
+ goto err;
+ goto done;
+ }
+
+ /*
+ * When removing the tree, we have to do one of two things. If this is
+ * case #2, that is, the primary tree is a btree, delete the key that's
+ * associated with the tree from the btree leaf page. We know we are
+ * the only reference to it and we already have the correct lock. We
+ * detect this case because the cursor that was passed to us references
+ * an off-page duplicate cursor.
+ *
+ * If this is case #3, that is, the primary tree isn't a btree, pass
+ * the information back to our caller, it's their job to do cleanup on
+ * the primary page.
+ */
+ if (dbc_opd != NULL) {
+ if ((ret = __memp_fget(mpf, &cp->pgno, dbc->thread_info,
+ dbc->txn, DB_MPOOL_DIRTY, &cp->page)) != 0)
+ goto err;
+ if ((ret = __bamc_physdel(dbc)) != 0)
+ goto err;
+ } else
+ *rmroot = 1;
+err:
+done: /*
+ * Discard the page references and locks, and confirm that the stack
+ * has been emptied.
+ */
+ if (dbc_opd != NULL)
+ DISCARD_CUR(dbc_opd, ret);
+ DISCARD_CUR(dbc, ret);
+
+ /* Downgrade any CDB lock we acquired. */
+ if (cdb_lock)
+ (void)__lock_downgrade(env, &dbc->mylock, DB_LOCK_IWRITE, 0);
+
+ return (ret);
+}
+
+/*
+ * __bamc_cmp --
+ * Compare two btree cursors for equality.
+ *
+ * This function is only called with two cursors that point to the same item.
+ * It only distinguishes cursors pointing to deleted and undeleted items at
+ * the same location.
+ *
+ * PUBLIC: int __bamc_cmp __P((DBC *, DBC *, int *));
+ */
+int
+__bamc_cmp(dbc, other_dbc, result)
+ DBC *dbc, *other_dbc;
+ int *result;
+{
+ ENV *env;
+ BTREE_CURSOR *bcp, *obcp;
+
+ env = dbc->env;
+ bcp = (BTREE_CURSOR *)dbc->internal;
+ obcp = (BTREE_CURSOR *)other_dbc->internal;
+
+ DB_ASSERT (env, bcp->pgno == obcp->pgno);
+ DB_ASSERT (env, bcp->indx == obcp->indx);
+
+ /* Check to see if both cursors have the same deleted flag. */
+ *result =
+ ((F_ISSET(bcp, C_DELETED)) == F_ISSET(obcp, C_DELETED)) ? 0 : 1;
+ return (0);
+}
+
+/*
+ * __bamc_destroy --
+ * Close a single cursor -- internal version.
+ */
+static int
+__bamc_destroy(dbc)
+ DBC *dbc;
+{
+ BTREE_CURSOR *cp;
+ ENV *env;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ env = dbc->env;
+
+ /* Discard the structures. */
+ if (cp->sp != cp->stack)
+ __os_free(env, cp->sp);
+
+#ifdef HAVE_COMPRESSION
+ /* Free the memory used for compression */
+ __os_free(env, cp->compressed.data);
+ __os_free(env, cp->key1.data);
+ __os_free(env, cp->key2.data);
+ __os_free(env, cp->data1.data);
+ __os_free(env, cp->data2.data);
+ __os_free(env, cp->del_key.data);
+ __os_free(env, cp->del_data.data);
+#endif
+
+ __os_free(env, cp);
+
+ return (0);
+}
+
+/*
+ * __bamc_count --
+ * Return a count of on and off-page duplicates.
+ *
+ * PUBLIC: int __bamc_count __P((DBC *, db_recno_t *));
+ */
+int
+__bamc_count(dbc, recnop)
+ DBC *dbc;
+ db_recno_t *recnop;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ db_indx_t indx, top;
+ db_recno_t recno;
+ int ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * Called with the top-level cursor that may reference an off-page
+ * duplicates tree. We don't have to acquire any new locks, we have
+ * to have a read lock to even get here.
+ */
+ if (cp->opd == NULL) {
+ /*
+ * On-page duplicates, get the page and count.
+ */
+ DB_ASSERT(dbp->env, cp->page == NULL);
+ if ((ret = __memp_fget(mpf, &cp->pgno,
+ dbc->thread_info, dbc->txn, 0, &cp->page)) != 0)
+ return (ret);
+
+ /*
+ * Move back to the beginning of the set of duplicates and
+ * then count forward.
+ */
+ for (indx = cp->indx;; indx -= P_INDX)
+ if (indx == 0 ||
+ !IS_DUPLICATE(dbc, indx, indx - P_INDX))
+ break;
+ for (recno = 0,
+ top = NUM_ENT(cp->page) - P_INDX;; indx += P_INDX) {
+ if (!IS_DELETED(dbp, cp->page, indx))
+ ++recno;
+ if (indx == top ||
+ !IS_DUPLICATE(dbc, indx, indx + P_INDX))
+ break;
+ }
+ } else {
+ /*
+ * Off-page duplicates tree, get the root page of the off-page
+ * duplicate tree.
+ */
+ if ((ret = __memp_fget(mpf, &cp->opd->internal->root,
+ dbc->thread_info, dbc->txn, 0, &cp->page)) != 0)
+ return (ret);
+
+ /*
+ * If the page is an internal page use the page's count as it's
+ * up-to-date and reflects the status of cursors in the tree.
+ * If the page is a leaf page for unsorted duplicates, use the
+ * page's count as cursors don't mark items deleted on the page
+ * and wait, cursor delete items immediately.
+ * If the page is a leaf page for sorted duplicates, there may
+ * be cursors on the page marking deleted items -- count.
+ */
+ if (TYPE(cp->page) == P_LDUP)
+ for (recno = 0, indx = 0,
+ top = NUM_ENT(cp->page) - O_INDX;; indx += O_INDX) {
+ if (!IS_DELETED(dbp, cp->page, indx))
+ ++recno;
+ if (indx == top)
+ break;
+ }
+ else
+ recno = RE_NREC(cp->page);
+ }
+
+ *recnop = recno;
+
+ ret = __memp_fput(mpf, dbc->thread_info, cp->page, dbc->priority);
+ cp->page = NULL;
+
+ return (ret);
+}
+
+/*
+ * __bamc_del --
+ * Delete using a cursor.
+ */
+static int
+__bamc_del(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ int count, ret, t_ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ ret = 0;
+ COMPQUIET(flags, 0);
+
+ /* If the item was already deleted, return failure. */
+ if (F_ISSET(cp, C_DELETED))
+ return (DB_KEYEMPTY);
+
+ /*
+ * This code is always called with a page lock but no page.
+ */
+ DB_ASSERT(dbp->env, cp->page == NULL);
+
+ /*
+ * We don't physically delete the record until the cursor moves, so
+ * we have to have a long-lived write lock on the page instead of a
+ * a long-lived read lock. Note, we have to have a read lock to even
+ * get here.
+ *
+ * If we're maintaining record numbers, we lock the entire tree, else
+ * we lock the single page.
+ */
+ if (F_ISSET(cp, C_RECNUM)) {
+ if ((ret = __bamc_getstack(dbc)) != 0)
+ goto err;
+ cp->page = cp->csp->page;
+ } else {
+ ACQUIRE_CUR(dbc, DB_LOCK_WRITE, cp->pgno, 0, ret);
+ if (ret != 0)
+ goto err;
+ }
+
+ /* Mark the page dirty. */
+ if ((ret = __memp_dirty(mpf,
+ &cp->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err;
+
+ /* Log the change. */
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __bam_cdel_log(dbp, dbc->txn, &LSN(cp->page), 0,
+ PGNO(cp->page), &LSN(cp->page), cp->indx)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(cp->page));
+
+ /* Set the intent-to-delete flag on the page. */
+ if (TYPE(cp->page) == P_LBTREE)
+ B_DSET(GET_BKEYDATA(dbp, cp->page, cp->indx + O_INDX)->type);
+ else
+ B_DSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type);
+
+err: /*
+ * If we've been successful so far and the tree has record numbers,
+ * adjust the record counts. Either way, release acquired page(s).
+ */
+ if (F_ISSET(cp, C_RECNUM)) {
+ cp->csp->page = cp->page;
+ if (ret == 0)
+ ret = __bam_adjust(dbc, -1);
+ (void)__bam_stkrel(dbc, 0);
+ } else
+ if (cp->page != NULL &&
+ (t_ret = __memp_fput(mpf, dbc->thread_info,
+ cp->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ cp->page = NULL;
+
+ /*
+ * Update the cursors last, after all chance of recoverable failure
+ * is past.
+ */
+ if (ret == 0)
+ ret = __bam_ca_delete(dbp, cp->pgno, cp->indx, 1, &count);
+
+ return (ret);
+}
+
+/*
+ * __bamc_dup --
+ * Duplicate a btree cursor, such that the new one holds appropriate
+ * locks for the position of the original.
+ *
+ * PUBLIC: int __bamc_dup __P((DBC *, DBC *, u_int32_t));
+ */
+int
+__bamc_dup(orig_dbc, new_dbc, flags)
+ DBC *orig_dbc, *new_dbc;
+ u_int32_t flags;
+{
+ BTREE_CURSOR *orig, *new;
+
+ orig = (BTREE_CURSOR *)orig_dbc->internal;
+ new = (BTREE_CURSOR *)new_dbc->internal;
+
+ new->ovflsize = orig->ovflsize;
+ new->recno = orig->recno;
+ new->flags = orig->flags;
+
+#ifdef HAVE_COMPRESSION
+ /* Copy the compression state */
+ return (__bamc_compress_dup(orig_dbc, new_dbc, flags));
+#else
+ COMPQUIET(flags, 0);
+
+ return (0);
+#endif
+}
+
+/*
+ * __bamc_get --
+ * Get using a cursor (btree).
+ */
+static int
+__bamc_get(dbc, key, data, flags, pgnop)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+ db_pgno_t *pgnop;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ db_pgno_t orig_pgno;
+ db_indx_t orig_indx;
+ int exact, newopd, ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ orig_pgno = cp->pgno;
+ orig_indx = cp->indx;
+
+ newopd = 0;
+ switch (flags) {
+ case DB_CURRENT:
+ /* It's not possible to return a deleted record. */
+ if (F_ISSET(cp, C_DELETED)) {
+ ret = DB_KEYEMPTY;
+ goto err;
+ }
+
+ /*
+ * Acquire the current page. We have at least a read-lock
+ * already. The caller may have set DB_RMW asking for a
+ * write lock, but upgrading to a write lock has no better
+ * chance of succeeding now instead of later, so don't try.
+ */
+ if ((ret = __memp_fget(mpf, &cp->pgno,
+ dbc->thread_info, dbc->txn, 0, &cp->page)) != 0)
+ goto err;
+ break;
+ case DB_FIRST:
+ newopd = 1;
+ if ((ret = __bamc_search(dbc,
+ PGNO_INVALID, NULL, flags, &exact)) != 0)
+ goto err;
+ break;
+ case DB_GET_BOTH:
+ case DB_GET_BOTH_RANGE:
+ /*
+ * There are two ways to get here based on DBcursor->get
+ * with the DB_GET_BOTH/DB_GET_BOTH_RANGE flags set:
+ *
+ * 1. Searching a sorted off-page duplicate tree: do a tree
+ * search.
+ *
+ * 2. Searching btree: do a tree search. If it returns a
+ * reference to off-page duplicate tree, return immediately
+ * and let our caller deal with it. If the search doesn't
+ * return a reference to off-page duplicate tree, continue
+ * with an on-page search.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ if ((ret = __bamc_search(
+ dbc, PGNO_INVALID, data, flags, &exact)) != 0)
+ goto err;
+ if (flags == DB_GET_BOTH) {
+ if (!exact) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ break;
+ }
+
+ /*
+ * We didn't require an exact match, so the search may
+ * may have returned an entry past the end of the page,
+ * or we may be referencing a deleted record. If so,
+ * move to the next entry.
+ */
+ if ((cp->indx == NUM_ENT(cp->page) ||
+ IS_CUR_DELETED(dbc)) &&
+ (ret = __bamc_next(dbc, 1, 0)) != 0)
+ goto err;
+ } else {
+ if ((ret = __bamc_search(
+ dbc, PGNO_INVALID, key, flags, &exact)) != 0)
+ return (ret);
+ if (!exact) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+
+ if (pgnop != NULL && __bam_isopd(dbc, pgnop)) {
+ newopd = 1;
+ break;
+ }
+ if ((ret =
+ __bam_getboth_finddatum(dbc, data, flags)) != 0)
+ goto err;
+ }
+ break;
+#ifdef HAVE_COMPRESSION
+ case DB_SET_LTE:
+ if ((ret = __bam_getlte(dbc, key, NULL)) != 0)
+ goto err;
+ break;
+ case DB_GET_BOTH_LTE:
+ if ((ret = __bam_getlte(dbc, key, data)) != 0)
+ goto err;
+ break;
+#endif
+ case DB_GET_BOTHC:
+ if ((ret = __bam_getbothc(dbc, data)) != 0)
+ goto err;
+ break;
+ case DB_LAST:
+ newopd = 1;
+ if ((ret = __bamc_search(dbc,
+ PGNO_INVALID, NULL, flags, &exact)) != 0)
+ goto err;
+ break;
+ case DB_NEXT:
+ newopd = 1;
+ if (cp->pgno == PGNO_INVALID) {
+ if ((ret = __bamc_search(dbc,
+ PGNO_INVALID, NULL, DB_FIRST, &exact)) != 0)
+ goto err;
+ } else
+ if ((ret = __bamc_next(dbc, 1, 0)) != 0)
+ goto err;
+ break;
+ case DB_NEXT_DUP:
+ if ((ret = __bamc_next(dbc, 1, 0)) != 0)
+ goto err;
+ if (!IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx)) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ break;
+ case DB_NEXT_NODUP:
+ newopd = 1;
+ if (cp->pgno == PGNO_INVALID) {
+ if ((ret = __bamc_search(dbc,
+ PGNO_INVALID, NULL, DB_FIRST, &exact)) != 0)
+ goto err;
+ } else
+ do {
+ if ((ret = __bamc_next(dbc, 1, 0)) != 0)
+ goto err;
+ } while (IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx));
+ break;
+ case DB_PREV:
+ newopd = 1;
+ if (cp->pgno == PGNO_INVALID) {
+ if ((ret = __bamc_search(dbc,
+ PGNO_INVALID, NULL, DB_LAST, &exact)) != 0)
+ goto err;
+ } else
+ if ((ret = __bamc_prev(dbc)) != 0)
+ goto err;
+ break;
+ case DB_PREV_DUP:
+ if ((ret = __bamc_prev(dbc)) != 0)
+ goto err;
+ if (!IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx)) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ break;
+ case DB_PREV_NODUP:
+ newopd = 1;
+ if (cp->pgno == PGNO_INVALID) {
+ if ((ret = __bamc_search(dbc,
+ PGNO_INVALID, NULL, DB_LAST, &exact)) != 0)
+ goto err;
+ } else
+ do {
+ if ((ret = __bamc_prev(dbc)) != 0)
+ goto err;
+ } while (IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx));
+ break;
+ case DB_SET:
+ case DB_SET_RECNO:
+ newopd = 1;
+ if ((ret = __bamc_search(dbc,
+ PGNO_INVALID, key, flags, &exact)) != 0)
+ goto err;
+ break;
+ case DB_SET_RANGE:
+ newopd = 1;
+ if ((ret = __bamc_search(dbc,
+ PGNO_INVALID, key, flags, &exact)) != 0)
+ goto err;
+
+ /*
+ * As we didn't require an exact match, the search function
+ * may have returned an entry past the end of the page. Or,
+ * we may be referencing a deleted record. If so, move to
+ * the next entry.
+ */
+ if (cp->indx == NUM_ENT(cp->page) || IS_CUR_DELETED(dbc))
+ if ((ret = __bamc_next(dbc, 0, 0)) != 0)
+ goto err;
+ break;
+ default:
+ ret = __db_unknown_flag(dbp->env, "__bamc_get", flags);
+ goto err;
+ }
+
+ /*
+ * We may have moved to an off-page duplicate tree. Return that
+ * information to our caller.
+ */
+ if (newopd && pgnop != NULL)
+ (void)__bam_isopd(dbc, pgnop);
+
+err: /*
+ * Regardless of whether we were successful or not, if the cursor
+ * moved, clear the delete flag, DBcursor->get never references a
+ * deleted key, if it moved at all.
+ */
+ if (F_ISSET(cp, C_DELETED) &&
+ (cp->pgno != orig_pgno || cp->indx != orig_indx))
+ F_CLR(cp, C_DELETED);
+
+ return (ret);
+}
+
+static int
+__bam_get_prev(dbc)
+ DBC *dbc;
+{
+ BTREE_CURSOR *cp;
+ DBT key, data;
+ db_pgno_t pgno;
+ int ret;
+
+ if ((ret = __bamc_prev(dbc)) != 0)
+ return (ret);
+
+ if (__bam_isopd(dbc, &pgno)) {
+ cp = (BTREE_CURSOR *)dbc->internal;
+ if ((ret = __dbc_newopd(dbc, pgno, cp->opd, &cp->opd)) != 0)
+ return (ret);
+ if ((ret = cp->opd->am_get(cp->opd,
+ &key, &data, DB_LAST, NULL)) != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+/*
+ * __bam_bulk -- Return bulk data from a btree.
+ */
+static int
+__bam_bulk(dbc, data, flags)
+ DBC *dbc;
+ DBT *data;
+ u_int32_t flags;
+{
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ BTREE_CURSOR *cp;
+ PAGE *pg;
+ db_indx_t *inp, indx, pg_keyoff;
+ int32_t *endp, key_off, *offp, *saveoffp;
+ u_int8_t *dbuf, *dp, *np;
+ u_int32_t key_size, pagesize, size, space;
+ int adj, is_key, need_pg, next_key, no_dup, rec_key, ret;
+
+ ret = 0;
+ key_off = 0;
+ size = 0;
+ pagesize = dbc->dbp->pgsize;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * dp tracks the beginning of the page in the buffer.
+ * np is the next place to copy things into the buffer.
+ * dbuf always stays at the beginning of the buffer.
+ */
+ dbuf = data->data;
+ np = dp = dbuf;
+
+ /* Keep track of space that is left. There is a termination entry */
+ space = data->ulen;
+ space -= sizeof(*offp);
+
+ /* Build the offset/size table from the end up. */
+ endp = (int32_t *)((u_int8_t *)dbuf + data->ulen);
+ endp--;
+ offp = endp;
+
+ key_size = 0;
+
+ /*
+ * Distinguish between BTREE and RECNO.
+ * There are no keys in RECNO. If MULTIPLE_KEY is specified
+ * then we return the record numbers.
+ * is_key indicates that multiple btree keys are returned.
+ * rec_key is set if we are returning record numbers.
+ * next_key is set if we are going after the next key rather than dup.
+ */
+ if (dbc->dbtype == DB_BTREE) {
+ is_key = LF_ISSET(DB_MULTIPLE_KEY) ? 1: 0;
+ rec_key = 0;
+ next_key = is_key && LF_ISSET(DB_OPFLAGS_MASK) != DB_NEXT_DUP;
+ adj = 2;
+ } else {
+ is_key = 0;
+ rec_key = LF_ISSET(DB_MULTIPLE_KEY) ? 1 : 0;
+ next_key = LF_ISSET(DB_OPFLAGS_MASK) != DB_NEXT_DUP;
+ adj = 1;
+ }
+ no_dup = LF_ISSET(DB_OPFLAGS_MASK) == DB_NEXT_NODUP;
+
+next_pg:
+ indx = cp->indx;
+ pg = cp->page;
+
+ inp = P_INP(dbc->dbp, pg);
+ /* The current page is not yet in the buffer. */
+ need_pg = 1;
+
+ /*
+ * Keep track of the offset of the current key on the page.
+ * If we are returning keys, set it to 0 first so we force
+ * the copy of the key to the buffer.
+ */
+ pg_keyoff = 0;
+ if (is_key == 0)
+ pg_keyoff = inp[indx];
+
+ do {
+ if (IS_DELETED(dbc->dbp, pg, indx)) {
+ if (dbc->dbtype != DB_RECNO)
+ continue;
+
+ cp->recno++;
+ /*
+ * If we are not returning recnos then we
+ * need to fill in every slot so the user
+ * can calculate the record numbers.
+ */
+ if (rec_key != 0)
+ continue;
+
+ space -= 2 * sizeof(*offp);
+ /* Check if space as underflowed. */
+ if (space > data->ulen)
+ goto back_up;
+
+ /* Just mark the empty recno slots. */
+ *offp-- = 0;
+ *offp-- = 0;
+ continue;
+ }
+
+ /*
+ * Check to see if we have a new key.
+ * If so, then see if we need to put the
+ * key on the page. If its already there
+ * then we just point to it.
+ */
+ if (is_key && pg_keyoff != inp[indx]) {
+ bk = GET_BKEYDATA(dbc->dbp, pg, indx);
+ if (B_TYPE(bk->type) == B_OVERFLOW) {
+ bo = (BOVERFLOW *)bk;
+ size = key_size = bo->tlen;
+ if (key_size > space)
+ goto get_key_space;
+ if ((ret = __bam_bulk_overflow(dbc,
+ bo->tlen, bo->pgno, np)) != 0)
+ return (ret);
+ space -= key_size;
+ key_off = (int32_t)(np - dbuf);
+ np += key_size;
+ } else {
+ if (need_pg) {
+ dp = np;
+ size = pagesize - HOFFSET(pg);
+ if (space < size) {
+get_key_space:
+ /* Nothing added, then error. */
+ if (offp == endp) {
+ data->size = (u_int32_t)
+ DB_ALIGN(size +
+ pagesize, 1024);
+ return
+ (DB_BUFFER_SMALL);
+ }
+ /*
+ * We need to back up to the
+ * last record put into the
+ * buffer so that it is
+ * CURRENT.
+ */
+ if (indx != 0)
+ indx -= P_INDX;
+ else {
+ if ((ret =
+ __bam_get_prev(
+ dbc)) != 0)
+ return (ret);
+ indx = cp->indx;
+ pg = cp->page;
+ }
+ break;
+ }
+ /*
+ * Move the data part of the page
+ * to the buffer.
+ */
+ memcpy(dp,
+ (u_int8_t *)pg + HOFFSET(pg), size);
+ need_pg = 0;
+ space -= size;
+ np += size;
+ }
+ key_size = bk->len;
+ key_off = (int32_t)((inp[indx] - HOFFSET(pg))
+ + (dp - dbuf) + SSZA(BKEYDATA, data));
+ pg_keyoff = inp[indx];
+ }
+ }
+
+ /*
+ * Reserve space for the pointers and sizes.
+ * Either key/data pair or just for a data item.
+ */
+ space -= (is_key ? 4 : 2) * sizeof(*offp);
+ if (rec_key)
+ space -= sizeof(*offp);
+
+ /* Check to see if space has underflowed. */
+ if (space > data->ulen)
+ goto back_up;
+
+ /*
+ * Determine if the next record is in the
+ * buffer already or if it needs to be copied in.
+ * If we have an off page dup, then copy as many
+ * as will fit into the buffer.
+ */
+ bk = GET_BKEYDATA(dbc->dbp, pg, indx + adj - 1);
+ if (B_TYPE(bk->type) == B_DUPLICATE) {
+ bo = (BOVERFLOW *)bk;
+ if (is_key) {
+ *offp-- = (int32_t)key_off;
+ *offp-- = (int32_t)key_size;
+ }
+ /*
+ * We pass the offset of the current key.
+ * On return we check to see if offp has
+ * moved to see if any data fit.
+ */
+ saveoffp = offp;
+ if ((ret = __bam_bulk_duplicates(dbc, bo->pgno,
+ dbuf, is_key ? offp + P_INDX : NULL,
+ &offp, &np, &space, no_dup)) != 0) {
+ if (ret == DB_BUFFER_SMALL) {
+ size = space;
+ space = 0;
+ /* If nothing was added, then error. */
+ if (offp == saveoffp) {
+ offp += 2;
+ goto back_up;
+ }
+ goto get_space;
+ }
+ return (ret);
+ }
+ } else if (B_TYPE(bk->type) == B_OVERFLOW) {
+ bo = (BOVERFLOW *)bk;
+ size = bo->tlen;
+ if (size > space)
+ goto back_up;
+ if ((ret =
+ __bam_bulk_overflow(dbc,
+ bo->tlen, bo->pgno, np)) != 0)
+ return (ret);
+ space -= size;
+ if (is_key) {
+ *offp-- = (int32_t)key_off;
+ *offp-- = (int32_t)key_size;
+ } else if (rec_key)
+ *offp-- = (int32_t)cp->recno;
+ *offp-- = (int32_t)(np - dbuf);
+ np += size;
+ *offp-- = (int32_t)size;
+ } else {
+ if (need_pg) {
+ dp = np;
+ size = pagesize - HOFFSET(pg);
+ if (space < size) {
+back_up:
+ /*
+ * Back up the index so that the
+ * last record in the buffer is CURRENT
+ */
+ if (indx >= adj)
+ indx -= adj;
+ else {
+ if ((ret =
+ __bam_get_prev(dbc)) != 0 &&
+ ret != DB_NOTFOUND)
+ return (ret);
+ indx = cp->indx;
+ pg = cp->page;
+ }
+ if (dbc->dbtype == DB_RECNO)
+ cp->recno--;
+get_space:
+ /*
+ * See if we put anything in the
+ * buffer or if we are doing a DBP->get
+ * did we get all of the data.
+ */
+ if (offp >=
+ (is_key ? &endp[-1] : endp) ||
+ F_ISSET(dbc, DBC_FROM_DB_GET)) {
+ data->size = (u_int32_t)
+ DB_ALIGN(size +
+ data->ulen - space, 1024);
+ return (DB_BUFFER_SMALL);
+ }
+ break;
+ }
+ memcpy(dp, (u_int8_t *)pg + HOFFSET(pg), size);
+ need_pg = 0;
+ space -= size;
+ np += size;
+ }
+ /*
+ * Add the offsets and sizes to the end of the buffer.
+ * First add the key info then the data info.
+ */
+ if (is_key) {
+ *offp-- = (int32_t)key_off;
+ *offp-- = (int32_t)key_size;
+ } else if (rec_key)
+ *offp-- = (int32_t)cp->recno;
+ *offp-- = (int32_t)((inp[indx + adj - 1] - HOFFSET(pg))
+ + (dp - dbuf) + SSZA(BKEYDATA, data));
+ *offp-- = bk->len;
+ }
+ if (dbc->dbtype == DB_RECNO)
+ cp->recno++;
+ else if (no_dup) {
+ while (indx + adj < NUM_ENT(pg) &&
+ pg_keyoff == inp[indx + adj])
+ indx += adj;
+ }
+ /*
+ * Stop when we either run off the page or we move to the next key and
+ * we are not returning multiple keys.
+ */
+ } while ((indx += adj) < NUM_ENT(pg) &&
+ (next_key || pg_keyoff == inp[indx]));
+
+ /* If we are off the page then try to the next page. */
+ if (ret == 0 && next_key && indx >= NUM_ENT(pg)) {
+ cp->indx = indx;
+ ret = __bamc_next(dbc, 0, 1);
+ if (ret == 0)
+ goto next_pg;
+ if (ret != DB_NOTFOUND)
+ return (ret);
+ }
+
+ /*
+ * If we did a DBP->get we must error if we did not return
+ * all the data for the current key because there is
+ * no way to know if we did not get it all, nor any
+ * interface to fetch the balance.
+ */
+
+ if (ret == 0 && indx < pg->entries &&
+ F_ISSET(dbc, DBC_TRANSIENT) && pg_keyoff == inp[indx]) {
+ data->size = (data->ulen - space) + size;
+ return (DB_BUFFER_SMALL);
+ }
+ /*
+ * Must leave the index pointing at the last record fetched.
+ * If we are not fetching keys, we may have stepped to the
+ * next key.
+ */
+ if (ret == DB_BUFFER_SMALL || next_key || pg_keyoff == inp[indx])
+ cp->indx = indx;
+ else
+ cp->indx = indx - P_INDX;
+
+ if (rec_key == 1)
+ *offp = RECNO_OOB;
+ else
+ *offp = -1;
+ return (0);
+}
+
+/*
+ * __bam_bulk_overflow --
+ * Dump overflow record into the buffer.
+ * The space requirements have already been checked.
+ * PUBLIC: int __bam_bulk_overflow
+ * PUBLIC: __P((DBC *, u_int32_t, db_pgno_t, u_int8_t *));
+ */
+int
+__bam_bulk_overflow(dbc, len, pgno, dp)
+ DBC *dbc;
+ u_int32_t len;
+ db_pgno_t pgno;
+ u_int8_t *dp;
+{
+ DBT dbt;
+
+ memset(&dbt, 0, sizeof(dbt));
+ F_SET(&dbt, DB_DBT_USERMEM);
+ dbt.ulen = len;
+ dbt.data = (void *)dp;
+ return (__db_goff(dbc, &dbt, len, pgno, NULL, NULL));
+}
+
+/*
+ * __bam_bulk_duplicates --
+ * Put as many off page duplicates as will fit into the buffer.
+ * This routine will adjust the cursor to reflect the position in
+ * the overflow tree.
+ * PUBLIC: int __bam_bulk_duplicates __P((DBC *,
+ * PUBLIC: db_pgno_t, u_int8_t *, int32_t *,
+ * PUBLIC: int32_t **, u_int8_t **, u_int32_t *, int));
+ */
+int
+__bam_bulk_duplicates(dbc, pgno, dbuf, keyoff, offpp, dpp, spacep, no_dup)
+ DBC *dbc;
+ db_pgno_t pgno;
+ u_int8_t *dbuf;
+ int32_t *keyoff, **offpp;
+ u_int8_t **dpp;
+ u_int32_t *spacep;
+ int no_dup;
+{
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBC *opd;
+ DBT key, data;
+ PAGE *pg;
+ db_indx_t indx, *inp;
+ int32_t *offp;
+ u_int32_t pagesize, size, space;
+ u_int8_t *dp, *np;
+ int first, need_pg, ret, t_ret;
+
+ ret = 0;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ opd = cp->opd;
+
+ if (opd == NULL) {
+ if ((ret = __dbc_newopd(dbc, pgno, NULL, &opd)) != 0)
+ return (ret);
+ cp->opd = opd;
+ if ((ret = opd->am_get(opd,
+ &key, &data, DB_FIRST, NULL)) != 0)
+ goto close_opd;
+ }
+
+ pagesize = opd->dbp->pgsize;
+ cp = (BTREE_CURSOR *)opd->internal;
+ space = *spacep;
+ /* Get current offset slot. */
+ offp = *offpp;
+
+ /*
+ * np is the next place to put data.
+ * dp is the beginning of the current page in the buffer.
+ */
+ np = dp = *dpp;
+ first = 1;
+ indx = cp->indx;
+
+ do {
+ /* Fetch the current record. No initial move. */
+ if ((ret = __bamc_next(opd, 0, 0)) != 0)
+ break;
+ pg = cp->page;
+ indx = cp->indx;
+ inp = P_INP(dbp, pg);
+ /* We need to copy the page to the buffer. */
+ need_pg = 1;
+
+ do {
+ if (IS_DELETED(dbp, pg, indx))
+ goto contin;
+ bk = GET_BKEYDATA(dbp, pg, indx);
+ space -= 2 * sizeof(*offp);
+ /* Allocate space for key if needed. */
+ if (first == 0 && keyoff != NULL)
+ space -= 2 * sizeof(*offp);
+
+ /* Did space underflow? */
+ if (space > *spacep) {
+ ret = DB_BUFFER_SMALL;
+ if (first == 1) {
+ /* Get the absolute value. */
+ space = -(int32_t)space;
+ space = *spacep + space;
+ if (need_pg)
+ space += pagesize - HOFFSET(pg);
+ }
+ break;
+ }
+ if (B_TYPE(bk->type) == B_OVERFLOW) {
+ bo = (BOVERFLOW *)bk;
+ size = bo->tlen;
+ if (size > space) {
+ ret = DB_BUFFER_SMALL;
+ space = *spacep + size;
+ break;
+ }
+ if (first == 0 && keyoff != NULL) {
+ *offp-- = keyoff[0];
+ *offp-- = keyoff[-1];
+ }
+ if ((ret = __bam_bulk_overflow(dbc,
+ bo->tlen, bo->pgno, np)) != 0)
+ return (ret);
+ space -= size;
+ *offp-- = (int32_t)(np - dbuf);
+ np += size;
+ } else {
+ if (need_pg) {
+ dp = np;
+ size = pagesize - HOFFSET(pg);
+ if (space < size) {
+ ret = DB_BUFFER_SMALL;
+ /* Return space required. */
+ space = *spacep + size;
+ break;
+ }
+ memcpy(dp,
+ (u_int8_t *)pg + HOFFSET(pg), size);
+ need_pg = 0;
+ space -= size;
+ np += size;
+ }
+ if (first == 0 && keyoff != NULL) {
+ *offp-- = keyoff[0];
+ *offp-- = keyoff[-1];
+ }
+ size = bk->len;
+ *offp-- = (int32_t)((inp[indx] - HOFFSET(pg))
+ + (dp - dbuf) + SSZA(BKEYDATA, data));
+ }
+ *offp-- = (int32_t)size;
+ first = 0;
+ if (no_dup)
+ break;
+contin:
+ indx++;
+ if (opd->dbtype == DB_RECNO)
+ cp->recno++;
+ } while (indx < NUM_ENT(pg));
+ if (no_dup)
+ break;
+ cp->indx = indx;
+
+ } while (ret == 0);
+
+ /* Return the updated information. */
+ *spacep = space;
+ *offpp = offp;
+ *dpp = np;
+
+ /*
+ * If we ran out of space back up the pointer.
+ * If we did not return any dups or reached the end, close the opd.
+ */
+ if (ret == DB_BUFFER_SMALL) {
+ if (opd->dbtype == DB_RECNO) {
+ if (--cp->recno == 0)
+ goto close_opd;
+ } else if (indx != 0)
+ cp->indx--;
+ else {
+ t_ret = __bamc_prev(opd);
+ if (t_ret == DB_NOTFOUND)
+ goto close_opd;
+ if (t_ret != 0)
+ ret = t_ret;
+ }
+ } else if (keyoff == NULL && ret == DB_NOTFOUND) {
+ cp->indx--;
+ if (opd->dbtype == DB_RECNO)
+ --cp->recno;
+ } else if (indx == 0 || ret == DB_NOTFOUND) {
+close_opd:
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ if ((t_ret = __dbc_close(opd)) != 0 && ret == 0)
+ ret = t_ret;
+ ((BTREE_CURSOR *)dbc->internal)->opd = NULL;
+ }
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+
+ return (ret);
+}
+
+/*
+ * __bam_getbothc --
+ * Search for a matching data item on a join.
+ */
+static int
+__bam_getbothc(dbc, data)
+ DBC *dbc;
+ DBT *data;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ int cmp, exact, ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * Acquire the current page. We have at least a read-lock
+ * already. The caller may have set DB_RMW asking for a
+ * write lock, but upgrading to a write lock has no better
+ * chance of succeeding now instead of later, so don't try.
+ */
+ if ((ret = __memp_fget(mpf, &cp->pgno,
+ dbc->thread_info, dbc->txn, 0, &cp->page)) != 0)
+ return (ret);
+
+ /*
+ * An off-page duplicate cursor. Search the remaining duplicates
+ * for one which matches (do a normal btree search, then verify
+ * that the retrieved record is greater than the original one).
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ /*
+ * Check to make sure the desired item comes strictly after
+ * the current position; if it doesn't, return DB_NOTFOUND.
+ */
+ if ((ret = __bam_cmp(dbc, data, cp->page, cp->indx,
+ dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare,
+ &cmp)) != 0)
+ return (ret);
+
+ if (cmp <= 0)
+ return (DB_NOTFOUND);
+
+ /* Discard the current page, we're going to do a full search. */
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, cp->page, dbc->priority)) != 0)
+ return (ret);
+ cp->page = NULL;
+
+ return (__bamc_search(dbc,
+ PGNO_INVALID, data, DB_GET_BOTH, &exact));
+ }
+
+ /*
+ * We're doing a DBC->get(DB_GET_BOTHC) and we're already searching
+ * a set of on-page duplicates (either sorted or unsorted). Continue
+ * a linear search from after the current position.
+ *
+ * (Note that we could have just finished a "set" of one duplicate,
+ * i.e. not a duplicate at all, but the following check will always
+ * return DB_NOTFOUND in this case, which is the desired behavior.)
+ */
+ if (cp->indx + P_INDX >= NUM_ENT(cp->page) ||
+ !IS_DUPLICATE(dbc, cp->indx, cp->indx + P_INDX))
+ return (DB_NOTFOUND);
+ cp->indx += P_INDX;
+
+ return (__bam_getboth_finddatum(dbc, data, DB_GET_BOTH));
+}
+
+#ifdef HAVE_COMPRESSION
+/*
+ * __bam_getlte --
+ * Search for the largest entry <= key/data - used by compression.
+ *
+ * data == NULL indicates the DB_SET_LTE flag
+ * data != NULL indicates the DB_GET_BOTH_LTE flag
+ *
+ * Only works for a primary cursor - not an OPD cursor. Handles the
+ * OPD manipulation as well - no need to return to the caller to
+ * perform more OPD movements.
+ */
+static int
+__bam_getlte(dbc, key, data)
+ DBC *dbc;
+ DBT *key, *data;
+{
+ BTREE_CURSOR *cp, *ocp;
+ DB *dbp;
+ db_pgno_t pgno;
+ int exact, ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /* Begin by searching for the key */
+ ret = __bamc_search(dbc, PGNO_INVALID, key, DB_SET_RANGE, &exact);
+ if (ret == DB_NOTFOUND)
+ goto find_last;
+ if (ret != 0)
+ goto end;
+
+ if (cp->indx == NUM_ENT(cp->page) || IS_CUR_DELETED(dbc)) {
+ /*
+ * Move to the next entry if we're past the end of the
+ * page or on a deleted entry.
+ */
+ ret = __bamc_next(dbc, 0, 0);
+ if (ret == DB_NOTFOUND)
+ goto find_last;
+ if (ret != 0)
+ goto end;
+
+ /* Check if we're still on the correct key */
+ if ((ret = __bam_cmp(dbc, key, cp->page, cp->indx,
+ ((BTREE*)dbp->bt_internal)->bt_compare, &exact)) != 0)
+ goto end;
+ exact = (exact == 0);
+ }
+
+ if (exact == 0) {
+ ret = __bam_get_prev(dbc);
+ goto end;
+ }
+
+ if (__bam_isopd(dbc, &pgno)) {
+ /*
+ * We want to do unusual things with off-page duplicates, so
+ * deal with them here rather than returning to handle them.
+ */
+ if ((ret = __dbc_newopd(dbc, pgno, cp->opd, &cp->opd)) != 0)
+ goto end;
+
+ /* Search for the correct duplicate */
+ ret = __bamc_search(cp->opd, PGNO_INVALID, data,
+ data == NULL ? DB_FIRST : DB_SET_RANGE, &exact);
+ if (ret == DB_NOTFOUND)
+ goto find_last_dup;
+ if (ret != 0)
+ goto end;
+
+ ocp = (BTREE_CURSOR *)cp->opd->internal;
+ if (ocp->indx == NUM_ENT(ocp->page) ||
+ IS_CUR_DELETED(cp->opd)) {
+ /*
+ * Move to the next entry if we're past the end of the
+ * page or on a deleted entry.
+ */
+ ret = __bamc_next(cp->opd, 0, 0);
+ if (ret == DB_NOTFOUND)
+ goto find_last_dup;
+ if (ret != 0)
+ goto end;
+
+ if (data != NULL) {
+ /* Check if we're still on the correct data */
+ if ((ret = __bam_cmp(
+ dbc, data, ocp->page, ocp->indx,
+ dbp->dup_compare, &exact)) != 0)
+ goto end;
+ exact = (exact == 0);
+ } else
+ exact = 1;
+ }
+
+ if (exact == 0) {
+ /* Move to the previous entry */
+ ret = __bamc_prev(cp->opd);
+ if (ret == DB_NOTFOUND) {
+ if ((ret = __dbc_close(cp->opd)) != 0)
+ goto end;
+ cp->opd = NULL;
+ ret = __bam_get_prev(dbc);
+ }
+ }
+ } else if(data != NULL) {
+ /*
+ * If we got an exact match with on-page duplicates, we need to
+ * search in them.
+ */
+ ret = __bam_getboth_finddatum(dbc, data, DB_GET_BOTH_RANGE);
+ if (ret == DB_NOTFOUND)
+ exact = 0;
+ else if (ret != 0)
+ goto end;
+ else {
+ /* Check if we're still on the correct data */
+ if ((ret = __bam_cmp(dbc, data, cp->page,
+ cp->indx + O_INDX, dbp->dup_compare, &exact)) != 0)
+ goto end;
+ exact = (exact == 0);
+ }
+
+ if (exact == 0) {
+ ret = __bam_get_prev(dbc);
+ }
+ }
+
+ end:
+ return (ret);
+
+ find_last:
+ if ((ret = __bamc_search(
+ dbc, PGNO_INVALID, NULL, DB_LAST, &exact)) != 0)
+ return (ret);
+
+ if (__bam_isopd(dbc, &pgno)) {
+ if ((ret = __dbc_newopd(dbc, pgno, cp->opd, &cp->opd)) != 0)
+ return (ret);
+ find_last_dup:
+ if ((ret = __bamc_search(
+ cp->opd, PGNO_INVALID, NULL, DB_LAST, &exact)) != 0)
+ return (ret);
+ }
+
+ return (ret);
+}
+#endif
+
+/*
+ * __bam_getboth_finddatum --
+ * Find a matching on-page data item.
+ */
+static int
+__bam_getboth_finddatum(dbc, data, flags)
+ DBC *dbc;
+ DBT *data;
+ u_int32_t flags;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ db_indx_t base, lim, top;
+ int cmp, ret;
+
+ COMPQUIET(cmp, 0);
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * Called (sometimes indirectly) from DBC->get to search on-page data
+ * item(s) for a matching value. If the original flag was DB_GET_BOTH
+ * or DB_GET_BOTH_RANGE, the cursor is set to the first undeleted data
+ * item for the key. If the original flag was DB_GET_BOTHC, the cursor
+ * argument is set to the first data item we can potentially return.
+ * In both cases, there may or may not be additional duplicate data
+ * items to search.
+ *
+ * If the duplicates are not sorted, do a linear search.
+ */
+ if (dbp->dup_compare == NULL) {
+ for (;; cp->indx += P_INDX) {
+ if (!IS_CUR_DELETED(dbc) &&
+ (ret = __bam_cmp(dbc, data, cp->page,
+ cp->indx + O_INDX, __bam_defcmp, &cmp)) != 0)
+ return (ret);
+ if (cmp == 0)
+ return (0);
+
+ if (cp->indx + P_INDX >= NUM_ENT(cp->page) ||
+ !IS_DUPLICATE(dbc, cp->indx, cp->indx + P_INDX))
+ break;
+ }
+ return (DB_NOTFOUND);
+ }
+
+ /*
+ * If the duplicates are sorted, do a binary search. The reason for
+ * this is that large pages and small key/data pairs result in large
+ * numbers of on-page duplicates before they get pushed off-page.
+ *
+ * Find the top and bottom of the duplicate set. Binary search
+ * requires at least two items, don't loop if there's only one.
+ */
+ for (base = top = cp->indx; top < NUM_ENT(cp->page); top += P_INDX)
+ if (!IS_DUPLICATE(dbc, cp->indx, top))
+ break;
+ if (base == (top - P_INDX)) {
+ if ((ret = __bam_cmp(dbc, data, cp->page,
+ cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0)
+ return (ret);
+ if (cmp == 0 || (cmp < 0 && flags == DB_GET_BOTH_RANGE))
+ return 0;
+ cp->indx = top;
+ return DB_NOTFOUND;
+ }
+
+ for (lim = (top - base) / (db_indx_t)P_INDX; lim != 0; lim >>= 1) {
+ cp->indx = base + ((lim >> 1) * P_INDX);
+ if ((ret = __bam_cmp(dbc, data, cp->page,
+ cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0)
+ return (ret);
+ if (cmp == 0) {
+ /*
+ * XXX
+ * No duplicate duplicates in sorted duplicate sets,
+ * so there can be only one.
+ */
+ if (!IS_CUR_DELETED(dbc))
+ return (0);
+ break;
+ }
+ if (cmp > 0) {
+ base = cp->indx + P_INDX;
+ --lim;
+ }
+ }
+
+ /* No match found; if we're looking for an exact match, we're done. */
+ if (flags == DB_GET_BOTH)
+ return (DB_NOTFOUND);
+
+ /*
+ * Base is the smallest index greater than the data item, may be zero
+ * or a last + O_INDX index, and may be deleted. Find an undeleted
+ * item.
+ */
+ cp->indx = base;
+ while (cp->indx < top && IS_CUR_DELETED(dbc))
+ cp->indx += P_INDX;
+ return (cp->indx < top ? 0 : DB_NOTFOUND);
+}
+
+/*
+ * __bamc_put --
+ * Put using a cursor.
+ */
+static int
+__bamc_put(dbc, key, data, flags, pgnop)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+ db_pgno_t *pgnop;
+{
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT dbt;
+ DB_MPOOLFILE *mpf;
+ db_pgno_t root_pgno;
+ int cmp, exact, own, ret, stack;
+ u_int32_t iiop;
+ void *arg;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ root_pgno = cp->root;
+
+split: ret = stack = 0;
+ switch (flags) {
+ case DB_CURRENT:
+ if (F_ISSET(cp, C_DELETED))
+ return (DB_NOTFOUND);
+ /* FALLTHROUGH */
+ case DB_AFTER:
+ case DB_BEFORE:
+ iiop = flags;
+ own = 1;
+
+ /* Acquire the current page with a write lock. */
+ ACQUIRE_WRITE_LOCK(dbc, ret);
+ if (ret != 0)
+ goto err;
+ if (cp->page == NULL && (ret = __memp_fget(mpf, &cp->pgno,
+ dbc->thread_info, dbc->txn, 0, &cp->page)) != 0)
+ goto err;
+ break;
+ case DB_KEYFIRST:
+ case DB_KEYLAST:
+ case DB_NODUPDATA:
+ case DB_NOOVERWRITE:
+ case DB_OVERWRITE_DUP:
+ own = 0;
+ /*
+ * Searching off-page, sorted duplicate tree: do a tree search
+ * for the correct item; __bamc_search returns the smallest
+ * slot greater than the key, use it.
+ *
+ * See comment below regarding where we can start the search.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ if ((ret = __bamc_search(dbc,
+ F_ISSET(cp, C_RECNUM) ? cp->root : root_pgno,
+ data, flags, &exact)) != 0)
+ goto err;
+ stack = 1;
+
+ /* Disallow "sorted" duplicate duplicates. */
+ if (exact != 0) {
+ if (flags == DB_OVERWRITE_DUP ||
+ IS_DELETED(dbp, cp->page, cp->indx)) {
+ iiop = DB_CURRENT;
+ break;
+ }
+ ret = __db_duperr(dbp, flags);
+ goto err;
+ }
+ iiop = DB_BEFORE;
+ break;
+ }
+
+ /*
+ * Searching a btree.
+ *
+ * If we've done a split, we can start the search from the
+ * parent of the split page, which __bam_split returned
+ * for us in root_pgno, unless we're in a Btree with record
+ * numbering. In that case, we'll need the true root page
+ * in order to adjust the record count.
+ */
+ if ((ret = __bamc_search(dbc,
+ F_ISSET(cp, C_RECNUM) ? cp->root : root_pgno, key,
+ flags == DB_KEYFIRST || dbp->dup_compare != NULL ?
+ DB_KEYFIRST : DB_KEYLAST, &exact)) != 0)
+ goto err;
+ stack = 1;
+
+ /*
+ * If we don't have an exact match, __bamc_search returned
+ * the smallest slot greater than the key, use it.
+ */
+ if (!exact) {
+ iiop = DB_KEYFIRST;
+ break;
+
+ /*
+ * Check for NOOVERWRITE. It is possible that there
+ * is a key with an empty duplicate page attached.
+ */
+ } else if (flags == DB_NOOVERWRITE && !IS_CUR_DELETED(dbc)) {
+ if (pgnop != NULL && __bam_isopd(dbc, pgnop))
+ ret = __bam_opd_exists(dbc, *pgnop);
+ else
+ ret = DB_KEYEXIST;
+ if (ret != 0)
+ goto err;
+ }
+
+ /*
+ * If duplicates aren't supported, replace the current item.
+ */
+ if (!F_ISSET(dbp, DB_AM_DUP)) {
+ iiop = DB_CURRENT;
+ break;
+ }
+
+ /*
+ * If we find a matching entry, it may be an off-page duplicate
+ * tree. Return the page number to our caller, we need a new
+ * cursor.
+ */
+ if (pgnop != NULL && __bam_isopd(dbc, pgnop))
+ goto done;
+
+ /* If the duplicates aren't sorted, move to the right slot. */
+ if (dbp->dup_compare == NULL) {
+ if (flags == DB_KEYFIRST)
+ iiop = DB_BEFORE;
+ else
+ for (;; cp->indx += P_INDX)
+ if (cp->indx + P_INDX >=
+ NUM_ENT(cp->page) ||
+ !IS_DUPLICATE(dbc, cp->indx,
+ cp->indx + P_INDX)) {
+ iiop = DB_AFTER;
+ break;
+ }
+ break;
+ }
+
+ /*
+ * We know that we're looking at the first of a set of sorted
+ * on-page duplicates. Walk the list to find the right slot.
+ */
+ for (;; cp->indx += P_INDX) {
+ if ((ret = __bam_cmp(dbc, data, cp->page,
+ cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0)
+ goto err;
+ if (cmp < 0) {
+ iiop = DB_BEFORE;
+ break;
+ }
+
+ /* Disallow "sorted" duplicate duplicates. */
+ if (cmp == 0) {
+ if (flags == DB_OVERWRITE_DUP ||
+ IS_DELETED(dbp, cp->page, cp->indx)) {
+ iiop = DB_CURRENT;
+ break;
+ }
+ ret = __db_duperr(dbp, flags);
+ goto err;
+ }
+
+ if (cp->indx + P_INDX >= NUM_ENT(cp->page) ||
+ P_INP(dbp, ((PAGE *)cp->page))[cp->indx] !=
+ P_INP(dbp, ((PAGE *)cp->page))[cp->indx + P_INDX]) {
+ iiop = DB_AFTER;
+ break;
+ }
+ }
+ break;
+ default:
+ ret = __db_unknown_flag(dbp->env, "__bamc_put", flags);
+ goto err;
+ }
+
+ switch (ret = __bam_iitem(dbc, key, data, iiop, 0)) {
+ case 0:
+ break;
+ case DB_NEEDSPLIT:
+ /*
+ * To split, we need a key for the page. Either use the key
+ * argument or get a copy of the key from the page.
+ */
+ if (flags == DB_AFTER ||
+ flags == DB_BEFORE || flags == DB_CURRENT) {
+ memset(&dbt, 0, sizeof(DBT));
+ if ((ret = __db_ret(dbc, cp->page, 0, &dbt,
+ &dbc->my_rkey.data, &dbc->my_rkey.ulen)) != 0)
+ goto err;
+ arg = &dbt;
+ } else
+ arg = F_ISSET(dbc, DBC_OPD) ? data : key;
+
+ /*
+ * Discard any locks and pinned pages (the locks are discarded
+ * even if we're running with transactions, as they lock pages
+ * that we're sorry we ever acquired). If stack is set and the
+ * cursor entries are valid, they point to the same entries as
+ * the stack, don't free them twice.
+ */
+ if (stack)
+ ret = __bam_stkrel(dbc, STK_CLRDBC | STK_NOLOCK);
+ else
+ DISCARD_CUR(dbc, ret);
+ if (ret != 0)
+ goto err;
+
+ /*
+ * SR [#6059]
+ * If we do not own a lock on the page any more, then clear the
+ * cursor so we don't point at it. Even if we call __bam_stkrel
+ * above we still may have entered the routine with the cursor
+ * positioned to a particular record. This is in the case
+ * where C_RECNUM is set.
+ */
+ if (own == 0) {
+ cp->pgno = PGNO_INVALID;
+ cp->indx = 0;
+ }
+
+ /* Split the tree. */
+ if ((ret = __bam_split(dbc, arg, &root_pgno)) != 0)
+ return (ret);
+
+ goto split;
+ default:
+ goto err;
+ }
+
+err:
+done: /*
+ * If we inserted a key into the first or last slot of the tree,
+ * remember where it was so we can do it more quickly next time.
+ * If the tree has record numbers, we need a complete stack so
+ * that we can adjust the record counts, so skipping the tree search
+ * isn't possible. For subdatabases we need to be careful that the
+ * page does not move from one db to another, so we track its LSN.
+ *
+ * If there are duplicates and we are inserting into the last slot,
+ * the cursor will point _to_ the last item, not after it, which
+ * is why we subtract P_INDX below.
+ */
+
+ t = dbp->bt_internal;
+ if (ret == 0 && TYPE(cp->page) == P_LBTREE &&
+ (flags == DB_KEYFIRST || flags == DB_KEYLAST) &&
+ !F_ISSET(cp, C_RECNUM) &&
+ (!F_ISSET(dbp, DB_AM_SUBDB) ||
+ (LOGGING_ON(dbp->env) && !F_ISSET(dbp, DB_AM_NOT_DURABLE))) &&
+ ((NEXT_PGNO(cp->page) == PGNO_INVALID &&
+ cp->indx >= NUM_ENT(cp->page) - P_INDX) ||
+ (PREV_PGNO(cp->page) == PGNO_INVALID && cp->indx == 0))) {
+ t->bt_lpgno = cp->pgno;
+ if (F_ISSET(dbp, DB_AM_SUBDB))
+ t->bt_llsn = LSN(cp->page);
+ } else
+ t->bt_lpgno = PGNO_INVALID;
+ /*
+ * Discard any pages pinned in the tree and their locks, except for
+ * the leaf page. Note, the leaf page participated in any stack we
+ * acquired, and so we have to adjust the stack as necessary. If
+ * there was only a single page on the stack, we don't have to free
+ * further stack pages.
+ */
+ if (stack && BT_STK_POP(cp) != NULL)
+ (void)__bam_stkrel(dbc, 0);
+
+ /*
+ * Regardless of whether we were successful or not, clear the delete
+ * flag. If we're successful, we either moved the cursor or the item
+ * is no longer deleted. If we're not successful, then we're just a
+ * copy, no need to have the flag set.
+ *
+ * We may have instantiated off-page duplicate cursors during the put,
+ * so clear the deleted bit from the off-page duplicate cursor as well.
+ */
+ F_CLR(cp, C_DELETED);
+ if (cp->opd != NULL) {
+ cp = (BTREE_CURSOR *)cp->opd->internal;
+ F_CLR(cp, C_DELETED);
+ }
+
+ return (ret);
+}
+
+/*
+ * __bamc_rget --
+ * Return the record number for a cursor.
+ *
+ * PUBLIC: int __bamc_rget __P((DBC *, DBT *));
+ */
+int
+__bamc_rget(dbc, data)
+ DBC *dbc;
+ DBT *data;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT dbt;
+ DB_MPOOLFILE *mpf;
+ db_recno_t recno;
+ int exact, ret, t_ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * Get the page with the current item on it.
+ * Get a copy of the key.
+ * Release the page, making sure we don't release it twice.
+ */
+ if ((ret = __memp_fget(mpf, &cp->pgno,
+ dbc->thread_info, dbc->txn, 0, &cp->page)) != 0)
+ return (ret);
+ memset(&dbt, 0, sizeof(DBT));
+ if ((ret = __db_ret(dbc, cp->page, cp->indx, &dbt,
+ &dbc->my_rkey.data, &dbc->my_rkey.ulen)) != 0)
+ goto err;
+ ret = __memp_fput(mpf, dbc->thread_info, cp->page, dbc->priority);
+ cp->page = NULL;
+ if (ret != 0)
+ return (ret);
+
+ if ((ret = __bam_search(dbc, PGNO_INVALID, &dbt,
+ F_ISSET(dbc, DBC_RMW) ? SR_FIND_WR : SR_FIND,
+ 1, &recno, &exact)) != 0)
+ goto err;
+
+ ret = __db_retcopy(dbc->env, data,
+ &recno, sizeof(recno), &dbc->rdata->data, &dbc->rdata->ulen);
+
+ /* Release the stack. */
+err: if ((t_ret = __bam_stkrel(dbc, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __bamc_writelock --
+ * Upgrade the cursor to a write lock.
+ */
+static int
+__bamc_writelock(dbc)
+ DBC *dbc;
+{
+ BTREE_CURSOR *cp;
+ int ret;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ if (cp->lock_mode == DB_LOCK_WRITE)
+ return (0);
+
+ /*
+ * When writing to an off-page duplicate tree, we need to have the
+ * appropriate page in the primary tree locked. The general DBC
+ * code calls us first with the primary cursor so we can acquire the
+ * appropriate lock.
+ */
+ ACQUIRE_WRITE_LOCK(dbc, ret);
+ return (ret);
+}
+
+/*
+ * __bamc_next --
+ * Move to the next record.
+ */
+static int
+__bamc_next(dbc, initial_move, deleted_okay)
+ DBC *dbc;
+ int initial_move, deleted_okay;
+{
+ BTREE_CURSOR *cp;
+ db_indx_t adjust;
+ db_lockmode_t lock_mode;
+ db_pgno_t pgno;
+ int ret;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ ret = 0;
+
+ /*
+ * We're either moving through a page of duplicates or a btree leaf
+ * page.
+ *
+ * !!!
+ * This code handles empty pages and pages with only deleted entries.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ adjust = O_INDX;
+ lock_mode = DB_LOCK_NG;
+ } else {
+ adjust = dbc->dbtype == DB_BTREE ? P_INDX : O_INDX;
+ lock_mode =
+ F_ISSET(dbc, DBC_RMW) ? DB_LOCK_WRITE : DB_LOCK_READ;
+ }
+ if (cp->page == NULL) {
+ ACQUIRE_CUR(dbc, lock_mode, cp->pgno, 0, ret);
+ if (ret != 0)
+ return (ret);
+ }
+
+ if (initial_move)
+ cp->indx += adjust;
+
+ for (;;) {
+ /*
+ * If at the end of the page, move to a subsequent page.
+ *
+ * !!!
+ * Check for >= NUM_ENT. If the original search landed us on
+ * NUM_ENT, we may have incremented indx before the test.
+ */
+ if (cp->indx >= NUM_ENT(cp->page)) {
+ if ((pgno = NEXT_PGNO(cp->page)) == PGNO_INVALID)
+ return (DB_NOTFOUND);
+
+ ACQUIRE_CUR(dbc, lock_mode, pgno, 0, ret);
+ if (ret != 0)
+ return (ret);
+ cp->indx = 0;
+ continue;
+ }
+ if (!deleted_okay && IS_CUR_DELETED(dbc)) {
+ cp->indx += adjust;
+ continue;
+ }
+ break;
+ }
+ return (0);
+}
+
+/*
+ * __bamc_prev --
+ * Move to the previous record.
+ */
+static int
+__bamc_prev(dbc)
+ DBC *dbc;
+{
+ BTREE_CURSOR *cp;
+ db_indx_t adjust;
+ db_lockmode_t lock_mode;
+ db_pgno_t pgno;
+ int ret;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ ret = 0;
+
+ /*
+ * We're either moving through a page of duplicates or a btree leaf
+ * page.
+ *
+ * !!!
+ * This code handles empty pages and pages with only deleted entries.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ adjust = O_INDX;
+ lock_mode = DB_LOCK_NG;
+ } else {
+ adjust = dbc->dbtype == DB_BTREE ? P_INDX : O_INDX;
+ lock_mode =
+ F_ISSET(dbc, DBC_RMW) ? DB_LOCK_WRITE : DB_LOCK_READ;
+ }
+ if (cp->page == NULL) {
+ ACQUIRE_CUR(dbc, lock_mode, cp->pgno, 0, ret);
+ if (ret != 0)
+ return (ret);
+ }
+
+ for (;;) {
+ /* If at the beginning of the page, move to a previous one. */
+ if (cp->indx == 0) {
+ if ((pgno =
+ PREV_PGNO(cp->page)) == PGNO_INVALID)
+ return (DB_NOTFOUND);
+
+ ACQUIRE_CUR(dbc, lock_mode, pgno, 0, ret);
+ if (ret != 0)
+ return (ret);
+
+ if ((cp->indx = NUM_ENT(cp->page)) == 0)
+ continue;
+ }
+
+ /* Ignore deleted records. */
+ cp->indx -= adjust;
+ if (IS_CUR_DELETED(dbc))
+ continue;
+
+ break;
+ }
+ return (0);
+}
+
+/*
+ * __bamc_search --
+ * Move to a specified record.
+ */
+static int
+__bamc_search(dbc, root_pgno, key, flags, exactp)
+ DBC *dbc;
+ db_pgno_t root_pgno;
+ const DBT *key;
+ u_int32_t flags;
+ int *exactp;
+{
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ PAGE *h;
+ db_indx_t base, indx, *inp, lim;
+ db_pgno_t bt_lpgno;
+ db_recno_t recno;
+ u_int32_t sflags;
+ int bulk, cmp, ret, t_ret;
+
+ COMPQUIET(cmp, 0);
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ t = dbp->bt_internal;
+ ret = 0;
+ bulk = (F_ISSET(dbc, DBC_BULK) && cp->pgno != PGNO_INVALID);
+
+ /*
+ * Find an entry in the database. Discard any lock we currently hold,
+ * we're going to search the tree.
+ */
+ DISCARD_CUR(dbc, ret);
+ if (ret != 0)
+ return (ret);
+
+ switch (flags) {
+ case DB_FIRST:
+ sflags = (F_ISSET(dbc, DBC_RMW) ? SR_WRITE : SR_READ) | SR_MIN;
+ goto search;
+ case DB_LAST:
+ sflags = (F_ISSET(dbc, DBC_RMW) ? SR_WRITE : SR_READ) | SR_MAX;
+ goto search;
+ case DB_SET_RECNO:
+ if ((ret = __ram_getno(dbc, key, &recno, 0)) != 0)
+ return (ret);
+ sflags =
+ (F_ISSET(dbc, DBC_RMW) ? SR_FIND_WR : SR_FIND) | SR_EXACT;
+ if ((ret = __bam_rsearch(dbc, &recno, sflags, 1, exactp)) != 0)
+ return (ret);
+ goto done;
+ case DB_SET:
+ case DB_GET_BOTH:
+ sflags =
+ (F_ISSET(dbc, DBC_RMW) ? SR_FIND_WR : SR_FIND) | SR_EXACT;
+ if (bulk)
+ break;
+ goto search;
+ case DB_GET_BOTH_RANGE:
+ sflags = (F_ISSET(dbc, DBC_RMW) ? SR_FIND_WR : SR_FIND);
+ goto search;
+ case DB_SET_RANGE:
+ sflags =
+ (F_ISSET(dbc, DBC_RMW) ? SR_WRITE : SR_READ) | SR_DUPFIRST;
+ goto search;
+ case DB_KEYFIRST:
+ case DB_NOOVERWRITE:
+ sflags = SR_KEYFIRST;
+ break;
+ case DB_KEYLAST:
+ case DB_NODUPDATA:
+ case DB_OVERWRITE_DUP:
+ sflags = SR_KEYLAST;
+ break;
+ default:
+ return (__db_unknown_flag(dbp->env, "__bamc_search", flags));
+ }
+
+ /*
+ * If the application has a history of inserting into the first or last
+ * pages of the database, we check those pages first to avoid doing a
+ * full search. Similarly, if the cursor is configured as a bulk
+ * cursor, check whether this operation belongs on the same page as the
+ * last one.
+ */
+ if (bulk)
+ bt_lpgno = cp->pgno;
+ else {
+ if (F_ISSET(dbc, DBC_OPD))
+ goto search;
+
+ /*
+ * !!!
+ * We do not mutex protect the t->bt_lpgno field, which means
+ * that it can only be used in an advisory manner. If we find
+ * page we can use, great. If we don't, we don't care, we do
+ * it the slow way instead. Regardless, copy it into a local
+ * variable, otherwise we might acquire a lock for a page and
+ * then read a different page because it changed underfoot.
+ */
+ bt_lpgno = t->bt_lpgno;
+ }
+
+ /*
+ * If the tree has no history of insertion, do it the slow way.
+ */
+ if (bt_lpgno == PGNO_INVALID)
+ goto search;
+
+ /*
+ * Lock and retrieve the page on which we last inserted.
+ *
+ * The page may not exist: if a transaction created the page
+ * and then aborted, the page might have been truncated from
+ * the end of the file. We don't want to wait on the lock.
+ * The page may not even be relevant to this search.
+ */
+ h = NULL;
+ ACQUIRE_CUR(dbc, DB_LOCK_WRITE, bt_lpgno, DB_LOCK_NOWAIT, ret);
+ if (ret != 0) {
+ if (ret == DB_LOCK_DEADLOCK ||
+ ret == DB_LOCK_NOTGRANTED ||
+ ret == DB_PAGE_NOTFOUND)
+ ret = 0;
+ goto fast_miss;
+ }
+
+ h = cp->page;
+ inp = P_INP(dbp, h);
+
+ /*
+ * It's okay if the page type isn't right or it's empty, it
+ * just means that the world changed.
+ */
+ if (TYPE(h) != P_LBTREE || NUM_ENT(h) == 0)
+ goto fast_miss;
+
+ /* Verify that this page cannot have moved to another db. */
+ if (F_ISSET(dbp, DB_AM_SUBDB) &&
+ LOG_COMPARE(&t->bt_llsn, &LSN(h)) != 0)
+ goto fast_miss;
+
+ /*
+ * What we do here is test to see if we're at the beginning or
+ * end of the tree and if the new item sorts before/after the
+ * first/last page entry. We only try to catch inserts into
+ * the middle of the tree for bulk cursors.
+ */
+ if (h->next_pgno == PGNO_INVALID) {
+ indx = NUM_ENT(h) - P_INDX;
+ if ((ret = __bam_cmp(dbc, key, h, indx,
+ t->bt_compare, &cmp)) != 0)
+ goto fast_miss;
+ if (cmp > 0)
+ indx += P_INDX;
+ if (cmp >= 0)
+ goto fast_hit;
+ }
+ if (h->prev_pgno == PGNO_INVALID) {
+ indx = 0;
+ if ((ret = __bam_cmp(dbc, key, h, indx,
+ t->bt_compare, &cmp)) != 0)
+ goto fast_miss;
+ if (cmp <= 0)
+ goto fast_hit;
+ }
+ if (bulk) {
+ DB_BINARY_SEARCH_FOR(base, lim, NUM_ENT(h), P_INDX) {
+ DB_BINARY_SEARCH_INCR(indx, base, lim, P_INDX);
+ if ((ret = __bam_cmp(dbc, key, h, indx,
+ t->bt_compare, &cmp)) != 0)
+ goto fast_miss;
+
+ if (cmp == 0)
+ goto fast_hit;
+ if (cmp > 0)
+ DB_BINARY_SEARCH_SHIFT_BASE(indx, base,
+ lim, P_INDX);
+ }
+ /*
+ * No match found: base is the smallest index greater than
+ * the key and may be zero or NUM_ENT(h).
+ */
+ indx = base;
+ if (indx > 0 && indx < NUM_ENT(h)) {
+ if (FLD_ISSET(sflags, SR_EXACT))
+ return (DB_NOTFOUND);
+ goto fast_hit;
+ }
+ }
+ goto fast_miss;
+
+fast_hit:
+ if (cmp == 0) {
+ /*
+ * Found a duplicate. Deal with DB_KEYFIRST / DB_KEYLAST.
+ */
+ if (FLD_ISSET(sflags, SR_DUPFIRST))
+ while (indx > 0 && inp[indx - P_INDX] == inp[indx])
+ indx -= P_INDX;
+ else if (FLD_ISSET(sflags, SR_DUPLAST))
+ while (indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
+ inp[indx] == inp[indx + P_INDX])
+ indx += P_INDX;
+ }
+
+ /* Set the exact match flag, we may have found a duplicate. */
+ *exactp = (cmp == 0);
+
+ /*
+ * Insert the entry in the stack. (Our caller is likely to
+ * call __bam_stkrel() after our return.)
+ */
+ BT_STK_CLR(cp);
+ BT_STK_ENTER(dbp->env,
+ cp, h, indx, cp->lock, cp->lock_mode, ret);
+ if (ret != 0)
+ return (ret);
+ goto done;
+
+fast_miss:
+ /*
+ * This was not the right page, so we do not need to retain
+ * the lock even in the presence of transactions.
+ *
+ * This is also an error path, so ret may have been set.
+ */
+ DISCARD_CUR(dbc, ret);
+ cp->pgno = PGNO_INVALID;
+ if ((t_ret = __LPUT(dbc, cp->lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ return (ret);
+
+search:
+ if ((ret = __bam_search(dbc, root_pgno,
+ key, sflags, 1, NULL, exactp)) != 0)
+ return (ret);
+
+done: /* Initialize the cursor from the stack. */
+ cp->page = cp->csp->page;
+ cp->pgno = cp->csp->page->pgno;
+ cp->indx = cp->csp->indx;
+ cp->lock = cp->csp->lock;
+ cp->lock_mode = cp->csp->lock_mode;
+
+ /* If on an empty page or a deleted record, move to the next one. */
+ if (flags == DB_FIRST &&
+ (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(dbc)))
+ if ((ret = __bamc_next(dbc, 0, 0)) != 0)
+ return (ret);
+ if (flags == DB_LAST &&
+ (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(dbc)))
+ if ((ret = __bamc_prev(dbc)) != 0)
+ return (ret);
+
+ return (0);
+}
+
+/*
+ * __bamc_physdel --
+ * Physically remove an item from the page.
+ */
+static int
+__bamc_physdel(dbc)
+ DBC *dbc;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT key;
+ DB_LOCK next_lock, prev_lock;
+ db_pgno_t pgno;
+ int delete_page, empty_page, exact, ret;
+
+ dbp = dbc->dbp;
+ memset(&key, 0, sizeof(DBT));
+ cp = (BTREE_CURSOR *)dbc->internal;
+ delete_page = empty_page = ret = 0;
+ LOCK_INIT(next_lock);
+ LOCK_INIT(prev_lock);
+
+ /* If the page is going to be emptied, consider deleting it. */
+ delete_page = empty_page =
+ NUM_ENT(cp->page) == (TYPE(cp->page) == P_LBTREE ? 2 : 1);
+
+ /*
+ * Check if the application turned off reverse splits. Applications
+ * can't turn off reverse splits in off-page duplicate trees, that
+ * space will never be reused unless the exact same key is specified.
+ */
+ if (delete_page &&
+ !F_ISSET(dbc, DBC_OPD) && F_ISSET(dbp, DB_AM_REVSPLITOFF))
+ delete_page = 0;
+
+ /*
+ * We never delete the last leaf page. (Not really true -- we delete
+ * the last leaf page of off-page duplicate trees, but that's handled
+ * by our caller, not down here.)
+ */
+ if (delete_page && cp->pgno == cp->root)
+ delete_page = 0;
+
+ /*
+ * To delete a leaf page other than an empty root page, we need a
+ * copy of a key from the page. Use the 0th page index since it's
+ * the last key the page held.
+ *
+ * !!!
+ * Note that because __bamc_physdel is always called from a cursor
+ * close, it should be safe to use the cursor's own "my_rkey" memory
+ * to temporarily hold this key. We shouldn't own any returned-data
+ * memory of interest--if we do, we're in trouble anyway.
+ */
+ if (delete_page) {
+ if ((ret = __db_ret(dbc, cp->page, 0, &key,
+ &dbc->my_rkey.data, &dbc->my_rkey.ulen)) != 0)
+ return (ret);
+ }
+
+ /*
+ * Delete the items. If page isn't empty, we adjust the cursors.
+ *
+ * !!!
+ * The following operations to delete a page may deadlock. The easy
+ * scenario is if we're deleting an item because we're closing cursors
+ * because we've already deadlocked and want to call txn->abort. If
+ * we fail due to deadlock, we'll leave a locked, possibly empty page
+ * in the tree, which won't be empty long because we'll undo the delete
+ * when we undo the transaction's modifications.
+ *
+ * !!!
+ * Delete the key item first, otherwise the on-page duplicate checks
+ * in __bam_ditem() won't work!
+ */
+ if ((ret = __memp_dirty(dbp->mpf,
+ &cp->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ return (ret);
+ if (TYPE(cp->page) == P_LBTREE) {
+ if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
+ return (ret);
+ if (!empty_page)
+ if ((ret = __bam_ca_di(dbc,
+ PGNO(cp->page), cp->indx, -1)) != 0)
+ return (ret);
+ }
+ if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
+ return (ret);
+
+ /* Clear the deleted flag, the item is gone. */
+ F_CLR(cp, C_DELETED);
+
+ if (!empty_page)
+ if ((ret = __bam_ca_di(dbc, PGNO(cp->page), cp->indx, -1)) != 0)
+ return (ret);
+
+ /*
+ * Need to downgrade write locks here or non-txn locks will get stuck.
+ */
+ if (F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED)) {
+ if ((ret = __TLPUT(dbc, cp->lock)) != 0)
+ return (ret);
+ cp->lock_mode = DB_LOCK_WWRITE;
+ if (cp->page != NULL &&
+ (ret = __memp_shared(dbp->mpf, cp->page)) != 0)
+ return (ret);
+ }
+ /* If we're not going to try and delete the page, we're done. */
+ if (!delete_page)
+ return (0);
+
+ /*
+ * Lock the previous and next pages before latching the parent
+ * sub tree.
+ */
+ if (STD_LOCKING(dbc)) {
+ if ((pgno = PREV_PGNO(cp->page)) != PGNO_INVALID &&
+ (ret = __db_lget(dbc,
+ 0, pgno, DB_LOCK_WRITE, 0, &prev_lock)) != 0)
+ return (ret);
+ if ((pgno = NEXT_PGNO(cp->page)) != PGNO_INVALID &&
+ (ret = __db_lget(dbc,
+ 0, pgno, DB_LOCK_WRITE, 0, &next_lock)) != 0) {
+ (void)__TLPUT(dbc, next_lock);
+ return (ret);
+ }
+ }
+ DISCARD_CUR(dbc, ret);
+ if (ret != 0)
+ goto err;
+ ret = __bam_search(dbc, PGNO_INVALID, &key, SR_DEL, 0, NULL, &exact);
+
+ /*
+ * If everything worked, delete the stack, otherwise, release the
+ * stack and page locks without further damage.
+ */
+ if (ret == 0)
+ ret = __bam_dpages(dbc, 1, BTD_RELINK);
+ else
+ (void)__bam_stkrel(dbc, 0);
+
+err: (void)__TLPUT(dbc, prev_lock);
+ (void)__TLPUT(dbc, next_lock);
+ return (ret);
+}
+
+/*
+ * __bamc_getstack --
+ * Acquire a full stack for a cursor.
+ */
+static int
+__bamc_getstack(dbc)
+ DBC *dbc;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT dbt;
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ int exact, ret, t_ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * Get the page with the current item on it. The caller of this
+ * routine has to already hold a read lock on the page, so there
+ * is no additional lock to acquire.
+ */
+ if ((ret = __memp_fget(mpf, &cp->pgno,
+ dbc->thread_info, dbc->txn, 0, &h)) != 0)
+ return (ret);
+
+ /* Get a copy of a key from the page. */
+ memset(&dbt, 0, sizeof(DBT));
+ ret = __db_ret(dbc, h, 0, &dbt,
+ &dbc->my_rkey.data, &dbc->my_rkey.ulen);
+ if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ return (ret);
+
+ /* Get a write-locked stack for the page. */
+ exact = 0;
+ ret = __bam_search(dbc, PGNO_INVALID,
+ &dbt, SR_KEYFIRST, 1, NULL, &exact);
+
+ return (ret);
+}
+
+/*
+ * __bam_isopd --
+ * Return if the cursor references an off-page duplicate tree via its
+ * page number.
+ */
+static int
+__bam_isopd(dbc, pgnop)
+ DBC *dbc;
+ db_pgno_t *pgnop;
+{
+ BOVERFLOW *bo;
+
+ if (TYPE(dbc->internal->page) != P_LBTREE)
+ return (0);
+
+ bo = GET_BOVERFLOW(dbc->dbp,
+ dbc->internal->page, dbc->internal->indx + O_INDX);
+ if (B_TYPE(bo->type) == B_DUPLICATE) {
+ *pgnop = bo->pgno;
+ return (1);
+ }
+ return (0);
+}
+
+/*
+ * __bam_opd_exists --
+ * Return if the current position has any data.
+ * PUBLIC: int __bam_opd_exists __P((DBC *, db_pgno_t));
+ */
+int
+__bam_opd_exists(dbc, pgno)
+ DBC *dbc;
+ db_pgno_t pgno;
+{
+ PAGE *h;
+ int ret;
+
+ if ((ret = __memp_fget(dbc->dbp->mpf, &pgno,
+ dbc->thread_info, dbc->txn, 0, &h)) != 0)
+ return (ret);
+
+ /*
+ * We always collapse OPD trees so we only need to check
+ * the number of entries on the root. If there is a non-empty
+ * tree then there will be duplicates.
+ */
+ if (NUM_ENT(h) == 0)
+ ret = 0;
+ else
+ ret = DB_KEYEXIST;
+
+ (void)__memp_fput(dbc->dbp->mpf, dbc->thread_info, h, dbc->priority);
+
+ return (ret);
+}
diff --git a/btree/bt_debug.c b/btree/bt_debug.c
deleted file mode 100644
index 3aefbe7..0000000
--- a/btree/bt_debug.c
+++ /dev/null
@@ -1,329 +0,0 @@
-/*-
- * Copyright (c) 1990, 1993, 1994
- * The Regents of the University of California. All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * Mike Olson.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#if defined(LIBC_SCCS) && !defined(lint)
-static char sccsid[] = "@(#)bt_debug.c 8.5 (Berkeley) 8/17/94";
-#endif /* LIBC_SCCS and not lint */
-
-#include <sys/param.h>
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <db.h>
-#include "btree.h"
-
-#ifdef DEBUG
-/*
- * BT_DUMP -- Dump the tree
- *
- * Parameters:
- * dbp: pointer to the DB
- */
-void
-__bt_dump(dbp)
- DB *dbp;
-{
- BTREE *t;
- PAGE *h;
- pgno_t i;
- char *sep;
-
- t = dbp->internal;
- (void)fprintf(stderr, "%s: pgsz %d",
- F_ISSET(t, B_INMEM) ? "memory" : "disk", t->bt_psize);
- if (F_ISSET(t, R_RECNO))
- (void)fprintf(stderr, " keys %lu", t->bt_nrecs);
-#undef X
-#define X(flag, name) \
- if (F_ISSET(t, flag)) { \
- (void)fprintf(stderr, "%s%s", sep, name); \
- sep = ", "; \
- }
- if (t->flags != 0) {
- sep = " flags (";
- X(R_FIXLEN, "FIXLEN");
- X(B_INMEM, "INMEM");
- X(B_NODUPS, "NODUPS");
- X(B_RDONLY, "RDONLY");
- X(R_RECNO, "RECNO");
- X(B_METADIRTY,"METADIRTY");
- (void)fprintf(stderr, ")\n");
- }
-#undef X
-
- for (i = P_ROOT; (h = mpool_get(t->bt_mp, i, 0)) != NULL; ++i) {
- __bt_dpage(h);
- (void)mpool_put(t->bt_mp, h, 0);
- }
-}
-
-/*
- * BT_DMPAGE -- Dump the meta page
- *
- * Parameters:
- * h: pointer to the PAGE
- */
-void
-__bt_dmpage(h)
- PAGE *h;
-{
- BTMETA *m;
- char *sep;
-
- m = (BTMETA *)h;
- (void)fprintf(stderr, "magic %lx\n", m->magic);
- (void)fprintf(stderr, "version %lu\n", m->version);
- (void)fprintf(stderr, "psize %lu\n", m->psize);
- (void)fprintf(stderr, "free %lu\n", m->free);
- (void)fprintf(stderr, "nrecs %lu\n", m->nrecs);
- (void)fprintf(stderr, "flags %lu", m->flags);
-#undef X
-#define X(flag, name) \
- if (m->flags & flag) { \
- (void)fprintf(stderr, "%s%s", sep, name); \
- sep = ", "; \
- }
- if (m->flags) {
- sep = " (";
- X(B_NODUPS, "NODUPS");
- X(R_RECNO, "RECNO");
- (void)fprintf(stderr, ")");
- }
-}
-
-/*
- * BT_DNPAGE -- Dump the page
- *
- * Parameters:
- * n: page number to dump.
- */
-void
-__bt_dnpage(dbp, pgno)
- DB *dbp;
- pgno_t pgno;
-{
- BTREE *t;
- PAGE *h;
-
- t = dbp->internal;
- if ((h = mpool_get(t->bt_mp, pgno, 0)) != NULL) {
- __bt_dpage(h);
- (void)mpool_put(t->bt_mp, h, 0);
- }
-}
-
-/*
- * BT_DPAGE -- Dump the page
- *
- * Parameters:
- * h: pointer to the PAGE
- */
-void
-__bt_dpage(h)
- PAGE *h;
-{
- BINTERNAL *bi;
- BLEAF *bl;
- RINTERNAL *ri;
- RLEAF *rl;
- indx_t cur, top;
- char *sep;
-
- (void)fprintf(stderr, " page %d: (", h->pgno);
-#undef X
-#define X(flag, name) \
- if (h->flags & flag) { \
- (void)fprintf(stderr, "%s%s", sep, name); \
- sep = ", "; \
- }
- sep = "";
- X(P_BINTERNAL, "BINTERNAL") /* types */
- X(P_BLEAF, "BLEAF")
- X(P_RINTERNAL, "RINTERNAL") /* types */
- X(P_RLEAF, "RLEAF")
- X(P_OVERFLOW, "OVERFLOW")
- X(P_PRESERVE, "PRESERVE");
- (void)fprintf(stderr, ")\n");
-#undef X
-
- (void)fprintf(stderr, "\tprev %2d next %2d", h->prevpg, h->nextpg);
- if (h->flags & P_OVERFLOW)
- return;
-
- top = NEXTINDEX(h);
- (void)fprintf(stderr, " lower %3d upper %3d nextind %d\n",
- h->lower, h->upper, top);
- for (cur = 0; cur < top; cur++) {
- (void)fprintf(stderr, "\t[%03d] %4d ", cur, h->linp[cur]);
- switch (h->flags & P_TYPE) {
- case P_BINTERNAL:
- bi = GETBINTERNAL(h, cur);
- (void)fprintf(stderr,
- "size %03d pgno %03d", bi->ksize, bi->pgno);
- if (bi->flags & P_BIGKEY)
- (void)fprintf(stderr, " (indirect)");
- else if (bi->ksize)
- (void)fprintf(stderr,
- " {%.*s}", (int)bi->ksize, bi->bytes);
- break;
- case P_RINTERNAL:
- ri = GETRINTERNAL(h, cur);
- (void)fprintf(stderr, "entries %03d pgno %03d",
- ri->nrecs, ri->pgno);
- break;
- case P_BLEAF:
- bl = GETBLEAF(h, cur);
- if (bl->flags & P_BIGKEY)
- (void)fprintf(stderr,
- "big key page %lu size %u/",
- *(pgno_t *)bl->bytes,
- *(u_int32_t *)(bl->bytes + sizeof(pgno_t)));
- else if (bl->ksize)
- (void)fprintf(stderr, "%s/", bl->bytes);
- if (bl->flags & P_BIGDATA)
- (void)fprintf(stderr,
- "big data page %lu size %u",
- *(pgno_t *)(bl->bytes + bl->ksize),
- *(u_int32_t *)(bl->bytes + bl->ksize +
- sizeof(pgno_t)));
- else if (bl->dsize)
- (void)fprintf(stderr, "%.*s",
- (int)bl->dsize, bl->bytes + bl->ksize);
- break;
- case P_RLEAF:
- rl = GETRLEAF(h, cur);
- if (rl->flags & P_BIGDATA)
- (void)fprintf(stderr,
- "big data page %lu size %u",
- *(pgno_t *)rl->bytes,
- *(u_int32_t *)(rl->bytes + sizeof(pgno_t)));
- else if (rl->dsize)
- (void)fprintf(stderr,
- "%.*s", (int)rl->dsize, rl->bytes);
- break;
- }
- (void)fprintf(stderr, "\n");
- }
-}
-#endif
-
-#ifdef STATISTICS
-/*
- * BT_STAT -- Gather/print the tree statistics
- *
- * Parameters:
- * dbp: pointer to the DB
- */
-void
-__bt_stat(dbp)
- DB *dbp;
-{
- extern u_long bt_cache_hit, bt_cache_miss, bt_pfxsaved, bt_rootsplit;
- extern u_long bt_sortsplit, bt_split;
- BTREE *t;
- PAGE *h;
- pgno_t i, pcont, pinternal, pleaf;
- u_long ifree, lfree, nkeys;
- int levels;
-
- t = dbp->internal;
- pcont = pinternal = pleaf = 0;
- nkeys = ifree = lfree = 0;
- for (i = P_ROOT; (h = mpool_get(t->bt_mp, i, 0)) != NULL; ++i) {
- switch (h->flags & P_TYPE) {
- case P_BINTERNAL:
- case P_RINTERNAL:
- ++pinternal;
- ifree += h->upper - h->lower;
- break;
- case P_BLEAF:
- case P_RLEAF:
- ++pleaf;
- lfree += h->upper - h->lower;
- nkeys += NEXTINDEX(h);
- break;
- case P_OVERFLOW:
- ++pcont;
- break;
- }
- (void)mpool_put(t->bt_mp, h, 0);
- }
-
- /* Count the levels of the tree. */
- for (i = P_ROOT, levels = 0 ;; ++levels) {
- h = mpool_get(t->bt_mp, i, 0);
- if (h->flags & (P_BLEAF|P_RLEAF)) {
- if (levels == 0)
- levels = 1;
- (void)mpool_put(t->bt_mp, h, 0);
- break;
- }
- i = F_ISSET(t, R_RECNO) ?
- GETRINTERNAL(h, 0)->pgno :
- GETBINTERNAL(h, 0)->pgno;
- (void)mpool_put(t->bt_mp, h, 0);
- }
-
- (void)fprintf(stderr, "%d level%s with %ld keys",
- levels, levels == 1 ? "" : "s", nkeys);
- if (F_ISSET(t, R_RECNO))
- (void)fprintf(stderr, " (%ld header count)", t->bt_nrecs);
- (void)fprintf(stderr,
- "\n%lu pages (leaf %ld, internal %ld, overflow %ld)\n",
- pinternal + pleaf + pcont, pleaf, pinternal, pcont);
- (void)fprintf(stderr, "%ld cache hits, %ld cache misses\n",
- bt_cache_hit, bt_cache_miss);
- (void)fprintf(stderr, "%ld splits (%ld root splits, %ld sort splits)\n",
- bt_split, bt_rootsplit, bt_sortsplit);
- pleaf *= t->bt_psize - BTDATAOFF;
- if (pleaf)
- (void)fprintf(stderr,
- "%.0f%% leaf fill (%ld bytes used, %ld bytes free)\n",
- ((double)(pleaf - lfree) / pleaf) * 100,
- pleaf - lfree, lfree);
- pinternal *= t->bt_psize - BTDATAOFF;
- if (pinternal)
- (void)fprintf(stderr,
- "%.0f%% internal fill (%ld bytes used, %ld bytes free\n",
- ((double)(pinternal - ifree) / pinternal) * 100,
- pinternal - ifree, ifree);
- if (bt_pfxsaved)
- (void)fprintf(stderr, "prefix checking removed %lu bytes.\n",
- bt_pfxsaved);
-}
-#endif
diff --git a/btree/bt_delete.c b/btree/bt_delete.c
index ece1ab6..f76aa05 100644
--- a/btree/bt_delete.c
+++ b/btree/bt_delete.c
@@ -1,5 +1,14 @@
/*-
- * Copyright (c) 1990, 1993, 1994
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
@@ -13,11 +22,7 @@
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
+ * 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
@@ -32,626 +37,611 @@
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
+ *
+ * $Id$
*/
-#if defined(LIBC_SCCS) && !defined(lint)
-static char sccsid[] = "@(#)bt_delete.c 8.13 (Berkeley) 7/28/94";
-#endif /* LIBC_SCCS and not lint */
-
-#include <sys/types.h>
-
-#include <errno.h>
-#include <stdio.h>
-#include <string.h>
+#include "db_config.h"
-#include <db.h>
-#include "btree.h"
-
-static int __bt_bdelete __P((BTREE *, const DBT *));
-static int __bt_curdel __P((BTREE *, const DBT *, PAGE *, u_int));
-static int __bt_pdelete __P((BTREE *, PAGE *));
-static int __bt_relink __P((BTREE *, PAGE *));
-static int __bt_stkacq __P((BTREE *, PAGE **, CURSOR *));
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
/*
- * __bt_delete
- * Delete the item(s) referenced by a key.
+ * __bam_ditem --
+ * Delete one or more entries from a page.
*
- * Return RET_SPECIAL if the key is not found.
+ * PUBLIC: int __bam_ditem __P((DBC *, PAGE *, u_int32_t));
*/
int
-__bt_delete(dbp, key, flags)
- const DB *dbp;
- const DBT *key;
- u_int flags;
-{
- BTREE *t;
- CURSOR *c;
+__bam_ditem(dbc, h, indx)
+ DBC *dbc;
PAGE *h;
- int status;
-
- t = dbp->internal;
-
- /* Toss any page pinned across calls. */
- if (t->bt_pinned != NULL) {
- mpool_put(t->bt_mp, t->bt_pinned, 0);
- t->bt_pinned = NULL;
- }
-
- /* Check for change to a read-only tree. */
- if (F_ISSET(t, B_RDONLY)) {
- errno = EPERM;
- return (RET_ERROR);
- }
-
- switch (flags) {
- case 0:
- status = __bt_bdelete(t, key);
+ u_int32_t indx;
+{
+ BINTERNAL *bi;
+ BKEYDATA *bk;
+ DB *dbp;
+ u_int32_t nbytes;
+ int ret;
+ db_indx_t *inp;
+
+ dbp = dbc->dbp;
+ inp = P_INP(dbp, h);
+
+ /* The page should already have been dirtied by our caller. */
+ DB_ASSERT(dbp->env, IS_DIRTY(h));
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ bi = GET_BINTERNAL(dbp, h, indx);
+ switch (B_TYPE(bi->type)) {
+ case B_DUPLICATE:
+ case B_KEYDATA:
+ nbytes = BINTERNAL_SIZE(bi->len);
+ break;
+ case B_OVERFLOW:
+ nbytes = BINTERNAL_SIZE(bi->len);
+ if ((ret =
+ __db_doff(dbc, ((BOVERFLOW *)bi->data)->pgno)) != 0)
+ return (ret);
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, PGNO(h)));
+ }
break;
- case R_CURSOR:
+ case P_IRECNO:
+ nbytes = RINTERNAL_SIZE;
+ break;
+ case P_LBTREE:
/*
- * If flags is R_CURSOR, delete the cursor. Must already
- * have started a scan and not have already deleted it.
+ * If it's a duplicate key, discard the index and don't touch
+ * the actual page item.
+ *
+ * !!!
+ * This works because no data item can have an index matching
+ * any other index so even if the data item is in a key "slot",
+ * it won't match any other index.
*/
- c = &t->bt_cursor;
- if (F_ISSET(c, CURS_INIT)) {
- if (F_ISSET(c, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE))
- return (RET_SPECIAL);
- if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL)
- return (RET_ERROR);
-
+ if ((indx % 2) == 0) {
/*
- * If the page is about to be emptied, we'll need to
- * delete it, which means we have to acquire a stack.
+ * Check for a duplicate after us on the page. NOTE:
+ * we have to delete the key item before deleting the
+ * data item, otherwise the "indx + P_INDX" calculation
+ * won't work!
*/
- if (NEXTINDEX(h) == 1)
- if (__bt_stkacq(t, &h, &t->bt_cursor))
- return (RET_ERROR);
-
- status = __bt_dleaf(t, NULL, h, c->pg.index);
-
- if (NEXTINDEX(h) == 0 && status == RET_SUCCESS) {
- if (__bt_pdelete(t, h))
- return (RET_ERROR);
- } else
- mpool_put(t->bt_mp,
- h, status == RET_SUCCESS ? MPOOL_DIRTY : 0);
- break;
+ if (indx + P_INDX < (u_int32_t)NUM_ENT(h) &&
+ inp[indx] == inp[indx + P_INDX])
+ return (__bam_adjindx(dbc,
+ h, indx, indx + O_INDX, 0));
+ /*
+ * Check for a duplicate before us on the page. It
+ * doesn't matter if we delete the key item before or
+ * after the data item for the purposes of this one.
+ */
+ if (indx > 0 && inp[indx] == inp[indx - P_INDX])
+ return (__bam_adjindx(dbc,
+ h, indx, indx - P_INDX, 0));
}
/* FALLTHROUGH */
+ case P_LDUP:
+ case P_LRECNO:
+ bk = GET_BKEYDATA(dbp, h, indx);
+ switch (B_TYPE(bk->type)) {
+ case B_DUPLICATE:
+ nbytes = BOVERFLOW_SIZE;
+ break;
+ case B_OVERFLOW:
+ nbytes = BOVERFLOW_SIZE;
+ if ((ret = __db_doff(
+ dbc, (GET_BOVERFLOW(dbp, h, indx))->pgno)) != 0)
+ return (ret);
+ break;
+ case B_KEYDATA:
+ nbytes = BKEYDATA_SIZE(bk->len);
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, PGNO(h)));
+ }
+ break;
default:
- errno = EINVAL;
- return (RET_ERROR);
+ return (__db_pgfmt(dbp->env, PGNO(h)));
}
- if (status == RET_SUCCESS)
- F_SET(t, B_MODIFIED);
- return (status);
+
+ /* Delete the item and mark the page dirty. */
+ if ((ret = __db_ditem(dbc, h, indx, nbytes)) != 0)
+ return (ret);
+
+ return (0);
}
/*
- * __bt_stkacq --
- * Acquire a stack so we can delete a cursor entry.
+ * __bam_adjindx --
+ * Adjust an index on the page.
*
- * Parameters:
- * t: tree
- * hp: pointer to current, pinned PAGE pointer
- * c: pointer to the cursor
+ * PUBLIC: int __bam_adjindx __P((DBC *, PAGE *, u_int32_t, u_int32_t, int));
+ */
+int
+__bam_adjindx(dbc, h, indx, indx_copy, is_insert)
+ DBC *dbc;
+ PAGE *h;
+ u_int32_t indx, indx_copy;
+ int is_insert;
+{
+ DB *dbp;
+ db_indx_t copy, *inp;
+ int ret;
+
+ dbp = dbc->dbp;
+ inp = P_INP(dbp, h);
+
+ /* Log the change. */
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __bam_adj_log(dbp, dbc->txn, &LSN(h), 0,
+ PGNO(h), &LSN(h), indx, indx_copy, (u_int32_t)is_insert)) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(LSN(h));
+
+ /* Shuffle the indices and mark the page dirty. */
+ if (is_insert) {
+ copy = inp[indx_copy];
+ if (indx != NUM_ENT(h))
+ memmove(&inp[indx + O_INDX], &inp[indx],
+ sizeof(db_indx_t) * (NUM_ENT(h) - indx));
+ inp[indx] = copy;
+ ++NUM_ENT(h);
+ } else {
+ --NUM_ENT(h);
+ if (indx != NUM_ENT(h))
+ memmove(&inp[indx], &inp[indx + O_INDX],
+ sizeof(db_indx_t) * (NUM_ENT(h) - indx));
+ }
+
+ return (0);
+}
+
+/*
+ * __bam_dpages --
+ * Delete a set of locked pages.
*
- * Returns:
- * 0 on success, 1 on failure
+ * PUBLIC: int __bam_dpages __P((DBC *, int, int));
*/
-static int
-__bt_stkacq(t, hp, c)
- BTREE *t;
- PAGE **hp;
- CURSOR *c;
+int
+__bam_dpages(dbc, use_top, flags)
+ DBC *dbc;
+ int use_top;
+ int flags;
{
BINTERNAL *bi;
- EPG *e;
- EPGNO *parent;
- PAGE *h;
- indx_t index;
- pgno_t pgno;
- recno_t nextpg, prevpg;
- int exact, level;
-
- /*
- * Find the first occurrence of the key in the tree. Toss the
- * currently locked page so we don't hit an already-locked page.
- */
- h = *hp;
- mpool_put(t->bt_mp, h, 0);
- if ((e = __bt_search(t, &c->key, &exact)) == NULL)
- return (1);
- h = e->page;
-
- /* See if we got it in one shot. */
- if (h->pgno == c->pg.pgno)
- goto ret;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT a, b;
+ DB_LOCK c_lock, p_lock;
+ DB_MPOOLFILE *mpf;
+ EPG *epg, *save_sp, *stack_epg;
+ PAGE *child, *parent;
+ db_indx_t nitems;
+ db_pgno_t pgno, root_pgno;
+ db_recno_t rcnt;
+ int done, ret, t_ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ nitems = 0;
+ pgno = PGNO_INVALID;
/*
- * Move right, looking for the page. At each move we have to move
- * up the stack until we don't have to move to the next page. If
- * we have to change pages at an internal level, we have to fix the
- * stack back up.
+ * We have the entire stack of deletable pages locked.
+ *
+ * Btree calls us with the first page in the stack is to have a
+ * single item deleted, and the rest of the pages are to be removed.
+ *
+ * Recno always has a stack to the root and __bam_merge operations
+ * may have unneeded items in the sack. We find the lowest page
+ * in the stack that has more than one record in it and start there.
*/
- while (h->pgno != c->pg.pgno) {
- if ((nextpg = h->nextpg) == P_INVALID)
- break;
- mpool_put(t->bt_mp, h, 0);
-
- /* Move up the stack. */
- for (level = 0; (parent = BT_POP(t)) != NULL; ++level) {
- /* Get the parent page. */
- if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
- return (1);
-
- /* Move to the next index. */
- if (parent->index != NEXTINDEX(h) - 1) {
- index = parent->index + 1;
- BT_PUSH(t, h->pgno, index);
+ ret = 0;
+ if (use_top)
+ stack_epg = cp->sp;
+ else
+ for (stack_epg = cp->csp; stack_epg > cp->sp; --stack_epg)
+ if (NUM_ENT(stack_epg->page) > 1)
break;
- }
- mpool_put(t->bt_mp, h, 0);
- }
+ epg = stack_epg;
+ /*
+ * !!!
+ * There is an interesting deadlock situation here. We have to relink
+ * the leaf page chain around the leaf page being deleted. Consider
+ * a cursor walking through the leaf pages, that has the previous page
+ * read-locked and is waiting on a lock for the page we're deleting.
+ * It will deadlock here. Before we unlink the subtree, we relink the
+ * leaf page chain.
+ */
+ if (LF_ISSET(BTD_RELINK) && LEVEL(cp->csp->page) == 1 &&
+ (ret = __bam_relink(dbc, cp->csp->page, NULL, PGNO_INVALID)) != 0)
+ goto discard;
- /* Restore the stack. */
- while (level--) {
- /* Push the next level down onto the stack. */
- bi = GETBINTERNAL(h, index);
- pgno = bi->pgno;
- BT_PUSH(t, pgno, 0);
+ /*
+ * Delete the last item that references the underlying pages that are
+ * to be deleted, and adjust cursors that reference that page. Then,
+ * save that page's page number and item count and release it. If
+ * the application isn't retaining locks because it's running without
+ * transactions, this lets the rest of the tree get back to business
+ * immediately.
+ */
+ if ((ret = __memp_dirty(mpf,
+ &epg->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto discard;
+ if ((ret = __bam_ditem(dbc, epg->page, epg->indx)) != 0)
+ goto discard;
+ if ((ret = __bam_ca_di(dbc, PGNO(epg->page), epg->indx, -1)) != 0)
+ goto discard;
+
+ if (LF_ISSET(BTD_UPDATE) && epg->indx == 0) {
+ save_sp = cp->csp;
+ cp->csp = epg;
+ ret = __bam_pupdate(dbc, epg->page);
+ cp->csp = save_sp;
+ if (ret != 0)
+ goto discard;
+ }
- /* Lose the currently pinned page. */
- mpool_put(t->bt_mp, h, 0);
+ pgno = PGNO(epg->page);
+ nitems = NUM_ENT(epg->page);
+
+ ret = __memp_fput(mpf, dbc->thread_info, epg->page, dbc->priority);
+ epg->page = NULL;
+ if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err_inc;
+
+ /* Then, discard any pages that we don't care about. */
+discard: for (epg = cp->sp; epg < stack_epg; ++epg) {
+ if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+ epg->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ epg->page = NULL;
+ if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ if (ret != 0)
+ goto err;
+
+ /* Free the rest of the pages in the stack. */
+ while (++epg <= cp->csp) {
+ if ((ret = __memp_dirty(mpf, &epg->page,
+ dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err;
+ /*
+ * Delete page entries so they will be restored as part of
+ * recovery. We don't need to do cursor adjustment here as
+ * the pages are being emptied by definition and so cannot
+ * be referenced by a cursor.
+ */
+ if (NUM_ENT(epg->page) != 0) {
+ DB_ASSERT(dbp->env, LEVEL(epg->page) != 1);
- /* Get the next level down. */
- if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL)
- return (1);
- index = 0;
+ if ((ret = __bam_ditem(dbc, epg->page, epg->indx)) != 0)
+ goto err;
+ /*
+ * Sheer paranoia: if we find any pages that aren't
+ * emptied by the delete, someone else added an item
+ * while we were walking the tree, and we discontinue
+ * the delete. Shouldn't be possible, but we check
+ * regardless.
+ */
+ if (NUM_ENT(epg->page) != 0)
+ goto err;
}
- mpool_put(t->bt_mp, h, 0);
- if ((h = mpool_get(t->bt_mp, nextpg, 0)) == NULL)
- return (1);
- }
- if (h->pgno == c->pg.pgno)
- goto ret;
+ ret = __db_free(dbc, epg->page);
+ if (cp->page == epg->page)
+ cp->page = NULL;
+ epg->page = NULL;
+ if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err_inc;
+ }
- /* Reacquire the original stack. */
- mpool_put(t->bt_mp, h, 0);
- if ((e = __bt_search(t, &c->key, &exact)) == NULL)
- return (1);
- h = e->page;
+ if (0) {
+err_inc: ++epg;
+err: for (; epg <= cp->csp; ++epg) {
+ if (epg->page != NULL) {
+ (void)__memp_fput(mpf, dbc->thread_info,
+ epg->page, dbc->priority);
+ epg->page = NULL;
+ }
+ (void)__TLPUT(dbc, epg->lock);
+ }
+ BT_STK_CLR(cp);
+ return (ret);
+ }
+ BT_STK_CLR(cp);
/*
- * Move left, looking for the page. At each move we have to move
- * up the stack until we don't have to change pages to move to the
- * next page. If we have to change pages at an internal level, we
- * have to fix the stack back up.
+ * If we just deleted the next-to-last item from the root page, the
+ * tree can collapse one or more levels. While there remains only a
+ * single item on the root page, write lock the last page referenced
+ * by the root page and copy it over the root page.
*/
- while (h->pgno != c->pg.pgno) {
- if ((prevpg = h->prevpg) == P_INVALID)
+ root_pgno = cp->root;
+ if (pgno != root_pgno || nitems != 1)
+ return (0);
+
+ for (done = 0; !done;) {
+ /* Initialize. */
+ parent = child = NULL;
+ LOCK_INIT(p_lock);
+ LOCK_INIT(c_lock);
+
+ /* Lock the root. */
+ pgno = root_pgno;
+ if ((ret =
+ __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &p_lock)) != 0)
+ goto stop;
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_DIRTY, &parent)) != 0)
+ goto stop;
+
+ if (NUM_ENT(parent) != 1)
+ goto stop;
+
+ switch (TYPE(parent)) {
+ case P_IBTREE:
+ /*
+ * If this is overflow, then try to delete it.
+ * The child may or may not still point at it.
+ */
+ bi = GET_BINTERNAL(dbp, parent, 0);
+ if (B_TYPE(bi->type) == B_OVERFLOW)
+ if ((ret = __db_doff(dbc,
+ ((BOVERFLOW *)bi->data)->pgno)) != 0)
+ goto stop;
+ pgno = bi->pgno;
break;
- mpool_put(t->bt_mp, h, 0);
-
- /* Move up the stack. */
- for (level = 0; (parent = BT_POP(t)) != NULL; ++level) {
- /* Get the parent page. */
- if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
- return (1);
-
- /* Move to the next index. */
- if (parent->index != 0) {
- index = parent->index - 1;
- BT_PUSH(t, h->pgno, index);
- break;
- }
- mpool_put(t->bt_mp, h, 0);
+ case P_IRECNO:
+ pgno = GET_RINTERNAL(dbp, parent, 0)->pgno;
+ break;
+ default:
+ goto stop;
}
- /* Restore the stack. */
- while (level--) {
- /* Push the next level down onto the stack. */
- bi = GETBINTERNAL(h, index);
- pgno = bi->pgno;
+ /* Lock the child page. */
+ if ((ret =
+ __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &c_lock)) != 0)
+ goto stop;
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_DIRTY, &child)) != 0)
+ goto stop;
+
+ /* Log the change. */
+ if (DBC_LOGGING(dbc)) {
+ memset(&a, 0, sizeof(a));
+ a.data = child;
+ a.size = dbp->pgsize;
+ memset(&b, 0, sizeof(b));
+ b.data = P_ENTRY(dbp, parent, 0);
+ b.size = TYPE(parent) == P_IRECNO ? RINTERNAL_SIZE :
+ BINTERNAL_SIZE(((BINTERNAL *)b.data)->len);
+ if ((ret = __bam_rsplit_log(dbp, dbc->txn,
+ &child->lsn, 0, PGNO(child), &a, PGNO(parent),
+ RE_NREC(parent), &b, &parent->lsn)) != 0)
+ goto stop;
+ } else
+ LSN_NOT_LOGGED(child->lsn);
- /* Lose the currently pinned page. */
- mpool_put(t->bt_mp, h, 0);
+ /*
+ * Make the switch.
+ *
+ * One fixup -- internal pages below the top level do not store
+ * a record count, so we have to preserve it if we're not
+ * converting to a leaf page. Note also that we are about to
+ * overwrite the parent page, including its LSN. This is OK
+ * because the log message we wrote describing this update
+ * stores its LSN on the child page. When the child is copied
+ * onto the parent, the correct LSN is copied into place.
+ */
+ COMPQUIET(rcnt, 0);
+ if (F_ISSET(cp, C_RECNUM) && LEVEL(child) > LEAFLEVEL)
+ rcnt = RE_NREC(parent);
+ memcpy(parent, child, dbp->pgsize);
+ PGNO(parent) = root_pgno;
+ if (F_ISSET(cp, C_RECNUM) && LEVEL(child) > LEAFLEVEL)
+ RE_NREC_SET(parent, rcnt);
+
+ /* Adjust the cursors. */
+ if ((ret = __bam_ca_rsplit(dbc, PGNO(child), root_pgno)) != 0)
+ goto stop;
- /* Get the next level down. */
- if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL)
- return (1);
+ /*
+ * Free the page copied onto the root page and discard its
+ * lock. (The call to __db_free() discards our reference
+ * to the page.)
+ */
+ if ((ret = __db_free(dbc, child)) != 0) {
+ child = NULL;
+ goto stop;
+ }
+ child = NULL;
- index = NEXTINDEX(h) - 1;
- BT_PUSH(t, pgno, index);
+ if (0) {
+stop: done = 1;
}
- mpool_put(t->bt_mp, h, 0);
- if ((h = mpool_get(t->bt_mp, prevpg, 0)) == NULL)
- return (1);
+ if ((t_ret = __TLPUT(dbc, p_lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (parent != NULL &&
+ (t_ret = __memp_fput(mpf, dbc->thread_info,
+ parent, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, c_lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (child != NULL &&
+ (t_ret = __memp_fput(mpf, dbc->thread_info,
+ child, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
}
-
-ret: mpool_put(t->bt_mp, h, 0);
- return ((*hp = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL);
+ return (ret);
}
/*
- * __bt_bdelete --
- * Delete all key/data pairs matching the specified key.
+ * __bam_relink --
+ * Relink around a deleted page.
*
- * Parameters:
- * t: tree
- * key: key to delete
- *
- * Returns:
- * RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found.
+ * PUBLIC: int __bam_relink __P((DBC *, PAGE *, PAGE *, db_pgno_t));
+ * Otherp can be either the previous or the next page to use if
+ * the caller already holds that page.
*/
-static int
-__bt_bdelete(t, key)
- BTREE *t;
- const DBT *key;
+int
+__bam_relink(dbc, pagep, otherp, new_pgno)
+ DBC *dbc;
+ PAGE *pagep, *otherp;
+ db_pgno_t new_pgno;
{
- EPG *e;
- PAGE *h;
- int deleted, exact, redo;
-
- deleted = 0;
-
- /* Find any matching record; __bt_search pins the page. */
-loop: if ((e = __bt_search(t, key, &exact)) == NULL)
- return (deleted ? RET_SUCCESS : RET_ERROR);
- if (!exact) {
- mpool_put(t->bt_mp, e->page, 0);
- return (deleted ? RET_SUCCESS : RET_SPECIAL);
- }
+ DB *dbp;
+ DB_LOCK npl, ppl;
+ DB_LSN *nlsnp, *plsnp, ret_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *np, *pp;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ np = pp = NULL;
+ LOCK_INIT(npl);
+ LOCK_INIT(ppl);
+ nlsnp = plsnp = NULL;
+ mpf = dbp->mpf;
+ ret = 0;
/*
- * Delete forward, then delete backward, from the found key. If
- * there are duplicates and we reach either side of the page, do
- * the key search again, so that we get them all.
+ * Retrieve the one/two pages. The caller must have them locked
+ * because the parent is latched. For a remove, we may need
+ * two pages (the before and after). For an add, we only need one
+ * because, the split took care of the prev.
*/
- redo = 0;
- h = e->page;
- do {
- if (__bt_dleaf(t, key, h, e->index)) {
- mpool_put(t->bt_mp, h, 0);
- return (RET_ERROR);
- }
- if (F_ISSET(t, B_NODUPS)) {
- if (NEXTINDEX(h) == 0) {
- if (__bt_pdelete(t, h))
- return (RET_ERROR);
- } else
- mpool_put(t->bt_mp, h, MPOOL_DIRTY);
- return (RET_SUCCESS);
- }
- deleted = 1;
- } while (e->index < NEXTINDEX(h) && __bt_cmp(t, key, e) == 0);
-
- /* Check for right-hand edge of the page. */
- if (e->index == NEXTINDEX(h))
- redo = 1;
-
- /* Delete from the key to the beginning of the page. */
- while (e->index-- > 0) {
- if (__bt_cmp(t, key, e) != 0)
- break;
- if (__bt_dleaf(t, key, h, e->index) == RET_ERROR) {
- mpool_put(t->bt_mp, h, 0);
- return (RET_ERROR);
+ if (pagep->next_pgno != PGNO_INVALID) {
+ if (((np = otherp) == NULL ||
+ PGNO(otherp) != pagep->next_pgno) &&
+ (ret = __memp_fget(mpf, &pagep->next_pgno,
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &np)) != 0) {
+ ret = __db_pgerr(dbp, pagep->next_pgno, ret);
+ goto err;
}
- if (e->index == 0)
- redo = 1;
+ nlsnp = &np->lsn;
}
-
- /* Check for an empty page. */
- if (NEXTINDEX(h) == 0) {
- if (__bt_pdelete(t, h))
- return (RET_ERROR);
- goto loop;
+ if (pagep->prev_pgno != PGNO_INVALID) {
+ if (((pp = otherp) == NULL ||
+ PGNO(otherp) != pagep->prev_pgno) &&
+ (ret = __memp_fget(mpf, &pagep->prev_pgno,
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pp)) != 0) {
+ ret = __db_pgerr(dbp, pagep->prev_pgno, ret);
+ goto err;
+ }
+ plsnp = &pp->lsn;
}
- /* Put the page. */
- mpool_put(t->bt_mp, h, MPOOL_DIRTY);
-
- if (redo)
- goto loop;
- return (RET_SUCCESS);
-}
-
-/*
- * __bt_pdelete --
- * Delete a single page from the tree.
- *
- * Parameters:
- * t: tree
- * h: leaf page
- *
- * Returns:
- * RET_SUCCESS, RET_ERROR.
- *
- * Side-effects:
- * mpool_put's the page
- */
-static int
-__bt_pdelete(t, h)
- BTREE *t;
- PAGE *h;
-{
- BINTERNAL *bi;
- PAGE *pg;
- EPGNO *parent;
- indx_t cnt, index, *ip, offset;
- u_int32_t nksize;
- char *from;
+ /* Log the change. */
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __bam_relink_log(dbp, dbc->txn, &ret_lsn, 0,
+ pagep->pgno, new_pgno, pagep->prev_pgno, plsnp,
+ pagep->next_pgno, nlsnp)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(ret_lsn);
+ if (np != NULL)
+ np->lsn = ret_lsn;
+ if (pp != NULL)
+ pp->lsn = ret_lsn;
/*
- * Walk the parent page stack -- a LIFO stack of the pages that were
- * traversed when we searched for the page where the delete occurred.
- * Each stack entry is a page number and a page index offset. The
- * offset is for the page traversed on the search. We've just deleted
- * a page, so we have to delete the key from the parent page.
- *
- * If the delete from the parent page makes it empty, this process may
- * continue all the way up the tree. We stop if we reach the root page
- * (which is never deleted, it's just not worth the effort) or if the
- * delete does not empty the page.
+ * Modify and release the two pages.
*/
- while ((parent = BT_POP(t)) != NULL) {
- /* Get the parent page. */
- if ((pg = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
- return (RET_ERROR);
-
- index = parent->index;
- bi = GETBINTERNAL(pg, index);
-
- /* Free any overflow pages. */
- if (bi->flags & P_BIGKEY &&
- __ovfl_delete(t, bi->bytes) == RET_ERROR) {
- mpool_put(t->bt_mp, pg, 0);
- return (RET_ERROR);
- }
-
- /*
- * Free the parent if it has only the one key and it's not the
- * root page. If it's the rootpage, turn it back into an empty
- * leaf page.
- */
- if (NEXTINDEX(pg) == 1)
- if (pg->pgno == P_ROOT) {
- pg->lower = BTDATAOFF;
- pg->upper = t->bt_psize;
- pg->flags = P_BLEAF;
- } else {
- if (__bt_relink(t, pg) || __bt_free(t, pg))
- return (RET_ERROR);
- continue;
- }
- else {
- /* Pack remaining key items at the end of the page. */
- nksize = NBINTERNAL(bi->ksize);
- from = (char *)pg + pg->upper;
- memmove(from + nksize, from, (char *)bi - from);
- pg->upper += nksize;
-
- /* Adjust indices' offsets, shift the indices down. */
- offset = pg->linp[index];
- for (cnt = index, ip = &pg->linp[0]; cnt--; ++ip)
- if (ip[0] < offset)
- ip[0] += nksize;
- for (cnt = NEXTINDEX(pg) - index; --cnt; ++ip)
- ip[0] = ip[1] < offset ? ip[1] + nksize : ip[1];
- pg->lower -= sizeof(indx_t);
- }
-
- mpool_put(t->bt_mp, pg, MPOOL_DIRTY);
- break;
+ if (np != NULL) {
+ if (new_pgno == PGNO_INVALID)
+ np->prev_pgno = pagep->prev_pgno;
+ else
+ np->prev_pgno = new_pgno;
+ if (np != otherp)
+ ret = __memp_fput(mpf,
+ dbc->thread_info, np, dbc->priority);
+ if ((t_ret = __TLPUT(dbc, npl)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
}
- /* Free the leaf page, as long as it wasn't the root. */
- if (h->pgno == P_ROOT) {
- mpool_put(t->bt_mp, h, MPOOL_DIRTY);
- return (RET_SUCCESS);
+ if (pp != NULL) {
+ if (new_pgno == PGNO_INVALID)
+ pp->next_pgno = pagep->next_pgno;
+ else
+ pp->next_pgno = new_pgno;
+ if (pp != otherp)
+ ret = __memp_fput(mpf,
+ dbc->thread_info, pp, dbc->priority);
+ if ((t_ret = __TLPUT(dbc, ppl)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
}
- return (__bt_relink(t, h) || __bt_free(t, h));
+ return (0);
+
+err: if (np != NULL && np != otherp)
+ (void)__memp_fput(mpf, dbc->thread_info, np, dbc->priority);
+ if (pp != NULL && pp != otherp)
+ (void)__memp_fput(mpf, dbc->thread_info, pp, dbc->priority);
+ return (ret);
}
/*
- * __bt_dleaf --
- * Delete a single record from a leaf page.
+ * __bam_pupdate --
+ * Update parent key pointers up the tree.
*
- * Parameters:
- * t: tree
- * key: referenced key
- * h: page
- * index: index on page to delete
- *
- * Returns:
- * RET_SUCCESS, RET_ERROR.
+ * PUBLIC: int __bam_pupdate __P((DBC *, PAGE *));
*/
int
-__bt_dleaf(t, key, h, index)
- BTREE *t;
- const DBT *key;
- PAGE *h;
- u_int index;
+__bam_pupdate(dbc, lpg)
+ DBC *dbc;
+ PAGE *lpg;
{
- BLEAF *bl;
- indx_t cnt, *ip, offset;
- u_int32_t nbytes;
- void *to;
- char *from;
-
- /* If this record is referenced by the cursor, delete the cursor. */
- if (F_ISSET(&t->bt_cursor, CURS_INIT) &&
- !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) &&
- t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index == index &&
- __bt_curdel(t, key, h, index))
- return (RET_ERROR);
-
- /* If the entry uses overflow pages, make them available for reuse. */
- to = bl = GETBLEAF(h, index);
- if (bl->flags & P_BIGKEY && __ovfl_delete(t, bl->bytes) == RET_ERROR)
- return (RET_ERROR);
- if (bl->flags & P_BIGDATA &&
- __ovfl_delete(t, bl->bytes + bl->ksize) == RET_ERROR)
- return (RET_ERROR);
-
- /* Pack the remaining key/data items at the end of the page. */
- nbytes = NBLEAF(bl);
- from = (char *)h + h->upper;
- memmove(from + nbytes, from, (char *)to - from);
- h->upper += nbytes;
-
- /* Adjust the indices' offsets, shift the indices down. */
- offset = h->linp[index];
- for (cnt = index, ip = &h->linp[0]; cnt--; ++ip)
- if (ip[0] < offset)
- ip[0] += nbytes;
- for (cnt = NEXTINDEX(h) - index; --cnt; ++ip)
- ip[0] = ip[1] < offset ? ip[1] + nbytes : ip[1];
- h->lower -= sizeof(indx_t);
-
- /* If the cursor is on this page, adjust it as necessary. */
- if (F_ISSET(&t->bt_cursor, CURS_INIT) &&
- !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) &&
- t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index > index)
- --t->bt_cursor.pg.index;
-
- return (RET_SUCCESS);
-}
+ BTREE_CURSOR *cp;
+ ENV *env;
+ EPG *epg;
+ int ret;
-/*
- * __bt_curdel --
- * Delete the cursor.
- *
- * Parameters:
- * t: tree
- * key: referenced key (or NULL)
- * h: page
- * index: index on page to delete
- *
- * Returns:
- * RET_SUCCESS, RET_ERROR.
- */
-static int
-__bt_curdel(t, key, h, index)
- BTREE *t;
- const DBT *key;
- PAGE *h;
- u_int index;
-{
- CURSOR *c;
- EPG e;
- PAGE *pg;
- int curcopy, status;
+ env = dbc->env;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ ret = 0;
/*
- * If there are duplicates, move forward or backward to one.
- * Otherwise, copy the key into the cursor area.
+ * Update the parents up the tree. __bam_pinsert only looks at the
+ * left child if is a leaf page, so we don't need to change it. We
+ * just do a delete and insert; a replace is possible but reusing
+ * pinsert is better.
*/
- c = &t->bt_cursor;
- F_CLR(c, CURS_AFTER | CURS_BEFORE | CURS_ACQUIRE);
-
- curcopy = 0;
- if (!F_ISSET(t, B_NODUPS)) {
- /*
- * We're going to have to do comparisons. If we weren't
- * provided a copy of the key, i.e. the user is deleting
- * the current cursor position, get one.
- */
- if (key == NULL) {
- e.page = h;
- e.index = index;
- if ((status = __bt_ret(t, &e,
- &c->key, &c->key, NULL, NULL, 1)) != RET_SUCCESS)
- return (status);
- curcopy = 1;
- key = &c->key;
- }
- /* Check previous key, if not at the beginning of the page. */
- if (index > 0) {
- e.page = h;
- e.index = index - 1;
- if (__bt_cmp(t, key, &e) == 0) {
- F_SET(c, CURS_BEFORE);
- goto dup2;
+ for (epg = &cp->csp[-1]; epg >= cp->sp; epg--) {
+ if ((ret = __memp_dirty(dbc->dbp->mpf, &epg->page,
+ dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ return (ret);
+ epg->indx--;
+ if ((ret = __bam_pinsert(dbc, epg, 0,
+ lpg, epg[1].page, BPI_NORECNUM | BPI_REPLACE)) != 0) {
+ if (ret == DB_NEEDSPLIT) {
+ /* This should not happen. */
+ __db_errx(env,
+ "Not enough room in parent: %s: page %lu",
+ dbc->dbp->fname, (u_long)PGNO(epg->page));
+ ret = __env_panic(env, EINVAL);
}
+ epg->indx++;
+ return (ret);
}
- /* Check next key, if not at the end of the page. */
- if (index < NEXTINDEX(h) - 1) {
- e.page = h;
- e.index = index + 1;
- if (__bt_cmp(t, key, &e) == 0) {
- F_SET(c, CURS_AFTER);
- goto dup2;
- }
- }
- /* Check previous key if at the beginning of the page. */
- if (index == 0 && h->prevpg != P_INVALID) {
- if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL)
- return (RET_ERROR);
- e.page = pg;
- e.index = NEXTINDEX(pg) - 1;
- if (__bt_cmp(t, key, &e) == 0) {
- F_SET(c, CURS_BEFORE);
- goto dup1;
- }
- mpool_put(t->bt_mp, pg, 0);
- }
- /* Check next key if at the end of the page. */
- if (index == NEXTINDEX(h) - 1 && h->nextpg != P_INVALID) {
- if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL)
- return (RET_ERROR);
- e.page = pg;
- e.index = 0;
- if (__bt_cmp(t, key, &e) == 0) {
- F_SET(c, CURS_AFTER);
-dup1: mpool_put(t->bt_mp, pg, 0);
-dup2: c->pg.pgno = e.page->pgno;
- c->pg.index = e.index;
- return (RET_SUCCESS);
- }
- mpool_put(t->bt_mp, pg, 0);
- }
- }
- e.page = h;
- e.index = index;
- if (curcopy || (status =
- __bt_ret(t, &e, &c->key, &c->key, NULL, NULL, 1)) == RET_SUCCESS) {
- F_SET(c, CURS_ACQUIRE);
- return (RET_SUCCESS);
- }
- return (status);
-}
-
-/*
- * __bt_relink --
- * Link around a deleted page.
- *
- * Parameters:
- * t: tree
- * h: page to be deleted
- */
-static int
-__bt_relink(t, h)
- BTREE *t;
- PAGE *h;
-{
- PAGE *pg;
-
- if (h->nextpg != P_INVALID) {
- if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL)
- return (RET_ERROR);
- pg->prevpg = h->prevpg;
- mpool_put(t->bt_mp, pg, MPOOL_DIRTY);
+ epg->indx++;
}
- if (h->prevpg != P_INVALID) {
- if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL)
- return (RET_ERROR);
- pg->nextpg = h->nextpg;
- mpool_put(t->bt_mp, pg, MPOOL_DIRTY);
- }
- return (0);
+ return (ret);
}
diff --git a/btree/bt_get.c b/btree/bt_get.c
deleted file mode 100644
index 74824c7..0000000
--- a/btree/bt_get.c
+++ /dev/null
@@ -1,105 +0,0 @@
-/*-
- * Copyright (c) 1990, 1993, 1994
- * The Regents of the University of California. All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * Mike Olson.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#if defined(LIBC_SCCS) && !defined(lint)
-static char sccsid[] = "@(#)bt_get.c 8.6 (Berkeley) 7/20/94";
-#endif /* LIBC_SCCS and not lint */
-
-#include <sys/types.h>
-
-#include <errno.h>
-#include <stddef.h>
-#include <stdio.h>
-
-#include <db.h>
-#include "btree.h"
-
-/*
- * __BT_GET -- Get a record from the btree.
- *
- * Parameters:
- * dbp: pointer to access method
- * key: key to find
- * data: data to return
- * flag: currently unused
- *
- * Returns:
- * RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found.
- */
-int
-__bt_get(dbp, key, data, flags)
- const DB *dbp;
- const DBT *key;
- DBT *data;
- u_int flags;
-{
- BTREE *t;
- EPG *e;
- int exact, status;
-
- t = dbp->internal;
-
- /* Toss any page pinned across calls. */
- if (t->bt_pinned != NULL) {
- mpool_put(t->bt_mp, t->bt_pinned, 0);
- t->bt_pinned = NULL;
- }
-
- /* Get currently doesn't take any flags. */
- if (flags) {
- errno = EINVAL;
- return (RET_ERROR);
- }
-
- if ((e = __bt_search(t, key, &exact)) == NULL)
- return (RET_ERROR);
- if (!exact) {
- mpool_put(t->bt_mp, e->page, 0);
- return (RET_SPECIAL);
- }
-
- status = __bt_ret(t, e, NULL, NULL, data, &t->bt_rdata, 0);
-
- /*
- * If the user is doing concurrent access, we copied the
- * key/data, toss the page.
- */
- if (F_ISSET(t, B_DB_LOCK))
- mpool_put(t->bt_mp, e->page, 0);
- else
- t->bt_pinned = e->page;
- return (status);
-}
diff --git a/btree/bt_method.c b/btree/bt_method.c
new file mode 100644
index 0000000..d27fe3d
--- /dev/null
+++ b/btree/bt_method.c
@@ -0,0 +1,734 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/qam.h"
+
+static int __bam_set_bt_minkey __P((DB *, u_int32_t));
+static int __bam_get_bt_compare
+ __P((DB *, int (**)(DB *, const DBT *, const DBT *)));
+static int __bam_get_bt_prefix
+ __P((DB *, size_t(**)(DB *, const DBT *, const DBT *)));
+static int __bam_set_bt_prefix
+ __P((DB *, size_t(*)(DB *, const DBT *, const DBT *)));
+static int __bam_get_bt_compress __P((DB *,
+ int (**)(DB *, const DBT *, const DBT *, const DBT *, const DBT *, DBT *),
+ int (**)(DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *)));
+static int __ram_get_re_delim __P((DB *, int *));
+static int __ram_set_re_delim __P((DB *, int));
+static int __ram_set_re_len __P((DB *, u_int32_t));
+static int __ram_set_re_pad __P((DB *, int));
+static int __ram_get_re_source __P((DB *, const char **));
+static int __ram_set_re_source __P((DB *, const char *));
+
+/*
+ * __bam_db_create --
+ * Btree specific initialization of the DB structure.
+ *
+ * PUBLIC: int __bam_db_create __P((DB *));
+ */
+int
+__bam_db_create(dbp)
+ DB *dbp;
+{
+ BTREE *t;
+ int ret;
+
+ /* Allocate and initialize the private btree structure. */
+ if ((ret = __os_calloc(dbp->env, 1, sizeof(BTREE), &t)) != 0)
+ return (ret);
+ dbp->bt_internal = t;
+
+ t->bt_minkey = DEFMINKEYPAGE; /* Btree */
+ t->bt_compare = __bam_defcmp;
+ t->bt_prefix = __bam_defpfx;
+#ifdef HAVE_COMPRESSION
+ t->bt_compress = NULL;
+ t->bt_decompress = NULL;
+ t->compress_dup_compare = NULL;
+
+ /*
+ * DB_AM_COMPRESS may have been set in __bam_metachk before the
+ * bt_internal structure existed.
+ */
+ if (F_ISSET(dbp, DB_AM_COMPRESS) &&
+ (ret = __bam_set_bt_compress(dbp, NULL, NULL)) != 0)
+ return (ret);
+#endif
+
+ dbp->get_bt_compare = __bam_get_bt_compare;
+ dbp->set_bt_compare = __bam_set_bt_compare;
+ dbp->get_bt_minkey = __bam_get_bt_minkey;
+ dbp->set_bt_minkey = __bam_set_bt_minkey;
+ dbp->get_bt_prefix = __bam_get_bt_prefix;
+ dbp->set_bt_prefix = __bam_set_bt_prefix;
+ dbp->get_bt_compress = __bam_get_bt_compress;
+ dbp->set_bt_compress = __bam_set_bt_compress;
+
+ t->re_pad = ' '; /* Recno */
+ t->re_delim = '\n';
+ t->re_eof = 1;
+
+ dbp->get_re_delim = __ram_get_re_delim;
+ dbp->set_re_delim = __ram_set_re_delim;
+ dbp->get_re_len = __ram_get_re_len;
+ dbp->set_re_len = __ram_set_re_len;
+ dbp->get_re_pad = __ram_get_re_pad;
+ dbp->set_re_pad = __ram_set_re_pad;
+ dbp->get_re_source = __ram_get_re_source;
+ dbp->set_re_source = __ram_set_re_source;
+
+ return (0);
+}
+
+/*
+ * __bam_db_close --
+ * Btree specific discard of the DB structure.
+ *
+ * PUBLIC: int __bam_db_close __P((DB *));
+ */
+int
+__bam_db_close(dbp)
+ DB *dbp;
+{
+ BTREE *t;
+
+ if ((t = dbp->bt_internal) == NULL)
+ return (0);
+ /* Recno */
+ /* Close any backing source file descriptor. */
+ if (t->re_fp != NULL)
+ (void)fclose(t->re_fp);
+
+ /* Free any backing source file name. */
+ if (t->re_source != NULL)
+ __os_free(dbp->env, t->re_source);
+
+ __os_free(dbp->env, t);
+ dbp->bt_internal = NULL;
+
+ return (0);
+}
+
+/*
+ * __bam_map_flags --
+ * Map Btree specific flags from public to the internal values.
+ *
+ * PUBLIC: void __bam_map_flags __P((DB *, u_int32_t *, u_int32_t *));
+ */
+void
+__bam_map_flags(dbp, inflagsp, outflagsp)
+ DB *dbp;
+ u_int32_t *inflagsp, *outflagsp;
+{
+ COMPQUIET(dbp, NULL);
+
+ if (FLD_ISSET(*inflagsp, DB_DUP)) {
+ FLD_SET(*outflagsp, DB_AM_DUP);
+ FLD_CLR(*inflagsp, DB_DUP);
+ }
+ if (FLD_ISSET(*inflagsp, DB_DUPSORT)) {
+ FLD_SET(*outflagsp, DB_AM_DUP | DB_AM_DUPSORT);
+ FLD_CLR(*inflagsp, DB_DUPSORT);
+ }
+ if (FLD_ISSET(*inflagsp, DB_RECNUM)) {
+ FLD_SET(*outflagsp, DB_AM_RECNUM);
+ FLD_CLR(*inflagsp, DB_RECNUM);
+ }
+ if (FLD_ISSET(*inflagsp, DB_REVSPLITOFF)) {
+ FLD_SET(*outflagsp, DB_AM_REVSPLITOFF);
+ FLD_CLR(*inflagsp, DB_REVSPLITOFF);
+ }
+}
+
+/*
+ * __bam_set_flags --
+ * Set Btree specific flags.
+ *
+ * PUBLIC: int __bam_set_flags __P((DB *, u_int32_t *flagsp));
+ */
+int
+__bam_set_flags(dbp, flagsp)
+ DB *dbp;
+ u_int32_t *flagsp;
+{
+ BTREE *t;
+ u_int32_t flags;
+
+ t = dbp->bt_internal;
+
+ flags = *flagsp;
+ if (LF_ISSET(DB_DUP | DB_DUPSORT | DB_RECNUM | DB_REVSPLITOFF))
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_flags");
+
+ /*
+ * The DB_DUP and DB_DUPSORT flags are shared by the Hash
+ * and Btree access methods.
+ */
+ if (LF_ISSET(DB_DUP | DB_DUPSORT))
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE | DB_OK_HASH);
+
+ if (LF_ISSET(DB_RECNUM | DB_REVSPLITOFF))
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+ /* DB_DUP/DB_DUPSORT is incompatible with DB_RECNUM. */
+ if (LF_ISSET(DB_DUP | DB_DUPSORT) && F_ISSET(dbp, DB_AM_RECNUM))
+ goto incompat;
+
+ /* DB_RECNUM is incompatible with DB_DUP/DB_DUPSORT. */
+ if (LF_ISSET(DB_RECNUM) && F_ISSET(dbp, DB_AM_DUP))
+ goto incompat;
+
+ /* DB_RECNUM is incompatible with DB_DUP/DB_DUPSORT. */
+ if (LF_ISSET(DB_RECNUM) && LF_ISSET(DB_DUP | DB_DUPSORT))
+ goto incompat;
+
+#ifdef HAVE_COMPRESSION
+ /* DB_RECNUM is incompatible with compression */
+ if (LF_ISSET(DB_RECNUM) && DB_IS_COMPRESSED(dbp)) {
+ __db_errx(dbp->env,
+ "DB_RECNUM cannot be used with compression");
+ return (EINVAL);
+ }
+
+ /* DB_DUP without DB_DUPSORT is incompatible with compression */
+ if (LF_ISSET(DB_DUP) && !LF_ISSET(DB_DUPSORT) &&
+ !F_ISSET(dbp, DB_AM_DUPSORT) && DB_IS_COMPRESSED(dbp)) {
+ __db_errx(dbp->env,
+ "DB_DUP cannot be used with compression without DB_DUPSORT");
+ return (EINVAL);
+ }
+#endif
+
+ if (LF_ISSET(DB_DUPSORT) && dbp->dup_compare == NULL) {
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbp)) {
+ dbp->dup_compare = __bam_compress_dupcmp;
+ t->compress_dup_compare = __bam_defcmp;
+ } else
+#endif
+ dbp->dup_compare = __bam_defcmp;
+ }
+
+ __bam_map_flags(dbp, flagsp, &dbp->flags);
+ return (0);
+
+incompat:
+ return (__db_ferr(dbp->env, "DB->set_flags", 1));
+}
+
+/*
+ * __bam_get_bt_compare --
+ * Get the comparison function.
+ */
+static int
+__bam_get_bt_compare(dbp, funcp)
+ DB *dbp;
+ int (**funcp) __P((DB *, const DBT *, const DBT *));
+{
+ BTREE *t;
+
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+ t = dbp->bt_internal;
+
+ if (funcp != NULL)
+ *funcp = t->bt_compare;
+
+ return (0);
+}
+
+/*
+ * __bam_set_bt_compare --
+ * Set the comparison function.
+ *
+ * PUBLIC: int __bam_set_bt_compare
+ * PUBLIC: __P((DB *, int (*)(DB *, const DBT *, const DBT *)));
+ */
+int
+__bam_set_bt_compare(dbp, func)
+ DB *dbp;
+ int (*func) __P((DB *, const DBT *, const DBT *));
+{
+ BTREE *t;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_bt_compare");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+ t = dbp->bt_internal;
+
+ /*
+ * Can't default the prefix routine if the user supplies a comparison
+ * routine; shortening the keys can break their comparison algorithm.
+ */
+ t->bt_compare = func;
+ if (t->bt_prefix == __bam_defpfx)
+ t->bt_prefix = NULL;
+
+ return (0);
+}
+
+/*
+ * __bam_get_bt_compress --
+ * Get the compression functions.
+ */
+static int
+__bam_get_bt_compress(dbp, compressp, decompressp)
+ DB *dbp;
+ int (**compressp) __P((DB *, const DBT *, const DBT *, const DBT *,
+ const DBT *, DBT *));
+ int (**decompressp) __P((DB *, const DBT *, const DBT *, DBT *, DBT *,
+ DBT *));
+{
+#ifdef HAVE_COMPRESSION
+ BTREE *t;
+
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+ t = dbp->bt_internal;
+
+ if (compressp != NULL)
+ *compressp = t->bt_compress;
+ if (decompressp != NULL)
+ *decompressp = t->bt_decompress;
+
+ return (0);
+#else
+ COMPQUIET(compressp, NULL);
+ COMPQUIET(decompressp, NULL);
+
+ __db_errx(dbp->env, "compression support has not been compiled in");
+ return (EINVAL);
+#endif
+}
+
+/*
+ * __bam_set_bt_compress --
+ * Set the compression functions.
+ *
+ * PUBLIC: int __bam_set_bt_compress __P((DB *,
+ * PUBLIC: int (*)(DB *, const DBT *, const DBT *,
+ * PUBLIC: const DBT *, const DBT *, DBT *),
+ * PUBLIC: int (*)(DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *)));
+ */
+int
+__bam_set_bt_compress(dbp, compress, decompress)
+ DB *dbp;
+ int (*compress) __P((DB *, const DBT *, const DBT *, const DBT *,
+ const DBT *, DBT *));
+ int (*decompress) __P((DB *, const DBT *, const DBT *, DBT *, DBT *,
+ DBT *));
+{
+#ifdef HAVE_COMPRESSION
+ BTREE *t;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_bt_compress");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+ t = dbp->bt_internal;
+
+ /* compression is incompatible with DB_RECNUM */
+ if (F_ISSET(dbp, DB_AM_RECNUM)) {
+ __db_errx(dbp->env,
+ "compression cannot be used with DB_RECNUM");
+ return (EINVAL);
+ }
+
+ /* compression is incompatible with DB_DUP without DB_DUPSORT */
+ if (F_ISSET(dbp, DB_AM_DUP) && !F_ISSET(dbp, DB_AM_DUPSORT)) {
+ __db_errx(dbp->env,
+ "compression cannot be used with DB_DUP without DB_DUPSORT");
+ return (EINVAL);
+ }
+
+ if (compress != 0 && decompress != 0) {
+ t->bt_compress = compress;
+ t->bt_decompress = decompress;
+ } else if (compress == 0 && decompress == 0) {
+ t->bt_compress = __bam_defcompress;
+ t->bt_decompress = __bam_defdecompress;
+ } else {
+ __db_errx(dbp->env,
+ "to enable compression you need to supply both function arguments");
+ return (EINVAL);
+ }
+ F_SET(dbp, DB_AM_COMPRESS);
+
+ /* Copy dup_compare to compress_dup_compare, and use the compression
+ duplicate compare */
+ if (F_ISSET(dbp, DB_AM_DUPSORT)) {
+ t->compress_dup_compare = dbp->dup_compare;
+ dbp->dup_compare = __bam_compress_dupcmp;
+ }
+
+ return (0);
+#else
+ COMPQUIET(compress, NULL);
+ COMPQUIET(decompress, NULL);
+
+ __db_errx(dbp->env, "compression support has not been compiled in");
+ return (EINVAL);
+#endif
+}
+
+/*
+ * __db_get_bt_minkey --
+ * Get the minimum keys per page.
+ *
+ * PUBLIC: int __bam_get_bt_minkey __P((DB *, u_int32_t *));
+ */
+int
+__bam_get_bt_minkey(dbp, bt_minkeyp)
+ DB *dbp;
+ u_int32_t *bt_minkeyp;
+{
+ BTREE *t;
+
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+ t = dbp->bt_internal;
+ *bt_minkeyp = t->bt_minkey;
+ return (0);
+}
+
+/*
+ * __bam_set_bt_minkey --
+ * Set the minimum keys per page.
+ */
+static int
+__bam_set_bt_minkey(dbp, bt_minkey)
+ DB *dbp;
+ u_int32_t bt_minkey;
+{
+ BTREE *t;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_bt_minkey");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+ t = dbp->bt_internal;
+
+ if (bt_minkey < 2) {
+ __db_errx(dbp->env, "minimum bt_minkey value is 2");
+ return (EINVAL);
+ }
+
+ t->bt_minkey = bt_minkey;
+ return (0);
+}
+
+/*
+ * __bam_get_bt_prefix --
+ * Get the prefix function.
+ */
+static int
+__bam_get_bt_prefix(dbp, funcp)
+ DB *dbp;
+ size_t (**funcp) __P((DB *, const DBT *, const DBT *));
+{
+ BTREE *t;
+
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+ t = dbp->bt_internal;
+ if (funcp != NULL)
+ *funcp = t->bt_prefix;
+ return (0);
+}
+
+/*
+ * __bam_set_bt_prefix --
+ * Set the prefix function.
+ */
+static int
+__bam_set_bt_prefix(dbp, func)
+ DB *dbp;
+ size_t (*func) __P((DB *, const DBT *, const DBT *));
+{
+ BTREE *t;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_bt_prefix");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
+
+ t = dbp->bt_internal;
+
+ t->bt_prefix = func;
+ return (0);
+}
+
+/*
+ * __bam_copy_config
+ * Copy the configuration of one DB handle to another.
+ * PUBLIC: void __bam_copy_config __P((DB *, DB*, u_int32_t));
+ */
+void
+__bam_copy_config(src, dst, nparts)
+ DB *src, *dst;
+ u_int32_t nparts;
+{
+ BTREE *s, *d;
+
+ COMPQUIET(nparts, 0);
+
+ s = src->bt_internal;
+ d = dst->bt_internal;
+ d->bt_compare = s->bt_compare;
+ d->bt_minkey = s->bt_minkey;
+ d->bt_minkey = s->bt_minkey;
+ d->bt_prefix = s->bt_prefix;
+#ifdef HAVE_COMPRESSION
+ d->bt_compress = s->bt_compress;
+ d->bt_decompress = s->bt_decompress;
+ d->compress_dup_compare = s->compress_dup_compare;
+#endif
+}
+
+/*
+ * __ram_map_flags --
+ * Map Recno specific flags from public to the internal values.
+ *
+ * PUBLIC: void __ram_map_flags __P((DB *, u_int32_t *, u_int32_t *));
+ */
+void
+__ram_map_flags(dbp, inflagsp, outflagsp)
+ DB *dbp;
+ u_int32_t *inflagsp, *outflagsp;
+{
+ COMPQUIET(dbp, NULL);
+
+ if (FLD_ISSET(*inflagsp, DB_RENUMBER)) {
+ FLD_SET(*outflagsp, DB_AM_RENUMBER);
+ FLD_CLR(*inflagsp, DB_RENUMBER);
+ }
+ if (FLD_ISSET(*inflagsp, DB_SNAPSHOT)) {
+ FLD_SET(*outflagsp, DB_AM_SNAPSHOT);
+ FLD_CLR(*inflagsp, DB_SNAPSHOT);
+ }
+}
+
+/*
+ * __ram_set_flags --
+ * Set Recno specific flags.
+ *
+ * PUBLIC: int __ram_set_flags __P((DB *, u_int32_t *flagsp));
+ */
+int
+__ram_set_flags(dbp, flagsp)
+ DB *dbp;
+ u_int32_t *flagsp;
+{
+ u_int32_t flags;
+
+ flags = *flagsp;
+ if (LF_ISSET(DB_RENUMBER | DB_SNAPSHOT)) {
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_flags");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO);
+ }
+
+ __ram_map_flags(dbp, flagsp, &dbp->flags);
+ return (0);
+}
+
+/*
+ * __db_get_re_delim --
+ * Get the variable-length input record delimiter.
+ */
+static int
+__ram_get_re_delim(dbp, re_delimp)
+ DB *dbp;
+ int *re_delimp;
+{
+ BTREE *t;
+
+ DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO);
+ t = dbp->bt_internal;
+ *re_delimp = t->re_delim;
+ return (0);
+}
+
+/*
+ * __ram_set_re_delim --
+ * Set the variable-length input record delimiter.
+ */
+static int
+__ram_set_re_delim(dbp, re_delim)
+ DB *dbp;
+ int re_delim;
+{
+ BTREE *t;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_re_delim");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO);
+
+ t = dbp->bt_internal;
+
+ t->re_delim = re_delim;
+ F_SET(dbp, DB_AM_DELIMITER);
+
+ return (0);
+}
+
+/*
+ * __db_get_re_len --
+ * Get the variable-length input record length.
+ *
+ * PUBLIC: int __ram_get_re_len __P((DB *, u_int32_t *));
+ */
+int
+__ram_get_re_len(dbp, re_lenp)
+ DB *dbp;
+ u_int32_t *re_lenp;
+{
+ BTREE *t;
+ QUEUE *q;
+
+ DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO);
+
+ /*
+ * This has to work for all access methods, before or after opening the
+ * database. When the record length is set with __ram_set_re_len, the
+ * value in both the BTREE and QUEUE structs will be correct.
+ * Otherwise, this only makes sense after the database in opened, in
+ * which case we know the type.
+ */
+ if (dbp->type == DB_QUEUE) {
+ q = dbp->q_internal;
+ *re_lenp = q->re_len;
+ } else {
+ t = dbp->bt_internal;
+ *re_lenp = t->re_len;
+ }
+
+ return (0);
+}
+
+/*
+ * __ram_set_re_len --
+ * Set the variable-length input record length.
+ */
+static int
+__ram_set_re_len(dbp, re_len)
+ DB *dbp;
+ u_int32_t re_len;
+{
+ BTREE *t;
+ QUEUE *q;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_re_len");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO);
+
+ t = dbp->bt_internal;
+ t->re_len = re_len;
+
+ q = dbp->q_internal;
+ q->re_len = re_len;
+
+ F_SET(dbp, DB_AM_FIXEDLEN);
+
+ return (0);
+}
+
+/*
+ * __db_get_re_pad --
+ * Get the fixed-length record pad character.
+ *
+ * PUBLIC: int __ram_get_re_pad __P((DB *, int *));
+ */
+int
+__ram_get_re_pad(dbp, re_padp)
+ DB *dbp;
+ int *re_padp;
+{
+ BTREE *t;
+ QUEUE *q;
+
+ DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO);
+
+ /*
+ * This has to work for all access methods, before or after opening the
+ * database. When the record length is set with __ram_set_re_pad, the
+ * value in both the BTREE and QUEUE structs will be correct.
+ * Otherwise, this only makes sense after the database in opened, in
+ * which case we know the type.
+ */
+ if (dbp->type == DB_QUEUE) {
+ q = dbp->q_internal;
+ *re_padp = q->re_pad;
+ } else {
+ t = dbp->bt_internal;
+ *re_padp = t->re_pad;
+ }
+
+ return (0);
+}
+
+/*
+ * __ram_set_re_pad --
+ * Set the fixed-length record pad character.
+ */
+static int
+__ram_set_re_pad(dbp, re_pad)
+ DB *dbp;
+ int re_pad;
+{
+ BTREE *t;
+ QUEUE *q;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_re_pad");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO);
+
+ t = dbp->bt_internal;
+ t->re_pad = re_pad;
+
+ q = dbp->q_internal;
+ q->re_pad = re_pad;
+
+ F_SET(dbp, DB_AM_PAD);
+
+ return (0);
+}
+
+/*
+ * __db_get_re_source --
+ * Get the backing source file name.
+ */
+static int
+__ram_get_re_source(dbp, re_sourcep)
+ DB *dbp;
+ const char **re_sourcep;
+{
+ BTREE *t;
+
+ DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO);
+
+ t = dbp->bt_internal;
+ *re_sourcep = t->re_source;
+ return (0);
+}
+
+/*
+ * __ram_set_re_source --
+ * Set the backing source file name.
+ */
+static int
+__ram_set_re_source(dbp, re_source)
+ DB *dbp;
+ const char *re_source;
+{
+ BTREE *t;
+
+ DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_re_source");
+ DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO);
+
+ t = dbp->bt_internal;
+
+ return (__os_strdup(dbp->env, re_source, &t->re_source));
+}
diff --git a/btree/bt_open.c b/btree/bt_open.c
index f052249..1fdfea5 100644
--- a/btree/bt_open.c
+++ b/btree/bt_open.c
@@ -1,5 +1,14 @@
/*-
- * Copyright (c) 1990, 1993, 1994
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
@@ -13,11 +22,7 @@
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
+ * 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
@@ -32,413 +37,633 @@
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
+ *
+ * $Id$
*/
-#if defined(LIBC_SCCS) && !defined(lint)
-static char sccsid[] = "@(#)bt_open.c 8.10 (Berkeley) 8/17/94";
-#endif /* LIBC_SCCS and not lint */
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_swap.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+#include "dbinc/fop.h"
+
+static void __bam_init_meta __P((DB *, BTMETA *, db_pgno_t, DB_LSN *));
/*
- * Implementation of btree access method for 4.4BSD.
+ * __bam_open --
+ * Open a btree.
*
- * The design here was originally based on that of the btree access method
- * used in the Postgres database system at UC Berkeley. This implementation
- * is wholly independent of the Postgres code.
+ * PUBLIC: int __bam_open __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC: DB_TXN *, const char *, db_pgno_t, u_int32_t));
*/
+int
+__bam_open(dbp, ip, txn, name, base_pgno, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name;
+ db_pgno_t base_pgno;
+ u_int32_t flags;
+{
+ BTREE *t;
-#include <sys/param.h>
-#include <sys/stat.h>
-
-#include <errno.h>
-#include <fcntl.h>
-#include <limits.h>
-#include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
+ COMPQUIET(name, NULL);
+ t = dbp->bt_internal;
-#include <db.h>
-#include "btree.h"
+ /*
+ * We don't permit the user to specify a prefix routine if they didn't
+ * also specify a comparison routine, they can't know enough about our
+ * comparison routine to get it right.
+ */
+ if (t->bt_compare == __bam_defcmp && t->bt_prefix != __bam_defpfx) {
+ __db_errx(dbp->env,
+"prefix comparison may not be specified for default comparison routine");
+ return (EINVAL);
+ }
-#ifdef DEBUG
-#undef MINPSIZE
-#define MINPSIZE 128
-#endif
+ /*
+ * Verify that the bt_minkey value specified won't cause the
+ * calculation of ovflsize to underflow [#2406] for this pagesize.
+ */
+ if (B_MINKEY_TO_OVFLSIZE(dbp, t->bt_minkey, dbp->pgsize) >
+ B_MINKEY_TO_OVFLSIZE(dbp, DEFMINKEYPAGE, dbp->pgsize)) {
+ __db_errx(dbp->env,
+ "bt_minkey value of %lu too high for page size of %lu",
+ (u_long)t->bt_minkey, (u_long)dbp->pgsize);
+ return (EINVAL);
+ }
-static int byteorder __P((void));
-static int nroot __P((BTREE *));
-static int tmp __P((void));
+ /* Start up the tree. */
+ return (__bam_read_root(dbp, ip, txn, base_pgno, flags));
+}
/*
- * __BT_OPEN -- Open a btree.
- *
- * Creates and fills a DB struct, and calls the routine that actually
- * opens the btree.
- *
- * Parameters:
- * fname: filename (NULL for in-memory trees)
- * flags: open flag bits
- * mode: open permission bits
- * b: BTREEINFO pointer
- *
- * Returns:
- * NULL on failure, pointer to DB on success.
+ * __bam_metachk --
*
+ * PUBLIC: int __bam_metachk __P((DB *, const char *, BTMETA *));
*/
-DB *
-__bt_open(fname, flags, mode, openinfo, dflags)
- const char *fname;
- int flags, mode, dflags;
- const BTREEINFO *openinfo;
-{
- struct stat sb;
- BTMETA m;
- BTREE *t;
- BTREEINFO b;
+int
+__bam_metachk(dbp, name, btm)
DB *dbp;
- pgno_t ncache;
- ssize_t nr;
- int machine_lorder;
+ const char *name;
+ BTMETA *btm;
+{
+ ENV *env;
+ u_int32_t vers;
+ int ret;
- t = NULL;
+ env = dbp->env;
/*
- * Intention is to make sure all of the user's selections are okay
- * here and then use them without checking. Can't be complete, since
- * we don't know the right page size, lorder or flags until the backing
- * file is opened. Also, the file's page size can cause the cachesize
- * to change.
+ * At this point, all we know is that the magic number is for a Btree.
+ * Check the version, the database may be out of date.
*/
- machine_lorder = byteorder();
- if (openinfo) {
- b = *openinfo;
-
- /* Flags: R_DUP. */
- if (b.flags & ~(R_DUP))
- goto einval;
-
- /*
- * Page size must be indx_t aligned and >= MINPSIZE. Default
- * page size is set farther on, based on the underlying file
- * transfer size.
- */
- if (b.psize &&
- (b.psize < MINPSIZE || b.psize > MAX_PAGE_OFFSET + 1 ||
- b.psize & sizeof(indx_t) - 1))
- goto einval;
-
- /* Minimum number of keys per page; absolute minimum is 2. */
- if (b.minkeypage) {
- if (b.minkeypage < 2)
- goto einval;
- } else
- b.minkeypage = DEFMINKEYPAGE;
-
- /* If no comparison, use default comparison and prefix. */
- if (b.compare == NULL) {
- b.compare = __bt_defcmp;
- if (b.prefix == NULL)
- b.prefix = __bt_defpfx;
- }
-
- if (b.lorder == 0)
- b.lorder = machine_lorder;
- } else {
- b.compare = __bt_defcmp;
- b.cachesize = 0;
- b.flags = 0;
- b.lorder = machine_lorder;
- b.minkeypage = DEFMINKEYPAGE;
- b.prefix = __bt_defpfx;
- b.psize = 0;
+ vers = btm->dbmeta.version;
+ if (F_ISSET(dbp, DB_AM_SWAP))
+ M_32_SWAP(vers);
+ switch (vers) {
+ case 6:
+ case 7:
+ __db_errx(env,
+ "%s: btree version %lu requires a version upgrade",
+ name, (u_long)vers);
+ return (DB_OLD_VERSION);
+ case 8:
+ case 9:
+ break;
+ default:
+ __db_errx(env,
+ "%s: unsupported btree version: %lu", name, (u_long)vers);
+ return (EINVAL);
}
- /* Check for the ubiquitous PDP-11. */
- if (b.lorder != BIG_ENDIAN && b.lorder != LITTLE_ENDIAN)
- goto einval;
-
- /* Allocate and initialize DB and BTREE structures. */
- if ((t = (BTREE *)malloc(sizeof(BTREE))) == NULL)
- goto err;
- memset(t, 0, sizeof(BTREE));
- t->bt_fd = -1; /* Don't close unopened fd on error. */
- t->bt_lorder = b.lorder;
- t->bt_order = NOT;
- t->bt_cmp = b.compare;
- t->bt_pfx = b.prefix;
- t->bt_rfd = -1;
-
- if ((t->bt_dbp = dbp = (DB *)malloc(sizeof(DB))) == NULL)
- goto err;
- memset(t->bt_dbp, 0, sizeof(DB));
- if (t->bt_lorder != machine_lorder)
- F_SET(t, B_NEEDSWAP);
-
- dbp->type = DB_BTREE;
- dbp->internal = t;
- dbp->close = __bt_close;
- dbp->del = __bt_delete;
- dbp->fd = __bt_fd;
- dbp->get = __bt_get;
- dbp->put = __bt_put;
- dbp->seq = __bt_seq;
- dbp->sync = __bt_sync;
+ /* Swap the page if we need to. */
+ if (F_ISSET(dbp, DB_AM_SWAP) &&
+ (ret = __bam_mswap(env, (PAGE *)btm)) != 0)
+ return (ret);
/*
- * If no file name was supplied, this is an in-memory btree and we
- * open a backing temporary file. Otherwise, it's a disk-based tree.
+ * Check application info against metadata info, and set info, flags,
+ * and type based on metadata info.
*/
- if (fname) {
- switch (flags & O_ACCMODE) {
- case O_RDONLY:
- F_SET(t, B_RDONLY);
- break;
- case O_RDWR:
- break;
- case O_WRONLY:
- default:
- goto einval;
- }
-
- if ((t->bt_fd = open(fname, flags, mode)) < 0)
- goto err;
-
+ if ((ret =
+ __db_fchk(env, "DB->open", btm->dbmeta.flags, BTM_MASK)) != 0)
+ return (ret);
+
+ if (F_ISSET(&btm->dbmeta, BTM_RECNO)) {
+ if (dbp->type == DB_BTREE)
+ goto wrong_type;
+ dbp->type = DB_RECNO;
+ DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO);
} else {
- if ((flags & O_ACCMODE) != O_RDWR)
- goto einval;
- if ((t->bt_fd = tmp()) == -1)
- goto err;
- F_SET(t, B_INMEM);
+ if (dbp->type == DB_RECNO)
+ goto wrong_type;
+ dbp->type = DB_BTREE;
+ DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE);
}
- if (fcntl(t->bt_fd, F_SETFD, 1) == -1)
- goto err;
+ if (F_ISSET(&btm->dbmeta, BTM_DUP))
+ F_SET(dbp, DB_AM_DUP);
+ else
+ if (F_ISSET(dbp, DB_AM_DUP)) {
+ __db_errx(env,
+ "%s: DB_DUP specified to open method but not set in database",
+ name);
+ return (EINVAL);
+ }
- if (fstat(t->bt_fd, &sb))
- goto err;
- if (sb.st_size) {
- if ((nr = read(t->bt_fd, &m, sizeof(BTMETA))) < 0)
- goto err;
- if (nr != sizeof(BTMETA))
- goto eftype;
-
- /*
- * Read in the meta-data. This can change the notion of what
- * the lorder, page size and flags are, and, when the page size
- * changes, the cachesize value can change too. If the user
- * specified the wrong byte order for an existing database, we
- * don't bother to return an error, we just clear the NEEDSWAP
- * bit.
- */
- if (m.magic == BTREEMAGIC)
- F_CLR(t, B_NEEDSWAP);
- else {
- F_SET(t, B_NEEDSWAP);
- M_32_SWAP(m.magic);
- M_32_SWAP(m.version);
- M_32_SWAP(m.psize);
- M_32_SWAP(m.free);
- M_32_SWAP(m.nrecs);
- M_32_SWAP(m.flags);
+ if (F_ISSET(&btm->dbmeta, BTM_RECNUM)) {
+ if (dbp->type != DB_BTREE)
+ goto wrong_type;
+ F_SET(dbp, DB_AM_RECNUM);
+
+ if ((ret = __db_fcchk(env,
+ "DB->open", dbp->flags, DB_AM_DUP, DB_AM_RECNUM)) != 0)
+ return (ret);
+ } else
+ if (F_ISSET(dbp, DB_AM_RECNUM)) {
+ __db_errx(env,
+ "%s: DB_RECNUM specified to open method but not set in database",
+ name);
+ return (EINVAL);
}
- if (m.magic != BTREEMAGIC || m.version != BTREEVERSION)
- goto eftype;
- if (m.psize < MINPSIZE || m.psize > MAX_PAGE_OFFSET + 1 ||
- m.psize & sizeof(indx_t) - 1)
- goto eftype;
- if (m.flags & ~SAVEMETA)
- goto eftype;
- b.psize = m.psize;
- F_SET(t, m.flags);
- t->bt_free = m.free;
- t->bt_nrecs = m.nrecs;
- } else {
- /*
- * Set the page size to the best value for I/O to this file.
- * Don't overflow the page offset type.
- */
- if (b.psize == 0) {
- b.psize = sb.st_blksize;
- if (b.psize < MINPSIZE)
- b.psize = MINPSIZE;
- if (b.psize > MAX_PAGE_OFFSET + 1)
- b.psize = MAX_PAGE_OFFSET + 1;
+
+ if (F_ISSET(&btm->dbmeta, BTM_FIXEDLEN)) {
+ if (dbp->type != DB_RECNO)
+ goto wrong_type;
+ F_SET(dbp, DB_AM_FIXEDLEN);
+ } else
+ if (F_ISSET(dbp, DB_AM_FIXEDLEN)) {
+ __db_errx(env,
+ "%s: DB_FIXEDLEN specified to open method but not set in database",
+ name);
+ return (EINVAL);
}
- /* Set flag if duplicates permitted. */
- if (!(b.flags & R_DUP))
- F_SET(t, B_NODUPS);
+ if (F_ISSET(&btm->dbmeta, BTM_RENUMBER)) {
+ if (dbp->type != DB_RECNO)
+ goto wrong_type;
+ F_SET(dbp, DB_AM_RENUMBER);
+ } else
+ if (F_ISSET(dbp, DB_AM_RENUMBER)) {
+ __db_errx(env,
+ "%s: DB_RENUMBER specified to open method but not set in database",
+ name);
+ return (EINVAL);
+ }
- t->bt_free = P_INVALID;
- t->bt_nrecs = 0;
- F_SET(t, B_METADIRTY);
- }
+ if (F_ISSET(&btm->dbmeta, BTM_SUBDB))
+ F_SET(dbp, DB_AM_SUBDB);
+ else
+ if (F_ISSET(dbp, DB_AM_SUBDB)) {
+ __db_errx(env,
+ "%s: multiple databases specified but not supported by file",
+ name);
+ return (EINVAL);
+ }
- t->bt_psize = b.psize;
+ if (F_ISSET(&btm->dbmeta, BTM_DUPSORT)) {
+ if (dbp->dup_compare == NULL)
+ dbp->dup_compare = __bam_defcmp;
+ F_SET(dbp, DB_AM_DUPSORT);
+ } else
+ if (dbp->dup_compare != NULL) {
+ __db_errx(env,
+ "%s: duplicate sort specified but not supported in database",
+ name);
+ return (EINVAL);
+ }
- /* Set the cache size; must be a multiple of the page size. */
- if (b.cachesize && b.cachesize & b.psize - 1)
- b.cachesize += (~b.cachesize & b.psize - 1) + 1;
- if (b.cachesize < b.psize * MINCACHE)
- b.cachesize = b.psize * MINCACHE;
+#ifdef HAVE_COMPRESSION
+ if (F_ISSET(&btm->dbmeta, BTM_COMPRESS)) {
+ F_SET(dbp, DB_AM_COMPRESS);
+ if ((BTREE *)dbp->bt_internal != NULL &&
+ !DB_IS_COMPRESSED(dbp) &&
+ (ret = __bam_set_bt_compress(dbp, NULL, NULL)) != 0)
+ return (ret);
+ } else {
+ if ((BTREE *)dbp->bt_internal != NULL &&
+ DB_IS_COMPRESSED(dbp)) {
+ __db_errx(env,
+ "%s: compresssion specified to open method but not set in database",
+ name);
+ return (EINVAL);
+ }
+ }
+#else
+ if (F_ISSET(&btm->dbmeta, BTM_COMPRESS)) {
+ __db_errx(env,
+ "%s: compression support has not been compiled in",
+ name);
+ return (EINVAL);
+ }
+#endif
- /* Calculate number of pages to cache. */
- ncache = (b.cachesize + t->bt_psize - 1) / t->bt_psize;
+ /* Set the page size. */
+ dbp->pgsize = btm->dbmeta.pagesize;
- /*
- * The btree data structure requires that at least two keys can fit on
- * a page, but other than that there's no fixed requirement. The user
- * specified a minimum number per page, and we translated that into the
- * number of bytes a key/data pair can use before being placed on an
- * overflow page. This calculation includes the page header, the size
- * of the index referencing the leaf item and the size of the leaf item
- * structure. Also, don't let the user specify a minkeypage such that
- * a key/data pair won't fit even if both key and data are on overflow
- * pages.
- */
- t->bt_ovflsize = (t->bt_psize - BTDATAOFF) / b.minkeypage -
- (sizeof(indx_t) + NBLEAFDBT(0, 0));
- if (t->bt_ovflsize < NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t))
- t->bt_ovflsize =
- NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t);
-
- /* Initialize the buffer pool. */
- if ((t->bt_mp =
- mpool_open(NULL, t->bt_fd, t->bt_psize, ncache)) == NULL)
- goto err;
- if (!F_ISSET(t, B_INMEM))
- mpool_filter(t->bt_mp, __bt_pgin, __bt_pgout, t);
+ /* Copy the file's ID. */
+ memcpy(dbp->fileid, btm->dbmeta.uid, DB_FILE_ID_LEN);
- /* Create a root page if new tree. */
- if (nroot(t) == RET_ERROR)
- goto err;
+ return (0);
- /* Global flags. */
- if (dflags & DB_LOCK)
- F_SET(t, B_DB_LOCK);
- if (dflags & DB_SHMEM)
- F_SET(t, B_DB_SHMEM);
- if (dflags & DB_TXN)
- F_SET(t, B_DB_TXN);
+wrong_type:
+ if (dbp->type == DB_BTREE)
+ __db_errx(env,
+ "open method type is Btree, database type is Recno");
+ else
+ __db_errx(env,
+ "open method type is Recno, database type is Btree");
+ return (EINVAL);
+}
- return (dbp);
+/*
+ * __bam_read_root --
+ * Read the root page and check a tree.
+ *
+ * PUBLIC: int __bam_read_root __P((DB *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, db_pgno_t, u_int32_t));
+ */
+int
+__bam_read_root(dbp, ip, txn, base_pgno, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ db_pgno_t base_pgno;
+ u_int32_t flags;
+{
+ BTMETA *meta;
+ BTREE *t;
+ DBC *dbc;
+ DB_LOCK metalock;
+ DB_MPOOLFILE *mpf;
+ int ret, t_ret;
+
+ COMPQUIET(flags, 0);
+
+ meta = NULL;
+ t = dbp->bt_internal;
+ LOCK_INIT(metalock);
+ mpf = dbp->mpf;
+ ret = 0;
+
+ /* Get a cursor. */
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+ return (ret);
+
+ /* Get the metadata page. */
+ if ((ret =
+ __db_lget(dbc, 0, base_pgno, DB_LOCK_READ, 0, &metalock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &base_pgno, ip, dbc->txn, 0, &meta)) != 0)
+ goto err;
-einval: errno = EINVAL;
- goto err;
+ /*
+ * If the magic number is set, the tree has been created. Correct
+ * any fields that may not be right. Note, all of the local flags
+ * were set by DB->open.
+ *
+ * Otherwise, we'd better be in recovery or abort, in which case the
+ * metadata page will be created/initialized elsewhere.
+ */
+ if (meta->dbmeta.magic == DB_BTREEMAGIC) {
+ t->bt_minkey = meta->minkey;
+ t->re_pad = (int)meta->re_pad;
+ t->re_len = meta->re_len;
+
+ t->bt_meta = base_pgno;
+ t->bt_root = meta->root;
+#ifndef HAVE_FTRUNCATE
+ if (PGNO(meta) == PGNO_BASE_MD &&
+ !F_ISSET(dbp, DB_AM_RECOVER) && !IS_VERSION(dbp, meta))
+ __memp_set_last_pgno(mpf, meta->dbmeta.last_pgno);
+#endif
+ } else {
+ DB_ASSERT(dbp->env,
+ IS_RECOVERING(dbp->env) || F_ISSET(dbp, DB_AM_RECOVER));
+ }
-eftype: errno = EFTYPE;
- goto err;
+ /*
+ * !!!
+ * If creating a subdatabase, we've already done an insert when
+ * we put the subdatabase's entry into the master database, so
+ * our last-page-inserted value is wrongly initialized for the
+ * master database, not the subdatabase we're creating. I'm not
+ * sure where the *right* place to clear this value is, it's not
+ * intuitively obvious that it belongs here.
+ */
+ t->bt_lpgno = PGNO_INVALID;
+
+err: /* Put the metadata page back. */
+ if (meta != NULL && (t_ret = __memp_fput(mpf,
+ ip, meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
-err: if (t) {
- if (t->bt_dbp)
- free(t->bt_dbp);
- if (t->bt_fd != -1)
- (void)close(t->bt_fd);
- free(t);
+/*
+ * __bam_init_meta --
+ *
+ * Initialize a btree meta-data page. The following fields may need
+ * to be updated later: last_pgno, root.
+ */
+static void
+__bam_init_meta(dbp, meta, pgno, lsnp)
+ DB *dbp;
+ BTMETA *meta;
+ db_pgno_t pgno;
+ DB_LSN *lsnp;
+{
+ BTREE *t;
+#ifdef HAVE_PARTITION
+ DB_PARTITION *part;
+#endif
+ ENV *env;
+
+ env = dbp->env;
+ t = dbp->bt_internal;
+
+ memset(meta, 0, sizeof(BTMETA));
+ meta->dbmeta.lsn = *lsnp;
+ meta->dbmeta.pgno = pgno;
+ meta->dbmeta.magic = DB_BTREEMAGIC;
+ meta->dbmeta.version = DB_BTREEVERSION;
+ meta->dbmeta.pagesize = dbp->pgsize;
+ if (F_ISSET(dbp, DB_AM_CHKSUM))
+ FLD_SET(meta->dbmeta.metaflags, DBMETA_CHKSUM);
+ if (F_ISSET(dbp, DB_AM_ENCRYPT)) {
+ meta->dbmeta.encrypt_alg = env->crypto_handle->alg;
+ DB_ASSERT(env, meta->dbmeta.encrypt_alg != 0);
+ meta->crypto_magic = meta->dbmeta.magic;
}
- return (NULL);
+ meta->dbmeta.type = P_BTREEMETA;
+ meta->dbmeta.free = PGNO_INVALID;
+ meta->dbmeta.last_pgno = pgno;
+ if (F_ISSET(dbp, DB_AM_DUP))
+ F_SET(&meta->dbmeta, BTM_DUP);
+ if (F_ISSET(dbp, DB_AM_FIXEDLEN))
+ F_SET(&meta->dbmeta, BTM_FIXEDLEN);
+ if (F_ISSET(dbp, DB_AM_RECNUM))
+ F_SET(&meta->dbmeta, BTM_RECNUM);
+ if (F_ISSET(dbp, DB_AM_RENUMBER))
+ F_SET(&meta->dbmeta, BTM_RENUMBER);
+ if (F_ISSET(dbp, DB_AM_SUBDB))
+ F_SET(&meta->dbmeta, BTM_SUBDB);
+ if (dbp->dup_compare != NULL)
+ F_SET(&meta->dbmeta, BTM_DUPSORT);
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbp))
+ F_SET(&meta->dbmeta, BTM_COMPRESS);
+#endif
+ if (dbp->type == DB_RECNO)
+ F_SET(&meta->dbmeta, BTM_RECNO);
+ memcpy(meta->dbmeta.uid, dbp->fileid, DB_FILE_ID_LEN);
+
+ meta->minkey = t->bt_minkey;
+ meta->re_len = t->re_len;
+ meta->re_pad = (u_int32_t)t->re_pad;
+
+#ifdef HAVE_PARTITION
+ if ((part = dbp->p_internal) != NULL) {
+ meta->dbmeta.nparts = part->nparts;
+ if (F_ISSET(part, PART_CALLBACK))
+ FLD_SET(meta->dbmeta.metaflags, DBMETA_PART_CALLBACK);
+ if (F_ISSET(part, PART_RANGE))
+ FLD_SET(meta->dbmeta.metaflags, DBMETA_PART_RANGE);
+ }
+#endif
}
/*
- * NROOT -- Create the root of a new tree.
+ * __bam_new_file --
+ * Create the necessary pages to begin a new database file.
*
- * Parameters:
- * t: tree
+ * This code appears more complex than it is because of the two cases (named
+ * and unnamed). The way to read the code is that for each page being created,
+ * there are three parts: 1) a "get page" chunk (which either uses malloc'd
+ * memory or calls __memp_fget), 2) the initialization, and 3) the "put page"
+ * chunk which either does a fop write or an __memp_fput.
*
- * Returns:
- * RET_ERROR, RET_SUCCESS
+ * PUBLIC: int __bam_new_file __P((DB *,
+ * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *));
*/
-static int
-nroot(t)
- BTREE *t;
+int
+__bam_new_file(dbp, ip, txn, fhp, name)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ DB_FH *fhp;
+ const char *name;
{
- PAGE *meta, *root;
- pgno_t npg;
+ BTMETA *meta;
+ DBT pdbt;
+ DB_LSN lsn;
+ DB_MPOOLFILE *mpf;
+ DB_PGINFO pginfo;
+ ENV *env;
+ PAGE *root;
+ db_pgno_t pgno;
+ int ret, t_ret;
+ void *buf;
+
+ env = dbp->env;
+ mpf = dbp->mpf;
+ root = NULL;
+ meta = NULL;
+ buf = NULL;
+
+ if (F_ISSET(dbp, DB_AM_INMEM)) {
+ /* Build the meta-data page. */
+ pgno = PGNO_BASE_MD;
+ if ((ret = __memp_fget(mpf, &pgno, ip, txn,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &meta)) != 0)
+ return (ret);
+ LSN_NOT_LOGGED(lsn);
+ __bam_init_meta(dbp, meta, PGNO_BASE_MD, &lsn);
+ meta->root = 1;
+ meta->dbmeta.last_pgno = 1;
+ if ((ret =
+ __db_log_page(dbp, txn, &lsn, pgno, (PAGE *)meta)) != 0)
+ goto err;
+ ret = __memp_fput(mpf, ip, meta, dbp->priority);
+ meta = NULL;
+ if (ret != 0)
+ goto err;
- if ((meta = mpool_get(t->bt_mp, 0, 0)) != NULL) {
- mpool_put(t->bt_mp, meta, 0);
- return (RET_SUCCESS);
- }
- if (errno != EINVAL) /* It's OK to not exist. */
- return (RET_ERROR);
- errno = 0;
-
- if ((meta = mpool_new(t->bt_mp, &npg)) == NULL)
- return (RET_ERROR);
-
- if ((root = mpool_new(t->bt_mp, &npg)) == NULL)
- return (RET_ERROR);
-
- if (npg != P_ROOT)
- return (RET_ERROR);
- root->pgno = npg;
- root->prevpg = root->nextpg = P_INVALID;
- root->lower = BTDATAOFF;
- root->upper = t->bt_psize;
- root->flags = P_BLEAF;
- memset(meta, 0, t->bt_psize);
- mpool_put(t->bt_mp, meta, MPOOL_DIRTY);
- mpool_put(t->bt_mp, root, MPOOL_DIRTY);
- return (RET_SUCCESS);
-}
+ /* Build the root page. */
+ pgno = 1;
+ if ((ret = __memp_fget(mpf, &pgno,
+ ip, txn, DB_MPOOL_CREATE, &root)) != 0)
+ goto err;
+ P_INIT(root, dbp->pgsize, 1, PGNO_INVALID, PGNO_INVALID,
+ LEAFLEVEL, dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE);
+ LSN_NOT_LOGGED(root->lsn);
+ if ((ret =
+ __db_log_page(dbp, txn, &root->lsn, pgno, root)) != 0)
+ goto err;
+ ret = __memp_fput(mpf, ip, root, dbp->priority);
+ root = NULL;
+ if (ret != 0)
+ goto err;
+ } else {
+ memset(&pdbt, 0, sizeof(pdbt));
+
+ /* Build the meta-data page. */
+ pginfo.db_pagesize = dbp->pgsize;
+ pginfo.flags =
+ F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_SWAP));
+ pginfo.type = dbp->type;
+ pdbt.data = &pginfo;
+ pdbt.size = sizeof(pginfo);
+ if ((ret = __os_calloc(env, 1, dbp->pgsize, &buf)) != 0)
+ return (ret);
+ meta = (BTMETA *)buf;
+ LSN_NOT_LOGGED(lsn);
+ __bam_init_meta(dbp, meta, PGNO_BASE_MD, &lsn);
+ meta->root = 1;
+ meta->dbmeta.last_pgno = 1;
+ if ((ret = __db_pgout(
+ dbp->dbenv, PGNO_BASE_MD, meta, &pdbt)) != 0)
+ goto err;
+ if ((ret = __fop_write(env, txn, name, dbp->dirname,
+ DB_APP_DATA, fhp,
+ dbp->pgsize, 0, 0, buf, dbp->pgsize, 1, F_ISSET(
+ dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0)) != 0)
+ goto err;
+ meta = NULL;
-static int
-tmp()
-{
- sigset_t set, oset;
- int fd;
- char *envtmp;
- char path[MAXPATHLEN];
-
- envtmp = getenv("TMPDIR");
- (void)snprintf(path,
- sizeof(path), "%s/bt.XXXXXX", envtmp ? envtmp : "/tmp");
-
- (void)sigfillset(&set);
- (void)sigprocmask(SIG_BLOCK, &set, &oset);
- if ((fd = mkstemp(path)) != -1)
- (void)unlink(path);
- (void)sigprocmask(SIG_SETMASK, &oset, NULL);
- return(fd);
-}
+ /* Build the root page. */
+#ifdef DIAGNOSTIC
+ memset(buf, CLEAR_BYTE, dbp->pgsize);
+#endif
+ root = (PAGE *)buf;
+ P_INIT(root, dbp->pgsize, 1, PGNO_INVALID, PGNO_INVALID,
+ LEAFLEVEL, dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE);
+ LSN_NOT_LOGGED(root->lsn);
+ if ((ret =
+ __db_pgout(dbp->dbenv, root->pgno, root, &pdbt)) != 0)
+ goto err;
+ if ((ret =
+ __fop_write(env, txn, name, dbp->dirname, DB_APP_DATA,
+ fhp, dbp->pgsize, 1, 0, buf, dbp->pgsize, 1, F_ISSET(
+ dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0)) != 0)
+ goto err;
+ root = NULL;
+ }
-static int
-byteorder()
-{
- u_int32_t x;
- u_char *p;
-
- x = 0x01020304;
- p = (u_char *)&x;
- switch (*p) {
- case 1:
- return (BIG_ENDIAN);
- case 4:
- return (LITTLE_ENDIAN);
- default:
- return (0);
+err: if (buf != NULL)
+ __os_free(env, buf);
+ else {
+ if (meta != NULL &&
+ (t_ret = __memp_fput(mpf, ip,
+ meta, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (root != NULL &&
+ (t_ret = __memp_fput(mpf, ip,
+ root, dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
}
+ return (ret);
}
+/*
+ * __bam_new_subdb --
+ * Create a metadata page and a root page for a new btree.
+ *
+ * PUBLIC: int __bam_new_subdb __P((DB *, DB *, DB_THREAD_INFO *, DB_TXN *));
+ */
int
-__bt_fd(dbp)
- const DB *dbp;
+__bam_new_subdb(mdbp, dbp, ip, txn)
+ DB *mdbp, *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
{
- BTREE *t;
+ BTMETA *meta;
+ DBC *dbc;
+ DB_LOCK metalock;
+ DB_LSN lsn;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *root;
+ int ret, t_ret;
+
+ env = mdbp->env;
+ mpf = mdbp->mpf;
+ dbc = NULL;
+ meta = NULL;
+ root = NULL;
+
+ if ((ret = __db_cursor(mdbp, ip, txn,
+ &dbc, CDB_LOCKING(env) ? DB_WRITECURSOR : 0)) != 0)
+ return (ret);
+
+ /* Get, and optionally create the metadata page. */
+ if ((ret = __db_lget(dbc,
+ 0, dbp->meta_pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &dbp->meta_pgno,
+ ip, txn, DB_MPOOL_CREATE, &meta)) != 0)
+ goto err;
- t = dbp->internal;
+ /* Build meta-data page. */
+ lsn = meta->dbmeta.lsn;
+ __bam_init_meta(dbp, meta, dbp->meta_pgno, &lsn);
+ if ((ret = __db_log_page(mdbp,
+ txn, &meta->dbmeta.lsn, dbp->meta_pgno, (PAGE *)meta)) != 0)
+ goto err;
- /* Toss any page pinned across calls. */
- if (t->bt_pinned != NULL) {
- mpool_put(t->bt_mp, t->bt_pinned, 0);
- t->bt_pinned = NULL;
- }
+ /* Create and initialize a root page. */
+ if ((ret = __db_new(dbc,
+ dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE, NULL, &root)) != 0)
+ goto err;
+ root->level = LEAFLEVEL;
- /* In-memory database can't have a file descriptor. */
- if (F_ISSET(t, B_INMEM)) {
- errno = ENOENT;
- return (-1);
- }
- return (t->bt_fd);
+ if (DBENV_LOGGING(env) &&
+#if !defined(DEBUG_WOP)
+ txn != NULL &&
+#endif
+
+ (ret = __bam_root_log(mdbp, txn, &meta->dbmeta.lsn, 0,
+ meta->dbmeta.pgno, root->pgno, &meta->dbmeta.lsn)) != 0)
+ goto err;
+
+ meta->root = root->pgno;
+ if ((ret =
+ __db_log_page(mdbp, txn, &root->lsn, root->pgno, root)) != 0)
+ goto err;
+
+ /* Release the metadata and root pages. */
+ if ((ret = __memp_fput(mpf, ip, meta, dbc->priority)) != 0)
+ goto err;
+ meta = NULL;
+ if ((ret = __memp_fput(mpf, ip, root, dbc->priority)) != 0)
+ goto err;
+ root = NULL;
+err:
+ if (meta != NULL)
+ if ((t_ret = __memp_fput(mpf, ip,
+ meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (root != NULL)
+ if ((t_ret = __memp_fput(mpf, ip,
+ root, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (dbc != NULL)
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
}
diff --git a/btree/bt_overflow.c b/btree/bt_overflow.c
deleted file mode 100644
index b28b8e0..0000000
--- a/btree/bt_overflow.c
+++ /dev/null
@@ -1,228 +0,0 @@
-/*-
- * Copyright (c) 1990, 1993, 1994
- * The Regents of the University of California. All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * Mike Olson.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#if defined(LIBC_SCCS) && !defined(lint)
-static char sccsid[] = "@(#)bt_overflow.c 8.5 (Berkeley) 7/16/94";
-#endif /* LIBC_SCCS and not lint */
-
-#include <sys/param.h>
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <db.h>
-#include "btree.h"
-
-/*
- * Big key/data code.
- *
- * Big key and data entries are stored on linked lists of pages. The initial
- * reference is byte string stored with the key or data and is the page number
- * and size. The actual record is stored in a chain of pages linked by the
- * nextpg field of the PAGE header.
- *
- * The first page of the chain has a special property. If the record is used
- * by an internal page, it cannot be deleted and the P_PRESERVE bit will be set
- * in the header.
- *
- * XXX
- * A single DBT is written to each chain, so a lot of space on the last page
- * is wasted. This is a fairly major bug for some data sets.
- */
-
-/*
- * __OVFL_GET -- Get an overflow key/data item.
- *
- * Parameters:
- * t: tree
- * p: pointer to { pgno_t, u_int32_t }
- * buf: storage address
- * bufsz: storage size
- *
- * Returns:
- * RET_ERROR, RET_SUCCESS
- */
-int
-__ovfl_get(t, p, ssz, buf, bufsz)
- BTREE *t;
- void *p;
- size_t *ssz;
- void **buf;
- size_t *bufsz;
-{
- PAGE *h;
- pgno_t pg;
- size_t nb, plen;
- u_int32_t sz;
-
- memmove(&pg, p, sizeof(pgno_t));
- memmove(&sz, (char *)p + sizeof(pgno_t), sizeof(u_int32_t));
- *ssz = sz;
-
-#ifdef DEBUG
- if (pg == P_INVALID || sz == 0)
- abort();
-#endif
- /* Make the buffer bigger as necessary. */
- if (*bufsz < sz) {
- *buf = (char *)(*buf == NULL ? malloc(sz) : realloc(*buf, sz));
- if (*buf == NULL)
- return (RET_ERROR);
- *bufsz = sz;
- }
-
- /*
- * Step through the linked list of pages, copying the data on each one
- * into the buffer. Never copy more than the data's length.
- */
- plen = t->bt_psize - BTDATAOFF;
- for (p = *buf;; p = (char *)p + nb, pg = h->nextpg) {
- if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
- return (RET_ERROR);
-
- nb = MIN(sz, plen);
- memmove(p, (char *)h + BTDATAOFF, nb);
- mpool_put(t->bt_mp, h, 0);
-
- if ((sz -= nb) == 0)
- break;
- }
- return (RET_SUCCESS);
-}
-
-/*
- * __OVFL_PUT -- Store an overflow key/data item.
- *
- * Parameters:
- * t: tree
- * data: DBT to store
- * pgno: storage page number
- *
- * Returns:
- * RET_ERROR, RET_SUCCESS
- */
-int
-__ovfl_put(t, dbt, pg)
- BTREE *t;
- const DBT *dbt;
- pgno_t *pg;
-{
- PAGE *h, *last;
- void *p;
- pgno_t npg;
- size_t nb, plen;
- u_int32_t sz;
-
- /*
- * Allocate pages and copy the key/data record into them. Store the
- * number of the first page in the chain.
- */
- plen = t->bt_psize - BTDATAOFF;
- for (last = NULL, p = dbt->data, sz = dbt->size;;
- p = (char *)p + plen, last = h) {
- if ((h = __bt_new(t, &npg)) == NULL)
- return (RET_ERROR);
-
- h->pgno = npg;
- h->nextpg = h->prevpg = P_INVALID;
- h->flags = P_OVERFLOW;
- h->lower = h->upper = 0;
-
- nb = MIN(sz, plen);
- memmove((char *)h + BTDATAOFF, p, nb);
-
- if (last) {
- last->nextpg = h->pgno;
- mpool_put(t->bt_mp, last, MPOOL_DIRTY);
- } else
- *pg = h->pgno;
-
- if ((sz -= nb) == 0) {
- mpool_put(t->bt_mp, h, MPOOL_DIRTY);
- break;
- }
- }
- return (RET_SUCCESS);
-}
-
-/*
- * __OVFL_DELETE -- Delete an overflow chain.
- *
- * Parameters:
- * t: tree
- * p: pointer to { pgno_t, u_int32_t }
- *
- * Returns:
- * RET_ERROR, RET_SUCCESS
- */
-int
-__ovfl_delete(t, p)
- BTREE *t;
- void *p;
-{
- PAGE *h;
- pgno_t pg;
- size_t plen;
- u_int32_t sz;
-
- memmove(&pg, p, sizeof(pgno_t));
- memmove(&sz, (char *)p + sizeof(pgno_t), sizeof(u_int32_t));
-
-#ifdef DEBUG
- if (pg == P_INVALID || sz == 0)
- abort();
-#endif
- if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
- return (RET_ERROR);
-
- /* Don't delete chains used by internal pages. */
- if (h->flags & P_PRESERVE) {
- mpool_put(t->bt_mp, h, 0);
- return (RET_SUCCESS);
- }
-
- /* Step through the chain, calling the free routine for each page. */
- for (plen = t->bt_psize - BTDATAOFF;; sz -= plen) {
- pg = h->nextpg;
- __bt_free(t, h);
- if (sz <= plen)
- break;
- if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
- return (RET_ERROR);
- }
- return (RET_SUCCESS);
-}
diff --git a/btree/bt_page.c b/btree/bt_page.c
deleted file mode 100644
index 0d9d138..0000000
--- a/btree/bt_page.c
+++ /dev/null
@@ -1,98 +0,0 @@
-/*-
- * Copyright (c) 1990, 1993, 1994
- * The Regents of the University of California. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#if defined(LIBC_SCCS) && !defined(lint)
-static char sccsid[] = "@(#)bt_page.c 8.3 (Berkeley) 7/14/94";
-#endif /* LIBC_SCCS and not lint */
-
-#include <sys/types.h>
-
-#include <stdio.h>
-
-#include <db.h>
-#include "btree.h"
-
-/*
- * __bt_free --
- * Put a page on the freelist.
- *
- * Parameters:
- * t: tree
- * h: page to free
- *
- * Returns:
- * RET_ERROR, RET_SUCCESS
- *
- * Side-effect:
- * mpool_put's the page.
- */
-int
-__bt_free(t, h)
- BTREE *t;
- PAGE *h;
-{
- /* Insert the page at the head of the free list. */
- h->prevpg = P_INVALID;
- h->nextpg = t->bt_free;
- t->bt_free = h->pgno;
-
- /* Make sure the page gets written back. */
- return (mpool_put(t->bt_mp, h, MPOOL_DIRTY));
-}
-
-/*
- * __bt_new --
- * Get a new page, preferably from the freelist.
- *
- * Parameters:
- * t: tree
- * npg: storage for page number.
- *
- * Returns:
- * Pointer to a page, NULL on error.
- */
-PAGE *
-__bt_new(t, npg)
- BTREE *t;
- pgno_t *npg;
-{
- PAGE *h;
-
- if (t->bt_free != P_INVALID &&
- (h = mpool_get(t->bt_mp, t->bt_free, 0)) != NULL) {
- *npg = t->bt_free;
- t->bt_free = h->nextpg;
- return (h);
- }
- return (mpool_new(t->bt_mp, npg));
-}
diff --git a/btree/bt_put.c b/btree/bt_put.c
index 952be09..683b09c 100644
--- a/btree/bt_put.c
+++ b/btree/bt_put.c
@@ -1,5 +1,14 @@
/*-
- * Copyright (c) 1990, 1993, 1994
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
@@ -13,11 +22,7 @@
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
+ * 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
@@ -32,289 +37,1033 @@
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
+ *
+ * $Id$
*/
-#if defined(LIBC_SCCS) && !defined(lint)
-static char sccsid[] = "@(#)bt_put.c 8.8 (Berkeley) 7/26/94";
-#endif /* LIBC_SCCS and not lint */
-
-#include <sys/types.h>
+#include "db_config.h"
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
-#include <db.h>
-#include "btree.h"
-
-static EPG *bt_fast __P((BTREE *, const DBT *, const DBT *, int *));
+static int __bam_build
+ __P((DBC *, u_int32_t, DBT *, PAGE *, u_int32_t, u_int32_t));
+static int __bam_dup_check __P((DBC *, u_int32_t,
+ PAGE *, u_int32_t, u_int32_t, db_indx_t *));
+static int __bam_dup_convert __P((DBC *, PAGE *, u_int32_t, u_int32_t));
+static int __bam_ovput
+ __P((DBC *, u_int32_t, db_pgno_t, PAGE *, u_int32_t, DBT *));
+static u_int32_t
+ __bam_partsize __P((DB *, u_int32_t, DBT *, PAGE *, u_int32_t));
/*
- * __BT_PUT -- Add a btree item to the tree.
- *
- * Parameters:
- * dbp: pointer to access method
- * key: key
- * data: data
- * flag: R_NOOVERWRITE
+ * __bam_iitem --
+ * Insert an item into the tree.
*
- * Returns:
- * RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key is already in the
- * tree and R_NOOVERWRITE specified.
+ * PUBLIC: int __bam_iitem __P((DBC *, DBT *, DBT *, u_int32_t, u_int32_t));
*/
int
-__bt_put(dbp, key, data, flags)
- const DB *dbp;
- DBT *key;
- const DBT *data;
- u_int flags;
+__bam_iitem(dbc, key, data, op, flags)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t op, flags;
{
+ BKEYDATA *bk, bk_tmp;
BTREE *t;
- DBT tkey, tdata;
- EPG *e;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT bk_hdr, tdbt;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
PAGE *h;
- indx_t index, nxtindex;
- pgno_t pg;
- u_int32_t nbytes;
- int dflags, exact, status;
- char *dest, db[NOVFLSIZE], kb[NOVFLSIZE];
+ db_indx_t cnt, indx;
+ u_int32_t data_size, have_bytes, need_bytes, needed, pages, pagespace;
+ char tmp_ch;
+ int cmp, bigkey, bigdata, del, dupadjust;
+ int padrec, replace, ret, t_ret, was_deleted;
+
+ COMPQUIET(cnt, 0);
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ t = dbp->bt_internal;
+ h = cp->page;
+ indx = cp->indx;
+ del = dupadjust = replace = was_deleted = 0;
+
+ /*
+ * Fixed-length records with partial puts: it's an error to specify
+ * anything other simple overwrite.
+ */
+ if (F_ISSET(dbp, DB_AM_FIXEDLEN) &&
+ F_ISSET(data, DB_DBT_PARTIAL) && data->size != data->dlen)
+ return (__db_rec_repl(env, data->size, data->dlen));
+
+ /*
+ * Figure out how much space the data will take, including if it's a
+ * partial record.
+ *
+ * Fixed-length records: it's an error to specify a record that's
+ * longer than the fixed-length, and we never require less than
+ * the fixed-length record size.
+ */
+ data_size = F_ISSET(data, DB_DBT_PARTIAL) ?
+ __bam_partsize(dbp, op, data, h, indx) : data->size;
+ padrec = 0;
+ if (F_ISSET(dbp, DB_AM_FIXEDLEN)) {
+ if (data_size > t->re_len)
+ return (__db_rec_toobig(env, data_size, t->re_len));
+
+ /* Records that are deleted anyway needn't be padded out. */
+ if (!LF_ISSET(BI_DELETED) && data_size < t->re_len) {
+ padrec = 1;
+ data_size = t->re_len;
+ }
+ }
- t = dbp->internal;
+ /*
+ * Handle partial puts or short fixed-length records: check whether we
+ * can just append the data or else build the real record. We can't
+ * append if there are secondaries: we need the whole data item for the
+ * application's secondary callback.
+ */
+ if (op == DB_CURRENT && dbp->dup_compare == NULL &&
+ F_ISSET(data, DB_DBT_PARTIAL) && !DB_IS_PRIMARY(dbp)) {
+ bk = GET_BKEYDATA(
+ dbp, h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
+ /*
+ * If the item is an overflow type, and the input DBT is
+ * partial, and begins at the length of the current item then
+ * it is an append. Avoid deleting and re-creating the entire
+ * offpage item.
+ */
+ if (B_TYPE(bk->type) == B_OVERFLOW &&
+ data->doff == ((BOVERFLOW *)bk)->tlen) {
+ /*
+ * If the cursor has not already cached the last page
+ * in the offpage chain. We need to walk the chain
+ * to be sure that the page has been read.
+ */
+ if (cp->stream_start_pgno != ((BOVERFLOW *)bk)->pgno ||
+ cp->stream_off > data->doff || data->doff >
+ cp->stream_off + P_MAXSPACE(dbp, dbp->pgsize)) {
+ memset(&tdbt, 0, sizeof(DBT));
+ tdbt.doff = data->doff - 1;
+ /*
+ * Set the length to 1, to force __db_goff
+ * to do the traversal.
+ */
+ tdbt.dlen = tdbt.ulen = 1;
+ tdbt.data = &tmp_ch;
+ tdbt.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM;
+
+ /*
+ * Read to the last page. It will be cached
+ * in the cursor.
+ */
+ if ((ret = __db_goff(
+ dbc, &tdbt, ((BOVERFLOW *)bk)->tlen,
+ ((BOVERFLOW *)bk)->pgno, NULL, NULL)) != 0)
+ return (ret);
+ }
- /* Toss any page pinned across calls. */
- if (t->bt_pinned != NULL) {
- mpool_put(t->bt_mp, t->bt_pinned, 0);
- t->bt_pinned = NULL;
+ /*
+ * Since this is an append, dlen is irrelevant (there
+ * are no bytes to overwrite). We need the caller's
+ * DBT size to end up with the total size of the item.
+ * From now on, use dlen as the length of the user's
+ * data that we are going to append.
+ * Don't futz with the caller's DBT any more than we
+ * have to in order to send back the size.
+ */
+ tdbt = *data;
+ tdbt.dlen = data->size;
+ tdbt.size = data_size;
+ data = &tdbt;
+ F_SET(data, DB_DBT_STREAMING);
+ }
+ }
+ if (!F_ISSET(data, DB_DBT_STREAMING) &&
+ (padrec || F_ISSET(data, DB_DBT_PARTIAL))) {
+ tdbt = *data;
+ if ((ret =
+ __bam_build(dbc, op, &tdbt, h, indx, data_size)) != 0)
+ return (ret);
+ data = &tdbt;
}
- /* Check for change to a read-only tree. */
- if (F_ISSET(t, B_RDONLY)) {
- errno = EPERM;
- return (RET_ERROR);
+ /*
+ * If the user has specified a duplicate comparison function, return
+ * an error if DB_CURRENT was specified and the replacement data
+ * doesn't compare equal to the current data. This stops apps from
+ * screwing up the duplicate sort order. We have to do this after
+ * we build the real record so that we're comparing the real items.
+ */
+ if (op == DB_CURRENT && dbp->dup_compare != NULL) {
+ if ((ret = __bam_cmp(dbc, data, h,
+ indx + (TYPE(h) == P_LBTREE ? O_INDX : 0),
+ dbp->dup_compare, &cmp)) != 0)
+ return (ret);
+ if (cmp != 0) {
+ __db_errx(env,
+ "Existing data sorts differently from put data");
+ return (EINVAL);
+ }
}
- switch (flags) {
- case 0:
- case R_NOOVERWRITE:
+ /*
+ * If the key or data item won't fit on a page, we'll have to store
+ * them on overflow pages.
+ */
+ needed = 0;
+ bigdata = data_size > cp->ovflsize;
+ switch (op) {
+ case DB_KEYFIRST:
+ /* We're adding a new key and data pair. */
+ bigkey = key->size > cp->ovflsize;
+ if (bigkey)
+ needed += BOVERFLOW_PSIZE;
+ else
+ needed += BKEYDATA_PSIZE(key->size);
+ if (bigdata)
+ needed += BOVERFLOW_PSIZE;
+ else
+ needed += BKEYDATA_PSIZE(data_size);
break;
- case R_CURSOR:
+ case DB_AFTER:
+ case DB_BEFORE:
+ case DB_CURRENT:
/*
- * If flags is R_CURSOR, put the cursor. Must already
- * have started a scan and not have already deleted it.
+ * We're either overwriting the data item of a key/data pair
+ * or we're creating a new on-page duplicate and only adding
+ * a data item.
+ *
+ * !!!
+ * We're not currently correcting for space reclaimed from
+ * already deleted items, but I don't think it's worth the
+ * complexity.
*/
- if (F_ISSET(&t->bt_cursor, CURS_INIT) &&
- !F_ISSET(&t->bt_cursor,
- CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE))
- break;
- /* FALLTHROUGH */
+ bigkey = 0;
+ if (op == DB_CURRENT) {
+ bk = GET_BKEYDATA(dbp, h,
+ indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
+ if (B_TYPE(bk->type) == B_KEYDATA)
+ have_bytes = BKEYDATA_PSIZE(bk->len);
+ else
+ have_bytes = BOVERFLOW_PSIZE;
+ need_bytes = 0;
+ } else {
+ have_bytes = 0;
+ need_bytes = sizeof(db_indx_t);
+ }
+ if (bigdata)
+ need_bytes += BOVERFLOW_PSIZE;
+ else
+ need_bytes += BKEYDATA_PSIZE(data_size);
+
+ if (have_bytes < need_bytes)
+ needed += need_bytes - have_bytes;
+ break;
default:
- errno = EINVAL;
- return (RET_ERROR);
+ return (__db_unknown_flag(env, "DB->put", op));
+ }
+
+ /* Split the page if there's not enough room. */
+ if (P_FREESPACE(dbp, h) < needed)
+ return (DB_NEEDSPLIT);
+
+ /*
+ * Check to see if we will convert to off page duplicates -- if
+ * so, we'll need a page.
+ */
+ if (F_ISSET(dbp, DB_AM_DUP) &&
+ TYPE(h) == P_LBTREE && op != DB_KEYFIRST &&
+ P_FREESPACE(dbp, h) - needed <= dbp->pgsize / 2 &&
+ __bam_dup_check(dbc, op, h, indx, needed, &cnt)) {
+ pages = 1;
+ dupadjust = 1;
+ } else
+ pages = 0;
+
+ /*
+ * If we are not using transactions and there is a page limit
+ * set on the file, then figure out if things will fit before
+ * taking action.
+ */
+ if (dbc->txn == NULL && mpf->mfp->maxpgno != 0) {
+ pagespace = P_MAXSPACE(dbp, dbp->pgsize);
+ if (bigdata)
+ pages += ((data_size - 1) / pagespace) + 1;
+ if (bigkey)
+ pages += ((key->size - 1) / pagespace) + 1;
+
+ if (pages > (mpf->mfp->maxpgno - mpf->mfp->last_pgno))
+ return (__db_space_err(dbp));
}
+ ret = __memp_dirty(mpf, &h,
+ dbc->thread_info, dbc->txn, dbc->priority, 0);
+ if (cp->csp->page == cp->page)
+ cp->csp->page = h;
+ cp->page = h;
+ if (ret != 0)
+ return (ret);
+
/*
- * If the key/data pair won't fit on a page, store it on overflow
- * pages. Only put the key on the overflow page if the pair are
- * still too big after moving the data to an overflow page.
+ * The code breaks it up into five cases:
*
- * XXX
- * If the insert fails later on, the overflow pages aren't recovered.
+ * 1. Insert a new key/data pair.
+ * 2. Append a new data item (a new duplicate).
+ * 3. Insert a new data item (a new duplicate).
+ * 4. Delete and re-add the data item (overflow item).
+ * 5. Overwrite the data item.
*/
- dflags = 0;
- if (key->size + data->size > t->bt_ovflsize) {
- if (key->size > t->bt_ovflsize) {
-storekey: if (__ovfl_put(t, key, &pg) == RET_ERROR)
- return (RET_ERROR);
- tkey.data = kb;
- tkey.size = NOVFLSIZE;
- memmove(kb, &pg, sizeof(pgno_t));
- memmove(kb + sizeof(pgno_t),
- &key->size, sizeof(u_int32_t));
- dflags |= P_BIGKEY;
- key = &tkey;
+ switch (op) {
+ case DB_KEYFIRST: /* 1. Insert a new key/data pair. */
+ if (bigkey) {
+ if ((ret = __bam_ovput(dbc,
+ B_OVERFLOW, PGNO_INVALID, h, indx, key)) != 0)
+ return (ret);
+ } else
+ if ((ret = __db_pitem(dbc, h, indx,
+ BKEYDATA_SIZE(key->size), NULL, key)) != 0)
+ return (ret);
+
+ if ((ret = __bam_ca_di(dbc, PGNO(h), indx, 1)) != 0)
+ return (ret);
+ ++indx;
+ break;
+ case DB_AFTER: /* 2. Append a new data item. */
+ if (TYPE(h) == P_LBTREE) {
+ /* Copy the key for the duplicate and adjust cursors. */
+ if ((ret =
+ __bam_adjindx(dbc, h, indx + P_INDX, indx, 1)) != 0)
+ return (ret);
+ if ((ret =
+ __bam_ca_di(dbc, PGNO(h), indx + P_INDX, 1)) != 0)
+ return (ret);
+
+ indx += 3;
+
+ cp->indx += 2;
+ } else {
+ ++indx;
+ cp->indx += 1;
}
- if (key->size + data->size > t->bt_ovflsize) {
- if (__ovfl_put(t, data, &pg) == RET_ERROR)
- return (RET_ERROR);
- tdata.data = db;
- tdata.size = NOVFLSIZE;
- memmove(db, &pg, sizeof(pgno_t));
- memmove(db + sizeof(pgno_t),
- &data->size, sizeof(u_int32_t));
- dflags |= P_BIGDATA;
- data = &tdata;
+ break;
+ case DB_BEFORE: /* 3. Insert a new data item. */
+ if (TYPE(h) == P_LBTREE) {
+ /* Copy the key for the duplicate and adjust cursors. */
+ if ((ret = __bam_adjindx(dbc, h, indx, indx, 1)) != 0)
+ return (ret);
+ if ((ret = __bam_ca_di(dbc, PGNO(h), indx, 1)) != 0)
+ return (ret);
+
+ ++indx;
}
- if (key->size + data->size > t->bt_ovflsize)
- goto storekey;
+ break;
+ case DB_CURRENT:
+ /*
+ * Clear the cursor's deleted flag. The problem is that if
+ * we deadlock or fail while deleting the overflow item or
+ * replacing the non-overflow item, a subsequent cursor close
+ * will try and remove the item because the cursor's delete
+ * flag is set.
+ */
+ if ((ret = __bam_ca_delete(dbp, PGNO(h), indx, 0, NULL)) != 0)
+ return (ret);
+
+ if (TYPE(h) == P_LBTREE)
+ ++indx;
+ bk = GET_BKEYDATA(dbp, h, indx);
+
+ /*
+ * In a Btree deleted records aren't counted (deleted records
+ * are counted in a Recno because all accesses are based on
+ * record number). If it's a Btree and it's a DB_CURRENT
+ * operation overwriting a previously deleted record, increment
+ * the record count.
+ */
+ if (TYPE(h) == P_LBTREE || TYPE(h) == P_LDUP)
+ was_deleted = B_DISSET(bk->type);
+
+ /*
+ * 4. Delete and re-add the data item.
+ *
+ * If we're changing the type of the on-page structure, or we
+ * are referencing offpage items, we have to delete and then
+ * re-add the item. We do not do any cursor adjustments here
+ * because we're going to immediately re-add the item into the
+ * same slot.
+ */
+ if (bigdata || B_TYPE(bk->type) != B_KEYDATA) {
+ /*
+ * If streaming, don't delete the overflow item,
+ * just delete the item pointing to the overflow item.
+ * It will be added back in later, with the new size.
+ * We can't simply adjust the size of the item on the
+ * page, because there is no easy way to log a
+ * modification.
+ */
+ if (F_ISSET(data, DB_DBT_STREAMING)) {
+ if ((ret = __db_ditem(
+ dbc, h, indx, BOVERFLOW_SIZE)) != 0)
+ return (ret);
+ } else if ((ret = __bam_ditem(dbc, h, indx)) != 0)
+ return (ret);
+ del = 1;
+ break;
+ }
+
+ /* 5. Overwrite the data item. */
+ replace = 1;
+ break;
+ default:
+ return (__db_unknown_flag(env, "DB->put", op));
+ }
+
+ /* Add the data. */
+ if (bigdata) {
+ /*
+ * We do not have to handle deleted (BI_DELETED) records
+ * in this case; the actual records should never be created.
+ */
+ DB_ASSERT(env, !LF_ISSET(BI_DELETED));
+ ret = __bam_ovput(dbc,
+ B_OVERFLOW, PGNO_INVALID, h, indx, data);
+ } else {
+ if (LF_ISSET(BI_DELETED)) {
+ B_TSET_DELETED(bk_tmp.type, B_KEYDATA);
+ bk_tmp.len = data->size;
+ bk_hdr.data = &bk_tmp;
+ bk_hdr.size = SSZA(BKEYDATA, data);
+ ret = __db_pitem(dbc, h, indx,
+ BKEYDATA_SIZE(data->size), &bk_hdr, data);
+ } else if (replace)
+ ret = __bam_ritem(dbc, h, indx, data, 0);
+ else
+ ret = __db_pitem(dbc, h, indx,
+ BKEYDATA_SIZE(data->size), NULL, data);
+ }
+ if (ret != 0) {
+ if (del == 1 && (t_ret =
+ __bam_ca_di(dbc, PGNO(h), indx + 1, -1)) != 0) {
+ __db_err(env, t_ret,
+ "cursor adjustment after delete failed");
+ return (__env_panic(env, t_ret));
+ }
+ return (ret);
}
- /* Replace the cursor. */
- if (flags == R_CURSOR) {
- if ((h = mpool_get(t->bt_mp, t->bt_cursor.pg.pgno, 0)) == NULL)
- return (RET_ERROR);
- index = t->bt_cursor.pg.index;
- goto delete;
+ /*
+ * Re-position the cursors if necessary and reset the current cursor
+ * to point to the new item.
+ */
+ if (op != DB_CURRENT) {
+ if ((ret = __bam_ca_di(dbc, PGNO(h), indx, 1)) != 0)
+ return (ret);
+ cp->indx = TYPE(h) == P_LBTREE ? indx - O_INDX : indx;
}
/*
- * Find the key to delete, or, the location at which to insert.
- * Bt_fast and __bt_search both pin the returned page.
+ * If we've changed the record count, update the tree. There's no
+ * need to adjust the count if the operation not performed on the
+ * current record or when the current record was previously deleted.
*/
- if (t->bt_order == NOT || (e = bt_fast(t, key, data, &exact)) == NULL)
- if ((e = __bt_search(t, key, &exact)) == NULL)
- return (RET_ERROR);
- h = e->page;
- index = e->index;
+ if (F_ISSET(cp, C_RECNUM) && (op != DB_CURRENT || was_deleted))
+ if ((ret = __bam_adjust(dbc, 1)) != 0)
+ return (ret);
/*
- * Add the key/data pair to the tree. If an identical key is already
- * in the tree, and R_NOOVERWRITE is set, an error is returned. If
- * R_NOOVERWRITE is not set, the key is either added (if duplicates are
- * permitted) or an error is returned.
+ * If a Btree leaf page is at least 50% full and we may have added or
+ * modified a duplicate data item, see if the set of duplicates takes
+ * up at least 25% of the space on the page. If it does, move it onto
+ * its own page.
*/
- switch (flags) {
- case R_NOOVERWRITE:
- if (!exact)
- break;
- mpool_put(t->bt_mp, h, 0);
- return (RET_SPECIAL);
- default:
- if (!exact || !F_ISSET(t, B_NODUPS))
- break;
+ if (dupadjust &&
+ (ret = __bam_dup_convert(dbc, h, indx - O_INDX, cnt)) != 0)
+ return (ret);
+
+ /* If we've modified a recno file, set the flag. */
+ if (dbc->dbtype == DB_RECNO)
+ t->re_modified = 1;
+
+ return (ret);
+}
+
+/*
+ * __bam_partsize --
+ * Figure out how much space a partial data item is in total.
+ */
+static u_int32_t
+__bam_partsize(dbp, op, data, h, indx)
+ DB *dbp;
+ u_int32_t op, indx;
+ DBT *data;
+ PAGE *h;
+{
+ BKEYDATA *bk;
+ u_int32_t nbytes;
+
+ /*
+ * If the record doesn't already exist, it's simply the data we're
+ * provided.
+ */
+ if (op != DB_CURRENT)
+ return (data->doff + data->size);
+
+ /*
+ * Otherwise, it's the data provided plus any already existing data
+ * that we're not replacing.
+ */
+ bk = GET_BKEYDATA(dbp, h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0));
+ nbytes =
+ B_TYPE(bk->type) == B_OVERFLOW ? ((BOVERFLOW *)bk)->tlen : bk->len;
+
+ return (__db_partsize(nbytes, data));
+}
+
+/*
+ * __bam_build --
+ * Build the real record for a partial put, or short fixed-length record.
+ */
+static int
+__bam_build(dbc, op, dbt, h, indx, nbytes)
+ DBC *dbc;
+ u_int32_t op, indx, nbytes;
+ DBT *dbt;
+ PAGE *h;
+{
+ BKEYDATA *bk, tbk;
+ BOVERFLOW *bo;
+ BTREE *t;
+ DB *dbp;
+ DBT copy, *rdata;
+ u_int32_t len, tlen;
+ u_int8_t *p;
+ int ret;
+
+ COMPQUIET(bo, NULL);
+
+ dbp = dbc->dbp;
+ t = dbp->bt_internal;
+
+ /* We use the record data return memory, it's only a short-term use. */
+ rdata = &dbc->my_rdata;
+ if (rdata->ulen < nbytes) {
+ if ((ret = __os_realloc(dbp->env,
+ nbytes, &rdata->data)) != 0) {
+ rdata->ulen = 0;
+ rdata->data = NULL;
+ return (ret);
+ }
+ rdata->ulen = nbytes;
+ }
+
+ /*
+ * We use nul or pad bytes for any part of the record that isn't
+ * specified; get it over with.
+ */
+ memset(rdata->data,
+ F_ISSET(dbp, DB_AM_FIXEDLEN) ? t->re_pad : 0, nbytes);
+
+ /*
+ * In the next clauses, we need to do three things: a) set p to point
+ * to the place at which to copy the user's data, b) set tlen to the
+ * total length of the record, not including the bytes contributed by
+ * the user, and c) copy any valid data from an existing record. If
+ * it's not a partial put (this code is called for both partial puts
+ * and fixed-length record padding) or it's a new key, we can cut to
+ * the chase.
+ */
+ if (!F_ISSET(dbt, DB_DBT_PARTIAL) || op != DB_CURRENT) {
+ p = (u_int8_t *)rdata->data + dbt->doff;
+ tlen = dbt->doff;
+ goto user_copy;
+ }
+
+ /* Find the current record. */
+ if (indx < NUM_ENT(h)) {
+ bk = GET_BKEYDATA(dbp, h, indx + (TYPE(h) == P_LBTREE ?
+ O_INDX : 0));
+ bo = (BOVERFLOW *)bk;
+ } else {
+ bk = &tbk;
+ B_TSET(bk->type, B_KEYDATA);
+ bk->len = 0;
+ }
+ if (B_TYPE(bk->type) == B_OVERFLOW) {
/*
- * !!!
- * Note, the delete may empty the page, so we need to put a
- * new entry into the page immediately.
+ * In the case of an overflow record, we shift things around
+ * in the current record rather than allocate a separate copy.
*/
-delete: if (__bt_dleaf(t, key, h, index) == RET_ERROR) {
- mpool_put(t->bt_mp, h, 0);
- return (RET_ERROR);
+ memset(&copy, 0, sizeof(copy));
+ if ((ret = __db_goff(dbc, &copy, bo->tlen, bo->pgno,
+ &rdata->data, &rdata->ulen)) != 0)
+ return (ret);
+
+ /* Skip any leading data from the original record. */
+ tlen = dbt->doff;
+ p = (u_int8_t *)rdata->data + dbt->doff;
+
+ /*
+ * Copy in any trailing data from the original record.
+ *
+ * If the original record was larger than the original offset
+ * plus the bytes being deleted, there is trailing data in the
+ * original record we need to preserve. If we aren't deleting
+ * the same number of bytes as we're inserting, copy it up or
+ * down, into place.
+ *
+ * Use memmove(), the regions may overlap.
+ */
+ if (bo->tlen > dbt->doff + dbt->dlen) {
+ len = bo->tlen - (dbt->doff + dbt->dlen);
+ if (dbt->dlen != dbt->size)
+ memmove(p + dbt->size, p + dbt->dlen, len);
+ tlen += len;
+ }
+ } else {
+ /* Copy in any leading data from the original record. */
+ memcpy(rdata->data,
+ bk->data, dbt->doff > bk->len ? bk->len : dbt->doff);
+ tlen = dbt->doff;
+ p = (u_int8_t *)rdata->data + dbt->doff;
+
+ /* Copy in any trailing data from the original record. */
+ len = dbt->doff + dbt->dlen;
+ if (bk->len > len) {
+ memcpy(p + dbt->size, bk->data + len, bk->len - len);
+ tlen += bk->len - len;
}
- break;
}
+user_copy:
/*
- * If not enough room, or the user has put a ceiling on the number of
- * keys permitted in the page, split the page. The split code will
- * insert the key and data and unpin the current page. If inserting
- * into the offset array, shift the pointers up.
+ * Copy in the application provided data -- p and tlen must have been
+ * initialized above.
*/
- nbytes = NBLEAFDBT(key->size, data->size);
- if (h->upper - h->lower < nbytes + sizeof(indx_t)) {
- if ((status = __bt_split(t, h, key,
- data, dflags, nbytes, index)) != RET_SUCCESS)
- return (status);
- goto success;
+ memcpy(p, dbt->data, dbt->size);
+ tlen += dbt->size;
+
+ /* Set the DBT to reference our new record. */
+ rdata->size = F_ISSET(dbp, DB_AM_FIXEDLEN) ? t->re_len : tlen;
+ rdata->dlen = 0;
+ rdata->doff = 0;
+ rdata->flags = 0;
+ *dbt = *rdata;
+ return (0);
+}
+
+/*
+ * __bam_ritem --
+ * Replace an item on a page.
+ *
+ * PUBLIC: int __bam_ritem __P((DBC *, PAGE *, u_int32_t, DBT *, u_int32_t));
+ */
+int
+__bam_ritem(dbc, h, indx, data, typeflag)
+ DBC *dbc;
+ PAGE *h;
+ u_int32_t indx;
+ DBT *data;
+ u_int32_t typeflag;
+{
+ BKEYDATA *bk;
+ BINTERNAL *bi;
+ DB *dbp;
+ DBT orig, repl;
+ db_indx_t cnt, lo, ln, min, off, prefix, suffix;
+ int32_t nbytes;
+ u_int32_t len;
+ int ret;
+ db_indx_t *inp;
+ u_int8_t *dp, *p, *t, type;
+
+ dbp = dbc->dbp;
+ bi = NULL;
+ bk = NULL;
+
+ /*
+ * Replace a single item onto a page. The logic figuring out where
+ * to insert and whether it fits is handled in the caller. All we do
+ * here is manage the page shuffling.
+ */
+ if (TYPE(h) == P_IBTREE) {
+ /* Point at the part of the internal struct past the type. */
+ bi = GET_BINTERNAL(dbp, h, indx);
+ if (B_TYPE(bi->type) == B_OVERFLOW)
+ len = BOVERFLOW_SIZE;
+ else
+ len = bi->len;
+ len += SSZA(BINTERNAL, data) - SSZ(BINTERNAL, unused);
+ dp = &bi->unused;
+ type = typeflag == 0 ? bi->type :
+ (bi->type == B_KEYDATA ? B_OVERFLOW : B_KEYDATA);
+ } else {
+ bk = GET_BKEYDATA(dbp, h, indx);
+ len = bk->len;
+ dp = bk->data;
+ type = bk->type;
+ typeflag = B_DISSET(type);
}
- if (index < (nxtindex = NEXTINDEX(h)))
- memmove(h->linp + index + 1, h->linp + index,
- (nxtindex - index) * sizeof(indx_t));
- h->lower += sizeof(indx_t);
-
- h->linp[index] = h->upper -= nbytes;
- dest = (char *)h + h->upper;
- WR_BLEAF(dest, key, data, dflags);
-
- /* If the cursor is on this page, adjust it as necessary. */
- if (F_ISSET(&t->bt_cursor, CURS_INIT) &&
- !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) &&
- t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index >= index)
- ++t->bt_cursor.pg.index;
-
- if (t->bt_order == NOT)
- if (h->nextpg == P_INVALID) {
- if (index == NEXTINDEX(h) - 1) {
- t->bt_order = FORWARD;
- t->bt_last.index = index;
- t->bt_last.pgno = h->pgno;
- }
- } else if (h->prevpg == P_INVALID) {
- if (index == 0) {
- t->bt_order = BACK;
- t->bt_last.index = 0;
- t->bt_last.pgno = h->pgno;
- }
+ /* Log the change. */
+ if (DBC_LOGGING(dbc)) {
+ /*
+ * We might as well check to see if the two data items share
+ * a common prefix and suffix -- it can save us a lot of log
+ * message if they're large.
+ */
+ min = data->size < len ? data->size : len;
+ for (prefix = 0,
+ p = dp, t = data->data;
+ prefix < min && *p == *t; ++prefix, ++p, ++t)
+ ;
+
+ min -= prefix;
+ for (suffix = 0,
+ p = (u_int8_t *)dp + len - 1,
+ t = (u_int8_t *)data->data + data->size - 1;
+ suffix < min && *p == *t; ++suffix, --p, --t)
+ ;
+
+ /* We only log the parts of the keys that have changed. */
+ orig.data = (u_int8_t *)dp + prefix;
+ orig.size = len - (prefix + suffix);
+ repl.data = (u_int8_t *)data->data + prefix;
+ repl.size = data->size - (prefix + suffix);
+ if ((ret = __bam_repl_log(dbp, dbc->txn, &LSN(h), 0, PGNO(h),
+ &LSN(h), (u_int32_t)indx, typeflag,
+ &orig, &repl, (u_int32_t)prefix, (u_int32_t)suffix)) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(LSN(h));
+
+ /*
+ * Set references to the first in-use byte on the page and the
+ * first byte of the item being replaced.
+ */
+ inp = P_INP(dbp, h);
+ p = (u_int8_t *)h + HOFFSET(h);
+ if (TYPE(h) == P_IBTREE) {
+ t = (u_int8_t *)bi;
+ lo = (db_indx_t)BINTERNAL_SIZE(bi->len);
+ ln = (db_indx_t)BINTERNAL_SIZE(data->size -
+ (SSZA(BINTERNAL, data) - SSZ(BINTERNAL, unused)));
+ } else {
+ t = (u_int8_t *)bk;
+ lo = (db_indx_t)BKEYDATA_SIZE(bk->len);
+ ln = (db_indx_t)BKEYDATA_SIZE(data->size);
+ }
+
+ /*
+ * If the entry is growing in size, shift the beginning of the data
+ * part of the page down. If the entry is shrinking in size, shift
+ * the beginning of the data part of the page up. Use memmove(3),
+ * the regions overlap.
+ */
+ if (lo != ln) {
+ nbytes = lo - ln; /* Signed difference. */
+ if (p == t) /* First index is fast. */
+ inp[indx] += nbytes;
+ else { /* Else, shift the page. */
+ memmove(p + nbytes, p, (size_t)(t - p));
+
+ /* Adjust the indices' offsets. */
+ off = inp[indx];
+ for (cnt = 0; cnt < NUM_ENT(h); ++cnt)
+ if (inp[cnt] <= off)
+ inp[cnt] += nbytes;
}
- mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+ /* Clean up the page and adjust the item's reference. */
+ HOFFSET(h) += nbytes;
+ t += nbytes;
+ }
-success:
- if (flags == R_SETCURSOR)
- __bt_setcur(t, e->page->pgno, e->index);
+ /* Copy the new item onto the page. */
+ bk = (BKEYDATA *)t;
+ bk->len = data->size;
+ B_TSET(bk->type, type);
+ memcpy(bk->data, data->data, bk->len);
- F_SET(t, B_MODIFIED);
- return (RET_SUCCESS);
-}
+ /* Remove the length of the internal header elements. */
+ if (TYPE(h) == P_IBTREE)
+ bk->len -= SSZA(BINTERNAL, data) - SSZ(BINTERNAL, unused);
-#ifdef STATISTICS
-u_long bt_cache_hit, bt_cache_miss;
-#endif
+ return (0);
+}
/*
- * BT_FAST -- Do a quick check for sorted data.
+ * __bam_irep --
+ * Replace an item on an internal page.
*
- * Parameters:
- * t: tree
- * key: key to insert
- *
- * Returns:
- * EPG for new record or NULL if not found.
+ * PUBLIC: int __bam_irep __P((DBC *, PAGE *, u_int32_t, DBT *, DBT *));
*/
-static EPG *
-bt_fast(t, key, data, exactp)
- BTREE *t;
- const DBT *key, *data;
- int *exactp;
+int
+__bam_irep(dbc, h, indx, hdr, data)
+ DBC *dbc;
+ PAGE *h;
+ u_int32_t indx;
+ DBT *hdr;
+ DBT *data;
{
+ BINTERNAL *bi, *bn;
+ DB *dbp;
+ DBT dbt;
+ int ret;
+
+ dbp = dbc->dbp;
+
+ bi = GET_BINTERNAL(dbp, h, indx);
+ bn = (BINTERNAL *) hdr->data;
+
+ if (B_TYPE(bi->type) == B_OVERFLOW &&
+ (ret = __db_doff(dbc, ((BOVERFLOW *)bi->data)->pgno)) != 0)
+ return (ret);
+
+ memset(&dbt, 0, sizeof(dbt));
+ dbt.size = hdr->size + data->size - SSZ(BINTERNAL, unused);
+ if ((ret = __os_malloc(dbp->env, dbt.size, &dbt.data)) != 0)
+ return (ret);
+ memcpy(dbt.data,
+ (u_int8_t *)hdr->data + SSZ(BINTERNAL, unused),
+ hdr->size - SSZ(BINTERNAL, unused));
+ memcpy((u_int8_t *)dbt.data +
+ hdr->size - SSZ(BINTERNAL, unused), data->data, data->size);
+
+ ret = __bam_ritem(dbc, h, indx, &dbt, bi->type != bn->type);
+
+ __os_free(dbp->env, dbt.data);
+ return (ret);
+}
+
+/*
+ * __bam_dup_check --
+ * Check to see if the duplicate set at indx should have its own page.
+ */
+static int
+__bam_dup_check(dbc, op, h, indx, sz, cntp)
+ DBC *dbc;
+ u_int32_t op;
PAGE *h;
- u_int32_t nbytes;
- int cmp;
+ u_int32_t indx, sz;
+ db_indx_t *cntp;
+{
+ BKEYDATA *bk;
+ DB *dbp;
+ db_indx_t cnt, first, *inp;
+
+ dbp = dbc->dbp;
+ inp = P_INP(dbp, h);
+
+ /*
+ * Count the duplicate records and calculate how much room they're
+ * using on the page.
+ */
+ while (indx > 0 && inp[indx] == inp[indx - P_INDX])
+ indx -= P_INDX;
- if ((h = mpool_get(t->bt_mp, t->bt_last.pgno, 0)) == NULL) {
- t->bt_order = NOT;
- return (NULL);
+ /* Count the key once. */
+ bk = GET_BKEYDATA(dbp, h, indx);
+ sz += B_TYPE(bk->type) == B_KEYDATA ?
+ BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE;
+
+ /* Sum up all the data items. */
+ first = indx;
+
+ /*
+ * Account for the record being inserted. If we are replacing it,
+ * don't count it twice.
+ *
+ * We execute the loop with first == indx to get the size of the
+ * first record.
+ */
+ cnt = op == DB_CURRENT ? 0 : 1;
+ for (first = indx;
+ indx < NUM_ENT(h) && inp[first] == inp[indx];
+ ++cnt, indx += P_INDX) {
+ bk = GET_BKEYDATA(dbp, h, indx + O_INDX);
+ sz += B_TYPE(bk->type) == B_KEYDATA ?
+ BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE;
}
- t->bt_cur.page = h;
- t->bt_cur.index = t->bt_last.index;
/*
- * If won't fit in this page or have too many keys in this page,
- * have to search to get split stack.
+ * We have to do these checks when the user is replacing the cursor's
+ * data item -- if the application replaces a duplicate item with a
+ * larger data item, it can increase the amount of space used by the
+ * duplicates, requiring this check. But that means we may have done
+ * this check when it wasn't a duplicate item after all.
+ */
+ if (cnt == 1)
+ return (0);
+
+ /*
+ * If this set of duplicates is using more than 25% of the page, move
+ * them off. The choice of 25% is a WAG, but the value must be small
+ * enough that we can always split a page without putting duplicates
+ * on two different pages.
*/
- nbytes = NBLEAFDBT(key->size, data->size);
- if (h->upper - h->lower < nbytes + sizeof(indx_t))
- goto miss;
-
- if (t->bt_order == FORWARD) {
- if (t->bt_cur.page->nextpg != P_INVALID)
- goto miss;
- if (t->bt_cur.index != NEXTINDEX(h) - 1)
- goto miss;
- if ((cmp = __bt_cmp(t, key, &t->bt_cur)) < 0)
- goto miss;
- t->bt_last.index = cmp ? ++t->bt_cur.index : t->bt_cur.index;
+ if (sz < dbp->pgsize / 4)
+ return (0);
+
+ *cntp = cnt;
+ return (1);
+}
+
+/*
+ * __bam_dup_convert --
+ * Move a set of duplicates off-page and into their own tree.
+ */
+static int
+__bam_dup_convert(dbc, h, indx, cnt)
+ DBC *dbc;
+ PAGE *h;
+ u_int32_t indx, cnt;
+{
+ BKEYDATA *bk;
+ DB *dbp;
+ DBT hdr;
+ DB_LOCK lock;
+ DB_MPOOLFILE *mpf;
+ PAGE *dp;
+ db_indx_t cpindx, dindx, first, *inp;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ inp = P_INP(dbp, h);
+
+ /* Move to the beginning of the dup set. */
+ while (indx > 0 && inp[indx] == inp[indx - P_INDX])
+ indx -= P_INDX;
+
+ /* Get a new page. */
+ if ((ret = __db_new(dbc,
+ dbp->dup_compare == NULL ? P_LRECNO : P_LDUP, &lock, &dp)) != 0)
+ return (ret);
+ P_INIT(dp, dbp->pgsize, dp->pgno,
+ PGNO_INVALID, PGNO_INVALID, LEAFLEVEL, TYPE(dp));
+
+ /*
+ * Move this set of duplicates off the page. First points to the first
+ * key of the first duplicate key/data pair, cnt is the number of pairs
+ * we're dealing with.
+ */
+ memset(&hdr, 0, sizeof(hdr));
+ first = indx;
+ dindx = indx;
+ cpindx = 0;
+ do {
+ /* Move cursors referencing the old entry to the new entry. */
+ if ((ret = __bam_ca_dup(dbc, first,
+ PGNO(h), indx, PGNO(dp), cpindx)) != 0)
+ goto err;
+
+ /*
+ * Copy the entry to the new page. If the off-duplicate page
+ * If the off-duplicate page is a Btree page (i.e. dup_compare
+ * will be non-NULL, we use Btree pages for sorted dups,
+ * and Recno pages for unsorted dups), move all entries
+ * normally, even deleted ones. If it's a Recno page,
+ * deleted entries are discarded (if the deleted entry is
+ * overflow, then free up those pages).
+ */
+ bk = GET_BKEYDATA(dbp, h, dindx + 1);
+ hdr.data = bk;
+ hdr.size = B_TYPE(bk->type) == B_KEYDATA ?
+ BKEYDATA_SIZE(bk->len) : BOVERFLOW_SIZE;
+ if (dbp->dup_compare == NULL && B_DISSET(bk->type)) {
+ /*
+ * Unsorted dups, i.e. recno page, and we have
+ * a deleted entry, don't move it, but if it was
+ * an overflow entry, we need to free those pages.
+ */
+ if (B_TYPE(bk->type) == B_OVERFLOW &&
+ (ret = __db_doff(dbc,
+ (GET_BOVERFLOW(dbp, h, dindx + 1))->pgno)) != 0)
+ goto err;
+ } else {
+ if ((ret = __db_pitem(
+ dbc, dp, cpindx, hdr.size, &hdr, NULL)) != 0)
+ goto err;
+ ++cpindx;
+ }
+ /* Delete all but the last reference to the key. */
+ if (cnt != 1) {
+ if ((ret = __bam_adjindx(dbc,
+ h, dindx, first + 1, 0)) != 0)
+ goto err;
+ } else
+ dindx++;
+
+ /* Delete the data item. */
+ if ((ret = __db_ditem(dbc, h, dindx, hdr.size)) != 0)
+ goto err;
+ indx += P_INDX;
+ } while (--cnt);
+
+ /* Put in a new data item that points to the duplicates page. */
+ if ((ret = __bam_ovput(dbc,
+ B_DUPLICATE, dp->pgno, h, first + 1, NULL)) != 0)
+ goto err;
+
+ /* Adjust cursors for all the above movements. */
+ ret = __bam_ca_di(dbc,
+ PGNO(h), first + P_INDX, (int)(first + P_INDX - indx));
+
+err: if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, dp, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ (void)__TLPUT(dbc, lock);
+ return (ret);
+}
+
+/*
+ * __bam_ovput --
+ * Build an item for an off-page duplicates page or overflow page and
+ * insert it on the page.
+ */
+static int
+__bam_ovput(dbc, type, pgno, h, indx, item)
+ DBC *dbc;
+ u_int32_t type, indx;
+ db_pgno_t pgno;
+ PAGE *h;
+ DBT *item;
+{
+ BOVERFLOW bo;
+ DBT hdr;
+ int ret;
+
+ UMRW_SET(bo.unused1);
+ B_TSET(bo.type, type);
+ UMRW_SET(bo.unused2);
+
+ /*
+ * If we're creating an overflow item, do so and acquire the page
+ * number for it. If we're creating an off-page duplicates tree,
+ * we are giving the page number as an argument.
+ */
+ if (type == B_OVERFLOW) {
+ if ((ret = __db_poff(dbc, item, &bo.pgno)) != 0)
+ return (ret);
+ bo.tlen = item->size;
} else {
- if (t->bt_cur.page->prevpg != P_INVALID)
- goto miss;
- if (t->bt_cur.index != 0)
- goto miss;
- if ((cmp = __bt_cmp(t, key, &t->bt_cur)) > 0)
- goto miss;
- t->bt_last.index = 0;
+ bo.pgno = pgno;
+ bo.tlen = 0;
}
- *exactp = cmp == 0;
-#ifdef STATISTICS
- ++bt_cache_hit;
-#endif
- return (&t->bt_cur);
-
-miss:
-#ifdef STATISTICS
- ++bt_cache_miss;
-#endif
- t->bt_order = NOT;
- mpool_put(t->bt_mp, h, 0);
- return (NULL);
+
+ /* Store the new record on the page. */
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.data = &bo;
+ hdr.size = BOVERFLOW_SIZE;
+ return (__db_pitem(dbc, h, indx, BOVERFLOW_SIZE, &hdr, NULL));
}
diff --git a/btree/bt_rec.c b/btree/bt_rec.c
new file mode 100644
index 0000000..9650d92
--- /dev/null
+++ b/btree/bt_rec.c
@@ -0,0 +1,2035 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+
+#define IS_BTREE_PAGE(pagep) \
+ (TYPE(pagep) == P_IBTREE || \
+ TYPE(pagep) == P_LBTREE || TYPE(pagep) == P_LDUP)
+
+/*
+ * __bam_split_recover --
+ * Recovery function for split.
+ *
+ * PUBLIC: int __bam_split_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_split_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_split_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_LSN *plsnp;
+ DB_MPOOLFILE *mpf;
+ PAGE *_lp, *lp, *np, *pp, *_rp, *rp, *sp;
+ db_pgno_t pgno, parent_pgno;
+ u_int32_t ptype, size;
+ int cmp, l_update, p_update, r_update, ret, rootsplit, t_ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__bam_split_print);
+
+ _lp = lp = np = pp = _rp = rp = NULL;
+ sp = NULL;
+
+ REC_INTRO(__bam_split_read, ip, 0);
+
+ if ((ret = __db_cursor_int(file_dbp, ip, NULL,
+ (argp->opflags & SPL_RECNO) ? DB_RECNO : DB_BTREE,
+ PGNO_INVALID, 0, NULL, &dbc)) != 0)
+ goto out;
+ if (argp->opflags & SPL_NRECS)
+ F_SET((BTREE_CURSOR *)dbc->internal, C_RECNUM);
+ F_SET(dbc, DBC_RECOVER);
+
+ /*
+ * There are two kinds of splits that we have to recover from. The
+ * first is a root-page split, where the root page is split from a
+ * leaf page into an internal page and two new leaf pages are created.
+ * The second is where a page is split into two pages, and a new key
+ * is inserted into the parent page.
+ *
+ * DBTs are not aligned in log records, so we need to copy the page
+ * so that we can access fields within it throughout this routine.
+ * Although we could hardcode the unaligned copies in this routine,
+ * we will be calling into regular btree functions with this page,
+ * so it's got to be aligned. Copying it into allocated memory is
+ * the only way to guarantee this.
+ */
+ if ((ret = __os_malloc(env, argp->pg.size, &sp)) != 0)
+ goto out;
+ memcpy(sp, argp->pg.data, argp->pg.size);
+
+ pgno = PGNO(sp);
+ parent_pgno = argp->ppgno;
+ rootsplit = parent_pgno == pgno;
+
+ /* Get the pages going down the tree. */
+ REC_FGET(mpf, ip, parent_pgno, &pp, left);
+left: REC_FGET(mpf, ip, argp->left, &lp, right);
+right: REC_FGET(mpf, ip, argp->right, &rp, redo);
+
+redo: if (DB_REDO(op)) {
+ l_update = r_update = p_update = 0;
+ /*
+ * Decide if we need to resplit the page.
+ *
+ * If this is a root split, then the root has to exist unless
+ * we have truncated it due to a future deallocation.
+ */
+ if (pp != NULL) {
+ if (rootsplit)
+ plsnp = &LSN(argp->pg.data);
+ else
+ plsnp = &argp->plsn;
+ cmp = LOG_COMPARE(&LSN(pp), plsnp);
+ CHECK_LSN(env, op, cmp, &LSN(pp), plsnp);
+ if (cmp == 0)
+ p_update = 1;
+ }
+
+ if (lp != NULL) {
+ cmp = LOG_COMPARE(&LSN(lp), &argp->llsn);
+ CHECK_LSN(env, op, cmp, &LSN(lp), &argp->llsn);
+ if (cmp == 0)
+ l_update = 1;
+ }
+
+ if (rp != NULL) {
+ cmp = LOG_COMPARE(&LSN(rp), &argp->rlsn);
+ CHECK_LSN(env, op, cmp, &LSN(rp), &argp->rlsn);
+ if (cmp == 0)
+ r_update = 1;
+ }
+
+ if (!p_update && !l_update && !r_update)
+ goto check_next;
+
+ /* Allocate and initialize new left/right child pages. */
+ if ((ret = __os_malloc(env, file_dbp->pgsize, &_lp)) != 0 ||
+ (ret = __os_malloc(env, file_dbp->pgsize, &_rp)) != 0)
+ goto out;
+ if (rootsplit) {
+ P_INIT(_lp, file_dbp->pgsize, argp->left,
+ PGNO_INVALID,
+ ISINTERNAL(sp) ? PGNO_INVALID : argp->right,
+ LEVEL(sp), TYPE(sp));
+ P_INIT(_rp, file_dbp->pgsize, argp->right,
+ ISINTERNAL(sp) ? PGNO_INVALID : argp->left,
+ PGNO_INVALID, LEVEL(sp), TYPE(sp));
+ } else {
+ P_INIT(_lp, file_dbp->pgsize, PGNO(sp),
+ ISINTERNAL(sp) ? PGNO_INVALID : PREV_PGNO(sp),
+ ISINTERNAL(sp) ? PGNO_INVALID : argp->right,
+ LEVEL(sp), TYPE(sp));
+ P_INIT(_rp, file_dbp->pgsize, argp->right,
+ ISINTERNAL(sp) ? PGNO_INVALID : sp->pgno,
+ ISINTERNAL(sp) ? PGNO_INVALID : NEXT_PGNO(sp),
+ LEVEL(sp), TYPE(sp));
+ }
+
+ /* Split the page. */
+ if ((ret = __bam_copy(file_dbp, sp, _lp, 0, argp->indx)) != 0 ||
+ (ret = __bam_copy(file_dbp, sp, _rp, argp->indx,
+ NUM_ENT(sp))) != 0)
+ goto out;
+
+ if (l_update) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &lp);
+ memcpy(lp, _lp, file_dbp->pgsize);
+ lp->lsn = *lsnp;
+ }
+
+ if (r_update) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &rp);
+ memcpy(rp, _rp, file_dbp->pgsize);
+ rp->lsn = *lsnp;
+ }
+
+ /*
+ * Drop the latches on the lower level pages before
+ * getting an exclusive latch on the higher level page.
+ */
+ if (lp != NULL && (ret = __memp_fput(mpf,
+ ip, lp, file_dbp->priority)) && ret == 0)
+ goto out;
+ lp = NULL;
+ if (rp != NULL && (ret = __memp_fput(mpf,
+ ip, rp, file_dbp->priority)) && ret == 0)
+ goto out;
+ rp = NULL;
+ /*
+ * If the parent page is wrong, update it.
+ * Initialize the page. If it is a root page update
+ * the record counts if needed and put the first record in.
+ * Then insert the record for the right hand child page.
+ */
+ if (p_update) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pp);
+ if (argp->opflags & SPL_RECNO)
+ ptype = P_IRECNO;
+ else
+ ptype = P_IBTREE;
+
+ if (rootsplit) {
+ P_INIT(pp, file_dbp->pgsize, pgno, PGNO_INVALID,
+ PGNO_INVALID, _lp->level + 1, ptype);
+ if (argp->opflags & SPL_NRECS) {
+ RE_NREC_SET(pp,
+ __bam_total(file_dbp, _lp) +
+ __bam_total(file_dbp, _rp));
+ }
+ if ((ret = __db_pitem_nolog(dbc, pp,
+ argp->pindx, argp->pentry.size,
+ &argp->pentry, NULL)) != 0)
+ goto out;
+
+ }
+ if ((ret = __db_pitem_nolog(dbc, pp, argp->pindx + 1,
+ argp->rentry.size, &argp->rentry, NULL)) != 0)
+ goto out;
+ pp->lsn = *lsnp;
+ }
+
+check_next: /*
+ * Finally, redo the next-page link if necessary. This is of
+ * interest only if it wasn't a root split -- inserting a new
+ * page in the tree requires that any following page have its
+ * previous-page pointer updated to our new page. The next
+ * page must exist because we're redoing the operation.
+ */
+ if (!rootsplit && argp->npgno != PGNO_INVALID) {
+ REC_FGET(mpf, ip, argp->npgno, &np, done);
+ cmp = LOG_COMPARE(&LSN(np), &argp->nlsn);
+ CHECK_LSN(env, op, cmp, &LSN(np), &argp->nlsn);
+ if (cmp == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &np);
+ PREV_PGNO(np) = argp->right;
+ np->lsn = *lsnp;
+ }
+ }
+ } else {
+ /*
+ * If it's a root split and the left child ever existed, update
+ * its LSN. Otherwise its the split page. If
+ * right child ever existed, root split or not, update its LSN.
+ * The undo of the page allocation(s) will restore them to the
+ * free list.
+ */
+ if (rootsplit && lp != NULL &&
+ LOG_COMPARE(lsnp, &LSN(lp)) == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &lp);
+ lp->lsn = argp->llsn;
+ }
+ if (rp != NULL &&
+ LOG_COMPARE(lsnp, &LSN(rp)) == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &rp);
+ rp->lsn = argp->rlsn;
+ }
+ /*
+ * Drop the lower level pages before getting an exclusive
+ * latch on the parent.
+ */
+ if (rp != NULL && (ret = __memp_fput(mpf,
+ ip, rp, file_dbp->priority)))
+ goto out;
+ rp = NULL;
+
+ /*
+ * Check the state of the split page. If its a rootsplit
+ * then thats the rootpage otherwise its the left page.
+ */
+ if (rootsplit) {
+ DB_ASSERT(env, pgno == argp->ppgno);
+ if (lp != NULL && (ret = __memp_fput(mpf, ip,
+ lp, file_dbp->priority)) != 0)
+ goto out;
+ lp = pp;
+ pp = NULL;
+ }
+ if (lp != NULL) {
+ cmp = LOG_COMPARE(lsnp, &LSN(lp));
+ CHECK_ABORT(env, op, cmp, &LSN(lp), lsnp);
+ if (cmp == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &lp);
+ memcpy(lp, argp->pg.data, argp->pg.size);
+ if ((ret = __memp_fput(mpf,
+ ip, lp, file_dbp->priority)))
+ goto out;
+ lp = NULL;
+ }
+ }
+
+ /*
+ * Next we can update the parent removing the new index.
+ */
+ if (pp != NULL) {
+ DB_ASSERT(env, !rootsplit);
+ cmp = LOG_COMPARE(lsnp, &LSN(pp));
+ CHECK_ABORT(env, op, cmp, &LSN(pp), lsnp);
+ if (cmp == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pp);
+ if (argp->opflags & SPL_RECNO)
+ size = RINTERNAL_SIZE;
+ else
+ size = BINTERNAL_SIZE(
+ GET_BINTERNAL(file_dbp,
+ pp, argp->pindx + 1)->len);
+
+ if ((ret = __db_ditem(dbc, pp,
+ argp->pindx + 1, size)) != 0)
+ goto out;
+ pp->lsn = argp->plsn;
+ }
+ }
+
+ /*
+ * Finally, undo the next-page link if necessary. This is of
+ * interest only if it wasn't a root split -- inserting a new
+ * page in the tree requires that any following page have its
+ * previous-page pointer updated to our new page. Since it's
+ * possible that the next-page never existed, we ignore it as
+ * if there's nothing to undo.
+ */
+ if (!rootsplit && argp->npgno != PGNO_INVALID) {
+ if ((ret = __memp_fget(mpf, &argp->npgno,
+ ip, NULL, DB_MPOOL_EDIT, &np)) != 0) {
+ np = NULL;
+ goto done;
+ }
+ if (LOG_COMPARE(lsnp, &LSN(np)) == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &np);
+ PREV_PGNO(np) = argp->left;
+ np->lsn = argp->nlsn;
+ }
+ }
+ }
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: /* Free any pages that are left. */
+ if (lp != NULL && (t_ret = __memp_fput(mpf,
+ ip, lp, file_dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (np != NULL && (t_ret = __memp_fput(mpf,
+ ip, np, file_dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (rp != NULL && (t_ret = __memp_fput(mpf,
+ ip, rp, file_dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (pp != NULL && (t_ret = __memp_fput(mpf,
+ ip, pp, file_dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Free any allocated space. */
+ if (_lp != NULL)
+ __os_free(env, _lp);
+ if (_rp != NULL)
+ __os_free(env, _rp);
+ if (sp != NULL)
+ __os_free(env, sp);
+
+ REC_CLOSE;
+}
+/*
+ * __bam_split_recover --
+ * Recovery function for split.
+ *
+ * PUBLIC: int __bam_split_42_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_split_42_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_split_42_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *_lp, *lp, *np, *pp, *_rp, *rp, *sp;
+ db_pgno_t pgno, root_pgno;
+ u_int32_t ptype;
+ int cmp, l_update, p_update, r_update, rc, ret, rootsplit, t_ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__bam_split_print);
+
+ _lp = lp = np = pp = _rp = rp = NULL;
+ sp = NULL;
+
+ REC_INTRO(__bam_split_42_read, ip, 0);
+
+ /*
+ * There are two kinds of splits that we have to recover from. The
+ * first is a root-page split, where the root page is split from a
+ * leaf page into an internal page and two new leaf pages are created.
+ * The second is where a page is split into two pages, and a new key
+ * is inserted into the parent page.
+ *
+ * DBTs are not aligned in log records, so we need to copy the page
+ * so that we can access fields within it throughout this routine.
+ * Although we could hardcode the unaligned copies in this routine,
+ * we will be calling into regular btree functions with this page,
+ * so it's got to be aligned. Copying it into allocated memory is
+ * the only way to guarantee this.
+ */
+ if ((ret = __os_malloc(env, argp->pg.size, &sp)) != 0)
+ goto out;
+ memcpy(sp, argp->pg.data, argp->pg.size);
+
+ pgno = PGNO(sp);
+ root_pgno = argp->root_pgno;
+ rootsplit = root_pgno != PGNO_INVALID;
+ REC_FGET(mpf, ip, argp->left, &lp, right);
+right: REC_FGET(mpf, ip, argp->right, &rp, redo);
+
+redo: if (DB_REDO(op)) {
+ l_update = r_update = p_update = 0;
+ /*
+ * Decide if we need to resplit the page.
+ *
+ * If this is a root split, then the root has to exist unless
+ * we have truncated it due to a future deallocation.
+ */
+ if (rootsplit) {
+ REC_FGET(mpf, ip, root_pgno, &pp, do_left);
+ cmp = LOG_COMPARE(&LSN(pp), &LSN(argp->pg.data));
+ CHECK_LSN(env, op,
+ cmp, &LSN(pp), &LSN(argp->pg.data));
+ p_update = cmp == 0;
+ }
+
+do_left: if (lp != NULL) {
+ cmp = LOG_COMPARE(&LSN(lp), &argp->llsn);
+ CHECK_LSN(env, op, cmp, &LSN(lp), &argp->llsn);
+ if (cmp == 0)
+ l_update = 1;
+ }
+
+ if (rp != NULL) {
+ cmp = LOG_COMPARE(&LSN(rp), &argp->rlsn);
+ CHECK_LSN(env, op, cmp, &LSN(rp), &argp->rlsn);
+ if (cmp == 0)
+ r_update = 1;
+ }
+
+ if (!p_update && !l_update && !r_update)
+ goto check_next;
+
+ /* Allocate and initialize new left/right child pages. */
+ if ((ret = __os_malloc(env, file_dbp->pgsize, &_lp)) != 0 ||
+ (ret = __os_malloc(env, file_dbp->pgsize, &_rp)) != 0)
+ goto out;
+ if (rootsplit) {
+ P_INIT(_lp, file_dbp->pgsize, argp->left,
+ PGNO_INVALID,
+ ISINTERNAL(sp) ? PGNO_INVALID : argp->right,
+ LEVEL(sp), TYPE(sp));
+ P_INIT(_rp, file_dbp->pgsize, argp->right,
+ ISINTERNAL(sp) ? PGNO_INVALID : argp->left,
+ PGNO_INVALID, LEVEL(sp), TYPE(sp));
+ } else {
+ P_INIT(_lp, file_dbp->pgsize, PGNO(sp),
+ ISINTERNAL(sp) ? PGNO_INVALID : PREV_PGNO(sp),
+ ISINTERNAL(sp) ? PGNO_INVALID : argp->right,
+ LEVEL(sp), TYPE(sp));
+ P_INIT(_rp, file_dbp->pgsize, argp->right,
+ ISINTERNAL(sp) ? PGNO_INVALID : sp->pgno,
+ ISINTERNAL(sp) ? PGNO_INVALID : NEXT_PGNO(sp),
+ LEVEL(sp), TYPE(sp));
+ }
+
+ /* Split the page. */
+ if ((ret = __bam_copy(file_dbp, sp, _lp, 0, argp->indx)) != 0 ||
+ (ret = __bam_copy(file_dbp, sp, _rp, argp->indx,
+ NUM_ENT(sp))) != 0)
+ goto out;
+
+ if (l_update) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &lp);
+ memcpy(lp, _lp, file_dbp->pgsize);
+ lp->lsn = *lsnp;
+ if ((ret = __memp_fput(mpf,
+ ip, lp, file_dbp->priority)) != 0)
+ goto out;
+ lp = NULL;
+ }
+
+ if (r_update) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &rp);
+ memcpy(rp, _rp, file_dbp->pgsize);
+ rp->lsn = *lsnp;
+ if ((ret = __memp_fput(mpf,
+ ip, rp, file_dbp->priority)) != 0)
+ goto out;
+ rp = NULL;
+ }
+
+ /*
+ * If the parent page is wrong, update it. This is of interest
+ * only if it was a root split, since root splits create parent
+ * pages. All other splits modify a parent page, but those are
+ * separately logged and recovered.
+ */
+ if (rootsplit && p_update) {
+ if (IS_BTREE_PAGE(sp)) {
+ ptype = P_IBTREE;
+ rc = argp->opflags & SPL_NRECS ? 1 : 0;
+ } else {
+ ptype = P_IRECNO;
+ rc = 1;
+ }
+
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pp);
+ P_INIT(pp, file_dbp->pgsize, root_pgno,
+ PGNO_INVALID, PGNO_INVALID, _lp->level + 1, ptype);
+ RE_NREC_SET(pp, rc ? __bam_total(file_dbp, _lp) +
+ __bam_total(file_dbp, _rp) : 0);
+
+ pp->lsn = *lsnp;
+ if ((ret = __memp_fput(mpf,
+ ip, pp, file_dbp->priority)) != 0)
+ goto out;
+ pp = NULL;
+ }
+
+check_next: /*
+ * Finally, redo the next-page link if necessary. This is of
+ * interest only if it wasn't a root split -- inserting a new
+ * page in the tree requires that any following page have its
+ * previous-page pointer updated to our new page. The next
+ * page must exist because we're redoing the operation.
+ */
+ if (!rootsplit && argp->npgno != PGNO_INVALID) {
+ if ((ret = __memp_fget(mpf, &argp->npgno,
+ ip, NULL, 0, &np)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(
+ file_dbp, argp->npgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+ cmp = LOG_COMPARE(&LSN(np), &argp->nlsn);
+ CHECK_LSN(env, op, cmp, &LSN(np), &argp->nlsn);
+ if (cmp == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &np);
+ PREV_PGNO(np) = argp->right;
+ np->lsn = *lsnp;
+ if ((ret = __memp_fput(mpf, ip,
+ np, file_dbp->priority)) != 0)
+ goto out;
+ np = NULL;
+ }
+ }
+ } else {
+ /*
+ * If the split page is wrong, replace its contents with the
+ * logged page contents. If the page doesn't exist, it means
+ * that the create of the page never happened, nor did any of
+ * the adds onto the page that caused the split, and there's
+ * really no undo-ing to be done.
+ */
+ if ((ret = __memp_fget(mpf, &pgno, ip, NULL,
+ DB_MPOOL_EDIT, &pp)) != 0) {
+ pp = NULL;
+ goto lrundo;
+ }
+ if (LOG_COMPARE(lsnp, &LSN(pp)) == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pp);
+ memcpy(pp, argp->pg.data, argp->pg.size);
+ if ((ret = __memp_fput(mpf,
+ ip, pp, file_dbp->priority)) != 0)
+ goto out;
+ pp = NULL;
+ }
+
+ /*
+ * If it's a root split and the left child ever existed, update
+ * its LSN. (If it's not a root split, we've updated the left
+ * page already -- it's the same as the split page.) If the
+ * right child ever existed, root split or not, update its LSN.
+ * The undo of the page allocation(s) will restore them to the
+ * free list.
+ */
+lrundo: if ((rootsplit && lp != NULL) || rp != NULL) {
+ if (rootsplit && lp != NULL &&
+ LOG_COMPARE(lsnp, &LSN(lp)) == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &lp);
+ lp->lsn = argp->llsn;
+ if ((ret = __memp_fput(mpf, ip,
+ lp, file_dbp->priority)) != 0)
+ goto out;
+ lp = NULL;
+ }
+ if (rp != NULL &&
+ LOG_COMPARE(lsnp, &LSN(rp)) == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &rp);
+ rp->lsn = argp->rlsn;
+ if ((ret = __memp_fput(mpf, ip,
+ rp, file_dbp->priority)) != 0)
+ goto out;
+ rp = NULL;
+ }
+ }
+
+ /*
+ * Finally, undo the next-page link if necessary. This is of
+ * interest only if it wasn't a root split -- inserting a new
+ * page in the tree requires that any following page have its
+ * previous-page pointer updated to our new page. Since it's
+ * possible that the next-page never existed, we ignore it as
+ * if there's nothing to undo.
+ */
+ if (!rootsplit && argp->npgno != PGNO_INVALID) {
+ if ((ret = __memp_fget(mpf, &argp->npgno,
+ ip, NULL, DB_MPOOL_EDIT, &np)) != 0) {
+ np = NULL;
+ goto done;
+ }
+ if (LOG_COMPARE(lsnp, &LSN(np)) == 0) {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &np);
+ PREV_PGNO(np) = argp->left;
+ np->lsn = argp->nlsn;
+ if (__memp_fput(mpf,
+ ip, np, file_dbp->priority))
+ goto out;
+ np = NULL;
+ }
+ }
+ }
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: /* Free any pages that weren't dirtied. */
+ if (pp != NULL && (t_ret = __memp_fput(mpf,
+ ip, pp, file_dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (lp != NULL && (t_ret = __memp_fput(mpf,
+ ip, lp, file_dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (np != NULL && (t_ret = __memp_fput(mpf,
+ ip, np, file_dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (rp != NULL && (t_ret = __memp_fput(mpf,
+ ip, rp, file_dbp->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Free any allocated space. */
+ if (_lp != NULL)
+ __os_free(env, _lp);
+ if (_rp != NULL)
+ __os_free(env, _rp);
+ if (sp != NULL)
+ __os_free(env, sp);
+
+ REC_CLOSE;
+}
+
+/*
+ * __bam_rsplit_recover --
+ * Recovery function for a reverse split.
+ *
+ * PUBLIC: int __bam_rsplit_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_rsplit_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_rsplit_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_LSN copy_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ db_pgno_t pgno, root_pgno;
+ db_recno_t rcnt;
+ int cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__bam_rsplit_print);
+ REC_INTRO(__bam_rsplit_read, ip, 1);
+
+ /* Fix the root page. */
+ pgno = root_pgno = argp->root_pgno;
+ if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, pgno, ret);
+ goto out;
+ } else
+ goto do_page;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->rootlsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->rootlsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /*
+ * Copy the new data to the root page. If it is not now a
+ * leaf page we need to restore the record number. We could
+ * try to determine if C_RECNUM was set in the btree, but
+ * that's not really necessary since the field is not used
+ * otherwise.
+ */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ rcnt = RE_NREC(pagep);
+ memcpy(pagep, argp->pgdbt.data, argp->pgdbt.size);
+ if (LEVEL(pagep) > LEAFLEVEL)
+ RE_NREC_SET(pagep, rcnt);
+ pagep->pgno = root_pgno;
+ pagep->lsn = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ P_INIT(pagep, file_dbp->pgsize, root_pgno,
+ argp->nrec, PGNO_INVALID, pagep->level + 1,
+ IS_BTREE_PAGE(pagep) ? P_IBTREE : P_IRECNO);
+ if ((ret = __db_pitem(dbc, pagep, 0,
+ argp->rootent.size, &argp->rootent, NULL)) != 0)
+ goto out;
+ pagep->lsn = argp->rootlsn;
+ }
+ if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+ goto out;
+
+do_page:
+ /*
+ * Fix the page copied over the root page. It's possible that the
+ * page never made it to disk, or was truncated so if the page
+ * doesn't exist, it's okay and there's nothing further to do.
+ */
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+ (void)__ua_memcpy(&copy_lsn, &LSN(argp->pgdbt.data), sizeof(DB_LSN));
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &copy_lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &copy_lsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ pagep->lsn = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ memcpy(pagep, argp->pgdbt.data, argp->pgdbt.size);
+ }
+ if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, dbc->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __bam_adj_recover --
+ * Recovery function for adj.
+ *
+ * PUBLIC: int __bam_adj_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_adj_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_adj_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__bam_adj_print);
+ REC_INTRO(__bam_adj_read, ip, 1);
+
+ /* Get the page; if it never existed and we're undoing, we're done. */
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if ((ret = __bam_adjindx(dbc,
+ pagep, argp->indx, argp->indx_copy, argp->is_insert)) != 0)
+ goto out;
+
+ LSN(pagep) = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if ((ret = __bam_adjindx(dbc,
+ pagep, argp->indx, argp->indx_copy, !argp->is_insert)) != 0)
+ goto out;
+
+ LSN(pagep) = argp->lsn;
+ }
+ if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, dbc->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __bam_cadjust_recover --
+ * Recovery function for the adjust of a count change in an internal
+ * page.
+ *
+ * PUBLIC: int __bam_cadjust_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_cadjust_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_cadjust_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__bam_cadjust_print);
+ REC_INTRO(__bam_cadjust_read, ip, 0);
+
+ /* Get the page; if it never existed and we're undoing, we're done. */
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ if (IS_BTREE_PAGE(pagep)) {
+ GET_BINTERNAL(file_dbp, pagep, argp->indx)->nrecs +=
+ argp->adjust;
+ if (argp->opflags & CAD_UPDATEROOT)
+ RE_NREC_ADJ(pagep, argp->adjust);
+ } else {
+ GET_RINTERNAL(file_dbp, pagep, argp->indx)->nrecs +=
+ argp->adjust;
+ if (argp->opflags & CAD_UPDATEROOT)
+ RE_NREC_ADJ(pagep, argp->adjust);
+ }
+
+ LSN(pagep) = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ if (IS_BTREE_PAGE(pagep)) {
+ GET_BINTERNAL(file_dbp, pagep, argp->indx)->nrecs -=
+ argp->adjust;
+ if (argp->opflags & CAD_UPDATEROOT)
+ RE_NREC_ADJ(pagep, -(argp->adjust));
+ } else {
+ GET_RINTERNAL(file_dbp, pagep, argp->indx)->nrecs -=
+ argp->adjust;
+ if (argp->opflags & CAD_UPDATEROOT)
+ RE_NREC_ADJ(pagep, -(argp->adjust));
+ }
+ LSN(pagep) = argp->lsn;
+ }
+ if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __bam_cdel_recover --
+ * Recovery function for the intent-to-delete of a cursor record.
+ *
+ * PUBLIC: int __bam_cdel_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_cdel_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_cdel_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ u_int32_t indx;
+ int cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__bam_cdel_print);
+ REC_INTRO(__bam_cdel_read, ip, 0);
+
+ /* Get the page; if it never existed and we're undoing, we're done. */
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ indx = argp->indx + (TYPE(pagep) == P_LBTREE ? O_INDX : 0);
+ B_DSET(GET_BKEYDATA(file_dbp, pagep, indx)->type);
+
+ LSN(pagep) = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Need to undo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ indx = argp->indx + (TYPE(pagep) == P_LBTREE ? O_INDX : 0);
+ B_DCLR(GET_BKEYDATA(file_dbp, pagep, indx)->type);
+
+ if ((ret = __bam_ca_delete(
+ file_dbp, argp->pgno, argp->indx, 0, NULL)) != 0)
+ goto out;
+
+ LSN(pagep) = argp->lsn;
+ }
+ if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __bam_repl_recover --
+ * Recovery function for page item replacement.
+ *
+ * PUBLIC: int __bam_repl_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_repl_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_repl_args *argp;
+ DB_THREAD_INFO *ip;
+ BKEYDATA *bk;
+ BINTERNAL *bi;
+ DB *file_dbp;
+ DBC *dbc;
+ DBT dbt;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, ret;
+ u_int32_t len;
+ u_int8_t *dp, *p;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__bam_repl_print);
+ REC_INTRO(__bam_repl_read, ip, 1);
+
+ /* Get the page; if it never existed and we're undoing, we're done. */
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /*
+ * Need to redo update described.
+ *
+ * Re-build the replacement item.
+ */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if (TYPE(pagep) == P_IBTREE) {
+ /* Point at the internal struct past the type. */
+ bi = GET_BINTERNAL(file_dbp, pagep, argp->indx);
+ dp = &bi->unused;
+ len = bi->len +
+ SSZA(BINTERNAL, data) - SSZ(BINTERNAL, unused);
+ } else {
+ bk = GET_BKEYDATA(file_dbp, pagep, argp->indx);
+ dp = bk->data;
+ len = bk->len;
+ }
+ memset(&dbt, 0, sizeof(dbt));
+ dbt.size = argp->prefix + argp->suffix + argp->repl.size;
+ if ((ret = __os_malloc(env, dbt.size, &dbt.data)) != 0)
+ goto out;
+ p = dbt.data;
+ memcpy(p, dp, argp->prefix);
+ p += argp->prefix;
+ memcpy(p, argp->repl.data, argp->repl.size);
+ p += argp->repl.size;
+ memcpy(p, dp + (len - argp->suffix), argp->suffix);
+
+ /* isdeleted has become the type flag for non-leaf replace */
+ ret = __bam_ritem(dbc,
+ pagep, argp->indx, &dbt, argp->isdeleted);
+ __os_free(env, dbt.data);
+ if (ret != 0)
+ goto out;
+
+ LSN(pagep) = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /*
+ * Need to undo update described.
+ *
+ * Re-build the original item.
+ */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if (TYPE(pagep) == P_IBTREE) {
+ /* Point at the internal struct past the type. */
+ bi = GET_BINTERNAL(file_dbp, pagep, argp->indx);
+ dp = &bi->unused;
+ len = bi->len +
+ SSZA(BINTERNAL, data) - SSZ(BINTERNAL, unused);
+ } else {
+ bk = GET_BKEYDATA(file_dbp, pagep, argp->indx);
+ dp = bk->data;
+ len = bk->len;
+ }
+ memset(&dbt, 0, sizeof(dbt));
+ dbt.size = argp->prefix + argp->suffix + argp->orig.size;
+ if ((ret = __os_malloc(env, dbt.size, &dbt.data)) != 0)
+ goto out;
+ p = dbt.data;
+ memcpy(p, dp, argp->prefix);
+ p += argp->prefix;
+ memcpy(p, argp->orig.data, argp->orig.size);
+ p += argp->orig.size;
+ memcpy(p, dp + (len - argp->suffix), argp->suffix);
+
+ ret = __bam_ritem(dbc,
+ pagep, argp->indx, &dbt, argp->isdeleted);
+ __os_free(env, dbt.data);
+ if (ret != 0)
+ goto out;
+
+ /* Reset the deleted flag, if necessary. */
+ if (argp->isdeleted && LEVEL(pagep) == LEAFLEVEL)
+ B_DSET(GET_BKEYDATA(file_dbp, pagep, argp->indx)->type);
+
+ LSN(pagep) = argp->lsn;
+ }
+ if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, dbc->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __bam_root_recover --
+ * Recovery function for setting the root page on the meta-data page.
+ *
+ * PUBLIC: int __bam_root_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_root_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_root_args *argp;
+ DB_THREAD_INFO *ip;
+ BTMETA *meta;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ int cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ meta = NULL;
+ REC_PRINT(__bam_root_print);
+ REC_INTRO(__bam_root_read, ip, 0);
+
+ if ((ret = __memp_fget(mpf, &argp->meta_pgno, ip, NULL,
+ 0, &meta)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->meta_pgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(meta));
+ cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn);
+ CHECK_ABORT(env, op, cmp_n, &LSN(meta), lsnp);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to redo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+ meta->root = argp->root_pgno;
+ meta->dbmeta.lsn = *lsnp;
+ ((BTREE *)file_dbp->bt_internal)->bt_root = meta->root;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Nothing to undo except lsn. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &meta);
+ meta->dbmeta.lsn = argp->meta_lsn;
+ }
+ if ((ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0)
+ goto out;
+ meta = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (meta != NULL)
+ (void)__memp_fput(mpf, ip, meta, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __bam_curadj_recover --
+ * Transaction abort function to undo cursor adjustments.
+ * This should only be triggered by subtransaction aborts.
+ *
+ * PUBLIC: int __bam_curadj_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_curadj_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_curadj_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ int ret;
+
+ COMPQUIET(mpf, NULL);
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__bam_curadj_print);
+ REC_INTRO(__bam_curadj_read, ip, 1);
+
+ ret = 0;
+ if (op != DB_TXN_ABORT)
+ goto done;
+
+ switch (argp->mode) {
+ case DB_CA_DI:
+ if ((ret = __bam_ca_di(dbc, argp->from_pgno,
+ argp->from_indx, -(int)argp->first_indx)) != 0)
+ goto out;
+ break;
+ case DB_CA_DUP:
+ if ((ret = __bam_ca_undodup(file_dbp, argp->first_indx,
+ argp->from_pgno, argp->from_indx, argp->to_indx)) != 0)
+ goto out;
+ break;
+
+ case DB_CA_RSPLIT:
+ if ((ret =
+ __bam_ca_rsplit(dbc, argp->to_pgno, argp->from_pgno)) != 0)
+ goto out;
+ break;
+
+ case DB_CA_SPLIT:
+ if ((ret = __bam_ca_undosplit(file_dbp, argp->from_pgno,
+ argp->to_pgno, argp->left_pgno, argp->from_indx)) != 0)
+ goto out;
+ break;
+ }
+
+done: *lsnp = argp->prev_lsn;
+out: REC_CLOSE;
+}
+
+/*
+ * __bam_rcuradj_recover --
+ * Transaction abort function to undo cursor adjustments in rrecno.
+ * This should only be triggered by subtransaction aborts.
+ *
+ * PUBLIC: int __bam_rcuradj_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_rcuradj_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_rcuradj_args *argp;
+ DB_THREAD_INFO *ip;
+ BTREE_CURSOR *cp;
+ DB *file_dbp;
+ DBC *dbc, *rdbc;
+ DB_MPOOLFILE *mpf;
+ int ret, t_ret;
+
+ COMPQUIET(mpf, NULL);
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ rdbc = NULL;
+ REC_PRINT(__bam_rcuradj_print);
+ REC_INTRO(__bam_rcuradj_read, ip, 1);
+
+ ret = t_ret = 0;
+
+ if (op != DB_TXN_ABORT)
+ goto done;
+
+ /*
+ * We don't know whether we're in an offpage dup set, and
+ * thus don't know whether the dbc REC_INTRO has handed us is
+ * of a reasonable type. It's certainly unset, so if this is
+ * an offpage dup set, we don't have an OPD cursor. The
+ * simplest solution is just to allocate a whole new cursor
+ * for our use; we're only really using it to hold pass some
+ * state into __ram_ca, and this way we don't need to make
+ * this function know anything about how offpage dups work.
+ */
+ if ((ret = __db_cursor_int(file_dbp, NULL,
+ NULL, DB_RECNO, argp->root, 0, NULL, &rdbc)) != 0)
+ goto out;
+
+ cp = (BTREE_CURSOR *)rdbc->internal;
+ F_SET(cp, C_RENUMBER);
+ cp->recno = argp->recno;
+
+ switch (argp->mode) {
+ case CA_DELETE:
+ /*
+ * The way to undo a delete is with an insert. Since
+ * we're undoing it, the delete flag must be set.
+ */
+ F_SET(cp, C_DELETED);
+ F_SET(cp, C_RENUMBER); /* Just in case. */
+ cp->order = argp->order;
+ if ((ret = __ram_ca(rdbc, CA_ICURRENT, NULL)) != 0)
+ goto out;
+ break;
+ case CA_IAFTER:
+ case CA_IBEFORE:
+ case CA_ICURRENT:
+ /*
+ * The way to undo an insert is with a delete. The delete
+ * flag is unset to start with.
+ */
+ F_CLR(cp, C_DELETED);
+ cp->order = INVALID_ORDER;
+ if ((ret = __ram_ca(rdbc, CA_DELETE, NULL)) != 0)
+ goto out;
+ break;
+ }
+
+done: *lsnp = argp->prev_lsn;
+out: if (rdbc != NULL && (t_ret = __dbc_close(rdbc)) != 0 && ret == 0)
+ ret = t_ret;
+ REC_CLOSE;
+}
+
+/*
+ * __bam_relink_recover --
+ * Recovery function for relink.
+ *
+ * PUBLIC: int __bam_relink_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_relink_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_relink_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__bam_relink_print);
+ REC_INTRO(__bam_relink_read, ip, 0);
+
+ /*
+ * There are up to three pages we need to check -- the page, and the
+ * previous and next pages, if they existed. For a page add operation,
+ * the current page is the result of a split and is being recovered
+ * elsewhere, so all we need do is recover the next page.
+ */
+ if (argp->next == PGNO_INVALID)
+ goto prev;
+ if ((ret = __memp_fget(mpf, &argp->next, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->next, ret);
+ goto out;
+ } else
+ goto prev;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_next);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_next);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Redo the remove or replace. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ if (argp->new_pgno == PGNO_INVALID)
+ pagep->prev_pgno = argp->prev;
+ else
+ pagep->prev_pgno = argp->new_pgno;
+
+ pagep->lsn = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Undo the remove or replace. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->prev_pgno = argp->pgno;
+
+ pagep->lsn = argp->lsn_next;
+ }
+
+ if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+prev: if (argp->prev == PGNO_INVALID)
+ goto done;
+ if ((ret = __memp_fget(mpf, &argp->prev, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->prev, ret);
+ goto out;
+ } else
+ goto done;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_prev);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_prev);
+ CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Redo the relink. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ if (argp->new_pgno == PGNO_INVALID)
+ pagep->next_pgno = argp->next;
+ else
+ pagep->next_pgno = argp->new_pgno;
+
+ pagep->lsn = *lsnp;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Undo the relink. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->next_pgno = argp->pgno;
+ pagep->lsn = argp->lsn_prev;
+ }
+
+ if ((ret = __memp_fput(mpf,
+ ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ REC_CLOSE;
+}
+
+/*
+ * __bam_merge_44_recover --
+ * Recovery function for merge.
+ *
+ * PUBLIC: int __bam_merge_44_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_merge_44_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_merge_44_args *argp;
+ DB_THREAD_INFO *ip;
+ BKEYDATA *bk;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ db_indx_t indx, *ninp, *pinp;
+ u_int32_t size;
+ u_int8_t *bp;
+ int cmp_n, cmp_p, i, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__bam_merge_44_print);
+ REC_INTRO(__bam_merge_44_read, ip, 1);
+
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ } else
+ goto next;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(file_dbp->env, op, cmp_p, &LSN(pagep), &argp->lsn);
+
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /*
+ * If the header is provided the page is empty, copy the
+ * needed data.
+ */
+ DB_ASSERT(env, argp->hdr.size == 0 || NUM_ENT(pagep) == 0);
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if (argp->hdr.size != 0) {
+ P_INIT(pagep, file_dbp->pgsize, pagep->pgno,
+ PREV_PGNO(argp->hdr.data),
+ NEXT_PGNO(argp->hdr.data),
+ LEVEL(argp->hdr.data), TYPE(argp->hdr.data));
+ }
+ if (TYPE(pagep) == P_OVERFLOW) {
+ OV_REF(pagep) = OV_REF(argp->hdr.data);
+ OV_LEN(pagep) = OV_LEN(argp->hdr.data);
+ bp = (u_int8_t *) pagep + P_OVERHEAD(file_dbp);
+ memcpy(bp, argp->data.data, argp->data.size);
+ } else {
+ /* Copy the data segment. */
+ bp = (u_int8_t *)pagep +
+ (db_indx_t)(HOFFSET(pagep) - argp->data.size);
+ memcpy(bp, argp->data.data, argp->data.size);
+
+ /* Copy index table offset past the current entries. */
+ pinp = P_INP(file_dbp, pagep) + NUM_ENT(pagep);
+ ninp = argp->ind.data;
+ for (i = 0;
+ i < (int)(argp->ind.size / sizeof(*ninp)); i++)
+ *pinp++ = *ninp++
+ - (file_dbp->pgsize - HOFFSET(pagep));
+ HOFFSET(pagep) -= argp->data.size;
+ NUM_ENT(pagep) += i;
+ }
+ pagep->lsn = *lsnp;
+ } else if (cmp_n == 0 && !DB_REDO(op)) {
+ /*
+ * Since logging is logical at the page level
+ * we cannot just truncate the data space. Delete
+ * the proper number of items from the logical end
+ * of the page.
+ */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ for (i = 0; i < (int)(argp->ind.size / sizeof(*ninp)); i++) {
+ indx = NUM_ENT(pagep) - 1;
+ if (P_INP(file_dbp, pagep)[indx] ==
+ P_INP(file_dbp, pagep)[indx - P_INDX]) {
+ NUM_ENT(pagep)--;
+ continue;
+ }
+ switch (TYPE(pagep)) {
+ case P_LBTREE:
+ case P_LRECNO:
+ case P_LDUP:
+ bk = GET_BKEYDATA(file_dbp, pagep, indx);
+ size = BITEM_SIZE(bk);
+ break;
+
+ case P_IBTREE:
+ size = BINTERNAL_SIZE(
+ GET_BINTERNAL(file_dbp, pagep, indx)->len);
+ break;
+ case P_IRECNO:
+ size = RINTERNAL_SIZE;
+ break;
+
+ default:
+ ret = __db_pgfmt(env, PGNO(pagep));
+ goto out;
+ }
+ if ((ret =
+ __db_ditem(dbc, pagep, indx, size)) != 0)
+ goto out;
+ }
+ if (argp->ind.size == 0)
+ HOFFSET(pagep) = file_dbp->pgsize;
+ pagep->lsn = argp->lsn;
+ }
+
+ if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+ goto out;
+
+next: if ((ret = __memp_fget(mpf, &argp->npgno, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nlsn);
+ CHECK_LSN(file_dbp->env, op, cmp_p, &LSN(pagep), &argp->nlsn);
+
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to truncate the page. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ HOFFSET(pagep) = file_dbp->pgsize;
+ NUM_ENT(pagep) = 0;
+ pagep->lsn = *lsnp;
+ } else if (cmp_n == 0 && !DB_REDO(op)) {
+ /* Need to put the data back on the page. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if (TYPE(pagep) == P_OVERFLOW) {
+ OV_REF(pagep) = OV_REF(argp->hdr.data);
+ OV_LEN(pagep) = OV_LEN(argp->hdr.data);
+ bp = (u_int8_t *) pagep + P_OVERHEAD(file_dbp);
+ memcpy(bp, argp->data.data, argp->data.size);
+ } else {
+ bp = (u_int8_t *)pagep +
+ (db_indx_t)(HOFFSET(pagep) - argp->data.size);
+ memcpy(bp, argp->data.data, argp->data.size);
+
+ /* Copy index table. */
+ pinp = P_INP(file_dbp, pagep) + NUM_ENT(pagep);
+ ninp = argp->ind.data;
+ for (i = 0;
+ i < (int)(argp->ind.size / sizeof(*ninp)); i++)
+ *pinp++ = *ninp++;
+ HOFFSET(pagep) -= argp->data.size;
+ NUM_ENT(pagep) = i;
+ }
+ pagep->lsn = argp->nlsn;
+ }
+
+ if ((ret = __memp_fput(mpf,
+ ip, pagep, dbc->priority)) != 0)
+ goto out;
+done:
+ *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: REC_CLOSE;
+}
+
+/*
+ * __bam_merge_recover --
+ * Recovery function for merge.
+ *
+ * PUBLIC: int __bam_merge_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_merge_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_merge_args *argp;
+ DB_THREAD_INFO *ip;
+ BKEYDATA *bk;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ db_indx_t indx, *ninp, *pinp;
+ u_int32_t size;
+ u_int8_t *bp;
+ int cmp_n, cmp_p, i, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__bam_merge_print);
+ REC_INTRO(__bam_merge_read, ip, 1);
+
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ } else
+ goto next;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(file_dbp->env, op, cmp_p, &LSN(pagep), &argp->lsn);
+ CHECK_ABORT(file_dbp->env, op, cmp_n, &LSN(pagep), lsnp);
+
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /*
+ * When pg_copy is set, we are copying onto a new page.
+ */
+ DB_ASSERT(env, !argp->pg_copy || NUM_ENT(pagep) == 0);
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if (argp->pg_copy) {
+ P_INIT(pagep, file_dbp->pgsize, pagep->pgno,
+ PREV_PGNO(argp->hdr.data),
+ NEXT_PGNO(argp->hdr.data),
+ LEVEL(argp->hdr.data), TYPE(argp->hdr.data));
+ }
+ if (TYPE(pagep) == P_OVERFLOW) {
+ OV_REF(pagep) = OV_REF(argp->hdr.data);
+ OV_LEN(pagep) = OV_LEN(argp->hdr.data);
+ bp = (u_int8_t *)pagep + P_OVERHEAD(file_dbp);
+ memcpy(bp, argp->data.data, argp->data.size);
+ } else {
+ /* Copy the data segment. */
+ bp = (u_int8_t *)pagep +
+ (db_indx_t)(HOFFSET(pagep) - argp->data.size);
+ memcpy(bp, argp->data.data, argp->data.size);
+
+ /* Copy index table offset past the current entries. */
+ pinp = P_INP(file_dbp, pagep) + NUM_ENT(pagep);
+ ninp = P_INP(file_dbp, argp->hdr.data);
+ for (i = 0; i < NUM_ENT(argp->hdr.data); i++)
+ *pinp++ = *ninp++
+ - (file_dbp->pgsize - HOFFSET(pagep));
+ HOFFSET(pagep) -= argp->data.size;
+ NUM_ENT(pagep) += i;
+ }
+ pagep->lsn = *lsnp;
+ } else if (cmp_n == 0 && !DB_REDO(op)) {
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if (TYPE(pagep) == P_OVERFLOW) {
+ HOFFSET(pagep) = file_dbp->pgsize;
+ goto setlsn;
+ }
+
+ /*
+ * Since logging is logical at the page level we cannot just
+ * truncate the data space. Delete the proper number of items
+ * from the logical end of the page.
+ */
+ for (i = 0; i < NUM_ENT(argp->hdr.data); i++) {
+ indx = NUM_ENT(pagep) - 1;
+ if (P_INP(file_dbp, pagep)[indx] ==
+ P_INP(file_dbp, pagep)[indx - P_INDX]) {
+ NUM_ENT(pagep)--;
+ continue;
+ }
+ switch (TYPE(pagep)) {
+ case P_LBTREE:
+ case P_LRECNO:
+ case P_LDUP:
+ bk = GET_BKEYDATA(file_dbp, pagep, indx);
+ size = BITEM_SIZE(bk);
+ break;
+
+ case P_IBTREE:
+ size = BINTERNAL_SIZE(
+ GET_BINTERNAL(file_dbp, pagep, indx)->len);
+ break;
+ case P_IRECNO:
+ size = RINTERNAL_SIZE;
+ break;
+
+ default:
+ ret = __db_pgfmt(env, PGNO(pagep));
+ goto out;
+ }
+ if ((ret = __db_ditem(dbc, pagep, indx, size)) != 0)
+ goto out;
+ }
+setlsn: pagep->lsn = argp->lsn;
+ }
+
+ if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0)
+ goto out;
+
+next: if ((ret = __memp_fget(mpf, &argp->npgno, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ } else
+ goto done;
+ }
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nlsn);
+ CHECK_LSN(file_dbp->env, op, cmp_p, &LSN(pagep), &argp->nlsn);
+
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Need to truncate the page. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ HOFFSET(pagep) = file_dbp->pgsize;
+ NUM_ENT(pagep) = 0;
+ pagep->lsn = *lsnp;
+ } else if (cmp_n == 0 && !DB_REDO(op)) {
+ /* Need to put the data back on the page. */
+ REC_DIRTY(mpf, ip, dbc->priority, &pagep);
+ if (TYPE(pagep) == P_OVERFLOW) {
+ OV_REF(pagep) = OV_REF(argp->hdr.data);
+ OV_LEN(pagep) = OV_LEN(argp->hdr.data);
+ bp = (u_int8_t *)pagep + P_OVERHEAD(file_dbp);
+ memcpy(bp, argp->data.data, argp->data.size);
+ } else {
+ bp = (u_int8_t *)pagep +
+ (db_indx_t)(HOFFSET(pagep) - argp->data.size);
+ memcpy(bp, argp->data.data, argp->data.size);
+
+ /* Copy index table. */
+ pinp = P_INP(file_dbp, pagep) + NUM_ENT(pagep);
+ ninp = P_INP(file_dbp, argp->hdr.data);
+ for (i = 0; i < NUM_ENT(argp->hdr.data); i++)
+ *pinp++ = *ninp++;
+ HOFFSET(pagep) -= argp->data.size;
+ NUM_ENT(pagep) += i;
+ }
+ pagep->lsn = argp->nlsn;
+ }
+
+ if ((ret = __memp_fput(mpf,
+ ip, pagep, dbc->priority)) != 0)
+ goto out;
+done:
+ *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: REC_CLOSE;
+}
+
+/*
+ * __bam_pgno_recover --
+ * Recovery function for page number replacment.
+ *
+ * PUBLIC: int __bam_pgno_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_pgno_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ BINTERNAL *bi;
+ __bam_pgno_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep, *npagep;
+ db_pgno_t *pgnop;
+ int cmp_n, cmp_p, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ REC_PRINT(__bam_pgno_print);
+ REC_INTRO(__bam_pgno_read, ip, 0);
+
+ REC_FGET(mpf, ip, argp->pgno, &pagep, done);
+
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(file_dbp->env, op, cmp_p, &LSN(pagep), &argp->lsn);
+ CHECK_ABORT(file_dbp->env, op, cmp_n, &LSN(pagep), lsnp);
+
+ if ((cmp_p == 0 && DB_REDO(op)) || (cmp_n == 0 && !DB_REDO(op))) {
+ switch (TYPE(pagep)) {
+ case P_IBTREE:
+ /*
+ * An internal record can have both a overflow
+ * and child pointer. Fetch the page to see
+ * which it is.
+ */
+ bi = GET_BINTERNAL(file_dbp, pagep, argp->indx);
+ if (B_TYPE(bi->type) == B_OVERFLOW) {
+ REC_FGET(mpf, ip, argp->npgno, &npagep, out);
+
+ if (TYPE(npagep) == P_OVERFLOW)
+ pgnop =
+ &((BOVERFLOW *)(bi->data))->pgno;
+ else
+ pgnop = &bi->pgno;
+ if ((ret = __memp_fput(mpf, ip,
+ npagep, file_dbp->priority)) != 0)
+ goto out;
+ break;
+ }
+ pgnop = &bi->pgno;
+ break;
+ case P_IRECNO:
+ pgnop =
+ &GET_RINTERNAL(file_dbp, pagep, argp->indx)->pgno;
+ break;
+ default:
+ pgnop =
+ &GET_BOVERFLOW(file_dbp, pagep, argp->indx)->pgno;
+ break;
+ }
+
+ if (DB_REDO(op)) {
+ /* Need to redo update described. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ *pgnop = argp->npgno;
+ pagep->lsn = *lsnp;
+ } else {
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ *pgnop = argp->opgno;
+ pagep->lsn = argp->lsn;
+ }
+ }
+
+ if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+
+done:
+ *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: REC_CLOSE;
+}
+
+/*
+ * __bam_relink_43_recover --
+ * Recovery function for relink.
+ *
+ * PUBLIC: int __bam_relink_43_recover
+ * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *));
+ */
+int
+__bam_relink_43_recover(env, dbtp, lsnp, op, info)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops op;
+ void *info;
+{
+ __bam_relink_43_args *argp;
+ DB_THREAD_INFO *ip;
+ DB *file_dbp;
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *pagep;
+ int cmp_n, cmp_p, modified, ret;
+
+ ip = ((DB_TXNHEAD *)info)->thread_info;
+ pagep = NULL;
+ REC_PRINT(__bam_relink_43_print);
+ REC_INTRO(__bam_relink_43_read, ip, 0);
+
+ /*
+ * There are up to three pages we need to check -- the page, and the
+ * previous and next pages, if they existed. For a page add operation,
+ * the current page is the result of a split and is being recovered
+ * elsewhere, so all we need do is recover the next page.
+ */
+ if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->pgno, ret);
+ goto out;
+ } else
+ goto next2;
+ }
+
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Redo the relink. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->lsn = *lsnp;
+ } else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) {
+ /* Undo the relink. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->next_pgno = argp->next;
+ pagep->prev_pgno = argp->prev;
+ pagep->lsn = argp->lsn;
+ }
+ if ((ret = __memp_fput(mpf,
+ ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+next2: if ((ret = __memp_fget(mpf, &argp->next, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->next, ret);
+ goto out;
+ } else
+ goto prev;
+ }
+
+ modified = 0;
+ cmp_n = LOG_COMPARE(lsnp, &LSN(pagep));
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_next);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_next);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Redo the remove or undo the add. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->prev_pgno = argp->prev;
+ modified = 1;
+ } else if (cmp_n == 0 && DB_UNDO(op)) {
+ /* Undo the remove or redo the add. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->prev_pgno = argp->pgno;
+ modified = 1;
+ }
+ if (modified) {
+ if (DB_UNDO(op))
+ pagep->lsn = argp->lsn_next;
+ else
+ pagep->lsn = *lsnp;
+ }
+ if ((ret = __memp_fput(mpf,
+ ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+prev: if ((ret = __memp_fget(mpf, &argp->prev, ip, NULL, 0, &pagep)) != 0) {
+ if (ret != DB_PAGE_NOTFOUND) {
+ ret = __db_pgerr(file_dbp, argp->prev, ret);
+ goto out;
+ } else
+ goto done;
+ }
+
+ modified = 0;
+ cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_prev);
+ CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_prev);
+ if (cmp_p == 0 && DB_REDO(op)) {
+ /* Redo the relink. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->next_pgno = argp->next;
+ modified = 1;
+ } else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) {
+ /* Undo the relink. */
+ REC_DIRTY(mpf, ip, file_dbp->priority, &pagep);
+ pagep->next_pgno = argp->pgno;
+ modified = 1;
+ }
+ if (modified) {
+ if (DB_UNDO(op))
+ pagep->lsn = argp->lsn_prev;
+ else
+ pagep->lsn = *lsnp;
+ }
+ if ((ret = __memp_fput(mpf,
+ ip, pagep, file_dbp->priority)) != 0)
+ goto out;
+ pagep = NULL;
+
+done: *lsnp = argp->prev_lsn;
+ ret = 0;
+
+out: if (pagep != NULL)
+ (void)__memp_fput(mpf, ip, pagep, file_dbp->priority);
+ REC_CLOSE;
+}
diff --git a/btree/bt_reclaim.c b/btree/bt_reclaim.c
new file mode 100644
index 0000000..835bf9f
--- /dev/null
+++ b/btree/bt_reclaim.c
@@ -0,0 +1,97 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1998-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+
+/*
+ * __bam_reclaim --
+ * Free a database.
+ *
+ * PUBLIC: int __bam_reclaim __P((DB *, DB_THREAD_INFO *, DB_TXN *));
+ */
+int
+__bam_reclaim(dbp, ip, txn)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+{
+ DBC *dbc;
+ DB_LOCK meta_lock;
+ int ret, t_ret;
+
+ /* Acquire a cursor. */
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+ return (ret);
+
+ /* Write lock the metapage for deallocations. */
+ if ((ret = __db_lget(dbc,
+ 0, PGNO_BASE_MD, DB_LOCK_WRITE, 0, &meta_lock)) != 0)
+ goto err;
+
+ /* Avoid locking every page, we have the handle locked exclusive. */
+ F_SET(dbc, DBC_DONTLOCK);
+
+ /* Walk the tree, freeing pages. */
+ ret = __bam_traverse(dbc,
+ DB_LOCK_WRITE, dbc->internal->root, __db_reclaim_callback, NULL);
+
+ if ((t_ret = __TLPUT(dbc, meta_lock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Discard the cursor. */
+err: if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __bam_truncate --
+ * Truncate a database.
+ *
+ * PUBLIC: int __bam_truncate __P((DBC *, u_int32_t *));
+ */
+int
+__bam_truncate(dbc, countp)
+ DBC *dbc;
+ u_int32_t *countp;
+{
+ u_int32_t count;
+ int ret;
+
+#ifdef HAVE_COMPRESSION
+ u_int32_t comp_count;
+
+ comp_count = 0;
+ if (DB_IS_COMPRESSED(dbc->dbp) &&
+ (ret = __bam_compress_count(dbc, NULL, &comp_count)) != 0)
+ return (ret);
+#endif
+
+ count = 0;
+
+ /* Walk the tree, freeing pages. */
+ ret = __bam_traverse(dbc,
+ DB_LOCK_WRITE, dbc->internal->root, __db_truncate_callback, &count);
+
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbc->dbp)) {
+ if (countp != NULL)
+ *countp = comp_count;
+ } else
+#endif
+ if (countp != NULL)
+ *countp = count;
+
+ return (ret);
+}
diff --git a/btree/bt_recno.c b/btree/bt_recno.c
new file mode 100644
index 0000000..524de46
--- /dev/null
+++ b/btree/bt_recno.c
@@ -0,0 +1,1385 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+static int __ram_add __P((DBC *, db_recno_t *, DBT *, u_int32_t, u_int32_t));
+static int __ram_source __P((DB *));
+static int __ram_sread __P((DBC *, db_recno_t));
+static int __ram_update __P((DBC *, db_recno_t, int));
+
+/*
+ * In recno, there are two meanings to the on-page "deleted" flag. If we're
+ * re-numbering records, it means the record was implicitly created. We skip
+ * over implicitly created records if doing a cursor "next" or "prev", and
+ * return DB_KEYEMPTY if they're explicitly requested.. If not re-numbering
+ * records, it means that the record was implicitly created, or was deleted.
+ * We skip over implicitly created or deleted records if doing a cursor "next"
+ * or "prev", and return DB_KEYEMPTY if they're explicitly requested.
+ *
+ * If we're re-numbering records, then we have to detect in the cursor that
+ * a record was deleted, and adjust the cursor as necessary on the next get.
+ * If we're not re-numbering records, then we can detect that a record has
+ * been deleted by looking at the actual on-page record, so we completely
+ * ignore the cursor's delete flag. This is different from the B+tree code.
+ * It also maintains whether the cursor references a deleted record in the
+ * cursor, and it doesn't always check the on-page value.
+ */
+#define CD_SET(cp) { \
+ if (F_ISSET(cp, C_RENUMBER)) \
+ F_SET(cp, C_DELETED); \
+}
+#define CD_CLR(cp) { \
+ if (F_ISSET(cp, C_RENUMBER)) { \
+ F_CLR(cp, C_DELETED); \
+ cp->order = INVALID_ORDER; \
+ } \
+}
+#define CD_ISSET(cp) \
+ (F_ISSET(cp, C_RENUMBER) && F_ISSET(cp, C_DELETED) ? 1 : 0)
+
+/*
+ * Macros for comparing the ordering of two cursors.
+ * cp1 comes before cp2 iff one of the following holds:
+ * cp1's recno is less than cp2's recno
+ * recnos are equal, both deleted, and cp1's order is less than cp2's
+ * recnos are equal, cp1 deleted, and cp2 not deleted
+ */
+#define C_LESSTHAN(cp1, cp2) \
+ (((cp1)->recno < (cp2)->recno) || \
+ (((cp1)->recno == (cp2)->recno) && \
+ ((CD_ISSET((cp1)) && CD_ISSET((cp2)) && (cp1)->order < (cp2)->order) || \
+ (CD_ISSET((cp1)) && !CD_ISSET((cp2))))))
+
+/*
+ * cp1 is equal to cp2 iff their recnos and delete flags are identical,
+ * and if the delete flag is set their orders are also identical.
+ */
+#define C_EQUAL(cp1, cp2) \
+ ((cp1)->recno == (cp2)->recno && CD_ISSET((cp1)) == CD_ISSET((cp2)) && \
+ (!CD_ISSET((cp1)) || (cp1)->order == (cp2)->order))
+
+/*
+ * Do we need to log the current cursor adjustment?
+ */
+#define CURADJ_LOG(dbc) \
+ (DBC_LOGGING((dbc)) && (dbc)->txn != NULL && (dbc)->txn->parent != NULL)
+
+/*
+ * After a search, copy the found page into the cursor, discarding any
+ * currently held lock.
+ */
+#define STACK_TO_CURSOR(cp, ret) { \
+ int __t_ret; \
+ (cp)->page = (cp)->csp->page; \
+ (cp)->pgno = (cp)->csp->page->pgno; \
+ (cp)->indx = (cp)->csp->indx; \
+ if ((__t_ret = __TLPUT(dbc, (cp)->lock)) != 0 && (ret) == 0) \
+ ret = __t_ret; \
+ (cp)->lock = (cp)->csp->lock; \
+ (cp)->lock_mode = (cp)->csp->lock_mode; \
+}
+
+/*
+ * __ram_open --
+ * Recno open function.
+ *
+ * PUBLIC: int __ram_open __P((DB *, DB_THREAD_INFO *,
+ * PUBLIC: DB_TXN *, const char *, db_pgno_t, u_int32_t));
+ */
+int
+__ram_open(dbp, ip, txn, name, base_pgno, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ const char *name;
+ db_pgno_t base_pgno;
+ u_int32_t flags;
+{
+ BTREE *t;
+ DBC *dbc;
+ int ret, t_ret;
+
+ COMPQUIET(name, NULL);
+ t = dbp->bt_internal;
+
+ /* Start up the tree. */
+ if ((ret = __bam_read_root(dbp, ip, txn, base_pgno, flags)) != 0)
+ return (ret);
+
+ /*
+ * If the user specified a source tree, open it and map it in.
+ *
+ * !!!
+ * We don't complain if the user specified transactions or threads.
+ * It's possible to make it work, but you'd better know what you're
+ * doing!
+ */
+ if (t->re_source != NULL && (ret = __ram_source(dbp)) != 0)
+ return (ret);
+
+ /* If we're snapshotting an underlying source file, do it now. */
+ if (F_ISSET(dbp, DB_AM_SNAPSHOT)) {
+ /* Allocate a cursor. */
+ if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+ return (ret);
+
+ /* Do the snapshot. */
+ if ((ret = __ram_update(dbc,
+ DB_MAX_RECORDS, 0)) != 0 && ret == DB_NOTFOUND)
+ ret = 0;
+
+ /* Discard the cursor. */
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ return (ret);
+}
+
+/*
+ * __ram_append --
+ * Recno append function.
+ *
+ * PUBLIC: int __ram_append __P((DBC *, DBT *, DBT *));
+ */
+int
+__ram_append(dbc, key, data)
+ DBC *dbc;
+ DBT *key, *data;
+{
+ BTREE_CURSOR *cp;
+ int ret;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * Make sure we've read in all of the backing source file. If
+ * we found the record or it simply didn't exist, add the
+ * user's record.
+ */
+ ret = __ram_update(dbc, DB_MAX_RECORDS, 0);
+ if (ret == 0 || ret == DB_NOTFOUND)
+ ret = __ram_add(dbc, &cp->recno, data, DB_APPEND, 0);
+
+ /* Return the record number. */
+ if (ret == 0 && key != NULL)
+ ret = __db_retcopy(dbc->env, key, &cp->recno,
+ sizeof(cp->recno), &dbc->rkey->data, &dbc->rkey->ulen);
+
+ return (ret);
+}
+
+/*
+ * __ramc_del --
+ * Recno DBC->del function.
+ *
+ * PUBLIC: int __ramc_del __P((DBC *, u_int32_t));
+ */
+int
+__ramc_del(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ BKEYDATA bk;
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT hdr, data;
+ DB_LOCK next_lock, prev_lock;
+ DB_LSN lsn;
+ db_pgno_t npgno, ppgno, save_npgno, save_ppgno;
+ int exact, nc, ret, stack, t_ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ t = dbp->bt_internal;
+ stack = 0;
+ save_npgno = save_ppgno = PGNO_INVALID;
+ LOCK_INIT(next_lock);
+ LOCK_INIT(prev_lock);
+ COMPQUIET(flags, 0);
+
+ /*
+ * The semantics of cursors during delete are as follows: in
+ * non-renumbering recnos, records are replaced with a marker
+ * containing a delete flag. If the record referenced by this cursor
+ * has already been deleted, we will detect that as part of the delete
+ * operation, and fail.
+ *
+ * In renumbering recnos, cursors which represent deleted items
+ * are flagged with the C_DELETED flag, and it is an error to
+ * call c_del a second time without an intervening cursor motion.
+ */
+ if (CD_ISSET(cp))
+ return (DB_KEYEMPTY);
+
+ /* Search the tree for the key; delete only deletes exact matches. */
+retry: if ((ret = __bam_rsearch(dbc, &cp->recno, SR_DELETE, 1, &exact)) != 0)
+ goto err;
+ if (!exact) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ stack = 1;
+
+ /* Copy the page into the cursor. */
+ STACK_TO_CURSOR(cp, ret);
+ if (ret != 0)
+ goto err;
+
+ /*
+ * If re-numbering records, the on-page deleted flag can only mean
+ * that this record was implicitly created. Applications aren't
+ * permitted to delete records they never created, return an error.
+ *
+ * If not re-numbering records, the on-page deleted flag means that
+ * this record was implicitly created, or, was deleted at some time.
+ * The former is an error because applications aren't permitted to
+ * delete records they never created, the latter is an error because
+ * if the record was "deleted", we could never have found it.
+ */
+ if (B_DISSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type)) {
+ ret = DB_KEYEMPTY;
+ goto err;
+ }
+
+ if (F_ISSET(cp, C_RENUMBER)) {
+ /* If we are going to drop the page, lock its neighbors. */
+ if (STD_LOCKING(dbc) &&
+ NUM_ENT(cp->page) == 1 && PGNO(cp->page) != cp->root) {
+ if ((npgno = NEXT_PGNO(cp->page)) != PGNO_INVALID)
+ TRY_LOCK(dbc, npgno, save_npgno,
+ next_lock, DB_LOCK_WRITE, retry);
+ if (ret != 0)
+ goto err;
+ if ((ppgno = PREV_PGNO(cp->page)) != PGNO_INVALID)
+ TRY_LOCK(dbc, ppgno, save_ppgno,
+ prev_lock, DB_LOCK_WRITE, retry);
+ if (ret != 0)
+ goto err;
+ }
+ /* Delete the item, adjust the counts, adjust the cursors. */
+ if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
+ goto err;
+ if ((ret = __bam_adjust(dbc, -1)) != 0)
+ goto err;
+ if ((ret = __ram_ca(dbc, CA_DELETE, &nc)) != 0)
+ goto err;
+ if (nc > 0 &&
+ CURADJ_LOG(dbc) && (ret = __bam_rcuradj_log(dbp, dbc->txn,
+ &lsn, 0, CA_DELETE, cp->root, cp->recno, cp->order)) != 0)
+ goto err;
+
+ /*
+ * If the page is empty, delete it.
+ *
+ * We never delete a root page. First, root pages of primary
+ * databases never go away, recno or otherwise. However, if
+ * it's the root page of an off-page duplicates database, then
+ * it can be deleted. We don't delete it here because we have
+ * no way of telling the primary database page holder (e.g.,
+ * the hash access method) that its page element should cleaned
+ * up because the underlying tree is gone. So, we keep the page
+ * around until the last cursor referencing the empty tree is
+ * are closed, and then clean it up.
+ */
+ if (NUM_ENT(cp->page) == 0 && PGNO(cp->page) != cp->root) {
+ /*
+ * We want to delete a single item out of the last page
+ * that we're not deleting.
+ */
+ ret = __bam_dpages(dbc, 0, BTD_RELINK);
+
+ /*
+ * Regardless of the return from __bam_dpages, it will
+ * discard our stack and pinned page.
+ */
+ stack = 0;
+ cp->page = NULL;
+ LOCK_INIT(cp->lock);
+ cp->lock_mode = DB_LOCK_NG;
+ }
+ } else {
+ /* Use a delete/put pair to replace the record with a marker. */
+ if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
+ goto err;
+
+ B_TSET_DELETED(bk.type, B_KEYDATA);
+ bk.len = 0;
+ DB_INIT_DBT(hdr, &bk, SSZA(BKEYDATA, data));
+ DB_INIT_DBT(data, "", 0);
+ if ((ret = __db_pitem(dbc,
+ cp->page, cp->indx, BKEYDATA_SIZE(0), &hdr, &data)) != 0)
+ goto err;
+ }
+
+ t->re_modified = 1;
+
+err: if (stack && (t_ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, next_lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, prev_lock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __ramc_get --
+ * Recno DBC->get function.
+ *
+ * PUBLIC: int __ramc_get
+ * PUBLIC: __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+ */
+int
+__ramc_get(dbc, key, data, flags, pgnop)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+ db_pgno_t *pgnop;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ int cmp, exact, ret;
+
+ COMPQUIET(pgnop, NULL);
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ LF_CLR(DB_MULTIPLE|DB_MULTIPLE_KEY);
+retry: switch (flags) {
+ case DB_CURRENT:
+ /*
+ * If we're using mutable records and the deleted flag is
+ * set, the cursor is pointing at a nonexistent record;
+ * return an error.
+ */
+ if (CD_ISSET(cp))
+ return (DB_KEYEMPTY);
+ break;
+ case DB_NEXT_DUP:
+ /*
+ * If we're not in an off-page dup set, we know there's no
+ * next duplicate since recnos don't have them. If we
+ * are in an off-page dup set, the next item assuredly is
+ * a dup, so we set flags to DB_NEXT and keep going.
+ */
+ if (!F_ISSET(dbc, DBC_OPD))
+ return (DB_NOTFOUND);
+ /* FALLTHROUGH */
+ case DB_NEXT_NODUP:
+ /*
+ * Recno databases don't have duplicates, set flags to DB_NEXT
+ * and keep going.
+ */
+ /* FALLTHROUGH */
+ case DB_NEXT:
+ flags = DB_NEXT;
+ /*
+ * If record numbers are mutable: if we just deleted a record,
+ * we have to avoid incrementing the record number so that we
+ * return the right record by virtue of renumbering the tree.
+ */
+ if (CD_ISSET(cp)) {
+ /*
+ * Clear the flag, we've moved off the deleted record.
+ */
+ CD_CLR(cp);
+ break;
+ }
+
+ if (cp->recno != RECNO_OOB) {
+ ++cp->recno;
+ break;
+ }
+ /* FALLTHROUGH */
+ case DB_FIRST:
+ flags = DB_NEXT;
+ cp->recno = 1;
+ break;
+ case DB_PREV_DUP:
+ /*
+ * If we're not in an off-page dup set, we know there's no
+ * previous duplicate since recnos don't have them. If we
+ * are in an off-page dup set, the previous item assuredly
+ * is a dup, so we set flags to DB_PREV and keep going.
+ */
+ if (!F_ISSET(dbc, DBC_OPD))
+ return (DB_NOTFOUND);
+ /* FALLTHROUGH */
+ case DB_PREV_NODUP:
+ /*
+ * Recno databases don't have duplicates, set flags to DB_PREV
+ * and keep going.
+ */
+ /* FALLTHROUGH */
+ case DB_PREV:
+ flags = DB_PREV;
+ if (cp->recno != RECNO_OOB) {
+ if (cp->recno == 1) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ --cp->recno;
+ break;
+ }
+ /* FALLTHROUGH */
+ case DB_LAST:
+ flags = DB_PREV;
+ if (((ret = __ram_update(dbc,
+ DB_MAX_RECORDS, 0)) != 0) && ret != DB_NOTFOUND)
+ goto err;
+ if ((ret = __bam_nrecs(dbc, &cp->recno)) != 0)
+ goto err;
+ if (cp->recno == 0) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ break;
+ case DB_GET_BOTHC:
+ /*
+ * If we're doing a join and these are offpage dups,
+ * we want to keep searching forward from after the
+ * current cursor position. Increment the recno by 1,
+ * then proceed as for a DB_SET.
+ *
+ * Otherwise, we know there are no additional matching
+ * data, as recnos don't have dups. return DB_NOTFOUND.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ cp->recno++;
+ break;
+ }
+ ret = DB_NOTFOUND;
+ goto err;
+ /* NOTREACHED */
+ case DB_GET_BOTH:
+ case DB_GET_BOTH_RANGE:
+ /*
+ * If we're searching a set of off-page dups, we start
+ * a new linear search from the first record. Otherwise,
+ * we compare the single data item associated with the
+ * requested record for a match.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ cp->recno = 1;
+ break;
+ }
+ /* FALLTHROUGH */
+ case DB_SET:
+ case DB_SET_RANGE:
+ if ((ret = __ram_getno(dbc, key, &cp->recno, 0)) != 0)
+ goto err;
+ break;
+ default:
+ ret = __db_unknown_flag(dbp->env, "__ramc_get", flags);
+ goto err;
+ }
+
+ /*
+ * For DB_PREV, DB_LAST, DB_SET and DB_SET_RANGE, we have already
+ * called __ram_update() to make sure sufficient records have been
+ * read from the backing source file. Do it now for DB_CURRENT (if
+ * the current record was deleted we may need more records from the
+ * backing file for a DB_CURRENT operation), DB_FIRST and DB_NEXT.
+ * (We don't have to test for flags == DB_FIRST, because the switch
+ * statement above re-set flags to DB_NEXT in that case.)
+ */
+ if ((flags == DB_NEXT || flags == DB_CURRENT) && ((ret =
+ __ram_update(dbc, cp->recno, 0)) != 0) && ret != DB_NOTFOUND)
+ goto err;
+
+ for (;; ++cp->recno) {
+ /* Search the tree for the record. */
+ if ((ret = __bam_rsearch(dbc, &cp->recno,
+ F_ISSET(dbc, DBC_RMW) ? SR_FIND_WR : SR_FIND,
+ 1, &exact)) != 0)
+ goto err;
+ if (!exact) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+
+ /* Copy the page into the cursor. */
+ STACK_TO_CURSOR(cp, ret);
+ if (ret != 0)
+ goto err;
+
+ /*
+ * If re-numbering records, the on-page deleted flag means this
+ * record was implicitly created. If not re-numbering records,
+ * the on-page deleted flag means this record was implicitly
+ * created, or, it was deleted at some time. Regardless, we
+ * skip such records if doing cursor next/prev operations or
+ * walking through off-page duplicates, and fail if they were
+ * requested explicitly by the application.
+ */
+ if (B_DISSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type))
+ switch (flags) {
+ case DB_NEXT:
+ case DB_PREV:
+ (void)__bam_stkrel(dbc, STK_CLRDBC);
+ goto retry;
+ case DB_GET_BOTH:
+ case DB_GET_BOTH_RANGE:
+ /*
+ * If we're an OPD tree, we don't care about
+ * matching a record number on a DB_GET_BOTH
+ * -- everything belongs to the same tree. A
+ * normal recno should give up and return
+ * DB_NOTFOUND if the matching recno is deleted.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ (void)__bam_stkrel(dbc, STK_CLRDBC);
+ continue;
+ }
+ ret = DB_NOTFOUND;
+ goto err;
+ default:
+ ret = DB_KEYEMPTY;
+ goto err;
+ }
+
+ if (flags == DB_GET_BOTH ||
+ flags == DB_GET_BOTHC || flags == DB_GET_BOTH_RANGE) {
+ if ((ret = __bam_cmp(dbc, data, cp->page, cp->indx,
+ __bam_defcmp, &cmp)) != 0)
+ return (ret);
+ if (cmp == 0)
+ break;
+ if (!F_ISSET(dbc, DBC_OPD)) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ (void)__bam_stkrel(dbc, STK_CLRDBC);
+ } else
+ break;
+ }
+
+ /* Return the key if the user didn't give us one. */
+ if (!F_ISSET(dbc, DBC_OPD) && !F_ISSET(key, DB_DBT_ISSET)) {
+ ret = __db_retcopy(dbp->env,
+ key, &cp->recno, sizeof(cp->recno),
+ &dbc->rkey->data, &dbc->rkey->ulen);
+ F_SET(key, DB_DBT_ISSET);
+ }
+
+ /* The cursor was reset, no further delete adjustment is necessary. */
+err: CD_CLR(cp);
+
+ return (ret);
+}
+
+/*
+ * __ramc_put --
+ * Recno DBC->put function.
+ *
+ * PUBLIC: int __ramc_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+ */
+int
+__ramc_put(dbc, key, data, flags, pgnop)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+ db_pgno_t *pgnop;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_LSN lsn;
+ ENV *env;
+ u_int32_t iiflags;
+ int exact, nc, ret, t_ret;
+ void *arg;
+
+ COMPQUIET(pgnop, NULL);
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * DB_KEYFIRST and DB_KEYLAST mean different things if they're
+ * used in an off-page duplicate tree. If we're an off-page
+ * duplicate tree, they really mean "put at the beginning of the
+ * tree" and "put at the end of the tree" respectively, so translate
+ * them to something else.
+ */
+ if (F_ISSET(dbc, DBC_OPD))
+ switch (flags) {
+ case DB_KEYFIRST:
+ cp->recno = 1;
+ flags = DB_BEFORE;
+ break;
+ case DB_KEYLAST:
+ if ((ret = __ram_add(dbc,
+ &cp->recno, data, DB_APPEND, 0)) != 0)
+ return (ret);
+ if (CURADJ_LOG(dbc) &&
+ (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0,
+ CA_ICURRENT, cp->root, cp->recno, cp->order)) != 0)
+ return (ret);
+ return (0);
+ default:
+ break;
+ }
+
+ /*
+ * Handle normal DB_KEYFIRST/DB_KEYLAST; for a recno, which has
+ * no duplicates, these are identical and mean "put the given
+ * datum at the given recno".
+ */
+ if (flags == DB_KEYFIRST || flags == DB_KEYLAST ||
+ flags == DB_NOOVERWRITE || flags == DB_OVERWRITE_DUP) {
+ ret = __ram_getno(dbc, key, &cp->recno, 1);
+ if (ret == 0 || ret == DB_NOTFOUND)
+ ret = __ram_add(dbc, &cp->recno, data, flags, 0);
+ return (ret);
+ }
+
+ /*
+ * If we're putting with a cursor that's marked C_DELETED, we need to
+ * take special care; the cursor doesn't "really" reference the item
+ * corresponding to its current recno, but instead is "between" that
+ * record and the current one. Translate the actual insert into
+ * DB_BEFORE, and let the __ram_ca work out the gory details of what
+ * should wind up pointing where.
+ */
+ if (CD_ISSET(cp))
+ iiflags = DB_BEFORE;
+ else
+ iiflags = flags;
+
+split: if ((ret = __bam_rsearch(dbc, &cp->recno, SR_INSERT, 1, &exact)) != 0)
+ goto err;
+ /*
+ * An inexact match is okay; it just means we're one record past the
+ * end, which is reasonable if we're marked deleted.
+ */
+ DB_ASSERT(env, exact || CD_ISSET(cp));
+
+ /* Copy the page into the cursor. */
+ STACK_TO_CURSOR(cp, ret);
+ if (ret != 0)
+ goto err;
+
+ ret = __bam_iitem(dbc, key, data, iiflags, 0);
+ t_ret = __bam_stkrel(dbc, STK_CLRDBC);
+
+ if (t_ret != 0 && (ret == 0 || ret == DB_NEEDSPLIT))
+ ret = t_ret;
+ else if (ret == DB_NEEDSPLIT) {
+ arg = &cp->recno;
+ if ((ret = __bam_split(dbc, arg, NULL)) != 0)
+ goto err;
+ goto split;
+ }
+ if (ret != 0)
+ goto err;
+
+ switch (flags) { /* Adjust the cursors. */
+ case DB_AFTER:
+ if ((ret = __ram_ca(dbc, CA_IAFTER, &nc)) != 0)
+ goto err;
+
+ /*
+ * We only need to adjust this cursor forward if we truly added
+ * the item after the current recno, rather than remapping it
+ * to DB_BEFORE.
+ */
+ if (iiflags == DB_AFTER)
+ ++cp->recno;
+
+ /* Only log if __ram_ca found any relevant cursors. */
+ if (nc > 0 && CURADJ_LOG(dbc) &&
+ (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0, CA_IAFTER,
+ cp->root, cp->recno, cp->order)) != 0)
+ goto err;
+ break;
+ case DB_BEFORE:
+ if ((ret = __ram_ca(dbc, CA_IBEFORE, &nc)) != 0)
+ goto err;
+ --cp->recno;
+
+ /* Only log if __ram_ca found any relevant cursors. */
+ if (nc > 0 && CURADJ_LOG(dbc) &&
+ (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0, CA_IBEFORE,
+ cp->root, cp->recno, cp->order)) != 0)
+ goto err;
+ break;
+ case DB_CURRENT:
+ /*
+ * We only need to do an adjustment if we actually
+ * added an item, which we only would have done if the
+ * cursor was marked deleted.
+ */
+ if (!CD_ISSET(cp))
+ break;
+
+ /* Only log if __ram_ca found any relevant cursors. */
+ if ((ret = __ram_ca(dbc, CA_ICURRENT, &nc)) != 0)
+ goto err;
+ if (nc > 0 && CURADJ_LOG(dbc) &&
+ (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0,
+ CA_ICURRENT, cp->root, cp->recno, cp->order)) != 0)
+ goto err;
+ break;
+ default:
+ break;
+ }
+
+ /* Return the key if we've created a new record. */
+ if (!F_ISSET(dbc, DBC_OPD) &&
+ (flags == DB_AFTER || flags == DB_BEFORE) && key != NULL)
+ ret = __db_retcopy(env, key, &cp->recno,
+ sizeof(cp->recno), &dbc->rkey->data, &dbc->rkey->ulen);
+
+ /* The cursor was reset, no further delete adjustment is necessary. */
+err: CD_CLR(cp);
+
+ return (ret);
+}
+
+/*
+ * __ram_ca --
+ * Adjust cursors. Returns the number of relevant cursors.
+ *
+ * PUBLIC: int __ram_ca __P((DBC *, ca_recno_arg, int *));
+ */
+int
+__ram_ca(dbc_arg, op, foundp)
+ DBC *dbc_arg;
+ ca_recno_arg op;
+ int *foundp;
+{
+ BTREE_CURSOR *cp, *cp_arg;
+ DB *dbp, *ldbp;
+ DBC *dbc;
+ ENV *env;
+ db_recno_t recno;
+ u_int32_t order;
+ int adjusted, found;
+
+ dbp = dbc_arg->dbp;
+ env = dbp->env;
+ cp_arg = (BTREE_CURSOR *)dbc_arg->internal;
+ recno = cp_arg->recno;
+
+ /*
+ * It only makes sense to adjust cursors if we're a renumbering
+ * recno; we should only be called if this is one.
+ */
+ DB_ASSERT(env, F_ISSET(cp_arg, C_RENUMBER));
+
+ MUTEX_LOCK(env, env->mtx_dblist);
+ /*
+ * Adjust the cursors. See the comment in __bam_ca_delete().
+ *
+ * If we're doing a delete, we need to find the highest
+ * order of any cursor currently pointing at this item,
+ * so we can assign a higher order to the newly deleted
+ * cursor. Unfortunately, this requires a second pass through
+ * the cursor list.
+ */
+ if (op == CA_DELETE) {
+ FIND_FIRST_DB_MATCH(env, dbp, ldbp);
+ for (order = 1;
+ ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+ ldbp = TAILQ_NEXT(ldbp, dblistlinks)) {
+ MUTEX_LOCK(env, dbp->mutex);
+ TAILQ_FOREACH(dbc, &ldbp->active_queue, links) {
+ cp = (BTREE_CURSOR *)dbc->internal;
+ if (cp_arg->root == cp->root &&
+ recno == cp->recno && CD_ISSET(cp) &&
+ order <= cp->order &&
+ !MVCC_SKIP_CURADJ(dbc, cp->root))
+ order = cp->order + 1;
+ }
+ MUTEX_UNLOCK(env, dbp->mutex);
+ }
+ } else
+ order = INVALID_ORDER;
+
+ /* Now go through and do the actual adjustments. */
+ FIND_FIRST_DB_MATCH(env, dbp, ldbp);
+ for (found = 0;
+ ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+ ldbp = TAILQ_NEXT(ldbp, dblistlinks)) {
+ MUTEX_LOCK(env, dbp->mutex);
+ TAILQ_FOREACH(dbc, &ldbp->active_queue, links) {
+ cp = (BTREE_CURSOR *)dbc->internal;
+ if (cp_arg->root != cp->root ||
+ MVCC_SKIP_CURADJ(dbc, cp->root))
+ continue;
+ ++found;
+ adjusted = 0;
+ switch (op) {
+ case CA_DELETE:
+ if (recno < cp->recno) {
+ --cp->recno;
+ /*
+ * If the adjustment made them equal,
+ * we have to merge the orders.
+ */
+ if (recno == cp->recno && CD_ISSET(cp))
+ cp->order += order;
+ } else if (recno == cp->recno &&
+ !CD_ISSET(cp)) {
+ CD_SET(cp);
+ cp->order = order;
+ /*
+ * If we're deleting the item, we can't
+ * keep a streaming offset cached.
+ */
+ cp->stream_start_pgno = PGNO_INVALID;
+ }
+ break;
+ case CA_IBEFORE:
+ /*
+ * IBEFORE is just like IAFTER, except that we
+ * adjust cursors on the current record too.
+ */
+ if (C_EQUAL(cp_arg, cp)) {
+ ++cp->recno;
+ adjusted = 1;
+ }
+ goto iafter;
+ case CA_ICURRENT:
+
+ /*
+ * If the original cursor wasn't deleted, we
+ * just did a replacement and so there's no
+ * need to adjust anything--we shouldn't have
+ * gotten this far. Otherwise, we behave
+ * much like an IAFTER, except that all
+ * cursors pointing to the current item get
+ * marked undeleted and point to the new
+ * item.
+ */
+ DB_ASSERT(env, CD_ISSET(cp_arg));
+ if (C_EQUAL(cp_arg, cp)) {
+ CD_CLR(cp);
+ break;
+ }
+ /* FALLTHROUGH */
+ case CA_IAFTER:
+iafter: if (!adjusted && C_LESSTHAN(cp_arg, cp)) {
+ ++cp->recno;
+ adjusted = 1;
+ }
+ if (recno == cp->recno && adjusted)
+ /*
+ * If we've moved this cursor's recno,
+ * split its order number--i.e.,
+ * decrement it by enough so that
+ * the lowest cursor moved has order 1.
+ * cp_arg->order is the split point,
+ * so decrement by one less than that.
+ */
+ cp->order -= (cp_arg->order - 1);
+ break;
+ }
+ }
+ MUTEX_UNLOCK(dbp->env, dbp->mutex);
+ }
+ MUTEX_UNLOCK(env, env->mtx_dblist);
+
+ if (foundp != NULL)
+ *foundp = found;
+ return (0);
+}
+
+/*
+ * __ram_getno --
+ * Check the user's record number, and make sure we've seen it.
+ *
+ * PUBLIC: int __ram_getno __P((DBC *, const DBT *, db_recno_t *, int));
+ */
+int
+__ram_getno(dbc, key, rep, can_create)
+ DBC *dbc;
+ const DBT *key;
+ db_recno_t *rep;
+ int can_create;
+{
+ DB *dbp;
+ db_recno_t recno;
+
+ dbp = dbc->dbp;
+
+ /* If passed an empty DBT from Java, key->data may be NULL */
+ if (key->size != sizeof(db_recno_t)) {
+ __db_errx(dbp->env, "illegal record number size");
+ return (EINVAL);
+ }
+
+ /* Check the user's record number. */
+ if ((recno = *(db_recno_t *)key->data) == 0) {
+ __db_errx(dbp->env, "illegal record number of 0");
+ return (EINVAL);
+ }
+ if (rep != NULL)
+ *rep = recno;
+
+ /*
+ * Btree can neither create records nor read them in. Recno can
+ * do both, see if we can find the record.
+ */
+ return (dbc->dbtype == DB_RECNO ?
+ __ram_update(dbc, recno, can_create) : 0);
+}
+
+/*
+ * __ram_update --
+ * Ensure the tree has records up to and including the specified one.
+ */
+static int
+__ram_update(dbc, recno, can_create)
+ DBC *dbc;
+ db_recno_t recno;
+ int can_create;
+{
+ BTREE *t;
+ DB *dbp;
+ DBT *rdata;
+ db_recno_t nrecs;
+ int ret;
+
+ dbp = dbc->dbp;
+ t = dbp->bt_internal;
+
+ /*
+ * If we can't create records and we've read the entire backing input
+ * file, we're done.
+ */
+ if (!can_create && t->re_eof)
+ return (0);
+
+ /*
+ * If we haven't seen this record yet, try to get it from the original
+ * file.
+ */
+ if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
+ return (ret);
+ if (!t->re_eof && recno > nrecs) {
+ if ((ret = __ram_sread(dbc, recno)) != 0 && ret != DB_NOTFOUND)
+ return (ret);
+ if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
+ return (ret);
+ }
+
+ /*
+ * If we can create records, create empty ones up to the requested
+ * record.
+ */
+ if (!can_create || recno <= nrecs + 1)
+ return (0);
+
+ rdata = &dbc->my_rdata;
+ rdata->flags = 0;
+ rdata->size = 0;
+
+ while (recno > ++nrecs)
+ if ((ret = __ram_add(dbc,
+ &nrecs, rdata, 0, BI_DELETED)) != 0)
+ return (ret);
+ return (0);
+}
+
+/*
+ * __ram_source --
+ * Load information about the backing file.
+ */
+static int
+__ram_source(dbp)
+ DB *dbp;
+{
+ BTREE *t;
+ ENV *env;
+ char *source;
+ int ret;
+
+ env = dbp->env;
+ t = dbp->bt_internal;
+
+ /* Find the real name, and swap out the one we had before. */
+ if ((ret = __db_appname(env,
+ DB_APP_DATA, t->re_source, NULL, &source)) != 0)
+ return (ret);
+ __os_free(env, t->re_source);
+ t->re_source = source;
+
+ /*
+ * !!!
+ * It's possible that the backing source file is read-only. We don't
+ * much care other than we'll complain if there are any modifications
+ * when it comes time to write the database back to the source.
+ */
+ if ((t->re_fp = fopen(t->re_source, "rb")) == NULL) {
+ ret = __os_get_errno();
+ __db_err(env, ret, "%s", t->re_source);
+ return (ret);
+ }
+
+ t->re_eof = 0;
+ return (0);
+}
+
+/*
+ * __ram_writeback --
+ * Rewrite the backing file.
+ *
+ * PUBLIC: int __ram_writeback __P((DB *));
+ */
+int
+__ram_writeback(dbp)
+ DB *dbp;
+{
+ BTREE *t;
+ DBC *dbc;
+ DBT key, data;
+ DB_THREAD_INFO *ip;
+ ENV *env;
+ FILE *fp;
+ db_recno_t keyno;
+ int ret, t_ret;
+ u_int8_t delim, *pad;
+
+ t = dbp->bt_internal;
+ env = dbp->env;
+ fp = NULL;
+ pad = NULL;
+
+ /* If the file wasn't modified, we're done. */
+ if (!t->re_modified)
+ return (0);
+
+ /* If there's no backing source file, we're done. */
+ if (t->re_source == NULL) {
+ t->re_modified = 0;
+ return (0);
+ }
+
+ /*
+ * We step through the records, writing each one out. Use the record
+ * number and the dbp->get() function, instead of a cursor, so we find
+ * and write out "deleted" or non-existent records. The DB handle may
+ * be threaded, so allocate memory as we go.
+ */
+ memset(&key, 0, sizeof(key));
+ key.size = sizeof(db_recno_t);
+ key.data = &keyno;
+ memset(&data, 0, sizeof(data));
+ F_SET(&data, DB_DBT_REALLOC);
+
+ /* Allocate a cursor. */
+ ENV_GET_THREAD_INFO(env, ip);
+ if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0)
+ return (ret);
+
+ /*
+ * Read any remaining records into the tree.
+ *
+ * !!!
+ * This is why we can't support transactions when applications specify
+ * backing (re_source) files. At this point we have to read in the
+ * rest of the records from the file so that we can write all of the
+ * records back out again, which could modify a page for which we'd
+ * have to log changes and which we don't have locked. This could be
+ * partially fixed by taking a snapshot of the entire file during the
+ * DB->open as DB->open is transaction protected. But, if a checkpoint
+ * occurs then, the part of the log holding the copy of the file could
+ * be discarded, and that would make it impossible to recover in the
+ * face of disaster. This could all probably be fixed, but it would
+ * require transaction protecting the backing source file.
+ *
+ * XXX
+ * This could be made to work now that we have transactions protecting
+ * file operations. Margo has specifically asked for the privilege of
+ * doing this work.
+ */
+ if ((ret =
+ __ram_update(dbc, DB_MAX_RECORDS, 0)) != 0 && ret != DB_NOTFOUND)
+ goto err;
+
+ /*
+ * Close any existing file handle and re-open the file, truncating it.
+ */
+ if (t->re_fp != NULL) {
+ if (fclose(t->re_fp) != 0) {
+ ret = __os_get_errno();
+ __db_err(env, ret, "%s", t->re_source);
+ goto err;
+ }
+ t->re_fp = NULL;
+ }
+ if ((fp = fopen(t->re_source, "wb")) == NULL) {
+ ret = __os_get_errno();
+ __db_err(env, ret, "%s", t->re_source);
+ goto err;
+ }
+
+ /*
+ * We'll need the delimiter if we're doing variable-length records,
+ * and the pad character if we're doing fixed-length records.
+ */
+ delim = t->re_delim;
+ for (keyno = 1;; ++keyno) {
+ switch (ret = __db_get(dbp, ip, NULL, &key, &data, 0)) {
+ case 0:
+ if (data.size != 0 &&
+ fwrite(data.data, 1, data.size, fp) != data.size)
+ goto write_err;
+ break;
+ case DB_KEYEMPTY:
+ if (F_ISSET(dbp, DB_AM_FIXEDLEN)) {
+ if (pad == NULL) {
+ if ((ret = __os_malloc(
+ env, t->re_len, &pad)) != 0)
+ goto err;
+ memset(pad, t->re_pad, t->re_len);
+ }
+ if (fwrite(pad, 1, t->re_len, fp) != t->re_len)
+ goto write_err;
+ }
+ break;
+ case DB_NOTFOUND:
+ ret = 0;
+ goto done;
+ default:
+ goto err;
+ }
+ if (!F_ISSET(dbp, DB_AM_FIXEDLEN) &&
+ fwrite(&delim, 1, 1, fp) != 1) {
+write_err: ret = __os_get_errno();
+ __db_err(env, ret,
+ "%s: write failed to backing file", t->re_source);
+ goto err;
+ }
+ }
+
+err:
+done: /* Close the file descriptor. */
+ if (fp != NULL && fclose(fp) != 0) {
+ t_ret = __os_get_errno();
+ __db_err(env, t_ret, "%s", t->re_source);
+ if (ret == 0)
+ ret = t_ret;
+ }
+
+ /* Discard the cursor. */
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Discard memory allocated to hold the data items. */
+ if (data.data != NULL)
+ __os_ufree(env, data.data);
+ if (pad != NULL)
+ __os_free(env, pad);
+
+ if (ret == 0)
+ t->re_modified = 0;
+
+ return (ret);
+}
+
+/*
+ * __ram_sread --
+ * Read records from a source file.
+ */
+static int
+__ram_sread(dbc, top)
+ DBC *dbc;
+ db_recno_t top;
+{
+ BTREE *t;
+ DB *dbp;
+ DBT data, *rdata;
+ db_recno_t recno;
+ size_t len;
+ int ch, ret, was_modified;
+
+ t = dbc->dbp->bt_internal;
+ dbp = dbc->dbp;
+ was_modified = t->re_modified;
+
+ if ((ret = __bam_nrecs(dbc, &recno)) != 0)
+ return (ret);
+
+ /*
+ * Use the record key return memory, it's only a short-term use.
+ * The record data return memory is used by __bam_iitem, which
+ * we'll indirectly call, so use the key so as not to collide.
+ */
+ len = F_ISSET(dbp, DB_AM_FIXEDLEN) ? t->re_len : 256;
+ rdata = &dbc->my_rkey;
+ if (rdata->ulen < len) {
+ if ((ret = __os_realloc(
+ dbp->env, len, &rdata->data)) != 0) {
+ rdata->ulen = 0;
+ rdata->data = NULL;
+ return (ret);
+ }
+ rdata->ulen = (u_int32_t)len;
+ }
+
+ memset(&data, 0, sizeof(data));
+ while (recno < top) {
+ data.data = rdata->data;
+ data.size = 0;
+ if (F_ISSET(dbp, DB_AM_FIXEDLEN))
+ for (len = t->re_len; len > 0; --len) {
+ if ((ch = fgetc(t->re_fp)) == EOF) {
+ if (data.size == 0)
+ goto eof;
+ break;
+ }
+ ((u_int8_t *)data.data)[data.size++] = ch;
+ }
+ else
+ for (;;) {
+ if ((ch = fgetc(t->re_fp)) == EOF) {
+ if (data.size == 0)
+ goto eof;
+ break;
+ }
+ if (ch == t->re_delim)
+ break;
+
+ ((u_int8_t *)data.data)[data.size++] = ch;
+ if (data.size == rdata->ulen) {
+ if ((ret = __os_realloc(dbp->env,
+ rdata->ulen *= 2,
+ &rdata->data)) != 0) {
+ rdata->ulen = 0;
+ rdata->data = NULL;
+ return (ret);
+ } else
+ data.data = rdata->data;
+ }
+ }
+
+ /*
+ * Another process may have read this record from the input
+ * file and stored it into the database already, in which
+ * case we don't need to repeat that operation. We detect
+ * this by checking if the last record we've read is greater
+ * or equal to the number of records in the database.
+ */
+ if (t->re_last >= recno) {
+ ++recno;
+ if ((ret = __ram_add(dbc, &recno, &data, 0, 0)) != 0)
+ goto err;
+ }
+ ++t->re_last;
+ }
+
+ if (0) {
+eof: t->re_eof = 1;
+ ret = DB_NOTFOUND;
+ }
+err: if (!was_modified)
+ t->re_modified = 0;
+
+ return (ret);
+}
+
+/*
+ * __ram_add --
+ * Add records into the tree.
+ */
+static int
+__ram_add(dbc, recnop, data, flags, bi_flags)
+ DBC *dbc;
+ db_recno_t *recnop;
+ DBT *data;
+ u_int32_t flags, bi_flags;
+{
+ BTREE_CURSOR *cp;
+ int exact, ret, stack, t_ret;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+retry: /* Find the slot for insertion. */
+ if ((ret = __bam_rsearch(dbc, recnop,
+ SR_INSERT | (flags == DB_APPEND ? SR_APPEND : 0), 1, &exact)) != 0)
+ return (ret);
+ stack = 1;
+
+ /* Copy the page into the cursor. */
+ STACK_TO_CURSOR(cp, ret);
+ if (ret != 0)
+ goto err;
+
+ if (exact && flags == DB_NOOVERWRITE && !CD_ISSET(cp) &&
+ !B_DISSET(GET_BKEYDATA(dbc->dbp, cp->page, cp->indx)->type)) {
+ ret = DB_KEYEXIST;
+ goto err;
+ }
+
+ /*
+ * The application may modify the data based on the selected record
+ * number.
+ */
+ if (flags == DB_APPEND && dbc->dbp->db_append_recno != NULL &&
+ (ret = dbc->dbp->db_append_recno(dbc->dbp, data, *recnop)) != 0)
+ goto err;
+
+ /*
+ * Select the arguments for __bam_iitem() and do the insert. If the
+ * key is an exact match, or we're replacing the data item with a
+ * new data item, replace the current item. If the key isn't an exact
+ * match, we're inserting a new key/data pair, before the search
+ * location.
+ */
+ switch (ret = __bam_iitem(dbc,
+ NULL, data, exact ? DB_CURRENT : DB_BEFORE, bi_flags)) {
+ case 0:
+ /*
+ * Don't adjust anything.
+ *
+ * If we inserted a record, no cursors need adjusting because
+ * the only new record it's possible to insert is at the very
+ * end of the tree. The necessary adjustments to the internal
+ * page counts were made by __bam_iitem().
+ *
+ * If we overwrote a record, no cursors need adjusting because
+ * future DBcursor->get calls will simply return the underlying
+ * record (there's no adjustment made for the DB_CURRENT flag
+ * when a cursor get operation immediately follows a cursor
+ * delete operation, and the normal adjustment for the DB_NEXT
+ * flag is still correct).
+ */
+ break;
+ case DB_NEEDSPLIT:
+ /* Discard the stack of pages and split the page. */
+ (void)__bam_stkrel(dbc, STK_CLRDBC);
+ stack = 0;
+
+ if ((ret = __bam_split(dbc, recnop, NULL)) != 0)
+ goto err;
+
+ goto retry;
+ /* NOTREACHED */
+ default:
+ goto err;
+ }
+
+err: if (stack && (t_ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
diff --git a/btree/bt_rsearch.c b/btree/bt_rsearch.c
new file mode 100644
index 0000000..1d5581a
--- /dev/null
+++ b/btree/bt_rsearch.c
@@ -0,0 +1,502 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+/*
+ * __bam_rsearch --
+ * Search a btree for a record number.
+ *
+ * PUBLIC: int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *));
+ */
+int
+__bam_rsearch(dbc, recnop, flags, stop, exactp)
+ DBC *dbc;
+ db_recno_t *recnop;
+ u_int32_t flags;
+ int stop, *exactp;
+{
+ BINTERNAL *bi;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_LOCK lock;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *h;
+ RINTERNAL *ri;
+ db_indx_t adjust, deloffset, indx, top;
+ db_lockmode_t lock_mode;
+ db_pgno_t pg;
+ db_recno_t recno, t_recno, total;
+ u_int32_t get_mode;
+ int ret, stack, t_ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ h = NULL;
+
+ BT_STK_CLR(cp);
+
+ /*
+ * There are several ways we search a btree tree. The flags argument
+ * specifies if we're acquiring read or write locks and if we are
+ * locking pairs of pages. In addition, if we're adding or deleting
+ * an item, we have to lock the entire tree, regardless. See btree.h
+ * for more details.
+ *
+ * If write-locking pages, we need to know whether or not to acquire a
+ * write lock on a page before getting it. This depends on how deep it
+ * is in tree, which we don't know until we acquire the root page. So,
+ * if we need to lock the root page we may have to upgrade it later,
+ * because we won't get the correct lock initially.
+ *
+ * Retrieve the root page.
+ */
+
+ if ((ret = __bam_get_root(dbc, cp->root, stop, flags, &stack)) != 0)
+ return (ret);
+ lock_mode = cp->csp->lock_mode;
+ get_mode = lock_mode == DB_LOCK_WRITE ? DB_MPOOL_DIRTY : 0;
+ lock = cp->csp->lock;
+ h = cp->csp->page;
+
+ BT_STK_CLR(cp);
+ /*
+ * If appending to the tree, set the record number now -- we have the
+ * root page locked.
+ *
+ * Delete only deletes exact matches, read only returns exact matches.
+ * Note, this is different from __bam_search(), which returns non-exact
+ * matches for read.
+ *
+ * The record may not exist. We can only return the correct location
+ * for the record immediately after the last record in the tree, so do
+ * a fast check now.
+ */
+ total = RE_NREC(h);
+ if (LF_ISSET(SR_APPEND)) {
+ *exactp = 0;
+ *recnop = recno = total + 1;
+ } else {
+ recno = *recnop;
+ if (recno <= total)
+ *exactp = 1;
+ else {
+ *exactp = 0;
+ if (!LF_ISSET(SR_PAST_EOF) || recno > total + 1) {
+ /*
+ * Keep the page locked for serializability.
+ *
+ * XXX
+ * This leaves the root page locked, which will
+ * eliminate any concurrency. A possible fix
+ * would be to lock the last leaf page instead.
+ */
+ ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority);
+ if ((t_ret =
+ __TLPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret == 0 ? DB_NOTFOUND : ret);
+ }
+ }
+ }
+
+ /*
+ * !!!
+ * Record numbers in the tree are 0-based, but the recno is
+ * 1-based. All of the calculations below have to take this
+ * into account.
+ */
+ for (total = 0;;) {
+ switch (TYPE(h)) {
+ case P_LBTREE:
+ if (LF_ISSET(SR_MAX)) {
+ indx = NUM_ENT(h) - 2;
+ goto enter;
+ }
+ /* FALLTHROUGH */
+ case P_LDUP:
+ if (LF_ISSET(SR_MAX)) {
+ indx = NUM_ENT(h) - 1;
+ goto enter;
+ }
+ recno -= total;
+ /*
+ * There may be logically deleted records on the page.
+ * If there are enough, the record may not exist.
+ */
+ if (TYPE(h) == P_LBTREE) {
+ adjust = P_INDX;
+ deloffset = O_INDX;
+ } else {
+ adjust = O_INDX;
+ deloffset = 0;
+ }
+ for (t_recno = 0, indx = 0;; indx += adjust) {
+ if (indx >= NUM_ENT(h)) {
+ *exactp = 0;
+ if (!LF_ISSET(SR_PAST_EOF) ||
+ recno > t_recno + 1) {
+ ret = __memp_fput(mpf,
+ dbc->thread_info,
+ h, dbc->priority);
+ h = NULL;
+ if ((t_ret = __TLPUT(dbc,
+ lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret == 0)
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ }
+ if (!B_DISSET(GET_BKEYDATA(dbp, h,
+ indx + deloffset)->type) &&
+ ++t_recno == recno)
+ break;
+ }
+
+ BT_STK_ENTER(env, cp, h, indx, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ if (LF_ISSET(SR_BOTH))
+ goto get_prev;
+ return (0);
+ case P_IBTREE:
+ if (LF_ISSET(SR_MAX)) {
+ indx = NUM_ENT(h);
+ bi = GET_BINTERNAL(dbp, h, indx - 1);
+ } else for (indx = 0, top = NUM_ENT(h);;) {
+ bi = GET_BINTERNAL(dbp, h, indx);
+ if (++indx == top || total + bi->nrecs >= recno)
+ break;
+ total += bi->nrecs;
+ }
+ pg = bi->pgno;
+ break;
+ case P_LRECNO:
+ if (LF_ISSET(SR_MAX))
+ recno = NUM_ENT(h);
+ else
+ recno -= total;
+
+ /* Correct from 1-based to 0-based for a page offset. */
+ --recno;
+enter: BT_STK_ENTER(env, cp, h, recno, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ if (LF_ISSET(SR_BOTH)) {
+get_prev: DB_ASSERT(env, LF_ISSET(SR_NEXT));
+ /*
+ * We have a NEXT tree, now add the sub tree
+ * that points gets to the previous page.
+ */
+ cp->csp++;
+ indx = cp->sp->indx - 1;
+ h = cp->sp->page;
+ if (TYPE(h) == P_IRECNO) {
+ ri = GET_RINTERNAL(dbp, h, indx);
+ pg = ri->pgno;
+ } else {
+ DB_ASSERT(env, TYPE(h) == P_IBTREE);
+ bi = GET_BINTERNAL(dbp, h, indx);
+ pg = bi->pgno;
+ }
+ LF_CLR(SR_NEXT | SR_BOTH);
+ LF_SET(SR_MAX);
+ stack = 1;
+ h = NULL;
+ goto lock_next;
+ }
+ return (0);
+ case P_IRECNO:
+ if (LF_ISSET(SR_MAX)) {
+ indx = NUM_ENT(h);
+ ri = GET_RINTERNAL(dbp, h, indx - 1);
+ } else for (indx = 0, top = NUM_ENT(h);;) {
+ ri = GET_RINTERNAL(dbp, h, indx);
+ if (++indx == top || total + ri->nrecs >= recno)
+ break;
+ total += ri->nrecs;
+ }
+ pg = ri->pgno;
+ break;
+ default:
+ return (__db_pgfmt(env, h->pgno));
+ }
+ --indx;
+
+ /* Return if this is the lowest page wanted. */
+ if (stop == LEVEL(h)) {
+ BT_STK_ENTER(env, cp, h, indx, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ return (0);
+ }
+ if (stack) {
+ BT_STK_PUSH(env, cp, h, indx, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ h = NULL;
+
+ lock_mode = DB_LOCK_WRITE;
+ get_mode = DB_MPOOL_DIRTY;
+ if ((ret =
+ __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
+ goto err;
+ } else if (LF_ISSET(SR_NEXT)) {
+ /*
+ * For RECNO if we are doing a NEXT search the
+ * search recno is the one we are looking for
+ * but we want to keep the stack from the spanning
+ * node on down. We only know we have the spanning
+ * node when its child's index is 0, so save
+ * each node and discard the tree when we find out
+ * its not needed.
+ */
+ if (indx != 0 && cp->sp->page != NULL) {
+ BT_STK_POP(cp);
+ if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0)
+ goto err;
+ }
+
+ BT_STK_PUSH(env, cp, h, indx, lock, lock_mode, ret);
+ h = NULL;
+ if (ret != 0)
+ goto err;
+lock_next: if ((ret =
+ __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
+ goto err;
+ } else {
+ /*
+ * Decide if we want to return a pointer to the next
+ * page in the stack. If we do, write lock it and
+ * never unlock it.
+ */
+ if ((LF_ISSET(SR_PARENT) &&
+ (u_int8_t)(stop + 1) >= (u_int8_t)(LEVEL(h) - 1)) ||
+ (LEVEL(h) - 1) == LEAFLEVEL)
+ stack = 1;
+
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0)
+ goto err;
+ h = NULL;
+
+ lock_mode = stack &&
+ LF_ISSET(SR_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ;
+ if (lock_mode == DB_LOCK_WRITE)
+ get_mode = DB_MPOOL_DIRTY;
+ if ((ret = __db_lget(dbc,
+ LCK_COUPLE_ALWAYS, pg, lock_mode, 0, &lock)) != 0) {
+ /*
+ * If we fail, discard the lock we held. This
+ * is OK because this only happens when we are
+ * descending the tree holding read-locks.
+ */
+ (void)__LPUT(dbc, lock);
+ goto err;
+ }
+ }
+
+ if ((ret = __memp_fget(mpf, &pg,
+ dbc->thread_info, dbc->txn, get_mode, &h)) != 0)
+ goto err;
+ }
+ /* NOTREACHED */
+
+err: if (h != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ BT_STK_POP(cp);
+ (void)__bam_stkrel(dbc, 0);
+
+ return (ret);
+}
+
+/*
+ * __bam_adjust --
+ * Adjust the tree after adding or deleting a record.
+ *
+ * PUBLIC: int __bam_adjust __P((DBC *, int32_t));
+ */
+int
+__bam_adjust(dbc, adjust)
+ DBC *dbc;
+ int32_t adjust;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ EPG *epg;
+ PAGE *h;
+ db_pgno_t root_pgno;
+ int ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ root_pgno = cp->root;
+
+ /* Update the record counts for the tree. */
+ for (epg = cp->sp; epg <= cp->csp; ++epg) {
+ h = epg->page;
+ if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) {
+ ret = __memp_dirty(mpf, &h,
+ dbc->thread_info, dbc->txn, dbc->priority, 0);
+ epg->page = h;
+ if (ret != 0)
+ return (ret);
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __bam_cadjust_log(dbp, dbc->txn,
+ &LSN(h), 0, PGNO(h), &LSN(h),
+ (u_int32_t)epg->indx, adjust,
+ PGNO(h) == root_pgno ?
+ CAD_UPDATEROOT : 0)) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(LSN(h));
+
+ if (TYPE(h) == P_IBTREE)
+ GET_BINTERNAL(dbp, h, epg->indx)->nrecs +=
+ adjust;
+ else
+ GET_RINTERNAL(dbp, h, epg->indx)->nrecs +=
+ adjust;
+
+ if (PGNO(h) == root_pgno)
+ RE_NREC_ADJ(h, adjust);
+ }
+ }
+ return (0);
+}
+
+/*
+ * __bam_nrecs --
+ * Return the number of records in the tree.
+ *
+ * PUBLIC: int __bam_nrecs __P((DBC *, db_recno_t *));
+ */
+int
+__bam_nrecs(dbc, rep)
+ DBC *dbc;
+ db_recno_t *rep;
+{
+ DB *dbp;
+ DB_LOCK lock;
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ db_pgno_t pgno;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+
+ pgno = dbc->internal->root;
+ if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &lock)) != 0)
+ return (ret);
+ if ((ret = __memp_fget(mpf, &pgno,
+ dbc->thread_info, dbc->txn, 0, &h)) != 0)
+ return (ret);
+
+ *rep = RE_NREC(h);
+
+ ret = __memp_fput(mpf, dbc->thread_info, h, dbc->priority);
+ if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __bam_total --
+ * Return the number of records below a page.
+ *
+ * PUBLIC: db_recno_t __bam_total __P((DB *, PAGE *));
+ */
+db_recno_t
+__bam_total(dbp, h)
+ DB *dbp;
+ PAGE *h;
+{
+ db_recno_t nrecs;
+ db_indx_t indx, top;
+
+ nrecs = 0;
+ top = NUM_ENT(h);
+
+ switch (TYPE(h)) {
+ case P_LBTREE:
+ /* Check for logically deleted records. */
+ for (indx = 0; indx < top; indx += P_INDX)
+ if (!B_DISSET(
+ GET_BKEYDATA(dbp, h, indx + O_INDX)->type))
+ ++nrecs;
+ break;
+ case P_LDUP:
+ /* Check for logically deleted records. */
+ for (indx = 0; indx < top; indx += O_INDX)
+ if (!B_DISSET(GET_BKEYDATA(dbp, h, indx)->type))
+ ++nrecs;
+ break;
+ case P_IBTREE:
+ for (indx = 0; indx < top; indx += O_INDX)
+ nrecs += GET_BINTERNAL(dbp, h, indx)->nrecs;
+ break;
+ case P_LRECNO:
+ nrecs = NUM_ENT(h);
+ break;
+ case P_IRECNO:
+ for (indx = 0; indx < top; indx += O_INDX)
+ nrecs += GET_RINTERNAL(dbp, h, indx)->nrecs;
+ break;
+ }
+
+ return (nrecs);
+}
diff --git a/btree/bt_search.c b/btree/bt_search.c
index 485afcb..6176b86 100644
--- a/btree/bt_search.c
+++ b/btree/bt_search.c
@@ -1,5 +1,14 @@
/*-
- * Copyright (c) 1990, 1993, 1994
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
@@ -13,11 +22,7 @@
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
+ * 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
@@ -32,182 +37,929 @@
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+/*
+ * __bam_get_root --
+ * Fetch the root of a tree and see if we want to keep
+ * it in the stack.
+ *
+ * PUBLIC: int __bam_get_root __P((DBC *, db_pgno_t, int, u_int32_t, int *));
*/
+int
+__bam_get_root(dbc, pg, slevel, flags, stack)
+ DBC *dbc;
+ db_pgno_t pg;
+ int slevel;
+ u_int32_t flags;
+ int *stack;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_LOCK lock;
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ db_lockmode_t lock_mode;
+ u_int32_t get_mode;
+ int ret, t_ret;
-#if defined(LIBC_SCCS) && !defined(lint)
-static char sccsid[] = "@(#)bt_search.c 8.8 (Berkeley) 7/31/94";
-#endif /* LIBC_SCCS and not lint */
+ LOCK_INIT(lock);
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ /*
+ * If write-locking pages, we need to know whether or not to acquire a
+ * write lock on a page before getting it. This depends on how deep it
+ * is in tree, which we don't know until we acquire the root page. So,
+ * if we need to lock the root page we may have to upgrade it later,
+ * because we won't get the correct lock initially.
+ *
+ * Retrieve the root page.
+ */
+try_again:
+ *stack = LF_ISSET(SR_STACK) &&
+ (dbc->dbtype == DB_RECNO || F_ISSET(cp, C_RECNUM));
+ lock_mode = DB_LOCK_READ;
+ if (*stack ||
+ LF_ISSET(SR_DEL) || (LF_ISSET(SR_NEXT) && LF_ISSET(SR_WRITE)))
+ lock_mode = DB_LOCK_WRITE;
+ if ((lock_mode == DB_LOCK_WRITE || F_ISSET(dbc, DBC_DOWNREV) ||
+ dbc->dbtype == DB_RECNO || F_ISSET(cp, C_RECNUM))) {
+lock_it: if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
+ return (ret);
+ }
-#include <sys/types.h>
+ /*
+ * Get the root. If the root happens to be a leaf page then
+ * we are supposed to get a read lock on it before latching
+ * it. So if we have not locked it do a try get first.
+ * If we can't get the root shared, then get a lock on it and
+ * then wait for the latch.
+ */
+ if (lock_mode == DB_LOCK_WRITE)
+ get_mode = DB_MPOOL_DIRTY;
+ else if (LOCK_ISSET(lock) || !STD_LOCKING(dbc))
+ get_mode = 0;
+ else
+ get_mode = DB_MPOOL_TRY;
-#include <stdio.h>
+ if ((ret = __memp_fget(mpf, &pg,
+ dbc->thread_info, dbc->txn, get_mode, &h)) != 0) {
+ if (ret == DB_LOCK_NOTGRANTED)
+ goto lock_it;
+ /* Did not read it, so we can release the lock */
+ (void)__LPUT(dbc, lock);
+ return (ret);
+ }
+
+ /*
+ * Decide if we need to dirty and/or lock this page.
+ * We must not hold the latch while we get the lock.
+ */
+ if (!*stack &&
+ ((LF_ISSET(SR_PARENT) && (u_int8_t)(slevel + 1) >= LEVEL(h)) ||
+ LEVEL(h) == LEAFLEVEL ||
+ (LF_ISSET(SR_START) && slevel == LEVEL(h)))) {
+ *stack = 1;
+ /* If we already have the write lock, we are done. */
+ if (dbc->dbtype == DB_RECNO || F_ISSET(cp, C_RECNUM)) {
+ if (lock_mode == DB_LOCK_WRITE)
+ goto done;
+ if ((ret = __LPUT(dbc, lock)) != 0)
+ return (ret);
+ }
+
+ /*
+ * Now that we know what level the root is at, do we need a
+ * write lock? If not and we got the lock before latching
+ * we are done.
+ */
+ if (LEVEL(h) != LEAFLEVEL || LF_ISSET(SR_WRITE)) {
+ lock_mode = DB_LOCK_WRITE;
+ /* Drop the read lock if we got it above. */
+ if ((ret = __LPUT(dbc, lock)) != 0)
+ return (ret);
+ } else if (LOCK_ISSET(lock))
+ goto done;
+ if (!STD_LOCKING(dbc)) {
+ if (lock_mode != DB_LOCK_WRITE)
+ goto done;
+ if ((ret = __memp_dirty(mpf, &h, dbc->thread_info,
+ dbc->txn, dbc->priority, 0)) != 0) {
+ if (h != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority);
+ return (ret);
+ }
+ } else {
+ /* Try to lock the page without waiting first. */
+ if ((ret = __db_lget(dbc,
+ 0, pg, lock_mode, DB_LOCK_NOWAIT, &lock)) == 0) {
+ if (lock_mode == DB_LOCK_WRITE && (ret =
+ __memp_dirty(mpf, &h, dbc->thread_info,
+ dbc->txn, dbc->priority, 0)) != 0) {
+ if (h != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, h,
+ dbc->priority);
+ return (ret);
+ }
+ goto done;
+ }
+
+ t_ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority);
+
+ if (ret == DB_LOCK_DEADLOCK ||
+ ret == DB_LOCK_NOTGRANTED)
+ ret = 0;
+ if (ret == 0)
+ ret = t_ret;
+
+ if (ret != 0)
+ return (ret);
+
+ if ((ret = __db_lget(dbc,
+ 0, pg, lock_mode, 0, &lock)) != 0)
+ return (ret);
+ if ((ret = __memp_fget(mpf,
+ &pg, dbc->thread_info, dbc->txn,
+ lock_mode == DB_LOCK_WRITE ? DB_MPOOL_DIRTY : 0,
+ &h)) != 0) {
+ /* Did not read it, release the lock */
+ (void)__LPUT(dbc, lock);
+ return (ret);
+ }
+ }
+ /*
+ * While getting dirty or locked we need to drop the mutex
+ * so someone else could get in and split the root.
+ */
+ if (!((LF_ISSET(SR_PARENT) &&
+ (u_int8_t)(slevel + 1) >= LEVEL(h)) ||
+ LEVEL(h) == LEAFLEVEL ||
+ (LF_ISSET(SR_START) && slevel == LEVEL(h)))) {
+ /* Someone else split the root, start over. */
+ ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority);
+ if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ return (ret);
+ goto try_again;
+ }
+ }
-#include <db.h>
-#include "btree.h"
+done: BT_STK_ENTER(dbp->env, cp, h, 0, lock, lock_mode, ret);
-static int __bt_snext __P((BTREE *, PAGE *, const DBT *, int *));
-static int __bt_sprev __P((BTREE *, PAGE *, const DBT *, int *));
+ return (ret);
+}
/*
- * __bt_search --
+ * __bam_search --
* Search a btree for a key.
*
- * Parameters:
- * t: tree to search
- * key: key to find
- * exactp: pointer to exact match flag
- *
- * Returns:
- * The EPG for matching record, if any, or the EPG for the location
- * of the key, if it were inserted into the tree, is entered into
- * the bt_cur field of the tree. A pointer to the field is returned.
+ * PUBLIC: int __bam_search __P((DBC *, db_pgno_t,
+ * PUBLIC: const DBT *, u_int32_t, int, db_recno_t *, int *));
*/
-EPG *
-__bt_search(t, key, exactp)
- BTREE *t;
+int
+__bam_search(dbc, root_pgno, key, flags, slevel, recnop, exactp)
+ DBC *dbc;
+ db_pgno_t root_pgno;
const DBT *key;
- int *exactp;
+ u_int32_t flags;
+ int slevel, *exactp;
+ db_recno_t *recnop;
{
- PAGE *h;
- indx_t base, index, lim;
- pgno_t pg;
- int cmp;
-
- BT_CLR(t);
- for (pg = P_ROOT;;) {
- if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
- return (NULL);
-
- /* Do a binary search on the current page. */
- t->bt_cur.page = h;
- for (base = 0, lim = NEXTINDEX(h); lim; lim >>= 1) {
- t->bt_cur.index = index = base + (lim >> 1);
- if ((cmp = __bt_cmp(t, key, &t->bt_cur)) == 0) {
- if (h->flags & P_BLEAF) {
- *exactp = 1;
- return (&t->bt_cur);
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_LOCK lock, saved_lock;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *h, *parent_h;
+ db_indx_t base, i, indx, *inp, lim;
+ db_lockmode_t lock_mode;
+ db_pgno_t pg, saved_pg;
+ db_recno_t recno;
+ int adjust, cmp, deloffset, ret, set_stack, stack, t_ret;
+ int getlock, was_next;
+ int (*func) __P((DB *, const DBT *, const DBT *));
+ u_int32_t get_mode, wait;
+ u_int8_t level, saved_level;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ h = NULL;
+ parent_h = NULL;
+ t = dbp->bt_internal;
+ recno = 0;
+ t_ret = 0;
+
+ BT_STK_CLR(cp);
+ LOCK_INIT(saved_lock);
+ LOCK_INIT(lock);
+ was_next = LF_ISSET(SR_NEXT);
+ wait = DB_LOCK_NOWAIT;
+
+ /*
+ * There are several ways we search a btree tree. The flags argument
+ * specifies if we're acquiring read or write latches, if we position
+ * to the first or last item in a set of duplicates, if we return
+ * deleted items, and if we are latching pairs of pages. In addition,
+ * if we're modifying record numbers, we have to latch the entire tree
+ * regardless. See btree.h for more details.
+ */
+
+ if (root_pgno == PGNO_INVALID)
+ root_pgno = cp->root;
+ saved_pg = root_pgno;
+ saved_level = MAXBTREELEVEL;
+retry: if ((ret = __bam_get_root(dbc, root_pgno, slevel, flags, &stack)) != 0)
+ goto err;
+ lock_mode = cp->csp->lock_mode;
+ get_mode = lock_mode == DB_LOCK_WRITE ? DB_MPOOL_DIRTY : 0;
+ h = cp->csp->page;
+ pg = PGNO(h);
+ lock = cp->csp->lock;
+ set_stack = stack;
+ /*
+ * Determine if we need to lock interiror nodes.
+ * If we have record numbers we always lock. Otherwise we only
+ * need to do this if we are write locking and we are returning
+ * a stack of nodes. SR_NEXT will eventually get a stack and
+ * release the locks above that level.
+ */
+ if (F_ISSET(dbc, DBC_DOWNREV)) {
+ getlock = 1;
+ wait = 0;
+ } else
+ getlock = F_ISSET(cp, C_RECNUM) ||
+ (lock_mode == DB_LOCK_WRITE &&
+ (stack || LF_ISSET(SR_NEXT | SR_DEL)));
+
+ /*
+ * If we are asked a level that is above the root,
+ * just return the root. This can happen if the tree
+ * collapses while we are trying to lock the root.
+ */
+ if (!LF_ISSET(SR_START) && LEVEL(h) < slevel)
+ goto done;
+
+ BT_STK_CLR(cp);
+
+ /* Choose a comparison function. */
+ func = F_ISSET(dbc, DBC_OPD) ?
+ (dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare) :
+ t->bt_compare;
+
+ for (;;) {
+ if (TYPE(h) == P_LBTREE)
+ adjust = P_INDX;
+ else {
+ /*
+ * It is possible to catch an internal page as a change
+ * is being backed out. Its leaf pages will be locked
+ * but we must be sure we get to one. If the page
+ * is not populated enough lock it.
+ */
+ if (TYPE(h) != P_LDUP && NUM_ENT(h) == 0) {
+ getlock = 1;
+ level = LEVEL(h) + 1;
+ if ((ret = __memp_fput(mpf, dbc->thread_info,
+ h, dbc->priority)) != 0)
+ goto err;
+ goto lock_next;
+ }
+ adjust = O_INDX;
+ }
+ inp = P_INP(dbp, h);
+ if (LF_ISSET(SR_MIN | SR_MAX)) {
+ if (LF_ISSET(SR_MIN) || NUM_ENT(h) == 0)
+ indx = 0;
+ else if (TYPE(h) == P_LBTREE)
+ indx = NUM_ENT(h) - 2;
+ else
+ indx = NUM_ENT(h) - 1;
+
+ if (LEVEL(h) == LEAFLEVEL ||
+ (!LF_ISSET(SR_START) && LEVEL(h) == slevel)) {
+ if (LF_ISSET(SR_NEXT))
+ goto get_next;
+ goto found;
+ }
+ goto next;
+ }
+ /*
+ * Do a binary search on the current page. If we're searching
+ * a Btree leaf page, we have to walk the indices in groups of
+ * two. If we're searching an internal page or a off-page dup
+ * page, they're an index per page item. If we find an exact
+ * match on a leaf page, we're done.
+ */
+ DB_BINARY_SEARCH_FOR(base, lim, NUM_ENT(h), adjust) {
+ DB_BINARY_SEARCH_INCR(indx, base, lim, adjust);
+ if ((ret = __bam_cmp(dbc, key, h, indx,
+ func, &cmp)) != 0)
+ goto err;
+ if (cmp == 0) {
+ if (LEVEL(h) == LEAFLEVEL ||
+ (!LF_ISSET(SR_START) &&
+ LEVEL(h) == slevel)) {
+ if (LF_ISSET(SR_NEXT))
+ goto get_next;
+ goto found;
}
goto next;
}
- if (cmp > 0) {
- base = index + 1;
- --lim;
+ if (cmp > 0)
+ DB_BINARY_SEARCH_SHIFT_BASE(indx, base,
+ lim, adjust);
+ }
+
+ /*
+ * No match found. Base is the smallest index greater than
+ * key and may be zero or a last + O_INDX index.
+ *
+ * If it's a leaf page or the stopping point,
+ * return base as the "found" value.
+ * Delete only deletes exact matches.
+ */
+ if (LEVEL(h) == LEAFLEVEL ||
+ (!LF_ISSET(SR_START) && LEVEL(h) == slevel)) {
+ *exactp = 0;
+
+ if (LF_ISSET(SR_EXACT)) {
+ ret = DB_NOTFOUND;
+ goto err;
}
+
+ if (LF_ISSET(SR_STK_ONLY)) {
+ BT_STK_NUM(env, cp, h, base, ret);
+ if ((t_ret =
+ __LPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+ h, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ h = NULL;
+ if (ret != 0)
+ goto err;
+ goto done;
+ }
+ if (LF_ISSET(SR_NEXT)) {
+get_next: /*
+ * The caller could have asked for a NEXT
+ * at the root if the tree recently collapsed.
+ */
+ if (PGNO(h) == root_pgno) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+
+ indx = cp->sp->indx + 1;
+ if (indx == NUM_ENT(cp->sp->page)) {
+ ret = DB_NOTFOUND;
+ cp->csp++;
+ goto err;
+ }
+ /*
+ * If we want both the key page and the next
+ * page, push the key page on the stack
+ * otherwise save the root of the subtree
+ * and drop the rest of the subtree.
+ * Search down again starting at the
+ * next child of the root of this subtree.
+ */
+ LF_SET(SR_MIN);
+ LF_CLR(SR_NEXT);
+ set_stack = stack = 1;
+ if (LF_ISSET(SR_BOTH)) {
+ cp->csp++;
+ BT_STK_PUSH(env,
+ cp, h, indx, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ LOCK_INIT(lock);
+ h = cp->sp->page;
+ pg = GET_BINTERNAL(dbp, h, indx)->pgno;
+ level = LEVEL(h);
+ h = NULL;
+ goto lock_next;
+ } else {
+ if ((ret = __LPUT(dbc, lock)) != 0)
+ goto err;
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info,
+ h, dbc->priority)) != 0)
+ goto err;
+ h = cp->sp->page;
+ cp->sp->page = NULL;
+ lock = cp->sp->lock;
+ LOCK_INIT(cp->sp->lock);
+ if ((ret = __bam_stkrel(dbc,
+ STK_NOLOCK)) != 0)
+ goto err;
+ goto next;
+ }
+ }
+
+ /*
+ * !!!
+ * Possibly returning a deleted record -- DB_SET_RANGE,
+ * DB_KEYFIRST and DB_KEYLAST don't require an exact
+ * match, and we don't want to walk multiple pages here
+ * to find an undeleted record. This is handled by the
+ * calling routine.
+ */
+ if (LF_ISSET(SR_DEL) && cp->csp == cp->sp)
+ cp->csp++;
+ BT_STK_ENTER(env, cp, h, base, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ goto done;
}
/*
- * If it's a leaf page, we're almost done. If no duplicates
- * are allowed, or we have an exact match, we're done. Else,
- * it's possible that there were matching keys on this page,
- * which later deleted, and we're on a page with no matches
- * while there are matches on other pages. If at the start or
- * end of a page, check the adjacent page.
+ * If it's not a leaf page, record the internal page (which is
+ * a parent page for the key). Decrement the base by 1 if it's
+ * non-zero so that if a split later occurs, the inserted page
+ * will be to the right of the saved page.
+ */
+ indx = base > 0 ? base - O_INDX : base;
+
+ /*
+ * If we're trying to calculate the record number, sum up
+ * all the record numbers on this page up to the indx point.
*/
- if (h->flags & P_BLEAF) {
- if (!F_ISSET(t, B_NODUPS)) {
- if (base == 0 &&
- h->prevpg != P_INVALID &&
- __bt_sprev(t, h, key, exactp))
- return (&t->bt_cur);
- if (base == NEXTINDEX(h) &&
- h->nextpg != P_INVALID &&
- __bt_snext(t, h, key, exactp))
- return (&t->bt_cur);
+next: if (recnop != NULL)
+ for (i = 0; i < indx; ++i)
+ recno += GET_BINTERNAL(dbp, h, i)->nrecs;
+
+ pg = GET_BINTERNAL(dbp, h, indx)->pgno;
+ level = LEVEL(h);
+
+ /* See if we are at the level to start stacking. */
+ if (LF_ISSET(SR_START) && slevel == level)
+ set_stack = stack = 1;
+
+ if (LF_ISSET(SR_STK_ONLY)) {
+ if (slevel == LEVEL(h)) {
+ BT_STK_NUM(env, cp, h, indx, ret);
+ if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+ h, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ h = NULL;
+ if (ret != 0)
+ goto err;
+ goto done;
}
- *exactp = 0;
- t->bt_cur.index = base;
- return (&t->bt_cur);
+ BT_STK_NUMPUSH(env, cp, h, indx, ret);
+ (void)__memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority);
+ h = NULL;
+ } else if (stack) {
+ /* Return if this is the lowest page wanted. */
+ if (LF_ISSET(SR_PARENT) && slevel == level) {
+ BT_STK_ENTER(env,
+ cp, h, indx, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ goto done;
+ }
+ if (LF_ISSET(SR_DEL) && NUM_ENT(h) > 1) {
+ /*
+ * There was a page with a singleton pointer
+ * to a non-empty subtree.
+ */
+ cp->csp--;
+ if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0)
+ goto err;
+ set_stack = stack = 0;
+ goto do_del;
+ }
+ BT_STK_PUSH(env,
+ cp, h, indx, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+
+ LOCK_INIT(lock);
+ get_mode = DB_MPOOL_DIRTY;
+ lock_mode = DB_LOCK_WRITE;
+ goto lock_next;
+ } else {
+ /*
+ * Decide if we want to return a reference to the next
+ * page in the return stack. If so, latch it and don't
+ * unlatch it. We will want to stack things on the
+ * next iteration. The stack variable cannot be
+ * set until we leave this clause. If we are locking
+ * then we must lock this level before getting the page.
+ */
+ if ((LF_ISSET(SR_PARENT) &&
+ (u_int8_t)(slevel + 1) >= (level - 1)) ||
+ (level - 1) == LEAFLEVEL)
+ set_stack = 1;
+
+ /*
+ * Check for a normal search. If so, we need to
+ * latch couple the parent/chid buffers.
+ */
+ if (!LF_ISSET(SR_DEL | SR_NEXT)) {
+ parent_h = h;
+ goto lock_next;
+ }
+
+ /*
+ * Returning a subtree. See if we have hit the start
+ * point if so save the parent and set stack.
+ * Otherwise free the parent and temporarily
+ * save this one.
+ * For SR_DEL we need to find a page with 1 entry.
+ * For SR_NEXT we want find the minimal subtree
+ * that contains the key and the next page.
+ * We save pages as long as we are at the right
+ * edge of the subtree. When we leave the right
+ * edge, then drop the subtree.
+ */
+
+ if ((LF_ISSET(SR_DEL) && NUM_ENT(h) == 1)) {
+ /*
+ * We are pushing the things on the stack,
+ * set the stack variable now to indicate this
+ * has happened.
+ */
+ stack = set_stack = 1;
+ LF_SET(SR_WRITE);
+ /* Push the parent. */
+ cp->csp++;
+ /* Push this node. */
+ BT_STK_PUSH(env, cp, h,
+ indx, lock, DB_LOCK_NG, ret);
+ if (ret != 0)
+ goto err;
+ LOCK_INIT(lock);
+ } else {
+ /*
+ * See if we want to save the tree so far.
+ * If we are looking for the next key,
+ * then we must save this node if we are
+ * at the end of the page. If not then
+ * discard anything we have saved so far.
+ * For delete only keep one node until
+ * we find a singleton.
+ */
+do_del: if (cp->csp->page != NULL) {
+ if (LF_ISSET(SR_NEXT) &&
+ indx == NUM_ENT(h) - 1)
+ cp->csp++;
+ else if ((ret =
+ __bam_stkrel(dbc, STK_NOLOCK)) != 0)
+ goto err;
+ }
+ /* Save this node. */
+ BT_STK_ENTER(env, cp,
+ h, indx, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ LOCK_INIT(lock);
+ }
+
+lock_next: h = NULL;
+
+ if (set_stack && LF_ISSET(SR_WRITE)) {
+ lock_mode = DB_LOCK_WRITE;
+ get_mode = DB_MPOOL_DIRTY;
+ getlock = 1;
+ }
+ /*
+ * If we are retrying and we are back at the same
+ * page then we already have it locked. If we are
+ * at a different page we want to lock couple and
+ * release that lock.
+ */
+ if (level - 1 == saved_level) {
+ if ((ret = __LPUT(dbc, lock)) != 0)
+ goto err;
+ lock = saved_lock;
+ LOCK_INIT(saved_lock);
+ saved_level = MAXBTREELEVEL;
+ if (pg == saved_pg)
+ goto skip_lock;
+ }
+ if ((getlock || level - 1 == LEAFLEVEL) &&
+ (ret = __db_lget(dbc, LCK_COUPLE_ALWAYS,
+ pg, lock_mode, wait, &lock)) != 0) {
+ /*
+ * If we are doing DEL or NEXT then we
+ * have an extra level saved in the stack,
+ * push it so it will get freed.
+ */
+ if (LF_ISSET(SR_DEL | SR_NEXT) && !stack)
+ cp->csp++;
+ /*
+ * If we fail, discard the lock we held.
+ * This is ok because we will either search
+ * again or exit without actually looking
+ * at the data.
+ */
+ if ((t_ret = __LPUT(dbc, lock)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ /*
+ * If we blocked at a different level release
+ * the previous saved lock.
+ */
+ if ((t_ret = __LPUT(dbc, saved_lock)) != 0 &&
+ ret == 0)
+ ret = t_ret;
+ if (wait == 0 || (ret != DB_LOCK_NOTGRANTED &&
+ ret != DB_LOCK_DEADLOCK))
+ goto err;
+
+ /* Relase the parent if we are holding it. */
+ if (parent_h != NULL &&
+ (ret = __memp_fput(mpf, dbc->thread_info,
+ parent_h, dbc->priority)) != 0)
+ goto err;
+ parent_h = NULL;
+
+ BT_STK_POP(cp);
+ if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0)
+ goto err;
+ if ((ret = __db_lget(dbc,
+ 0, pg, lock_mode, 0, &saved_lock)) != 0)
+ goto err;
+ /*
+ * A very strange case: if this page was
+ * freed while we wait then we cannot hold
+ * the lock on it while we reget the root
+ * latch because allocation is one place
+ * we lock while holding a latch.
+ * Noone can have a free page locked, so
+ * check for that case. We do this by
+ * checking the level, since it will be 0
+ * if free and we might as well see if this
+ * page moved and drop the lock in that case.
+ */
+ if ((ret = __memp_fget(mpf, &pg,
+ dbc->thread_info,
+ dbc->txn, get_mode, &h)) != 0 &&
+ ret != DB_PAGE_NOTFOUND)
+ goto err;
+
+ if (ret != 0 || LEVEL(h) != level - 1) {
+ ret = __LPUT(dbc, saved_lock);
+ if (ret != 0)
+ goto err;
+ pg = root_pgno;
+ saved_level = MAXBTREELEVEL;
+ }
+ if (h != NULL && (ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0)
+ goto err;
+ h = NULL;
+
+ if (was_next) {
+ LF_CLR(SR_MIN);
+ LF_SET(SR_NEXT);
+ }
+ /*
+ * We have the lock but we dropped the
+ * latch so we need to search again. If
+ * we get back to the same page then all
+ * is good, otherwise we need to try to
+ * lock the new page.
+ */
+ saved_pg = pg;
+ saved_level = level - 1;
+ goto retry;
+ }
+skip_lock: stack = set_stack;
}
+ /* Get the child page. */
+ if ((ret = __memp_fget(mpf, &pg,
+ dbc->thread_info, dbc->txn, get_mode, &h)) != 0)
+ goto err;
+ /* Release the parent. */
+ if (parent_h != NULL && (ret = __memp_fput(mpf,
+ dbc->thread_info, parent_h, dbc->priority)) != 0)
+ goto err;
+ parent_h = NULL;
+ }
+ /* NOTREACHED */
+
+found: *exactp = 1;
+
+ /*
+ * If we got here, we know that we have a Btree leaf or off-page
+ * duplicates page. If it's a Btree leaf page, we have to handle
+ * on-page duplicates.
+ *
+ * If there are duplicates, go to the first/last one. This is
+ * safe because we know that we're not going to leave the page,
+ * all duplicate sets that are not on overflow pages exist on a
+ * single leaf page.
+ */
+ if (TYPE(h) == P_LBTREE && NUM_ENT(h) > P_INDX) {
+ if (LF_ISSET(SR_DUPLAST))
+ while (indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
+ inp[indx] == inp[indx + P_INDX])
+ indx += P_INDX;
+ else if (LF_ISSET(SR_DUPFIRST))
+ while (indx > 0 &&
+ inp[indx] == inp[indx - P_INDX])
+ indx -= P_INDX;
+ }
+
+ /*
+ * Now check if we are allowed to return deleted items; if not, then
+ * find the next (or previous) non-deleted duplicate entry. (We do
+ * not move from the original found key on the basis of the SR_DELNO
+ * flag.)
+ */
+ DB_ASSERT(env, recnop == NULL || LF_ISSET(SR_DELNO));
+ if (LF_ISSET(SR_DELNO)) {
+ deloffset = TYPE(h) == P_LBTREE ? O_INDX : 0;
+ if (LF_ISSET(SR_DUPLAST))
+ while (B_DISSET(GET_BKEYDATA(dbp,
+ h, indx + deloffset)->type) && indx > 0 &&
+ inp[indx] == inp[indx - adjust])
+ indx -= adjust;
+ else
+ while (B_DISSET(GET_BKEYDATA(dbp,
+ h, indx + deloffset)->type) &&
+ indx < (db_indx_t)(NUM_ENT(h) - adjust) &&
+ inp[indx] == inp[indx + adjust])
+ indx += adjust;
/*
- * No match found. Base is the smallest index greater than
- * key and may be zero or a last + 1 index. If it's non-zero,
- * decrement by one, and record the internal page which should
- * be a parent page for the key. If a split later occurs, the
- * inserted page will be to the right of the saved page.
+ * If we weren't able to find a non-deleted duplicate, return
+ * DB_NOTFOUND.
+ */
+ if (B_DISSET(GET_BKEYDATA(dbp, h, indx + deloffset)->type)) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+
+ /*
+ * Increment the record counter to point to the found element.
+ * Ignore any deleted key/data pairs. There doesn't need to
+ * be any correction for duplicates, as Btree doesn't support
+ * duplicates and record numbers in the same tree.
*/
- index = base ? base - 1 : base;
+ if (recnop != NULL) {
+ DB_ASSERT(env, TYPE(h) == P_LBTREE);
-next: BT_PUSH(t, h->pgno, index);
- pg = GETBINTERNAL(h, index)->pgno;
- mpool_put(t->bt_mp, h, 0);
+ for (i = 0; i < indx; i += P_INDX)
+ if (!B_DISSET(
+ GET_BKEYDATA(dbp, h, i + O_INDX)->type))
+ ++recno;
+
+ /* Correct the number for a 0-base. */
+ *recnop = recno + 1;
+ }
}
+
+ if (LF_ISSET(SR_STK_ONLY)) {
+ BT_STK_NUM(env, cp, h, indx, ret);
+ if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ } else {
+ if (LF_ISSET(SR_DEL) && cp->csp == cp->sp)
+ cp->csp++;
+ BT_STK_ENTER(env, cp, h, indx, lock, lock_mode, ret);
+ }
+ if (ret != 0)
+ goto err;
+
+ cp->csp->lock = lock;
+ DB_ASSERT(env, parent_h == NULL);
+
+done: if ((ret = __LPUT(dbc, saved_lock)) != 0)
+ return (ret);
+
+ return (0);
+
+err: if (ret == 0)
+ ret = t_ret;
+ if (h != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (parent_h != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, parent_h, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Keep any not-found page locked for serializability. */
+ if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ (void)__LPUT(dbc, saved_lock);
+
+ BT_STK_POP(cp);
+ (void)__bam_stkrel(dbc, 0);
+
+ return (ret);
}
/*
- * __bt_snext --
- * Check for an exact match after the key.
- *
- * Parameters:
- * t: tree
- * h: current page
- * key: key
- * exactp: pointer to exact match flag
+ * __bam_stkrel --
+ * Release all pages currently held in the stack.
*
- * Returns:
- * If an exact match found.
+ * PUBLIC: int __bam_stkrel __P((DBC *, u_int32_t));
*/
-static int
-__bt_snext(t, h, key, exactp)
- BTREE *t;
- PAGE *h;
- const DBT *key;
- int *exactp;
+int
+__bam_stkrel(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
{
- EPG e;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ EPG *epg;
+ int ret, t_ret;
+
+ DB_ASSERT(NULL, dbc != NULL);
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ cp = (BTREE_CURSOR *)dbc->internal;
/*
- * Get the next page. The key is either an exact
- * match, or not as good as the one we already have.
+ * Release inner pages first.
+ *
+ * The caller must be sure that setting STK_NOLOCK will not effect
+ * either serializability or recoverability.
*/
- if ((e.page = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL)
- return (0);
- e.index = 0;
- if (__bt_cmp(t, key, &e) == 0) {
- mpool_put(t->bt_mp, h, 0);
- t->bt_cur = e;
- *exactp = 1;
- return (1);
+ for (ret = 0, epg = cp->sp; epg <= cp->csp; ++epg) {
+ if (epg->page != NULL) {
+ if (LF_ISSET(STK_CLRDBC) && cp->page == epg->page) {
+ cp->page = NULL;
+ LOCK_INIT(cp->lock);
+ }
+ if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+ epg->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ epg->page = NULL;
+ }
+ /*
+ * We set this if we need to release our pins,
+ * but are not logically ready to have the pages
+ * visible.
+ */
+ if (LF_ISSET(STK_PGONLY))
+ continue;
+ if (LF_ISSET(STK_NOLOCK)) {
+ if ((t_ret = __LPUT(dbc, epg->lock)) != 0 && ret == 0)
+ ret = t_ret;
+ } else
+ if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0)
+ ret = t_ret;
}
- mpool_put(t->bt_mp, e.page, 0);
- return (0);
+
+ /* Clear the stack, all pages have been released. */
+ if (!LF_ISSET(STK_PGONLY))
+ BT_STK_CLR(cp);
+
+ return (ret);
}
/*
- * __bt_sprev --
- * Check for an exact match before the key.
- *
- * Parameters:
- * t: tree
- * h: current page
- * key: key
- * exactp: pointer to exact match flag
+ * __bam_stkgrow --
+ * Grow the stack.
*
- * Returns:
- * If an exact match found.
+ * PUBLIC: int __bam_stkgrow __P((ENV *, BTREE_CURSOR *));
*/
-static int
-__bt_sprev(t, h, key, exactp)
- BTREE *t;
- PAGE *h;
- const DBT *key;
- int *exactp;
+int
+__bam_stkgrow(env, cp)
+ ENV *env;
+ BTREE_CURSOR *cp;
{
- EPG e;
+ EPG *p;
+ size_t entries;
+ int ret;
- /*
- * Get the previous page. The key is either an exact
- * match, or not as good as the one we already have.
- */
- if ((e.page = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL)
- return (0);
- e.index = NEXTINDEX(e.page) - 1;
- if (__bt_cmp(t, key, &e) == 0) {
- mpool_put(t->bt_mp, h, 0);
- t->bt_cur = e;
- *exactp = 1;
- return (1);
- }
- mpool_put(t->bt_mp, e.page, 0);
+ entries = cp->esp - cp->sp;
+
+ if ((ret = __os_calloc(env, entries * 2, sizeof(EPG), &p)) != 0)
+ return (ret);
+ memcpy(p, cp->sp, entries * sizeof(EPG));
+ if (cp->sp != cp->stack)
+ __os_free(env, cp->sp);
+ cp->sp = p;
+ cp->csp = p + entries;
+ cp->esp = p + entries * 2;
return (0);
}
diff --git a/btree/bt_seq.c b/btree/bt_seq.c
deleted file mode 100644
index 303b481..0000000
--- a/btree/bt_seq.c
+++ /dev/null
@@ -1,460 +0,0 @@
-/*-
- * Copyright (c) 1990, 1993, 1994
- * The Regents of the University of California. All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * Mike Olson.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#if defined(LIBC_SCCS) && !defined(lint)
-static char sccsid[] = "@(#)bt_seq.c 8.7 (Berkeley) 7/20/94";
-#endif /* LIBC_SCCS and not lint */
-
-#include <sys/types.h>
-
-#include <errno.h>
-#include <stddef.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <db.h>
-#include "btree.h"
-
-static int __bt_first __P((BTREE *, const DBT *, EPG *, int *));
-static int __bt_seqadv __P((BTREE *, EPG *, int));
-static int __bt_seqset __P((BTREE *, EPG *, DBT *, int));
-
-/*
- * Sequential scan support.
- *
- * The tree can be scanned sequentially, starting from either end of the
- * tree or from any specific key. A scan request before any scanning is
- * done is initialized as starting from the least node.
- */
-
-/*
- * __bt_seq --
- * Btree sequential scan interface.
- *
- * Parameters:
- * dbp: pointer to access method
- * key: key for positioning and return value
- * data: data return value
- * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV.
- *
- * Returns:
- * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
- */
-int
-__bt_seq(dbp, key, data, flags)
- const DB *dbp;
- DBT *key, *data;
- u_int flags;
-{
- BTREE *t;
- EPG e;
- int status;
-
- t = dbp->internal;
-
- /* Toss any page pinned across calls. */
- if (t->bt_pinned != NULL) {
- mpool_put(t->bt_mp, t->bt_pinned, 0);
- t->bt_pinned = NULL;
- }
-
- /*
- * If scan unitialized as yet, or starting at a specific record, set
- * the scan to a specific key. Both __bt_seqset and __bt_seqadv pin
- * the page the cursor references if they're successful.
- */
- switch (flags) {
- case R_NEXT:
- case R_PREV:
- if (F_ISSET(&t->bt_cursor, CURS_INIT)) {
- status = __bt_seqadv(t, &e, flags);
- break;
- }
- /* FALLTHROUGH */
- case R_FIRST:
- case R_LAST:
- case R_CURSOR:
- status = __bt_seqset(t, &e, key, flags);
- break;
- default:
- errno = EINVAL;
- return (RET_ERROR);
- }
-
- if (status == RET_SUCCESS) {
- __bt_setcur(t, e.page->pgno, e.index);
-
- status =
- __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0);
-
- /*
- * If the user is doing concurrent access, we copied the
- * key/data, toss the page.
- */
- if (F_ISSET(t, B_DB_LOCK))
- mpool_put(t->bt_mp, e.page, 0);
- else
- t->bt_pinned = e.page;
- }
- return (status);
-}
-
-/*
- * __bt_seqset --
- * Set the sequential scan to a specific key.
- *
- * Parameters:
- * t: tree
- * ep: storage for returned key
- * key: key for initial scan position
- * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV
- *
- * Side effects:
- * Pins the page the cursor references.
- *
- * Returns:
- * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
- */
-static int
-__bt_seqset(t, ep, key, flags)
- BTREE *t;
- EPG *ep;
- DBT *key;
- int flags;
-{
- PAGE *h;
- pgno_t pg;
- int exact;
-
- /*
- * Find the first, last or specific key in the tree and point the
- * cursor at it. The cursor may not be moved until a new key has
- * been found.
- */
- switch (flags) {
- case R_CURSOR: /* Keyed scan. */
- /*
- * Find the first instance of the key or the smallest key
- * which is greater than or equal to the specified key.
- */
- if (key->data == NULL || key->size == 0) {
- errno = EINVAL;
- return (RET_ERROR);
- }
- return (__bt_first(t, key, ep, &exact));
- case R_FIRST: /* First record. */
- case R_NEXT:
- /* Walk down the left-hand side of the tree. */
- for (pg = P_ROOT;;) {
- if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
- return (RET_ERROR);
-
- /* Check for an empty tree. */
- if (NEXTINDEX(h) == 0) {
- mpool_put(t->bt_mp, h, 0);
- return (RET_SPECIAL);
- }
-
- if (h->flags & (P_BLEAF | P_RLEAF))
- break;
- pg = GETBINTERNAL(h, 0)->pgno;
- mpool_put(t->bt_mp, h, 0);
- }
- ep->page = h;
- ep->index = 0;
- break;
- case R_LAST: /* Last record. */
- case R_PREV:
- /* Walk down the right-hand side of the tree. */
- for (pg = P_ROOT;;) {
- if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
- return (RET_ERROR);
-
- /* Check for an empty tree. */
- if (NEXTINDEX(h) == 0) {
- mpool_put(t->bt_mp, h, 0);
- return (RET_SPECIAL);
- }
-
- if (h->flags & (P_BLEAF | P_RLEAF))
- break;
- pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno;
- mpool_put(t->bt_mp, h, 0);
- }
-
- ep->page = h;
- ep->index = NEXTINDEX(h) - 1;
- break;
- }
- return (RET_SUCCESS);
-}
-
-/*
- * __bt_seqadvance --
- * Advance the sequential scan.
- *
- * Parameters:
- * t: tree
- * flags: R_NEXT, R_PREV
- *
- * Side effects:
- * Pins the page the new key/data record is on.
- *
- * Returns:
- * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key.
- */
-static int
-__bt_seqadv(t, ep, flags)
- BTREE *t;
- EPG *ep;
- int flags;
-{
- CURSOR *c;
- PAGE *h;
- indx_t index;
- pgno_t pg;
- int exact;
-
- /*
- * There are a couple of states that we can be in. The cursor has
- * been initialized by the time we get here, but that's all we know.
- */
- c = &t->bt_cursor;
-
- /*
- * The cursor was deleted where there weren't any duplicate records,
- * so the key was saved. Find out where that key would go in the
- * current tree. It doesn't matter if the returned key is an exact
- * match or not -- if it's an exact match, the record was added after
- * the delete so we can just return it. If not, as long as there's
- * a record there, return it.
- */
- if (F_ISSET(c, CURS_ACQUIRE))
- return (__bt_first(t, &c->key, ep, &exact));
-
- /* Get the page referenced by the cursor. */
- if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL)
- return (RET_ERROR);
-
- /*
- * Find the next/previous record in the tree and point the cursor at
- * it. The cursor may not be moved until a new key has been found.
- */
- switch (flags) {
- case R_NEXT: /* Next record. */
- /*
- * The cursor was deleted in duplicate records, and moved
- * forward to a record that has yet to be returned. Clear
- * that flag, and return the record.
- */
- if (F_ISSET(c, CURS_AFTER))
- goto usecurrent;
- index = c->pg.index;
- if (++index == NEXTINDEX(h)) {
- pg = h->nextpg;
- mpool_put(t->bt_mp, h, 0);
- if (pg == P_INVALID)
- return (RET_SPECIAL);
- if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
- return (RET_ERROR);
- index = 0;
- }
- break;
- case R_PREV: /* Previous record. */
- /*
- * The cursor was deleted in duplicate records, and moved
- * backward to a record that has yet to be returned. Clear
- * that flag, and return the record.
- */
- if (F_ISSET(c, CURS_BEFORE)) {
-usecurrent: F_CLR(c, CURS_AFTER | CURS_BEFORE);
- ep->page = h;
- ep->index = c->pg.index;
- return (RET_SUCCESS);
- }
- index = c->pg.index;
- if (index == 0) {
- pg = h->prevpg;
- mpool_put(t->bt_mp, h, 0);
- if (pg == P_INVALID)
- return (RET_SPECIAL);
- if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
- return (RET_ERROR);
- index = NEXTINDEX(h) - 1;
- } else
- --index;
- break;
- }
-
- ep->page = h;
- ep->index = index;
- return (RET_SUCCESS);
-}
-
-/*
- * __bt_first --
- * Find the first entry.
- *
- * Parameters:
- * t: the tree
- * key: the key
- * erval: return EPG
- * exactp: pointer to exact match flag
- *
- * Returns:
- * The first entry in the tree greater than or equal to key,
- * or RET_SPECIAL if no such key exists.
- */
-static int
-__bt_first(t, key, erval, exactp)
- BTREE *t;
- const DBT *key;
- EPG *erval;
- int *exactp;
-{
- PAGE *h;
- EPG *ep, save;
- pgno_t pg;
-
- /*
- * Find any matching record; __bt_search pins the page.
- *
- * If it's an exact match and duplicates are possible, walk backwards
- * in the tree until we find the first one. Otherwise, make sure it's
- * a valid key (__bt_search may return an index just past the end of a
- * page) and return it.
- */
- if ((ep = __bt_search(t, key, exactp)) == NULL)
- return (NULL);
- if (*exactp) {
- if (F_ISSET(t, B_NODUPS)) {
- *erval = *ep;
- return (RET_SUCCESS);
- }
-
- /*
- * Walk backwards, as long as the entry matches and there are
- * keys left in the tree. Save a copy of each match in case
- * we go too far.
- */
- save = *ep;
- h = ep->page;
- do {
- if (save.page->pgno != ep->page->pgno) {
- mpool_put(t->bt_mp, save.page, 0);
- save = *ep;
- } else
- save.index = ep->index;
-
- /*
- * Don't unpin the page the last (or original) match
- * was on, but make sure it's unpinned if an error
- * occurs.
- */
- if (ep->index == 0) {
- if (h->prevpg == P_INVALID)
- break;
- if (h->pgno != save.page->pgno)
- mpool_put(t->bt_mp, h, 0);
- if ((h = mpool_get(t->bt_mp,
- h->prevpg, 0)) == NULL) {
- if (h->pgno == save.page->pgno)
- mpool_put(t->bt_mp,
- save.page, 0);
- return (RET_ERROR);
- }
- ep->page = h;
- ep->index = NEXTINDEX(h);
- }
- --ep->index;
- } while (__bt_cmp(t, key, ep) == 0);
-
- /*
- * Reach here with the last page that was looked at pinned,
- * which may or may not be the same as the last (or original)
- * match page. If it's not useful, release it.
- */
- if (h->pgno != save.page->pgno)
- mpool_put(t->bt_mp, h, 0);
-
- *erval = save;
- return (RET_SUCCESS);
- }
-
- /* If at the end of a page, find the next entry. */
- if (ep->index == NEXTINDEX(ep->page)) {
- h = ep->page;
- pg = h->nextpg;
- mpool_put(t->bt_mp, h, 0);
- if (pg == P_INVALID)
- return (RET_SPECIAL);
- if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
- return (RET_ERROR);
- ep->index = 0;
- ep->page = h;
- }
- *erval = *ep;
- return (RET_SUCCESS);
-}
-
-/*
- * __bt_setcur --
- * Set the cursor to an entry in the tree.
- *
- * Parameters:
- * t: the tree
- * pgno: page number
- * index: page index
- */
-void
-__bt_setcur(t, pgno, index)
- BTREE *t;
- pgno_t pgno;
- u_int index;
-{
- /* Lose any already deleted key. */
- if (t->bt_cursor.key.data != NULL) {
- free(t->bt_cursor.key.data);
- t->bt_cursor.key.size = 0;
- t->bt_cursor.key.data = NULL;
- }
- F_CLR(&t->bt_cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE);
-
- /* Update the cursor. */
- t->bt_cursor.pg.pgno = pgno;
- t->bt_cursor.pg.index = index;
- F_SET(&t->bt_cursor, CURS_INIT);
-}
diff --git a/btree/bt_split.c b/btree/bt_split.c
index 1646d82..fcf9aab 100644
--- a/btree/bt_split.c
+++ b/btree/bt_split.c
@@ -1,9 +1,15 @@
/*-
- * Copyright (c) 1990, 1993, 1994
- * The Regents of the University of California. All rights reserved.
+ * See the file LICENSE for redistribution information.
*
- * This code is derived from software contributed to Berkeley by
- * Mike Olson.
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -13,11 +19,7 @@
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
+ * 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
@@ -32,796 +34,1277 @@
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
+ *
+ * $Id$
*/
-#if defined(LIBC_SCCS) && !defined(lint)
-static char sccsid[] = "@(#)bt_split.c 8.9 (Berkeley) 7/26/94";
-#endif /* LIBC_SCCS and not lint */
-
-#include <sys/types.h>
+#include "db_config.h"
-#include <limits.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/btree.h"
-#include <db.h>
-#include "btree.h"
-
-static int bt_broot __P((BTREE *, PAGE *, PAGE *, PAGE *));
-static PAGE *bt_page
- __P((BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t));
-static int bt_preserve __P((BTREE *, pgno_t));
-static PAGE *bt_psplit
- __P((BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t));
-static PAGE *bt_root
- __P((BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t));
-static int bt_rroot __P((BTREE *, PAGE *, PAGE *, PAGE *));
-static recno_t rec_total __P((PAGE *));
-
-#ifdef STATISTICS
-u_long bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved;
-#endif
+static int __bam_page __P((DBC *, EPG *, EPG *));
+static int __bam_psplit __P((DBC *, EPG *, PAGE *, PAGE *, db_indx_t *));
+static int __bam_root __P((DBC *, EPG *));
/*
- * __BT_SPLIT -- Split the tree.
+ * __bam_split --
+ * Split a page.
*
- * Parameters:
- * t: tree
- * sp: page to split
- * key: key to insert
- * data: data to insert
- * flags: BIGKEY/BIGDATA flags
- * ilen: insert length
- * skip: index to leave open
- *
- * Returns:
- * RET_ERROR, RET_SUCCESS
+ * PUBLIC: int __bam_split __P((DBC *, void *, db_pgno_t *));
*/
int
-__bt_split(t, sp, key, data, flags, ilen, argskip)
- BTREE *t;
- PAGE *sp;
- const DBT *key, *data;
- int flags;
- size_t ilen;
- u_int32_t argskip;
+__bam_split(dbc, arg, root_pgnop)
+ DBC *dbc;
+ void *arg;
+ db_pgno_t *root_pgnop;
{
- BINTERNAL *bi;
- BLEAF *bl, *tbl;
- DBT a, b;
- EPGNO *parent;
- PAGE *h, *l, *r, *lchild, *rchild;
- indx_t nxtindex;
- u_int16_t skip;
- u_int32_t n, nbytes, nksize;
- int parentsplit;
- char *dest;
+ BTREE_CURSOR *cp;
+ DB_LOCK metalock, next_lock;
+ enum { UP, DOWN } dir;
+ db_pgno_t pgno, next_pgno, root_pgno;
+ int exact, level, ret;
- /*
- * Split the page into two pages, l and r. The split routines return
- * a pointer to the page into which the key should be inserted and with
- * skip set to the offset which should be used. Additionally, l and r
- * are pinned.
- */
- skip = argskip;
- h = sp->pgno == P_ROOT ?
- bt_root(t, sp, &l, &r, &skip, ilen) :
- bt_page(t, sp, &l, &r, &skip, ilen);
- if (h == NULL)
- return (RET_ERROR);
+ cp = (BTREE_CURSOR *)dbc->internal;
+ root_pgno = cp->root;
+ LOCK_INIT(next_lock);
+ next_pgno = PGNO_INVALID;
/*
- * Insert the new key/data pair into the leaf page. (Key inserts
- * always cause a leaf page to split first.)
+ * First get a lock on the metadata page, we will have to allocate
+ * pages and cannot get a lock while we have the search tree pinnned.
*/
- h->linp[skip] = h->upper -= ilen;
- dest = (char *)h + h->upper;
- if (F_ISSET(t, R_RECNO))
- WR_RLEAF(dest, data, flags)
- else
- WR_BLEAF(dest, key, data, flags)
- /* If the root page was split, make it look right. */
- if (sp->pgno == P_ROOT &&
- (F_ISSET(t, R_RECNO) ?
- bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
- goto err2;
+ pgno = PGNO_BASE_MD;
+ if ((ret = __db_lget(dbc,
+ 0, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ goto err;
/*
- * Now we walk the parent page stack -- a LIFO stack of the pages that
- * were traversed when we searched for the page that split. Each stack
- * entry is a page number and a page index offset. The offset is for
- * the page traversed on the search. We've just split a page, so we
- * have to insert a new key into the parent page.
+ * The locking protocol we use to avoid deadlock to acquire locks by
+ * walking down the tree, but we do it as lazily as possible, locking
+ * the root only as a last resort. We expect all stack pages to have
+ * been discarded before we're called; we discard all short-term locks.
*
- * If the insert into the parent page causes it to split, may have to
- * continue splitting all the way up the tree. We stop if the root
- * splits or the page inserted into didn't have to split to hold the
- * new key. Some algorithms replace the key for the old page as well
- * as the new page. We don't, as there's no reason to believe that the
- * first key on the old page is any better than the key we have, and,
- * in the case of a key being placed at index 0 causing the split, the
- * key is unavailable.
+ * When __bam_split is first called, we know that a leaf page was too
+ * full for an insert. We don't know what leaf page it was, but we
+ * have the key/recno that caused the problem. We call XX_search to
+ * reacquire the leaf page, but this time get both the leaf page and
+ * its parent, locked. We then split the leaf page and see if the new
+ * internal key will fit into the parent page. If it will, we're done.
+ *
+ * If it won't, we discard our current locks and repeat the process,
+ * only this time acquiring the parent page and its parent, locked.
+ * This process repeats until we succeed in the split, splitting the
+ * root page as the final resort. The entire process then repeats,
+ * as necessary, until we split a leaf page.
*
- * There are a maximum of 5 pages pinned at any time. We keep the left
- * and right pages pinned while working on the parent. The 5 are the
- * two children, left parent and right parent (when the parent splits)
- * and the root page or the overflow key page when calling bt_preserve.
- * This code must make sure that all pins are released other than the
- * root page or overflow page which is unlocked elsewhere.
+ * XXX
+ * A traditional method of speeding this up is to maintain a stack of
+ * the pages traversed in the original search. You can detect if the
+ * stack is correct by storing the page's LSN when it was searched and
+ * comparing that LSN with the current one when it's locked during the
+ * split. This would be an easy change for this code, but I have no
+ * numbers that indicate it's worthwhile.
*/
- while ((parent = BT_POP(t)) != NULL) {
- lchild = l;
- rchild = r;
-
- /* Get the parent page. */
- if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL)
- goto err2;
-
- /*
- * The new key goes ONE AFTER the index, because the split
- * was to the right.
+ for (dir = UP, level = LEAFLEVEL;; dir == UP ? ++level : --level) {
+ /*
+ * Acquire a page and its parent, locked.
*/
- skip = parent->index + 1;
+retry: if ((ret = (dbc->dbtype == DB_BTREE ?
+ __bam_search(dbc, PGNO_INVALID,
+ arg, SR_WRPAIR, level, NULL, &exact) :
+ __bam_rsearch(dbc,
+ (db_recno_t *)arg, SR_WRPAIR, level, &exact))) != 0)
+ break;
+
+ if (cp->csp[0].page->pgno == root_pgno) {
+ /* we can overshoot the top of the tree. */
+ level = cp->csp[0].page->level;
+ if (root_pgnop != NULL)
+ *root_pgnop = root_pgno;
+ } else if (root_pgnop != NULL)
+ *root_pgnop = cp->csp[-1].page->pgno;
/*
- * Calculate the space needed on the parent page.
- *
- * Prefix trees: space hack when inserting into BINTERNAL
- * pages. Retain only what's needed to distinguish between
- * the new entry and the LAST entry on the page to its left.
- * If the keys compare equal, retain the entire key. Note,
- * we don't touch overflow keys, and the entire key must be
- * retained for the next-to-left most key on the leftmost
- * page of each level, or the search will fail. Applicable
- * ONLY to internal pages that have leaf pages as children.
- * Further reduction of the key between pairs of internal
- * pages loses too much information.
+ * Split the page if it still needs it (it's possible another
+ * thread of control has already split the page). If we are
+ * guaranteed that two items will fit on the page, the split
+ * is no longer necessary.
*/
- switch (rchild->flags & P_TYPE) {
- case P_BINTERNAL:
- bi = GETBINTERNAL(rchild, 0);
- nbytes = NBINTERNAL(bi->ksize);
- break;
- case P_BLEAF:
- bl = GETBLEAF(rchild, 0);
- nbytes = NBINTERNAL(bl->ksize);
- if (t->bt_pfx && !(bl->flags & P_BIGKEY) &&
- (h->prevpg != P_INVALID || skip > 1)) {
- tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1);
- a.size = tbl->ksize;
- a.data = tbl->bytes;
- b.size = bl->ksize;
- b.data = bl->bytes;
- nksize = t->bt_pfx(&a, &b);
- n = NBINTERNAL(nksize);
- if (n < nbytes) {
-#ifdef STATISTICS
- bt_pfxsaved += nbytes - n;
-#endif
- nbytes = n;
- } else
- nksize = 0;
- } else
- nksize = 0;
- break;
- case P_RINTERNAL:
- case P_RLEAF:
- nbytes = NRINTERNAL;
- break;
- default:
- abort();
+ if (2 * B_MAXSIZEONPAGE(cp->ovflsize)
+ <= (db_indx_t)P_FREESPACE(dbc->dbp, cp->csp[0].page)) {
+ if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0)
+ goto err;
+ goto no_split;
}
- /* Split the parent page if necessary or shift the indices. */
- if (h->upper - h->lower < nbytes + sizeof(indx_t)) {
- sp = h;
- h = h->pgno == P_ROOT ?
- bt_root(t, h, &l, &r, &skip, nbytes) :
- bt_page(t, h, &l, &r, &skip, nbytes);
- if (h == NULL)
- goto err1;
- parentsplit = 1;
- } else {
- if (skip < (nxtindex = NEXTINDEX(h)))
- memmove(h->linp + skip + 1, h->linp + skip,
- (nxtindex - skip) * sizeof(indx_t));
- h->lower += sizeof(indx_t);
- parentsplit = 0;
+ /*
+ * We need to try to lock the next page so we can update
+ * its PREV.
+ */
+ if (dbc->dbtype == DB_BTREE && ISLEAF(cp->csp->page) &&
+ (pgno = NEXT_PGNO(cp->csp->page)) != PGNO_INVALID) {
+ TRY_LOCK(dbc, pgno,
+ next_pgno, next_lock, DB_LOCK_WRITE, retry);
+ if (ret != 0)
+ goto err;
}
-
- /* Insert the key into the parent page. */
- switch (rchild->flags & P_TYPE) {
- case P_BINTERNAL:
- h->linp[skip] = h->upper -= nbytes;
- dest = (char *)h + h->linp[skip];
- memmove(dest, bi, nbytes);
- ((BINTERNAL *)dest)->pgno = rchild->pgno;
- break;
- case P_BLEAF:
- h->linp[skip] = h->upper -= nbytes;
- dest = (char *)h + h->linp[skip];
- WR_BINTERNAL(dest, nksize ? nksize : bl->ksize,
- rchild->pgno, bl->flags & P_BIGKEY);
- memmove(dest, bl->bytes, nksize ? nksize : bl->ksize);
- if (bl->flags & P_BIGKEY &&
- bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
- goto err1;
- break;
- case P_RINTERNAL:
- /*
- * Update the left page count. If split
- * added at index 0, fix the correct page.
- */
- if (skip > 0)
- dest = (char *)h + h->linp[skip - 1];
- else
- dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
- ((RINTERNAL *)dest)->nrecs = rec_total(lchild);
- ((RINTERNAL *)dest)->pgno = lchild->pgno;
-
- /* Update the right page count. */
- h->linp[skip] = h->upper -= nbytes;
- dest = (char *)h + h->linp[skip];
- ((RINTERNAL *)dest)->nrecs = rec_total(rchild);
- ((RINTERNAL *)dest)->pgno = rchild->pgno;
+ ret = cp->csp[0].page->pgno == root_pgno ?
+ __bam_root(dbc, &cp->csp[0]) :
+ __bam_page(dbc, &cp->csp[-1], &cp->csp[0]);
+ BT_STK_CLR(cp);
+
+ switch (ret) {
+ case 0:
+no_split: /* Once we've split the leaf page, we're done. */
+ if (level == LEAFLEVEL)
+ goto done;
+
+ /* Switch directions. */
+ if (dir == UP)
+ dir = DOWN;
break;
- case P_RLEAF:
+ case DB_NEEDSPLIT:
/*
- * Update the left page count. If split
- * added at index 0, fix the correct page.
+ * It's possible to fail to split repeatedly, as other
+ * threads may be modifying the tree, or the page usage
+ * is sufficiently bad that we don't get enough space
+ * the first time.
*/
- if (skip > 0)
- dest = (char *)h + h->linp[skip - 1];
- else
- dest = (char *)l + l->linp[NEXTINDEX(l) - 1];
- ((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild);
- ((RINTERNAL *)dest)->pgno = lchild->pgno;
-
- /* Update the right page count. */
- h->linp[skip] = h->upper -= nbytes;
- dest = (char *)h + h->linp[skip];
- ((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild);
- ((RINTERNAL *)dest)->pgno = rchild->pgno;
+ if (dir == DOWN)
+ dir = UP;
break;
default:
- abort();
+ goto err;
}
+ }
- /* Unpin the held pages. */
- if (!parentsplit) {
- mpool_put(t->bt_mp, h, MPOOL_DIRTY);
- break;
- }
+err: if (root_pgnop != NULL)
+ *root_pgnop = cp->root;
+done: (void)__LPUT(dbc, metalock);
+ (void)__TLPUT(dbc, next_lock);
+ return (ret);
+}
- /* If the root page was split, make it look right. */
- if (sp->pgno == P_ROOT &&
- (F_ISSET(t, R_RECNO) ?
- bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR)
- goto err1;
+/*
+ * __bam_root --
+ * Split the root page of a btree.
+ */
+static int
+__bam_root(dbc, cp)
+ DBC *dbc;
+ EPG *cp;
+{
+ DB *dbp;
+ DBT log_dbt, rootent[2];
+ DB_LOCK llock, rlock;
+ DB_LSN log_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *lp, *rp;
+ db_indx_t split;
+ u_int32_t opflags;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ lp = rp = NULL;
+ LOCK_INIT(llock);
+ LOCK_INIT(rlock);
+ COMPQUIET(log_dbt.data, NULL);
+
+ /* Yeah, right. */
+ if (cp->page->level >= MAXBTREELEVEL) {
+ __db_errx(dbp->env,
+ "Too many btree levels: %d", cp->page->level);
+ return (ENOSPC);
+ }
- mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
- mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
+ if ((ret = __memp_dirty(mpf,
+ &cp->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err;
+
+ /* Create new left and right pages for the split. */
+ if ((ret = __db_new(dbc, TYPE(cp->page), &llock, &lp)) != 0 ||
+ (ret = __db_new(dbc, TYPE(cp->page), &rlock, &rp)) != 0)
+ goto err;
+ P_INIT(lp, dbp->pgsize, lp->pgno,
+ PGNO_INVALID, ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno,
+ cp->page->level, TYPE(cp->page));
+ P_INIT(rp, dbp->pgsize, rp->pgno,
+ ISINTERNAL(cp->page) ? PGNO_INVALID : lp->pgno, PGNO_INVALID,
+ cp->page->level, TYPE(cp->page));
+
+ /* Split the page. */
+ if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0)
+ goto err;
+
+ if (DBC_LOGGING(dbc)) {
+ memset(&log_dbt, 0, sizeof(log_dbt));
+ if ((ret =
+ __os_malloc(dbp->env, dbp->pgsize, &log_dbt.data)) != 0)
+ goto err;
+ log_dbt.size = dbp->pgsize;
+ memcpy(log_dbt.data, cp->page, dbp->pgsize);
}
- /* Unpin the held pages. */
- mpool_put(t->bt_mp, l, MPOOL_DIRTY);
- mpool_put(t->bt_mp, r, MPOOL_DIRTY);
+ /* Clean up the new root page. */
+ if ((ret = (dbc->dbtype == DB_RECNO ?
+ __ram_root(dbc, cp->page, lp, rp) :
+ __bam_broot(dbc, cp->page, split, lp, rp))) != 0) {
+ if (DBC_LOGGING(dbc))
+ __os_free(dbp->env, log_dbt.data);
+ goto err;
+ }
- /* Clear any pages left on the stack. */
- return (RET_SUCCESS);
+ /* Log the change. */
+ if (DBC_LOGGING(dbc)) {
+ memset(rootent, 0, sizeof(rootent));
+ rootent[0].data = GET_BINTERNAL(dbp, cp->page, 0);
+ rootent[1].data = GET_BINTERNAL(dbp, cp->page, 1);
+ if (dbc->dbtype == DB_RECNO)
+ rootent[0].size = rootent[1].size = RINTERNAL_SIZE;
+ else {
+ rootent[0].size = BINTERNAL_SIZE(
+ ((BINTERNAL *)rootent[0].data)->len);
+ rootent[1].size = BINTERNAL_SIZE(
+ ((BINTERNAL *)rootent[1].data)->len);
+ }
+ ZERO_LSN(log_lsn);
+ opflags = F_ISSET(
+ (BTREE_CURSOR *)dbc->internal, C_RECNUM) ? SPL_NRECS : 0;
+ if (dbc->dbtype == DB_RECNO)
+ opflags |= SPL_RECNO;
+ ret = __bam_split_log(dbp,
+ dbc->txn, &LSN(cp->page), 0, PGNO(lp), &LSN(lp), PGNO(rp),
+ &LSN(rp), (u_int32_t)NUM_ENT(lp), PGNO_INVALID, &log_lsn,
+ dbc->internal->root, &LSN(cp->page), 0,
+ &log_dbt, &rootent[0], &rootent[1], opflags);
+
+ __os_free(dbp->env, log_dbt.data);
+
+ if (ret != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(cp->page));
+ LSN(lp) = LSN(cp->page);
+ LSN(rp) = LSN(cp->page);
+
+ /* Adjust any cursors. */
+ ret = __bam_ca_split(dbc, cp->page->pgno, lp->pgno, rp->pgno, split, 1);
+
+ /* Success or error: release pages and locks. */
+err: if (cp->page != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, cp->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ cp->page = NULL;
/*
- * If something fails in the above loop we were already walking back
- * up the tree and the tree is now inconsistent. Nothing much we can
- * do about it but release any memory we're holding.
+ * We are done. Put or downgrade all our locks and release
+ * the pages.
*/
-err1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY);
- mpool_put(t->bt_mp, rchild, MPOOL_DIRTY);
-
-err2: mpool_put(t->bt_mp, l, 0);
- mpool_put(t->bt_mp, r, 0);
- __dbpanic(t->bt_dbp);
- return (RET_ERROR);
+ if ((t_ret = __TLPUT(dbc, llock)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, rlock)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, cp->lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (lp != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, lp, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (rp != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, rp, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
}
/*
- * BT_PAGE -- Split a non-root page of a btree.
- *
- * Parameters:
- * t: tree
- * h: root page
- * lp: pointer to left page pointer
- * rp: pointer to right page pointer
- * skip: pointer to index to leave open
- * ilen: insert length
- *
- * Returns:
- * Pointer to page in which to insert or NULL on error.
+ * __bam_page --
+ * Split the non-root page of a btree.
*/
-static PAGE *
-bt_page(t, h, lp, rp, skip, ilen)
- BTREE *t;
- PAGE *h, **lp, **rp;
- indx_t *skip;
- size_t ilen;
+static int
+__bam_page(dbc, pp, cp)
+ DBC *dbc;
+ EPG *pp, *cp;
{
- PAGE *l, *r, *tp;
- pgno_t npg;
-
-#ifdef STATISTICS
- ++bt_split;
-#endif
- /* Put the new right page for the split into place. */
- if ((r = __bt_new(t, &npg)) == NULL)
- return (NULL);
- r->pgno = npg;
- r->lower = BTDATAOFF;
- r->upper = t->bt_psize;
- r->nextpg = h->nextpg;
- r->prevpg = h->pgno;
- r->flags = h->flags & P_TYPE;
+ BTREE_CURSOR *bc;
+ DB *dbp;
+ DBT log_dbt, rentry;
+ DB_LOCK rplock;
+ DB_LSN log_lsn;
+ DB_LSN save_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *lp, *rp, *alloc_rp, *tp;
+ db_indx_t split;
+ u_int32_t opflags;
+ int ret, t_ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ alloc_rp = lp = rp = tp = NULL;
+ LOCK_INIT(rplock);
+ ret = -1;
/*
- * If we're splitting the last page on a level because we're appending
- * a key to it (skip is NEXTINDEX()), it's likely that the data is
- * sorted. Adding an empty page on the side of the level is less work
- * and can push the fill factor much higher than normal. If we're
- * wrong it's no big deal, we'll just do the split the right way next
- * time. It may look like it's equally easy to do a similar hack for
- * reverse sorted data, that is, split the tree left, but it's not.
- * Don't even try.
+ * Create new left page for the split, and fill in everything
+ * except its LSN and next-page page number.
+ *
+ * Create a new right page for the split, and fill in everything
+ * except its LSN and page number.
+ *
+ * We malloc space for both the left and right pages, so we don't get
+ * a new page from the underlying buffer pool until we know the split
+ * is going to succeed. The reason is that we can't release locks
+ * acquired during the get-a-new-page process because metadata page
+ * locks can't be discarded on failure since we may have modified the
+ * free list. So, if you assume that we're holding a write lock on the
+ * leaf page which ran out of space and started this split (e.g., we
+ * have already written records to the page, or we retrieved a record
+ * from it with the DB_RMW flag set), failing in a split with both a
+ * leaf page locked and the metadata page locked can potentially lock
+ * up the tree badly, because we've violated the rule of always locking
+ * down the tree, and never up.
*/
- if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) {
-#ifdef STATISTICS
- ++bt_sortsplit;
-#endif
- h->nextpg = r->pgno;
- r->lower = BTDATAOFF + sizeof(indx_t);
- *skip = 0;
- *lp = h;
- *rp = r;
- return (r);
- }
+ if ((ret = __os_malloc(dbp->env, dbp->pgsize * 2, &lp)) != 0)
+ goto err;
+ P_INIT(lp, dbp->pgsize, PGNO(cp->page),
+ ISINTERNAL(cp->page) ? PGNO_INVALID : PREV_PGNO(cp->page),
+ ISINTERNAL(cp->page) ? PGNO_INVALID : 0,
+ cp->page->level, TYPE(cp->page));
+
+ rp = (PAGE *)((u_int8_t *)lp + dbp->pgsize);
+ P_INIT(rp, dbp->pgsize, 0,
+ ISINTERNAL(cp->page) ? PGNO_INVALID : PGNO(cp->page),
+ ISINTERNAL(cp->page) ? PGNO_INVALID : NEXT_PGNO(cp->page),
+ cp->page->level, TYPE(cp->page));
- /* Put the new left page for the split into place. */
- if ((l = (PAGE *)malloc(t->bt_psize)) == NULL) {
- mpool_put(t->bt_mp, r, 0);
- return (NULL);
- }
-#ifdef PURIFY
- memset(l, 0xff, t->bt_psize);
-#endif
- l->pgno = h->pgno;
- l->nextpg = r->pgno;
- l->prevpg = h->prevpg;
- l->lower = BTDATAOFF;
- l->upper = t->bt_psize;
- l->flags = h->flags & P_TYPE;
-
- /* Fix up the previous pointer of the page after the split page. */
- if (h->nextpg != P_INVALID) {
- if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) {
- free(l);
- /* XXX mpool_free(t->bt_mp, r->pgno); */
- return (NULL);
+ /*
+ * Split right.
+ *
+ * Only the indices are sorted on the page, i.e., the key/data pairs
+ * aren't, so it's simpler to copy the data from the split page onto
+ * two new pages instead of copying half the data to a new right page
+ * and compacting the left page in place. Since the left page can't
+ * change, we swap the original and the allocated left page after the
+ * split.
+ */
+ if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0)
+ goto err;
+
+ /*
+ * Test to see if we are going to be able to insert the new pages into
+ * the parent page. The interesting failure here is that the parent
+ * page can't hold the new keys, and has to be split in turn, in which
+ * case we want to release all the locks we can.
+ */
+ if ((ret = __bam_pinsert(dbc, pp, split, lp, rp, BPI_SPACEONLY)) != 0)
+ goto err;
+
+ /*
+ * We've got everything locked down we need, and we know the split
+ * is going to succeed. Go and get the additional page we'll need.
+ */
+ if ((ret = __db_new(dbc, TYPE(cp->page), &rplock, &alloc_rp)) != 0)
+ goto err;
+
+ /*
+ * Prepare to fix up the previous pointer of any leaf page following
+ * the split page. Our caller has already write locked the page so
+ * we can get it without deadlocking on the parent latch.
+ */
+ if (ISLEAF(cp->page) && NEXT_PGNO(cp->page) != PGNO_INVALID &&
+ (ret = __memp_fget(mpf, &NEXT_PGNO(cp->page),
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &tp)) != 0)
+ goto err;
+
+ /*
+ * Fix up the page numbers we didn't have before. We have to do this
+ * before calling __bam_pinsert because it may copy a page number onto
+ * the parent page and it takes the page number from its page argument.
+ */
+ PGNO(rp) = NEXT_PGNO(lp) = PGNO(alloc_rp);
+
+ DB_ASSERT(dbp->env, IS_DIRTY(cp->page));
+ DB_ASSERT(dbp->env, IS_DIRTY(pp->page));
+
+ /* Actually update the parent page. */
+ if ((ret = __bam_pinsert(dbc, pp, split, lp, rp, BPI_NOLOGGING)) != 0)
+ goto err;
+
+ bc = (BTREE_CURSOR *)dbc->internal;
+ /* Log the change. */
+ if (DBC_LOGGING(dbc)) {
+ memset(&log_dbt, 0, sizeof(log_dbt));
+ log_dbt.data = cp->page;
+ log_dbt.size = dbp->pgsize;
+ memset(&rentry, 0, sizeof(rentry));
+ rentry.data = GET_BINTERNAL(dbp, pp->page, pp->indx + 1);
+ opflags = F_ISSET(bc, C_RECNUM) ? SPL_NRECS : 0;
+ if (dbc->dbtype == DB_RECNO) {
+ opflags |= SPL_RECNO;
+ rentry.size = RINTERNAL_SIZE;
+ } else
+ rentry.size =
+ BINTERNAL_SIZE(((BINTERNAL *)rentry.data)->len);
+ if (tp == NULL)
+ ZERO_LSN(log_lsn);
+ if ((ret = __bam_split_log(dbp, dbc->txn, &LSN(cp->page), 0,
+ PGNO(cp->page), &LSN(cp->page), PGNO(alloc_rp),
+ &LSN(alloc_rp), (u_int32_t)NUM_ENT(lp),
+ tp == NULL ? 0 : PGNO(tp), tp == NULL ? &log_lsn : &LSN(tp),
+ PGNO(pp->page), &LSN(pp->page), pp->indx,
+ &log_dbt, NULL, &rentry, opflags)) != 0) {
+ /*
+ * Undo the update to the parent page, which has not
+ * been logged yet. This must succeed.
+ */
+ t_ret = __db_ditem_nolog(dbc, pp->page,
+ pp->indx + 1, rentry.size);
+ DB_ASSERT(dbp->env, t_ret == 0);
+
+ goto err;
}
- tp->prevpg = r->pgno;
- mpool_put(t->bt_mp, tp, MPOOL_DIRTY);
+
+ } else
+ LSN_NOT_LOGGED(LSN(cp->page));
+
+ /* Update the LSNs for all involved pages. */
+ LSN(alloc_rp) = LSN(cp->page);
+ LSN(lp) = LSN(cp->page);
+ LSN(rp) = LSN(cp->page);
+ LSN(pp->page) = LSN(cp->page);
+ if (tp != NULL) {
+ /* Log record has been written; now it is safe to update next page. */
+ PREV_PGNO(tp) = PGNO(rp);
+ LSN(tp) = LSN(cp->page);
}
/*
- * Split right. The key/data pairs aren't sorted in the btree page so
- * it's simpler to copy the data from the split page onto two new pages
- * instead of copying half the data to the right page and compacting
- * the left page in place. Since the left page can't change, we have
- * to swap the original and the allocated left page after the split.
+ * Copy the left and right pages into place. There are two paths
+ * through here. Either we are logging and we set the LSNs in the
+ * logging path. However, if we are not logging, then we do not
+ * have valid LSNs on lp or rp. The correct LSNs to use are the
+ * ones on the page we got from __db_new or the one that was
+ * originally on cp->page. In both cases, we save the LSN from the
+ * real database page (not a malloc'd one) and reapply it after we
+ * do the copy.
*/
- tp = bt_psplit(t, h, l, r, skip, ilen);
+ save_lsn = alloc_rp->lsn;
+ memcpy(alloc_rp, rp, LOFFSET(dbp, rp));
+ memcpy((u_int8_t *)alloc_rp + HOFFSET(rp),
+ (u_int8_t *)rp + HOFFSET(rp), dbp->pgsize - HOFFSET(rp));
+ alloc_rp->lsn = save_lsn;
- /* Move the new left page onto the old left page. */
- memmove(h, l, t->bt_psize);
- if (tp == l)
- tp = h;
- free(l);
+ save_lsn = cp->page->lsn;
+ memcpy(cp->page, lp, LOFFSET(dbp, lp));
+ memcpy((u_int8_t *)cp->page + HOFFSET(lp),
+ (u_int8_t *)lp + HOFFSET(lp), dbp->pgsize - HOFFSET(lp));
+ cp->page->lsn = save_lsn;
- *lp = h;
- *rp = r;
- return (tp);
-}
+ /* Adjust any cursors. */
+ if ((ret = __bam_ca_split(dbc,
+ PGNO(cp->page), PGNO(cp->page), PGNO(rp), split, 0)) != 0)
+ goto err;
-/*
- * BT_ROOT -- Split the root page of a btree.
- *
- * Parameters:
- * t: tree
- * h: root page
- * lp: pointer to left page pointer
- * rp: pointer to right page pointer
- * skip: pointer to index to leave open
- * ilen: insert length
- *
- * Returns:
- * Pointer to page in which to insert or NULL on error.
- */
-static PAGE *
-bt_root(t, h, lp, rp, skip, ilen)
- BTREE *t;
- PAGE *h, **lp, **rp;
- indx_t *skip;
- size_t ilen;
-{
- PAGE *l, *r, *tp;
- pgno_t lnpg, rnpg;
-
-#ifdef STATISTICS
- ++bt_split;
- ++bt_rootsplit;
-#endif
- /* Put the new left and right pages for the split into place. */
- if ((l = __bt_new(t, &lnpg)) == NULL ||
- (r = __bt_new(t, &rnpg)) == NULL)
- return (NULL);
- l->pgno = lnpg;
- r->pgno = rnpg;
- l->nextpg = r->pgno;
- r->prevpg = l->pgno;
- l->prevpg = r->nextpg = P_INVALID;
- l->lower = r->lower = BTDATAOFF;
- l->upper = r->upper = t->bt_psize;
- l->flags = r->flags = h->flags & P_TYPE;
-
- /* Split the root page. */
- tp = bt_psplit(t, h, l, r, skip, ilen);
-
- *lp = l;
- *rp = r;
- return (tp);
-}
+ __os_free(dbp->env, lp);
-/*
- * BT_RROOT -- Fix up the recno root page after it has been split.
- *
- * Parameters:
- * t: tree
- * h: root page
- * l: left page
- * r: right page
- *
- * Returns:
- * RET_ERROR, RET_SUCCESS
- */
-static int
-bt_rroot(t, h, l, r)
- BTREE *t;
- PAGE *h, *l, *r;
-{
- char *dest;
+ /*
+ * Success -- write the real pages back to the store.
+ */
+ if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, alloc_rp, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, rplock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (tp != NULL) {
+ if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, tp, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ if ((t_ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+
+err: if (lp != NULL)
+ __os_free(dbp->env, lp);
+ if (alloc_rp != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, alloc_rp, dbc->priority);
+ if (tp != NULL)
+ (void)__memp_fput(mpf, dbc->thread_info, tp, dbc->priority);
+
+ if (pp->page != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, pp->page, dbc->priority);
+
+ if (ret == DB_NEEDSPLIT)
+ (void)__LPUT(dbc, pp->lock);
+ else
+ (void)__TLPUT(dbc, pp->lock);
- /* Insert the left and right keys, set the header information. */
- h->linp[0] = h->upper = t->bt_psize - NRINTERNAL;
- dest = (char *)h + h->upper;
- WR_RINTERNAL(dest,
- l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno);
+ (void)__memp_fput(mpf, dbc->thread_info, cp->page, dbc->priority);
- h->linp[1] = h->upper -= NRINTERNAL;
- dest = (char *)h + h->upper;
- WR_RINTERNAL(dest,
- r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno);
+ /*
+ * We don't drop the left and right page locks. If we doing dirty
+ * reads then we need to hold the locks until we abort the transaction.
+ * If we are not transactional, we are hosed anyway as the tree
+ * is trashed. It may be better not to leak the locks.
+ */
- h->lower = BTDATAOFF + 2 * sizeof(indx_t);
+ if (dbc->txn == NULL)
+ (void)__LPUT(dbc, rplock);
- /* Unpin the root page, set to recno internal page. */
- h->flags &= ~P_TYPE;
- h->flags |= P_RINTERNAL;
- mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+ if (dbc->txn == NULL || ret == DB_NEEDSPLIT)
+ (void)__LPUT(dbc, cp->lock);
- return (RET_SUCCESS);
+ return (ret);
}
/*
- * BT_BROOT -- Fix up the btree root page after it has been split.
- *
- * Parameters:
- * t: tree
- * h: root page
- * l: left page
- * r: right page
- *
- * Returns:
- * RET_ERROR, RET_SUCCESS
+ * __bam_broot --
+ * Fix up the btree root page after it has been split.
+ * PUBLIC: int __bam_broot __P((DBC *, PAGE *, u_int32_t, PAGE *, PAGE *));
*/
-static int
-bt_broot(t, h, l, r)
- BTREE *t;
- PAGE *h, *l, *r;
+int
+__bam_broot(dbc, rootp, split, lp, rp)
+ DBC *dbc;
+ u_int32_t split;
+ PAGE *rootp, *lp, *rp;
{
- BINTERNAL *bi;
- BLEAF *bl;
- u_int32_t nbytes;
- char *dest;
-
+ BINTERNAL bi, bi0, *child_bi;
+ BKEYDATA *child_bk;
+ BOVERFLOW bo, *child_bo;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT hdr, hdr0, data;
+ db_pgno_t root_pgno;
+ int ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ child_bo = NULL;
+ data.data = NULL;
+ memset(&bi, 0, sizeof(bi));
+
+ switch (TYPE(rootp)) {
+ case P_IBTREE:
+ /* Copy the first key of the child page onto the root page. */
+ child_bi = GET_BINTERNAL(dbp, rootp, split);
+ switch (B_TYPE(child_bi->type)) {
+ case B_KEYDATA:
+ bi.len = child_bi->len;
+ B_TSET(bi.type, B_KEYDATA);
+ bi.pgno = rp->pgno;
+ DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+ if ((ret = __os_malloc(dbp->env,
+ child_bi->len, &data.data)) != 0)
+ return (ret);
+ memcpy(data.data, child_bi->data, child_bi->len);
+ data.size = child_bi->len;
+ break;
+ case B_OVERFLOW:
+ /* Reuse the overflow key. */
+ child_bo = (BOVERFLOW *)child_bi->data;
+ memset(&bo, 0, sizeof(bo));
+ bo.type = B_OVERFLOW;
+ bo.tlen = child_bo->tlen;
+ bo.pgno = child_bo->pgno;
+ bi.len = BOVERFLOW_SIZE;
+ B_TSET(bi.type, B_OVERFLOW);
+ bi.pgno = rp->pgno;
+ DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+ DB_SET_DBT(data, &bo, BOVERFLOW_SIZE);
+ break;
+ case B_DUPLICATE:
+ default:
+ goto pgfmt;
+ }
+ break;
+ case P_LDUP:
+ case P_LBTREE:
+ /* Copy the first key of the child page onto the root page. */
+ child_bk = GET_BKEYDATA(dbp, rootp, split);
+ switch (B_TYPE(child_bk->type)) {
+ case B_KEYDATA:
+ bi.len = child_bk->len;
+ B_TSET(bi.type, B_KEYDATA);
+ bi.pgno = rp->pgno;
+ DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+ if ((ret = __os_malloc(dbp->env,
+ child_bk->len, &data.data)) != 0)
+ return (ret);
+ memcpy(data.data, child_bk->data, child_bk->len);
+ data.size = child_bk->len;
+ break;
+ case B_OVERFLOW:
+ /* Copy the overflow key. */
+ child_bo = (BOVERFLOW *)child_bk;
+ memset(&bo, 0, sizeof(bo));
+ bo.type = B_OVERFLOW;
+ bo.tlen = child_bo->tlen;
+ memset(&hdr, 0, sizeof(hdr));
+ if ((ret = __db_goff(dbc, &hdr, child_bo->tlen,
+ child_bo->pgno, &hdr.data, &hdr.size)) == 0)
+ ret = __db_poff(dbc, &hdr, &bo.pgno);
+
+ if (hdr.data != NULL)
+ __os_free(dbp->env, hdr.data);
+ if (ret != 0)
+ return (ret);
+
+ bi.len = BOVERFLOW_SIZE;
+ B_TSET(bi.type, B_OVERFLOW);
+ bi.pgno = rp->pgno;
+ DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+ DB_SET_DBT(data, &bo, BOVERFLOW_SIZE);
+ break;
+ case B_DUPLICATE:
+ default:
+ goto pgfmt;
+ }
+ break;
+ default:
+pgfmt: return (__db_pgfmt(dbp->env, rp->pgno));
+ }
/*
* If the root page was a leaf page, change it into an internal page.
* We copy the key we split on (but not the key's data, in the case of
* a leaf page) to the new root page.
- *
- * The btree comparison code guarantees that the left-most key on any
- * level of the tree is never used, so it doesn't need to be filled in.
*/
- nbytes = NBINTERNAL(0);
- h->linp[0] = h->upper = t->bt_psize - nbytes;
- dest = (char *)h + h->upper;
- WR_BINTERNAL(dest, 0, l->pgno, 0);
-
- switch (h->flags & P_TYPE) {
- case P_BLEAF:
- bl = GETBLEAF(r, 0);
- nbytes = NBINTERNAL(bl->ksize);
- h->linp[1] = h->upper -= nbytes;
- dest = (char *)h + h->upper;
- WR_BINTERNAL(dest, bl->ksize, r->pgno, 0);
- memmove(dest, bl->bytes, bl->ksize);
+ root_pgno = cp->root;
+ P_INIT(rootp, dbp->pgsize,
+ root_pgno, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IBTREE);
- /*
- * If the key is on an overflow page, mark the overflow chain
- * so it isn't deleted when the leaf copy of the key is deleted.
- */
- if (bl->flags & P_BIGKEY &&
- bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR)
- return (RET_ERROR);
- break;
- case P_BINTERNAL:
- bi = GETBINTERNAL(r, 0);
- nbytes = NBINTERNAL(bi->ksize);
- h->linp[1] = h->upper -= nbytes;
- dest = (char *)h + h->upper;
- memmove(dest, bi, nbytes);
- ((BINTERNAL *)dest)->pgno = r->pgno;
- break;
- default:
- abort();
+ /*
+ * The btree comparison code guarantees that the left-most key on any
+ * internal btree page is never used, so it doesn't need to be filled
+ * in. Set the record count if necessary.
+ */
+ memset(&bi0, 0, sizeof(bi0));
+ B_TSET(bi0.type, B_KEYDATA);
+ bi0.pgno = lp->pgno;
+ if (F_ISSET(cp, C_RECNUM)) {
+ bi0.nrecs = __bam_total(dbp, lp);
+ RE_NREC_SET(rootp, bi0.nrecs);
+ bi.nrecs = __bam_total(dbp, rp);
+ RE_NREC_ADJ(rootp, bi.nrecs);
}
+ DB_SET_DBT(hdr0, &bi0, SSZA(BINTERNAL, data));
+ if ((ret = __db_pitem_nolog(dbc, rootp,
+ 0, BINTERNAL_SIZE(0), &hdr0, NULL)) != 0)
+ goto err;
+ ret = __db_pitem_nolog(dbc, rootp, 1,
+ BINTERNAL_SIZE(data.size), &hdr, &data);
+
+err: if (data.data != NULL && child_bo == NULL)
+ __os_free(dbp->env, data.data);
+ return (ret);
+}
+
+/*
+ * __ram_root --
+ * Fix up the recno root page after it has been split.
+ * PUBLIC: int __ram_root __P((DBC *, PAGE *, PAGE *, PAGE *));
+ */
+int
+__ram_root(dbc, rootp, lp, rp)
+ DBC *dbc;
+ PAGE *rootp, *lp, *rp;
+{
+ DB *dbp;
+ DBT hdr;
+ RINTERNAL ri;
+ db_pgno_t root_pgno;
+ int ret;
+
+ dbp = dbc->dbp;
+ root_pgno = dbc->internal->root;
- /* There are two keys on the page. */
- h->lower = BTDATAOFF + 2 * sizeof(indx_t);
+ /* Initialize the page. */
+ P_INIT(rootp, dbp->pgsize,
+ root_pgno, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IRECNO);
- /* Unpin the root page, set to btree internal page. */
- h->flags &= ~P_TYPE;
- h->flags |= P_BINTERNAL;
- mpool_put(t->bt_mp, h, MPOOL_DIRTY);
+ /* Initialize the header. */
+ DB_SET_DBT(hdr, &ri, RINTERNAL_SIZE);
- return (RET_SUCCESS);
+ /* Insert the left and right keys, set the header information. */
+ ri.pgno = lp->pgno;
+ ri.nrecs = __bam_total(dbp, lp);
+ if ((ret = __db_pitem_nolog(dbc,
+ rootp, 0, RINTERNAL_SIZE, &hdr, NULL)) != 0)
+ return (ret);
+ RE_NREC_SET(rootp, ri.nrecs);
+ ri.pgno = rp->pgno;
+ ri.nrecs = __bam_total(dbp, rp);
+ if ((ret = __db_pitem_nolog(dbc,
+ rootp, 1, RINTERNAL_SIZE, &hdr, NULL)) != 0)
+ return (ret);
+ RE_NREC_ADJ(rootp, ri.nrecs);
+ return (0);
}
/*
- * BT_PSPLIT -- Do the real work of splitting the page.
- *
- * Parameters:
- * t: tree
- * h: page to be split
- * l: page to put lower half of data
- * r: page to put upper half of data
- * pskip: pointer to index to leave open
- * ilen: insert length
+ * __bam_pinsert --
+ * Insert a new key into a parent page, completing the split.
*
- * Returns:
- * Pointer to page in which to insert.
+ * PUBLIC: int __bam_pinsert
+ * PUBLIC: __P((DBC *, EPG *, u_int32_t, PAGE *, PAGE *, int));
*/
-static PAGE *
-bt_psplit(t, h, l, r, pskip, ilen)
- BTREE *t;
- PAGE *h, *l, *r;
- indx_t *pskip;
- size_t ilen;
+int
+__bam_pinsert(dbc, parent, split, lchild, rchild, flags)
+ DBC *dbc;
+ EPG *parent;
+ u_int32_t split;
+ PAGE *lchild, *rchild;
+ int flags;
{
- BINTERNAL *bi;
- BLEAF *bl;
- CURSOR *c;
- RLEAF *rl;
- PAGE *rval;
- void *src;
- indx_t full, half, nxt, off, skip, top, used;
- u_int32_t nbytes;
- int bigkeycnt, isbigkey;
+ BINTERNAL bi, *child_bi;
+ BKEYDATA *child_bk, *tmp_bk;
+ BOVERFLOW bo, *child_bo;
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DBT a, b, hdr, data;
+ EPG *child;
+ PAGE *ppage;
+ RINTERNAL ri;
+ db_indx_t off;
+ db_recno_t nrecs;
+ size_t (*func) __P((DB *, const DBT *, const DBT *));
+ int (*pitem) __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *));
+ u_int32_t n, nbytes, nksize, oldsize, size;
+ int ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ t = dbp->bt_internal;
+ ppage = parent->page;
+ child = parent + 1;
+
+ /* If handling record numbers, count records split to the right page. */
+ nrecs = F_ISSET(cp, C_RECNUM) &&
+ !LF_ISSET(BPI_SPACEONLY) ? __bam_total(dbp, rchild) : 0;
/*
- * Split the data to the left and right pages. Leave the skip index
- * open. Additionally, make some effort not to split on an overflow
- * key. This makes internal page processing faster and can save
- * space as overflow keys used by internal pages are never deleted.
+ * Now we insert the new page's first key into the parent page, which
+ * completes the split. The parent points to a PAGE and a page index
+ * offset, where the new key goes ONE AFTER the index, because we split
+ * to the right.
+ *
+ * XXX
+ * Some btree algorithms replace the key for the old page as well as
+ * the new page. We don't, as there's no reason to believe that the
+ * first key on the old page is any better than the key we have, and,
+ * in the case of a key being placed at index 0 causing the split, the
+ * key is unavailable.
*/
- bigkeycnt = 0;
- skip = *pskip;
- full = t->bt_psize - BTDATAOFF;
- half = full / 2;
- used = 0;
- for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) {
- if (skip == off) {
- nbytes = ilen;
- isbigkey = 0; /* XXX: not really known. */
- } else
- switch (h->flags & P_TYPE) {
- case P_BINTERNAL:
- src = bi = GETBINTERNAL(h, nxt);
- nbytes = NBINTERNAL(bi->ksize);
- isbigkey = bi->flags & P_BIGKEY;
- break;
- case P_BLEAF:
- src = bl = GETBLEAF(h, nxt);
- nbytes = NBLEAF(bl);
- isbigkey = bl->flags & P_BIGKEY;
- break;
- case P_RINTERNAL:
- src = GETRINTERNAL(h, nxt);
- nbytes = NRINTERNAL;
- isbigkey = 0;
- break;
- case P_RLEAF:
- src = rl = GETRLEAF(h, nxt);
- nbytes = NRLEAF(rl);
- isbigkey = 0;
- break;
- default:
- abort();
- }
+ off = parent->indx + O_INDX;
+ if (LF_ISSET(BPI_REPLACE))
+ oldsize = TYPE(ppage) == P_IRECNO ? RINTERNAL_PSIZE :
+ BINTERNAL_PSIZE(GET_BINTERNAL(dbp, ppage, off)->len);
+ else
+ oldsize = 0;
- /*
- * If the key/data pairs are substantial fractions of the max
- * possible size for the page, it's possible to get situations
- * where we decide to try and copy too much onto the left page.
- * Make sure that doesn't happen.
- */
- if (skip <= off && used + nbytes >= full) {
- --off;
+ /*
+ * Calculate the space needed on the parent page.
+ *
+ * Prefix trees: space hack used when inserting into BINTERNAL pages.
+ * Retain only what's needed to distinguish between the new entry and
+ * the LAST entry on the page to its left. If the keys compare equal,
+ * retain the entire key. We ignore overflow keys, and the entire key
+ * must be retained for the next-to-leftmost key on the leftmost page
+ * of each level, or the search will fail. Applicable ONLY to internal
+ * pages that have leaf pages as children. Further reduction of the
+ * key between pairs of internal pages loses too much information.
+ */
+ switch (TYPE(child->page)) {
+ case P_IBTREE:
+ child_bi = GET_BINTERNAL(dbp, child->page, split);
+ nbytes = BINTERNAL_PSIZE(child_bi->len);
+
+ if (P_FREESPACE(dbp, ppage) + oldsize < nbytes)
+ return (DB_NEEDSPLIT);
+ if (LF_ISSET(BPI_SPACEONLY))
+ return (0);
+
+ switch (B_TYPE(child_bi->type)) {
+ case B_KEYDATA:
+ /* Add a new record for the right page. */
+ memset(&bi, 0, sizeof(bi));
+ bi.len = child_bi->len;
+ B_TSET(bi.type, B_KEYDATA);
+ bi.pgno = rchild->pgno;
+ bi.nrecs = nrecs;
+ DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+ DB_SET_DBT(data, child_bi->data, child_bi->len);
+ size = BINTERNAL_SIZE(child_bi->len);
break;
+ case B_OVERFLOW:
+ /* Reuse the overflow key. */
+ child_bo = (BOVERFLOW *)child_bi->data;
+ memset(&bo, 0, sizeof(bo));
+ bo.type = B_OVERFLOW;
+ bo.tlen = child_bo->tlen;
+ bo.pgno = child_bo->pgno;
+ bi.len = BOVERFLOW_SIZE;
+ B_TSET(bi.type, B_OVERFLOW);
+ bi.pgno = rchild->pgno;
+ bi.nrecs = nrecs;
+ DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+ DB_SET_DBT(data, &bo, BOVERFLOW_SIZE);
+ size = BINTERNAL_SIZE(BOVERFLOW_SIZE);
+ break;
+ case B_DUPLICATE:
+ default:
+ goto pgfmt;
}
+ break;
+ case P_LDUP:
+ case P_LBTREE:
+ child_bk = GET_BKEYDATA(dbp, child->page, split);
+ switch (B_TYPE(child_bk->type)) {
+ case B_KEYDATA:
+ nbytes = BINTERNAL_PSIZE(child_bk->len);
+ nksize = child_bk->len;
- /* Copy the key/data pair, if not the skipped index. */
- if (skip != off) {
- ++nxt;
+ /*
+ * Prefix compression:
+ * We set t->bt_prefix to NULL if we have a comparison
+ * callback but no prefix compression callback. But,
+ * if we're splitting in an off-page duplicates tree,
+ * we still have to do some checking. If using the
+ * default off-page duplicates comparison routine we
+ * can use the default prefix compression callback. If
+ * not using the default off-page duplicates comparison
+ * routine, we can't do any kind of prefix compression
+ * as there's no way for an application to specify a
+ * prefix compression callback that corresponds to its
+ * comparison callback.
+ *
+ * No prefix compression if we don't have a compression
+ * function, or the key we'd compress isn't a normal
+ * key (for example, it references an overflow page).
+ *
+ * Generate a parent page key for the right child page
+ * from a comparison of the last key on the left child
+ * page and the first key on the right child page.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ if (dbp->dup_compare == __bam_defcmp)
+ func = __bam_defpfx;
+ else
+ func = NULL;
+ } else
+ func = t->bt_prefix;
+ if (func == NULL)
+ goto noprefix;
+ tmp_bk = GET_BKEYDATA(dbp, lchild, NUM_ENT(lchild) -
+ (TYPE(lchild) == P_LDUP ? O_INDX : P_INDX));
+ if (B_TYPE(tmp_bk->type) != B_KEYDATA)
+ goto noprefix;
+ DB_INIT_DBT(a, tmp_bk->data, tmp_bk->len);
+ DB_INIT_DBT(b, child_bk->data, child_bk->len);
+ nksize = (u_int32_t)func(dbp, &a, &b);
+ if ((n = BINTERNAL_PSIZE(nksize)) < nbytes)
+ nbytes = n;
+ else
+ nksize = child_bk->len;
+
+noprefix: if (P_FREESPACE(dbp, ppage) + oldsize < nbytes)
+ return (DB_NEEDSPLIT);
+ if (LF_ISSET(BPI_SPACEONLY))
+ return (0);
+
+ memset(&bi, 0, sizeof(bi));
+ bi.len = nksize;
+ B_TSET(bi.type, B_KEYDATA);
+ bi.pgno = rchild->pgno;
+ bi.nrecs = nrecs;
+ DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+ DB_SET_DBT(data, child_bk->data, nksize);
+ size = BINTERNAL_SIZE(nksize);
+ break;
+ case B_OVERFLOW:
+ nbytes = BINTERNAL_PSIZE(BOVERFLOW_SIZE);
+
+ if (P_FREESPACE(dbp, ppage) + oldsize < nbytes)
+ return (DB_NEEDSPLIT);
+ if (LF_ISSET(BPI_SPACEONLY))
+ return (0);
+
+ /* Copy the overflow key. */
+ child_bo = (BOVERFLOW *)child_bk;
+ memset(&bo, 0, sizeof(bo));
+ bo.type = B_OVERFLOW;
+ bo.tlen = child_bo->tlen;
+ memset(&hdr, 0, sizeof(hdr));
+ if ((ret = __db_goff(dbc, &hdr, child_bo->tlen,
+ child_bo->pgno, &hdr.data, &hdr.size)) == 0)
+ ret = __db_poff(dbc, &hdr, &bo.pgno);
+
+ if (hdr.data != NULL)
+ __os_free(dbp->env, hdr.data);
+ if (ret != 0)
+ return (ret);
+
+ memset(&bi, 0, sizeof(bi));
+ bi.len = BOVERFLOW_SIZE;
+ B_TSET(bi.type, B_OVERFLOW);
+ bi.pgno = rchild->pgno;
+ bi.nrecs = nrecs;
+ DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data));
+ DB_SET_DBT(data, &bo, BOVERFLOW_SIZE);
+ size = BINTERNAL_SIZE(BOVERFLOW_SIZE);
- l->linp[off] = l->upper -= nbytes;
- memmove((char *)l + l->upper, src, nbytes);
+ break;
+ case B_DUPLICATE:
+ default:
+ goto pgfmt;
}
-
- used += nbytes;
- if (used >= half) {
- if (!isbigkey || bigkeycnt == 3)
- break;
- else
- ++bigkeycnt;
+ break;
+ case P_IRECNO:
+ case P_LRECNO:
+ nbytes = RINTERNAL_PSIZE;
+
+ if (P_FREESPACE(dbp, ppage) + oldsize < nbytes)
+ return (DB_NEEDSPLIT);
+ if (LF_ISSET(BPI_SPACEONLY))
+ return (0);
+
+ /* Add a new record for the right page. */
+ DB_SET_DBT(hdr, &ri, RINTERNAL_SIZE);
+ ri.pgno = rchild->pgno;
+ ri.nrecs = nrecs;
+ size = RINTERNAL_SIZE;
+ data.size = 0;
+ /*
+ * For now, we are locking internal recno nodes so
+ * use two steps.
+ */
+ if (LF_ISSET(BPI_REPLACE)) {
+ if ((ret = __bam_ditem(dbc, ppage, off)) != 0)
+ return (ret);
+ LF_CLR(BPI_REPLACE);
}
+ break;
+ default:
+pgfmt: return (__db_pgfmt(dbp->env, PGNO(child->page)));
+ }
+
+ if (LF_ISSET(BPI_REPLACE)) {
+ DB_ASSERT(dbp->env, !LF_ISSET(BPI_NOLOGGING));
+ if ((ret = __bam_irep(dbc, ppage,
+ off, &hdr, data.size != 0 ? &data : NULL)) != 0)
+ return (ret);
+ } else {
+ if (LF_ISSET(BPI_NOLOGGING))
+ pitem = __db_pitem_nolog;
+ else
+ pitem = __db_pitem;
+
+ if ((ret = pitem(dbc, ppage,
+ off, size, &hdr, data.size != 0 ? &data : NULL)) != 0)
+ return (ret);
}
/*
- * Off is the last offset that's valid for the left page.
- * Nxt is the first offset to be placed on the right page.
+ * If a Recno or Btree with record numbers AM page, or an off-page
+ * duplicates tree, adjust the parent page's left page record count.
*/
- l->lower += (off + 1) * sizeof(indx_t);
+ if (F_ISSET(cp, C_RECNUM) && !LF_ISSET(BPI_NORECNUM)) {
+ /* Log the change. */
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __bam_cadjust_log(dbp, dbc->txn,
+ &LSN(ppage), 0, PGNO(ppage), &LSN(ppage),
+ parent->indx, -(int32_t)nrecs, 0)) != 0)
+ return (ret);
+ } else
+ LSN_NOT_LOGGED(LSN(ppage));
+
+ /* Update the left page count. */
+ if (dbc->dbtype == DB_RECNO)
+ GET_RINTERNAL(dbp, ppage, parent->indx)->nrecs -= nrecs;
+ else
+ GET_BINTERNAL(dbp, ppage, parent->indx)->nrecs -= nrecs;
+ }
+
+ return (0);
+}
+
+/*
+ * __bam_psplit --
+ * Do the real work of splitting the page.
+ */
+static int
+__bam_psplit(dbc, cp, lp, rp, splitret)
+ DBC *dbc;
+ EPG *cp;
+ PAGE *lp, *rp;
+ db_indx_t *splitret;
+{
+ DB *dbp;
+ PAGE *pp;
+ db_indx_t half, *inp, nbytes, off, splitp, top;
+ int adjust, cnt, iflag, isbigkey, ret;
+
+ dbp = dbc->dbp;
+ pp = cp->page;
+ inp = P_INP(dbp, pp);
+ adjust = TYPE(pp) == P_LBTREE ? P_INDX : O_INDX;
/*
- * If splitting the page that the cursor was on, the cursor has to be
- * adjusted to point to the same record as before the split. If the
- * cursor is at or past the skipped slot, the cursor is incremented by
- * one. If the cursor is on the right page, it is decremented by the
- * number of records split to the left page.
+ * If we're splitting the first (last) page on a level because we're
+ * inserting (appending) a key to it, it's likely that the data is
+ * sorted. Moving a single item to the new page is less work and can
+ * push the fill factor higher than normal. This is trivial when we
+ * are splitting a new page before the beginning of the tree, all of
+ * the interesting tests are against values of 0.
+ *
+ * Catching appends to the tree is harder. In a simple append, we're
+ * inserting an item that sorts past the end of the tree; the cursor
+ * will point past the last element on the page. But, in trees with
+ * duplicates, the cursor may point to the last entry on the page --
+ * in this case, the entry will also be the last element of a duplicate
+ * set (the last because the search call specified the SR_DUPLAST flag).
+ * The only way to differentiate between an insert immediately before
+ * the last item in a tree or an append after a duplicate set which is
+ * also the last item in the tree is to call the comparison function.
+ * When splitting internal pages during an append, the search code
+ * guarantees the cursor always points to the largest page item less
+ * than the new internal entry. To summarize, we want to catch three
+ * possible index values:
+ *
+ * NUM_ENT(page) Btree/Recno leaf insert past end-of-tree
+ * NUM_ENT(page) - O_INDX Btree or Recno internal insert past EOT
+ * NUM_ENT(page) - P_INDX Btree leaf insert past EOT after a set
+ * of duplicates
+ *
+ * two of which, (NUM_ENT(page) - O_INDX or P_INDX) might be an insert
+ * near the end of the tree, and not after the end of the tree at all.
+ * Do a simple test which might be wrong because calling the comparison
+ * functions is expensive. Regardless, it's not a big deal if we're
+ * wrong, we'll do the split the right way next time.
*/
- c = &t->bt_cursor;
- if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) {
- if (c->pg.index >= skip)
- ++c->pg.index;
- if (c->pg.index < nxt) /* Left page. */
- c->pg.pgno = l->pgno;
- else { /* Right page. */
- c->pg.pgno = r->pgno;
- c->pg.index -= nxt;
- }
- }
+ off = 0;
+ if (NEXT_PGNO(pp) == PGNO_INVALID && cp->indx >= NUM_ENT(pp) - adjust)
+ off = NUM_ENT(pp) - adjust;
+ else if (PREV_PGNO(pp) == PGNO_INVALID && cp->indx == 0)
+ off = adjust;
+ if (off != 0)
+ goto sort;
/*
- * If the skipped index was on the left page, just return that page.
- * Otherwise, adjust the skip index to reflect the new position on
- * the right page.
+ * Split the data to the left and right pages. Try not to split on
+ * an overflow key. (Overflow keys on internal pages will slow down
+ * searches.) Refuse to split in the middle of a set of duplicates.
+ *
+ * First, find the optimum place to split.
+ *
+ * It's possible to try and split past the last record on the page if
+ * there's a very large record at the end of the page. Make sure this
+ * doesn't happen by bounding the check at the next-to-last entry on
+ * the page.
+ *
+ * Note, we try and split half the data present on the page. This is
+ * because another process may have already split the page and left
+ * it half empty. We don't try and skip the split -- we don't know
+ * how much space we're going to need on the page, and we may need up
+ * to half the page for a big item, so there's no easy test to decide
+ * if we need to split or not. Besides, if two threads are inserting
+ * data into the same place in the database, we're probably going to
+ * need more space soon anyway.
*/
- if (skip <= off) {
- skip = 0;
- rval = l;
- } else {
- rval = r;
- *pskip -= nxt;
- }
+ top = NUM_ENT(pp) - adjust;
+ half = (dbp->pgsize - HOFFSET(pp)) / 2;
+ for (nbytes = 0, off = 0; off < top && nbytes < half; ++off)
+ switch (TYPE(pp)) {
+ case P_IBTREE:
+ if (B_TYPE(
+ GET_BINTERNAL(dbp, pp, off)->type) == B_KEYDATA)
+ nbytes += BINTERNAL_SIZE(
+ GET_BINTERNAL(dbp, pp, off)->len);
+ else
+ nbytes += BINTERNAL_SIZE(BOVERFLOW_SIZE);
+ break;
+ case P_LBTREE:
+ if (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) ==
+ B_KEYDATA)
+ nbytes += BKEYDATA_SIZE(GET_BKEYDATA(dbp,
+ pp, off)->len);
+ else
+ nbytes += BOVERFLOW_SIZE;
- for (off = 0; nxt < top; ++off) {
- if (skip == nxt) {
++off;
- skip = 0;
- }
- switch (h->flags & P_TYPE) {
- case P_BINTERNAL:
- src = bi = GETBINTERNAL(h, nxt);
- nbytes = NBINTERNAL(bi->ksize);
- break;
- case P_BLEAF:
- src = bl = GETBLEAF(h, nxt);
- nbytes = NBLEAF(bl);
- break;
- case P_RINTERNAL:
- src = GETRINTERNAL(h, nxt);
- nbytes = NRINTERNAL;
+ /* FALLTHROUGH */
+ case P_LDUP:
+ case P_LRECNO:
+ if (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) ==
+ B_KEYDATA)
+ nbytes += BKEYDATA_SIZE(GET_BKEYDATA(dbp,
+ pp, off)->len);
+ else
+ nbytes += BOVERFLOW_SIZE;
break;
- case P_RLEAF:
- src = rl = GETRLEAF(h, nxt);
- nbytes = NRLEAF(rl);
+ case P_IRECNO:
+ nbytes += RINTERNAL_SIZE;
break;
default:
- abort();
+ return (__db_pgfmt(dbp->env, pp->pgno));
}
- ++nxt;
- r->linp[off] = r->upper -= nbytes;
- memmove((char *)r + r->upper, src, nbytes);
- }
- r->lower += off * sizeof(indx_t);
+sort: splitp = off;
- /* If the key is being appended to the page, adjust the index. */
- if (skip == top)
- r->lower += sizeof(indx_t);
+ /*
+ * Splitp is either at or just past the optimum split point. If the
+ * tree type is such that we're going to promote a key to an internal
+ * page, and our current choice is an overflow key, look for something
+ * close by that's smaller.
+ */
+ switch (TYPE(pp)) {
+ case P_IBTREE:
+ iflag = 1;
+ isbigkey =
+ B_TYPE(GET_BINTERNAL(dbp, pp, off)->type) != B_KEYDATA;
+ break;
+ case P_LBTREE:
+ case P_LDUP:
+ iflag = 0;
+ isbigkey = B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) !=
+ B_KEYDATA;
+ break;
+ default:
+ iflag = isbigkey = 0;
+ }
+ if (isbigkey)
+ for (cnt = 1; cnt <= 3; ++cnt) {
+ off = splitp + cnt * adjust;
+ if (off < (db_indx_t)NUM_ENT(pp) &&
+ ((iflag && B_TYPE(
+ GET_BINTERNAL(dbp, pp,off)->type) == B_KEYDATA) ||
+ B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) ==
+ B_KEYDATA)) {
+ splitp = off;
+ break;
+ }
+ if (splitp <= (db_indx_t)(cnt * adjust))
+ continue;
+ off = splitp - cnt * adjust;
+ if (iflag ? B_TYPE(
+ GET_BINTERNAL(dbp, pp, off)->type) == B_KEYDATA :
+ B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) ==
+ B_KEYDATA) {
+ splitp = off;
+ break;
+ }
+ }
- return (rval);
-}
+ /*
+ * We can't split in the middle a set of duplicates. We know that
+ * no duplicate set can take up more than about 25% of the page,
+ * because that's the point where we push it off onto a duplicate
+ * page set. So, this loop can't be unbounded.
+ */
+ if (TYPE(pp) == P_LBTREE &&
+ inp[splitp] == inp[splitp - adjust])
+ for (cnt = 1;; ++cnt) {
+ off = splitp + cnt * adjust;
+ if (off < NUM_ENT(pp) &&
+ inp[splitp] != inp[off]) {
+ splitp = off;
+ break;
+ }
+ if (splitp <= (db_indx_t)(cnt * adjust))
+ continue;
+ off = splitp - cnt * adjust;
+ if (inp[splitp] != inp[off]) {
+ splitp = off + adjust;
+ break;
+ }
+ }
-/*
- * BT_PRESERVE -- Mark a chain of pages as used by an internal node.
- *
- * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the
- * record that references them gets deleted. Chains pointed to by internal
- * pages never get deleted. This routine marks a chain as pointed to by an
- * internal page.
- *
- * Parameters:
- * t: tree
- * pg: page number of first page in the chain.
- *
- * Returns:
- * RET_SUCCESS, RET_ERROR.
- */
-static int
-bt_preserve(t, pg)
- BTREE *t;
- pgno_t pg;
-{
- PAGE *h;
+ /* We're going to split at splitp. */
+ if ((ret = __bam_copy(dbp, pp, lp, 0, splitp)) != 0)
+ return (ret);
+ if ((ret = __bam_copy(dbp, pp, rp, splitp, NUM_ENT(pp))) != 0)
+ return (ret);
- if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL)
- return (RET_ERROR);
- h->flags |= P_PRESERVE;
- mpool_put(t->bt_mp, h, MPOOL_DIRTY);
- return (RET_SUCCESS);
+ *splitret = splitp;
+ return (0);
}
/*
- * REC_TOTAL -- Return the number of recno entries below a page.
- *
- * Parameters:
- * h: page
- *
- * Returns:
- * The number of recno entries below a page.
+ * __bam_copy --
+ * Copy a set of records from one page to another.
*
- * XXX
- * These values could be set by the bt_psplit routine. The problem is that the
- * entry has to be popped off of the stack etc. or the values have to be passed
- * all the way back to bt_split/bt_rroot and it's not very clean.
+ * PUBLIC: int __bam_copy __P((DB *, PAGE *, PAGE *, u_int32_t, u_int32_t));
*/
-static recno_t
-rec_total(h)
- PAGE *h;
+int
+__bam_copy(dbp, pp, cp, nxt, stop)
+ DB *dbp;
+ PAGE *pp, *cp;
+ u_int32_t nxt, stop;
{
- recno_t recs;
- indx_t nxt, top;
+ BINTERNAL internal;
+ db_indx_t *cinp, nbytes, off, *pinp;
- for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt)
- recs += GETRINTERNAL(h, nxt)->nrecs;
- return (recs);
+ cinp = P_INP(dbp, cp);
+ pinp = P_INP(dbp, pp);
+ /*
+ * Nxt is the offset of the next record to be placed on the target page.
+ */
+ for (off = 0; nxt < stop; ++nxt, ++NUM_ENT(cp), ++off) {
+ switch (TYPE(pp)) {
+ case P_IBTREE:
+ if (off == 0 && nxt != 0)
+ nbytes = BINTERNAL_SIZE(0);
+ else if (B_TYPE(
+ GET_BINTERNAL(dbp, pp, nxt)->type) == B_KEYDATA)
+ nbytes = BINTERNAL_SIZE(
+ GET_BINTERNAL(dbp, pp, nxt)->len);
+ else
+ nbytes = BINTERNAL_SIZE(BOVERFLOW_SIZE);
+ break;
+ case P_LBTREE:
+ /*
+ * If we're on a key and it's a duplicate, just copy
+ * the offset.
+ */
+ if (off != 0 && (nxt % P_INDX) == 0 &&
+ pinp[nxt] == pinp[nxt - P_INDX]) {
+ cinp[off] = cinp[off - P_INDX];
+ continue;
+ }
+ /* FALLTHROUGH */
+ case P_LDUP:
+ case P_LRECNO:
+ if (B_TYPE(GET_BKEYDATA(dbp, pp, nxt)->type) ==
+ B_KEYDATA)
+ nbytes = BKEYDATA_SIZE(GET_BKEYDATA(dbp,
+ pp, nxt)->len);
+ else
+ nbytes = BOVERFLOW_SIZE;
+ break;
+ case P_IRECNO:
+ nbytes = RINTERNAL_SIZE;
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, pp->pgno));
+ }
+ cinp[off] = HOFFSET(cp) -= nbytes;
+ if (off == 0 && nxt != 0 && TYPE(pp) == P_IBTREE) {
+ internal.len = 0;
+ UMRW_SET(internal.unused);
+ internal.type = B_KEYDATA;
+ internal.pgno = GET_BINTERNAL(dbp, pp, nxt)->pgno;
+ internal.nrecs = GET_BINTERNAL(dbp, pp, nxt)->nrecs;
+ memcpy(P_ENTRY(dbp, cp, off), &internal, nbytes);
+ }
+ else
+ memcpy(P_ENTRY(dbp, cp, off),
+ P_ENTRY(dbp, pp, nxt), nbytes);
+ }
+ return (0);
}
diff --git a/btree/bt_stat.c b/btree/bt_stat.c
new file mode 100644
index 0000000..912a166
--- /dev/null
+++ b/btree/bt_stat.c
@@ -0,0 +1,669 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+
+#ifdef HAVE_STATISTICS
+/*
+ * __bam_stat --
+ * Gather/print the btree statistics
+ *
+ * PUBLIC: int __bam_stat __P((DBC *, void *, u_int32_t));
+ */
+int
+__bam_stat(dbc, spp, flags)
+ DBC *dbc;
+ void *spp;
+ u_int32_t flags;
+{
+ BTMETA *meta;
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_BTREE_STAT *sp;
+ DB_LOCK lock, metalock;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *h;
+ db_pgno_t pgno;
+ int ret, t_ret, write_meta;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ meta = NULL;
+ t = dbp->bt_internal;
+ sp = NULL;
+ LOCK_INIT(metalock);
+ LOCK_INIT(lock);
+ mpf = dbp->mpf;
+ h = NULL;
+ ret = write_meta = 0;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /* Allocate and clear the structure. */
+ if ((ret = __os_umalloc(env, sizeof(*sp), &sp)) != 0)
+ goto err;
+ memset(sp, 0, sizeof(*sp));
+
+ /* Get the metadata page for the entire database. */
+ pgno = PGNO_BASE_MD;
+ if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &metalock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &pgno,
+ dbc->thread_info, dbc->txn, 0, &meta)) != 0)
+ goto err;
+
+ if (flags == DB_FAST_STAT)
+ goto meta_only;
+
+ /* Walk the metadata free list, counting pages. */
+ for (sp->bt_free = 0, pgno = meta->dbmeta.free; pgno != PGNO_INVALID;) {
+ ++sp->bt_free;
+
+ if ((ret = __memp_fget(mpf, &pgno,
+ dbc->thread_info, dbc->txn, 0, &h)) != 0)
+ goto err;
+
+ pgno = h->next_pgno;
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0)
+ goto err;
+ h = NULL;
+ }
+
+ /* Get the root page. */
+ pgno = cp->root;
+ if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &lock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &pgno,
+ dbc->thread_info, dbc->txn, 0, &h)) != 0)
+ goto err;
+
+ /* Get the levels from the root page. */
+ sp->bt_levels = h->level;
+
+ /* Discard the root page. */
+ ret = __memp_fput(mpf, dbc->thread_info, h, dbc->priority);
+ h = NULL;
+ if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+
+ /* Walk the tree. */
+ if ((ret = __bam_traverse(dbc,
+ DB_LOCK_READ, cp->root, __bam_stat_callback, sp)) != 0)
+ goto err;
+
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbp) && (ret = __bam_compress_count(dbc,
+ &sp->bt_nkeys, &sp->bt_ndata)) != 0)
+ goto err;
+#endif
+
+ /*
+ * Get the subdatabase metadata page if it's not the same as the
+ * one we already have.
+ */
+ write_meta = !F_ISSET(dbp, DB_AM_RDONLY) &&
+ (!MULTIVERSION(dbp) || dbc->txn != NULL);
+meta_only:
+ if (t->bt_meta != PGNO_BASE_MD || write_meta) {
+ ret = __memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
+ meta = NULL;
+ if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+
+ if ((ret = __db_lget(dbc,
+ 0, t->bt_meta, write_meta ? DB_LOCK_WRITE : DB_LOCK_READ,
+ 0, &metalock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &t->bt_meta,
+ dbc->thread_info, dbc->txn,
+ write_meta ? DB_MPOOL_DIRTY : 0, &meta)) != 0)
+ goto err;
+ }
+ if (flags == DB_FAST_STAT) {
+ if (dbp->type == DB_RECNO ||
+ (dbp->type == DB_BTREE && F_ISSET(dbp, DB_AM_RECNUM))) {
+ if ((ret = __db_lget(dbc, 0,
+ cp->root, DB_LOCK_READ, 0, &lock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &cp->root,
+ dbc->thread_info, dbc->txn, 0, &h)) != 0)
+ goto err;
+
+ sp->bt_nkeys = RE_NREC(h);
+ } else
+ sp->bt_nkeys = meta->dbmeta.key_count;
+
+ sp->bt_ndata = dbp->type == DB_RECNO ?
+ sp->bt_nkeys : meta->dbmeta.record_count;
+ }
+
+ /* Get metadata page statistics. */
+ sp->bt_metaflags = meta->dbmeta.flags;
+ sp->bt_minkey = meta->minkey;
+ sp->bt_re_len = meta->re_len;
+ sp->bt_re_pad = meta->re_pad;
+ /*
+ * Don't take the page number from the meta-data page -- that value is
+ * only maintained in the primary database, we may have been called on
+ * a subdatabase. (Yes, I read the primary database meta-data page
+ * earlier in this function, but I'm asking the underlying cache so the
+ * code for the Hash and Btree methods is the same.)
+ */
+ if ((ret = __memp_get_last_pgno(dbp->mpf, &pgno)) != 0)
+ goto err;
+ sp->bt_pagecnt = pgno + 1;
+ sp->bt_pagesize = meta->dbmeta.pagesize;
+ sp->bt_magic = meta->dbmeta.magic;
+ sp->bt_version = meta->dbmeta.version;
+
+ if (write_meta != 0) {
+ meta->dbmeta.key_count = sp->bt_nkeys;
+ meta->dbmeta.record_count = sp->bt_ndata;
+ }
+
+ *(DB_BTREE_STAT **)spp = sp;
+
+err: /* Discard the second page. */
+ if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (h != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Discard the metadata page. */
+ if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (meta != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (ret != 0 && sp != NULL) {
+ __os_ufree(env, sp);
+ *(DB_BTREE_STAT **)spp = NULL;
+ }
+
+ return (ret);
+}
+
+/*
+ * __bam_stat_print --
+ * Display btree/recno statistics.
+ *
+ * PUBLIC: int __bam_stat_print __P((DBC *, u_int32_t));
+ */
+int
+__bam_stat_print(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ static const FN fn[] = {
+ { BTM_DUP, "duplicates" },
+ { BTM_RECNO, "recno" },
+ { BTM_RECNUM, "record-numbers" },
+ { BTM_FIXEDLEN, "fixed-length" },
+ { BTM_RENUMBER, "renumber" },
+ { BTM_SUBDB, "multiple-databases" },
+ { BTM_DUPSORT, "sorted duplicates" },
+ { BTM_COMPRESS, "compressed" },
+ { 0, NULL }
+ };
+ DB *dbp;
+ DB_BTREE_STAT *sp;
+ ENV *env;
+ int lorder, ret;
+ const char *s;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+#ifdef HAVE_PARTITION
+ if (DB_IS_PARTITIONED(dbp)) {
+ if ((ret = __partition_stat(dbc, &sp, flags)) != 0)
+ return (ret);
+ } else
+#endif
+ if ((ret = __bam_stat(dbc, &sp, LF_ISSET(DB_FAST_STAT))) != 0)
+ return (ret);
+
+ if (LF_ISSET(DB_STAT_ALL)) {
+ __db_msg(env, "%s", DB_GLOBAL(db_line));
+ __db_msg(env, "Default Btree/Recno database information:");
+ }
+
+ __db_msg(env, "%lx\tBtree magic number", (u_long)sp->bt_magic);
+ __db_msg(env, "%lu\tBtree version number", (u_long)sp->bt_version);
+
+ (void)__db_get_lorder(dbp, &lorder);
+ switch (lorder) {
+ case 1234:
+ s = "Little-endian";
+ break;
+ case 4321:
+ s = "Big-endian";
+ break;
+ default:
+ s = "Unrecognized byte order";
+ break;
+ }
+ __db_msg(env, "%s\tByte order", s);
+ __db_prflags(env, NULL, sp->bt_metaflags, fn, NULL, "\tFlags");
+ if (dbp->type == DB_BTREE)
+ __db_dl(env, "Minimum keys per-page", (u_long)sp->bt_minkey);
+ if (dbp->type == DB_RECNO) {
+ __db_dl(env,
+ "Fixed-length record size", (u_long)sp->bt_re_len);
+ __db_msg(env,
+ "%#x\tFixed-length record pad", (u_int)sp->bt_re_pad);
+ }
+ __db_dl(env,
+ "Underlying database page size", (u_long)sp->bt_pagesize);
+ if (dbp->type == DB_BTREE)
+ __db_dl(env, "Overflow key/data size",
+ ((BTREE_CURSOR *)dbc->internal)->ovflsize);
+ __db_dl(env, "Number of levels in the tree", (u_long)sp->bt_levels);
+ __db_dl(env, dbp->type == DB_BTREE ?
+ "Number of unique keys in the tree" :
+ "Number of records in the tree", (u_long)sp->bt_nkeys);
+ __db_dl(env,
+ "Number of data items in the tree", (u_long)sp->bt_ndata);
+
+ __db_dl(env,
+ "Number of tree internal pages", (u_long)sp->bt_int_pg);
+ __db_dl_pct(env,
+ "Number of bytes free in tree internal pages",
+ (u_long)sp->bt_int_pgfree,
+ DB_PCT_PG(sp->bt_int_pgfree, sp->bt_int_pg, sp->bt_pagesize), "ff");
+
+ __db_dl(env,
+ "Number of tree leaf pages", (u_long)sp->bt_leaf_pg);
+ __db_dl_pct(env, "Number of bytes free in tree leaf pages",
+ (u_long)sp->bt_leaf_pgfree, DB_PCT_PG(
+ sp->bt_leaf_pgfree, sp->bt_leaf_pg, sp->bt_pagesize), "ff");
+
+ __db_dl(env,
+ "Number of tree duplicate pages", (u_long)sp->bt_dup_pg);
+ __db_dl_pct(env,
+ "Number of bytes free in tree duplicate pages",
+ (u_long)sp->bt_dup_pgfree,
+ DB_PCT_PG(sp->bt_dup_pgfree, sp->bt_dup_pg, sp->bt_pagesize), "ff");
+
+ __db_dl(env,
+ "Number of tree overflow pages", (u_long)sp->bt_over_pg);
+ __db_dl_pct(env, "Number of bytes free in tree overflow pages",
+ (u_long)sp->bt_over_pgfree, DB_PCT_PG(
+ sp->bt_over_pgfree, sp->bt_over_pg, sp->bt_pagesize), "ff");
+ __db_dl(env, "Number of empty pages", (u_long)sp->bt_empty_pg);
+
+ __db_dl(env, "Number of pages on the free list", (u_long)sp->bt_free);
+
+ __os_ufree(env, sp);
+
+ return (0);
+}
+
+/*
+ * __bam_stat_callback --
+ * Statistics callback.
+ *
+ * PUBLIC: int __bam_stat_callback __P((DBC *, PAGE *, void *, int *));
+ */
+int
+__bam_stat_callback(dbc, h, cookie, putp)
+ DBC *dbc;
+ PAGE *h;
+ void *cookie;
+ int *putp;
+{
+ DB *dbp;
+ DB_BTREE_STAT *sp;
+ db_indx_t indx, *inp, top;
+ u_int8_t type;
+
+ dbp = dbc->dbp;
+ sp = cookie;
+ *putp = 0;
+ top = NUM_ENT(h);
+ inp = P_INP(dbp, h);
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ case P_IRECNO:
+ ++sp->bt_int_pg;
+ sp->bt_int_pgfree += P_FREESPACE(dbp, h);
+ break;
+ case P_LBTREE:
+ if (top == 0)
+ ++sp->bt_empty_pg;
+
+ /* Correct for on-page duplicates and deleted items. */
+ for (indx = 0; indx < top; indx += P_INDX) {
+ type = GET_BKEYDATA(dbp, h, indx + O_INDX)->type;
+ /* Ignore deleted items. */
+ if (B_DISSET(type))
+ continue;
+
+ /* Ignore duplicate keys. */
+ if (indx + P_INDX >= top ||
+ inp[indx] != inp[indx + P_INDX])
+ ++sp->bt_nkeys;
+
+ /* Ignore off-page duplicates. */
+ if (B_TYPE(type) != B_DUPLICATE)
+ ++sp->bt_ndata;
+ }
+
+ ++sp->bt_leaf_pg;
+ sp->bt_leaf_pgfree += P_FREESPACE(dbp, h);
+ break;
+ case P_LRECNO:
+ if (top == 0)
+ ++sp->bt_empty_pg;
+
+ /*
+ * If walking a recno tree, then each of these items is a key.
+ * Otherwise, we're walking an off-page duplicate set.
+ */
+ if (dbp->type == DB_RECNO) {
+ /*
+ * Correct for deleted items in non-renumbering Recno
+ * databases.
+ */
+ if (F_ISSET(dbp, DB_AM_RENUMBER)) {
+ sp->bt_nkeys += top;
+ sp->bt_ndata += top;
+ } else
+ for (indx = 0; indx < top; indx += O_INDX) {
+ type = GET_BKEYDATA(dbp, h, indx)->type;
+ if (!B_DISSET(type)) {
+ ++sp->bt_ndata;
+ ++sp->bt_nkeys;
+ }
+ }
+
+ ++sp->bt_leaf_pg;
+ sp->bt_leaf_pgfree += P_FREESPACE(dbp, h);
+ } else {
+ sp->bt_ndata += top;
+
+ ++sp->bt_dup_pg;
+ sp->bt_dup_pgfree += P_FREESPACE(dbp, h);
+ }
+ break;
+ case P_LDUP:
+ if (top == 0)
+ ++sp->bt_empty_pg;
+
+ /* Correct for deleted items. */
+ for (indx = 0; indx < top; indx += O_INDX)
+ if (!B_DISSET(GET_BKEYDATA(dbp, h, indx)->type))
+ ++sp->bt_ndata;
+
+ ++sp->bt_dup_pg;
+ sp->bt_dup_pgfree += P_FREESPACE(dbp, h);
+ break;
+ case P_OVERFLOW:
+ ++sp->bt_over_pg;
+ sp->bt_over_pgfree += P_OVFLSPACE(dbp, dbp->pgsize, h);
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, h->pgno));
+ }
+ return (0);
+}
+
+/*
+ * __bam_print_cursor --
+ * Display the current internal cursor.
+ *
+ * PUBLIC: void __bam_print_cursor __P((DBC *));
+ */
+void
+__bam_print_cursor(dbc)
+ DBC *dbc;
+{
+ static const FN fn[] = {
+ { C_DELETED, "C_DELETED" },
+ { C_RECNUM, "C_RECNUM" },
+ { C_RENUMBER, "C_RENUMBER" },
+ { 0, NULL }
+ };
+ ENV *env;
+ BTREE_CURSOR *cp;
+
+ env = dbc->env;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ STAT_ULONG("Overflow size", cp->ovflsize);
+ if (dbc->dbtype == DB_RECNO)
+ STAT_ULONG("Recno", cp->recno);
+ STAT_ULONG("Order", cp->order);
+ __db_prflags(env, NULL, cp->flags, fn, NULL, "\tInternal Flags");
+}
+
+#else /* !HAVE_STATISTICS */
+
+int
+__bam_stat(dbc, spp, flags)
+ DBC *dbc;
+ void *spp;
+ u_int32_t flags;
+{
+ COMPQUIET(spp, NULL);
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbc->env));
+}
+
+int
+__bam_stat_print(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ COMPQUIET(flags, 0);
+
+ return (__db_stat_not_built(dbc->env));
+}
+#endif
+
+#ifndef HAVE_BREW
+/*
+ * __bam_key_range --
+ * Return proportion of keys relative to given key. The numbers are
+ * slightly skewed due to on page duplicates.
+ *
+ * PUBLIC: int __bam_key_range __P((DBC *, DBT *, DB_KEY_RANGE *, u_int32_t));
+ */
+int
+__bam_key_range(dbc, dbt, kp, flags)
+ DBC *dbc;
+ DBT *dbt;
+ DB_KEY_RANGE *kp;
+ u_int32_t flags;
+{
+ BTREE_CURSOR *cp;
+ EPG *sp;
+ double factor;
+ int exact, ret;
+
+ COMPQUIET(flags, 0);
+
+ if ((ret = __bam_search(dbc, PGNO_INVALID,
+ dbt, SR_STK_ONLY, 1, NULL, &exact)) != 0)
+ return (ret);
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+ kp->less = kp->greater = 0.0;
+
+ factor = 1.0;
+
+ /* Correct the leaf page. */
+ cp->csp->entries /= 2;
+ cp->csp->indx /= 2;
+ for (sp = cp->sp; sp <= cp->csp; ++sp) {
+ /*
+ * At each level we know that pages greater than indx contain
+ * keys greater than what we are looking for and those less
+ * than indx are less than. The one pointed to by indx may
+ * have some less, some greater or even equal. If indx is
+ * equal to the number of entries, then the key is out of range
+ * and everything is less.
+ */
+ if (sp->indx == 0)
+ kp->greater += factor * (sp->entries - 1)/sp->entries;
+ else if (sp->indx == sp->entries)
+ kp->less += factor;
+ else {
+ kp->less += factor * sp->indx / sp->entries;
+ kp->greater += factor *
+ ((sp->entries - sp->indx) - 1) / sp->entries;
+ }
+ factor *= 1.0/sp->entries;
+ }
+
+ /*
+ * If there was an exact match then assign 1 n'th to the key itself.
+ * Otherwise that factor belongs to those greater than the key, unless
+ * the key was out of range.
+ */
+ if (exact)
+ kp->equal = factor;
+ else {
+ if (kp->less != 1)
+ kp->greater += factor;
+ kp->equal = 0;
+ }
+
+ BT_STK_CLR(cp);
+
+ return (0);
+}
+#endif
+
+/*
+ * __bam_traverse --
+ * Walk a Btree database.
+ *
+ * PUBLIC: int __bam_traverse __P((DBC *, db_lockmode_t,
+ * PUBLIC: db_pgno_t, int (*)(DBC *, PAGE *, void *, int *), void *));
+ */
+int
+__bam_traverse(dbc, mode, root_pgno, callback, cookie)
+ DBC *dbc;
+ db_lockmode_t mode;
+ db_pgno_t root_pgno;
+ int (*callback)__P((DBC *, PAGE *, void *, int *));
+ void *cookie;
+{
+ BINTERNAL *bi;
+ BKEYDATA *bk;
+ DB *dbp;
+ DB_LOCK lock;
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ RINTERNAL *ri;
+ db_indx_t indx, *inp;
+ int already_put, ret, t_ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ already_put = 0;
+
+ if ((ret = __db_lget(dbc, 0, root_pgno, mode, 0, &lock)) != 0)
+ return (ret);
+ if ((ret = __memp_fget(mpf, &root_pgno,
+ dbc->thread_info, dbc->txn, 0, &h)) != 0) {
+ (void)__TLPUT(dbc, lock);
+ return (ret);
+ }
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) {
+ bi = GET_BINTERNAL(dbp, h, indx);
+ if (B_TYPE(bi->type) == B_OVERFLOW &&
+ (ret = __db_traverse_big(dbc,
+ ((BOVERFLOW *)bi->data)->pgno,
+ callback, cookie)) != 0)
+ goto err;
+ if ((ret = __bam_traverse(
+ dbc, mode, bi->pgno, callback, cookie)) != 0)
+ goto err;
+ }
+ break;
+ case P_IRECNO:
+ for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) {
+ ri = GET_RINTERNAL(dbp, h, indx);
+ if ((ret = __bam_traverse(
+ dbc, mode, ri->pgno, callback, cookie)) != 0)
+ goto err;
+ }
+ break;
+ case P_LBTREE:
+ inp = P_INP(dbp, h);
+ for (indx = 0; indx < NUM_ENT(h); indx += P_INDX) {
+ bk = GET_BKEYDATA(dbp, h, indx);
+ if (B_TYPE(bk->type) == B_OVERFLOW &&
+ (indx + P_INDX >= NUM_ENT(h) ||
+ inp[indx] != inp[indx + P_INDX])) {
+ if ((ret = __db_traverse_big(dbc,
+ GET_BOVERFLOW(dbp, h, indx)->pgno,
+ callback, cookie)) != 0)
+ goto err;
+ }
+ bk = GET_BKEYDATA(dbp, h, indx + O_INDX);
+ if (B_TYPE(bk->type) == B_DUPLICATE &&
+ (ret = __bam_traverse(dbc, mode,
+ GET_BOVERFLOW(dbp, h, indx + O_INDX)->pgno,
+ callback, cookie)) != 0)
+ goto err;
+ if (B_TYPE(bk->type) == B_OVERFLOW &&
+ (ret = __db_traverse_big(dbc,
+ GET_BOVERFLOW(dbp, h, indx + O_INDX)->pgno,
+ callback, cookie)) != 0)
+ goto err;
+ }
+ break;
+ case P_LDUP:
+ case P_LRECNO:
+ for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) {
+ bk = GET_BKEYDATA(dbp, h, indx);
+ if (B_TYPE(bk->type) == B_OVERFLOW &&
+ (ret = __db_traverse_big(dbc,
+ GET_BOVERFLOW(dbp, h, indx)->pgno,
+ callback, cookie)) != 0)
+ goto err;
+ }
+ break;
+ default:
+ return (__db_pgfmt(dbp->env, h->pgno));
+ }
+
+ ret = callback(dbc, h, cookie, &already_put);
+
+err: if (!already_put && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
diff --git a/btree/bt_upgrade.c b/btree/bt_upgrade.c
new file mode 100644
index 0000000..edf6718
--- /dev/null
+++ b/btree/bt_upgrade.c
@@ -0,0 +1,153 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_upgrade.h"
+#include "dbinc/btree.h"
+
+/*
+ * __bam_30_btreemeta --
+ * Upgrade the metadata pages from version 6 to version 7.
+ *
+ * PUBLIC: int __bam_30_btreemeta __P((DB *, char *, u_int8_t *));
+ */
+int
+__bam_30_btreemeta(dbp, real_name, buf)
+ DB *dbp;
+ char *real_name;
+ u_int8_t *buf;
+{
+ BTMETA2X *oldmeta;
+ BTMETA30 *newmeta;
+ ENV *env;
+ int ret;
+
+ env = dbp->env;
+
+ newmeta = (BTMETA30 *)buf;
+ oldmeta = (BTMETA2X *)buf;
+
+ /*
+ * Move things from the end up, so we do not overwrite things.
+ * We are going to create a new uid, so we can move the stuff
+ * at the end of the structure first, overwriting the uid.
+ */
+
+ newmeta->re_pad = oldmeta->re_pad;
+ newmeta->re_len = oldmeta->re_len;
+ newmeta->minkey = oldmeta->minkey;
+ newmeta->maxkey = oldmeta->maxkey;
+ newmeta->dbmeta.free = oldmeta->free;
+ newmeta->dbmeta.flags = oldmeta->flags;
+ newmeta->dbmeta.type = P_BTREEMETA;
+
+ newmeta->dbmeta.version = 7;
+ /* Replace the unique ID. */
+ if ((ret = __os_fileid(env, real_name, 1, buf + 36)) != 0)
+ return (ret);
+
+ newmeta->root = 1;
+
+ return (0);
+}
+
+/*
+ * __bam_31_btreemeta --
+ * Upgrade the database from version 7 to version 8.
+ *
+ * PUBLIC: int __bam_31_btreemeta
+ * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ */
+int
+__bam_31_btreemeta(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ BTMETA30 *oldmeta;
+ BTMETA31 *newmeta;
+
+ COMPQUIET(dbp, NULL);
+ COMPQUIET(real_name, NULL);
+ COMPQUIET(fhp, NULL);
+
+ newmeta = (BTMETA31 *)h;
+ oldmeta = (BTMETA30 *)h;
+
+ /*
+ * Copy the effected fields down the page.
+ * The fields may overlap each other so we
+ * start at the bottom and use memmove.
+ */
+ newmeta->root = oldmeta->root;
+ newmeta->re_pad = oldmeta->re_pad;
+ newmeta->re_len = oldmeta->re_len;
+ newmeta->minkey = oldmeta->minkey;
+ newmeta->maxkey = oldmeta->maxkey;
+ memmove(newmeta->dbmeta.uid,
+ oldmeta->dbmeta.uid, sizeof(oldmeta->dbmeta.uid));
+ newmeta->dbmeta.flags = oldmeta->dbmeta.flags;
+ newmeta->dbmeta.record_count = 0;
+ newmeta->dbmeta.key_count = 0;
+ ZERO_LSN(newmeta->dbmeta.unused3);
+
+ /* Set the version number. */
+ newmeta->dbmeta.version = 8;
+
+ /* Upgrade the flags. */
+ if (LF_ISSET(DB_DUPSORT))
+ F_SET(&newmeta->dbmeta, BTM_DUPSORT);
+
+ *dirtyp = 1;
+ return (0);
+}
+
+/*
+ * __bam_31_lbtree --
+ * Upgrade the database btree leaf pages.
+ *
+ * PUBLIC: int __bam_31_lbtree
+ * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *));
+ */
+int
+__bam_31_lbtree(dbp, real_name, flags, fhp, h, dirtyp)
+ DB *dbp;
+ char *real_name;
+ u_int32_t flags;
+ DB_FH *fhp;
+ PAGE *h;
+ int *dirtyp;
+{
+ BKEYDATA *bk;
+ db_pgno_t pgno;
+ db_indx_t indx;
+ int ret;
+
+ ret = 0;
+ for (indx = O_INDX; indx < NUM_ENT(h); indx += P_INDX) {
+ bk = GET_BKEYDATA(dbp, h, indx);
+ if (B_TYPE(bk->type) == B_DUPLICATE) {
+ pgno = GET_BOVERFLOW(dbp, h, indx)->pgno;
+ if ((ret = __db_31_offdup(dbp, real_name, fhp,
+ LF_ISSET(DB_DUPSORT) ? 1 : 0, &pgno)) != 0)
+ break;
+ if (pgno != GET_BOVERFLOW(dbp, h, indx)->pgno) {
+ *dirtyp = 1;
+ GET_BOVERFLOW(dbp, h, indx)->pgno = pgno;
+ }
+ }
+ }
+
+ return (ret);
+}
diff --git a/btree/bt_utils.c b/btree/bt_utils.c
deleted file mode 100644
index 9c1438e..0000000
--- a/btree/bt_utils.c
+++ /dev/null
@@ -1,260 +0,0 @@
-/*-
- * Copyright (c) 1990, 1993, 1994
- * The Regents of the University of California. All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * Mike Olson.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#if defined(LIBC_SCCS) && !defined(lint)
-static char sccsid[] = "@(#)bt_utils.c 8.8 (Berkeley) 7/20/94";
-#endif /* LIBC_SCCS and not lint */
-
-#include <sys/param.h>
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <db.h>
-#include "btree.h"
-
-/*
- * __bt_ret --
- * Build return key/data pair.
- *
- * Parameters:
- * t: tree
- * e: key/data pair to be returned
- * key: user's key structure (NULL if not to be filled in)
- * rkey: memory area to hold key
- * data: user's data structure (NULL if not to be filled in)
- * rdata: memory area to hold data
- * copy: always copy the key/data item
- *
- * Returns:
- * RET_SUCCESS, RET_ERROR.
- */
-int
-__bt_ret(t, e, key, rkey, data, rdata, copy)
- BTREE *t;
- EPG *e;
- DBT *key, *rkey, *data, *rdata;
- int copy;
-{
- BLEAF *bl;
- void *p;
-
- bl = GETBLEAF(e->page, e->index);
-
- /*
- * We must copy big keys/data to make them contigous. Otherwise,
- * leave the page pinned and don't copy unless the user specified
- * concurrent access.
- */
- if (key == NULL)
- goto dataonly;
-
- if (bl->flags & P_BIGKEY) {
- if (__ovfl_get(t, bl->bytes,
- &key->size, &rkey->data, &rkey->size))
- return (RET_ERROR);
- key->data = rkey->data;
- } else if (copy || F_ISSET(t, B_DB_LOCK)) {
- if (bl->ksize > rkey->size) {
- p = (void *)(rkey->data == NULL ?
- malloc(bl->ksize) : realloc(rkey->data, bl->ksize));
- if (p == NULL)
- return (RET_ERROR);
- rkey->data = p;
- rkey->size = bl->ksize;
- }
- memmove(rkey->data, bl->bytes, bl->ksize);
- key->size = bl->ksize;
- key->data = rkey->data;
- } else {
- key->size = bl->ksize;
- key->data = bl->bytes;
- }
-
-dataonly:
- if (data == NULL)
- return (RET_SUCCESS);
-
- if (bl->flags & P_BIGDATA) {
- if (__ovfl_get(t, bl->bytes + bl->ksize,
- &data->size, &rdata->data, &rdata->size))
- return (RET_ERROR);
- data->data = rdata->data;
- } else if (copy || F_ISSET(t, B_DB_LOCK)) {
- /* Use +1 in case the first record retrieved is 0 length. */
- if (bl->dsize + 1 > rdata->size) {
- p = (void *)(rdata->data == NULL ?
- malloc(bl->dsize + 1) :
- realloc(rdata->data, bl->dsize + 1));
- if (p == NULL)
- return (RET_ERROR);
- rdata->data = p;
- rdata->size = bl->dsize + 1;
- }
- memmove(rdata->data, bl->bytes + bl->ksize, bl->dsize);
- data->size = bl->dsize;
- data->data = rdata->data;
- } else {
- data->size = bl->dsize;
- data->data = bl->bytes + bl->ksize;
- }
-
- return (RET_SUCCESS);
-}
-
-/*
- * __BT_CMP -- Compare a key to a given record.
- *
- * Parameters:
- * t: tree
- * k1: DBT pointer of first arg to comparison
- * e: pointer to EPG for comparison
- *
- * Returns:
- * < 0 if k1 is < record
- * = 0 if k1 is = record
- * > 0 if k1 is > record
- */
-int
-__bt_cmp(t, k1, e)
- BTREE *t;
- const DBT *k1;
- EPG *e;
-{
- BINTERNAL *bi;
- BLEAF *bl;
- DBT k2;
- PAGE *h;
- void *bigkey;
-
- /*
- * The left-most key on internal pages, at any level of the tree, is
- * guaranteed by the following code to be less than any user key.
- * This saves us from having to update the leftmost key on an internal
- * page when the user inserts a new key in the tree smaller than
- * anything we've yet seen.
- */
- h = e->page;
- if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & P_BLEAF))
- return (1);
-
- bigkey = NULL;
- if (h->flags & P_BLEAF) {
- bl = GETBLEAF(h, e->index);
- if (bl->flags & P_BIGKEY)
- bigkey = bl->bytes;
- else {
- k2.data = bl->bytes;
- k2.size = bl->ksize;
- }
- } else {
- bi = GETBINTERNAL(h, e->index);
- if (bi->flags & P_BIGKEY)
- bigkey = bi->bytes;
- else {
- k2.data = bi->bytes;
- k2.size = bi->ksize;
- }
- }
-
- if (bigkey) {
- if (__ovfl_get(t, bigkey,
- &k2.size, &t->bt_rdata.data, &t->bt_rdata.size))
- return (RET_ERROR);
- k2.data = t->bt_rdata.data;
- }
- return ((*t->bt_cmp)(k1, &k2));
-}
-
-/*
- * __BT_DEFCMP -- Default comparison routine.
- *
- * Parameters:
- * a: DBT #1
- * b: DBT #2
- *
- * Returns:
- * < 0 if a is < b
- * = 0 if a is = b
- * > 0 if a is > b
- */
-int
-__bt_defcmp(a, b)
- const DBT *a, *b;
-{
- register size_t len;
- register u_char *p1, *p2;
-
- /*
- * XXX
- * If a size_t doesn't fit in an int, this routine can lose.
- * What we need is a integral type which is guaranteed to be
- * larger than a size_t, and there is no such thing.
- */
- len = MIN(a->size, b->size);
- for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2)
- if (*p1 != *p2)
- return ((int)*p1 - (int)*p2);
- return ((int)a->size - (int)b->size);
-}
-
-/*
- * __BT_DEFPFX -- Default prefix routine.
- *
- * Parameters:
- * a: DBT #1
- * b: DBT #2
- *
- * Returns:
- * Number of bytes needed to distinguish b from a.
- */
-size_t
-__bt_defpfx(a, b)
- const DBT *a, *b;
-{
- register u_char *p1, *p2;
- register size_t cnt, len;
-
- cnt = 1;
- len = MIN(a->size, b->size);
- for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2, ++cnt)
- if (*p1 != *p2)
- return (cnt);
-
- /* a->size must be <= b->size, or they wouldn't be in this order. */
- return (a->size < b->size ? a->size + 1 : a->size);
-}
diff --git a/btree/bt_verify.c b/btree/bt_verify.c
new file mode 100644
index 0000000..1c561d2
--- /dev/null
+++ b/btree/bt_verify.c
@@ -0,0 +1,2746 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_verify.h"
+#include "dbinc/btree.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+
+static int __bam_safe_getdata __P((DB *, DB_THREAD_INFO *,
+ PAGE *, u_int32_t, int, DBT *, int *));
+static int __bam_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+ db_indx_t *, u_int32_t));
+static int __bam_vrfy_treeorder __P((DB *, DB_THREAD_INFO *, PAGE *,
+ BINTERNAL *, BINTERNAL *, int (*)(DB *, const DBT *, const DBT *),
+ u_int32_t));
+static int __ram_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+ db_indx_t *, u_int32_t));
+
+/*
+ * __bam_vrfy_meta --
+ * Verify the btree-specific part of a metadata page.
+ *
+ * PUBLIC: int __bam_vrfy_meta __P((DB *, VRFY_DBINFO *, BTMETA *,
+ * PUBLIC: db_pgno_t, u_int32_t));
+ */
+int
+__bam_vrfy_meta(dbp, vdp, meta, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ BTMETA *meta;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ int isbad, t_ret, ret;
+ db_indx_t ovflsize;
+
+ env = dbp->env;
+ isbad = 0;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ /*
+ * If VRFY_INCOMPLETE is not set, then we didn't come through
+ * __db_vrfy_pagezero and didn't incompletely
+ * check this page--we haven't checked it at all.
+ * Thus we need to call __db_vrfy_meta and check the common fields.
+ *
+ * If VRFY_INCOMPLETE is set, we've already done all the same work
+ * in __db_vrfy_pagezero, so skip the check.
+ */
+ if (!F_ISSET(pip, VRFY_INCOMPLETE) &&
+ (ret = __db_vrfy_meta(dbp, vdp, &meta->dbmeta, pgno, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ /* bt_minkey: must be >= 2; must produce sensible ovflsize */
+
+ /* avoid division by zero */
+ ovflsize = meta->minkey > 0 ?
+ B_MINKEY_TO_OVFLSIZE(dbp, meta->minkey, dbp->pgsize) : 0;
+
+ if (meta->minkey < 2 ||
+ ovflsize > B_MINKEY_TO_OVFLSIZE(dbp, DEFMINKEYPAGE, dbp->pgsize)) {
+ pip->bt_minkey = 0;
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: nonsensical bt_minkey value %lu on metadata page",
+ (u_long)pgno, (u_long)meta->minkey));
+ } else
+ pip->bt_minkey = meta->minkey;
+
+ /* re_len: no constraints on this (may be zero or huge--we make rope) */
+ pip->re_pad = meta->re_pad;
+ pip->re_len = meta->re_len;
+
+ /*
+ * The root must not be current page or 0 and it must be within
+ * database. If this metadata page is the master meta data page
+ * of the file, then the root page had better be page 1.
+ */
+ pip->root = 0;
+ if (meta->root == PGNO_INVALID ||
+ meta->root == pgno || !IS_VALID_PGNO(meta->root) ||
+ (pgno == PGNO_BASE_MD && meta->root != 1)) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: nonsensical root page %lu on metadata page",
+ (u_long)pgno, (u_long)meta->root));
+ } else
+ pip->root = meta->root;
+
+ /* Flags. */
+ if (F_ISSET(&meta->dbmeta, BTM_RENUMBER))
+ F_SET(pip, VRFY_IS_RRECNO);
+
+ if (F_ISSET(&meta->dbmeta, BTM_SUBDB)) {
+ /*
+ * If this is a master db meta page, it had better not have
+ * duplicates.
+ */
+ if (F_ISSET(&meta->dbmeta, BTM_DUP) && pgno == PGNO_BASE_MD) {
+ isbad = 1;
+ EPRINT((env,
+"Page %lu: Btree metadata page has both duplicates and multiple databases",
+ (u_long)pgno));
+ }
+ F_SET(pip, VRFY_HAS_SUBDBS);
+ }
+
+ if (F_ISSET(&meta->dbmeta, BTM_DUP))
+ F_SET(pip, VRFY_HAS_DUPS);
+ if (F_ISSET(&meta->dbmeta, BTM_DUPSORT))
+ F_SET(pip, VRFY_HAS_DUPSORT);
+ if (F_ISSET(&meta->dbmeta, BTM_RECNUM))
+ F_SET(pip, VRFY_HAS_RECNUMS);
+ if (F_ISSET(pip, VRFY_HAS_RECNUMS) && F_ISSET(pip, VRFY_HAS_DUPS)) {
+ EPRINT((env,
+ "Page %lu: Btree metadata page illegally has both recnums and dups",
+ (u_long)pgno));
+ isbad = 1;
+ }
+
+ if (F_ISSET(&meta->dbmeta, BTM_RECNO)) {
+ F_SET(pip, VRFY_IS_RECNO);
+ dbp->type = DB_RECNO;
+ } else if (F_ISSET(pip, VRFY_IS_RRECNO)) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: metadata page has renumber flag set but is not recno",
+ (u_long)pgno));
+ }
+
+#ifdef HAVE_COMPRESSION
+ if (F_ISSET(&meta->dbmeta, BTM_COMPRESS)) {
+ F_SET(pip, VRFY_HAS_COMPRESS);
+ if (!DB_IS_COMPRESSED(dbp)) {
+ ((BTREE *)dbp->bt_internal)->bt_compress =
+ __bam_defcompress;
+ ((BTREE *)dbp->bt_internal)->bt_decompress =
+ __bam_defdecompress;
+ }
+ /*
+ * Copy dup_compare to compress_dup_compare, and use the
+ * compression duplicate compare.
+ */
+ if (F_ISSET(pip, VRFY_HAS_DUPSORT)) {
+ if (dbp->dup_compare == NULL)
+ dbp->dup_compare = __bam_defcmp;
+ if (((BTREE *)dbp->bt_internal)->compress_dup_compare
+ == NULL) {
+ ((BTREE *)dbp->bt_internal)->
+ compress_dup_compare = dbp->dup_compare;
+ dbp->dup_compare = __bam_compress_dupcmp;
+ }
+ }
+ }
+
+ if (F_ISSET(pip, VRFY_HAS_RECNUMS) && F_ISSET(pip, VRFY_HAS_COMPRESS)) {
+ EPRINT((env,
+ "Page %lu: Btree metadata page illegally has both recnums and compression",
+ (u_long)pgno));
+ isbad = 1;
+ }
+ if (F_ISSET(pip, VRFY_HAS_DUPS) && !F_ISSET(pip, VRFY_HAS_DUPSORT) &&
+ F_ISSET(pip, VRFY_HAS_COMPRESS)) {
+ EPRINT((env,
+ "Page %lu: Btree metadata page illegally has both unsorted duplicates%s",
+ (u_long)pgno,
+ " and compression"));
+ isbad = 1;
+ }
+#endif
+
+ if (F_ISSET(pip, VRFY_IS_RECNO) && F_ISSET(pip, VRFY_HAS_DUPS)) {
+ EPRINT((env,
+ "Page %lu: recno metadata page specifies duplicates",
+ (u_long)pgno));
+ isbad = 1;
+ }
+
+ if (F_ISSET(&meta->dbmeta, BTM_FIXEDLEN))
+ F_SET(pip, VRFY_IS_FIXEDLEN);
+ else if (pip->re_len > 0) {
+ /*
+ * It's wrong to have an re_len if it's not a fixed-length
+ * database
+ */
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: re_len of %lu in non-fixed-length database",
+ (u_long)pgno, (u_long)pip->re_len));
+ }
+
+ /*
+ * We do not check that the rest of the page is 0, because it may
+ * not be and may still be correct.
+ */
+
+err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ if (LF_ISSET(DB_SALVAGE) &&
+ (t_ret = __db_salvage_markdone(vdp, pgno)) != 0 && ret == 0)
+ ret = t_ret;
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __ram_vrfy_leaf --
+ * Verify a recno leaf page.
+ *
+ * PUBLIC: int __ram_vrfy_leaf __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+ * PUBLIC: u_int32_t));
+ */
+int
+__ram_vrfy_leaf(dbp, vdp, h, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ BKEYDATA *bk;
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ db_indx_t i;
+ int ret, t_ret, isbad;
+ u_int32_t re_len_guess, len;
+
+ env = dbp->env;
+ isbad = 0;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ if (TYPE(h) != P_LRECNO) {
+ ret = __db_unknown_path(env, "__ram_vrfy_leaf");
+ goto err;
+ }
+
+ /*
+ * Verify (and, if relevant, save off) page fields common to
+ * all PAGEs.
+ */
+ if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ /*
+ * Verify inp[]. Return immediately if it returns DB_VERIFY_BAD;
+ * further checks are dangerous.
+ */
+ if ((ret = __bam_vrfy_inp(dbp,
+ vdp, h, pgno, &pip->entries, flags)) != 0)
+ goto err;
+
+ if (F_ISSET(pip, VRFY_HAS_DUPS)) {
+ EPRINT((env,
+ "Page %lu: Recno database has dups", (u_long)pgno));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+ /*
+ * Walk through inp and see if the lengths of all the records are the
+ * same--if so, this may be a fixed-length database, and we want to
+ * save off this value. We know inp to be safe if we've gotten this
+ * far.
+ */
+ re_len_guess = 0;
+ for (i = 0; i < NUM_ENT(h); i++) {
+ bk = GET_BKEYDATA(dbp, h, i);
+ /* KEYEMPTY. Go on. */
+ if (B_DISSET(bk->type))
+ continue;
+ if (bk->type == B_OVERFLOW)
+ len = ((BOVERFLOW *)bk)->tlen;
+ else if (bk->type == B_KEYDATA)
+ len = bk->len;
+ else {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: nonsensical type for item %lu",
+ (u_long)pgno, (u_long)i));
+ continue;
+ }
+ if (re_len_guess == 0)
+ re_len_guess = len;
+
+ /*
+ * Is this item's len the same as the last one's? If not,
+ * reset to 0 and break--we don't have a single re_len.
+ * Otherwise, go on to the next item.
+ */
+ if (re_len_guess != len) {
+ re_len_guess = 0;
+ break;
+ }
+ }
+ pip->re_len = re_len_guess;
+
+ /* Save off record count. */
+ pip->rec_cnt = NUM_ENT(h);
+
+err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __bam_vrfy --
+ * Verify a btree leaf or internal page.
+ *
+ * PUBLIC: int __bam_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t,
+ * PUBLIC: u_int32_t));
+ */
+int
+__bam_vrfy(dbp, vdp, h, pgno, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t flags;
+{
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ int ret, t_ret, isbad;
+
+ env = dbp->env;
+ isbad = 0;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ case P_IRECNO:
+ case P_LBTREE:
+ case P_LDUP:
+ break;
+ default:
+ ret = __db_unknown_path(env, "__bam_vrfy");
+ goto err;
+ }
+
+ /*
+ * Verify (and, if relevant, save off) page fields common to
+ * all PAGEs.
+ */
+ if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+ /*
+ * The record count is, on internal pages, stored in an overloaded
+ * next_pgno field. Save it off; we'll verify it when we check
+ * overall database structure. We could overload the field
+ * in VRFY_PAGEINFO, too, but this seems gross, and space
+ * is not at such a premium.
+ */
+ pip->rec_cnt = RE_NREC(h);
+
+ /*
+ * Verify inp[].
+ */
+ if (TYPE(h) == P_IRECNO) {
+ if ((ret = __ram_vrfy_inp(dbp,
+ vdp, h, pgno, &pip->entries, flags)) != 0)
+ goto err;
+ } else if ((ret = __bam_vrfy_inp(dbp,
+ vdp, h, pgno, &pip->entries, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ EPRINT((env,
+ "Page %lu: item order check unsafe: skipping",
+ (u_long)pgno));
+ } else if (!LF_ISSET(DB_NOORDERCHK) && (ret =
+ __bam_vrfy_itemorder(dbp,
+ vdp, vdp->thread_info, h, pgno, 0, 0, 0, flags)) != 0) {
+ /*
+ * We know that the elements of inp are reasonable.
+ *
+ * Check that elements fall in the proper order.
+ */
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+
+err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __ram_vrfy_inp --
+ * Verify that all entries in a P_IRECNO inp[] array are reasonable,
+ * and count them. Note that P_LRECNO uses __bam_vrfy_inp;
+ * P_IRECNOs are a special, and simpler, case, since they have
+ * RINTERNALs rather than BKEYDATA/BINTERNALs.
+ */
+static int
+__ram_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ db_indx_t *nentriesp;
+ u_int32_t flags;
+{
+ ENV *env;
+ RINTERNAL *ri;
+ VRFY_CHILDINFO child;
+ VRFY_PAGEINFO *pip;
+ int ret, t_ret, isbad;
+ u_int32_t himark, i, offset, nentries;
+ db_indx_t *inp;
+ u_int8_t *pagelayout, *p;
+
+ env = dbp->env;
+ isbad = 0;
+ memset(&child, 0, sizeof(VRFY_CHILDINFO));
+ nentries = 0;
+ pagelayout = NULL;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ if (TYPE(h) != P_IRECNO) {
+ ret = __db_unknown_path(env, "__ram_vrfy_inp");
+ goto err;
+ }
+
+ himark = dbp->pgsize;
+ if ((ret = __os_malloc(env, dbp->pgsize, &pagelayout)) != 0)
+ goto err;
+ memset(pagelayout, 0, dbp->pgsize);
+ inp = P_INP(dbp, h);
+ for (i = 0; i < NUM_ENT(h); i++) {
+ if ((u_int8_t *)inp + i >= (u_int8_t *)h + himark) {
+ EPRINT((env,
+ "Page %lu: entries listing %lu overlaps data",
+ (u_long)pgno, (u_long)i));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ offset = inp[i];
+ /*
+ * Check that the item offset is reasonable: it points
+ * somewhere after the inp array and before the end of the
+ * page.
+ */
+ if (offset <= (u_int32_t)((u_int8_t *)inp + i -
+ (u_int8_t *)h) ||
+ offset > (u_int32_t)(dbp->pgsize - RINTERNAL_SIZE)) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: bad offset %lu at index %lu",
+ (u_long)pgno, (u_long)offset, (u_long)i));
+ continue;
+ }
+
+ /* Update the high-water mark (what HOFFSET should be) */
+ if (offset < himark)
+ himark = offset;
+
+ nentries++;
+
+ /* Make sure this RINTERNAL is not multiply referenced. */
+ ri = GET_RINTERNAL(dbp, h, i);
+ if (pagelayout[offset] == 0) {
+ pagelayout[offset] = 1;
+ child.pgno = ri->pgno;
+ child.type = V_RECNO;
+ child.nrecs = ri->nrecs;
+ if ((ret = __db_vrfy_childput(vdp, pgno, &child)) != 0)
+ goto err;
+ } else {
+ EPRINT((env,
+ "Page %lu: RINTERNAL structure at offset %lu referenced twice",
+ (u_long)pgno, (u_long)offset));
+ isbad = 1;
+ }
+ }
+
+ for (p = pagelayout + himark;
+ p < pagelayout + dbp->pgsize;
+ p += RINTERNAL_SIZE)
+ if (*p != 1) {
+ EPRINT((env,
+ "Page %lu: gap between items at offset %lu",
+ (u_long)pgno, (u_long)(p - pagelayout)));
+ isbad = 1;
+ }
+
+ if ((db_indx_t)himark != HOFFSET(h)) {
+ EPRINT((env,
+ "Page %lu: bad HOFFSET %lu, appears to be %lu",
+ (u_long)pgno, (u_long)(HOFFSET(h)), (u_long)himark));
+ isbad = 1;
+ }
+
+ *nentriesp = nentries;
+
+err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ if (pagelayout != NULL)
+ __os_free(env, pagelayout);
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+typedef enum { VRFY_ITEM_NOTSET=0, VRFY_ITEM_BEGIN, VRFY_ITEM_END } VRFY_ITEM;
+
+/*
+ * __bam_vrfy_inp --
+ * Verify that all entries in inp[] array are reasonable;
+ * count them.
+ */
+static int
+__bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ db_pgno_t pgno;
+ db_indx_t *nentriesp;
+ u_int32_t flags;
+{
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ ENV *env;
+ VRFY_CHILDINFO child;
+ VRFY_ITEM *pagelayout;
+ VRFY_PAGEINFO *pip;
+ u_int32_t himark, offset; /*
+ * These would be db_indx_ts
+ * but for alignment.
+ */
+ u_int32_t i, endoff, nentries;
+ int isbad, initem, isdupitem, ret, t_ret;
+
+ env = dbp->env;
+ isbad = isdupitem = 0;
+ nentries = 0;
+ memset(&child, 0, sizeof(VRFY_CHILDINFO));
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ case P_LBTREE:
+ case P_LDUP:
+ case P_LRECNO:
+ break;
+ default:
+ /*
+ * In the salvager, we might call this from a page which
+ * we merely suspect is a btree page. Otherwise, it
+ * shouldn't get called--if it is, that's a verifier bug.
+ */
+ if (LF_ISSET(DB_SALVAGE))
+ break;
+ ret = __db_unknown_path(env, "__bam_vrfy_inp");
+ goto err;
+ }
+
+ /*
+ * Loop through inp[], the array of items, until we either
+ * run out of entries or collide with the data. Keep track
+ * of h_offset in himark.
+ *
+ * For each element in inp[i], make sure it references a region
+ * that starts after the end of the inp array (as defined by
+ * NUM_ENT(h)), ends before the beginning of the page, doesn't
+ * overlap any other regions, and doesn't have a gap between
+ * it and the region immediately after it.
+ */
+ himark = dbp->pgsize;
+ if ((ret = __os_calloc(
+ env, dbp->pgsize, sizeof(pagelayout[0]), &pagelayout)) != 0)
+ goto err;
+ for (i = 0; i < NUM_ENT(h); i++) {
+ switch (ret = __db_vrfy_inpitem(dbp,
+ h, pgno, i, 1, flags, &himark, &offset)) {
+ case 0:
+ break;
+ case DB_VERIFY_BAD:
+ isbad = 1;
+ continue;
+ case DB_VERIFY_FATAL:
+ isbad = 1;
+ goto err;
+ default:
+ DB_ASSERT(env, ret != 0);
+ break;
+ }
+
+ /*
+ * We now have a plausible beginning for the item, and we know
+ * its length is safe.
+ *
+ * Mark the beginning and end in pagelayout so we can make sure
+ * items have no overlaps or gaps.
+ */
+ bk = GET_BKEYDATA(dbp, h, i);
+ if (pagelayout[offset] == VRFY_ITEM_NOTSET)
+ pagelayout[offset] = VRFY_ITEM_BEGIN;
+ else if (pagelayout[offset] == VRFY_ITEM_BEGIN) {
+ /*
+ * Having two inp entries that point at the same patch
+ * of page is legal if and only if the page is
+ * a btree leaf and they're onpage duplicate keys--
+ * that is, if (i % P_INDX) == 0.
+ */
+ if ((i % P_INDX == 0) && (TYPE(h) == P_LBTREE)) {
+ /* Flag for later. */
+ F_SET(pip, VRFY_HAS_DUPS);
+
+ /* Bump up nentries so we don't undercount. */
+ nentries++;
+
+ /*
+ * We'll check to make sure the end is
+ * equal, too.
+ */
+ isdupitem = 1;
+ } else {
+ isbad = 1;
+ EPRINT((env, "Page %lu: duplicated item %lu",
+ (u_long)pgno, (u_long)i));
+ }
+ }
+
+ /*
+ * Mark the end. Its location varies with the page type
+ * and the item type.
+ *
+ * If the end already has a sign other than 0, do nothing--
+ * it's an overlap that we'll catch later.
+ */
+ switch (B_TYPE(bk->type)) {
+ case B_KEYDATA:
+ if (TYPE(h) == P_IBTREE)
+ /* It's a BINTERNAL. */
+ endoff = offset + BINTERNAL_SIZE(bk->len) - 1;
+ else
+ endoff = offset + BKEYDATA_SIZE(bk->len) - 1;
+ break;
+ case B_DUPLICATE:
+ /*
+ * Flag that we have dups; we'll check whether
+ * that's okay during the structure check.
+ */
+ F_SET(pip, VRFY_HAS_DUPS);
+ /* FALLTHROUGH */
+ case B_OVERFLOW:
+ /*
+ * Overflow entries on internal pages are stored
+ * as the _data_ of a BINTERNAL; overflow entries
+ * on leaf pages are stored as the entire entry.
+ */
+ endoff = offset +
+ ((TYPE(h) == P_IBTREE) ?
+ BINTERNAL_SIZE(BOVERFLOW_SIZE) :
+ BOVERFLOW_SIZE) - 1;
+ break;
+ default:
+ /*
+ * We'll complain later; for now, just mark
+ * a minimum.
+ */
+ endoff = offset + BKEYDATA_SIZE(0) - 1;
+ break;
+ }
+
+ /*
+ * If this is an onpage duplicate key we've seen before,
+ * the end had better coincide too.
+ */
+ if (isdupitem && pagelayout[endoff] != VRFY_ITEM_END) {
+ EPRINT((env, "Page %lu: duplicated item %lu",
+ (u_long)pgno, (u_long)i));
+ isbad = 1;
+ } else if (pagelayout[endoff] == VRFY_ITEM_NOTSET)
+ pagelayout[endoff] = VRFY_ITEM_END;
+ isdupitem = 0;
+
+ /*
+ * There should be no deleted items in a quiescent tree,
+ * except in recno.
+ */
+ if (B_DISSET(bk->type) && TYPE(h) != P_LRECNO) {
+ isbad = 1;
+ EPRINT((env, "Page %lu: item %lu marked deleted",
+ (u_long)pgno, (u_long)i));
+ }
+
+ /*
+ * Check the type and such of bk--make sure it's reasonable
+ * for the pagetype.
+ */
+ switch (B_TYPE(bk->type)) {
+ case B_KEYDATA:
+ /*
+ * This is a normal, non-overflow BKEYDATA or BINTERNAL.
+ * The only thing to check is the len, and that's
+ * already been done.
+ */
+ break;
+ case B_DUPLICATE:
+ if (TYPE(h) == P_IBTREE) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: duplicate page referenced by internal btree page at item %lu",
+ (u_long)pgno, (u_long)i));
+ break;
+ } else if (TYPE(h) == P_LRECNO) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: duplicate page referenced by recno page at item %lu",
+ (u_long)pgno, (u_long)i));
+ break;
+ }
+ /* FALLTHROUGH */
+ case B_OVERFLOW:
+ bo = (TYPE(h) == P_IBTREE) ?
+ (BOVERFLOW *)(((BINTERNAL *)bk)->data) :
+ (BOVERFLOW *)bk;
+
+ if (B_TYPE(bk->type) == B_OVERFLOW)
+ /* Make sure tlen is reasonable. */
+ if (bo->tlen > dbp->pgsize * vdp->last_pgno) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: impossible tlen %lu, item %lu",
+ (u_long)pgno,
+ (u_long)bo->tlen, (u_long)i));
+ /* Don't save as a child. */
+ break;
+ }
+
+ if (!IS_VALID_PGNO(bo->pgno) || bo->pgno == pgno ||
+ bo->pgno == PGNO_INVALID) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: offpage item %lu has bad pgno %lu",
+ (u_long)pgno, (u_long)i, (u_long)bo->pgno));
+ /* Don't save as a child. */
+ break;
+ }
+
+ child.pgno = bo->pgno;
+ child.type = (B_TYPE(bk->type) == B_OVERFLOW ?
+ V_OVERFLOW : V_DUPLICATE);
+ child.tlen = bo->tlen;
+ if ((ret = __db_vrfy_childput(vdp, pgno, &child)) != 0)
+ goto err;
+ break;
+ default:
+ isbad = 1;
+ EPRINT((env, "Page %lu: item %lu of invalid type %lu",
+ (u_long)pgno, (u_long)i, (u_long)B_TYPE(bk->type)));
+ break;
+ }
+ }
+
+ /*
+ * Now, loop through and make sure the items are contiguous and
+ * non-overlapping.
+ */
+ initem = 0;
+ for (i = himark; i < dbp->pgsize; i++)
+ if (initem == 0)
+ switch (pagelayout[i]) {
+ case VRFY_ITEM_NOTSET:
+ /* May be just for alignment. */
+ if (i != DB_ALIGN(i, sizeof(u_int32_t)))
+ continue;
+
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: gap between items at offset %lu",
+ (u_long)pgno, (u_long)i));
+ /* Find the end of the gap */
+ for (; pagelayout[i + 1] == VRFY_ITEM_NOTSET &&
+ (size_t)(i + 1) < dbp->pgsize; i++)
+ ;
+ break;
+ case VRFY_ITEM_BEGIN:
+ /* We've found an item. Check its alignment. */
+ if (i != DB_ALIGN(i, sizeof(u_int32_t))) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: offset %lu unaligned",
+ (u_long)pgno, (u_long)i));
+ }
+ initem = 1;
+ nentries++;
+ break;
+ case VRFY_ITEM_END:
+ /*
+ * We've hit the end of an item even though
+ * we don't think we're in one; must
+ * be an overlap.
+ */
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: overlapping items at offset %lu",
+ (u_long)pgno, (u_long)i));
+ break;
+ }
+ else
+ switch (pagelayout[i]) {
+ case VRFY_ITEM_NOTSET:
+ /* In the middle of an item somewhere. Okay. */
+ break;
+ case VRFY_ITEM_END:
+ /* End of an item; switch to out-of-item mode.*/
+ initem = 0;
+ break;
+ case VRFY_ITEM_BEGIN:
+ /*
+ * Hit a second item beginning without an
+ * end. Overlap.
+ */
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: overlapping items at offset %lu",
+ (u_long)pgno, (u_long)i));
+ break;
+ }
+
+ __os_free(env, pagelayout);
+
+ /* Verify HOFFSET. */
+ if ((db_indx_t)himark != HOFFSET(h)) {
+ EPRINT((env, "Page %lu: bad HOFFSET %lu, appears to be %lu",
+ (u_long)pgno, (u_long)HOFFSET(h), (u_long)himark));
+ isbad = 1;
+ }
+
+err: if (nentriesp != NULL)
+ *nentriesp = nentries;
+
+ if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __bam_vrfy_itemorder --
+ * Make sure the items on a page sort correctly.
+ *
+ * Assumes that NUM_ENT(h) and inp[0]..inp[NUM_ENT(h) - 1] are
+ * reasonable; be sure that __bam_vrfy_inp has been called first.
+ *
+ * If ovflok is set, it also assumes that overflow page chains
+ * hanging off the current page have been sanity-checked, and so we
+ * can use __bam_cmp to verify their ordering. If it is not set,
+ * and we run into an overflow page, carp and return DB_VERIFY_BAD;
+ * we shouldn't be called if any exist.
+ *
+ * PUBLIC: int __bam_vrfy_itemorder __P((DB *, VRFY_DBINFO *, DB_THREAD_INFO *,
+ * PUBLIC: PAGE *, db_pgno_t, u_int32_t, int, int, u_int32_t));
+ */
+int
+__bam_vrfy_itemorder(dbp, vdp, ip, h, pgno, nentries, ovflok, hasdups, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ DB_THREAD_INFO *ip;
+ PAGE *h;
+ db_pgno_t pgno;
+ u_int32_t nentries;
+ int ovflok, hasdups;
+ u_int32_t flags;
+{
+ BINTERNAL *bi;
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ BTREE *bt;
+ DBC *dbc;
+ DBT dbta, dbtb, dup_1, dup_2, *p1, *p2, *tmp;
+ ENV *env;
+ VRFY_PAGEINFO *pip;
+ db_indx_t i, *inp;
+ int adj, cmp, freedup_1, freedup_2, isbad, ret, t_ret;
+ int (*dupfunc) __P((DB *, const DBT *, const DBT *));
+ int (*func) __P((DB *, const DBT *, const DBT *));
+ void *buf1, *buf2, *tmpbuf;
+
+ /*
+ * We need to work in the ORDERCHKONLY environment where we might
+ * not have a pip, but we also may need to work in contexts where
+ * NUM_ENT isn't safe.
+ */
+ if (vdp != NULL) {
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+ nentries = pip->entries;
+ } else
+ pip = NULL;
+
+ env = dbp->env;
+ ret = isbad = 0;
+ bo = NULL; /* Shut up compiler. */
+
+ memset(&dbta, 0, sizeof(DBT));
+ F_SET(&dbta, DB_DBT_REALLOC);
+
+ memset(&dbtb, 0, sizeof(DBT));
+ F_SET(&dbtb, DB_DBT_REALLOC);
+
+ buf1 = buf2 = NULL;
+
+ DB_ASSERT(env, !LF_ISSET(DB_NOORDERCHK));
+
+ dupfunc = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare;
+ if (TYPE(h) == P_LDUP)
+ func = dupfunc;
+ else {
+ func = __bam_defcmp;
+ if (dbp->bt_internal != NULL) {
+ bt = (BTREE *)dbp->bt_internal;
+ if (bt->bt_compare != NULL)
+ func = bt->bt_compare;
+ }
+ }
+
+ /*
+ * We alternate our use of dbta and dbtb so that we can walk
+ * through the page key-by-key without copying a dbt twice.
+ * p1 is always the dbt for index i - 1, and p2 for index i.
+ * Reset the data pointers in case we are retrying.
+ */
+retry: p1 = &dbta;
+ p1->data = NULL;
+ p2 = &dbtb;
+ p2->data = NULL;
+
+ /*
+ * Loop through the entries. nentries ought to contain the
+ * actual count, and so is a safe way to terminate the loop; whether
+ * we inc. by one or two depends on whether we're a leaf page--
+ * on a leaf page, we care only about keys. On internal pages
+ * and LDUP pages, we want to check the order of all entries.
+ *
+ * Note that on IBTREE pages or the index page of a partitioned
+ * database, we start with item 1, since item 0 doesn't get looked
+ * at by __bam_cmp.
+ */
+ inp = P_INP(dbp, h);
+ adj = (TYPE(h) == P_LBTREE) ? P_INDX : O_INDX;
+ for (i = (TYPE(h) == P_IBTREE || dbp->p_internal != NULL) ? adj : 0;
+ i < nentries; i += adj) {
+ /*
+ * Put key i-1, now in p2, into p1, by swapping DBTs and bufs.
+ */
+ tmp = p1;
+ p1 = p2;
+ p2 = tmp;
+ tmpbuf = buf1;
+ buf1 = buf2;
+ buf2 = tmpbuf;
+
+ /*
+ * Get key i into p2.
+ */
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ bi = GET_BINTERNAL(dbp, h, i);
+ if (B_TYPE(bi->type) == B_OVERFLOW) {
+ bo = (BOVERFLOW *)(bi->data);
+ goto overflow;
+ } else {
+ p2->data = bi->data;
+ p2->size = bi->len;
+ }
+
+ /*
+ * The leftmost key on an internal page must be
+ * len 0, since it's just a placeholder and
+ * automatically sorts less than all keys.
+ *
+ * XXX
+ * This criterion does not currently hold!
+ * See todo list item #1686. Meanwhile, it's harmless
+ * to just not check for it.
+ */
+#if 0
+ if (i == 0 && bi->len != 0) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: lowest key on internal page of nonzero length",
+ (u_long)pgno));
+ }
+#endif
+ break;
+ case P_LBTREE:
+ case P_LDUP:
+ bk = GET_BKEYDATA(dbp, h, i);
+ if (B_TYPE(bk->type) == B_OVERFLOW) {
+ bo = (BOVERFLOW *)bk;
+ goto overflow;
+ } else {
+ p2->data = bk->data;
+ p2->size = bk->len;
+ }
+ break;
+ default:
+ /*
+ * This means our caller screwed up and sent us
+ * an inappropriate page.
+ */
+ ret = __db_unknown_path(env, "__bam_vrfy_itemorder");
+ goto err;
+ }
+
+ if (0) {
+ /*
+ * If ovflok != 1, we can't safely go chasing
+ * overflow pages with the normal routines now;
+ * they might be unsafe or nonexistent. Mark this
+ * page as incomplete and return.
+ *
+ * Note that we don't need to worry about freeing
+ * buffers, since they can't have been allocated
+ * if overflow items are unsafe.
+ */
+overflow: if (!ovflok) {
+ F_SET(pip, VRFY_INCOMPLETE);
+ goto err;
+ }
+
+ /*
+ * Overflow items are safe to chase. Do so.
+ * Fetch the overflow item into p2->data,
+ * NULLing it or reallocing it as appropriate.
+ *
+ * (We set p2->data to buf2 before the call
+ * so we're sure to realloc if we can and if p2
+ * was just pointing at a non-overflow item.)
+ */
+ p2->data = buf2;
+ if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE,
+ PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
+ goto err;
+ if ((ret = __db_goff(dbc,
+ p2, bo->tlen, bo->pgno, NULL, NULL)) != 0) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: error %lu in fetching overflow item %lu",
+ (u_long)pgno, (u_long)ret, (u_long)i));
+ }
+ /* In case it got realloc'ed and thus changed. */
+ buf2 = p2->data;
+ }
+
+ /* Compare with the last key. */
+ if (p1->data != NULL && p2->data != NULL) {
+ cmp = inp[i] == inp[i - adj] ? 0 : func(dbp, p1, p2);
+
+ /* comparison succeeded */
+ if (cmp > 0) {
+ /*
+ * If we are looking at an internal page, we
+ * don't know whether it is part of the main
+ * database or in an off-page-duplicate tree.
+ * If the main comparator fails, retry with
+ * the duplicate comparator.
+ */
+ if (TYPE(h) == P_IBTREE && func != dupfunc) {
+ func = dupfunc;
+ goto retry;
+ }
+
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: out-of-order key at entry %lu",
+ (u_long)pgno, (u_long)i));
+ /* proceed */
+ } else if (cmp == 0) {
+ if (inp[i] != inp[i - adj]) {
+ /* See above. */
+ if (TYPE(h) == P_IBTREE &&
+ func != dupfunc) {
+ func = dupfunc;
+ goto retry;
+ }
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: non-dup dup key at entry %lu",
+ (u_long)pgno, (u_long)i));
+ }
+ /*
+ * If they compared equally, this
+ * had better be a (sub)database with dups.
+ * Mark it so we can check during the
+ * structure check.
+ */
+ if (pip != NULL)
+ F_SET(pip, VRFY_HAS_DUPS);
+ else if (hasdups == 0) {
+ /* See above. */
+ if (TYPE(h) == P_IBTREE &&
+ func != dupfunc) {
+ func = dupfunc;
+ goto retry;
+ }
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: database with no duplicates has duplicated keys",
+ (u_long)pgno));
+ }
+
+ /*
+ * If we're a btree leaf, check to see
+ * if the data items of these on-page dups are
+ * in sorted order. If not, flag this, so
+ * that we can make sure during the
+ * structure checks that the DUPSORT flag
+ * is unset.
+ *
+ * At this point i points to a duplicate key.
+ * Compare the datum before it (same key)
+ * to the datum after it, i.e. i-1 to i+1.
+ */
+ if (TYPE(h) == P_LBTREE) {
+ /*
+ * Unsafe; continue and we'll pick
+ * up the bogus nentries later.
+ */
+ if (i + 1 >= (db_indx_t)nentries)
+ continue;
+
+ /*
+ * We don't bother with clever memory
+ * management with on-page dups,
+ * as it's only really a big win
+ * in the overflow case, and overflow
+ * dups are probably (?) rare.
+ */
+ if (((ret = __bam_safe_getdata(dbp,
+ ip, h, i - 1, ovflok,
+ &dup_1, &freedup_1)) != 0) ||
+ ((ret = __bam_safe_getdata(dbp,
+ ip, h, i + 1, ovflok,
+ &dup_2, &freedup_2)) != 0))
+ goto err;
+
+ /*
+ * If either of the data are NULL,
+ * it's because they're overflows and
+ * it's not safe to chase them now.
+ * Mark an incomplete and return.
+ */
+ if (dup_1.data == NULL ||
+ dup_2.data == NULL) {
+ DB_ASSERT(env, !ovflok);
+ F_SET(pip, VRFY_INCOMPLETE);
+ goto err;
+ }
+
+ /*
+ * If the dups are out of order,
+ * flag this. It's not an error
+ * until we do the structure check
+ * and see whether DUPSORT is set.
+ */
+ if (dupfunc(dbp, &dup_1, &dup_2) > 0)
+ F_SET(pip, VRFY_DUPS_UNSORTED);
+
+ if (freedup_1)
+ __os_ufree(env, dup_1.data);
+ if (freedup_2)
+ __os_ufree(env, dup_2.data);
+ }
+ }
+ }
+ }
+
+err: if (pip != NULL && ((t_ret =
+ __db_vrfy_putpageinfo(env, vdp, pip)) != 0) && ret == 0)
+ ret = t_ret;
+
+ if (buf1 != NULL)
+ __os_ufree(env, buf1);
+ if (buf2 != NULL)
+ __os_ufree(env, buf2);
+
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __bam_vrfy_structure --
+ * Verify the tree structure of a btree database (including the master
+ * database containing subdbs).
+ *
+ * PUBLIC: int __bam_vrfy_structure __P((DB *, VRFY_DBINFO *, db_pgno_t,
+ * PUBLIC: void *, void *, u_int32_t));
+ */
+int
+__bam_vrfy_structure(dbp, vdp, meta_pgno, lp, rp, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t meta_pgno;
+ void *lp, *rp;
+ u_int32_t flags;
+{
+ DB *pgset;
+ ENV *env;
+ VRFY_PAGEINFO *mip, *rip;
+ db_pgno_t root, p;
+ int t_ret, ret;
+ u_int32_t nrecs, level, relen, stflags;
+
+ env = dbp->env;
+ mip = rip = 0;
+ pgset = vdp->pgset;
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &mip)) != 0)
+ return (ret);
+
+ if ((ret = __db_vrfy_pgset_get(pgset,
+ vdp->thread_info, meta_pgno, (int *)&p)) != 0)
+ goto err;
+ if (p != 0) {
+ EPRINT((env,
+ "Page %lu: btree metadata page observed twice",
+ (u_long)meta_pgno));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ if ((ret =
+ __db_vrfy_pgset_inc(pgset, vdp->thread_info, meta_pgno)) != 0)
+ goto err;
+
+ root = mip->root;
+
+ if (root == 0) {
+ EPRINT((env,
+ "Page %lu: btree metadata page has no root",
+ (u_long)meta_pgno));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, root, &rip)) != 0)
+ goto err;
+
+ switch (rip->type) {
+ case P_IBTREE:
+ case P_LBTREE:
+ stflags = flags | DB_ST_TOPLEVEL;
+ if (F_ISSET(mip, VRFY_HAS_DUPS))
+ stflags |= DB_ST_DUPOK;
+ if (F_ISSET(mip, VRFY_HAS_DUPSORT))
+ stflags |= DB_ST_DUPSORT;
+ if (F_ISSET(mip, VRFY_HAS_RECNUMS))
+ stflags |= DB_ST_RECNUM;
+ ret = __bam_vrfy_subtree(dbp,
+ vdp, root, lp, rp, stflags, NULL, NULL, NULL);
+ break;
+ case P_IRECNO:
+ case P_LRECNO:
+ stflags =
+ flags | DB_ST_RECNUM | DB_ST_IS_RECNO | DB_ST_TOPLEVEL;
+ if (mip->re_len > 0)
+ stflags |= DB_ST_RELEN;
+ if ((ret = __bam_vrfy_subtree(dbp, vdp,
+ root, NULL, NULL, stflags, &level, &nrecs, &relen)) != 0)
+ goto err;
+ /*
+ * Even if mip->re_len > 0, re_len may come back zero if the
+ * tree is empty. It should be okay to just skip the check in
+ * this case, as if there are any non-deleted keys at all,
+ * that should never happen.
+ */
+ if (mip->re_len > 0 && relen > 0 && mip->re_len != relen) {
+ EPRINT((env,
+ "Page %lu: recno database has bad re_len %lu",
+ (u_long)meta_pgno, (u_long)relen));
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ ret = 0;
+ break;
+ case P_LDUP:
+ EPRINT((env,
+ "Page %lu: duplicate tree referenced from metadata page",
+ (u_long)meta_pgno));
+ ret = DB_VERIFY_BAD;
+ break;
+ default:
+ EPRINT((env,
+ "Page %lu: btree root of incorrect type %lu on metadata page",
+ (u_long)meta_pgno, (u_long)rip->type));
+ ret = DB_VERIFY_BAD;
+ break;
+ }
+
+err: if (mip != NULL && ((t_ret =
+ __db_vrfy_putpageinfo(env, vdp, mip)) != 0) && ret == 0)
+ ret = t_ret;
+ if (rip != NULL && ((t_ret =
+ __db_vrfy_putpageinfo(env, vdp, rip)) != 0) && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __bam_vrfy_subtree--
+ * Verify a subtree (or entire) btree with specified root.
+ *
+ * Note that this is public because it must be called to verify
+ * offpage dup trees, including from hash.
+ *
+ * PUBLIC: int __bam_vrfy_subtree __P((DB *, VRFY_DBINFO *, db_pgno_t, void *,
+ * PUBLIC: void *, u_int32_t, u_int32_t *, u_int32_t *, u_int32_t *));
+ */
+int
+__bam_vrfy_subtree(dbp, vdp, pgno, l, r, flags, levelp, nrecsp, relenp)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ void *l, *r;
+ u_int32_t flags, *levelp, *nrecsp, *relenp;
+{
+ BINTERNAL *li, *ri;
+ DB *pgset;
+ DBC *cc;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *h;
+ VRFY_CHILDINFO *child;
+ VRFY_PAGEINFO *pip;
+ db_indx_t i;
+ db_pgno_t next_pgno, prev_pgno;
+ db_recno_t child_nrecs, nrecs;
+ u_int32_t child_level, child_relen, j, level, relen, stflags;
+ u_int8_t leaf_type;
+ int (*func) __P((DB *, const DBT *, const DBT *));
+ int isbad, p, ret, t_ret, toplevel;
+
+ if (levelp != NULL) /* Don't leave uninitialized on error. */
+ *levelp = 0;
+ if (nrecsp != NULL)
+ *nrecsp = 0;
+
+ env = dbp->env;
+ mpf = dbp->mpf;
+ h = NULL;
+ next_pgno = prev_pgno = PGNO_INVALID;
+ nrecs = 0;
+ relen = 0;
+ leaf_type = P_INVALID;
+ isbad = ret = 0;
+
+ /* Provide feedback on our progress to the application. */
+ if (!LF_ISSET(DB_SALVAGE))
+ __db_vrfy_struct_feedback(dbp, vdp);
+
+ if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0)
+ return (ret);
+
+ cc = NULL;
+ level = pip->bt_level;
+
+ toplevel = LF_ISSET(DB_ST_TOPLEVEL) ? 1 : 0;
+ LF_CLR(DB_ST_TOPLEVEL);
+
+ /*
+ * If this is the root, initialize the vdp's prev- and next-pgno
+ * accounting.
+ *
+ * For each leaf page we hit, we'll want to make sure that
+ * vdp->prev_pgno is the same as pip->prev_pgno and vdp->next_pgno is
+ * our page number. Then, we'll set vdp->next_pgno to pip->next_pgno
+ * and vdp->prev_pgno to our page number, and the next leaf page in
+ * line should be able to do the same verification.
+ */
+ if (toplevel) {
+ /*
+ * Cache the values stored in the vdp so that if we're an
+ * auxiliary tree such as an off-page duplicate set, our
+ * caller's leaf page chain doesn't get lost.
+ */
+ prev_pgno = vdp->prev_pgno;
+ next_pgno = vdp->next_pgno;
+ leaf_type = vdp->leaf_type;
+ vdp->next_pgno = vdp->prev_pgno = PGNO_INVALID;
+ vdp->leaf_type = P_INVALID;
+ }
+
+ /*
+ * We are recursively descending a btree, starting from the root
+ * and working our way out to the leaves.
+ *
+ * There are four cases we need to deal with:
+ * 1. pgno is a recno leaf page. Any children are overflows.
+ * 2. pgno is a duplicate leaf page. Any children
+ * are overflow pages; traverse them, and then return
+ * level and nrecs.
+ * 3. pgno is an ordinary leaf page. Check whether dups are
+ * allowed, and if so, traverse any off-page dups or
+ * overflows. Then return nrecs and level.
+ * 4. pgno is a recno internal page. Recursively check any
+ * child pages, making sure their levels are one lower
+ * and their nrecs sum to ours.
+ * 5. pgno is a btree internal page. Same as #4, plus we
+ * must verify that for each pair of BINTERNAL entries
+ * N and N+1, the leftmost item on N's child sorts
+ * greater than N, and the rightmost item on N's child
+ * sorts less than N+1.
+ *
+ * Furthermore, in any sorted page type (P_LDUP, P_LBTREE, P_IBTREE),
+ * we need to verify the internal sort order is correct if,
+ * due to overflow items, we were not able to do so earlier.
+ */
+ switch (pip->type) {
+ case P_LRECNO:
+ case P_LDUP:
+ case P_LBTREE:
+ /*
+ * Cases 1, 2 and 3.
+ *
+ * We're some sort of leaf page; verify
+ * that our linked list of leaves is consistent.
+ */
+ if (vdp->leaf_type == P_INVALID) {
+ /*
+ * First leaf page. Set the type that all its
+ * successors should be, and verify that our prev_pgno
+ * is PGNO_INVALID.
+ */
+ vdp->leaf_type = pip->type;
+ if (pip->prev_pgno != PGNO_INVALID)
+ goto bad_prev;
+ } else {
+ /*
+ * Successor leaf page. Check our type, the previous
+ * page's next_pgno, and our prev_pgno.
+ */
+ if (pip->type != vdp->leaf_type) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: unexpected page type %lu found in leaf chain (expected %lu)",
+ (u_long)pip->pgno, (u_long)pip->type,
+ (u_long)vdp->leaf_type));
+ }
+
+ /*
+ * Don't do the prev/next_pgno checks if we've lost
+ * leaf pages due to another corruption.
+ */
+ if (!F_ISSET(vdp, VRFY_LEAFCHAIN_BROKEN)) {
+ if (pip->pgno != vdp->next_pgno) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: incorrect next_pgno %lu found in leaf chain (should be %lu)",
+ (u_long)vdp->prev_pgno,
+ (u_long)vdp->next_pgno,
+ (u_long)pip->pgno));
+ }
+ if (pip->prev_pgno != vdp->prev_pgno) {
+bad_prev: isbad = 1;
+ EPRINT((env,
+ "Page %lu: incorrect prev_pgno %lu found in leaf chain (should be %lu)",
+ (u_long)pip->pgno,
+ (u_long)pip->prev_pgno,
+ (u_long)vdp->prev_pgno));
+ }
+ }
+ }
+ vdp->prev_pgno = pip->pgno;
+ vdp->next_pgno = pip->next_pgno;
+ F_CLR(vdp, VRFY_LEAFCHAIN_BROKEN);
+
+ /*
+ * Overflow pages are common to all three leaf types;
+ * traverse the child list, looking for overflows.
+ */
+ if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0)
+ goto err;
+ for (ret = __db_vrfy_ccset(cc, pgno, &child); ret == 0;
+ ret = __db_vrfy_ccnext(cc, &child))
+ if (child->type == V_OVERFLOW &&
+ (ret = __db_vrfy_ovfl_structure(dbp, vdp,
+ child->pgno, child->tlen,
+ flags | DB_ST_OVFL_LEAF)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto done;
+ }
+
+ if ((ret = __db_vrfy_ccclose(cc)) != 0)
+ goto err;
+ cc = NULL;
+
+ /* Case 1 */
+ if (pip->type == P_LRECNO) {
+ if (!LF_ISSET(DB_ST_IS_RECNO) &&
+ !(LF_ISSET(DB_ST_DUPOK) &&
+ !LF_ISSET(DB_ST_DUPSORT))) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: recno leaf page non-recno tree",
+ (u_long)pgno));
+ goto done;
+ }
+ goto leaf;
+ } else if (LF_ISSET(DB_ST_IS_RECNO)) {
+ /*
+ * It's a non-recno leaf. Had better not be a recno
+ * subtree.
+ */
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: non-recno leaf page in recno tree",
+ (u_long)pgno));
+ goto done;
+ }
+
+ /* Case 2--no more work. */
+ if (pip->type == P_LDUP)
+ goto leaf;
+
+ /* Case 3 */
+
+ /* Check if we have any dups. */
+ if (F_ISSET(pip, VRFY_HAS_DUPS)) {
+ /* If dups aren't allowed in this btree, trouble. */
+ if (!LF_ISSET(DB_ST_DUPOK)) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: duplicates in non-dup btree",
+ (u_long)pgno));
+ } else {
+ /*
+ * We correctly have dups. If any are off-page,
+ * traverse those btrees recursively.
+ */
+ if ((ret =
+ __db_vrfy_childcursor(vdp, &cc)) != 0)
+ goto err;
+ for (ret = __db_vrfy_ccset(cc, pgno, &child);
+ ret == 0;
+ ret = __db_vrfy_ccnext(cc, &child)) {
+ stflags =
+ flags | DB_ST_RECNUM | DB_ST_DUPSET;
+ /* Skip any overflow entries. */
+ if (child->type == V_DUPLICATE) {
+ if ((ret = __db_vrfy_duptype(
+ dbp, vdp, child->pgno,
+ stflags)) != 0) {
+ isbad = 1;
+ /* Next child. */
+ continue;
+ }
+ if ((ret = __bam_vrfy_subtree(
+ dbp, vdp, child->pgno,
+ NULL, NULL,
+ stflags | DB_ST_TOPLEVEL,
+ NULL, NULL, NULL)) != 0) {
+ if (ret ==
+ DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+ }
+ }
+
+ if ((ret = __db_vrfy_ccclose(cc)) != 0)
+ goto err;
+ cc = NULL;
+
+ /*
+ * If VRFY_DUPS_UNSORTED is set,
+ * DB_ST_DUPSORT had better not be.
+ */
+ if (F_ISSET(pip, VRFY_DUPS_UNSORTED) &&
+ LF_ISSET(DB_ST_DUPSORT)) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: unsorted duplicate set in sorted-dup database",
+ (u_long)pgno));
+ }
+ }
+ }
+ goto leaf;
+ case P_IBTREE:
+ case P_IRECNO:
+ /* We handle these below. */
+ break;
+ default:
+ /*
+ * If a P_IBTREE or P_IRECNO contains a reference to an
+ * invalid page, we'll wind up here; handle it gracefully.
+ * Note that the code at the "done" label assumes that the
+ * current page is a btree/recno one of some sort; this
+ * is not the case here, so we goto err.
+ *
+ * If the page is entirely zeroed, its pip->type will be a lie
+ * (we assumed it was a hash page, as they're allowed to be
+ * zeroed); handle this case specially.
+ */
+ if (F_ISSET(pip, VRFY_IS_ALLZEROES))
+ ZEROPG_ERR_PRINT(env, pgno, "btree or recno page");
+ else
+ EPRINT((env,
+ "Page %lu: btree or recno page is of inappropriate type %lu",
+ (u_long)pgno, (u_long)pip->type));
+
+ /*
+ * We probably lost a leaf page (or more if this was an
+ * internal page) from our prev/next_pgno chain. Flag
+ * that this is expected; we don't want or need to
+ * spew error messages about erroneous prev/next_pgnos,
+ * since that's probably not the real problem.
+ */
+ F_SET(vdp, VRFY_LEAFCHAIN_BROKEN);
+
+ ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+ /*
+ * Cases 4 & 5: This is a btree or recno internal page. For each child,
+ * recurse, keeping a running count of nrecs and making sure the level
+ * is always reasonable.
+ */
+ if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0)
+ goto err;
+ for (ret = __db_vrfy_ccset(cc, pgno, &child); ret == 0;
+ ret = __db_vrfy_ccnext(cc, &child))
+ if (child->type == V_RECNO) {
+ if (pip->type != P_IRECNO) {
+ ret = __db_unknown_path(
+ env, "__bam_vrfy_subtree");
+ goto err;
+ }
+ if ((ret = __bam_vrfy_subtree(dbp, vdp, child->pgno,
+ NULL, NULL, flags, &child_level, &child_nrecs,
+ &child_relen)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto done;
+ }
+
+ if (LF_ISSET(DB_ST_RELEN)) {
+ if (relen == 0)
+ relen = child_relen;
+ /*
+ * child_relen may be zero if the child subtree
+ * is empty.
+ */
+ else if (child_relen > 0 &&
+ relen != child_relen) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: recno page returned bad re_len %lu",
+ (u_long)child->pgno,
+ (u_long)child_relen));
+ }
+ if (relenp)
+ *relenp = relen;
+ }
+ if (LF_ISSET(DB_ST_RECNUM)) {
+ if (child->nrecs != child_nrecs) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: record count incorrect: actual %lu, in record %lu",
+ (u_long)child->pgno,
+ (u_long)child_nrecs,
+ (u_long)child->nrecs));
+ }
+ nrecs += child_nrecs;
+ }
+ if (isbad == 0 && level != child_level + 1) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: recno level incorrect: got %lu, expected %lu",
+ (u_long)child->pgno, (u_long)child_level,
+ (u_long)(level - 1)));
+ }
+ } else if (child->type == V_OVERFLOW) {
+ /*
+ * It is possible for one internal page to reference
+ * a single overflow page twice, if all the items
+ * in the subtree referenced by slot 0 are deleted,
+ * then a similar number of items are put back
+ * before the key that formerly had been in slot 1.
+ *
+ * (Btree doesn't look at the key in slot 0, so the
+ * fact that the key formerly at slot 1 is the "wrong"
+ * parent of the stuff in the slot 0 subtree isn't
+ * really incorrect.)
+ *
+ * __db_vrfy_ovfl_structure is designed to be
+ * efficiently called multiple times for multiple
+ * references; call it here as many times as is
+ * appropriate.
+ */
+
+ /* Otherwise, __db_vrfy_childput would be broken. */
+ DB_ASSERT(env, child->refcnt >= 1);
+
+ /*
+ * An overflow referenced more than twice here
+ * shouldn't happen.
+ */
+ if (child->refcnt > 2) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: overflow page %lu referenced more than twice from internal page",
+ (u_long)pgno, (u_long)child->pgno));
+ } else
+ for (j = 0; j < child->refcnt; j++)
+ if ((ret = __db_vrfy_ovfl_structure(dbp,
+ vdp, child->pgno, child->tlen,
+ flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto done;
+ }
+ }
+
+ if ((ret = __db_vrfy_ccclose(cc)) != 0)
+ goto err;
+ cc = NULL;
+
+ /* We're done with case 4. */
+ if (pip->type == P_IRECNO)
+ goto done;
+
+ /*
+ * Case 5. Btree internal pages.
+ * As described above, we need to iterate through all the
+ * items on the page and make sure that our children sort appropriately
+ * with respect to them.
+ *
+ * For each entry, li will be the "left-hand" key for the entry
+ * itself, which must sort lower than all entries on its child;
+ * ri will be the key to its right, which must sort greater.
+ */
+ if (h == NULL &&
+ (ret = __memp_fget(mpf, &pgno, vdp->thread_info, NULL, 0, &h)) != 0)
+ goto err;
+ for (i = 0; i < pip->entries; i += O_INDX) {
+ li = GET_BINTERNAL(dbp, h, i);
+ ri = (i + O_INDX < pip->entries) ?
+ GET_BINTERNAL(dbp, h, i + O_INDX) : r;
+
+ /*
+ * The leftmost key is forcibly sorted less than all entries,
+ * so don't bother passing it.
+ */
+ if ((ret = __bam_vrfy_subtree(dbp, vdp, li->pgno,
+ i == 0 ? NULL : li, ri, flags, &child_level,
+ &child_nrecs, NULL)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto done;
+ }
+
+ if (LF_ISSET(DB_ST_RECNUM)) {
+ /*
+ * Keep a running tally on the actual record count so
+ * we can return it to our parent (if we have one) or
+ * compare it to the NRECS field if we're a root page.
+ */
+ nrecs += child_nrecs;
+
+ /*
+ * Make sure the actual record count of the child
+ * is equal to the value in the BINTERNAL structure.
+ */
+ if (li->nrecs != child_nrecs) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: item %lu has incorrect record count of %lu, should be %lu",
+ (u_long)pgno, (u_long)i, (u_long)li->nrecs,
+ (u_long)child_nrecs));
+ }
+ }
+
+ if (level != child_level + 1) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: Btree level incorrect: got %lu, expected %lu",
+ (u_long)li->pgno,
+ (u_long)child_level, (u_long)(level - 1)));
+ }
+ }
+
+ if (0) {
+leaf: level = LEAFLEVEL;
+ if (LF_ISSET(DB_ST_RECNUM))
+ nrecs = pip->rec_cnt;
+
+ /* XXX
+ * We should verify that the record count on a leaf page
+ * is the sum of the number of keys and the number of
+ * records in its off-page dups. This requires looking
+ * at the page again, however, and it may all be changing
+ * soon, so for now we don't bother.
+ */
+
+ if (LF_ISSET(DB_ST_RELEN) && relenp)
+ *relenp = pip->re_len;
+ }
+done: if (F_ISSET(pip, VRFY_INCOMPLETE) && isbad == 0 && ret == 0) {
+ /*
+ * During the page-by-page pass, item order verification was
+ * not finished due to the presence of overflow items. If
+ * isbad == 0, though, it's now safe to do so, as we've
+ * traversed any child overflow pages. Do it.
+ */
+ if (h == NULL && (ret = __memp_fget(mpf, &pgno,
+ vdp->thread_info, NULL, 0, &h)) != 0)
+ goto err;
+ if ((ret = __bam_vrfy_itemorder(dbp,
+ vdp, vdp->thread_info, h, pgno, 0, 1, 0, flags)) != 0)
+ goto err;
+ F_CLR(pip, VRFY_INCOMPLETE);
+ }
+
+ /*
+ * It's possible to get to this point with a page that has no
+ * items, but without having detected any sort of failure yet.
+ * Having zero items is legal if it's a leaf--it may be the
+ * root page in an empty tree, or the tree may have been
+ * modified with the DB_REVSPLITOFF flag set (there's no way
+ * to tell from what's on disk). For an internal page,
+ * though, having no items is a problem (all internal pages
+ * must have children).
+ */
+ if (isbad == 0 && ret == 0) {
+ if (h == NULL && (ret = __memp_fget(mpf, &pgno,
+ vdp->thread_info, NULL, 0, &h)) != 0)
+ goto err;
+
+ if (NUM_ENT(h) == 0 && ISINTERNAL(h)) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: internal page is empty and should not be",
+ (u_long)pgno));
+ goto err;
+ }
+ }
+
+ /*
+ * Our parent has sent us BINTERNAL pointers to parent records
+ * so that we can verify our place with respect to them. If it's
+ * appropriate--we have a default sort function--verify this.
+ */
+ if (isbad == 0 && ret == 0 && !LF_ISSET(DB_NOORDERCHK) &&
+ pip->type != P_IRECNO && pip->type != P_LRECNO) {
+ if (h == NULL && (ret = __memp_fget(mpf, &pgno,
+ vdp->thread_info, NULL, 0, &h)) != 0)
+ goto err;
+
+ /*
+ * __bam_vrfy_treeorder needs to know what comparison function
+ * to use. If DB_ST_DUPSET is set, we're in a duplicate tree
+ * and we use the duplicate comparison function; otherwise,
+ * use the btree one. If unset, use the default, of course.
+ */
+ func = LF_ISSET(DB_ST_DUPSET) ? dbp->dup_compare :
+ ((BTREE *)dbp->bt_internal)->bt_compare;
+ if (func == NULL)
+ func = __bam_defcmp;
+
+ if ((ret = __bam_vrfy_treeorder(dbp,
+ vdp->thread_info, h, l, r, func, flags)) != 0) {
+ if (ret == DB_VERIFY_BAD)
+ isbad = 1;
+ else
+ goto err;
+ }
+ }
+
+ /*
+ * This is guaranteed to succeed for leaf pages, but no harm done.
+ *
+ * Internal pages below the top level do not store their own
+ * record numbers, so we skip them.
+ */
+ if (LF_ISSET(DB_ST_RECNUM) && nrecs != pip->rec_cnt && toplevel) {
+ isbad = 1;
+ EPRINT((env,
+ "Page %lu: bad record count: has %lu records, claims %lu",
+ (u_long)pgno, (u_long)nrecs, (u_long)pip->rec_cnt));
+ }
+
+ if (levelp)
+ *levelp = level;
+ if (nrecsp)
+ *nrecsp = nrecs;
+
+ pgset = vdp->pgset;
+ if ((ret = __db_vrfy_pgset_get(pgset,
+ vdp->thread_info, pgno, &p)) != 0)
+ goto err;
+ if (p != 0) {
+ isbad = 1;
+ EPRINT((env, "Page %lu: linked twice", (u_long)pgno));
+ } else if ((ret =
+ __db_vrfy_pgset_inc(pgset, vdp->thread_info, pgno)) != 0)
+ goto err;
+
+ if (toplevel)
+ /*
+ * The last page's next_pgno in the leaf chain should have been
+ * PGNO_INVALID.
+ */
+ if (vdp->next_pgno != PGNO_INVALID) {
+ isbad = 1;
+ EPRINT((env, "Page %lu: unterminated leaf chain",
+ (u_long)vdp->prev_pgno));
+ }
+
+err: if (toplevel) {
+ /* Restore our caller's settings. */
+ vdp->next_pgno = next_pgno;
+ vdp->prev_pgno = prev_pgno;
+ vdp->leaf_type = leaf_type;
+ }
+
+ if (h != NULL && (t_ret = __memp_fput(mpf,
+ vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0)
+ ret = t_ret;
+ if (cc != NULL && ((t_ret = __db_vrfy_ccclose(cc)) != 0) && ret == 0)
+ ret = t_ret;
+ return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret);
+}
+
+/*
+ * __bam_vrfy_treeorder --
+ * Verify that the lowest key on a page sorts greater than the
+ * BINTERNAL which points to it (lp), and the highest key
+ * sorts less than the BINTERNAL above that (rp).
+ *
+ * If lp is NULL, this means that it was the leftmost key on the
+ * parent, which (regardless of sort function) sorts less than
+ * all keys. No need to check it.
+ *
+ * If rp is NULL, lp was the highest key on the parent, so there's
+ * no higher key we must sort less than.
+ */
+static int
+__bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ PAGE *h;
+ BINTERNAL *lp, *rp;
+ int (*func) __P((DB *, const DBT *, const DBT *));
+ u_int32_t flags;
+{
+ BOVERFLOW *bo;
+ DBC *dbc;
+ DBT dbt;
+ ENV *env;
+ db_indx_t last;
+ int ret, cmp;
+
+ env = dbp->env;
+ memset(&dbt, 0, sizeof(DBT));
+ F_SET(&dbt, DB_DBT_MALLOC);
+ ret = 0;
+
+ /*
+ * Empty pages are sorted correctly by definition. We check
+ * to see whether they ought to be empty elsewhere; leaf
+ * pages legally may be.
+ */
+ if (NUM_ENT(h) == 0)
+ return (0);
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ case P_LDUP:
+ last = NUM_ENT(h) - O_INDX;
+ break;
+ case P_LBTREE:
+ last = NUM_ENT(h) - P_INDX;
+ break;
+ default:
+ return (__db_unknown_path(env, "__bam_vrfy_treeorder"));
+ }
+
+ /* Populate a dummy cursor. */
+ if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE,
+ PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
+ return (ret);
+ /*
+ * The key on page h, the child page, is more likely to be
+ * an overflow page, so we pass its offset, rather than lp/rp's,
+ * into __bam_cmp. This will take advantage of __db_moff.
+ */
+
+ /*
+ * Skip first-item check if we're an internal page--the first
+ * entry on an internal page is treated specially by __bam_cmp,
+ * so what's on the page shouldn't matter. (Plus, since we're passing
+ * our page and item 0 as to __bam_cmp, we'll sort before our
+ * parent and falsely report a failure.)
+ */
+ if (lp != NULL && TYPE(h) != P_IBTREE) {
+ if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE,
+ PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
+ return (ret);
+ if (lp->type == B_KEYDATA) {
+ dbt.data = lp->data;
+ dbt.size = lp->len;
+ } else if (lp->type == B_OVERFLOW) {
+ bo = (BOVERFLOW *)lp->data;
+ if ((ret = __db_goff(dbc, &dbt,
+ bo->tlen, bo->pgno, NULL, NULL)) != 0)
+ return (ret);
+ } else
+ return (
+ __db_unknown_path(env, "__bam_vrfy_treeorder"));
+
+ /* On error, fall through, free if needed, and return. */
+ if ((ret = __bam_cmp(dbc, &dbt, h, 0, func, &cmp)) == 0) {
+ if (cmp > 0) {
+ EPRINT((env,
+ "Page %lu: first item on page sorted greater than parent entry",
+ (u_long)PGNO(h)));
+ ret = DB_VERIFY_BAD;
+ }
+ } else
+ EPRINT((env,
+ "Page %lu: first item on page had comparison error",
+ (u_long)PGNO(h)));
+
+ if (dbt.data != lp->data)
+ __os_ufree(env, dbt.data);
+ if (ret != 0)
+ return (ret);
+ }
+
+ if (rp != NULL) {
+ if (rp->type == B_KEYDATA) {
+ dbt.data = rp->data;
+ dbt.size = rp->len;
+ } else if (rp->type == B_OVERFLOW) {
+ bo = (BOVERFLOW *)rp->data;
+ if ((ret = __db_goff(dbc, &dbt,
+ bo->tlen, bo->pgno, NULL, NULL)) != 0)
+ return (ret);
+ } else
+ return (
+ __db_unknown_path(env, "__bam_vrfy_treeorder"));
+
+ /* On error, fall through, free if needed, and return. */
+ if ((ret = __bam_cmp(dbc, &dbt, h, last, func, &cmp)) == 0) {
+ if (cmp < 0) {
+ EPRINT((env,
+ "Page %lu: last item on page sorted greater than parent entry",
+ (u_long)PGNO(h)));
+ ret = DB_VERIFY_BAD;
+ }
+ } else
+ EPRINT((env,
+ "Page %lu: last item on page had comparison error",
+ (u_long)PGNO(h)));
+
+ if (dbt.data != rp->data)
+ __os_ufree(env, dbt.data);
+ }
+
+ return (ret);
+}
+
+/*
+ * __bam_salvage --
+ * Safely dump out anything that looks like a key on an alleged
+ * btree leaf page, also mark overflow pages as seen. For internal btree
+ * pages, just mark any overflow pages as seen.
+ *
+ * PUBLIC: int __bam_salvage __P((DB *, VRFY_DBINFO *,
+ * PUBLIC: db_pgno_t, u_int32_t, PAGE *, void *,
+ * PUBLIC: int (*)(void *, const void *), DBT *, u_int32_t));
+ */
+int
+__bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ db_pgno_t pgno;
+ u_int32_t pgtype;
+ PAGE *h;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ DBT *key;
+ u_int32_t flags;
+{
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ DBT dbt, repldbt, unknown_key, unknown_data;
+ ENV *env;
+ VRFY_ITEM *pgmap;
+ db_indx_t i, last, beg, end, *inp;
+ db_pgno_t ovflpg;
+ u_int32_t himark, ovfl_bufsz;
+ void *ovflbuf;
+ int adj, ret, t_ret, t2_ret;
+#ifdef HAVE_COMPRESSION
+ DBT kcpy, *last_key;
+ int unknown_dup_key;
+#endif
+
+ env = dbp->env;
+ ovflbuf = pgmap = NULL;
+ inp = P_INP(dbp, h);
+
+ memset(&dbt, 0, sizeof(DBT));
+ dbt.flags = DB_DBT_REALLOC;
+ memset(&repldbt, 0, sizeof(DBT));
+
+#ifdef HAVE_COMPRESSION
+ memset(&kcpy, 0, sizeof(DBT));
+ unknown_dup_key = LF_ISSET(DB_SA_UNKNOWNKEY);
+ last_key = unknown_dup_key ? NULL : key;
+#endif
+ LF_CLR(DB_SA_UNKNOWNKEY);
+
+ DB_INIT_DBT(unknown_key, "UNKNOWN_KEY", sizeof("UNKNOWN_KEY") - 1);
+ DB_INIT_DBT(unknown_data, "UNKNOWN_DATA", sizeof("UNKNOWN_DATA") - 1);
+
+ /*
+ * Allocate a buffer for overflow items. Start at one page;
+ * __db_safe_goff will realloc as needed.
+ */
+ if ((ret = __os_malloc(env, dbp->pgsize, &ovflbuf)) != 0)
+ goto err;
+ ovfl_bufsz = dbp->pgsize;
+
+ if (LF_ISSET(DB_AGGRESSIVE) && (ret =
+ __os_calloc(env, dbp->pgsize, sizeof(pgmap[0]), &pgmap)) != 0)
+ goto err;
+
+ /*
+ * Loop through the inp array, spitting out key/data pairs.
+ *
+ * If we're salvaging normally, loop from 0 through NUM_ENT(h). If
+ * we're being aggressive, loop until we hit the end of the page --
+ * NUM_ENT() may be bogus.
+ */
+ himark = dbp->pgsize;
+ for (i = 0, last = UINT16_MAX;; i += O_INDX) {
+ /*
+ * If we're not aggressive, or if we're on an internal page,
+ * break when we hit NUM_ENT(h).
+ */
+ if ((!LF_ISSET(DB_AGGRESSIVE) ||
+ pgtype == P_IBTREE) && i >= NUM_ENT(h))
+ break;
+
+ /* Verify the current item. */
+ t_ret =
+ __db_vrfy_inpitem(dbp, h, pgno, i, 1, flags, &himark, NULL);
+
+ if (t_ret != 0) {
+ /*
+ * If this is a btree leaf and we've printed out a key
+ * but not its associated data item, fix this imbalance
+ * by printing an "UNKNOWN_DATA".
+ */
+ if (pgtype == P_LBTREE && i % P_INDX == 1 &&
+ last == i - 1 && (t2_ret = __db_vrfy_prdbt(
+ &unknown_data,
+ 0, " ", handle, callback, 0, vdp)) != 0) {
+ if (ret == 0)
+ ret = t2_ret;
+ goto err;
+ }
+
+ /*
+ * Don't return DB_VERIFY_FATAL; it's private and means
+ * only that we can't go on with this page, not with
+ * the whole database. It's not even an error if we've
+ * run into it after NUM_ENT(h).
+ */
+ if (t_ret == DB_VERIFY_FATAL) {
+ if (i < NUM_ENT(h) && ret == 0)
+ ret = DB_VERIFY_BAD;
+ break;
+ }
+ continue;
+ }
+
+ /*
+ * If this returned 0, it's safe to print or (carefully)
+ * try to fetch.
+ *
+ * We only print deleted items if DB_AGGRESSIVE is set.
+ */
+ bk = GET_BKEYDATA(dbp, h, i);
+ if (!LF_ISSET(DB_AGGRESSIVE) && B_DISSET(bk->type))
+ continue;
+
+ /*
+ * If this is a btree leaf and we're about to print out a data
+ * item for which we didn't print out a key, fix this imbalance
+ * by printing an "UNKNOWN_KEY".
+ */
+ if (pgtype == P_LBTREE && i % P_INDX == 1 && last != i - 1) {
+#ifdef HAVE_COMPRESSION
+ last_key = NULL;
+#endif
+ if ((t_ret = __db_vrfy_prdbt(&unknown_key,
+ 0, " ", handle, callback, 0, vdp)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ goto err;
+ }
+ }
+ last = i;
+
+ /*
+ * We're going to go try to print the next item. If key is
+ * non-NULL, we're a dup page, so we've got to print the key
+ * first, unless DB_SA_SKIPFIRSTKEY is set and we're on the
+ * first entry.
+ */
+ if (key != NULL && (i != 0 || !LF_ISSET(DB_SA_SKIPFIRSTKEY))) {
+#ifdef HAVE_COMPRESSION
+ last_key = unknown_dup_key ? NULL : key;
+#endif
+ if ((t_ret = __db_vrfy_prdbt(key,
+ 0, " ", handle, callback, 0, vdp)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ goto err;
+ }
+ }
+
+ beg = end = inp[i];
+ switch (B_TYPE(bk->type)) {
+ case B_DUPLICATE:
+ if (pgtype == P_IBTREE)
+ break;
+
+ end = beg + BOVERFLOW_SIZE - 1;
+ /*
+ * If we're not on a normal btree leaf page, there
+ * shouldn't be off-page dup sets. Something's
+ * confused; just drop it, and the code to pick up
+ * unlinked offpage dup sets will print it out
+ * with key "UNKNOWN" later.
+ */
+ if (pgtype != P_LBTREE)
+ break;
+
+ bo = (BOVERFLOW *)bk;
+
+ /*
+ * If the page number is unreasonable, or if this is
+ * supposed to be a key item, output "UNKNOWN_KEY" --
+ * the best we can do is run into the data items in
+ * the unlinked offpage dup pass.
+ */
+ if (!IS_VALID_PGNO(bo->pgno) || (i % P_INDX == 0)) {
+ /* Not much to do on failure. */
+#ifdef HAVE_COMPRESSION
+ if (key == NULL && i % P_INDX == 0)
+ last_key = NULL;
+#endif
+ if ((t_ret = __db_vrfy_prdbt(
+ i % P_INDX == 0 ? &unknown_key : &unknown_data,
+ 0, " ", handle, callback, 0, vdp)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ goto err;
+ }
+ break;
+ }
+
+ /* Don't stop on error. */
+ if ((t_ret = __db_salvage_duptree(dbp,
+ vdp, bo->pgno, &dbt, handle, callback,
+ flags | DB_SA_SKIPFIRSTKEY
+#ifdef HAVE_COMPRESSION
+ | (last_key == NULL ? DB_SA_UNKNOWNKEY : 0)
+#endif
+ )) != 0 && ret == 0)
+ ret = t_ret;
+
+ break;
+ case B_KEYDATA:
+ if (pgtype == P_IBTREE)
+ break;
+
+ end = (db_indx_t)DB_ALIGN(
+ beg + bk->len, sizeof(u_int32_t)) - 1;
+
+ dbt.data = bk->data;
+ dbt.size = bk->len;
+
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbp) && last_key != NULL &&
+ (key != NULL || (i % P_INDX == 1))) {
+ /* Decompress the key/data pair - the key
+ is in last_key, and the data is in dbt */
+ if ((t_ret = __bam_compress_salvage(dbp, vdp,
+ handle, callback, last_key, &dbt)) != 0) {
+ if (t_ret == DB_VERIFY_FATAL) {
+ if (ret == 0)
+ ret = DB_VERIFY_BAD;
+ if (!LF_ISSET(DB_AGGRESSIVE))
+ goto err;
+ } else if (ret == 0) {
+ ret = t_ret;
+ goto err;
+ }
+ }
+ } else {
+ if (key == NULL && i % P_INDX == 0) {
+ if ((ret = __os_realloc(
+ env, dbt.size, &kcpy.data)) != 0)
+ goto err;
+ memcpy(kcpy.data, dbt.data, dbt.size);
+ kcpy.size = dbt.size;
+ last_key = &kcpy;
+ }
+#endif
+
+ if ((t_ret = __db_vrfy_prdbt(&dbt,
+ 0, " ", handle, callback, 0, vdp)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ goto err;
+ }
+#ifdef HAVE_COMPRESSION
+ }
+#endif
+ break;
+ case B_OVERFLOW:
+ if (pgtype != P_IBTREE)
+ end = beg + BOVERFLOW_SIZE - 1;
+ bo = (BOVERFLOW *)bk;
+
+ /*
+ * Check for replicated overflow keys, so that we only
+ * call __db_safe_goff once per overflow page. If we
+ * get the same offset as the previous key just re-use
+ * the previous dbt.
+ *
+ * P_IBTREE pages will never have replicated overflow
+ * keys.
+ */
+ adj = pgtype == P_IBTREE ? O_INDX : P_INDX;
+ if (pgtype == P_IBTREE) {
+ /*
+ * If we're looking at a P_IBTREE, we just want
+ * to mark the overflow page as seen.
+ *
+ * Note that this call to __db_safe_goff differs
+ * from the non-P_IBTREE call.
+ *
+ * Only call __db_safe_goff if the overflow page
+ * hasn't been seen.
+ */
+ ovflpg = ((BOVERFLOW *)
+ ((BINTERNAL *)bk)->data)->pgno;
+ if (__db_salvage_isdone(vdp, ovflpg) == 0 &&
+ (t_ret =__db_safe_goff(dbp, vdp, ovflpg,
+ &dbt, &ovflbuf,
+ &ovfl_bufsz, flags)) != 0 && ret == 0)
+ ret = t_ret;
+ break;
+ } else if (i > adj - 1 &&
+ i % adj == 0 && inp[i] == inp[i - adj])
+ dbt = repldbt;
+ else {
+ /* Don't stop on error. */
+ if ((t_ret = __db_safe_goff(dbp, vdp,
+ bo->pgno, &dbt, &ovflbuf,
+ &ovfl_bufsz, flags)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /*
+ * If this is a key, save it in case the next
+ * key is a replicated overflow, so we don't
+ * call __db_safe_goff again. Copy out dbt.data
+ * in case that pointer gets realloc'd when
+ * getting a data item.
+ */
+ if (i % P_INDX == 0) {
+ if (t_ret == 0) {
+ if ((t_ret = __os_realloc(env,
+ dbt.size,
+ &repldbt.data)) != 0) {
+ if (ret == 0)
+ ret = t_ret;
+ goto err;
+ }
+ memcpy(repldbt.data,
+ dbt.data, dbt.size);
+ repldbt.size = dbt.size;
+ } else {
+ if (__os_realloc(env,
+ unknown_key.size,
+ &repldbt.data) != 0)
+ goto err;
+ memcpy(repldbt.data,
+ unknown_key.data,
+ unknown_key.size);
+ repldbt.size = unknown_key.size;
+ }
+ }
+
+ }
+
+#ifdef HAVE_COMPRESSION
+ if (DB_IS_COMPRESSED(dbp) && last_key && t_ret == 0 &&
+ (key != NULL || (i % P_INDX == 1))) {
+ /* Decompress the key/data pair - the key
+ is in last_key, and the data is in dbt */
+ if ((t_ret = __bam_compress_salvage(dbp, vdp,
+ handle, callback, last_key, &dbt)) != 0) {
+ if (t_ret == DB_VERIFY_FATAL) {
+ if (ret == 0)
+ ret = DB_VERIFY_BAD;
+ if (!LF_ISSET(DB_AGGRESSIVE))
+ goto err;
+ } else if (ret == 0) {
+ ret = t_ret;
+ goto err;
+ }
+ }
+ } else {
+ if (key == NULL && i % P_INDX == 0) {
+ if (t_ret == 0) {
+ if ((ret = __os_realloc(env,
+ dbt.size, &kcpy.data)) != 0)
+ goto err;
+ memcpy(kcpy.data, dbt.data,
+ dbt.size);
+ kcpy.size = dbt.size;
+ last_key = &kcpy;
+ } else
+ last_key = NULL;
+ }
+#endif
+
+ if ((t_ret = __db_vrfy_prdbt(
+ t_ret == 0 ? &dbt : &unknown_key,
+ 0, " ", handle, callback, 0, vdp))
+ != 0 && ret == 0)
+ ret = t_ret;
+#ifdef HAVE_COMPRESSION
+ }
+#endif
+ break;
+ default:
+ /*
+ * We should never get here; __db_vrfy_inpitem should
+ * not be returning 0 if bk->type is unrecognizable.
+ */
+ t_ret = __db_unknown_path(env, "__bam_salvage");
+ if (ret == 0)
+ ret = t_ret;
+ goto err;
+ }
+
+ /*
+ * If we're being aggressive, mark the beginning and end of
+ * the item; we'll come back and print whatever "junk" is in
+ * the gaps in case we had any bogus inp elements and thereby
+ * missed stuff.
+ */
+ if (LF_ISSET(DB_AGGRESSIVE) && pgtype != P_IBTREE) {
+ pgmap[beg] = VRFY_ITEM_BEGIN;
+ pgmap[end] = VRFY_ITEM_END;
+ }
+ }
+
+err: if (pgmap != NULL)
+ __os_free(env, pgmap);
+ if (ovflbuf != NULL)
+ __os_free(env, ovflbuf);
+ if (repldbt.data != NULL)
+ __os_free(env, repldbt.data);
+#ifdef HAVE_COMPRESSION
+ if (kcpy.data != NULL)
+ __os_free(env, kcpy.data);
+#endif
+
+ /* Mark this page as done. */
+ if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __bam_salvage_walkdupint --
+ * Walk a known-good btree or recno internal page which is part of
+ * a dup tree, calling __db_salvage_duptree on each child page.
+ *
+ * PUBLIC: int __bam_salvage_walkdupint __P((DB *, VRFY_DBINFO *, PAGE *,
+ * PUBLIC: DBT *, void *, int (*)(void *, const void *), u_int32_t));
+ */
+int
+__bam_salvage_walkdupint(dbp, vdp, h, key, handle, callback, flags)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ PAGE *h;
+ DBT *key;
+ void *handle;
+ int (*callback) __P((void *, const void *));
+ u_int32_t flags;
+{
+ BINTERNAL *bi;
+ ENV *env;
+ RINTERNAL *ri;
+ int ret, t_ret;
+ db_indx_t i;
+
+ env = dbp->env;
+ ret = 0;
+
+ for (i = 0; i < NUM_ENT(h); i++) {
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ bi = GET_BINTERNAL(dbp, h, i);
+ if ((t_ret = __db_salvage_duptree(dbp,
+ vdp, bi->pgno, key, handle, callback, flags)) != 0)
+ ret = t_ret;
+ break;
+ case P_IRECNO:
+ ri = GET_RINTERNAL(dbp, h, i);
+ if ((t_ret = __db_salvage_duptree(dbp,
+ vdp, ri->pgno, key, handle, callback, flags)) != 0)
+ ret = t_ret;
+ break;
+ default:
+ return (__db_unknown_path(
+ env, "__bam_salvage_walkdupint"));
+ }
+ /* Pass DB_SA_SKIPFIRSTKEY, if set, on to the 0th child only. */
+ flags &= ~LF_ISSET(DB_SA_SKIPFIRSTKEY);
+ }
+
+ return (ret);
+}
+
+/*
+ * __bam_meta2pgset --
+ * Given a known-good meta page, return in pgsetp a 0-terminated list of
+ * db_pgno_t's corresponding to the pages in the btree.
+ *
+ * We do this by a somewhat sleazy method, to avoid having to traverse the
+ * btree structure neatly: we walk down the left side to the very
+ * first leaf page, then we mark all the pages in the chain of
+ * NEXT_PGNOs (being wary of cycles and invalid ones), then we
+ * consolidate our scratch array into a nice list, and return. This
+ * avoids the memory management hassles of recursion and the
+ * trouble of walking internal pages--they just don't matter, except
+ * for the left branch.
+ *
+ * PUBLIC: int __bam_meta2pgset __P((DB *, VRFY_DBINFO *, BTMETA *,
+ * PUBLIC: u_int32_t, DB *));
+ */
+int
+__bam_meta2pgset(dbp, vdp, btmeta, flags, pgset)
+ DB *dbp;
+ VRFY_DBINFO *vdp;
+ BTMETA *btmeta;
+ u_int32_t flags;
+ DB *pgset;
+{
+ BINTERNAL *bi;
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ RINTERNAL *ri;
+ db_pgno_t current, p;
+ int err_ret, ret;
+
+ DB_ASSERT(dbp->env, pgset != NULL);
+
+ mpf = dbp->mpf;
+ h = NULL;
+ ret = err_ret = 0;
+
+ for (current = btmeta->root;;) {
+ if (!IS_VALID_PGNO(current) || current == PGNO(btmeta)) {
+ err_ret = DB_VERIFY_BAD;
+ goto err;
+ }
+ if ((ret = __memp_fget(mpf, &current,
+ vdp->thread_info, NULL, 0, &h)) != 0) {
+ err_ret = ret;
+ goto err;
+ }
+
+ switch (TYPE(h)) {
+ case P_IBTREE:
+ case P_IRECNO:
+ if ((ret = __bam_vrfy(dbp,
+ vdp, h, current, flags | DB_NOORDERCHK)) != 0) {
+ err_ret = ret;
+ goto err;
+ }
+ if (TYPE(h) == P_IBTREE) {
+ bi = GET_BINTERNAL(dbp, h, 0);
+ current = bi->pgno;
+ } else { /* P_IRECNO */
+ ri = GET_RINTERNAL(dbp, h, 0);
+ current = ri->pgno;
+ }
+ break;
+ case P_LBTREE:
+ case P_LRECNO:
+ goto traverse;
+ default:
+ err_ret = DB_VERIFY_BAD;
+ goto err;
+ }
+
+ if ((ret = __memp_fput(mpf,
+ vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
+ err_ret = ret;
+ h = NULL;
+ }
+
+ /*
+ * At this point, current is the pgno of leaf page h, the 0th in the
+ * tree we're concerned with.
+ */
+traverse:
+ while (IS_VALID_PGNO(current) && current != PGNO_INVALID) {
+ if (h == NULL && (ret = __memp_fget(mpf,
+ &current, vdp->thread_info, NULL, 0, &h)) != 0) {
+ err_ret = ret;
+ break;
+ }
+
+ if ((ret = __db_vrfy_pgset_get(pgset,
+ vdp->thread_info, current, (int *)&p)) != 0)
+ goto err;
+
+ if (p != 0) {
+ /*
+ * We've found a cycle. Return success anyway--
+ * our caller may as well use however much of
+ * the pgset we've come up with.
+ */
+ break;
+ }
+ if ((ret =
+ __db_vrfy_pgset_inc(pgset, vdp->thread_info, current)) != 0)
+ goto err;
+
+ current = NEXT_PGNO(h);
+ if ((ret = __memp_fput(mpf,
+ vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0)
+ err_ret = ret;
+ h = NULL;
+ }
+
+err: if (h != NULL)
+ (void)__memp_fput(mpf,
+ vdp->thread_info, h, DB_PRIORITY_UNCHANGED);
+
+ return (ret == 0 ? err_ret : ret);
+}
+
+/*
+ * __bam_safe_getdata --
+ *
+ * Utility function for __bam_vrfy_itemorder. Safely gets the datum at
+ * index i, page h, and sticks it in DBT dbt. If ovflok is 1 and i's an
+ * overflow item, we do a safe_goff to get the item and signal that we need
+ * to free dbt->data; if ovflok is 0, we leaves the DBT zeroed.
+ */
+static int
+__bam_safe_getdata(dbp, ip, h, i, ovflok, dbt, freedbtp)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ PAGE *h;
+ u_int32_t i;
+ int ovflok;
+ DBT *dbt;
+ int *freedbtp;
+{
+ BKEYDATA *bk;
+ BOVERFLOW *bo;
+ DBC *dbc;
+ int ret;
+
+ memset(dbt, 0, sizeof(DBT));
+ *freedbtp = 0;
+
+ bk = GET_BKEYDATA(dbp, h, i);
+ if (B_TYPE(bk->type) == B_OVERFLOW) {
+ if (!ovflok)
+ return (0);
+
+ if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE,
+ PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0)
+ return (ret);
+ bo = (BOVERFLOW *)bk;
+ F_SET(dbt, DB_DBT_MALLOC);
+
+ *freedbtp = 1;
+ return (__db_goff(dbc, dbt, bo->tlen, bo->pgno, NULL, NULL));
+ } else {
+ dbt->data = bk->data;
+ dbt->size = bk->len;
+ }
+
+ return (0);
+}
diff --git a/btree/btree.h b/btree/btree.h
deleted file mode 100644
index 36d35c9..0000000
--- a/btree/btree.h
+++ /dev/null
@@ -1,383 +0,0 @@
-/*-
- * Copyright (c) 1991, 1993, 1994
- * The Regents of the University of California. All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * Mike Olson.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * @(#)btree.h 8.11 (Berkeley) 8/17/94
- */
-
-/* Macros to set/clear/test flags. */
-#define F_SET(p, f) (p)->flags |= (f)
-#define F_CLR(p, f) (p)->flags &= ~(f)
-#define F_ISSET(p, f) ((p)->flags & (f))
-
-#include <mpool.h>
-
-#define DEFMINKEYPAGE (2) /* Minimum keys per page */
-#define MINCACHE (5) /* Minimum cached pages */
-#define MINPSIZE (512) /* Minimum page size */
-
-/*
- * Page 0 of a btree file contains a copy of the meta-data. This page is also
- * used as an out-of-band page, i.e. page pointers that point to nowhere point
- * to page 0. Page 1 is the root of the btree.
- */
-#define P_INVALID 0 /* Invalid tree page number. */
-#define P_META 0 /* Tree metadata page number. */
-#define P_ROOT 1 /* Tree root page number. */
-
-/*
- * There are five page layouts in the btree: btree internal pages (BINTERNAL),
- * btree leaf pages (BLEAF), recno internal pages (RINTERNAL), recno leaf pages
- * (RLEAF) and overflow pages. All five page types have a page header (PAGE).
- * This implementation requires that values within structures NOT be padded.
- * (ANSI C permits random padding.) If your compiler pads randomly you'll have
- * to do some work to get this package to run.
- */
-typedef struct _page {
- pgno_t pgno; /* this page's page number */
- pgno_t prevpg; /* left sibling */
- pgno_t nextpg; /* right sibling */
-
-#define P_BINTERNAL 0x01 /* btree internal page */
-#define P_BLEAF 0x02 /* leaf page */
-#define P_OVERFLOW 0x04 /* overflow page */
-#define P_RINTERNAL 0x08 /* recno internal page */
-#define P_RLEAF 0x10 /* leaf page */
-#define P_TYPE 0x1f /* type mask */
-#define P_PRESERVE 0x20 /* never delete this chain of pages */
- u_int32_t flags;
-
- indx_t lower; /* lower bound of free space on page */
- indx_t upper; /* upper bound of free space on page */
- indx_t linp[1]; /* indx_t-aligned VAR. LENGTH DATA */
-} PAGE;
-
-/* First and next index. */
-#define BTDATAOFF \
- (sizeof(pgno_t) + sizeof(pgno_t) + sizeof(pgno_t) + \
- sizeof(u_int32_t) + sizeof(indx_t) + sizeof(indx_t))
-#define NEXTINDEX(p) (((p)->lower - BTDATAOFF) / sizeof(indx_t))
-
-/*
- * For pages other than overflow pages, there is an array of offsets into the
- * rest of the page immediately following the page header. Each offset is to
- * an item which is unique to the type of page. The h_lower offset is just
- * past the last filled-in index. The h_upper offset is the first item on the
- * page. Offsets are from the beginning of the page.
- *
- * If an item is too big to store on a single page, a flag is set and the item
- * is a { page, size } pair such that the page is the first page of an overflow
- * chain with size bytes of item. Overflow pages are simply bytes without any
- * external structure.
- *
- * The page number and size fields in the items are pgno_t-aligned so they can
- * be manipulated without copying. (This presumes that 32 bit items can be
- * manipulated on this system.)
- */
-#define LALIGN(n) (((n) + sizeof(pgno_t) - 1) & ~(sizeof(pgno_t) - 1))
-#define NOVFLSIZE (sizeof(pgno_t) + sizeof(u_int32_t))
-
-/*
- * For the btree internal pages, the item is a key. BINTERNALs are {key, pgno}
- * pairs, such that the key compares less than or equal to all of the records
- * on that page. For a tree without duplicate keys, an internal page with two
- * consecutive keys, a and b, will have all records greater than or equal to a
- * and less than b stored on the page associated with a. Duplicate keys are
- * somewhat special and can cause duplicate internal and leaf page records and
- * some minor modifications of the above rule.
- */
-typedef struct _binternal {
- u_int32_t ksize; /* key size */
- pgno_t pgno; /* page number stored on */
-#define P_BIGDATA 0x01 /* overflow data */
-#define P_BIGKEY 0x02 /* overflow key */
- u_char flags;
- char bytes[1]; /* data */
-} BINTERNAL;
-
-/* Get the page's BINTERNAL structure at index indx. */
-#define GETBINTERNAL(pg, indx) \
- ((BINTERNAL *)((char *)(pg) + (pg)->linp[indx]))
-
-/* Get the number of bytes in the entry. */
-#define NBINTERNAL(len) \
- LALIGN(sizeof(u_int32_t) + sizeof(pgno_t) + sizeof(u_char) + (len))
-
-/* Copy a BINTERNAL entry to the page. */
-#define WR_BINTERNAL(p, size, pgno, flags) { \
- *(u_int32_t *)p = size; \
- p += sizeof(u_int32_t); \
- *(pgno_t *)p = pgno; \
- p += sizeof(pgno_t); \
- *(u_char *)p = flags; \
- p += sizeof(u_char); \
-}
-
-/*
- * For the recno internal pages, the item is a page number with the number of
- * keys found on that page and below.
- */
-typedef struct _rinternal {
- recno_t nrecs; /* number of records */
- pgno_t pgno; /* page number stored below */
-} RINTERNAL;
-
-/* Get the page's RINTERNAL structure at index indx. */
-#define GETRINTERNAL(pg, indx) \
- ((RINTERNAL *)((char *)(pg) + (pg)->linp[indx]))
-
-/* Get the number of bytes in the entry. */
-#define NRINTERNAL \
- LALIGN(sizeof(recno_t) + sizeof(pgno_t))
-
-/* Copy a RINTERAL entry to the page. */
-#define WR_RINTERNAL(p, nrecs, pgno) { \
- *(recno_t *)p = nrecs; \
- p += sizeof(recno_t); \
- *(pgno_t *)p = pgno; \
-}
-
-/* For the btree leaf pages, the item is a key and data pair. */
-typedef struct _bleaf {
- u_int32_t ksize; /* size of key */
- u_int32_t dsize; /* size of data */
- u_char flags; /* P_BIGDATA, P_BIGKEY */
- char bytes[1]; /* data */
-} BLEAF;
-
-/* Get the page's BLEAF structure at index indx. */
-#define GETBLEAF(pg, indx) \
- ((BLEAF *)((char *)(pg) + (pg)->linp[indx]))
-
-/* Get the number of bytes in the entry. */
-#define NBLEAF(p) NBLEAFDBT((p)->ksize, (p)->dsize)
-
-/* Get the number of bytes in the user's key/data pair. */
-#define NBLEAFDBT(ksize, dsize) \
- LALIGN(sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_char) + \
- (ksize) + (dsize))
-
-/* Copy a BLEAF entry to the page. */
-#define WR_BLEAF(p, key, data, flags) { \
- *(u_int32_t *)p = key->size; \
- p += sizeof(u_int32_t); \
- *(u_int32_t *)p = data->size; \
- p += sizeof(u_int32_t); \
- *(u_char *)p = flags; \
- p += sizeof(u_char); \
- memmove(p, key->data, key->size); \
- p += key->size; \
- memmove(p, data->data, data->size); \
-}
-
-/* For the recno leaf pages, the item is a data entry. */
-typedef struct _rleaf {
- u_int32_t dsize; /* size of data */
- u_char flags; /* P_BIGDATA */
- char bytes[1];
-} RLEAF;
-
-/* Get the page's RLEAF structure at index indx. */
-#define GETRLEAF(pg, indx) \
- ((RLEAF *)((char *)(pg) + (pg)->linp[indx]))
-
-/* Get the number of bytes in the entry. */
-#define NRLEAF(p) NRLEAFDBT((p)->dsize)
-
-/* Get the number of bytes from the user's data. */
-#define NRLEAFDBT(dsize) \
- LALIGN(sizeof(u_int32_t) + sizeof(u_char) + (dsize))
-
-/* Copy a RLEAF entry to the page. */
-#define WR_RLEAF(p, data, flags) { \
- *(u_int32_t *)p = data->size; \
- p += sizeof(u_int32_t); \
- *(u_char *)p = flags; \
- p += sizeof(u_char); \
- memmove(p, data->data, data->size); \
-}
-
-/*
- * A record in the tree is either a pointer to a page and an index in the page
- * or a page number and an index. These structures are used as a cursor, stack
- * entry and search returns as well as to pass records to other routines.
- *
- * One comment about searches. Internal page searches must find the largest
- * record less than key in the tree so that descents work. Leaf page searches
- * must find the smallest record greater than key so that the returned index
- * is the record's correct position for insertion.
- */
-typedef struct _epgno {
- pgno_t pgno; /* the page number */
- indx_t index; /* the index on the page */
-} EPGNO;
-
-typedef struct _epg {
- PAGE *page; /* the (pinned) page */
- indx_t index; /* the index on the page */
-} EPG;
-
-/*
- * About cursors. The cursor (and the page that contained the key/data pair
- * that it referenced) can be deleted, which makes things a bit tricky. If
- * there are no duplicates of the cursor key in the tree (i.e. B_NODUPS is set
- * or there simply aren't any duplicates of the key) we copy the key that it
- * referenced when it's deleted, and reacquire a new cursor key if the cursor
- * is used again. If there are duplicates keys, we move to the next/previous
- * key, and set a flag so that we know what happened. NOTE: if duplicate (to
- * the cursor) keys are added to the tree during this process, it is undefined
- * if they will be returned or not in a cursor scan.
- *
- * The flags determine the possible states of the cursor:
- *
- * CURS_INIT The cursor references *something*.
- * CURS_ACQUIRE The cursor was deleted, and a key has been saved so that
- * we can reacquire the right position in the tree.
- * CURS_AFTER, CURS_BEFORE
- * The cursor was deleted, and now references a key/data pair
- * that has not yet been returned, either before or after the
- * deleted key/data pair.
- * XXX
- * This structure is broken out so that we can eventually offer multiple
- * cursors as part of the DB interface.
- */
-typedef struct _cursor {
- EPGNO pg; /* B: Saved tree reference. */
- DBT key; /* B: Saved key, or key.data == NULL. */
- recno_t rcursor; /* R: recno cursor (1-based) */
-
-#define CURS_ACQUIRE 0x01 /* B: Cursor needs to be reacquired. */
-#define CURS_AFTER 0x02 /* B: Unreturned cursor after key. */
-#define CURS_BEFORE 0x04 /* B: Unreturned cursor before key. */
-#define CURS_INIT 0x08 /* RB: Cursor initialized. */
- u_int8_t flags;
-} CURSOR;
-
-/*
- * The metadata of the tree. The nrecs field is used only by the RECNO code.
- * This is because the btree doesn't really need it and it requires that every
- * put or delete call modify the metadata.
- */
-typedef struct _btmeta {
- u_int32_t magic; /* magic number */
- u_int32_t version; /* version */
- u_int32_t psize; /* page size */
- u_int32_t free; /* page number of first free page */
- u_int32_t nrecs; /* R: number of records */
-
-#define SAVEMETA (B_NODUPS | R_RECNO)
- u_int32_t flags; /* bt_flags & SAVEMETA */
-} BTMETA;
-
-/* The in-memory btree/recno data structure. */
-typedef struct _btree {
- MPOOL *bt_mp; /* memory pool cookie */
-
- DB *bt_dbp; /* pointer to enclosing DB */
-
- EPG bt_cur; /* current (pinned) page */
- PAGE *bt_pinned; /* page pinned across calls */
-
- CURSOR bt_cursor; /* cursor */
-
-#define BT_PUSH(t, p, i) { \
- t->bt_sp->pgno = p; \
- t->bt_sp->index = i; \
- ++t->bt_sp; \
-}
-#define BT_POP(t) (t->bt_sp == t->bt_stack ? NULL : --t->bt_sp)
-#define BT_CLR(t) (t->bt_sp = t->bt_stack)
- EPGNO bt_stack[50]; /* stack of parent pages */
- EPGNO *bt_sp; /* current stack pointer */
-
- DBT bt_rkey; /* returned key */
- DBT bt_rdata; /* returned data */
-
- int bt_fd; /* tree file descriptor */
-
- pgno_t bt_free; /* next free page */
- u_int32_t bt_psize; /* page size */
- indx_t bt_ovflsize; /* cut-off for key/data overflow */
- int bt_lorder; /* byte order */
- /* sorted order */
- enum { NOT, BACK, FORWARD } bt_order;
- EPGNO bt_last; /* last insert */
-
- /* B: key comparison function */
- int (*bt_cmp) __P((const DBT *, const DBT *));
- /* B: prefix comparison function */
- size_t (*bt_pfx) __P((const DBT *, const DBT *));
- /* R: recno input function */
- int (*bt_irec) __P((struct _btree *, recno_t));
-
- FILE *bt_rfp; /* R: record FILE pointer */
- int bt_rfd; /* R: record file descriptor */
-
- caddr_t bt_cmap; /* R: current point in mapped space */
- caddr_t bt_smap; /* R: start of mapped space */
- caddr_t bt_emap; /* R: end of mapped space */
- size_t bt_msize; /* R: size of mapped region. */
-
- recno_t bt_nrecs; /* R: number of records */
- size_t bt_reclen; /* R: fixed record length */
- u_char bt_bval; /* R: delimiting byte/pad character */
-
-/*
- * NB:
- * B_NODUPS and R_RECNO are stored on disk, and may not be changed.
- */
-#define B_INMEM 0x00001 /* in-memory tree */
-#define B_METADIRTY 0x00002 /* need to write metadata */
-#define B_MODIFIED 0x00004 /* tree modified */
-#define B_NEEDSWAP 0x00008 /* if byte order requires swapping */
-#define B_RDONLY 0x00010 /* read-only tree */
-
-#define B_NODUPS 0x00020 /* no duplicate keys permitted */
-#define R_RECNO 0x00080 /* record oriented tree */
-
-#define R_CLOSEFP 0x00040 /* opened a file pointer */
-#define R_EOF 0x00100 /* end of input file reached. */
-#define R_FIXLEN 0x00200 /* fixed length records */
-#define R_MEMMAPPED 0x00400 /* memory mapped file. */
-#define R_INMEM 0x00800 /* in-memory file */
-#define R_MODIFIED 0x01000 /* modified file */
-#define R_RDONLY 0x02000 /* read-only file */
-
-#define B_DB_LOCK 0x04000 /* DB_LOCK specified. */
-#define B_DB_SHMEM 0x08000 /* DB_SHMEM specified. */
-#define B_DB_TXN 0x10000 /* DB_TXN specified. */
- u_int32_t flags;
-} BTREE;
-
-#include "extern.h"
diff --git a/btree/btree.src b/btree/btree.src
new file mode 100644
index 0000000..b6198e1
--- /dev/null
+++ b/btree/btree.src
@@ -0,0 +1,291 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2009 Oracle. All rights reserved.
+ *
+ * $Id$
+ */
+
+DBPRIVATE
+PREFIX __bam
+
+INCLUDE #include "db_int.h"
+INCLUDE #include "dbinc/crypto.h"
+INCLUDE #include "dbinc/db_page.h"
+INCLUDE #include "dbinc/db_am.h"
+INCLUDE #include "dbinc/btree.h"
+INCLUDE #include "dbinc/log.h"
+INCLUDE #include "dbinc/txn.h"
+INCLUDE
+
+/*
+ * BTREE-split: used to log a page split.
+ *
+ * left: the page number for the low-order contents.
+ * llsn: the left page's original LSN.
+ * right: the page number for the high-order contents.
+ * rlsn: the right page's original LSN.
+ * indx: the number of entries that went to the left page.
+ * npgno: the next page number
+ * nlsn: the next page's original LSN (or 0 if no next page).
+ * pgno: the parent page number
+ * plsn: the parent page's original LSN.
+ * pg: the split page's contents before the split.
+ * opflags: SPL_NRECS: if splitting a tree that maintains a record count.
+ * pindx: index of new record in parent page.
+ */
+BEGIN split 48 62
+DB fileid int32_t ld
+ARG left db_pgno_t lu
+POINTER llsn DB_LSN * lu
+ARG right db_pgno_t lu
+POINTER rlsn DB_LSN * lu
+ARG indx u_int32_t lu
+ARG npgno db_pgno_t lu
+POINTER nlsn DB_LSN * lu
+ARG ppgno db_pgno_t lu
+POINTER plsn DB_LSN * lu
+ARG pindx u_int32_t lu
+PGDBT pg DBT s
+DBT pentry DBT s
+DBT rentry DBT s
+ARG opflags u_int32_t lu
+END
+
+BEGIN_COMPAT split 42 62
+DB fileid int32_t ld
+ARG left db_pgno_t lu
+POINTER llsn DB_LSN * lu
+ARG right db_pgno_t lu
+POINTER rlsn DB_LSN * lu
+ARG indx u_int32_t lu
+ARG npgno db_pgno_t lu
+POINTER nlsn DB_LSN * lu
+ARG root_pgno db_pgno_t lu
+PGDBT pg DBT s
+ARG opflags u_int32_t lu
+END
+
+/*
+ * BTREE-rsplit: used to log a reverse-split
+ *
+ * pgno: the page number of the page copied over the root.
+ * pgdbt: the page being copied on the root page.
+ * root_pgno: the root page number.
+ * nrec: the tree's record count.
+ * rootent: last entry on the root page.
+ * rootlsn: the root page's original lsn.
+ */
+BEGIN rsplit 42 63
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+PGDBT pgdbt DBT s
+ARG root_pgno db_pgno_t lu
+ARG nrec db_pgno_t lu
+DBT rootent DBT s
+POINTER rootlsn DB_LSN * lu
+END
+
+/*
+ * BTREE-adj: used to log the adjustment of an index.
+ *
+ * pgno: the page modified.
+ * lsn: the page's original lsn.
+ * indx: the index adjusted.
+ * indx_copy: the index to copy if inserting.
+ * is_insert: 0 if a delete, 1 if an insert.
+ */
+BEGIN adj 42 55
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER lsn DB_LSN * lu
+ARG indx u_int32_t lu
+ARG indx_copy u_int32_t lu
+ARG is_insert u_int32_t lu
+END
+
+/*
+ * BTREE-cadjust: used to adjust the count change in an internal page.
+ *
+ * pgno: the page modified.
+ * lsn: the page's original lsn.
+ * indx: the index to be adjusted.
+ * adjust: the signed adjustment.
+ * opflags: CAD_UPDATEROOT: if root page count was adjusted.
+ */
+BEGIN cadjust 42 56
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER lsn DB_LSN * lu
+ARG indx u_int32_t lu
+ARG adjust int32_t ld
+ARG opflags u_int32_t lu
+END
+
+/*
+ * BTREE-cdel: used to log the intent-to-delete of a cursor record.
+ *
+ * pgno: the page modified.
+ * lsn: the page's original lsn.
+ * indx: the index to be deleted.
+ */
+BEGIN cdel 42 57
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER lsn DB_LSN * lu
+ARG indx u_int32_t lu
+END
+
+/*
+ * BTREE-repl: used to log the replacement of an item.
+ *
+ * pgno: the page modified.
+ * lsn: the page's original lsn.
+ * indx: the index to be replaced.
+ * isdeleted: set if the record was previously deleted.
+ * orig: the original data.
+ * repl: the replacement data.
+ * prefix: the prefix of the replacement that matches the original.
+ * suffix: the suffix of the replacement that matches the original.
+ */
+BEGIN repl 42 58
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER lsn DB_LSN * lu
+ARG indx u_int32_t lu
+ARG isdeleted u_int32_t lu
+DBT orig DBT s
+DBT repl DBT s
+ARG prefix u_int32_t lu
+ARG suffix u_int32_t lu
+END
+
+/*
+ * BTREE-root: log the assignment of a root btree page.
+ */
+BEGIN root 42 59
+DB fileid int32_t ld
+ARG meta_pgno db_pgno_t lu
+ARG root_pgno db_pgno_t lu
+POINTER meta_lsn DB_LSN * lu
+END
+
+/*
+ * BTREE-curadj: undo cursor adjustments on txn abort.
+ * Should only be processed during DB_TXN_ABORT.
+ * NOTE: the first_indx field gets used to hold
+ * signed index adjustment in one case.
+ * care should be taken if its size is changed.
+ */
+BEGIN curadj 42 64
+/* Fileid of db affected. */
+DB fileid int32_t ld
+/* Which adjustment. */
+ARG mode db_ca_mode ld
+/* Page entry is from. */
+ARG from_pgno db_pgno_t lu
+/* Page entry went to. */
+ARG to_pgno db_pgno_t lu
+/* Left page of root split. */
+ARG left_pgno db_pgno_t lu
+/* First index of dup set. Also used as adjustment. */
+ARG first_indx u_int32_t lu
+/* Index entry is from. */
+ARG from_indx u_int32_t lu
+/* Index where entry went. */
+ARG to_indx u_int32_t lu
+END
+
+/*
+ * BTREE-rcuradj: undo cursor adjustments on txn abort in
+ * renumbering recno trees.
+ * Should only be processed during DB_TXN_ABORT.
+ */
+BEGIN rcuradj 42 65
+/* Fileid of db affected. */
+DB fileid int32_t ld
+/* Which adjustment. */
+ARG mode ca_recno_arg ld
+/* Root page number. */
+ARG root db_pgno_t ld
+/* Recno of the adjustment. */
+ARG recno db_recno_t ld
+/* Order number of the adjustment. */
+ARG order u_int32_t lu
+END
+
+/*
+ * BTREE-relink -- Handles relinking around a deleted leaf page.
+ *
+ */
+BEGIN_COMPAT relink 43 147
+/* Fileid of db affected. */
+DB fileid int32_t ld
+/* The page being removed. */
+ARG pgno db_pgno_t lu
+/* The page's original lsn. */
+POINTER lsn DB_LSN * lu
+/* The previous page. */
+ARG prev db_pgno_t lu
+/* The previous page's original lsn. */
+POINTER lsn_prev DB_LSN * lu
+/* The next page. */
+ARG next db_pgno_t lu
+/* The previous page's original lsn. */
+POINTER lsn_next DB_LSN * lu
+END
+
+BEGIN relink 44 147
+/* Fileid of db affected. */
+DB fileid int32_t ld
+/* The page being removed. */
+ARG pgno db_pgno_t lu
+/* The new page number, if any. */
+ARG new_pgno db_pgno_t lu
+/* The previous page. */
+ARG prev db_pgno_t lu
+/* The previous page's original lsn. */
+POINTER lsn_prev DB_LSN * lu
+/* The next page. */
+ARG next db_pgno_t lu
+/* The previous page's original lsn. */
+POINTER lsn_next DB_LSN * lu
+END
+
+/*
+ * BTREE-merge -- Handles merging of pages during a compaction.
+ */
+BEGIN_COMPAT merge 44 148
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER lsn DB_LSN * lu
+ARG npgno db_pgno_t lu
+POINTER nlsn DB_LSN * lu
+DBT hdr DBT s
+DBT data DBT s
+DBT ind DBT s
+END
+
+BEGIN merge 47 148
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER lsn DB_LSN * lu
+ARG npgno db_pgno_t lu
+POINTER nlsn DB_LSN * lu
+PGDBT hdr DBT s
+PGDDBT data DBT s
+ARG pg_copy int32_t lu
+END
+
+/*
+ * BTREE-pgno -- Handles replacing a page number in the record
+ * reference on pgno by indx.
+ */
+BEGIN pgno 44 149
+DB fileid int32_t ld
+ARG pgno db_pgno_t lu
+POINTER lsn DB_LSN * lu
+ARG indx u_int32_t lu
+ARG opgno db_pgno_t lu
+ARG npgno db_pgno_t lu
+END
diff --git a/btree/btree_auto.c b/btree/btree_auto.c
new file mode 100644
index 0000000..460f038
--- /dev/null
+++ b/btree/btree_auto.c
@@ -0,0 +1,3547 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/btree.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __bam_split_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC: __bam_split_args **));
+ */
+int
+__bam_split_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __bam_split_args **argpp;
+{
+ __bam_split_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__bam_split_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->left = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->llsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->right = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->rlsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &argp->indx, bp);
+ bp += sizeof(argp->indx);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->npgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->nlsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->ppgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->plsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &argp->pindx, bp);
+ bp += sizeof(argp->pindx);
+
+ memset(&argp->pg, 0, sizeof(argp->pg));
+ LOGCOPY_32(env,&argp->pg.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->pg.data = bp;
+ bp += argp->pg.size;
+ if (LOG_SWAPPED(env) && dbpp != NULL && *dbpp != NULL) {
+ int t_ret;
+ if ((t_ret = __db_pageswap(*dbpp, (PAGE *)argp->pg.data,
+ (size_t)argp->pg.size, NULL, 1)) != 0)
+ return (t_ret);
+ }
+
+ memset(&argp->pentry, 0, sizeof(argp->pentry));
+ LOGCOPY_32(env,&argp->pentry.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->pentry.data = bp;
+ bp += argp->pentry.size;
+
+ memset(&argp->rentry, 0, sizeof(argp->rentry));
+ LOGCOPY_32(env,&argp->rentry.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->rentry.data = bp;
+ bp += argp->rentry.size;
+
+ LOGCOPY_32(env, &argp->opflags, bp);
+ bp += sizeof(argp->opflags);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_split_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t, DB_LSN *, u_int32_t,
+ * PUBLIC: db_pgno_t, DB_LSN *, db_pgno_t, DB_LSN *, u_int32_t, const DBT *,
+ * PUBLIC: const DBT *, const DBT *, u_int32_t));
+ */
+int
+__bam_split_log(dbp, txnp, ret_lsnp, flags, left, llsn, right, rlsn, indx,
+ npgno, nlsn, ppgno, plsn, pindx, pg,
+ pentry, rentry, opflags)
+ DB *dbp;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ db_pgno_t left;
+ DB_LSN * llsn;
+ db_pgno_t right;
+ DB_LSN * rlsn;
+ u_int32_t indx;
+ db_pgno_t npgno;
+ DB_LSN * nlsn;
+ db_pgno_t ppgno;
+ DB_LSN * plsn;
+ u_int32_t pindx;
+ const DBT *pg;
+ const DBT *pentry;
+ const DBT *rentry;
+ u_int32_t opflags;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ ENV *env;
+ u_int32_t zero, uinttmp, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ env = dbp->env;
+ rlsnp = ret_lsnp;
+ rectype = DB___bam_split;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+ F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ DB_ASSERT(env, dbp->log_filename != NULL);
+ if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+ (ret = __dbreg_lazy_id(dbp)) != 0)
+ return (ret);
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(*llsn)
+ + sizeof(u_int32_t)
+ + sizeof(*rlsn)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(*nlsn)
+ + sizeof(u_int32_t)
+ + sizeof(*plsn)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t) + (pg == NULL ? 0 : pg->size)
+ + sizeof(u_int32_t) + (pentry == NULL ? 0 : pentry->size)
+ + sizeof(u_int32_t) + (rentry == NULL ? 0 : rentry->size)
+ + sizeof(u_int32_t);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ uinttmp = (u_int32_t)dbp->log_filename->id;
+ LOGCOPY_32(env, bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)left;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (llsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(llsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, llsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, llsn);
+ } else
+ memset(bp, 0, sizeof(*llsn));
+ bp += sizeof(*llsn);
+
+ uinttmp = (u_int32_t)right;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (rlsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(rlsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, rlsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, rlsn);
+ } else
+ memset(bp, 0, sizeof(*rlsn));
+ bp += sizeof(*rlsn);
+
+ LOGCOPY_32(env, bp, &indx);
+ bp += sizeof(indx);
+
+ uinttmp = (u_int32_t)npgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (nlsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(nlsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, nlsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, nlsn);
+ } else
+ memset(bp, 0, sizeof(*nlsn));
+ bp += sizeof(*nlsn);
+
+ uinttmp = (u_int32_t)ppgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (plsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(plsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, plsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, plsn);
+ } else
+ memset(bp, 0, sizeof(*plsn));
+ bp += sizeof(*plsn);
+
+ LOGCOPY_32(env, bp, &pindx);
+ bp += sizeof(pindx);
+
+ if (pg == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &pg->size);
+ bp += sizeof(pg->size);
+ memcpy(bp, pg->data, pg->size);
+ if (LOG_SWAPPED(env))
+ if ((ret = __db_pageswap(dbp,
+ (PAGE *)bp, (size_t)pg->size, (DBT *)NULL, 0)) != 0)
+ return (ret);
+ bp += pg->size;
+ }
+
+ if (pentry == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &pentry->size);
+ bp += sizeof(pentry->size);
+ memcpy(bp, pentry->data, pentry->size);
+ bp += pentry->size;
+ }
+
+ if (rentry == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &rentry->size);
+ bp += sizeof(rentry->size);
+ memcpy(bp, rentry->data, rentry->size);
+ bp += rentry->size;
+ }
+
+ LOGCOPY_32(env, bp, &opflags);
+ bp += sizeof(opflags);
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__bam_split_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_split_42_read __P((ENV *, DB **, void *,
+ * PUBLIC: void *, __bam_split_42_args **));
+ */
+int
+__bam_split_42_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __bam_split_42_args **argpp;
+{
+ __bam_split_42_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__bam_split_42_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->left = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->llsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->right = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->rlsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &argp->indx, bp);
+ bp += sizeof(argp->indx);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->npgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->nlsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->root_pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ memset(&argp->pg, 0, sizeof(argp->pg));
+ LOGCOPY_32(env,&argp->pg.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->pg.data = bp;
+ bp += argp->pg.size;
+
+ LOGCOPY_32(env, &argp->opflags, bp);
+ bp += sizeof(argp->opflags);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_rsplit_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC: __bam_rsplit_args **));
+ */
+int
+__bam_rsplit_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __bam_rsplit_args **argpp;
+{
+ __bam_rsplit_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__bam_rsplit_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ memset(&argp->pgdbt, 0, sizeof(argp->pgdbt));
+ LOGCOPY_32(env,&argp->pgdbt.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->pgdbt.data = bp;
+ bp += argp->pgdbt.size;
+ if (LOG_SWAPPED(env) && dbpp != NULL && *dbpp != NULL) {
+ int t_ret;
+ if ((t_ret = __db_pageswap(*dbpp, (PAGE *)argp->pgdbt.data,
+ (size_t)argp->pgdbt.size, NULL, 1)) != 0)
+ return (t_ret);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->root_pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->nrec = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ memset(&argp->rootent, 0, sizeof(argp->rootent));
+ LOGCOPY_32(env,&argp->rootent.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->rootent.data = bp;
+ bp += argp->rootent.size;
+
+ LOGCOPY_TOLSN(env, &argp->rootlsn, bp);
+ bp += sizeof(DB_LSN);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_rsplit_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC: u_int32_t, db_pgno_t, const DBT *, db_pgno_t, db_pgno_t,
+ * PUBLIC: const DBT *, DB_LSN *));
+ */
+int
+__bam_rsplit_log(dbp, txnp, ret_lsnp, flags, pgno, pgdbt, root_pgno, nrec, rootent,
+ rootlsn)
+ DB *dbp;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ db_pgno_t pgno;
+ const DBT *pgdbt;
+ db_pgno_t root_pgno;
+ db_pgno_t nrec;
+ const DBT *rootent;
+ DB_LSN * rootlsn;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ ENV *env;
+ u_int32_t zero, uinttmp, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ env = dbp->env;
+ rlsnp = ret_lsnp;
+ rectype = DB___bam_rsplit;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+ F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ DB_ASSERT(env, dbp->log_filename != NULL);
+ if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+ (ret = __dbreg_lazy_id(dbp)) != 0)
+ return (ret);
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t) + (pgdbt == NULL ? 0 : pgdbt->size)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t) + (rootent == NULL ? 0 : rootent->size)
+ + sizeof(*rootlsn);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ uinttmp = (u_int32_t)dbp->log_filename->id;
+ LOGCOPY_32(env, bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (pgdbt == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &pgdbt->size);
+ bp += sizeof(pgdbt->size);
+ memcpy(bp, pgdbt->data, pgdbt->size);
+ if (LOG_SWAPPED(env))
+ if ((ret = __db_pageswap(dbp,
+ (PAGE *)bp, (size_t)pgdbt->size, (DBT *)NULL, 0)) != 0)
+ return (ret);
+ bp += pgdbt->size;
+ }
+
+ uinttmp = (u_int32_t)root_pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)nrec;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (rootent == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &rootent->size);
+ bp += sizeof(rootent->size);
+ memcpy(bp, rootent->data, rootent->size);
+ bp += rootent->size;
+ }
+
+ if (rootlsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(rootlsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, rootlsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, rootlsn);
+ } else
+ memset(bp, 0, sizeof(*rootlsn));
+ bp += sizeof(*rootlsn);
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__bam_rsplit_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_adj_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC: __bam_adj_args **));
+ */
+int
+__bam_adj_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __bam_adj_args **argpp;
+{
+ __bam_adj_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__bam_adj_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &argp->indx, bp);
+ bp += sizeof(argp->indx);
+
+ LOGCOPY_32(env, &argp->indx_copy, bp);
+ bp += sizeof(argp->indx_copy);
+
+ LOGCOPY_32(env, &argp->is_insert, bp);
+ bp += sizeof(argp->is_insert);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_adj_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, u_int32_t, u_int32_t,
+ * PUBLIC: u_int32_t));
+ */
+int
+__bam_adj_log(dbp, txnp, ret_lsnp, flags, pgno, lsn, indx, indx_copy, is_insert)
+ DB *dbp;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ db_pgno_t pgno;
+ DB_LSN * lsn;
+ u_int32_t indx;
+ u_int32_t indx_copy;
+ u_int32_t is_insert;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ ENV *env;
+ u_int32_t uinttmp, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ env = dbp->env;
+ rlsnp = ret_lsnp;
+ rectype = DB___bam_adj;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+ F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ DB_ASSERT(env, dbp->log_filename != NULL);
+ if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+ (ret = __dbreg_lazy_id(dbp)) != 0)
+ return (ret);
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(*lsn)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ uinttmp = (u_int32_t)dbp->log_filename->id;
+ LOGCOPY_32(env, bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (lsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(lsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, lsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, lsn);
+ } else
+ memset(bp, 0, sizeof(*lsn));
+ bp += sizeof(*lsn);
+
+ LOGCOPY_32(env, bp, &indx);
+ bp += sizeof(indx);
+
+ LOGCOPY_32(env, bp, &indx_copy);
+ bp += sizeof(indx_copy);
+
+ LOGCOPY_32(env, bp, &is_insert);
+ bp += sizeof(is_insert);
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__bam_adj_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_cadjust_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC: __bam_cadjust_args **));
+ */
+int
+__bam_cadjust_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __bam_cadjust_args **argpp;
+{
+ __bam_cadjust_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__bam_cadjust_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &argp->indx, bp);
+ bp += sizeof(argp->indx);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->adjust = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, &argp->opflags, bp);
+ bp += sizeof(argp->opflags);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_cadjust_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, u_int32_t, int32_t, u_int32_t));
+ */
+int
+__bam_cadjust_log(dbp, txnp, ret_lsnp, flags, pgno, lsn, indx, adjust, opflags)
+ DB *dbp;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ db_pgno_t pgno;
+ DB_LSN * lsn;
+ u_int32_t indx;
+ int32_t adjust;
+ u_int32_t opflags;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ ENV *env;
+ u_int32_t uinttmp, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ env = dbp->env;
+ rlsnp = ret_lsnp;
+ rectype = DB___bam_cadjust;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+ F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ DB_ASSERT(env, dbp->log_filename != NULL);
+ if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+ (ret = __dbreg_lazy_id(dbp)) != 0)
+ return (ret);
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(*lsn)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ uinttmp = (u_int32_t)dbp->log_filename->id;
+ LOGCOPY_32(env, bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (lsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(lsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, lsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, lsn);
+ } else
+ memset(bp, 0, sizeof(*lsn));
+ bp += sizeof(*lsn);
+
+ LOGCOPY_32(env, bp, &indx);
+ bp += sizeof(indx);
+
+ uinttmp = (u_int32_t)adjust;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, bp, &opflags);
+ bp += sizeof(opflags);
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__bam_cadjust_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_cdel_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC: __bam_cdel_args **));
+ */
+int
+__bam_cdel_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __bam_cdel_args **argpp;
+{
+ __bam_cdel_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__bam_cdel_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &argp->indx, bp);
+ bp += sizeof(argp->indx);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_cdel_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, u_int32_t));
+ */
+int
+__bam_cdel_log(dbp, txnp, ret_lsnp, flags, pgno, lsn, indx)
+ DB *dbp;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ db_pgno_t pgno;
+ DB_LSN * lsn;
+ u_int32_t indx;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ ENV *env;
+ u_int32_t uinttmp, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ env = dbp->env;
+ rlsnp = ret_lsnp;
+ rectype = DB___bam_cdel;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+ F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ DB_ASSERT(env, dbp->log_filename != NULL);
+ if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+ (ret = __dbreg_lazy_id(dbp)) != 0)
+ return (ret);
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(*lsn)
+ + sizeof(u_int32_t);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ uinttmp = (u_int32_t)dbp->log_filename->id;
+ LOGCOPY_32(env, bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (lsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(lsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, lsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, lsn);
+ } else
+ memset(bp, 0, sizeof(*lsn));
+ bp += sizeof(*lsn);
+
+ LOGCOPY_32(env, bp, &indx);
+ bp += sizeof(indx);
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__bam_cdel_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_repl_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC: __bam_repl_args **));
+ */
+int
+__bam_repl_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __bam_repl_args **argpp;
+{
+ __bam_repl_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__bam_repl_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &argp->indx, bp);
+ bp += sizeof(argp->indx);
+
+ LOGCOPY_32(env, &argp->isdeleted, bp);
+ bp += sizeof(argp->isdeleted);
+
+ memset(&argp->orig, 0, sizeof(argp->orig));
+ LOGCOPY_32(env,&argp->orig.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->orig.data = bp;
+ bp += argp->orig.size;
+
+ memset(&argp->repl, 0, sizeof(argp->repl));
+ LOGCOPY_32(env,&argp->repl.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->repl.data = bp;
+ bp += argp->repl.size;
+
+ LOGCOPY_32(env, &argp->prefix, bp);
+ bp += sizeof(argp->prefix);
+
+ LOGCOPY_32(env, &argp->suffix, bp);
+ bp += sizeof(argp->suffix);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_repl_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, u_int32_t, u_int32_t,
+ * PUBLIC: const DBT *, const DBT *, u_int32_t, u_int32_t));
+ */
+int
+__bam_repl_log(dbp, txnp, ret_lsnp, flags, pgno, lsn, indx, isdeleted, orig,
+ repl, prefix, suffix)
+ DB *dbp;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ db_pgno_t pgno;
+ DB_LSN * lsn;
+ u_int32_t indx;
+ u_int32_t isdeleted;
+ const DBT *orig;
+ const DBT *repl;
+ u_int32_t prefix;
+ u_int32_t suffix;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ ENV *env;
+ u_int32_t zero, uinttmp, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ env = dbp->env;
+ rlsnp = ret_lsnp;
+ rectype = DB___bam_repl;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+ F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ DB_ASSERT(env, dbp->log_filename != NULL);
+ if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+ (ret = __dbreg_lazy_id(dbp)) != 0)
+ return (ret);
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(*lsn)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t) + (orig == NULL ? 0 : orig->size)
+ + sizeof(u_int32_t) + (repl == NULL ? 0 : repl->size)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ uinttmp = (u_int32_t)dbp->log_filename->id;
+ LOGCOPY_32(env, bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (lsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(lsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, lsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, lsn);
+ } else
+ memset(bp, 0, sizeof(*lsn));
+ bp += sizeof(*lsn);
+
+ LOGCOPY_32(env, bp, &indx);
+ bp += sizeof(indx);
+
+ LOGCOPY_32(env, bp, &isdeleted);
+ bp += sizeof(isdeleted);
+
+ if (orig == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &orig->size);
+ bp += sizeof(orig->size);
+ memcpy(bp, orig->data, orig->size);
+ bp += orig->size;
+ }
+
+ if (repl == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &repl->size);
+ bp += sizeof(repl->size);
+ memcpy(bp, repl->data, repl->size);
+ bp += repl->size;
+ }
+
+ LOGCOPY_32(env, bp, &prefix);
+ bp += sizeof(prefix);
+
+ LOGCOPY_32(env, bp, &suffix);
+ bp += sizeof(suffix);
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__bam_repl_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_root_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC: __bam_root_args **));
+ */
+int
+__bam_root_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __bam_root_args **argpp;
+{
+ __bam_root_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__bam_root_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->meta_pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->root_pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->meta_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_root_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC: u_int32_t, db_pgno_t, db_pgno_t, DB_LSN *));
+ */
+int
+__bam_root_log(dbp, txnp, ret_lsnp, flags, meta_pgno, root_pgno, meta_lsn)
+ DB *dbp;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ db_pgno_t meta_pgno;
+ db_pgno_t root_pgno;
+ DB_LSN * meta_lsn;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ ENV *env;
+ u_int32_t uinttmp, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ env = dbp->env;
+ rlsnp = ret_lsnp;
+ rectype = DB___bam_root;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+ F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ DB_ASSERT(env, dbp->log_filename != NULL);
+ if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+ (ret = __dbreg_lazy_id(dbp)) != 0)
+ return (ret);
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(*meta_lsn);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ uinttmp = (u_int32_t)dbp->log_filename->id;
+ LOGCOPY_32(env, bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)meta_pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)root_pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (meta_lsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(meta_lsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, meta_lsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, meta_lsn);
+ } else
+ memset(bp, 0, sizeof(*meta_lsn));
+ bp += sizeof(*meta_lsn);
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__bam_root_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_curadj_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC: __bam_curadj_args **));
+ */
+int
+__bam_curadj_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __bam_curadj_args **argpp;
+{
+ __bam_curadj_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__bam_curadj_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->mode = (db_ca_mode)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->from_pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->to_pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->left_pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, &argp->first_indx, bp);
+ bp += sizeof(argp->first_indx);
+
+ LOGCOPY_32(env, &argp->from_indx, bp);
+ bp += sizeof(argp->from_indx);
+
+ LOGCOPY_32(env, &argp->to_indx, bp);
+ bp += sizeof(argp->to_indx);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_curadj_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC: u_int32_t, db_ca_mode, db_pgno_t, db_pgno_t, db_pgno_t,
+ * PUBLIC: u_int32_t, u_int32_t, u_int32_t));
+ */
+int
+__bam_curadj_log(dbp, txnp, ret_lsnp, flags, mode, from_pgno, to_pgno, left_pgno, first_indx,
+ from_indx, to_indx)
+ DB *dbp;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ db_ca_mode mode;
+ db_pgno_t from_pgno;
+ db_pgno_t to_pgno;
+ db_pgno_t left_pgno;
+ u_int32_t first_indx;
+ u_int32_t from_indx;
+ u_int32_t to_indx;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ ENV *env;
+ u_int32_t uinttmp, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ env = dbp->env;
+ rlsnp = ret_lsnp;
+ rectype = DB___bam_curadj;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+ F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ DB_ASSERT(env, dbp->log_filename != NULL);
+ if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+ (ret = __dbreg_lazy_id(dbp)) != 0)
+ return (ret);
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ uinttmp = (u_int32_t)dbp->log_filename->id;
+ LOGCOPY_32(env, bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)mode;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)from_pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)to_pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)left_pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, bp, &first_indx);
+ bp += sizeof(first_indx);
+
+ LOGCOPY_32(env, bp, &from_indx);
+ bp += sizeof(from_indx);
+
+ LOGCOPY_32(env, bp, &to_indx);
+ bp += sizeof(to_indx);
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__bam_curadj_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_rcuradj_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC: __bam_rcuradj_args **));
+ */
+int
+__bam_rcuradj_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __bam_rcuradj_args **argpp;
+{
+ __bam_rcuradj_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__bam_rcuradj_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->mode = (ca_recno_arg)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->root = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->recno = (db_recno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, &argp->order, bp);
+ bp += sizeof(argp->order);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_rcuradj_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC: u_int32_t, ca_recno_arg, db_pgno_t, db_recno_t, u_int32_t));
+ */
+int
+__bam_rcuradj_log(dbp, txnp, ret_lsnp, flags, mode, root, recno, order)
+ DB *dbp;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ ca_recno_arg mode;
+ db_pgno_t root;
+ db_recno_t recno;
+ u_int32_t order;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ ENV *env;
+ u_int32_t uinttmp, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ env = dbp->env;
+ rlsnp = ret_lsnp;
+ rectype = DB___bam_rcuradj;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+ F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ DB_ASSERT(env, dbp->log_filename != NULL);
+ if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+ (ret = __dbreg_lazy_id(dbp)) != 0)
+ return (ret);
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ uinttmp = (u_int32_t)dbp->log_filename->id;
+ LOGCOPY_32(env, bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)mode;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)root;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)recno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, bp, &order);
+ bp += sizeof(order);
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__bam_rcuradj_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_relink_43_read __P((ENV *, DB **, void *,
+ * PUBLIC: void *, __bam_relink_43_args **));
+ */
+int
+__bam_relink_43_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __bam_relink_43_args **argpp;
+{
+ __bam_relink_43_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__bam_relink_43_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->prev = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->lsn_prev, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->next = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->lsn_next, bp);
+ bp += sizeof(DB_LSN);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_relink_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC: __bam_relink_args **));
+ */
+int
+__bam_relink_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __bam_relink_args **argpp;
+{
+ __bam_relink_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__bam_relink_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->new_pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->prev = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->lsn_prev, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->next = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->lsn_next, bp);
+ bp += sizeof(DB_LSN);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_relink_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC: u_int32_t, db_pgno_t, db_pgno_t, db_pgno_t, DB_LSN *, db_pgno_t,
+ * PUBLIC: DB_LSN *));
+ */
+int
+__bam_relink_log(dbp, txnp, ret_lsnp, flags, pgno, new_pgno, prev, lsn_prev, next,
+ lsn_next)
+ DB *dbp;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ db_pgno_t pgno;
+ db_pgno_t new_pgno;
+ db_pgno_t prev;
+ DB_LSN * lsn_prev;
+ db_pgno_t next;
+ DB_LSN * lsn_next;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ ENV *env;
+ u_int32_t uinttmp, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ env = dbp->env;
+ rlsnp = ret_lsnp;
+ rectype = DB___bam_relink;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+ F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ DB_ASSERT(env, dbp->log_filename != NULL);
+ if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+ (ret = __dbreg_lazy_id(dbp)) != 0)
+ return (ret);
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(*lsn_prev)
+ + sizeof(u_int32_t)
+ + sizeof(*lsn_next);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ uinttmp = (u_int32_t)dbp->log_filename->id;
+ LOGCOPY_32(env, bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)new_pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)prev;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (lsn_prev != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(lsn_prev, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, lsn_prev)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, lsn_prev);
+ } else
+ memset(bp, 0, sizeof(*lsn_prev));
+ bp += sizeof(*lsn_prev);
+
+ uinttmp = (u_int32_t)next;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (lsn_next != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(lsn_next, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, lsn_next)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, lsn_next);
+ } else
+ memset(bp, 0, sizeof(*lsn_next));
+ bp += sizeof(*lsn_next);
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__bam_relink_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_merge_44_read __P((ENV *, DB **, void *,
+ * PUBLIC: void *, __bam_merge_44_args **));
+ */
+int
+__bam_merge_44_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __bam_merge_44_args **argpp;
+{
+ __bam_merge_44_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__bam_merge_44_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->npgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->nlsn, bp);
+ bp += sizeof(DB_LSN);
+
+ memset(&argp->hdr, 0, sizeof(argp->hdr));
+ LOGCOPY_32(env,&argp->hdr.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->hdr.data = bp;
+ bp += argp->hdr.size;
+
+ memset(&argp->data, 0, sizeof(argp->data));
+ LOGCOPY_32(env,&argp->data.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->data.data = bp;
+ bp += argp->data.size;
+
+ memset(&argp->ind, 0, sizeof(argp->ind));
+ LOGCOPY_32(env,&argp->ind.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->ind.data = bp;
+ bp += argp->ind.size;
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_merge_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC: __bam_merge_args **));
+ */
+int
+__bam_merge_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __bam_merge_args **argpp;
+{
+ __bam_merge_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__bam_merge_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->npgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->nlsn, bp);
+ bp += sizeof(DB_LSN);
+
+ memset(&argp->hdr, 0, sizeof(argp->hdr));
+ LOGCOPY_32(env,&argp->hdr.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->hdr.data = bp;
+ bp += argp->hdr.size;
+
+ memset(&argp->data, 0, sizeof(argp->data));
+ LOGCOPY_32(env,&argp->data.size, bp);
+ bp += sizeof(u_int32_t);
+ argp->data.data = bp;
+ bp += argp->data.size;
+ if (LOG_SWAPPED(env) && dbpp != NULL && *dbpp != NULL) {
+ int t_ret;
+ if ((t_ret = __db_pageswap(*dbpp,
+ (PAGE *)argp->hdr.data, (size_t)argp->hdr.size,
+ &argp->data, 1)) != 0)
+ return (t_ret);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->pg_copy = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_merge_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t, DB_LSN *, const DBT *,
+ * PUBLIC: const DBT *, int32_t));
+ */
+int
+__bam_merge_log(dbp, txnp, ret_lsnp, flags, pgno, lsn, npgno, nlsn, hdr,
+ data, pg_copy)
+ DB *dbp;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ db_pgno_t pgno;
+ DB_LSN * lsn;
+ db_pgno_t npgno;
+ DB_LSN * nlsn;
+ const DBT *hdr;
+ const DBT *data;
+ int32_t pg_copy;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ ENV *env;
+ u_int32_t zero, uinttmp, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ env = dbp->env;
+ rlsnp = ret_lsnp;
+ rectype = DB___bam_merge;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+ F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ DB_ASSERT(env, dbp->log_filename != NULL);
+ if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+ (ret = __dbreg_lazy_id(dbp)) != 0)
+ return (ret);
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(*lsn)
+ + sizeof(u_int32_t)
+ + sizeof(*nlsn)
+ + sizeof(u_int32_t) + (hdr == NULL ? 0 : hdr->size)
+ + sizeof(u_int32_t) + (data == NULL ? 0 : data->size)
+ + sizeof(u_int32_t);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ uinttmp = (u_int32_t)dbp->log_filename->id;
+ LOGCOPY_32(env, bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (lsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(lsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, lsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, lsn);
+ } else
+ memset(bp, 0, sizeof(*lsn));
+ bp += sizeof(*lsn);
+
+ uinttmp = (u_int32_t)npgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (nlsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(nlsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, nlsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, nlsn);
+ } else
+ memset(bp, 0, sizeof(*nlsn));
+ bp += sizeof(*nlsn);
+
+ if (hdr == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &hdr->size);
+ bp += sizeof(hdr->size);
+ memcpy(bp, hdr->data, hdr->size);
+ if (LOG_SWAPPED(env))
+ if ((ret = __db_pageswap(dbp,
+ (PAGE *)bp, (size_t)hdr->size, (DBT *)data, 0)) != 0)
+ return (ret);
+ bp += hdr->size;
+ }
+
+ if (data == NULL) {
+ zero = 0;
+ LOGCOPY_32(env, bp, &zero);
+ bp += sizeof(u_int32_t);
+ } else {
+ LOGCOPY_32(env, bp, &data->size);
+ bp += sizeof(data->size);
+ memcpy(bp, data->data, data->size);
+ if (LOG_SWAPPED(env) && F_ISSET(data, DB_DBT_APPMALLOC))
+ __os_free(env, data->data);
+ bp += data->size;
+ }
+
+ uinttmp = (u_int32_t)pg_copy;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__bam_merge_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_pgno_read __P((ENV *, DB **, void *, void *,
+ * PUBLIC: __bam_pgno_args **));
+ */
+int
+__bam_pgno_read(env, dbpp, td, recbuf, argpp)
+ ENV *env;
+ DB **dbpp;
+ void *td;
+ void *recbuf;
+ __bam_pgno_args **argpp;
+{
+ __bam_pgno_args *argp;
+ u_int32_t uinttmp;
+ u_int8_t *bp;
+ int ret;
+
+ if ((ret = __os_malloc(env,
+ sizeof(__bam_pgno_args) + sizeof(DB_TXN), &argp)) != 0)
+ return (ret);
+ bp = recbuf;
+ argp->txnp = (DB_TXN *)&argp[1];
+ memset(argp->txnp, 0, sizeof(DB_TXN));
+
+ argp->txnp->td = td;
+ LOGCOPY_32(env, &argp->type, bp);
+ bp += sizeof(argp->type);
+
+ LOGCOPY_32(env, &argp->txnp->txnid, bp);
+ bp += sizeof(argp->txnp->txnid);
+
+ LOGCOPY_TOLSN(env, &argp->prev_lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->fileid = (int32_t)uinttmp;
+ bp += sizeof(uinttmp);
+ if (dbpp != NULL) {
+ *dbpp = NULL;
+ ret = __dbreg_id_to_db(
+ env, argp->txnp, dbpp, argp->fileid, 1);
+ }
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->pgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_TOLSN(env, &argp->lsn, bp);
+ bp += sizeof(DB_LSN);
+
+ LOGCOPY_32(env, &argp->indx, bp);
+ bp += sizeof(argp->indx);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->opgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ LOGCOPY_32(env, &uinttmp, bp);
+ argp->npgno = (db_pgno_t)uinttmp;
+ bp += sizeof(uinttmp);
+
+ *argpp = argp;
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_pgno_log __P((DB *, DB_TXN *, DB_LSN *,
+ * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, u_int32_t, db_pgno_t,
+ * PUBLIC: db_pgno_t));
+ */
+int
+__bam_pgno_log(dbp, txnp, ret_lsnp, flags, pgno, lsn, indx, opgno, npgno)
+ DB *dbp;
+ DB_TXN *txnp;
+ DB_LSN *ret_lsnp;
+ u_int32_t flags;
+ db_pgno_t pgno;
+ DB_LSN * lsn;
+ u_int32_t indx;
+ db_pgno_t opgno;
+ db_pgno_t npgno;
+{
+ DBT logrec;
+ DB_LSN *lsnp, null_lsn, *rlsnp;
+ DB_TXNLOGREC *lr;
+ ENV *env;
+ u_int32_t uinttmp, rectype, txn_num;
+ u_int npad;
+ u_int8_t *bp;
+ int is_durable, ret;
+
+ COMPQUIET(lr, NULL);
+
+ env = dbp->env;
+ rlsnp = ret_lsnp;
+ rectype = DB___bam_pgno;
+ npad = 0;
+ ret = 0;
+
+ if (LF_ISSET(DB_LOG_NOT_DURABLE) ||
+ F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
+ if (txnp == NULL)
+ return (0);
+ is_durable = 0;
+ } else
+ is_durable = 1;
+
+ if (txnp == NULL) {
+ txn_num = 0;
+ lsnp = &null_lsn;
+ null_lsn.file = null_lsn.offset = 0;
+ } else {
+ if (TAILQ_FIRST(&txnp->kids) != NULL &&
+ (ret = __txn_activekids(env, rectype, txnp)) != 0)
+ return (ret);
+ /*
+ * We need to assign begin_lsn while holding region mutex.
+ * That assignment is done inside the DbEnv->log_put call,
+ * so pass in the appropriate memory location to be filled
+ * in by the log_put code.
+ */
+ DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp);
+ txn_num = txnp->txnid;
+ }
+
+ DB_ASSERT(env, dbp->log_filename != NULL);
+ if (dbp->log_filename->id == DB_LOGFILEID_INVALID &&
+ (ret = __dbreg_lazy_id(dbp)) != 0)
+ return (ret);
+
+ logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(*lsn)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t)
+ + sizeof(u_int32_t);
+ if (CRYPTO_ON(env)) {
+ npad = env->crypto_handle->adj_size(logrec.size);
+ logrec.size += npad;
+ }
+
+ if (is_durable || txnp == NULL) {
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0)
+ return (ret);
+ } else {
+ if ((ret = __os_malloc(env,
+ logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0)
+ return (ret);
+#ifdef DIAGNOSTIC
+ if ((ret =
+ __os_malloc(env, logrec.size, &logrec.data)) != 0) {
+ __os_free(env, lr);
+ return (ret);
+ }
+#else
+ logrec.data = lr->data;
+#endif
+ }
+ if (npad > 0)
+ memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad);
+
+ bp = logrec.data;
+
+ LOGCOPY_32(env, bp, &rectype);
+ bp += sizeof(rectype);
+
+ LOGCOPY_32(env, bp, &txn_num);
+ bp += sizeof(txn_num);
+
+ LOGCOPY_FROMLSN(env, bp, lsnp);
+ bp += sizeof(DB_LSN);
+
+ uinttmp = (u_int32_t)dbp->log_filename->id;
+ LOGCOPY_32(env, bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)pgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ if (lsn != NULL) {
+ if (txnp != NULL) {
+ LOG *lp = env->lg_handle->reginfo.primary;
+ if (LOG_COMPARE(lsn, &lp->lsn) >= 0 && (ret =
+ __log_check_page_lsn(env, dbp, lsn)) != 0)
+ return (ret);
+ }
+ LOGCOPY_FROMLSN(env, bp, lsn);
+ } else
+ memset(bp, 0, sizeof(*lsn));
+ bp += sizeof(*lsn);
+
+ LOGCOPY_32(env, bp, &indx);
+ bp += sizeof(indx);
+
+ uinttmp = (u_int32_t)opgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ uinttmp = (u_int32_t)npgno;
+ LOGCOPY_32(env,bp, &uinttmp);
+ bp += sizeof(uinttmp);
+
+ DB_ASSERT(env,
+ (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size);
+
+ if (is_durable || txnp == NULL) {
+ if ((ret = __log_put(env, rlsnp,(DBT *)&logrec,
+ flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) {
+ *lsnp = *rlsnp;
+ if (rlsnp != ret_lsnp)
+ *ret_lsnp = *rlsnp;
+ }
+ } else {
+ ret = 0;
+#ifdef DIAGNOSTIC
+ /*
+ * Set the debug bit if we are going to log non-durable
+ * transactions so they will be ignored by recovery.
+ */
+ memcpy(lr->data, logrec.data, logrec.size);
+ rectype |= DB_debug_FLAG;
+ LOGCOPY_32(env, logrec.data, &rectype);
+
+ if (!IS_REP_CLIENT(env))
+ ret = __log_put(env,
+ rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY);
+#endif
+ STAILQ_INSERT_HEAD(&txnp->logs, lr, links);
+ F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY);
+ LSN_NOT_LOGGED(*ret_lsnp);
+ }
+
+#ifdef LOG_DIAGNOSTIC
+ if (ret != 0)
+ (void)__bam_pgno_print(env,
+ (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL);
+#endif
+
+#ifdef DIAGNOSTIC
+ __os_free(env, logrec.data);
+#else
+ if (is_durable || txnp == NULL)
+ __os_free(env, logrec.data);
+#endif
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __bam_init_recover __P((ENV *, DB_DISTAB *));
+ */
+int
+__bam_init_recover(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_split_recover, DB___bam_split)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_rsplit_recover, DB___bam_rsplit)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_adj_recover, DB___bam_adj)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_cadjust_recover, DB___bam_cadjust)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_cdel_recover, DB___bam_cdel)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_repl_recover, DB___bam_repl)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_root_recover, DB___bam_root)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_curadj_recover, DB___bam_curadj)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_rcuradj_recover, DB___bam_rcuradj)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_relink_recover, DB___bam_relink)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_merge_recover, DB___bam_merge)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_pgno_recover, DB___bam_pgno)) != 0)
+ return (ret);
+ return (0);
+}
diff --git a/btree/btree_autop.c b/btree/btree_autop.c
new file mode 100644
index 0000000..54cb501
--- /dev/null
+++ b/btree/btree_autop.c
@@ -0,0 +1,766 @@
+/* Do not edit: automatically built by gen_rec.awk. */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_am.h"
+#include "dbinc/btree.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
+
+/*
+ * PUBLIC: int __bam_split_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_split_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_split_args *argp;
+ u_int32_t i;
+ int ch;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __bam_split_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__bam_split%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tleft: %lu\n", (u_long)argp->left);
+ (void)printf("\tllsn: [%lu][%lu]\n",
+ (u_long)argp->llsn.file, (u_long)argp->llsn.offset);
+ (void)printf("\tright: %lu\n", (u_long)argp->right);
+ (void)printf("\trlsn: [%lu][%lu]\n",
+ (u_long)argp->rlsn.file, (u_long)argp->rlsn.offset);
+ (void)printf("\tindx: %lu\n", (u_long)argp->indx);
+ (void)printf("\tnpgno: %lu\n", (u_long)argp->npgno);
+ (void)printf("\tnlsn: [%lu][%lu]\n",
+ (u_long)argp->nlsn.file, (u_long)argp->nlsn.offset);
+ (void)printf("\tppgno: %lu\n", (u_long)argp->ppgno);
+ (void)printf("\tplsn: [%lu][%lu]\n",
+ (u_long)argp->plsn.file, (u_long)argp->plsn.offset);
+ (void)printf("\tpindx: %lu\n", (u_long)argp->pindx);
+ (void)printf("\tpg: ");
+ for (i = 0; i < argp->pg.size; i++) {
+ ch = ((u_int8_t *)argp->pg.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\tpentry: ");
+ for (i = 0; i < argp->pentry.size; i++) {
+ ch = ((u_int8_t *)argp->pentry.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\trentry: ");
+ for (i = 0; i < argp->rentry.size; i++) {
+ ch = ((u_int8_t *)argp->rentry.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\topflags: %lu\n", (u_long)argp->opflags);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_split_42_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_split_42_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_split_42_args *argp;
+ u_int32_t i;
+ int ch;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __bam_split_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__bam_split_42%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tleft: %lu\n", (u_long)argp->left);
+ (void)printf("\tllsn: [%lu][%lu]\n",
+ (u_long)argp->llsn.file, (u_long)argp->llsn.offset);
+ (void)printf("\tright: %lu\n", (u_long)argp->right);
+ (void)printf("\trlsn: [%lu][%lu]\n",
+ (u_long)argp->rlsn.file, (u_long)argp->rlsn.offset);
+ (void)printf("\tindx: %lu\n", (u_long)argp->indx);
+ (void)printf("\tnpgno: %lu\n", (u_long)argp->npgno);
+ (void)printf("\tnlsn: [%lu][%lu]\n",
+ (u_long)argp->nlsn.file, (u_long)argp->nlsn.offset);
+ (void)printf("\troot_pgno: %lu\n", (u_long)argp->root_pgno);
+ (void)printf("\tpg: ");
+ for (i = 0; i < argp->pg.size; i++) {
+ ch = ((u_int8_t *)argp->pg.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\topflags: %lu\n", (u_long)argp->opflags);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_rsplit_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_rsplit_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_rsplit_args *argp;
+ u_int32_t i;
+ int ch;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __bam_rsplit_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__bam_rsplit%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ (void)printf("\tpgdbt: ");
+ for (i = 0; i < argp->pgdbt.size; i++) {
+ ch = ((u_int8_t *)argp->pgdbt.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\troot_pgno: %lu\n", (u_long)argp->root_pgno);
+ (void)printf("\tnrec: %lu\n", (u_long)argp->nrec);
+ (void)printf("\trootent: ");
+ for (i = 0; i < argp->rootent.size; i++) {
+ ch = ((u_int8_t *)argp->rootent.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\trootlsn: [%lu][%lu]\n",
+ (u_long)argp->rootlsn.file, (u_long)argp->rootlsn.offset);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_adj_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_adj_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_adj_args *argp;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __bam_adj_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__bam_adj%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ (void)printf("\tlsn: [%lu][%lu]\n",
+ (u_long)argp->lsn.file, (u_long)argp->lsn.offset);
+ (void)printf("\tindx: %lu\n", (u_long)argp->indx);
+ (void)printf("\tindx_copy: %lu\n", (u_long)argp->indx_copy);
+ (void)printf("\tis_insert: %lu\n", (u_long)argp->is_insert);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_cadjust_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_cadjust_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_cadjust_args *argp;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __bam_cadjust_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__bam_cadjust%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ (void)printf("\tlsn: [%lu][%lu]\n",
+ (u_long)argp->lsn.file, (u_long)argp->lsn.offset);
+ (void)printf("\tindx: %lu\n", (u_long)argp->indx);
+ (void)printf("\tadjust: %ld\n", (long)argp->adjust);
+ (void)printf("\topflags: %lu\n", (u_long)argp->opflags);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_cdel_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_cdel_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_cdel_args *argp;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __bam_cdel_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__bam_cdel%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ (void)printf("\tlsn: [%lu][%lu]\n",
+ (u_long)argp->lsn.file, (u_long)argp->lsn.offset);
+ (void)printf("\tindx: %lu\n", (u_long)argp->indx);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_repl_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_repl_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_repl_args *argp;
+ u_int32_t i;
+ int ch;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __bam_repl_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__bam_repl%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ (void)printf("\tlsn: [%lu][%lu]\n",
+ (u_long)argp->lsn.file, (u_long)argp->lsn.offset);
+ (void)printf("\tindx: %lu\n", (u_long)argp->indx);
+ (void)printf("\tisdeleted: %lu\n", (u_long)argp->isdeleted);
+ (void)printf("\torig: ");
+ for (i = 0; i < argp->orig.size; i++) {
+ ch = ((u_int8_t *)argp->orig.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\trepl: ");
+ for (i = 0; i < argp->repl.size; i++) {
+ ch = ((u_int8_t *)argp->repl.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\tprefix: %lu\n", (u_long)argp->prefix);
+ (void)printf("\tsuffix: %lu\n", (u_long)argp->suffix);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_root_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_root_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_root_args *argp;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __bam_root_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__bam_root%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tmeta_pgno: %lu\n", (u_long)argp->meta_pgno);
+ (void)printf("\troot_pgno: %lu\n", (u_long)argp->root_pgno);
+ (void)printf("\tmeta_lsn: [%lu][%lu]\n",
+ (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_curadj_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_curadj_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_curadj_args *argp;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __bam_curadj_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__bam_curadj%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tmode: %ld\n", (long)argp->mode);
+ (void)printf("\tfrom_pgno: %lu\n", (u_long)argp->from_pgno);
+ (void)printf("\tto_pgno: %lu\n", (u_long)argp->to_pgno);
+ (void)printf("\tleft_pgno: %lu\n", (u_long)argp->left_pgno);
+ (void)printf("\tfirst_indx: %lu\n", (u_long)argp->first_indx);
+ (void)printf("\tfrom_indx: %lu\n", (u_long)argp->from_indx);
+ (void)printf("\tto_indx: %lu\n", (u_long)argp->to_indx);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_rcuradj_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_rcuradj_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_rcuradj_args *argp;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __bam_rcuradj_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__bam_rcuradj%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tmode: %ld\n", (long)argp->mode);
+ (void)printf("\troot: %ld\n", (long)argp->root);
+ (void)printf("\trecno: %ld\n", (long)argp->recno);
+ (void)printf("\torder: %lu\n", (u_long)argp->order);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_relink_43_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_relink_43_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_relink_43_args *argp;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __bam_relink_43_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__bam_relink_43%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ (void)printf("\tlsn: [%lu][%lu]\n",
+ (u_long)argp->lsn.file, (u_long)argp->lsn.offset);
+ (void)printf("\tprev: %lu\n", (u_long)argp->prev);
+ (void)printf("\tlsn_prev: [%lu][%lu]\n",
+ (u_long)argp->lsn_prev.file, (u_long)argp->lsn_prev.offset);
+ (void)printf("\tnext: %lu\n", (u_long)argp->next);
+ (void)printf("\tlsn_next: [%lu][%lu]\n",
+ (u_long)argp->lsn_next.file, (u_long)argp->lsn_next.offset);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_relink_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_relink_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_relink_args *argp;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __bam_relink_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__bam_relink%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ (void)printf("\tnew_pgno: %lu\n", (u_long)argp->new_pgno);
+ (void)printf("\tprev: %lu\n", (u_long)argp->prev);
+ (void)printf("\tlsn_prev: [%lu][%lu]\n",
+ (u_long)argp->lsn_prev.file, (u_long)argp->lsn_prev.offset);
+ (void)printf("\tnext: %lu\n", (u_long)argp->next);
+ (void)printf("\tlsn_next: [%lu][%lu]\n",
+ (u_long)argp->lsn_next.file, (u_long)argp->lsn_next.offset);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_merge_44_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_merge_44_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_merge_44_args *argp;
+ u_int32_t i;
+ int ch;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __bam_merge_44_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__bam_merge_44%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ (void)printf("\tlsn: [%lu][%lu]\n",
+ (u_long)argp->lsn.file, (u_long)argp->lsn.offset);
+ (void)printf("\tnpgno: %lu\n", (u_long)argp->npgno);
+ (void)printf("\tnlsn: [%lu][%lu]\n",
+ (u_long)argp->nlsn.file, (u_long)argp->nlsn.offset);
+ (void)printf("\thdr: ");
+ for (i = 0; i < argp->hdr.size; i++) {
+ ch = ((u_int8_t *)argp->hdr.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\tdata: ");
+ for (i = 0; i < argp->data.size; i++) {
+ ch = ((u_int8_t *)argp->data.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\tind: ");
+ for (i = 0; i < argp->ind.size; i++) {
+ ch = ((u_int8_t *)argp->ind.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_merge_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_merge_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_merge_args *argp;
+ u_int32_t i;
+ int ch;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __bam_merge_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__bam_merge%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ (void)printf("\tlsn: [%lu][%lu]\n",
+ (u_long)argp->lsn.file, (u_long)argp->lsn.offset);
+ (void)printf("\tnpgno: %lu\n", (u_long)argp->npgno);
+ (void)printf("\tnlsn: [%lu][%lu]\n",
+ (u_long)argp->nlsn.file, (u_long)argp->nlsn.offset);
+ (void)printf("\thdr: ");
+ for (i = 0; i < argp->hdr.size; i++) {
+ ch = ((u_int8_t *)argp->hdr.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\tdata: ");
+ for (i = 0; i < argp->data.size; i++) {
+ ch = ((u_int8_t *)argp->data.data)[i];
+ printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch);
+ }
+ (void)printf("\n");
+ (void)printf("\tpg_copy: %lu\n", (u_long)argp->pg_copy);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_pgno_print __P((ENV *, DBT *, DB_LSN *,
+ * PUBLIC: db_recops, void *));
+ */
+int
+__bam_pgno_print(env, dbtp, lsnp, notused2, notused3)
+ ENV *env;
+ DBT *dbtp;
+ DB_LSN *lsnp;
+ db_recops notused2;
+ void *notused3;
+{
+ __bam_pgno_args *argp;
+ int ret;
+
+ notused2 = DB_TXN_PRINT;
+ notused3 = NULL;
+
+ if ((ret =
+ __bam_pgno_read(env, NULL, NULL, dbtp->data, &argp)) != 0)
+ return (ret);
+ (void)printf(
+ "[%lu][%lu]__bam_pgno%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n",
+ (u_long)lsnp->file, (u_long)lsnp->offset,
+ (argp->type & DB_debug_FLAG) ? "_debug" : "",
+ (u_long)argp->type,
+ (u_long)argp->txnp->txnid,
+ (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset);
+ (void)printf("\tfileid: %ld\n", (long)argp->fileid);
+ (void)printf("\tpgno: %lu\n", (u_long)argp->pgno);
+ (void)printf("\tlsn: [%lu][%lu]\n",
+ (u_long)argp->lsn.file, (u_long)argp->lsn.offset);
+ (void)printf("\tindx: %lu\n", (u_long)argp->indx);
+ (void)printf("\topgno: %lu\n", (u_long)argp->opgno);
+ (void)printf("\tnpgno: %lu\n", (u_long)argp->npgno);
+ (void)printf("\n");
+ __os_free(env, argp);
+ return (0);
+}
+
+/*
+ * PUBLIC: int __bam_init_print __P((ENV *, DB_DISTAB *));
+ */
+int
+__bam_init_print(env, dtabp)
+ ENV *env;
+ DB_DISTAB *dtabp;
+{
+ int ret;
+
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_split_print, DB___bam_split)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_rsplit_print, DB___bam_rsplit)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_adj_print, DB___bam_adj)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_cadjust_print, DB___bam_cadjust)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_cdel_print, DB___bam_cdel)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_repl_print, DB___bam_repl)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_root_print, DB___bam_root)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_curadj_print, DB___bam_curadj)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_rcuradj_print, DB___bam_rcuradj)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_relink_print, DB___bam_relink)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_merge_print, DB___bam_merge)) != 0)
+ return (ret);
+ if ((ret = __db_add_recovery_int(env, dtabp,
+ __bam_pgno_print, DB___bam_pgno)) != 0)
+ return (ret);
+ return (0);
+}
diff --git a/btree/extern.h b/btree/extern.h
deleted file mode 100644
index ebd9c54..0000000
--- a/btree/extern.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*-
- * Copyright (c) 1991, 1993, 1994
- * The Regents of the University of California. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * @(#)extern.h 8.10 (Berkeley) 7/20/94
- */
-
-int __bt_close __P((DB *));
-int __bt_cmp __P((BTREE *, const DBT *, EPG *));
-int __bt_crsrdel __P((BTREE *, EPGNO *));
-int __bt_defcmp __P((const DBT *, const DBT *));
-size_t __bt_defpfx __P((const DBT *, const DBT *));
-int __bt_delete __P((const DB *, const DBT *, u_int));
-int __bt_dleaf __P((BTREE *, const DBT *, PAGE *, u_int));
-int __bt_fd __P((const DB *));
-int __bt_free __P((BTREE *, PAGE *));
-int __bt_get __P((const DB *, const DBT *, DBT *, u_int));
-PAGE *__bt_new __P((BTREE *, pgno_t *));
-void __bt_pgin __P((void *, pgno_t, void *));
-void __bt_pgout __P((void *, pgno_t, void *));
-int __bt_push __P((BTREE *, pgno_t, int));
-int __bt_put __P((const DB *dbp, DBT *, const DBT *, u_int));
-int __bt_ret __P((BTREE *, EPG *, DBT *, DBT *, DBT *, DBT *, int));
-EPG *__bt_search __P((BTREE *, const DBT *, int *));
-int __bt_seq __P((const DB *, DBT *, DBT *, u_int));
-void __bt_setcur __P((BTREE *, pgno_t, u_int));
-int __bt_split __P((BTREE *, PAGE *,
- const DBT *, const DBT *, int, size_t, u_int32_t));
-int __bt_sync __P((const DB *, u_int));
-
-int __ovfl_delete __P((BTREE *, void *));
-int __ovfl_get __P((BTREE *, void *, size_t *, void **, size_t *));
-int __ovfl_put __P((BTREE *, const DBT *, pgno_t *));
-
-#ifdef DEBUG
-void __bt_dnpage __P((DB *, pgno_t));
-void __bt_dpage __P((PAGE *));
-void __bt_dump __P((DB *));
-#endif
-#ifdef STATISTICS
-void __bt_stat __P((DB *));
-#endif
diff --git a/btree/tags b/btree/tags
deleted file mode 120000
index 7ab656b..0000000
--- a/btree/tags
+++ /dev/null
@@ -1 +0,0 @@
-../db/tags \ No newline at end of file