summaryrefslogtreecommitdiff
path: root/db/btree/bt_recno.c
diff options
context:
space:
mode:
Diffstat (limited to 'db/btree/bt_recno.c')
-rw-r--r--db/btree/bt_recno.c1369
1 files changed, 1369 insertions, 0 deletions
diff --git a/db/btree/bt_recno.c b/db/btree/bt_recno.c
new file mode 100644
index 000000000..6ac0cac35
--- /dev/null
+++ b/db/btree/bt_recno.c
@@ -0,0 +1,1369 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: bt_recno.c,v 11.65 2001/01/18 14:33:22 bostic Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <limits.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "btree.h"
+#include "db_ext.h"
+#include "db_shash.h"
+#include "lock.h"
+#include "lock_ext.h"
+#include "qam.h"
+#include "txn.h"
+
+static int __ram_add __P((DBC *, db_recno_t *, DBT *, u_int32_t, u_int32_t));
+static int __ram_delete __P((DB *, DB_TXN *, DBT *, u_int32_t));
+static int __ram_put __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+static int __ram_source __P((DB *));
+static int __ram_sread __P((DBC *, db_recno_t));
+static int __ram_update __P((DBC *, db_recno_t, int));
+
+/*
+ * In recno, there are two meanings to the on-page "deleted" flag. If we're
+ * re-numbering records, it means the record was implicitly created. We skip
+ * over implicitly created records if doing a cursor "next" or "prev", and
+ * return DB_KEYEMPTY if they're explicitly requested.. If not re-numbering
+ * records, it means that the record was implicitly created, or was deleted.
+ * We skip over implicitly created or deleted records if doing a cursor "next"
+ * or "prev", and return DB_KEYEMPTY if they're explicitly requested.
+ *
+ * If we're re-numbering records, then we have to detect in the cursor that
+ * a record was deleted, and adjust the cursor as necessary on the next get.
+ * If we're not re-numbering records, then we can detect that a record has
+ * been deleted by looking at the actual on-page record, so we completely
+ * ignore the cursor's delete flag. This is different from the B+tree code.
+ * It also maintains whether the cursor references a deleted record in the
+ * cursor, and it doesn't always check the on-page value.
+ */
+#define CD_SET(cp) { \
+ if (F_ISSET(cp, C_RENUMBER)) \
+ F_SET(cp, C_DELETED); \
+}
+#define CD_CLR(cp) { \
+ if (F_ISSET(cp, C_RENUMBER)) { \
+ F_CLR(cp, C_DELETED); \
+ cp->order = INVALID_ORDER; \
+ } \
+}
+#define CD_ISSET(cp) \
+ (F_ISSET(cp, C_RENUMBER) && F_ISSET(cp, C_DELETED))
+
+/*
+ * Macros for comparing the ordering of two cursors.
+ * cp1 comes before cp2 iff one of the following holds:
+ * cp1's recno is less than cp2's recno
+ * recnos are equal, both deleted, and cp1's order is less than cp2's
+ * recnos are equal, cp1 deleted, and cp2 not deleted
+ */
+#define C_LESSTHAN(cp1, cp2) \
+ (((cp1)->recno < (cp2)->recno) || \
+ (((cp1)->recno == (cp2)->recno) && \
+ ((CD_ISSET((cp1)) && CD_ISSET((cp2)) && (cp1)->order < (cp2)->order) || \
+ (CD_ISSET((cp1)) && !CD_ISSET((cp2))))))
+
+/*
+ * cp1 is equal to cp2 iff their recnos and delete flags are identical,
+ * and if the delete flag is set their orders are also identical.
+ */
+#define C_EQUAL(cp1, cp2) \
+ ((cp1)->recno == (cp2)->recno && CD_ISSET((cp1)) == CD_ISSET((cp2)) && \
+ (!CD_ISSET((cp1)) || (cp1)->order == (cp2)->order))
+
+/*
+ * Do we need to log the current cursor adjustment?
+ */
+#define CURADJ_LOG(dbc) \
+ (DB_LOGGING((dbc)) && (dbc)->txn != NULL && (dbc)->txn->parent != NULL)
+
+/*
+ * __ram_open --
+ * Recno open function.
+ *
+ * PUBLIC: int __ram_open __P((DB *, const char *, db_pgno_t, u_int32_t));
+ */
+int
+__ram_open(dbp, name, base_pgno, flags)
+ DB *dbp;
+ const char *name;
+ db_pgno_t base_pgno;
+ u_int32_t flags;
+{
+ BTREE *t;
+ DBC *dbc;
+ int ret, t_ret;
+
+ t = dbp->bt_internal;
+
+ /* Initialize the remaining fields/methods of the DB. */
+ dbp->del = __ram_delete;
+ dbp->put = __ram_put;
+ dbp->stat = __bam_stat;
+
+ /* Start up the tree. */
+ if ((ret = __bam_read_root(dbp, name, base_pgno, flags)) != 0)
+ return (ret);
+
+ /*
+ * If the user specified a source tree, open it and map it in.
+ *
+ * !!!
+ * We don't complain if the user specified transactions or threads.
+ * It's possible to make it work, but you'd better know what you're
+ * doing!
+ */
+ if (t->re_source != NULL && (ret = __ram_source(dbp)) != 0)
+ return (ret);
+
+ /* If we're snapshotting an underlying source file, do it now. */
+ if (F_ISSET(dbp, DB_RE_SNAPSHOT)) {
+ /* Allocate a cursor. */
+ if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0)
+ return (ret);
+
+ /* Do the snapshot. */
+ if ((ret = __ram_update(dbc,
+ DB_MAX_RECORDS, 0)) != 0 && ret == DB_NOTFOUND)
+ ret = 0;
+
+ /* Discard the cursor. */
+ if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ return (0);
+}
+
+/*
+ * __ram_delete --
+ * Recno db->del function.
+ */
+static int
+__ram_delete(dbp, txn, key, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ DBT *key;
+ u_int32_t flags;
+{
+ BTREE_CURSOR *cp;
+ DBC *dbc;
+ db_recno_t recno;
+ int ret, t_ret;
+
+ PANIC_CHECK(dbp->dbenv);
+
+ /* Check for invalid flags. */
+ if ((ret = __db_delchk(dbp,
+ key, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0)
+ return (ret);
+
+ /* Acquire a cursor. */
+ if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
+ return (ret);
+
+ DEBUG_LWRITE(dbc, txn, "ram_delete", key, NULL, flags);
+
+ /* Check the user's record number and fill in as necessary. */
+ if ((ret = __ram_getno(dbc, key, &recno, 0)) != 0)
+ goto err;
+
+ /* Do the delete. */
+ cp = (BTREE_CURSOR *)dbc->internal;
+ cp->recno = recno;
+
+ ret = __ram_c_del(dbc);
+
+ /* Release the cursor. */
+err: if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/*
+ * __ram_put --
+ * Recno db->put function.
+ */
+static int
+__ram_put(dbp, txn, key, data, flags)
+ DB *dbp;
+ DB_TXN *txn;
+ DBT *key, *data;
+ u_int32_t flags;
+{
+ DBC *dbc;
+ db_recno_t recno;
+ int ret, t_ret;
+
+ PANIC_CHECK(dbp->dbenv);
+
+ /* Check for invalid flags. */
+ if ((ret = __db_putchk(dbp,
+ key, data, flags, F_ISSET(dbp, DB_AM_RDONLY), 0)) != 0)
+ return (ret);
+
+ /* Allocate a cursor. */
+ if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
+ return (ret);
+
+ DEBUG_LWRITE(dbc, txn, "ram_put", key, data, flags);
+
+ /*
+ * If we're appending to the tree, make sure we've read in all of
+ * the backing source file. Otherwise, check the user's record
+ * number and fill in as necessary. If we found the record or it
+ * simply didn't exist, add the user's record.
+ */
+ if (flags == DB_APPEND)
+ ret = __ram_update(dbc, DB_MAX_RECORDS, 0);
+ else
+ ret = __ram_getno(dbc, key, &recno, 1);
+ if (ret == 0 || ret == DB_NOTFOUND)
+ ret = __ram_add(dbc, &recno, data, flags, 0);
+
+ /* Discard the cursor. */
+ if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Return the record number if we're appending to the tree. */
+ if (ret == 0 && flags == DB_APPEND)
+ ret = __db_retcopy(dbp, key, &recno, sizeof(recno),
+ &dbc->rkey.data, &dbc->rkey.ulen);
+
+ return (ret);
+}
+
+/*
+ * __ram_c_del --
+ * Recno cursor->c_del function.
+ *
+ * PUBLIC: int __ram_c_del __P((DBC *));
+ */
+int
+__ram_c_del(dbc)
+ DBC *dbc;
+{
+ BKEYDATA bk;
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_LSN lsn;
+ DBT hdr, data;
+ EPG *epg;
+ int exact, ret, stack;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ t = dbp->bt_internal;
+ stack = 0;
+
+ /*
+ * The semantics of cursors during delete are as follows: in
+ * non-renumbering recnos, records are replaced with a marker
+ * containing a delete flag. If the record referenced by this cursor
+ * has already been deleted, we will detect that as part of the delete
+ * operation, and fail.
+ *
+ * In renumbering recnos, cursors which represent deleted items
+ * are flagged with the C_DELETED flag, and it is an error to
+ * call c_del a second time without an intervening cursor motion.
+ */
+ if (CD_ISSET(cp))
+ return (DB_KEYEMPTY);
+
+ /* Search the tree for the key; delete only deletes exact matches. */
+ if ((ret = __bam_rsearch(dbc, &cp->recno, S_DELETE, 1, &exact)) != 0)
+ goto err;
+ if (!exact) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ stack = 1;
+ cp->page = cp->csp->page;
+ cp->pgno = cp->csp->page->pgno;
+ cp->indx = cp->csp->indx;
+
+ /*
+ * If re-numbering records, the on-page deleted flag can only mean
+ * that this record was implicitly created. Applications aren't
+ * permitted to delete records they never created, return an error.
+ *
+ * If not re-numbering records, the on-page deleted flag means that
+ * this record was implicitly created, or, was deleted at some time.
+ * The former is an error because applications aren't permitted to
+ * delete records they never created, the latter is an error because
+ * if the record was "deleted", we could never have found it.
+ */
+ if (B_DISSET(GET_BKEYDATA(cp->page, cp->indx)->type)) {
+ ret = DB_KEYEMPTY;
+ goto err;
+ }
+
+ if (F_ISSET(cp, C_RENUMBER)) {
+ /* Delete the item, adjust the counts, adjust the cursors. */
+ if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
+ goto err;
+ __bam_adjust(dbc, -1);
+ if (__ram_ca(dbc, CA_DELETE) > 0 &&
+ CURADJ_LOG(dbc) && (ret = __bam_rcuradj_log(dbp->dbenv,
+ dbc->txn, &lsn, 0, dbp->log_fileid, CA_DELETE,
+ cp->root, cp->recno, cp->order)) != 0)
+ goto err;
+
+ /*
+ * If the page is empty, delete it.
+ *
+ * We never delete a root page. First, root pages of primary
+ * databases never go away, recno or otherwise. However, if
+ * it's the root page of an off-page duplicates database, then
+ * it can be deleted. We don't delete it here because we have
+ * no way of telling the primary database page holder (e.g.,
+ * the hash access method) that its page element should cleaned
+ * up because the underlying tree is gone. So, we keep the page
+ * around until the last cursor referencing the empty tree is
+ * are closed, and then clean it up.
+ */
+ if (NUM_ENT(cp->page) == 0 && PGNO(cp->page) != cp->root) {
+ /*
+ * We already have a locked stack of pages. However,
+ * there are likely entries in the stack that aren't
+ * going to be emptied by removing the single reference
+ * to the emptied page (or one of its parents).
+ */
+ for (epg = cp->sp; epg <= cp->csp; ++epg)
+ if (NUM_ENT(epg->page) <= 1)
+ break;
+
+ /*
+ * We want to delete a single item out of the last page
+ * that we're not deleting, back up to that page.
+ */
+ ret = __bam_dpages(dbc, --epg);
+
+ /*
+ * Regardless of the return from __bam_dpages, it will
+ * discard our stack and pinned page.
+ */
+ stack = 0;
+ cp->page = NULL;
+ }
+ } else {
+ /* Use a delete/put pair to replace the record with a marker. */
+ if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0)
+ goto err;
+
+ B_TSET(bk.type, B_KEYDATA, 1);
+ bk.len = 0;
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.data = &bk;
+ hdr.size = SSZA(BKEYDATA, data);
+ memset(&data, 0, sizeof(data));
+ data.data = (void *)"";
+ data.size = 0;
+ if ((ret = __db_pitem(dbc,
+ cp->page, cp->indx, BKEYDATA_SIZE(0), &hdr, &data)) != 0)
+ goto err;
+ }
+
+ t->re_modified = 1;
+
+err: if (stack)
+ __bam_stkrel(dbc, STK_CLRDBC);
+
+ return (ret);
+}
+
+/*
+ * __ram_c_get --
+ * Recno cursor->c_get function.
+ *
+ * PUBLIC: int __ram_c_get
+ * PUBLIC: __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+ */
+int
+__ram_c_get(dbc, key, data, flags, pgnop)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+ db_pgno_t *pgnop;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ int cmp, exact, ret;
+
+ COMPQUIET(pgnop, NULL);
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+retry: switch (flags) {
+ case DB_CURRENT:
+ /*
+ * If we're using mutable records and the deleted flag is
+ * set, the cursor is pointing at a nonexistent record;
+ * return an error.
+ */
+ if (CD_ISSET(cp))
+ return (DB_KEYEMPTY);
+ break;
+ case DB_NEXT_DUP:
+ /*
+ * If we're not in an off-page dup set, we know there's no
+ * next duplicate since recnos don't have them. If we
+ * are in an off-page dup set, the next item assuredly is
+ * a dup, so we set flags to DB_NEXT and keep going.
+ */
+ if (!F_ISSET(dbc, DBC_OPD))
+ return (DB_NOTFOUND);
+ /* FALLTHROUGH */
+ case DB_NEXT_NODUP:
+ /*
+ * Recno databases don't have duplicates, set flags to DB_NEXT
+ * and keep going.
+ */
+ /* FALLTHROUGH */
+ case DB_NEXT:
+ flags = DB_NEXT;
+ /*
+ * If record numbers are mutable: if we just deleted a record,
+ * we have to avoid incrementing the record number so that we
+ * return the right record by virtue of renumbering the tree.
+ */
+ if (CD_ISSET(cp))
+ break;
+
+ if (cp->recno != RECNO_OOB) {
+ ++cp->recno;
+ break;
+ }
+ /* FALLTHROUGH */
+ case DB_FIRST:
+ flags = DB_NEXT;
+ cp->recno = 1;
+ break;
+ case DB_PREV_NODUP:
+ /*
+ * Recno databases don't have duplicates, set flags to DB_PREV
+ * and keep going.
+ */
+ /* FALLTHROUGH */
+ case DB_PREV:
+ flags = DB_PREV;
+ if (cp->recno != RECNO_OOB) {
+ if (cp->recno == 1) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ --cp->recno;
+ break;
+ }
+ /* FALLTHROUGH */
+ case DB_LAST:
+ flags = DB_PREV;
+ if (((ret = __ram_update(dbc,
+ DB_MAX_RECORDS, 0)) != 0) && ret != DB_NOTFOUND)
+ goto err;
+ if ((ret = __bam_nrecs(dbc, &cp->recno)) != 0)
+ goto err;
+ if (cp->recno == 0) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ break;
+ case DB_GET_BOTHC:
+ /*
+ * If we're doing a join and these are offpage dups,
+ * we want to keep searching forward from after the
+ * current cursor position. Increment the recno by 1,
+ * then proceed as for a DB_SET.
+ *
+ * Otherwise, we know there are no additional matching
+ * data, as recnos don't have dups. return DB_NOTFOUND.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ cp->recno++;
+ break;
+ }
+ ret = DB_NOTFOUND;
+ goto err;
+ /* NOTREACHED */
+ case DB_GET_BOTH:
+ /*
+ * If we're searching a set of off-page dups, we start
+ * a new linear search from the first record. Otherwise,
+ * we compare the single data item associated with the
+ * requested record for a match.
+ */
+ if (F_ISSET(dbc, DBC_OPD)) {
+ cp->recno = 1;
+ break;
+ }
+ /* FALLTHROUGH */
+ case DB_SET:
+ case DB_SET_RANGE:
+ if ((ret = __ram_getno(dbc, key, &cp->recno, 0)) != 0)
+ goto err;
+ break;
+ default:
+ ret = __db_unknown_flag(dbp->dbenv, "__ram_c_get", flags);
+ goto err;
+ }
+
+ /*
+ * For DB_PREV, DB_LAST, DB_SET and DB_SET_RANGE, we have already
+ * called __ram_update() to make sure sufficient records have been
+ * read from the backing source file. Do it now for DB_CURRENT (if
+ * the current record was deleted we may need more records from the
+ * backing file for a DB_CURRENT operation), DB_FIRST and DB_NEXT.
+ */
+ if ((flags == DB_NEXT || flags == DB_CURRENT) && ((ret =
+ __ram_update(dbc, cp->recno, 0)) != 0) && ret != DB_NOTFOUND)
+ goto err;
+
+ for (;; ++cp->recno) {
+ /* Search the tree for the record. */
+ if ((ret = __bam_rsearch(dbc, &cp->recno,
+ F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND,
+ 1, &exact)) != 0)
+ goto err;
+ if (!exact) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+
+ /*
+ * Copy the page into the cursor, discarding any lock we
+ * are currently holding.
+ */
+ cp->page = cp->csp->page;
+ cp->pgno = cp->csp->page->pgno;
+ cp->indx = cp->csp->indx;
+ (void)__TLPUT(dbc, cp->lock);
+ cp->lock = cp->csp->lock;
+ cp->lock_mode = cp->csp->lock_mode;
+
+ /*
+ * If re-numbering records, the on-page deleted flag means this
+ * record was implicitly created. If not re-numbering records,
+ * the on-page deleted flag means this record was implicitly
+ * created, or, it was deleted at some time. Regardless, we
+ * skip such records if doing cursor next/prev operations or
+ * walking through off-page duplicates, and fail if they were
+ * requested explicitly by the application.
+ */
+ if (B_DISSET(GET_BKEYDATA(cp->page, cp->indx)->type))
+ switch (flags) {
+ case DB_NEXT:
+ case DB_PREV:
+ (void)__bam_stkrel(dbc, STK_CLRDBC);
+ goto retry;
+ case DB_GET_BOTH:
+ (void)__bam_stkrel(dbc, STK_CLRDBC);
+ continue;
+ default:
+ ret = DB_KEYEMPTY;
+ goto err;
+ }
+
+ if (flags == DB_GET_BOTH || flags == DB_GET_BOTHC) {
+ if ((ret = __bam_cmp(dbp, data,
+ cp->page, cp->indx, __bam_defcmp, &cmp)) != 0)
+ return (ret);
+ if (cmp == 0)
+ break;
+ if (!F_ISSET(dbc, DBC_OPD)) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ (void)__bam_stkrel(dbc, STK_CLRDBC);
+ } else
+ break;
+ }
+
+ /* Return the key if the user didn't give us one. */
+ if (!F_ISSET(dbc, DBC_OPD)) {
+ if (flags != DB_SET && flags != DB_SET_RANGE)
+ ret = __db_retcopy(dbp,
+ key, &cp->recno, sizeof(cp->recno),
+ &dbc->rkey.data, &dbc->rkey.ulen);
+ F_SET(key, DB_DBT_ISSET);
+ }
+
+ /* The cursor was reset, no further delete adjustment is necessary. */
+err: CD_CLR(cp);
+
+ return (ret);
+}
+
+/*
+ * __ram_c_put --
+ * Recno cursor->c_put function.
+ *
+ * PUBLIC: int __ram_c_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+ */
+int
+__ram_c_put(dbc, key, data, flags, pgnop)
+ DBC *dbc;
+ DBT *key, *data;
+ u_int32_t flags;
+ db_pgno_t *pgnop;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_LSN lsn;
+ int exact, nc, ret, t_ret;
+ u_int32_t iiflags;
+ void *arg;
+
+ COMPQUIET(pgnop, NULL);
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ /*
+ * DB_KEYFIRST and DB_KEYLAST will only be set if we're dealing with
+ * an off-page duplicate tree, they can't be specified at user level.
+ * Translate them into something else.
+ */
+ switch (flags) {
+ case DB_KEYFIRST:
+ cp->recno = 1;
+ flags = DB_BEFORE;
+ break;
+ case DB_KEYLAST:
+ if ((ret = __ram_add(dbc, &cp->recno, data, DB_APPEND, 0)) != 0)
+ return (ret);
+ if (CURADJ_LOG(dbc) && (ret = __bam_rcuradj_log(dbp->dbenv,
+ dbc->txn, &lsn, 0, dbp->log_fileid, CA_ICURRENT,
+ cp->root, cp->recno, cp->order)))
+ return (ret);
+ return (0);
+ }
+
+ /*
+ * If we're putting with a cursor that's marked C_DELETED, we need to
+ * take special care; the cursor doesn't "really" reference the item
+ * corresponding to its current recno, but instead is "between" that
+ * record and the current one. Translate the actual insert into
+ * DB_BEFORE, and let the __ram_ca work out the gory details of what
+ * should wind up pointing where.
+ */
+ if (CD_ISSET(cp))
+ iiflags = DB_BEFORE;
+ else
+ iiflags = flags;
+
+split: if ((ret = __bam_rsearch(dbc, &cp->recno, S_INSERT, 1, &exact)) != 0)
+ goto err;
+ /*
+ * An inexact match is okay; it just means we're one record past the
+ * end, which is reasonable if we're marked deleted.
+ */
+ DB_ASSERT(exact || CD_ISSET(cp));
+
+ cp->page = cp->csp->page;
+ cp->pgno = cp->csp->page->pgno;
+ cp->indx = cp->csp->indx;
+
+ ret = __bam_iitem(dbc, key, data, iiflags, 0);
+ t_ret = __bam_stkrel(dbc, STK_CLRDBC);
+
+ if (t_ret != 0 && (ret == 0 || ret == DB_NEEDSPLIT))
+ ret = t_ret;
+ else if (ret == DB_NEEDSPLIT) {
+ arg = &cp->recno;
+ if ((ret = __bam_split(dbc, arg)) != 0)
+ goto err;
+ goto split;
+ }
+ if (ret != 0)
+ goto err;
+
+ switch (flags) { /* Adjust the cursors. */
+ case DB_AFTER:
+ nc = __ram_ca(dbc, CA_IAFTER);
+
+ /*
+ * We only need to adjust this cursor forward if we truly added
+ * the item after the current recno, rather than remapping it
+ * to DB_BEFORE.
+ */
+ if (iiflags == DB_AFTER)
+ ++cp->recno;
+
+ /* Only log if __ram_ca found any relevant cursors. */
+ if (nc > 0 && CURADJ_LOG(dbc) &&
+ (ret = __bam_rcuradj_log(dbp->dbenv,
+ dbc->txn, &lsn, 0, dbp->log_fileid, CA_IAFTER,
+ cp->root, cp->recno, cp->order)) != 0)
+ goto err;
+ break;
+ case DB_BEFORE:
+ nc = __ram_ca(dbc, CA_IBEFORE);
+ --cp->recno;
+
+ /* Only log if __ram_ca found any relevant cursors. */
+ if (nc > 0 && CURADJ_LOG(dbc) &&
+ (ret = __bam_rcuradj_log(dbp->dbenv,
+ dbc->txn, &lsn, 0, dbp->log_fileid, CA_IBEFORE,
+ cp->root, cp->recno, cp->order)) != 0)
+ goto err;
+ break;
+ case DB_CURRENT:
+ /*
+ * We only need to do an adjustment if we actually
+ * added an item, which we only would have done if the
+ * cursor was marked deleted.
+ *
+ * Only log if __ram_ca found any relevant cursors.
+ */
+ if (CD_ISSET(cp) && __ram_ca(dbc, CA_ICURRENT) > 0 &&
+ CURADJ_LOG(dbc) && (ret = __bam_rcuradj_log(
+ dbp->dbenv, dbc->txn, &lsn, 0, dbp->log_fileid,
+ CA_ICURRENT, cp->root, cp->recno, cp->order)) != 0)
+ goto err;
+ break;
+ }
+
+ /* Return the key if we've created a new record. */
+ if (!F_ISSET(dbc, DBC_OPD) && (flags == DB_AFTER || flags == DB_BEFORE))
+ ret = __db_retcopy(dbp, key, &cp->recno,
+ sizeof(cp->recno), &dbc->rkey.data, &dbc->rkey.ulen);
+
+ /* The cursor was reset, no further delete adjustment is necessary. */
+err: CD_CLR(cp);
+
+ return (ret);
+}
+
+/*
+ * __ram_ca --
+ * Adjust cursors. Returns the number of relevant cursors.
+ *
+ * PUBLIC: int __ram_ca __P((DBC *, ca_recno_arg));
+ */
+int
+__ram_ca(dbc_arg, op)
+ DBC *dbc_arg;
+ ca_recno_arg op;
+{
+ BTREE_CURSOR *cp, *cp_arg;
+ DB *dbp, *ldbp;
+ DB_ENV *dbenv;
+ DBC *dbc;
+ db_recno_t recno;
+ int adjusted, found;
+ u_int32_t order;
+
+ dbp = dbc_arg->dbp;
+ dbenv = dbp->dbenv;
+ cp_arg = (BTREE_CURSOR *)dbc_arg->internal;
+ recno = cp_arg->recno;
+
+ found = 0;
+
+ /*
+ * It only makes sense to adjust cursors if we're a renumbering
+ * recno; we should only be called if this is one.
+ */
+ DB_ASSERT(F_ISSET(cp_arg, C_RENUMBER));
+
+ MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp);
+ /*
+ * Adjust the cursors. See the comment in __bam_ca_delete().
+ */
+ /*
+ * If we're doing a delete, we need to find the highest
+ * order of any cursor currently pointing at this item,
+ * so we can assign a higher order to the newly deleted
+ * cursor. Unfortunately, this requires a second pass through
+ * the cursor list.
+ */
+ if (op == CA_DELETE) {
+ order = 1;
+ for (ldbp = __dblist_get(dbenv, dbp->adj_fileid);
+ ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+ ldbp = LIST_NEXT(ldbp, dblistlinks)) {
+ MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
+ for (dbc = TAILQ_FIRST(&ldbp->active_queue);
+ dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+ cp = (BTREE_CURSOR *)dbc->internal;
+ if (cp_arg->root == cp->root &&
+ recno == cp->recno && CD_ISSET(cp) &&
+ order <= cp->order)
+ order = cp->order + 1;
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
+ }
+ } else
+ order = INVALID_ORDER;
+
+ /* Now go through and do the actual adjustments. */
+ for (ldbp = __dblist_get(dbenv, dbp->adj_fileid);
+ ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
+ ldbp = LIST_NEXT(ldbp, dblistlinks)) {
+ MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
+ for (dbc = TAILQ_FIRST(&ldbp->active_queue);
+ dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
+ cp = (BTREE_CURSOR *)dbc->internal;
+ if (cp_arg->root != cp->root)
+ continue;
+ ++found;
+ adjusted = 0;
+ switch (op) {
+ case CA_DELETE:
+ if (recno < cp->recno) {
+ --cp->recno;
+ /*
+ * If the adjustment made them equal,
+ * we have to merge the orders.
+ */
+ if (recno == cp->recno && CD_ISSET(cp))
+ cp->order += order;
+ } else if (recno == cp->recno &&
+ !CD_ISSET(cp)) {
+ CD_SET(cp);
+ cp->order = order;
+ }
+ break;
+ case CA_IBEFORE:
+ /*
+ * IBEFORE is just like IAFTER, except that we
+ * adjust cursors on the current record too.
+ */
+ if (C_EQUAL(cp_arg, cp)) {
+ ++cp->recno;
+ adjusted = 1;
+ }
+ goto iafter;
+ case CA_ICURRENT:
+
+ /*
+ * If the original cursor wasn't deleted, we
+ * just did a replacement and so there's no
+ * need to adjust anything--we shouldn't have
+ * gotten this far. Otherwise, we behave
+ * much like an IAFTER, except that all
+ * cursors pointing to the current item get
+ * marked undeleted and point to the new
+ * item.
+ */
+ DB_ASSERT(CD_ISSET(cp_arg));
+ if (C_EQUAL(cp_arg, cp)) {
+ CD_CLR(cp);
+ break;
+ }
+ /* FALLTHROUGH */
+ case CA_IAFTER:
+iafter: if (!adjusted && C_LESSTHAN(cp_arg, cp)) {
+ ++cp->recno;
+ adjusted = 1;
+ }
+ if (recno == cp->recno && adjusted)
+ /*
+ * If we've moved this cursor's recno,
+ * split its order number--i.e.,
+ * decrement it by enough so that
+ * the lowest cursor moved has order 1.
+ * cp_arg->order is the split point,
+ * so decrement by one less than that.
+ */
+ cp->order -= (cp_arg->order - 1);
+ break;
+ }
+ }
+ MUTEX_THREAD_UNLOCK(dbp->dbenv, dbp->mutexp);
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp);
+
+ return (found);
+}
+
+/*
+ * __ram_getno --
+ * Check the user's record number, and make sure we've seen it.
+ *
+ * PUBLIC: int __ram_getno __P((DBC *, const DBT *, db_recno_t *, int));
+ */
+int
+__ram_getno(dbc, key, rep, can_create)
+ DBC *dbc;
+ const DBT *key;
+ db_recno_t *rep;
+ int can_create;
+{
+ DB *dbp;
+ db_recno_t recno;
+
+ dbp = dbc->dbp;
+
+ /* Check the user's record number. */
+ if ((recno = *(db_recno_t *)key->data) == 0) {
+ __db_err(dbp->dbenv, "illegal record number of 0");
+ return (EINVAL);
+ }
+ if (rep != NULL)
+ *rep = recno;
+
+ /*
+ * Btree can neither create records nor read them in. Recno can
+ * do both, see if we can find the record.
+ */
+ return (dbc->dbtype == DB_RECNO ?
+ __ram_update(dbc, recno, can_create) : 0);
+}
+
+/*
+ * __ram_update --
+ * Ensure the tree has records up to and including the specified one.
+ */
+static int
+__ram_update(dbc, recno, can_create)
+ DBC *dbc;
+ db_recno_t recno;
+ int can_create;
+{
+ BTREE *t;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ db_recno_t nrecs;
+ int ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ t = dbp->bt_internal;
+
+ /*
+ * If we can't create records and we've read the entire backing input
+ * file, we're done.
+ */
+ if (!can_create && t->re_eof)
+ return (0);
+
+ /*
+ * If we haven't seen this record yet, try to get it from the original
+ * file.
+ */
+ if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
+ return (ret);
+ if (!t->re_eof && recno > nrecs) {
+ if ((ret = __ram_sread(dbc, recno)) != 0 && ret != DB_NOTFOUND)
+ return (ret);
+ if ((ret = __bam_nrecs(dbc, &nrecs)) != 0)
+ return (ret);
+ }
+
+ /*
+ * If we can create records, create empty ones up to the requested
+ * record.
+ */
+ if (!can_create || recno <= nrecs + 1)
+ return (0);
+
+ dbc->rdata.dlen = 0;
+ dbc->rdata.doff = 0;
+ dbc->rdata.flags = 0;
+ if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
+ if (dbc->rdata.ulen < t->re_len) {
+ if ((ret = __os_realloc(dbp->dbenv,
+ t->re_len, NULL, &dbc->rdata.data)) != 0) {
+ dbc->rdata.ulen = 0;
+ dbc->rdata.data = NULL;
+ return (ret);
+ }
+ dbc->rdata.ulen = t->re_len;
+ }
+ dbc->rdata.size = t->re_len;
+ memset(dbc->rdata.data, t->re_pad, t->re_len);
+ } else
+ dbc->rdata.size = 0;
+
+ while (recno > ++nrecs)
+ if ((ret = __ram_add(dbc,
+ &nrecs, &dbc->rdata, 0, BI_DELETED)) != 0)
+ return (ret);
+ return (0);
+}
+
+/*
+ * __ram_source --
+ * Load information about the backing file.
+ */
+static int
+__ram_source(dbp)
+ DB *dbp;
+{
+ BTREE *t;
+ char *source;
+ int ret;
+
+ t = dbp->bt_internal;
+
+ /* Find the real name, and swap out the one we had before. */
+ if ((ret = __db_appname(dbp->dbenv,
+ DB_APP_DATA, NULL, t->re_source, 0, NULL, &source)) != 0)
+ return (ret);
+ __os_freestr(t->re_source);
+ t->re_source = source;
+
+ /*
+ * !!!
+ * It's possible that the backing source file is read-only. We don't
+ * much care other than we'll complain if there are any modifications
+ * when it comes time to write the database back to the source.
+ */
+ if ((t->re_fp = fopen(t->re_source, "r")) == NULL) {
+ ret = errno;
+ __db_err(dbp->dbenv, "%s: %s", t->re_source, db_strerror(ret));
+ return (ret);
+ }
+
+ t->re_eof = 0;
+ return (0);
+}
+
+/*
+ * __ram_writeback --
+ * Rewrite the backing file.
+ *
+ * PUBLIC: int __ram_writeback __P((DB *));
+ */
+int
+__ram_writeback(dbp)
+ DB *dbp;
+{
+ BTREE *t;
+ DB_ENV *dbenv;
+ DBC *dbc;
+ DBT key, data;
+ FILE *fp;
+ db_recno_t keyno;
+ int ret, t_ret;
+ u_int8_t delim, *pad;
+
+ t = dbp->bt_internal;
+ dbenv = dbp->dbenv;
+ fp = NULL;
+
+ /* If the file wasn't modified, we're done. */
+ if (!t->re_modified)
+ return (0);
+
+ /* If there's no backing source file, we're done. */
+ if (t->re_source == NULL) {
+ t->re_modified = 0;
+ return (0);
+ }
+
+ /* Allocate a cursor. */
+ if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0)
+ return (ret);
+
+ /*
+ * Read any remaining records into the tree.
+ *
+ * !!!
+ * This is why we can't support transactions when applications specify
+ * backing (re_source) files. At this point we have to read in the
+ * rest of the records from the file so that we can write all of the
+ * records back out again, which could modify a page for which we'd
+ * have to log changes and which we don't have locked. This could be
+ * partially fixed by taking a snapshot of the entire file during the
+ * DB->open as DB->open is transaction protected. But, if a checkpoint
+ * occurs then, the part of the log holding the copy of the file could
+ * be discarded, and that would make it impossible to recover in the
+ * face of disaster. This could all probably be fixed, but it would
+ * require transaction protecting the backing source file.
+ *
+ * XXX
+ * This could be made to work now that we have transactions protecting
+ * file operations. Margo has specifically asked for the privilege of
+ * doing this work.
+ */
+ if ((ret =
+ __ram_update(dbc, DB_MAX_RECORDS, 0)) != 0 && ret != DB_NOTFOUND)
+ return (ret);
+
+ /*
+ * Close any existing file handle and re-open the file, truncating it.
+ */
+ if (t->re_fp != NULL) {
+ if (fclose(t->re_fp) != 0) {
+ ret = errno;
+ goto err;
+ }
+ t->re_fp = NULL;
+ }
+ if ((fp = fopen(t->re_source, "w")) == NULL) {
+ ret = errno;
+ __db_err(dbenv, "%s: %s", t->re_source, db_strerror(ret));
+ goto err;
+ }
+
+ /*
+ * We step through the records, writing each one out. Use the record
+ * number and the dbp->get() function, instead of a cursor, so we find
+ * and write out "deleted" or non-existent records.
+ */
+ memset(&key, 0, sizeof(key));
+ memset(&data, 0, sizeof(data));
+ key.size = sizeof(db_recno_t);
+ key.data = &keyno;
+
+ /*
+ * We'll need the delimiter if we're doing variable-length records,
+ * and the pad character if we're doing fixed-length records.
+ */
+ delim = t->re_delim;
+ if (F_ISSET(dbp, DB_RE_FIXEDLEN)) {
+ if ((ret = __os_malloc(dbenv, t->re_len, NULL, &pad)) != 0)
+ goto err;
+ memset(pad, t->re_pad, t->re_len);
+ } else
+ COMPQUIET(pad, NULL);
+ for (keyno = 1;; ++keyno) {
+ switch (ret = dbp->get(dbp, NULL, &key, &data, 0)) {
+ case 0:
+ if (fwrite(data.data, 1, data.size, fp) != data.size)
+ goto write_err;
+ break;
+ case DB_KEYEMPTY:
+ if (F_ISSET(dbp, DB_RE_FIXEDLEN) &&
+ fwrite(pad, 1, t->re_len, fp) != t->re_len)
+ goto write_err;
+ break;
+ case DB_NOTFOUND:
+ ret = 0;
+ goto done;
+ }
+ if (!F_ISSET(dbp, DB_RE_FIXEDLEN) &&
+ fwrite(&delim, 1, 1, fp) != 1) {
+write_err: ret = errno;
+ __db_err(dbp->dbenv,
+ "%s: write failed to backing file: %s",
+ t->re_source, strerror(ret));
+ goto err;
+ }
+ }
+
+err:
+done: /* Close the file descriptor. */
+ if (fp != NULL && fclose(fp) != 0) {
+ if (ret == 0)
+ ret = errno;
+ __db_err(dbenv, "%s: %s", t->re_source, db_strerror(errno));
+ }
+
+ /* Discard the cursor. */
+ if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ if (ret == 0)
+ t->re_modified = 0;
+
+ return (ret);
+}
+
+/*
+ * __ram_sread --
+ * Read records from a source file.
+ */
+static int
+__ram_sread(dbc, top)
+ DBC *dbc;
+ db_recno_t top;
+{
+ BTREE *t;
+ DB *dbp;
+ DBT data;
+ db_recno_t recno;
+ size_t len;
+ int ch, ret, was_modified;
+
+ t = dbc->dbp->bt_internal;
+ dbp = dbc->dbp;
+ was_modified = t->re_modified;
+
+ if ((ret = __bam_nrecs(dbc, &recno)) != 0)
+ return (ret);
+
+ /* Use the record data return memory, it's only a short-term use. */
+ len = F_ISSET(dbp, DB_RE_FIXEDLEN) ? t->re_len : 256;
+ if (dbc->rdata.ulen < len) {
+ if ((ret = __os_realloc(
+ dbp->dbenv, len, NULL, &dbc->rdata.data)) != 0) {
+ dbc->rdata.ulen = 0;
+ dbc->rdata.data = NULL;
+ return (ret);
+ }
+ dbc->rdata.ulen = len;
+ }
+
+ memset(&data, 0, sizeof(data));
+ while (recno < top) {
+ data.data = dbc->rdata.data;
+ data.size = 0;
+ if (F_ISSET(dbp, DB_RE_FIXEDLEN))
+ for (len = t->re_len; len > 0; --len) {
+ if ((ch = getc(t->re_fp)) == EOF)
+ goto eof;
+ ((u_int8_t *)data.data)[data.size++] = ch;
+ }
+ else
+ for (;;) {
+ if ((ch = getc(t->re_fp)) == EOF)
+ goto eof;
+ if (ch == t->re_delim)
+ break;
+
+ ((u_int8_t *)data.data)[data.size++] = ch;
+ if (data.size == dbc->rdata.ulen) {
+ if ((ret = __os_realloc(dbp->dbenv,
+ dbc->rdata.ulen *= 2,
+ NULL, &dbc->rdata.data)) != 0) {
+ dbc->rdata.ulen = 0;
+ dbc->rdata.data = NULL;
+ return (ret);
+ } else
+ data.data = dbc->rdata.data;
+ }
+ }
+
+ /*
+ * Another process may have read this record from the input
+ * file and stored it into the database already, in which
+ * case we don't need to repeat that operation. We detect
+ * this by checking if the last record we've read is greater
+ * or equal to the number of records in the database.
+ */
+ if (t->re_last >= recno) {
+ ++recno;
+ if ((ret = __ram_add(dbc, &recno, &data, 0, 0)) != 0)
+ goto err;
+ }
+ ++t->re_last;
+ }
+
+ if (0) {
+eof: t->re_eof = 1;
+ ret = DB_NOTFOUND;
+ }
+err: if (!was_modified)
+ t->re_modified = 0;
+
+ return (ret);
+}
+
+/*
+ * __ram_add --
+ * Add records into the tree.
+ */
+static int
+__ram_add(dbc, recnop, data, flags, bi_flags)
+ DBC *dbc;
+ db_recno_t *recnop;
+ DBT *data;
+ u_int32_t flags, bi_flags;
+{
+ BKEYDATA *bk;
+ BTREE_CURSOR *cp;
+ int exact, ret, stack;
+
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+retry: /* Find the slot for insertion. */
+ if ((ret = __bam_rsearch(dbc, recnop,
+ S_INSERT | (flags == DB_APPEND ? S_APPEND : 0), 1, &exact)) != 0)
+ return (ret);
+ stack = 1;
+ cp->page = cp->csp->page;
+ cp->pgno = cp->csp->page->pgno;
+ cp->indx = cp->csp->indx;
+
+ /*
+ * The application may modify the data based on the selected record
+ * number.
+ */
+ if (flags == DB_APPEND && dbc->dbp->db_append_recno != NULL &&
+ (ret = dbc->dbp->db_append_recno(dbc->dbp, data, *recnop)) != 0)
+ goto err;
+
+ /*
+ * If re-numbering records, the on-page deleted flag means this record
+ * was implicitly created. If not re-numbering records, the on-page
+ * deleted flag means this record was implicitly created, or, it was
+ * deleted at some time.
+ *
+ * If DB_NOOVERWRITE is set and the item already exists in the tree,
+ * return an error unless the item was either marked for deletion or
+ * only implicitly created.
+ */
+ if (exact) {
+ bk = GET_BKEYDATA(cp->page, cp->indx);
+ if (!B_DISSET(bk->type) && flags == DB_NOOVERWRITE) {
+ ret = DB_KEYEXIST;
+ goto err;
+ }
+ }
+
+ /*
+ * Select the arguments for __bam_iitem() and do the insert. If the
+ * key is an exact match, or we're replacing the data item with a
+ * new data item, replace the current item. If the key isn't an exact
+ * match, we're inserting a new key/data pair, before the search
+ * location.
+ */
+ switch (ret = __bam_iitem(dbc,
+ NULL, data, exact ? DB_CURRENT : DB_BEFORE, bi_flags)) {
+ case 0:
+ /*
+ * Don't adjust anything.
+ *
+ * If we inserted a record, no cursors need adjusting because
+ * the only new record it's possible to insert is at the very
+ * end of the tree. The necessary adjustments to the internal
+ * page counts were made by __bam_iitem().
+ *
+ * If we overwrote a record, no cursors need adjusting because
+ * future DBcursor->get calls will simply return the underlying
+ * record (there's no adjustment made for the DB_CURRENT flag
+ * when a cursor get operation immediately follows a cursor
+ * delete operation, and the normal adjustment for the DB_NEXT
+ * flag is still correct).
+ */
+ break;
+ case DB_NEEDSPLIT:
+ /* Discard the stack of pages and split the page. */
+ (void)__bam_stkrel(dbc, STK_CLRDBC);
+ stack = 0;
+
+ if ((ret = __bam_split(dbc, recnop)) != 0)
+ goto err;
+
+ goto retry;
+ /* NOTREACHED */
+ default:
+ goto err;
+ }
+
+err: if (stack)
+ __bam_stkrel(dbc, STK_CLRDBC);
+
+ return (ret);
+}