summaryrefslogtreecommitdiff
path: root/db/btree/bt_rsearch.c
diff options
context:
space:
mode:
Diffstat (limited to 'db/btree/bt_rsearch.c')
-rw-r--r--db/btree/bt_rsearch.c429
1 files changed, 429 insertions, 0 deletions
diff --git a/db/btree/bt_rsearch.c b/db/btree/bt_rsearch.c
new file mode 100644
index 000000000..7102cd715
--- /dev/null
+++ b/db/btree/bt_rsearch.c
@@ -0,0 +1,429 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: bt_rsearch.c,v 11.21 2000/03/28 21:50:04 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "btree.h"
+#include "db_shash.h"
+#include "lock.h"
+
+/*
+ * __bam_rsearch --
+ * Search a btree for a record number.
+ *
+ * PUBLIC: int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *));
+ */
+int
+__bam_rsearch(dbc, recnop, flags, stop, exactp)
+ DBC *dbc;
+ db_recno_t *recnop;
+ u_int32_t flags;
+ int stop, *exactp;
+{
+ BINTERNAL *bi;
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ DB_LOCK lock;
+ PAGE *h;
+ RINTERNAL *ri;
+ db_indx_t adjust, deloffset, indx, top;
+ db_lockmode_t lock_mode;
+ db_pgno_t pg;
+ db_recno_t recno, t_recno, total;
+ int ret, stack;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+
+ BT_STK_CLR(cp);
+
+ /*
+ * There are several ways we search a btree tree. The flags argument
+ * specifies if we're acquiring read or write locks and if we are
+ * locking pairs of pages. In addition, if we're adding or deleting
+ * an item, we have to lock the entire tree, regardless. See btree.h
+ * for more details.
+ *
+ * If write-locking pages, we need to know whether or not to acquire a
+ * write lock on a page before getting it. This depends on how deep it
+ * is in tree, which we don't know until we acquire the root page. So,
+ * if we need to lock the root page we may have to upgrade it later,
+ * because we won't get the correct lock initially.
+ *
+ * Retrieve the root page.
+ */
+ pg = cp->root;
+ stack = LF_ISSET(S_STACK);
+ lock_mode = stack ? DB_LOCK_WRITE : DB_LOCK_READ;
+ if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
+ return (ret);
+ if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) {
+ /* Did not read it, so we can release the lock */
+ (void)__LPUT(dbc, lock);
+ return (ret);
+ }
+
+ /*
+ * Decide if we need to save this page; if we do, write lock it.
+ * We deliberately don't lock-couple on this call. If the tree
+ * is tiny, i.e., one page, and two threads are busily updating
+ * the root page, we're almost guaranteed deadlocks galore, as
+ * each one gets a read lock and then blocks the other's attempt
+ * for a write lock.
+ */
+ if (!stack &&
+ ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) ||
+ (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) {
+ (void)memp_fput(dbp->mpf, h, 0);
+ (void)__LPUT(dbc, lock);
+ lock_mode = DB_LOCK_WRITE;
+ if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
+ return (ret);
+ if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) {
+ /* Did not read it, so we can release the lock */
+ (void)__LPUT(dbc, lock);
+ return (ret);
+ }
+ stack = 1;
+ }
+
+ /*
+ * If appending to the tree, set the record number now -- we have the
+ * root page locked.
+ *
+ * Delete only deletes exact matches, read only returns exact matches.
+ * Note, this is different from __bam_search(), which returns non-exact
+ * matches for read.
+ *
+ * The record may not exist. We can only return the correct location
+ * for the record immediately after the last record in the tree, so do
+ * a fast check now.
+ */
+ total = RE_NREC(h);
+ if (LF_ISSET(S_APPEND)) {
+ *exactp = 0;
+ *recnop = recno = total + 1;
+ } else {
+ recno = *recnop;
+ if (recno <= total)
+ *exactp = 1;
+ else {
+ *exactp = 0;
+ if (!LF_ISSET(S_PAST_EOF) || recno > total + 1) {
+ /*
+ * Keep the page locked for serializability.
+ *
+ * XXX
+ * This leaves the root page locked, which will
+ * eliminate any concurrency. A possible fix
+ * would be to lock the last leaf page instead.
+ */
+ (void)memp_fput(dbp->mpf, h, 0);
+ (void)__TLPUT(dbc, lock);
+ return (DB_NOTFOUND);
+ }
+ }
+ }
+
+ /*
+ * !!!
+ * Record numbers in the tree are 0-based, but the recno is
+ * 1-based. All of the calculations below have to take this
+ * into account.
+ */
+ for (total = 0;;) {
+ switch (TYPE(h)) {
+ case P_LBTREE:
+ case P_LDUP:
+ recno -= total;
+ /*
+ * There may be logically deleted records on the page.
+ * If there are enough, the record may not exist.
+ */
+ if (TYPE(h) == P_LBTREE) {
+ adjust = P_INDX;
+ deloffset = O_INDX;
+ } else {
+ adjust = O_INDX;
+ deloffset = 0;
+ }
+ for (t_recno = 0, indx = 0;; indx += adjust) {
+ if (indx >= NUM_ENT(h)) {
+ *exactp = 0;
+ if (!LF_ISSET(S_PAST_EOF) ||
+ recno > t_recno + 1) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
+ }
+ if (!B_DISSET(
+ GET_BKEYDATA(h, indx + deloffset)->type) &&
+ ++t_recno == recno)
+ break;
+ }
+
+ /* Correct from 1-based to 0-based for a page offset. */
+ BT_STK_ENTER(dbp->dbenv,
+ cp, h, indx, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ return (0);
+ case P_IBTREE:
+ for (indx = 0, top = NUM_ENT(h);;) {
+ bi = GET_BINTERNAL(h, indx);
+ if (++indx == top || total + bi->nrecs >= recno)
+ break;
+ total += bi->nrecs;
+ }
+ pg = bi->pgno;
+ break;
+ case P_LRECNO:
+ recno -= total;
+
+ /* Correct from 1-based to 0-based for a page offset. */
+ --recno;
+ BT_STK_ENTER(dbp->dbenv,
+ cp, h, recno, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ return (0);
+ case P_IRECNO:
+ for (indx = 0, top = NUM_ENT(h);;) {
+ ri = GET_RINTERNAL(h, indx);
+ if (++indx == top || total + ri->nrecs >= recno)
+ break;
+ total += ri->nrecs;
+ }
+ pg = ri->pgno;
+ break;
+ default:
+ return (__db_pgfmt(dbp, h->pgno));
+ }
+ --indx;
+
+ if (stack) {
+ /* Return if this is the lowest page wanted. */
+ if (LF_ISSET(S_PARENT) && stop == h->level) {
+ BT_STK_ENTER(dbp->dbenv,
+ cp, h, indx, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+ return (0);
+ }
+ BT_STK_PUSH(dbp->dbenv,
+ cp, h, indx, lock, lock_mode, ret);
+ if (ret != 0)
+ goto err;
+
+ lock_mode = DB_LOCK_WRITE;
+ if ((ret =
+ __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
+ goto err;
+ } else {
+ /*
+ * Decide if we want to return a pointer to the next
+ * page in the stack. If we do, write lock it and
+ * never unlock it.
+ */
+ if ((LF_ISSET(S_PARENT) &&
+ (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) ||
+ (h->level - 1) == LEAFLEVEL)
+ stack = 1;
+
+ (void)memp_fput(dbp->mpf, h, 0);
+
+ lock_mode = stack &&
+ LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ;
+ if ((ret = __db_lget(dbc,
+ LCK_COUPLE, pg, lock_mode, 0, &lock)) != 0) {
+ /*
+ * If we fail, discard the lock we held. This
+ * is OK because this only happens when we are
+ * descending the tree holding read-locks.
+ */
+ __LPUT(dbc, lock);
+ goto err;
+ }
+ }
+
+ if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0)
+ goto err;
+ }
+ /* NOTREACHED */
+
+err: BT_STK_POP(cp);
+ __bam_stkrel(dbc, 0);
+ return (ret);
+}
+
+/*
+ * __bam_adjust --
+ * Adjust the tree after adding or deleting a record.
+ *
+ * PUBLIC: int __bam_adjust __P((DBC *, int32_t));
+ */
+int
+__bam_adjust(dbc, adjust)
+ DBC *dbc;
+ int32_t adjust;
+{
+ BTREE_CURSOR *cp;
+ DB *dbp;
+ EPG *epg;
+ PAGE *h;
+ db_pgno_t root_pgno;
+ int ret;
+
+ dbp = dbc->dbp;
+ cp = (BTREE_CURSOR *)dbc->internal;
+ root_pgno = cp->root;
+
+ /* Update the record counts for the tree. */
+ for (epg = cp->sp; epg <= cp->csp; ++epg) {
+ h = epg->page;
+ if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) {
+ if (DB_LOGGING(dbc) &&
+ (ret = __bam_cadjust_log(dbp->dbenv,
+ dbc->txn, &LSN(h), 0, dbp->log_fileid,
+ PGNO(h), &LSN(h), (u_int32_t)epg->indx, adjust,
+ PGNO(h) == root_pgno ? CAD_UPDATEROOT : 0)) != 0)
+ return (ret);
+
+ if (TYPE(h) == P_IBTREE)
+ GET_BINTERNAL(h, epg->indx)->nrecs += adjust;
+ else
+ GET_RINTERNAL(h, epg->indx)->nrecs += adjust;
+
+ if (PGNO(h) == root_pgno)
+ RE_NREC_ADJ(h, adjust);
+
+ if ((ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0)
+ return (ret);
+ }
+ }
+ return (0);
+}
+
+/*
+ * __bam_nrecs --
+ * Return the number of records in the tree.
+ *
+ * PUBLIC: int __bam_nrecs __P((DBC *, db_recno_t *));
+ */
+int
+__bam_nrecs(dbc, rep)
+ DBC *dbc;
+ db_recno_t *rep;
+{
+ DB *dbp;
+ DB_LOCK lock;
+ PAGE *h;
+ db_pgno_t pgno;
+ int ret;
+
+ dbp = dbc->dbp;
+
+ pgno = dbc->internal->root;
+ if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &lock)) != 0)
+ return (ret);
+ if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0)
+ return (ret);
+
+ *rep = RE_NREC(h);
+
+ (void)memp_fput(dbp->mpf, h, 0);
+ (void)__TLPUT(dbc, lock);
+
+ return (0);
+}
+
+/*
+ * __bam_total --
+ * Return the number of records below a page.
+ *
+ * PUBLIC: db_recno_t __bam_total __P((PAGE *));
+ */
+db_recno_t
+__bam_total(h)
+ PAGE *h;
+{
+ db_recno_t nrecs;
+ db_indx_t indx, top;
+
+ nrecs = 0;
+ top = NUM_ENT(h);
+
+ switch (TYPE(h)) {
+ case P_LBTREE:
+ /* Check for logically deleted records. */
+ for (indx = 0; indx < top; indx += P_INDX)
+ if (!B_DISSET(GET_BKEYDATA(h, indx + O_INDX)->type))
+ ++nrecs;
+ break;
+ case P_LDUP:
+ /* Check for logically deleted records. */
+ for (indx = 0; indx < top; indx += O_INDX)
+ if (!B_DISSET(GET_BKEYDATA(h, indx)->type))
+ ++nrecs;
+ break;
+ case P_IBTREE:
+ for (indx = 0; indx < top; indx += O_INDX)
+ nrecs += GET_BINTERNAL(h, indx)->nrecs;
+ break;
+ case P_LRECNO:
+ nrecs = NUM_ENT(h);
+ break;
+ case P_IRECNO:
+ for (indx = 0; indx < top; indx += O_INDX)
+ nrecs += GET_RINTERNAL(h, indx)->nrecs;
+ break;
+ }
+
+ return (nrecs);
+}