diff options
Diffstat (limited to 'db/btree/bt_rsearch.c')
-rw-r--r-- | db/btree/bt_rsearch.c | 429 |
1 files changed, 429 insertions, 0 deletions
diff --git a/db/btree/bt_rsearch.c b/db/btree/bt_rsearch.c new file mode 100644 index 000000000..7102cd715 --- /dev/null +++ b/db/btree/bt_rsearch.c @@ -0,0 +1,429 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: bt_rsearch.c,v 11.21 2000/03/28 21:50:04 ubell Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" +#include "db_shash.h" +#include "lock.h" + +/* + * __bam_rsearch -- + * Search a btree for a record number. + * + * PUBLIC: int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *)); + */ +int +__bam_rsearch(dbc, recnop, flags, stop, exactp) + DBC *dbc; + db_recno_t *recnop; + u_int32_t flags; + int stop, *exactp; +{ + BINTERNAL *bi; + BTREE_CURSOR *cp; + DB *dbp; + DB_LOCK lock; + PAGE *h; + RINTERNAL *ri; + db_indx_t adjust, deloffset, indx, top; + db_lockmode_t lock_mode; + db_pgno_t pg; + db_recno_t recno, t_recno, total; + int ret, stack; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + + BT_STK_CLR(cp); + + /* + * There are several ways we search a btree tree. The flags argument + * specifies if we're acquiring read or write locks and if we are + * locking pairs of pages. In addition, if we're adding or deleting + * an item, we have to lock the entire tree, regardless. See btree.h + * for more details. + * + * If write-locking pages, we need to know whether or not to acquire a + * write lock on a page before getting it. This depends on how deep it + * is in tree, which we don't know until we acquire the root page. So, + * if we need to lock the root page we may have to upgrade it later, + * because we won't get the correct lock initially. + * + * Retrieve the root page. + */ + pg = cp->root; + stack = LF_ISSET(S_STACK); + lock_mode = stack ? DB_LOCK_WRITE : DB_LOCK_READ; + if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0) + return (ret); + if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) { + /* Did not read it, so we can release the lock */ + (void)__LPUT(dbc, lock); + return (ret); + } + + /* + * Decide if we need to save this page; if we do, write lock it. + * We deliberately don't lock-couple on this call. If the tree + * is tiny, i.e., one page, and two threads are busily updating + * the root page, we're almost guaranteed deadlocks galore, as + * each one gets a read lock and then blocks the other's attempt + * for a write lock. + */ + if (!stack && + ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) || + (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) { + (void)memp_fput(dbp->mpf, h, 0); + (void)__LPUT(dbc, lock); + lock_mode = DB_LOCK_WRITE; + if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0) + return (ret); + if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) { + /* Did not read it, so we can release the lock */ + (void)__LPUT(dbc, lock); + return (ret); + } + stack = 1; + } + + /* + * If appending to the tree, set the record number now -- we have the + * root page locked. + * + * Delete only deletes exact matches, read only returns exact matches. + * Note, this is different from __bam_search(), which returns non-exact + * matches for read. + * + * The record may not exist. We can only return the correct location + * for the record immediately after the last record in the tree, so do + * a fast check now. + */ + total = RE_NREC(h); + if (LF_ISSET(S_APPEND)) { + *exactp = 0; + *recnop = recno = total + 1; + } else { + recno = *recnop; + if (recno <= total) + *exactp = 1; + else { + *exactp = 0; + if (!LF_ISSET(S_PAST_EOF) || recno > total + 1) { + /* + * Keep the page locked for serializability. + * + * XXX + * This leaves the root page locked, which will + * eliminate any concurrency. A possible fix + * would be to lock the last leaf page instead. + */ + (void)memp_fput(dbp->mpf, h, 0); + (void)__TLPUT(dbc, lock); + return (DB_NOTFOUND); + } + } + } + + /* + * !!! + * Record numbers in the tree are 0-based, but the recno is + * 1-based. All of the calculations below have to take this + * into account. + */ + for (total = 0;;) { + switch (TYPE(h)) { + case P_LBTREE: + case P_LDUP: + recno -= total; + /* + * There may be logically deleted records on the page. + * If there are enough, the record may not exist. + */ + if (TYPE(h) == P_LBTREE) { + adjust = P_INDX; + deloffset = O_INDX; + } else { + adjust = O_INDX; + deloffset = 0; + } + for (t_recno = 0, indx = 0;; indx += adjust) { + if (indx >= NUM_ENT(h)) { + *exactp = 0; + if (!LF_ISSET(S_PAST_EOF) || + recno > t_recno + 1) { + ret = DB_NOTFOUND; + goto err; + } + } + if (!B_DISSET( + GET_BKEYDATA(h, indx + deloffset)->type) && + ++t_recno == recno) + break; + } + + /* Correct from 1-based to 0-based for a page offset. */ + BT_STK_ENTER(dbp->dbenv, + cp, h, indx, lock, lock_mode, ret); + if (ret != 0) + goto err; + return (0); + case P_IBTREE: + for (indx = 0, top = NUM_ENT(h);;) { + bi = GET_BINTERNAL(h, indx); + if (++indx == top || total + bi->nrecs >= recno) + break; + total += bi->nrecs; + } + pg = bi->pgno; + break; + case P_LRECNO: + recno -= total; + + /* Correct from 1-based to 0-based for a page offset. */ + --recno; + BT_STK_ENTER(dbp->dbenv, + cp, h, recno, lock, lock_mode, ret); + if (ret != 0) + goto err; + return (0); + case P_IRECNO: + for (indx = 0, top = NUM_ENT(h);;) { + ri = GET_RINTERNAL(h, indx); + if (++indx == top || total + ri->nrecs >= recno) + break; + total += ri->nrecs; + } + pg = ri->pgno; + break; + default: + return (__db_pgfmt(dbp, h->pgno)); + } + --indx; + + if (stack) { + /* Return if this is the lowest page wanted. */ + if (LF_ISSET(S_PARENT) && stop == h->level) { + BT_STK_ENTER(dbp->dbenv, + cp, h, indx, lock, lock_mode, ret); + if (ret != 0) + goto err; + return (0); + } + BT_STK_PUSH(dbp->dbenv, + cp, h, indx, lock, lock_mode, ret); + if (ret != 0) + goto err; + + lock_mode = DB_LOCK_WRITE; + if ((ret = + __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0) + goto err; + } else { + /* + * Decide if we want to return a pointer to the next + * page in the stack. If we do, write lock it and + * never unlock it. + */ + if ((LF_ISSET(S_PARENT) && + (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) || + (h->level - 1) == LEAFLEVEL) + stack = 1; + + (void)memp_fput(dbp->mpf, h, 0); + + lock_mode = stack && + LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ; + if ((ret = __db_lget(dbc, + LCK_COUPLE, pg, lock_mode, 0, &lock)) != 0) { + /* + * If we fail, discard the lock we held. This + * is OK because this only happens when we are + * descending the tree holding read-locks. + */ + __LPUT(dbc, lock); + goto err; + } + } + + if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) + goto err; + } + /* NOTREACHED */ + +err: BT_STK_POP(cp); + __bam_stkrel(dbc, 0); + return (ret); +} + +/* + * __bam_adjust -- + * Adjust the tree after adding or deleting a record. + * + * PUBLIC: int __bam_adjust __P((DBC *, int32_t)); + */ +int +__bam_adjust(dbc, adjust) + DBC *dbc; + int32_t adjust; +{ + BTREE_CURSOR *cp; + DB *dbp; + EPG *epg; + PAGE *h; + db_pgno_t root_pgno; + int ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + root_pgno = cp->root; + + /* Update the record counts for the tree. */ + for (epg = cp->sp; epg <= cp->csp; ++epg) { + h = epg->page; + if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) { + if (DB_LOGGING(dbc) && + (ret = __bam_cadjust_log(dbp->dbenv, + dbc->txn, &LSN(h), 0, dbp->log_fileid, + PGNO(h), &LSN(h), (u_int32_t)epg->indx, adjust, + PGNO(h) == root_pgno ? CAD_UPDATEROOT : 0)) != 0) + return (ret); + + if (TYPE(h) == P_IBTREE) + GET_BINTERNAL(h, epg->indx)->nrecs += adjust; + else + GET_RINTERNAL(h, epg->indx)->nrecs += adjust; + + if (PGNO(h) == root_pgno) + RE_NREC_ADJ(h, adjust); + + if ((ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0) + return (ret); + } + } + return (0); +} + +/* + * __bam_nrecs -- + * Return the number of records in the tree. + * + * PUBLIC: int __bam_nrecs __P((DBC *, db_recno_t *)); + */ +int +__bam_nrecs(dbc, rep) + DBC *dbc; + db_recno_t *rep; +{ + DB *dbp; + DB_LOCK lock; + PAGE *h; + db_pgno_t pgno; + int ret; + + dbp = dbc->dbp; + + pgno = dbc->internal->root; + if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &lock)) != 0) + return (ret); + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + return (ret); + + *rep = RE_NREC(h); + + (void)memp_fput(dbp->mpf, h, 0); + (void)__TLPUT(dbc, lock); + + return (0); +} + +/* + * __bam_total -- + * Return the number of records below a page. + * + * PUBLIC: db_recno_t __bam_total __P((PAGE *)); + */ +db_recno_t +__bam_total(h) + PAGE *h; +{ + db_recno_t nrecs; + db_indx_t indx, top; + + nrecs = 0; + top = NUM_ENT(h); + + switch (TYPE(h)) { + case P_LBTREE: + /* Check for logically deleted records. */ + for (indx = 0; indx < top; indx += P_INDX) + if (!B_DISSET(GET_BKEYDATA(h, indx + O_INDX)->type)) + ++nrecs; + break; + case P_LDUP: + /* Check for logically deleted records. */ + for (indx = 0; indx < top; indx += O_INDX) + if (!B_DISSET(GET_BKEYDATA(h, indx)->type)) + ++nrecs; + break; + case P_IBTREE: + for (indx = 0; indx < top; indx += O_INDX) + nrecs += GET_BINTERNAL(h, indx)->nrecs; + break; + case P_LRECNO: + nrecs = NUM_ENT(h); + break; + case P_IRECNO: + for (indx = 0; indx < top; indx += O_INDX) + nrecs += GET_RINTERNAL(h, indx)->nrecs; + break; + } + + return (nrecs); +} |