diff options
Diffstat (limited to 'db/btree')
-rw-r--r-- | db/btree/bt_compare.c | 211 | ||||
-rw-r--r-- | db/btree/bt_conv.c | 98 | ||||
-rw-r--r-- | db/btree/bt_curadj.c | 573 | ||||
-rw-r--r-- | db/btree/bt_cursor.c | 2131 | ||||
-rw-r--r-- | db/btree/bt_delete.c | 530 | ||||
-rw-r--r-- | db/btree/bt_method.c | 387 | ||||
-rw-r--r-- | db/btree/bt_open.c | 468 | ||||
-rw-r--r-- | db/btree/bt_put.c | 859 | ||||
-rw-r--r-- | db/btree/bt_rec.c | 1219 | ||||
-rw-r--r-- | db/btree/bt_reclaim.c | 53 | ||||
-rw-r--r-- | db/btree/bt_recno.c | 1369 | ||||
-rw-r--r-- | db/btree/bt_rsearch.c | 429 | ||||
-rw-r--r-- | db/btree/bt_search.c | 471 | ||||
-rw-r--r-- | db/btree/bt_split.c | 1126 | ||||
-rw-r--r-- | db/btree/bt_stat.c | 480 | ||||
-rw-r--r-- | db/btree/bt_upgrade.c | 164 | ||||
-rw-r--r-- | db/btree/bt_verify.c | 2237 | ||||
-rw-r--r-- | db/btree/btree.src | 296 | ||||
-rw-r--r-- | db/btree/btree_auto.c | 2284 |
19 files changed, 15385 insertions, 0 deletions
diff --git a/db/btree/bt_compare.c b/db/btree/bt_compare.c new file mode 100644 index 000000000..91481c313 --- /dev/null +++ b/db/btree/bt_compare.c @@ -0,0 +1,211 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: bt_compare.c,v 11.12 2000/10/26 19:00:28 krinsky Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +/* + * __bam_cmp -- + * Compare a key to a given record. + * + * PUBLIC: int __bam_cmp __P((DB *, const DBT *, PAGE *, + * PUBLIC: u_int32_t, int (*)(DB *, const DBT *, const DBT *), int *)); + */ +int +__bam_cmp(dbp, dbt, h, indx, func, cmpp) + DB *dbp; + const DBT *dbt; + PAGE *h; + u_int32_t indx; + int (*func)__P((DB *, const DBT *, const DBT *)); + int *cmpp; +{ + BINTERNAL *bi; + BKEYDATA *bk; + BOVERFLOW *bo; + DBT pg_dbt; + + /* + * Returns: + * < 0 if dbt is < page record + * = 0 if dbt is = page record + * > 0 if dbt is > page record + * + * !!! + * We do not clear the pg_dbt DBT even though it's likely to contain + * random bits. That should be okay, because the app's comparison + * routine had better not be looking at fields other than data/size. + * We don't clear it because we go through this path a lot and it's + * expensive. + */ + switch (TYPE(h)) { + case P_LBTREE: + case P_LDUP: + case P_LRECNO: + bk = GET_BKEYDATA(h, indx); + if (B_TYPE(bk->type) == B_OVERFLOW) + bo = (BOVERFLOW *)bk; + else { + pg_dbt.data = bk->data; + pg_dbt.size = bk->len; + *cmpp = func(dbp, dbt, &pg_dbt); + return (0); + } + break; + case P_IBTREE: + /* + * The following code guarantees that the left-most key on an + * internal page at any place in the tree sorts less than any + * user-specified key. The reason is that if we have reached + * this internal page, we know the user key must sort greater + * than the key we're storing for this page in any internal + * pages at levels above us in the tree. It then follows that + * any user-specified key cannot sort less than the first page + * which we reference, and so there's no reason to call the + * comparison routine. While this may save us a comparison + * routine call or two, the real reason for this is because + * we don't maintain a copy of the smallest key in the tree, + * so that we don't have to update all the levels of the tree + * should the application store a new smallest key. And, so, + * we may not have a key to compare, which makes doing the + * comparison difficult and error prone. + */ + if (indx == 0) { + *cmpp = 1; + return (0); + } + + bi = GET_BINTERNAL(h, indx); + if (B_TYPE(bi->type) == B_OVERFLOW) + bo = (BOVERFLOW *)(bi->data); + else { + pg_dbt.data = bi->data; + pg_dbt.size = bi->len; + *cmpp = func(dbp, dbt, &pg_dbt); + return (0); + } + break; + default: + return (__db_pgfmt(dbp, PGNO(h))); + } + + /* + * Overflow. + */ + return (__db_moff(dbp, dbt, + bo->pgno, bo->tlen, func == __bam_defcmp ? NULL : func, cmpp)); +} + +/* + * __bam_defcmp -- + * Default comparison routine. + * + * PUBLIC: int __bam_defcmp __P((DB *, const DBT *, const DBT *)); + */ +int +__bam_defcmp(dbp, a, b) + DB *dbp; + const DBT *a, *b; +{ + size_t len; + u_int8_t *p1, *p2; + + COMPQUIET(dbp, NULL); + + /* + * Returns: + * < 0 if a is < b + * = 0 if a is = b + * > 0 if a is > b + * + * XXX + * If a size_t doesn't fit into a long, or if the difference between + * any two characters doesn't fit into an int, this routine can lose. + * What we need is a signed integral type that's guaranteed to be at + * least as large as a size_t, and there is no such thing. + */ + len = a->size > b->size ? b->size : a->size; + for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2) + if (*p1 != *p2) + return ((long)*p1 - (long)*p2); + return ((long)a->size - (long)b->size); +} + +/* + * __bam_defpfx -- + * Default prefix routine. + * + * PUBLIC: size_t __bam_defpfx __P((DB *, const DBT *, const DBT *)); + */ +size_t +__bam_defpfx(dbp, a, b) + DB *dbp; + const DBT *a, *b; +{ + size_t cnt, len; + u_int8_t *p1, *p2; + + COMPQUIET(dbp, NULL); + + cnt = 1; + len = a->size > b->size ? b->size : a->size; + for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2, ++cnt) + if (*p1 != *p2) + return (cnt); + + /* + * We know that a->size must be <= b->size, or they wouldn't be + * in this order. + */ + return (a->size < b->size ? a->size + 1 : a->size); +} diff --git a/db/btree/bt_conv.c b/db/btree/bt_conv.c new file mode 100644 index 000000000..fd30f375f --- /dev/null +++ b/db/btree/bt_conv.c @@ -0,0 +1,98 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: bt_conv.c,v 11.6 2000/03/31 00:30:26 ubell Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_swap.h" +#include "btree.h" + +/* + * __bam_pgin -- + * Convert host-specific page layout from the host-independent format + * stored on disk. + * + * PUBLIC: int __bam_pgin __P((DB_ENV *, db_pgno_t, void *, DBT *)); + */ +int +__bam_pgin(dbenv, pg, pp, cookie) + DB_ENV *dbenv; + db_pgno_t pg; + void *pp; + DBT *cookie; +{ + DB_PGINFO *pginfo; + PAGE *h; + + pginfo = (DB_PGINFO *)cookie->data; + if (!pginfo->needswap) + return (0); + + h = pp; + return (TYPE(h) == P_BTREEMETA ? __bam_mswap(pp) : + __db_byteswap(dbenv, pg, pp, pginfo->db_pagesize, 1)); +} + +/* + * __bam_pgout -- + * Convert host-specific page layout to the host-independent format + * stored on disk. + * + * PUBLIC: int __bam_pgout __P((DB_ENV *, db_pgno_t, void *, DBT *)); + */ +int +__bam_pgout(dbenv, pg, pp, cookie) + DB_ENV *dbenv; + db_pgno_t pg; + void *pp; + DBT *cookie; +{ + DB_PGINFO *pginfo; + PAGE *h; + + pginfo = (DB_PGINFO *)cookie->data; + if (!pginfo->needswap) + return (0); + + h = pp; + return (TYPE(h) == P_BTREEMETA ? __bam_mswap(pp) : + __db_byteswap(dbenv, pg, pp, pginfo->db_pagesize, 0)); +} + +/* + * __bam_mswap -- + * Swap the bytes on the btree metadata page. + * + * PUBLIC: int __bam_mswap __P((PAGE *)); + */ +int +__bam_mswap(pg) + PAGE *pg; +{ + u_int8_t *p; + + __db_metaswap(pg); + + p = (u_int8_t *)pg + sizeof(DBMETA); + + SWAP32(p); /* maxkey */ + SWAP32(p); /* minkey */ + SWAP32(p); /* re_len */ + SWAP32(p); /* re_pad */ + SWAP32(p); /* root */ + + return (0); +} diff --git a/db/btree/bt_curadj.c b/db/btree/bt_curadj.c new file mode 100644 index 000000000..011acd2f4 --- /dev/null +++ b/db/btree/bt_curadj.c @@ -0,0 +1,573 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: bt_curadj.c,v 11.20 2001/01/17 16:15:49 bostic Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" +#include "txn.h" + +static int __bam_opd_cursor __P((DB *, DBC *, db_pgno_t, u_int32_t, u_int32_t)); + +#ifdef DEBUG +/* + * __bam_cprint -- + * Display the current internal cursor. + * + * PUBLIC: void __bam_cprint __P((DBC *)); + */ +void +__bam_cprint(dbc) + DBC *dbc; +{ + BTREE_CURSOR *cp; + + cp = (BTREE_CURSOR *)dbc->internal; + + fprintf(stderr, "\tinternal: ovflsize: %lu", (u_long)cp->ovflsize); + if (dbc->dbtype == DB_RECNO) + fprintf(stderr, " recno: %lu", (u_long)cp->recno); + if (F_ISSET(cp, C_DELETED)) + fprintf(stderr, " (deleted)"); + fprintf(stderr, "\n"); +} +#endif + +/* + * Cursor adjustments are logged if they are for subtransactions. This is + * because it's possible for a subtransaction to adjust cursors which will + * still be active after the subtransaction aborts, and so which must be + * restored to their previous locations. Cursors that can be both affected + * by our cursor adjustments and active after our transaction aborts can + * only be found in our parent transaction -- cursors in other transactions, + * including other child transactions of our parent, must have conflicting + * locker IDs, and so cannot be affected by adjustments in this transaction. + */ + +/* + * __bam_ca_delete -- + * Update the cursors when items are deleted and when already deleted + * items are overwritten. Return the number of relevant cursors found. + * + * PUBLIC: int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, int)); + */ +int +__bam_ca_delete(dbp, pgno, indx, delete) + DB *dbp; + db_pgno_t pgno; + u_int32_t indx; + int delete; +{ + BTREE_CURSOR *cp; + DB *ldbp; + DB_ENV *dbenv; + DBC *dbc; + int count; /* !!!: Has to contain max number of cursors. */ + + dbenv = dbp->dbenv; + + /* + * Adjust the cursors. We have the page write locked, so the + * only other cursors that can be pointing at a page are + * those in the same thread of control. Unfortunately, we don't + * know that they're using the same DB handle, so traverse + * all matching DB handles in the same DB_ENV, then all cursors + * on each matching DB handle. + * + * Each cursor is single-threaded, so we only need to lock the + * list of DBs and then the list of cursors in each DB. + */ + MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp); + for (count = 0, ldbp = __dblist_get(dbenv, dbp->adj_fileid); + ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; + ldbp = LIST_NEXT(ldbp, dblistlinks)) { + MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); + for (dbc = TAILQ_FIRST(&ldbp->active_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (BTREE_CURSOR *)dbc->internal; + if (cp->pgno == pgno && cp->indx == indx) { + if (delete) + F_SET(cp, C_DELETED); + else + F_CLR(cp, C_DELETED); + ++count; + } + } + MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); + } + MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp); + + return (count); +} + +/* + * __ram_ca_delete -- + * Return the number of relevant cursors. + * + * PUBLIC: int __ram_ca_delete __P((DB *, db_pgno_t)); + */ +int +__ram_ca_delete(dbp, root_pgno) + DB *dbp; + db_pgno_t root_pgno; +{ + DB *ldbp; + DBC *dbc; + DB_ENV *dbenv; + int found; + + found = 0; + dbenv = dbp->dbenv; + + /* + * Review the cursors. See the comment in __bam_ca_delete(). + */ + MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp); + for (ldbp = __dblist_get(dbenv, dbp->adj_fileid); + found == 0 && ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; + ldbp = LIST_NEXT(ldbp, dblistlinks)) { + MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); + for (dbc = TAILQ_FIRST(&ldbp->active_queue); + found == 0 && dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) + if (dbc->internal->root == root_pgno) + found = 1; + MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); + } + MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp); + return (found); +} + +/* + * __bam_ca_di -- + * Adjust the cursors during a delete or insert. + * + * PUBLIC: int __bam_ca_di __P((DBC *, db_pgno_t, u_int32_t, int)); + */ +int +__bam_ca_di(my_dbc, pgno, indx, adjust) + DBC *my_dbc; + db_pgno_t pgno; + u_int32_t indx; + int adjust; +{ + DB *dbp, *ldbp; + DB_ENV *dbenv; + DB_LSN lsn; + DB_TXN *my_txn; + DBC *dbc; + DBC_INTERNAL *cp; + int found, ret; + + dbp = my_dbc->dbp; + dbenv = dbp->dbenv; + + my_txn = IS_SUBTRANSACTION(my_dbc->txn) ? my_dbc->txn : NULL; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + */ + found = 0; + MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp); + for (ldbp = __dblist_get(dbenv, dbp->adj_fileid); + ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; + ldbp = LIST_NEXT(ldbp, dblistlinks)) { + MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); + for (dbc = TAILQ_FIRST(&ldbp->active_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + if (dbc->dbtype == DB_RECNO) + continue; + cp = dbc->internal; + if (cp->pgno == pgno && cp->indx >= indx) { + /* Cursor indices should never be negative. */ + DB_ASSERT(cp->indx != 0 || adjust > 0); + + cp->indx += adjust; + if (my_txn != NULL && dbc->txn != my_txn) + found = 1; + } + } + MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); + } + MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp); + + if (found != 0 && DB_LOGGING(my_dbc)) { + if ((ret = __bam_curadj_log(dbenv, + my_dbc->txn, &lsn, 0, dbp->log_fileid, + DB_CA_DI, pgno, 0, 0, adjust, indx, 0)) != 0) + return (ret); + } + + return (0); +} + +/* + * __bam_opd_cursor -- create a new opd cursor. + */ +static int +__bam_opd_cursor(dbp, dbc, first, tpgno, ti) + DB *dbp; + DBC *dbc; + db_pgno_t tpgno; + u_int32_t first, ti; +{ + BTREE_CURSOR *cp, *orig_cp; + DBC *dbc_nopd; + int ret; + + orig_cp = (BTREE_CURSOR *)dbc->internal; + dbc_nopd = NULL; + + /* + * Allocate a new cursor and create the stack. If duplicates + * are sorted, we've just created an off-page duplicate Btree. + * If duplicates aren't sorted, we've just created a Recno tree. + */ + if ((ret = __db_c_newopd(dbc, tpgno, &dbc_nopd)) != 0) + return (ret); + + cp = (BTREE_CURSOR *)dbc_nopd->internal; + cp->pgno = tpgno; + cp->indx = ti; + + if (dbp->dup_compare == NULL) { + /* + * Converting to off-page Recno trees is tricky. The + * record number for the cursor is the index + 1 (to + * convert to 1-based record numbers). + */ + cp->recno = ti + 1; + } + + /* + * Transfer the deleted flag from the top-level cursor to the + * created one. + */ + if (F_ISSET(orig_cp, C_DELETED)) { + F_SET(cp, C_DELETED); + F_CLR(orig_cp, C_DELETED); + } + + /* Stack the cursors and reset the initial cursor's index. */ + orig_cp->opd = dbc_nopd; + orig_cp->indx = first; + return (0); +} + +/* + * __bam_ca_dup -- + * Adjust the cursors when moving items from a leaf page to a duplicates + * page. + * + * PUBLIC: int __bam_ca_dup __P((DBC *, + * PUBLIC: u_int32_t, db_pgno_t, u_int32_t, db_pgno_t, u_int32_t)); + */ +int +__bam_ca_dup(my_dbc, first, fpgno, fi, tpgno, ti) + DBC *my_dbc; + db_pgno_t fpgno, tpgno; + u_int32_t first, fi, ti; +{ + BTREE_CURSOR *orig_cp; + DB *dbp, *ldbp; + DBC *dbc; + DB_ENV *dbenv; + DB_LSN lsn; + DB_TXN *my_txn; + int found, ret; + + dbp = my_dbc->dbp; + dbenv = dbp->dbenv; + my_txn = IS_SUBTRANSACTION(my_dbc->txn) ? my_dbc->txn : NULL; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + */ + found = 0; + MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp); + for (ldbp = __dblist_get(dbenv, dbp->adj_fileid); + ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; + ldbp = LIST_NEXT(ldbp, dblistlinks)) { +loop: MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); + for (dbc = TAILQ_FIRST(&ldbp->active_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + /* Find cursors pointing to this record. */ + orig_cp = (BTREE_CURSOR *)dbc->internal; + if (orig_cp->pgno != fpgno || orig_cp->indx != fi) + continue; + + /* + * Since we rescan the list see if this is already + * converted. + */ + if (orig_cp->opd != NULL) + continue; + + MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); + if ((ret = __bam_opd_cursor(dbp, + dbc, first, tpgno, ti)) !=0) + return (ret); + if (my_txn != NULL && dbc->txn != my_txn) + found = 1; + /* We released the MUTEX to get a cursor, start over. */ + goto loop; + } + MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); + } + MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp); + + if (found != 0 && DB_LOGGING(my_dbc)) { + if ((ret = __bam_curadj_log(dbenv, + my_dbc->txn, &lsn, 0, dbp->log_fileid, + DB_CA_DUP, fpgno, tpgno, 0, first, fi, ti)) != 0) + return (ret); + } + return (0); +} + +/* + * __bam_ca_undodup -- + * Adjust the cursors when returning items to a leaf page + * from a duplicate page. + * Called only during undo processing. + * + * PUBLIC: int __bam_ca_undodup __P((DB *, + * PUBLIC: u_int32_t, db_pgno_t, u_int32_t, u_int32_t)); + */ +int +__bam_ca_undodup(dbp, first, fpgno, fi, ti) + DB *dbp; + db_pgno_t fpgno; + u_int32_t first, fi, ti; +{ + BTREE_CURSOR *orig_cp; + DB *ldbp; + DBC *dbc; + DB_ENV *dbenv; + int ret; + + dbenv = dbp->dbenv; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + */ + MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp); + for (ldbp = __dblist_get(dbenv, dbp->adj_fileid); + ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; + ldbp = LIST_NEXT(ldbp, dblistlinks)) { +loop: MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); + for (dbc = TAILQ_FIRST(&ldbp->active_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + orig_cp = (BTREE_CURSOR *)dbc->internal; + + if (orig_cp->pgno != fpgno || + orig_cp->indx != first || + ((BTREE_CURSOR *)orig_cp->opd->internal)->indx + != ti) + continue; + MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); + if ((ret = orig_cp->opd->c_close(orig_cp->opd)) != 0) + return (ret); + orig_cp->opd = NULL; + orig_cp->indx = fi; + /* + * We released the MUTEX to free a cursor, + * start over. + */ + goto loop; + } + MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); + } + MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp); + + return (0); +} + +/* + * __bam_ca_rsplit -- + * Adjust the cursors when doing reverse splits. + * + * PUBLIC: int __bam_ca_rsplit __P((DBC *, db_pgno_t, db_pgno_t)); + */ +int +__bam_ca_rsplit(my_dbc, fpgno, tpgno) + DBC* my_dbc; + db_pgno_t fpgno, tpgno; +{ + DB *dbp, *ldbp; + DBC *dbc; + DB_ENV *dbenv; + DB_LSN lsn; + DB_TXN *my_txn; + int found, ret; + + dbp = my_dbc->dbp; + dbenv = dbp->dbenv; + my_txn = IS_SUBTRANSACTION(my_dbc->txn) ? my_dbc->txn : NULL; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + */ + found = 0; + MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp); + for (ldbp = __dblist_get(dbenv, dbp->adj_fileid); + ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; + ldbp = LIST_NEXT(ldbp, dblistlinks)) { + MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); + for (dbc = TAILQ_FIRST(&ldbp->active_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + if (dbc->dbtype == DB_RECNO) + continue; + if (dbc->internal->pgno == fpgno) { + dbc->internal->pgno = tpgno; + if (my_txn != NULL && dbc->txn != my_txn) + found = 1; + } + } + MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); + } + MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp); + + if (found != 0 && DB_LOGGING(my_dbc)) { + if ((ret = __bam_curadj_log(dbenv, + my_dbc->txn, &lsn, 0, dbp->log_fileid, + DB_CA_RSPLIT, fpgno, tpgno, 0, 0, 0, 0)) != 0) + return (ret); + } + return (0); +} + +/* + * __bam_ca_split -- + * Adjust the cursors when splitting a page. + * + * PUBLIC: int __bam_ca_split __P((DBC *, + * PUBLIC: db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t, int)); + */ +int +__bam_ca_split(my_dbc, ppgno, lpgno, rpgno, split_indx, cleft) + DBC *my_dbc; + db_pgno_t ppgno, lpgno, rpgno; + u_int32_t split_indx; + int cleft; +{ + DB *dbp, *ldbp; + DBC *dbc; + DBC_INTERNAL *cp; + DB_ENV *dbenv; + DB_LSN lsn; + DB_TXN *my_txn; + int found, ret; + + dbp = my_dbc->dbp; + dbenv = dbp->dbenv; + my_txn = IS_SUBTRANSACTION(my_dbc->txn) ? my_dbc->txn : NULL; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + * + * If splitting the page that a cursor was on, the cursor has to be + * adjusted to point to the same record as before the split. Most + * of the time we don't adjust pointers to the left page, because + * we're going to copy its contents back over the original page. If + * the cursor is on the right page, it is decremented by the number of + * records split to the left page. + */ + found = 0; + MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp); + for (ldbp = __dblist_get(dbenv, dbp->adj_fileid); + ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; + ldbp = LIST_NEXT(ldbp, dblistlinks)) { + MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); + for (dbc = TAILQ_FIRST(&ldbp->active_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + if (dbc->dbtype == DB_RECNO) + continue; + cp = dbc->internal; + if (cp->pgno == ppgno) { + if (my_txn != NULL && dbc->txn != my_txn) + found = 1; + if (cp->indx < split_indx) { + if (cleft) + cp->pgno = lpgno; + } else { + cp->pgno = rpgno; + cp->indx -= split_indx; + } + } + } + MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); + } + MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp); + + if (found != 0 && DB_LOGGING(my_dbc)) { + if ((ret = __bam_curadj_log(dbenv, my_dbc->txn, + &lsn, 0, dbp->log_fileid, DB_CA_SPLIT, ppgno, rpgno, + cleft ? lpgno : PGNO_INVALID, 0, split_indx, 0)) != 0) + return (ret); + } + + return (0); +} + +/* + * __bam_ca_undosplit -- + * Adjust the cursors when undoing a split of a page. + * If we grew a level we will execute this for both the + * left and the right pages. + * Called only during undo processing. + * + * PUBLIC: void __bam_ca_undosplit __P((DB *, + * PUBLIC: db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t)); + */ +void +__bam_ca_undosplit(dbp, frompgno, topgno, lpgno, split_indx) + DB *dbp; + db_pgno_t frompgno, topgno, lpgno; + u_int32_t split_indx; +{ + DB *ldbp; + DBC *dbc; + DB_ENV *dbenv; + DBC_INTERNAL *cp; + + dbenv = dbp->dbenv; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + * + * When backing out a split, we move the cursor back + * to the original offset and bump it by the split_indx. + */ + MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp); + for (ldbp = __dblist_get(dbenv, dbp->adj_fileid); + ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; + ldbp = LIST_NEXT(ldbp, dblistlinks)) { + MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); + for (dbc = TAILQ_FIRST(&ldbp->active_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + if (dbc->dbtype == DB_RECNO) + continue; + cp = dbc->internal; + if (cp->pgno == topgno) { + cp->pgno = frompgno; + cp->indx += split_indx; + } else if (cp->pgno == lpgno) + cp->pgno = frompgno; + } + MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); + } + MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp); +} diff --git a/db/btree/bt_cursor.c b/db/btree/bt_cursor.c new file mode 100644 index 000000000..84ab7c807 --- /dev/null +++ b/db/btree/bt_cursor.c @@ -0,0 +1,2131 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: bt_cursor.c,v 11.88 2001/01/11 18:19:49 bostic Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_shash.h" +#include "btree.h" +#include "lock.h" +#include "qam.h" +#include "common_ext.h" + +static int __bam_c_close __P((DBC *, db_pgno_t, int *)); +static int __bam_c_del __P((DBC *)); +static int __bam_c_destroy __P((DBC *)); +static int __bam_c_first __P((DBC *)); +static int __bam_c_get __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *)); +static int __bam_c_getstack __P((DBC *)); +static int __bam_c_last __P((DBC *)); +static int __bam_c_next __P((DBC *, int)); +static int __bam_c_physdel __P((DBC *)); +static int __bam_c_prev __P((DBC *)); +static int __bam_c_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *)); +static void __bam_c_reset __P((BTREE_CURSOR *)); +static int __bam_c_search __P((DBC *, const DBT *, u_int32_t, int *)); +static int __bam_c_writelock __P((DBC *)); +static int __bam_getboth_finddatum __P((DBC *, DBT *)); +static int __bam_getbothc __P((DBC *, DBT *)); +static int __bam_isopd __P((DBC *, db_pgno_t *)); + +/* + * Acquire a new page/lock. If we hold a page/lock, discard the page, and + * lock-couple the lock. + * + * !!! + * We have to handle both where we have a lock to lock-couple and where we + * don't -- we don't duplicate locks when we duplicate cursors if we are + * running in a transaction environment as there's no point if locks are + * never discarded. This means that the cursor may or may not hold a lock. + */ +#undef ACQUIRE +#define ACQUIRE(dbc, mode, lpgno, lock, fpgno, pagep, ret) {\ + if ((pagep) != NULL) { \ + ret = memp_fput((dbc)->dbp->mpf, pagep, 0); \ + pagep = NULL; \ + } else \ + ret = 0; \ + if ((ret) == 0 && STD_LOCKING(dbc)) \ + ret = __db_lget(dbc, \ + (lock).off == LOCK_INVALID ? 0 : LCK_COUPLE, \ + lpgno, mode, 0, &lock); \ + else \ + (lock).off = LOCK_INVALID; \ + if ((ret) == 0) \ + ret = memp_fget((dbc)->dbp->mpf, &(fpgno), 0, &(pagep));\ +} + +/* Acquire a new page/lock for a cursor. */ +#undef ACQUIRE_CUR +#define ACQUIRE_CUR(dbc, mode, ret) { \ + BTREE_CURSOR *__cp = (BTREE_CURSOR *)(dbc)->internal; \ + ACQUIRE(dbc, mode, \ + __cp->pgno, __cp->lock, __cp->pgno, __cp->page, ret); \ + if ((ret) == 0) \ + __cp->lock_mode = (mode); \ +} + +/* + * Acquire a new page/lock for a cursor, and move the cursor on success. + * The reason that this is a separate macro is because we don't want to + * set the pgno/indx fields in the cursor until we actually have the lock, + * otherwise the cursor adjust routines will adjust the cursor even though + * we're not really on the page. + */ +#undef ACQUIRE_CUR_SET +#define ACQUIRE_CUR_SET(dbc, mode, p, ret) { \ + BTREE_CURSOR *__cp = (BTREE_CURSOR *)(dbc)->internal; \ + ACQUIRE(dbc, mode, p, __cp->lock, p, __cp->page, ret); \ + if ((ret) == 0) { \ + __cp->pgno = p; \ + __cp->indx = 0; \ + __cp->lock_mode = (mode); \ + } \ +} + +/* + * Acquire a write lock if we don't already have one. + * + * !!! + * See ACQUIRE macro on why we handle cursors that don't have locks. + */ +#undef ACQUIRE_WRITE_LOCK +#define ACQUIRE_WRITE_LOCK(dbc, ret) { \ + BTREE_CURSOR *__cp = (BTREE_CURSOR *)(dbc)->internal; \ + ret = 0; \ + if (STD_LOCKING(dbc) && \ + __cp->lock_mode != DB_LOCK_WRITE && \ + ((ret) = __db_lget(dbc, \ + __cp->lock.off == LOCK_INVALID ? 0 : LCK_COUPLE, \ + __cp->pgno, DB_LOCK_WRITE, 0, &__cp->lock)) == 0) \ + __cp->lock_mode = DB_LOCK_WRITE; \ +} + +/* Discard the current page/lock. */ +#undef DISCARD +#define DISCARD(dbc, ldiscard, lock, pagep, ret) { \ + int __t_ret; \ + if ((pagep) != NULL) { \ + ret = memp_fput((dbc)->dbp->mpf, pagep, 0); \ + pagep = NULL; \ + } else \ + ret = 0; \ + if ((lock).off != LOCK_INVALID) { \ + __t_ret = ldiscard ? \ + __LPUT((dbc), lock): __TLPUT((dbc), lock); \ + if (__t_ret != 0 && (ret) == 0) \ + ret = __t_ret; \ + (lock).off = LOCK_INVALID; \ + } \ +} + +/* Discard the current page/lock for a cursor. */ +#undef DISCARD_CUR +#define DISCARD_CUR(dbc, ret) { \ + BTREE_CURSOR *__cp = (BTREE_CURSOR *)(dbc)->internal; \ + DISCARD(dbc, 0, __cp->lock, __cp->page, ret); \ + if ((ret) == 0) \ + __cp->lock_mode = DB_LOCK_NG; \ +} + +/* If on-page item is a deleted record. */ +#undef IS_DELETED +#define IS_DELETED(page, indx) \ + B_DISSET(GET_BKEYDATA(page, \ + (indx) + (TYPE(page) == P_LBTREE ? O_INDX : 0))->type) +#undef IS_CUR_DELETED +#define IS_CUR_DELETED(dbc) \ + IS_DELETED((dbc)->internal->page, (dbc)->internal->indx) + +/* + * Test to see if two cursors could point to duplicates of the same key. + * In the case of off-page duplicates they are they same, as the cursors + * will be in the same off-page duplicate tree. In the case of on-page + * duplicates, the key index offsets must be the same. For the last test, + * as the original cursor may not have a valid page pointer, we use the + * current cursor's. + */ +#undef IS_DUPLICATE +#define IS_DUPLICATE(dbc, i1, i2) \ + (((PAGE *)(dbc)->internal->page)->inp[i1] == \ + ((PAGE *)(dbc)->internal->page)->inp[i2]) +#undef IS_CUR_DUPLICATE +#define IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx) \ + (F_ISSET(dbc, DBC_OPD) || \ + (orig_pgno == (dbc)->internal->pgno && \ + IS_DUPLICATE(dbc, (dbc)->internal->indx, orig_indx))) + +/* + * __bam_c_reset -- + * Initialize internal cursor structure. + */ +static void +__bam_c_reset(cp) + BTREE_CURSOR *cp; +{ + cp->csp = cp->sp; + cp->lock.off = LOCK_INVALID; + cp->lock_mode = DB_LOCK_NG; + cp->recno = RECNO_OOB; + cp->order = INVALID_ORDER; + cp->flags = 0; +} + +/* + * __bam_c_init -- + * Initialize the access private portion of a cursor + * + * PUBLIC: int __bam_c_init __P((DBC *, DBTYPE)); + */ +int +__bam_c_init(dbc, dbtype) + DBC *dbc; + DBTYPE dbtype; +{ + BTREE *t; + BTREE_CURSOR *cp; + DB *dbp; + int ret; + u_int32_t minkey; + + dbp = dbc->dbp; + + /* Allocate/initialize the internal structure. */ + if (dbc->internal == NULL) { + if ((ret = __os_malloc(dbp->dbenv, + sizeof(BTREE_CURSOR), NULL, &cp)) != 0) + return (ret); + dbc->internal = (DBC_INTERNAL *)cp; + + cp->sp = cp->csp = cp->stack; + cp->esp = cp->stack + sizeof(cp->stack) / sizeof(cp->stack[0]); + } else + cp = (BTREE_CURSOR *)dbc->internal; + __bam_c_reset(cp); + + /* Initialize methods. */ + dbc->c_close = __db_c_close; + dbc->c_count = __db_c_count; + dbc->c_del = __db_c_del; + dbc->c_dup = __db_c_dup; + dbc->c_get = __db_c_get; + dbc->c_put = __db_c_put; + if (dbtype == DB_BTREE) { + dbc->c_am_close = __bam_c_close; + dbc->c_am_del = __bam_c_del; + dbc->c_am_destroy = __bam_c_destroy; + dbc->c_am_get = __bam_c_get; + dbc->c_am_put = __bam_c_put; + dbc->c_am_writelock = __bam_c_writelock; + } else { + dbc->c_am_close = __bam_c_close; + dbc->c_am_del = __ram_c_del; + dbc->c_am_destroy = __bam_c_destroy; + dbc->c_am_get = __ram_c_get; + dbc->c_am_put = __ram_c_put; + dbc->c_am_writelock = __bam_c_writelock; + } + + /* + * The btree leaf page data structures require that two key/data pairs + * (or four items) fit on a page, but other than that there's no fixed + * requirement. The btree off-page duplicates only require two items, + * to be exact, but requiring four for them as well seems reasonable. + * + * Recno uses the btree bt_ovflsize value -- it's close enough. + */ + t = dbp->bt_internal; + minkey = F_ISSET(dbc, DBC_OPD) ? 2 : t->bt_minkey; + cp->ovflsize = B_MINKEY_TO_OVFLSIZE(minkey, dbp->pgsize); + + return (0); +} + +/* + * __bam_c_refresh + * Set things up properly for cursor re-use. + * + * PUBLIC: int __bam_c_refresh __P((DBC *)); + */ +int +__bam_c_refresh(dbc) + DBC *dbc; +{ + BTREE_CURSOR *cp; + DB *dbp; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + __bam_c_reset(cp); + + /* + * If our caller set the root page number, it's because the root was + * known. This is always the case for off page dup cursors. Else, + * pull it out of our internal information. + */ + if (cp->root == PGNO_INVALID) + cp->root = ((BTREE *)dbp->bt_internal)->bt_root; + + /* Initialize for record numbers. */ + if (F_ISSET(dbc, DBC_OPD) || + dbc->dbtype == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) { + F_SET(cp, C_RECNUM); + + /* + * All btrees that support record numbers, optionally standard + * recno trees, and all off-page duplicate recno trees have + * mutable record numbers. + */ + if ((F_ISSET(dbc, DBC_OPD) && dbc->dbtype == DB_RECNO) || + F_ISSET(dbp, DB_BT_RECNUM | DB_RE_RENUMBER)) + F_SET(cp, C_RENUMBER); + } + + return (0); +} + +/* + * __bam_c_close -- + * Close down the cursor. + */ +static int +__bam_c_close(dbc, root_pgno, rmroot) + DBC *dbc; + db_pgno_t root_pgno; + int *rmroot; +{ + BTREE_CURSOR *cp, *cp_opd, *cp_c; + DB *dbp; + DBC *dbc_opd, *dbc_c; + PAGE *h; + u_int32_t num; + int cdb_lock, ret, t_ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + cp_opd = (dbc_opd = cp->opd) == NULL ? + NULL : (BTREE_CURSOR *)dbc_opd->internal; + cdb_lock = ret = 0; + + /* + * There are 3 ways this function is called: + * + * 1. Closing a primary cursor: we get called with a pointer to a + * primary cursor that has a NULL opd field. This happens when + * closing a btree/recno database cursor without an associated + * off-page duplicate tree. + * + * 2. Closing a primary and an off-page duplicate cursor stack: we + * get called with a pointer to the primary cursor which has a + * non-NULL opd field. This happens when closing a btree cursor + * into database with an associated off-page btree/recno duplicate + * tree. (It can't be a primary recno database, recno databases + * don't support duplicates.) + * + * 3. Closing an off-page duplicate cursor stack: we get called with + * a pointer to the off-page duplicate cursor. This happens when + * closing a non-btree database that has an associated off-page + * btree/recno duplicate tree or for a btree database when the + * opd tree is not empty (root_pgno == PGNO_INVALID). + * + * If either the primary or off-page duplicate cursor deleted a btree + * key/data pair, check to see if the item is still referenced by a + * different cursor. If it is, confirm that cursor's delete flag is + * set and leave it to that cursor to do the delete. + * + * NB: The test for == 0 below is correct. Our caller already removed + * our cursor argument from the active queue, we won't find it when we + * search the queue in __bam_ca_delete(). + * NB: It can't be true that both the primary and off-page duplicate + * cursors have deleted a btree key/data pair. Either the primary + * cursor may have deleted an item and there's no off-page duplicate + * cursor, or there's an off-page duplicate cursor and it may have + * deleted an item. + * + * Primary recno databases aren't an issue here. Recno keys are either + * deleted immediately or never deleted, and do not have to be handled + * here. + * + * Off-page duplicate recno databases are an issue here, cases #2 and + * #3 above can both be off-page recno databases. The problem is the + * same as the final problem for off-page duplicate btree databases. + * If we no longer need the off-page duplicate tree, we want to remove + * it. For off-page duplicate btrees, we are done with the tree when + * we delete the last item it contains, i.e., there can be no further + * references to it when it's empty. For off-page duplicate recnos, + * we remove items from the tree as the application calls the remove + * function, so we are done with the tree when we close the last cursor + * that references it. + * + * We optionally take the root page number from our caller. If the + * primary database is a btree, we can get it ourselves because dbc + * is the primary cursor. If the primary database is not a btree, + * the problem is that we may be dealing with a stack of pages. The + * cursor we're using to do the delete points at the bottom of that + * stack and we need the top of the stack. + */ + if (F_ISSET(cp, C_DELETED)) { + dbc_c = dbc; + switch (dbc->dbtype) { + case DB_BTREE: /* Case #1, #3. */ + if (__bam_ca_delete(dbp, cp->pgno, cp->indx, 1) == 0) + goto lock; + goto done; + case DB_RECNO: + if (!F_ISSET(dbc, DBC_OPD)) /* Case #1. */ + goto done; + /* Case #3. */ + if (__ram_ca_delete(dbp, cp->root) == 0) + goto lock; + goto done; + default: + return (__db_unknown_type(dbp->dbenv, + "__bam_c_close", dbc->dbtype)); + } + } + + if (dbc_opd == NULL) + goto done; + + if (F_ISSET(cp_opd, C_DELETED)) { /* Case #2. */ + /* + * We will not have been provided a root page number. Acquire + * one from the primary database. + */ + if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &h)) != 0) + goto err; + root_pgno = GET_BOVERFLOW(h, cp->indx + O_INDX)->pgno; + if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) + goto err; + + dbc_c = dbc_opd; + switch (dbc_opd->dbtype) { + case DB_BTREE: + if (__bam_ca_delete( + dbp, cp_opd->pgno, cp_opd->indx, 1) == 0) + goto lock; + goto done; + case DB_RECNO: + if (__ram_ca_delete(dbp, cp_opd->root) == 0) + goto lock; + goto done; + default: + return (__db_unknown_type(dbp->dbenv, + "__bam_c_close", dbc->dbtype)); + } + } + goto done; + +lock: cp_c = (BTREE_CURSOR *)dbc_c->internal; + + /* + * If this is CDB, upgrade the lock if necessary. While we acquired + * the write lock to logically delete the record, we released it when + * we returned from that call, and so may not be holding a write lock + * at the moment. NB: to get here in CDB we must either be holding a + * write lock or be the only cursor that is permitted to acquire write + * locks. The reason is that there can never be more than a single CDB + * write cursor (that cursor cannot be dup'd), and so that cursor must + * be closed and the item therefore deleted before any other cursor + * could acquire a reference to this item. + * + * Note that dbc may be an off-page dup cursor; this is the sole + * instance in which an OPD cursor does any locking, but it's necessary + * because we may be closed by ourselves without a parent cursor + * handy, and we have to do a lock upgrade on behalf of somebody. + * If this is the case, the OPD has been given the parent's locking + * info in __db_c_get--the OPD is also a WRITEDUP. + */ + if (CDB_LOCKING(dbp->dbenv)) { + DB_ASSERT(!F_ISSET(dbc, DBC_OPD) || F_ISSET(dbc, DBC_WRITEDUP)); + if (!F_ISSET(dbc, DBC_WRITER)) { + if ((ret = + lock_get(dbp->dbenv, dbc->locker, DB_LOCK_UPGRADE, + &dbc->lock_dbt, DB_LOCK_WRITE, &dbc->mylock)) != 0) + goto err; + cdb_lock = 1; + } + + cp_c->lock.off = LOCK_INVALID; + if ((ret = + memp_fget(dbp->mpf, &cp_c->pgno, 0, &cp_c->page)) != 0) + goto err; + + goto delete; + } + + /* + * The variable dbc_c has been initialized to reference the cursor in + * which we're going to do the delete. Initialize the cursor's page + * and lock structures as necessary. + * + * First, we may not need to acquire any locks. If we're in case #3, + * that is, the primary database isn't a btree database, our caller + * is responsible for acquiring any necessary locks before calling us. + */ + if (F_ISSET(dbc, DBC_OPD)) { + cp_c->lock.off = LOCK_INVALID; + if ((ret = + memp_fget(dbp->mpf, &cp_c->pgno, 0, &cp_c->page)) != 0) + goto err; + goto delete; + } + + /* + * Otherwise, acquire a write lock. If the cursor that did the initial + * logical deletion (and which had a write lock) is not the same as the + * cursor doing the physical deletion (which may have only ever had a + * read lock on the item), we need to upgrade. The confusion comes as + * follows: + * + * C1 created, acquires item read lock + * C2 dup C1, create C2, also has item read lock. + * C1 acquire write lock, delete item + * C1 close + * C2 close, needs a write lock to physically delete item. + * + * If we're in a TXN, we know that C2 will be able to acquire the write + * lock, because no locker other than the one shared by C1 and C2 can + * acquire a write lock -- the original write lock C1 acquire was never + * discarded. + * + * If we're not in a TXN, it's nastier. Other cursors might acquire + * read locks on the item after C1 closed, discarding its write lock, + * and such locks would prevent C2 from acquiring a read lock. That's + * OK, though, we'll simply wait until we can acquire a read lock, or + * we'll deadlock. (Which better not happen, since we're not in a TXN.) + * + * Lock the primary database page, regardless of whether we're deleting + * an item on a primary database page or an off-page duplicates page. + */ + ACQUIRE(dbc, DB_LOCK_WRITE, + cp->pgno, cp_c->lock, cp_c->pgno, cp_c->page, ret); + if (ret != 0) + goto err; + +delete: /* + * If the delete occurred in a btree, delete the on-page physical item + * referenced by the cursor. + */ + if (dbc_c->dbtype == DB_BTREE && (ret = __bam_c_physdel(dbc_c)) != 0) + goto err; + + /* + * If we're not working in an off-page duplicate tree, then we're + * done. + */ + if (!F_ISSET(dbc_c, DBC_OPD) || root_pgno == PGNO_INVALID) + goto done; + + /* + * We may have just deleted the last element in the off-page duplicate + * tree, and closed the last cursor in the tree. For an off-page btree + * there are no other cursors in the tree by definition, if the tree is + * empty. For an off-page recno we know we have closed the last cursor + * in the tree because the __ram_ca_delete call above returned 0 only + * in that case. So, if the off-page duplicate tree is empty at this + * point, we want to remove it. + */ + if ((ret = memp_fget(dbp->mpf, &root_pgno, 0, &h)) != 0) + goto err; + if ((num = NUM_ENT(h)) == 0) { + if ((ret = __db_free(dbc, h)) != 0) + goto err; + } else { + if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) + goto err; + goto done; + } + + /* + * When removing the tree, we have to do one of two things. If this is + * case #2, that is, the primary tree is a btree, delete the key that's + * associated with the tree from the btree leaf page. We know we are + * the only reference to it and we already have the correct lock. We + * detect this case because the cursor that was passed to us references + * an off-page duplicate cursor. + * + * If this is case #3, that is, the primary tree isn't a btree, pass + * the information back to our caller, it's their job to do cleanup on + * the primary page. + */ + if (dbc_opd != NULL) { + cp->lock.off = LOCK_INVALID; + if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0) + goto err; + if ((ret = __bam_c_physdel(dbc)) != 0) + goto err; + } else + *rmroot = 1; +err: +done: /* + * Discard the page references and locks, and confirm that the stack + * has been emptied. + */ + if (dbc_opd != NULL) { + DISCARD_CUR(dbc_opd, t_ret); + if (t_ret != 0 && ret == 0) + ret = t_ret; + } + DISCARD_CUR(dbc, t_ret); + if (t_ret != 0 && ret == 0) + ret = t_ret; + + /* Downgrade any CDB lock we acquired. */ + if (cdb_lock) + (void)__lock_downgrade( + dbp->dbenv, &dbc->mylock, DB_LOCK_IWRITE, 0); + + return (ret); +} + +/* + * __bam_c_destroy -- + * Close a single cursor -- internal version. + */ +static int +__bam_c_destroy(dbc) + DBC *dbc; +{ + /* Discard the structures. */ + __os_free(dbc->internal, sizeof(BTREE_CURSOR)); + + return (0); +} + +/* + * __bam_c_count -- + * Return a count of on and off-page duplicates. + * + * PUBLIC: int __bam_c_count __P((DBC *, db_recno_t *)); + */ +int +__bam_c_count(dbc, recnop) + DBC *dbc; + db_recno_t *recnop; +{ + BTREE_CURSOR *cp; + DB *dbp; + db_indx_t indx, top; + db_recno_t recno; + int ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + + /* + * Called with the top-level cursor that may reference an off-page + * duplicates page. If it's a set of on-page duplicates, get the + * page and count. Otherwise, get the root page of the off-page + * duplicate tree, and use the count. We don't have to acquire any + * new locks, we have to have a read lock to even get here. + */ + if (cp->opd == NULL) { + if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0) + return (ret); + + /* + * Move back to the beginning of the set of duplicates and + * then count forward. + */ + for (indx = cp->indx;; indx -= P_INDX) + if (indx == 0 || + !IS_DUPLICATE(dbc, indx, indx - P_INDX)) + break; + for (recno = 1, top = NUM_ENT(cp->page) - P_INDX; + indx < top; ++recno, indx += P_INDX) + if (!IS_DUPLICATE(dbc, indx, indx + P_INDX)) + break; + *recnop = recno; + } else { + if ((ret = memp_fget(dbp->mpf, + &cp->opd->internal->root, 0, &cp->page)) != 0) + return (ret); + + *recnop = RE_NREC(cp->page); + } + + ret = memp_fput(dbp->mpf, cp->page, 0); + cp->page = NULL; + + return (ret); +} + +/* + * __bam_c_del -- + * Delete using a cursor. + */ +static int +__bam_c_del(dbc) + DBC *dbc; +{ + BTREE_CURSOR *cp; + DB *dbp; + int ret, t_ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + ret = 0; + + /* If the item was already deleted, return failure. */ + if (F_ISSET(cp, C_DELETED)) + return (DB_KEYEMPTY); + + /* + * This code is always called with a page lock but no page. + */ + DB_ASSERT(cp->page == NULL); + + /* + * We don't physically delete the record until the cursor moves, so + * we have to have a long-lived write lock on the page instead of a + * a long-lived read lock. Note, we have to have a read lock to even + * get here. + * + * If we're maintaining record numbers, we lock the entire tree, else + * we lock the single page. + */ + if (F_ISSET(cp, C_RECNUM)) { + if ((ret = __bam_c_getstack(dbc)) != 0) + goto err; + cp->page = cp->csp->page; + } else { + ACQUIRE_CUR(dbc, DB_LOCK_WRITE, ret); + if (ret != 0) + goto err; + } + + /* Log the change. */ + if (DB_LOGGING(dbc) && + (ret = __bam_cdel_log(dbp->dbenv, dbc->txn, &LSN(cp->page), 0, + dbp->log_fileid, PGNO(cp->page), &LSN(cp->page), cp->indx)) != 0) + goto err; + + /* Set the intent-to-delete flag on the page. */ + if (TYPE(cp->page) == P_LBTREE) + B_DSET(GET_BKEYDATA(cp->page, cp->indx + O_INDX)->type); + else + B_DSET(GET_BKEYDATA(cp->page, cp->indx)->type); + + /* Mark the page dirty. */ + ret = memp_fset(dbp->mpf, cp->page, DB_MPOOL_DIRTY); + +err: /* + * If we've been successful so far and the tree has record numbers, + * adjust the record counts. Either way, release acquired page(s). + */ + if (F_ISSET(cp, C_RECNUM)) { + if (ret == 0) + ret = __bam_adjust(dbc, -1); + (void)__bam_stkrel(dbc, 0); + } else + if (cp->page != NULL && + (t_ret = memp_fput(dbp->mpf, cp->page, 0)) != 0 && ret == 0) + ret = t_ret; + + cp->page = NULL; + + /* Update the cursors last, after all chance of failure is past. */ + if (ret == 0) + (void)__bam_ca_delete(dbp, cp->pgno, cp->indx, 1); + + return (ret); +} + +/* + * __bam_c_dup -- + * Duplicate a btree cursor, such that the new one holds appropriate + * locks for the position of the original. + * + * PUBLIC: int __bam_c_dup __P((DBC *, DBC *)); + */ +int +__bam_c_dup(orig_dbc, new_dbc) + DBC *orig_dbc, *new_dbc; +{ + BTREE_CURSOR *orig, *new; + int ret; + + orig = (BTREE_CURSOR *)orig_dbc->internal; + new = (BTREE_CURSOR *)new_dbc->internal; + + /* + * If we're holding a lock we need to acquire a copy of it, unless + * we're in a transaction. We don't need to copy any lock we're + * holding inside a transaction because all the locks are retained + * until the transaction commits or aborts. + */ + if (orig->lock.off != LOCK_INVALID && orig_dbc->txn == NULL) { + if ((ret = __db_lget(new_dbc, + 0, new->pgno, new->lock_mode, 0, &new->lock)) != 0) + return (ret); + } + new->ovflsize = orig->ovflsize; + new->recno = orig->recno; + new->flags = orig->flags; + + return (0); +} + +/* + * __bam_c_get -- + * Get using a cursor (btree). + */ +static int +__bam_c_get(dbc, key, data, flags, pgnop) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; + db_pgno_t *pgnop; +{ + BTREE_CURSOR *cp; + DB *dbp; + db_pgno_t orig_pgno; + db_indx_t orig_indx; + int exact, newopd, ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + orig_pgno = cp->pgno; + orig_indx = cp->indx; + + newopd = 0; + switch (flags) { + case DB_CURRENT: + /* It's not possible to return a deleted record. */ + if (F_ISSET(cp, C_DELETED)) { + ret = DB_KEYEMPTY; + goto err; + } + + /* + * Acquire the current page. We have at least a read-lock + * already. The caller may have set DB_RMW asking for a + * write lock, but upgrading to a write lock has no better + * chance of succeeding now instead of later, so don't try. + */ + if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0) + goto err; + break; + case DB_FIRST: + newopd = 1; + if ((ret = __bam_c_first(dbc)) != 0) + goto err; + break; + case DB_GET_BOTH: + /* + * There are two ways to get here based on DBcursor->c_get + * with the DB_GET_BOTH flag set: + * + * 1. Searching a sorted off-page duplicate tree: do a tree + * search. + * + * 2. Searching btree: do a tree search. If it returns a + * reference to off-page duplicate tree, return immediately + * and let our caller deal with it. If the search doesn't + * return a reference to off-page duplicate tree, start an + * on-page search. + */ + if (F_ISSET(dbc, DBC_OPD)) { + if ((ret = __bam_c_search( + dbc, data, DB_GET_BOTH, &exact)) != 0) + goto err; + if (!exact) { + ret = DB_NOTFOUND; + goto err; + } + } else { + if ((ret = __bam_c_search( + dbc, key, DB_GET_BOTH, &exact)) != 0) + return (ret); + if (!exact) { + ret = DB_NOTFOUND; + goto err; + } + + if (pgnop != NULL && __bam_isopd(dbc, pgnop)) { + newopd = 1; + break; + } + if ((ret = __bam_getboth_finddatum(dbc, data)) != 0) + goto err; + } + break; + case DB_GET_BOTHC: + if ((ret = __bam_getbothc(dbc, data)) != 0) + goto err; + break; + case DB_LAST: + newopd = 1; + if ((ret = __bam_c_last(dbc)) != 0) + goto err; + break; + case DB_NEXT: + newopd = 1; + if (cp->pgno == PGNO_INVALID) { + if ((ret = __bam_c_first(dbc)) != 0) + goto err; + } else + if ((ret = __bam_c_next(dbc, 1)) != 0) + goto err; + break; + case DB_NEXT_DUP: + if ((ret = __bam_c_next(dbc, 1)) != 0) + goto err; + if (!IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx)) { + ret = DB_NOTFOUND; + goto err; + } + break; + case DB_NEXT_NODUP: + newopd = 1; + if (cp->pgno == PGNO_INVALID) { + if ((ret = __bam_c_first(dbc)) != 0) + goto err; + } else + do { + if ((ret = __bam_c_next(dbc, 1)) != 0) + goto err; + } while (IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx)); + break; + case DB_PREV: + newopd = 1; + if (cp->pgno == PGNO_INVALID) { + if ((ret = __bam_c_last(dbc)) != 0) + goto err; + } else + if ((ret = __bam_c_prev(dbc)) != 0) + goto err; + break; + case DB_PREV_NODUP: + newopd = 1; + if (cp->pgno == PGNO_INVALID) { + if ((ret = __bam_c_last(dbc)) != 0) + goto err; + } else + do { + if ((ret = __bam_c_prev(dbc)) != 0) + goto err; + } while (IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx)); + break; + case DB_SET: + case DB_SET_RECNO: + newopd = 1; + if ((ret = __bam_c_search(dbc, key, flags, &exact)) != 0) + goto err; + break; + case DB_SET_RANGE: + newopd = 1; + if ((ret = __bam_c_search(dbc, key, flags, &exact)) != 0) + goto err; + + /* + * As we didn't require an exact match, the search function + * may have returned an entry past the end of the page. Or, + * we may be referencing a deleted record. If so, move to + * the next entry. + */ + if (cp->indx == NUM_ENT(cp->page) || IS_CUR_DELETED(dbc)) + if ((ret = __bam_c_next(dbc, 0)) != 0) + goto err; + break; + default: + ret = __db_unknown_flag(dbp->dbenv, "__bam_c_get", flags); + goto err; + } + + /* + * We may have moved to an off-page duplicate tree. Return that + * information to our caller. + */ + if (newopd && pgnop != NULL) + (void)__bam_isopd(dbc, pgnop); + + /* Don't return the key, it was passed to us */ + if (flags == DB_SET) + F_SET(key, DB_DBT_ISSET); + +err: /* + * Regardless of whether we were successful or not, if the cursor + * moved, clear the delete flag, DBcursor->c_get never references + * a deleted key, if it moved at all. + */ + if (F_ISSET(cp, C_DELETED) + && (cp->pgno != orig_pgno || cp->indx != orig_indx)) + F_CLR(cp, C_DELETED); + + return (ret); +} + +/* + * __bam_getbothc -- + * Search for a matching data item on a join. + */ +static int +__bam_getbothc(dbc, data) + DBC *dbc; + DBT *data; +{ + BTREE_CURSOR *cp; + DB *dbp; + int cmp, exact, ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + + /* + * Acquire the current page. We have at least a read-lock + * already. The caller may have set DB_RMW asking for a + * write lock, but upgrading to a write lock has no better + * chance of succeeding now instead of later, so don't try. + */ + if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0) + return (ret); + + /* + * An off-page duplicate cursor. Search the remaining duplicates + * for one which matches (do a normal btree search, then verify + * that the retrieved record is greater than the original one). + */ + if (F_ISSET(dbc, DBC_OPD)) { + /* + * Check to make sure the desired item comes strictly after + * the current position; if it doesn't, return DB_NOTFOUND. + */ + if ((ret = __bam_cmp(dbp, data, cp->page, cp->indx, + dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare, + &cmp)) != 0) + return (ret); + + if (cmp <= 0) + return (DB_NOTFOUND); + + /* Discard the current page, we're going to do a full search. */ + if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0) + return (ret); + cp->page = NULL; + + return (__bam_c_search(dbc, data, DB_GET_BOTH, &exact)); + } + + /* + * We're doing a DBC->c_get(DB_GET_BOTHC) and we're already searching + * a set of on-page duplicates (either sorted or unsorted). Continue + * a linear search from after the current position. + * + * (Note that we could have just finished a "set" of one duplicate, + * i.e. not a duplicate at all, but the following check will always + * return DB_NOTFOUND in this case, which is the desired behavior.) + */ + if (cp->indx + P_INDX >= NUM_ENT(cp->page) || + !IS_DUPLICATE(dbc, cp->indx, cp->indx + P_INDX)) + return (DB_NOTFOUND); + cp->indx += P_INDX; + + return (__bam_getboth_finddatum(dbc, data)); +} + +/* + * __bam_getboth_finddatum -- + * Find a matching on-page data item. + */ +static int +__bam_getboth_finddatum(dbc, data) + DBC *dbc; + DBT *data; +{ + BTREE_CURSOR *cp; + DB *dbp; + db_indx_t base, lim, top; + int cmp, ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + + /* + * Called (sometimes indirectly) from DBC->get to search on-page data + * item(s) for a matching value. If the original flag was DB_GET_BOTH, + * the cursor argument is set to the first data item for the key. If + * the original flag was DB_GET_BOTHC, the cursor argument is set to + * the first data item that we can potentially return. In both cases, + * there may or may not be additional duplicate data items to search. + * + * If the duplicates are not sorted, do a linear search. + * + * If the duplicates are sorted, do a binary search. The reason for + * this is that large pages and small key/data pairs result in large + * numbers of on-page duplicates before they get pushed off-page. + */ + if (dbp->dup_compare == NULL) { + for (;; cp->indx += P_INDX) { + if (!IS_CUR_DELETED(dbc) && + (ret = __bam_cmp(dbp, data, cp->page, + cp->indx + O_INDX, __bam_defcmp, &cmp)) != 0) + return (ret); + if (cmp == 0) + return (0); + + if (cp->indx + P_INDX >= NUM_ENT(cp->page) || + !IS_DUPLICATE(dbc, cp->indx, cp->indx + P_INDX)) + break; + } + } else { + /* + * Find the top and bottom of the duplicate set. Binary search + * requires at least two items, don't loop if there's only one. + */ + for (base = top = cp->indx; + top < NUM_ENT(cp->page); top += P_INDX) + if (!IS_DUPLICATE(dbc, cp->indx, top)) + break; + if (base == (top - P_INDX)) { + if ((ret = __bam_cmp(dbp, data, + cp->page, cp->indx + O_INDX, + dbp->dup_compare, &cmp)) != 0) + return (ret); + return (cmp == 0 ? 0 : DB_NOTFOUND); + } + + for (lim = + (top - base) / (db_indx_t)P_INDX; lim != 0; lim >>= 1) { + cp->indx = base + ((lim >> 1) * P_INDX); + if ((ret = __bam_cmp(dbp, data, cp->page, + cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0) + return (ret); + if (cmp == 0) { + if (!IS_CUR_DELETED(dbc)) + return (0); + break; + } + if (cmp > 0) { + base = cp->indx + P_INDX; + --lim; + } + } + } + return (DB_NOTFOUND); +} + +/* + * __bam_c_put -- + * Put using a cursor. + */ +static int +__bam_c_put(dbc, key, data, flags, pgnop) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; + db_pgno_t *pgnop; +{ + BTREE_CURSOR *cp; + DB *dbp; + DBT dbt; + u_int32_t iiop; + int cmp, exact, needkey, ret, stack; + void *arg; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + +split: needkey = ret = stack = 0; + switch (flags) { + case DB_AFTER: + case DB_BEFORE: + case DB_CURRENT: + needkey = 1; + iiop = flags; + + /* + * If the Btree has record numbers (and we're not replacing an + * existing record), we need a complete stack so that we can + * adjust the record counts. The check for flags == DB_CURRENT + * is superfluous but left in for clarity. (If C_RECNUM is set + * we know that flags must be DB_CURRENT, as DB_AFTER/DB_BEFORE + * are illegal in a Btree unless it's configured for duplicates + * and you cannot configure a Btree for both record renumbering + * and duplicates.) + */ + if (flags == DB_CURRENT && + F_ISSET(cp, C_RECNUM) && F_ISSET(cp, C_DELETED)) { + if ((ret = __bam_c_getstack(dbc)) != 0) + goto err; + /* + * Initialize the cursor from the stack. Don't take + * the page number or page index, they should already + * be set. + */ + cp->page = cp->csp->page; + cp->lock = cp->csp->lock; + cp->lock_mode = cp->csp->lock_mode; + + stack = 1; + break; + } + + /* Acquire the current page with a write lock. */ + ACQUIRE_WRITE_LOCK(dbc, ret); + if (ret != 0) + goto err; + if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0) + goto err; + break; + case DB_KEYFIRST: + case DB_KEYLAST: + case DB_NODUPDATA: + /* + * Searching off-page, sorted duplicate tree: do a tree search + * for the correct item; __bam_c_search returns the smallest + * slot greater than the key, use it. + */ + if (F_ISSET(dbc, DBC_OPD)) { + if ((ret = + __bam_c_search(dbc, data, flags, &exact)) != 0) + goto err; + stack = 1; + + /* Disallow "sorted" duplicate duplicates. */ + if (exact) { + ret = __db_duperr(dbp, flags); + goto err; + } + iiop = DB_BEFORE; + break; + } + + /* Searching a btree. */ + if ((ret = __bam_c_search(dbc, key, + flags == DB_KEYFIRST || dbp->dup_compare != NULL ? + DB_KEYFIRST : DB_KEYLAST, &exact)) != 0) + goto err; + stack = 1; + + /* + * If we don't have an exact match, __bam_c_search returned + * the smallest slot greater than the key, use it. + */ + if (!exact) { + iiop = DB_KEYFIRST; + break; + } + + /* + * If duplicates aren't supported, replace the current item. + * (If implementing the DB->put function, our caller already + * checked the DB_NOOVERWRITE flag.) + */ + if (!F_ISSET(dbp, DB_AM_DUP)) { + iiop = DB_CURRENT; + break; + } + + /* + * If we find a matching entry, it may be an off-page duplicate + * tree. Return the page number to our caller, we need a new + * cursor. + */ + if (pgnop != NULL && __bam_isopd(dbc, pgnop)) + goto done; + + /* If the duplicates aren't sorted, move to the right slot. */ + if (dbp->dup_compare == NULL) { + if (flags == DB_KEYFIRST) + iiop = DB_BEFORE; + else + for (;; cp->indx += P_INDX) + if (cp->indx + P_INDX >= + NUM_ENT(cp->page) || + !IS_DUPLICATE(dbc, cp->indx, + cp->indx + P_INDX)) { + iiop = DB_AFTER; + break; + } + break; + } + + /* + * We know that we're looking at the first of a set of sorted + * on-page duplicates. Walk the list to find the right slot. + */ + for (;; cp->indx += P_INDX) { + if ((ret = __bam_cmp(dbp, data, cp->page, + cp->indx + O_INDX, dbp->dup_compare, &cmp)) !=0) + return (ret); + if (cmp < 0) { + iiop = DB_BEFORE; + break; + } + + /* Disallow "sorted" duplicate duplicates. */ + if (cmp == 0) { + if (IS_DELETED(cp->page, cp->indx)) { + iiop = DB_CURRENT; + break; + } + ret = __db_duperr(dbp, flags); + goto err; + } + + if (cp->indx + P_INDX >= NUM_ENT(cp->page) || + ((PAGE *)cp->page)->inp[cp->indx] != + ((PAGE *)cp->page)->inp[cp->indx + P_INDX]) { + iiop = DB_AFTER; + break; + } + } + break; + default: + ret = __db_unknown_flag(dbp->dbenv, "__bam_c_put", flags); + goto err; + } + + switch (ret = __bam_iitem(dbc, key, data, iiop, 0)) { + case 0: + break; + case DB_NEEDSPLIT: + /* + * To split, we need a key for the page. Either use the key + * argument or get a copy of the key from the page. + */ + if (flags == DB_AFTER || + flags == DB_BEFORE || flags == DB_CURRENT) { + memset(&dbt, 0, sizeof(DBT)); + if ((ret = __db_ret(dbp, cp->page, 0, &dbt, + &dbc->rkey.data, &dbc->rkey.ulen)) != 0) + goto err; + arg = &dbt; + } else + arg = F_ISSET(dbc, DBC_OPD) ? data : key; + + /* + * Discard any locks and pinned pages (the locks are discarded + * even if we're running with transactions, as they lock pages + * that we're sorry we ever acquired). If stack is set and the + * cursor entries are valid, they point to the same entries as + * the stack, don't free them twice. + */ + if (stack) + ret = __bam_stkrel(dbc, STK_CLRDBC | STK_NOLOCK); + else + DISCARD_CUR(dbc, ret); + if (ret != 0) + goto err; + + /* Split the tree. */ + if ((ret = __bam_split(dbc, arg)) != 0) + return (ret); + + goto split; + default: + goto err; + } + +err: +done: /* + * Discard any pages pinned in the tree and their locks, except for + * the leaf page. Note, the leaf page participated in any stack we + * acquired, and so we have to adjust the stack as necessary. If + * there was only a single page on the stack, we don't have to free + * further stack pages. + */ + if (stack && BT_STK_POP(cp) != NULL) + (void)__bam_stkrel(dbc, 0); + + /* + * Regardless of whether we were successful or not, clear the delete + * flag. If we're successful, we either moved the cursor or the item + * is no longer deleted. If we're not successful, then we're just a + * copy, no need to have the flag set. + */ + F_CLR(cp, C_DELETED); + + return (ret); +} + +/* + * __bam_c_rget -- + * Return the record number for a cursor. + * + * PUBLIC: int __bam_c_rget __P((DBC *, DBT *, u_int32_t)); + */ +int +__bam_c_rget(dbc, data, flags) + DBC *dbc; + DBT *data; + u_int32_t flags; +{ + BTREE_CURSOR *cp; + DB *dbp; + DBT dbt; + db_recno_t recno; + int exact, ret; + + COMPQUIET(flags, 0); + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + + /* + * Get the page with the current item on it. + * Get a copy of the key. + * Release the page, making sure we don't release it twice. + */ + if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0) + return (ret); + memset(&dbt, 0, sizeof(DBT)); + if ((ret = __db_ret(dbp, cp->page, + cp->indx, &dbt, &dbc->rkey.data, &dbc->rkey.ulen)) != 0) + goto err; + ret = memp_fput(dbp->mpf, cp->page, 0); + cp->page = NULL; + if (ret != 0) + return (ret); + + if ((ret = __bam_search(dbc, &dbt, + F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND, + 1, &recno, &exact)) != 0) + goto err; + + ret = __db_retcopy(dbp, data, + &recno, sizeof(recno), &dbc->rdata.data, &dbc->rdata.ulen); + + /* Release the stack. */ +err: __bam_stkrel(dbc, 0); + + return (ret); +} + +/* + * __bam_c_writelock -- + * Upgrade the cursor to a write lock. + */ +static int +__bam_c_writelock(dbc) + DBC *dbc; +{ + BTREE_CURSOR *cp; + int ret; + + cp = (BTREE_CURSOR *)dbc->internal; + + if (cp->lock_mode == DB_LOCK_WRITE) + return (0); + + /* + * When writing to an off-page duplicate tree, we need to have the + * appropriate page in the primary tree locked. The general DBC + * code calls us first with the primary cursor so we can acquire the + * appropriate lock. + */ + ACQUIRE_WRITE_LOCK(dbc, ret); + return (ret); +} + +/* + * __bam_c_first -- + * Return the first record. + */ +static int +__bam_c_first(dbc) + DBC *dbc; +{ + BTREE_CURSOR *cp; + DB *dbp; + db_pgno_t pgno; + int ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + ret = 0; + + /* Walk down the left-hand side of the tree. */ + for (pgno = cp->root;;) { + ACQUIRE_CUR_SET(dbc, DB_LOCK_READ, pgno, ret); + if (ret != 0) + return (ret); + + /* If we find a leaf page, we're done. */ + if (ISLEAF(cp->page)) + break; + + pgno = GET_BINTERNAL(cp->page, 0)->pgno; + } + + /* If we want a write lock instead of a read lock, get it now. */ + if (F_ISSET(dbc, DBC_RMW)) { + ACQUIRE_WRITE_LOCK(dbc, ret); + if (ret != 0) + return (ret); + } + + /* If on an empty page or a deleted record, move to the next one. */ + if (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(dbc)) + if ((ret = __bam_c_next(dbc, 0)) != 0) + return (ret); + + return (0); +} + +/* + * __bam_c_last -- + * Return the last record. + */ +static int +__bam_c_last(dbc) + DBC *dbc; +{ + BTREE_CURSOR *cp; + DB *dbp; + db_pgno_t pgno; + int ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + ret = 0; + + /* Walk down the right-hand side of the tree. */ + for (pgno = cp->root;;) { + ACQUIRE_CUR_SET(dbc, DB_LOCK_READ, pgno, ret); + if (ret != 0) + return (ret); + + /* If we find a leaf page, we're done. */ + if (ISLEAF(cp->page)) + break; + + pgno = + GET_BINTERNAL(cp->page, NUM_ENT(cp->page) - O_INDX)->pgno; + } + + /* If we want a write lock instead of a read lock, get it now. */ + if (F_ISSET(dbc, DBC_RMW)) { + ACQUIRE_WRITE_LOCK(dbc, ret); + if (ret != 0) + return (ret); + } + + cp->indx = NUM_ENT(cp->page) == 0 ? 0 : + NUM_ENT(cp->page) - + (TYPE(cp->page) == P_LBTREE ? P_INDX : O_INDX); + + /* If on an empty page or a deleted record, move to the previous one. */ + if (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(dbc)) + if ((ret = __bam_c_prev(dbc)) != 0) + return (ret); + + return (0); +} + +/* + * __bam_c_next -- + * Move to the next record. + */ +static int +__bam_c_next(dbc, initial_move) + DBC *dbc; + int initial_move; +{ + BTREE_CURSOR *cp; + DB *dbp; + db_indx_t adjust; + db_lockmode_t lock_mode; + db_pgno_t pgno; + int ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + ret = 0; + + /* + * We're either moving through a page of duplicates or a btree leaf + * page. + * + * !!! + * This code handles empty pages and pages with only deleted entries. + */ + if (F_ISSET(dbc, DBC_OPD)) { + adjust = O_INDX; + lock_mode = DB_LOCK_NG; + } else { + adjust = dbc->dbtype == DB_BTREE ? P_INDX : O_INDX; + lock_mode = + F_ISSET(dbc, DBC_RMW) ? DB_LOCK_WRITE : DB_LOCK_READ; + } + if (cp->page == NULL) { + ACQUIRE_CUR(dbc, lock_mode, ret); + if (ret != 0) + return (ret); + } + + if (initial_move) + cp->indx += adjust; + + for (;;) { + /* + * If at the end of the page, move to a subsequent page. + * + * !!! + * Check for >= NUM_ENT. If the original search landed us on + * NUM_ENT, we may have incremented indx before the test. + */ + if (cp->indx >= NUM_ENT(cp->page)) { + if ((pgno + = NEXT_PGNO(cp->page)) == PGNO_INVALID) + return (DB_NOTFOUND); + + ACQUIRE_CUR_SET(dbc, lock_mode, pgno, ret); + if (ret != 0) + return (ret); + continue; + } + if (IS_CUR_DELETED(dbc)) { + cp->indx += adjust; + continue; + } + break; + } + return (0); +} + +/* + * __bam_c_prev -- + * Move to the previous record. + */ +static int +__bam_c_prev(dbc) + DBC *dbc; +{ + BTREE_CURSOR *cp; + DB *dbp; + db_indx_t adjust; + db_lockmode_t lock_mode; + db_pgno_t pgno; + int ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + ret = 0; + + /* + * We're either moving through a page of duplicates or a btree leaf + * page. + * + * !!! + * This code handles empty pages and pages with only deleted entries. + */ + if (F_ISSET(dbc, DBC_OPD)) { + adjust = O_INDX; + lock_mode = DB_LOCK_NG; + } else { + adjust = dbc->dbtype == DB_BTREE ? P_INDX : O_INDX; + lock_mode = + F_ISSET(dbc, DBC_RMW) ? DB_LOCK_WRITE : DB_LOCK_READ; + } + if (cp->page == NULL) { + ACQUIRE_CUR(dbc, lock_mode, ret); + if (ret != 0) + return (ret); + } + + for (;;) { + /* If at the beginning of the page, move to a previous one. */ + if (cp->indx == 0) { + if ((pgno = + PREV_PGNO(cp->page)) == PGNO_INVALID) + return (DB_NOTFOUND); + + ACQUIRE_CUR_SET(dbc, lock_mode, pgno, ret); + if (ret != 0) + return (ret); + + if ((cp->indx = NUM_ENT(cp->page)) == 0) + continue; + } + + /* Ignore deleted records. */ + cp->indx -= adjust; + if (IS_CUR_DELETED(dbc)) + continue; + + break; + } + return (0); +} + +/* + * __bam_c_search -- + * Move to a specified record. + */ +static int +__bam_c_search(dbc, key, flags, exactp) + DBC *dbc; + const DBT *key; + u_int32_t flags; + int *exactp; +{ + BTREE *t; + BTREE_CURSOR *cp; + DB *dbp; + PAGE *h; + db_indx_t indx; + db_pgno_t bt_lpgno; + db_recno_t recno; + u_int32_t sflags; + int cmp, ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + t = dbp->bt_internal; + ret = 0; + + /* + * Find an entry in the database. Discard any lock we currently hold, + * we're going to search the tree. + */ + DISCARD_CUR(dbc, ret); + if (ret != 0) + return (ret); + + switch (flags) { + case DB_SET_RECNO: + if ((ret = __ram_getno(dbc, key, &recno, 0)) != 0) + return (ret); + sflags = (F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND) | S_EXACT; + if ((ret = __bam_rsearch(dbc, &recno, sflags, 1, exactp)) != 0) + return (ret); + break; + case DB_SET: + case DB_GET_BOTH: + sflags = (F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND) | S_EXACT; + goto search; + case DB_SET_RANGE: + sflags = + (F_ISSET(dbc, DBC_RMW) ? S_WRITE : S_READ) | S_DUPFIRST; + goto search; + case DB_KEYFIRST: + sflags = S_KEYFIRST; + goto fast_search; + case DB_KEYLAST: + case DB_NODUPDATA: + sflags = S_KEYLAST; +fast_search: /* + * If the application has a history of inserting into the first + * or last pages of the database, we check those pages first to + * avoid doing a full search. + * + * If the tree has record numbers, we need a complete stack so + * that we can adjust the record counts, so fast_search isn't + * possible. + */ + if (F_ISSET(cp, C_RECNUM)) + goto search; + + /* + * !!! + * We do not mutex protect the t->bt_lpgno field, which means + * that it can only be used in an advisory manner. If we find + * page we can use, great. If we don't, we don't care, we do + * it the slow way instead. Regardless, copy it into a local + * variable, otherwise we might acquire a lock for a page and + * then read a different page because it changed underfoot. + */ + bt_lpgno = t->bt_lpgno; + + /* + * If the tree has no history of insertion, do it the slow way. + */ + if (bt_lpgno == PGNO_INVALID) + goto search; + + /* Lock and retrieve the page on which we last inserted. */ + h = NULL; + ACQUIRE(dbc, + DB_LOCK_WRITE, bt_lpgno, cp->lock, bt_lpgno, h, ret); + if (ret != 0) + goto fast_miss; + + /* + * It's okay if the page type isn't right or it's empty, it + * just means that the world changed. + */ + if (TYPE(h) != P_LBTREE || NUM_ENT(h) == 0) + goto fast_miss; + + /* + * What we do here is test to see if we're at the beginning or + * end of the tree and if the new item sorts before/after the + * first/last page entry. We don't try and catch inserts into + * the middle of the tree (although we could, as long as there + * were two keys on the page and we saved both the index and + * the page number of the last insert). + */ + if (h->next_pgno == PGNO_INVALID) { + indx = NUM_ENT(h) - P_INDX; + if ((ret = __bam_cmp(dbp, + key, h, indx, t->bt_compare, &cmp)) != 0) + return (ret); + + if (cmp < 0) + goto try_begin; + if (cmp > 0) { + indx += P_INDX; + goto fast_hit; + } + + /* + * Found a duplicate. If doing DB_KEYLAST, we're at + * the correct position, otherwise, move to the first + * of the duplicates. If we're looking at off-page + * duplicates, duplicate duplicates aren't permitted, + * so we're done. + */ + if (flags == DB_KEYLAST) + goto fast_hit; + for (; + indx > 0 && h->inp[indx - P_INDX] == h->inp[indx]; + indx -= P_INDX) + ; + goto fast_hit; + } +try_begin: if (h->prev_pgno == PGNO_INVALID) { + indx = 0; + if ((ret = __bam_cmp(dbp, + key, h, indx, t->bt_compare, &cmp)) != 0) + return (ret); + + if (cmp > 0) + goto fast_miss; + if (cmp < 0) + goto fast_hit; + + /* + * Found a duplicate. If doing DB_KEYFIRST, we're at + * the correct position, otherwise, move to the last + * of the duplicates. If we're looking at off-page + * duplicates, duplicate duplicates aren't permitted, + * so we're done. + */ + if (flags == DB_KEYFIRST) + goto fast_hit; + for (; + indx < (db_indx_t)(NUM_ENT(h) - P_INDX) && + h->inp[indx] == h->inp[indx + P_INDX]; + indx += P_INDX) + ; + goto fast_hit; + } + goto fast_miss; + +fast_hit: /* Set the exact match flag, we may have found a duplicate. */ + *exactp = cmp == 0; + + /* + * Insert the entry in the stack. (Our caller is likely to + * call __bam_stkrel() after our return.) + */ + BT_STK_CLR(cp); + BT_STK_ENTER(dbp->dbenv, + cp, h, indx, cp->lock, cp->lock_mode, ret); + if (ret != 0) + return (ret); + break; + +fast_miss: /* + * This was not the right page, so we do not need to retain + * the lock even in the presence of transactions. + */ + DISCARD(dbc, 1, cp->lock, h, ret); + if (ret != 0) + return (ret); + +search: if ((ret = + __bam_search(dbc, key, sflags, 1, NULL, exactp)) != 0) + return (ret); + break; + default: + return (__db_unknown_flag(dbp->dbenv, "__bam_c_search", flags)); + } + + /* Initialize the cursor from the stack. */ + cp->page = cp->csp->page; + cp->pgno = cp->csp->page->pgno; + cp->indx = cp->csp->indx; + cp->lock = cp->csp->lock; + cp->lock_mode = cp->csp->lock_mode; + + /* + * If we inserted a key into the first or last slot of the tree, + * remember where it was so we can do it more quickly next time. + */ + if (TYPE(cp->page) == P_LBTREE && + (flags == DB_KEYFIRST || flags == DB_KEYLAST)) + t->bt_lpgno = + (NEXT_PGNO(cp->page) == PGNO_INVALID && + cp->indx >= NUM_ENT(cp->page)) || + (PREV_PGNO(cp->page) == PGNO_INVALID && + cp->indx == 0) ? cp->pgno : PGNO_INVALID; + return (0); +} + +/* + * __bam_c_physdel -- + * Physically remove an item from the page. + */ +static int +__bam_c_physdel(dbc) + DBC *dbc; +{ + BTREE_CURSOR *cp; + DB *dbp; + DBT key; + DB_LOCK lock; + PAGE *h; + db_pgno_t pgno; + int delete_page, empty_page, exact, level, ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + delete_page = empty_page = ret = 0; + + /* If the page is going to be emptied, consider deleting it. */ + delete_page = empty_page = + NUM_ENT(cp->page) == (TYPE(cp->page) == P_LBTREE ? 2 : 1); + + /* + * Check if the application turned off reverse splits. Applications + * can't turn off reverse splits in off-page duplicate trees, that + * space will never be reused unless the exact same key is specified. + */ + if (delete_page && + !F_ISSET(dbc, DBC_OPD) && F_ISSET(dbp, DB_BT_REVSPLIT)) + delete_page = 0; + + /* + * We never delete the last leaf page. (Not really true -- we delete + * the last leaf page of off-page duplicate trees, but that's handled + * by our caller, not down here.) + */ + if (delete_page && cp->pgno == cp->root) + delete_page = 0; + + /* + * To delete a leaf page other than an empty root page, we need a + * copy of a key from the page. Use the 0th page index since it's + * the last key the page held. + */ + if (delete_page) { + memset(&key, 0, sizeof(DBT)); + if ((ret = __db_ret(dbp, cp->page, + 0, &key, &dbc->rkey.data, &dbc->rkey.ulen)) != 0) + return (ret); + } + + /* + * Delete the items. If page isn't empty, we adjust the cursors. + * + * !!! + * The following operations to delete a page may deadlock. The easy + * scenario is if we're deleting an item because we're closing cursors + * because we've already deadlocked and want to call txn_abort(). If + * we fail due to deadlock, we'll leave a locked, possibly empty page + * in the tree, which won't be empty long because we'll undo the delete + * when we undo the transaction's modifications. + * + * !!! + * Delete the key item first, otherwise the on-page duplicate checks + * in __bam_ditem() won't work! + */ + if (TYPE(cp->page) == P_LBTREE) { + if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0) + return (ret); + if (!empty_page) + if ((ret = __bam_ca_di(dbc, + PGNO(cp->page), cp->indx, -1)) != 0) + return (ret); + } + if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0) + return (ret); + if (!empty_page) + if ((ret = __bam_ca_di(dbc, PGNO(cp->page), cp->indx, -1)) != 0) + return (ret); + + /* If we're not going to try and delete the page, we're done. */ + if (!delete_page) + return (0); + + /* + * Call __bam_search to reacquire the empty leaf page, but this time + * get both the leaf page and it's parent, locked. Jump back up the + * tree, until we have the top pair of pages that we want to delete. + * Once we have the top page that we want to delete locked, lock the + * underlying pages and check to make sure they're still empty. If + * they are, delete them. + */ + for (level = LEAFLEVEL;; ++level) { + /* Acquire a page and its parent, locked. */ + if ((ret = __bam_search( + dbc, &key, S_WRPAIR, level, NULL, &exact)) != 0) + return (ret); + + /* + * If we reach the root or the parent page isn't going to be + * empty when we delete one record, stop. + */ + h = cp->csp[-1].page; + if (h->pgno == cp->root || NUM_ENT(h) != 1) + break; + + /* Discard the stack, retaining no locks. */ + (void)__bam_stkrel(dbc, STK_NOLOCK); + } + + /* + * Move the stack pointer one after the last entry, we may be about + * to push more items onto the page stack. + */ + ++cp->csp; + + /* + * cp->csp[-2].page is now the parent page, which we may or may not be + * going to delete, and cp->csp[-1].page is the first page we know we + * are going to delete. Walk down the chain of pages, acquiring pages + * until we've acquired a leaf page. Generally, this shouldn't happen; + * we should only see a single internal page with one item and a single + * leaf page with no items. The scenario where we could see something + * else is if reverse splits were turned off for awhile and then turned + * back on. That could result in all sorts of strangeness, e.g., empty + * pages in the tree, trees that looked like linked lists, and so on. + * + * !!! + * Sheer paranoia: if we find any pages that aren't going to be emptied + * by the delete, someone else added an item while we were walking the + * tree, and we discontinue the delete. Shouldn't be possible, but we + * check regardless. + */ + for (h = cp->csp[-1].page;;) { + if (ISLEAF(h)) { + if (NUM_ENT(h) != 0) + break; + break; + } else + if (NUM_ENT(h) != 1) + break; + + /* + * Get the next page, write lock it and push it onto the stack. + * We know it's index 0, because it can only have one element. + */ + switch (TYPE(h)) { + case P_IBTREE: + pgno = GET_BINTERNAL(h, 0)->pgno; + break; + case P_IRECNO: + pgno = GET_RINTERNAL(h, 0)->pgno; + break; + default: + return (__db_pgfmt(dbp, PGNO(h))); + } + + if ((ret = + __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &lock)) != 0) + break; + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + break; + BT_STK_PUSH(dbp->dbenv, cp, h, 0, lock, DB_LOCK_WRITE, ret); + if (ret != 0) + break; + } + + /* Adjust the cursor stack to reference the last page on the stack. */ + BT_STK_POP(cp); + + /* + * If everything worked, delete the stack, otherwise, release the + * stack and page locks without further damage. + */ + if (ret == 0) + ret = __bam_dpages(dbc, cp->sp); + else + (void)__bam_stkrel(dbc, 0); + + return (ret); +} + +/* + * __bam_c_getstack -- + * Acquire a full stack for a cursor. + */ +static int +__bam_c_getstack(dbc) + DBC *dbc; +{ + BTREE_CURSOR *cp; + DB *dbp; + DBT dbt; + PAGE *h; + int exact, ret, t_ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + + /* + * Get the page with the current item on it. The caller of this + * routine has to already hold a read lock on the page, so there + * is no additional lock to acquire. + */ + if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &h)) != 0) + return (ret); + + /* Get a copy of a key from the page. */ + memset(&dbt, 0, sizeof(DBT)); + if ((ret = __db_ret(dbp, + h, 0, &dbt, &dbc->rkey.data, &dbc->rkey.ulen)) != 0) + goto err; + + /* Get a write-locked stack for the page. */ + exact = 0; + ret = __bam_search(dbc, &dbt, S_KEYFIRST, 1, NULL, &exact); + +err: /* Discard the key and the page. */ + if ((t_ret = memp_fput(dbp->mpf, h, 0)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __bam_isopd -- + * Return if the cursor references an off-page duplicate tree via its + * page number. + */ +static int +__bam_isopd(dbc, pgnop) + DBC *dbc; + db_pgno_t *pgnop; +{ + BOVERFLOW *bo; + + if (TYPE(dbc->internal->page) != P_LBTREE) + return (0); + + bo = GET_BOVERFLOW(dbc->internal->page, dbc->internal->indx + O_INDX); + if (B_TYPE(bo->type) == B_DUPLICATE) { + *pgnop = bo->pgno; + return (1); + } + return (0); +} diff --git a/db/btree/bt_delete.c b/db/btree/bt_delete.c new file mode 100644 index 000000000..972588788 --- /dev/null +++ b/db/btree/bt_delete.c @@ -0,0 +1,530 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: bt_delete.c,v 11.31 2001/01/17 18:48:46 bostic Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_shash.h" +#include "btree.h" +#include "lock.h" + +/* + * __bam_delete -- + * Delete the items referenced by a key. + * + * PUBLIC: int __bam_delete __P((DB *, DB_TXN *, DBT *, u_int32_t)); + */ +int +__bam_delete(dbp, txn, key, flags) + DB *dbp; + DB_TXN *txn; + DBT *key; + u_int32_t flags; +{ + DBC *dbc; + DBT lkey; + DBT data; + u_int32_t f_init, f_next; + int ret, t_ret; + + PANIC_CHECK(dbp->dbenv); + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->del"); + DB_CHECK_TXN(dbp, txn); + + /* Check for invalid flags. */ + if ((ret = + __db_delchk(dbp, key, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0) + return (ret); + + /* Allocate a cursor. */ + if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0) + return (ret); + + DEBUG_LWRITE(dbc, txn, "bam_delete", key, NULL, flags); + + /* + * Walk a cursor through the key/data pairs, deleting as we go. Set + * the DB_DBT_USERMEM flag, as this might be a threaded application + * and the flags checking will catch us. We don't actually want the + * keys or data, so request a partial of length 0. + */ + memset(&lkey, 0, sizeof(lkey)); + F_SET(&lkey, DB_DBT_USERMEM | DB_DBT_PARTIAL); + memset(&data, 0, sizeof(data)); + F_SET(&data, DB_DBT_USERMEM | DB_DBT_PARTIAL); + + /* + * If locking (and we haven't already acquired CDB locks), set the + * read-modify-write flag. + */ + f_init = DB_SET; + f_next = DB_NEXT_DUP; + if (STD_LOCKING(dbc)) { + f_init |= DB_RMW; + f_next |= DB_RMW; + } + + /* Walk through the set of key/data pairs, deleting as we go. */ + if ((ret = dbc->c_get(dbc, key, &data, f_init)) != 0) + goto err; + for (;;) { + if ((ret = dbc->c_del(dbc, 0)) != 0) + goto err; + if ((ret = dbc->c_get(dbc, &lkey, &data, f_next)) != 0) { + if (ret == DB_NOTFOUND) { + ret = 0; + break; + } + goto err; + } + } + +err: /* Discard the cursor. */ + if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __bam_ditem -- + * Delete one or more entries from a page. + * + * PUBLIC: int __bam_ditem __P((DBC *, PAGE *, u_int32_t)); + */ +int +__bam_ditem(dbc, h, indx) + DBC *dbc; + PAGE *h; + u_int32_t indx; +{ + BINTERNAL *bi; + BKEYDATA *bk; + DB *dbp; + u_int32_t nbytes; + int ret; + + dbp = dbc->dbp; + + switch (TYPE(h)) { + case P_IBTREE: + bi = GET_BINTERNAL(h, indx); + switch (B_TYPE(bi->type)) { + case B_DUPLICATE: + case B_KEYDATA: + nbytes = BINTERNAL_SIZE(bi->len); + break; + case B_OVERFLOW: + nbytes = BINTERNAL_SIZE(bi->len); + if ((ret = + __db_doff(dbc, ((BOVERFLOW *)bi->data)->pgno)) != 0) + return (ret); + break; + default: + return (__db_pgfmt(dbp, PGNO(h))); + } + break; + case P_IRECNO: + nbytes = RINTERNAL_SIZE; + break; + case P_LBTREE: + /* + * If it's a duplicate key, discard the index and don't touch + * the actual page item. + * + * !!! + * This works because no data item can have an index matching + * any other index so even if the data item is in a key "slot", + * it won't match any other index. + */ + if ((indx % 2) == 0) { + /* + * Check for a duplicate after us on the page. NOTE: + * we have to delete the key item before deleting the + * data item, otherwise the "indx + P_INDX" calculation + * won't work! + */ + if (indx + P_INDX < (u_int32_t)NUM_ENT(h) && + h->inp[indx] == h->inp[indx + P_INDX]) + return (__bam_adjindx(dbc, + h, indx, indx + O_INDX, 0)); + /* + * Check for a duplicate before us on the page. It + * doesn't matter if we delete the key item before or + * after the data item for the purposes of this one. + */ + if (indx > 0 && h->inp[indx] == h->inp[indx - P_INDX]) + return (__bam_adjindx(dbc, + h, indx, indx - P_INDX, 0)); + } + /* FALLTHROUGH */ + case P_LDUP: + case P_LRECNO: + bk = GET_BKEYDATA(h, indx); + switch (B_TYPE(bk->type)) { + case B_DUPLICATE: + nbytes = BOVERFLOW_SIZE; + break; + case B_OVERFLOW: + nbytes = BOVERFLOW_SIZE; + if ((ret = __db_doff( + dbc, (GET_BOVERFLOW(h, indx))->pgno)) != 0) + return (ret); + break; + case B_KEYDATA: + nbytes = BKEYDATA_SIZE(bk->len); + break; + default: + return (__db_pgfmt(dbp, PGNO(h))); + } + break; + default: + return (__db_pgfmt(dbp, PGNO(h))); + } + + /* Delete the item and mark the page dirty. */ + if ((ret = __db_ditem(dbc, h, indx, nbytes)) != 0) + return (ret); + if ((ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0) + return (ret); + + return (0); +} + +/* + * __bam_adjindx -- + * Adjust an index on the page. + * + * PUBLIC: int __bam_adjindx __P((DBC *, PAGE *, u_int32_t, u_int32_t, int)); + */ +int +__bam_adjindx(dbc, h, indx, indx_copy, is_insert) + DBC *dbc; + PAGE *h; + u_int32_t indx, indx_copy; + int is_insert; +{ + DB *dbp; + db_indx_t copy; + int ret; + + dbp = dbc->dbp; + + /* Log the change. */ + if (DB_LOGGING(dbc) && + (ret = __bam_adj_log(dbp->dbenv, dbc->txn, &LSN(h), + 0, dbp->log_fileid, PGNO(h), &LSN(h), indx, indx_copy, + (u_int32_t)is_insert)) != 0) + return (ret); + + /* Shuffle the indices and mark the page dirty. */ + if (is_insert) { + copy = h->inp[indx_copy]; + if (indx != NUM_ENT(h)) + memmove(&h->inp[indx + O_INDX], &h->inp[indx], + sizeof(db_indx_t) * (NUM_ENT(h) - indx)); + h->inp[indx] = copy; + ++NUM_ENT(h); + } else { + --NUM_ENT(h); + if (indx != NUM_ENT(h)) + memmove(&h->inp[indx], &h->inp[indx + O_INDX], + sizeof(db_indx_t) * (NUM_ENT(h) - indx)); + } + if ((ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0) + return (ret); + + return (0); +} + +/* + * __bam_dpages -- + * Delete a set of locked pages. + * + * PUBLIC: int __bam_dpages __P((DBC *, EPG *)); + */ +int +__bam_dpages(dbc, stack_epg) + DBC *dbc; + EPG *stack_epg; +{ + BTREE_CURSOR *cp; + BINTERNAL *bi; + DB *dbp; + DBT a, b; + DB_LOCK c_lock, p_lock; + EPG *epg; + PAGE *child, *parent; + db_indx_t nitems; + db_pgno_t pgno, root_pgno; + db_recno_t rcnt; + int done, ret, t_ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + + /* + * We have the entire stack of deletable pages locked. + * + * Btree calls us with a pointer to the beginning of a stack, where + * the first page in the stack is to have a single item deleted, and + * the rest of the pages are to be removed. + * + * Recno calls us with a pointer into the middle of the stack, where + * the referenced page is to have a single item deleted, and pages + * after the stack reference are to be removed. + * + * First, discard any pages that we don't care about. + */ + ret = 0; + for (epg = cp->sp; epg < stack_epg; ++epg) { + if ((t_ret = + memp_fput(dbp->mpf, epg->page, 0)) != 0 && ret == 0) + ret = t_ret; + (void)__TLPUT(dbc, epg->lock); + } + if (ret != 0) + goto err; + + /* + * !!! + * There is an interesting deadlock situation here. We have to relink + * the leaf page chain around the leaf page being deleted. Consider + * a cursor walking through the leaf pages, that has the previous page + * read-locked and is waiting on a lock for the page we're deleting. + * It will deadlock here. Before we unlink the subtree, we relink the + * leaf page chain. + */ + if ((ret = __db_relink(dbc, DB_REM_PAGE, cp->csp->page, NULL, 1)) != 0) + goto err; + + /* + * Delete the last item that references the underlying pages that are + * to be deleted, and adjust cursors that reference that page. Then, + * save that page's page number and item count and release it. If + * the application isn't retaining locks because it's running without + * transactions, this lets the rest of the tree get back to business + * immediately. + */ + if ((ret = __bam_ditem(dbc, epg->page, epg->indx)) != 0) + goto err; + if ((ret = __bam_ca_di(dbc, PGNO(epg->page), epg->indx, -1)) != 0) + goto err; + + pgno = PGNO(epg->page); + nitems = NUM_ENT(epg->page); + + if ((ret = memp_fput(dbp->mpf, epg->page, 0)) != 0) + goto err_inc; + (void)__TLPUT(dbc, epg->lock); + + /* Free the rest of the pages in the stack. */ + while (++epg <= cp->csp) { + /* + * Delete page entries so they will be restored as part of + * recovery. We don't need to do cursor adjustment here as + * the pages are being emptied by definition and so cannot + * be referenced by a cursor. + */ + if (NUM_ENT(epg->page) != 0) { + DB_ASSERT(NUM_ENT(epg->page) == 1); + + if ((ret = __bam_ditem(dbc, epg->page, epg->indx)) != 0) + goto err; + } + + if ((ret = __db_free(dbc, epg->page)) != 0) { + epg->page = NULL; + goto err_inc; + } + (void)__TLPUT(dbc, epg->lock); + } + + if (0) { +err_inc: ++epg; +err: for (; epg <= cp->csp; ++epg) { + if (epg->page != NULL) + (void)memp_fput(dbp->mpf, epg->page, 0); + (void)__TLPUT(dbc, epg->lock); + } + BT_STK_CLR(cp); + return (ret); + } + BT_STK_CLR(cp); + + /* + * If we just deleted the next-to-last item from the root page, the + * tree can collapse one or more levels. While there remains only a + * single item on the root page, write lock the last page referenced + * by the root page and copy it over the root page. + */ + root_pgno = cp->root; + if (pgno != root_pgno || nitems != 1) + return (0); + + for (done = 0; !done;) { + /* Initialize. */ + parent = child = NULL; + p_lock.off = c_lock.off = LOCK_INVALID; + + /* Lock the root. */ + pgno = root_pgno; + if ((ret = + __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &p_lock)) != 0) + goto stop; + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &parent)) != 0) + goto stop; + + if (NUM_ENT(parent) != 1) + goto stop; + + switch (TYPE(parent)) { + case P_IBTREE: + /* + * If this is overflow, then try to delete it. + * The child may or may not still point at it. + */ + bi = GET_BINTERNAL(parent, 0); + if (B_TYPE(bi->type) == B_OVERFLOW) + if ((ret = __db_doff(dbc, + ((BOVERFLOW *)bi->data)->pgno)) != 0) + goto stop; + pgno = bi->pgno; + break; + case P_IRECNO: + pgno = GET_RINTERNAL(parent, 0)->pgno; + break; + default: + goto stop; + } + + /* Lock the child page. */ + if ((ret = + __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &c_lock)) != 0) + goto stop; + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &child)) != 0) + goto stop; + + /* Log the change. */ + if (DB_LOGGING(dbc)) { + memset(&a, 0, sizeof(a)); + a.data = child; + a.size = dbp->pgsize; + memset(&b, 0, sizeof(b)); + b.data = P_ENTRY(parent, 0); + b.size = TYPE(parent) == P_IRECNO ? RINTERNAL_SIZE : + BINTERNAL_SIZE(((BINTERNAL *)b.data)->len); + if ((ret = + __bam_rsplit_log(dbp->dbenv, dbc->txn, &child->lsn, + 0, dbp->log_fileid, PGNO(child), &a, PGNO(parent), + RE_NREC(parent), &b, &parent->lsn)) != 0) + goto stop; + } + + /* + * Make the switch. + * + * One fixup -- internal pages below the top level do not store + * a record count, so we have to preserve it if we're not + * converting to a leaf page. Note also that we are about to + * overwrite the parent page, including its LSN. This is OK + * because the log message we wrote describing this update + * stores its LSN on the child page. When the child is copied + * onto the parent, the correct LSN is copied into place. + */ + COMPQUIET(rcnt, 0); + if (F_ISSET(cp, C_RECNUM) && LEVEL(child) > LEAFLEVEL) + rcnt = RE_NREC(parent); + memcpy(parent, child, dbp->pgsize); + PGNO(parent) = root_pgno; + if (F_ISSET(cp, C_RECNUM) && LEVEL(child) > LEAFLEVEL) + RE_NREC_SET(parent, rcnt); + + /* Mark the pages dirty. */ + if ((ret = memp_fset(dbp->mpf, parent, DB_MPOOL_DIRTY)) != 0) + goto stop; + if ((ret = memp_fset(dbp->mpf, child, DB_MPOOL_DIRTY)) != 0) + goto stop; + + /* Adjust the cursors. */ + if ((ret = __bam_ca_rsplit(dbc, PGNO(child), root_pgno)) != 0) + goto stop; + + /* + * Free the page copied onto the root page and discard its + * lock. (The call to __db_free() discards our reference + * to the page.) + */ + if ((ret = __db_free(dbc, child)) != 0) { + child = NULL; + goto stop; + } + child = NULL; + + if (0) { +stop: done = 1; + } + if (p_lock.off != LOCK_INVALID) + (void)__TLPUT(dbc, p_lock); + if (parent != NULL && + (t_ret = memp_fput(dbp->mpf, parent, 0)) != 0 && ret == 0) + ret = t_ret; + if (c_lock.off != LOCK_INVALID) + (void)__TLPUT(dbc, c_lock); + if (child != NULL && + (t_ret = memp_fput(dbp->mpf, child, 0)) != 0 && ret == 0) + ret = t_ret; + } + + return (ret); +} diff --git a/db/btree/bt_method.c b/db/btree/bt_method.c new file mode 100644 index 000000000..5e3af27d0 --- /dev/null +++ b/db/btree/bt_method.c @@ -0,0 +1,387 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1999, 2000 + * Sleepycat Software. All rights reserved. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: bt_method.c,v 11.20 2000/11/30 00:58:28 ubell Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" +#include "qam.h" + +static int __bam_set_bt_compare + __P((DB *, int (*)(DB *, const DBT *, const DBT *))); +static int __bam_set_bt_maxkey __P((DB *, u_int32_t)); +static int __bam_set_bt_minkey __P((DB *, u_int32_t)); +static int __bam_set_bt_prefix + __P((DB *, size_t(*)(DB *, const DBT *, const DBT *))); +static int __ram_set_re_delim __P((DB *, int)); +static int __ram_set_re_len __P((DB *, u_int32_t)); +static int __ram_set_re_pad __P((DB *, int)); +static int __ram_set_re_source __P((DB *, const char *)); + +/* + * __bam_db_create -- + * Btree specific initialization of the DB structure. + * + * PUBLIC: int __bam_db_create __P((DB *)); + */ +int +__bam_db_create(dbp) + DB *dbp; +{ + BTREE *t; + int ret; + + /* Allocate and initialize the private btree structure. */ + if ((ret = __os_calloc(dbp->dbenv, 1, sizeof(BTREE), &t)) != 0) + return (ret); + dbp->bt_internal = t; + + t->bt_minkey = DEFMINKEYPAGE; /* Btree */ + t->bt_compare = __bam_defcmp; + t->bt_prefix = __bam_defpfx; + + dbp->set_bt_compare = __bam_set_bt_compare; + dbp->set_bt_maxkey = __bam_set_bt_maxkey; + dbp->set_bt_minkey = __bam_set_bt_minkey; + dbp->set_bt_prefix = __bam_set_bt_prefix; + + t->re_pad = ' '; /* Recno */ + t->re_delim = '\n'; + t->re_eof = 1; + + dbp->set_re_delim = __ram_set_re_delim; + dbp->set_re_len = __ram_set_re_len; + dbp->set_re_pad = __ram_set_re_pad; + dbp->set_re_source = __ram_set_re_source; + + return (0); +} + +/* + * __bam_db_close -- + * Btree specific discard of the DB structure. + * + * PUBLIC: int __bam_db_close __P((DB *)); + */ +int +__bam_db_close(dbp) + DB *dbp; +{ + BTREE *t; + + t = dbp->bt_internal; + /* Recno */ + /* Close any backing source file descriptor. */ + if (t->re_fp != NULL) + (void)fclose(t->re_fp); + + /* Free any backing source file name. */ + if (t->re_source != NULL) + __os_freestr(t->re_source); + + __os_free(t, sizeof(BTREE)); + dbp->bt_internal = NULL; + + return (0); +} + +/* + * __bam_set_flags -- + * Set Btree specific flags. + * + * PUBLIC: int __bam_set_flags __P((DB *, u_int32_t *flagsp)); + */ +int +__bam_set_flags(dbp, flagsp) + DB *dbp; + u_int32_t *flagsp; +{ + u_int32_t flags; + + flags = *flagsp; + if (LF_ISSET(DB_DUP | DB_DUPSORT | DB_RECNUM | DB_REVSPLITOFF)) { + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_flags"); + + /* + * The DB_DUP and DB_DUPSORT flags are shared by the Hash + * and Btree access methods. + */ + if (LF_ISSET(DB_DUP | DB_DUPSORT)) + DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE | DB_OK_HASH); + + if (LF_ISSET(DB_RECNUM | DB_REVSPLITOFF)) + DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE); + + if (LF_ISSET(DB_DUP | DB_DUPSORT)) { + /* DB_DUP/DB_DUPSORT is incompatible with DB_RECNUM. */ + if (F_ISSET(dbp, DB_BT_RECNUM)) + goto incompat; + + if (LF_ISSET(DB_DUPSORT)) { + if (dbp->dup_compare == NULL) + dbp->dup_compare = __bam_defcmp; + F_SET(dbp, DB_AM_DUPSORT); + } + + F_SET(dbp, DB_AM_DUP); + LF_CLR(DB_DUP | DB_DUPSORT); + } + + if (LF_ISSET(DB_RECNUM)) { + /* DB_RECNUM is incompatible with DB_DUP/DB_DUPSORT. */ + if (F_ISSET(dbp, DB_AM_DUP)) + goto incompat; + + F_SET(dbp, DB_BT_RECNUM); + LF_CLR(DB_RECNUM); + } + + if (LF_ISSET(DB_REVSPLITOFF)) { + F_SET(dbp, DB_BT_REVSPLIT); + LF_CLR(DB_REVSPLITOFF); + } + + *flagsp = flags; + } + return (0); + +incompat: + return (__db_ferr(dbp->dbenv, "DB->set_flags", 1)); +} + +/* + * __bam_set_bt_compare -- + * Set the comparison function. + */ +static int +__bam_set_bt_compare(dbp, func) + DB *dbp; + int (*func) __P((DB *, const DBT *, const DBT *)); +{ + BTREE *t; + + DB_ILLEGAL_AFTER_OPEN(dbp, "set_bt_compare"); + DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE); + + t = dbp->bt_internal; + + /* + * Can't default the prefix routine if the user supplies a comparison + * routine; shortening the keys can break their comparison algorithm. + */ + t->bt_compare = func; + if (t->bt_prefix == __bam_defpfx) + t->bt_prefix = NULL; + + return (0); +} + +/* + * __bam_set_bt_maxkey -- + * Set the maximum keys per page. + */ +static int +__bam_set_bt_maxkey(dbp, bt_maxkey) + DB *dbp; + u_int32_t bt_maxkey; +{ + BTREE *t; + + DB_ILLEGAL_AFTER_OPEN(dbp, "set_bt_maxkey"); + DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE); + + t = dbp->bt_internal; + + if (bt_maxkey < 1) { + __db_err(dbp->dbenv, "minimum bt_maxkey value is 1"); + return (EINVAL); + } + + t->bt_maxkey = bt_maxkey; + return (0); +} + +/* + * __bam_set_bt_minkey -- + * Set the minimum keys per page. + */ +static int +__bam_set_bt_minkey(dbp, bt_minkey) + DB *dbp; + u_int32_t bt_minkey; +{ + BTREE *t; + + DB_ILLEGAL_AFTER_OPEN(dbp, "set_bt_minkey"); + DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE); + + t = dbp->bt_internal; + + if (bt_minkey < 2) { + __db_err(dbp->dbenv, "minimum bt_minkey value is 2"); + return (EINVAL); + } + + t->bt_minkey = bt_minkey; + return (0); +} + +/* + * __bam_set_bt_prefix -- + * Set the prefix function. + */ +static int +__bam_set_bt_prefix(dbp, func) + DB *dbp; + size_t (*func) __P((DB *, const DBT *, const DBT *)); +{ + BTREE *t; + + DB_ILLEGAL_AFTER_OPEN(dbp, "set_bt_prefix"); + DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE); + + t = dbp->bt_internal; + + t->bt_prefix = func; + return (0); +} + +/* + * __ram_set_flags -- + * Set Recno specific flags. + * + * PUBLIC: int __ram_set_flags __P((DB *, u_int32_t *flagsp)); + */ +int +__ram_set_flags(dbp, flagsp) + DB *dbp; + u_int32_t *flagsp; +{ + u_int32_t flags; + + flags = *flagsp; + if (LF_ISSET(DB_RENUMBER | DB_SNAPSHOT)) { + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_flags"); + + DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO); + + if (LF_ISSET(DB_RENUMBER)) { + F_SET(dbp, DB_RE_RENUMBER); + LF_CLR(DB_RENUMBER); + } + + if (LF_ISSET(DB_SNAPSHOT)) { + F_SET(dbp, DB_RE_SNAPSHOT); + LF_CLR(DB_SNAPSHOT); + } + + *flagsp = flags; + } + return (0); +} + +/* + * __ram_set_re_delim -- + * Set the variable-length input record delimiter. + */ +static int +__ram_set_re_delim(dbp, re_delim) + DB *dbp; + int re_delim; +{ + BTREE *t; + + DB_ILLEGAL_AFTER_OPEN(dbp, "set_re_delim"); + DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO); + + t = dbp->bt_internal; + + t->re_delim = re_delim; + F_SET(dbp, DB_RE_DELIMITER); + + return (0); +} + +/* + * __ram_set_re_len -- + * Set the variable-length input record length. + */ +static int +__ram_set_re_len(dbp, re_len) + DB *dbp; + u_int32_t re_len; +{ + BTREE *t; + QUEUE *q; + + DB_ILLEGAL_AFTER_OPEN(dbp, "set_re_len"); + DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO); + + t = dbp->bt_internal; + t->re_len = re_len; + + q = dbp->q_internal; + q->re_len = re_len; + + F_SET(dbp, DB_RE_FIXEDLEN); + + return (0); +} + +/* + * __ram_set_re_pad -- + * Set the fixed-length record pad character. + */ +static int +__ram_set_re_pad(dbp, re_pad) + DB *dbp; + int re_pad; +{ + BTREE *t; + QUEUE *q; + + DB_ILLEGAL_AFTER_OPEN(dbp, "set_re_pad"); + DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO); + + t = dbp->bt_internal; + t->re_pad = re_pad; + + q = dbp->q_internal; + q->re_pad = re_pad; + + F_SET(dbp, DB_RE_PAD); + + return (0); +} + +/* + * __ram_set_re_source -- + * Set the backing source file name. + */ +static int +__ram_set_re_source(dbp, re_source) + DB *dbp; + const char *re_source; +{ + BTREE *t; + + DB_ILLEGAL_AFTER_OPEN(dbp, "set_re_source"); + DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO); + + t = dbp->bt_internal; + + return (__os_strdup(dbp->dbenv, re_source, &t->re_source)); +} diff --git a/db/btree/bt_open.c b/db/btree/bt_open.c new file mode 100644 index 000000000..405c1880f --- /dev/null +++ b/db/btree/bt_open.c @@ -0,0 +1,468 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: bt_open.c,v 11.42 2000/11/30 00:58:28 ubell Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <limits.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_swap.h" +#include "btree.h" +#include "db_shash.h" +#include "lock.h" +#include "log.h" +#include "mp.h" + +/* + * __bam_open -- + * Open a btree. + * + * PUBLIC: int __bam_open __P((DB *, const char *, db_pgno_t, u_int32_t)); + */ +int +__bam_open(dbp, name, base_pgno, flags) + DB *dbp; + const char *name; + db_pgno_t base_pgno; + u_int32_t flags; +{ + BTREE *t; + + t = dbp->bt_internal; + + /* Initialize the remaining fields/methods of the DB. */ + dbp->del = __bam_delete; + dbp->key_range = __bam_key_range; + dbp->stat = __bam_stat; + + /* + * We don't permit the user to specify a prefix routine if they didn't + * also specify a comparison routine, they can't know enough about our + * comparison routine to get it right. + */ + if (t->bt_compare == __bam_defcmp && t->bt_prefix != __bam_defpfx) { + __db_err(dbp->dbenv, +"prefix comparison may not be specified for default comparison routine"); + return (EINVAL); + } + + /* + * Verify that the bt_minkey value specified won't cause the + * calculation of ovflsize to underflow [#2406] for this pagesize. + */ + if (B_MINKEY_TO_OVFLSIZE(t->bt_minkey, dbp->pgsize) > + B_MINKEY_TO_OVFLSIZE(DEFMINKEYPAGE, dbp->pgsize)) { + __db_err(dbp->dbenv, + "bt_minkey value of %lu too high for page size of %lu", + (u_long)t->bt_minkey, (u_long)dbp->pgsize); + return (EINVAL); + } + + /* Start up the tree. */ + return (__bam_read_root(dbp, name, base_pgno, flags)); +} + +/* + * __bam_metachk -- + * + * PUBLIC: int __bam_metachk __P((DB *, const char *, BTMETA *)); + */ +int +__bam_metachk(dbp, name, btm) + DB *dbp; + const char *name; + BTMETA *btm; +{ + DB_ENV *dbenv; + u_int32_t vers; + int ret; + + dbenv = dbp->dbenv; + + /* + * At this point, all we know is that the magic number is for a Btree. + * Check the version, the database may be out of date. + */ + vers = btm->dbmeta.version; + if (F_ISSET(dbp, DB_AM_SWAP)) + M_32_SWAP(vers); + switch (vers) { + case 6: + case 7: + __db_err(dbenv, + "%s: btree version %lu requires a version upgrade", + name, (u_long)vers); + return (DB_OLD_VERSION); + case 8: + break; + default: + __db_err(dbenv, + "%s: unsupported btree version: %lu", name, (u_long)vers); + return (EINVAL); + } + + /* Swap the page if we need to. */ + if (F_ISSET(dbp, DB_AM_SWAP) && (ret = __bam_mswap((PAGE *)btm)) != 0) + return (ret); + + /* + * Check application info against metadata info, and set info, flags, + * and type based on metadata info. + */ + if ((ret = + __db_fchk(dbenv, "DB->open", btm->dbmeta.flags, BTM_MASK)) != 0) + return (ret); + + if (F_ISSET(&btm->dbmeta, BTM_RECNO)) { + if (dbp->type == DB_BTREE) + goto wrong_type; + dbp->type = DB_RECNO; + DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO); + } else { + if (dbp->type == DB_RECNO) + goto wrong_type; + dbp->type = DB_BTREE; + DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE); + } + + if (F_ISSET(&btm->dbmeta, BTM_DUP)) + F_SET(dbp, DB_AM_DUP); + else + if (F_ISSET(dbp, DB_AM_DUP)) { + __db_err(dbenv, + "%s: DB_DUP specified to open method but not set in database", + name); + return (EINVAL); + } + + if (F_ISSET(&btm->dbmeta, BTM_RECNUM)) { + if (dbp->type != DB_BTREE) + goto wrong_type; + F_SET(dbp, DB_BT_RECNUM); + + if ((ret = __db_fcchk(dbenv, + "DB->open", dbp->flags, DB_AM_DUP, DB_BT_RECNUM)) != 0) + return (ret); + } else + if (F_ISSET(dbp, DB_BT_RECNUM)) { + __db_err(dbenv, + "%s: DB_RECNUM specified to open method but not set in database", + name); + return (EINVAL); + } + + if (F_ISSET(&btm->dbmeta, BTM_FIXEDLEN)) { + if (dbp->type != DB_RECNO) + goto wrong_type; + F_SET(dbp, DB_RE_FIXEDLEN); + } else + if (F_ISSET(dbp, DB_RE_FIXEDLEN)) { + __db_err(dbenv, + "%s: DB_FIXEDLEN specified to open method but not set in database", + name); + return (EINVAL); + } + + if (F_ISSET(&btm->dbmeta, BTM_RENUMBER)) { + if (dbp->type != DB_RECNO) + goto wrong_type; + F_SET(dbp, DB_RE_RENUMBER); + } else + if (F_ISSET(dbp, DB_RE_RENUMBER)) { + __db_err(dbenv, + "%s: DB_RENUMBER specified to open method but not set in database", + name); + return (EINVAL); + } + + if (F_ISSET(&btm->dbmeta, BTM_SUBDB)) + F_SET(dbp, DB_AM_SUBDB); + else + if (F_ISSET(dbp, DB_AM_SUBDB)) { + __db_err(dbenv, + "%s: multiple databases specified but not supported by file", + name); + return (EINVAL); + } + + if (F_ISSET(&btm->dbmeta, BTM_DUPSORT)) { + if (dbp->dup_compare == NULL) + dbp->dup_compare = __bam_defcmp; + F_SET(dbp, DB_AM_DUPSORT); + } else + if (dbp->dup_compare != NULL) { + __db_err(dbenv, + "%s: duplicate sort specified but not supported in database", + name); + return (EINVAL); + } + + /* Set the page size. */ + dbp->pgsize = btm->dbmeta.pagesize; + + /* Copy the file's ID. */ + memcpy(dbp->fileid, btm->dbmeta.uid, DB_FILE_ID_LEN); + + return (0); + +wrong_type: + if (dbp->type == DB_BTREE) + __db_err(dbenv, + "open method type is Btree, database type is Recno"); + else + __db_err(dbenv, + "open method type is Recno, database type is Btree"); + return (EINVAL); +} + +/* + * __bam_read_root -- + * Check (and optionally create) a tree. + * + * PUBLIC: int __bam_read_root __P((DB *, const char *, db_pgno_t, u_int32_t)); + */ +int +__bam_read_root(dbp, name, base_pgno, flags) + DB *dbp; + const char *name; + db_pgno_t base_pgno; + u_int32_t flags; +{ + BTMETA *meta; + BTREE *t; + DBC *dbc; + DB_LSN orig_lsn; + DB_LOCK metalock; + PAGE *root; + int locked, ret, t_ret; + + ret = 0; + t = dbp->bt_internal; + meta = NULL; + root = NULL; + locked = 0; + + /* + * Get a cursor. If DB_CREATE is specified, we may be creating + * the root page, and to do that safely in CDB we need a write + * cursor. In STD_LOCKING mode, we'll synchronize using the + * meta page lock instead. + */ + if ((ret = dbp->cursor(dbp, dbp->open_txn, + &dbc, LF_ISSET(DB_CREATE) && CDB_LOCKING(dbp->dbenv) ? + DB_WRITECURSOR : 0)) != 0) + return (ret); + + /* Get, and optionally create the metadata page. */ + if ((ret = + __db_lget(dbc, 0, base_pgno, DB_LOCK_READ, 0, &metalock)) != 0) + goto err; + if ((ret = memp_fget( + dbp->mpf, &base_pgno, DB_MPOOL_CREATE, (PAGE **)&meta)) != 0) + goto err; + + /* + * If the magic number is correct, we're not creating the tree. + * Correct any fields that may not be right. Note, all of the + * local flags were set by DB->open. + */ +again: if (meta->dbmeta.magic != 0) { + t->bt_maxkey = meta->maxkey; + t->bt_minkey = meta->minkey; + t->re_pad = meta->re_pad; + t->re_len = meta->re_len; + + t->bt_meta = base_pgno; + t->bt_root = meta->root; + + (void)memp_fput(dbp->mpf, meta, 0); + meta = NULL; + goto done; + } + + /* In recovery if it's not there it will be created elsewhere.*/ + if (IS_RECOVERING(dbp->dbenv)) + goto done; + + /* If we're doing CDB; we now have to get the write lock. */ + if (CDB_LOCKING(dbp->dbenv)) { + /* + * We'd better have DB_CREATE set if we're actually doing + * the create. + */ + DB_ASSERT(LF_ISSET(DB_CREATE)); + if ((ret = lock_get(dbp->dbenv, dbc->locker, DB_LOCK_UPGRADE, + &dbc->lock_dbt, DB_LOCK_WRITE, &dbc->mylock)) != 0) + goto err; + } + + /* + * If we are doing locking, relase the read lock and get a write lock. + * We want to avoid deadlock. + */ + if (locked == 0 && STD_LOCKING(dbc)) { + if ((ret = __LPUT(dbc, metalock)) != 0) + goto err; + if ((ret = __db_lget(dbc, + 0, base_pgno, DB_LOCK_WRITE, 0, &metalock)) != 0) + goto err; + locked = 1; + goto again; + } + + /* Initialize the tree structure metadata information. */ + orig_lsn = meta->dbmeta.lsn; + memset(meta, 0, sizeof(BTMETA)); + meta->dbmeta.lsn = orig_lsn; + meta->dbmeta.pgno = base_pgno; + meta->dbmeta.magic = DB_BTREEMAGIC; + meta->dbmeta.version = DB_BTREEVERSION; + meta->dbmeta.pagesize = dbp->pgsize; + meta->dbmeta.type = P_BTREEMETA; + meta->dbmeta.free = PGNO_INVALID; + if (F_ISSET(dbp, DB_AM_DUP)) + F_SET(&meta->dbmeta, BTM_DUP); + if (F_ISSET(dbp, DB_RE_FIXEDLEN)) + F_SET(&meta->dbmeta, BTM_FIXEDLEN); + if (F_ISSET(dbp, DB_BT_RECNUM)) + F_SET(&meta->dbmeta, BTM_RECNUM); + if (F_ISSET(dbp, DB_RE_RENUMBER)) + F_SET(&meta->dbmeta, BTM_RENUMBER); + if (F_ISSET(dbp, DB_AM_SUBDB)) + F_SET(&meta->dbmeta, BTM_SUBDB); + if (dbp->dup_compare != NULL) + F_SET(&meta->dbmeta, BTM_DUPSORT); + if (dbp->type == DB_RECNO) + F_SET(&meta->dbmeta, BTM_RECNO); + memcpy(meta->dbmeta.uid, dbp->fileid, DB_FILE_ID_LEN); + + meta->maxkey = t->bt_maxkey; + meta->minkey = t->bt_minkey; + meta->re_len = t->re_len; + meta->re_pad = t->re_pad; + + /* If necessary, log the meta-data and root page creates. */ + if ((ret = __db_log_page(dbp, + name, &orig_lsn, base_pgno, (PAGE *)meta)) != 0) + goto err; + + /* Create and initialize a root page. */ + if ((ret = __db_new(dbc, + dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE, &root)) != 0) + goto err; + root->level = LEAFLEVEL; + + if (dbp->open_txn != NULL && (ret = __bam_root_log(dbp->dbenv, + dbp->open_txn, &meta->dbmeta.lsn, 0, dbp->log_fileid, + meta->dbmeta.pgno, root->pgno, &meta->dbmeta.lsn)) != 0) + goto err; + + meta->root = root->pgno; + + DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOGMETA, ret, name); + if ((ret = __db_log_page(dbp, + name, &root->lsn, root->pgno, root)) != 0) + goto err; + DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOG, ret, name); + + t->bt_meta = base_pgno; + t->bt_root = root->pgno; + + /* Release the metadata and root pages. */ + if ((ret = memp_fput(dbp->mpf, meta, DB_MPOOL_DIRTY)) != 0) + goto err; + meta = NULL; + if ((ret = memp_fput(dbp->mpf, root, DB_MPOOL_DIRTY)) != 0) + goto err; + root = NULL; + + /* + * Flush the metadata and root pages to disk. + * + * !!! + * It's not useful to return not-yet-flushed here -- convert it to + * an error. + */ + if ((ret = memp_fsync(dbp->mpf)) == DB_INCOMPLETE) { + __db_err(dbp->dbenv, "Metapage flush failed"); + ret = EINVAL; + } + DB_TEST_RECOVERY(dbp, DB_TEST_POSTSYNC, ret, name); + +done: /* + * !!! + * We already did an insert and so the last-page-inserted has been + * set. I'm not sure where the *right* place to clear this value + * is, it's not intuitively obvious that it belongs here. + */ + t->bt_lpgno = PGNO_INVALID; + +err: +DB_TEST_RECOVERY_LABEL + /* Put any remaining pages back. */ + if (meta != NULL) + if ((t_ret = memp_fput(dbp->mpf, meta, 0)) != 0 && + ret == 0) + ret = t_ret; + if (root != NULL) + if ((t_ret = memp_fput(dbp->mpf, root, 0)) != 0 && + ret == 0) + ret = t_ret; + + /* We can release the metapage lock when we are done. */ + if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0) + ret = t_ret; + + if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} diff --git a/db/btree/bt_put.c b/db/btree/bt_put.c new file mode 100644 index 000000000..19a04526d --- /dev/null +++ b/db/btree/bt_put.c @@ -0,0 +1,859 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: bt_put.c,v 11.46 2001/01/17 18:48:46 bostic Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" + +static int __bam_dup_convert __P((DBC *, PAGE *, u_int32_t)); +static int __bam_ovput + __P((DBC *, u_int32_t, db_pgno_t, PAGE *, u_int32_t, DBT *)); + +/* + * __bam_iitem -- + * Insert an item into the tree. + * + * PUBLIC: int __bam_iitem __P((DBC *, DBT *, DBT *, u_int32_t, u_int32_t)); + */ +int +__bam_iitem(dbc, key, data, op, flags) + DBC *dbc; + DBT *key, *data; + u_int32_t op, flags; +{ + BKEYDATA *bk, bk_tmp; + BTREE *t; + BTREE_CURSOR *cp; + DB *dbp; + DBT bk_hdr, tdbt; + PAGE *h; + db_indx_t indx; + u_int32_t data_size, have_bytes, need_bytes, needed; + int cmp, bigkey, bigdata, dupadjust, padrec, replace, ret, was_deleted; + + COMPQUIET(bk, NULL); + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + t = dbp->bt_internal; + h = cp->page; + indx = cp->indx; + dupadjust = replace = was_deleted = 0; + + /* + * Fixed-length records with partial puts: it's an error to specify + * anything other simple overwrite. + */ + if (F_ISSET(dbp, DB_RE_FIXEDLEN) && + F_ISSET(data, DB_DBT_PARTIAL) && data->dlen != data->size) { + data_size = data->size; + goto len_err; + } + + /* + * Figure out how much space the data will take, including if it's a + * partial record. + * + * Fixed-length records: it's an error to specify a record that's + * longer than the fixed-length, and we never require less than + * the fixed-length record size. + */ + data_size = F_ISSET(data, DB_DBT_PARTIAL) ? + __bam_partsize(op, data, h, indx) : data->size; + padrec = 0; + if (F_ISSET(dbp, DB_RE_FIXEDLEN)) { + if (data_size > t->re_len) { +len_err: __db_err(dbp->dbenv, + "Length improper for fixed length record %lu", + (u_long)data_size); + return (EINVAL); + } + if (data_size < t->re_len) { + padrec = 1; + data_size = t->re_len; + } + } + + /* + * Handle partial puts or short fixed-length records: build the + * real record. + */ + if (padrec || F_ISSET(data, DB_DBT_PARTIAL)) { + tdbt = *data; + if ((ret = + __bam_build(dbc, op, &tdbt, h, indx, data_size)) != 0) + return (ret); + data = &tdbt; + } + + /* + * If the user has specified a duplicate comparison function, return + * an error if DB_CURRENT was specified and the replacement data + * doesn't compare equal to the current data. This stops apps from + * screwing up the duplicate sort order. We have to do this after + * we build the real record so that we're comparing the real items. + */ + if (op == DB_CURRENT && dbp->dup_compare != NULL) { + if ((ret = __bam_cmp(dbp, data, h, + indx + (TYPE(h) == P_LBTREE ? O_INDX : 0), + dbp->dup_compare, &cmp)) != 0) + return (ret); + if (cmp != 0) { + __db_err(dbp->dbenv, + "Current data differs from put data"); + return (EINVAL); + } + } + + /* + * If the key or data item won't fit on a page, we'll have to store + * them on overflow pages. + */ + needed = 0; + bigdata = data_size > cp->ovflsize; + switch (op) { + case DB_KEYFIRST: + /* We're adding a new key and data pair. */ + bigkey = key->size > cp->ovflsize; + if (bigkey) + needed += BOVERFLOW_PSIZE; + else + needed += BKEYDATA_PSIZE(key->size); + if (bigdata) + needed += BOVERFLOW_PSIZE; + else + needed += BKEYDATA_PSIZE(data_size); + break; + case DB_AFTER: + case DB_BEFORE: + case DB_CURRENT: + /* + * We're either overwriting the data item of a key/data pair + * or we're creating a new on-page duplicate and only adding + * a data item. + * + * !!! + * We're not currently correcting for space reclaimed from + * already deleted items, but I don't think it's worth the + * complexity. + */ + bigkey = 0; + if (op == DB_CURRENT) { + bk = GET_BKEYDATA(h, + indx + (TYPE(h) == P_LBTREE ? O_INDX : 0)); + if (B_TYPE(bk->type) == B_KEYDATA) + have_bytes = BKEYDATA_PSIZE(bk->len); + else + have_bytes = BOVERFLOW_PSIZE; + need_bytes = 0; + } else { + have_bytes = 0; + need_bytes = sizeof(db_indx_t); + } + if (bigdata) + need_bytes += BOVERFLOW_PSIZE; + else + need_bytes += BKEYDATA_PSIZE(data_size); + + if (have_bytes < need_bytes) + needed += need_bytes - have_bytes; + break; + default: + return (__db_unknown_flag(dbp->dbenv, "__bam_iitem", op)); + } + + /* + * If there's not enough room, or the user has put a ceiling on the + * number of keys permitted in the page, split the page. + * + * XXX + * The t->bt_maxkey test here may be insufficient -- do we have to + * check in the btree split code, so we don't undo it there!?!? + */ + if (P_FREESPACE(h) < needed || + (t->bt_maxkey != 0 && NUM_ENT(h) > t->bt_maxkey)) + return (DB_NEEDSPLIT); + + /* + * The code breaks it up into five cases: + * + * 1. Insert a new key/data pair. + * 2. Append a new data item (a new duplicate). + * 3. Insert a new data item (a new duplicate). + * 4. Delete and re-add the data item (overflow item). + * 5. Overwrite the data item. + */ + switch (op) { + case DB_KEYFIRST: /* 1. Insert a new key/data pair. */ + if (bigkey) { + if ((ret = __bam_ovput(dbc, + B_OVERFLOW, PGNO_INVALID, h, indx, key)) != 0) + return (ret); + } else + if ((ret = __db_pitem(dbc, h, indx, + BKEYDATA_SIZE(key->size), NULL, key)) != 0) + return (ret); + + if ((ret = __bam_ca_di(dbc, PGNO(h), indx, 1)) != 0) + return (ret); + ++indx; + break; + case DB_AFTER: /* 2. Append a new data item. */ + if (TYPE(h) == P_LBTREE) { + /* Copy the key for the duplicate and adjust cursors. */ + if ((ret = + __bam_adjindx(dbc, h, indx + P_INDX, indx, 1)) != 0) + return (ret); + if ((ret = + __bam_ca_di(dbc, PGNO(h), indx + P_INDX, 1)) != 0) + return (ret); + + indx += 3; + dupadjust = 1; + + cp->indx += 2; + } else { + ++indx; + cp->indx += 1; + } + break; + case DB_BEFORE: /* 3. Insert a new data item. */ + if (TYPE(h) == P_LBTREE) { + /* Copy the key for the duplicate and adjust cursors. */ + if ((ret = __bam_adjindx(dbc, h, indx, indx, 1)) != 0) + return (ret); + if ((ret = __bam_ca_di(dbc, PGNO(h), indx, 1)) != 0) + return (ret); + + ++indx; + dupadjust = 1; + } + break; + case DB_CURRENT: + /* + * Clear the cursor's deleted flag. The problem is that if + * we deadlock or fail while deleting the overflow item or + * replacing the non-overflow item, a subsequent cursor close + * will try and remove the item because the cursor's delete + * flag is set + */ + (void)__bam_ca_delete(dbp, PGNO(h), indx, 0); + + if (TYPE(h) == P_LBTREE) { + ++indx; + dupadjust = 1; + + /* + * In a Btree deleted records aren't counted (deleted + * records are counted in a Recno because all accesses + * are based on record number). If it's a Btree and + * it's a DB_CURRENT operation overwriting a previously + * deleted record, increment the record count. + */ + was_deleted = B_DISSET(bk->type); + } + + /* + * 4. Delete and re-add the data item. + * + * If we're changing the type of the on-page structure, or we + * are referencing offpage items, we have to delete and then + * re-add the item. We do not do any cursor adjustments here + * because we're going to immediately re-add the item into the + * same slot. + */ + if (bigdata || B_TYPE(bk->type) != B_KEYDATA) { + if ((ret = __bam_ditem(dbc, h, indx)) != 0) + return (ret); + break; + } + + /* 5. Overwrite the data item. */ + replace = 1; + break; + default: + return (__db_unknown_flag(dbp->dbenv, "__bam_iitem", op)); + } + + /* Add the data. */ + if (bigdata) { + if ((ret = __bam_ovput(dbc, + B_OVERFLOW, PGNO_INVALID, h, indx, data)) != 0) + return (ret); + } else { + if (LF_ISSET(BI_DELETED)) { + B_TSET(bk_tmp.type, B_KEYDATA, 1); + bk_tmp.len = data->size; + bk_hdr.data = &bk_tmp; + bk_hdr.size = SSZA(BKEYDATA, data); + ret = __db_pitem(dbc, h, indx, + BKEYDATA_SIZE(data->size), &bk_hdr, data); + } else if (replace) + ret = __bam_ritem(dbc, h, indx, data); + else + ret = __db_pitem(dbc, h, indx, + BKEYDATA_SIZE(data->size), NULL, data); + if (ret != 0) + return (ret); + } + if ((ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0) + return (ret); + + /* + * Re-position the cursors if necessary and reset the current cursor + * to point to the new item. + */ + if (op != DB_CURRENT) { + if ((ret = __bam_ca_di(dbc, PGNO(h), indx, 1)) != 0) + return (ret); + cp->indx = TYPE(h) == P_LBTREE ? indx - O_INDX : indx; + } + + /* + * If we've changed the record count, update the tree. There's no + * need to adjust the count if the operation not performed on the + * current record or when the current record was previously deleted. + */ + if (F_ISSET(cp, C_RECNUM) && (op != DB_CURRENT || was_deleted)) + if ((ret = __bam_adjust(dbc, 1)) != 0) + return (ret); + + /* + * If a Btree leaf page is at least 50% full and we may have added or + * modified a duplicate data item, see if the set of duplicates takes + * up at least 25% of the space on the page. If it does, move it onto + * its own page. + */ + if (dupadjust && P_FREESPACE(h) <= dbp->pgsize / 2) { + if ((ret = __bam_dup_convert(dbc, h, indx - O_INDX)) != 0) + return (ret); + } + + /* If we've modified a recno file, set the flag. */ + if (dbc->dbtype == DB_RECNO) + t->re_modified = 1; + + return (ret); +} + +/* + * __bam_partsize -- + * Figure out how much space a partial data item is in total. + * + * PUBLIC: u_int32_t __bam_partsize __P((u_int32_t, DBT *, PAGE *, u_int32_t)); + */ +u_int32_t +__bam_partsize(op, data, h, indx) + u_int32_t op, indx; + DBT *data; + PAGE *h; +{ + BKEYDATA *bk; + u_int32_t nbytes; + + /* + * If the record doesn't already exist, it's simply the data we're + * provided. + */ + if (op != DB_CURRENT) + return (data->doff + data->size); + + /* + * Otherwise, it's the data provided plus any already existing data + * that we're not replacing. + */ + bk = GET_BKEYDATA(h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0)); + nbytes = + B_TYPE(bk->type) == B_OVERFLOW ? ((BOVERFLOW *)bk)->tlen : bk->len; + + /* + * There are really two cases here: + * + * Case 1: We are replacing some bytes that do not exist (i.e., they + * are past the end of the record). In this case the number of bytes + * we are replacing is irrelevant and all we care about is how many + * bytes we are going to add from offset. So, the new record length + * is going to be the size of the new bytes (size) plus wherever those + * new bytes begin (doff). + * + * Case 2: All the bytes we are replacing exist. Therefore, the new + * size is the oldsize (nbytes) minus the bytes we are replacing (dlen) + * plus the bytes we are adding (size). + */ + if (nbytes < data->doff + data->dlen) /* Case 1 */ + return (data->doff + data->size); + + return (nbytes + data->size - data->dlen); /* Case 2 */ +} + +/* + * __bam_build -- + * Build the real record for a partial put, or short fixed-length record. + * + * PUBLIC: int __bam_build __P((DBC *, u_int32_t, + * PUBLIC: DBT *, PAGE *, u_int32_t, u_int32_t)); + */ +int +__bam_build(dbc, op, dbt, h, indx, nbytes) + DBC *dbc; + u_int32_t op, indx, nbytes; + DBT *dbt; + PAGE *h; +{ + BKEYDATA *bk, tbk; + BOVERFLOW *bo; + BTREE *t; + BTREE_CURSOR *cp; + DB *dbp; + DBT copy; + u_int32_t len, tlen; + u_int8_t *p; + int ret; + + COMPQUIET(bo, NULL); + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *) dbc->internal; + t = dbp->bt_internal; + + /* We use the record data return memory, it's only a short-term use. */ + if (dbc->rdata.ulen < nbytes) { + if ((ret = __os_realloc(dbp->dbenv, + nbytes, NULL, &dbc->rdata.data)) != 0) { + dbc->rdata.ulen = 0; + dbc->rdata.data = NULL; + return (ret); + } + dbc->rdata.ulen = nbytes; + } + + /* + * We use nul or pad bytes for any part of the record that isn't + * specified; get it over with. + */ + memset(dbc->rdata.data, + F_ISSET(dbp, DB_RE_FIXEDLEN) ? t->re_pad : 0, nbytes); + + /* + * In the next clauses, we need to do three things: a) set p to point + * to the place at which to copy the user's data, b) set tlen to the + * total length of the record, not including the bytes contributed by + * the user, and c) copy any valid data from an existing record. If + * it's not a partial put (this code is called for both partial puts + * and fixed-length record padding) or it's a new key, we can cut to + * the chase. + */ + if (!F_ISSET(dbt, DB_DBT_PARTIAL) || op != DB_CURRENT) { + p = (u_int8_t *)dbc->rdata.data + dbt->doff; + tlen = dbt->doff; + goto user_copy; + } + + /* Find the current record. */ + if (indx < NUM_ENT(h)) { + bk = GET_BKEYDATA(h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0)); + bo = (BOVERFLOW *)bk; + } else { + bk = &tbk; + B_TSET(bk->type, B_KEYDATA, 0); + bk->len = 0; + } + if (B_TYPE(bk->type) == B_OVERFLOW) { + /* + * In the case of an overflow record, we shift things around + * in the current record rather than allocate a separate copy. + */ + memset(©, 0, sizeof(copy)); + if ((ret = __db_goff(dbp, ©, bo->tlen, + bo->pgno, &dbc->rdata.data, &dbc->rdata.ulen)) != 0) + return (ret); + + /* Skip any leading data from the original record. */ + tlen = dbt->doff; + p = (u_int8_t *)dbc->rdata.data + dbt->doff; + + /* + * Copy in any trailing data from the original record. + * + * If the original record was larger than the original offset + * plus the bytes being deleted, there is trailing data in the + * original record we need to preserve. If we aren't deleting + * the same number of bytes as we're inserting, copy it up or + * down, into place. + * + * Use memmove(), the regions may overlap. + */ + if (bo->tlen > dbt->doff + dbt->dlen) { + len = bo->tlen - (dbt->doff + dbt->dlen); + if (dbt->dlen != dbt->size) + memmove(p + dbt->size, p + dbt->dlen, len); + tlen += len; + } + } else { + /* Copy in any leading data from the original record. */ + memcpy(dbc->rdata.data, + bk->data, dbt->doff > bk->len ? bk->len : dbt->doff); + tlen = dbt->doff; + p = (u_int8_t *)dbc->rdata.data + dbt->doff; + + /* Copy in any trailing data from the original record. */ + len = dbt->doff + dbt->dlen; + if (bk->len > len) { + memcpy(p + dbt->size, bk->data + len, bk->len - len); + tlen += bk->len - len; + } + } + +user_copy: + /* + * Copy in the application provided data -- p and tlen must have been + * initialized above. + */ + memcpy(p, dbt->data, dbt->size); + tlen += dbt->size; + + /* Set the DBT to reference our new record. */ + dbc->rdata.size = F_ISSET(dbp, DB_RE_FIXEDLEN) ? t->re_len : tlen; + dbc->rdata.dlen = 0; + dbc->rdata.doff = 0; + dbc->rdata.flags = 0; + *dbt = dbc->rdata; + return (0); +} + +/* + * __bam_ritem -- + * Replace an item on a page. + * + * PUBLIC: int __bam_ritem __P((DBC *, PAGE *, u_int32_t, DBT *)); + */ +int +__bam_ritem(dbc, h, indx, data) + DBC *dbc; + PAGE *h; + u_int32_t indx; + DBT *data; +{ + BKEYDATA *bk; + DB *dbp; + DBT orig, repl; + db_indx_t cnt, lo, ln, min, off, prefix, suffix; + int32_t nbytes; + int ret; + u_int8_t *p, *t; + + dbp = dbc->dbp; + + /* + * Replace a single item onto a page. The logic figuring out where + * to insert and whether it fits is handled in the caller. All we do + * here is manage the page shuffling. + */ + bk = GET_BKEYDATA(h, indx); + + /* Log the change. */ + if (DB_LOGGING(dbc)) { + /* + * We might as well check to see if the two data items share + * a common prefix and suffix -- it can save us a lot of log + * message if they're large. + */ + min = data->size < bk->len ? data->size : bk->len; + for (prefix = 0, + p = bk->data, t = data->data; + prefix < min && *p == *t; ++prefix, ++p, ++t) + ; + + min -= prefix; + for (suffix = 0, + p = (u_int8_t *)bk->data + bk->len - 1, + t = (u_int8_t *)data->data + data->size - 1; + suffix < min && *p == *t; ++suffix, --p, --t) + ; + + /* We only log the parts of the keys that have changed. */ + orig.data = (u_int8_t *)bk->data + prefix; + orig.size = bk->len - (prefix + suffix); + repl.data = (u_int8_t *)data->data + prefix; + repl.size = data->size - (prefix + suffix); + if ((ret = __bam_repl_log(dbp->dbenv, dbc->txn, + &LSN(h), 0, dbp->log_fileid, PGNO(h), &LSN(h), + (u_int32_t)indx, (u_int32_t)B_DISSET(bk->type), + &orig, &repl, (u_int32_t)prefix, (u_int32_t)suffix)) != 0) + return (ret); + } + + /* + * Set references to the first in-use byte on the page and the + * first byte of the item being replaced. + */ + p = (u_int8_t *)h + HOFFSET(h); + t = (u_int8_t *)bk; + + /* + * If the entry is growing in size, shift the beginning of the data + * part of the page down. If the entry is shrinking in size, shift + * the beginning of the data part of the page up. Use memmove(3), + * the regions overlap. + */ + lo = BKEYDATA_SIZE(bk->len); + ln = BKEYDATA_SIZE(data->size); + if (lo != ln) { + nbytes = lo - ln; /* Signed difference. */ + if (p == t) /* First index is fast. */ + h->inp[indx] += nbytes; + else { /* Else, shift the page. */ + memmove(p + nbytes, p, t - p); + + /* Adjust the indices' offsets. */ + off = h->inp[indx]; + for (cnt = 0; cnt < NUM_ENT(h); ++cnt) + if (h->inp[cnt] <= off) + h->inp[cnt] += nbytes; + } + + /* Clean up the page and adjust the item's reference. */ + HOFFSET(h) += nbytes; + t += nbytes; + } + + /* Copy the new item onto the page. */ + bk = (BKEYDATA *)t; + B_TSET(bk->type, B_KEYDATA, 0); + bk->len = data->size; + memcpy(bk->data, data->data, data->size); + + return (0); +} + +/* + * __bam_dup_convert -- + * Check to see if the duplicate set at indx should have its own page. + * If it should, create it. + */ +static int +__bam_dup_convert(dbc, h, indx) + DBC *dbc; + PAGE *h; + u_int32_t indx; +{ + BTREE_CURSOR *cp; + BKEYDATA *bk; + DB *dbp; + DBT hdr; + PAGE *dp; + db_indx_t cnt, cpindx, dindx, first, sz; + int ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + + /* + * Count the duplicate records and calculate how much room they're + * using on the page. + */ + while (indx > 0 && h->inp[indx] == h->inp[indx - P_INDX]) + indx -= P_INDX; + for (cnt = 0, sz = 0, first = indx;; ++cnt, indx += P_INDX) { + if (indx >= NUM_ENT(h) || h->inp[first] != h->inp[indx]) + break; + bk = GET_BKEYDATA(h, indx); + sz += B_TYPE(bk->type) == B_KEYDATA ? + BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE; + bk = GET_BKEYDATA(h, indx + O_INDX); + sz += B_TYPE(bk->type) == B_KEYDATA ? + BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE; + } + + /* + * We have to do these checks when the user is replacing the cursor's + * data item -- if the application replaces a duplicate item with a + * larger data item, it can increase the amount of space used by the + * duplicates, requiring this check. But that means we may have done + * this check when it wasn't a duplicate item after all. + */ + if (cnt == 1) + return (0); + + /* + * If this set of duplicates is using more than 25% of the page, move + * them off. The choice of 25% is a WAG, but the value must be small + * enough that we can always split a page without putting duplicates + * on two different pages. + */ + if (sz < dbp->pgsize / 4) + return (0); + + /* Get a new page. */ + if ((ret = __db_new(dbc, + dbp->dup_compare == NULL ? P_LRECNO : P_LDUP, &dp)) != 0) + return (ret); + P_INIT(dp, dbp->pgsize, dp->pgno, + PGNO_INVALID, PGNO_INVALID, LEAFLEVEL, TYPE(dp)); + + /* + * Move this set of duplicates off the page. First points to the first + * key of the first duplicate key/data pair, cnt is the number of pairs + * we're dealing with. + */ + memset(&hdr, 0, sizeof(hdr)); + dindx = first; + indx = first; + cpindx = 0; + do { + /* Move cursors referencing the old entry to the new entry. */ + if ((ret = __bam_ca_dup(dbc, first, + PGNO(h), indx, PGNO(dp), cpindx)) != 0) + goto err; + + /* + * Copy the entry to the new page. If the off-duplicate page + * If the off-duplicate page is a Btree page (i.e. dup_compare + * will be non-NULL, we use Btree pages for sorted dups, + * and Recno pages for unsorted dups), move all entries + * normally, even deleted ones. If it's a Recno page, + * deleted entries are discarded (if the deleted entry is + * overflow, then free up those pages). + */ + bk = GET_BKEYDATA(h, dindx + 1); + hdr.data = bk; + hdr.size = B_TYPE(bk->type) == B_KEYDATA ? + BKEYDATA_SIZE(bk->len) : BOVERFLOW_SIZE; + if (dbp->dup_compare == NULL && B_DISSET(bk->type)) { + /* + * Unsorted dups, i.e. recno page, and we have + * a deleted entry, don't move it, but if it was + * an overflow entry, we need to free those pages. + */ + if (B_TYPE(bk->type) == B_OVERFLOW && + (ret = __db_doff(dbc, + (GET_BOVERFLOW(h, dindx + 1))->pgno)) != 0) + goto err; + } else { + if ((ret = __db_pitem( + dbc, dp, cpindx, hdr.size, &hdr, NULL)) != 0) + goto err; + ++cpindx; + } + /* Delete all but the last reference to the key. */ + if (cnt != 1) { + if ((ret = __bam_adjindx(dbc, + h, dindx, first + 1, 0)) != 0) + goto err; + } else + dindx++; + + /* Delete the data item. */ + if ((ret = __db_ditem(dbc, h, dindx, hdr.size)) != 0) + goto err; + indx += P_INDX; + } while (--cnt); + + /* Put in a new data item that points to the duplicates page. */ + if ((ret = __bam_ovput(dbc, + B_DUPLICATE, dp->pgno, h, first + 1, NULL)) != 0) + goto err; + + /* Adjust cursors for all the above movments. */ + if ((ret = __bam_ca_di(dbc, + PGNO(h), first + P_INDX, first + P_INDX - indx)) != 0) + goto err; + + return (memp_fput(dbp->mpf, dp, DB_MPOOL_DIRTY)); + +err: (void)__db_free(dbc, dp); + return (ret); +} + +/* + * __bam_ovput -- + * Build an item for an off-page duplicates page or overflow page and + * insert it on the page. + */ +static int +__bam_ovput(dbc, type, pgno, h, indx, item) + DBC *dbc; + u_int32_t type, indx; + db_pgno_t pgno; + PAGE *h; + DBT *item; +{ + BOVERFLOW bo; + DBT hdr; + int ret; + + UMRW_SET(bo.unused1); + B_TSET(bo.type, type, 0); + UMRW_SET(bo.unused2); + + /* + * If we're creating an overflow item, do so and acquire the page + * number for it. If we're creating an off-page duplicates tree, + * we are giving the page number as an argument. + */ + if (type == B_OVERFLOW) { + if ((ret = __db_poff(dbc, item, &bo.pgno)) != 0) + return (ret); + bo.tlen = item->size; + } else { + bo.pgno = pgno; + bo.tlen = 0; + } + + /* Store the new record on the page. */ + memset(&hdr, 0, sizeof(hdr)); + hdr.data = &bo; + hdr.size = BOVERFLOW_SIZE; + return (__db_pitem(dbc, h, indx, BOVERFLOW_SIZE, &hdr, NULL)); +} diff --git a/db/btree/bt_rec.c b/db/btree/bt_rec.c new file mode 100644 index 000000000..24dc9bc6a --- /dev/null +++ b/db/btree/bt_rec.c @@ -0,0 +1,1219 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: bt_rec.c,v 11.35 2001/01/10 16:24:47 ubell Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "hash.h" +#include "btree.h" +#include "log.h" + +#define IS_BTREE_PAGE(pagep) \ + (TYPE(pagep) == P_IBTREE || \ + TYPE(pagep) == P_LBTREE || TYPE(pagep) == P_LDUP) + +/* + * __bam_pg_alloc_recover -- + * Recovery function for pg_alloc. + * + * PUBLIC: int __bam_pg_alloc_recover + * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_pg_alloc_recover(dbenv, dbtp, lsnp, op, info) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __bam_pg_alloc_args *argp; + DB *file_dbp; + DBC *dbc; + DBMETA *meta; + DB_MPOOLFILE *mpf; + PAGE *pagep; + db_pgno_t pgno; + int cmp_n, cmp_p, level, modified, ret; + + REC_PRINT(__bam_pg_alloc_print); + REC_INTRO(__bam_pg_alloc_read, 0); + + /* + * Fix up the allocated page. If we're redoing the operation, we have + * to get the page (creating it if it doesn't exist), and update its + * LSN. If we're undoing the operation, we have to reset the page's + * LSN and put it on the free list. + * + * Fix up the metadata page. If we're redoing the operation, we have + * to get the metadata page and update its LSN and its free pointer. + * If we're undoing the operation and the page was ever created, we put + * it on the freelist. + */ + pgno = PGNO_BASE_MD; + meta = NULL; + if ((ret = memp_fget(mpf, &pgno, 0, &meta)) != 0) { + /* The metadata page must always exist on redo. */ + if (DB_REDO(op)) { + (void)__db_pgerr(file_dbp, pgno); + goto out; + } else + goto done; + } + if ((ret = memp_fget(mpf, &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0) { + /* + * We specify creation and check for it later, because this + * operation was supposed to create the page, and even in + * the undo case it's going to get linked onto the freelist + * which we're also fixing up. + */ + (void)__db_pgerr(file_dbp, argp->pgno); + goto err; + } + + /* Fix up the allocated page. */ + modified = 0; + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->page_lsn); + + /* + * If an inital allocation is aborted and then reallocated + * during an archival restore the log record will have + * an LSN for the page but the page will be empty. + */ + if (IS_ZERO_LSN(LSN(pagep))) + cmp_p = 0; + CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->page_lsn); + /* + * If we we rolled back this allocation previously during an + * archive restore, the page may have the LSN of the meta page + * at the point of the roll back. This will be no more + * than the LSN of the metadata page at the time of this allocation. + */ + if (DB_REDO(op) && + (cmp_p == 0 || + (IS_ZERO_LSN(argp->page_lsn) && + log_compare(&LSN(pagep), &argp->meta_lsn) <= 0))) { + /* Need to redo update described. */ + switch (argp->ptype) { + case P_LBTREE: + case P_LRECNO: + case P_LDUP: + level = LEAFLEVEL; + break; + default: + level = 0; + break; + } + P_INIT(pagep, file_dbp->pgsize, + argp->pgno, PGNO_INVALID, PGNO_INVALID, level, argp->ptype); + + pagep->lsn = *lsnp; + modified = 1; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* + * Undo the allocation, reinitialize the page and + * link its next pointer to the free list. + */ + P_INIT(pagep, file_dbp->pgsize, + argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID); + + pagep->lsn = argp->page_lsn; + modified = 1; + } + + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) { + goto err; + } + + /* + * If the page was newly created, put it on the limbo list. + */ + if (IS_ZERO_LSN(LSN(pagep)) && + IS_ZERO_LSN(argp->page_lsn) && DB_UNDO(op)) { + /* Put the page in limbo.*/ + if ((ret = __db_add_limbo(dbenv, + info, argp->fileid, argp->pgno, 1)) != 0) + goto err; + } + + /* Fix up the metadata page. */ + modified = 0; + cmp_n = log_compare(lsnp, &LSN(meta)); + cmp_p = log_compare(&LSN(meta), &argp->meta_lsn); + CHECK_LSN(op, cmp_p, &LSN(meta), &argp->meta_lsn); + if (cmp_p == 0 && DB_REDO(op)) { + /* Need to redo update described. */ + LSN(meta) = *lsnp; + meta->free = argp->next; + modified = 1; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Need to undo update described. */ + LSN(meta) = argp->meta_lsn; + + /* + * If the page has a zero LSN then its newly created + * and will go into limbo rather than directly on the + * free list. + */ + if (!IS_ZERO_LSN(argp->page_lsn)) + meta->free = argp->pgno; + modified = 1; + } + if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0) + goto out; + /* + * This could be the metapage from a subdb which is read from disk + * to recover its creation. + */ + if (F_ISSET(file_dbp, DB_AM_SUBDB)) + switch (argp->type) { + case P_BTREEMETA: + case P_HASHMETA: + case P_QAMMETA: + file_dbp->sync(file_dbp, 0); + break; + } + +done: *lsnp = argp->prev_lsn; + ret = 0; + + if (0) { +err: + if (meta != NULL) + (void)memp_fput(mpf, meta, 0); + } +out: REC_CLOSE; +} + +/* + * __bam_pg_free_recover -- + * Recovery function for pg_free. + * + * PUBLIC: int __bam_pg_free_recover + * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_pg_free_recover(dbenv, dbtp, lsnp, op, info) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __bam_pg_free_args *argp; + DB *file_dbp; + DBC *dbc; + DBMETA *meta; + DB_LSN copy_lsn; + DB_MPOOLFILE *mpf; + PAGE *pagep; + db_pgno_t pgno; + int cmp_n, cmp_p, modified, ret; + + COMPQUIET(info, NULL); + REC_PRINT(__bam_pg_free_print); + REC_INTRO(__bam_pg_free_read, 1); + + /* + * Fix up the freed page. If we're redoing the operation we get the + * page and explicitly discard its contents, then update its LSN. If + * we're undoing the operation, we get the page and restore its header. + * Create the page if necessary, we may be freeing an aborted + * create. + */ + if ((ret = memp_fget(mpf, &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + modified = 0; + __ua_memcpy(©_lsn, &LSN(argp->header.data), sizeof(DB_LSN)); + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), ©_lsn); + CHECK_LSN(op, cmp_p, &LSN(pagep), ©_lsn); + if (DB_REDO(op) && + (cmp_p == 0 || + (IS_ZERO_LSN(copy_lsn) && + log_compare(&LSN(pagep), &argp->meta_lsn) <= 0))) { + /* Need to redo update described. */ + P_INIT(pagep, file_dbp->pgsize, + argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID); + pagep->lsn = *lsnp; + + modified = 1; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Need to undo update described. */ + memcpy(pagep, argp->header.data, argp->header.size); + + modified = 1; + } + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) + goto out; + + /* + * Fix up the metadata page. If we're redoing or undoing the operation + * we get the page and update its LSN and free pointer. + */ + pgno = PGNO_BASE_MD; + if ((ret = memp_fget(mpf, &pgno, 0, &meta)) != 0) { + /* The metadata page must always exist. */ + (void)__db_pgerr(file_dbp, pgno); + goto out; + } + + modified = 0; + cmp_n = log_compare(lsnp, &LSN(meta)); + cmp_p = log_compare(&LSN(meta), &argp->meta_lsn); + CHECK_LSN(op, cmp_p, &LSN(meta), &argp->meta_lsn); + if (cmp_p == 0 && DB_REDO(op)) { + /* Need to redo the deallocation. */ + meta->free = argp->pgno; + LSN(meta) = *lsnp; + modified = 1; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Need to undo the deallocation. */ + meta->free = argp->next; + LSN(meta) = argp->meta_lsn; + modified = 1; + } + if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0) + goto out; + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: REC_CLOSE; +} + +/* + * __bam_split_recover -- + * Recovery function for split. + * + * PUBLIC: int __bam_split_recover + * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_split_recover(dbenv, dbtp, lsnp, op, info) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __bam_split_args *argp; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + PAGE *_lp, *lp, *np, *pp, *_rp, *rp, *sp; + db_pgno_t pgno, root_pgno; + u_int32_t ptype; + int cmp, l_update, p_update, r_update, rc, ret, rootsplit, t_ret; + + COMPQUIET(info, NULL); + REC_PRINT(__bam_split_print); + + mpf = NULL; + _lp = lp = np = pp = _rp = rp = NULL; + sp = NULL; + + REC_INTRO(__bam_split_read, 1); + + /* + * There are two kinds of splits that we have to recover from. The + * first is a root-page split, where the root page is split from a + * leaf page into an internal page and two new leaf pages are created. + * The second is where a page is split into two pages, and a new key + * is inserted into the parent page. + * + * DBTs are not aligned in log records, so we need to copy the page + * so that we can access fields within it throughout this routine. + * Although we could hardcode the unaligned copies in this routine, + * we will be calling into regular btree functions with this page, + * so it's got to be aligned. Copying it into allocated memory is + * the only way to guarantee this. + */ + if ((ret = __os_malloc(dbenv, argp->pg.size, NULL, &sp)) != 0) + goto out; + memcpy(sp, argp->pg.data, argp->pg.size); + + pgno = PGNO(sp); + root_pgno = argp->root_pgno; + rootsplit = pgno == root_pgno; + if (memp_fget(mpf, &argp->left, 0, &lp) != 0) + lp = NULL; + if (memp_fget(mpf, &argp->right, 0, &rp) != 0) + rp = NULL; + + if (DB_REDO(op)) { + l_update = r_update = p_update = 0; + /* + * Decide if we need to resplit the page. + * + * If this is a root split, then the root has to exist, it's + * the page we're splitting and it gets modified. If this is + * not a root split, then the left page has to exist, for the + * same reason. + */ + if (rootsplit) { + if ((ret = memp_fget(mpf, &pgno, 0, &pp)) != 0) { + (void)__db_pgerr(file_dbp, pgno); + pp = NULL; + goto out; + } + cmp = log_compare(&LSN(pp), &LSN(argp->pg.data)); + CHECK_LSN(op, cmp, &LSN(pp), &LSN(argp->pg.data)); + p_update = cmp == 0; + } else if (lp == NULL) { + (void)__db_pgerr(file_dbp, argp->left); + goto out; + } + + if (lp != NULL) { + cmp = log_compare(&LSN(lp), &argp->llsn); + CHECK_LSN(op, cmp, &LSN(lp), &argp->llsn); + if (cmp == 0) + l_update = 1; + } else + l_update = 1; + + if (rp != NULL) { + cmp = log_compare(&LSN(rp), &argp->rlsn); + CHECK_LSN(op, cmp, &LSN(rp), &argp->rlsn); + if (cmp == 0) + r_update = 1; + } else + r_update = 1; + if (!p_update && !l_update && !r_update) + goto check_next; + + /* Allocate and initialize new left/right child pages. */ + if ((ret = + __os_malloc(dbenv, file_dbp->pgsize, NULL, &_lp)) != 0 + || (ret = + __os_malloc(dbenv, file_dbp->pgsize, NULL, &_rp)) != 0) + goto out; + if (rootsplit) { + P_INIT(_lp, file_dbp->pgsize, argp->left, + PGNO_INVALID, + ISINTERNAL(sp) ? PGNO_INVALID : argp->right, + LEVEL(sp), TYPE(sp)); + P_INIT(_rp, file_dbp->pgsize, argp->right, + ISINTERNAL(sp) ? PGNO_INVALID : argp->left, + PGNO_INVALID, LEVEL(sp), TYPE(sp)); + } else { + P_INIT(_lp, file_dbp->pgsize, PGNO(sp), + ISINTERNAL(sp) ? PGNO_INVALID : PREV_PGNO(sp), + ISINTERNAL(sp) ? PGNO_INVALID : argp->right, + LEVEL(sp), TYPE(sp)); + P_INIT(_rp, file_dbp->pgsize, argp->right, + ISINTERNAL(sp) ? PGNO_INVALID : sp->pgno, + ISINTERNAL(sp) ? PGNO_INVALID : NEXT_PGNO(sp), + LEVEL(sp), TYPE(sp)); + } + + /* Split the page. */ + if ((ret = __bam_copy(file_dbp, sp, _lp, 0, argp->indx)) != 0 || + (ret = __bam_copy(file_dbp, sp, _rp, argp->indx, + NUM_ENT(sp))) != 0) + goto out; + + /* If the left child is wrong, update it. */ + if (lp == NULL && (ret = + memp_fget(mpf, &argp->left, DB_MPOOL_CREATE, &lp)) != 0) { + (void)__db_pgerr(file_dbp, argp->left); + lp = NULL; + goto out; + } + if (l_update) { + memcpy(lp, _lp, file_dbp->pgsize); + lp->lsn = *lsnp; + if ((ret = memp_fput(mpf, lp, DB_MPOOL_DIRTY)) != 0) + goto out; + lp = NULL; + } + + /* If the right child is wrong, update it. */ + if (rp == NULL && (ret = memp_fget(mpf, + &argp->right, DB_MPOOL_CREATE, &rp)) != 0) { + (void)__db_pgerr(file_dbp, argp->right); + rp = NULL; + goto out; + } + if (r_update) { + memcpy(rp, _rp, file_dbp->pgsize); + rp->lsn = *lsnp; + if ((ret = memp_fput(mpf, rp, DB_MPOOL_DIRTY)) != 0) + goto out; + rp = NULL; + } + + /* + * If the parent page is wrong, update it. This is of interest + * only if it was a root split, since root splits create parent + * pages. All other splits modify a parent page, but those are + * separately logged and recovered. + */ + if (rootsplit && p_update) { + if (IS_BTREE_PAGE(sp)) { + ptype = P_IBTREE; + rc = argp->opflags & SPL_NRECS ? 1 : 0; + } else { + ptype = P_IRECNO; + rc = 1; + } + + P_INIT(pp, file_dbp->pgsize, root_pgno, + PGNO_INVALID, PGNO_INVALID, _lp->level + 1, ptype); + RE_NREC_SET(pp, + rc ? __bam_total(_lp) + __bam_total(_rp) : 0); + + pp->lsn = *lsnp; + if ((ret = memp_fput(mpf, pp, DB_MPOOL_DIRTY)) != 0) + goto out; + pp = NULL; + } + +check_next: /* + * Finally, redo the next-page link if necessary. This is of + * interest only if it wasn't a root split -- inserting a new + * page in the tree requires that any following page have its + * previous-page pointer updated to our new page. The next + * page must exist because we're redoing the operation. + */ + if (!rootsplit && !IS_ZERO_LSN(argp->nlsn)) { + if ((ret = memp_fget(mpf, &argp->npgno, 0, &np)) != 0) { + (void)__db_pgerr(file_dbp, argp->npgno); + np = NULL; + goto out; + } + cmp = log_compare(&LSN(np), &argp->nlsn); + CHECK_LSN(op, cmp, &LSN(np), &argp->nlsn); + if (cmp == 0) { + PREV_PGNO(np) = argp->right; + np->lsn = *lsnp; + if ((ret = + memp_fput(mpf, np, DB_MPOOL_DIRTY)) != 0) + goto out; + np = NULL; + } + } + } else { + /* + * If the split page is wrong, replace its contents with the + * logged page contents. If the page doesn't exist, it means + * that the create of the page never happened, nor did any of + * the adds onto the page that caused the split, and there's + * really no undo-ing to be done. + */ + if ((ret = memp_fget(mpf, &pgno, 0, &pp)) != 0) { + pp = NULL; + goto lrundo; + } + if (log_compare(lsnp, &LSN(pp)) == 0) { + memcpy(pp, argp->pg.data, argp->pg.size); + if ((ret = memp_fput(mpf, pp, DB_MPOOL_DIRTY)) != 0) + goto out; + pp = NULL; + } + + /* + * If it's a root split and the left child ever existed, update + * its LSN. (If it's not a root split, we've updated the left + * page already -- it's the same as the split page.) If the + * right child ever existed, root split or not, update its LSN. + * The undo of the page allocation(s) will restore them to the + * free list. + */ +lrundo: if ((rootsplit && lp != NULL) || rp != NULL) { + if (rootsplit && lp != NULL && + log_compare(lsnp, &LSN(lp)) == 0) { + lp->lsn = argp->llsn; + if ((ret = + memp_fput(mpf, lp, DB_MPOOL_DIRTY)) != 0) + goto out; + lp = NULL; + } + if (rp != NULL && + log_compare(lsnp, &LSN(rp)) == 0) { + rp->lsn = argp->rlsn; + if ((ret = + memp_fput(mpf, rp, DB_MPOOL_DIRTY)) != 0) + goto out; + rp = NULL; + } + } + + /* + * Finally, undo the next-page link if necessary. This is of + * interest only if it wasn't a root split -- inserting a new + * page in the tree requires that any following page have its + * previous-page pointer updated to our new page. Since it's + * possible that the next-page never existed, we ignore it as + * if there's nothing to undo. + */ + if (!rootsplit && !IS_ZERO_LSN(argp->nlsn)) { + if ((ret = memp_fget(mpf, &argp->npgno, 0, &np)) != 0) { + np = NULL; + goto done; + } + if (log_compare(lsnp, &LSN(np)) == 0) { + PREV_PGNO(np) = argp->left; + np->lsn = argp->nlsn; + if (memp_fput(mpf, np, DB_MPOOL_DIRTY)) + goto out; + np = NULL; + } + } + } + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: /* Free any pages that weren't dirtied. */ + if (pp != NULL && (t_ret = memp_fput(mpf, pp, 0)) != 0 && ret == 0) + ret = t_ret; + if (lp != NULL && (t_ret = memp_fput(mpf, lp, 0)) != 0 && ret == 0) + ret = t_ret; + if (np != NULL && (t_ret = memp_fput(mpf, np, 0)) != 0 && ret == 0) + ret = t_ret; + if (rp != NULL && (t_ret = memp_fput(mpf, rp, 0)) != 0 && ret == 0) + ret = t_ret; + + /* Free any allocated space. */ + if (_lp != NULL) + __os_free(_lp, file_dbp->pgsize); + if (_rp != NULL) + __os_free(_rp, file_dbp->pgsize); + if (sp != NULL) + __os_free(sp, argp->pg.size); + + REC_CLOSE; +} + +/* + * __bam_rsplit_recover -- + * Recovery function for a reverse split. + * + * PUBLIC: int __bam_rsplit_recover + * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_rsplit_recover(dbenv, dbtp, lsnp, op, info) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __bam_rsplit_args *argp; + DB *file_dbp; + DBC *dbc; + DB_LSN copy_lsn; + DB_MPOOLFILE *mpf; + PAGE *pagep; + db_pgno_t pgno, root_pgno; + int cmp_n, cmp_p, modified, ret; + + COMPQUIET(info, NULL); + REC_PRINT(__bam_rsplit_print); + REC_INTRO(__bam_rsplit_read, 1); + + /* Fix the root page. */ + pgno = root_pgno = argp->root_pgno; + if ((ret = memp_fget(mpf, &pgno, 0, &pagep)) != 0) { + /* The root page must always exist if we are going forward. */ + if (DB_REDO(op)) { + __db_pgerr(file_dbp, pgno); + goto out; + } + /* This must be the root of an OPD tree. */ + DB_ASSERT(root_pgno != + ((BTREE *)file_dbp->bt_internal)->bt_root); + ret = 0; + goto done; + } + modified = 0; + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->rootlsn); + CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->rootlsn); + if (cmp_p == 0 && DB_REDO(op)) { + /* Need to redo update described. */ + memcpy(pagep, argp->pgdbt.data, argp->pgdbt.size); + pagep->pgno = root_pgno; + pagep->lsn = *lsnp; + modified = 1; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Need to undo update described. */ + P_INIT(pagep, file_dbp->pgsize, root_pgno, + argp->nrec, PGNO_INVALID, pagep->level + 1, + IS_BTREE_PAGE(pagep) ? P_IBTREE : P_IRECNO); + if ((ret = __db_pitem(dbc, pagep, 0, + argp->rootent.size, &argp->rootent, NULL)) != 0) + goto out; + pagep->lsn = argp->rootlsn; + modified = 1; + } + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) + goto out; + + /* + * Fix the page copied over the root page. It's possible that the + * page never made it to disk, so if we're undo-ing and the page + * doesn't exist, it's okay and there's nothing further to do. + */ + if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + if (DB_UNDO(op)) + goto done; + (void)__db_pgerr(file_dbp, argp->pgno); + goto out; + } + modified = 0; + __ua_memcpy(©_lsn, &LSN(argp->pgdbt.data), sizeof(DB_LSN)); + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), ©_lsn); + CHECK_LSN(op, cmp_p, &LSN(pagep), ©_lsn); + if (cmp_p == 0 && DB_REDO(op)) { + /* Need to redo update described. */ + pagep->lsn = *lsnp; + modified = 1; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Need to undo update described. */ + memcpy(pagep, argp->pgdbt.data, argp->pgdbt.size); + modified = 1; + } + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) + goto out; + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: REC_CLOSE; +} + +/* + * __bam_adj_recover -- + * Recovery function for adj. + * + * PUBLIC: int __bam_adj_recover + * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_adj_recover(dbenv, dbtp, lsnp, op, info) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __bam_adj_args *argp; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int cmp_n, cmp_p, modified, ret; + + COMPQUIET(info, NULL); + REC_PRINT(__bam_adj_print); + REC_INTRO(__bam_adj_read, 1); + + /* Get the page; if it never existed and we're undoing, we're done. */ + if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + if (DB_UNDO(op)) + goto done; + (void)__db_pgerr(file_dbp, argp->pgno); + goto out; + } + + modified = 0; + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->lsn); + CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->lsn); + if (cmp_p == 0 && DB_REDO(op)) { + /* Need to redo update described. */ + if ((ret = __bam_adjindx(dbc, + pagep, argp->indx, argp->indx_copy, argp->is_insert)) != 0) + goto err; + + LSN(pagep) = *lsnp; + modified = 1; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Need to undo update described. */ + if ((ret = __bam_adjindx(dbc, + pagep, argp->indx, argp->indx_copy, !argp->is_insert)) != 0) + goto err; + + LSN(pagep) = argp->lsn; + modified = 1; + } + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) + goto out; + +done: *lsnp = argp->prev_lsn; + ret = 0; + + if (0) { +err: (void)memp_fput(mpf, pagep, 0); + } +out: REC_CLOSE; +} + +/* + * __bam_cadjust_recover -- + * Recovery function for the adjust of a count change in an internal + * page. + * + * PUBLIC: int __bam_cadjust_recover + * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_cadjust_recover(dbenv, dbtp, lsnp, op, info) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __bam_cadjust_args *argp; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int cmp_n, cmp_p, modified, ret; + + COMPQUIET(info, NULL); + REC_PRINT(__bam_cadjust_print); + REC_INTRO(__bam_cadjust_read, 1); + + /* Get the page; if it never existed and we're undoing, we're done. */ + if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + if (DB_UNDO(op)) + goto done; + (void)__db_pgerr(file_dbp, argp->pgno); + goto out; + } + + modified = 0; + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->lsn); + CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->lsn); + if (cmp_p == 0 && DB_REDO(op)) { + /* Need to redo update described. */ + if (IS_BTREE_PAGE(pagep)) { + GET_BINTERNAL(pagep, argp->indx)->nrecs += argp->adjust; + if (argp->opflags & CAD_UPDATEROOT) + RE_NREC_ADJ(pagep, argp->adjust); + } else { + GET_RINTERNAL(pagep, argp->indx)->nrecs += argp->adjust; + if (argp->opflags & CAD_UPDATEROOT) + RE_NREC_ADJ(pagep, argp->adjust); + } + + LSN(pagep) = *lsnp; + modified = 1; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Need to undo update described. */ + if (IS_BTREE_PAGE(pagep)) { + GET_BINTERNAL(pagep, argp->indx)->nrecs -= argp->adjust; + if (argp->opflags & CAD_UPDATEROOT) + RE_NREC_ADJ(pagep, -(argp->adjust)); + } else { + GET_RINTERNAL(pagep, argp->indx)->nrecs -= argp->adjust; + if (argp->opflags & CAD_UPDATEROOT) + RE_NREC_ADJ(pagep, -(argp->adjust)); + } + LSN(pagep) = argp->lsn; + modified = 1; + } + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) + goto out; + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: REC_CLOSE; +} + +/* + * __bam_cdel_recover -- + * Recovery function for the intent-to-delete of a cursor record. + * + * PUBLIC: int __bam_cdel_recover + * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_cdel_recover(dbenv, dbtp, lsnp, op, info) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __bam_cdel_args *argp; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + PAGE *pagep; + u_int32_t indx; + int cmp_n, cmp_p, modified, ret; + + COMPQUIET(info, NULL); + REC_PRINT(__bam_cdel_print); + REC_INTRO(__bam_cdel_read, 1); + + /* Get the page; if it never existed and we're undoing, we're done. */ + if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + if (DB_UNDO(op)) + goto done; + (void)__db_pgerr(file_dbp, argp->pgno); + goto out; + } + + modified = 0; + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->lsn); + CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->lsn); + if (cmp_p == 0 && DB_REDO(op)) { + /* Need to redo update described. */ + indx = argp->indx + (TYPE(pagep) == P_LBTREE ? O_INDX : 0); + B_DSET(GET_BKEYDATA(pagep, indx)->type); + + LSN(pagep) = *lsnp; + modified = 1; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Need to undo update described. */ + indx = argp->indx + (TYPE(pagep) == P_LBTREE ? O_INDX : 0); + B_DCLR(GET_BKEYDATA(pagep, indx)->type); + + (void)__bam_ca_delete(file_dbp, argp->pgno, argp->indx, 0); + + LSN(pagep) = argp->lsn; + modified = 1; + } + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) + goto out; + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: REC_CLOSE; +} + +/* + * __bam_repl_recover -- + * Recovery function for page item replacement. + * + * PUBLIC: int __bam_repl_recover + * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_repl_recover(dbenv, dbtp, lsnp, op, info) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __bam_repl_args *argp; + BKEYDATA *bk; + DB *file_dbp; + DBC *dbc; + DBT dbt; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int cmp_n, cmp_p, modified, ret; + u_int8_t *p; + + COMPQUIET(info, NULL); + REC_PRINT(__bam_repl_print); + REC_INTRO(__bam_repl_read, 1); + + /* Get the page; if it never existed and we're undoing, we're done. */ + if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + if (DB_UNDO(op)) + goto done; + (void)__db_pgerr(file_dbp, argp->pgno); + goto out; + } + bk = GET_BKEYDATA(pagep, argp->indx); + + modified = 0; + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->lsn); + CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->lsn); + if (cmp_p == 0 && DB_REDO(op)) { + /* + * Need to redo update described. + * + * Re-build the replacement item. + */ + memset(&dbt, 0, sizeof(dbt)); + dbt.size = argp->prefix + argp->suffix + argp->repl.size; + if ((ret = __os_malloc(dbenv, dbt.size, NULL, &dbt.data)) != 0) + goto err; + p = dbt.data; + memcpy(p, bk->data, argp->prefix); + p += argp->prefix; + memcpy(p, argp->repl.data, argp->repl.size); + p += argp->repl.size; + memcpy(p, bk->data + (bk->len - argp->suffix), argp->suffix); + + ret = __bam_ritem(dbc, pagep, argp->indx, &dbt); + __os_free(dbt.data, dbt.size); + if (ret != 0) + goto err; + + LSN(pagep) = *lsnp; + modified = 1; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* + * Need to undo update described. + * + * Re-build the original item. + */ + memset(&dbt, 0, sizeof(dbt)); + dbt.size = argp->prefix + argp->suffix + argp->orig.size; + if ((ret = __os_malloc(dbenv, dbt.size, NULL, &dbt.data)) != 0) + goto err; + p = dbt.data; + memcpy(p, bk->data, argp->prefix); + p += argp->prefix; + memcpy(p, argp->orig.data, argp->orig.size); + p += argp->orig.size; + memcpy(p, bk->data + (bk->len - argp->suffix), argp->suffix); + + ret = __bam_ritem(dbc, pagep, argp->indx, &dbt); + __os_free(dbt.data, dbt.size); + if (ret != 0) + goto err; + + /* Reset the deleted flag, if necessary. */ + if (argp->isdeleted) + B_DSET(GET_BKEYDATA(pagep, argp->indx)->type); + + LSN(pagep) = argp->lsn; + modified = 1; + } + if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) + goto out; + +done: *lsnp = argp->prev_lsn; + ret = 0; + + if (0) { +err: (void)memp_fput(mpf, pagep, 0); + } +out: REC_CLOSE; +} + +/* + * __bam_root_recover -- + * Recovery function for setting the root page on the meta-data page. + * + * PUBLIC: int __bam_root_recover + * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_root_recover(dbenv, dbtp, lsnp, op, info) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __bam_root_args *argp; + BTMETA *meta; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + int cmp_n, cmp_p, modified, ret; + + COMPQUIET(info, NULL); + REC_PRINT(__bam_root_print); + REC_INTRO(__bam_root_read, 0); + + if ((ret = memp_fget(mpf, &argp->meta_pgno, 0, &meta)) != 0) { + /* The metadata page must always exist on redo. */ + if (DB_REDO(op)) { + (void)__db_pgerr(file_dbp, argp->meta_pgno); + goto out; + } else + goto done; + } + + modified = 0; + cmp_n = log_compare(lsnp, &LSN(meta)); + cmp_p = log_compare(&LSN(meta), &argp->meta_lsn); + CHECK_LSN(op, cmp_p, &LSN(meta), &argp->meta_lsn); + if (cmp_p == 0 && DB_REDO(op)) { + /* Need to redo update described. */ + meta->root = argp->root_pgno; + meta->dbmeta.lsn = *lsnp; + ((BTREE *)file_dbp->bt_internal)->bt_root = meta->root; + modified = 1; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Nothing to undo except lsn. */ + meta->dbmeta.lsn = argp->meta_lsn; + modified = 1; + } + if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0) + goto out; + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: REC_CLOSE; +} + +/* + * __bam_curadj_recover -- + * Transaction abort function to undo cursor adjustments. + * This should only be triggered by subtransaction aborts. + * + * PUBLIC: int __bam_curadj_recover + * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_curadj_recover(dbenv, dbtp, lsnp, op, info) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __bam_curadj_args *argp; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + int ret; + + COMPQUIET(info, NULL); + + REC_PRINT(__bam_curadj_print); + REC_INTRO(__bam_curadj_read, 0); + + ret = 0; + if (op != DB_TXN_ABORT) + goto done; + + switch(argp->mode) { + case DB_CA_DI: + if ((ret = __bam_ca_di(dbc, argp->from_pgno, + argp->from_indx, -(int)argp->first_indx)) != 0) + goto out; + break; + case DB_CA_DUP: + if ((ret = __bam_ca_undodup(file_dbp, argp->first_indx, + argp->from_pgno, argp->from_indx, argp->to_indx)) != 0) + goto out; + break; + + case DB_CA_RSPLIT: + if ((ret = + __bam_ca_rsplit(dbc, argp->to_pgno, argp->from_pgno)) != 0) + goto out; + break; + + case DB_CA_SPLIT: + __bam_ca_undosplit(file_dbp, argp->from_pgno, + argp->to_pgno, argp->left_pgno, argp->from_indx); + break; + } + +done: *lsnp = argp->prev_lsn; +out: REC_CLOSE; +} + +/* + * __bam_rcuradj_recover -- + * Transaction abort function to undo cursor adjustments in rrecno. + * This should only be triggered by subtransaction aborts. + * + * PUBLIC: int __bam_rcuradj_recover + * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_rcuradj_recover(dbenv, dbtp, lsnp, op, info) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __bam_rcuradj_args *argp; + BTREE_CURSOR *cp; + DB *file_dbp; + DBC *dbc, *rdbc; + DB_MPOOLFILE *mpf; + int ret, t_ret; + + COMPQUIET(info, NULL); + rdbc = NULL; + + REC_PRINT(__bam_rcuradj_print); + REC_INTRO(__bam_rcuradj_read, 0); + + ret = t_ret = 0; + + if (op != DB_TXN_ABORT) + goto done; + + /* + * We don't know whether we're in an offpage dup set, and + * thus don't know whether the dbc REC_INTRO has handed us is + * of a reasonable type. It's certainly unset, so if this is + * an offpage dup set, we don't have an OPD cursor. The + * simplest solution is just to allocate a whole new cursor + * for our use; we're only really using it to hold pass some + * state into __ram_ca, and this way we don't need to make + * this function know anything about how offpage dups work. + */ + if ((ret = + __db_icursor(file_dbp, NULL, DB_RECNO, argp->root, 0, &rdbc)) != 0) + goto out; + + cp = (BTREE_CURSOR *)rdbc->internal; + F_SET(cp, C_RENUMBER); + cp->recno = argp->recno; + + switch(argp->mode) { + case CA_DELETE: + /* + * The way to undo a delete is with an insert. Since + * we're undoing it, the delete flag must be set. + */ + F_SET(cp, C_DELETED); + F_SET(cp, C_RENUMBER); /* Just in case. */ + cp->order = argp->order; + __ram_ca(rdbc, CA_ICURRENT); + break; + case CA_IAFTER: + case CA_IBEFORE: + case CA_ICURRENT: + /* + * The way to undo an insert is with a delete. The delete + * flag is unset to start with. + */ + F_CLR(cp, C_DELETED); + cp->order = INVALID_ORDER; + __ram_ca(rdbc, CA_DELETE); + break; + } + +done: *lsnp = argp->prev_lsn; +out: if (rdbc != NULL && (t_ret = rdbc->c_close(rdbc)) != 0 && ret == 0) + ret = t_ret; + REC_CLOSE; +} diff --git a/db/btree/bt_reclaim.c b/db/btree/bt_reclaim.c new file mode 100644 index 000000000..538d837c2 --- /dev/null +++ b/db/btree/bt_reclaim.c @@ -0,0 +1,53 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: bt_reclaim.c,v 11.5 2000/03/22 04:21:01 ubell Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_shash.h" +#include "lock.h" +#include "btree.h" + +/* + * __bam_reclaim -- + * Free a database. + * + * PUBLIC: int __bam_reclaim __P((DB *, DB_TXN *)); + */ +int +__bam_reclaim(dbp, txn) + DB *dbp; + DB_TXN *txn; +{ + DBC *dbc; + int ret, t_ret; + + /* Acquire a cursor. */ + if ((ret = dbp->cursor(dbp, txn, &dbc, 0)) != 0) + return (ret); + + /* Walk the tree, freeing pages. */ + ret = __bam_traverse(dbc, + DB_LOCK_WRITE, dbc->internal->root, __db_reclaim_callback, dbc); + + /* Discard the cursor. */ + if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} diff --git a/db/btree/bt_recno.c b/db/btree/bt_recno.c new file mode 100644 index 000000000..6ac0cac35 --- /dev/null +++ b/db/btree/bt_recno.c @@ -0,0 +1,1369 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: bt_recno.c,v 11.65 2001/01/18 14:33:22 bostic Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <limits.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" +#include "db_ext.h" +#include "db_shash.h" +#include "lock.h" +#include "lock_ext.h" +#include "qam.h" +#include "txn.h" + +static int __ram_add __P((DBC *, db_recno_t *, DBT *, u_int32_t, u_int32_t)); +static int __ram_delete __P((DB *, DB_TXN *, DBT *, u_int32_t)); +static int __ram_put __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); +static int __ram_source __P((DB *)); +static int __ram_sread __P((DBC *, db_recno_t)); +static int __ram_update __P((DBC *, db_recno_t, int)); + +/* + * In recno, there are two meanings to the on-page "deleted" flag. If we're + * re-numbering records, it means the record was implicitly created. We skip + * over implicitly created records if doing a cursor "next" or "prev", and + * return DB_KEYEMPTY if they're explicitly requested.. If not re-numbering + * records, it means that the record was implicitly created, or was deleted. + * We skip over implicitly created or deleted records if doing a cursor "next" + * or "prev", and return DB_KEYEMPTY if they're explicitly requested. + * + * If we're re-numbering records, then we have to detect in the cursor that + * a record was deleted, and adjust the cursor as necessary on the next get. + * If we're not re-numbering records, then we can detect that a record has + * been deleted by looking at the actual on-page record, so we completely + * ignore the cursor's delete flag. This is different from the B+tree code. + * It also maintains whether the cursor references a deleted record in the + * cursor, and it doesn't always check the on-page value. + */ +#define CD_SET(cp) { \ + if (F_ISSET(cp, C_RENUMBER)) \ + F_SET(cp, C_DELETED); \ +} +#define CD_CLR(cp) { \ + if (F_ISSET(cp, C_RENUMBER)) { \ + F_CLR(cp, C_DELETED); \ + cp->order = INVALID_ORDER; \ + } \ +} +#define CD_ISSET(cp) \ + (F_ISSET(cp, C_RENUMBER) && F_ISSET(cp, C_DELETED)) + +/* + * Macros for comparing the ordering of two cursors. + * cp1 comes before cp2 iff one of the following holds: + * cp1's recno is less than cp2's recno + * recnos are equal, both deleted, and cp1's order is less than cp2's + * recnos are equal, cp1 deleted, and cp2 not deleted + */ +#define C_LESSTHAN(cp1, cp2) \ + (((cp1)->recno < (cp2)->recno) || \ + (((cp1)->recno == (cp2)->recno) && \ + ((CD_ISSET((cp1)) && CD_ISSET((cp2)) && (cp1)->order < (cp2)->order) || \ + (CD_ISSET((cp1)) && !CD_ISSET((cp2)))))) + +/* + * cp1 is equal to cp2 iff their recnos and delete flags are identical, + * and if the delete flag is set their orders are also identical. + */ +#define C_EQUAL(cp1, cp2) \ + ((cp1)->recno == (cp2)->recno && CD_ISSET((cp1)) == CD_ISSET((cp2)) && \ + (!CD_ISSET((cp1)) || (cp1)->order == (cp2)->order)) + +/* + * Do we need to log the current cursor adjustment? + */ +#define CURADJ_LOG(dbc) \ + (DB_LOGGING((dbc)) && (dbc)->txn != NULL && (dbc)->txn->parent != NULL) + +/* + * __ram_open -- + * Recno open function. + * + * PUBLIC: int __ram_open __P((DB *, const char *, db_pgno_t, u_int32_t)); + */ +int +__ram_open(dbp, name, base_pgno, flags) + DB *dbp; + const char *name; + db_pgno_t base_pgno; + u_int32_t flags; +{ + BTREE *t; + DBC *dbc; + int ret, t_ret; + + t = dbp->bt_internal; + + /* Initialize the remaining fields/methods of the DB. */ + dbp->del = __ram_delete; + dbp->put = __ram_put; + dbp->stat = __bam_stat; + + /* Start up the tree. */ + if ((ret = __bam_read_root(dbp, name, base_pgno, flags)) != 0) + return (ret); + + /* + * If the user specified a source tree, open it and map it in. + * + * !!! + * We don't complain if the user specified transactions or threads. + * It's possible to make it work, but you'd better know what you're + * doing! + */ + if (t->re_source != NULL && (ret = __ram_source(dbp)) != 0) + return (ret); + + /* If we're snapshotting an underlying source file, do it now. */ + if (F_ISSET(dbp, DB_RE_SNAPSHOT)) { + /* Allocate a cursor. */ + if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0) + return (ret); + + /* Do the snapshot. */ + if ((ret = __ram_update(dbc, + DB_MAX_RECORDS, 0)) != 0 && ret == DB_NOTFOUND) + ret = 0; + + /* Discard the cursor. */ + if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; + } + + return (0); +} + +/* + * __ram_delete -- + * Recno db->del function. + */ +static int +__ram_delete(dbp, txn, key, flags) + DB *dbp; + DB_TXN *txn; + DBT *key; + u_int32_t flags; +{ + BTREE_CURSOR *cp; + DBC *dbc; + db_recno_t recno; + int ret, t_ret; + + PANIC_CHECK(dbp->dbenv); + + /* Check for invalid flags. */ + if ((ret = __db_delchk(dbp, + key, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0) + return (ret); + + /* Acquire a cursor. */ + if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0) + return (ret); + + DEBUG_LWRITE(dbc, txn, "ram_delete", key, NULL, flags); + + /* Check the user's record number and fill in as necessary. */ + if ((ret = __ram_getno(dbc, key, &recno, 0)) != 0) + goto err; + + /* Do the delete. */ + cp = (BTREE_CURSOR *)dbc->internal; + cp->recno = recno; + + ret = __ram_c_del(dbc); + + /* Release the cursor. */ +err: if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __ram_put -- + * Recno db->put function. + */ +static int +__ram_put(dbp, txn, key, data, flags) + DB *dbp; + DB_TXN *txn; + DBT *key, *data; + u_int32_t flags; +{ + DBC *dbc; + db_recno_t recno; + int ret, t_ret; + + PANIC_CHECK(dbp->dbenv); + + /* Check for invalid flags. */ + if ((ret = __db_putchk(dbp, + key, data, flags, F_ISSET(dbp, DB_AM_RDONLY), 0)) != 0) + return (ret); + + /* Allocate a cursor. */ + if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0) + return (ret); + + DEBUG_LWRITE(dbc, txn, "ram_put", key, data, flags); + + /* + * If we're appending to the tree, make sure we've read in all of + * the backing source file. Otherwise, check the user's record + * number and fill in as necessary. If we found the record or it + * simply didn't exist, add the user's record. + */ + if (flags == DB_APPEND) + ret = __ram_update(dbc, DB_MAX_RECORDS, 0); + else + ret = __ram_getno(dbc, key, &recno, 1); + if (ret == 0 || ret == DB_NOTFOUND) + ret = __ram_add(dbc, &recno, data, flags, 0); + + /* Discard the cursor. */ + if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + /* Return the record number if we're appending to the tree. */ + if (ret == 0 && flags == DB_APPEND) + ret = __db_retcopy(dbp, key, &recno, sizeof(recno), + &dbc->rkey.data, &dbc->rkey.ulen); + + return (ret); +} + +/* + * __ram_c_del -- + * Recno cursor->c_del function. + * + * PUBLIC: int __ram_c_del __P((DBC *)); + */ +int +__ram_c_del(dbc) + DBC *dbc; +{ + BKEYDATA bk; + BTREE *t; + BTREE_CURSOR *cp; + DB *dbp; + DB_LSN lsn; + DBT hdr, data; + EPG *epg; + int exact, ret, stack; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + t = dbp->bt_internal; + stack = 0; + + /* + * The semantics of cursors during delete are as follows: in + * non-renumbering recnos, records are replaced with a marker + * containing a delete flag. If the record referenced by this cursor + * has already been deleted, we will detect that as part of the delete + * operation, and fail. + * + * In renumbering recnos, cursors which represent deleted items + * are flagged with the C_DELETED flag, and it is an error to + * call c_del a second time without an intervening cursor motion. + */ + if (CD_ISSET(cp)) + return (DB_KEYEMPTY); + + /* Search the tree for the key; delete only deletes exact matches. */ + if ((ret = __bam_rsearch(dbc, &cp->recno, S_DELETE, 1, &exact)) != 0) + goto err; + if (!exact) { + ret = DB_NOTFOUND; + goto err; + } + stack = 1; + cp->page = cp->csp->page; + cp->pgno = cp->csp->page->pgno; + cp->indx = cp->csp->indx; + + /* + * If re-numbering records, the on-page deleted flag can only mean + * that this record was implicitly created. Applications aren't + * permitted to delete records they never created, return an error. + * + * If not re-numbering records, the on-page deleted flag means that + * this record was implicitly created, or, was deleted at some time. + * The former is an error because applications aren't permitted to + * delete records they never created, the latter is an error because + * if the record was "deleted", we could never have found it. + */ + if (B_DISSET(GET_BKEYDATA(cp->page, cp->indx)->type)) { + ret = DB_KEYEMPTY; + goto err; + } + + if (F_ISSET(cp, C_RENUMBER)) { + /* Delete the item, adjust the counts, adjust the cursors. */ + if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0) + goto err; + __bam_adjust(dbc, -1); + if (__ram_ca(dbc, CA_DELETE) > 0 && + CURADJ_LOG(dbc) && (ret = __bam_rcuradj_log(dbp->dbenv, + dbc->txn, &lsn, 0, dbp->log_fileid, CA_DELETE, + cp->root, cp->recno, cp->order)) != 0) + goto err; + + /* + * If the page is empty, delete it. + * + * We never delete a root page. First, root pages of primary + * databases never go away, recno or otherwise. However, if + * it's the root page of an off-page duplicates database, then + * it can be deleted. We don't delete it here because we have + * no way of telling the primary database page holder (e.g., + * the hash access method) that its page element should cleaned + * up because the underlying tree is gone. So, we keep the page + * around until the last cursor referencing the empty tree is + * are closed, and then clean it up. + */ + if (NUM_ENT(cp->page) == 0 && PGNO(cp->page) != cp->root) { + /* + * We already have a locked stack of pages. However, + * there are likely entries in the stack that aren't + * going to be emptied by removing the single reference + * to the emptied page (or one of its parents). + */ + for (epg = cp->sp; epg <= cp->csp; ++epg) + if (NUM_ENT(epg->page) <= 1) + break; + + /* + * We want to delete a single item out of the last page + * that we're not deleting, back up to that page. + */ + ret = __bam_dpages(dbc, --epg); + + /* + * Regardless of the return from __bam_dpages, it will + * discard our stack and pinned page. + */ + stack = 0; + cp->page = NULL; + } + } else { + /* Use a delete/put pair to replace the record with a marker. */ + if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0) + goto err; + + B_TSET(bk.type, B_KEYDATA, 1); + bk.len = 0; + memset(&hdr, 0, sizeof(hdr)); + hdr.data = &bk; + hdr.size = SSZA(BKEYDATA, data); + memset(&data, 0, sizeof(data)); + data.data = (void *)""; + data.size = 0; + if ((ret = __db_pitem(dbc, + cp->page, cp->indx, BKEYDATA_SIZE(0), &hdr, &data)) != 0) + goto err; + } + + t->re_modified = 1; + +err: if (stack) + __bam_stkrel(dbc, STK_CLRDBC); + + return (ret); +} + +/* + * __ram_c_get -- + * Recno cursor->c_get function. + * + * PUBLIC: int __ram_c_get + * PUBLIC: __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *)); + */ +int +__ram_c_get(dbc, key, data, flags, pgnop) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; + db_pgno_t *pgnop; +{ + BTREE_CURSOR *cp; + DB *dbp; + int cmp, exact, ret; + + COMPQUIET(pgnop, NULL); + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + +retry: switch (flags) { + case DB_CURRENT: + /* + * If we're using mutable records and the deleted flag is + * set, the cursor is pointing at a nonexistent record; + * return an error. + */ + if (CD_ISSET(cp)) + return (DB_KEYEMPTY); + break; + case DB_NEXT_DUP: + /* + * If we're not in an off-page dup set, we know there's no + * next duplicate since recnos don't have them. If we + * are in an off-page dup set, the next item assuredly is + * a dup, so we set flags to DB_NEXT and keep going. + */ + if (!F_ISSET(dbc, DBC_OPD)) + return (DB_NOTFOUND); + /* FALLTHROUGH */ + case DB_NEXT_NODUP: + /* + * Recno databases don't have duplicates, set flags to DB_NEXT + * and keep going. + */ + /* FALLTHROUGH */ + case DB_NEXT: + flags = DB_NEXT; + /* + * If record numbers are mutable: if we just deleted a record, + * we have to avoid incrementing the record number so that we + * return the right record by virtue of renumbering the tree. + */ + if (CD_ISSET(cp)) + break; + + if (cp->recno != RECNO_OOB) { + ++cp->recno; + break; + } + /* FALLTHROUGH */ + case DB_FIRST: + flags = DB_NEXT; + cp->recno = 1; + break; + case DB_PREV_NODUP: + /* + * Recno databases don't have duplicates, set flags to DB_PREV + * and keep going. + */ + /* FALLTHROUGH */ + case DB_PREV: + flags = DB_PREV; + if (cp->recno != RECNO_OOB) { + if (cp->recno == 1) { + ret = DB_NOTFOUND; + goto err; + } + --cp->recno; + break; + } + /* FALLTHROUGH */ + case DB_LAST: + flags = DB_PREV; + if (((ret = __ram_update(dbc, + DB_MAX_RECORDS, 0)) != 0) && ret != DB_NOTFOUND) + goto err; + if ((ret = __bam_nrecs(dbc, &cp->recno)) != 0) + goto err; + if (cp->recno == 0) { + ret = DB_NOTFOUND; + goto err; + } + break; + case DB_GET_BOTHC: + /* + * If we're doing a join and these are offpage dups, + * we want to keep searching forward from after the + * current cursor position. Increment the recno by 1, + * then proceed as for a DB_SET. + * + * Otherwise, we know there are no additional matching + * data, as recnos don't have dups. return DB_NOTFOUND. + */ + if (F_ISSET(dbc, DBC_OPD)) { + cp->recno++; + break; + } + ret = DB_NOTFOUND; + goto err; + /* NOTREACHED */ + case DB_GET_BOTH: + /* + * If we're searching a set of off-page dups, we start + * a new linear search from the first record. Otherwise, + * we compare the single data item associated with the + * requested record for a match. + */ + if (F_ISSET(dbc, DBC_OPD)) { + cp->recno = 1; + break; + } + /* FALLTHROUGH */ + case DB_SET: + case DB_SET_RANGE: + if ((ret = __ram_getno(dbc, key, &cp->recno, 0)) != 0) + goto err; + break; + default: + ret = __db_unknown_flag(dbp->dbenv, "__ram_c_get", flags); + goto err; + } + + /* + * For DB_PREV, DB_LAST, DB_SET and DB_SET_RANGE, we have already + * called __ram_update() to make sure sufficient records have been + * read from the backing source file. Do it now for DB_CURRENT (if + * the current record was deleted we may need more records from the + * backing file for a DB_CURRENT operation), DB_FIRST and DB_NEXT. + */ + if ((flags == DB_NEXT || flags == DB_CURRENT) && ((ret = + __ram_update(dbc, cp->recno, 0)) != 0) && ret != DB_NOTFOUND) + goto err; + + for (;; ++cp->recno) { + /* Search the tree for the record. */ + if ((ret = __bam_rsearch(dbc, &cp->recno, + F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND, + 1, &exact)) != 0) + goto err; + if (!exact) { + ret = DB_NOTFOUND; + goto err; + } + + /* + * Copy the page into the cursor, discarding any lock we + * are currently holding. + */ + cp->page = cp->csp->page; + cp->pgno = cp->csp->page->pgno; + cp->indx = cp->csp->indx; + (void)__TLPUT(dbc, cp->lock); + cp->lock = cp->csp->lock; + cp->lock_mode = cp->csp->lock_mode; + + /* + * If re-numbering records, the on-page deleted flag means this + * record was implicitly created. If not re-numbering records, + * the on-page deleted flag means this record was implicitly + * created, or, it was deleted at some time. Regardless, we + * skip such records if doing cursor next/prev operations or + * walking through off-page duplicates, and fail if they were + * requested explicitly by the application. + */ + if (B_DISSET(GET_BKEYDATA(cp->page, cp->indx)->type)) + switch (flags) { + case DB_NEXT: + case DB_PREV: + (void)__bam_stkrel(dbc, STK_CLRDBC); + goto retry; + case DB_GET_BOTH: + (void)__bam_stkrel(dbc, STK_CLRDBC); + continue; + default: + ret = DB_KEYEMPTY; + goto err; + } + + if (flags == DB_GET_BOTH || flags == DB_GET_BOTHC) { + if ((ret = __bam_cmp(dbp, data, + cp->page, cp->indx, __bam_defcmp, &cmp)) != 0) + return (ret); + if (cmp == 0) + break; + if (!F_ISSET(dbc, DBC_OPD)) { + ret = DB_NOTFOUND; + goto err; + } + (void)__bam_stkrel(dbc, STK_CLRDBC); + } else + break; + } + + /* Return the key if the user didn't give us one. */ + if (!F_ISSET(dbc, DBC_OPD)) { + if (flags != DB_SET && flags != DB_SET_RANGE) + ret = __db_retcopy(dbp, + key, &cp->recno, sizeof(cp->recno), + &dbc->rkey.data, &dbc->rkey.ulen); + F_SET(key, DB_DBT_ISSET); + } + + /* The cursor was reset, no further delete adjustment is necessary. */ +err: CD_CLR(cp); + + return (ret); +} + +/* + * __ram_c_put -- + * Recno cursor->c_put function. + * + * PUBLIC: int __ram_c_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *)); + */ +int +__ram_c_put(dbc, key, data, flags, pgnop) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; + db_pgno_t *pgnop; +{ + BTREE_CURSOR *cp; + DB *dbp; + DB_LSN lsn; + int exact, nc, ret, t_ret; + u_int32_t iiflags; + void *arg; + + COMPQUIET(pgnop, NULL); + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + + /* + * DB_KEYFIRST and DB_KEYLAST will only be set if we're dealing with + * an off-page duplicate tree, they can't be specified at user level. + * Translate them into something else. + */ + switch (flags) { + case DB_KEYFIRST: + cp->recno = 1; + flags = DB_BEFORE; + break; + case DB_KEYLAST: + if ((ret = __ram_add(dbc, &cp->recno, data, DB_APPEND, 0)) != 0) + return (ret); + if (CURADJ_LOG(dbc) && (ret = __bam_rcuradj_log(dbp->dbenv, + dbc->txn, &lsn, 0, dbp->log_fileid, CA_ICURRENT, + cp->root, cp->recno, cp->order))) + return (ret); + return (0); + } + + /* + * If we're putting with a cursor that's marked C_DELETED, we need to + * take special care; the cursor doesn't "really" reference the item + * corresponding to its current recno, but instead is "between" that + * record and the current one. Translate the actual insert into + * DB_BEFORE, and let the __ram_ca work out the gory details of what + * should wind up pointing where. + */ + if (CD_ISSET(cp)) + iiflags = DB_BEFORE; + else + iiflags = flags; + +split: if ((ret = __bam_rsearch(dbc, &cp->recno, S_INSERT, 1, &exact)) != 0) + goto err; + /* + * An inexact match is okay; it just means we're one record past the + * end, which is reasonable if we're marked deleted. + */ + DB_ASSERT(exact || CD_ISSET(cp)); + + cp->page = cp->csp->page; + cp->pgno = cp->csp->page->pgno; + cp->indx = cp->csp->indx; + + ret = __bam_iitem(dbc, key, data, iiflags, 0); + t_ret = __bam_stkrel(dbc, STK_CLRDBC); + + if (t_ret != 0 && (ret == 0 || ret == DB_NEEDSPLIT)) + ret = t_ret; + else if (ret == DB_NEEDSPLIT) { + arg = &cp->recno; + if ((ret = __bam_split(dbc, arg)) != 0) + goto err; + goto split; + } + if (ret != 0) + goto err; + + switch (flags) { /* Adjust the cursors. */ + case DB_AFTER: + nc = __ram_ca(dbc, CA_IAFTER); + + /* + * We only need to adjust this cursor forward if we truly added + * the item after the current recno, rather than remapping it + * to DB_BEFORE. + */ + if (iiflags == DB_AFTER) + ++cp->recno; + + /* Only log if __ram_ca found any relevant cursors. */ + if (nc > 0 && CURADJ_LOG(dbc) && + (ret = __bam_rcuradj_log(dbp->dbenv, + dbc->txn, &lsn, 0, dbp->log_fileid, CA_IAFTER, + cp->root, cp->recno, cp->order)) != 0) + goto err; + break; + case DB_BEFORE: + nc = __ram_ca(dbc, CA_IBEFORE); + --cp->recno; + + /* Only log if __ram_ca found any relevant cursors. */ + if (nc > 0 && CURADJ_LOG(dbc) && + (ret = __bam_rcuradj_log(dbp->dbenv, + dbc->txn, &lsn, 0, dbp->log_fileid, CA_IBEFORE, + cp->root, cp->recno, cp->order)) != 0) + goto err; + break; + case DB_CURRENT: + /* + * We only need to do an adjustment if we actually + * added an item, which we only would have done if the + * cursor was marked deleted. + * + * Only log if __ram_ca found any relevant cursors. + */ + if (CD_ISSET(cp) && __ram_ca(dbc, CA_ICURRENT) > 0 && + CURADJ_LOG(dbc) && (ret = __bam_rcuradj_log( + dbp->dbenv, dbc->txn, &lsn, 0, dbp->log_fileid, + CA_ICURRENT, cp->root, cp->recno, cp->order)) != 0) + goto err; + break; + } + + /* Return the key if we've created a new record. */ + if (!F_ISSET(dbc, DBC_OPD) && (flags == DB_AFTER || flags == DB_BEFORE)) + ret = __db_retcopy(dbp, key, &cp->recno, + sizeof(cp->recno), &dbc->rkey.data, &dbc->rkey.ulen); + + /* The cursor was reset, no further delete adjustment is necessary. */ +err: CD_CLR(cp); + + return (ret); +} + +/* + * __ram_ca -- + * Adjust cursors. Returns the number of relevant cursors. + * + * PUBLIC: int __ram_ca __P((DBC *, ca_recno_arg)); + */ +int +__ram_ca(dbc_arg, op) + DBC *dbc_arg; + ca_recno_arg op; +{ + BTREE_CURSOR *cp, *cp_arg; + DB *dbp, *ldbp; + DB_ENV *dbenv; + DBC *dbc; + db_recno_t recno; + int adjusted, found; + u_int32_t order; + + dbp = dbc_arg->dbp; + dbenv = dbp->dbenv; + cp_arg = (BTREE_CURSOR *)dbc_arg->internal; + recno = cp_arg->recno; + + found = 0; + + /* + * It only makes sense to adjust cursors if we're a renumbering + * recno; we should only be called if this is one. + */ + DB_ASSERT(F_ISSET(cp_arg, C_RENUMBER)); + + MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp); + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + */ + /* + * If we're doing a delete, we need to find the highest + * order of any cursor currently pointing at this item, + * so we can assign a higher order to the newly deleted + * cursor. Unfortunately, this requires a second pass through + * the cursor list. + */ + if (op == CA_DELETE) { + order = 1; + for (ldbp = __dblist_get(dbenv, dbp->adj_fileid); + ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; + ldbp = LIST_NEXT(ldbp, dblistlinks)) { + MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); + for (dbc = TAILQ_FIRST(&ldbp->active_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (BTREE_CURSOR *)dbc->internal; + if (cp_arg->root == cp->root && + recno == cp->recno && CD_ISSET(cp) && + order <= cp->order) + order = cp->order + 1; + } + MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); + } + } else + order = INVALID_ORDER; + + /* Now go through and do the actual adjustments. */ + for (ldbp = __dblist_get(dbenv, dbp->adj_fileid); + ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; + ldbp = LIST_NEXT(ldbp, dblistlinks)) { + MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); + for (dbc = TAILQ_FIRST(&ldbp->active_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (BTREE_CURSOR *)dbc->internal; + if (cp_arg->root != cp->root) + continue; + ++found; + adjusted = 0; + switch (op) { + case CA_DELETE: + if (recno < cp->recno) { + --cp->recno; + /* + * If the adjustment made them equal, + * we have to merge the orders. + */ + if (recno == cp->recno && CD_ISSET(cp)) + cp->order += order; + } else if (recno == cp->recno && + !CD_ISSET(cp)) { + CD_SET(cp); + cp->order = order; + } + break; + case CA_IBEFORE: + /* + * IBEFORE is just like IAFTER, except that we + * adjust cursors on the current record too. + */ + if (C_EQUAL(cp_arg, cp)) { + ++cp->recno; + adjusted = 1; + } + goto iafter; + case CA_ICURRENT: + + /* + * If the original cursor wasn't deleted, we + * just did a replacement and so there's no + * need to adjust anything--we shouldn't have + * gotten this far. Otherwise, we behave + * much like an IAFTER, except that all + * cursors pointing to the current item get + * marked undeleted and point to the new + * item. + */ + DB_ASSERT(CD_ISSET(cp_arg)); + if (C_EQUAL(cp_arg, cp)) { + CD_CLR(cp); + break; + } + /* FALLTHROUGH */ + case CA_IAFTER: +iafter: if (!adjusted && C_LESSTHAN(cp_arg, cp)) { + ++cp->recno; + adjusted = 1; + } + if (recno == cp->recno && adjusted) + /* + * If we've moved this cursor's recno, + * split its order number--i.e., + * decrement it by enough so that + * the lowest cursor moved has order 1. + * cp_arg->order is the split point, + * so decrement by one less than that. + */ + cp->order -= (cp_arg->order - 1); + break; + } + } + MUTEX_THREAD_UNLOCK(dbp->dbenv, dbp->mutexp); + } + MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp); + + return (found); +} + +/* + * __ram_getno -- + * Check the user's record number, and make sure we've seen it. + * + * PUBLIC: int __ram_getno __P((DBC *, const DBT *, db_recno_t *, int)); + */ +int +__ram_getno(dbc, key, rep, can_create) + DBC *dbc; + const DBT *key; + db_recno_t *rep; + int can_create; +{ + DB *dbp; + db_recno_t recno; + + dbp = dbc->dbp; + + /* Check the user's record number. */ + if ((recno = *(db_recno_t *)key->data) == 0) { + __db_err(dbp->dbenv, "illegal record number of 0"); + return (EINVAL); + } + if (rep != NULL) + *rep = recno; + + /* + * Btree can neither create records nor read them in. Recno can + * do both, see if we can find the record. + */ + return (dbc->dbtype == DB_RECNO ? + __ram_update(dbc, recno, can_create) : 0); +} + +/* + * __ram_update -- + * Ensure the tree has records up to and including the specified one. + */ +static int +__ram_update(dbc, recno, can_create) + DBC *dbc; + db_recno_t recno; + int can_create; +{ + BTREE *t; + BTREE_CURSOR *cp; + DB *dbp; + db_recno_t nrecs; + int ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + t = dbp->bt_internal; + + /* + * If we can't create records and we've read the entire backing input + * file, we're done. + */ + if (!can_create && t->re_eof) + return (0); + + /* + * If we haven't seen this record yet, try to get it from the original + * file. + */ + if ((ret = __bam_nrecs(dbc, &nrecs)) != 0) + return (ret); + if (!t->re_eof && recno > nrecs) { + if ((ret = __ram_sread(dbc, recno)) != 0 && ret != DB_NOTFOUND) + return (ret); + if ((ret = __bam_nrecs(dbc, &nrecs)) != 0) + return (ret); + } + + /* + * If we can create records, create empty ones up to the requested + * record. + */ + if (!can_create || recno <= nrecs + 1) + return (0); + + dbc->rdata.dlen = 0; + dbc->rdata.doff = 0; + dbc->rdata.flags = 0; + if (F_ISSET(dbp, DB_RE_FIXEDLEN)) { + if (dbc->rdata.ulen < t->re_len) { + if ((ret = __os_realloc(dbp->dbenv, + t->re_len, NULL, &dbc->rdata.data)) != 0) { + dbc->rdata.ulen = 0; + dbc->rdata.data = NULL; + return (ret); + } + dbc->rdata.ulen = t->re_len; + } + dbc->rdata.size = t->re_len; + memset(dbc->rdata.data, t->re_pad, t->re_len); + } else + dbc->rdata.size = 0; + + while (recno > ++nrecs) + if ((ret = __ram_add(dbc, + &nrecs, &dbc->rdata, 0, BI_DELETED)) != 0) + return (ret); + return (0); +} + +/* + * __ram_source -- + * Load information about the backing file. + */ +static int +__ram_source(dbp) + DB *dbp; +{ + BTREE *t; + char *source; + int ret; + + t = dbp->bt_internal; + + /* Find the real name, and swap out the one we had before. */ + if ((ret = __db_appname(dbp->dbenv, + DB_APP_DATA, NULL, t->re_source, 0, NULL, &source)) != 0) + return (ret); + __os_freestr(t->re_source); + t->re_source = source; + + /* + * !!! + * It's possible that the backing source file is read-only. We don't + * much care other than we'll complain if there are any modifications + * when it comes time to write the database back to the source. + */ + if ((t->re_fp = fopen(t->re_source, "r")) == NULL) { + ret = errno; + __db_err(dbp->dbenv, "%s: %s", t->re_source, db_strerror(ret)); + return (ret); + } + + t->re_eof = 0; + return (0); +} + +/* + * __ram_writeback -- + * Rewrite the backing file. + * + * PUBLIC: int __ram_writeback __P((DB *)); + */ +int +__ram_writeback(dbp) + DB *dbp; +{ + BTREE *t; + DB_ENV *dbenv; + DBC *dbc; + DBT key, data; + FILE *fp; + db_recno_t keyno; + int ret, t_ret; + u_int8_t delim, *pad; + + t = dbp->bt_internal; + dbenv = dbp->dbenv; + fp = NULL; + + /* If the file wasn't modified, we're done. */ + if (!t->re_modified) + return (0); + + /* If there's no backing source file, we're done. */ + if (t->re_source == NULL) { + t->re_modified = 0; + return (0); + } + + /* Allocate a cursor. */ + if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0) + return (ret); + + /* + * Read any remaining records into the tree. + * + * !!! + * This is why we can't support transactions when applications specify + * backing (re_source) files. At this point we have to read in the + * rest of the records from the file so that we can write all of the + * records back out again, which could modify a page for which we'd + * have to log changes and which we don't have locked. This could be + * partially fixed by taking a snapshot of the entire file during the + * DB->open as DB->open is transaction protected. But, if a checkpoint + * occurs then, the part of the log holding the copy of the file could + * be discarded, and that would make it impossible to recover in the + * face of disaster. This could all probably be fixed, but it would + * require transaction protecting the backing source file. + * + * XXX + * This could be made to work now that we have transactions protecting + * file operations. Margo has specifically asked for the privilege of + * doing this work. + */ + if ((ret = + __ram_update(dbc, DB_MAX_RECORDS, 0)) != 0 && ret != DB_NOTFOUND) + return (ret); + + /* + * Close any existing file handle and re-open the file, truncating it. + */ + if (t->re_fp != NULL) { + if (fclose(t->re_fp) != 0) { + ret = errno; + goto err; + } + t->re_fp = NULL; + } + if ((fp = fopen(t->re_source, "w")) == NULL) { + ret = errno; + __db_err(dbenv, "%s: %s", t->re_source, db_strerror(ret)); + goto err; + } + + /* + * We step through the records, writing each one out. Use the record + * number and the dbp->get() function, instead of a cursor, so we find + * and write out "deleted" or non-existent records. + */ + memset(&key, 0, sizeof(key)); + memset(&data, 0, sizeof(data)); + key.size = sizeof(db_recno_t); + key.data = &keyno; + + /* + * We'll need the delimiter if we're doing variable-length records, + * and the pad character if we're doing fixed-length records. + */ + delim = t->re_delim; + if (F_ISSET(dbp, DB_RE_FIXEDLEN)) { + if ((ret = __os_malloc(dbenv, t->re_len, NULL, &pad)) != 0) + goto err; + memset(pad, t->re_pad, t->re_len); + } else + COMPQUIET(pad, NULL); + for (keyno = 1;; ++keyno) { + switch (ret = dbp->get(dbp, NULL, &key, &data, 0)) { + case 0: + if (fwrite(data.data, 1, data.size, fp) != data.size) + goto write_err; + break; + case DB_KEYEMPTY: + if (F_ISSET(dbp, DB_RE_FIXEDLEN) && + fwrite(pad, 1, t->re_len, fp) != t->re_len) + goto write_err; + break; + case DB_NOTFOUND: + ret = 0; + goto done; + } + if (!F_ISSET(dbp, DB_RE_FIXEDLEN) && + fwrite(&delim, 1, 1, fp) != 1) { +write_err: ret = errno; + __db_err(dbp->dbenv, + "%s: write failed to backing file: %s", + t->re_source, strerror(ret)); + goto err; + } + } + +err: +done: /* Close the file descriptor. */ + if (fp != NULL && fclose(fp) != 0) { + if (ret == 0) + ret = errno; + __db_err(dbenv, "%s: %s", t->re_source, db_strerror(errno)); + } + + /* Discard the cursor. */ + if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + if (ret == 0) + t->re_modified = 0; + + return (ret); +} + +/* + * __ram_sread -- + * Read records from a source file. + */ +static int +__ram_sread(dbc, top) + DBC *dbc; + db_recno_t top; +{ + BTREE *t; + DB *dbp; + DBT data; + db_recno_t recno; + size_t len; + int ch, ret, was_modified; + + t = dbc->dbp->bt_internal; + dbp = dbc->dbp; + was_modified = t->re_modified; + + if ((ret = __bam_nrecs(dbc, &recno)) != 0) + return (ret); + + /* Use the record data return memory, it's only a short-term use. */ + len = F_ISSET(dbp, DB_RE_FIXEDLEN) ? t->re_len : 256; + if (dbc->rdata.ulen < len) { + if ((ret = __os_realloc( + dbp->dbenv, len, NULL, &dbc->rdata.data)) != 0) { + dbc->rdata.ulen = 0; + dbc->rdata.data = NULL; + return (ret); + } + dbc->rdata.ulen = len; + } + + memset(&data, 0, sizeof(data)); + while (recno < top) { + data.data = dbc->rdata.data; + data.size = 0; + if (F_ISSET(dbp, DB_RE_FIXEDLEN)) + for (len = t->re_len; len > 0; --len) { + if ((ch = getc(t->re_fp)) == EOF) + goto eof; + ((u_int8_t *)data.data)[data.size++] = ch; + } + else + for (;;) { + if ((ch = getc(t->re_fp)) == EOF) + goto eof; + if (ch == t->re_delim) + break; + + ((u_int8_t *)data.data)[data.size++] = ch; + if (data.size == dbc->rdata.ulen) { + if ((ret = __os_realloc(dbp->dbenv, + dbc->rdata.ulen *= 2, + NULL, &dbc->rdata.data)) != 0) { + dbc->rdata.ulen = 0; + dbc->rdata.data = NULL; + return (ret); + } else + data.data = dbc->rdata.data; + } + } + + /* + * Another process may have read this record from the input + * file and stored it into the database already, in which + * case we don't need to repeat that operation. We detect + * this by checking if the last record we've read is greater + * or equal to the number of records in the database. + */ + if (t->re_last >= recno) { + ++recno; + if ((ret = __ram_add(dbc, &recno, &data, 0, 0)) != 0) + goto err; + } + ++t->re_last; + } + + if (0) { +eof: t->re_eof = 1; + ret = DB_NOTFOUND; + } +err: if (!was_modified) + t->re_modified = 0; + + return (ret); +} + +/* + * __ram_add -- + * Add records into the tree. + */ +static int +__ram_add(dbc, recnop, data, flags, bi_flags) + DBC *dbc; + db_recno_t *recnop; + DBT *data; + u_int32_t flags, bi_flags; +{ + BKEYDATA *bk; + BTREE_CURSOR *cp; + int exact, ret, stack; + + cp = (BTREE_CURSOR *)dbc->internal; + +retry: /* Find the slot for insertion. */ + if ((ret = __bam_rsearch(dbc, recnop, + S_INSERT | (flags == DB_APPEND ? S_APPEND : 0), 1, &exact)) != 0) + return (ret); + stack = 1; + cp->page = cp->csp->page; + cp->pgno = cp->csp->page->pgno; + cp->indx = cp->csp->indx; + + /* + * The application may modify the data based on the selected record + * number. + */ + if (flags == DB_APPEND && dbc->dbp->db_append_recno != NULL && + (ret = dbc->dbp->db_append_recno(dbc->dbp, data, *recnop)) != 0) + goto err; + + /* + * If re-numbering records, the on-page deleted flag means this record + * was implicitly created. If not re-numbering records, the on-page + * deleted flag means this record was implicitly created, or, it was + * deleted at some time. + * + * If DB_NOOVERWRITE is set and the item already exists in the tree, + * return an error unless the item was either marked for deletion or + * only implicitly created. + */ + if (exact) { + bk = GET_BKEYDATA(cp->page, cp->indx); + if (!B_DISSET(bk->type) && flags == DB_NOOVERWRITE) { + ret = DB_KEYEXIST; + goto err; + } + } + + /* + * Select the arguments for __bam_iitem() and do the insert. If the + * key is an exact match, or we're replacing the data item with a + * new data item, replace the current item. If the key isn't an exact + * match, we're inserting a new key/data pair, before the search + * location. + */ + switch (ret = __bam_iitem(dbc, + NULL, data, exact ? DB_CURRENT : DB_BEFORE, bi_flags)) { + case 0: + /* + * Don't adjust anything. + * + * If we inserted a record, no cursors need adjusting because + * the only new record it's possible to insert is at the very + * end of the tree. The necessary adjustments to the internal + * page counts were made by __bam_iitem(). + * + * If we overwrote a record, no cursors need adjusting because + * future DBcursor->get calls will simply return the underlying + * record (there's no adjustment made for the DB_CURRENT flag + * when a cursor get operation immediately follows a cursor + * delete operation, and the normal adjustment for the DB_NEXT + * flag is still correct). + */ + break; + case DB_NEEDSPLIT: + /* Discard the stack of pages and split the page. */ + (void)__bam_stkrel(dbc, STK_CLRDBC); + stack = 0; + + if ((ret = __bam_split(dbc, recnop)) != 0) + goto err; + + goto retry; + /* NOTREACHED */ + default: + goto err; + } + +err: if (stack) + __bam_stkrel(dbc, STK_CLRDBC); + + return (ret); +} diff --git a/db/btree/bt_rsearch.c b/db/btree/bt_rsearch.c new file mode 100644 index 000000000..7102cd715 --- /dev/null +++ b/db/btree/bt_rsearch.c @@ -0,0 +1,429 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: bt_rsearch.c,v 11.21 2000/03/28 21:50:04 ubell Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "btree.h" +#include "db_shash.h" +#include "lock.h" + +/* + * __bam_rsearch -- + * Search a btree for a record number. + * + * PUBLIC: int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *)); + */ +int +__bam_rsearch(dbc, recnop, flags, stop, exactp) + DBC *dbc; + db_recno_t *recnop; + u_int32_t flags; + int stop, *exactp; +{ + BINTERNAL *bi; + BTREE_CURSOR *cp; + DB *dbp; + DB_LOCK lock; + PAGE *h; + RINTERNAL *ri; + db_indx_t adjust, deloffset, indx, top; + db_lockmode_t lock_mode; + db_pgno_t pg; + db_recno_t recno, t_recno, total; + int ret, stack; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + + BT_STK_CLR(cp); + + /* + * There are several ways we search a btree tree. The flags argument + * specifies if we're acquiring read or write locks and if we are + * locking pairs of pages. In addition, if we're adding or deleting + * an item, we have to lock the entire tree, regardless. See btree.h + * for more details. + * + * If write-locking pages, we need to know whether or not to acquire a + * write lock on a page before getting it. This depends on how deep it + * is in tree, which we don't know until we acquire the root page. So, + * if we need to lock the root page we may have to upgrade it later, + * because we won't get the correct lock initially. + * + * Retrieve the root page. + */ + pg = cp->root; + stack = LF_ISSET(S_STACK); + lock_mode = stack ? DB_LOCK_WRITE : DB_LOCK_READ; + if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0) + return (ret); + if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) { + /* Did not read it, so we can release the lock */ + (void)__LPUT(dbc, lock); + return (ret); + } + + /* + * Decide if we need to save this page; if we do, write lock it. + * We deliberately don't lock-couple on this call. If the tree + * is tiny, i.e., one page, and two threads are busily updating + * the root page, we're almost guaranteed deadlocks galore, as + * each one gets a read lock and then blocks the other's attempt + * for a write lock. + */ + if (!stack && + ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) || + (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) { + (void)memp_fput(dbp->mpf, h, 0); + (void)__LPUT(dbc, lock); + lock_mode = DB_LOCK_WRITE; + if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0) + return (ret); + if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) { + /* Did not read it, so we can release the lock */ + (void)__LPUT(dbc, lock); + return (ret); + } + stack = 1; + } + + /* + * If appending to the tree, set the record number now -- we have the + * root page locked. + * + * Delete only deletes exact matches, read only returns exact matches. + * Note, this is different from __bam_search(), which returns non-exact + * matches for read. + * + * The record may not exist. We can only return the correct location + * for the record immediately after the last record in the tree, so do + * a fast check now. + */ + total = RE_NREC(h); + if (LF_ISSET(S_APPEND)) { + *exactp = 0; + *recnop = recno = total + 1; + } else { + recno = *recnop; + if (recno <= total) + *exactp = 1; + else { + *exactp = 0; + if (!LF_ISSET(S_PAST_EOF) || recno > total + 1) { + /* + * Keep the page locked for serializability. + * + * XXX + * This leaves the root page locked, which will + * eliminate any concurrency. A possible fix + * would be to lock the last leaf page instead. + */ + (void)memp_fput(dbp->mpf, h, 0); + (void)__TLPUT(dbc, lock); + return (DB_NOTFOUND); + } + } + } + + /* + * !!! + * Record numbers in the tree are 0-based, but the recno is + * 1-based. All of the calculations below have to take this + * into account. + */ + for (total = 0;;) { + switch (TYPE(h)) { + case P_LBTREE: + case P_LDUP: + recno -= total; + /* + * There may be logically deleted records on the page. + * If there are enough, the record may not exist. + */ + if (TYPE(h) == P_LBTREE) { + adjust = P_INDX; + deloffset = O_INDX; + } else { + adjust = O_INDX; + deloffset = 0; + } + for (t_recno = 0, indx = 0;; indx += adjust) { + if (indx >= NUM_ENT(h)) { + *exactp = 0; + if (!LF_ISSET(S_PAST_EOF) || + recno > t_recno + 1) { + ret = DB_NOTFOUND; + goto err; + } + } + if (!B_DISSET( + GET_BKEYDATA(h, indx + deloffset)->type) && + ++t_recno == recno) + break; + } + + /* Correct from 1-based to 0-based for a page offset. */ + BT_STK_ENTER(dbp->dbenv, + cp, h, indx, lock, lock_mode, ret); + if (ret != 0) + goto err; + return (0); + case P_IBTREE: + for (indx = 0, top = NUM_ENT(h);;) { + bi = GET_BINTERNAL(h, indx); + if (++indx == top || total + bi->nrecs >= recno) + break; + total += bi->nrecs; + } + pg = bi->pgno; + break; + case P_LRECNO: + recno -= total; + + /* Correct from 1-based to 0-based for a page offset. */ + --recno; + BT_STK_ENTER(dbp->dbenv, + cp, h, recno, lock, lock_mode, ret); + if (ret != 0) + goto err; + return (0); + case P_IRECNO: + for (indx = 0, top = NUM_ENT(h);;) { + ri = GET_RINTERNAL(h, indx); + if (++indx == top || total + ri->nrecs >= recno) + break; + total += ri->nrecs; + } + pg = ri->pgno; + break; + default: + return (__db_pgfmt(dbp, h->pgno)); + } + --indx; + + if (stack) { + /* Return if this is the lowest page wanted. */ + if (LF_ISSET(S_PARENT) && stop == h->level) { + BT_STK_ENTER(dbp->dbenv, + cp, h, indx, lock, lock_mode, ret); + if (ret != 0) + goto err; + return (0); + } + BT_STK_PUSH(dbp->dbenv, + cp, h, indx, lock, lock_mode, ret); + if (ret != 0) + goto err; + + lock_mode = DB_LOCK_WRITE; + if ((ret = + __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0) + goto err; + } else { + /* + * Decide if we want to return a pointer to the next + * page in the stack. If we do, write lock it and + * never unlock it. + */ + if ((LF_ISSET(S_PARENT) && + (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) || + (h->level - 1) == LEAFLEVEL) + stack = 1; + + (void)memp_fput(dbp->mpf, h, 0); + + lock_mode = stack && + LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ; + if ((ret = __db_lget(dbc, + LCK_COUPLE, pg, lock_mode, 0, &lock)) != 0) { + /* + * If we fail, discard the lock we held. This + * is OK because this only happens when we are + * descending the tree holding read-locks. + */ + __LPUT(dbc, lock); + goto err; + } + } + + if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) + goto err; + } + /* NOTREACHED */ + +err: BT_STK_POP(cp); + __bam_stkrel(dbc, 0); + return (ret); +} + +/* + * __bam_adjust -- + * Adjust the tree after adding or deleting a record. + * + * PUBLIC: int __bam_adjust __P((DBC *, int32_t)); + */ +int +__bam_adjust(dbc, adjust) + DBC *dbc; + int32_t adjust; +{ + BTREE_CURSOR *cp; + DB *dbp; + EPG *epg; + PAGE *h; + db_pgno_t root_pgno; + int ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + root_pgno = cp->root; + + /* Update the record counts for the tree. */ + for (epg = cp->sp; epg <= cp->csp; ++epg) { + h = epg->page; + if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) { + if (DB_LOGGING(dbc) && + (ret = __bam_cadjust_log(dbp->dbenv, + dbc->txn, &LSN(h), 0, dbp->log_fileid, + PGNO(h), &LSN(h), (u_int32_t)epg->indx, adjust, + PGNO(h) == root_pgno ? CAD_UPDATEROOT : 0)) != 0) + return (ret); + + if (TYPE(h) == P_IBTREE) + GET_BINTERNAL(h, epg->indx)->nrecs += adjust; + else + GET_RINTERNAL(h, epg->indx)->nrecs += adjust; + + if (PGNO(h) == root_pgno) + RE_NREC_ADJ(h, adjust); + + if ((ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0) + return (ret); + } + } + return (0); +} + +/* + * __bam_nrecs -- + * Return the number of records in the tree. + * + * PUBLIC: int __bam_nrecs __P((DBC *, db_recno_t *)); + */ +int +__bam_nrecs(dbc, rep) + DBC *dbc; + db_recno_t *rep; +{ + DB *dbp; + DB_LOCK lock; + PAGE *h; + db_pgno_t pgno; + int ret; + + dbp = dbc->dbp; + + pgno = dbc->internal->root; + if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &lock)) != 0) + return (ret); + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + return (ret); + + *rep = RE_NREC(h); + + (void)memp_fput(dbp->mpf, h, 0); + (void)__TLPUT(dbc, lock); + + return (0); +} + +/* + * __bam_total -- + * Return the number of records below a page. + * + * PUBLIC: db_recno_t __bam_total __P((PAGE *)); + */ +db_recno_t +__bam_total(h) + PAGE *h; +{ + db_recno_t nrecs; + db_indx_t indx, top; + + nrecs = 0; + top = NUM_ENT(h); + + switch (TYPE(h)) { + case P_LBTREE: + /* Check for logically deleted records. */ + for (indx = 0; indx < top; indx += P_INDX) + if (!B_DISSET(GET_BKEYDATA(h, indx + O_INDX)->type)) + ++nrecs; + break; + case P_LDUP: + /* Check for logically deleted records. */ + for (indx = 0; indx < top; indx += O_INDX) + if (!B_DISSET(GET_BKEYDATA(h, indx)->type)) + ++nrecs; + break; + case P_IBTREE: + for (indx = 0; indx < top; indx += O_INDX) + nrecs += GET_BINTERNAL(h, indx)->nrecs; + break; + case P_LRECNO: + nrecs = NUM_ENT(h); + break; + case P_IRECNO: + for (indx = 0; indx < top; indx += O_INDX) + nrecs += GET_RINTERNAL(h, indx)->nrecs; + break; + } + + return (nrecs); +} diff --git a/db/btree/bt_search.c b/db/btree/bt_search.c new file mode 100644 index 000000000..d822198f2 --- /dev/null +++ b/db/btree/bt_search.c @@ -0,0 +1,471 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: bt_search.c,v 11.32 2001/01/17 20:19:46 bostic Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_shash.h" +#include "btree.h" +#include "lock.h" + +/* + * __bam_search -- + * Search a btree for a key. + * + * PUBLIC: int __bam_search __P((DBC *, + * PUBLIC: const DBT *, u_int32_t, int, db_recno_t *, int *)); + */ +int +__bam_search(dbc, key, flags, stop, recnop, exactp) + DBC *dbc; + const DBT *key; + u_int32_t flags; + int stop, *exactp; + db_recno_t *recnop; +{ + BTREE *t; + BTREE_CURSOR *cp; + DB *dbp; + DB_LOCK lock; + PAGE *h; + db_indx_t base, i, indx, lim; + db_lockmode_t lock_mode; + db_pgno_t pg; + db_recno_t recno; + int adjust, cmp, deloffset, ret, stack; + int (*func) __P((DB *, const DBT *, const DBT *)); + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + t = dbp->bt_internal; + recno = 0; + + BT_STK_CLR(cp); + + /* + * There are several ways we search a btree tree. The flags argument + * specifies if we're acquiring read or write locks, if we position + * to the first or last item in a set of duplicates, if we return + * deleted items, and if we are locking pairs of pages. In addition, + * if we're modifying record numbers, we have to lock the entire tree + * regardless. See btree.h for more details. + * + * If write-locking pages, we need to know whether or not to acquire a + * write lock on a page before getting it. This depends on how deep it + * is in tree, which we don't know until we acquire the root page. So, + * if we need to lock the root page we may have to upgrade it later, + * because we won't get the correct lock initially. + * + * Retrieve the root page. + */ +try_again: + pg = cp->root; + stack = LF_ISSET(S_STACK) && F_ISSET(cp, C_RECNUM); + lock_mode = stack ? DB_LOCK_WRITE : DB_LOCK_READ; + if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0) + return (ret); + if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) { + /* Did not read it, so we can release the lock */ + (void)__LPUT(dbc, lock); + return (ret); + } + + /* + * Decide if we need to save this page; if we do, write lock it. + * We deliberately don't lock-couple on this call. If the tree + * is tiny, i.e., one page, and two threads are busily updating + * the root page, we're almost guaranteed deadlocks galore, as + * each one gets a read lock and then blocks the other's attempt + * for a write lock. + */ + if (!stack && + ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) || + (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) { + (void)memp_fput(dbp->mpf, h, 0); + (void)__LPUT(dbc, lock); + lock_mode = DB_LOCK_WRITE; + if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0) + return (ret); + if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) { + /* Did not read it, so we can release the lock */ + (void)__LPUT(dbc, lock); + return (ret); + } + if (!((LF_ISSET(S_PARENT) + && (u_int8_t)(stop + 1) >= h->level) || + (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) { + /* Someone else split the root, start over. */ + (void)memp_fput(dbp->mpf, h, 0); + (void)__LPUT(dbc, lock); + goto try_again; + } + stack = 1; + } + + /* Choose a comparison function. */ + func = F_ISSET(dbc, DBC_OPD) ? + (dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare) : + t->bt_compare; + + for (;;) { + /* + * Do a binary search on the current page. If we're searching + * a Btree leaf page, we have to walk the indices in groups of + * two. If we're searching an internal page or a off-page dup + * page, they're an index per page item. If we find an exact + * match on a leaf page, we're done. + */ + adjust = TYPE(h) == P_LBTREE ? P_INDX : O_INDX; + for (base = 0, + lim = NUM_ENT(h) / (db_indx_t)adjust; lim != 0; lim >>= 1) { + indx = base + ((lim >> 1) * adjust); + if ((ret = + __bam_cmp(dbp, key, h, indx, func, &cmp)) != 0) + goto err; + if (cmp == 0) { + if (TYPE(h) == P_LBTREE || TYPE(h) == P_LDUP) + goto found; + goto next; + } + if (cmp > 0) { + base = indx + adjust; + --lim; + } + } + + /* + * No match found. Base is the smallest index greater than + * key and may be zero or a last + O_INDX index. + * + * If it's a leaf page, return base as the "found" value. + * Delete only deletes exact matches. + */ + if (TYPE(h) == P_LBTREE || TYPE(h) == P_LDUP) { + *exactp = 0; + + if (LF_ISSET(S_EXACT)) + goto notfound; + + if (LF_ISSET(S_STK_ONLY)) { + BT_STK_NUM(dbp->dbenv, cp, h, base, ret); + __LPUT(dbc, lock); + (void)memp_fput(dbp->mpf, h, 0); + return (ret); + } + + /* + * !!! + * Possibly returning a deleted record -- DB_SET_RANGE, + * DB_KEYFIRST and DB_KEYLAST don't require an exact + * match, and we don't want to walk multiple pages here + * to find an undeleted record. This is handled by the + * calling routine. + */ + BT_STK_ENTER(dbp->dbenv, + cp, h, base, lock, lock_mode, ret); + if (ret != 0) + goto err; + return (0); + } + + /* + * If it's not a leaf page, record the internal page (which is + * a parent page for the key). Decrement the base by 1 if it's + * non-zero so that if a split later occurs, the inserted page + * will be to the right of the saved page. + */ + indx = base > 0 ? base - O_INDX : base; + + /* + * If we're trying to calculate the record number, sum up + * all the record numbers on this page up to the indx point. + */ +next: if (recnop != NULL) + for (i = 0; i < indx; ++i) + recno += GET_BINTERNAL(h, i)->nrecs; + + pg = GET_BINTERNAL(h, indx)->pgno; + + if (LF_ISSET(S_STK_ONLY)) { + if (stop == h->level) { + BT_STK_NUM(dbp->dbenv, cp, h, indx, ret); + __LPUT(dbc, lock); + (void)memp_fput(dbp->mpf, h, 0); + return (ret); + } + BT_STK_NUMPUSH(dbp->dbenv, cp, h, indx, ret); + (void)memp_fput(dbp->mpf, h, 0); + if ((ret = __db_lget(dbc, + LCK_COUPLE, pg, lock_mode, 0, &lock)) != 0) { + /* + * Discard our lock and return on failure. This + * is OK because it only happens when descending + * the tree holding read-locks. + */ + __LPUT(dbc, lock); + return (ret); + } + } else if (stack) { + /* Return if this is the lowest page wanted. */ + if (LF_ISSET(S_PARENT) && stop == h->level) { + BT_STK_ENTER(dbp->dbenv, + cp, h, indx, lock, lock_mode, ret); + if (ret != 0) + goto err; + return (0); + } + BT_STK_PUSH(dbp->dbenv, + cp, h, indx, lock, lock_mode, ret); + if (ret != 0) + goto err; + + lock_mode = DB_LOCK_WRITE; + if ((ret = + __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0) + goto err; + } else { + /* + * Decide if we want to return a reference to the next + * page in the return stack. If so, lock it and never + * unlock it. + */ + if ((LF_ISSET(S_PARENT) && + (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) || + (h->level - 1) == LEAFLEVEL) + stack = 1; + + (void)memp_fput(dbp->mpf, h, 0); + + lock_mode = stack && + LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ; + if ((ret = __db_lget(dbc, + LCK_COUPLE, pg, lock_mode, 0, &lock)) != 0) { + /* + * If we fail, discard the lock we held. This + * is OK because this only happens when we are + * descending the tree holding read-locks. + */ + __LPUT(dbc, lock); + goto err; + } + } + if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) + goto err; + } + /* NOTREACHED */ + +found: *exactp = 1; + + /* + * If we're trying to calculate the record number, add in the + * offset on this page and correct for the fact that records + * in the tree are 0-based. + */ + if (recnop != NULL) + *recnop = recno + (indx / P_INDX) + 1; + + /* + * If we got here, we know that we have a Btree leaf or off-page + * duplicates page. If it's a Btree leaf page, we have to handle + * on-page duplicates. + * + * If there are duplicates, go to the first/last one. This is + * safe because we know that we're not going to leave the page, + * all duplicate sets that are not on overflow pages exist on a + * single leaf page. + */ + if (TYPE(h) == P_LBTREE) { + if (LF_ISSET(S_DUPLAST)) + while (indx < (db_indx_t)(NUM_ENT(h) - P_INDX) && + h->inp[indx] == h->inp[indx + P_INDX]) + indx += P_INDX; + else + while (indx > 0 && + h->inp[indx] == h->inp[indx - P_INDX]) + indx -= P_INDX; + } + + /* + * Now check if we are allowed to return deleted items; if not, then + * find the next (or previous) non-deleted duplicate entry. (We do + * not move from the original found key on the basis of the S_DELNO + * flag.) + */ + if (LF_ISSET(S_DELNO)) { + deloffset = TYPE(h) == P_LBTREE ? O_INDX : 0; + if (LF_ISSET(S_DUPLAST)) + while (B_DISSET(GET_BKEYDATA( + h, indx + deloffset)->type) && indx > 0 && + h->inp[indx] == h->inp[indx - adjust]) + indx -= adjust; + else + while (B_DISSET(GET_BKEYDATA( + h, indx + deloffset)->type) && + indx < (db_indx_t)(NUM_ENT(h) - adjust) && + h->inp[indx] == h->inp[indx + adjust]) + indx += adjust; + + /* + * If we weren't able to find a non-deleted duplicate, return + * DB_NOTFOUND. + */ + if (B_DISSET(GET_BKEYDATA(h, indx + deloffset)->type)) + goto notfound; + } + + if (LF_ISSET(S_STK_ONLY)) { + BT_STK_NUM(dbp->dbenv, cp, h, indx, ret); + __LPUT(dbc, lock); + (void)memp_fput(dbp->mpf, h, 0); + } else { + BT_STK_ENTER(dbp->dbenv, cp, h, indx, lock, lock_mode, ret); + if (ret != 0) + goto err; + } + return (0); + +notfound: + /* Keep the page locked for serializability. */ + (void)memp_fput(dbp->mpf, h, 0); + (void)__TLPUT(dbc, lock); + ret = DB_NOTFOUND; + +err: BT_STK_POP(cp); + __bam_stkrel(dbc, 0); + return (ret); +} + +/* + * __bam_stkrel -- + * Release all pages currently held in the stack. + * + * PUBLIC: int __bam_stkrel __P((DBC *, u_int32_t)); + */ +int +__bam_stkrel(dbc, flags) + DBC *dbc; + u_int32_t flags; +{ + BTREE_CURSOR *cp; + DB *dbp; + EPG *epg; + int ret, t_ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + + /* + * Release inner pages first. + * + * The caller must be sure that setting STK_NOLOCK will not effect + * either serializability or recoverability. + */ + for (ret = 0, epg = cp->sp; epg <= cp->csp; ++epg) { + if (epg->page != NULL) { + if (LF_ISSET(STK_CLRDBC) && cp->page == epg->page) { + cp->page = NULL; + cp->lock.off = LOCK_INVALID; + } + if ((t_ret = memp_fput( + dbp->mpf, epg->page, 0)) != 0 && ret == 0) + ret = t_ret; + /* + * XXX + * Temporary fix for #3243 -- under certain deadlock + * conditions we call here again and re-free the page. + * The correct fix is to never release a stack that + * doesn't hold items. + */ + epg->page = NULL; + } + if (epg->lock.off != LOCK_INVALID) { + if (LF_ISSET(STK_NOLOCK)) + (void)__LPUT(dbc, epg->lock); + else + (void)__TLPUT(dbc, epg->lock); + } + } + + /* Clear the stack, all pages have been released. */ + BT_STK_CLR(cp); + + return (ret); +} + +/* + * __bam_stkgrow -- + * Grow the stack. + * + * PUBLIC: int __bam_stkgrow __P((DB_ENV *, BTREE_CURSOR *)); + */ +int +__bam_stkgrow(dbenv, cp) + DB_ENV *dbenv; + BTREE_CURSOR *cp; +{ + EPG *p; + size_t entries; + int ret; + + entries = cp->esp - cp->sp; + + if ((ret = __os_calloc(dbenv, entries * 2, sizeof(EPG), &p)) != 0) + return (ret); + memcpy(p, cp->sp, entries * sizeof(EPG)); + if (cp->sp != cp->stack) + __os_free(cp->sp, entries * sizeof(EPG)); + cp->sp = p; + cp->csp = p + entries; + cp->esp = p + entries * 2; + return (0); +} diff --git a/db/btree/bt_split.c b/db/btree/bt_split.c new file mode 100644 index 000000000..f76337b19 --- /dev/null +++ b/db/btree/bt_split.c @@ -0,0 +1,1126 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: bt_split.c,v 11.31 2000/12/22 19:08:27 bostic Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <limits.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_shash.h" +#include "lock.h" +#include "btree.h" + +static int __bam_broot __P((DBC *, PAGE *, PAGE *, PAGE *)); +static int __bam_page __P((DBC *, EPG *, EPG *)); +static int __bam_pinsert __P((DBC *, EPG *, PAGE *, PAGE *, int)); +static int __bam_psplit __P((DBC *, EPG *, PAGE *, PAGE *, db_indx_t *)); +static int __bam_root __P((DBC *, EPG *)); +static int __ram_root __P((DBC *, PAGE *, PAGE *, PAGE *)); + +/* + * __bam_split -- + * Split a page. + * + * PUBLIC: int __bam_split __P((DBC *, void *)); + */ +int +__bam_split(dbc, arg) + DBC *dbc; + void *arg; +{ + BTREE *t; + BTREE_CURSOR *cp; + DB *dbp; + enum { UP, DOWN } dir; + db_pgno_t root_pgno; + int exact, level, ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + root_pgno = cp->root; + + /* + * The locking protocol we use to avoid deadlock to acquire locks by + * walking down the tree, but we do it as lazily as possible, locking + * the root only as a last resort. We expect all stack pages to have + * been discarded before we're called; we discard all short-term locks. + * + * When __bam_split is first called, we know that a leaf page was too + * full for an insert. We don't know what leaf page it was, but we + * have the key/recno that caused the problem. We call XX_search to + * reacquire the leaf page, but this time get both the leaf page and + * its parent, locked. We then split the leaf page and see if the new + * internal key will fit into the parent page. If it will, we're done. + * + * If it won't, we discard our current locks and repeat the process, + * only this time acquiring the parent page and its parent, locked. + * This process repeats until we succeed in the split, splitting the + * root page as the final resort. The entire process then repeats, + * as necessary, until we split a leaf page. + * + * XXX + * A traditional method of speeding this up is to maintain a stack of + * the pages traversed in the original search. You can detect if the + * stack is correct by storing the page's LSN when it was searched and + * comparing that LSN with the current one when it's locked during the + * split. This would be an easy change for this code, but I have no + * numbers that indicate it's worthwhile. + */ + t = dbp->bt_internal; + for (dir = UP, level = LEAFLEVEL;; dir == UP ? ++level : --level) { + /* + * Acquire a page and its parent, locked. + */ + if ((ret = (dbc->dbtype == DB_BTREE ? + __bam_search(dbc, arg, S_WRPAIR, level, NULL, &exact) : + __bam_rsearch(dbc, + (db_recno_t *)arg, S_WRPAIR, level, &exact))) != 0) + return (ret); + + /* + * Split the page if it still needs it (it's possible another + * thread of control has already split the page). If we are + * guaranteed that two items will fit on the page, the split + * is no longer necessary. + */ + if (2 * B_MAXSIZEONPAGE(cp->ovflsize) + <= (db_indx_t)P_FREESPACE(cp->csp[0].page)) { + __bam_stkrel(dbc, STK_NOLOCK); + return (0); + } + ret = cp->csp[0].page->pgno == root_pgno ? + __bam_root(dbc, &cp->csp[0]) : + __bam_page(dbc, &cp->csp[-1], &cp->csp[0]); + BT_STK_CLR(cp); + + switch (ret) { + case 0: + /* Once we've split the leaf page, we're done. */ + if (level == LEAFLEVEL) + return (0); + + /* Switch directions. */ + if (dir == UP) + dir = DOWN; + break; + case DB_NEEDSPLIT: + /* + * It's possible to fail to split repeatedly, as other + * threads may be modifying the tree, or the page usage + * is sufficiently bad that we don't get enough space + * the first time. + */ + if (dir == DOWN) + dir = UP; + break; + default: + return (ret); + } + } + /* NOTREACHED */ +} + +/* + * __bam_root -- + * Split the root page of a btree. + */ +static int +__bam_root(dbc, cp) + DBC *dbc; + EPG *cp; +{ + DB *dbp; + DBT log_dbt; + DB_LSN log_lsn; + PAGE *lp, *rp; + db_indx_t split; + u_int32_t opflags; + int ret; + + dbp = dbc->dbp; + + /* Yeah, right. */ + if (cp->page->level >= MAXBTREELEVEL) { + __db_err(dbp->dbenv, + "Too many btree levels: %d", cp->page->level); + ret = ENOSPC; + goto err; + } + + /* Create new left and right pages for the split. */ + lp = rp = NULL; + if ((ret = __db_new(dbc, TYPE(cp->page), &lp)) != 0 || + (ret = __db_new(dbc, TYPE(cp->page), &rp)) != 0) + goto err; + P_INIT(lp, dbp->pgsize, lp->pgno, + PGNO_INVALID, ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno, + cp->page->level, TYPE(cp->page)); + P_INIT(rp, dbp->pgsize, rp->pgno, + ISINTERNAL(cp->page) ? PGNO_INVALID : lp->pgno, PGNO_INVALID, + cp->page->level, TYPE(cp->page)); + + /* Split the page. */ + if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0) + goto err; + + /* Log the change. */ + if (DB_LOGGING(dbc)) { + memset(&log_dbt, 0, sizeof(log_dbt)); + log_dbt.data = cp->page; + log_dbt.size = dbp->pgsize; + ZERO_LSN(log_lsn); + opflags = F_ISSET( + (BTREE_CURSOR *)dbc->internal, C_RECNUM) ? SPL_NRECS : 0; + if ((ret = __bam_split_log(dbp->dbenv, dbc->txn, + &LSN(cp->page), 0, dbp->log_fileid, PGNO(lp), &LSN(lp), + PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp), 0, &log_lsn, + dbc->internal->root, &log_dbt, opflags)) != 0) + goto err; + LSN(lp) = LSN(cp->page); + LSN(rp) = LSN(cp->page); + } + + /* Clean up the new root page. */ + if ((ret = (dbc->dbtype == DB_RECNO ? + __ram_root(dbc, cp->page, lp, rp) : + __bam_broot(dbc, cp->page, lp, rp))) != 0) + goto err; + + /* Adjust any cursors. */ + if ((ret = __bam_ca_split(dbc, + cp->page->pgno, lp->pgno, rp->pgno, split, 1)) != 0) + goto err; + + /* Success -- write the real pages back to the store. */ + (void)memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY); + (void)__TLPUT(dbc, cp->lock); + (void)memp_fput(dbp->mpf, lp, DB_MPOOL_DIRTY); + (void)memp_fput(dbp->mpf, rp, DB_MPOOL_DIRTY); + + return (0); + +err: if (lp != NULL) + (void)__db_free(dbc, lp); + if (rp != NULL) + (void)__db_free(dbc, rp); + (void)memp_fput(dbp->mpf, cp->page, 0); + (void)__TLPUT(dbc, cp->lock); + return (ret); +} + +/* + * __bam_page -- + * Split the non-root page of a btree. + */ +static int +__bam_page(dbc, pp, cp) + DBC *dbc; + EPG *pp, *cp; +{ + BTREE_CURSOR *bc; + DBT log_dbt; + DB_LSN log_lsn; + DB *dbp; + DB_LOCK tplock; + DB_LSN save_lsn; + PAGE *lp, *rp, *alloc_rp, *tp; + db_indx_t split; + u_int32_t opflags; + int ret, t_ret; + + dbp = dbc->dbp; + alloc_rp = lp = rp = tp = NULL; + tplock.off = LOCK_INVALID; + ret = -1; + + /* + * Create a new right page for the split, and fill in everything + * except its LSN and page number. + * + * We malloc space for both the left and right pages, so we don't get + * a new page from the underlying buffer pool until we know the split + * is going to succeed. The reason is that we can't release locks + * acquired during the get-a-new-page process because metadata page + * locks can't be discarded on failure since we may have modified the + * free list. So, if you assume that we're holding a write lock on the + * leaf page which ran out of space and started this split (e.g., we + * have already written records to the page, or we retrieved a record + * from it with the DB_RMW flag set), failing in a split with both a + * leaf page locked and the metadata page locked can potentially lock + * up the tree badly, because we've violated the rule of always locking + * down the tree, and never up. + */ + if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &rp)) != 0) + goto err; + P_INIT(rp, dbp->pgsize, 0, + ISINTERNAL(cp->page) ? PGNO_INVALID : PGNO(cp->page), + ISINTERNAL(cp->page) ? PGNO_INVALID : NEXT_PGNO(cp->page), + cp->page->level, TYPE(cp->page)); + + /* + * Create new left page for the split, and fill in everything + * except its LSN and next-page page number. + */ + if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &lp)) != 0) + goto err; + P_INIT(lp, dbp->pgsize, PGNO(cp->page), + ISINTERNAL(cp->page) ? PGNO_INVALID : PREV_PGNO(cp->page), + ISINTERNAL(cp->page) ? PGNO_INVALID : 0, + cp->page->level, TYPE(cp->page)); + + /* + * Split right. + * + * Only the indices are sorted on the page, i.e., the key/data pairs + * aren't, so it's simpler to copy the data from the split page onto + * two new pages instead of copying half the data to a new right page + * and compacting the left page in place. Since the left page can't + * change, we swap the original and the allocated left page after the + * split. + */ + if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0) + goto err; + + /* + * Test to see if we are going to be able to insert the new pages into + * the parent page. The interesting failure here is that the parent + * page can't hold the new keys, and has to be split in turn, in which + * case we want to release all the locks we can. + */ + if ((ret = __bam_pinsert(dbc, pp, lp, rp, 1)) != 0) + goto err; + + /* + * Fix up the previous pointer of any leaf page following the split + * page. + * + * There's interesting deadlock situations here as we try to write-lock + * a page that's not in our direct ancestry. Consider a cursor walking + * backward through the leaf pages, that has our following page locked, + * and is waiting on a lock for the page we're splitting. In that case + * we're going to deadlock here . It's probably OK, stepping backward + * through the tree isn't a common operation. + */ + if (ISLEAF(cp->page) && NEXT_PGNO(cp->page) != PGNO_INVALID) { + if ((ret = __db_lget(dbc, + 0, NEXT_PGNO(cp->page), DB_LOCK_WRITE, 0, &tplock)) != 0) + goto err; + if ((ret = + memp_fget(dbp->mpf, &NEXT_PGNO(cp->page), 0, &tp)) != 0) + goto err; + } + + /* + * We've got everything locked down we need, and we know the split + * is going to succeed. Go and get the additional page we'll need. + */ + if ((ret = __db_new(dbc, TYPE(cp->page), &alloc_rp)) != 0) + goto err; + + /* + * Fix up the page numbers we didn't have before. We have to do this + * before calling __bam_pinsert because it may copy a page number onto + * the parent page and it takes the page number from its page argument. + */ + PGNO(rp) = NEXT_PGNO(lp) = PGNO(alloc_rp); + + /* Actually update the parent page. */ + if ((ret = __bam_pinsert(dbc, pp, lp, rp, 0)) != 0) + goto err; + + bc = (BTREE_CURSOR *)dbc->internal; + /* Log the change. */ + if (DB_LOGGING(dbc)) { + memset(&log_dbt, 0, sizeof(log_dbt)); + log_dbt.data = cp->page; + log_dbt.size = dbp->pgsize; + if (tp == NULL) + ZERO_LSN(log_lsn); + opflags = F_ISSET(bc, C_RECNUM) ? SPL_NRECS : 0; + if ((ret = __bam_split_log(dbp->dbenv, dbc->txn, + &LSN(cp->page), 0, dbp->log_fileid, PGNO(cp->page), + &LSN(cp->page), PGNO(alloc_rp), &LSN(alloc_rp), + (u_int32_t)NUM_ENT(lp), + tp == NULL ? 0 : PGNO(tp), + tp == NULL ? &log_lsn : &LSN(tp), + bc->root, &log_dbt, opflags)) != 0) + goto err; + + /* Update the LSNs for all involved pages. */ + LSN(alloc_rp) = LSN(cp->page); + LSN(lp) = LSN(cp->page); + LSN(rp) = LSN(cp->page); + if (tp != NULL) + LSN(tp) = LSN(cp->page); + } + + /* + * Copy the left and right pages into place. There are two paths + * through here. Either we are logging and we set the LSNs in the + * logging path. However, if we are not logging, then we do not + * have valid LSNs on lp or rp. The correct LSNs to use are the + * ones on the page we got from __db_new or the one that was + * originally on cp->page. In both cases, we save the LSN from the + * real database page (not a malloc'd one) and reapply it after we + * do the copy. + */ + save_lsn = alloc_rp->lsn; + memcpy(alloc_rp, rp, LOFFSET(rp)); + memcpy((u_int8_t *)alloc_rp + HOFFSET(rp), + (u_int8_t *)rp + HOFFSET(rp), dbp->pgsize - HOFFSET(rp)); + alloc_rp->lsn = save_lsn; + + save_lsn = cp->page->lsn; + memcpy(cp->page, lp, LOFFSET(lp)); + memcpy((u_int8_t *)cp->page + HOFFSET(lp), + (u_int8_t *)lp + HOFFSET(lp), dbp->pgsize - HOFFSET(lp)); + cp->page->lsn = save_lsn; + + /* Fix up the next-page link. */ + if (tp != NULL) + PREV_PGNO(tp) = PGNO(rp); + + /* Adjust any cursors. */ + if ((ret = __bam_ca_split(dbc, + PGNO(cp->page), PGNO(cp->page), PGNO(rp), split, 0)) != 0) + goto err; + + __os_free(lp, dbp->pgsize); + __os_free(rp, dbp->pgsize); + + /* + * Success -- write the real pages back to the store. As we never + * acquired any sort of lock on the new page, we release it before + * releasing locks on the pages that reference it. We're finished + * modifying the page so it's not really necessary, but it's neater. + */ + if ((t_ret = + memp_fput(dbp->mpf, alloc_rp, DB_MPOOL_DIRTY)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = + memp_fput(dbp->mpf, pp->page, DB_MPOOL_DIRTY)) != 0 && ret == 0) + ret = t_ret; + (void)__TLPUT(dbc, pp->lock); + if ((t_ret = + memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY)) != 0 && ret == 0) + ret = t_ret; + (void)__TLPUT(dbc, cp->lock); + if (tp != NULL) { + if ((t_ret = + memp_fput(dbp->mpf, tp, DB_MPOOL_DIRTY)) != 0 && ret == 0) + ret = t_ret; + (void)__TLPUT(dbc, tplock); + } + return (ret); + +err: if (lp != NULL) + __os_free(lp, dbp->pgsize); + if (rp != NULL) + __os_free(rp, dbp->pgsize); + if (alloc_rp != NULL) + (void)__db_free(dbc, alloc_rp); + + if (tp != NULL) + (void)memp_fput(dbp->mpf, tp, 0); + if (tplock.off != LOCK_INVALID) + /* We never updated the next page, we can release it. */ + (void)__LPUT(dbc, tplock); + + (void)memp_fput(dbp->mpf, pp->page, 0); + if (ret == DB_NEEDSPLIT) + (void)__LPUT(dbc, pp->lock); + else + (void)__TLPUT(dbc, pp->lock); + + (void)memp_fput(dbp->mpf, cp->page, 0); + if (ret == DB_NEEDSPLIT) + (void)__LPUT(dbc, cp->lock); + else + (void)__TLPUT(dbc, cp->lock); + + return (ret); +} + +/* + * __bam_broot -- + * Fix up the btree root page after it has been split. + */ +static int +__bam_broot(dbc, rootp, lp, rp) + DBC *dbc; + PAGE *rootp, *lp, *rp; +{ + BINTERNAL bi, *child_bi; + BKEYDATA *child_bk; + BTREE_CURSOR *cp; + DB *dbp; + DBT hdr, data; + db_pgno_t root_pgno; + int ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + + /* + * If the root page was a leaf page, change it into an internal page. + * We copy the key we split on (but not the key's data, in the case of + * a leaf page) to the new root page. + */ + root_pgno = cp->root; + P_INIT(rootp, dbp->pgsize, + root_pgno, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IBTREE); + + memset(&data, 0, sizeof(data)); + memset(&hdr, 0, sizeof(hdr)); + + /* + * The btree comparison code guarantees that the left-most key on any + * internal btree page is never used, so it doesn't need to be filled + * in. Set the record count if necessary. + */ + memset(&bi, 0, sizeof(bi)); + bi.len = 0; + B_TSET(bi.type, B_KEYDATA, 0); + bi.pgno = lp->pgno; + if (F_ISSET(cp, C_RECNUM)) { + bi.nrecs = __bam_total(lp); + RE_NREC_SET(rootp, bi.nrecs); + } + hdr.data = &bi; + hdr.size = SSZA(BINTERNAL, data); + if ((ret = + __db_pitem(dbc, rootp, 0, BINTERNAL_SIZE(0), &hdr, NULL)) != 0) + return (ret); + + switch (TYPE(rp)) { + case P_IBTREE: + /* Copy the first key of the child page onto the root page. */ + child_bi = GET_BINTERNAL(rp, 0); + + bi.len = child_bi->len; + B_TSET(bi.type, child_bi->type, 0); + bi.pgno = rp->pgno; + if (F_ISSET(cp, C_RECNUM)) { + bi.nrecs = __bam_total(rp); + RE_NREC_ADJ(rootp, bi.nrecs); + } + hdr.data = &bi; + hdr.size = SSZA(BINTERNAL, data); + data.data = child_bi->data; + data.size = child_bi->len; + if ((ret = __db_pitem(dbc, rootp, 1, + BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0) + return (ret); + + /* Increment the overflow ref count. */ + if (B_TYPE(child_bi->type) == B_OVERFLOW) + if ((ret = __db_ovref(dbc, + ((BOVERFLOW *)(child_bi->data))->pgno, 1)) != 0) + return (ret); + break; + case P_LDUP: + case P_LBTREE: + /* Copy the first key of the child page onto the root page. */ + child_bk = GET_BKEYDATA(rp, 0); + switch (B_TYPE(child_bk->type)) { + case B_KEYDATA: + bi.len = child_bk->len; + B_TSET(bi.type, child_bk->type, 0); + bi.pgno = rp->pgno; + if (F_ISSET(cp, C_RECNUM)) { + bi.nrecs = __bam_total(rp); + RE_NREC_ADJ(rootp, bi.nrecs); + } + hdr.data = &bi; + hdr.size = SSZA(BINTERNAL, data); + data.data = child_bk->data; + data.size = child_bk->len; + if ((ret = __db_pitem(dbc, rootp, 1, + BINTERNAL_SIZE(child_bk->len), &hdr, &data)) != 0) + return (ret); + break; + case B_DUPLICATE: + case B_OVERFLOW: + bi.len = BOVERFLOW_SIZE; + B_TSET(bi.type, child_bk->type, 0); + bi.pgno = rp->pgno; + if (F_ISSET(cp, C_RECNUM)) { + bi.nrecs = __bam_total(rp); + RE_NREC_ADJ(rootp, bi.nrecs); + } + hdr.data = &bi; + hdr.size = SSZA(BINTERNAL, data); + data.data = child_bk; + data.size = BOVERFLOW_SIZE; + if ((ret = __db_pitem(dbc, rootp, 1, + BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0) + return (ret); + + /* Increment the overflow ref count. */ + if (B_TYPE(child_bk->type) == B_OVERFLOW) + if ((ret = __db_ovref(dbc, + ((BOVERFLOW *)child_bk)->pgno, 1)) != 0) + return (ret); + break; + default: + return (__db_pgfmt(dbp, rp->pgno)); + } + break; + default: + return (__db_pgfmt(dbp, rp->pgno)); + } + return (0); +} + +/* + * __ram_root -- + * Fix up the recno root page after it has been split. + */ +static int +__ram_root(dbc, rootp, lp, rp) + DBC *dbc; + PAGE *rootp, *lp, *rp; +{ + DB *dbp; + DBT hdr; + RINTERNAL ri; + db_pgno_t root_pgno; + int ret; + + dbp = dbc->dbp; + root_pgno = dbc->internal->root; + + /* Initialize the page. */ + P_INIT(rootp, dbp->pgsize, + root_pgno, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IRECNO); + + /* Initialize the header. */ + memset(&hdr, 0, sizeof(hdr)); + hdr.data = &ri; + hdr.size = RINTERNAL_SIZE; + + /* Insert the left and right keys, set the header information. */ + ri.pgno = lp->pgno; + ri.nrecs = __bam_total(lp); + if ((ret = __db_pitem(dbc, rootp, 0, RINTERNAL_SIZE, &hdr, NULL)) != 0) + return (ret); + RE_NREC_SET(rootp, ri.nrecs); + ri.pgno = rp->pgno; + ri.nrecs = __bam_total(rp); + if ((ret = __db_pitem(dbc, rootp, 1, RINTERNAL_SIZE, &hdr, NULL)) != 0) + return (ret); + RE_NREC_ADJ(rootp, ri.nrecs); + return (0); +} + +/* + * __bam_pinsert -- + * Insert a new key into a parent page, completing the split. + */ +static int +__bam_pinsert(dbc, parent, lchild, rchild, space_check) + DBC *dbc; + EPG *parent; + PAGE *lchild, *rchild; + int space_check; +{ + BINTERNAL bi, *child_bi; + BKEYDATA *child_bk, *tmp_bk; + BTREE *t; + BTREE_CURSOR *cp; + DB *dbp; + DBT a, b, hdr, data; + PAGE *ppage; + RINTERNAL ri; + db_indx_t off; + db_recno_t nrecs; + size_t (*func) __P((DB *, const DBT *, const DBT *)); + u_int32_t n, nbytes, nksize; + int ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + t = dbp->bt_internal; + ppage = parent->page; + + /* If handling record numbers, count records split to the right page. */ + nrecs = F_ISSET(cp, C_RECNUM) && !space_check ? __bam_total(rchild) : 0; + + /* + * Now we insert the new page's first key into the parent page, which + * completes the split. The parent points to a PAGE and a page index + * offset, where the new key goes ONE AFTER the index, because we split + * to the right. + * + * XXX + * Some btree algorithms replace the key for the old page as well as + * the new page. We don't, as there's no reason to believe that the + * first key on the old page is any better than the key we have, and, + * in the case of a key being placed at index 0 causing the split, the + * key is unavailable. + */ + off = parent->indx + O_INDX; + + /* + * Calculate the space needed on the parent page. + * + * Prefix trees: space hack used when inserting into BINTERNAL pages. + * Retain only what's needed to distinguish between the new entry and + * the LAST entry on the page to its left. If the keys compare equal, + * retain the entire key. We ignore overflow keys, and the entire key + * must be retained for the next-to-leftmost key on the leftmost page + * of each level, or the search will fail. Applicable ONLY to internal + * pages that have leaf pages as children. Further reduction of the + * key between pairs of internal pages loses too much information. + */ + switch (TYPE(rchild)) { + case P_IBTREE: + child_bi = GET_BINTERNAL(rchild, 0); + nbytes = BINTERNAL_PSIZE(child_bi->len); + + if (P_FREESPACE(ppage) < nbytes) + return (DB_NEEDSPLIT); + if (space_check) + return (0); + + /* Add a new record for the right page. */ + memset(&bi, 0, sizeof(bi)); + bi.len = child_bi->len; + B_TSET(bi.type, child_bi->type, 0); + bi.pgno = rchild->pgno; + bi.nrecs = nrecs; + memset(&hdr, 0, sizeof(hdr)); + hdr.data = &bi; + hdr.size = SSZA(BINTERNAL, data); + memset(&data, 0, sizeof(data)); + data.data = child_bi->data; + data.size = child_bi->len; + if ((ret = __db_pitem(dbc, ppage, off, + BINTERNAL_SIZE(child_bi->len), &hdr, &data)) != 0) + return (ret); + + /* Increment the overflow ref count. */ + if (B_TYPE(child_bi->type) == B_OVERFLOW) + if ((ret = __db_ovref(dbc, + ((BOVERFLOW *)(child_bi->data))->pgno, 1)) != 0) + return (ret); + break; + case P_LDUP: + case P_LBTREE: + child_bk = GET_BKEYDATA(rchild, 0); + switch (B_TYPE(child_bk->type)) { + case B_KEYDATA: + /* + * We set t->bt_prefix to NULL if we have a comparison + * callback but no prefix compression callback. But, + * if we're splitting in an off-page duplicates tree, + * we still have to do some checking. If using the + * default off-page duplicates comparison routine we + * can use the default prefix compression callback. If + * not using the default off-page duplicates comparison + * routine, we can't do any kind of prefix compression + * as there's no way for an application to specify a + * prefix compression callback that corresponds to its + * comparison callback. + */ + if (F_ISSET(dbc, DBC_OPD)) { + if (dbp->dup_compare == __bam_defcmp) + func = __bam_defpfx; + else + func = NULL; + } else + func = t->bt_prefix; + + nbytes = BINTERNAL_PSIZE(child_bk->len); + nksize = child_bk->len; + if (func == NULL) + goto noprefix; + if (ppage->prev_pgno == PGNO_INVALID && off <= 1) + goto noprefix; + tmp_bk = GET_BKEYDATA(lchild, NUM_ENT(lchild) - + (TYPE(lchild) == P_LDUP ? O_INDX : P_INDX)); + if (B_TYPE(tmp_bk->type) != B_KEYDATA) + goto noprefix; + memset(&a, 0, sizeof(a)); + a.size = tmp_bk->len; + a.data = tmp_bk->data; + memset(&b, 0, sizeof(b)); + b.size = child_bk->len; + b.data = child_bk->data; + nksize = func(dbp, &a, &b); + if ((n = BINTERNAL_PSIZE(nksize)) < nbytes) + nbytes = n; + else +noprefix: nksize = child_bk->len; + + if (P_FREESPACE(ppage) < nbytes) + return (DB_NEEDSPLIT); + if (space_check) + return (0); + + memset(&bi, 0, sizeof(bi)); + bi.len = nksize; + B_TSET(bi.type, child_bk->type, 0); + bi.pgno = rchild->pgno; + bi.nrecs = nrecs; + memset(&hdr, 0, sizeof(hdr)); + hdr.data = &bi; + hdr.size = SSZA(BINTERNAL, data); + memset(&data, 0, sizeof(data)); + data.data = child_bk->data; + data.size = nksize; + if ((ret = __db_pitem(dbc, ppage, off, + BINTERNAL_SIZE(nksize), &hdr, &data)) != 0) + return (ret); + break; + case B_DUPLICATE: + case B_OVERFLOW: + nbytes = BINTERNAL_PSIZE(BOVERFLOW_SIZE); + + if (P_FREESPACE(ppage) < nbytes) + return (DB_NEEDSPLIT); + if (space_check) + return (0); + + memset(&bi, 0, sizeof(bi)); + bi.len = BOVERFLOW_SIZE; + B_TSET(bi.type, child_bk->type, 0); + bi.pgno = rchild->pgno; + bi.nrecs = nrecs; + memset(&hdr, 0, sizeof(hdr)); + hdr.data = &bi; + hdr.size = SSZA(BINTERNAL, data); + memset(&data, 0, sizeof(data)); + data.data = child_bk; + data.size = BOVERFLOW_SIZE; + if ((ret = __db_pitem(dbc, ppage, off, + BINTERNAL_SIZE(BOVERFLOW_SIZE), &hdr, &data)) != 0) + return (ret); + + /* Increment the overflow ref count. */ + if (B_TYPE(child_bk->type) == B_OVERFLOW) + if ((ret = __db_ovref(dbc, + ((BOVERFLOW *)child_bk)->pgno, 1)) != 0) + return (ret); + break; + default: + return (__db_pgfmt(dbp, rchild->pgno)); + } + break; + case P_IRECNO: + case P_LRECNO: + nbytes = RINTERNAL_PSIZE; + + if (P_FREESPACE(ppage) < nbytes) + return (DB_NEEDSPLIT); + if (space_check) + return (0); + + /* Add a new record for the right page. */ + memset(&hdr, 0, sizeof(hdr)); + hdr.data = &ri; + hdr.size = RINTERNAL_SIZE; + ri.pgno = rchild->pgno; + ri.nrecs = nrecs; + if ((ret = __db_pitem(dbc, + ppage, off, RINTERNAL_SIZE, &hdr, NULL)) != 0) + return (ret); + break; + default: + return (__db_pgfmt(dbp, rchild->pgno)); + } + + /* + * If a Recno or Btree with record numbers AM page, or an off-page + * duplicates tree, adjust the parent page's left page record count. + */ + if (F_ISSET(cp, C_RECNUM)) { + /* Log the change. */ + if (DB_LOGGING(dbc) && + (ret = __bam_cadjust_log(dbp->dbenv, dbc->txn, + &LSN(ppage), 0, dbp->log_fileid, PGNO(ppage), + &LSN(ppage), parent->indx, -(int32_t)nrecs, 0)) != 0) + return (ret); + + /* Update the left page count. */ + if (dbc->dbtype == DB_RECNO) + GET_RINTERNAL(ppage, parent->indx)->nrecs -= nrecs; + else + GET_BINTERNAL(ppage, parent->indx)->nrecs -= nrecs; + } + + return (0); +} + +/* + * __bam_psplit -- + * Do the real work of splitting the page. + */ +static int +__bam_psplit(dbc, cp, lp, rp, splitret) + DBC *dbc; + EPG *cp; + PAGE *lp, *rp; + db_indx_t *splitret; +{ + DB *dbp; + PAGE *pp; + db_indx_t half, nbytes, off, splitp, top; + int adjust, cnt, iflag, isbigkey, ret; + + dbp = dbc->dbp; + pp = cp->page; + adjust = TYPE(pp) == P_LBTREE ? P_INDX : O_INDX; + + /* + * If we're splitting the first (last) page on a level because we're + * inserting (appending) a key to it, it's likely that the data is + * sorted. Moving a single item to the new page is less work and can + * push the fill factor higher than normal. If we're wrong it's not + * a big deal, we'll just do the split the right way next time. + */ + off = 0; + if (NEXT_PGNO(pp) == PGNO_INVALID && + ((ISINTERNAL(pp) && cp->indx == NUM_ENT(cp->page) - 1) || + (!ISINTERNAL(pp) && cp->indx == NUM_ENT(cp->page)))) + off = NUM_ENT(cp->page) - adjust; + else if (PREV_PGNO(pp) == PGNO_INVALID && cp->indx == 0) + off = adjust; + + if (off != 0) + goto sort; + + /* + * Split the data to the left and right pages. Try not to split on + * an overflow key. (Overflow keys on internal pages will slow down + * searches.) Refuse to split in the middle of a set of duplicates. + * + * First, find the optimum place to split. + * + * It's possible to try and split past the last record on the page if + * there's a very large record at the end of the page. Make sure this + * doesn't happen by bounding the check at the next-to-last entry on + * the page. + * + * Note, we try and split half the data present on the page. This is + * because another process may have already split the page and left + * it half empty. We don't try and skip the split -- we don't know + * how much space we're going to need on the page, and we may need up + * to half the page for a big item, so there's no easy test to decide + * if we need to split or not. Besides, if two threads are inserting + * data into the same place in the database, we're probably going to + * need more space soon anyway. + */ + top = NUM_ENT(pp) - adjust; + half = (dbp->pgsize - HOFFSET(pp)) / 2; + for (nbytes = 0, off = 0; off < top && nbytes < half; ++off) + switch (TYPE(pp)) { + case P_IBTREE: + if (B_TYPE(GET_BINTERNAL(pp, off)->type) == B_KEYDATA) + nbytes += + BINTERNAL_SIZE(GET_BINTERNAL(pp, off)->len); + else + nbytes += BINTERNAL_SIZE(BOVERFLOW_SIZE); + break; + case P_LBTREE: + if (B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA) + nbytes += + BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len); + else + nbytes += BOVERFLOW_SIZE; + + ++off; + /* FALLTHROUGH */ + case P_LDUP: + case P_LRECNO: + if (B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA) + nbytes += + BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len); + else + nbytes += BOVERFLOW_SIZE; + break; + case P_IRECNO: + nbytes += RINTERNAL_SIZE; + break; + default: + return (__db_pgfmt(dbp, pp->pgno)); + } +sort: splitp = off; + + /* + * Splitp is either at or just past the optimum split point. If the + * tree type is such that we're going to promote a key to an internal + * page, and our current choice is an overflow key, look for something + * close by that's smaller. + */ + switch (TYPE(pp)) { + case P_IBTREE: + iflag = 1; + isbigkey = B_TYPE(GET_BINTERNAL(pp, off)->type) != B_KEYDATA; + break; + case P_LBTREE: + case P_LDUP: + iflag = 0; + isbigkey = B_TYPE(GET_BKEYDATA(pp, off)->type) != B_KEYDATA; + break; + default: + iflag = isbigkey = 0; + } + if (isbigkey) + for (cnt = 1; cnt <= 3; ++cnt) { + off = splitp + cnt * adjust; + if (off < (db_indx_t)NUM_ENT(pp) && + ((iflag && + B_TYPE(GET_BINTERNAL(pp,off)->type) == B_KEYDATA) || + B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA)) { + splitp = off; + break; + } + if (splitp <= (db_indx_t)(cnt * adjust)) + continue; + off = splitp - cnt * adjust; + if (iflag ? + B_TYPE(GET_BINTERNAL(pp, off)->type) == B_KEYDATA : + B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA) { + splitp = off; + break; + } + } + + /* + * We can't split in the middle a set of duplicates. We know that + * no duplicate set can take up more than about 25% of the page, + * because that's the point where we push it off onto a duplicate + * page set. So, this loop can't be unbounded. + */ + if (TYPE(pp) == P_LBTREE && + pp->inp[splitp] == pp->inp[splitp - adjust]) + for (cnt = 1;; ++cnt) { + off = splitp + cnt * adjust; + if (off < NUM_ENT(pp) && + pp->inp[splitp] != pp->inp[off]) { + splitp = off; + break; + } + if (splitp <= (db_indx_t)(cnt * adjust)) + continue; + off = splitp - cnt * adjust; + if (pp->inp[splitp] != pp->inp[off]) { + splitp = off + adjust; + break; + } + } + + /* We're going to split at splitp. */ + if ((ret = __bam_copy(dbp, pp, lp, 0, splitp)) != 0) + return (ret); + if ((ret = __bam_copy(dbp, pp, rp, splitp, NUM_ENT(pp))) != 0) + return (ret); + + *splitret = splitp; + return (0); +} + +/* + * __bam_copy -- + * Copy a set of records from one page to another. + * + * PUBLIC: int __bam_copy __P((DB *, PAGE *, PAGE *, u_int32_t, u_int32_t)); + */ +int +__bam_copy(dbp, pp, cp, nxt, stop) + DB *dbp; + PAGE *pp, *cp; + u_int32_t nxt, stop; +{ + db_indx_t nbytes, off; + + /* + * Copy the rest of the data to the right page. Nxt is the next + * offset placed on the target page. + */ + for (off = 0; nxt < stop; ++nxt, ++NUM_ENT(cp), ++off) { + switch (TYPE(pp)) { + case P_IBTREE: + if (B_TYPE(GET_BINTERNAL(pp, nxt)->type) == B_KEYDATA) + nbytes = + BINTERNAL_SIZE(GET_BINTERNAL(pp, nxt)->len); + else + nbytes = BINTERNAL_SIZE(BOVERFLOW_SIZE); + break; + case P_LBTREE: + /* + * If we're on a key and it's a duplicate, just copy + * the offset. + */ + if (off != 0 && (nxt % P_INDX) == 0 && + pp->inp[nxt] == pp->inp[nxt - P_INDX]) { + cp->inp[off] = cp->inp[off - P_INDX]; + continue; + } + /* FALLTHROUGH */ + case P_LDUP: + case P_LRECNO: + if (B_TYPE(GET_BKEYDATA(pp, nxt)->type) == B_KEYDATA) + nbytes = + BKEYDATA_SIZE(GET_BKEYDATA(pp, nxt)->len); + else + nbytes = BOVERFLOW_SIZE; + break; + case P_IRECNO: + nbytes = RINTERNAL_SIZE; + break; + default: + return (__db_pgfmt(dbp, pp->pgno)); + } + cp->inp[off] = HOFFSET(cp) -= nbytes; + memcpy(P_ENTRY(cp, off), P_ENTRY(pp, nxt), nbytes); + } + return (0); +} diff --git a/db/btree/bt_stat.c b/db/btree/bt_stat.c new file mode 100644 index 000000000..349bb40cf --- /dev/null +++ b/db/btree/bt_stat.c @@ -0,0 +1,480 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: bt_stat.c,v 11.29 2000/11/28 21:42:27 bostic Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_shash.h" +#include "lock.h" +#include "btree.h" + +/* + * __bam_stat -- + * Gather/print the btree statistics + * + * PUBLIC: int __bam_stat __P((DB *, void *, void *(*)(size_t), u_int32_t)); + */ +int +__bam_stat(dbp, spp, db_malloc, flags) + DB *dbp; + void *spp; + void *(*db_malloc) __P((size_t)); + u_int32_t flags; +{ + BTMETA *meta; + BTREE *t; + BTREE_CURSOR *cp; + DBC *dbc; + DB_BTREE_STAT *sp; + DB_LOCK lock, metalock; + PAGE *h; + db_pgno_t pgno; + int ret, t_ret; + + PANIC_CHECK(dbp->dbenv); + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->stat"); + + meta = NULL; + t = dbp->bt_internal; + sp = NULL; + metalock.off = lock.off = LOCK_INVALID; + h = NULL; + ret = 0; + + /* Check for invalid flags. */ + if ((ret = __db_statchk(dbp, flags)) != 0) + return (ret); + + /* Acquire a cursor. */ + if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0) + return (ret); + cp = (BTREE_CURSOR *)dbc->internal; + + DEBUG_LWRITE(dbc, NULL, "bam_stat", NULL, NULL, flags); + + /* Allocate and clear the structure. */ + if ((ret = __os_malloc(dbp->dbenv, sizeof(*sp), db_malloc, &sp)) != 0) + goto err; + memset(sp, 0, sizeof(*sp)); + + /* If the app just wants the record count, make it fast. */ + if (flags == DB_RECORDCOUNT) { + if ((ret = __db_lget(dbc, 0, + cp->root, DB_LOCK_READ, 0, &lock)) != 0) + goto err; + if ((ret = memp_fget(dbp->mpf, + &cp->root, 0, (PAGE **)&h)) != 0) + goto err; + + sp->bt_nkeys = RE_NREC(h); + + goto done; + } + if (flags == DB_CACHED_COUNTS) { + if ((ret = __db_lget(dbc, + 0, t->bt_meta, DB_LOCK_READ, 0, &lock)) != 0) + goto err; + if ((ret = + memp_fget(dbp->mpf, &t->bt_meta, 0, (PAGE **)&meta)) != 0) + goto err; + sp->bt_nkeys = meta->dbmeta.key_count; + sp->bt_ndata = meta->dbmeta.record_count; + + goto done; + } + + /* Get the metadata page for the entire database. */ + pgno = PGNO_BASE_MD; + if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &metalock)) != 0) + goto err; + if ((ret = memp_fget(dbp->mpf, &pgno, 0, (PAGE **)&meta)) != 0) + goto err; + + /* Walk the metadata free list, counting pages. */ + for (sp->bt_free = 0, pgno = meta->dbmeta.free; pgno != PGNO_INVALID;) { + ++sp->bt_free; + + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + goto err; + + pgno = h->next_pgno; + if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) + goto err; + h = NULL; + } + + /* Get the root page. */ + pgno = cp->root; + if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &lock)) != 0) + goto err; + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + goto err; + + /* Get the levels from the root page. */ + sp->bt_levels = h->level; + + /* Discard the root page. */ + if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) + goto err; + h = NULL; + __LPUT(dbc, lock); + + /* Walk the tree. */ + if ((ret = __bam_traverse(dbc, + DB_LOCK_READ, cp->root, __bam_stat_callback, sp)) != 0) + goto err; + + /* + * Get the subdatabase metadata page if it's not the same as the + * one we already have. + */ + if (t->bt_meta != PGNO_BASE_MD || !F_ISSET(dbp, DB_AM_RDONLY)) { + if ((ret = memp_fput(dbp->mpf, meta, 0)) != 0) + goto err; + meta = NULL; + __LPUT(dbc, metalock); + + if ((ret = __db_lget(dbc, + 0, t->bt_meta, F_ISSET(dbp, DB_AM_RDONLY) ? + DB_LOCK_READ : DB_LOCK_WRITE, 0, &metalock)) != 0) + goto err; + if ((ret = + memp_fget(dbp->mpf, &t->bt_meta, 0, (PAGE **)&meta)) != 0) + goto err; + } + + /* Get metadata page statistics. */ + sp->bt_metaflags = meta->dbmeta.flags; + sp->bt_maxkey = meta->maxkey; + sp->bt_minkey = meta->minkey; + sp->bt_re_len = meta->re_len; + sp->bt_re_pad = meta->re_pad; + sp->bt_pagesize = meta->dbmeta.pagesize; + sp->bt_magic = meta->dbmeta.magic; + sp->bt_version = meta->dbmeta.version; + if (!F_ISSET(dbp, DB_AM_RDONLY)) { + meta->dbmeta.key_count = sp->bt_nkeys; + meta->dbmeta.record_count = sp->bt_ndata; + } + + /* Discard the metadata page. */ + if ((ret = memp_fput(dbp->mpf, + meta, F_ISSET(dbp, DB_AM_RDONLY) ? 0 : DB_MPOOL_DIRTY)) != 0) + goto err; + meta = NULL; + __LPUT(dbc, metalock); + +done: *(DB_BTREE_STAT **)spp = sp; + + if (0) { +err: if (sp != NULL) + __os_free(sp, sizeof(*sp)); + } + + if (h != NULL && + (t_ret = memp_fput(dbp->mpf, h, 0)) != 0 && ret == 0) + ret = t_ret; + + if (meta != NULL && + (t_ret = memp_fput(dbp->mpf, meta, 0)) != 0 && ret == 0) + ret = t_ret; + + if (lock.off != LOCK_INVALID) + __LPUT(dbc, lock); + + if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __bam_traverse -- + * Walk a Btree database. + * + * PUBLIC: int __bam_traverse __P((DBC *, db_lockmode_t, + * PUBLIC: db_pgno_t, int (*)(DB *, PAGE *, void *, int *), void *)); + */ +int +__bam_traverse(dbc, mode, root_pgno, callback, cookie) + DBC *dbc; + db_lockmode_t mode; + db_pgno_t root_pgno; + int (*callback)__P((DB *, PAGE *, void *, int *)); + void *cookie; +{ + BINTERNAL *bi; + BKEYDATA *bk; + DB *dbp; + DB_LOCK lock; + PAGE *h; + RINTERNAL *ri; + db_indx_t indx; + int already_put, ret, t_ret; + + dbp = dbc->dbp; + + if ((ret = __db_lget(dbc, 0, root_pgno, mode, 0, &lock)) != 0) + return (ret); + if ((ret = memp_fget(dbp->mpf, &root_pgno, 0, &h)) != 0) + goto err; + + switch (TYPE(h)) { + case P_IBTREE: + for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) { + bi = GET_BINTERNAL(h, indx); + if (B_TYPE(bi->type) == B_OVERFLOW && + (ret = __db_traverse_big(dbp, + ((BOVERFLOW *)bi->data)->pgno, + callback, cookie)) != 0) + goto err; + if ((ret = __bam_traverse( + dbc, mode, bi->pgno, callback, cookie)) != 0) + break; + } + break; + case P_IRECNO: + for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) { + ri = GET_RINTERNAL(h, indx); + if ((ret = __bam_traverse( + dbc, mode, ri->pgno, callback, cookie)) != 0) + break; + } + break; + case P_LBTREE: + for (indx = 0; indx < NUM_ENT(h); indx += P_INDX) { + bk = GET_BKEYDATA(h, indx); + if (B_TYPE(bk->type) == B_OVERFLOW && + (ret = __db_traverse_big(dbp, + GET_BOVERFLOW(h, indx)->pgno, + callback, cookie)) != 0) + goto err; + bk = GET_BKEYDATA(h, indx + O_INDX); + if (B_TYPE(bk->type) == B_DUPLICATE && + (ret = __bam_traverse(dbc, mode, + GET_BOVERFLOW(h, indx + O_INDX)->pgno, + callback, cookie)) != 0) + goto err; + if (B_TYPE(bk->type) == B_OVERFLOW && + (ret = __db_traverse_big(dbp, + GET_BOVERFLOW(h, indx + O_INDX)->pgno, + callback, cookie)) != 0) + goto err; + } + break; + case P_LDUP: + case P_LRECNO: + for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) { + bk = GET_BKEYDATA(h, indx); + if (B_TYPE(bk->type) == B_OVERFLOW && + (ret = __db_traverse_big(dbp, + GET_BOVERFLOW(h, indx)->pgno, + callback, cookie)) != 0) + goto err; + } + break; + } + + already_put = 0; + if ((ret = callback(dbp, h, cookie, &already_put)) != 0) + goto err; + +err: if (!already_put && + (t_ret = memp_fput(dbp->mpf, h, 0)) != 0 && ret != 0) + ret = t_ret; + __LPUT(dbc, lock); + + return (ret); +} + +/* + * __bam_stat_callback -- + * Statistics callback. + * + * PUBLIC: int __bam_stat_callback __P((DB *, PAGE *, void *, int *)); + */ +int +__bam_stat_callback(dbp, h, cookie, putp) + DB *dbp; + PAGE *h; + void *cookie; + int *putp; +{ + DB_BTREE_STAT *sp; + db_indx_t indx, top; + u_int8_t type; + + sp = cookie; + *putp = 0; + top = NUM_ENT(h); + + switch (TYPE(h)) { + case P_IBTREE: + case P_IRECNO: + ++sp->bt_int_pg; + sp->bt_int_pgfree += P_FREESPACE(h); + break; + case P_LBTREE: + /* Correct for on-page duplicates and deleted items. */ + for (indx = 0; indx < top; indx += P_INDX) { + if (indx + P_INDX >= top || + h->inp[indx] != h->inp[indx + P_INDX]) + ++sp->bt_nkeys; + + type = GET_BKEYDATA(h, indx + O_INDX)->type; + if (!B_DISSET(type) && B_TYPE(type) != B_DUPLICATE) + ++sp->bt_ndata; + } + + ++sp->bt_leaf_pg; + sp->bt_leaf_pgfree += P_FREESPACE(h); + break; + case P_LRECNO: + /* + * If walking a recno tree, then each of these items is a key. + * Otherwise, we're walking an off-page duplicate set. + */ + if (dbp->type == DB_RECNO) { + sp->bt_nkeys += top; + + /* + * Correct for deleted items in non-renumbering + * Recno databases. + */ + if (F_ISSET(dbp, DB_RE_RENUMBER)) + sp->bt_ndata += top; + else + for (indx = 0; indx < top; indx += O_INDX) { + type = GET_BKEYDATA(h, indx)->type; + if (!B_DISSET(type)) + ++sp->bt_ndata; + } + + ++sp->bt_leaf_pg; + sp->bt_leaf_pgfree += P_FREESPACE(h); + } else { + sp->bt_ndata += top; + + ++sp->bt_dup_pg; + sp->bt_dup_pgfree += P_FREESPACE(h); + } + break; + case P_LDUP: + /* Correct for deleted items. */ + for (indx = 0; indx < top; indx += O_INDX) + if (!B_DISSET(GET_BKEYDATA(h, indx)->type)) + ++sp->bt_ndata; + + ++sp->bt_dup_pg; + sp->bt_dup_pgfree += P_FREESPACE(h); + break; + case P_OVERFLOW: + ++sp->bt_over_pg; + sp->bt_over_pgfree += P_OVFLSPACE(dbp->pgsize, h); + break; + default: + return (__db_pgfmt(dbp, h->pgno)); + } + return (0); +} + +/* + * __bam_key_range -- + * Return proportion of keys relative to given key. The numbers are + * slightly skewed due to on page duplicates. + * + * PUBLIC: int __bam_key_range __P((DB *, + * PUBLIC: DB_TXN *, DBT *, DB_KEY_RANGE *, u_int32_t)); + */ +int +__bam_key_range(dbp, txn, dbt, kp, flags) + DB *dbp; + DB_TXN *txn; + DBT *dbt; + DB_KEY_RANGE *kp; + u_int32_t flags; +{ + BTREE_CURSOR *cp; + DBC *dbc; + EPG *sp; + double factor; + int exact, ret, t_ret; + + PANIC_CHECK(dbp->dbenv); + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->key_range"); + + if (flags != 0) + return (__db_ferr(dbp->dbenv, "DB->key_range", 0)); + + /* Acquire a cursor. */ + if ((ret = dbp->cursor(dbp, txn, &dbc, 0)) != 0) + return (ret); + + DEBUG_LWRITE(dbc, NULL, "bam_key_range", NULL, NULL, 0); + + if ((ret = __bam_search(dbc, dbt, S_STK_ONLY, 1, NULL, &exact)) != 0) + goto err; + + cp = (BTREE_CURSOR *)dbc->internal; + kp->less = kp->greater = 0.0; + + factor = 1.0; + /* Correct the leaf page. */ + cp->csp->entries /= 2; + cp->csp->indx /= 2; + for (sp = cp->sp; sp <= cp->csp; ++sp) { + /* + * At each level we know that pages greater than indx contain + * keys greater than what we are looking for and those less + * than indx are less than. The one pointed to by indx may + * have some less, some greater or even equal. If indx is + * equal to the number of entries, then the key is out of range + * and everything is less. + */ + if (sp->indx == 0) + kp->greater += factor * (sp->entries - 1)/sp->entries; + else if (sp->indx == sp->entries) + kp->less += factor; + else { + kp->less += factor * sp->indx / sp->entries; + kp->greater += factor * + (sp->entries - sp->indx - 1) / sp->entries; + } + factor *= 1.0/sp->entries; + } + + /* + * If there was an exact match then assign 1 n'th to the key itself. + * Otherwise that factor belongs to those greater than the key, unless + * the key was out of range. + */ + if (exact) + kp->equal = factor; + else { + if (kp->less != 1) + kp->greater += factor; + kp->equal = 0; + } + + BT_STK_CLR(cp); + +err: if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} diff --git a/db/btree/bt_upgrade.c b/db/btree/bt_upgrade.c new file mode 100644 index 000000000..4032dba3b --- /dev/null +++ b/db/btree/bt_upgrade.c @@ -0,0 +1,164 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: bt_upgrade.c,v 11.19 2000/11/30 00:58:29 ubell Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <limits.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_swap.h" +#include "btree.h" +#include "db_am.h" +#include "db_upgrade.h" + +/* + * __bam_30_btreemeta -- + * Upgrade the metadata pages from version 6 to version 7. + * + * PUBLIC: int __bam_30_btreemeta __P((DB *, char *, u_int8_t *)); + */ +int +__bam_30_btreemeta(dbp, real_name, buf) + DB *dbp; + char *real_name; + u_int8_t *buf; +{ + BTMETA30 *newmeta; + BTMETA2X *oldmeta; + DB_ENV *dbenv; + int ret; + + dbenv = dbp->dbenv; + + newmeta = (BTMETA30 *)buf; + oldmeta = (BTMETA2X *)buf; + + /* + * Move things from the end up, so we do not overwrite things. + * We are going to create a new uid, so we can move the stuff + * at the end of the structure first, overwriting the uid. + */ + + newmeta->re_pad = oldmeta->re_pad; + newmeta->re_len = oldmeta->re_len; + newmeta->minkey = oldmeta->minkey; + newmeta->maxkey = oldmeta->maxkey; + newmeta->dbmeta.free = oldmeta->free; + newmeta->dbmeta.flags = oldmeta->flags; + newmeta->dbmeta.type = P_BTREEMETA; + + newmeta->dbmeta.version = 7; + /* Replace the unique ID. */ + if ((ret = __os_fileid(dbenv, real_name, 1, buf + 36)) != 0) + return (ret); + + newmeta->root = 1; + + return (0); +} + +/* + * __bam_31_btreemeta -- + * Upgrade the database from version 7 to version 8. + * + * PUBLIC: int __bam_31_btreemeta + * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); + */ +int +__bam_31_btreemeta(dbp, real_name, flags, fhp, h, dirtyp) + DB *dbp; + char *real_name; + u_int32_t flags; + DB_FH *fhp; + PAGE *h; + int *dirtyp; +{ + BTMETA31 *newmeta; + BTMETA30 *oldmeta; + + COMPQUIET(dbp, NULL); + COMPQUIET(real_name, NULL); + COMPQUIET(fhp, NULL); + + newmeta = (BTMETA31 *)h; + oldmeta = (BTMETA30 *)h; + + /* + * Copy the effected fields down the page. + * The fields may overlap each other so we + * start at the bottom and use memmove. + */ + newmeta->root = oldmeta->root; + newmeta->re_pad = oldmeta->re_pad; + newmeta->re_len = oldmeta->re_len; + newmeta->minkey = oldmeta->minkey; + newmeta->maxkey = oldmeta->maxkey; + memmove(newmeta->dbmeta.uid, + oldmeta->dbmeta.uid, sizeof(oldmeta->dbmeta.uid)); + newmeta->dbmeta.flags = oldmeta->dbmeta.flags; + newmeta->dbmeta.record_count = 0; + newmeta->dbmeta.key_count = 0; + ZERO_LSN(newmeta->dbmeta.unused3); + + /* Set the version number. */ + newmeta->dbmeta.version = 8; + + /* Upgrade the flags. */ + if (LF_ISSET(DB_DUPSORT)) + F_SET(&newmeta->dbmeta, BTM_DUPSORT); + + *dirtyp = 1; + return (0); +} + +/* + * __bam_31_lbtree -- + * Upgrade the database btree leaf pages. + * + * PUBLIC: int __bam_31_lbtree + * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); + */ +int +__bam_31_lbtree(dbp, real_name, flags, fhp, h, dirtyp) + DB *dbp; + char *real_name; + u_int32_t flags; + DB_FH *fhp; + PAGE *h; + int *dirtyp; +{ + BKEYDATA *bk; + db_pgno_t pgno; + db_indx_t indx; + int ret; + + ret = 0; + for (indx = O_INDX; indx < NUM_ENT(h); indx += P_INDX) { + bk = GET_BKEYDATA(h, indx); + if (B_TYPE(bk->type) == B_DUPLICATE) { + pgno = GET_BOVERFLOW(h, indx)->pgno; + if ((ret = __db_31_offdup(dbp, real_name, fhp, + LF_ISSET(DB_DUPSORT) ? 1 : 0, &pgno)) != 0) + break; + if (pgno != GET_BOVERFLOW(h, indx)->pgno) { + *dirtyp = 1; + GET_BOVERFLOW(h, indx)->pgno = pgno; + } + } + } + + return (ret); +} diff --git a/db/btree/bt_verify.c b/db/btree/bt_verify.c new file mode 100644 index 000000000..9f8647e7e --- /dev/null +++ b/db/btree/bt_verify.c @@ -0,0 +1,2237 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1999, 2000 + * Sleepycat Software. All rights reserved. + * + * $Id: bt_verify.c,v 1.44 2000/12/06 19:55:44 ubell Exp $ + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: bt_verify.c,v 1.44 2000/12/06 19:55:44 ubell Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_verify.h" +#include "btree.h" + +static int __bam_safe_getdata __P((DB *, PAGE *, u_int32_t, int, DBT *, int *)); +static int __bam_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, + db_indx_t *, u_int32_t)); +static int __bam_vrfy_treeorder __P((DB *, db_pgno_t, PAGE *, BINTERNAL *, + BINTERNAL *, int (*)(DB *, const DBT *, const DBT *), u_int32_t)); +static int __ram_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, + db_indx_t *, u_int32_t)); + +#define OKFLAGS (DB_AGGRESSIVE | DB_NOORDERCHK | DB_SALVAGE) + +/* + * __bam_vrfy_meta -- + * Verify the btree-specific part of a metadata page. + * + * PUBLIC: int __bam_vrfy_meta __P((DB *, VRFY_DBINFO *, BTMETA *, + * PUBLIC: db_pgno_t, u_int32_t)); + */ +int +__bam_vrfy_meta(dbp, vdp, meta, pgno, flags) + DB *dbp; + VRFY_DBINFO *vdp; + BTMETA *meta; + db_pgno_t pgno; + u_int32_t flags; +{ + VRFY_PAGEINFO *pip; + int isbad, t_ret, ret; + db_indx_t ovflsize; + + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + + isbad = 0; + + /* + * If VRFY_INCOMPLETE is not set, then we didn't come through + * __db_vrfy_pagezero and didn't incompletely + * check this page--we haven't checked it at all. + * Thus we need to call __db_vrfy_meta and check the common fields. + * + * If VRFY_INCOMPLETE is set, we've already done all the same work + * in __db_vrfy_pagezero, so skip the check. + */ + if (!F_ISSET(pip, VRFY_INCOMPLETE) && + (ret = __db_vrfy_meta(dbp, vdp, &meta->dbmeta, pgno, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + + /* bt_minkey: must be >= 2; must produce sensible ovflsize */ + + /* avoid division by zero */ + ovflsize = meta->minkey > 0 ? + B_MINKEY_TO_OVFLSIZE(meta->minkey, dbp->pgsize) : 0; + + if (meta->minkey < 2 || + ovflsize > B_MINKEY_TO_OVFLSIZE(DEFMINKEYPAGE, dbp->pgsize)) { + pip->bt_minkey = 0; + isbad = 1; + EPRINT((dbp->dbenv, + "Nonsensical bt_minkey value %lu on metadata page %lu", + (u_long)meta->minkey, (u_long)pgno)); + } else + pip->bt_minkey = meta->minkey; + + /* bt_maxkey: no constraints (XXX: right?) */ + pip->bt_maxkey = meta->maxkey; + + /* re_len: no constraints on this (may be zero or huge--we make rope) */ + pip->re_len = meta->re_len; + + /* + * The root must not be current page or 0 and it must be within + * database. If this metadata page is the master meta data page + * of the file, then the root page had better be page 1. + */ + pip->root = 0; + if (meta->root == PGNO_INVALID + || meta->root == pgno || !IS_VALID_PGNO(meta->root) || + (pgno == PGNO_BASE_MD && meta->root != 1)) { + isbad = 1; + EPRINT((dbp->dbenv, + "Nonsensical root page %lu on metadata page %lu", + (u_long)meta->root, (u_long)vdp->last_pgno)); + } else + pip->root = meta->root; + + /* Flags. */ + if (F_ISSET(&meta->dbmeta, BTM_RENUMBER)) + F_SET(pip, VRFY_IS_RRECNO); + + if (F_ISSET(&meta->dbmeta, BTM_SUBDB)) { + /* + * If this is a master db meta page, it had better not have + * duplicates. + */ + if (F_ISSET(&meta->dbmeta, BTM_DUP) && pgno == PGNO_BASE_MD) { + isbad = 1; + EPRINT((dbp->dbenv, + "Btree metadata page %lu has both duplicates and multiple databases", + (u_long)pgno)); + } + F_SET(pip, VRFY_HAS_SUBDBS); + } + + if (F_ISSET(&meta->dbmeta, BTM_DUP)) + F_SET(pip, VRFY_HAS_DUPS); + if (F_ISSET(&meta->dbmeta, BTM_DUPSORT)) + F_SET(pip, VRFY_HAS_DUPSORT); + if (F_ISSET(&meta->dbmeta, BTM_RECNUM)) + F_SET(pip, VRFY_HAS_RECNUMS); + if (F_ISSET(pip, VRFY_HAS_RECNUMS) && F_ISSET(pip, VRFY_HAS_DUPS)) { + EPRINT((dbp->dbenv, + "Btree metadata page %lu illegally has both recnums and dups", + (u_long)pgno)); + isbad = 1; + } + + if (F_ISSET(&meta->dbmeta, BTM_RECNO)) { + F_SET(pip, VRFY_IS_RECNO); + dbp->type = DB_RECNO; + } else if (F_ISSET(pip, VRFY_IS_RRECNO)) { + isbad = 1; + EPRINT((dbp->dbenv, + "Metadata page %lu has renumber flag set but is not recno", + (u_long)pgno)); + } + + if (F_ISSET(pip, VRFY_IS_RECNO) && F_ISSET(pip, VRFY_HAS_DUPS)) { + EPRINT((dbp->dbenv, + "Recno metadata page %lu specifies duplicates", + (u_long)pgno)); + isbad = 1; + } + + if (F_ISSET(&meta->dbmeta, BTM_FIXEDLEN)) + F_SET(pip, VRFY_IS_FIXEDLEN); + else if (pip->re_len > 0) { + /* + * It's wrong to have an re_len if it's not a fixed-length + * database + */ + isbad = 1; + EPRINT((dbp->dbenv, + "re_len of %lu in non-fixed-length database", + (u_long)pip->re_len)); + } + + /* + * We do not check that the rest of the page is 0, because it may + * not be and may still be correct. + */ + +err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) + ret = t_ret; + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); +} + +/* + * __ram_vrfy_leaf -- + * Verify a recno leaf page. + * + * PUBLIC: int __ram_vrfy_leaf __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, + * PUBLIC: u_int32_t)); + */ +int +__ram_vrfy_leaf(dbp, vdp, h, pgno, flags) + DB *dbp; + VRFY_DBINFO *vdp; + PAGE *h; + db_pgno_t pgno; + u_int32_t flags; +{ + BKEYDATA *bk; + VRFY_PAGEINFO *pip; + db_indx_t i; + int ret, t_ret, isbad; + u_int32_t re_len_guess, len; + + isbad = 0; + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + + if ((ret = __db_fchk(dbp->dbenv, + "__ram_vrfy_leaf", flags, OKFLAGS)) != 0) + goto err; + + if (TYPE(h) != P_LRECNO) { + /* We should not have been called. */ + TYPE_ERR_PRINT(dbp->dbenv, "__ram_vrfy_leaf", pgno, TYPE(h)); + DB_ASSERT(0); + ret = EINVAL; + goto err; + } + + /* + * Verify (and, if relevant, save off) page fields common to + * all PAGEs. + */ + if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + + /* + * Verify inp[]. Return immediately if it returns DB_VERIFY_BAD; + * further checks are dangerous. + */ + if ((ret = __bam_vrfy_inp(dbp, + vdp, h, pgno, &pip->entries, flags)) != 0) + goto err; + + if (F_ISSET(pip, VRFY_HAS_DUPS)) { + EPRINT((dbp->dbenv, + "Recno database has dups on page %lu", (u_long)pgno)); + ret = DB_VERIFY_BAD; + goto err; + } + + /* + * Walk through inp and see if the lengths of all the records are the + * same--if so, this may be a fixed-length database, and we want to + * save off this value. We know inp to be safe if we've gotten this + * far. + */ + re_len_guess = 0; + for (i = 0; i < NUM_ENT(h); i++) { + bk = GET_BKEYDATA(h, i); + /* KEYEMPTY. Go on. */ + if (B_DISSET(bk->type)) + continue; + if (bk->type == B_OVERFLOW) + len = ((BOVERFLOW *)bk)->tlen; + else if (bk->type == B_KEYDATA) + len = bk->len; + else { + isbad = 1; + EPRINT((dbp->dbenv, + "Nonsensical type for item %lu, page %lu", + (u_long)i, (u_long)pgno)); + continue; + } + if (re_len_guess == 0) + re_len_guess = len; + + /* + * Is this item's len the same as the last one's? If not, + * reset to 0 and break--we don't have a single re_len. + * Otherwise, go on to the next item. + */ + if (re_len_guess != len) { + re_len_guess = 0; + break; + } + } + pip->re_len = re_len_guess; + + /* Save off record count. */ + pip->rec_cnt = NUM_ENT(h); + +err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) + ret = t_ret; + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : 0); +} + +/* + * __bam_vrfy -- + * Verify a btree leaf or internal page. + * + * PUBLIC: int __bam_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, + * PUBLIC: u_int32_t)); + */ +int +__bam_vrfy(dbp, vdp, h, pgno, flags) + DB *dbp; + VRFY_DBINFO *vdp; + PAGE *h; + db_pgno_t pgno; + u_int32_t flags; +{ + VRFY_PAGEINFO *pip; + int ret, t_ret, isbad; + + isbad = 0; + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + + switch (TYPE(h)) { + case P_IBTREE: + case P_IRECNO: + case P_LBTREE: + case P_LDUP: + break; + default: + TYPE_ERR_PRINT(dbp->dbenv, "__bam_vrfy", pgno, TYPE(h)); + DB_ASSERT(0); + ret = EINVAL; + goto err; + } + + /* + * Verify (and, if relevant, save off) page fields common to + * all PAGEs. + */ + if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + + /* + * The record count is, on internal pages, stored in an overloaded + * next_pgno field. Save it off; we'll verify it when we check + * overall database structure. We could overload the field + * in VRFY_PAGEINFO, too, but this seems gross, and space + * is not at such a premium. + */ + pip->rec_cnt = RE_NREC(h); + + /* + * Verify inp[]. + */ + if (TYPE(h) == P_IRECNO) { + if ((ret = __ram_vrfy_inp(dbp, + vdp, h, pgno, &pip->entries, flags)) != 0) + goto err; + } else if ((ret = __bam_vrfy_inp(dbp, + vdp, h, pgno, &pip->entries, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + EPRINT((dbp->dbenv, + "item order check on page %lu unsafe: skipping", + (u_long)pgno)); + } else if (!LF_ISSET(DB_NOORDERCHK) && (ret = + __bam_vrfy_itemorder(dbp, vdp, h, pgno, 0, 0, 0, flags)) != 0) { + /* + * We know that the elements of inp are reasonable. + * + * Check that elements fall in the proper order. + */ + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + +err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) + ret = t_ret; + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : 0); +} + +/* + * __ram_vrfy_inp -- + * Verify that all entries in a P_IRECNO inp[] array are reasonable, + * and count them. Note that P_LRECNO uses __bam_vrfy_inp; + * P_IRECNOs are a special, and simpler, case, since they have + * RINTERNALs rather than BKEYDATA/BINTERNALs. + */ +static int +__ram_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) + DB *dbp; + VRFY_DBINFO *vdp; + PAGE *h; + db_pgno_t pgno; + db_indx_t *nentriesp; + u_int32_t flags; +{ + RINTERNAL *ri; + VRFY_CHILDINFO child; + VRFY_PAGEINFO *pip; + int ret, t_ret, isbad; + u_int32_t himark, i, offset, nentries; + u_int8_t *pagelayout, *p; + + isbad = 0; + memset(&child, 0, sizeof(VRFY_CHILDINFO)); + nentries = 0; + pagelayout = NULL; + + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + + if (TYPE(h) != P_IRECNO) { + TYPE_ERR_PRINT(dbp->dbenv, "__ram_vrfy_inp", pgno, TYPE(h)); + DB_ASSERT(0); + ret = EINVAL; + goto err; + } + + himark = dbp->pgsize; + if ((ret = + __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &pagelayout)) != 0) + goto err; + memset(pagelayout, 0, dbp->pgsize); + for (i = 0; i < NUM_ENT(h); i++) { + if ((u_int8_t *)h->inp + i >= (u_int8_t *)h + himark) { + EPRINT((dbp->dbenv, + "Page %lu entries listing %lu overlaps data", + (u_long)pgno, (u_long)i)); + ret = DB_VERIFY_BAD; + goto err; + } + offset = h->inp[i]; + /* + * Check that the item offset is reasonable: it points + * somewhere after the inp array and before the end of the + * page. + */ + if (offset <= (u_int32_t)((u_int8_t *)h->inp + i - + (u_int8_t *)h) || + offset > (u_int32_t)(dbp->pgsize - RINTERNAL_SIZE)) { + isbad = 1; + EPRINT((dbp->dbenv, + "Bad offset %lu at page %lu index %lu", + (u_long)offset, (u_long)pgno, (u_long)i)); + continue; + } + + /* Update the high-water mark (what HOFFSET should be) */ + if (offset < himark) + himark = offset; + + nentries++; + + /* Make sure this RINTERNAL is not multiply referenced. */ + ri = GET_RINTERNAL(h, i); + if (pagelayout[offset] == 0) { + pagelayout[offset] = 1; + child.pgno = ri->pgno; + child.type = V_RECNO; + child.nrecs = ri->nrecs; + if ((ret = __db_vrfy_childput(vdp, pgno, &child)) != 0) + goto err; + } else { + EPRINT((dbp->dbenv, + "RINTERNAL structure at offset %lu, page %lu referenced twice", + (u_long)offset, (u_long)pgno)); + isbad = 1; + } + } + + for (p = pagelayout + himark; + p < pagelayout + dbp->pgsize; + p += RINTERNAL_SIZE) + if (*p != 1) { + EPRINT((dbp->dbenv, + "Gap between items at offset %lu, page %lu", + (u_long)(p - pagelayout), (u_long)pgno)); + isbad = 1; + } + + if ((db_indx_t)himark != HOFFSET(h)) { + EPRINT((dbp->dbenv, "Bad HOFFSET %lu, appears to be %lu", + (u_long)(HOFFSET(h)), (u_long)himark)); + isbad = 1; + } + + *nentriesp = nentries; + +err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) + ret = t_ret; + if (pagelayout != NULL) + __os_free(pagelayout, dbp->pgsize); + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); +} + +/* + * __bam_vrfy_inp -- + * Verify that all entries in inp[] array are reasonable; + * count them. + */ +static int +__bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) + DB *dbp; + VRFY_DBINFO *vdp; + PAGE *h; + db_pgno_t pgno; + db_indx_t *nentriesp; + u_int32_t flags; +{ + BKEYDATA *bk; + BOVERFLOW *bo; + VRFY_CHILDINFO child; + VRFY_PAGEINFO *pip; + int isbad, initem, isdupitem, ret, t_ret; + u_int32_t himark, offset; /* These would be db_indx_ts but for algnmt.*/ + u_int32_t i, endoff, nentries; + u_int8_t *pagelayout; + + isbad = isdupitem = 0; + nentries = 0; + memset(&child, 0, sizeof(VRFY_CHILDINFO)); + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + + switch (TYPE(h)) { + case P_IBTREE: + case P_LBTREE: + case P_LDUP: + case P_LRECNO: + break; + default: + /* + * In the salvager, we might call this from a page which + * we merely suspect is a btree page. Otherwise, it + * shouldn't get called--if it is, that's a verifier bug. + */ + if (LF_ISSET(DB_SALVAGE)) + break; + TYPE_ERR_PRINT(dbp->dbenv, "__bam_vrfy_inp", pgno, TYPE(h)); + DB_ASSERT(0); + ret = EINVAL; + goto err; + } + + /* + * Loop through inp[], the array of items, until we either + * run out of entries or collide with the data. Keep track + * of h_offset in himark. + * + * For each element in inp[i], make sure it references a region + * that starts after the end of the inp array (as defined by + * NUM_ENT(h)), ends before the beginning of the page, doesn't + * overlap any other regions, and doesn't have a gap between + * it and the region immediately after it. + */ + himark = dbp->pgsize; + if ((ret = __os_malloc(dbp->dbenv, + dbp->pgsize, NULL, &pagelayout)) != 0) + goto err; + memset(pagelayout, 0, dbp->pgsize); + for (i = 0; i < NUM_ENT(h); i++) { + + ret = __db_vrfy_inpitem(dbp, + h, pgno, i, 1, flags, &himark, &offset); + if (ret == DB_VERIFY_BAD) { + isbad = 1; + continue; + } else if (ret == DB_VERIFY_FATAL) { + isbad = 1; + goto err; + } else if (ret != 0) + DB_ASSERT(0); + + /* + * We now have a plausible beginning for the item, and we know + * its length is safe. + * + * Mark the beginning and end in pagelayout so we can make sure + * items have no overlaps or gaps. + */ + bk = GET_BKEYDATA(h, i); +#define ITEM_BEGIN 1 +#define ITEM_END 2 + if (pagelayout[offset] == 0) + pagelayout[offset] = ITEM_BEGIN; + else if (pagelayout[offset] == ITEM_BEGIN) { + /* + * Having two inp entries that point at the same patch + * of page is legal if and only if the page is + * a btree leaf and they're onpage duplicate keys-- + * that is, if (i % P_INDX) == 0. + */ + if ((i % P_INDX == 0) && (TYPE(h) == P_LBTREE)) { + /* Flag for later. */ + F_SET(pip, VRFY_HAS_DUPS); + + /* Bump up nentries so we don't undercount. */ + nentries++; + + /* + * We'll check to make sure the end is + * equal, too. + */ + isdupitem = 1; + } else { + isbad = 1; + EPRINT((dbp->dbenv, + "Duplicated item %lu on page %lu", + (u_long)i, (u_long)pgno)); + } + } + + /* + * Mark the end. Its location varies with the page type + * and the item type. + * + * If the end already has a sign other than 0, do nothing-- + * it's an overlap that we'll catch later. + */ + switch(B_TYPE(bk->type)) { + case B_KEYDATA: + if (TYPE(h) == P_IBTREE) + /* It's a BINTERNAL. */ + endoff = offset + BINTERNAL_SIZE(bk->len) - 1; + else + endoff = offset + BKEYDATA_SIZE(bk->len) - 1; + break; + case B_DUPLICATE: + /* + * Flag that we have dups; we'll check whether + * that's okay during the structure check. + */ + F_SET(pip, VRFY_HAS_DUPS); + /* FALLTHROUGH */ + case B_OVERFLOW: + /* + * Overflow entries on internal pages are stored + * as the _data_ of a BINTERNAL; overflow entries + * on leaf pages are stored as the entire entry. + */ + endoff = offset + + ((TYPE(h) == P_IBTREE) ? + BINTERNAL_SIZE(BOVERFLOW_SIZE) : + BOVERFLOW_SIZE) - 1; + break; + default: + /* + * We'll complain later; for now, just mark + * a minimum. + */ + endoff = offset + BKEYDATA_SIZE(0) - 1; + break; + } + + /* + * If this is an onpage duplicate key we've seen before, + * the end had better coincide too. + */ + if (isdupitem && pagelayout[endoff] != ITEM_END) { + EPRINT((dbp->dbenv, + "Duplicated item %lu on page %lu", + (u_long)i, (u_long)pgno)); + isbad = 1; + } else if (pagelayout[endoff] == 0) + pagelayout[endoff] = ITEM_END; + isdupitem = 0; + + /* + * There should be no deleted items in a quiescent tree, + * except in recno. + */ + if (B_DISSET(bk->type) && TYPE(h) != P_LRECNO) { + isbad = 1; + EPRINT((dbp->dbenv, + "Item %lu on page %lu marked deleted", + (u_long)i, (u_long)pgno)); + } + + /* + * Check the type and such of bk--make sure it's reasonable + * for the pagetype. + */ + switch (B_TYPE(bk->type)) { + case B_KEYDATA: + /* + * This is a normal, non-overflow BKEYDATA or BINTERNAL. + * The only thing to check is the len, and that's + * already been done. + */ + break; + case B_DUPLICATE: + if (TYPE(h) == P_IBTREE) { + isbad = 1; + EPRINT((dbp->dbenv, + "Duplicate page referenced by internal btree page %lu at item %lu", + (u_long)pgno, (u_long)i)); + break; + } else if (TYPE(h) == P_LRECNO) { + isbad = 1; + EPRINT((dbp->dbenv, + "Duplicate page referenced by recno page %lu at item %lu", + (u_long)pgno, (u_long)i)); + break; + } + /* FALLTHROUGH */ + case B_OVERFLOW: + bo = (TYPE(h) == P_IBTREE) ? + (BOVERFLOW *)(((BINTERNAL *)bk)->data) : + (BOVERFLOW *)bk; + + if (B_TYPE(bk->type) == B_OVERFLOW) + /* Make sure tlen is reasonable. */ + if (bo->tlen > dbp->pgsize * vdp->last_pgno) { + isbad = 1; + EPRINT((dbp->dbenv, + "Impossible tlen %lu, item %lu, page %lu", + (u_long)bo->tlen, (u_long)i, + (u_long)pgno)); + /* Don't save as a child. */ + break; + } + + if (!IS_VALID_PGNO(bo->pgno) || bo->pgno == pgno || + bo->pgno == PGNO_INVALID) { + isbad = 1; + EPRINT((dbp->dbenv, + "Offpage item %lu, page %lu has bad pgno", + (u_long)i, (u_long)pgno)); + /* Don't save as a child. */ + break; + } + + child.pgno = bo->pgno; + child.type = (B_TYPE(bk->type) == B_OVERFLOW ? + V_OVERFLOW : V_DUPLICATE); + child.tlen = bo->tlen; + if ((ret = __db_vrfy_childput(vdp, pgno, &child)) != 0) + goto err; + break; + default: + isbad = 1; + EPRINT((dbp->dbenv, + "Item %lu on page %lu of invalid type %lu", + (u_long)i, (u_long)pgno)); + break; + } + } + + /* + * Now, loop through and make sure the items are contiguous and + * non-overlapping. + */ + initem = 0; + for (i = himark; i < dbp->pgsize; i++) + if (initem == 0) + switch (pagelayout[i]) { + case 0: + /* May be just for alignment. */ + if (i != ALIGN(i, sizeof(u_int32_t))) + continue; + + isbad = 1; + EPRINT((dbp->dbenv, + "Gap between items, page %lu offset %lu", + (u_long)pgno, (u_long)i)); + /* Find the end of the gap */ + for ( ; pagelayout[i + 1] == 0 && + (size_t)(i + 1) < dbp->pgsize; i++) + ; + break; + case ITEM_BEGIN: + /* We've found an item. Check its alignment. */ + if (i != ALIGN(i, sizeof(u_int32_t))) { + isbad = 1; + EPRINT((dbp->dbenv, + "Offset %lu page %lu unaligned", + (u_long)i, (u_long)pgno)); + } + initem = 1; + nentries++; + break; + case ITEM_END: + /* + * We've hit the end of an item even though + * we don't think we're in one; must + * be an overlap. + */ + isbad = 1; + EPRINT((dbp->dbenv, + "Overlapping items, page %lu offset %lu", + (u_long)pgno, (u_long)i)); + break; + default: + /* Should be impossible. */ + DB_ASSERT(0); + ret = EINVAL; + goto err; + } + else + switch (pagelayout[i]) { + case 0: + /* In the middle of an item somewhere. Okay. */ + break; + case ITEM_END: + /* End of an item; switch to out-of-item mode.*/ + initem = 0; + break; + case ITEM_BEGIN: + /* + * Hit a second item beginning without an + * end. Overlap. + */ + isbad = 1; + EPRINT((dbp->dbenv, + "Overlapping items, page %lu offset %lu", + (u_long)pgno, (u_long)i)); + break; + } + + (void)__os_free(pagelayout, dbp->pgsize); + + /* Verify HOFFSET. */ + if ((db_indx_t)himark != HOFFSET(h)) { + EPRINT((dbp->dbenv, "Bad HOFFSET %lu, appears to be %lu", + (u_long)HOFFSET(h), (u_long)himark)); + isbad = 1; + } + +err: if (nentriesp != NULL) + *nentriesp = nentries; + + if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) + ret = t_ret; + + return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret); +} + +/* + * __bam_vrfy_itemorder -- + * Make sure the items on a page sort correctly. + * + * Assumes that NUM_ENT(h) and inp[0]..inp[NUM_ENT(h) - 1] are + * reasonable; be sure that __bam_vrfy_inp has been called first. + * + * If ovflok is set, it also assumes that overflow page chains + * hanging off the current page have been sanity-checked, and so we + * can use __bam_cmp to verify their ordering. If it is not set, + * and we run into an overflow page, carp and return DB_VERIFY_BAD; + * we shouldn't be called if any exist. + * + * PUBLIC: int __bam_vrfy_itemorder __P((DB *, VRFY_DBINFO *, PAGE *, + * PUBLIC: db_pgno_t, u_int32_t, int, int, u_int32_t)); + */ +int +__bam_vrfy_itemorder(dbp, vdp, h, pgno, nentries, ovflok, hasdups, flags) + DB *dbp; + VRFY_DBINFO *vdp; + PAGE *h; + db_pgno_t pgno; + u_int32_t nentries; + int ovflok, hasdups; + u_int32_t flags; +{ + DBT dbta, dbtb, dup1, dup2, *p1, *p2, *tmp; + BTREE *bt; + BINTERNAL *bi; + BKEYDATA *bk; + BOVERFLOW *bo; + VRFY_PAGEINFO *pip; + db_indx_t i; + int cmp, freedup1, freedup2, isbad, ret, t_ret; + int (*dupfunc) __P((DB *, const DBT *, const DBT *)); + int (*func) __P((DB *, const DBT *, const DBT *)); + void *buf1, *buf2, *tmpbuf; + + /* + * We need to work in the ORDERCHKONLY environment where we might + * not have a pip, but we also may need to work in contexts where + * NUM_ENT isn't safe. + */ + if (vdp != NULL) { + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + nentries = pip->entries; + } else + pip = NULL; + + ret = isbad = 0; + bo = NULL; /* Shut up compiler. */ + + memset(&dbta, 0, sizeof(DBT)); + F_SET(&dbta, DB_DBT_REALLOC); + + memset(&dbtb, 0, sizeof(DBT)); + F_SET(&dbtb, DB_DBT_REALLOC); + + buf1 = buf2 = NULL; + + DB_ASSERT(!LF_ISSET(DB_NOORDERCHK)); + + dupfunc = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare; + if (TYPE(h) == P_LDUP) + func = dupfunc; + else { + func = __bam_defcmp; + if (dbp->bt_internal != NULL) { + bt = (BTREE *)dbp->bt_internal; + if (bt->bt_compare != NULL) + func = bt->bt_compare; + } + } + + /* + * We alternate our use of dbta and dbtb so that we can walk + * through the page key-by-key without copying a dbt twice. + * p1 is always the dbt for index i - 1, and p2 for index i. + */ + p1 = &dbta; + p2 = &dbtb; + + /* + * Loop through the entries. nentries ought to contain the + * actual count, and so is a safe way to terminate the loop; whether + * we inc. by one or two depends on whether we're a leaf page-- + * on a leaf page, we care only about keys. On internal pages + * and LDUP pages, we want to check the order of all entries. + * + * Note that on IBTREE pages, we start with item 1, since item + * 0 doesn't get looked at by __bam_cmp. + */ + for (i = (TYPE(h) == P_IBTREE) ? 1 : 0; i < nentries; + i += (TYPE(h) == P_LBTREE) ? P_INDX : O_INDX) { + /* + * Put key i-1, now in p2, into p1, by swapping DBTs and bufs. + */ + tmp = p1; + p1 = p2; + p2 = tmp; + tmpbuf = buf1; + buf1 = buf2; + buf2 = tmpbuf; + + /* + * Get key i into p2. + */ + switch (TYPE(h)) { + case P_IBTREE: + bi = GET_BINTERNAL(h, i); + if (B_TYPE(bi->type) == B_OVERFLOW) { + bo = (BOVERFLOW *)(bi->data); + goto overflow; + } else { + p2->data = bi->data; + p2->size = bi->len; + } + + /* + * The leftmost key on an internal page must be + * len 0, since it's just a placeholder and + * automatically sorts less than all keys. + * + * XXX + * This criterion does not currently hold! + * See todo list item #1686. Meanwhile, it's harmless + * to just not check for it. + */ +#if 0 + if (i == 0 && bi->len != 0) { + isbad = 1; + EPRINT((dbp->dbenv, + "Lowest key on internal page %lu of nonzero length", + (u_long)pgno)); + } +#endif + break; + case P_LBTREE: + case P_LDUP: + bk = GET_BKEYDATA(h, i); + if (B_TYPE(bk->type) == B_OVERFLOW) { + bo = (BOVERFLOW *)bk; + goto overflow; + } else { + p2->data = bk->data; + p2->size = bk->len; + } + break; + default: + /* + * This means our caller screwed up and sent us + * an inappropriate page. + */ + TYPE_ERR_PRINT(dbp->dbenv, + "__bam_vrfy_itemorder", pgno, TYPE(h)) + DB_ASSERT(0); + ret = EINVAL; + goto err; + } + + if (0) { + /* + * If ovflok != 1, we can't safely go chasing + * overflow pages with the normal routines now; + * they might be unsafe or nonexistent. Mark this + * page as incomplete and return. + * + * Note that we don't need to worry about freeing + * buffers, since they can't have been allocated + * if overflow items are unsafe. + */ +overflow: if (!ovflok) { + F_SET(pip, VRFY_INCOMPLETE); + goto err; + } + + /* + * Overflow items are safe to chase. Do so. + * Fetch the overflow item into p2->data, + * NULLing it or reallocing it as appropriate. + * + * (We set p2->data to buf2 before the call + * so we're sure to realloc if we can and if p2 + * was just pointing at a non-overflow item.) + */ + p2->data = buf2; + if ((ret = __db_goff(dbp, + p2, bo->tlen, bo->pgno, NULL, NULL)) != 0) { + isbad = 1; + EPRINT((dbp->dbenv, + "Error %lu in fetching overflow item %lu, page %lu", + (u_long)ret, (u_long)i, (u_long)pgno)); + } + /* In case it got realloc'ed and thus changed. */ + buf2 = p2->data; + } + + /* Compare with the last key. */ + if (p1->data != NULL && p2->data != NULL) { + cmp = func(dbp, p1, p2); + + /* comparison succeeded */ + if (cmp > 0) { + isbad = 1; + EPRINT((dbp->dbenv, + "Out-of-order key, page %lu item %lu", + (u_long)pgno, (u_long)i)); + /* proceed */ + } else if (cmp == 0) { + /* + * If they compared equally, this + * had better be a (sub)database with dups. + * Mark it so we can check during the + * structure check. + */ + if (pip != NULL) + F_SET(pip, VRFY_HAS_DUPS); + else if (hasdups == 0) { + isbad = 1; + EPRINT((dbp->dbenv, + "Database with no duplicates has duplicated keys on page %lu", + (u_long)pgno)); + } + + /* + * If we're a btree leaf, check to see + * if the data items of these on-page dups are + * in sorted order. If not, flag this, so + * that we can make sure during the + * structure checks that the DUPSORT flag + * is unset. + * + * At this point i points to a duplicate key. + * Compare the datum before it (same key) + * to the datum after it, i.e. i-1 to i+1. + */ + if (TYPE(h) == P_LBTREE) { + /* + * Unsafe; continue and we'll pick + * up the bogus nentries later. + */ + if (i + 1 >= (db_indx_t)nentries) + continue; + + /* + * We don't bother with clever memory + * management with on-page dups, + * as it's only really a big win + * in the overflow case, and overflow + * dups are probably (?) rare. + */ + if (((ret = __bam_safe_getdata(dbp, + h, i - 1, ovflok, &dup1, + &freedup1)) != 0) || + ((ret = __bam_safe_getdata(dbp, + h, i + 1, ovflok, &dup2, + &freedup2)) != 0)) + goto err; + + /* + * If either of the data are NULL, + * it's because they're overflows and + * it's not safe to chase them now. + * Mark an incomplete and return. + */ + if (dup1.data == NULL || + dup2.data == NULL) { + DB_ASSERT(!ovflok); + F_SET(pip, VRFY_INCOMPLETE); + goto err; + } + + /* + * If the dups are out of order, + * flag this. It's not an error + * until we do the structure check + * and see whether DUPSORT is set. + */ + if (dupfunc(dbp, &dup1, &dup2) > 0) + F_SET(pip, VRFY_DUPS_UNSORTED); + + if (freedup1) + __os_free(dup1.data, 0); + if (freedup2) + __os_free(dup2.data, 0); + } + } + } + } + +err: if (pip != NULL && + ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0) && ret == 0) + ret = t_ret; + + if (buf1 != NULL) + __os_free(buf1, 0); + if (buf2 != NULL) + __os_free(buf2, 0); + + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); +} + +/* + * __bam_vrfy_structure -- + * Verify the tree structure of a btree database (including the master + * database containing subdbs). + * + * PUBLIC: int __bam_vrfy_structure __P((DB *, VRFY_DBINFO *, db_pgno_t, + * PUBLIC: u_int32_t)); + */ +int +__bam_vrfy_structure(dbp, vdp, meta_pgno, flags) + DB *dbp; + VRFY_DBINFO *vdp; + db_pgno_t meta_pgno; + u_int32_t flags; +{ + DB *pgset; + VRFY_PAGEINFO *mip, *rip; + db_pgno_t root, p; + int t_ret, ret; + u_int32_t nrecs, level, relen, stflags; + + mip = rip = 0; + pgset = vdp->pgset; + + if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &mip)) != 0) + return (ret); + + if ((ret = __db_vrfy_pgset_get(pgset, meta_pgno, (int *)&p)) != 0) + goto err; + if (p != 0) { + EPRINT((dbp->dbenv, + "Btree metadata page number %lu observed twice", + (u_long)meta_pgno)); + ret = DB_VERIFY_BAD; + goto err; + } + if ((ret = __db_vrfy_pgset_inc(pgset, meta_pgno)) != 0) + goto err; + + root = mip->root; + + if (root == 0) { + EPRINT((dbp->dbenv, + "Btree metadata page %lu has no root", (u_long)meta_pgno)); + ret = DB_VERIFY_BAD; + goto err; + } + + if ((ret = __db_vrfy_getpageinfo(vdp, root, &rip)) != 0) + goto err; + + switch (rip->type) { + case P_IBTREE: + case P_LBTREE: + stflags = flags | ST_TOPLEVEL; + if (F_ISSET(mip, VRFY_HAS_DUPS)) + stflags |= ST_DUPOK; + if (F_ISSET(mip, VRFY_HAS_DUPSORT)) + stflags |= ST_DUPSORT; + if (F_ISSET(mip, VRFY_HAS_RECNUMS)) + stflags |= ST_RECNUM; + ret = __bam_vrfy_subtree(dbp, + vdp, root, NULL, NULL, stflags, NULL, NULL, NULL); + break; + case P_IRECNO: + case P_LRECNO: + stflags = flags | ST_RECNUM | ST_IS_RECNO | ST_TOPLEVEL; + if (mip->re_len > 0) + stflags |= ST_RELEN; + if ((ret = __bam_vrfy_subtree(dbp, vdp, + root, NULL, NULL, stflags, &level, &nrecs, &relen)) != 0) + goto err; + /* + * Even if mip->re_len > 0, re_len may come back zero if the + * tree is empty. It should be okay to just skip the check in + * this case, as if there are any non-deleted keys at all, + * that should never happen. + */ + if (mip->re_len > 0 && relen > 0 && mip->re_len != relen) { + EPRINT((dbp->dbenv, + "Recno database with meta page %lu has bad re_len %lu", + (u_long)meta_pgno, (u_long)relen)); + ret = DB_VERIFY_BAD; + goto err; + } + ret = 0; + break; + case P_LDUP: + EPRINT((dbp->dbenv, + "Duplicate tree referenced from metadata page %lu", + (u_long)meta_pgno)); + ret = DB_VERIFY_BAD; + break; + default: + EPRINT((dbp->dbenv, + "Btree root of incorrect type %lu on meta page %lu", + (u_long)rip->type, (u_long)meta_pgno)); + ret = DB_VERIFY_BAD; + break; + } + +err: if (mip != NULL && + ((t_ret = __db_vrfy_putpageinfo(vdp, mip)) != 0) && ret == 0) + t_ret = ret; + if (rip != NULL && + ((t_ret = __db_vrfy_putpageinfo(vdp, rip)) != 0) && ret == 0) + t_ret = ret; + return (ret); +} + +/* + * __bam_vrfy_subtree-- + * Verify a subtree (or entire) btree with specified root. + * + * Note that this is public because it must be called to verify + * offpage dup trees, including from hash. + * + * PUBLIC: int __bam_vrfy_subtree __P((DB *, VRFY_DBINFO *, db_pgno_t, void *, + * PUBLIC: void *, u_int32_t, u_int32_t *, u_int32_t *, u_int32_t *)); + */ +int +__bam_vrfy_subtree(dbp, + vdp, pgno, l, r, flags, levelp, nrecsp, relenp) + DB *dbp; + VRFY_DBINFO *vdp; + db_pgno_t pgno; + void *l, *r; + u_int32_t flags, *levelp, *nrecsp, *relenp; +{ + BINTERNAL *li, *ri, *lp, *rp; + DB *pgset; + DBC *cc; + PAGE *h; + VRFY_CHILDINFO *child; + VRFY_PAGEINFO *pip; + db_recno_t nrecs, child_nrecs; + db_indx_t i; + int ret, t_ret, isbad, toplevel, p; + int (*func) __P((DB *, const DBT *, const DBT *)); + u_int32_t level, child_level, stflags, child_relen, relen; + + ret = isbad = 0; + nrecs = 0; + h = NULL; + relen = 0; + rp = (BINTERNAL *)r; + lp = (BINTERNAL *)l; + + /* Provide feedback on our progress to the application. */ + if (!LF_ISSET(DB_SALVAGE)) + __db_vrfy_struct_feedback(dbp, vdp); + + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + + cc = NULL; + level = pip->bt_level; + + toplevel = LF_ISSET(ST_TOPLEVEL); + LF_CLR(ST_TOPLEVEL); + + /* + * We are recursively descending a btree, starting from the root + * and working our way out to the leaves. + * + * There are four cases we need to deal with: + * 1. pgno is a recno leaf page. Any children are overflows. + * 2. pgno is a duplicate leaf page. Any children + * are overflow pages; traverse them, and then return + * level and nrecs. + * 3. pgno is an ordinary leaf page. Check whether dups are + * allowed, and if so, traverse any off-page dups or + * overflows. Then return nrecs and level. + * 4. pgno is a recno internal page. Recursively check any + * child pages, making sure their levels are one lower + * and their nrecs sum to ours. + * 5. pgno is a btree internal page. Same as #4, plus we + * must verify that for each pair of BINTERNAL entries + * N and N+1, the leftmost item on N's child sorts + * greater than N, and the rightmost item on N's child + * sorts less than N+1. + * + * Furthermore, in any sorted page type (P_LDUP, P_LBTREE, P_IBTREE), + * we need to verify the internal sort order is correct if, + * due to overflow items, we were not able to do so earlier. + */ + switch (pip->type) { + case P_LRECNO: + case P_LDUP: + case P_LBTREE: + /* + * Cases 1, 2 and 3 (overflow pages are common to all three); + * traverse child list, looking for overflows. + */ + if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0) + goto err; + for (ret = __db_vrfy_ccset(cc, pgno, &child); ret == 0; + ret = __db_vrfy_ccnext(cc, &child)) + if (child->type == V_OVERFLOW && + (ret = __db_vrfy_ovfl_structure(dbp, vdp, + child->pgno, child->tlen, + flags | ST_OVFL_LEAF)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto done; + } + + if ((ret = __db_vrfy_ccclose(cc)) != 0) + goto err; + cc = NULL; + + /* Case 1 */ + if (pip->type == P_LRECNO) { + if (!LF_ISSET(ST_IS_RECNO) && + !(LF_ISSET(ST_DUPOK) && !LF_ISSET(ST_DUPSORT))) { + isbad = 1; + EPRINT((dbp->dbenv, + "Recno leaf page %lu in non-recno tree", + (u_long)pgno)); + goto done; + } + goto leaf; + } else if (LF_ISSET(ST_IS_RECNO)) { + /* + * It's a non-recno leaf. Had better not be a recno + * subtree. + */ + isbad = 1; + EPRINT((dbp->dbenv, + "Non-recno leaf page %lu in recno tree", + (u_long)pgno)); + goto done; + } + + /* Case 2--no more work. */ + if (pip->type == P_LDUP) + goto leaf; + + /* Case 3 */ + + /* Check if we have any dups. */ + if (F_ISSET(pip, VRFY_HAS_DUPS)) { + /* If dups aren't allowed in this btree, trouble. */ + if (!LF_ISSET(ST_DUPOK)) { + isbad = 1; + EPRINT((dbp->dbenv, + "Duplicates on page %lu in non-dup btree", + (u_long)pgno)); + } else { + /* + * We correctly have dups. If any are off-page, + * traverse those btrees recursively. + */ + if ((ret = + __db_vrfy_childcursor(vdp, &cc)) != 0) + goto err; + for (ret = __db_vrfy_ccset(cc, pgno, &child); + ret == 0; + ret = __db_vrfy_ccnext(cc, &child)) { + stflags = flags | ST_RECNUM | ST_DUPSET; + /* Skip any overflow entries. */ + if (child->type == V_DUPLICATE) { + if ((ret = __db_vrfy_duptype( + dbp, vdp, child->pgno, + stflags)) != 0) { + isbad = 1; + /* Next child. */ + continue; + } + if ((ret = __bam_vrfy_subtree( + dbp, vdp, child->pgno, NULL, + NULL, stflags, NULL, NULL, + NULL)) != 0) { + if (ret != + DB_VERIFY_BAD) + goto err; + else + isbad = 1; + } + } + } + + if ((ret = __db_vrfy_ccclose(cc)) != 0) + goto err; + cc = NULL; + + /* + * If VRFY_DUPS_UNSORTED is set, + * ST_DUPSORT had better not be. + */ + if (F_ISSET(pip, VRFY_DUPS_UNSORTED) && + LF_ISSET(ST_DUPSORT)) { + EPRINT((dbp->dbenv, + "Unsorted duplicate set at page %lu in sorted-dup database", + (u_long)pgno)); + isbad = 1; + } + } + } + goto leaf; + break; + case P_IBTREE: + case P_IRECNO: + /* We handle these below. */ + break; + default: + /* + * If a P_IBTREE or P_IRECNO contains a reference to an + * invalid page, we'll wind up here; handle it gracefully. + * Note that the code at the "done" label assumes that the + * current page is a btree/recno one of some sort; this + * is not the case here, so we goto err. + */ + EPRINT((dbp->dbenv, + "Page %lu is of inappropriate type %lu", + (u_long)pgno, (u_long)pip->type)); + ret = DB_VERIFY_BAD; + goto err; + } + + /* + * Cases 4 & 5: This is a btree or recno internal page. For each child, + * recurse, keeping a running count of nrecs and making sure the level + * is always reasonable. + */ + if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0) + goto err; + for (ret = __db_vrfy_ccset(cc, pgno, &child); ret == 0; + ret = __db_vrfy_ccnext(cc, &child)) + if (child->type == V_RECNO) { + if (pip->type != P_IRECNO) { + TYPE_ERR_PRINT(dbp->dbenv, "__bam_vrfy_subtree", + pgno, pip->type); + DB_ASSERT(0); + ret = EINVAL; + goto err; + } + if ((ret = __bam_vrfy_subtree(dbp, vdp, child->pgno, + NULL, NULL, flags, &child_level, &child_nrecs, + &child_relen)) != 0) { + if (ret != DB_VERIFY_BAD) + goto done; + else + isbad = 1; + } + + if (LF_ISSET(ST_RELEN)) { + if (relen == 0) + relen = child_relen; + /* + * child_relen may be zero if the child subtree + * is empty. + */ + else if (child_relen > 0 && + relen != child_relen) { + isbad = 1; + EPRINT((dbp->dbenv, + "Recno page %lu returned bad re_len", + (u_long)child->pgno)); + } + if (relenp) + *relenp = relen; + } + if (LF_ISSET(ST_RECNUM)) + nrecs += child_nrecs; + if (level != child_level + 1) { + isbad = 1; + EPRINT((dbp->dbenv, "%s%lu%s%lu%s%lu", + "Recno level incorrect on page ", + (u_long)child->pgno, ": got ", + (u_long)child_level, ", expected ", + (u_long)(level - 1))); + } + } else if (child->type == V_OVERFLOW && + (ret = __db_vrfy_ovfl_structure(dbp, vdp, + child->pgno, child->tlen, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto done; + } + + if ((ret = __db_vrfy_ccclose(cc)) != 0) + goto err; + cc = NULL; + + /* We're done with case 4. */ + if (pip->type == P_IRECNO) + goto done; + + /* + * Case 5. Btree internal pages. + * As described above, we need to iterate through all the + * items on the page and make sure that our children sort appropriately + * with respect to them. + * + * For each entry, li will be the "left-hand" key for the entry + * itself, which must sort lower than all entries on its child; + * ri will be the key to its right, which must sort greater. + */ + if (h == NULL && (ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + goto err; + for (i = 0; i < pip->entries; i += O_INDX) { + li = GET_BINTERNAL(h, i); + ri = (i + O_INDX < pip->entries) ? + GET_BINTERNAL(h, i + O_INDX) : NULL; + + /* + * The leftmost key is forcibly sorted less than all entries, + * so don't bother passing it. + */ + if ((ret = __bam_vrfy_subtree(dbp, vdp, li->pgno, + i == 0 ? NULL : li, ri, flags, &child_level, + &child_nrecs, NULL)) != 0) { + if (ret != DB_VERIFY_BAD) + goto done; + else + isbad = 1; + } + + if (LF_ISSET(ST_RECNUM)) { + /* + * Keep a running tally on the actual record count so + * we can return it to our parent (if we have one) or + * compare it to the NRECS field if we're a root page. + */ + nrecs += child_nrecs; + + /* + * Make sure the actual record count of the child + * is equal to the value in the BINTERNAL structure. + */ + if (li->nrecs != child_nrecs) { + isbad = 1; + EPRINT((dbp->dbenv, + "Item %lu page %lu has incorrect record count of %lu, should be %lu", + (u_long)i, (u_long)pgno, (u_long)li->nrecs, + (u_long)child_nrecs)); + } + } + + if (level != child_level + 1) { + isbad = 1; + EPRINT((dbp->dbenv, "%s%lu%s%lu%s%lu", + "Btree level incorrect on page ", (u_long)li->pgno, + ": got ", (u_long)child_level, ", expected ", + (u_long)(level - 1))); + } + } + + if (0) { +leaf: level = LEAFLEVEL; + if (LF_ISSET(ST_RECNUM)) + nrecs = pip->rec_cnt; + + /* XXX + * We should verify that the record count on a leaf page + * is the sum of the number of keys and the number of + * records in its off-page dups. This requires looking + * at the page again, however, and it may all be changing + * soon, so for now we don't bother. + */ + + if (LF_ISSET(ST_RELEN) && relenp) + *relenp = pip->re_len; + } +done: if (F_ISSET(pip, VRFY_INCOMPLETE) && isbad == 0 && ret == 0) { + /* + * During the page-by-page pass, item order verification was + * not finished due to the presence of overflow items. If + * isbad == 0, though, it's now safe to do so, as we've + * traversed any child overflow pages. Do it. + */ + if (h == NULL && (ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + goto err; + if ((ret = __bam_vrfy_itemorder(dbp, + vdp, h, pgno, 0, 1, 0, flags)) != 0) + goto err; + F_CLR(pip, VRFY_INCOMPLETE); + } + + /* + * Our parent has sent us BINTERNAL pointers to parent records + * so that we can verify our place with respect to them. If it's + * appropriate--we have a default sort function--verify this. + */ + if (isbad == 0 && ret == 0 && !LF_ISSET(DB_NOORDERCHK) && lp != NULL) { + if (h == NULL && (ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + goto err; + + /* + * __bam_vrfy_treeorder needs to know what comparison function + * to use. If ST_DUPSET is set, we're in a duplicate tree + * and we use the duplicate comparison function; otherwise, + * use the btree one. If unset, use the default, of course. + */ + func = LF_ISSET(ST_DUPSET) ? dbp->dup_compare : + ((BTREE *)dbp->bt_internal)->bt_compare; + if (func == NULL) + func = __bam_defcmp; + + if ((ret = __bam_vrfy_treeorder( + dbp, pgno, h, lp, rp, func, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + } + + /* + * This is guaranteed to succeed for leaf pages, but no harm done. + * + * Internal pages below the top level do not store their own + * record numbers, so we skip them. + */ + if (LF_ISSET(ST_RECNUM) && nrecs != pip->rec_cnt && toplevel) { + isbad = 1; + EPRINT((dbp->dbenv, + "Bad record count on page %lu: got %lu, expected %lu", + (u_long)pgno, (u_long)nrecs, (u_long)pip->rec_cnt)); + } + + if (levelp) + *levelp = level; + if (nrecsp) + *nrecsp = nrecs; + + pgset = vdp->pgset; + if ((ret = __db_vrfy_pgset_get(pgset, pgno, &p)) != 0) + goto err; + if (p != 0) { + isbad = 1; + EPRINT((dbp->dbenv, "Page %lu linked twice", (u_long)pgno)); + } else if ((ret = __db_vrfy_pgset_inc(pgset, pgno)) != 0) + goto err; + +err: if (h != NULL && (t_ret = memp_fput(dbp->mpf, h, 0)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) + ret = t_ret; + if (cc != NULL && ((t_ret = __db_vrfy_ccclose(cc)) != 0) && ret == 0) + ret = t_ret; + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); +} + +/* + * __bam_vrfy_treeorder -- + * Verify that the lowest key on a page sorts greater than the + * BINTERNAL which points to it (lp), and the highest key + * sorts less than the BINTERNAL above that (rp). + * + * If lp is NULL, this means that it was the leftmost key on the + * parent, which (regardless of sort function) sorts less than + * all keys. No need to check it. + * + * If rp is NULL, lp was the highest key on the parent, so there's + * no higher key we must sort less than. + */ +static int +__bam_vrfy_treeorder(dbp, pgno, h, lp, rp, func, flags) + DB *dbp; + db_pgno_t pgno; + PAGE *h; + BINTERNAL *lp, *rp; + int (*func) __P((DB *, const DBT *, const DBT *)); + u_int32_t flags; +{ + BOVERFLOW *bo; + DBT dbt; + db_indx_t last; + int ret, cmp; + + memset(&dbt, 0, sizeof(DBT)); + F_SET(&dbt, DB_DBT_MALLOC); + ret = 0; + + switch (TYPE(h)) { + case P_IBTREE: + case P_LDUP: + last = NUM_ENT(h) - O_INDX; + break; + case P_LBTREE: + last = NUM_ENT(h) - P_INDX; + break; + default: + TYPE_ERR_PRINT(dbp->dbenv, + "__bam_vrfy_treeorder", pgno, TYPE(h)); + DB_ASSERT(0); + return (EINVAL); + } + + /* + * The key on page h, the child page, is more likely to be + * an overflow page, so we pass its offset, rather than lp/rp's, + * into __bam_cmp. This will take advantage of __db_moff. + */ + + /* + * Skip first-item check if we're an internal page--the first + * entry on an internal page is treated specially by __bam_cmp, + * so what's on the page shouldn't matter. (Plus, since we're passing + * our page and item 0 as to __bam_cmp, we'll sort before our + * parent and falsely report a failure.) + */ + if (lp != NULL && TYPE(h) != P_IBTREE) { + if (lp->type == B_KEYDATA) { + dbt.data = lp->data; + dbt.size = lp->len; + } else if (lp->type == B_OVERFLOW) { + bo = (BOVERFLOW *)lp->data; + if ((ret = __db_goff(dbp, &dbt, bo->tlen, bo->pgno, + NULL, NULL)) != 0) + return (ret); + } else { + DB_ASSERT(0); + EPRINT((dbp->dbenv, + "Unknown type for internal record")); + return (EINVAL); + } + + /* On error, fall through, free if neeeded, and return. */ + if ((ret = __bam_cmp(dbp, &dbt, h, 0, func, &cmp)) == 0) { + if (cmp > 0) { + EPRINT((dbp->dbenv, + "First item on page %lu sorted greater than parent entry", + (u_long)PGNO(h))); + ret = DB_VERIFY_BAD; + } + } else + EPRINT((dbp->dbenv, + "First item on page %lu had comparison error", + (u_long)PGNO(h))); + + if (dbt.data != lp->data) + __os_free(dbt.data, 0); + if (ret != 0) + return (ret); + } + + if (rp != NULL) { + if (rp->type == B_KEYDATA) { + dbt.data = rp->data; + dbt.size = rp->len; + } else if (rp->type == B_OVERFLOW) { + bo = (BOVERFLOW *)rp->data; + if ((ret = __db_goff(dbp, &dbt, bo->tlen, bo->pgno, + NULL, NULL)) != 0) + return (ret); + } else { + DB_ASSERT(0); + EPRINT((dbp->dbenv, + "Unknown type for internal record")); + return (EINVAL); + } + + /* On error, fall through, free if neeeded, and return. */ + if ((ret = __bam_cmp(dbp, &dbt, h, last, func, &cmp)) == 0) { + if (cmp < 0) { + EPRINT((dbp->dbenv, + "Last item on page %lu sorted greater than parent entry", + (u_long)PGNO(h))); + ret = DB_VERIFY_BAD; + } + } else + EPRINT((dbp->dbenv, + "Last item on page %lu had comparison error", + (u_long)PGNO(h))); + + if (dbt.data != rp->data) + __os_free(dbt.data, 0); + } + + return (ret); +} + +/* + * __bam_salvage -- + * Safely dump out anything that looks like a key on an alleged + * btree leaf page. + * + * PUBLIC: int __bam_salvage __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t, + * PUBLIC: PAGE *, void *, int (*)(void *, const void *), DBT *, + * PUBLIC: u_int32_t)); + */ +int +__bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags) + DB *dbp; + VRFY_DBINFO *vdp; + db_pgno_t pgno; + u_int32_t pgtype; + PAGE *h; + void *handle; + int (*callback) __P((void *, const void *)); + DBT *key; + u_int32_t flags; +{ + DBT dbt, unkdbt; + BKEYDATA *bk; + BOVERFLOW *bo; + db_indx_t i, beg, end; + u_int32_t himark; + u_int8_t *pgmap; + void *ovflbuf; + int t_ret, ret, err_ret; + + /* Shut up lint. */ + COMPQUIET(end, 0); + + ovflbuf = pgmap = NULL; + err_ret = ret = 0; + + memset(&dbt, 0, sizeof(DBT)); + dbt.flags = DB_DBT_REALLOC; + + memset(&unkdbt, 0, sizeof(DBT)); + unkdbt.size = strlen("UNKNOWN") + 1; + unkdbt.data = "UNKNOWN"; + + /* + * Allocate a buffer for overflow items. Start at one page; + * __db_safe_goff will realloc as needed. + */ + if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &ovflbuf)) != 0) + return (ret); + + if (LF_ISSET(DB_AGGRESSIVE)) { + if ((ret = + __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &pgmap)) != 0) + goto err; + memset(pgmap, 0, dbp->pgsize); + } + + /* + * Loop through the inp array, spitting out key/data pairs. + * + * If we're salvaging normally, loop from 0 through NUM_ENT(h). + * If we're being aggressive, loop until we hit the end of the page-- + * NUM_ENT() may be bogus. + */ + himark = dbp->pgsize; + for (i = 0;; i += O_INDX) { + /* If we're not aggressive, break when we hit NUM_ENT(h). */ + if (!LF_ISSET(DB_AGGRESSIVE) && i >= NUM_ENT(h)) + break; + + /* Verify the current item. */ + ret = __db_vrfy_inpitem(dbp, + h, pgno, i, 1, flags, &himark, NULL); + /* If this returned a fatality, it's time to break. */ + if (ret == DB_VERIFY_FATAL) { + /* + * Don't return DB_VERIFY_FATAL; it's private + * and means only that we can't go on with this + * page, not with the whole database. It's + * not even an error if we've run into it + * after NUM_ENT(h). + */ + ret = (i < NUM_ENT(h)) ? DB_VERIFY_BAD : 0; + break; + } + + /* + * If this returned 0, it's safe to print or (carefully) + * try to fetch. + */ + if (ret == 0) { + /* + * We only want to print deleted items if + * DB_AGGRESSIVE is set. + */ + bk = GET_BKEYDATA(h, i); + if (!LF_ISSET(DB_AGGRESSIVE) && B_DISSET(bk->type)) + continue; + + /* + * We're going to go try to print the next item. If + * key is non-NULL, we're a dup page, so we've got to + * print the key first, unless SA_SKIPFIRSTKEY is set + * and we're on the first entry. + */ + if (key != NULL && + (i != 0 || !LF_ISSET(SA_SKIPFIRSTKEY))) + if ((ret = __db_prdbt(key, + 0, " ", handle, callback, 0, NULL)) != 0) + err_ret = ret; + + beg = h->inp[i]; + switch (B_TYPE(bk->type)) { + case B_DUPLICATE: + end = beg + BOVERFLOW_SIZE - 1; + /* + * If we're not on a normal btree leaf page, + * there shouldn't be off-page + * dup sets. Something's confused; just + * drop it, and the code to pick up unlinked + * offpage dup sets will print it out + * with key "UNKNOWN" later. + */ + if (pgtype != P_LBTREE) + break; + + bo = (BOVERFLOW *)bk; + + /* + * If the page number is unreasonable, or + * if this is supposed to be a key item, + * just spit out "UNKNOWN"--the best we + * can do is run into the data items in the + * unlinked offpage dup pass. + */ + if (!IS_VALID_PGNO(bo->pgno) || + (i % P_INDX == 0)) { + /* Not much to do on failure. */ + if ((ret = __db_prdbt(&unkdbt, 0, " ", + handle, callback, 0, NULL)) != 0) + err_ret = ret; + break; + } + + if ((ret = __db_salvage_duptree(dbp, + vdp, bo->pgno, &dbt, handle, callback, + flags | SA_SKIPFIRSTKEY)) != 0) + err_ret = ret; + + break; + case B_KEYDATA: + end = ALIGN(beg + bk->len, sizeof(u_int32_t)) - 1; + dbt.data = bk->data; + dbt.size = bk->len; + if ((ret = __db_prdbt(&dbt, + 0, " ", handle, callback, 0, NULL)) != 0) + err_ret = ret; + break; + case B_OVERFLOW: + end = beg + BOVERFLOW_SIZE - 1; + bo = (BOVERFLOW *)bk; + if ((ret = __db_safe_goff(dbp, vdp, + bo->pgno, &dbt, &ovflbuf, flags)) != 0) { + err_ret = ret; + /* We care about err_ret more. */ + (void)__db_prdbt(&unkdbt, 0, " ", + handle, callback, 0, NULL); + break; + } + if ((ret = __db_prdbt(&dbt, + 0, " ", handle, callback, 0, NULL)) != 0) + err_ret = ret; + break; + default: + /* + * We should never get here; __db_vrfy_inpitem + * should not be returning 0 if bk->type + * is unrecognizable. + */ + DB_ASSERT(0); + return (EINVAL); + } + + /* + * If we're being aggressive, mark the beginning + * and end of the item; we'll come back and print + * whatever "junk" is in the gaps in case we had + * any bogus inp elements and thereby missed stuff. + */ + if (LF_ISSET(DB_AGGRESSIVE)) { + pgmap[beg] = ITEM_BEGIN; + pgmap[end] = ITEM_END; + } + } + } + + /* + * If i is odd and this is a btree leaf, we've printed out a key but not + * a datum; fix this imbalance by printing an "UNKNOWN". + */ + if (pgtype == P_LBTREE && (i % P_INDX == 1) && ((ret = + __db_prdbt(&unkdbt, 0, " ", handle, callback, 0, NULL)) != 0)) + err_ret = ret; + +err: if (pgmap != NULL) + __os_free(pgmap, 0); + __os_free(ovflbuf, 0); + + /* Mark this page as done. */ + if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0) + return (t_ret); + + return ((err_ret != 0) ? err_ret : ret); +} + +/* + * __bam_salvage_walkdupint -- + * Walk a known-good btree or recno internal page which is part of + * a dup tree, calling __db_salvage_duptree on each child page. + * + * PUBLIC: int __bam_salvage_walkdupint __P((DB *, VRFY_DBINFO *, PAGE *, + * PUBLIC: DBT *, void *, int (*)(void *, const void *), u_int32_t)); + */ +int +__bam_salvage_walkdupint(dbp, vdp, h, key, handle, callback, flags) + DB *dbp; + VRFY_DBINFO *vdp; + PAGE *h; + DBT *key; + void *handle; + int (*callback) __P((void *, const void *)); + u_int32_t flags; +{ + RINTERNAL *ri; + BINTERNAL *bi; + int ret, t_ret; + db_indx_t i; + + ret = 0; + for (i = 0; i < NUM_ENT(h); i++) { + switch (TYPE(h)) { + case P_IBTREE: + bi = GET_BINTERNAL(h, i); + if ((t_ret = __db_salvage_duptree(dbp, + vdp, bi->pgno, key, handle, callback, flags)) != 0) + ret = t_ret; + case P_IRECNO: + ri = GET_RINTERNAL(h, i); + if ((t_ret = __db_salvage_duptree(dbp, + vdp, ri->pgno, key, handle, callback, flags)) != 0) + ret = t_ret; + break; + default: + __db_err(dbp->dbenv, + "__bam_salvage_walkdupint called on non-int. page"); + DB_ASSERT(0); + return (EINVAL); + } + /* Pass SA_SKIPFIRSTKEY, if set, on to the 0th child only. */ + flags &= ~LF_ISSET(SA_SKIPFIRSTKEY); + } + + return (ret); +} + +/* + * __bam_meta2pgset -- + * Given a known-good meta page, return in pgsetp a 0-terminated list of + * db_pgno_t's corresponding to the pages in the btree. + * + * We do this by a somewhat sleazy method, to avoid having to traverse the + * btree structure neatly: we walk down the left side to the very + * first leaf page, then we mark all the pages in the chain of + * NEXT_PGNOs (being wary of cycles and invalid ones), then we + * consolidate our scratch array into a nice list, and return. This + * avoids the memory management hassles of recursion and the + * trouble of walking internal pages--they just don't matter, except + * for the left branch. + * + * PUBLIC: int __bam_meta2pgset __P((DB *, VRFY_DBINFO *, BTMETA *, + * PUBLIC: u_int32_t, DB *)); + */ +int +__bam_meta2pgset(dbp, vdp, btmeta, flags, pgset) + DB *dbp; + VRFY_DBINFO *vdp; + BTMETA *btmeta; + u_int32_t flags; + DB *pgset; +{ + BINTERNAL *bi; + PAGE *h; + RINTERNAL *ri; + db_pgno_t current, p; + int err_ret, ret; + + h = NULL; + ret = err_ret = 0; + DB_ASSERT(pgset != NULL); + for (current = btmeta->root;;) { + if (!IS_VALID_PGNO(current) || current == PGNO(btmeta)) { + err_ret = DB_VERIFY_BAD; + goto err; + } + if ((ret = memp_fget(dbp->mpf, ¤t, 0, &h)) != 0) { + err_ret = ret; + goto err; + } + + switch (TYPE(h)) { + case P_IBTREE: + case P_IRECNO: + if ((ret = __bam_vrfy(dbp, + vdp, h, current, flags | DB_NOORDERCHK)) != 0) { + err_ret = ret; + goto err; + } + if (TYPE(h) == P_IBTREE) { + bi = GET_BINTERNAL(h, 0); + current = bi->pgno; + } else { /* P_IRECNO */ + ri = GET_RINTERNAL(h, 0); + current = ri->pgno; + } + break; + case P_LBTREE: + case P_LRECNO: + goto traverse; + default: + err_ret = DB_VERIFY_BAD; + goto err; + } + + if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) + err_ret = ret; + h = NULL; + } + + /* + * At this point, current is the pgno of leaf page h, the 0th in the + * tree we're concerned with. + */ +traverse: + while (IS_VALID_PGNO(current) && current != PGNO_INVALID) { + if (h == NULL && + (ret = memp_fget(dbp->mpf, ¤t, 0, &h) != 0)) { + err_ret = ret; + break; + } + + if ((ret = __db_vrfy_pgset_get(pgset, current, (int *)&p)) != 0) + goto err; + + if (p != 0) { + /* + * We've found a cycle. Return success anyway-- + * our caller may as well use however much of + * the pgset we've come up with. + */ + break; + } + if ((ret = __db_vrfy_pgset_inc(pgset, current)) != 0) + goto err; + + current = NEXT_PGNO(h); + if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) + err_ret = ret; + h = NULL; + } + +err: if (h != NULL) + (void)memp_fput(dbp->mpf, h, 0); + + return (ret == 0 ? err_ret : ret); +} + +/* + * __bam_safe_getdata -- + * + * Utility function for __bam_vrfy_itemorder. Safely gets the datum at + * index i, page h, and sticks it in DBT dbt. If ovflok is 1 and i's an + * overflow item, we do a safe_goff to get the item and signal that we need + * to free dbt->data; if ovflok is 0, we leaves the DBT zeroed. + */ +static int +__bam_safe_getdata(dbp, h, i, ovflok, dbt, freedbtp) + DB *dbp; + PAGE *h; + u_int32_t i; + int ovflok; + DBT *dbt; + int *freedbtp; +{ + BKEYDATA *bk; + BOVERFLOW *bo; + + memset(dbt, 0, sizeof(DBT)); + *freedbtp = 0; + + bk = GET_BKEYDATA(h, i); + if (B_TYPE(bk->type) == B_OVERFLOW) { + if (!ovflok) + return (0); + + bo = (BOVERFLOW *)bk; + F_SET(dbt, DB_DBT_MALLOC); + + *freedbtp = 1; + return (__db_goff(dbp, dbt, bo->tlen, bo->pgno, NULL, NULL)); + } else { + dbt->data = bk->data; + dbt->size = bk->len; + } + + return (0); +} diff --git a/db/btree/btree.src b/db/btree/btree.src new file mode 100644 index 000000000..a1eba7d7f --- /dev/null +++ b/db/btree/btree.src @@ -0,0 +1,296 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + * + * $Id: btree.src,v 10.26 2000/12/12 17:40:23 bostic Exp $ + */ + +PREFIX bam + +INCLUDE #include "db_config.h" +INCLUDE +INCLUDE #ifndef NO_SYSTEM_INCLUDES +INCLUDE #include <sys/types.h> +INCLUDE +INCLUDE #include <ctype.h> +INCLUDE #include <errno.h> +INCLUDE #include <string.h> +INCLUDE #endif +INCLUDE +INCLUDE #include "db_int.h" +INCLUDE #include "db_page.h" +INCLUDE #include "db_dispatch.h" +INCLUDE #include "db_am.h" +INCLUDE #include "btree.h" +INCLUDE #include "txn.h" +INCLUDE + +/* + * BTREE-pg_alloc: used to record allocating a new page. + * + * meta_lsn: the meta-data page's original lsn. + * page_lsn: the allocated page's original lsn. + * pgno: the page allocated. + * next: the next page on the free list. + */ +BEGIN pg_alloc 51 +ARG fileid int32_t ld +POINTER meta_lsn DB_LSN * lu +POINTER page_lsn DB_LSN * lu +ARG pgno db_pgno_t lu +ARG ptype u_int32_t lu +ARG next db_pgno_t lu +END + +DEPRECATED pg_alloc1 60 +ARG fileid int32_t ld +POINTER meta_lsn DB_LSN * lu +POINTER alloc_lsn DB_LSN * lu +POINTER page_lsn DB_LSN * lu +ARG pgno db_pgno_t lu +ARG ptype u_int32_t lu +ARG next db_pgno_t lu +END + +/* + * BTREE-pg_free: used to record freeing a page. + * + * pgno: the page being freed. + * meta_lsn: the meta-data page's original lsn. + * header: the header from the free'd page. + * next: the previous next pointer on the metadata page. + */ +BEGIN pg_free 52 +ARG fileid int32_t ld +ARG pgno db_pgno_t lu +POINTER meta_lsn DB_LSN * lu +DBT header DBT s +ARG next db_pgno_t lu +END + +DEPRECATED pg_free1 61 +ARG fileid int32_t ld +ARG pgno db_pgno_t lu +POINTER meta_lsn DB_LSN * lu +POINTER alloc_lsn DB_LSN * lu +DBT header DBT s +ARG next db_pgno_t lu +END + +/* + * BTREE-split: used to log a page split. + * + * left: the page number for the low-order contents. + * llsn: the left page's original LSN. + * right: the page number for the high-order contents. + * rlsn: the right page's original LSN. + * indx: the number of entries that went to the left page. + * npgno: the next page number + * nlsn: the next page's original LSN (or 0 if no next page). + * pg: the split page's contents before the split. + */ +DEPRECATED split1 53 +ARG fileid int32_t ld +ARG left db_pgno_t lu +POINTER llsn DB_LSN * lu +ARG right db_pgno_t lu +POINTER rlsn DB_LSN * lu +ARG indx u_int32_t lu +ARG npgno db_pgno_t lu +POINTER nlsn DB_LSN * lu +DBT pg DBT s +END + +/* + * BTREE-split: used to log a page split. + * + * left: the page number for the low-order contents. + * llsn: the left page's original LSN. + * right: the page number for the high-order contents. + * rlsn: the right page's original LSN. + * indx: the number of entries that went to the left page. + * npgno: the next page number + * npgno: the next page number + * nlsn: the next page's original LSN (or 0 if no next page). + * root_pgno: the root page number + * pg: the split page's contents before the split. + * opflags: SPL_NRECS: if splitting a tree that maintains a record count. + */ +BEGIN split 62 +ARG fileid int32_t ld +ARG left db_pgno_t lu +POINTER llsn DB_LSN * lu +ARG right db_pgno_t lu +POINTER rlsn DB_LSN * lu +ARG indx u_int32_t lu +ARG npgno db_pgno_t lu +POINTER nlsn DB_LSN * lu +ARG root_pgno db_pgno_t lu +DBT pg DBT s +ARG opflags u_int32_t lu +END + +/* + * BTREE-rsplit: used to log a reverse-split + * + * pgno: the page number of the page copied over the root. + * pgdbt: the page being copied on the root page. + * nrec: the tree's record count. + * rootent: last entry on the root page. + * rootlsn: the root page's original lsn. + */ +DEPRECATED rsplit1 54 +ARG fileid int32_t ld +ARG pgno db_pgno_t lu +DBT pgdbt DBT s +ARG nrec db_pgno_t lu +DBT rootent DBT s +POINTER rootlsn DB_LSN * lu +END + +/* + * BTREE-rsplit: used to log a reverse-split + * + * pgno: the page number of the page copied over the root. + * pgdbt: the page being copied on the root page. + * root_pgno: the root page number. + * nrec: the tree's record count. + * rootent: last entry on the root page. + * rootlsn: the root page's original lsn. + */ +BEGIN rsplit 63 +ARG fileid int32_t ld +ARG pgno db_pgno_t lu +DBT pgdbt DBT s +ARG root_pgno db_pgno_t lu +ARG nrec db_pgno_t lu +DBT rootent DBT s +POINTER rootlsn DB_LSN * lu +END + +/* + * BTREE-adj: used to log the adjustment of an index. + * + * pgno: the page modified. + * lsn: the page's original lsn. + * indx: the index adjusted. + * indx_copy: the index to copy if inserting. + * is_insert: 0 if a delete, 1 if an insert. + */ +BEGIN adj 55 +ARG fileid int32_t ld +ARG pgno db_pgno_t lu +POINTER lsn DB_LSN * lu +ARG indx u_int32_t lu +ARG indx_copy u_int32_t lu +ARG is_insert u_int32_t lu +END + +/* + * BTREE-cadjust: used to adjust the count change in an internal page. + * + * pgno: the page modified. + * lsn: the page's original lsn. + * indx: the index to be adjusted. + * adjust: the signed adjustment. + * opflags: CAD_UPDATEROOT: if root page count was adjusted. + */ +BEGIN cadjust 56 +ARG fileid int32_t ld +ARG pgno db_pgno_t lu +POINTER lsn DB_LSN * lu +ARG indx u_int32_t lu +ARG adjust int32_t ld +ARG opflags u_int32_t lu +END + +/* + * BTREE-cdel: used to log the intent-to-delete of a cursor record. + * + * pgno: the page modified. + * lsn: the page's original lsn. + * indx: the index to be deleted. + */ +BEGIN cdel 57 +ARG fileid int32_t ld +ARG pgno db_pgno_t lu +POINTER lsn DB_LSN * lu +ARG indx u_int32_t lu +END + +/* + * BTREE-repl: used to log the replacement of an item. + * + * pgno: the page modified. + * lsn: the page's original lsn. + * orig: the original data. + * new: the replacement data. + * duplicate: the prefix of the replacement that matches the original. + */ +BEGIN repl 58 +ARG fileid int32_t ld +ARG pgno db_pgno_t lu +POINTER lsn DB_LSN * lu +ARG indx u_int32_t lu +ARG isdeleted u_int32_t lu +DBT orig DBT s +DBT repl DBT s +ARG prefix u_int32_t lu +ARG suffix u_int32_t lu +END + +/* + * BTREE-root: log the assignment of a root btree page. + */ +BEGIN root 59 +ARG fileid int32_t ld +ARG meta_pgno db_pgno_t lu +ARG root_pgno db_pgno_t lu +POINTER meta_lsn DB_LSN * lu +END + +/* + * BTREE-curadj: undo cursor adjustments on txn abort. + * Should only be processed during DB_TXN_ABORT. + * NOTE: the first_indx field gets used to hold + * signed index adjustment in one case. + * care should be taken if its size is changed. + */ +BEGIN curadj 64 +/* Fileid of db affected. */ +ARG fileid int32_t ld +/* Which adjustment. */ +ARG mode db_ca_mode ld +/* Page entry is from. */ +ARG from_pgno db_pgno_t lu +/* Page entry went to. */ +ARG to_pgno db_pgno_t lu +/* Left page of root split. */ +ARG left_pgno db_pgno_t lu +/* First index of dup set. Also used as adjustment. */ +ARG first_indx u_int32_t lu +/* Index entry is from. */ +ARG from_indx u_int32_t lu +/* Index where entry went. */ +ARG to_indx u_int32_t lu +END + +/* + * BTREE-rcuradj: undo cursor adjustments on txn abort in + * renumbering recno trees. + * Should only be processed during DB_TXN_ABORT. + */ +BEGIN rcuradj 65 +/* Fileid of db affected. */ +ARG fileid int32_t ld +/* Which adjustment. */ +ARG mode ca_recno_arg ld +/* Root page number. */ +ARG root db_pgno_t ld +/* Recno of the adjustment. */ +ARG recno db_recno_t ld +/* Order number of the adjustment. */ +ARG order u_int32_t ld +END diff --git a/db/btree/btree_auto.c b/db/btree/btree_auto.c new file mode 100644 index 000000000..fdb27b7d2 --- /dev/null +++ b/db/btree/btree_auto.c @@ -0,0 +1,2284 @@ +/* Do not edit: automatically built by gen_rec.awk. */ +#include "db_config.h" + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <ctype.h> +#include <errno.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_dispatch.h" +#include "db_am.h" +#include "btree.h" +#include "txn.h" + +int +__bam_pg_alloc_log(dbenv, txnid, ret_lsnp, flags, + fileid, meta_lsn, page_lsn, pgno, ptype, next) + DB_ENV *dbenv; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + int32_t fileid; + DB_LSN * meta_lsn; + DB_LSN * page_lsn; + db_pgno_t pgno; + u_int32_t ptype; + db_pgno_t next; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_bam_pg_alloc; + if (txnid != NULL && + TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + ZERO_LSN(null_lsn); + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(*meta_lsn) + + sizeof(*page_lsn) + + sizeof(pgno) + + sizeof(ptype) + + sizeof(next); + if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) + return (ret); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + if (meta_lsn != NULL) + memcpy(bp, meta_lsn, sizeof(*meta_lsn)); + else + memset(bp, 0, sizeof(*meta_lsn)); + bp += sizeof(*meta_lsn); + if (page_lsn != NULL) + memcpy(bp, page_lsn, sizeof(*page_lsn)); + else + memset(bp, 0, sizeof(*page_lsn)); + bp += sizeof(*page_lsn); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + memcpy(bp, &ptype, sizeof(ptype)); + bp += sizeof(ptype); + memcpy(bp, &next, sizeof(next)); + bp += sizeof(next); + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); + ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + __os_free(logrec.data, logrec.size); + return (ret); +} + +int +__bam_pg_alloc_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_pg_alloc_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __bam_pg_alloc_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_pg_alloc: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tmeta_lsn: [%lu][%lu]\n", + (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset); + printf("\tpage_lsn: [%lu][%lu]\n", + (u_long)argp->page_lsn.file, (u_long)argp->page_lsn.offset); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tptype: %lu\n", (u_long)argp->ptype); + printf("\tnext: %lu\n", (u_long)argp->next); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__bam_pg_alloc_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __bam_pg_alloc_args **argpp; +{ + __bam_pg_alloc_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__bam_pg_alloc_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->meta_lsn, bp, sizeof(argp->meta_lsn)); + bp += sizeof(argp->meta_lsn); + memcpy(&argp->page_lsn, bp, sizeof(argp->page_lsn)); + bp += sizeof(argp->page_lsn); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->ptype, bp, sizeof(argp->ptype)); + bp += sizeof(argp->ptype); + memcpy(&argp->next, bp, sizeof(argp->next)); + bp += sizeof(argp->next); + *argpp = argp; + return (0); +} + +int +__bam_pg_alloc1_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_pg_alloc1_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __bam_pg_alloc1_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_pg_alloc1: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tmeta_lsn: [%lu][%lu]\n", + (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset); + printf("\talloc_lsn: [%lu][%lu]\n", + (u_long)argp->alloc_lsn.file, (u_long)argp->alloc_lsn.offset); + printf("\tpage_lsn: [%lu][%lu]\n", + (u_long)argp->page_lsn.file, (u_long)argp->page_lsn.offset); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tptype: %lu\n", (u_long)argp->ptype); + printf("\tnext: %lu\n", (u_long)argp->next); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__bam_pg_alloc1_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __bam_pg_alloc1_args **argpp; +{ + __bam_pg_alloc1_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__bam_pg_alloc1_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->meta_lsn, bp, sizeof(argp->meta_lsn)); + bp += sizeof(argp->meta_lsn); + memcpy(&argp->alloc_lsn, bp, sizeof(argp->alloc_lsn)); + bp += sizeof(argp->alloc_lsn); + memcpy(&argp->page_lsn, bp, sizeof(argp->page_lsn)); + bp += sizeof(argp->page_lsn); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->ptype, bp, sizeof(argp->ptype)); + bp += sizeof(argp->ptype); + memcpy(&argp->next, bp, sizeof(argp->next)); + bp += sizeof(argp->next); + *argpp = argp; + return (0); +} + +int +__bam_pg_free_log(dbenv, txnid, ret_lsnp, flags, + fileid, pgno, meta_lsn, header, next) + DB_ENV *dbenv; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + int32_t fileid; + db_pgno_t pgno; + DB_LSN * meta_lsn; + const DBT *header; + db_pgno_t next; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t zero; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_bam_pg_free; + if (txnid != NULL && + TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + ZERO_LSN(null_lsn); + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(*meta_lsn) + + sizeof(u_int32_t) + (header == NULL ? 0 : header->size) + + sizeof(next); + if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) + return (ret); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + if (meta_lsn != NULL) + memcpy(bp, meta_lsn, sizeof(*meta_lsn)); + else + memset(bp, 0, sizeof(*meta_lsn)); + bp += sizeof(*meta_lsn); + if (header == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &header->size, sizeof(header->size)); + bp += sizeof(header->size); + memcpy(bp, header->data, header->size); + bp += header->size; + } + memcpy(bp, &next, sizeof(next)); + bp += sizeof(next); + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); + ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + __os_free(logrec.data, logrec.size); + return (ret); +} + +int +__bam_pg_free_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_pg_free_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __bam_pg_free_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_pg_free: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tmeta_lsn: [%lu][%lu]\n", + (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset); + printf("\theader: "); + for (i = 0; i < argp->header.size; i++) { + ch = ((u_int8_t *)argp->header.data)[i]; + if (isprint(ch) || ch == 0xa) + putchar(ch); + else + printf("%#x ", ch); + } + printf("\n"); + printf("\tnext: %lu\n", (u_long)argp->next); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__bam_pg_free_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __bam_pg_free_args **argpp; +{ + __bam_pg_free_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__bam_pg_free_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->meta_lsn, bp, sizeof(argp->meta_lsn)); + bp += sizeof(argp->meta_lsn); + memset(&argp->header, 0, sizeof(argp->header)); + memcpy(&argp->header.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->header.data = bp; + bp += argp->header.size; + memcpy(&argp->next, bp, sizeof(argp->next)); + bp += sizeof(argp->next); + *argpp = argp; + return (0); +} + +int +__bam_pg_free1_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_pg_free1_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __bam_pg_free1_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_pg_free1: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tmeta_lsn: [%lu][%lu]\n", + (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset); + printf("\talloc_lsn: [%lu][%lu]\n", + (u_long)argp->alloc_lsn.file, (u_long)argp->alloc_lsn.offset); + printf("\theader: "); + for (i = 0; i < argp->header.size; i++) { + ch = ((u_int8_t *)argp->header.data)[i]; + if (isprint(ch) || ch == 0xa) + putchar(ch); + else + printf("%#x ", ch); + } + printf("\n"); + printf("\tnext: %lu\n", (u_long)argp->next); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__bam_pg_free1_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __bam_pg_free1_args **argpp; +{ + __bam_pg_free1_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__bam_pg_free1_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->meta_lsn, bp, sizeof(argp->meta_lsn)); + bp += sizeof(argp->meta_lsn); + memcpy(&argp->alloc_lsn, bp, sizeof(argp->alloc_lsn)); + bp += sizeof(argp->alloc_lsn); + memset(&argp->header, 0, sizeof(argp->header)); + memcpy(&argp->header.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->header.data = bp; + bp += argp->header.size; + memcpy(&argp->next, bp, sizeof(argp->next)); + bp += sizeof(argp->next); + *argpp = argp; + return (0); +} + +int +__bam_split1_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_split1_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __bam_split1_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_split1: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tleft: %lu\n", (u_long)argp->left); + printf("\tllsn: [%lu][%lu]\n", + (u_long)argp->llsn.file, (u_long)argp->llsn.offset); + printf("\tright: %lu\n", (u_long)argp->right); + printf("\trlsn: [%lu][%lu]\n", + (u_long)argp->rlsn.file, (u_long)argp->rlsn.offset); + printf("\tindx: %lu\n", (u_long)argp->indx); + printf("\tnpgno: %lu\n", (u_long)argp->npgno); + printf("\tnlsn: [%lu][%lu]\n", + (u_long)argp->nlsn.file, (u_long)argp->nlsn.offset); + printf("\tpg: "); + for (i = 0; i < argp->pg.size; i++) { + ch = ((u_int8_t *)argp->pg.data)[i]; + if (isprint(ch) || ch == 0xa) + putchar(ch); + else + printf("%#x ", ch); + } + printf("\n"); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__bam_split1_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __bam_split1_args **argpp; +{ + __bam_split1_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__bam_split1_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->left, bp, sizeof(argp->left)); + bp += sizeof(argp->left); + memcpy(&argp->llsn, bp, sizeof(argp->llsn)); + bp += sizeof(argp->llsn); + memcpy(&argp->right, bp, sizeof(argp->right)); + bp += sizeof(argp->right); + memcpy(&argp->rlsn, bp, sizeof(argp->rlsn)); + bp += sizeof(argp->rlsn); + memcpy(&argp->indx, bp, sizeof(argp->indx)); + bp += sizeof(argp->indx); + memcpy(&argp->npgno, bp, sizeof(argp->npgno)); + bp += sizeof(argp->npgno); + memcpy(&argp->nlsn, bp, sizeof(argp->nlsn)); + bp += sizeof(argp->nlsn); + memset(&argp->pg, 0, sizeof(argp->pg)); + memcpy(&argp->pg.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->pg.data = bp; + bp += argp->pg.size; + *argpp = argp; + return (0); +} + +int +__bam_split_log(dbenv, txnid, ret_lsnp, flags, + fileid, left, llsn, right, rlsn, indx, + npgno, nlsn, root_pgno, pg, opflags) + DB_ENV *dbenv; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + int32_t fileid; + db_pgno_t left; + DB_LSN * llsn; + db_pgno_t right; + DB_LSN * rlsn; + u_int32_t indx; + db_pgno_t npgno; + DB_LSN * nlsn; + db_pgno_t root_pgno; + const DBT *pg; + u_int32_t opflags; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t zero; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_bam_split; + if (txnid != NULL && + TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + ZERO_LSN(null_lsn); + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(left) + + sizeof(*llsn) + + sizeof(right) + + sizeof(*rlsn) + + sizeof(indx) + + sizeof(npgno) + + sizeof(*nlsn) + + sizeof(root_pgno) + + sizeof(u_int32_t) + (pg == NULL ? 0 : pg->size) + + sizeof(opflags); + if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) + return (ret); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &left, sizeof(left)); + bp += sizeof(left); + if (llsn != NULL) + memcpy(bp, llsn, sizeof(*llsn)); + else + memset(bp, 0, sizeof(*llsn)); + bp += sizeof(*llsn); + memcpy(bp, &right, sizeof(right)); + bp += sizeof(right); + if (rlsn != NULL) + memcpy(bp, rlsn, sizeof(*rlsn)); + else + memset(bp, 0, sizeof(*rlsn)); + bp += sizeof(*rlsn); + memcpy(bp, &indx, sizeof(indx)); + bp += sizeof(indx); + memcpy(bp, &npgno, sizeof(npgno)); + bp += sizeof(npgno); + if (nlsn != NULL) + memcpy(bp, nlsn, sizeof(*nlsn)); + else + memset(bp, 0, sizeof(*nlsn)); + bp += sizeof(*nlsn); + memcpy(bp, &root_pgno, sizeof(root_pgno)); + bp += sizeof(root_pgno); + if (pg == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &pg->size, sizeof(pg->size)); + bp += sizeof(pg->size); + memcpy(bp, pg->data, pg->size); + bp += pg->size; + } + memcpy(bp, &opflags, sizeof(opflags)); + bp += sizeof(opflags); + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); + ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + __os_free(logrec.data, logrec.size); + return (ret); +} + +int +__bam_split_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_split_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __bam_split_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_split: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tleft: %lu\n", (u_long)argp->left); + printf("\tllsn: [%lu][%lu]\n", + (u_long)argp->llsn.file, (u_long)argp->llsn.offset); + printf("\tright: %lu\n", (u_long)argp->right); + printf("\trlsn: [%lu][%lu]\n", + (u_long)argp->rlsn.file, (u_long)argp->rlsn.offset); + printf("\tindx: %lu\n", (u_long)argp->indx); + printf("\tnpgno: %lu\n", (u_long)argp->npgno); + printf("\tnlsn: [%lu][%lu]\n", + (u_long)argp->nlsn.file, (u_long)argp->nlsn.offset); + printf("\troot_pgno: %lu\n", (u_long)argp->root_pgno); + printf("\tpg: "); + for (i = 0; i < argp->pg.size; i++) { + ch = ((u_int8_t *)argp->pg.data)[i]; + if (isprint(ch) || ch == 0xa) + putchar(ch); + else + printf("%#x ", ch); + } + printf("\n"); + printf("\topflags: %lu\n", (u_long)argp->opflags); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__bam_split_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __bam_split_args **argpp; +{ + __bam_split_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__bam_split_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->left, bp, sizeof(argp->left)); + bp += sizeof(argp->left); + memcpy(&argp->llsn, bp, sizeof(argp->llsn)); + bp += sizeof(argp->llsn); + memcpy(&argp->right, bp, sizeof(argp->right)); + bp += sizeof(argp->right); + memcpy(&argp->rlsn, bp, sizeof(argp->rlsn)); + bp += sizeof(argp->rlsn); + memcpy(&argp->indx, bp, sizeof(argp->indx)); + bp += sizeof(argp->indx); + memcpy(&argp->npgno, bp, sizeof(argp->npgno)); + bp += sizeof(argp->npgno); + memcpy(&argp->nlsn, bp, sizeof(argp->nlsn)); + bp += sizeof(argp->nlsn); + memcpy(&argp->root_pgno, bp, sizeof(argp->root_pgno)); + bp += sizeof(argp->root_pgno); + memset(&argp->pg, 0, sizeof(argp->pg)); + memcpy(&argp->pg.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->pg.data = bp; + bp += argp->pg.size; + memcpy(&argp->opflags, bp, sizeof(argp->opflags)); + bp += sizeof(argp->opflags); + *argpp = argp; + return (0); +} + +int +__bam_rsplit1_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_rsplit1_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __bam_rsplit1_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_rsplit1: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tpgdbt: "); + for (i = 0; i < argp->pgdbt.size; i++) { + ch = ((u_int8_t *)argp->pgdbt.data)[i]; + if (isprint(ch) || ch == 0xa) + putchar(ch); + else + printf("%#x ", ch); + } + printf("\n"); + printf("\tnrec: %lu\n", (u_long)argp->nrec); + printf("\trootent: "); + for (i = 0; i < argp->rootent.size; i++) { + ch = ((u_int8_t *)argp->rootent.data)[i]; + if (isprint(ch) || ch == 0xa) + putchar(ch); + else + printf("%#x ", ch); + } + printf("\n"); + printf("\trootlsn: [%lu][%lu]\n", + (u_long)argp->rootlsn.file, (u_long)argp->rootlsn.offset); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__bam_rsplit1_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __bam_rsplit1_args **argpp; +{ + __bam_rsplit1_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__bam_rsplit1_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memset(&argp->pgdbt, 0, sizeof(argp->pgdbt)); + memcpy(&argp->pgdbt.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->pgdbt.data = bp; + bp += argp->pgdbt.size; + memcpy(&argp->nrec, bp, sizeof(argp->nrec)); + bp += sizeof(argp->nrec); + memset(&argp->rootent, 0, sizeof(argp->rootent)); + memcpy(&argp->rootent.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->rootent.data = bp; + bp += argp->rootent.size; + memcpy(&argp->rootlsn, bp, sizeof(argp->rootlsn)); + bp += sizeof(argp->rootlsn); + *argpp = argp; + return (0); +} + +int +__bam_rsplit_log(dbenv, txnid, ret_lsnp, flags, + fileid, pgno, pgdbt, root_pgno, nrec, rootent, + rootlsn) + DB_ENV *dbenv; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + int32_t fileid; + db_pgno_t pgno; + const DBT *pgdbt; + db_pgno_t root_pgno; + db_pgno_t nrec; + const DBT *rootent; + DB_LSN * rootlsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t zero; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_bam_rsplit; + if (txnid != NULL && + TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + ZERO_LSN(null_lsn); + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(u_int32_t) + (pgdbt == NULL ? 0 : pgdbt->size) + + sizeof(root_pgno) + + sizeof(nrec) + + sizeof(u_int32_t) + (rootent == NULL ? 0 : rootent->size) + + sizeof(*rootlsn); + if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) + return (ret); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + if (pgdbt == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &pgdbt->size, sizeof(pgdbt->size)); + bp += sizeof(pgdbt->size); + memcpy(bp, pgdbt->data, pgdbt->size); + bp += pgdbt->size; + } + memcpy(bp, &root_pgno, sizeof(root_pgno)); + bp += sizeof(root_pgno); + memcpy(bp, &nrec, sizeof(nrec)); + bp += sizeof(nrec); + if (rootent == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &rootent->size, sizeof(rootent->size)); + bp += sizeof(rootent->size); + memcpy(bp, rootent->data, rootent->size); + bp += rootent->size; + } + if (rootlsn != NULL) + memcpy(bp, rootlsn, sizeof(*rootlsn)); + else + memset(bp, 0, sizeof(*rootlsn)); + bp += sizeof(*rootlsn); + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); + ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + __os_free(logrec.data, logrec.size); + return (ret); +} + +int +__bam_rsplit_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_rsplit_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __bam_rsplit_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_rsplit: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tpgdbt: "); + for (i = 0; i < argp->pgdbt.size; i++) { + ch = ((u_int8_t *)argp->pgdbt.data)[i]; + if (isprint(ch) || ch == 0xa) + putchar(ch); + else + printf("%#x ", ch); + } + printf("\n"); + printf("\troot_pgno: %lu\n", (u_long)argp->root_pgno); + printf("\tnrec: %lu\n", (u_long)argp->nrec); + printf("\trootent: "); + for (i = 0; i < argp->rootent.size; i++) { + ch = ((u_int8_t *)argp->rootent.data)[i]; + if (isprint(ch) || ch == 0xa) + putchar(ch); + else + printf("%#x ", ch); + } + printf("\n"); + printf("\trootlsn: [%lu][%lu]\n", + (u_long)argp->rootlsn.file, (u_long)argp->rootlsn.offset); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__bam_rsplit_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __bam_rsplit_args **argpp; +{ + __bam_rsplit_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__bam_rsplit_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memset(&argp->pgdbt, 0, sizeof(argp->pgdbt)); + memcpy(&argp->pgdbt.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->pgdbt.data = bp; + bp += argp->pgdbt.size; + memcpy(&argp->root_pgno, bp, sizeof(argp->root_pgno)); + bp += sizeof(argp->root_pgno); + memcpy(&argp->nrec, bp, sizeof(argp->nrec)); + bp += sizeof(argp->nrec); + memset(&argp->rootent, 0, sizeof(argp->rootent)); + memcpy(&argp->rootent.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->rootent.data = bp; + bp += argp->rootent.size; + memcpy(&argp->rootlsn, bp, sizeof(argp->rootlsn)); + bp += sizeof(argp->rootlsn); + *argpp = argp; + return (0); +} + +int +__bam_adj_log(dbenv, txnid, ret_lsnp, flags, + fileid, pgno, lsn, indx, indx_copy, is_insert) + DB_ENV *dbenv; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + int32_t fileid; + db_pgno_t pgno; + DB_LSN * lsn; + u_int32_t indx; + u_int32_t indx_copy; + u_int32_t is_insert; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_bam_adj; + if (txnid != NULL && + TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + ZERO_LSN(null_lsn); + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(*lsn) + + sizeof(indx) + + sizeof(indx_copy) + + sizeof(is_insert); + if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) + return (ret); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + if (lsn != NULL) + memcpy(bp, lsn, sizeof(*lsn)); + else + memset(bp, 0, sizeof(*lsn)); + bp += sizeof(*lsn); + memcpy(bp, &indx, sizeof(indx)); + bp += sizeof(indx); + memcpy(bp, &indx_copy, sizeof(indx_copy)); + bp += sizeof(indx_copy); + memcpy(bp, &is_insert, sizeof(is_insert)); + bp += sizeof(is_insert); + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); + ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + __os_free(logrec.data, logrec.size); + return (ret); +} + +int +__bam_adj_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_adj_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __bam_adj_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_adj: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tlsn: [%lu][%lu]\n", + (u_long)argp->lsn.file, (u_long)argp->lsn.offset); + printf("\tindx: %lu\n", (u_long)argp->indx); + printf("\tindx_copy: %lu\n", (u_long)argp->indx_copy); + printf("\tis_insert: %lu\n", (u_long)argp->is_insert); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__bam_adj_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __bam_adj_args **argpp; +{ + __bam_adj_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__bam_adj_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->lsn, bp, sizeof(argp->lsn)); + bp += sizeof(argp->lsn); + memcpy(&argp->indx, bp, sizeof(argp->indx)); + bp += sizeof(argp->indx); + memcpy(&argp->indx_copy, bp, sizeof(argp->indx_copy)); + bp += sizeof(argp->indx_copy); + memcpy(&argp->is_insert, bp, sizeof(argp->is_insert)); + bp += sizeof(argp->is_insert); + *argpp = argp; + return (0); +} + +int +__bam_cadjust_log(dbenv, txnid, ret_lsnp, flags, + fileid, pgno, lsn, indx, adjust, opflags) + DB_ENV *dbenv; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + int32_t fileid; + db_pgno_t pgno; + DB_LSN * lsn; + u_int32_t indx; + int32_t adjust; + u_int32_t opflags; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_bam_cadjust; + if (txnid != NULL && + TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + ZERO_LSN(null_lsn); + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(*lsn) + + sizeof(indx) + + sizeof(adjust) + + sizeof(opflags); + if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) + return (ret); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + if (lsn != NULL) + memcpy(bp, lsn, sizeof(*lsn)); + else + memset(bp, 0, sizeof(*lsn)); + bp += sizeof(*lsn); + memcpy(bp, &indx, sizeof(indx)); + bp += sizeof(indx); + memcpy(bp, &adjust, sizeof(adjust)); + bp += sizeof(adjust); + memcpy(bp, &opflags, sizeof(opflags)); + bp += sizeof(opflags); + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); + ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + __os_free(logrec.data, logrec.size); + return (ret); +} + +int +__bam_cadjust_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_cadjust_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __bam_cadjust_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_cadjust: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tlsn: [%lu][%lu]\n", + (u_long)argp->lsn.file, (u_long)argp->lsn.offset); + printf("\tindx: %lu\n", (u_long)argp->indx); + printf("\tadjust: %ld\n", (long)argp->adjust); + printf("\topflags: %lu\n", (u_long)argp->opflags); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__bam_cadjust_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __bam_cadjust_args **argpp; +{ + __bam_cadjust_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__bam_cadjust_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->lsn, bp, sizeof(argp->lsn)); + bp += sizeof(argp->lsn); + memcpy(&argp->indx, bp, sizeof(argp->indx)); + bp += sizeof(argp->indx); + memcpy(&argp->adjust, bp, sizeof(argp->adjust)); + bp += sizeof(argp->adjust); + memcpy(&argp->opflags, bp, sizeof(argp->opflags)); + bp += sizeof(argp->opflags); + *argpp = argp; + return (0); +} + +int +__bam_cdel_log(dbenv, txnid, ret_lsnp, flags, + fileid, pgno, lsn, indx) + DB_ENV *dbenv; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + int32_t fileid; + db_pgno_t pgno; + DB_LSN * lsn; + u_int32_t indx; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_bam_cdel; + if (txnid != NULL && + TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + ZERO_LSN(null_lsn); + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(*lsn) + + sizeof(indx); + if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) + return (ret); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + if (lsn != NULL) + memcpy(bp, lsn, sizeof(*lsn)); + else + memset(bp, 0, sizeof(*lsn)); + bp += sizeof(*lsn); + memcpy(bp, &indx, sizeof(indx)); + bp += sizeof(indx); + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); + ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + __os_free(logrec.data, logrec.size); + return (ret); +} + +int +__bam_cdel_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_cdel_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __bam_cdel_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_cdel: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tlsn: [%lu][%lu]\n", + (u_long)argp->lsn.file, (u_long)argp->lsn.offset); + printf("\tindx: %lu\n", (u_long)argp->indx); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__bam_cdel_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __bam_cdel_args **argpp; +{ + __bam_cdel_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__bam_cdel_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->lsn, bp, sizeof(argp->lsn)); + bp += sizeof(argp->lsn); + memcpy(&argp->indx, bp, sizeof(argp->indx)); + bp += sizeof(argp->indx); + *argpp = argp; + return (0); +} + +int +__bam_repl_log(dbenv, txnid, ret_lsnp, flags, + fileid, pgno, lsn, indx, isdeleted, orig, + repl, prefix, suffix) + DB_ENV *dbenv; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + int32_t fileid; + db_pgno_t pgno; + DB_LSN * lsn; + u_int32_t indx; + u_int32_t isdeleted; + const DBT *orig; + const DBT *repl; + u_int32_t prefix; + u_int32_t suffix; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t zero; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_bam_repl; + if (txnid != NULL && + TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + ZERO_LSN(null_lsn); + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(*lsn) + + sizeof(indx) + + sizeof(isdeleted) + + sizeof(u_int32_t) + (orig == NULL ? 0 : orig->size) + + sizeof(u_int32_t) + (repl == NULL ? 0 : repl->size) + + sizeof(prefix) + + sizeof(suffix); + if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) + return (ret); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + if (lsn != NULL) + memcpy(bp, lsn, sizeof(*lsn)); + else + memset(bp, 0, sizeof(*lsn)); + bp += sizeof(*lsn); + memcpy(bp, &indx, sizeof(indx)); + bp += sizeof(indx); + memcpy(bp, &isdeleted, sizeof(isdeleted)); + bp += sizeof(isdeleted); + if (orig == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &orig->size, sizeof(orig->size)); + bp += sizeof(orig->size); + memcpy(bp, orig->data, orig->size); + bp += orig->size; + } + if (repl == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &repl->size, sizeof(repl->size)); + bp += sizeof(repl->size); + memcpy(bp, repl->data, repl->size); + bp += repl->size; + } + memcpy(bp, &prefix, sizeof(prefix)); + bp += sizeof(prefix); + memcpy(bp, &suffix, sizeof(suffix)); + bp += sizeof(suffix); + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); + ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + __os_free(logrec.data, logrec.size); + return (ret); +} + +int +__bam_repl_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_repl_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __bam_repl_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_repl: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tlsn: [%lu][%lu]\n", + (u_long)argp->lsn.file, (u_long)argp->lsn.offset); + printf("\tindx: %lu\n", (u_long)argp->indx); + printf("\tisdeleted: %lu\n", (u_long)argp->isdeleted); + printf("\torig: "); + for (i = 0; i < argp->orig.size; i++) { + ch = ((u_int8_t *)argp->orig.data)[i]; + if (isprint(ch) || ch == 0xa) + putchar(ch); + else + printf("%#x ", ch); + } + printf("\n"); + printf("\trepl: "); + for (i = 0; i < argp->repl.size; i++) { + ch = ((u_int8_t *)argp->repl.data)[i]; + if (isprint(ch) || ch == 0xa) + putchar(ch); + else + printf("%#x ", ch); + } + printf("\n"); + printf("\tprefix: %lu\n", (u_long)argp->prefix); + printf("\tsuffix: %lu\n", (u_long)argp->suffix); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__bam_repl_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __bam_repl_args **argpp; +{ + __bam_repl_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__bam_repl_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->lsn, bp, sizeof(argp->lsn)); + bp += sizeof(argp->lsn); + memcpy(&argp->indx, bp, sizeof(argp->indx)); + bp += sizeof(argp->indx); + memcpy(&argp->isdeleted, bp, sizeof(argp->isdeleted)); + bp += sizeof(argp->isdeleted); + memset(&argp->orig, 0, sizeof(argp->orig)); + memcpy(&argp->orig.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->orig.data = bp; + bp += argp->orig.size; + memset(&argp->repl, 0, sizeof(argp->repl)); + memcpy(&argp->repl.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->repl.data = bp; + bp += argp->repl.size; + memcpy(&argp->prefix, bp, sizeof(argp->prefix)); + bp += sizeof(argp->prefix); + memcpy(&argp->suffix, bp, sizeof(argp->suffix)); + bp += sizeof(argp->suffix); + *argpp = argp; + return (0); +} + +int +__bam_root_log(dbenv, txnid, ret_lsnp, flags, + fileid, meta_pgno, root_pgno, meta_lsn) + DB_ENV *dbenv; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + int32_t fileid; + db_pgno_t meta_pgno; + db_pgno_t root_pgno; + DB_LSN * meta_lsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_bam_root; + if (txnid != NULL && + TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + ZERO_LSN(null_lsn); + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(meta_pgno) + + sizeof(root_pgno) + + sizeof(*meta_lsn); + if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) + return (ret); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &meta_pgno, sizeof(meta_pgno)); + bp += sizeof(meta_pgno); + memcpy(bp, &root_pgno, sizeof(root_pgno)); + bp += sizeof(root_pgno); + if (meta_lsn != NULL) + memcpy(bp, meta_lsn, sizeof(*meta_lsn)); + else + memset(bp, 0, sizeof(*meta_lsn)); + bp += sizeof(*meta_lsn); + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); + ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + __os_free(logrec.data, logrec.size); + return (ret); +} + +int +__bam_root_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_root_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __bam_root_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_root: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tmeta_pgno: %lu\n", (u_long)argp->meta_pgno); + printf("\troot_pgno: %lu\n", (u_long)argp->root_pgno); + printf("\tmeta_lsn: [%lu][%lu]\n", + (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__bam_root_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __bam_root_args **argpp; +{ + __bam_root_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__bam_root_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->meta_pgno, bp, sizeof(argp->meta_pgno)); + bp += sizeof(argp->meta_pgno); + memcpy(&argp->root_pgno, bp, sizeof(argp->root_pgno)); + bp += sizeof(argp->root_pgno); + memcpy(&argp->meta_lsn, bp, sizeof(argp->meta_lsn)); + bp += sizeof(argp->meta_lsn); + *argpp = argp; + return (0); +} + +int +__bam_curadj_log(dbenv, txnid, ret_lsnp, flags, + fileid, mode, from_pgno, to_pgno, left_pgno, first_indx, + from_indx, to_indx) + DB_ENV *dbenv; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + int32_t fileid; + db_ca_mode mode; + db_pgno_t from_pgno; + db_pgno_t to_pgno; + db_pgno_t left_pgno; + u_int32_t first_indx; + u_int32_t from_indx; + u_int32_t to_indx; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_bam_curadj; + if (txnid != NULL && + TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + ZERO_LSN(null_lsn); + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(mode) + + sizeof(from_pgno) + + sizeof(to_pgno) + + sizeof(left_pgno) + + sizeof(first_indx) + + sizeof(from_indx) + + sizeof(to_indx); + if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) + return (ret); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &mode, sizeof(mode)); + bp += sizeof(mode); + memcpy(bp, &from_pgno, sizeof(from_pgno)); + bp += sizeof(from_pgno); + memcpy(bp, &to_pgno, sizeof(to_pgno)); + bp += sizeof(to_pgno); + memcpy(bp, &left_pgno, sizeof(left_pgno)); + bp += sizeof(left_pgno); + memcpy(bp, &first_indx, sizeof(first_indx)); + bp += sizeof(first_indx); + memcpy(bp, &from_indx, sizeof(from_indx)); + bp += sizeof(from_indx); + memcpy(bp, &to_indx, sizeof(to_indx)); + bp += sizeof(to_indx); + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); + ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + __os_free(logrec.data, logrec.size); + return (ret); +} + +int +__bam_curadj_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_curadj_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __bam_curadj_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_curadj: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tmode: %ld\n", (long)argp->mode); + printf("\tfrom_pgno: %lu\n", (u_long)argp->from_pgno); + printf("\tto_pgno: %lu\n", (u_long)argp->to_pgno); + printf("\tleft_pgno: %lu\n", (u_long)argp->left_pgno); + printf("\tfirst_indx: %lu\n", (u_long)argp->first_indx); + printf("\tfrom_indx: %lu\n", (u_long)argp->from_indx); + printf("\tto_indx: %lu\n", (u_long)argp->to_indx); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__bam_curadj_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __bam_curadj_args **argpp; +{ + __bam_curadj_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__bam_curadj_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->mode, bp, sizeof(argp->mode)); + bp += sizeof(argp->mode); + memcpy(&argp->from_pgno, bp, sizeof(argp->from_pgno)); + bp += sizeof(argp->from_pgno); + memcpy(&argp->to_pgno, bp, sizeof(argp->to_pgno)); + bp += sizeof(argp->to_pgno); + memcpy(&argp->left_pgno, bp, sizeof(argp->left_pgno)); + bp += sizeof(argp->left_pgno); + memcpy(&argp->first_indx, bp, sizeof(argp->first_indx)); + bp += sizeof(argp->first_indx); + memcpy(&argp->from_indx, bp, sizeof(argp->from_indx)); + bp += sizeof(argp->from_indx); + memcpy(&argp->to_indx, bp, sizeof(argp->to_indx)); + bp += sizeof(argp->to_indx); + *argpp = argp; + return (0); +} + +int +__bam_rcuradj_log(dbenv, txnid, ret_lsnp, flags, + fileid, mode, root, recno, order) + DB_ENV *dbenv; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + int32_t fileid; + ca_recno_arg mode; + db_pgno_t root; + db_recno_t recno; + u_int32_t order; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_bam_rcuradj; + if (txnid != NULL && + TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + ZERO_LSN(null_lsn); + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(mode) + + sizeof(root) + + sizeof(recno) + + sizeof(order); + if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) + return (ret); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &mode, sizeof(mode)); + bp += sizeof(mode); + memcpy(bp, &root, sizeof(root)); + bp += sizeof(root); + memcpy(bp, &recno, sizeof(recno)); + bp += sizeof(recno); + memcpy(bp, &order, sizeof(order)); + bp += sizeof(order); + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); + ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + __os_free(logrec.data, logrec.size); + return (ret); +} + +int +__bam_rcuradj_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_rcuradj_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __bam_rcuradj_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]bam_rcuradj: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tmode: %ld\n", (long)argp->mode); + printf("\troot: %ld\n", (long)argp->root); + printf("\trecno: %ld\n", (long)argp->recno); + printf("\torder: %ld\n", (long)argp->order); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__bam_rcuradj_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __bam_rcuradj_args **argpp; +{ + __bam_rcuradj_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__bam_rcuradj_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->mode, bp, sizeof(argp->mode)); + bp += sizeof(argp->mode); + memcpy(&argp->root, bp, sizeof(argp->root)); + bp += sizeof(argp->root); + memcpy(&argp->recno, bp, sizeof(argp->recno)); + bp += sizeof(argp->recno); + memcpy(&argp->order, bp, sizeof(argp->order)); + bp += sizeof(argp->order); + *argpp = argp; + return (0); +} + +int +__bam_init_print(dbenv) + DB_ENV *dbenv; +{ + int ret; + + if ((ret = __db_add_recovery(dbenv, + __bam_pg_alloc_print, DB_bam_pg_alloc)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_pg_alloc1_print, DB_bam_pg_alloc1)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_pg_free_print, DB_bam_pg_free)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_pg_free1_print, DB_bam_pg_free1)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_split1_print, DB_bam_split1)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_split_print, DB_bam_split)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_rsplit1_print, DB_bam_rsplit1)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_rsplit_print, DB_bam_rsplit)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_adj_print, DB_bam_adj)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_cadjust_print, DB_bam_cadjust)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_cdel_print, DB_bam_cdel)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_repl_print, DB_bam_repl)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_root_print, DB_bam_root)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_curadj_print, DB_bam_curadj)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_rcuradj_print, DB_bam_rcuradj)) != 0) + return (ret); + return (0); +} + +int +__bam_init_recover(dbenv) + DB_ENV *dbenv; +{ + int ret; + + if ((ret = __db_add_recovery(dbenv, + __bam_pg_alloc_recover, DB_bam_pg_alloc)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __deprecated_recover, DB_bam_pg_alloc1)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_pg_free_recover, DB_bam_pg_free)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __deprecated_recover, DB_bam_pg_free1)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __deprecated_recover, DB_bam_split1)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_split_recover, DB_bam_split)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __deprecated_recover, DB_bam_rsplit1)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_rsplit_recover, DB_bam_rsplit)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_adj_recover, DB_bam_adj)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_cadjust_recover, DB_bam_cadjust)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_cdel_recover, DB_bam_cdel)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_repl_recover, DB_bam_repl)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_root_recover, DB_bam_root)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_curadj_recover, DB_bam_curadj)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __bam_rcuradj_recover, DB_bam_rcuradj)) != 0) + return (ret); + return (0); +} + |