From 8960e3895f7af91126465368dff8fbb36ab4e853 Mon Sep 17 00:00:00 2001 From: jbj Date: Mon, 15 Dec 2003 21:42:09 +0000 Subject: - upgrade to db-4.2.52. CVS patchset: 6972 CVS date: 2003/12/15 21:42:09 --- db/btree/bt_compare.c | 14 +- db/btree/bt_conv.c | 30 +- db/btree/bt_curadj.c | 87 +- db/btree/bt_cursor.c | 1332 ++++++++++++++----- db/btree/bt_delete.c | 187 +-- db/btree/bt_method.c | 274 ++-- db/btree/bt_open.c | 432 ++++-- db/btree/bt_put.c | 233 ++-- db/btree/bt_rec.c | 510 ++----- db/btree/bt_reclaim.c | 40 +- db/btree/bt_recno.c | 471 +++---- db/btree/bt_rsearch.c | 88 +- db/btree/bt_search.c | 120 +- db/btree/bt_split.c | 321 +++-- db/btree/bt_stat.c | 259 ++-- db/btree/bt_upgrade.c | 25 +- db/btree/bt_verify.c | 733 ++++++---- db/btree/btree.src | 142 +- db/btree/btree_auto.c | 3528 ++++++++++++++++++++++++++++++------------------- 19 files changed, 5247 insertions(+), 3579 deletions(-) (limited to 'db/btree') diff --git a/db/btree/bt_compare.c b/db/btree/bt_compare.c index 91481c313..a329d8044 100644 --- a/db/btree/bt_compare.c +++ b/db/btree/bt_compare.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2003 * Sleepycat Software. All rights reserved. */ /* @@ -43,7 +43,7 @@ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: bt_compare.c,v 11.12 2000/10/26 19:00:28 krinsky Exp $"; +static const char revid[] = "$Id: bt_compare.c,v 11.18 2003/01/08 04:00:56 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -51,8 +51,8 @@ static const char revid[] = "$Id: bt_compare.c,v 11.12 2000/10/26 19:00:28 krins #endif #include "db_int.h" -#include "db_page.h" -#include "btree.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" /* * __bam_cmp -- @@ -92,7 +92,7 @@ __bam_cmp(dbp, dbt, h, indx, func, cmpp) case P_LBTREE: case P_LDUP: case P_LRECNO: - bk = GET_BKEYDATA(h, indx); + bk = GET_BKEYDATA(dbp, h, indx); if (B_TYPE(bk->type) == B_OVERFLOW) bo = (BOVERFLOW *)bk; else { @@ -125,7 +125,7 @@ __bam_cmp(dbp, dbt, h, indx, func, cmpp) return (0); } - bi = GET_BINTERNAL(h, indx); + bi = GET_BINTERNAL(dbp, h, indx); if (B_TYPE(bi->type) == B_OVERFLOW) bo = (BOVERFLOW *)(bi->data); else { @@ -136,7 +136,7 @@ __bam_cmp(dbp, dbt, h, indx, func, cmpp) } break; default: - return (__db_pgfmt(dbp, PGNO(h))); + return (__db_pgfmt(dbp->dbenv, PGNO(h))); } /* diff --git a/db/btree/bt_conv.c b/db/btree/bt_conv.c index fd30f375f..fd80d8a4c 100644 --- a/db/btree/bt_conv.c +++ b/db/btree/bt_conv.c @@ -1,14 +1,14 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2003 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: bt_conv.c,v 11.6 2000/03/31 00:30:26 ubell Exp $"; +static const char revid[] = "$Id: bt_conv.c,v 11.14 2003/01/08 04:00:56 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -16,20 +16,21 @@ static const char revid[] = "$Id: bt_conv.c,v 11.6 2000/03/31 00:30:26 ubell Exp #endif #include "db_int.h" -#include "db_page.h" -#include "db_swap.h" -#include "btree.h" +#include "dbinc/db_page.h" +#include "dbinc/db_swap.h" +#include "dbinc/btree.h" /* * __bam_pgin -- * Convert host-specific page layout from the host-independent format * stored on disk. * - * PUBLIC: int __bam_pgin __P((DB_ENV *, db_pgno_t, void *, DBT *)); + * PUBLIC: int __bam_pgin __P((DB_ENV *, DB *, db_pgno_t, void *, DBT *)); */ int -__bam_pgin(dbenv, pg, pp, cookie) +__bam_pgin(dbenv, dummydbp, pg, pp, cookie) DB_ENV *dbenv; + DB *dummydbp; db_pgno_t pg; void *pp; DBT *cookie; @@ -38,12 +39,12 @@ __bam_pgin(dbenv, pg, pp, cookie) PAGE *h; pginfo = (DB_PGINFO *)cookie->data; - if (!pginfo->needswap) + if (!F_ISSET(pginfo, DB_AM_SWAP)) return (0); h = pp; return (TYPE(h) == P_BTREEMETA ? __bam_mswap(pp) : - __db_byteswap(dbenv, pg, pp, pginfo->db_pagesize, 1)); + __db_byteswap(dbenv, dummydbp, pg, pp, pginfo->db_pagesize, 1)); } /* @@ -51,11 +52,12 @@ __bam_pgin(dbenv, pg, pp, cookie) * Convert host-specific page layout to the host-independent format * stored on disk. * - * PUBLIC: int __bam_pgout __P((DB_ENV *, db_pgno_t, void *, DBT *)); + * PUBLIC: int __bam_pgout __P((DB_ENV *, DB *, db_pgno_t, void *, DBT *)); */ int -__bam_pgout(dbenv, pg, pp, cookie) +__bam_pgout(dbenv, dummydbp, pg, pp, cookie) DB_ENV *dbenv; + DB *dummydbp; db_pgno_t pg; void *pp; DBT *cookie; @@ -64,12 +66,12 @@ __bam_pgout(dbenv, pg, pp, cookie) PAGE *h; pginfo = (DB_PGINFO *)cookie->data; - if (!pginfo->needswap) + if (!F_ISSET(pginfo, DB_AM_SWAP)) return (0); h = pp; return (TYPE(h) == P_BTREEMETA ? __bam_mswap(pp) : - __db_byteswap(dbenv, pg, pp, pginfo->db_pagesize, 0)); + __db_byteswap(dbenv, dummydbp, pg, pp, pginfo->db_pagesize, 0)); } /* @@ -93,6 +95,8 @@ __bam_mswap(pg) SWAP32(p); /* re_len */ SWAP32(p); /* re_pad */ SWAP32(p); /* root */ + p += 92 * sizeof(u_int32_t); /* unused */ + SWAP32(p); /* crypto_magic */ return (0); } diff --git a/db/btree/bt_curadj.c b/db/btree/bt_curadj.c index 011acd2f4..3da200c27 100644 --- a/db/btree/bt_curadj.c +++ b/db/btree/bt_curadj.c @@ -1,14 +1,14 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2003 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: bt_curadj.c,v 11.20 2001/01/17 16:15:49 bostic Exp $"; +static const char revid[] = "$Id: bt_curadj.c,v 11.34 2003/07/09 02:32:24 margo Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -16,9 +16,8 @@ static const char revid[] = "$Id: bt_curadj.c,v 11.20 2001/01/17 16:15:49 bostic #endif #include "db_int.h" -#include "db_page.h" -#include "btree.h" -#include "txn.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" static int __bam_opd_cursor __P((DB *, DBC *, db_pgno_t, u_int32_t, u_int32_t)); @@ -99,6 +98,19 @@ __bam_ca_delete(dbp, pgno, indx, delete) dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { cp = (BTREE_CURSOR *)dbc->internal; if (cp->pgno == pgno && cp->indx == indx) { + /* + * [#8032] This assert is checking + * for possible race conditions where we + * hold a cursor position without a lock. + * Unfortunately, there are paths in the + * Btree code that do not satisfy these + * conditions. None of them are known to + * be a problem, but this assert should + * be re-activated when the Btree stack + * code is re-written. + DB_ASSERT(!STD_LOCKING(dbc) || + cp->lock_mode != DB_LOCK_NG); + */ if (delete) F_SET(cp, C_DELETED); else @@ -193,7 +205,10 @@ __bam_ca_di(my_dbc, pgno, indx, adjust) if (cp->pgno == pgno && cp->indx >= indx) { /* Cursor indices should never be negative. */ DB_ASSERT(cp->indx != 0 || adjust > 0); - + /* [#8032] + DB_ASSERT(!STD_LOCKING(dbc) || + cp->lock_mode != DB_LOCK_NG); + */ cp->indx += adjust; if (my_txn != NULL && dbc->txn != my_txn) found = 1; @@ -203,10 +218,9 @@ __bam_ca_di(my_dbc, pgno, indx, adjust) } MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp); - if (found != 0 && DB_LOGGING(my_dbc)) { - if ((ret = __bam_curadj_log(dbenv, - my_dbc->txn, &lsn, 0, dbp->log_fileid, - DB_CA_DI, pgno, 0, 0, adjust, indx, 0)) != 0) + if (found != 0 && DBC_LOGGING(my_dbc)) { + if ((ret = __bam_curadj_log(dbp, my_dbc->txn, + &lsn, 0, DB_CA_DI, pgno, 0, 0, adjust, indx, 0)) != 0) return (ret); } @@ -234,8 +248,13 @@ __bam_opd_cursor(dbp, dbc, first, tpgno, ti) * Allocate a new cursor and create the stack. If duplicates * are sorted, we've just created an off-page duplicate Btree. * If duplicates aren't sorted, we've just created a Recno tree. + * + * Note that in order to get here at all, there shouldn't be + * an old off-page dup cursor--to augment the checking db_c_newopd + * will do, assert this. */ - if ((ret = __db_c_newopd(dbc, tpgno, &dbc_nopd)) != 0) + DB_ASSERT(orig_cp->opd == NULL); + if ((ret = __db_c_newopd(dbc, tpgno, orig_cp->opd, &dbc_nopd)) != 0) return (ret); cp = (BTREE_CURSOR *)dbc_nopd->internal; @@ -316,22 +335,25 @@ loop: MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); continue; MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); + /* [#8032] + DB_ASSERT(!STD_LOCKING(dbc) || + orig_cp->lock_mode != DB_LOCK_NG); + */ if ((ret = __bam_opd_cursor(dbp, dbc, first, tpgno, ti)) !=0) return (ret); if (my_txn != NULL && dbc->txn != my_txn) found = 1; - /* We released the MUTEX to get a cursor, start over. */ + /* We released the mutex to get a cursor, start over. */ goto loop; } MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); } MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp); - if (found != 0 && DB_LOGGING(my_dbc)) { - if ((ret = __bam_curadj_log(dbenv, - my_dbc->txn, &lsn, 0, dbp->log_fileid, - DB_CA_DUP, fpgno, tpgno, 0, first, fi, ti)) != 0) + if (found != 0 && DBC_LOGGING(my_dbc)) { + if ((ret = __bam_curadj_log(dbp, my_dbc->txn, + &lsn, 0, DB_CA_DUP, fpgno, tpgno, 0, first, fi, ti)) != 0) return (ret); } return (0); @@ -372,18 +394,26 @@ loop: MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { orig_cp = (BTREE_CURSOR *)dbc->internal; + /* + * A note on the orig_cp->opd != NULL requirement here: + * it's possible that there's a cursor that refers to + * the same duplicate set, but which has no opd cursor, + * because it refers to a different item and we took + * care of it while processing a previous record. + */ if (orig_cp->pgno != fpgno || orig_cp->indx != first || + orig_cp->opd == NULL || ((BTREE_CURSOR *)orig_cp->opd->internal)->indx != ti) continue; MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); - if ((ret = orig_cp->opd->c_close(orig_cp->opd)) != 0) + if ((ret = __db_c_close(orig_cp->opd)) != 0) return (ret); orig_cp->opd = NULL; orig_cp->indx = fi; /* - * We released the MUTEX to free a cursor, + * We released the mutex to free a cursor, * start over. */ goto loop; @@ -432,6 +462,10 @@ __bam_ca_rsplit(my_dbc, fpgno, tpgno) continue; if (dbc->internal->pgno == fpgno) { dbc->internal->pgno = tpgno; + /* [#8032] + DB_ASSERT(!STD_LOCKING(dbc) || + dbc->internal->lock_mode != DB_LOCK_NG); + */ if (my_txn != NULL && dbc->txn != my_txn) found = 1; } @@ -440,10 +474,9 @@ __bam_ca_rsplit(my_dbc, fpgno, tpgno) } MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp); - if (found != 0 && DB_LOGGING(my_dbc)) { - if ((ret = __bam_curadj_log(dbenv, - my_dbc->txn, &lsn, 0, dbp->log_fileid, - DB_CA_RSPLIT, fpgno, tpgno, 0, 0, 0, 0)) != 0) + if (found != 0 && DBC_LOGGING(my_dbc)) { + if ((ret = __bam_curadj_log(dbp, my_dbc->txn, + &lsn, 0, DB_CA_RSPLIT, fpgno, tpgno, 0, 0, 0, 0)) != 0) return (ret); } return (0); @@ -497,6 +530,10 @@ __bam_ca_split(my_dbc, ppgno, lpgno, rpgno, split_indx, cleft) continue; cp = dbc->internal; if (cp->pgno == ppgno) { + /* [#8032] + DB_ASSERT(!STD_LOCKING(dbc) || + cp->lock_mode != DB_LOCK_NG); + */ if (my_txn != NULL && dbc->txn != my_txn) found = 1; if (cp->indx < split_indx) { @@ -512,9 +549,9 @@ __bam_ca_split(my_dbc, ppgno, lpgno, rpgno, split_indx, cleft) } MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp); - if (found != 0 && DB_LOGGING(my_dbc)) { - if ((ret = __bam_curadj_log(dbenv, my_dbc->txn, - &lsn, 0, dbp->log_fileid, DB_CA_SPLIT, ppgno, rpgno, + if (found != 0 && DBC_LOGGING(my_dbc)) { + if ((ret = __bam_curadj_log(dbp, + my_dbc->txn, &lsn, 0, DB_CA_SPLIT, ppgno, rpgno, cleft ? lpgno : PGNO_INVALID, 0, split_indx, 0)) != 0) return (ret); } diff --git a/db/btree/bt_cursor.c b/db/btree/bt_cursor.c index 84ab7c807..067da53be 100644 --- a/db/btree/bt_cursor.c +++ b/db/btree/bt_cursor.c @@ -1,31 +1,30 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2003 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: bt_cursor.c,v 11.88 2001/01/11 18:19:49 bostic Exp $"; +static const char revid[] = "$Id: bt_cursor.c,v 11.169 2003/11/19 18:41:06 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include -#include #include #endif #include "db_int.h" -#include "db_page.h" -#include "db_shash.h" -#include "btree.h" -#include "lock.h" -#include "qam.h" -#include "common_ext.h" +#include "dbinc/db_page.h" +#include "dbinc/db_shash.h" +#include "dbinc/btree.h" +#include "dbinc/lock.h" +#include "dbinc/mp.h" +static int __bam_bulk __P((DBC *, DBT *, u_int32_t)); static int __bam_c_close __P((DBC *, db_pgno_t, int *)); static int __bam_c_del __P((DBC *)); static int __bam_c_destroy __P((DBC *)); @@ -33,15 +32,16 @@ static int __bam_c_first __P((DBC *)); static int __bam_c_get __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *)); static int __bam_c_getstack __P((DBC *)); static int __bam_c_last __P((DBC *)); -static int __bam_c_next __P((DBC *, int)); +static int __bam_c_next __P((DBC *, int, int)); static int __bam_c_physdel __P((DBC *)); static int __bam_c_prev __P((DBC *)); static int __bam_c_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *)); -static void __bam_c_reset __P((BTREE_CURSOR *)); -static int __bam_c_search __P((DBC *, const DBT *, u_int32_t, int *)); +static int __bam_c_search __P((DBC *, + db_pgno_t, const DBT *, u_int32_t, int *)); static int __bam_c_writelock __P((DBC *)); -static int __bam_getboth_finddatum __P((DBC *, DBT *)); +static int __bam_getboth_finddatum __P((DBC *, DBT *, u_int32_t)); static int __bam_getbothc __P((DBC *, DBT *)); +static int __bam_get_prev __P((DBC *)); static int __bam_isopd __P((DBC *, db_pgno_t *)); /* @@ -53,48 +53,64 @@ static int __bam_isopd __P((DBC *, db_pgno_t *)); * don't -- we don't duplicate locks when we duplicate cursors if we are * running in a transaction environment as there's no point if locks are * never discarded. This means that the cursor may or may not hold a lock. + * In the case where we are decending the tree we always want to + * unlock the held interior page so we use ACQUIRE_COUPLE. */ #undef ACQUIRE -#define ACQUIRE(dbc, mode, lpgno, lock, fpgno, pagep, ret) {\ +#define ACQUIRE(dbc, mode, lpgno, lock, fpgno, pagep, ret) { \ + DB_MPOOLFILE *__mpf = (dbc)->dbp->mpf; \ if ((pagep) != NULL) { \ - ret = memp_fput((dbc)->dbp->mpf, pagep, 0); \ + ret = __memp_fput(__mpf, pagep, 0); \ + pagep = NULL; \ + } else \ + ret = 0; \ + if ((ret) == 0 && STD_LOCKING(dbc)) \ + ret = __db_lget(dbc, LCK_COUPLE, lpgno, mode, 0, &(lock));\ + if ((ret) == 0) \ + ret = __memp_fget(__mpf, &(fpgno), 0, &(pagep)); \ +} + +#undef ACQUIRE_COUPLE +#define ACQUIRE_COUPLE(dbc, mode, lpgno, lock, fpgno, pagep, ret) { \ + DB_MPOOLFILE *__mpf = (dbc)->dbp->mpf; \ + if ((pagep) != NULL) { \ + ret = __memp_fput(__mpf, pagep, 0); \ pagep = NULL; \ } else \ ret = 0; \ if ((ret) == 0 && STD_LOCKING(dbc)) \ ret = __db_lget(dbc, \ - (lock).off == LOCK_INVALID ? 0 : LCK_COUPLE, \ - lpgno, mode, 0, &lock); \ - else \ - (lock).off = LOCK_INVALID; \ + LCK_COUPLE_ALWAYS, lpgno, mode, 0, &(lock)); \ if ((ret) == 0) \ - ret = memp_fget((dbc)->dbp->mpf, &(fpgno), 0, &(pagep));\ + ret = __memp_fget(__mpf, &(fpgno), 0, &(pagep)); \ } /* Acquire a new page/lock for a cursor. */ #undef ACQUIRE_CUR -#define ACQUIRE_CUR(dbc, mode, ret) { \ +#define ACQUIRE_CUR(dbc, mode, p, ret) { \ BTREE_CURSOR *__cp = (BTREE_CURSOR *)(dbc)->internal; \ - ACQUIRE(dbc, mode, \ - __cp->pgno, __cp->lock, __cp->pgno, __cp->page, ret); \ - if ((ret) == 0) \ + if (p != __cp->pgno) \ + __cp->pgno = PGNO_INVALID; \ + ACQUIRE(dbc, mode, p, __cp->lock, p, __cp->page, ret); \ + if ((ret) == 0) { \ + __cp->pgno = p; \ __cp->lock_mode = (mode); \ + } \ } /* - * Acquire a new page/lock for a cursor, and move the cursor on success. - * The reason that this is a separate macro is because we don't want to - * set the pgno/indx fields in the cursor until we actually have the lock, - * otherwise the cursor adjust routines will adjust the cursor even though - * we're not really on the page. + * Acquire a new page/lock for a cursor and release the previous. + * This is typically used when decending a tree and we do not + * want to hold the interior nodes locked. */ -#undef ACQUIRE_CUR_SET -#define ACQUIRE_CUR_SET(dbc, mode, p, ret) { \ +#undef ACQUIRE_CUR_COUPLE +#define ACQUIRE_CUR_COUPLE(dbc, mode, p, ret) { \ BTREE_CURSOR *__cp = (BTREE_CURSOR *)(dbc)->internal; \ - ACQUIRE(dbc, mode, p, __cp->lock, p, __cp->page, ret); \ + if (p != __cp->pgno) \ + __cp->pgno = PGNO_INVALID; \ + ACQUIRE_COUPLE(dbc, mode, p, __cp->lock, p, __cp->page, ret); \ if ((ret) == 0) { \ - __cp->pgno = p; \ - __cp->indx = 0; \ + __cp->pgno = p; \ __cp->lock_mode = (mode); \ } \ } @@ -112,46 +128,37 @@ static int __bam_isopd __P((DBC *, db_pgno_t *)); if (STD_LOCKING(dbc) && \ __cp->lock_mode != DB_LOCK_WRITE && \ ((ret) = __db_lget(dbc, \ - __cp->lock.off == LOCK_INVALID ? 0 : LCK_COUPLE, \ + LOCK_ISSET(__cp->lock) ? LCK_COUPLE : 0, \ __cp->pgno, DB_LOCK_WRITE, 0, &__cp->lock)) == 0) \ __cp->lock_mode = DB_LOCK_WRITE; \ } -/* Discard the current page/lock. */ -#undef DISCARD -#define DISCARD(dbc, ldiscard, lock, pagep, ret) { \ - int __t_ret; \ - if ((pagep) != NULL) { \ - ret = memp_fput((dbc)->dbp->mpf, pagep, 0); \ - pagep = NULL; \ - } else \ - ret = 0; \ - if ((lock).off != LOCK_INVALID) { \ - __t_ret = ldiscard ? \ - __LPUT((dbc), lock): __TLPUT((dbc), lock); \ - if (__t_ret != 0 && (ret) == 0) \ - ret = __t_ret; \ - (lock).off = LOCK_INVALID; \ - } \ -} - /* Discard the current page/lock for a cursor. */ #undef DISCARD_CUR #define DISCARD_CUR(dbc, ret) { \ BTREE_CURSOR *__cp = (BTREE_CURSOR *)(dbc)->internal; \ - DISCARD(dbc, 0, __cp->lock, __cp->page, ret); \ - if ((ret) == 0) \ + DB_MPOOLFILE *__mpf = (dbc)->dbp->mpf; \ + int __t_ret; \ + if ((__cp->page) != NULL) { \ + ret = __memp_fput(__mpf, __cp->page, 0); \ + __cp->page = NULL; \ + } else \ + ret = 0; \ + __t_ret = __TLPUT((dbc), __cp->lock); \ + if (__t_ret != 0 && (ret) == 0) \ + ret = __t_ret; \ + if ((ret) == 0 && !LOCK_ISSET(__cp->lock)) \ __cp->lock_mode = DB_LOCK_NG; \ } /* If on-page item is a deleted record. */ #undef IS_DELETED -#define IS_DELETED(page, indx) \ - B_DISSET(GET_BKEYDATA(page, \ +#define IS_DELETED(dbp, page, indx) \ + B_DISSET(GET_BKEYDATA(dbp, page, \ (indx) + (TYPE(page) == P_LBTREE ? O_INDX : 0))->type) #undef IS_CUR_DELETED #define IS_CUR_DELETED(dbc) \ - IS_DELETED((dbc)->internal->page, (dbc)->internal->indx) + IS_DELETED((dbc)->dbp, (dbc)->internal->page, (dbc)->internal->indx) /* * Test to see if two cursors could point to duplicates of the same key. @@ -163,30 +170,14 @@ static int __bam_isopd __P((DBC *, db_pgno_t *)); */ #undef IS_DUPLICATE #define IS_DUPLICATE(dbc, i1, i2) \ - (((PAGE *)(dbc)->internal->page)->inp[i1] == \ - ((PAGE *)(dbc)->internal->page)->inp[i2]) + (P_INP((dbc)->dbp,((PAGE *)(dbc)->internal->page))[i1] == \ + P_INP((dbc)->dbp,((PAGE *)(dbc)->internal->page))[i2]) #undef IS_CUR_DUPLICATE #define IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx) \ (F_ISSET(dbc, DBC_OPD) || \ (orig_pgno == (dbc)->internal->pgno && \ IS_DUPLICATE(dbc, (dbc)->internal->indx, orig_indx))) -/* - * __bam_c_reset -- - * Initialize internal cursor structure. - */ -static void -__bam_c_reset(cp) - BTREE_CURSOR *cp; -{ - cp->csp = cp->sp; - cp->lock.off = LOCK_INVALID; - cp->lock_mode = DB_LOCK_NG; - cp->recno = RECNO_OOB; - cp->order = INVALID_ORDER; - cp->flags = 0; -} - /* * __bam_c_init -- * Initialize the access private portion of a cursor @@ -198,35 +189,26 @@ __bam_c_init(dbc, dbtype) DBC *dbc; DBTYPE dbtype; { - BTREE *t; - BTREE_CURSOR *cp; - DB *dbp; + DB_ENV *dbenv; int ret; - u_int32_t minkey; - dbp = dbc->dbp; + dbenv = dbc->dbp->dbenv; /* Allocate/initialize the internal structure. */ - if (dbc->internal == NULL) { - if ((ret = __os_malloc(dbp->dbenv, - sizeof(BTREE_CURSOR), NULL, &cp)) != 0) - return (ret); - dbc->internal = (DBC_INTERNAL *)cp; - - cp->sp = cp->csp = cp->stack; - cp->esp = cp->stack + sizeof(cp->stack) / sizeof(cp->stack[0]); - } else - cp = (BTREE_CURSOR *)dbc->internal; - __bam_c_reset(cp); + if (dbc->internal == NULL && (ret = + __os_malloc(dbenv, sizeof(BTREE_CURSOR), &dbc->internal)) != 0) + return (ret); /* Initialize methods. */ dbc->c_close = __db_c_close; - dbc->c_count = __db_c_count; - dbc->c_del = __db_c_del; - dbc->c_dup = __db_c_dup; - dbc->c_get = __db_c_get; - dbc->c_put = __db_c_put; + dbc->c_count = __db_c_count_pp; + dbc->c_del = __db_c_del_pp; + dbc->c_dup = __db_c_dup_pp; + dbc->c_get = __db_c_get_pp; + dbc->c_pget = __db_c_pget_pp; + dbc->c_put = __db_c_put_pp; if (dbtype == DB_BTREE) { + dbc->c_am_bulk = __bam_bulk; dbc->c_am_close = __bam_c_close; dbc->c_am_del = __bam_c_del; dbc->c_am_destroy = __bam_c_destroy; @@ -234,6 +216,7 @@ __bam_c_init(dbc, dbtype) dbc->c_am_put = __bam_c_put; dbc->c_am_writelock = __bam_c_writelock; } else { + dbc->c_am_bulk = __bam_bulk; dbc->c_am_close = __bam_c_close; dbc->c_am_del = __ram_c_del; dbc->c_am_destroy = __bam_c_destroy; @@ -242,18 +225,6 @@ __bam_c_init(dbc, dbtype) dbc->c_am_writelock = __bam_c_writelock; } - /* - * The btree leaf page data structures require that two key/data pairs - * (or four items) fit on a page, but other than that there's no fixed - * requirement. The btree off-page duplicates only require two items, - * to be exact, but requiring four for them as well seems reasonable. - * - * Recno uses the btree bt_ovflsize value -- it's close enough. - */ - t = dbp->bt_internal; - minkey = F_ISSET(dbc, DBC_OPD) ? 2 : t->bt_minkey; - cp->ovflsize = B_MINKEY_TO_OVFLSIZE(minkey, dbp->pgsize); - return (0); } @@ -267,12 +238,13 @@ int __bam_c_refresh(dbc) DBC *dbc; { + BTREE *t; BTREE_CURSOR *cp; DB *dbp; dbp = dbc->dbp; + t = dbp->bt_internal; cp = (BTREE_CURSOR *)dbc->internal; - __bam_c_reset(cp); /* * If our caller set the root page number, it's because the root was @@ -280,11 +252,32 @@ __bam_c_refresh(dbc) * pull it out of our internal information. */ if (cp->root == PGNO_INVALID) - cp->root = ((BTREE *)dbp->bt_internal)->bt_root; + cp->root = t->bt_root; + + LOCK_INIT(cp->lock); + cp->lock_mode = DB_LOCK_NG; + + cp->sp = cp->csp = cp->stack; + cp->esp = cp->stack + sizeof(cp->stack) / sizeof(cp->stack[0]); + + /* + * The btree leaf page data structures require that two key/data pairs + * (or four items) fit on a page, but other than that there's no fixed + * requirement. The btree off-page duplicates only require two items, + * to be exact, but requiring four for them as well seems reasonable. + * + * Recno uses the btree bt_ovflsize value -- it's close enough. + */ + cp->ovflsize = B_MINKEY_TO_OVFLSIZE( + dbp, F_ISSET(dbc, DBC_OPD) ? 2 : t->bt_minkey, dbp->pgsize); + + cp->recno = RECNO_OOB; + cp->order = INVALID_ORDER; + cp->flags = 0; /* Initialize for record numbers. */ if (F_ISSET(dbc, DBC_OPD) || - dbc->dbtype == DB_RECNO || F_ISSET(dbp, DB_BT_RECNUM)) { + dbc->dbtype == DB_RECNO || F_ISSET(dbp, DB_AM_RECNUM)) { F_SET(cp, C_RECNUM); /* @@ -293,7 +286,7 @@ __bam_c_refresh(dbc) * mutable record numbers. */ if ((F_ISSET(dbc, DBC_OPD) && dbc->dbtype == DB_RECNO) || - F_ISSET(dbp, DB_BT_RECNUM | DB_RE_RENUMBER)) + F_ISSET(dbp, DB_AM_RECNUM | DB_AM_RENUMBER)) F_SET(cp, C_RENUMBER); } @@ -313,11 +306,12 @@ __bam_c_close(dbc, root_pgno, rmroot) BTREE_CURSOR *cp, *cp_opd, *cp_c; DB *dbp; DBC *dbc_opd, *dbc_c; + DB_MPOOLFILE *mpf; PAGE *h; - u_int32_t num; int cdb_lock, ret, t_ret; dbp = dbc->dbp; + mpf = dbp->mpf; cp = (BTREE_CURSOR *)dbc->internal; cp_opd = (dbc_opd = cp->opd) == NULL ? NULL : (BTREE_CURSOR *)dbc_opd->internal; @@ -394,6 +388,9 @@ __bam_c_close(dbc, root_pgno, rmroot) if (__ram_ca_delete(dbp, cp->root) == 0) goto lock; goto done; + case DB_HASH: + case DB_QUEUE: + case DB_UNKNOWN: default: return (__db_unknown_type(dbp->dbenv, "__bam_c_close", dbc->dbtype)); @@ -408,10 +405,10 @@ __bam_c_close(dbc, root_pgno, rmroot) * We will not have been provided a root page number. Acquire * one from the primary database. */ - if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &h)) != 0) + if ((ret = __memp_fget(mpf, &cp->pgno, 0, &h)) != 0) goto err; - root_pgno = GET_BOVERFLOW(h, cp->indx + O_INDX)->pgno; - if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) + root_pgno = GET_BOVERFLOW(dbp, h, cp->indx + O_INDX)->pgno; + if ((ret = __memp_fput(mpf, h, 0)) != 0) goto err; dbc_c = dbc_opd; @@ -425,6 +422,9 @@ __bam_c_close(dbc, root_pgno, rmroot) if (__ram_ca_delete(dbp, cp_opd->root) == 0) goto lock; goto done; + case DB_HASH: + case DB_QUEUE: + case DB_UNKNOWN: default: return (__db_unknown_type(dbp->dbenv, "__bam_c_close", dbc->dbtype)); @@ -438,35 +438,18 @@ lock: cp_c = (BTREE_CURSOR *)dbc_c->internal; * If this is CDB, upgrade the lock if necessary. While we acquired * the write lock to logically delete the record, we released it when * we returned from that call, and so may not be holding a write lock - * at the moment. NB: to get here in CDB we must either be holding a - * write lock or be the only cursor that is permitted to acquire write - * locks. The reason is that there can never be more than a single CDB - * write cursor (that cursor cannot be dup'd), and so that cursor must - * be closed and the item therefore deleted before any other cursor - * could acquire a reference to this item. - * - * Note that dbc may be an off-page dup cursor; this is the sole - * instance in which an OPD cursor does any locking, but it's necessary - * because we may be closed by ourselves without a parent cursor - * handy, and we have to do a lock upgrade on behalf of somebody. - * If this is the case, the OPD has been given the parent's locking - * info in __db_c_get--the OPD is also a WRITEDUP. + * at the moment. */ if (CDB_LOCKING(dbp->dbenv)) { - DB_ASSERT(!F_ISSET(dbc, DBC_OPD) || F_ISSET(dbc, DBC_WRITEDUP)); - if (!F_ISSET(dbc, DBC_WRITER)) { - if ((ret = - lock_get(dbp->dbenv, dbc->locker, DB_LOCK_UPGRADE, - &dbc->lock_dbt, DB_LOCK_WRITE, &dbc->mylock)) != 0) + if (F_ISSET(dbc, DBC_WRITECURSOR)) { + if ((ret = __lock_get(dbp->dbenv, + dbc->locker, DB_LOCK_UPGRADE, &dbc->lock_dbt, + DB_LOCK_WRITE, &dbc->mylock)) != 0) goto err; cdb_lock = 1; } - - cp_c->lock.off = LOCK_INVALID; - if ((ret = - memp_fget(dbp->mpf, &cp_c->pgno, 0, &cp_c->page)) != 0) + if ((ret = __memp_fget(mpf, &cp_c->pgno, 0, &cp_c->page)) != 0) goto err; - goto delete; } @@ -480,9 +463,7 @@ lock: cp_c = (BTREE_CURSOR *)dbc_c->internal; * is responsible for acquiring any necessary locks before calling us. */ if (F_ISSET(dbc, DBC_OPD)) { - cp_c->lock.off = LOCK_INVALID; - if ((ret = - memp_fget(dbp->mpf, &cp_c->pgno, 0, &cp_c->page)) != 0) + if ((ret = __memp_fget(mpf, &cp_c->pgno, 0, &cp_c->page)) != 0) goto err; goto delete; } @@ -542,13 +523,13 @@ delete: /* * in that case. So, if the off-page duplicate tree is empty at this * point, we want to remove it. */ - if ((ret = memp_fget(dbp->mpf, &root_pgno, 0, &h)) != 0) + if ((ret = __memp_fget(mpf, &root_pgno, 0, &h)) != 0) goto err; - if ((num = NUM_ENT(h)) == 0) { + if (NUM_ENT(h) == 0) { if ((ret = __db_free(dbc, h)) != 0) goto err; } else { - if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) + if ((ret = __memp_fput(mpf, h, 0)) != 0) goto err; goto done; } @@ -566,8 +547,7 @@ delete: /* * the primary page. */ if (dbc_opd != NULL) { - cp->lock.off = LOCK_INVALID; - if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0) + if ((ret = __memp_fget(mpf, &cp->pgno, 0, &cp->page)) != 0) goto err; if ((ret = __bam_c_physdel(dbc)) != 0) goto err; @@ -604,7 +584,7 @@ __bam_c_destroy(dbc) DBC *dbc; { /* Discard the structures. */ - __os_free(dbc->internal, sizeof(BTREE_CURSOR)); + __os_free(dbc->dbp->dbenv, dbc->internal); return (0); } @@ -622,22 +602,25 @@ __bam_c_count(dbc, recnop) { BTREE_CURSOR *cp; DB *dbp; + DB_MPOOLFILE *mpf; db_indx_t indx, top; db_recno_t recno; int ret; dbp = dbc->dbp; + mpf = dbp->mpf; cp = (BTREE_CURSOR *)dbc->internal; /* * Called with the top-level cursor that may reference an off-page - * duplicates page. If it's a set of on-page duplicates, get the - * page and count. Otherwise, get the root page of the off-page - * duplicate tree, and use the count. We don't have to acquire any - * new locks, we have to have a read lock to even get here. + * duplicates tree. We don't have to acquire any new locks, we have + * to have a read lock to even get here. */ if (cp->opd == NULL) { - if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0) + /* + * On-page duplicates, get the page and count. + */ + if ((ret = __memp_fget(mpf, &cp->pgno, 0, &cp->page)) != 0) return (ret); /* @@ -648,20 +631,47 @@ __bam_c_count(dbc, recnop) if (indx == 0 || !IS_DUPLICATE(dbc, indx, indx - P_INDX)) break; - for (recno = 1, top = NUM_ENT(cp->page) - P_INDX; - indx < top; ++recno, indx += P_INDX) - if (!IS_DUPLICATE(dbc, indx, indx + P_INDX)) + for (recno = 0, + top = NUM_ENT(cp->page) - P_INDX;; indx += P_INDX) { + if (!IS_DELETED(dbp, cp->page, indx)) + ++recno; + if (indx == top || + !IS_DUPLICATE(dbc, indx, indx + P_INDX)) break; - *recnop = recno; + } } else { - if ((ret = memp_fget(dbp->mpf, - &cp->opd->internal->root, 0, &cp->page)) != 0) + /* + * Off-page duplicates tree, get the root page of the off-page + * duplicate tree. + */ + if ((ret = __memp_fget( + mpf, &cp->opd->internal->root, 0, &cp->page)) != 0) return (ret); - *recnop = RE_NREC(cp->page); + /* + * If the page is an internal page use the page's count as it's + * up-to-date and reflects the status of cursors in the tree. + * If the page is a leaf page for unsorted duplicates, use the + * page's count as cursors don't mark items deleted on the page + * and wait, cursor delete items immediately. + * If the page is a leaf page for sorted duplicates, there may + * be cursors on the page marking deleted items -- count. + */ + if (TYPE(cp->page) == P_LDUP) + for (recno = 0, indx = 0, + top = NUM_ENT(cp->page) - O_INDX;; indx += O_INDX) { + if (!IS_DELETED(dbp, cp->page, indx)) + ++recno; + if (indx == top) + break; + } + else + recno = RE_NREC(cp->page); } - ret = memp_fput(dbp->mpf, cp->page, 0); + *recnop = recno; + + ret = __memp_fput(mpf, cp->page, 0); cp->page = NULL; return (ret); @@ -677,9 +687,11 @@ __bam_c_del(dbc) { BTREE_CURSOR *cp; DB *dbp; + DB_MPOOLFILE *mpf; int ret, t_ret; dbp = dbc->dbp; + mpf = dbp->mpf; cp = (BTREE_CURSOR *)dbc->internal; ret = 0; @@ -706,25 +718,27 @@ __bam_c_del(dbc) goto err; cp->page = cp->csp->page; } else { - ACQUIRE_CUR(dbc, DB_LOCK_WRITE, ret); + ACQUIRE_CUR(dbc, DB_LOCK_WRITE, cp->pgno, ret); if (ret != 0) goto err; } /* Log the change. */ - if (DB_LOGGING(dbc) && - (ret = __bam_cdel_log(dbp->dbenv, dbc->txn, &LSN(cp->page), 0, - dbp->log_fileid, PGNO(cp->page), &LSN(cp->page), cp->indx)) != 0) - goto err; + if (DBC_LOGGING(dbc)) { + if ((ret = __bam_cdel_log(dbp, dbc->txn, &LSN(cp->page), 0, + PGNO(cp->page), &LSN(cp->page), cp->indx)) != 0) + goto err; + } else + LSN_NOT_LOGGED(LSN(cp->page)); /* Set the intent-to-delete flag on the page. */ if (TYPE(cp->page) == P_LBTREE) - B_DSET(GET_BKEYDATA(cp->page, cp->indx + O_INDX)->type); + B_DSET(GET_BKEYDATA(dbp, cp->page, cp->indx + O_INDX)->type); else - B_DSET(GET_BKEYDATA(cp->page, cp->indx)->type); + B_DSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type); /* Mark the page dirty. */ - ret = memp_fset(dbp->mpf, cp->page, DB_MPOOL_DIRTY); + ret = __memp_fset(mpf, cp->page, DB_MPOOL_DIRTY); err: /* * If we've been successful so far and the tree has record numbers, @@ -736,7 +750,7 @@ err: /* (void)__bam_stkrel(dbc, 0); } else if (cp->page != NULL && - (t_ret = memp_fput(dbp->mpf, cp->page, 0)) != 0 && ret == 0) + (t_ret = __memp_fput(mpf, cp->page, 0)) != 0 && ret == 0) ret = t_ret; cp->page = NULL; @@ -771,7 +785,7 @@ __bam_c_dup(orig_dbc, new_dbc) * holding inside a transaction because all the locks are retained * until the transaction commits or aborts. */ - if (orig->lock.off != LOCK_INVALID && orig_dbc->txn == NULL) { + if (LOCK_ISSET(orig->lock) && orig_dbc->txn == NULL) { if ((ret = __db_lget(new_dbc, 0, new->pgno, new->lock_mode, 0, &new->lock)) != 0) return (ret); @@ -796,11 +810,13 @@ __bam_c_get(dbc, key, data, flags, pgnop) { BTREE_CURSOR *cp; DB *dbp; + DB_MPOOLFILE *mpf; db_pgno_t orig_pgno; db_indx_t orig_indx; int exact, newopd, ret; dbp = dbc->dbp; + mpf = dbp->mpf; cp = (BTREE_CURSOR *)dbc->internal; orig_pgno = cp->pgno; orig_indx = cp->indx; @@ -820,7 +836,7 @@ __bam_c_get(dbc, key, data, flags, pgnop) * write lock, but upgrading to a write lock has no better * chance of succeeding now instead of later, so don't try. */ - if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0) + if ((ret = __memp_fget(mpf, &cp->pgno, 0, &cp->page)) != 0) goto err; break; case DB_FIRST: @@ -829,9 +845,10 @@ __bam_c_get(dbc, key, data, flags, pgnop) goto err; break; case DB_GET_BOTH: + case DB_GET_BOTH_RANGE: /* * There are two ways to get here based on DBcursor->c_get - * with the DB_GET_BOTH flag set: + * with the DB_GET_BOTH/DB_GET_BOTH_RANGE flags set: * * 1. Searching a sorted off-page duplicate tree: do a tree * search. @@ -839,20 +856,34 @@ __bam_c_get(dbc, key, data, flags, pgnop) * 2. Searching btree: do a tree search. If it returns a * reference to off-page duplicate tree, return immediately * and let our caller deal with it. If the search doesn't - * return a reference to off-page duplicate tree, start an - * on-page search. + * return a reference to off-page duplicate tree, continue + * with an on-page search. */ if (F_ISSET(dbc, DBC_OPD)) { if ((ret = __bam_c_search( - dbc, data, DB_GET_BOTH, &exact)) != 0) - goto err; - if (!exact) { - ret = DB_NOTFOUND; + dbc, PGNO_INVALID, data, flags, &exact)) != 0) goto err; + if (flags == DB_GET_BOTH) { + if (!exact) { + ret = DB_NOTFOUND; + goto err; + } + break; } + + /* + * We didn't require an exact match, so the search may + * may have returned an entry past the end of the page, + * or we may be referencing a deleted record. If so, + * move to the next entry. + */ + if ((cp->indx == NUM_ENT(cp->page) || + IS_CUR_DELETED(dbc)) && + (ret = __bam_c_next(dbc, 1, 0)) != 0) + goto err; } else { if ((ret = __bam_c_search( - dbc, key, DB_GET_BOTH, &exact)) != 0) + dbc, PGNO_INVALID, key, flags, &exact)) != 0) return (ret); if (!exact) { ret = DB_NOTFOUND; @@ -863,7 +894,8 @@ __bam_c_get(dbc, key, data, flags, pgnop) newopd = 1; break; } - if ((ret = __bam_getboth_finddatum(dbc, data)) != 0) + if ((ret = + __bam_getboth_finddatum(dbc, data, flags)) != 0) goto err; } break; @@ -882,11 +914,11 @@ __bam_c_get(dbc, key, data, flags, pgnop) if ((ret = __bam_c_first(dbc)) != 0) goto err; } else - if ((ret = __bam_c_next(dbc, 1)) != 0) + if ((ret = __bam_c_next(dbc, 1, 0)) != 0) goto err; break; case DB_NEXT_DUP: - if ((ret = __bam_c_next(dbc, 1)) != 0) + if ((ret = __bam_c_next(dbc, 1, 0)) != 0) goto err; if (!IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx)) { ret = DB_NOTFOUND; @@ -900,7 +932,7 @@ __bam_c_get(dbc, key, data, flags, pgnop) goto err; } else do { - if ((ret = __bam_c_next(dbc, 1)) != 0) + if ((ret = __bam_c_next(dbc, 1, 0)) != 0) goto err; } while (IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx)); break; @@ -927,12 +959,14 @@ __bam_c_get(dbc, key, data, flags, pgnop) case DB_SET: case DB_SET_RECNO: newopd = 1; - if ((ret = __bam_c_search(dbc, key, flags, &exact)) != 0) + if ((ret = __bam_c_search(dbc, + PGNO_INVALID, key, flags, &exact)) != 0) goto err; break; case DB_SET_RANGE: newopd = 1; - if ((ret = __bam_c_search(dbc, key, flags, &exact)) != 0) + if ((ret = __bam_c_search(dbc, + PGNO_INVALID, key, flags, &exact)) != 0) goto err; /* @@ -942,7 +976,7 @@ __bam_c_get(dbc, key, data, flags, pgnop) * the next entry. */ if (cp->indx == NUM_ENT(cp->page) || IS_CUR_DELETED(dbc)) - if ((ret = __bam_c_next(dbc, 0)) != 0) + if ((ret = __bam_c_next(dbc, 0, 0)) != 0) goto err; break; default: @@ -957,8 +991,15 @@ __bam_c_get(dbc, key, data, flags, pgnop) if (newopd && pgnop != NULL) (void)__bam_isopd(dbc, pgnop); - /* Don't return the key, it was passed to us */ - if (flags == DB_SET) + /* + * Don't return the key, it was passed to us (this is true even if the + * application defines a compare function returning equality for more + * than one key value, since in that case which actual value we store + * in the database is undefined -- and particularly true in the case of + * duplicates where we only store one key value). + */ + if (flags == DB_GET_BOTH || + flags == DB_GET_BOTH_RANGE || flags == DB_SET) F_SET(key, DB_DBT_ISSET); err: /* @@ -966,13 +1007,595 @@ err: /* * moved, clear the delete flag, DBcursor->c_get never references * a deleted key, if it moved at all. */ - if (F_ISSET(cp, C_DELETED) - && (cp->pgno != orig_pgno || cp->indx != orig_indx)) + if (F_ISSET(cp, C_DELETED) && + (cp->pgno != orig_pgno || cp->indx != orig_indx)) F_CLR(cp, C_DELETED); return (ret); } +static int +__bam_get_prev(dbc) + DBC *dbc; +{ + BTREE_CURSOR *cp; + DBT key, data; + db_pgno_t pgno; + int ret; + + if ((ret = __bam_c_prev(dbc)) != 0) + return (ret); + + if (__bam_isopd(dbc, &pgno)) { + cp = (BTREE_CURSOR *)dbc->internal; + if ((ret = __db_c_newopd(dbc, pgno, cp->opd, &cp->opd)) != 0) + return (ret); + if ((ret = cp->opd->c_am_get(cp->opd, + &key, &data, DB_LAST, NULL)) != 0) + return (ret); + } + + return (0); +} + +/* + * __bam_bulk -- Return bulk data from a btree. + */ +static int +__bam_bulk(dbc, data, flags) + DBC *dbc; + DBT *data; + u_int32_t flags; +{ + BKEYDATA *bk; + BOVERFLOW *bo; + BTREE_CURSOR *cp; + PAGE *pg; + db_indx_t *inp, indx, pg_keyoff; + int32_t *endp, key_off, *offp, *saveoffp; + u_int8_t *dbuf, *dp, *np; + u_int32_t key_size, pagesize, size, space; + int adj, is_key, need_pg, next_key, no_dup, rec_key, ret; + + ret = 0; + key_off = 0; + size = 0; + pagesize = dbc->dbp->pgsize; + cp = (BTREE_CURSOR *)dbc->internal; + + /* + * dp tracks the beginging of the page in the buffer. + * np is the next place to copy things into the buffer. + * dbuf always stays at the beging of the buffer. + */ + dbuf = data->data; + np = dp = dbuf; + + /* Keep track of space that is left. There is a termination entry */ + space = data->ulen; + space -= sizeof(*offp); + + /* Build the offset/size table from the end up. */ + endp = (int32_t *)((u_int8_t *)dbuf + data->ulen); + endp--; + offp = endp; + + key_size = 0; + + /* + * Distinguish between BTREE and RECNO. + * There are no keys in RECNO. If MULTIPLE_KEY is specified + * then we return the record numbers. + * is_key indicates that multiple btree keys are returned. + * rec_key is set if we are returning record numbers. + * next_key is set if we are going after the next key rather than dup. + */ + if (dbc->dbtype == DB_BTREE) { + is_key = LF_ISSET(DB_MULTIPLE_KEY) ? 1: 0; + rec_key = 0; + next_key = is_key && LF_ISSET(DB_OPFLAGS_MASK) != DB_NEXT_DUP; + adj = 2; + } else { + is_key = 0; + rec_key = LF_ISSET(DB_MULTIPLE_KEY) ? 1 : 0; + next_key = LF_ISSET(DB_OPFLAGS_MASK) != DB_NEXT_DUP; + adj = 1; + } + no_dup = LF_ISSET(DB_OPFLAGS_MASK) == DB_NEXT_NODUP; + +next_pg: + indx = cp->indx; + pg = cp->page; + + inp = P_INP(dbc->dbp, pg); + /* The current page is not yet in the buffer. */ + need_pg = 1; + + /* + * Keep track of the offset of the current key on the page. + * If we are returning keys, set it to 0 first so we force + * the copy of the key to the buffer. + */ + pg_keyoff = 0; + if (is_key == 0) + pg_keyoff = inp[indx]; + + do { + if (IS_DELETED(dbc->dbp, pg, indx)) { + if (dbc->dbtype != DB_RECNO) + continue; + + cp->recno++; + /* + * If we are not returning recnos then we + * need to fill in every slot so the user + * can calculate the record numbers. + */ + if (rec_key != 0) + continue; + + space -= 2 * sizeof(*offp); + /* Check if space as underflowed. */ + if (space > data->ulen) + goto back_up; + + /* Just mark the empty recno slots. */ + *offp-- = 0; + *offp-- = 0; + continue; + } + + /* + * Check to see if we have a new key. + * If so, then see if we need to put the + * key on the page. If its already there + * then we just point to it. + */ + if (is_key && pg_keyoff != inp[indx]) { + bk = GET_BKEYDATA(dbc->dbp, pg, indx); + if (B_TYPE(bk->type) == B_OVERFLOW) { + bo = (BOVERFLOW *)bk; + size = key_size = bo->tlen; + if (key_size > space) + goto get_key_space; + if ((ret = __bam_bulk_overflow(dbc, + bo->tlen, bo->pgno, np)) != 0) + return (ret); + space -= key_size; + key_off = (int32_t)(np - dbuf); + np += key_size; + } else { + if (need_pg) { + dp = np; + size = pagesize - HOFFSET(pg); + if (space < size) { +get_key_space: + /* Nothing added, then error. */ + if (offp == endp) { + data->size = + ALIGN(size + + pagesize, 1024); + return (ENOMEM); + } + /* + * We need to back up to the + * last record put into the + * buffer so that it is + * CURRENT. + */ + if (indx != 0) + indx -= P_INDX; + else { + if ((ret = + __bam_get_prev( + dbc)) != 0) + return (ret); + indx = cp->indx; + pg = cp->page; + } + break; + } + /* + * Move the data part of the page + * to the buffer. + */ + memcpy(dp, + (u_int8_t *)pg + HOFFSET(pg), size); + need_pg = 0; + space -= size; + np += size; + } + key_size = bk->len; + key_off = (int32_t)((inp[indx] - HOFFSET(pg)) + + (dp - dbuf) + SSZA(BKEYDATA, data)); + pg_keyoff = inp[indx]; + } + } + + /* + * Reserve space for the pointers and sizes. + * Either key/data pair or just for a data item. + */ + space -= (is_key ? 4 : 2) * sizeof(*offp); + if (rec_key) + space -= sizeof(*offp); + + /* Check to see if space has underflowed. */ + if (space > data->ulen) + goto back_up; + + /* + * Determine if the next record is in the + * buffer already or if it needs to be copied in. + * If we have an off page dup, then copy as many + * as will fit into the buffer. + */ + bk = GET_BKEYDATA(dbc->dbp, pg, indx + adj - 1); + if (B_TYPE(bk->type) == B_DUPLICATE) { + bo = (BOVERFLOW *)bk; + if (is_key) { + *offp-- = (int32_t)key_off; + *offp-- = (int32_t)key_size; + } + /* + * We pass the offset of the current key. + * On return we check to see if offp has + * moved to see if any data fit. + */ + saveoffp = offp; + if ((ret = __bam_bulk_duplicates(dbc, bo->pgno, + dbuf, is_key ? offp + P_INDX : NULL, + &offp, &np, &space, no_dup)) != 0) { + if (ret == ENOMEM) { + size = space; + space = 0; + /* If nothing was added, then error. */ + if (offp == saveoffp) { + offp += 2; + goto back_up; + } + goto get_space; + } + return (ret); + } + } else if (B_TYPE(bk->type) == B_OVERFLOW) { + bo = (BOVERFLOW *)bk; + size = bo->tlen; + if (size > space) + goto back_up; + if ((ret = + __bam_bulk_overflow(dbc, + bo->tlen, bo->pgno, np)) != 0) + return (ret); + space -= size; + if (is_key) { + *offp-- = (int32_t)key_off; + *offp-- = (int32_t)key_size; + } else if (rec_key) + *offp-- = (int32_t)cp->recno; + *offp-- = (int32_t)(np - dbuf); + np += size; + *offp-- = (int32_t)size; + } else { + if (need_pg) { + dp = np; + size = pagesize - HOFFSET(pg); + if (space < size) { +back_up: + /* + * Back up the index so that the + * last record in the buffer is CURRENT + */ + if (indx >= adj) + indx -= adj; + else { + if ((ret = + __bam_get_prev(dbc)) != 0 && + ret != DB_NOTFOUND) + return (ret); + indx = cp->indx; + pg = cp->page; + } + if (dbc->dbtype == DB_RECNO) + cp->recno--; +get_space: + /* + * See if we put anything in the + * buffer or if we are doing a DBP->get + * did we get all of the data. + */ + if (offp >= + (is_key ? &endp[-1] : endp) || + F_ISSET(dbc, DBC_TRANSIENT)) { + data->size = ALIGN(size + + data->ulen - space, 1024); + return (ENOMEM); + } + break; + } + memcpy(dp, (u_int8_t *)pg + HOFFSET(pg), size); + need_pg = 0; + space -= size; + np += size; + } + /* + * Add the offsets and sizes to the end of the buffer. + * First add the key info then the data info. + */ + if (is_key) { + *offp-- = (int32_t)key_off; + *offp-- = (int32_t)key_size; + } else if (rec_key) + *offp-- = (int32_t)cp->recno; + *offp-- = (int32_t)((inp[indx + adj - 1] - HOFFSET(pg)) + + (dp - dbuf) + SSZA(BKEYDATA, data)); + *offp-- = bk->len; + } + if (dbc->dbtype == DB_RECNO) + cp->recno++; + else if (no_dup) { + while (indx + adj < NUM_ENT(pg) && + pg_keyoff == inp[indx + adj]) + indx += adj; + } + /* + * Stop when we either run off the page or we + * move to the next key and we are not returning mulitple keys. + */ + } while ((indx += adj) < NUM_ENT(pg) && + (next_key || pg_keyoff == inp[indx])); + + /* If we are off the page then try to the next page. */ + if (ret == 0 && next_key && indx >= NUM_ENT(pg)) { + cp->indx = indx; + ret = __bam_c_next(dbc, 0, 1); + if (ret == 0) + goto next_pg; + if (ret != DB_NOTFOUND) + return (ret); + } + + /* + * If we did a DBP->get we must error if we did not return + * all the data for the current key because there is + * no way to know if we did not get it all, nor any + * interface to fetch the balance. + */ + + if (ret == 0 && indx < pg->entries && + F_ISSET(dbc, DBC_TRANSIENT) && pg_keyoff == inp[indx]) { + data->size = (data->ulen - space) + size; + return (ENOMEM); + } + /* + * Must leave the index pointing at the last record fetched. + * If we are not fetching keys, we may have stepped to the + * next key. + */ + if (ret == ENOMEM || next_key || pg_keyoff == inp[indx]) + cp->indx = indx; + else + cp->indx = indx - P_INDX; + + if (rec_key == 1) + *offp = RECNO_OOB; + else + *offp = -1; + return (0); +} + +/* + * __bam_bulk_overflow -- + * Dump overflow record into the buffer. + * The space requirements have already been checked. + * PUBLIC: int __bam_bulk_overflow + * PUBLIC: __P((DBC *, u_int32_t, db_pgno_t, u_int8_t *)); + */ +int +__bam_bulk_overflow(dbc, len, pgno, dp) + DBC *dbc; + u_int32_t len; + db_pgno_t pgno; + u_int8_t *dp; +{ + DBT dbt; + + memset(&dbt, 0, sizeof(dbt)); + F_SET(&dbt, DB_DBT_USERMEM); + dbt.ulen = len; + dbt.data = (void *)dp; + return (__db_goff(dbc->dbp, &dbt, len, pgno, NULL, NULL)); +} + +/* + * __bam_bulk_duplicates -- + * Put as many off page duplicates as will fit into the buffer. + * This routine will adjust the cursor to reflect the position in + * the overflow tree. + * PUBLIC: int __bam_bulk_duplicates __P((DBC *, + * PUBLIC: db_pgno_t, u_int8_t *, int32_t *, + * PUBLIC: int32_t **, u_int8_t **, u_int32_t *, int)); + */ +int +__bam_bulk_duplicates(dbc, pgno, dbuf, keyoff, offpp, dpp, spacep, no_dup) + DBC *dbc; + db_pgno_t pgno; + u_int8_t *dbuf; + int32_t *keyoff, **offpp; + u_int8_t **dpp; + u_int32_t *spacep; + int no_dup; +{ + DB *dbp; + BKEYDATA *bk; + BOVERFLOW *bo; + BTREE_CURSOR *cp; + DBC *opd; + DBT key, data; + PAGE *pg; + db_indx_t indx, *inp; + int32_t *offp; + u_int32_t pagesize, size, space; + u_int8_t *dp, *np; + int first, need_pg, ret, t_ret; + + ret = 0; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + opd = cp->opd; + + if (opd == NULL) { + if ((ret = __db_c_newopd(dbc, pgno, NULL, &opd)) != 0) + return (ret); + cp->opd = opd; + if ((ret = opd->c_am_get(opd, + &key, &data, DB_FIRST, NULL)) != 0) + goto close_opd; + } + + pagesize = opd->dbp->pgsize; + cp = (BTREE_CURSOR *)opd->internal; + space = *spacep; + /* Get current offset slot. */ + offp = *offpp; + + /* + * np is the next place to put data. + * dp is the begining of the current page in the buffer. + */ + np = dp = *dpp; + first = 1; + indx = cp->indx; + + do { + /* Fetch the current record. No initial move. */ + if ((ret = __bam_c_next(opd, 0, 0)) != 0) + break; + pg = cp->page; + indx = cp->indx; + inp = P_INP(dbp, pg); + /* We need to copy the page to the buffer. */ + need_pg = 1; + + do { + if (IS_DELETED(dbp, pg, indx)) + goto contin; + bk = GET_BKEYDATA(dbp, pg, indx); + space -= 2 * sizeof(*offp); + /* Allocate space for key if needed. */ + if (first == 0 && keyoff != NULL) + space -= 2 * sizeof(*offp); + + /* Did space underflow? */ + if (space > *spacep) { + ret = ENOMEM; + if (first == 1) { + /* Get the absolute value. */ + space = -(int32_t)space; + space = *spacep + space; + if (need_pg) + space += pagesize - HOFFSET(pg); + } + break; + } + if (B_TYPE(bk->type) == B_OVERFLOW) { + bo = (BOVERFLOW *)bk; + size = bo->tlen; + if (size > space) { + ret = ENOMEM; + space = *spacep + size; + break; + } + if (first == 0 && keyoff != NULL) { + *offp-- = keyoff[0]; + *offp-- = keyoff[-1]; + } + if ((ret = __bam_bulk_overflow(dbc, + bo->tlen, bo->pgno, np)) != 0) + return (ret); + space -= size; + *offp-- = (int32_t)(np - dbuf); + np += size; + } else { + if (need_pg) { + dp = np; + size = pagesize - HOFFSET(pg); + if (space < size) { + ret = ENOMEM; + /* Return space required. */ + space = *spacep + size; + break; + } + memcpy(dp, + (u_int8_t *)pg + HOFFSET(pg), size); + need_pg = 0; + space -= size; + np += size; + } + if (first == 0 && keyoff != NULL) { + *offp-- = keyoff[0]; + *offp-- = keyoff[-1]; + } + size = bk->len; + *offp-- = (int32_t)((inp[indx] - HOFFSET(pg)) + + (dp - dbuf) + SSZA(BKEYDATA, data)); + } + *offp-- = (int32_t)size; + first = 0; + if (no_dup) + break; +contin: + indx++; + if (opd->dbtype == DB_RECNO) + cp->recno++; + } while (indx < NUM_ENT(pg)); + if (no_dup) + break; + cp->indx = indx; + + } while (ret == 0); + + /* Return the updated information. */ + *spacep = space; + *offpp = offp; + *dpp = np; + + /* + * If we ran out of space back up the pointer. + * If we did not return any dups or reached the end, close the opd. + */ + if (ret == ENOMEM) { + if (opd->dbtype == DB_RECNO) { + if (--cp->recno == 0) + goto close_opd; + } else if (indx != 0) + cp->indx--; + else { + t_ret = __bam_c_prev(opd); + if (t_ret == DB_NOTFOUND) + goto close_opd; + if (t_ret != 0) + ret = t_ret; + } + } else if (keyoff == NULL && ret == DB_NOTFOUND) { + cp->indx--; + if (opd->dbtype == DB_RECNO) + --cp->recno; + } else if (indx == 0 || ret == DB_NOTFOUND) { +close_opd: + if (ret == DB_NOTFOUND) + ret = 0; + if ((t_ret = __db_c_close(opd)) != 0 && ret == 0) + ret = t_ret; + ((BTREE_CURSOR *)dbc->internal)->opd = NULL; + } + if (ret == DB_NOTFOUND) + ret = 0; + + return (ret); +} + /* * __bam_getbothc -- * Search for a matching data item on a join. @@ -984,9 +1607,11 @@ __bam_getbothc(dbc, data) { BTREE_CURSOR *cp; DB *dbp; + DB_MPOOLFILE *mpf; int cmp, exact, ret; dbp = dbc->dbp; + mpf = dbp->mpf; cp = (BTREE_CURSOR *)dbc->internal; /* @@ -995,7 +1620,7 @@ __bam_getbothc(dbc, data) * write lock, but upgrading to a write lock has no better * chance of succeeding now instead of later, so don't try. */ - if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0) + if ((ret = __memp_fget(mpf, &cp->pgno, 0, &cp->page)) != 0) return (ret); /* @@ -1017,11 +1642,12 @@ __bam_getbothc(dbc, data) return (DB_NOTFOUND); /* Discard the current page, we're going to do a full search. */ - if ((ret = memp_fput(dbp->mpf, cp->page, 0)) != 0) + if ((ret = __memp_fput(mpf, cp->page, 0)) != 0) return (ret); cp->page = NULL; - return (__bam_c_search(dbc, data, DB_GET_BOTH, &exact)); + return (__bam_c_search(dbc, + PGNO_INVALID, data, DB_GET_BOTH, &exact)); } /* @@ -1038,7 +1664,7 @@ __bam_getbothc(dbc, data) return (DB_NOTFOUND); cp->indx += P_INDX; - return (__bam_getboth_finddatum(dbc, data)); + return (__bam_getboth_finddatum(dbc, data, DB_GET_BOTH)); } /* @@ -1046,31 +1672,31 @@ __bam_getbothc(dbc, data) * Find a matching on-page data item. */ static int -__bam_getboth_finddatum(dbc, data) +__bam_getboth_finddatum(dbc, data, flags) DBC *dbc; DBT *data; + u_int32_t flags; { BTREE_CURSOR *cp; DB *dbp; db_indx_t base, lim, top; int cmp, ret; + COMPQUIET(cmp, 0); + dbp = dbc->dbp; cp = (BTREE_CURSOR *)dbc->internal; /* * Called (sometimes indirectly) from DBC->get to search on-page data - * item(s) for a matching value. If the original flag was DB_GET_BOTH, - * the cursor argument is set to the first data item for the key. If - * the original flag was DB_GET_BOTHC, the cursor argument is set to - * the first data item that we can potentially return. In both cases, - * there may or may not be additional duplicate data items to search. + * item(s) for a matching value. If the original flag was DB_GET_BOTH + * or DB_GET_BOTH_RANGE, the cursor is set to the first undeleted data + * item for the key. If the original flag was DB_GET_BOTHC, the cursor + * argument is set to the first data item we can potentially return. + * In both cases, there may or may not be additional duplicate data + * items to search. * * If the duplicates are not sorted, do a linear search. - * - * If the duplicates are sorted, do a binary search. The reason for - * this is that large pages and small key/data pairs result in large - * numbers of on-page duplicates before they get pushed off-page. */ if (dbp->dup_compare == NULL) { for (;; cp->indx += P_INDX) { @@ -1085,41 +1711,62 @@ __bam_getboth_finddatum(dbc, data) !IS_DUPLICATE(dbc, cp->indx, cp->indx + P_INDX)) break; } - } else { - /* - * Find the top and bottom of the duplicate set. Binary search - * requires at least two items, don't loop if there's only one. - */ - for (base = top = cp->indx; - top < NUM_ENT(cp->page); top += P_INDX) - if (!IS_DUPLICATE(dbc, cp->indx, top)) - break; - if (base == (top - P_INDX)) { - if ((ret = __bam_cmp(dbp, data, - cp->page, cp->indx + O_INDX, - dbp->dup_compare, &cmp)) != 0) - return (ret); - return (cmp == 0 ? 0 : DB_NOTFOUND); - } + return (DB_NOTFOUND); + } - for (lim = - (top - base) / (db_indx_t)P_INDX; lim != 0; lim >>= 1) { - cp->indx = base + ((lim >> 1) * P_INDX); - if ((ret = __bam_cmp(dbp, data, cp->page, - cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0) - return (ret); - if (cmp == 0) { - if (!IS_CUR_DELETED(dbc)) - return (0); - break; - } - if (cmp > 0) { - base = cp->indx + P_INDX; - --lim; - } + /* + * If the duplicates are sorted, do a binary search. The reason for + * this is that large pages and small key/data pairs result in large + * numbers of on-page duplicates before they get pushed off-page. + * + * Find the top and bottom of the duplicate set. Binary search + * requires at least two items, don't loop if there's only one. + */ + for (base = top = cp->indx; top < NUM_ENT(cp->page); top += P_INDX) + if (!IS_DUPLICATE(dbc, cp->indx, top)) + break; + if (base == (top - P_INDX)) { + if ((ret = __bam_cmp(dbp, data, + cp->page, cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0) + return (ret); + return (cmp == 0 || + (cmp < 0 && flags == DB_GET_BOTH_RANGE) ? 0 : DB_NOTFOUND); + } + + for (lim = (top - base) / (db_indx_t)P_INDX; lim != 0; lim >>= 1) { + cp->indx = base + ((lim >> 1) * P_INDX); + if ((ret = __bam_cmp(dbp, data, cp->page, + cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0) + return (ret); + if (cmp == 0) { + /* + * XXX + * No duplicate duplicates in sorted duplicate sets, + * so there can be only one. + */ + if (!IS_CUR_DELETED(dbc)) + return (0); + break; + } + if (cmp > 0) { + base = cp->indx + P_INDX; + --lim; } } - return (DB_NOTFOUND); + + /* No match found; if we're looking for an exact match, we're done. */ + if (flags == DB_GET_BOTH) + return (DB_NOTFOUND); + + /* + * Base is the smallest index greater than the data item, may be zero + * or a last + O_INDX index, and may be deleted. Find an undeleted + * item. + */ + cp->indx = base; + while (cp->indx < top && IS_CUR_DELETED(dbc)) + cp->indx += P_INDX; + return (cp->indx < top ? 0 : DB_NOTFOUND); } /* @@ -1136,20 +1783,24 @@ __bam_c_put(dbc, key, data, flags, pgnop) BTREE_CURSOR *cp; DB *dbp; DBT dbt; + DB_MPOOLFILE *mpf; + db_pgno_t root_pgno; u_int32_t iiop; - int cmp, exact, needkey, ret, stack; + int cmp, exact, own, ret, stack; void *arg; dbp = dbc->dbp; + mpf = dbp->mpf; cp = (BTREE_CURSOR *)dbc->internal; + root_pgno = cp->root; -split: needkey = ret = stack = 0; +split: ret = stack = 0; switch (flags) { case DB_AFTER: case DB_BEFORE: case DB_CURRENT: - needkey = 1; iiop = flags; + own = 1; /* * If the Btree has record numbers (and we're not replacing an @@ -1182,25 +1833,33 @@ split: needkey = ret = stack = 0; ACQUIRE_WRITE_LOCK(dbc, ret); if (ret != 0) goto err; - if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0) + if ((ret = __memp_fget(mpf, &cp->pgno, 0, &cp->page)) != 0) goto err; break; case DB_KEYFIRST: case DB_KEYLAST: case DB_NODUPDATA: + own = 0; /* * Searching off-page, sorted duplicate tree: do a tree search * for the correct item; __bam_c_search returns the smallest * slot greater than the key, use it. + * + * See comment below regarding where we can start the search. */ if (F_ISSET(dbc, DBC_OPD)) { - if ((ret = - __bam_c_search(dbc, data, flags, &exact)) != 0) + if ((ret = __bam_c_search(dbc, + F_ISSET(cp, C_RECNUM) ? cp->root : root_pgno, + data, flags, &exact)) != 0) goto err; stack = 1; /* Disallow "sorted" duplicate duplicates. */ if (exact) { + if (IS_DELETED(dbp, cp->page, cp->indx)) { + iiop = DB_CURRENT; + break; + } ret = __db_duperr(dbp, flags); goto err; } @@ -1208,8 +1867,17 @@ split: needkey = ret = stack = 0; break; } - /* Searching a btree. */ - if ((ret = __bam_c_search(dbc, key, + /* + * Searching a btree. + * + * If we've done a split, we can start the search from the + * parent of the split page, which __bam_split returned + * for us in root_pgno, unless we're in a Btree with record + * numbering. In that case, we'll need the true root page + * in order to adjust the record count. + */ + if ((ret = __bam_c_search(dbc, + F_ISSET(cp, C_RECNUM) ? cp->root : root_pgno, key, flags == DB_KEYFIRST || dbp->dup_compare != NULL ? DB_KEYFIRST : DB_KEYLAST, &exact)) != 0) goto err; @@ -1264,8 +1932,8 @@ split: needkey = ret = stack = 0; */ for (;; cp->indx += P_INDX) { if ((ret = __bam_cmp(dbp, data, cp->page, - cp->indx + O_INDX, dbp->dup_compare, &cmp)) !=0) - return (ret); + cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0) + goto err; if (cmp < 0) { iiop = DB_BEFORE; break; @@ -1273,7 +1941,7 @@ split: needkey = ret = stack = 0; /* Disallow "sorted" duplicate duplicates. */ if (cmp == 0) { - if (IS_DELETED(cp->page, cp->indx)) { + if (IS_DELETED(dbp, cp->page, cp->indx)) { iiop = DB_CURRENT; break; } @@ -1282,8 +1950,8 @@ split: needkey = ret = stack = 0; } if (cp->indx + P_INDX >= NUM_ENT(cp->page) || - ((PAGE *)cp->page)->inp[cp->indx] != - ((PAGE *)cp->page)->inp[cp->indx + P_INDX]) { + P_INP(dbp, ((PAGE *)cp->page))[cp->indx] != + P_INP(dbp, ((PAGE *)cp->page))[cp->indx + P_INDX]) { iiop = DB_AFTER; break; } @@ -1306,7 +1974,7 @@ split: needkey = ret = stack = 0; flags == DB_BEFORE || flags == DB_CURRENT) { memset(&dbt, 0, sizeof(DBT)); if ((ret = __db_ret(dbp, cp->page, 0, &dbt, - &dbc->rkey.data, &dbc->rkey.ulen)) != 0) + &dbc->my_rkey.data, &dbc->my_rkey.ulen)) != 0) goto err; arg = &dbt; } else @@ -1326,8 +1994,22 @@ split: needkey = ret = stack = 0; if (ret != 0) goto err; + /* + * SR [#6059] + * If we do not own a lock on the page anymore then + * clear the cursor so we don't point at it. + * Even if we call __bam_stkrel above we still + * may have entered the routine with the cursor + * posistioned to a particular record. This + * is in the case where C_RECNUM is set. + */ + if (own == 0) { + cp->pgno = PGNO_INVALID; + cp->indx = 0; + } + /* Split the tree. */ - if ((ret = __bam_split(dbc, arg)) != 0) + if ((ret = __bam_split(dbc, arg, &root_pgno)) != 0) return (ret); goto split; @@ -1351,8 +2033,15 @@ done: /* * flag. If we're successful, we either moved the cursor or the item * is no longer deleted. If we're not successful, then we're just a * copy, no need to have the flag set. + * + * We may have instantiated off-page duplicate cursors during the put, + * so clear the deleted bit from the off-page duplicate cursor as well. */ F_CLR(cp, C_DELETED); + if (cp->opd != NULL) { + cp = (BTREE_CURSOR *)cp->opd->internal; + F_CLR(cp, C_DELETED); + } return (ret); } @@ -1361,22 +2050,22 @@ done: /* * __bam_c_rget -- * Return the record number for a cursor. * - * PUBLIC: int __bam_c_rget __P((DBC *, DBT *, u_int32_t)); + * PUBLIC: int __bam_c_rget __P((DBC *, DBT *)); */ int -__bam_c_rget(dbc, data, flags) +__bam_c_rget(dbc, data) DBC *dbc; DBT *data; - u_int32_t flags; { BTREE_CURSOR *cp; DB *dbp; DBT dbt; + DB_MPOOLFILE *mpf; db_recno_t recno; - int exact, ret; + int exact, ret, t_ret; - COMPQUIET(flags, 0); dbp = dbc->dbp; + mpf = dbp->mpf; cp = (BTREE_CURSOR *)dbc->internal; /* @@ -1384,27 +2073,28 @@ __bam_c_rget(dbc, data, flags) * Get a copy of the key. * Release the page, making sure we don't release it twice. */ - if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &cp->page)) != 0) + if ((ret = __memp_fget(mpf, &cp->pgno, 0, &cp->page)) != 0) return (ret); memset(&dbt, 0, sizeof(DBT)); if ((ret = __db_ret(dbp, cp->page, - cp->indx, &dbt, &dbc->rkey.data, &dbc->rkey.ulen)) != 0) + cp->indx, &dbt, &dbc->my_rkey.data, &dbc->my_rkey.ulen)) != 0) goto err; - ret = memp_fput(dbp->mpf, cp->page, 0); + ret = __memp_fput(mpf, cp->page, 0); cp->page = NULL; if (ret != 0) return (ret); - if ((ret = __bam_search(dbc, &dbt, + if ((ret = __bam_search(dbc, PGNO_INVALID, &dbt, F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND, 1, &recno, &exact)) != 0) goto err; - ret = __db_retcopy(dbp, data, - &recno, sizeof(recno), &dbc->rdata.data, &dbc->rdata.ulen); + ret = __db_retcopy(dbp->dbenv, data, + &recno, sizeof(recno), &dbc->rdata->data, &dbc->rdata->ulen); /* Release the stack. */ -err: __bam_stkrel(dbc, 0); +err: if ((t_ret = __bam_stkrel(dbc, 0)) != 0 && ret == 0) + ret = t_ret; return (ret); } @@ -1444,17 +2134,15 @@ __bam_c_first(dbc) DBC *dbc; { BTREE_CURSOR *cp; - DB *dbp; db_pgno_t pgno; int ret; - dbp = dbc->dbp; cp = (BTREE_CURSOR *)dbc->internal; ret = 0; /* Walk down the left-hand side of the tree. */ for (pgno = cp->root;;) { - ACQUIRE_CUR_SET(dbc, DB_LOCK_READ, pgno, ret); + ACQUIRE_CUR_COUPLE(dbc, DB_LOCK_READ, pgno, ret); if (ret != 0) return (ret); @@ -1462,7 +2150,7 @@ __bam_c_first(dbc) if (ISLEAF(cp->page)) break; - pgno = GET_BINTERNAL(cp->page, 0)->pgno; + pgno = GET_BINTERNAL(dbc->dbp, cp->page, 0)->pgno; } /* If we want a write lock instead of a read lock, get it now. */ @@ -1472,9 +2160,11 @@ __bam_c_first(dbc) return (ret); } + cp->indx = 0; + /* If on an empty page or a deleted record, move to the next one. */ if (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(dbc)) - if ((ret = __bam_c_next(dbc, 0)) != 0) + if ((ret = __bam_c_next(dbc, 0, 0)) != 0) return (ret); return (0); @@ -1489,17 +2179,15 @@ __bam_c_last(dbc) DBC *dbc; { BTREE_CURSOR *cp; - DB *dbp; db_pgno_t pgno; int ret; - dbp = dbc->dbp; cp = (BTREE_CURSOR *)dbc->internal; ret = 0; /* Walk down the right-hand side of the tree. */ for (pgno = cp->root;;) { - ACQUIRE_CUR_SET(dbc, DB_LOCK_READ, pgno, ret); + ACQUIRE_CUR_COUPLE(dbc, DB_LOCK_READ, pgno, ret); if (ret != 0) return (ret); @@ -1507,8 +2195,8 @@ __bam_c_last(dbc) if (ISLEAF(cp->page)) break; - pgno = - GET_BINTERNAL(cp->page, NUM_ENT(cp->page) - O_INDX)->pgno; + pgno = GET_BINTERNAL(dbc->dbp, cp->page, + NUM_ENT(cp->page) - O_INDX)->pgno; } /* If we want a write lock instead of a read lock, get it now. */ @@ -1535,18 +2223,16 @@ __bam_c_last(dbc) * Move to the next record. */ static int -__bam_c_next(dbc, initial_move) +__bam_c_next(dbc, initial_move, deleted_okay) DBC *dbc; - int initial_move; + int initial_move, deleted_okay; { BTREE_CURSOR *cp; - DB *dbp; db_indx_t adjust; db_lockmode_t lock_mode; db_pgno_t pgno; int ret; - dbp = dbc->dbp; cp = (BTREE_CURSOR *)dbc->internal; ret = 0; @@ -1566,7 +2252,7 @@ __bam_c_next(dbc, initial_move) F_ISSET(dbc, DBC_RMW) ? DB_LOCK_WRITE : DB_LOCK_READ; } if (cp->page == NULL) { - ACQUIRE_CUR(dbc, lock_mode, ret); + ACQUIRE_CUR(dbc, lock_mode, cp->pgno, ret); if (ret != 0) return (ret); } @@ -1587,12 +2273,13 @@ __bam_c_next(dbc, initial_move) = NEXT_PGNO(cp->page)) == PGNO_INVALID) return (DB_NOTFOUND); - ACQUIRE_CUR_SET(dbc, lock_mode, pgno, ret); + ACQUIRE_CUR(dbc, lock_mode, pgno, ret); if (ret != 0) return (ret); + cp->indx = 0; continue; } - if (IS_CUR_DELETED(dbc)) { + if (!deleted_okay && IS_CUR_DELETED(dbc)) { cp->indx += adjust; continue; } @@ -1610,13 +2297,11 @@ __bam_c_prev(dbc) DBC *dbc; { BTREE_CURSOR *cp; - DB *dbp; db_indx_t adjust; db_lockmode_t lock_mode; db_pgno_t pgno; int ret; - dbp = dbc->dbp; cp = (BTREE_CURSOR *)dbc->internal; ret = 0; @@ -1636,7 +2321,7 @@ __bam_c_prev(dbc) F_ISSET(dbc, DBC_RMW) ? DB_LOCK_WRITE : DB_LOCK_READ; } if (cp->page == NULL) { - ACQUIRE_CUR(dbc, lock_mode, ret); + ACQUIRE_CUR(dbc, lock_mode, cp->pgno, ret); if (ret != 0) return (ret); } @@ -1648,7 +2333,7 @@ __bam_c_prev(dbc) PREV_PGNO(cp->page)) == PGNO_INVALID) return (DB_NOTFOUND); - ACQUIRE_CUR_SET(dbc, lock_mode, pgno, ret); + ACQUIRE_CUR(dbc, lock_mode, pgno, ret); if (ret != 0) return (ret); @@ -1671,8 +2356,9 @@ __bam_c_prev(dbc) * Move to a specified record. */ static int -__bam_c_search(dbc, key, flags, exactp) +__bam_c_search(dbc, root_pgno, key, flags, exactp) DBC *dbc; + db_pgno_t root_pgno; const DBT *key; u_int32_t flags; int *exactp; @@ -1681,7 +2367,7 @@ __bam_c_search(dbc, key, flags, exactp) BTREE_CURSOR *cp; DB *dbp; PAGE *h; - db_indx_t indx; + db_indx_t indx, *inp; db_pgno_t bt_lpgno; db_recno_t recno; u_int32_t sflags; @@ -1712,6 +2398,9 @@ __bam_c_search(dbc, key, flags, exactp) case DB_GET_BOTH: sflags = (F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND) | S_EXACT; goto search; + case DB_GET_BOTH_RANGE: + sflags = (F_ISSET(dbc, DBC_RMW) ? S_FIND_WR : S_FIND); + goto search; case DB_SET_RANGE: sflags = (F_ISSET(dbc, DBC_RMW) ? S_WRITE : S_READ) | S_DUPFIRST; @@ -1753,11 +2442,12 @@ fast_search: /* /* Lock and retrieve the page on which we last inserted. */ h = NULL; - ACQUIRE(dbc, - DB_LOCK_WRITE, bt_lpgno, cp->lock, bt_lpgno, h, ret); + ACQUIRE_CUR(dbc, DB_LOCK_WRITE, bt_lpgno, ret); if (ret != 0) goto fast_miss; + h = cp->page; + inp = P_INP(dbp, h); /* * It's okay if the page type isn't right or it's empty, it * just means that the world changed. @@ -1796,7 +2486,7 @@ fast_search: /* if (flags == DB_KEYLAST) goto fast_hit; for (; - indx > 0 && h->inp[indx - P_INDX] == h->inp[indx]; + indx > 0 && inp[indx - P_INDX] == inp[indx]; indx -= P_INDX) ; goto fast_hit; @@ -1823,7 +2513,7 @@ try_begin: if (h->prev_pgno == PGNO_INVALID) { goto fast_hit; for (; indx < (db_indx_t)(NUM_ENT(h) - P_INDX) && - h->inp[indx] == h->inp[indx + P_INDX]; + inp[indx] == inp[indx + P_INDX]; indx += P_INDX) ; goto fast_hit; @@ -1848,12 +2538,14 @@ fast_miss: /* * This was not the right page, so we do not need to retain * the lock even in the presence of transactions. */ - DISCARD(dbc, 1, cp->lock, h, ret); + DISCARD_CUR(dbc, ret); + cp->pgno = PGNO_INVALID; + (void)__LPUT(dbc, cp->lock); if (ret != 0) return (ret); -search: if ((ret = - __bam_search(dbc, key, sflags, 1, NULL, exactp)) != 0) +search: if ((ret = __bam_search(dbc, root_pgno, + key, sflags, 1, NULL, exactp)) != 0) return (ret); break; default: @@ -1870,12 +2562,15 @@ search: if ((ret = /* * If we inserted a key into the first or last slot of the tree, * remember where it was so we can do it more quickly next time. + * If there are duplicates and we are inserting into the last slot, + * the cursor will point _to_ the last item, not after it, which + * is why we subtract P_INDX below. */ if (TYPE(cp->page) == P_LBTREE && (flags == DB_KEYFIRST || flags == DB_KEYLAST)) t->bt_lpgno = (NEXT_PGNO(cp->page) == PGNO_INVALID && - cp->indx >= NUM_ENT(cp->page)) || + cp->indx >= NUM_ENT(cp->page) - P_INDX) || (PREV_PGNO(cp->page) == PGNO_INVALID && cp->indx == 0) ? cp->pgno : PGNO_INVALID; return (0); @@ -1893,11 +2588,14 @@ __bam_c_physdel(dbc) DB *dbp; DBT key; DB_LOCK lock; + DB_MPOOLFILE *mpf; PAGE *h; db_pgno_t pgno; int delete_page, empty_page, exact, level, ret; dbp = dbc->dbp; + memset(&key, 0, sizeof(DBT)); + mpf = dbp->mpf; cp = (BTREE_CURSOR *)dbc->internal; delete_page = empty_page = ret = 0; @@ -1911,7 +2609,7 @@ __bam_c_physdel(dbc) * space will never be reused unless the exact same key is specified. */ if (delete_page && - !F_ISSET(dbc, DBC_OPD) && F_ISSET(dbp, DB_BT_REVSPLIT)) + !F_ISSET(dbc, DBC_OPD) && F_ISSET(dbp, DB_AM_REVSPLITOFF)) delete_page = 0; /* @@ -1926,13 +2624,17 @@ __bam_c_physdel(dbc) * To delete a leaf page other than an empty root page, we need a * copy of a key from the page. Use the 0th page index since it's * the last key the page held. + * + * !!! + * Note that because __bam_c_physdel is always called from a cursor + * close, it should be safe to use the cursor's own "my_rkey" memory + * to temporarily hold this key. We shouldn't own any returned-data + * memory of interest--if we do, we're in trouble anyway. */ - if (delete_page) { - memset(&key, 0, sizeof(DBT)); + if (delete_page) if ((ret = __db_ret(dbp, cp->page, - 0, &key, &dbc->rkey.data, &dbc->rkey.ulen)) != 0) + 0, &key, &dbc->my_rkey.data, &dbc->my_rkey.ulen)) != 0) return (ret); - } /* * Delete the items. If page isn't empty, we adjust the cursors. @@ -1940,7 +2642,7 @@ __bam_c_physdel(dbc) * !!! * The following operations to delete a page may deadlock. The easy * scenario is if we're deleting an item because we're closing cursors - * because we've already deadlocked and want to call txn_abort(). If + * because we've already deadlocked and want to call txn->abort. If * we fail due to deadlock, we'll leave a locked, possibly empty page * in the tree, which won't be empty long because we'll undo the delete * when we undo the transaction's modifications. @@ -1977,8 +2679,8 @@ __bam_c_physdel(dbc) */ for (level = LEAFLEVEL;; ++level) { /* Acquire a page and its parent, locked. */ - if ((ret = __bam_search( - dbc, &key, S_WRPAIR, level, NULL, &exact)) != 0) + if ((ret = __bam_search(dbc, PGNO_INVALID, + &key, S_WRPAIR, level, NULL, &exact)) != 0) return (ret); /* @@ -2031,19 +2733,19 @@ __bam_c_physdel(dbc) */ switch (TYPE(h)) { case P_IBTREE: - pgno = GET_BINTERNAL(h, 0)->pgno; + pgno = GET_BINTERNAL(dbp, h, 0)->pgno; break; case P_IRECNO: - pgno = GET_RINTERNAL(h, 0)->pgno; + pgno = GET_RINTERNAL(dbp, h, 0)->pgno; break; default: - return (__db_pgfmt(dbp, PGNO(h))); + return (__db_pgfmt(dbp->dbenv, PGNO(h))); } if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &lock)) != 0) break; - if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + if ((ret = __memp_fget(mpf, &pgno, 0, &h)) != 0) break; BT_STK_PUSH(dbp->dbenv, cp, h, 0, lock, DB_LOCK_WRITE, ret); if (ret != 0) @@ -2076,10 +2778,12 @@ __bam_c_getstack(dbc) BTREE_CURSOR *cp; DB *dbp; DBT dbt; + DB_MPOOLFILE *mpf; PAGE *h; int exact, ret, t_ret; dbp = dbc->dbp; + mpf = dbp->mpf; cp = (BTREE_CURSOR *)dbc->internal; /* @@ -2087,21 +2791,22 @@ __bam_c_getstack(dbc) * routine has to already hold a read lock on the page, so there * is no additional lock to acquire. */ - if ((ret = memp_fget(dbp->mpf, &cp->pgno, 0, &h)) != 0) + if ((ret = __memp_fget(mpf, &cp->pgno, 0, &h)) != 0) return (ret); /* Get a copy of a key from the page. */ memset(&dbt, 0, sizeof(DBT)); if ((ret = __db_ret(dbp, - h, 0, &dbt, &dbc->rkey.data, &dbc->rkey.ulen)) != 0) + h, 0, &dbt, &dbc->my_rkey.data, &dbc->my_rkey.ulen)) != 0) goto err; /* Get a write-locked stack for the page. */ exact = 0; - ret = __bam_search(dbc, &dbt, S_KEYFIRST, 1, NULL, &exact); + ret = __bam_search(dbc, PGNO_INVALID, + &dbt, S_KEYFIRST, 1, NULL, &exact); err: /* Discard the key and the page. */ - if ((t_ret = memp_fput(dbp->mpf, h, 0)) != 0 && ret == 0) + if ((t_ret = __memp_fput(mpf, h, 0)) != 0 && ret == 0) ret = t_ret; return (ret); @@ -2122,7 +2827,8 @@ __bam_isopd(dbc, pgnop) if (TYPE(dbc->internal->page) != P_LBTREE) return (0); - bo = GET_BOVERFLOW(dbc->internal->page, dbc->internal->indx + O_INDX); + bo = GET_BOVERFLOW(dbc->dbp, + dbc->internal->page, dbc->internal->indx + O_INDX); if (B_TYPE(bo->type) == B_DUPLICATE) { *pgnop = bo->pgno; return (1); diff --git a/db/btree/bt_delete.c b/db/btree/bt_delete.c index 972588788..ef6e34caf 100644 --- a/db/btree/bt_delete.c +++ b/db/btree/bt_delete.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2003 * Sleepycat Software. All rights reserved. */ /* @@ -43,7 +43,7 @@ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: bt_delete.c,v 11.31 2001/01/17 18:48:46 bostic Exp $"; +static const char revid[] = "$Id: bt_delete.c,v 11.46 2003/06/30 17:19:29 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -53,88 +53,11 @@ static const char revid[] = "$Id: bt_delete.c,v 11.31 2001/01/17 18:48:46 bostic #endif #include "db_int.h" -#include "db_page.h" -#include "db_shash.h" -#include "btree.h" -#include "lock.h" - -/* - * __bam_delete -- - * Delete the items referenced by a key. - * - * PUBLIC: int __bam_delete __P((DB *, DB_TXN *, DBT *, u_int32_t)); - */ -int -__bam_delete(dbp, txn, key, flags) - DB *dbp; - DB_TXN *txn; - DBT *key; - u_int32_t flags; -{ - DBC *dbc; - DBT lkey; - DBT data; - u_int32_t f_init, f_next; - int ret, t_ret; - - PANIC_CHECK(dbp->dbenv); - DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->del"); - DB_CHECK_TXN(dbp, txn); - - /* Check for invalid flags. */ - if ((ret = - __db_delchk(dbp, key, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0) - return (ret); - - /* Allocate a cursor. */ - if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0) - return (ret); - - DEBUG_LWRITE(dbc, txn, "bam_delete", key, NULL, flags); - - /* - * Walk a cursor through the key/data pairs, deleting as we go. Set - * the DB_DBT_USERMEM flag, as this might be a threaded application - * and the flags checking will catch us. We don't actually want the - * keys or data, so request a partial of length 0. - */ - memset(&lkey, 0, sizeof(lkey)); - F_SET(&lkey, DB_DBT_USERMEM | DB_DBT_PARTIAL); - memset(&data, 0, sizeof(data)); - F_SET(&data, DB_DBT_USERMEM | DB_DBT_PARTIAL); - - /* - * If locking (and we haven't already acquired CDB locks), set the - * read-modify-write flag. - */ - f_init = DB_SET; - f_next = DB_NEXT_DUP; - if (STD_LOCKING(dbc)) { - f_init |= DB_RMW; - f_next |= DB_RMW; - } - - /* Walk through the set of key/data pairs, deleting as we go. */ - if ((ret = dbc->c_get(dbc, key, &data, f_init)) != 0) - goto err; - for (;;) { - if ((ret = dbc->c_del(dbc, 0)) != 0) - goto err; - if ((ret = dbc->c_get(dbc, &lkey, &data, f_next)) != 0) { - if (ret == DB_NOTFOUND) { - ret = 0; - break; - } - goto err; - } - } - -err: /* Discard the cursor. */ - if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) - ret = t_ret; - - return (ret); -} +#include "dbinc/db_page.h" +#include "dbinc/db_shash.h" +#include "dbinc/btree.h" +#include "dbinc/lock.h" +#include "dbinc/mp.h" /* * __bam_ditem -- @@ -151,14 +74,18 @@ __bam_ditem(dbc, h, indx) BINTERNAL *bi; BKEYDATA *bk; DB *dbp; + DB_MPOOLFILE *mpf; u_int32_t nbytes; int ret; + db_indx_t *inp; dbp = dbc->dbp; + mpf = dbp->mpf; + inp = P_INP(dbp, h); switch (TYPE(h)) { case P_IBTREE: - bi = GET_BINTERNAL(h, indx); + bi = GET_BINTERNAL(dbp, h, indx); switch (B_TYPE(bi->type)) { case B_DUPLICATE: case B_KEYDATA: @@ -171,7 +98,7 @@ __bam_ditem(dbc, h, indx) return (ret); break; default: - return (__db_pgfmt(dbp, PGNO(h))); + return (__db_pgfmt(dbp->dbenv, PGNO(h))); } break; case P_IRECNO: @@ -195,7 +122,7 @@ __bam_ditem(dbc, h, indx) * won't work! */ if (indx + P_INDX < (u_int32_t)NUM_ENT(h) && - h->inp[indx] == h->inp[indx + P_INDX]) + inp[indx] == inp[indx + P_INDX]) return (__bam_adjindx(dbc, h, indx, indx + O_INDX, 0)); /* @@ -203,14 +130,14 @@ __bam_ditem(dbc, h, indx) * doesn't matter if we delete the key item before or * after the data item for the purposes of this one. */ - if (indx > 0 && h->inp[indx] == h->inp[indx - P_INDX]) + if (indx > 0 && inp[indx] == inp[indx - P_INDX]) return (__bam_adjindx(dbc, h, indx, indx - P_INDX, 0)); } /* FALLTHROUGH */ case P_LDUP: case P_LRECNO: - bk = GET_BKEYDATA(h, indx); + bk = GET_BKEYDATA(dbp, h, indx); switch (B_TYPE(bk->type)) { case B_DUPLICATE: nbytes = BOVERFLOW_SIZE; @@ -218,24 +145,24 @@ __bam_ditem(dbc, h, indx) case B_OVERFLOW: nbytes = BOVERFLOW_SIZE; if ((ret = __db_doff( - dbc, (GET_BOVERFLOW(h, indx))->pgno)) != 0) + dbc, (GET_BOVERFLOW(dbp, h, indx))->pgno)) != 0) return (ret); break; case B_KEYDATA: nbytes = BKEYDATA_SIZE(bk->len); break; default: - return (__db_pgfmt(dbp, PGNO(h))); + return (__db_pgfmt(dbp->dbenv, PGNO(h))); } break; default: - return (__db_pgfmt(dbp, PGNO(h))); + return (__db_pgfmt(dbp->dbenv, PGNO(h))); } /* Delete the item and mark the page dirty. */ if ((ret = __db_ditem(dbc, h, indx, nbytes)) != 0) return (ret); - if ((ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0) + if ((ret = __memp_fset(mpf, h, DB_MPOOL_DIRTY)) != 0) return (ret); return (0); @@ -255,33 +182,37 @@ __bam_adjindx(dbc, h, indx, indx_copy, is_insert) int is_insert; { DB *dbp; - db_indx_t copy; + DB_MPOOLFILE *mpf; + db_indx_t copy, *inp; int ret; dbp = dbc->dbp; + mpf = dbp->mpf; + inp = P_INP(dbp, h); /* Log the change. */ - if (DB_LOGGING(dbc) && - (ret = __bam_adj_log(dbp->dbenv, dbc->txn, &LSN(h), - 0, dbp->log_fileid, PGNO(h), &LSN(h), indx, indx_copy, - (u_int32_t)is_insert)) != 0) - return (ret); + if (DBC_LOGGING(dbc)) { + if ((ret = __bam_adj_log(dbp, dbc->txn, &LSN(h), 0, + PGNO(h), &LSN(h), indx, indx_copy, (u_int32_t)is_insert)) != 0) + return (ret); + } else + LSN_NOT_LOGGED(LSN(h)); /* Shuffle the indices and mark the page dirty. */ if (is_insert) { - copy = h->inp[indx_copy]; + copy = inp[indx_copy]; if (indx != NUM_ENT(h)) - memmove(&h->inp[indx + O_INDX], &h->inp[indx], + memmove(&inp[indx + O_INDX], &inp[indx], sizeof(db_indx_t) * (NUM_ENT(h) - indx)); - h->inp[indx] = copy; + inp[indx] = copy; ++NUM_ENT(h); } else { --NUM_ENT(h); if (indx != NUM_ENT(h)) - memmove(&h->inp[indx], &h->inp[indx + O_INDX], + memmove(&inp[indx], &inp[indx + O_INDX], sizeof(db_indx_t) * (NUM_ENT(h) - indx)); } - if ((ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0) + if ((ret = __memp_fset(mpf, h, DB_MPOOL_DIRTY)) != 0) return (ret); return (0); @@ -303,6 +234,7 @@ __bam_dpages(dbc, stack_epg) DB *dbp; DBT a, b; DB_LOCK c_lock, p_lock; + DB_MPOOLFILE *mpf; EPG *epg; PAGE *child, *parent; db_indx_t nitems; @@ -311,6 +243,7 @@ __bam_dpages(dbc, stack_epg) int done, ret, t_ret; dbp = dbc->dbp; + mpf = dbp->mpf; cp = (BTREE_CURSOR *)dbc->internal; /* @@ -328,8 +261,7 @@ __bam_dpages(dbc, stack_epg) */ ret = 0; for (epg = cp->sp; epg < stack_epg; ++epg) { - if ((t_ret = - memp_fput(dbp->mpf, epg->page, 0)) != 0 && ret == 0) + if ((t_ret = __memp_fput(mpf, epg->page, 0)) != 0 && ret == 0) ret = t_ret; (void)__TLPUT(dbc, epg->lock); } @@ -364,7 +296,7 @@ __bam_dpages(dbc, stack_epg) pgno = PGNO(epg->page); nitems = NUM_ENT(epg->page); - if ((ret = memp_fput(dbp->mpf, epg->page, 0)) != 0) + if ((ret = __memp_fput(mpf, epg->page, 0)) != 0) goto err_inc; (void)__TLPUT(dbc, epg->lock); @@ -394,7 +326,7 @@ __bam_dpages(dbc, stack_epg) err_inc: ++epg; err: for (; epg <= cp->csp; ++epg) { if (epg->page != NULL) - (void)memp_fput(dbp->mpf, epg->page, 0); + (void)__memp_fput(mpf, epg->page, 0); (void)__TLPUT(dbc, epg->lock); } BT_STK_CLR(cp); @@ -415,14 +347,15 @@ err: for (; epg <= cp->csp; ++epg) { for (done = 0; !done;) { /* Initialize. */ parent = child = NULL; - p_lock.off = c_lock.off = LOCK_INVALID; + LOCK_INIT(p_lock); + LOCK_INIT(c_lock); /* Lock the root. */ pgno = root_pgno; if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &p_lock)) != 0) goto stop; - if ((ret = memp_fget(dbp->mpf, &pgno, 0, &parent)) != 0) + if ((ret = __memp_fget(mpf, &pgno, 0, &parent)) != 0) goto stop; if (NUM_ENT(parent) != 1) @@ -434,7 +367,7 @@ err: for (; epg <= cp->csp; ++epg) { * If this is overflow, then try to delete it. * The child may or may not still point at it. */ - bi = GET_BINTERNAL(parent, 0); + bi = GET_BINTERNAL(dbp, parent, 0); if (B_TYPE(bi->type) == B_OVERFLOW) if ((ret = __db_doff(dbc, ((BOVERFLOW *)bi->data)->pgno)) != 0) @@ -442,7 +375,7 @@ err: for (; epg <= cp->csp; ++epg) { pgno = bi->pgno; break; case P_IRECNO: - pgno = GET_RINTERNAL(parent, 0)->pgno; + pgno = GET_RINTERNAL(dbp, parent, 0)->pgno; break; default: goto stop; @@ -452,24 +385,24 @@ err: for (; epg <= cp->csp; ++epg) { if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &c_lock)) != 0) goto stop; - if ((ret = memp_fget(dbp->mpf, &pgno, 0, &child)) != 0) + if ((ret = __memp_fget(mpf, &pgno, 0, &child)) != 0) goto stop; /* Log the change. */ - if (DB_LOGGING(dbc)) { + if (DBC_LOGGING(dbc)) { memset(&a, 0, sizeof(a)); a.data = child; a.size = dbp->pgsize; memset(&b, 0, sizeof(b)); - b.data = P_ENTRY(parent, 0); + b.data = P_ENTRY(dbp, parent, 0); b.size = TYPE(parent) == P_IRECNO ? RINTERNAL_SIZE : BINTERNAL_SIZE(((BINTERNAL *)b.data)->len); - if ((ret = - __bam_rsplit_log(dbp->dbenv, dbc->txn, &child->lsn, - 0, dbp->log_fileid, PGNO(child), &a, PGNO(parent), - RE_NREC(parent), &b, &parent->lsn)) != 0) + if ((ret = __bam_rsplit_log(dbp, dbc->txn, + &child->lsn, 0, PGNO(child), &a, PGNO(parent), + RE_NREC(parent), &b, &parent->lsn)) != 0) goto stop; - } + } else + LSN_NOT_LOGGED(child->lsn); /* * Make the switch. @@ -491,9 +424,9 @@ err: for (; epg <= cp->csp; ++epg) { RE_NREC_SET(parent, rcnt); /* Mark the pages dirty. */ - if ((ret = memp_fset(dbp->mpf, parent, DB_MPOOL_DIRTY)) != 0) + if ((ret = __memp_fset(mpf, parent, DB_MPOOL_DIRTY)) != 0) goto stop; - if ((ret = memp_fset(dbp->mpf, child, DB_MPOOL_DIRTY)) != 0) + if ((ret = __memp_fset(mpf, child, DB_MPOOL_DIRTY)) != 0) goto stop; /* Adjust the cursors. */ @@ -514,15 +447,13 @@ err: for (; epg <= cp->csp; ++epg) { if (0) { stop: done = 1; } - if (p_lock.off != LOCK_INVALID) - (void)__TLPUT(dbc, p_lock); + (void)__TLPUT(dbc, p_lock); if (parent != NULL && - (t_ret = memp_fput(dbp->mpf, parent, 0)) != 0 && ret == 0) + (t_ret = __memp_fput(mpf, parent, 0)) != 0 && ret == 0) ret = t_ret; - if (c_lock.off != LOCK_INVALID) - (void)__TLPUT(dbc, c_lock); + (void)__TLPUT(dbc, c_lock); if (child != NULL && - (t_ret = memp_fput(dbp->mpf, child, 0)) != 0 && ret == 0) + (t_ret = __memp_fput(mpf, child, 0)) != 0 && ret == 0) ret = t_ret; } diff --git a/db/btree/bt_method.c b/db/btree/bt_method.c index 5e3af27d0..84abe96a2 100644 --- a/db/btree/bt_method.c +++ b/db/btree/bt_method.c @@ -1,14 +1,14 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2000 + * Copyright (c) 1999-2003 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: bt_method.c,v 11.20 2000/11/30 00:58:28 ubell Exp $"; +static const char revid[] = "$Id: bt_method.c,v 11.34 2003/06/30 17:19:32 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -16,19 +16,22 @@ static const char revid[] = "$Id: bt_method.c,v 11.20 2000/11/30 00:58:28 ubell #endif #include "db_int.h" -#include "db_page.h" -#include "btree.h" -#include "qam.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/qam.h" -static int __bam_set_bt_compare - __P((DB *, int (*)(DB *, const DBT *, const DBT *))); static int __bam_set_bt_maxkey __P((DB *, u_int32_t)); +static int __bam_get_bt_minkey __P((DB *, u_int32_t *)); static int __bam_set_bt_minkey __P((DB *, u_int32_t)); static int __bam_set_bt_prefix __P((DB *, size_t(*)(DB *, const DBT *, const DBT *))); +static int __ram_get_re_delim __P((DB *, int *)); static int __ram_set_re_delim __P((DB *, int)); +static int __ram_get_re_len __P((DB *, u_int32_t *)); static int __ram_set_re_len __P((DB *, u_int32_t)); +static int __ram_get_re_pad __P((DB *, int *)); static int __ram_set_re_pad __P((DB *, int)); +static int __ram_get_re_source __P((DB *, const char **)); static int __ram_set_re_source __P((DB *, const char *)); /* @@ -55,6 +58,7 @@ __bam_db_create(dbp) dbp->set_bt_compare = __bam_set_bt_compare; dbp->set_bt_maxkey = __bam_set_bt_maxkey; + dbp->get_bt_minkey = __bam_get_bt_minkey; dbp->set_bt_minkey = __bam_set_bt_minkey; dbp->set_bt_prefix = __bam_set_bt_prefix; @@ -62,9 +66,13 @@ __bam_db_create(dbp) t->re_delim = '\n'; t->re_eof = 1; + dbp->get_re_delim = __ram_get_re_delim; dbp->set_re_delim = __ram_set_re_delim; + dbp->get_re_len = __ram_get_re_len; dbp->set_re_len = __ram_set_re_len; + dbp->get_re_pad = __ram_get_re_pad; dbp->set_re_pad = __ram_set_re_pad; + dbp->get_re_source = __ram_get_re_source; dbp->set_re_source = __ram_set_re_source; return (0); @@ -82,7 +90,8 @@ __bam_db_close(dbp) { BTREE *t; - t = dbp->bt_internal; + if ((t = dbp->bt_internal) == NULL) + return (0); /* Recno */ /* Close any backing source file descriptor. */ if (t->re_fp != NULL) @@ -90,14 +99,45 @@ __bam_db_close(dbp) /* Free any backing source file name. */ if (t->re_source != NULL) - __os_freestr(t->re_source); + __os_free(dbp->dbenv, t->re_source); - __os_free(t, sizeof(BTREE)); + __os_free(dbp->dbenv, t); dbp->bt_internal = NULL; return (0); } +/* + * __bam_map_flags -- + * Map Btree specific flags from public to the internal values. + * + * PUBLIC: void __bam_map_flags __P((DB *, u_int32_t *, u_int32_t *)); + */ +void +__bam_map_flags(dbp, inflagsp, outflagsp) + DB *dbp; + u_int32_t *inflagsp, *outflagsp; +{ + COMPQUIET(dbp, NULL); + + if (FLD_ISSET(*inflagsp, DB_DUP)) { + FLD_SET(*outflagsp, DB_AM_DUP); + FLD_CLR(*inflagsp, DB_DUP); + } + if (FLD_ISSET(*inflagsp, DB_DUPSORT)) { + FLD_SET(*outflagsp, DB_AM_DUP | DB_AM_DUPSORT); + FLD_CLR(*inflagsp, DB_DUPSORT); + } + if (FLD_ISSET(*inflagsp, DB_RECNUM)) { + FLD_SET(*outflagsp, DB_AM_RECNUM); + FLD_CLR(*inflagsp, DB_RECNUM); + } + if (FLD_ISSET(*inflagsp, DB_REVSPLITOFF)) { + FLD_SET(*outflagsp, DB_AM_REVSPLITOFF); + FLD_CLR(*inflagsp, DB_REVSPLITOFF); + } +} + /* * __bam_set_flags -- * Set Btree specific flags. @@ -112,50 +152,31 @@ __bam_set_flags(dbp, flagsp) u_int32_t flags; flags = *flagsp; - if (LF_ISSET(DB_DUP | DB_DUPSORT | DB_RECNUM | DB_REVSPLITOFF)) { + if (LF_ISSET(DB_DUP | DB_DUPSORT | DB_RECNUM | DB_REVSPLITOFF)) DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_flags"); - /* - * The DB_DUP and DB_DUPSORT flags are shared by the Hash - * and Btree access methods. - */ - if (LF_ISSET(DB_DUP | DB_DUPSORT)) - DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE | DB_OK_HASH); - - if (LF_ISSET(DB_RECNUM | DB_REVSPLITOFF)) - DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE); - - if (LF_ISSET(DB_DUP | DB_DUPSORT)) { - /* DB_DUP/DB_DUPSORT is incompatible with DB_RECNUM. */ - if (F_ISSET(dbp, DB_BT_RECNUM)) - goto incompat; - - if (LF_ISSET(DB_DUPSORT)) { - if (dbp->dup_compare == NULL) - dbp->dup_compare = __bam_defcmp; - F_SET(dbp, DB_AM_DUPSORT); - } - - F_SET(dbp, DB_AM_DUP); - LF_CLR(DB_DUP | DB_DUPSORT); - } - - if (LF_ISSET(DB_RECNUM)) { - /* DB_RECNUM is incompatible with DB_DUP/DB_DUPSORT. */ - if (F_ISSET(dbp, DB_AM_DUP)) - goto incompat; - - F_SET(dbp, DB_BT_RECNUM); - LF_CLR(DB_RECNUM); - } - - if (LF_ISSET(DB_REVSPLITOFF)) { - F_SET(dbp, DB_BT_REVSPLIT); - LF_CLR(DB_REVSPLITOFF); - } - - *flagsp = flags; - } + /* + * The DB_DUP and DB_DUPSORT flags are shared by the Hash + * and Btree access methods. + */ + if (LF_ISSET(DB_DUP | DB_DUPSORT)) + DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE | DB_OK_HASH); + + if (LF_ISSET(DB_RECNUM | DB_REVSPLITOFF)) + DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE); + + /* DB_DUP/DB_DUPSORT is incompatible with DB_RECNUM. */ + if (LF_ISSET(DB_DUP | DB_DUPSORT) && F_ISSET(dbp, DB_AM_RECNUM)) + goto incompat; + + /* DB_RECNUM is incompatible with DB_DUP/DB_DUPSORT. */ + if (LF_ISSET(DB_RECNUM) && F_ISSET(dbp, DB_AM_DUP)) + goto incompat; + + if (LF_ISSET(DB_DUPSORT) && dbp->dup_compare == NULL) + dbp->dup_compare = __bam_defcmp; + + __bam_map_flags(dbp, flagsp, &dbp->flags); return (0); incompat: @@ -165,15 +186,18 @@ incompat: /* * __bam_set_bt_compare -- * Set the comparison function. + * + * PUBLIC: int __bam_set_bt_compare + * PUBLIC: __P((DB *, int (*)(DB *, const DBT *, const DBT *))); */ -static int +int __bam_set_bt_compare(dbp, func) DB *dbp; int (*func) __P((DB *, const DBT *, const DBT *)); { BTREE *t; - DB_ILLEGAL_AFTER_OPEN(dbp, "set_bt_compare"); + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_bt_compare"); DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE); t = dbp->bt_internal; @@ -200,7 +224,7 @@ __bam_set_bt_maxkey(dbp, bt_maxkey) { BTREE *t; - DB_ILLEGAL_AFTER_OPEN(dbp, "set_bt_maxkey"); + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_bt_maxkey"); DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE); t = dbp->bt_internal; @@ -214,6 +238,24 @@ __bam_set_bt_maxkey(dbp, bt_maxkey) return (0); } +/* + * __db_get_bt_minkey -- + * Get the minimum keys per page. + */ +static int +__bam_get_bt_minkey(dbp, bt_minkeyp) + DB *dbp; + u_int32_t *bt_minkeyp; +{ + BTREE *t; + + DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE); + + t = dbp->bt_internal; + *bt_minkeyp = t->bt_minkey; + return (0); +} + /* * __bam_set_bt_minkey -- * Set the minimum keys per page. @@ -225,7 +267,7 @@ __bam_set_bt_minkey(dbp, bt_minkey) { BTREE *t; - DB_ILLEGAL_AFTER_OPEN(dbp, "set_bt_minkey"); + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_bt_minkey"); DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE); t = dbp->bt_internal; @@ -250,7 +292,7 @@ __bam_set_bt_prefix(dbp, func) { BTREE *t; - DB_ILLEGAL_AFTER_OPEN(dbp, "set_bt_prefix"); + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_bt_prefix"); DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE); t = dbp->bt_internal; @@ -259,6 +301,29 @@ __bam_set_bt_prefix(dbp, func) return (0); } +/* + * __ram_map_flags -- + * Map Recno specific flags from public to the internal values. + * + * PUBLIC: void __ram_map_flags __P((DB *, u_int32_t *, u_int32_t *)); + */ +void +__ram_map_flags(dbp, inflagsp, outflagsp) + DB *dbp; + u_int32_t *inflagsp, *outflagsp; +{ + COMPQUIET(dbp, NULL); + + if (FLD_ISSET(*inflagsp, DB_RENUMBER)) { + FLD_SET(*outflagsp, DB_AM_RENUMBER); + FLD_CLR(*inflagsp, DB_RENUMBER); + } + if (FLD_ISSET(*inflagsp, DB_SNAPSHOT)) { + FLD_SET(*outflagsp, DB_AM_SNAPSHOT); + FLD_CLR(*inflagsp, DB_SNAPSHOT); + } +} + /* * __ram_set_flags -- * Set Recno specific flags. @@ -275,21 +340,27 @@ __ram_set_flags(dbp, flagsp) flags = *flagsp; if (LF_ISSET(DB_RENUMBER | DB_SNAPSHOT)) { DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_flags"); - DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO); + } - if (LF_ISSET(DB_RENUMBER)) { - F_SET(dbp, DB_RE_RENUMBER); - LF_CLR(DB_RENUMBER); - } + __ram_map_flags(dbp, flagsp, &dbp->flags); + return (0); +} - if (LF_ISSET(DB_SNAPSHOT)) { - F_SET(dbp, DB_RE_SNAPSHOT); - LF_CLR(DB_SNAPSHOT); - } +/* + * __db_get_re_delim -- + * Get the variable-length input record delimiter. + */ +static int +__ram_get_re_delim(dbp, re_delimp) + DB *dbp; + int *re_delimp; +{ + BTREE *t; - *flagsp = flags; - } + DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO); + t = dbp->bt_internal; + *re_delimp = t->re_delim; return (0); } @@ -304,17 +375,34 @@ __ram_set_re_delim(dbp, re_delim) { BTREE *t; - DB_ILLEGAL_AFTER_OPEN(dbp, "set_re_delim"); + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_re_delim"); DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO); t = dbp->bt_internal; t->re_delim = re_delim; - F_SET(dbp, DB_RE_DELIMITER); + F_SET(dbp, DB_AM_DELIMITER); return (0); } +/* + * __db_get_re_len -- + * Get the variable-length input record length. + */ +static int +__ram_get_re_len(dbp, re_lenp) + DB *dbp; + u_int32_t *re_lenp; +{ + BTREE *t; + + DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO); + t = dbp->bt_internal; + *re_lenp = t->re_len; + return (0); +} + /* * __ram_set_re_len -- * Set the variable-length input record length. @@ -327,7 +415,7 @@ __ram_set_re_len(dbp, re_len) BTREE *t; QUEUE *q; - DB_ILLEGAL_AFTER_OPEN(dbp, "set_re_len"); + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_re_len"); DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO); t = dbp->bt_internal; @@ -336,11 +424,29 @@ __ram_set_re_len(dbp, re_len) q = dbp->q_internal; q->re_len = re_len; - F_SET(dbp, DB_RE_FIXEDLEN); + F_SET(dbp, DB_AM_FIXEDLEN); return (0); } +/* + * __db_get_re_pad -- + * Get the fixed-length record pad character. + */ +static int +__ram_get_re_pad(dbp, re_padp) + DB *dbp; + int *re_padp; +{ + BTREE *t; + + DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO); + + t = dbp->bt_internal; + *re_padp = t->re_pad; + return (0); +} + /* * __ram_set_re_pad -- * Set the fixed-length record pad character. @@ -353,7 +459,7 @@ __ram_set_re_pad(dbp, re_pad) BTREE *t; QUEUE *q; - DB_ILLEGAL_AFTER_OPEN(dbp, "set_re_pad"); + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_re_pad"); DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO); t = dbp->bt_internal; @@ -362,8 +468,26 @@ __ram_set_re_pad(dbp, re_pad) q = dbp->q_internal; q->re_pad = re_pad; - F_SET(dbp, DB_RE_PAD); + F_SET(dbp, DB_AM_PAD); + + return (0); +} + +/* + * __db_get_re_source -- + * Get the backing source file name. + */ +static int +__ram_get_re_source(dbp, re_sourcep) + DB *dbp; + const char **re_sourcep; +{ + BTREE *t; + DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO); + + t = dbp->bt_internal; + *re_sourcep = t->re_source; return (0); } @@ -378,7 +502,7 @@ __ram_set_re_source(dbp, re_source) { BTREE *t; - DB_ILLEGAL_AFTER_OPEN(dbp, "set_re_source"); + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_re_source"); DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO); t = dbp->bt_internal; diff --git a/db/btree/bt_open.c b/db/btree/bt_open.c index 405c1880f..20f594fe5 100644 --- a/db/btree/bt_open.c +++ b/db/btree/bt_open.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2003 * Sleepycat Software. All rights reserved. */ /* @@ -43,47 +43,48 @@ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: bt_open.c,v 11.42 2000/11/30 00:58:28 ubell Exp $"; +static const char revid[] = "$Id: bt_open.c,v 11.87 2003/07/17 01:39:09 margo Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include -#include #include #endif #include "db_int.h" -#include "db_page.h" -#include "db_swap.h" -#include "btree.h" -#include "db_shash.h" -#include "lock.h" -#include "log.h" -#include "mp.h" +#include "dbinc/crypto.h" +#include "dbinc/db_page.h" +#include "dbinc/db_swap.h" +#include "dbinc/btree.h" +#include "dbinc/db_shash.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/fop.h" + +static void __bam_init_meta __P((DB *, BTMETA *, db_pgno_t, DB_LSN *)); /* * __bam_open -- * Open a btree. * - * PUBLIC: int __bam_open __P((DB *, const char *, db_pgno_t, u_int32_t)); + * PUBLIC: int __bam_open __P((DB *, + * PUBLIC: DB_TXN *, const char *, db_pgno_t, u_int32_t)); */ int -__bam_open(dbp, name, base_pgno, flags) +__bam_open(dbp, txn, name, base_pgno, flags) DB *dbp; + DB_TXN *txn; const char *name; db_pgno_t base_pgno; u_int32_t flags; { BTREE *t; + COMPQUIET(name, NULL); t = dbp->bt_internal; - /* Initialize the remaining fields/methods of the DB. */ - dbp->del = __bam_delete; - dbp->key_range = __bam_key_range; - dbp->stat = __bam_stat; - /* * We don't permit the user to specify a prefix routine if they didn't * also specify a comparison routine, they can't know enough about our @@ -99,8 +100,8 @@ __bam_open(dbp, name, base_pgno, flags) * Verify that the bt_minkey value specified won't cause the * calculation of ovflsize to underflow [#2406] for this pagesize. */ - if (B_MINKEY_TO_OVFLSIZE(t->bt_minkey, dbp->pgsize) > - B_MINKEY_TO_OVFLSIZE(DEFMINKEYPAGE, dbp->pgsize)) { + if (B_MINKEY_TO_OVFLSIZE(dbp, t->bt_minkey, dbp->pgsize) > + B_MINKEY_TO_OVFLSIZE(dbp, DEFMINKEYPAGE, dbp->pgsize)) { __db_err(dbp->dbenv, "bt_minkey value of %lu too high for page size of %lu", (u_long)t->bt_minkey, (u_long)dbp->pgsize); @@ -108,7 +109,7 @@ __bam_open(dbp, name, base_pgno, flags) } /* Start up the tree. */ - return (__bam_read_root(dbp, name, base_pgno, flags)); + return (__bam_read_root(dbp, txn, base_pgno, flags)); } /* @@ -143,6 +144,7 @@ __bam_metachk(dbp, name, btm) name, (u_long)vers); return (DB_OLD_VERSION); case 8: + case 9: break; default: __db_err(dbenv, @@ -187,13 +189,13 @@ __bam_metachk(dbp, name, btm) if (F_ISSET(&btm->dbmeta, BTM_RECNUM)) { if (dbp->type != DB_BTREE) goto wrong_type; - F_SET(dbp, DB_BT_RECNUM); + F_SET(dbp, DB_AM_RECNUM); if ((ret = __db_fcchk(dbenv, - "DB->open", dbp->flags, DB_AM_DUP, DB_BT_RECNUM)) != 0) + "DB->open", dbp->flags, DB_AM_DUP, DB_AM_RECNUM)) != 0) return (ret); } else - if (F_ISSET(dbp, DB_BT_RECNUM)) { + if (F_ISSET(dbp, DB_AM_RECNUM)) { __db_err(dbenv, "%s: DB_RECNUM specified to open method but not set in database", name); @@ -203,9 +205,9 @@ __bam_metachk(dbp, name, btm) if (F_ISSET(&btm->dbmeta, BTM_FIXEDLEN)) { if (dbp->type != DB_RECNO) goto wrong_type; - F_SET(dbp, DB_RE_FIXEDLEN); + F_SET(dbp, DB_AM_FIXEDLEN); } else - if (F_ISSET(dbp, DB_RE_FIXEDLEN)) { + if (F_ISSET(dbp, DB_AM_FIXEDLEN)) { __db_err(dbenv, "%s: DB_FIXEDLEN specified to open method but not set in database", name); @@ -215,9 +217,9 @@ __bam_metachk(dbp, name, btm) if (F_ISSET(&btm->dbmeta, BTM_RENUMBER)) { if (dbp->type != DB_RECNO) goto wrong_type; - F_SET(dbp, DB_RE_RENUMBER); + F_SET(dbp, DB_AM_RENUMBER); } else - if (F_ISSET(dbp, DB_RE_RENUMBER)) { + if (F_ISSET(dbp, DB_AM_RENUMBER)) { __db_err(dbenv, "%s: DB_RENUMBER specified to open method but not set in database", name); @@ -266,116 +268,137 @@ wrong_type: /* * __bam_read_root -- - * Check (and optionally create) a tree. + * Read the root page and check a tree. * - * PUBLIC: int __bam_read_root __P((DB *, const char *, db_pgno_t, u_int32_t)); + * PUBLIC: int __bam_read_root __P((DB *, DB_TXN *, db_pgno_t, u_int32_t)); */ int -__bam_read_root(dbp, name, base_pgno, flags) +__bam_read_root(dbp, txn, base_pgno, flags) DB *dbp; - const char *name; + DB_TXN *txn; db_pgno_t base_pgno; u_int32_t flags; { BTMETA *meta; BTREE *t; DBC *dbc; - DB_LSN orig_lsn; DB_LOCK metalock; - PAGE *root; - int locked, ret, t_ret; + DB_MPOOLFILE *mpf; + int ret, t_ret; - ret = 0; - t = dbp->bt_internal; meta = NULL; - root = NULL; - locked = 0; + t = dbp->bt_internal; + LOCK_INIT(metalock); + mpf = dbp->mpf; + ret = 0; - /* - * Get a cursor. If DB_CREATE is specified, we may be creating - * the root page, and to do that safely in CDB we need a write - * cursor. In STD_LOCKING mode, we'll synchronize using the - * meta page lock instead. - */ - if ((ret = dbp->cursor(dbp, dbp->open_txn, - &dbc, LF_ISSET(DB_CREATE) && CDB_LOCKING(dbp->dbenv) ? - DB_WRITECURSOR : 0)) != 0) + /* Get a cursor. */ + if ((ret = __db_cursor(dbp, txn, &dbc, 0)) != 0) return (ret); - /* Get, and optionally create the metadata page. */ + /* Get the metadata page. */ if ((ret = __db_lget(dbc, 0, base_pgno, DB_LOCK_READ, 0, &metalock)) != 0) goto err; - if ((ret = memp_fget( - dbp->mpf, &base_pgno, DB_MPOOL_CREATE, (PAGE **)&meta)) != 0) + if ((ret = __memp_fget(mpf, &base_pgno, 0, &meta)) != 0) goto err; /* - * If the magic number is correct, we're not creating the tree. - * Correct any fields that may not be right. Note, all of the - * local flags were set by DB->open. + * If the magic number is set, the tree has been created. Correct + * any fields that may not be right. Note, all of the local flags + * were set by DB->open. + * + * Otherwise, we'd better be in recovery or abort, in which case the + * metadata page will be created/initialized elsewhere. */ -again: if (meta->dbmeta.magic != 0) { + if (meta->dbmeta.magic == DB_BTREEMAGIC) { t->bt_maxkey = meta->maxkey; t->bt_minkey = meta->minkey; - t->re_pad = meta->re_pad; + t->re_pad = (int)meta->re_pad; t->re_len = meta->re_len; t->bt_meta = base_pgno; t->bt_root = meta->root; - - (void)memp_fput(dbp->mpf, meta, 0); - meta = NULL; - goto done; + } else { + DB_ASSERT(IS_RECOVERING(dbp->dbenv) || + F_ISSET(dbp, DB_AM_RECOVER)); } - /* In recovery if it's not there it will be created elsewhere.*/ - if (IS_RECOVERING(dbp->dbenv)) - goto done; - - /* If we're doing CDB; we now have to get the write lock. */ - if (CDB_LOCKING(dbp->dbenv)) { - /* - * We'd better have DB_CREATE set if we're actually doing - * the create. - */ - DB_ASSERT(LF_ISSET(DB_CREATE)); - if ((ret = lock_get(dbp->dbenv, dbc->locker, DB_LOCK_UPGRADE, - &dbc->lock_dbt, DB_LOCK_WRITE, &dbc->mylock)) != 0) - goto err; - } + /* + * !!! + * If creating a subdatabase, we've already done an insert when + * we put the subdatabase's entry into the master database, so + * our last-page-inserted value is wrongly initialized for the + * master database, not the subdatabase we're creating. I'm not + * sure where the *right* place to clear this value is, it's not + * intuitively obvious that it belongs here. + */ + t->bt_lpgno = PGNO_INVALID; /* - * If we are doing locking, relase the read lock and get a write lock. - * We want to avoid deadlock. + * We must initialize last_pgno, it could be stale. + * We update this without holding the meta page write + * locked. This is ok since two threads in the code + * must be setting it to the same value. SR #7159. */ - if (locked == 0 && STD_LOCKING(dbc)) { - if ((ret = __LPUT(dbc, metalock)) != 0) - goto err; - if ((ret = __db_lget(dbc, - 0, base_pgno, DB_LOCK_WRITE, 0, &metalock)) != 0) - goto err; - locked = 1; - goto again; - } + if (!LF_ISSET(DB_RDONLY) && dbp->meta_pgno == PGNO_BASE_MD) { + __memp_last_pgno(mpf, &meta->dbmeta.last_pgno); + ret = __memp_fput(mpf, meta, DB_MPOOL_DIRTY); + } else + ret = __memp_fput(mpf, meta, 0); + meta = NULL; + +err: /* Put the metadata page back. */ + if (meta != NULL && + (t_ret = __memp_fput(mpf, meta, 0)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0) + ret = t_ret; + + if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __bam_init_meta -- + * + * Initialize a btree meta-data page. The following fields may need + * to be updated later: last_pgno, root. + */ +static void +__bam_init_meta(dbp, meta, pgno, lsnp) + DB *dbp; + BTMETA *meta; + db_pgno_t pgno; + DB_LSN *lsnp; +{ + BTREE *t; - /* Initialize the tree structure metadata information. */ - orig_lsn = meta->dbmeta.lsn; memset(meta, 0, sizeof(BTMETA)); - meta->dbmeta.lsn = orig_lsn; - meta->dbmeta.pgno = base_pgno; + meta->dbmeta.lsn = *lsnp; + meta->dbmeta.pgno = pgno; meta->dbmeta.magic = DB_BTREEMAGIC; meta->dbmeta.version = DB_BTREEVERSION; meta->dbmeta.pagesize = dbp->pgsize; + if (F_ISSET(dbp, DB_AM_CHKSUM)) + FLD_SET(meta->dbmeta.metaflags, DBMETA_CHKSUM); + if (F_ISSET(dbp, DB_AM_ENCRYPT)) { + meta->dbmeta.encrypt_alg = + ((DB_CIPHER *)dbp->dbenv->crypto_handle)->alg; + DB_ASSERT(meta->dbmeta.encrypt_alg != 0); + meta->crypto_magic = meta->dbmeta.magic; + } meta->dbmeta.type = P_BTREEMETA; meta->dbmeta.free = PGNO_INVALID; + meta->dbmeta.last_pgno = pgno; if (F_ISSET(dbp, DB_AM_DUP)) F_SET(&meta->dbmeta, BTM_DUP); - if (F_ISSET(dbp, DB_RE_FIXEDLEN)) + if (F_ISSET(dbp, DB_AM_FIXEDLEN)) F_SET(&meta->dbmeta, BTM_FIXEDLEN); - if (F_ISSET(dbp, DB_BT_RECNUM)) + if (F_ISSET(dbp, DB_AM_RECNUM)) F_SET(&meta->dbmeta, BTM_RECNUM); - if (F_ISSET(dbp, DB_RE_RENUMBER)) + if (F_ISSET(dbp, DB_AM_RENUMBER)) F_SET(&meta->dbmeta, BTM_RENUMBER); if (F_ISSET(dbp, DB_AM_SUBDB)) F_SET(&meta->dbmeta, BTM_SUBDB); @@ -385,14 +408,170 @@ again: if (meta->dbmeta.magic != 0) { F_SET(&meta->dbmeta, BTM_RECNO); memcpy(meta->dbmeta.uid, dbp->fileid, DB_FILE_ID_LEN); + t = dbp->bt_internal; meta->maxkey = t->bt_maxkey; meta->minkey = t->bt_minkey; meta->re_len = t->re_len; - meta->re_pad = t->re_pad; + meta->re_pad = (u_int32_t)t->re_pad; +} + +/* + * __bam_new_file -- + * Create the necessary pages to begin a new database file. + * + * This code appears more complex than it is because of the two cases (named + * and unnamed). The way to read the code is that for each page being created, + * there are three parts: 1) a "get page" chunk (which either uses malloc'd + * memory or calls __memp_fget), 2) the initialization, and 3) the "put page" + * chunk which either does a fop write or an __memp_fput. + * + * PUBLIC: int __bam_new_file __P((DB *, DB_TXN *, DB_FH *, const char *)); + */ +int +__bam_new_file(dbp, txn, fhp, name) + DB *dbp; + DB_TXN *txn; + DB_FH *fhp; + const char *name; +{ + BTMETA *meta; + DB_ENV *dbenv; + DB_LSN lsn; + DB_MPOOLFILE *mpf; + DB_PGINFO pginfo; + DBT pdbt; + PAGE *root; + db_pgno_t pgno; + int ret; + void *buf; - /* If necessary, log the meta-data and root page creates. */ - if ((ret = __db_log_page(dbp, - name, &orig_lsn, base_pgno, (PAGE *)meta)) != 0) + dbenv = dbp->dbenv; + mpf = dbp->mpf; + root = NULL; + meta = NULL; + memset(&pdbt, 0, sizeof(pdbt)); + buf = NULL; + + /* Build meta-data page. */ + + if (name == NULL) { + pgno = PGNO_BASE_MD; + ret = __memp_fget(mpf, &pgno, DB_MPOOL_CREATE, &meta); + } else { + pginfo.db_pagesize = dbp->pgsize; + pginfo.flags = + F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_SWAP)); + pginfo.type = dbp->type; + pdbt.data = &pginfo; + pdbt.size = sizeof(pginfo); + ret = __os_calloc(dbp->dbenv, 1, dbp->pgsize, &buf); + meta = (BTMETA *)buf; + } + if (ret != 0) + return (ret); + + LSN_NOT_LOGGED(lsn); + __bam_init_meta(dbp, meta, PGNO_BASE_MD, &lsn); + meta->root = 1; + meta->dbmeta.last_pgno = 1; + + if (name == NULL) + ret = __memp_fput(mpf, meta, DB_MPOOL_DIRTY); + else { + if ((ret = __db_pgout(dbenv, PGNO_BASE_MD, meta, &pdbt)) != 0) + goto err; + ret = __fop_write(dbenv, txn, name, + DB_APP_DATA, fhp, dbp->pgsize, 0, 0, buf, dbp->pgsize, 1, + F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0); + } + if (ret != 0) + goto err; + meta = NULL; + + /* Now build root page. */ + if (name == NULL) { + pgno = 1; + if ((ret = + __memp_fget(mpf, &pgno, DB_MPOOL_CREATE, &root)) != 0) + goto err; + } else { +#ifdef DIAGNOSTIC + memset(buf, CLEAR_BYTE, dbp->pgsize); +#endif + root = (PAGE *)buf; + } + + P_INIT(root, dbp->pgsize, 1, PGNO_INVALID, PGNO_INVALID, + LEAFLEVEL, dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE); + LSN_NOT_LOGGED(root->lsn); + + if (name == NULL) + ret = __memp_fput(mpf, root, DB_MPOOL_DIRTY); + else { + if ((ret = __db_pgout(dbenv, root->pgno, root, &pdbt)) != 0) + goto err; + ret = __fop_write(dbenv, txn, name, + DB_APP_DATA, fhp, dbp->pgsize, 1, 0, buf, dbp->pgsize, 1, + F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0); + } + if (ret != 0) + goto err; + root = NULL; + +err: if (buf != NULL) + __os_free(dbenv, buf); + else { + if (meta != NULL) + (void)__memp_fput(mpf, meta, 0); + if (root != NULL) + (void)__memp_fput(mpf, root, 0); + } + return (ret); +} + +/* + * __bam_new_subdb -- + * Create a metadata page and a root page for a new btree. + * + * PUBLIC: int __bam_new_subdb __P((DB *, DB *, DB_TXN *)); + */ +int +__bam_new_subdb(mdbp, dbp, txn) + DB *mdbp, *dbp; + DB_TXN *txn; +{ + BTMETA *meta; + DBC *dbc; + DB_ENV *dbenv; + DB_LOCK metalock; + DB_LSN lsn; + DB_MPOOLFILE *mpf; + PAGE *root; + int ret, t_ret; + + dbenv = mdbp->dbenv; + mpf = mdbp->mpf; + dbc = NULL; + meta = NULL; + root = NULL; + + if ((ret = __db_cursor(mdbp, txn, + &dbc, CDB_LOCKING(dbenv) ? DB_WRITECURSOR : 0)) != 0) + return (ret); + + /* Get, and optionally create the metadata page. */ + if ((ret = __db_lget(dbc, + 0, dbp->meta_pgno, DB_LOCK_WRITE, 0, &metalock)) != 0) + goto err; + if ((ret = + __memp_fget(mpf, &dbp->meta_pgno, DB_MPOOL_CREATE, &meta)) != 0) + goto err; + + /* Build meta-data page. */ + lsn = meta->dbmeta.lsn; + __bam_init_meta(dbp, meta, dbp->meta_pgno, &lsn); + if ((ret = __db_log_page(mdbp, + txn, &meta->dbmeta.lsn, dbp->meta_pgno, (PAGE *)meta)) != 0) goto err; /* Create and initialize a root page. */ @@ -401,68 +580,35 @@ again: if (meta->dbmeta.magic != 0) { goto err; root->level = LEAFLEVEL; - if (dbp->open_txn != NULL && (ret = __bam_root_log(dbp->dbenv, - dbp->open_txn, &meta->dbmeta.lsn, 0, dbp->log_fileid, + if (DBENV_LOGGING(dbenv) && + (ret = __bam_root_log(mdbp, txn, &meta->dbmeta.lsn, 0, meta->dbmeta.pgno, root->pgno, &meta->dbmeta.lsn)) != 0) goto err; meta->root = root->pgno; - - DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOGMETA, ret, name); - if ((ret = __db_log_page(dbp, - name, &root->lsn, root->pgno, root)) != 0) + if ((ret = + __db_log_page(mdbp, txn, &root->lsn, root->pgno, root)) != 0) goto err; - DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOG, ret, name); - - t->bt_meta = base_pgno; - t->bt_root = root->pgno; /* Release the metadata and root pages. */ - if ((ret = memp_fput(dbp->mpf, meta, DB_MPOOL_DIRTY)) != 0) + if ((ret = __memp_fput(mpf, meta, DB_MPOOL_DIRTY)) != 0) goto err; meta = NULL; - if ((ret = memp_fput(dbp->mpf, root, DB_MPOOL_DIRTY)) != 0) + if ((ret = __memp_fput(mpf, root, DB_MPOOL_DIRTY)) != 0) goto err; root = NULL; - - /* - * Flush the metadata and root pages to disk. - * - * !!! - * It's not useful to return not-yet-flushed here -- convert it to - * an error. - */ - if ((ret = memp_fsync(dbp->mpf)) == DB_INCOMPLETE) { - __db_err(dbp->dbenv, "Metapage flush failed"); - ret = EINVAL; - } - DB_TEST_RECOVERY(dbp, DB_TEST_POSTSYNC, ret, name); - -done: /* - * !!! - * We already did an insert and so the last-page-inserted has been - * set. I'm not sure where the *right* place to clear this value - * is, it's not intuitively obvious that it belongs here. - */ - t->bt_lpgno = PGNO_INVALID; - err: -DB_TEST_RECOVERY_LABEL - /* Put any remaining pages back. */ if (meta != NULL) - if ((t_ret = memp_fput(dbp->mpf, meta, 0)) != 0 && - ret == 0) + if ((t_ret = __memp_fput(mpf, meta, 0)) != 0 && ret == 0) ret = t_ret; if (root != NULL) - if ((t_ret = memp_fput(dbp->mpf, root, 0)) != 0 && - ret == 0) + if ((t_ret = __memp_fput(mpf, root, 0)) != 0 && ret == 0) + ret = t_ret; + if (LOCK_ISSET(metalock)) + if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0) + ret = t_ret; + if (dbc != NULL) + if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0) ret = t_ret; - - /* We can release the metapage lock when we are done. */ - if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0) - ret = t_ret; - - if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) - ret = t_ret; return (ret); } diff --git a/db/btree/bt_put.c b/db/btree/bt_put.c index 19a04526d..b98c6c579 100644 --- a/db/btree/bt_put.c +++ b/db/btree/bt_put.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2003 * Sleepycat Software. All rights reserved. */ /* @@ -43,7 +43,7 @@ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: bt_put.c,v 11.46 2001/01/17 18:48:46 bostic Exp $"; +static const char revid[] = "$Id: bt_put.c,v 11.78 2003/10/31 15:07:40 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -53,12 +53,18 @@ static const char revid[] = "$Id: bt_put.c,v 11.46 2001/01/17 18:48:46 bostic Ex #endif #include "db_int.h" -#include "db_page.h" -#include "btree.h" +#include "dbinc/db_page.h" +#include "dbinc/db_shash.h" +#include "dbinc/btree.h" +#include "dbinc/mp.h" +static int __bam_build + __P((DBC *, u_int32_t, DBT *, PAGE *, u_int32_t, u_int32_t)); static int __bam_dup_convert __P((DBC *, PAGE *, u_int32_t)); static int __bam_ovput __P((DBC *, u_int32_t, db_pgno_t, PAGE *, u_int32_t, DBT *)); +static u_int32_t + __bam_partsize __P((DB *, u_int32_t, DBT *, PAGE *, u_int32_t)); /* * __bam_iitem -- @@ -72,11 +78,13 @@ __bam_iitem(dbc, key, data, op, flags) DBT *key, *data; u_int32_t op, flags; { + DB_ENV *dbenv; BKEYDATA *bk, bk_tmp; BTREE *t; BTREE_CURSOR *cp; DB *dbp; DBT bk_hdr, tdbt; + DB_MPOOLFILE *mpf; PAGE *h; db_indx_t indx; u_int32_t data_size, have_bytes, need_bytes, needed; @@ -85,6 +93,8 @@ __bam_iitem(dbc, key, data, op, flags) COMPQUIET(bk, NULL); dbp = dbc->dbp; + dbenv = dbp->dbenv; + mpf = dbp->mpf; cp = (BTREE_CURSOR *)dbc->internal; t = dbp->bt_internal; h = cp->page; @@ -95,11 +105,9 @@ __bam_iitem(dbc, key, data, op, flags) * Fixed-length records with partial puts: it's an error to specify * anything other simple overwrite. */ - if (F_ISSET(dbp, DB_RE_FIXEDLEN) && - F_ISSET(data, DB_DBT_PARTIAL) && data->dlen != data->size) { - data_size = data->size; - goto len_err; - } + if (F_ISSET(dbp, DB_AM_FIXEDLEN) && + F_ISSET(data, DB_DBT_PARTIAL) && data->size != data->dlen) + return (__db_rec_repl(dbenv, data->size, data->dlen)); /* * Figure out how much space the data will take, including if it's a @@ -110,16 +118,14 @@ __bam_iitem(dbc, key, data, op, flags) * the fixed-length record size. */ data_size = F_ISSET(data, DB_DBT_PARTIAL) ? - __bam_partsize(op, data, h, indx) : data->size; + __bam_partsize(dbp, op, data, h, indx) : data->size; padrec = 0; - if (F_ISSET(dbp, DB_RE_FIXEDLEN)) { - if (data_size > t->re_len) { -len_err: __db_err(dbp->dbenv, - "Length improper for fixed length record %lu", - (u_long)data_size); - return (EINVAL); - } - if (data_size < t->re_len) { + if (F_ISSET(dbp, DB_AM_FIXEDLEN)) { + if (data_size > t->re_len) + return (__db_rec_toobig(dbenv, data_size, t->re_len)); + + /* Records that are deleted anyway needn't be padded out. */ + if (!LF_ISSET(BI_DELETED) && data_size < t->re_len) { padrec = 1; data_size = t->re_len; } @@ -146,12 +152,12 @@ len_err: __db_err(dbp->dbenv, */ if (op == DB_CURRENT && dbp->dup_compare != NULL) { if ((ret = __bam_cmp(dbp, data, h, - indx + (TYPE(h) == P_LBTREE ? O_INDX : 0), - dbp->dup_compare, &cmp)) != 0) + indx + (TYPE(h) == P_LBTREE ? O_INDX : 0), + dbp->dup_compare, &cmp)) != 0) return (ret); if (cmp != 0) { - __db_err(dbp->dbenv, - "Current data differs from put data"); + __db_err(dbenv, + "Existing data sorts differently from put data"); return (EINVAL); } } @@ -190,7 +196,7 @@ len_err: __db_err(dbp->dbenv, */ bigkey = 0; if (op == DB_CURRENT) { - bk = GET_BKEYDATA(h, + bk = GET_BKEYDATA(dbp, h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0)); if (B_TYPE(bk->type) == B_KEYDATA) have_bytes = BKEYDATA_PSIZE(bk->len); @@ -210,19 +216,14 @@ len_err: __db_err(dbp->dbenv, needed += need_bytes - have_bytes; break; default: - return (__db_unknown_flag(dbp->dbenv, "__bam_iitem", op)); + return (__db_unknown_flag(dbenv, "DB->put", op)); } /* * If there's not enough room, or the user has put a ceiling on the * number of keys permitted in the page, split the page. - * - * XXX - * The t->bt_maxkey test here may be insufficient -- do we have to - * check in the btree split code, so we don't undo it there!?!? */ - if (P_FREESPACE(h) < needed || - (t->bt_maxkey != 0 && NUM_ENT(h) > t->bt_maxkey)) + if (P_FREESPACE(dbp, h) < needed) return (DB_NEEDSPLIT); /* @@ -286,23 +287,24 @@ len_err: __db_err(dbp->dbenv, * we deadlock or fail while deleting the overflow item or * replacing the non-overflow item, a subsequent cursor close * will try and remove the item because the cursor's delete - * flag is set + * flag is set. */ (void)__bam_ca_delete(dbp, PGNO(h), indx, 0); if (TYPE(h) == P_LBTREE) { ++indx; dupadjust = 1; + } - /* - * In a Btree deleted records aren't counted (deleted - * records are counted in a Recno because all accesses - * are based on record number). If it's a Btree and - * it's a DB_CURRENT operation overwriting a previously - * deleted record, increment the record count. - */ + /* + * In a Btree deleted records aren't counted (deleted records + * are counted in a Recno because all accesses are based on + * record number). If it's a Btree and it's a DB_CURRENT + * operation overwriting a previously deleted record, increment + * the record count. + */ + if (TYPE(h) == P_LBTREE || TYPE(h) == P_LDUP) was_deleted = B_DISSET(bk->type); - } /* * 4. Delete and re-add the data item. @@ -323,11 +325,16 @@ len_err: __db_err(dbp->dbenv, replace = 1; break; default: - return (__db_unknown_flag(dbp->dbenv, "__bam_iitem", op)); + return (__db_unknown_flag(dbenv, "DB->put", op)); } /* Add the data. */ if (bigdata) { + /* + * We do not have to handle deleted (BI_DELETED) records + * in this case; the actual records should never be created. + */ + DB_ASSERT(!LF_ISSET(BI_DELETED)); if ((ret = __bam_ovput(dbc, B_OVERFLOW, PGNO_INVALID, h, indx, data)) != 0) return (ret); @@ -347,7 +354,7 @@ len_err: __db_err(dbp->dbenv, if (ret != 0) return (ret); } - if ((ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0) + if ((ret = __memp_fset(mpf, h, DB_MPOOL_DIRTY)) != 0) return (ret); /* @@ -375,7 +382,7 @@ len_err: __db_err(dbp->dbenv, * up at least 25% of the space on the page. If it does, move it onto * its own page. */ - if (dupadjust && P_FREESPACE(h) <= dbp->pgsize / 2) { + if (dupadjust && P_FREESPACE(dbp, h) <= dbp->pgsize / 2) { if ((ret = __bam_dup_convert(dbc, h, indx - O_INDX)) != 0) return (ret); } @@ -390,11 +397,10 @@ len_err: __db_err(dbp->dbenv, /* * __bam_partsize -- * Figure out how much space a partial data item is in total. - * - * PUBLIC: u_int32_t __bam_partsize __P((u_int32_t, DBT *, PAGE *, u_int32_t)); */ -u_int32_t -__bam_partsize(op, data, h, indx) +static u_int32_t +__bam_partsize(dbp, op, data, h, indx) + DB *dbp; u_int32_t op, indx; DBT *data; PAGE *h; @@ -413,38 +419,18 @@ __bam_partsize(op, data, h, indx) * Otherwise, it's the data provided plus any already existing data * that we're not replacing. */ - bk = GET_BKEYDATA(h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0)); + bk = GET_BKEYDATA(dbp, h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0)); nbytes = B_TYPE(bk->type) == B_OVERFLOW ? ((BOVERFLOW *)bk)->tlen : bk->len; - /* - * There are really two cases here: - * - * Case 1: We are replacing some bytes that do not exist (i.e., they - * are past the end of the record). In this case the number of bytes - * we are replacing is irrelevant and all we care about is how many - * bytes we are going to add from offset. So, the new record length - * is going to be the size of the new bytes (size) plus wherever those - * new bytes begin (doff). - * - * Case 2: All the bytes we are replacing exist. Therefore, the new - * size is the oldsize (nbytes) minus the bytes we are replacing (dlen) - * plus the bytes we are adding (size). - */ - if (nbytes < data->doff + data->dlen) /* Case 1 */ - return (data->doff + data->size); - - return (nbytes + data->size - data->dlen); /* Case 2 */ + return (__db_partsize(nbytes, data)); } /* * __bam_build -- * Build the real record for a partial put, or short fixed-length record. - * - * PUBLIC: int __bam_build __P((DBC *, u_int32_t, - * PUBLIC: DBT *, PAGE *, u_int32_t, u_int32_t)); */ -int +static int __bam_build(dbc, op, dbt, h, indx, nbytes) DBC *dbc; u_int32_t op, indx, nbytes; @@ -454,9 +440,8 @@ __bam_build(dbc, op, dbt, h, indx, nbytes) BKEYDATA *bk, tbk; BOVERFLOW *bo; BTREE *t; - BTREE_CURSOR *cp; DB *dbp; - DBT copy; + DBT copy, *rdata; u_int32_t len, tlen; u_int8_t *p; int ret; @@ -464,26 +449,26 @@ __bam_build(dbc, op, dbt, h, indx, nbytes) COMPQUIET(bo, NULL); dbp = dbc->dbp; - cp = (BTREE_CURSOR *) dbc->internal; t = dbp->bt_internal; /* We use the record data return memory, it's only a short-term use. */ - if (dbc->rdata.ulen < nbytes) { + rdata = &dbc->my_rdata; + if (rdata->ulen < nbytes) { if ((ret = __os_realloc(dbp->dbenv, - nbytes, NULL, &dbc->rdata.data)) != 0) { - dbc->rdata.ulen = 0; - dbc->rdata.data = NULL; + nbytes, &rdata->data)) != 0) { + rdata->ulen = 0; + rdata->data = NULL; return (ret); } - dbc->rdata.ulen = nbytes; + rdata->ulen = nbytes; } /* * We use nul or pad bytes for any part of the record that isn't * specified; get it over with. */ - memset(dbc->rdata.data, - F_ISSET(dbp, DB_RE_FIXEDLEN) ? t->re_pad : 0, nbytes); + memset(rdata->data, + F_ISSET(dbp, DB_AM_FIXEDLEN) ? t->re_pad : 0, nbytes); /* * In the next clauses, we need to do three things: a) set p to point @@ -495,14 +480,15 @@ __bam_build(dbc, op, dbt, h, indx, nbytes) * the chase. */ if (!F_ISSET(dbt, DB_DBT_PARTIAL) || op != DB_CURRENT) { - p = (u_int8_t *)dbc->rdata.data + dbt->doff; + p = (u_int8_t *)rdata->data + dbt->doff; tlen = dbt->doff; goto user_copy; } /* Find the current record. */ if (indx < NUM_ENT(h)) { - bk = GET_BKEYDATA(h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0)); + bk = GET_BKEYDATA(dbp, h, indx + (TYPE(h) == P_LBTREE ? + O_INDX : 0)); bo = (BOVERFLOW *)bk; } else { bk = &tbk; @@ -516,12 +502,12 @@ __bam_build(dbc, op, dbt, h, indx, nbytes) */ memset(©, 0, sizeof(copy)); if ((ret = __db_goff(dbp, ©, bo->tlen, - bo->pgno, &dbc->rdata.data, &dbc->rdata.ulen)) != 0) + bo->pgno, &rdata->data, &rdata->ulen)) != 0) return (ret); /* Skip any leading data from the original record. */ tlen = dbt->doff; - p = (u_int8_t *)dbc->rdata.data + dbt->doff; + p = (u_int8_t *)rdata->data + dbt->doff; /* * Copy in any trailing data from the original record. @@ -542,10 +528,10 @@ __bam_build(dbc, op, dbt, h, indx, nbytes) } } else { /* Copy in any leading data from the original record. */ - memcpy(dbc->rdata.data, + memcpy(rdata->data, bk->data, dbt->doff > bk->len ? bk->len : dbt->doff); tlen = dbt->doff; - p = (u_int8_t *)dbc->rdata.data + dbt->doff; + p = (u_int8_t *)rdata->data + dbt->doff; /* Copy in any trailing data from the original record. */ len = dbt->doff + dbt->dlen; @@ -564,11 +550,11 @@ user_copy: tlen += dbt->size; /* Set the DBT to reference our new record. */ - dbc->rdata.size = F_ISSET(dbp, DB_RE_FIXEDLEN) ? t->re_len : tlen; - dbc->rdata.dlen = 0; - dbc->rdata.doff = 0; - dbc->rdata.flags = 0; - *dbt = dbc->rdata; + rdata->size = F_ISSET(dbp, DB_AM_FIXEDLEN) ? t->re_len : tlen; + rdata->dlen = 0; + rdata->doff = 0; + rdata->flags = 0; + *dbt = *rdata; return (0); } @@ -591,6 +577,7 @@ __bam_ritem(dbc, h, indx, data) db_indx_t cnt, lo, ln, min, off, prefix, suffix; int32_t nbytes; int ret; + db_indx_t *inp; u_int8_t *p, *t; dbp = dbc->dbp; @@ -600,10 +587,10 @@ __bam_ritem(dbc, h, indx, data) * to insert and whether it fits is handled in the caller. All we do * here is manage the page shuffling. */ - bk = GET_BKEYDATA(h, indx); + bk = GET_BKEYDATA(dbp, h, indx); /* Log the change. */ - if (DB_LOGGING(dbc)) { + if (DBC_LOGGING(dbc)) { /* * We might as well check to see if the two data items share * a common prefix and suffix -- it can save us a lot of log @@ -627,17 +614,18 @@ __bam_ritem(dbc, h, indx, data) orig.size = bk->len - (prefix + suffix); repl.data = (u_int8_t *)data->data + prefix; repl.size = data->size - (prefix + suffix); - if ((ret = __bam_repl_log(dbp->dbenv, dbc->txn, - &LSN(h), 0, dbp->log_fileid, PGNO(h), &LSN(h), - (u_int32_t)indx, (u_int32_t)B_DISSET(bk->type), + if ((ret = __bam_repl_log(dbp, dbc->txn, &LSN(h), 0, PGNO(h), + &LSN(h), (u_int32_t)indx, (u_int32_t)B_DISSET(bk->type), &orig, &repl, (u_int32_t)prefix, (u_int32_t)suffix)) != 0) return (ret); - } + } else + LSN_NOT_LOGGED(LSN(h)); /* * Set references to the first in-use byte on the page and the * first byte of the item being replaced. */ + inp = P_INP(dbp, h); p = (u_int8_t *)h + HOFFSET(h); t = (u_int8_t *)bk; @@ -648,19 +636,19 @@ __bam_ritem(dbc, h, indx, data) * the regions overlap. */ lo = BKEYDATA_SIZE(bk->len); - ln = BKEYDATA_SIZE(data->size); + ln = (db_indx_t)BKEYDATA_SIZE(data->size); if (lo != ln) { nbytes = lo - ln; /* Signed difference. */ if (p == t) /* First index is fast. */ - h->inp[indx] += nbytes; + inp[indx] += nbytes; else { /* Else, shift the page. */ - memmove(p + nbytes, p, t - p); + memmove(p + nbytes, p, (size_t)(t - p)); /* Adjust the indices' offsets. */ - off = h->inp[indx]; + off = inp[indx]; for (cnt = 0; cnt < NUM_ENT(h); ++cnt) - if (h->inp[cnt] <= off) - h->inp[cnt] += nbytes; + if (inp[cnt] <= off) + inp[cnt] += nbytes; } /* Clean up the page and adjust the item's reference. */ @@ -688,30 +676,35 @@ __bam_dup_convert(dbc, h, indx) PAGE *h; u_int32_t indx; { - BTREE_CURSOR *cp; BKEYDATA *bk; DB *dbp; DBT hdr; + DB_MPOOLFILE *mpf; PAGE *dp; - db_indx_t cnt, cpindx, dindx, first, sz; + db_indx_t cnt, cpindx, dindx, first, *inp, sz; int ret; dbp = dbc->dbp; - cp = (BTREE_CURSOR *)dbc->internal; + mpf = dbp->mpf; + inp = P_INP(dbp, h); /* * Count the duplicate records and calculate how much room they're * using on the page. */ - while (indx > 0 && h->inp[indx] == h->inp[indx - P_INDX]) + while (indx > 0 && inp[indx] == inp[indx - P_INDX]) indx -= P_INDX; - for (cnt = 0, sz = 0, first = indx;; ++cnt, indx += P_INDX) { - if (indx >= NUM_ENT(h) || h->inp[first] != h->inp[indx]) - break; - bk = GET_BKEYDATA(h, indx); - sz += B_TYPE(bk->type) == B_KEYDATA ? - BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE; - bk = GET_BKEYDATA(h, indx + O_INDX); + + /* Count the key once. */ + bk = GET_BKEYDATA(dbp, h, indx); + sz = B_TYPE(bk->type) == B_KEYDATA ? + BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE; + + /* Sum up all the data items. */ + for (cnt = 0, first = indx; + inp[first] == inp[indx] && indx < NUM_ENT(h); + ++cnt, indx += P_INDX) { + bk = GET_BKEYDATA(dbp, h, indx + O_INDX); sz += B_TYPE(bk->type) == B_KEYDATA ? BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE; } @@ -766,7 +759,7 @@ __bam_dup_convert(dbc, h, indx) * deleted entries are discarded (if the deleted entry is * overflow, then free up those pages). */ - bk = GET_BKEYDATA(h, dindx + 1); + bk = GET_BKEYDATA(dbp, h, dindx + 1); hdr.data = bk; hdr.size = B_TYPE(bk->type) == B_KEYDATA ? BKEYDATA_SIZE(bk->len) : BOVERFLOW_SIZE; @@ -778,7 +771,7 @@ __bam_dup_convert(dbc, h, indx) */ if (B_TYPE(bk->type) == B_OVERFLOW && (ret = __db_doff(dbc, - (GET_BOVERFLOW(h, dindx + 1))->pgno)) != 0) + (GET_BOVERFLOW(dbp, h, dindx + 1))->pgno)) != 0) goto err; } else { if ((ret = __db_pitem( @@ -802,17 +795,17 @@ __bam_dup_convert(dbc, h, indx) /* Put in a new data item that points to the duplicates page. */ if ((ret = __bam_ovput(dbc, - B_DUPLICATE, dp->pgno, h, first + 1, NULL)) != 0) + B_DUPLICATE, dp->pgno, h, first + 1, NULL)) != 0) goto err; - /* Adjust cursors for all the above movments. */ + /* Adjust cursors for all the above movements. */ if ((ret = __bam_ca_di(dbc, - PGNO(h), first + P_INDX, first + P_INDX - indx)) != 0) + PGNO(h), first + P_INDX, (int)(first + P_INDX - indx))) != 0) goto err; - return (memp_fput(dbp->mpf, dp, DB_MPOOL_DIRTY)); + return (__memp_fput(mpf, dp, DB_MPOOL_DIRTY)); -err: (void)__db_free(dbc, dp); +err: (void)__memp_fput(mpf, dp, 0); return (ret); } diff --git a/db/btree/bt_rec.c b/db/btree/bt_rec.c index 24dc9bc6a..1587028b3 100644 --- a/db/btree/bt_rec.c +++ b/db/btree/bt_rec.c @@ -1,14 +1,14 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2003 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: bt_rec.c,v 11.35 2001/01/10 16:24:47 ubell Exp $"; +static const char revid[] = "$Id: bt_rec.c,v 11.64 2003/09/13 18:48:58 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -18,286 +18,17 @@ static const char revid[] = "$Id: bt_rec.c,v 11.35 2001/01/10 16:24:47 ubell Exp #endif #include "db_int.h" -#include "db_page.h" -#include "hash.h" -#include "btree.h" -#include "log.h" +#include "dbinc/db_page.h" +#include "dbinc/db_shash.h" +#include "dbinc/btree.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" #define IS_BTREE_PAGE(pagep) \ (TYPE(pagep) == P_IBTREE || \ TYPE(pagep) == P_LBTREE || TYPE(pagep) == P_LDUP) -/* - * __bam_pg_alloc_recover -- - * Recovery function for pg_alloc. - * - * PUBLIC: int __bam_pg_alloc_recover - * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); - */ -int -__bam_pg_alloc_recover(dbenv, dbtp, lsnp, op, info) - DB_ENV *dbenv; - DBT *dbtp; - DB_LSN *lsnp; - db_recops op; - void *info; -{ - __bam_pg_alloc_args *argp; - DB *file_dbp; - DBC *dbc; - DBMETA *meta; - DB_MPOOLFILE *mpf; - PAGE *pagep; - db_pgno_t pgno; - int cmp_n, cmp_p, level, modified, ret; - - REC_PRINT(__bam_pg_alloc_print); - REC_INTRO(__bam_pg_alloc_read, 0); - - /* - * Fix up the allocated page. If we're redoing the operation, we have - * to get the page (creating it if it doesn't exist), and update its - * LSN. If we're undoing the operation, we have to reset the page's - * LSN and put it on the free list. - * - * Fix up the metadata page. If we're redoing the operation, we have - * to get the metadata page and update its LSN and its free pointer. - * If we're undoing the operation and the page was ever created, we put - * it on the freelist. - */ - pgno = PGNO_BASE_MD; - meta = NULL; - if ((ret = memp_fget(mpf, &pgno, 0, &meta)) != 0) { - /* The metadata page must always exist on redo. */ - if (DB_REDO(op)) { - (void)__db_pgerr(file_dbp, pgno); - goto out; - } else - goto done; - } - if ((ret = memp_fget(mpf, &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0) { - /* - * We specify creation and check for it later, because this - * operation was supposed to create the page, and even in - * the undo case it's going to get linked onto the freelist - * which we're also fixing up. - */ - (void)__db_pgerr(file_dbp, argp->pgno); - goto err; - } - - /* Fix up the allocated page. */ - modified = 0; - cmp_n = log_compare(lsnp, &LSN(pagep)); - cmp_p = log_compare(&LSN(pagep), &argp->page_lsn); - - /* - * If an inital allocation is aborted and then reallocated - * during an archival restore the log record will have - * an LSN for the page but the page will be empty. - */ - if (IS_ZERO_LSN(LSN(pagep))) - cmp_p = 0; - CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->page_lsn); - /* - * If we we rolled back this allocation previously during an - * archive restore, the page may have the LSN of the meta page - * at the point of the roll back. This will be no more - * than the LSN of the metadata page at the time of this allocation. - */ - if (DB_REDO(op) && - (cmp_p == 0 || - (IS_ZERO_LSN(argp->page_lsn) && - log_compare(&LSN(pagep), &argp->meta_lsn) <= 0))) { - /* Need to redo update described. */ - switch (argp->ptype) { - case P_LBTREE: - case P_LRECNO: - case P_LDUP: - level = LEAFLEVEL; - break; - default: - level = 0; - break; - } - P_INIT(pagep, file_dbp->pgsize, - argp->pgno, PGNO_INVALID, PGNO_INVALID, level, argp->ptype); - - pagep->lsn = *lsnp; - modified = 1; - } else if (cmp_n == 0 && DB_UNDO(op)) { - /* - * Undo the allocation, reinitialize the page and - * link its next pointer to the free list. - */ - P_INIT(pagep, file_dbp->pgsize, - argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID); - - pagep->lsn = argp->page_lsn; - modified = 1; - } - - if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) { - goto err; - } - - /* - * If the page was newly created, put it on the limbo list. - */ - if (IS_ZERO_LSN(LSN(pagep)) && - IS_ZERO_LSN(argp->page_lsn) && DB_UNDO(op)) { - /* Put the page in limbo.*/ - if ((ret = __db_add_limbo(dbenv, - info, argp->fileid, argp->pgno, 1)) != 0) - goto err; - } - - /* Fix up the metadata page. */ - modified = 0; - cmp_n = log_compare(lsnp, &LSN(meta)); - cmp_p = log_compare(&LSN(meta), &argp->meta_lsn); - CHECK_LSN(op, cmp_p, &LSN(meta), &argp->meta_lsn); - if (cmp_p == 0 && DB_REDO(op)) { - /* Need to redo update described. */ - LSN(meta) = *lsnp; - meta->free = argp->next; - modified = 1; - } else if (cmp_n == 0 && DB_UNDO(op)) { - /* Need to undo update described. */ - LSN(meta) = argp->meta_lsn; - - /* - * If the page has a zero LSN then its newly created - * and will go into limbo rather than directly on the - * free list. - */ - if (!IS_ZERO_LSN(argp->page_lsn)) - meta->free = argp->pgno; - modified = 1; - } - if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0) - goto out; - /* - * This could be the metapage from a subdb which is read from disk - * to recover its creation. - */ - if (F_ISSET(file_dbp, DB_AM_SUBDB)) - switch (argp->type) { - case P_BTREEMETA: - case P_HASHMETA: - case P_QAMMETA: - file_dbp->sync(file_dbp, 0); - break; - } - -done: *lsnp = argp->prev_lsn; - ret = 0; - - if (0) { -err: - if (meta != NULL) - (void)memp_fput(mpf, meta, 0); - } -out: REC_CLOSE; -} - -/* - * __bam_pg_free_recover -- - * Recovery function for pg_free. - * - * PUBLIC: int __bam_pg_free_recover - * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); - */ -int -__bam_pg_free_recover(dbenv, dbtp, lsnp, op, info) - DB_ENV *dbenv; - DBT *dbtp; - DB_LSN *lsnp; - db_recops op; - void *info; -{ - __bam_pg_free_args *argp; - DB *file_dbp; - DBC *dbc; - DBMETA *meta; - DB_LSN copy_lsn; - DB_MPOOLFILE *mpf; - PAGE *pagep; - db_pgno_t pgno; - int cmp_n, cmp_p, modified, ret; - - COMPQUIET(info, NULL); - REC_PRINT(__bam_pg_free_print); - REC_INTRO(__bam_pg_free_read, 1); - - /* - * Fix up the freed page. If we're redoing the operation we get the - * page and explicitly discard its contents, then update its LSN. If - * we're undoing the operation, we get the page and restore its header. - * Create the page if necessary, we may be freeing an aborted - * create. - */ - if ((ret = memp_fget(mpf, &argp->pgno, DB_MPOOL_CREATE, &pagep)) != 0) - goto out; - modified = 0; - __ua_memcpy(©_lsn, &LSN(argp->header.data), sizeof(DB_LSN)); - cmp_n = log_compare(lsnp, &LSN(pagep)); - cmp_p = log_compare(&LSN(pagep), ©_lsn); - CHECK_LSN(op, cmp_p, &LSN(pagep), ©_lsn); - if (DB_REDO(op) && - (cmp_p == 0 || - (IS_ZERO_LSN(copy_lsn) && - log_compare(&LSN(pagep), &argp->meta_lsn) <= 0))) { - /* Need to redo update described. */ - P_INIT(pagep, file_dbp->pgsize, - argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID); - pagep->lsn = *lsnp; - - modified = 1; - } else if (cmp_n == 0 && DB_UNDO(op)) { - /* Need to undo update described. */ - memcpy(pagep, argp->header.data, argp->header.size); - - modified = 1; - } - if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) - goto out; - - /* - * Fix up the metadata page. If we're redoing or undoing the operation - * we get the page and update its LSN and free pointer. - */ - pgno = PGNO_BASE_MD; - if ((ret = memp_fget(mpf, &pgno, 0, &meta)) != 0) { - /* The metadata page must always exist. */ - (void)__db_pgerr(file_dbp, pgno); - goto out; - } - - modified = 0; - cmp_n = log_compare(lsnp, &LSN(meta)); - cmp_p = log_compare(&LSN(meta), &argp->meta_lsn); - CHECK_LSN(op, cmp_p, &LSN(meta), &argp->meta_lsn); - if (cmp_p == 0 && DB_REDO(op)) { - /* Need to redo the deallocation. */ - meta->free = argp->pgno; - LSN(meta) = *lsnp; - modified = 1; - } else if (cmp_n == 0 && DB_UNDO(op)) { - /* Need to undo the deallocation. */ - meta->free = argp->next; - LSN(meta) = argp->meta_lsn; - modified = 1; - } - if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0) - goto out; - -done: *lsnp = argp->prev_lsn; - ret = 0; - -out: REC_CLOSE; -} - /* * __bam_split_recover -- * Recovery function for split. @@ -320,7 +51,7 @@ __bam_split_recover(dbenv, dbtp, lsnp, op, info) PAGE *_lp, *lp, *np, *pp, *_rp, *rp, *sp; db_pgno_t pgno, root_pgno; u_int32_t ptype; - int cmp, l_update, p_update, r_update, rc, ret, rootsplit, t_ret; + int cmp, l_update, p_update, r_update, rc, ret, ret_l, rootsplit, t_ret; COMPQUIET(info, NULL); REC_PRINT(__bam_split_print); @@ -345,16 +76,16 @@ __bam_split_recover(dbenv, dbtp, lsnp, op, info) * so it's got to be aligned. Copying it into allocated memory is * the only way to guarantee this. */ - if ((ret = __os_malloc(dbenv, argp->pg.size, NULL, &sp)) != 0) + if ((ret = __os_malloc(dbenv, argp->pg.size, &sp)) != 0) goto out; memcpy(sp, argp->pg.data, argp->pg.size); pgno = PGNO(sp); root_pgno = argp->root_pgno; - rootsplit = pgno == root_pgno; - if (memp_fget(mpf, &argp->left, 0, &lp) != 0) + rootsplit = root_pgno != PGNO_INVALID; + if ((ret_l = __memp_fget(mpf, &argp->left, 0, &lp)) != 0) lp = NULL; - if (memp_fget(mpf, &argp->right, 0, &rp) != 0) + if (__memp_fget(mpf, &argp->right, 0, &rp) != 0) rp = NULL; if (DB_REDO(op)) { @@ -368,8 +99,8 @@ __bam_split_recover(dbenv, dbtp, lsnp, op, info) * same reason. */ if (rootsplit) { - if ((ret = memp_fget(mpf, &pgno, 0, &pp)) != 0) { - (void)__db_pgerr(file_dbp, pgno); + if ((ret = __memp_fget(mpf, &pgno, 0, &pp)) != 0) { + ret = __db_pgerr(file_dbp, pgno, ret); pp = NULL; goto out; } @@ -377,7 +108,7 @@ __bam_split_recover(dbenv, dbtp, lsnp, op, info) CHECK_LSN(op, cmp, &LSN(pp), &LSN(argp->pg.data)); p_update = cmp == 0; } else if (lp == NULL) { - (void)__db_pgerr(file_dbp, argp->left); + ret = __db_pgerr(file_dbp, argp->left, ret_l); goto out; } @@ -400,10 +131,8 @@ __bam_split_recover(dbenv, dbtp, lsnp, op, info) goto check_next; /* Allocate and initialize new left/right child pages. */ - if ((ret = - __os_malloc(dbenv, file_dbp->pgsize, NULL, &_lp)) != 0 - || (ret = - __os_malloc(dbenv, file_dbp->pgsize, NULL, &_rp)) != 0) + if ((ret = __os_malloc(dbenv, file_dbp->pgsize, &_lp)) != 0 || + (ret = __os_malloc(dbenv, file_dbp->pgsize, &_rp)) != 0) goto out; if (rootsplit) { P_INIT(_lp, file_dbp->pgsize, argp->left, @@ -431,31 +160,31 @@ __bam_split_recover(dbenv, dbtp, lsnp, op, info) goto out; /* If the left child is wrong, update it. */ - if (lp == NULL && (ret = - memp_fget(mpf, &argp->left, DB_MPOOL_CREATE, &lp)) != 0) { - (void)__db_pgerr(file_dbp, argp->left); + if (lp == NULL && (ret = __memp_fget( + mpf, &argp->left, DB_MPOOL_CREATE, &lp)) != 0) { + ret = __db_pgerr(file_dbp, argp->left, ret); lp = NULL; goto out; } if (l_update) { memcpy(lp, _lp, file_dbp->pgsize); lp->lsn = *lsnp; - if ((ret = memp_fput(mpf, lp, DB_MPOOL_DIRTY)) != 0) + if ((ret = __memp_fput(mpf, lp, DB_MPOOL_DIRTY)) != 0) goto out; lp = NULL; } /* If the right child is wrong, update it. */ - if (rp == NULL && (ret = memp_fget(mpf, - &argp->right, DB_MPOOL_CREATE, &rp)) != 0) { - (void)__db_pgerr(file_dbp, argp->right); + if (rp == NULL && (ret = __memp_fget( + mpf, &argp->right, DB_MPOOL_CREATE, &rp)) != 0) { + ret = __db_pgerr(file_dbp, argp->right, ret); rp = NULL; goto out; } if (r_update) { memcpy(rp, _rp, file_dbp->pgsize); rp->lsn = *lsnp; - if ((ret = memp_fput(mpf, rp, DB_MPOOL_DIRTY)) != 0) + if ((ret = __memp_fput(mpf, rp, DB_MPOOL_DIRTY)) != 0) goto out; rp = NULL; } @@ -477,11 +206,11 @@ __bam_split_recover(dbenv, dbtp, lsnp, op, info) P_INIT(pp, file_dbp->pgsize, root_pgno, PGNO_INVALID, PGNO_INVALID, _lp->level + 1, ptype); - RE_NREC_SET(pp, - rc ? __bam_total(_lp) + __bam_total(_rp) : 0); + RE_NREC_SET(pp, rc ? __bam_total(file_dbp, _lp) + + __bam_total(file_dbp, _rp) : 0); pp->lsn = *lsnp; - if ((ret = memp_fput(mpf, pp, DB_MPOOL_DIRTY)) != 0) + if ((ret = __memp_fput(mpf, pp, DB_MPOOL_DIRTY)) != 0) goto out; pp = NULL; } @@ -494,8 +223,9 @@ check_next: /* * page must exist because we're redoing the operation. */ if (!rootsplit && !IS_ZERO_LSN(argp->nlsn)) { - if ((ret = memp_fget(mpf, &argp->npgno, 0, &np)) != 0) { - (void)__db_pgerr(file_dbp, argp->npgno); + if ((ret = + __memp_fget(mpf, &argp->npgno, 0, &np)) != 0) { + ret = __db_pgerr(file_dbp, argp->npgno, ret); np = NULL; goto out; } @@ -505,7 +235,7 @@ check_next: /* PREV_PGNO(np) = argp->right; np->lsn = *lsnp; if ((ret = - memp_fput(mpf, np, DB_MPOOL_DIRTY)) != 0) + __memp_fput(mpf, np, DB_MPOOL_DIRTY)) != 0) goto out; np = NULL; } @@ -518,13 +248,13 @@ check_next: /* * the adds onto the page that caused the split, and there's * really no undo-ing to be done. */ - if ((ret = memp_fget(mpf, &pgno, 0, &pp)) != 0) { + if ((ret = __memp_fget(mpf, &pgno, 0, &pp)) != 0) { pp = NULL; goto lrundo; } if (log_compare(lsnp, &LSN(pp)) == 0) { memcpy(pp, argp->pg.data, argp->pg.size); - if ((ret = memp_fput(mpf, pp, DB_MPOOL_DIRTY)) != 0) + if ((ret = __memp_fput(mpf, pp, DB_MPOOL_DIRTY)) != 0) goto out; pp = NULL; } @@ -542,7 +272,7 @@ lrundo: if ((rootsplit && lp != NULL) || rp != NULL) { log_compare(lsnp, &LSN(lp)) == 0) { lp->lsn = argp->llsn; if ((ret = - memp_fput(mpf, lp, DB_MPOOL_DIRTY)) != 0) + __memp_fput(mpf, lp, DB_MPOOL_DIRTY)) != 0) goto out; lp = NULL; } @@ -550,7 +280,7 @@ lrundo: if ((rootsplit && lp != NULL) || rp != NULL) { log_compare(lsnp, &LSN(rp)) == 0) { rp->lsn = argp->rlsn; if ((ret = - memp_fput(mpf, rp, DB_MPOOL_DIRTY)) != 0) + __memp_fput(mpf, rp, DB_MPOOL_DIRTY)) != 0) goto out; rp = NULL; } @@ -565,14 +295,15 @@ lrundo: if ((rootsplit && lp != NULL) || rp != NULL) { * if there's nothing to undo. */ if (!rootsplit && !IS_ZERO_LSN(argp->nlsn)) { - if ((ret = memp_fget(mpf, &argp->npgno, 0, &np)) != 0) { + if ((ret = + __memp_fget(mpf, &argp->npgno, 0, &np)) != 0) { np = NULL; goto done; } if (log_compare(lsnp, &LSN(np)) == 0) { PREV_PGNO(np) = argp->left; np->lsn = argp->nlsn; - if (memp_fput(mpf, np, DB_MPOOL_DIRTY)) + if (__memp_fput(mpf, np, DB_MPOOL_DIRTY)) goto out; np = NULL; } @@ -583,22 +314,22 @@ done: *lsnp = argp->prev_lsn; ret = 0; out: /* Free any pages that weren't dirtied. */ - if (pp != NULL && (t_ret = memp_fput(mpf, pp, 0)) != 0 && ret == 0) + if (pp != NULL && (t_ret = __memp_fput(mpf, pp, 0)) != 0 && ret == 0) ret = t_ret; - if (lp != NULL && (t_ret = memp_fput(mpf, lp, 0)) != 0 && ret == 0) + if (lp != NULL && (t_ret = __memp_fput(mpf, lp, 0)) != 0 && ret == 0) ret = t_ret; - if (np != NULL && (t_ret = memp_fput(mpf, np, 0)) != 0 && ret == 0) + if (np != NULL && (t_ret = __memp_fput(mpf, np, 0)) != 0 && ret == 0) ret = t_ret; - if (rp != NULL && (t_ret = memp_fput(mpf, rp, 0)) != 0 && ret == 0) + if (rp != NULL && (t_ret = __memp_fput(mpf, rp, 0)) != 0 && ret == 0) ret = t_ret; /* Free any allocated space. */ if (_lp != NULL) - __os_free(_lp, file_dbp->pgsize); + __os_free(dbenv, _lp); if (_rp != NULL) - __os_free(_rp, file_dbp->pgsize); + __os_free(dbenv, _rp); if (sp != NULL) - __os_free(sp, argp->pg.size); + __os_free(dbenv, sp); REC_CLOSE; } @@ -627,23 +358,24 @@ __bam_rsplit_recover(dbenv, dbtp, lsnp, op, info) db_pgno_t pgno, root_pgno; int cmp_n, cmp_p, modified, ret; + pagep = NULL; COMPQUIET(info, NULL); REC_PRINT(__bam_rsplit_print); REC_INTRO(__bam_rsplit_read, 1); /* Fix the root page. */ pgno = root_pgno = argp->root_pgno; - if ((ret = memp_fget(mpf, &pgno, 0, &pagep)) != 0) { + if ((ret = __memp_fget(mpf, &pgno, 0, &pagep)) != 0) { /* The root page must always exist if we are going forward. */ if (DB_REDO(op)) { - __db_pgerr(file_dbp, pgno); + ret = __db_pgerr(file_dbp, pgno, ret); goto out; } /* This must be the root of an OPD tree. */ DB_ASSERT(root_pgno != ((BTREE *)file_dbp->bt_internal)->bt_root); ret = 0; - goto done; + goto do_page; } modified = 0; cmp_n = log_compare(lsnp, &LSN(pagep)); @@ -666,22 +398,23 @@ __bam_rsplit_recover(dbenv, dbtp, lsnp, op, info) pagep->lsn = argp->rootlsn; modified = 1; } - if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) + if ((ret = __memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; +do_page: /* * Fix the page copied over the root page. It's possible that the * page never made it to disk, so if we're undo-ing and the page * doesn't exist, it's okay and there's nothing further to do. */ - if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + if ((ret = __memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { if (DB_UNDO(op)) goto done; - (void)__db_pgerr(file_dbp, argp->pgno); + ret = __db_pgerr(file_dbp, argp->pgno, ret); goto out; } modified = 0; - __ua_memcpy(©_lsn, &LSN(argp->pgdbt.data), sizeof(DB_LSN)); + (void)__ua_memcpy(©_lsn, &LSN(argp->pgdbt.data), sizeof(DB_LSN)); cmp_n = log_compare(lsnp, &LSN(pagep)); cmp_p = log_compare(&LSN(pagep), ©_lsn); CHECK_LSN(op, cmp_p, &LSN(pagep), ©_lsn); @@ -694,13 +427,16 @@ __bam_rsplit_recover(dbenv, dbtp, lsnp, op, info) memcpy(pagep, argp->pgdbt.data, argp->pgdbt.size); modified = 1; } - if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) + if ((ret = __memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; + pagep = NULL; done: *lsnp = argp->prev_lsn; ret = 0; -out: REC_CLOSE; +out: if (pagep != NULL) + (void)__memp_fput(mpf, pagep, 0); + REC_CLOSE; } /* @@ -725,15 +461,16 @@ __bam_adj_recover(dbenv, dbtp, lsnp, op, info) PAGE *pagep; int cmp_n, cmp_p, modified, ret; + pagep = NULL; COMPQUIET(info, NULL); REC_PRINT(__bam_adj_print); REC_INTRO(__bam_adj_read, 1); /* Get the page; if it never existed and we're undoing, we're done. */ - if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + if ((ret = __memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { if (DB_UNDO(op)) goto done; - (void)__db_pgerr(file_dbp, argp->pgno); + ret = __db_pgerr(file_dbp, argp->pgno, ret); goto out; } @@ -745,7 +482,7 @@ __bam_adj_recover(dbenv, dbtp, lsnp, op, info) /* Need to redo update described. */ if ((ret = __bam_adjindx(dbc, pagep, argp->indx, argp->indx_copy, argp->is_insert)) != 0) - goto err; + goto out; LSN(pagep) = *lsnp; modified = 1; @@ -753,21 +490,21 @@ __bam_adj_recover(dbenv, dbtp, lsnp, op, info) /* Need to undo update described. */ if ((ret = __bam_adjindx(dbc, pagep, argp->indx, argp->indx_copy, !argp->is_insert)) != 0) - goto err; + goto out; LSN(pagep) = argp->lsn; modified = 1; } - if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) + if ((ret = __memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; + pagep = NULL; done: *lsnp = argp->prev_lsn; ret = 0; - if (0) { -err: (void)memp_fput(mpf, pagep, 0); - } -out: REC_CLOSE; +out: if (pagep != NULL) + (void)__memp_fput(mpf, pagep, 0); + REC_CLOSE; } /* @@ -793,15 +530,16 @@ __bam_cadjust_recover(dbenv, dbtp, lsnp, op, info) PAGE *pagep; int cmp_n, cmp_p, modified, ret; + pagep = NULL; COMPQUIET(info, NULL); REC_PRINT(__bam_cadjust_print); REC_INTRO(__bam_cadjust_read, 1); /* Get the page; if it never existed and we're undoing, we're done. */ - if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + if ((ret = __memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { if (DB_UNDO(op)) goto done; - (void)__db_pgerr(file_dbp, argp->pgno); + ret = __db_pgerr(file_dbp, argp->pgno, ret); goto out; } @@ -812,11 +550,13 @@ __bam_cadjust_recover(dbenv, dbtp, lsnp, op, info) if (cmp_p == 0 && DB_REDO(op)) { /* Need to redo update described. */ if (IS_BTREE_PAGE(pagep)) { - GET_BINTERNAL(pagep, argp->indx)->nrecs += argp->adjust; + GET_BINTERNAL(file_dbp, pagep, argp->indx)->nrecs += + argp->adjust; if (argp->opflags & CAD_UPDATEROOT) RE_NREC_ADJ(pagep, argp->adjust); } else { - GET_RINTERNAL(pagep, argp->indx)->nrecs += argp->adjust; + GET_RINTERNAL(file_dbp, pagep, argp->indx)->nrecs += + argp->adjust; if (argp->opflags & CAD_UPDATEROOT) RE_NREC_ADJ(pagep, argp->adjust); } @@ -826,24 +566,29 @@ __bam_cadjust_recover(dbenv, dbtp, lsnp, op, info) } else if (cmp_n == 0 && DB_UNDO(op)) { /* Need to undo update described. */ if (IS_BTREE_PAGE(pagep)) { - GET_BINTERNAL(pagep, argp->indx)->nrecs -= argp->adjust; + GET_BINTERNAL(file_dbp, pagep, argp->indx)->nrecs -= + argp->adjust; if (argp->opflags & CAD_UPDATEROOT) RE_NREC_ADJ(pagep, -(argp->adjust)); } else { - GET_RINTERNAL(pagep, argp->indx)->nrecs -= argp->adjust; + GET_RINTERNAL(file_dbp, pagep, argp->indx)->nrecs -= + argp->adjust; if (argp->opflags & CAD_UPDATEROOT) RE_NREC_ADJ(pagep, -(argp->adjust)); } LSN(pagep) = argp->lsn; modified = 1; } - if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) + if ((ret = __memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; + pagep = NULL; done: *lsnp = argp->prev_lsn; ret = 0; -out: REC_CLOSE; +out: if (pagep != NULL) + (void)__memp_fput(mpf, pagep, 0); + REC_CLOSE; } /* @@ -869,15 +614,16 @@ __bam_cdel_recover(dbenv, dbtp, lsnp, op, info) u_int32_t indx; int cmp_n, cmp_p, modified, ret; + pagep = NULL; COMPQUIET(info, NULL); REC_PRINT(__bam_cdel_print); REC_INTRO(__bam_cdel_read, 1); /* Get the page; if it never existed and we're undoing, we're done. */ - if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + if ((ret = __memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { if (DB_UNDO(op)) goto done; - (void)__db_pgerr(file_dbp, argp->pgno); + ret = __db_pgerr(file_dbp, argp->pgno, ret); goto out; } @@ -888,27 +634,30 @@ __bam_cdel_recover(dbenv, dbtp, lsnp, op, info) if (cmp_p == 0 && DB_REDO(op)) { /* Need to redo update described. */ indx = argp->indx + (TYPE(pagep) == P_LBTREE ? O_INDX : 0); - B_DSET(GET_BKEYDATA(pagep, indx)->type); + B_DSET(GET_BKEYDATA(file_dbp, pagep, indx)->type); LSN(pagep) = *lsnp; modified = 1; } else if (cmp_n == 0 && DB_UNDO(op)) { /* Need to undo update described. */ indx = argp->indx + (TYPE(pagep) == P_LBTREE ? O_INDX : 0); - B_DCLR(GET_BKEYDATA(pagep, indx)->type); + B_DCLR(GET_BKEYDATA(file_dbp, pagep, indx)->type); (void)__bam_ca_delete(file_dbp, argp->pgno, argp->indx, 0); LSN(pagep) = argp->lsn; modified = 1; } - if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) + if ((ret = __memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; + pagep = NULL; done: *lsnp = argp->prev_lsn; ret = 0; -out: REC_CLOSE; +out: if (pagep != NULL) + (void)__memp_fput(mpf, pagep, 0); + REC_CLOSE; } /* @@ -936,18 +685,19 @@ __bam_repl_recover(dbenv, dbtp, lsnp, op, info) int cmp_n, cmp_p, modified, ret; u_int8_t *p; + pagep = NULL; COMPQUIET(info, NULL); REC_PRINT(__bam_repl_print); REC_INTRO(__bam_repl_read, 1); /* Get the page; if it never existed and we're undoing, we're done. */ - if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + if ((ret = __memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { if (DB_UNDO(op)) goto done; - (void)__db_pgerr(file_dbp, argp->pgno); + ret = __db_pgerr(file_dbp, argp->pgno, ret); goto out; } - bk = GET_BKEYDATA(pagep, argp->indx); + bk = GET_BKEYDATA(file_dbp, pagep, argp->indx); modified = 0; cmp_n = log_compare(lsnp, &LSN(pagep)); @@ -961,8 +711,8 @@ __bam_repl_recover(dbenv, dbtp, lsnp, op, info) */ memset(&dbt, 0, sizeof(dbt)); dbt.size = argp->prefix + argp->suffix + argp->repl.size; - if ((ret = __os_malloc(dbenv, dbt.size, NULL, &dbt.data)) != 0) - goto err; + if ((ret = __os_malloc(dbenv, dbt.size, &dbt.data)) != 0) + goto out; p = dbt.data; memcpy(p, bk->data, argp->prefix); p += argp->prefix; @@ -971,9 +721,9 @@ __bam_repl_recover(dbenv, dbtp, lsnp, op, info) memcpy(p, bk->data + (bk->len - argp->suffix), argp->suffix); ret = __bam_ritem(dbc, pagep, argp->indx, &dbt); - __os_free(dbt.data, dbt.size); + __os_free(dbenv, dbt.data); if (ret != 0) - goto err; + goto out; LSN(pagep) = *lsnp; modified = 1; @@ -985,8 +735,8 @@ __bam_repl_recover(dbenv, dbtp, lsnp, op, info) */ memset(&dbt, 0, sizeof(dbt)); dbt.size = argp->prefix + argp->suffix + argp->orig.size; - if ((ret = __os_malloc(dbenv, dbt.size, NULL, &dbt.data)) != 0) - goto err; + if ((ret = __os_malloc(dbenv, dbt.size, &dbt.data)) != 0) + goto out; p = dbt.data; memcpy(p, bk->data, argp->prefix); p += argp->prefix; @@ -995,27 +745,27 @@ __bam_repl_recover(dbenv, dbtp, lsnp, op, info) memcpy(p, bk->data + (bk->len - argp->suffix), argp->suffix); ret = __bam_ritem(dbc, pagep, argp->indx, &dbt); - __os_free(dbt.data, dbt.size); + __os_free(dbenv, dbt.data); if (ret != 0) - goto err; + goto out; /* Reset the deleted flag, if necessary. */ if (argp->isdeleted) - B_DSET(GET_BKEYDATA(pagep, argp->indx)->type); + B_DSET(GET_BKEYDATA(file_dbp, pagep, argp->indx)->type); LSN(pagep) = argp->lsn; modified = 1; } - if ((ret = memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) + if ((ret = __memp_fput(mpf, pagep, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; + pagep = NULL; done: *lsnp = argp->prev_lsn; ret = 0; - if (0) { -err: (void)memp_fput(mpf, pagep, 0); - } -out: REC_CLOSE; +out: if (pagep != NULL) + (void)__memp_fput(mpf, pagep, 0); + REC_CLOSE; } /* @@ -1040,14 +790,15 @@ __bam_root_recover(dbenv, dbtp, lsnp, op, info) DB_MPOOLFILE *mpf; int cmp_n, cmp_p, modified, ret; + meta = NULL; COMPQUIET(info, NULL); REC_PRINT(__bam_root_print); REC_INTRO(__bam_root_read, 0); - if ((ret = memp_fget(mpf, &argp->meta_pgno, 0, &meta)) != 0) { + if ((ret = __memp_fget(mpf, &argp->meta_pgno, 0, &meta)) != 0) { /* The metadata page must always exist on redo. */ if (DB_REDO(op)) { - (void)__db_pgerr(file_dbp, argp->meta_pgno); + ret = __db_pgerr(file_dbp, argp->meta_pgno, ret); goto out; } else goto done; @@ -1068,13 +819,16 @@ __bam_root_recover(dbenv, dbtp, lsnp, op, info) meta->dbmeta.lsn = argp->meta_lsn; modified = 1; } - if ((ret = memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0) + if ((ret = __memp_fput(mpf, meta, modified ? DB_MPOOL_DIRTY : 0)) != 0) goto out; + meta = NULL; done: *lsnp = argp->prev_lsn; ret = 0; -out: REC_CLOSE; +out: if (meta != NULL) + (void)__memp_fput(mpf, meta, 0); + REC_CLOSE; } /* @@ -1100,6 +854,7 @@ __bam_curadj_recover(dbenv, dbtp, lsnp, op, info) int ret; COMPQUIET(info, NULL); + COMPQUIET(mpf, NULL); REC_PRINT(__bam_curadj_print); REC_INTRO(__bam_curadj_read, 0); @@ -1108,7 +863,7 @@ __bam_curadj_recover(dbenv, dbtp, lsnp, op, info) if (op != DB_TXN_ABORT) goto done; - switch(argp->mode) { + switch (argp->mode) { case DB_CA_DI: if ((ret = __bam_ca_di(dbc, argp->from_pgno, argp->from_indx, -(int)argp->first_indx)) != 0) @@ -1116,7 +871,7 @@ __bam_curadj_recover(dbenv, dbtp, lsnp, op, info) break; case DB_CA_DUP: if ((ret = __bam_ca_undodup(file_dbp, argp->first_indx, - argp->from_pgno, argp->from_indx, argp->to_indx)) != 0) + argp->from_pgno, argp->from_indx, argp->to_indx)) != 0) goto out; break; @@ -1160,6 +915,7 @@ __bam_rcuradj_recover(dbenv, dbtp, lsnp, op, info) int ret, t_ret; COMPQUIET(info, NULL); + COMPQUIET(mpf, NULL); rdbc = NULL; REC_PRINT(__bam_rcuradj_print); @@ -1180,15 +936,15 @@ __bam_rcuradj_recover(dbenv, dbtp, lsnp, op, info) * state into __ram_ca, and this way we don't need to make * this function know anything about how offpage dups work. */ - if ((ret = - __db_icursor(file_dbp, NULL, DB_RECNO, argp->root, 0, &rdbc)) != 0) + if ((ret = __db_cursor_int(file_dbp, + NULL, DB_RECNO, argp->root, 0, DB_LOCK_INVALIDID, &rdbc)) != 0) goto out; cp = (BTREE_CURSOR *)rdbc->internal; F_SET(cp, C_RENUMBER); cp->recno = argp->recno; - switch(argp->mode) { + switch (argp->mode) { case CA_DELETE: /* * The way to undo a delete is with an insert. Since @@ -1197,7 +953,7 @@ __bam_rcuradj_recover(dbenv, dbtp, lsnp, op, info) F_SET(cp, C_DELETED); F_SET(cp, C_RENUMBER); /* Just in case. */ cp->order = argp->order; - __ram_ca(rdbc, CA_ICURRENT); + (void)__ram_ca(rdbc, CA_ICURRENT); break; case CA_IAFTER: case CA_IBEFORE: @@ -1208,12 +964,12 @@ __bam_rcuradj_recover(dbenv, dbtp, lsnp, op, info) */ F_CLR(cp, C_DELETED); cp->order = INVALID_ORDER; - __ram_ca(rdbc, CA_DELETE); + (void)__ram_ca(rdbc, CA_DELETE); break; } done: *lsnp = argp->prev_lsn; -out: if (rdbc != NULL && (t_ret = rdbc->c_close(rdbc)) != 0 && ret == 0) +out: if (rdbc != NULL && (t_ret = __db_c_close(rdbc)) != 0 && ret == 0) ret = t_ret; REC_CLOSE; } diff --git a/db/btree/bt_reclaim.c b/db/btree/bt_reclaim.c index 538d837c2..bc85bd2d3 100644 --- a/db/btree/bt_reclaim.c +++ b/db/btree/bt_reclaim.c @@ -1,14 +1,14 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1998, 1999, 2000 + * Copyright (c) 1998-2003 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: bt_reclaim.c,v 11.5 2000/03/22 04:21:01 ubell Exp $"; +static const char revid[] = "$Id: bt_reclaim.c,v 11.14 2003/06/30 17:19:33 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -18,10 +18,8 @@ static const char revid[] = "$Id: bt_reclaim.c,v 11.5 2000/03/22 04:21:01 ubell #endif #include "db_int.h" -#include "db_page.h" -#include "db_shash.h" -#include "lock.h" -#include "btree.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" /* * __bam_reclaim -- @@ -38,7 +36,7 @@ __bam_reclaim(dbp, txn) int ret, t_ret; /* Acquire a cursor. */ - if ((ret = dbp->cursor(dbp, txn, &dbc, 0)) != 0) + if ((ret = __db_cursor(dbp, txn, &dbc, 0)) != 0) return (ret); /* Walk the tree, freeing pages. */ @@ -46,8 +44,34 @@ __bam_reclaim(dbp, txn) DB_LOCK_WRITE, dbc->internal->root, __db_reclaim_callback, dbc); /* Discard the cursor. */ - if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0) ret = t_ret; return (ret); } + +/* + * __bam_truncate -- + * Truncate a database. + * + * PUBLIC: int __bam_truncate __P((DBC *, u_int32_t *)); + */ +int +__bam_truncate(dbc, countp) + DBC *dbc; + u_int32_t *countp; +{ + db_trunc_param trunc; + int ret; + + trunc.count = 0; + trunc.dbc = dbc; + + /* Walk the tree, freeing pages. */ + ret = __bam_traverse(dbc, + DB_LOCK_WRITE, dbc->internal->root, __db_truncate_callback, &trunc); + + *countp = trunc.count; + + return (ret); +} diff --git a/db/btree/bt_recno.c b/db/btree/bt_recno.c index 6ac0cac35..2098e4d94 100644 --- a/db/btree/bt_recno.c +++ b/db/btree/bt_recno.c @@ -1,36 +1,29 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 1998, 1999, 2000 + * Copyright (c) 1997-2003 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: bt_recno.c,v 11.65 2001/01/18 14:33:22 bostic Exp $"; +static const char revid[] = "$Id: bt_recno.c,v 11.113 2003/06/30 17:19:34 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include -#include #include #endif #include "db_int.h" -#include "db_page.h" -#include "btree.h" -#include "db_ext.h" -#include "db_shash.h" -#include "lock.h" -#include "lock_ext.h" -#include "qam.h" -#include "txn.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/db_shash.h" +#include "dbinc/lock.h" static int __ram_add __P((DBC *, db_recno_t *, DBT *, u_int32_t, u_int32_t)); -static int __ram_delete __P((DB *, DB_TXN *, DBT *, u_int32_t)); -static int __ram_put __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); static int __ram_source __P((DB *)); static int __ram_sread __P((DBC *, db_recno_t)); static int __ram_update __P((DBC *, db_recno_t, int)); @@ -63,7 +56,7 @@ static int __ram_update __P((DBC *, db_recno_t, int)); } \ } #define CD_ISSET(cp) \ - (F_ISSET(cp, C_RENUMBER) && F_ISSET(cp, C_DELETED)) + (F_ISSET(cp, C_RENUMBER) && F_ISSET(cp, C_DELETED) ? 1 : 0) /* * Macros for comparing the ordering of two cursors. @@ -90,17 +83,32 @@ static int __ram_update __P((DBC *, db_recno_t, int)); * Do we need to log the current cursor adjustment? */ #define CURADJ_LOG(dbc) \ - (DB_LOGGING((dbc)) && (dbc)->txn != NULL && (dbc)->txn->parent != NULL) + (DBC_LOGGING((dbc)) && (dbc)->txn != NULL && (dbc)->txn->parent != NULL) + +/* + * After a search, copy the found page into the cursor, discarding any + * currently held lock. + */ +#define STACK_TO_CURSOR(cp) { \ + (cp)->page = (cp)->csp->page; \ + (cp)->pgno = (cp)->csp->page->pgno; \ + (cp)->indx = (cp)->csp->indx; \ + (void)__TLPUT(dbc, (cp)->lock); \ + (cp)->lock = (cp)->csp->lock; \ + (cp)->lock_mode = (cp)->csp->lock_mode; \ +} /* * __ram_open -- * Recno open function. * - * PUBLIC: int __ram_open __P((DB *, const char *, db_pgno_t, u_int32_t)); + * PUBLIC: int __ram_open __P((DB *, + * PUBLIC: DB_TXN *, const char *, db_pgno_t, u_int32_t)); */ int -__ram_open(dbp, name, base_pgno, flags) +__ram_open(dbp, txn, name, base_pgno, flags) DB *dbp; + DB_TXN *txn; const char *name; db_pgno_t base_pgno; u_int32_t flags; @@ -109,15 +117,11 @@ __ram_open(dbp, name, base_pgno, flags) DBC *dbc; int ret, t_ret; + COMPQUIET(name, NULL); t = dbp->bt_internal; - /* Initialize the remaining fields/methods of the DB. */ - dbp->del = __ram_delete; - dbp->put = __ram_put; - dbp->stat = __bam_stat; - /* Start up the tree. */ - if ((ret = __bam_read_root(dbp, name, base_pgno, flags)) != 0) + if ((ret = __bam_read_root(dbp, txn, base_pgno, flags)) != 0) return (ret); /* @@ -132,9 +136,9 @@ __ram_open(dbp, name, base_pgno, flags) return (ret); /* If we're snapshotting an underlying source file, do it now. */ - if (F_ISSET(dbp, DB_RE_SNAPSHOT)) { + if (F_ISSET(dbp, DB_AM_SNAPSHOT)) { /* Allocate a cursor. */ - if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0) + if ((ret = __db_cursor(dbp, NULL, &dbc, 0)) != 0) return (ret); /* Do the snapshot. */ @@ -143,108 +147,42 @@ __ram_open(dbp, name, base_pgno, flags) ret = 0; /* Discard the cursor. */ - if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0) ret = t_ret; } - return (0); -} - -/* - * __ram_delete -- - * Recno db->del function. - */ -static int -__ram_delete(dbp, txn, key, flags) - DB *dbp; - DB_TXN *txn; - DBT *key; - u_int32_t flags; -{ - BTREE_CURSOR *cp; - DBC *dbc; - db_recno_t recno; - int ret, t_ret; - - PANIC_CHECK(dbp->dbenv); - - /* Check for invalid flags. */ - if ((ret = __db_delchk(dbp, - key, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0) - return (ret); - - /* Acquire a cursor. */ - if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0) - return (ret); - - DEBUG_LWRITE(dbc, txn, "ram_delete", key, NULL, flags); - - /* Check the user's record number and fill in as necessary. */ - if ((ret = __ram_getno(dbc, key, &recno, 0)) != 0) - goto err; - - /* Do the delete. */ - cp = (BTREE_CURSOR *)dbc->internal; - cp->recno = recno; - - ret = __ram_c_del(dbc); - - /* Release the cursor. */ -err: if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) - ret = t_ret; - return (ret); } /* - * __ram_put -- - * Recno db->put function. + * __ram_append -- + * Recno append function. + * + * PUBLIC: int __ram_append __P((DBC *, DBT *, DBT *)); */ -static int -__ram_put(dbp, txn, key, data, flags) - DB *dbp; - DB_TXN *txn; +int +__ram_append(dbc, key, data) + DBC *dbc; DBT *key, *data; - u_int32_t flags; { - DBC *dbc; - db_recno_t recno; - int ret, t_ret; - - PANIC_CHECK(dbp->dbenv); - - /* Check for invalid flags. */ - if ((ret = __db_putchk(dbp, - key, data, flags, F_ISSET(dbp, DB_AM_RDONLY), 0)) != 0) - return (ret); - - /* Allocate a cursor. */ - if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0) - return (ret); + BTREE_CURSOR *cp; + int ret; - DEBUG_LWRITE(dbc, txn, "ram_put", key, data, flags); + cp = (BTREE_CURSOR *)dbc->internal; /* - * If we're appending to the tree, make sure we've read in all of - * the backing source file. Otherwise, check the user's record - * number and fill in as necessary. If we found the record or it - * simply didn't exist, add the user's record. + * Make sure we've read in all of the backing source file. If + * we found the record or it simply didn't exist, add the + * user's record. */ - if (flags == DB_APPEND) - ret = __ram_update(dbc, DB_MAX_RECORDS, 0); - else - ret = __ram_getno(dbc, key, &recno, 1); + ret = __ram_update(dbc, DB_MAX_RECORDS, 0); if (ret == 0 || ret == DB_NOTFOUND) - ret = __ram_add(dbc, &recno, data, flags, 0); - - /* Discard the cursor. */ - if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) - ret = t_ret; + ret = __ram_add(dbc, &cp->recno, data, DB_APPEND, 0); - /* Return the record number if we're appending to the tree. */ - if (ret == 0 && flags == DB_APPEND) - ret = __db_retcopy(dbp, key, &recno, sizeof(recno), - &dbc->rkey.data, &dbc->rkey.ulen); + /* Return the record number. */ + if (ret == 0) + ret = __db_retcopy(dbc->dbp->dbenv, key, &cp->recno, + sizeof(cp->recno), &dbc->rkey->data, &dbc->rkey->ulen); return (ret); } @@ -266,7 +204,7 @@ __ram_c_del(dbc) DB_LSN lsn; DBT hdr, data; EPG *epg; - int exact, ret, stack; + int exact, ret, stack, t_ret; dbp = dbc->dbp; cp = (BTREE_CURSOR *)dbc->internal; @@ -295,9 +233,9 @@ __ram_c_del(dbc) goto err; } stack = 1; - cp->page = cp->csp->page; - cp->pgno = cp->csp->page->pgno; - cp->indx = cp->csp->indx; + + /* Copy the page into the cursor. */ + STACK_TO_CURSOR(cp); /* * If re-numbering records, the on-page deleted flag can only mean @@ -310,7 +248,7 @@ __ram_c_del(dbc) * delete records they never created, the latter is an error because * if the record was "deleted", we could never have found it. */ - if (B_DISSET(GET_BKEYDATA(cp->page, cp->indx)->type)) { + if (B_DISSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type)) { ret = DB_KEYEMPTY; goto err; } @@ -319,11 +257,11 @@ __ram_c_del(dbc) /* Delete the item, adjust the counts, adjust the cursors. */ if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0) goto err; - __bam_adjust(dbc, -1); + if ((ret = __bam_adjust(dbc, -1)) != 0) + goto err; if (__ram_ca(dbc, CA_DELETE) > 0 && - CURADJ_LOG(dbc) && (ret = __bam_rcuradj_log(dbp->dbenv, - dbc->txn, &lsn, 0, dbp->log_fileid, CA_DELETE, - cp->root, cp->recno, cp->order)) != 0) + CURADJ_LOG(dbc) && (ret = __bam_rcuradj_log(dbp, dbc->txn, + &lsn, 0, CA_DELETE, cp->root, cp->recno, cp->order)) != 0) goto err; /* @@ -346,15 +284,15 @@ __ram_c_del(dbc) * going to be emptied by removing the single reference * to the emptied page (or one of its parents). */ - for (epg = cp->sp; epg <= cp->csp; ++epg) - if (NUM_ENT(epg->page) <= 1) + for (epg = cp->csp; epg >= cp->sp; --epg) + if (NUM_ENT(epg->page) > 1) break; /* * We want to delete a single item out of the last page - * that we're not deleting, back up to that page. + * that we're not deleting. */ - ret = __bam_dpages(dbc, --epg); + ret = __bam_dpages(dbc, epg); /* * Regardless of the return from __bam_dpages, it will @@ -383,8 +321,8 @@ __ram_c_del(dbc) t->re_modified = 1; -err: if (stack) - __bam_stkrel(dbc, STK_CLRDBC); +err: if (stack && (t_ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0 && ret == 0) + ret = t_ret; return (ret); } @@ -412,6 +350,7 @@ __ram_c_get(dbc, key, data, flags, pgnop) dbp = dbc->dbp; cp = (BTREE_CURSOR *)dbc->internal; + LF_CLR(DB_MULTIPLE|DB_MULTIPLE_KEY); retry: switch (flags) { case DB_CURRENT: /* @@ -445,8 +384,13 @@ retry: switch (flags) { * we have to avoid incrementing the record number so that we * return the right record by virtue of renumbering the tree. */ - if (CD_ISSET(cp)) + if (CD_ISSET(cp)) { + /* + * Clear the flag, we've moved off the deleted record. + */ + CD_CLR(cp); break; + } if (cp->recno != RECNO_OOB) { ++cp->recno; @@ -504,6 +448,7 @@ retry: switch (flags) { goto err; /* NOTREACHED */ case DB_GET_BOTH: + case DB_GET_BOTH_RANGE: /* * If we're searching a set of off-page dups, we start * a new linear search from the first record. Otherwise, @@ -531,6 +476,8 @@ retry: switch (flags) { * read from the backing source file. Do it now for DB_CURRENT (if * the current record was deleted we may need more records from the * backing file for a DB_CURRENT operation), DB_FIRST and DB_NEXT. + * (We don't have to test for flags == DB_FIRST, because the switch + * statement above re-set flags to DB_NEXT in that case.) */ if ((flags == DB_NEXT || flags == DB_CURRENT) && ((ret = __ram_update(dbc, cp->recno, 0)) != 0) && ret != DB_NOTFOUND) @@ -547,16 +494,8 @@ retry: switch (flags) { goto err; } - /* - * Copy the page into the cursor, discarding any lock we - * are currently holding. - */ - cp->page = cp->csp->page; - cp->pgno = cp->csp->page->pgno; - cp->indx = cp->csp->indx; - (void)__TLPUT(dbc, cp->lock); - cp->lock = cp->csp->lock; - cp->lock_mode = cp->csp->lock_mode; + /* Copy the page into the cursor. */ + STACK_TO_CURSOR(cp); /* * If re-numbering records, the on-page deleted flag means this @@ -567,21 +506,34 @@ retry: switch (flags) { * walking through off-page duplicates, and fail if they were * requested explicitly by the application. */ - if (B_DISSET(GET_BKEYDATA(cp->page, cp->indx)->type)) + if (B_DISSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type)) switch (flags) { case DB_NEXT: case DB_PREV: (void)__bam_stkrel(dbc, STK_CLRDBC); goto retry; case DB_GET_BOTH: - (void)__bam_stkrel(dbc, STK_CLRDBC); - continue; + case DB_GET_BOTH_RANGE: + /* + * If we're an OPD tree, we don't care about + * matching a record number on a DB_GET_BOTH + * -- everything belongs to the same tree. A + * normal recno should give up and return + * DB_NOTFOUND if the matching recno is deleted. + */ + if (F_ISSET(dbc, DBC_OPD)) { + (void)__bam_stkrel(dbc, STK_CLRDBC); + continue; + } + ret = DB_NOTFOUND; + goto err; default: ret = DB_KEYEMPTY; goto err; } - if (flags == DB_GET_BOTH || flags == DB_GET_BOTHC) { + if (flags == DB_GET_BOTH || + flags == DB_GET_BOTHC || flags == DB_GET_BOTH_RANGE) { if ((ret = __bam_cmp(dbp, data, cp->page, cp->indx, __bam_defcmp, &cmp)) != 0) return (ret); @@ -598,10 +550,11 @@ retry: switch (flags) { /* Return the key if the user didn't give us one. */ if (!F_ISSET(dbc, DBC_OPD)) { - if (flags != DB_SET && flags != DB_SET_RANGE) - ret = __db_retcopy(dbp, - key, &cp->recno, sizeof(cp->recno), - &dbc->rkey.data, &dbc->rkey.ulen); + if (flags != DB_GET_BOTH && flags != DB_GET_BOTH_RANGE && + flags != DB_SET && flags != DB_SET_RANGE) + ret = __db_retcopy(dbp->dbenv, + key, &cp->recno, sizeof(cp->recno), + &dbc->rkey->data, &dbc->rkey->ulen); F_SET(key, DB_DBT_ISSET); } @@ -637,23 +590,45 @@ __ram_c_put(dbc, key, data, flags, pgnop) cp = (BTREE_CURSOR *)dbc->internal; /* - * DB_KEYFIRST and DB_KEYLAST will only be set if we're dealing with - * an off-page duplicate tree, they can't be specified at user level. - * Translate them into something else. + * DB_KEYFIRST and DB_KEYLAST mean different things if they're + * used in an off-page duplicate tree. If we're an off-page + * duplicate tree, they really mean "put at the beginning of the + * tree" and "put at the end of the tree" respectively, so translate + * them to something else. */ - switch (flags) { - case DB_KEYFIRST: - cp->recno = 1; - flags = DB_BEFORE; - break; - case DB_KEYLAST: - if ((ret = __ram_add(dbc, &cp->recno, data, DB_APPEND, 0)) != 0) - return (ret); - if (CURADJ_LOG(dbc) && (ret = __bam_rcuradj_log(dbp->dbenv, - dbc->txn, &lsn, 0, dbp->log_fileid, CA_ICURRENT, - cp->root, cp->recno, cp->order))) - return (ret); - return (0); + if (F_ISSET(dbc, DBC_OPD)) + switch (flags) { + case DB_KEYFIRST: + cp->recno = 1; + flags = DB_BEFORE; + break; + case DB_KEYLAST: + if ((ret = __ram_add(dbc, + &cp->recno, data, DB_APPEND, 0)) != 0) + return (ret); + if (CURADJ_LOG(dbc) && + (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0, + CA_ICURRENT, cp->root, cp->recno, cp->order)) != 0) + return (ret); + return (0); + default: + break; + } + + /* + * Handle normal DB_KEYFIRST/DB_KEYLAST; for a recno, which has + * no duplicates, these are identical and mean "put the given + * datum at the given recno". + * + * Note that the code here used to be in __ram_put; now, we + * go through the access-method-common __db_put function, which + * handles DB_NOOVERWRITE, so we and __ram_add don't have to. + */ + if (flags == DB_KEYFIRST || flags == DB_KEYLAST) { + ret = __ram_getno(dbc, key, &cp->recno, 1); + if (ret == 0 || ret == DB_NOTFOUND) + ret = __ram_add(dbc, &cp->recno, data, 0, 0); + return (ret); } /* @@ -677,9 +652,8 @@ split: if ((ret = __bam_rsearch(dbc, &cp->recno, S_INSERT, 1, &exact)) != 0) */ DB_ASSERT(exact || CD_ISSET(cp)); - cp->page = cp->csp->page; - cp->pgno = cp->csp->page->pgno; - cp->indx = cp->csp->indx; + /* Copy the page into the cursor. */ + STACK_TO_CURSOR(cp); ret = __bam_iitem(dbc, key, data, iiflags, 0); t_ret = __bam_stkrel(dbc, STK_CLRDBC); @@ -688,7 +662,7 @@ split: if ((ret = __bam_rsearch(dbc, &cp->recno, S_INSERT, 1, &exact)) != 0) ret = t_ret; else if (ret == DB_NEEDSPLIT) { arg = &cp->recno; - if ((ret = __bam_split(dbc, arg)) != 0) + if ((ret = __bam_split(dbc, arg, NULL)) != 0) goto err; goto split; } @@ -709,8 +683,7 @@ split: if ((ret = __bam_rsearch(dbc, &cp->recno, S_INSERT, 1, &exact)) != 0) /* Only log if __ram_ca found any relevant cursors. */ if (nc > 0 && CURADJ_LOG(dbc) && - (ret = __bam_rcuradj_log(dbp->dbenv, - dbc->txn, &lsn, 0, dbp->log_fileid, CA_IAFTER, + (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0, CA_IAFTER, cp->root, cp->recno, cp->order)) != 0) goto err; break; @@ -720,8 +693,7 @@ split: if ((ret = __bam_rsearch(dbc, &cp->recno, S_INSERT, 1, &exact)) != 0) /* Only log if __ram_ca found any relevant cursors. */ if (nc > 0 && CURADJ_LOG(dbc) && - (ret = __bam_rcuradj_log(dbp->dbenv, - dbc->txn, &lsn, 0, dbp->log_fileid, CA_IBEFORE, + (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0, CA_IBEFORE, cp->root, cp->recno, cp->order)) != 0) goto err; break; @@ -734,17 +706,19 @@ split: if ((ret = __bam_rsearch(dbc, &cp->recno, S_INSERT, 1, &exact)) != 0) * Only log if __ram_ca found any relevant cursors. */ if (CD_ISSET(cp) && __ram_ca(dbc, CA_ICURRENT) > 0 && - CURADJ_LOG(dbc) && (ret = __bam_rcuradj_log( - dbp->dbenv, dbc->txn, &lsn, 0, dbp->log_fileid, + CURADJ_LOG(dbc) && + (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0, CA_ICURRENT, cp->root, cp->recno, cp->order)) != 0) goto err; break; + default: + break; } /* Return the key if we've created a new record. */ if (!F_ISSET(dbc, DBC_OPD) && (flags == DB_AFTER || flags == DB_BEFORE)) - ret = __db_retcopy(dbp, key, &cp->recno, - sizeof(cp->recno), &dbc->rkey.data, &dbc->rkey.ulen); + ret = __db_retcopy(dbp->dbenv, key, &cp->recno, + sizeof(cp->recno), &dbc->rkey->data, &dbc->rkey->ulen); /* The cursor was reset, no further delete adjustment is necessary. */ err: CD_CLR(cp); @@ -940,13 +914,12 @@ __ram_update(dbc, recno, can_create) int can_create; { BTREE *t; - BTREE_CURSOR *cp; DB *dbp; + DBT *rdata; db_recno_t nrecs; int ret; dbp = dbc->dbp; - cp = (BTREE_CURSOR *)dbc->internal; t = dbp->bt_internal; /* @@ -976,27 +949,13 @@ __ram_update(dbc, recno, can_create) if (!can_create || recno <= nrecs + 1) return (0); - dbc->rdata.dlen = 0; - dbc->rdata.doff = 0; - dbc->rdata.flags = 0; - if (F_ISSET(dbp, DB_RE_FIXEDLEN)) { - if (dbc->rdata.ulen < t->re_len) { - if ((ret = __os_realloc(dbp->dbenv, - t->re_len, NULL, &dbc->rdata.data)) != 0) { - dbc->rdata.ulen = 0; - dbc->rdata.data = NULL; - return (ret); - } - dbc->rdata.ulen = t->re_len; - } - dbc->rdata.size = t->re_len; - memset(dbc->rdata.data, t->re_pad, t->re_len); - } else - dbc->rdata.size = 0; + rdata = &dbc->my_rdata; + rdata->flags = 0; + rdata->size = 0; while (recno > ++nrecs) if ((ret = __ram_add(dbc, - &nrecs, &dbc->rdata, 0, BI_DELETED)) != 0) + &nrecs, rdata, 0, BI_DELETED)) != 0) return (ret); return (0); } @@ -1017,9 +976,9 @@ __ram_source(dbp) /* Find the real name, and swap out the one we had before. */ if ((ret = __db_appname(dbp->dbenv, - DB_APP_DATA, NULL, t->re_source, 0, NULL, &source)) != 0) + DB_APP_DATA, t->re_source, 0, NULL, &source)) != 0) return (ret); - __os_freestr(t->re_source); + __os_free(dbp->dbenv, t->re_source); t->re_source = source; /* @@ -1060,6 +1019,7 @@ __ram_writeback(dbp) t = dbp->bt_internal; dbenv = dbp->dbenv; fp = NULL; + pad = NULL; /* If the file wasn't modified, we're done. */ if (!t->re_modified) @@ -1072,7 +1032,7 @@ __ram_writeback(dbp) } /* Allocate a cursor. */ - if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0) + if ((ret = __db_cursor(dbp, NULL, &dbc, 0)) != 0) return (ret); /* @@ -1119,40 +1079,45 @@ __ram_writeback(dbp) /* * We step through the records, writing each one out. Use the record * number and the dbp->get() function, instead of a cursor, so we find - * and write out "deleted" or non-existent records. + * and write out "deleted" or non-existent records. The DB handle may + * be threaded, so allocate memory as we go. */ memset(&key, 0, sizeof(key)); - memset(&data, 0, sizeof(data)); key.size = sizeof(db_recno_t); key.data = &keyno; + memset(&data, 0, sizeof(data)); + F_SET(&data, DB_DBT_REALLOC); /* * We'll need the delimiter if we're doing variable-length records, * and the pad character if we're doing fixed-length records. */ delim = t->re_delim; - if (F_ISSET(dbp, DB_RE_FIXEDLEN)) { - if ((ret = __os_malloc(dbenv, t->re_len, NULL, &pad)) != 0) + if (F_ISSET(dbp, DB_AM_FIXEDLEN)) { + if ((ret = __os_malloc(dbenv, t->re_len, &pad)) != 0) goto err; memset(pad, t->re_pad, t->re_len); - } else - COMPQUIET(pad, NULL); + } for (keyno = 1;; ++keyno) { - switch (ret = dbp->get(dbp, NULL, &key, &data, 0)) { + switch (ret = __db_get(dbp, NULL, &key, &data, 0)) { case 0: - if (fwrite(data.data, 1, data.size, fp) != data.size) + if (data.size != 0 && (u_int32_t)fwrite( + data.data, 1, data.size, fp) != data.size) goto write_err; break; case DB_KEYEMPTY: - if (F_ISSET(dbp, DB_RE_FIXEDLEN) && - fwrite(pad, 1, t->re_len, fp) != t->re_len) + if (F_ISSET(dbp, DB_AM_FIXEDLEN) && + (u_int32_t)fwrite(pad, 1, t->re_len, fp) != + t->re_len) goto write_err; break; case DB_NOTFOUND: ret = 0; goto done; + default: + goto err; } - if (!F_ISSET(dbp, DB_RE_FIXEDLEN) && + if (!F_ISSET(dbp, DB_AM_FIXEDLEN) && fwrite(&delim, 1, 1, fp) != 1) { write_err: ret = errno; __db_err(dbp->dbenv, @@ -1171,9 +1136,15 @@ done: /* Close the file descriptor. */ } /* Discard the cursor. */ - if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0) ret = t_ret; + /* Discard memory allocated to hold the data items. */ + if (data.data != NULL) + __os_ufree(dbenv, data.data); + if (pad != NULL) + __os_free(dbenv, pad); + if (ret == 0) t->re_modified = 0; @@ -1191,7 +1162,7 @@ __ram_sread(dbc, top) { BTREE *t; DB *dbp; - DBT data; + DBT data, *rdata; db_recno_t recno; size_t len; int ch, ret, was_modified; @@ -1203,45 +1174,56 @@ __ram_sread(dbc, top) if ((ret = __bam_nrecs(dbc, &recno)) != 0) return (ret); - /* Use the record data return memory, it's only a short-term use. */ - len = F_ISSET(dbp, DB_RE_FIXEDLEN) ? t->re_len : 256; - if (dbc->rdata.ulen < len) { + /* + * Use the record key return memory, it's only a short-term use. + * The record data return memory is used by __bam_iitem, which + * we'll indirectly call, so use the key so as not to collide. + */ + len = F_ISSET(dbp, DB_AM_FIXEDLEN) ? t->re_len : 256; + rdata = &dbc->my_rkey; + if (rdata->ulen < len) { if ((ret = __os_realloc( - dbp->dbenv, len, NULL, &dbc->rdata.data)) != 0) { - dbc->rdata.ulen = 0; - dbc->rdata.data = NULL; + dbp->dbenv, len, &rdata->data)) != 0) { + rdata->ulen = 0; + rdata->data = NULL; return (ret); } - dbc->rdata.ulen = len; + rdata->ulen = (u_int32_t)len; } memset(&data, 0, sizeof(data)); while (recno < top) { - data.data = dbc->rdata.data; + data.data = rdata->data; data.size = 0; - if (F_ISSET(dbp, DB_RE_FIXEDLEN)) + if (F_ISSET(dbp, DB_AM_FIXEDLEN)) for (len = t->re_len; len > 0; --len) { - if ((ch = getc(t->re_fp)) == EOF) - goto eof; + if ((ch = getc(t->re_fp)) == EOF) { + if (data.size == 0) + goto eof; + break; + } ((u_int8_t *)data.data)[data.size++] = ch; } else for (;;) { - if ((ch = getc(t->re_fp)) == EOF) - goto eof; + if ((ch = getc(t->re_fp)) == EOF) { + if (data.size == 0) + goto eof; + break; + } if (ch == t->re_delim) break; ((u_int8_t *)data.data)[data.size++] = ch; - if (data.size == dbc->rdata.ulen) { + if (data.size == rdata->ulen) { if ((ret = __os_realloc(dbp->dbenv, - dbc->rdata.ulen *= 2, - NULL, &dbc->rdata.data)) != 0) { - dbc->rdata.ulen = 0; - dbc->rdata.data = NULL; + rdata->ulen *= 2, + &rdata->data)) != 0) { + rdata->ulen = 0; + rdata->data = NULL; return (ret); } else - data.data = dbc->rdata.data; + data.data = rdata->data; } } @@ -1281,9 +1263,8 @@ __ram_add(dbc, recnop, data, flags, bi_flags) DBT *data; u_int32_t flags, bi_flags; { - BKEYDATA *bk; BTREE_CURSOR *cp; - int exact, ret, stack; + int exact, ret, stack, t_ret; cp = (BTREE_CURSOR *)dbc->internal; @@ -1292,9 +1273,9 @@ retry: /* Find the slot for insertion. */ S_INSERT | (flags == DB_APPEND ? S_APPEND : 0), 1, &exact)) != 0) return (ret); stack = 1; - cp->page = cp->csp->page; - cp->pgno = cp->csp->page->pgno; - cp->indx = cp->csp->indx; + + /* Copy the page into the cursor. */ + STACK_TO_CURSOR(cp); /* * The application may modify the data based on the selected record @@ -1304,24 +1285,6 @@ retry: /* Find the slot for insertion. */ (ret = dbc->dbp->db_append_recno(dbc->dbp, data, *recnop)) != 0) goto err; - /* - * If re-numbering records, the on-page deleted flag means this record - * was implicitly created. If not re-numbering records, the on-page - * deleted flag means this record was implicitly created, or, it was - * deleted at some time. - * - * If DB_NOOVERWRITE is set and the item already exists in the tree, - * return an error unless the item was either marked for deletion or - * only implicitly created. - */ - if (exact) { - bk = GET_BKEYDATA(cp->page, cp->indx); - if (!B_DISSET(bk->type) && flags == DB_NOOVERWRITE) { - ret = DB_KEYEXIST; - goto err; - } - } - /* * Select the arguments for __bam_iitem() and do the insert. If the * key is an exact match, or we're replacing the data item with a @@ -1353,7 +1316,7 @@ retry: /* Find the slot for insertion. */ (void)__bam_stkrel(dbc, STK_CLRDBC); stack = 0; - if ((ret = __bam_split(dbc, recnop)) != 0) + if ((ret = __bam_split(dbc, recnop, NULL)) != 0) goto err; goto retry; @@ -1362,8 +1325,8 @@ retry: /* Find the slot for insertion. */ goto err; } -err: if (stack) - __bam_stkrel(dbc, STK_CLRDBC); +err: if (stack && (t_ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0 && ret == 0) + ret = t_ret; return (ret); } diff --git a/db/btree/bt_rsearch.c b/db/btree/bt_rsearch.c index 7102cd715..92eb82144 100644 --- a/db/btree/bt_rsearch.c +++ b/db/btree/bt_rsearch.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2003 * Sleepycat Software. All rights reserved. */ /* @@ -40,7 +40,7 @@ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: bt_rsearch.c,v 11.21 2000/03/28 21:50:04 ubell Exp $"; +static const char revid[] = "$Id: bt_rsearch.c,v 11.37 2003/06/30 17:19:34 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -48,10 +48,11 @@ static const char revid[] = "$Id: bt_rsearch.c,v 11.21 2000/03/28 21:50:04 ubell #endif #include "db_int.h" -#include "db_page.h" -#include "btree.h" -#include "db_shash.h" -#include "lock.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/db_shash.h" +#include "dbinc/lock.h" +#include "dbinc/mp.h" /* * __bam_rsearch -- @@ -70,6 +71,7 @@ __bam_rsearch(dbc, recnop, flags, stop, exactp) BTREE_CURSOR *cp; DB *dbp; DB_LOCK lock; + DB_MPOOLFILE *mpf; PAGE *h; RINTERNAL *ri; db_indx_t adjust, deloffset, indx, top; @@ -79,6 +81,7 @@ __bam_rsearch(dbc, recnop, flags, stop, exactp) int ret, stack; dbp = dbc->dbp; + mpf = dbp->mpf; cp = (BTREE_CURSOR *)dbc->internal; BT_STK_CLR(cp); @@ -99,11 +102,11 @@ __bam_rsearch(dbc, recnop, flags, stop, exactp) * Retrieve the root page. */ pg = cp->root; - stack = LF_ISSET(S_STACK); + stack = LF_ISSET(S_STACK) ? 1 : 0; lock_mode = stack ? DB_LOCK_WRITE : DB_LOCK_READ; if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0) return (ret); - if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) { + if ((ret = __memp_fget(mpf, &pg, 0, &h)) != 0) { /* Did not read it, so we can release the lock */ (void)__LPUT(dbc, lock); return (ret); @@ -120,12 +123,12 @@ __bam_rsearch(dbc, recnop, flags, stop, exactp) if (!stack && ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) || (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) { - (void)memp_fput(dbp->mpf, h, 0); + (void)__memp_fput(mpf, h, 0); (void)__LPUT(dbc, lock); lock_mode = DB_LOCK_WRITE; if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0) return (ret); - if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) { + if ((ret = __memp_fget(mpf, &pg, 0, &h)) != 0) { /* Did not read it, so we can release the lock */ (void)__LPUT(dbc, lock); return (ret); @@ -164,7 +167,7 @@ __bam_rsearch(dbc, recnop, flags, stop, exactp) * eliminate any concurrency. A possible fix * would be to lock the last leaf page instead. */ - (void)memp_fput(dbp->mpf, h, 0); + (void)__memp_fput(mpf, h, 0); (void)__TLPUT(dbc, lock); return (DB_NOTFOUND); } @@ -198,12 +201,14 @@ __bam_rsearch(dbc, recnop, flags, stop, exactp) *exactp = 0; if (!LF_ISSET(S_PAST_EOF) || recno > t_recno + 1) { + (void)__memp_fput(mpf, h, 0); + (void)__TLPUT(dbc, lock); ret = DB_NOTFOUND; goto err; } } - if (!B_DISSET( - GET_BKEYDATA(h, indx + deloffset)->type) && + if (!B_DISSET(GET_BKEYDATA(dbp, h, + indx + deloffset)->type) && ++t_recno == recno) break; } @@ -216,7 +221,7 @@ __bam_rsearch(dbc, recnop, flags, stop, exactp) return (0); case P_IBTREE: for (indx = 0, top = NUM_ENT(h);;) { - bi = GET_BINTERNAL(h, indx); + bi = GET_BINTERNAL(dbp, h, indx); if (++indx == top || total + bi->nrecs >= recno) break; total += bi->nrecs; @@ -235,7 +240,7 @@ __bam_rsearch(dbc, recnop, flags, stop, exactp) return (0); case P_IRECNO: for (indx = 0, top = NUM_ENT(h);;) { - ri = GET_RINTERNAL(h, indx); + ri = GET_RINTERNAL(dbp, h, indx); if (++indx == top || total + ri->nrecs >= recno) break; total += ri->nrecs; @@ -243,7 +248,7 @@ __bam_rsearch(dbc, recnop, flags, stop, exactp) pg = ri->pgno; break; default: - return (__db_pgfmt(dbp, h->pgno)); + return (__db_pgfmt(dbp->dbenv, h->pgno)); } --indx; @@ -276,12 +281,12 @@ __bam_rsearch(dbc, recnop, flags, stop, exactp) (h->level - 1) == LEAFLEVEL) stack = 1; - (void)memp_fput(dbp->mpf, h, 0); + (void)__memp_fput(mpf, h, 0); lock_mode = stack && LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ; if ((ret = __db_lget(dbc, - LCK_COUPLE, pg, lock_mode, 0, &lock)) != 0) { + LCK_COUPLE_ALWAYS, pg, lock_mode, 0, &lock)) != 0) { /* * If we fail, discard the lock we held. This * is OK because this only happens when we are @@ -292,7 +297,7 @@ __bam_rsearch(dbc, recnop, flags, stop, exactp) } } - if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) + if ((ret = __memp_fget(mpf, &pg, 0, &h)) != 0) goto err; } /* NOTREACHED */ @@ -315,12 +320,14 @@ __bam_adjust(dbc, adjust) { BTREE_CURSOR *cp; DB *dbp; + DB_MPOOLFILE *mpf; EPG *epg; PAGE *h; db_pgno_t root_pgno; int ret; dbp = dbc->dbp; + mpf = dbp->mpf; cp = (BTREE_CURSOR *)dbc->internal; root_pgno = cp->root; @@ -328,22 +335,27 @@ __bam_adjust(dbc, adjust) for (epg = cp->sp; epg <= cp->csp; ++epg) { h = epg->page; if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) { - if (DB_LOGGING(dbc) && - (ret = __bam_cadjust_log(dbp->dbenv, - dbc->txn, &LSN(h), 0, dbp->log_fileid, - PGNO(h), &LSN(h), (u_int32_t)epg->indx, adjust, - PGNO(h) == root_pgno ? CAD_UPDATEROOT : 0)) != 0) - return (ret); + if (DBC_LOGGING(dbc)) { + if ((ret = __bam_cadjust_log(dbp, dbc->txn, + &LSN(h), 0, PGNO(h), &LSN(h), + (u_int32_t)epg->indx, adjust, + PGNO(h) == root_pgno ? + CAD_UPDATEROOT : 0)) != 0) + return (ret); + } else + LSN_NOT_LOGGED(LSN(h)); if (TYPE(h) == P_IBTREE) - GET_BINTERNAL(h, epg->indx)->nrecs += adjust; + GET_BINTERNAL(dbp, h, epg->indx)->nrecs += + adjust; else - GET_RINTERNAL(h, epg->indx)->nrecs += adjust; + GET_RINTERNAL(dbp, h, epg->indx)->nrecs += + adjust; if (PGNO(h) == root_pgno) RE_NREC_ADJ(h, adjust); - if ((ret = memp_fset(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0) + if ((ret = __memp_fset(mpf, h, DB_MPOOL_DIRTY)) != 0) return (ret); } } @@ -363,21 +375,23 @@ __bam_nrecs(dbc, rep) { DB *dbp; DB_LOCK lock; + DB_MPOOLFILE *mpf; PAGE *h; db_pgno_t pgno; int ret; dbp = dbc->dbp; + mpf = dbp->mpf; pgno = dbc->internal->root; if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &lock)) != 0) return (ret); - if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + if ((ret = __memp_fget(mpf, &pgno, 0, &h)) != 0) return (ret); *rep = RE_NREC(h); - (void)memp_fput(dbp->mpf, h, 0); + (void)__memp_fput(mpf, h, 0); (void)__TLPUT(dbc, lock); return (0); @@ -387,10 +401,11 @@ __bam_nrecs(dbc, rep) * __bam_total -- * Return the number of records below a page. * - * PUBLIC: db_recno_t __bam_total __P((PAGE *)); + * PUBLIC: db_recno_t __bam_total __P((DB *, PAGE *)); */ db_recno_t -__bam_total(h) +__bam_total(dbp, h) + DB *dbp; PAGE *h; { db_recno_t nrecs; @@ -403,25 +418,26 @@ __bam_total(h) case P_LBTREE: /* Check for logically deleted records. */ for (indx = 0; indx < top; indx += P_INDX) - if (!B_DISSET(GET_BKEYDATA(h, indx + O_INDX)->type)) + if (!B_DISSET( + GET_BKEYDATA(dbp, h, indx + O_INDX)->type)) ++nrecs; break; case P_LDUP: /* Check for logically deleted records. */ for (indx = 0; indx < top; indx += O_INDX) - if (!B_DISSET(GET_BKEYDATA(h, indx)->type)) + if (!B_DISSET(GET_BKEYDATA(dbp, h, indx)->type)) ++nrecs; break; case P_IBTREE: for (indx = 0; indx < top; indx += O_INDX) - nrecs += GET_BINTERNAL(h, indx)->nrecs; + nrecs += GET_BINTERNAL(dbp, h, indx)->nrecs; break; case P_LRECNO: nrecs = NUM_ENT(h); break; case P_IRECNO: for (indx = 0; indx < top; indx += O_INDX) - nrecs += GET_RINTERNAL(h, indx)->nrecs; + nrecs += GET_RINTERNAL(dbp, h, indx)->nrecs; break; } diff --git a/db/btree/bt_search.c b/db/btree/bt_search.c index d822198f2..dc35c7c68 100644 --- a/db/btree/bt_search.c +++ b/db/btree/bt_search.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2003 * Sleepycat Software. All rights reserved. */ /* @@ -43,7 +43,7 @@ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: bt_search.c,v 11.32 2001/01/17 20:19:46 bostic Exp $"; +static const char revid[] = "$Id: bt_search.c,v 11.47 2003/06/30 17:19:35 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -53,21 +53,23 @@ static const char revid[] = "$Id: bt_search.c,v 11.32 2001/01/17 20:19:46 bostic #endif #include "db_int.h" -#include "db_page.h" -#include "db_shash.h" -#include "btree.h" -#include "lock.h" +#include "dbinc/db_page.h" +#include "dbinc/db_shash.h" +#include "dbinc/btree.h" +#include "dbinc/lock.h" +#include "dbinc/mp.h" /* * __bam_search -- * Search a btree for a key. * - * PUBLIC: int __bam_search __P((DBC *, + * PUBLIC: int __bam_search __P((DBC *, db_pgno_t, * PUBLIC: const DBT *, u_int32_t, int, db_recno_t *, int *)); */ int -__bam_search(dbc, key, flags, stop, recnop, exactp) +__bam_search(dbc, root_pgno, key, flags, stop, recnop, exactp) DBC *dbc; + db_pgno_t root_pgno; const DBT *key; u_int32_t flags; int stop, *exactp; @@ -77,8 +79,9 @@ __bam_search(dbc, key, flags, stop, recnop, exactp) BTREE_CURSOR *cp; DB *dbp; DB_LOCK lock; + DB_MPOOLFILE *mpf; PAGE *h; - db_indx_t base, i, indx, lim; + db_indx_t base, i, indx, *inp, lim; db_lockmode_t lock_mode; db_pgno_t pg; db_recno_t recno; @@ -86,6 +89,7 @@ __bam_search(dbc, key, flags, stop, recnop, exactp) int (*func) __P((DB *, const DBT *, const DBT *)); dbp = dbc->dbp; + mpf = dbp->mpf; cp = (BTREE_CURSOR *)dbc->internal; t = dbp->bt_internal; recno = 0; @@ -109,12 +113,12 @@ __bam_search(dbc, key, flags, stop, recnop, exactp) * Retrieve the root page. */ try_again: - pg = cp->root; + pg = root_pgno == PGNO_INVALID ? cp->root : root_pgno; stack = LF_ISSET(S_STACK) && F_ISSET(cp, C_RECNUM); lock_mode = stack ? DB_LOCK_WRITE : DB_LOCK_READ; if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0) return (ret); - if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) { + if ((ret = __memp_fget(mpf, &pg, 0, &h)) != 0) { /* Did not read it, so we can release the lock */ (void)__LPUT(dbc, lock); return (ret); @@ -131,21 +135,21 @@ try_again: if (!stack && ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) || (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) { - (void)memp_fput(dbp->mpf, h, 0); + (void)__memp_fput(mpf, h, 0); (void)__LPUT(dbc, lock); lock_mode = DB_LOCK_WRITE; if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0) return (ret); - if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) { + if ((ret = __memp_fget(mpf, &pg, 0, &h)) != 0) { /* Did not read it, so we can release the lock */ (void)__LPUT(dbc, lock); return (ret); } - if (!((LF_ISSET(S_PARENT) - && (u_int8_t)(stop + 1) >= h->level) || + if (!((LF_ISSET(S_PARENT) && + (u_int8_t)(stop + 1) >= h->level) || (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) { /* Someone else split the root, start over. */ - (void)memp_fput(dbp->mpf, h, 0); + (void)__memp_fput(mpf, h, 0); (void)__LPUT(dbc, lock); goto try_again; } @@ -158,6 +162,7 @@ try_again: t->bt_compare; for (;;) { + inp = P_INP(dbp, h); /* * Do a binary search on the current page. If we're searching * a Btree leaf page, we have to walk the indices in groups of @@ -199,7 +204,7 @@ try_again: if (LF_ISSET(S_STK_ONLY)) { BT_STK_NUM(dbp->dbenv, cp, h, base, ret); __LPUT(dbc, lock); - (void)memp_fput(dbp->mpf, h, 0); + (void)__memp_fput(mpf, h, 0); return (ret); } @@ -232,21 +237,21 @@ try_again: */ next: if (recnop != NULL) for (i = 0; i < indx; ++i) - recno += GET_BINTERNAL(h, i)->nrecs; + recno += GET_BINTERNAL(dbp, h, i)->nrecs; - pg = GET_BINTERNAL(h, indx)->pgno; + pg = GET_BINTERNAL(dbp, h, indx)->pgno; if (LF_ISSET(S_STK_ONLY)) { if (stop == h->level) { BT_STK_NUM(dbp->dbenv, cp, h, indx, ret); __LPUT(dbc, lock); - (void)memp_fput(dbp->mpf, h, 0); + (void)__memp_fput(mpf, h, 0); return (ret); } BT_STK_NUMPUSH(dbp->dbenv, cp, h, indx, ret); - (void)memp_fput(dbp->mpf, h, 0); + (void)__memp_fput(mpf, h, 0); if ((ret = __db_lget(dbc, - LCK_COUPLE, pg, lock_mode, 0, &lock)) != 0) { + LCK_COUPLE_ALWAYS, pg, lock_mode, 0, &lock)) != 0) { /* * Discard our lock and return on failure. This * is OK because it only happens when descending @@ -284,12 +289,12 @@ next: if (recnop != NULL) (h->level - 1) == LEAFLEVEL) stack = 1; - (void)memp_fput(dbp->mpf, h, 0); + (void)__memp_fput(mpf, h, 0); lock_mode = stack && LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ; if ((ret = __db_lget(dbc, - LCK_COUPLE, pg, lock_mode, 0, &lock)) != 0) { + LCK_COUPLE_ALWAYS, pg, lock_mode, 0, &lock)) != 0) { /* * If we fail, discard the lock we held. This * is OK because this only happens when we are @@ -299,21 +304,13 @@ next: if (recnop != NULL) goto err; } } - if ((ret = memp_fget(dbp->mpf, &pg, 0, &h)) != 0) + if ((ret = __memp_fget(mpf, &pg, 0, &h)) != 0) goto err; } /* NOTREACHED */ found: *exactp = 1; - /* - * If we're trying to calculate the record number, add in the - * offset on this page and correct for the fact that records - * in the tree are 0-based. - */ - if (recnop != NULL) - *recnop = recno + (indx / P_INDX) + 1; - /* * If we got here, we know that we have a Btree leaf or off-page * duplicates page. If it's a Btree leaf page, we have to handle @@ -327,11 +324,11 @@ found: *exactp = 1; if (TYPE(h) == P_LBTREE) { if (LF_ISSET(S_DUPLAST)) while (indx < (db_indx_t)(NUM_ENT(h) - P_INDX) && - h->inp[indx] == h->inp[indx + P_INDX]) + inp[indx] == inp[indx + P_INDX]) indx += P_INDX; else while (indx > 0 && - h->inp[indx] == h->inp[indx - P_INDX]) + inp[indx] == inp[indx - P_INDX]) indx -= P_INDX; } @@ -341,32 +338,51 @@ found: *exactp = 1; * not move from the original found key on the basis of the S_DELNO * flag.) */ + DB_ASSERT(recnop == NULL || LF_ISSET(S_DELNO)); if (LF_ISSET(S_DELNO)) { deloffset = TYPE(h) == P_LBTREE ? O_INDX : 0; if (LF_ISSET(S_DUPLAST)) - while (B_DISSET(GET_BKEYDATA( + while (B_DISSET(GET_BKEYDATA(dbp, h, indx + deloffset)->type) && indx > 0 && - h->inp[indx] == h->inp[indx - adjust]) + inp[indx] == inp[indx - adjust]) indx -= adjust; else - while (B_DISSET(GET_BKEYDATA( + while (B_DISSET(GET_BKEYDATA(dbp, h, indx + deloffset)->type) && indx < (db_indx_t)(NUM_ENT(h) - adjust) && - h->inp[indx] == h->inp[indx + adjust]) + inp[indx] == inp[indx + adjust]) indx += adjust; /* * If we weren't able to find a non-deleted duplicate, return * DB_NOTFOUND. */ - if (B_DISSET(GET_BKEYDATA(h, indx + deloffset)->type)) + if (B_DISSET(GET_BKEYDATA(dbp, h, indx + deloffset)->type)) goto notfound; + + /* + * Increment the record counter to point to the found element. + * Ignore any deleted key/data pairs. There doesn't need to + * be any correction for duplicates, as Btree doesn't support + * duplicates and record numbers in the same tree. + */ + if (recnop != NULL) { + DB_ASSERT(TYPE(h) == P_LBTREE); + + for (i = 0; i < indx; i += P_INDX) + if (!B_DISSET( + GET_BKEYDATA(dbp, h, i + O_INDX)->type)) + ++recno; + + /* Correct the number for a 0-base. */ + *recnop = recno + 1; + } } if (LF_ISSET(S_STK_ONLY)) { BT_STK_NUM(dbp->dbenv, cp, h, indx, ret); __LPUT(dbc, lock); - (void)memp_fput(dbp->mpf, h, 0); + (void)__memp_fput(mpf, h, 0); } else { BT_STK_ENTER(dbp->dbenv, cp, h, indx, lock, lock_mode, ret); if (ret != 0) @@ -376,7 +392,7 @@ found: *exactp = 1; notfound: /* Keep the page locked for serializability. */ - (void)memp_fput(dbp->mpf, h, 0); + (void)__memp_fput(mpf, h, 0); (void)__TLPUT(dbc, lock); ret = DB_NOTFOUND; @@ -398,10 +414,12 @@ __bam_stkrel(dbc, flags) { BTREE_CURSOR *cp; DB *dbp; + DB_MPOOLFILE *mpf; EPG *epg; int ret, t_ret; dbp = dbc->dbp; + mpf = dbp->mpf; cp = (BTREE_CURSOR *)dbc->internal; /* @@ -414,10 +432,10 @@ __bam_stkrel(dbc, flags) if (epg->page != NULL) { if (LF_ISSET(STK_CLRDBC) && cp->page == epg->page) { cp->page = NULL; - cp->lock.off = LOCK_INVALID; + LOCK_INIT(cp->lock); } - if ((t_ret = memp_fput( - dbp->mpf, epg->page, 0)) != 0 && ret == 0) + if ((t_ret = + __memp_fput(mpf, epg->page, 0)) != 0 && ret == 0) ret = t_ret; /* * XXX @@ -428,12 +446,10 @@ __bam_stkrel(dbc, flags) */ epg->page = NULL; } - if (epg->lock.off != LOCK_INVALID) { - if (LF_ISSET(STK_NOLOCK)) - (void)__LPUT(dbc, epg->lock); - else - (void)__TLPUT(dbc, epg->lock); - } + if (LF_ISSET(STK_NOLOCK)) + (void)__LPUT(dbc, epg->lock); + else + (void)__TLPUT(dbc, epg->lock); } /* Clear the stack, all pages have been released. */ @@ -463,7 +479,7 @@ __bam_stkgrow(dbenv, cp) return (ret); memcpy(p, cp->sp, entries * sizeof(EPG)); if (cp->sp != cp->stack) - __os_free(cp->sp, entries * sizeof(EPG)); + __os_free(dbenv, cp->sp); cp->sp = p; cp->csp = p + entries; cp->esp = p + entries * 2; diff --git a/db/btree/bt_split.c b/db/btree/bt_split.c index f76337b19..8c5066aed 100644 --- a/db/btree/bt_split.c +++ b/db/btree/bt_split.c @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2003 * Sleepycat Software. All rights reserved. */ /* @@ -40,7 +40,7 @@ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: bt_split.c,v 11.31 2000/12/22 19:08:27 bostic Exp $"; +static const char revid[] = "$Id: bt_split.c,v 11.60 2003/06/30 17:19:35 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -51,10 +51,11 @@ static const char revid[] = "$Id: bt_split.c,v 11.31 2000/12/22 19:08:27 bostic #endif #include "db_int.h" -#include "db_page.h" -#include "db_shash.h" -#include "lock.h" -#include "btree.h" +#include "dbinc/db_page.h" +#include "dbinc/db_shash.h" +#include "dbinc/lock.h" +#include "dbinc/mp.h" +#include "dbinc/btree.h" static int __bam_broot __P((DBC *, PAGE *, PAGE *, PAGE *)); static int __bam_page __P((DBC *, EPG *, EPG *)); @@ -67,21 +68,19 @@ static int __ram_root __P((DBC *, PAGE *, PAGE *, PAGE *)); * __bam_split -- * Split a page. * - * PUBLIC: int __bam_split __P((DBC *, void *)); + * PUBLIC: int __bam_split __P((DBC *, void *, db_pgno_t *)); */ int -__bam_split(dbc, arg) +__bam_split(dbc, arg, root_pgnop) DBC *dbc; void *arg; + db_pgno_t *root_pgnop; { - BTREE *t; BTREE_CURSOR *cp; - DB *dbp; enum { UP, DOWN } dir; db_pgno_t root_pgno; int exact, level, ret; - dbp = dbc->dbp; cp = (BTREE_CURSOR *)dbc->internal; root_pgno = cp->root; @@ -112,17 +111,20 @@ __bam_split(dbc, arg) * split. This would be an easy change for this code, but I have no * numbers that indicate it's worthwhile. */ - t = dbp->bt_internal; for (dir = UP, level = LEAFLEVEL;; dir == UP ? ++level : --level) { /* * Acquire a page and its parent, locked. */ if ((ret = (dbc->dbtype == DB_BTREE ? - __bam_search(dbc, arg, S_WRPAIR, level, NULL, &exact) : + __bam_search(dbc, PGNO_INVALID, + arg, S_WRPAIR, level, NULL, &exact) : __bam_rsearch(dbc, (db_recno_t *)arg, S_WRPAIR, level, &exact))) != 0) return (ret); + if (root_pgnop != NULL) + *root_pgnop = cp->csp[0].page->pgno == root_pgno ? + root_pgno : cp->csp[-1].page->pgno; /* * Split the page if it still needs it (it's possible another * thread of control has already split the page). If we are @@ -130,7 +132,7 @@ __bam_split(dbc, arg) * is no longer necessary. */ if (2 * B_MAXSIZEONPAGE(cp->ovflsize) - <= (db_indx_t)P_FREESPACE(cp->csp[0].page)) { + <= (db_indx_t)P_FREESPACE(dbc->dbp, cp->csp[0].page)) { __bam_stkrel(dbc, STK_NOLOCK); return (0); } @@ -178,12 +180,14 @@ __bam_root(dbc, cp) DB *dbp; DBT log_dbt; DB_LSN log_lsn; + DB_MPOOLFILE *mpf; PAGE *lp, *rp; db_indx_t split; u_int32_t opflags; int ret; dbp = dbc->dbp; + mpf = dbp->mpf; /* Yeah, right. */ if (cp->page->level >= MAXBTREELEVEL) { @@ -210,21 +214,22 @@ __bam_root(dbc, cp) goto err; /* Log the change. */ - if (DB_LOGGING(dbc)) { + if (DBC_LOGGING(dbc)) { memset(&log_dbt, 0, sizeof(log_dbt)); log_dbt.data = cp->page; log_dbt.size = dbp->pgsize; ZERO_LSN(log_lsn); opflags = F_ISSET( (BTREE_CURSOR *)dbc->internal, C_RECNUM) ? SPL_NRECS : 0; - if ((ret = __bam_split_log(dbp->dbenv, dbc->txn, - &LSN(cp->page), 0, dbp->log_fileid, PGNO(lp), &LSN(lp), - PGNO(rp), &LSN(rp), (u_int32_t)NUM_ENT(lp), 0, &log_lsn, + if ((ret = __bam_split_log(dbp, + dbc->txn, &LSN(cp->page), 0, PGNO(lp), &LSN(lp), PGNO(rp), + &LSN(rp), (u_int32_t)NUM_ENT(lp), 0, &log_lsn, dbc->internal->root, &log_dbt, opflags)) != 0) goto err; - LSN(lp) = LSN(cp->page); - LSN(rp) = LSN(cp->page); - } + } else + LSN_NOT_LOGGED(LSN(cp->page)); + LSN(lp) = LSN(cp->page); + LSN(rp) = LSN(cp->page); /* Clean up the new root page. */ if ((ret = (dbc->dbtype == DB_RECNO ? @@ -238,18 +243,18 @@ __bam_root(dbc, cp) goto err; /* Success -- write the real pages back to the store. */ - (void)memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY); + (void)__memp_fput(mpf, cp->page, DB_MPOOL_DIRTY); (void)__TLPUT(dbc, cp->lock); - (void)memp_fput(dbp->mpf, lp, DB_MPOOL_DIRTY); - (void)memp_fput(dbp->mpf, rp, DB_MPOOL_DIRTY); + (void)__memp_fput(mpf, lp, DB_MPOOL_DIRTY); + (void)__memp_fput(mpf, rp, DB_MPOOL_DIRTY); return (0); err: if (lp != NULL) - (void)__db_free(dbc, lp); + (void)__memp_fput(mpf, lp, 0); if (rp != NULL) - (void)__db_free(dbc, rp); - (void)memp_fput(dbp->mpf, cp->page, 0); + (void)__memp_fput(mpf, rp, 0); + (void)__memp_fput(mpf, cp->page, 0); (void)__TLPUT(dbc, cp->lock); return (ret); } @@ -267,7 +272,8 @@ __bam_page(dbc, pp, cp) DBT log_dbt; DB_LSN log_lsn; DB *dbp; - DB_LOCK tplock; + DB_LOCK rplock, tplock; + DB_MPOOLFILE *mpf; DB_LSN save_lsn; PAGE *lp, *rp, *alloc_rp, *tp; db_indx_t split; @@ -275,8 +281,10 @@ __bam_page(dbc, pp, cp) int ret, t_ret; dbp = dbc->dbp; + mpf = dbp->mpf; alloc_rp = lp = rp = tp = NULL; - tplock.off = LOCK_INVALID; + LOCK_INIT(rplock); + LOCK_INIT(tplock); ret = -1; /* @@ -296,7 +304,7 @@ __bam_page(dbc, pp, cp) * up the tree badly, because we've violated the rule of always locking * down the tree, and never up. */ - if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &rp)) != 0) + if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, &rp)) != 0) goto err; P_INIT(rp, dbp->pgsize, 0, ISINTERNAL(cp->page) ? PGNO_INVALID : PGNO(cp->page), @@ -307,7 +315,7 @@ __bam_page(dbc, pp, cp) * Create new left page for the split, and fill in everything * except its LSN and next-page page number. */ - if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &lp)) != 0) + if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, &lp)) != 0) goto err; P_INIT(lp, dbp->pgsize, PGNO(cp->page), ISINTERNAL(cp->page) ? PGNO_INVALID : PREV_PGNO(cp->page), @@ -351,8 +359,7 @@ __bam_page(dbc, pp, cp) if ((ret = __db_lget(dbc, 0, NEXT_PGNO(cp->page), DB_LOCK_WRITE, 0, &tplock)) != 0) goto err; - if ((ret = - memp_fget(dbp->mpf, &NEXT_PGNO(cp->page), 0, &tp)) != 0) + if ((ret = __memp_fget(mpf, &NEXT_PGNO(cp->page), 0, &tp)) != 0) goto err; } @@ -363,6 +370,15 @@ __bam_page(dbc, pp, cp) if ((ret = __db_new(dbc, TYPE(cp->page), &alloc_rp)) != 0) goto err; + /* + * Lock the new page. We need to do this because someone + * could get here through bt_lpgno if this page was recently + * dealocated. They can't look at it before we commit. + */ + if ((ret = __db_lget(dbc, + 0, PGNO(alloc_rp), DB_LOCK_WRITE, 0, &rplock)) != 0) + goto err; + /* * Fix up the page numbers we didn't have before. We have to do this * before calling __bam_pinsert because it may copy a page number onto @@ -376,29 +392,30 @@ __bam_page(dbc, pp, cp) bc = (BTREE_CURSOR *)dbc->internal; /* Log the change. */ - if (DB_LOGGING(dbc)) { + if (DBC_LOGGING(dbc)) { memset(&log_dbt, 0, sizeof(log_dbt)); log_dbt.data = cp->page; log_dbt.size = dbp->pgsize; if (tp == NULL) ZERO_LSN(log_lsn); opflags = F_ISSET(bc, C_RECNUM) ? SPL_NRECS : 0; - if ((ret = __bam_split_log(dbp->dbenv, dbc->txn, - &LSN(cp->page), 0, dbp->log_fileid, PGNO(cp->page), - &LSN(cp->page), PGNO(alloc_rp), &LSN(alloc_rp), - (u_int32_t)NUM_ENT(lp), + if ((ret = __bam_split_log(dbp, dbc->txn, &LSN(cp->page), 0, + PGNO(cp->page), &LSN(cp->page), PGNO(alloc_rp), + &LSN(alloc_rp), (u_int32_t)NUM_ENT(lp), tp == NULL ? 0 : PGNO(tp), tp == NULL ? &log_lsn : &LSN(tp), - bc->root, &log_dbt, opflags)) != 0) + PGNO_INVALID, &log_dbt, opflags)) != 0) goto err; - /* Update the LSNs for all involved pages. */ - LSN(alloc_rp) = LSN(cp->page); - LSN(lp) = LSN(cp->page); - LSN(rp) = LSN(cp->page); - if (tp != NULL) - LSN(tp) = LSN(cp->page); - } + } else + LSN_NOT_LOGGED(LSN(cp->page)); + + /* Update the LSNs for all involved pages. */ + LSN(alloc_rp) = LSN(cp->page); + LSN(lp) = LSN(cp->page); + LSN(rp) = LSN(cp->page); + if (tp != NULL) + LSN(tp) = LSN(cp->page); /* * Copy the left and right pages into place. There are two paths @@ -411,13 +428,13 @@ __bam_page(dbc, pp, cp) * do the copy. */ save_lsn = alloc_rp->lsn; - memcpy(alloc_rp, rp, LOFFSET(rp)); + memcpy(alloc_rp, rp, LOFFSET(dbp, rp)); memcpy((u_int8_t *)alloc_rp + HOFFSET(rp), (u_int8_t *)rp + HOFFSET(rp), dbp->pgsize - HOFFSET(rp)); alloc_rp->lsn = save_lsn; save_lsn = cp->page->lsn; - memcpy(cp->page, lp, LOFFSET(lp)); + memcpy(cp->page, lp, LOFFSET(dbp, lp)); memcpy((u_int8_t *)cp->page + HOFFSET(lp), (u_int8_t *)lp + HOFFSET(lp), dbp->pgsize - HOFFSET(lp)); cp->page->lsn = save_lsn; @@ -431,8 +448,8 @@ __bam_page(dbc, pp, cp) PGNO(cp->page), PGNO(cp->page), PGNO(rp), split, 0)) != 0) goto err; - __os_free(lp, dbp->pgsize); - __os_free(rp, dbp->pgsize); + __os_free(dbp->dbenv, lp); + __os_free(dbp->dbenv, rp); /* * Success -- write the real pages back to the store. As we never @@ -441,44 +458,45 @@ __bam_page(dbc, pp, cp) * modifying the page so it's not really necessary, but it's neater. */ if ((t_ret = - memp_fput(dbp->mpf, alloc_rp, DB_MPOOL_DIRTY)) != 0 && ret == 0) + __memp_fput(mpf, alloc_rp, DB_MPOOL_DIRTY)) != 0 && ret == 0) ret = t_ret; + (void)__TLPUT(dbc, rplock); if ((t_ret = - memp_fput(dbp->mpf, pp->page, DB_MPOOL_DIRTY)) != 0 && ret == 0) + __memp_fput(mpf, pp->page, DB_MPOOL_DIRTY)) != 0 && ret == 0) ret = t_ret; (void)__TLPUT(dbc, pp->lock); if ((t_ret = - memp_fput(dbp->mpf, cp->page, DB_MPOOL_DIRTY)) != 0 && ret == 0) + __memp_fput(mpf, cp->page, DB_MPOOL_DIRTY)) != 0 && ret == 0) ret = t_ret; (void)__TLPUT(dbc, cp->lock); if (tp != NULL) { if ((t_ret = - memp_fput(dbp->mpf, tp, DB_MPOOL_DIRTY)) != 0 && ret == 0) + __memp_fput(mpf, tp, DB_MPOOL_DIRTY)) != 0 && ret == 0) ret = t_ret; (void)__TLPUT(dbc, tplock); } return (ret); err: if (lp != NULL) - __os_free(lp, dbp->pgsize); + __os_free(dbp->dbenv, lp); if (rp != NULL) - __os_free(rp, dbp->pgsize); + __os_free(dbp->dbenv, rp); if (alloc_rp != NULL) - (void)__db_free(dbc, alloc_rp); - + (void)__memp_fput(mpf, alloc_rp, 0); if (tp != NULL) - (void)memp_fput(dbp->mpf, tp, 0); - if (tplock.off != LOCK_INVALID) - /* We never updated the next page, we can release it. */ - (void)__LPUT(dbc, tplock); + (void)__memp_fput(mpf, tp, 0); + + /* We never updated the new or next pages, we can release them. */ + (void)__LPUT(dbc, rplock); + (void)__LPUT(dbc, tplock); - (void)memp_fput(dbp->mpf, pp->page, 0); + (void)__memp_fput(mpf, pp->page, 0); if (ret == DB_NEEDSPLIT) (void)__LPUT(dbc, pp->lock); else (void)__TLPUT(dbc, pp->lock); - (void)memp_fput(dbp->mpf, cp->page, 0); + (void)__memp_fput(mpf, cp->page, 0); if (ret == DB_NEEDSPLIT) (void)__LPUT(dbc, cp->lock); else @@ -529,7 +547,7 @@ __bam_broot(dbc, rootp, lp, rp) B_TSET(bi.type, B_KEYDATA, 0); bi.pgno = lp->pgno; if (F_ISSET(cp, C_RECNUM)) { - bi.nrecs = __bam_total(lp); + bi.nrecs = __bam_total(dbp, lp); RE_NREC_SET(rootp, bi.nrecs); } hdr.data = &bi; @@ -541,13 +559,13 @@ __bam_broot(dbc, rootp, lp, rp) switch (TYPE(rp)) { case P_IBTREE: /* Copy the first key of the child page onto the root page. */ - child_bi = GET_BINTERNAL(rp, 0); + child_bi = GET_BINTERNAL(dbp, rp, 0); bi.len = child_bi->len; B_TSET(bi.type, child_bi->type, 0); bi.pgno = rp->pgno; if (F_ISSET(cp, C_RECNUM)) { - bi.nrecs = __bam_total(rp); + bi.nrecs = __bam_total(dbp, rp); RE_NREC_ADJ(rootp, bi.nrecs); } hdr.data = &bi; @@ -567,14 +585,14 @@ __bam_broot(dbc, rootp, lp, rp) case P_LDUP: case P_LBTREE: /* Copy the first key of the child page onto the root page. */ - child_bk = GET_BKEYDATA(rp, 0); + child_bk = GET_BKEYDATA(dbp, rp, 0); switch (B_TYPE(child_bk->type)) { case B_KEYDATA: bi.len = child_bk->len; B_TSET(bi.type, child_bk->type, 0); bi.pgno = rp->pgno; if (F_ISSET(cp, C_RECNUM)) { - bi.nrecs = __bam_total(rp); + bi.nrecs = __bam_total(dbp, rp); RE_NREC_ADJ(rootp, bi.nrecs); } hdr.data = &bi; @@ -591,7 +609,7 @@ __bam_broot(dbc, rootp, lp, rp) B_TSET(bi.type, child_bk->type, 0); bi.pgno = rp->pgno; if (F_ISSET(cp, C_RECNUM)) { - bi.nrecs = __bam_total(rp); + bi.nrecs = __bam_total(dbp, rp); RE_NREC_ADJ(rootp, bi.nrecs); } hdr.data = &bi; @@ -609,11 +627,11 @@ __bam_broot(dbc, rootp, lp, rp) return (ret); break; default: - return (__db_pgfmt(dbp, rp->pgno)); + return (__db_pgfmt(dbp->dbenv, rp->pgno)); } break; default: - return (__db_pgfmt(dbp, rp->pgno)); + return (__db_pgfmt(dbp->dbenv, rp->pgno)); } return (0); } @@ -647,12 +665,12 @@ __ram_root(dbc, rootp, lp, rp) /* Insert the left and right keys, set the header information. */ ri.pgno = lp->pgno; - ri.nrecs = __bam_total(lp); + ri.nrecs = __bam_total(dbp, lp); if ((ret = __db_pitem(dbc, rootp, 0, RINTERNAL_SIZE, &hdr, NULL)) != 0) return (ret); RE_NREC_SET(rootp, ri.nrecs); ri.pgno = rp->pgno; - ri.nrecs = __bam_total(rp); + ri.nrecs = __bam_total(dbp, rp); if ((ret = __db_pitem(dbc, rootp, 1, RINTERNAL_SIZE, &hdr, NULL)) != 0) return (ret); RE_NREC_ADJ(rootp, ri.nrecs); @@ -690,7 +708,8 @@ __bam_pinsert(dbc, parent, lchild, rchild, space_check) ppage = parent->page; /* If handling record numbers, count records split to the right page. */ - nrecs = F_ISSET(cp, C_RECNUM) && !space_check ? __bam_total(rchild) : 0; + nrecs = F_ISSET(cp, C_RECNUM) && + !space_check ? __bam_total(dbp, rchild) : 0; /* * Now we insert the new page's first key into the parent page, which @@ -721,10 +740,10 @@ __bam_pinsert(dbc, parent, lchild, rchild, space_check) */ switch (TYPE(rchild)) { case P_IBTREE: - child_bi = GET_BINTERNAL(rchild, 0); + child_bi = GET_BINTERNAL(dbp, rchild, 0); nbytes = BINTERNAL_PSIZE(child_bi->len); - if (P_FREESPACE(ppage) < nbytes) + if (P_FREESPACE(dbp, ppage) < nbytes) return (DB_NEEDSPLIT); if (space_check) return (0); @@ -753,7 +772,7 @@ __bam_pinsert(dbc, parent, lchild, rchild, space_check) break; case P_LDUP: case P_LBTREE: - child_bk = GET_BKEYDATA(rchild, 0); + child_bk = GET_BKEYDATA(dbp, rchild, 0); switch (B_TYPE(child_bk->type)) { case B_KEYDATA: /* @@ -783,7 +802,7 @@ __bam_pinsert(dbc, parent, lchild, rchild, space_check) goto noprefix; if (ppage->prev_pgno == PGNO_INVALID && off <= 1) goto noprefix; - tmp_bk = GET_BKEYDATA(lchild, NUM_ENT(lchild) - + tmp_bk = GET_BKEYDATA(dbp, lchild, NUM_ENT(lchild) - (TYPE(lchild) == P_LDUP ? O_INDX : P_INDX)); if (B_TYPE(tmp_bk->type) != B_KEYDATA) goto noprefix; @@ -793,13 +812,13 @@ __bam_pinsert(dbc, parent, lchild, rchild, space_check) memset(&b, 0, sizeof(b)); b.size = child_bk->len; b.data = child_bk->data; - nksize = func(dbp, &a, &b); + nksize = (u_int32_t)func(dbp, &a, &b); if ((n = BINTERNAL_PSIZE(nksize)) < nbytes) nbytes = n; else noprefix: nksize = child_bk->len; - if (P_FREESPACE(ppage) < nbytes) + if (P_FREESPACE(dbp, ppage) < nbytes) return (DB_NEEDSPLIT); if (space_check) return (0); @@ -823,7 +842,7 @@ noprefix: nksize = child_bk->len; case B_OVERFLOW: nbytes = BINTERNAL_PSIZE(BOVERFLOW_SIZE); - if (P_FREESPACE(ppage) < nbytes) + if (P_FREESPACE(dbp, ppage) < nbytes) return (DB_NEEDSPLIT); if (space_check) return (0); @@ -850,14 +869,14 @@ noprefix: nksize = child_bk->len; return (ret); break; default: - return (__db_pgfmt(dbp, rchild->pgno)); + return (__db_pgfmt(dbp->dbenv, rchild->pgno)); } break; case P_IRECNO: case P_LRECNO: nbytes = RINTERNAL_PSIZE; - if (P_FREESPACE(ppage) < nbytes) + if (P_FREESPACE(dbp, ppage) < nbytes) return (DB_NEEDSPLIT); if (space_check) return (0); @@ -873,7 +892,7 @@ noprefix: nksize = child_bk->len; return (ret); break; default: - return (__db_pgfmt(dbp, rchild->pgno)); + return (__db_pgfmt(dbp->dbenv, rchild->pgno)); } /* @@ -882,17 +901,19 @@ noprefix: nksize = child_bk->len; */ if (F_ISSET(cp, C_RECNUM)) { /* Log the change. */ - if (DB_LOGGING(dbc) && - (ret = __bam_cadjust_log(dbp->dbenv, dbc->txn, - &LSN(ppage), 0, dbp->log_fileid, PGNO(ppage), + if (DBC_LOGGING(dbc)) { + if ((ret = __bam_cadjust_log(dbp, dbc->txn, + &LSN(ppage), 0, PGNO(ppage), &LSN(ppage), parent->indx, -(int32_t)nrecs, 0)) != 0) return (ret); + } else + LSN_NOT_LOGGED(LSN(ppage)); /* Update the left page count. */ if (dbc->dbtype == DB_RECNO) - GET_RINTERNAL(ppage, parent->indx)->nrecs -= nrecs; + GET_RINTERNAL(dbp, ppage, parent->indx)->nrecs -= nrecs; else - GET_BINTERNAL(ppage, parent->indx)->nrecs -= nrecs; + GET_BINTERNAL(dbp, ppage, parent->indx)->nrecs -= nrecs; } return (0); @@ -911,28 +932,52 @@ __bam_psplit(dbc, cp, lp, rp, splitret) { DB *dbp; PAGE *pp; - db_indx_t half, nbytes, off, splitp, top; + db_indx_t half, *inp, nbytes, off, splitp, top; int adjust, cnt, iflag, isbigkey, ret; dbp = dbc->dbp; pp = cp->page; + inp = P_INP(dbp, pp); adjust = TYPE(pp) == P_LBTREE ? P_INDX : O_INDX; /* * If we're splitting the first (last) page on a level because we're * inserting (appending) a key to it, it's likely that the data is * sorted. Moving a single item to the new page is less work and can - * push the fill factor higher than normal. If we're wrong it's not - * a big deal, we'll just do the split the right way next time. + * push the fill factor higher than normal. This is trivial when we + * are splitting a new page before the beginning of the tree, all of + * the interesting tests are against values of 0. + * + * Catching appends to the tree is harder. In a simple append, we're + * inserting an item that sorts past the end of the tree; the cursor + * will point past the last element on the page. But, in trees with + * duplicates, the cursor may point to the last entry on the page -- + * in this case, the entry will also be the last element of a duplicate + * set (the last because the search call specified the S_DUPLAST flag). + * The only way to differentiate between an insert immediately before + * the last item in a tree or an append after a duplicate set which is + * also the last item in the tree is to call the comparison function. + * When splitting internal pages during an append, the search code + * guarantees the cursor always points to the largest page item less + * than the new internal entry. To summarize, we want to catch three + * possible index values: + * + * NUM_ENT(page) Btree/Recno leaf insert past end-of-tree + * NUM_ENT(page) - O_INDX Btree or Recno internal insert past EOT + * NUM_ENT(page) - P_INDX Btree leaf insert past EOT after a set + * of duplicates + * + * two of which, (NUM_ENT(page) - O_INDX or P_INDX) might be an insert + * near the end of the tree, and not after the end of the tree at all. + * Do a simple test which might be wrong because calling the comparison + * functions is expensive. Regardless, it's not a big deal if we're + * wrong, we'll do the split the right way next time. */ off = 0; - if (NEXT_PGNO(pp) == PGNO_INVALID && - ((ISINTERNAL(pp) && cp->indx == NUM_ENT(cp->page) - 1) || - (!ISINTERNAL(pp) && cp->indx == NUM_ENT(cp->page)))) - off = NUM_ENT(cp->page) - adjust; + if (NEXT_PGNO(pp) == PGNO_INVALID && cp->indx >= NUM_ENT(pp) - adjust) + off = NUM_ENT(pp) - adjust; else if (PREV_PGNO(pp) == PGNO_INVALID && cp->indx == 0) off = adjust; - if (off != 0) goto sort; @@ -962,16 +1007,18 @@ __bam_psplit(dbc, cp, lp, rp, splitret) for (nbytes = 0, off = 0; off < top && nbytes < half; ++off) switch (TYPE(pp)) { case P_IBTREE: - if (B_TYPE(GET_BINTERNAL(pp, off)->type) == B_KEYDATA) - nbytes += - BINTERNAL_SIZE(GET_BINTERNAL(pp, off)->len); + if (B_TYPE( + GET_BINTERNAL(dbp, pp, off)->type) == B_KEYDATA) + nbytes += BINTERNAL_SIZE( + GET_BINTERNAL(dbp, pp, off)->len); else nbytes += BINTERNAL_SIZE(BOVERFLOW_SIZE); break; case P_LBTREE: - if (B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA) - nbytes += - BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len); + if (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) == + B_KEYDATA) + nbytes += BKEYDATA_SIZE(GET_BKEYDATA(dbp, + pp, off)->len); else nbytes += BOVERFLOW_SIZE; @@ -979,9 +1026,10 @@ __bam_psplit(dbc, cp, lp, rp, splitret) /* FALLTHROUGH */ case P_LDUP: case P_LRECNO: - if (B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA) - nbytes += - BKEYDATA_SIZE(GET_BKEYDATA(pp, off)->len); + if (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) == + B_KEYDATA) + nbytes += BKEYDATA_SIZE(GET_BKEYDATA(dbp, + pp, off)->len); else nbytes += BOVERFLOW_SIZE; break; @@ -989,7 +1037,7 @@ __bam_psplit(dbc, cp, lp, rp, splitret) nbytes += RINTERNAL_SIZE; break; default: - return (__db_pgfmt(dbp, pp->pgno)); + return (__db_pgfmt(dbp->dbenv, pp->pgno)); } sort: splitp = off; @@ -1002,12 +1050,14 @@ sort: splitp = off; switch (TYPE(pp)) { case P_IBTREE: iflag = 1; - isbigkey = B_TYPE(GET_BINTERNAL(pp, off)->type) != B_KEYDATA; + isbigkey = + B_TYPE(GET_BINTERNAL(dbp, pp, off)->type) != B_KEYDATA; break; case P_LBTREE: case P_LDUP: iflag = 0; - isbigkey = B_TYPE(GET_BKEYDATA(pp, off)->type) != B_KEYDATA; + isbigkey = B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) != + B_KEYDATA; break; default: iflag = isbigkey = 0; @@ -1016,18 +1066,20 @@ sort: splitp = off; for (cnt = 1; cnt <= 3; ++cnt) { off = splitp + cnt * adjust; if (off < (db_indx_t)NUM_ENT(pp) && - ((iflag && - B_TYPE(GET_BINTERNAL(pp,off)->type) == B_KEYDATA) || - B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA)) { + ((iflag && B_TYPE( + GET_BINTERNAL(dbp, pp,off)->type) == B_KEYDATA) || + B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) == + B_KEYDATA)) { splitp = off; break; } if (splitp <= (db_indx_t)(cnt * adjust)) continue; off = splitp - cnt * adjust; - if (iflag ? - B_TYPE(GET_BINTERNAL(pp, off)->type) == B_KEYDATA : - B_TYPE(GET_BKEYDATA(pp, off)->type) == B_KEYDATA) { + if (iflag ? B_TYPE( + GET_BINTERNAL(dbp, pp, off)->type) == B_KEYDATA : + B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) == + B_KEYDATA) { splitp = off; break; } @@ -1040,18 +1092,18 @@ sort: splitp = off; * page set. So, this loop can't be unbounded. */ if (TYPE(pp) == P_LBTREE && - pp->inp[splitp] == pp->inp[splitp - adjust]) + inp[splitp] == inp[splitp - adjust]) for (cnt = 1;; ++cnt) { off = splitp + cnt * adjust; if (off < NUM_ENT(pp) && - pp->inp[splitp] != pp->inp[off]) { + inp[splitp] != inp[off]) { splitp = off; break; } if (splitp <= (db_indx_t)(cnt * adjust)) continue; off = splitp - cnt * adjust; - if (pp->inp[splitp] != pp->inp[off]) { + if (inp[splitp] != inp[off]) { splitp = off + adjust; break; } @@ -1079,18 +1131,20 @@ __bam_copy(dbp, pp, cp, nxt, stop) PAGE *pp, *cp; u_int32_t nxt, stop; { - db_indx_t nbytes, off; + db_indx_t *cinp, nbytes, off, *pinp; + cinp = P_INP(dbp, cp); + pinp = P_INP(dbp, pp); /* - * Copy the rest of the data to the right page. Nxt is the next - * offset placed on the target page. + * Nxt is the offset of the next record to be placed on the target page. */ for (off = 0; nxt < stop; ++nxt, ++NUM_ENT(cp), ++off) { switch (TYPE(pp)) { case P_IBTREE: - if (B_TYPE(GET_BINTERNAL(pp, nxt)->type) == B_KEYDATA) - nbytes = - BINTERNAL_SIZE(GET_BINTERNAL(pp, nxt)->len); + if (B_TYPE( + GET_BINTERNAL(dbp, pp, nxt)->type) == B_KEYDATA) + nbytes = BINTERNAL_SIZE( + GET_BINTERNAL(dbp, pp, nxt)->len); else nbytes = BINTERNAL_SIZE(BOVERFLOW_SIZE); break; @@ -1100,16 +1154,17 @@ __bam_copy(dbp, pp, cp, nxt, stop) * the offset. */ if (off != 0 && (nxt % P_INDX) == 0 && - pp->inp[nxt] == pp->inp[nxt - P_INDX]) { - cp->inp[off] = cp->inp[off - P_INDX]; + pinp[nxt] == pinp[nxt - P_INDX]) { + cinp[off] = cinp[off - P_INDX]; continue; } /* FALLTHROUGH */ case P_LDUP: case P_LRECNO: - if (B_TYPE(GET_BKEYDATA(pp, nxt)->type) == B_KEYDATA) - nbytes = - BKEYDATA_SIZE(GET_BKEYDATA(pp, nxt)->len); + if (B_TYPE(GET_BKEYDATA(dbp, pp, nxt)->type) == + B_KEYDATA) + nbytes = BKEYDATA_SIZE(GET_BKEYDATA(dbp, + pp, nxt)->len); else nbytes = BOVERFLOW_SIZE; break; @@ -1117,10 +1172,10 @@ __bam_copy(dbp, pp, cp, nxt, stop) nbytes = RINTERNAL_SIZE; break; default: - return (__db_pgfmt(dbp, pp->pgno)); + return (__db_pgfmt(dbp->dbenv, pp->pgno)); } - cp->inp[off] = HOFFSET(cp) -= nbytes; - memcpy(P_ENTRY(cp, off), P_ENTRY(pp, nxt), nbytes); + cinp[off] = HOFFSET(cp) -= nbytes; + memcpy(P_ENTRY(dbp, cp, off), P_ENTRY(dbp, pp, nxt), nbytes); } return (0); } diff --git a/db/btree/bt_stat.c b/db/btree/bt_stat.c index 349bb40cf..0e8cff37f 100644 --- a/db/btree/bt_stat.c +++ b/db/btree/bt_stat.c @@ -1,14 +1,14 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2003 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: bt_stat.c,v 11.29 2000/11/28 21:42:27 bostic Exp $"; +static const char revid[] = "$Id: bt_stat.c,v 11.61 2003/09/13 18:52:21 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -18,102 +18,76 @@ static const char revid[] = "$Id: bt_stat.c,v 11.29 2000/11/28 21:42:27 bostic E #endif #include "db_int.h" -#include "db_page.h" -#include "db_shash.h" -#include "lock.h" -#include "btree.h" +#include "dbinc/db_page.h" +#include "dbinc/db_shash.h" +#include "dbinc/btree.h" +#include "dbinc/lock.h" +#include "dbinc/mp.h" /* * __bam_stat -- * Gather/print the btree statistics * - * PUBLIC: int __bam_stat __P((DB *, void *, void *(*)(size_t), u_int32_t)); + * PUBLIC: int __bam_stat __P((DBC *, void *, u_int32_t)); */ int -__bam_stat(dbp, spp, db_malloc, flags) - DB *dbp; +__bam_stat(dbc, spp, flags) + DBC *dbc; void *spp; - void *(*db_malloc) __P((size_t)); u_int32_t flags; { BTMETA *meta; BTREE *t; BTREE_CURSOR *cp; - DBC *dbc; + DB *dbp; DB_BTREE_STAT *sp; + DB_ENV *dbenv; DB_LOCK lock, metalock; + DB_MPOOLFILE *mpf; PAGE *h; db_pgno_t pgno; - int ret, t_ret; + int ret, t_ret, write_meta; - PANIC_CHECK(dbp->dbenv); - DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->stat"); + dbp = dbc->dbp; + dbenv = dbp->dbenv; meta = NULL; t = dbp->bt_internal; sp = NULL; - metalock.off = lock.off = LOCK_INVALID; + LOCK_INIT(metalock); + LOCK_INIT(lock); + mpf = dbp->mpf; h = NULL; - ret = 0; - - /* Check for invalid flags. */ - if ((ret = __db_statchk(dbp, flags)) != 0) - return (ret); + ret = write_meta = 0; - /* Acquire a cursor. */ - if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0) - return (ret); cp = (BTREE_CURSOR *)dbc->internal; - DEBUG_LWRITE(dbc, NULL, "bam_stat", NULL, NULL, flags); - /* Allocate and clear the structure. */ - if ((ret = __os_malloc(dbp->dbenv, sizeof(*sp), db_malloc, &sp)) != 0) + if ((ret = __os_umalloc(dbenv, sizeof(*sp), &sp)) != 0) goto err; memset(sp, 0, sizeof(*sp)); - /* If the app just wants the record count, make it fast. */ - if (flags == DB_RECORDCOUNT) { - if ((ret = __db_lget(dbc, 0, - cp->root, DB_LOCK_READ, 0, &lock)) != 0) - goto err; - if ((ret = memp_fget(dbp->mpf, - &cp->root, 0, (PAGE **)&h)) != 0) - goto err; - - sp->bt_nkeys = RE_NREC(h); - - goto done; - } - if (flags == DB_CACHED_COUNTS) { - if ((ret = __db_lget(dbc, - 0, t->bt_meta, DB_LOCK_READ, 0, &lock)) != 0) - goto err; - if ((ret = - memp_fget(dbp->mpf, &t->bt_meta, 0, (PAGE **)&meta)) != 0) - goto err; - sp->bt_nkeys = meta->dbmeta.key_count; - sp->bt_ndata = meta->dbmeta.record_count; - - goto done; - } - /* Get the metadata page for the entire database. */ pgno = PGNO_BASE_MD; if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &metalock)) != 0) goto err; - if ((ret = memp_fget(dbp->mpf, &pgno, 0, (PAGE **)&meta)) != 0) + if ((ret = __memp_fget(mpf, &pgno, 0, &meta)) != 0) goto err; + if (flags == DB_RECORDCOUNT || flags == DB_CACHED_COUNTS) + flags = DB_FAST_STAT; + if (flags == DB_FAST_STAT) + goto meta_only; + /* Walk the metadata free list, counting pages. */ for (sp->bt_free = 0, pgno = meta->dbmeta.free; pgno != PGNO_INVALID;) { ++sp->bt_free; - if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + if ((ret = __memp_fget(mpf, &pgno, 0, &h)) != 0) goto err; pgno = h->next_pgno; - if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) + if ((ret = __memp_fput(mpf, h, 0)) != 0) goto err; h = NULL; } @@ -122,14 +96,14 @@ __bam_stat(dbp, spp, db_malloc, flags) pgno = cp->root; if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &lock)) != 0) goto err; - if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + if ((ret = __memp_fget(mpf, &pgno, 0, &h)) != 0) goto err; /* Get the levels from the root page. */ sp->bt_levels = h->level; /* Discard the root page. */ - if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) + if ((ret = __memp_fput(mpf, h, 0)) != 0) goto err; h = NULL; __LPUT(dbc, lock); @@ -143,20 +117,36 @@ __bam_stat(dbp, spp, db_malloc, flags) * Get the subdatabase metadata page if it's not the same as the * one we already have. */ - if (t->bt_meta != PGNO_BASE_MD || !F_ISSET(dbp, DB_AM_RDONLY)) { - if ((ret = memp_fput(dbp->mpf, meta, 0)) != 0) + write_meta = !F_ISSET(dbp, DB_AM_RDONLY); +meta_only: + if (t->bt_meta != PGNO_BASE_MD || write_meta != 0) { + if ((ret = __memp_fput(mpf, meta, 0)) != 0) goto err; meta = NULL; __LPUT(dbc, metalock); if ((ret = __db_lget(dbc, - 0, t->bt_meta, F_ISSET(dbp, DB_AM_RDONLY) ? + 0, t->bt_meta, write_meta == 0 ? DB_LOCK_READ : DB_LOCK_WRITE, 0, &metalock)) != 0) goto err; - if ((ret = - memp_fget(dbp->mpf, &t->bt_meta, 0, (PAGE **)&meta)) != 0) + if ((ret = __memp_fget(mpf, &t->bt_meta, 0, &meta)) != 0) goto err; } + if (flags == DB_FAST_STAT) { + if (dbp->type == DB_RECNO || + (dbp->type == DB_BTREE && F_ISSET(dbp, DB_AM_RECNUM))) { + if ((ret = __db_lget(dbc, 0, + cp->root, DB_LOCK_READ, 0, &lock)) != 0) + goto err; + if ((ret = + __memp_fget(mpf, &cp->root, 0, (PAGE **)&h)) != 0) + goto err; + + sp->bt_nkeys = RE_NREC(h); + } else + sp->bt_nkeys = meta->dbmeta.key_count; + sp->bt_ndata = meta->dbmeta.record_count; + } /* Get metadata page statistics. */ sp->bt_metaflags = meta->dbmeta.flags; @@ -167,38 +157,29 @@ __bam_stat(dbp, spp, db_malloc, flags) sp->bt_pagesize = meta->dbmeta.pagesize; sp->bt_magic = meta->dbmeta.magic; sp->bt_version = meta->dbmeta.version; - if (!F_ISSET(dbp, DB_AM_RDONLY)) { + + if (write_meta != 0) { meta->dbmeta.key_count = sp->bt_nkeys; meta->dbmeta.record_count = sp->bt_ndata; } - /* Discard the metadata page. */ - if ((ret = memp_fput(dbp->mpf, - meta, F_ISSET(dbp, DB_AM_RDONLY) ? 0 : DB_MPOOL_DIRTY)) != 0) - goto err; - meta = NULL; - __LPUT(dbc, metalock); - -done: *(DB_BTREE_STAT **)spp = sp; - - if (0) { -err: if (sp != NULL) - __os_free(sp, sizeof(*sp)); - } + *(DB_BTREE_STAT **)spp = sp; - if (h != NULL && - (t_ret = memp_fput(dbp->mpf, h, 0)) != 0 && ret == 0) +err: /* Discard the second page. */ + __LPUT(dbc, lock); + if (h != NULL && (t_ret = __memp_fput(mpf, h, 0)) != 0 && ret == 0) ret = t_ret; - if (meta != NULL && - (t_ret = memp_fput(dbp->mpf, meta, 0)) != 0 && ret == 0) + /* Discard the metadata page. */ + __LPUT(dbc, metalock); + if (meta != NULL && (t_ret = __memp_fput( + mpf, meta, write_meta == 0 ? 0 : DB_MPOOL_DIRTY)) != 0 && ret == 0) ret = t_ret; - if (lock.off != LOCK_INVALID) - __LPUT(dbc, lock); - - if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) - ret = t_ret; + if (ret != 0 && sp != NULL) { + __os_ufree(dbenv, sp); + *(DB_BTREE_STAT **)spp = NULL; + } return (ret); } @@ -222,22 +203,27 @@ __bam_traverse(dbc, mode, root_pgno, callback, cookie) BKEYDATA *bk; DB *dbp; DB_LOCK lock; + DB_MPOOLFILE *mpf; PAGE *h; RINTERNAL *ri; db_indx_t indx; int already_put, ret, t_ret; dbp = dbc->dbp; + mpf = dbp->mpf; + already_put = 0; if ((ret = __db_lget(dbc, 0, root_pgno, mode, 0, &lock)) != 0) return (ret); - if ((ret = memp_fget(dbp->mpf, &root_pgno, 0, &h)) != 0) - goto err; + if ((ret = __memp_fget(mpf, &root_pgno, 0, &h)) != 0) { + __LPUT(dbc, lock); + return (ret); + } switch (TYPE(h)) { case P_IBTREE: for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) { - bi = GET_BINTERNAL(h, indx); + bi = GET_BINTERNAL(dbp, h, indx); if (B_TYPE(bi->type) == B_OVERFLOW && (ret = __db_traverse_big(dbp, ((BOVERFLOW *)bi->data)->pgno, @@ -245,34 +231,34 @@ __bam_traverse(dbc, mode, root_pgno, callback, cookie) goto err; if ((ret = __bam_traverse( dbc, mode, bi->pgno, callback, cookie)) != 0) - break; + goto err; } break; case P_IRECNO: for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) { - ri = GET_RINTERNAL(h, indx); + ri = GET_RINTERNAL(dbp, h, indx); if ((ret = __bam_traverse( dbc, mode, ri->pgno, callback, cookie)) != 0) - break; + goto err; } break; case P_LBTREE: for (indx = 0; indx < NUM_ENT(h); indx += P_INDX) { - bk = GET_BKEYDATA(h, indx); + bk = GET_BKEYDATA(dbp, h, indx); if (B_TYPE(bk->type) == B_OVERFLOW && (ret = __db_traverse_big(dbp, - GET_BOVERFLOW(h, indx)->pgno, + GET_BOVERFLOW(dbp, h, indx)->pgno, callback, cookie)) != 0) goto err; - bk = GET_BKEYDATA(h, indx + O_INDX); + bk = GET_BKEYDATA(dbp, h, indx + O_INDX); if (B_TYPE(bk->type) == B_DUPLICATE && (ret = __bam_traverse(dbc, mode, - GET_BOVERFLOW(h, indx + O_INDX)->pgno, + GET_BOVERFLOW(dbp, h, indx + O_INDX)->pgno, callback, cookie)) != 0) goto err; if (B_TYPE(bk->type) == B_OVERFLOW && (ret = __db_traverse_big(dbp, - GET_BOVERFLOW(h, indx + O_INDX)->pgno, + GET_BOVERFLOW(dbp, h, indx + O_INDX)->pgno, callback, cookie)) != 0) goto err; } @@ -280,22 +266,21 @@ __bam_traverse(dbc, mode, root_pgno, callback, cookie) case P_LDUP: case P_LRECNO: for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) { - bk = GET_BKEYDATA(h, indx); + bk = GET_BKEYDATA(dbp, h, indx); if (B_TYPE(bk->type) == B_OVERFLOW && (ret = __db_traverse_big(dbp, - GET_BOVERFLOW(h, indx)->pgno, + GET_BOVERFLOW(dbp, h, indx)->pgno, callback, cookie)) != 0) goto err; } break; + default: + return (__db_pgfmt(dbp->dbenv, h->pgno)); } - already_put = 0; - if ((ret = callback(dbp, h, cookie, &already_put)) != 0) - goto err; + ret = callback(dbp, h, cookie, &already_put); -err: if (!already_put && - (t_ret = memp_fput(dbp->mpf, h, 0)) != 0 && ret != 0) +err: if (!already_put && (t_ret = __memp_fput(mpf, h, 0)) != 0 && ret != 0) ret = t_ret; __LPUT(dbc, lock); @@ -316,33 +301,40 @@ __bam_stat_callback(dbp, h, cookie, putp) int *putp; { DB_BTREE_STAT *sp; - db_indx_t indx, top; + db_indx_t indx, *inp, top; u_int8_t type; sp = cookie; *putp = 0; top = NUM_ENT(h); + inp = P_INP(dbp, h); switch (TYPE(h)) { case P_IBTREE: case P_IRECNO: ++sp->bt_int_pg; - sp->bt_int_pgfree += P_FREESPACE(h); + sp->bt_int_pgfree += P_FREESPACE(dbp, h); break; case P_LBTREE: /* Correct for on-page duplicates and deleted items. */ for (indx = 0; indx < top; indx += P_INDX) { + type = GET_BKEYDATA(dbp, h, indx + O_INDX)->type; + /* Ignore deleted items. */ + if (B_DISSET(type)) + continue; + + /* Ignore duplicate keys. */ if (indx + P_INDX >= top || - h->inp[indx] != h->inp[indx + P_INDX]) + inp[indx] != inp[indx + P_INDX]) ++sp->bt_nkeys; - type = GET_BKEYDATA(h, indx + O_INDX)->type; - if (!B_DISSET(type) && B_TYPE(type) != B_DUPLICATE) + /* Ignore off-page duplicates. */ + if (B_TYPE(type) != B_DUPLICATE) ++sp->bt_ndata; } ++sp->bt_leaf_pg; - sp->bt_leaf_pgfree += P_FREESPACE(h); + sp->bt_leaf_pgfree += P_FREESPACE(dbp, h); break; case P_LRECNO: /* @@ -356,39 +348,39 @@ __bam_stat_callback(dbp, h, cookie, putp) * Correct for deleted items in non-renumbering * Recno databases. */ - if (F_ISSET(dbp, DB_RE_RENUMBER)) + if (F_ISSET(dbp, DB_AM_RENUMBER)) sp->bt_ndata += top; else for (indx = 0; indx < top; indx += O_INDX) { - type = GET_BKEYDATA(h, indx)->type; + type = GET_BKEYDATA(dbp, h, indx)->type; if (!B_DISSET(type)) ++sp->bt_ndata; } ++sp->bt_leaf_pg; - sp->bt_leaf_pgfree += P_FREESPACE(h); + sp->bt_leaf_pgfree += P_FREESPACE(dbp, h); } else { sp->bt_ndata += top; ++sp->bt_dup_pg; - sp->bt_dup_pgfree += P_FREESPACE(h); + sp->bt_dup_pgfree += P_FREESPACE(dbp, h); } break; case P_LDUP: /* Correct for deleted items. */ for (indx = 0; indx < top; indx += O_INDX) - if (!B_DISSET(GET_BKEYDATA(h, indx)->type)) + if (!B_DISSET(GET_BKEYDATA(dbp, h, indx)->type)) ++sp->bt_ndata; ++sp->bt_dup_pg; - sp->bt_dup_pgfree += P_FREESPACE(h); + sp->bt_dup_pgfree += P_FREESPACE(dbp, h); break; case P_OVERFLOW: ++sp->bt_over_pg; - sp->bt_over_pgfree += P_OVFLSPACE(dbp->pgsize, h); + sp->bt_over_pgfree += P_OVFLSPACE(dbp, dbp->pgsize, h); break; default: - return (__db_pgfmt(dbp, h->pgno)); + return (__db_pgfmt(dbp->dbenv, h->pgno)); } return (0); } @@ -398,38 +390,26 @@ __bam_stat_callback(dbp, h, cookie, putp) * Return proportion of keys relative to given key. The numbers are * slightly skewed due to on page duplicates. * - * PUBLIC: int __bam_key_range __P((DB *, - * PUBLIC: DB_TXN *, DBT *, DB_KEY_RANGE *, u_int32_t)); + * PUBLIC: int __bam_key_range __P((DBC *, DBT *, DB_KEY_RANGE *, u_int32_t)); */ int -__bam_key_range(dbp, txn, dbt, kp, flags) - DB *dbp; - DB_TXN *txn; +__bam_key_range(dbc, dbt, kp, flags) + DBC *dbc; DBT *dbt; DB_KEY_RANGE *kp; u_int32_t flags; { BTREE_CURSOR *cp; - DBC *dbc; EPG *sp; double factor; - int exact, ret, t_ret; + int exact, ret; - PANIC_CHECK(dbp->dbenv); - DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->key_range"); + COMPQUIET(flags, 0); - if (flags != 0) - return (__db_ferr(dbp->dbenv, "DB->key_range", 0)); - - /* Acquire a cursor. */ - if ((ret = dbp->cursor(dbp, txn, &dbc, 0)) != 0) + if ((ret = __bam_search(dbc, PGNO_INVALID, + dbt, S_STK_ONLY, 1, NULL, &exact)) != 0) return (ret); - DEBUG_LWRITE(dbc, NULL, "bam_key_range", NULL, NULL, 0); - - if ((ret = __bam_search(dbc, dbt, S_STK_ONLY, 1, NULL, &exact)) != 0) - goto err; - cp = (BTREE_CURSOR *)dbc->internal; kp->less = kp->greater = 0.0; @@ -453,7 +433,7 @@ __bam_key_range(dbp, txn, dbt, kp, flags) else { kp->less += factor * sp->indx / sp->entries; kp->greater += factor * - (sp->entries - sp->indx - 1) / sp->entries; + ((sp->entries - sp->indx) - 1) / sp->entries; } factor *= 1.0/sp->entries; } @@ -473,8 +453,5 @@ __bam_key_range(dbp, txn, dbt, kp, flags) BT_STK_CLR(cp); -err: if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) - ret = t_ret; - - return (ret); + return (0); } diff --git a/db/btree/bt_upgrade.c b/db/btree/bt_upgrade.c index 4032dba3b..71ee84222 100644 --- a/db/btree/bt_upgrade.c +++ b/db/btree/bt_upgrade.c @@ -1,28 +1,25 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2003 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: bt_upgrade.c,v 11.19 2000/11/30 00:58:29 ubell Exp $"; +static const char revid[] = "$Id: bt_upgrade.c,v 11.29 2003/05/18 18:10:11 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include -#include #include #endif #include "db_int.h" -#include "db_page.h" -#include "db_swap.h" -#include "btree.h" -#include "db_am.h" -#include "db_upgrade.h" +#include "dbinc/db_page.h" +#include "dbinc/db_upgrade.h" +#include "dbinc/btree.h" /* * __bam_30_btreemeta -- @@ -107,7 +104,7 @@ __bam_31_btreemeta(dbp, real_name, flags, fhp, h, dirtyp) newmeta->minkey = oldmeta->minkey; newmeta->maxkey = oldmeta->maxkey; memmove(newmeta->dbmeta.uid, - oldmeta->dbmeta.uid, sizeof(oldmeta->dbmeta.uid)); + oldmeta->dbmeta.uid, sizeof(oldmeta->dbmeta.uid)); newmeta->dbmeta.flags = oldmeta->dbmeta.flags; newmeta->dbmeta.record_count = 0; newmeta->dbmeta.key_count = 0; @@ -126,7 +123,7 @@ __bam_31_btreemeta(dbp, real_name, flags, fhp, h, dirtyp) /* * __bam_31_lbtree -- - * Upgrade the database btree leaf pages. + * Upgrade the database btree leaf pages. * * PUBLIC: int __bam_31_lbtree * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); @@ -147,15 +144,15 @@ __bam_31_lbtree(dbp, real_name, flags, fhp, h, dirtyp) ret = 0; for (indx = O_INDX; indx < NUM_ENT(h); indx += P_INDX) { - bk = GET_BKEYDATA(h, indx); + bk = GET_BKEYDATA(dbp, h, indx); if (B_TYPE(bk->type) == B_DUPLICATE) { - pgno = GET_BOVERFLOW(h, indx)->pgno; + pgno = GET_BOVERFLOW(dbp, h, indx)->pgno; if ((ret = __db_31_offdup(dbp, real_name, fhp, LF_ISSET(DB_DUPSORT) ? 1 : 0, &pgno)) != 0) break; - if (pgno != GET_BOVERFLOW(h, indx)->pgno) { + if (pgno != GET_BOVERFLOW(dbp, h, indx)->pgno) { *dirtyp = 1; - GET_BOVERFLOW(h, indx)->pgno = pgno; + GET_BOVERFLOW(dbp, h, indx)->pgno = pgno; } } } diff --git a/db/btree/bt_verify.c b/db/btree/bt_verify.c index 9f8647e7e..cd8c57a4d 100644 --- a/db/btree/bt_verify.c +++ b/db/btree/bt_verify.c @@ -1,16 +1,16 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2000 + * Copyright (c) 1999-2003 * Sleepycat Software. All rights reserved. * - * $Id: bt_verify.c,v 1.44 2000/12/06 19:55:44 ubell Exp $ + * $Id: bt_verify.c,v 1.87 2003/10/06 14:09:23 bostic Exp $ */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: bt_verify.c,v 1.44 2000/12/06 19:55:44 ubell Exp $"; +static const char revid[] = "$Id: bt_verify.c,v 1.87 2003/10/06 14:09:23 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -20,9 +20,11 @@ static const char revid[] = "$Id: bt_verify.c,v 1.44 2000/12/06 19:55:44 ubell E #endif #include "db_int.h" -#include "db_page.h" -#include "db_verify.h" -#include "btree.h" +#include "dbinc/db_page.h" +#include "dbinc/db_shash.h" +#include "dbinc/db_verify.h" +#include "dbinc/btree.h" +#include "dbinc/mp.h" static int __bam_safe_getdata __P((DB *, PAGE *, u_int32_t, int, DBT *, int *)); static int __bam_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, @@ -49,15 +51,17 @@ __bam_vrfy_meta(dbp, vdp, meta, pgno, flags) db_pgno_t pgno; u_int32_t flags; { + DB_ENV *dbenv; VRFY_PAGEINFO *pip; int isbad, t_ret, ret; db_indx_t ovflsize; + dbenv = dbp->dbenv; + isbad = 0; + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) return (ret); - isbad = 0; - /* * If VRFY_INCOMPLETE is not set, then we didn't come through * __db_vrfy_pagezero and didn't incompletely @@ -79,19 +83,19 @@ __bam_vrfy_meta(dbp, vdp, meta, pgno, flags) /* avoid division by zero */ ovflsize = meta->minkey > 0 ? - B_MINKEY_TO_OVFLSIZE(meta->minkey, dbp->pgsize) : 0; + B_MINKEY_TO_OVFLSIZE(dbp, meta->minkey, dbp->pgsize) : 0; if (meta->minkey < 2 || - ovflsize > B_MINKEY_TO_OVFLSIZE(DEFMINKEYPAGE, dbp->pgsize)) { + ovflsize > B_MINKEY_TO_OVFLSIZE(dbp, DEFMINKEYPAGE, dbp->pgsize)) { pip->bt_minkey = 0; isbad = 1; - EPRINT((dbp->dbenv, - "Nonsensical bt_minkey value %lu on metadata page %lu", - (u_long)meta->minkey, (u_long)pgno)); + EPRINT((dbenv, + "Page %lu: nonsensical bt_minkey value %lu on metadata page", + (u_long)pgno, (u_long)meta->minkey)); } else pip->bt_minkey = meta->minkey; - /* bt_maxkey: no constraints (XXX: right?) */ + /* bt_maxkey: unsupported so no constraints. */ pip->bt_maxkey = meta->maxkey; /* re_len: no constraints on this (may be zero or huge--we make rope) */ @@ -103,13 +107,13 @@ __bam_vrfy_meta(dbp, vdp, meta, pgno, flags) * of the file, then the root page had better be page 1. */ pip->root = 0; - if (meta->root == PGNO_INVALID - || meta->root == pgno || !IS_VALID_PGNO(meta->root) || + if (meta->root == PGNO_INVALID || + meta->root == pgno || !IS_VALID_PGNO(meta->root) || (pgno == PGNO_BASE_MD && meta->root != 1)) { isbad = 1; - EPRINT((dbp->dbenv, - "Nonsensical root page %lu on metadata page %lu", - (u_long)meta->root, (u_long)vdp->last_pgno)); + EPRINT((dbenv, + "Page %lu: nonsensical root page %lu on metadata page", + (u_long)pgno, (u_long)meta->root)); } else pip->root = meta->root; @@ -124,8 +128,8 @@ __bam_vrfy_meta(dbp, vdp, meta, pgno, flags) */ if (F_ISSET(&meta->dbmeta, BTM_DUP) && pgno == PGNO_BASE_MD) { isbad = 1; - EPRINT((dbp->dbenv, - "Btree metadata page %lu has both duplicates and multiple databases", + EPRINT((dbenv, +"Page %lu: Btree metadata page has both duplicates and multiple databases", (u_long)pgno)); } F_SET(pip, VRFY_HAS_SUBDBS); @@ -138,8 +142,8 @@ __bam_vrfy_meta(dbp, vdp, meta, pgno, flags) if (F_ISSET(&meta->dbmeta, BTM_RECNUM)) F_SET(pip, VRFY_HAS_RECNUMS); if (F_ISSET(pip, VRFY_HAS_RECNUMS) && F_ISSET(pip, VRFY_HAS_DUPS)) { - EPRINT((dbp->dbenv, - "Btree metadata page %lu illegally has both recnums and dups", + EPRINT((dbenv, + "Page %lu: Btree metadata page illegally has both recnums and dups", (u_long)pgno)); isbad = 1; } @@ -149,14 +153,14 @@ __bam_vrfy_meta(dbp, vdp, meta, pgno, flags) dbp->type = DB_RECNO; } else if (F_ISSET(pip, VRFY_IS_RRECNO)) { isbad = 1; - EPRINT((dbp->dbenv, - "Metadata page %lu has renumber flag set but is not recno", + EPRINT((dbenv, + "Page %lu: metadata page has renumber flag set but is not recno", (u_long)pgno)); } if (F_ISSET(pip, VRFY_IS_RECNO) && F_ISSET(pip, VRFY_HAS_DUPS)) { - EPRINT((dbp->dbenv, - "Recno metadata page %lu specifies duplicates", + EPRINT((dbenv, + "Page %lu: recno metadata page specifies duplicates", (u_long)pgno)); isbad = 1; } @@ -169,9 +173,9 @@ __bam_vrfy_meta(dbp, vdp, meta, pgno, flags) * database */ isbad = 1; - EPRINT((dbp->dbenv, - "re_len of %lu in non-fixed-length database", - (u_long)pip->re_len)); + EPRINT((dbenv, + "Page %lu: re_len of %lu in non-fixed-length database", + (u_long)pgno, (u_long)pip->re_len)); } /* @@ -179,7 +183,7 @@ __bam_vrfy_meta(dbp, vdp, meta, pgno, flags) * not be and may still be correct. */ -err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) +err: if ((t_ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0 && ret == 0) ret = t_ret; return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); } @@ -200,22 +204,24 @@ __ram_vrfy_leaf(dbp, vdp, h, pgno, flags) u_int32_t flags; { BKEYDATA *bk; + DB_ENV *dbenv; VRFY_PAGEINFO *pip; db_indx_t i; int ret, t_ret, isbad; u_int32_t re_len_guess, len; + dbenv = dbp->dbenv; isbad = 0; + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) return (ret); - if ((ret = __db_fchk(dbp->dbenv, - "__ram_vrfy_leaf", flags, OKFLAGS)) != 0) + if ((ret = __db_fchk(dbenv, "__ram_vrfy_leaf", flags, OKFLAGS)) != 0) goto err; if (TYPE(h) != P_LRECNO) { /* We should not have been called. */ - TYPE_ERR_PRINT(dbp->dbenv, "__ram_vrfy_leaf", pgno, TYPE(h)); + TYPE_ERR_PRINT(dbenv, "__ram_vrfy_leaf", pgno, TYPE(h)); DB_ASSERT(0); ret = EINVAL; goto err; @@ -241,8 +247,8 @@ __ram_vrfy_leaf(dbp, vdp, h, pgno, flags) goto err; if (F_ISSET(pip, VRFY_HAS_DUPS)) { - EPRINT((dbp->dbenv, - "Recno database has dups on page %lu", (u_long)pgno)); + EPRINT((dbenv, + "Page %lu: Recno database has dups", (u_long)pgno)); ret = DB_VERIFY_BAD; goto err; } @@ -255,7 +261,7 @@ __ram_vrfy_leaf(dbp, vdp, h, pgno, flags) */ re_len_guess = 0; for (i = 0; i < NUM_ENT(h); i++) { - bk = GET_BKEYDATA(h, i); + bk = GET_BKEYDATA(dbp, h, i); /* KEYEMPTY. Go on. */ if (B_DISSET(bk->type)) continue; @@ -265,9 +271,9 @@ __ram_vrfy_leaf(dbp, vdp, h, pgno, flags) len = bk->len; else { isbad = 1; - EPRINT((dbp->dbenv, - "Nonsensical type for item %lu, page %lu", - (u_long)i, (u_long)pgno)); + EPRINT((dbenv, + "Page %lu: nonsensical type for item %lu", + (u_long)pgno, (u_long)i)); continue; } if (re_len_guess == 0) @@ -288,9 +294,9 @@ __ram_vrfy_leaf(dbp, vdp, h, pgno, flags) /* Save off record count. */ pip->rec_cnt = NUM_ENT(h); -err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) +err: if ((t_ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0 && ret == 0) ret = t_ret; - return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : 0); + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); } /* @@ -308,10 +314,13 @@ __bam_vrfy(dbp, vdp, h, pgno, flags) db_pgno_t pgno; u_int32_t flags; { + DB_ENV *dbenv; VRFY_PAGEINFO *pip; int ret, t_ret, isbad; + dbenv = dbp->dbenv; isbad = 0; + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) return (ret); @@ -322,7 +331,7 @@ __bam_vrfy(dbp, vdp, h, pgno, flags) case P_LDUP: break; default: - TYPE_ERR_PRINT(dbp->dbenv, "__bam_vrfy", pgno, TYPE(h)); + TYPE_ERR_PRINT(dbenv, "__bam_vrfy", pgno, TYPE(h)); DB_ASSERT(0); ret = EINVAL; goto err; @@ -361,8 +370,8 @@ __bam_vrfy(dbp, vdp, h, pgno, flags) isbad = 1; else goto err; - EPRINT((dbp->dbenv, - "item order check on page %lu unsafe: skipping", + EPRINT((dbenv, + "Page %lu: item order check unsafe: skipping", (u_long)pgno)); } else if (!LF_ISSET(DB_NOORDERCHK) && (ret = __bam_vrfy_itemorder(dbp, vdp, h, pgno, 0, 0, 0, flags)) != 0) { @@ -377,9 +386,9 @@ __bam_vrfy(dbp, vdp, h, pgno, flags) goto err; } -err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) +err: if ((t_ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0 && ret == 0) ret = t_ret; - return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : 0); + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); } /* @@ -398,13 +407,16 @@ __ram_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) db_indx_t *nentriesp; u_int32_t flags; { + DB_ENV *dbenv; RINTERNAL *ri; VRFY_CHILDINFO child; VRFY_PAGEINFO *pip; int ret, t_ret, isbad; u_int32_t himark, i, offset, nentries; + db_indx_t *inp; u_int8_t *pagelayout, *p; + dbenv = dbp->dbenv; isbad = 0; memset(&child, 0, sizeof(VRFY_CHILDINFO)); nentries = 0; @@ -414,38 +426,38 @@ __ram_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) return (ret); if (TYPE(h) != P_IRECNO) { - TYPE_ERR_PRINT(dbp->dbenv, "__ram_vrfy_inp", pgno, TYPE(h)); + TYPE_ERR_PRINT(dbenv, "__ram_vrfy_inp", pgno, TYPE(h)); DB_ASSERT(0); ret = EINVAL; goto err; } himark = dbp->pgsize; - if ((ret = - __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &pagelayout)) != 0) + if ((ret = __os_malloc(dbenv, dbp->pgsize, &pagelayout)) != 0) goto err; memset(pagelayout, 0, dbp->pgsize); + inp = P_INP(dbp, h); for (i = 0; i < NUM_ENT(h); i++) { - if ((u_int8_t *)h->inp + i >= (u_int8_t *)h + himark) { - EPRINT((dbp->dbenv, - "Page %lu entries listing %lu overlaps data", + if ((u_int8_t *)inp + i >= (u_int8_t *)h + himark) { + EPRINT((dbenv, + "Page %lu: entries listing %lu overlaps data", (u_long)pgno, (u_long)i)); ret = DB_VERIFY_BAD; goto err; } - offset = h->inp[i]; + offset = inp[i]; /* * Check that the item offset is reasonable: it points * somewhere after the inp array and before the end of the * page. */ - if (offset <= (u_int32_t)((u_int8_t *)h->inp + i - + if (offset <= (u_int32_t)((u_int8_t *)inp + i - (u_int8_t *)h) || offset > (u_int32_t)(dbp->pgsize - RINTERNAL_SIZE)) { isbad = 1; - EPRINT((dbp->dbenv, - "Bad offset %lu at page %lu index %lu", - (u_long)offset, (u_long)pgno, (u_long)i)); + EPRINT((dbenv, + "Page %lu: bad offset %lu at index %lu", + (u_long)pgno, (u_long)offset, (u_long)i)); continue; } @@ -456,7 +468,7 @@ __ram_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) nentries++; /* Make sure this RINTERNAL is not multiply referenced. */ - ri = GET_RINTERNAL(h, i); + ri = GET_RINTERNAL(dbp, h, i); if (pagelayout[offset] == 0) { pagelayout[offset] = 1; child.pgno = ri->pgno; @@ -465,9 +477,9 @@ __ram_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) if ((ret = __db_vrfy_childput(vdp, pgno, &child)) != 0) goto err; } else { - EPRINT((dbp->dbenv, - "RINTERNAL structure at offset %lu, page %lu referenced twice", - (u_long)offset, (u_long)pgno)); + EPRINT((dbenv, + "Page %lu: RINTERNAL structure at offset %lu referenced twice", + (u_long)pgno, (u_long)offset)); isbad = 1; } } @@ -476,24 +488,25 @@ __ram_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) p < pagelayout + dbp->pgsize; p += RINTERNAL_SIZE) if (*p != 1) { - EPRINT((dbp->dbenv, - "Gap between items at offset %lu, page %lu", - (u_long)(p - pagelayout), (u_long)pgno)); + EPRINT((dbenv, + "Page %lu: gap between items at offset %lu", + (u_long)pgno, (u_long)(p - pagelayout))); isbad = 1; } if ((db_indx_t)himark != HOFFSET(h)) { - EPRINT((dbp->dbenv, "Bad HOFFSET %lu, appears to be %lu", - (u_long)(HOFFSET(h)), (u_long)himark)); + EPRINT((dbenv, + "Page %lu: bad HOFFSET %lu, appears to be %lu", + (u_long)pgno, (u_long)(HOFFSET(h)), (u_long)himark)); isbad = 1; } *nentriesp = nentries; -err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) +err: if ((t_ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0 && ret == 0) ret = t_ret; if (pagelayout != NULL) - __os_free(pagelayout, dbp->pgsize); + __os_free(dbenv, pagelayout); return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); } @@ -513,6 +526,7 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) { BKEYDATA *bk; BOVERFLOW *bo; + DB_ENV *dbenv; VRFY_CHILDINFO child; VRFY_PAGEINFO *pip; int isbad, initem, isdupitem, ret, t_ret; @@ -520,6 +534,7 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) u_int32_t i, endoff, nentries; u_int8_t *pagelayout; + dbenv = dbp->dbenv; isbad = isdupitem = 0; nentries = 0; memset(&child, 0, sizeof(VRFY_CHILDINFO)); @@ -540,7 +555,7 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) */ if (LF_ISSET(DB_SALVAGE)) break; - TYPE_ERR_PRINT(dbp->dbenv, "__bam_vrfy_inp", pgno, TYPE(h)); + TYPE_ERR_PRINT(dbenv, "__bam_vrfy_inp", pgno, TYPE(h)); DB_ASSERT(0); ret = EINVAL; goto err; @@ -558,22 +573,24 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) * it and the region immediately after it. */ himark = dbp->pgsize; - if ((ret = __os_malloc(dbp->dbenv, - dbp->pgsize, NULL, &pagelayout)) != 0) + if ((ret = __os_malloc(dbenv, dbp->pgsize, &pagelayout)) != 0) goto err; memset(pagelayout, 0, dbp->pgsize); for (i = 0; i < NUM_ENT(h); i++) { - - ret = __db_vrfy_inpitem(dbp, - h, pgno, i, 1, flags, &himark, &offset); - if (ret == DB_VERIFY_BAD) { + switch (ret = __db_vrfy_inpitem(dbp, + h, pgno, i, 1, flags, &himark, &offset)) { + case 0: + break; + case DB_VERIFY_BAD: isbad = 1; continue; - } else if (ret == DB_VERIFY_FATAL) { + case DB_VERIFY_FATAL: isbad = 1; goto err; - } else if (ret != 0) - DB_ASSERT(0); + default: + DB_ASSERT(ret != 0); + break; + } /* * We now have a plausible beginning for the item, and we know @@ -582,7 +599,7 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) * Mark the beginning and end in pagelayout so we can make sure * items have no overlaps or gaps. */ - bk = GET_BKEYDATA(h, i); + bk = GET_BKEYDATA(dbp, h, i); #define ITEM_BEGIN 1 #define ITEM_END 2 if (pagelayout[offset] == 0) @@ -608,9 +625,8 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) isdupitem = 1; } else { isbad = 1; - EPRINT((dbp->dbenv, - "Duplicated item %lu on page %lu", - (u_long)i, (u_long)pgno)); + EPRINT((dbenv, "Page %lu: duplicated item %lu", + (u_long)pgno, (u_long)i)); } } @@ -621,7 +637,7 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) * If the end already has a sign other than 0, do nothing-- * it's an overlap that we'll catch later. */ - switch(B_TYPE(bk->type)) { + switch (B_TYPE(bk->type)) { case B_KEYDATA: if (TYPE(h) == P_IBTREE) /* It's a BINTERNAL. */ @@ -661,9 +677,8 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) * the end had better coincide too. */ if (isdupitem && pagelayout[endoff] != ITEM_END) { - EPRINT((dbp->dbenv, - "Duplicated item %lu on page %lu", - (u_long)i, (u_long)pgno)); + EPRINT((dbenv, "Page %lu: duplicated item %lu", + (u_long)pgno, (u_long)i)); isbad = 1; } else if (pagelayout[endoff] == 0) pagelayout[endoff] = ITEM_END; @@ -675,9 +690,8 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) */ if (B_DISSET(bk->type) && TYPE(h) != P_LRECNO) { isbad = 1; - EPRINT((dbp->dbenv, - "Item %lu on page %lu marked deleted", - (u_long)i, (u_long)pgno)); + EPRINT((dbenv, "Page %lu: item %lu marked deleted", + (u_long)pgno, (u_long)i)); } /* @@ -695,14 +709,14 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) case B_DUPLICATE: if (TYPE(h) == P_IBTREE) { isbad = 1; - EPRINT((dbp->dbenv, - "Duplicate page referenced by internal btree page %lu at item %lu", + EPRINT((dbenv, + "Page %lu: duplicate page referenced by internal btree page at item %lu", (u_long)pgno, (u_long)i)); break; } else if (TYPE(h) == P_LRECNO) { isbad = 1; - EPRINT((dbp->dbenv, - "Duplicate page referenced by recno page %lu at item %lu", + EPRINT((dbenv, + "Page %lu: duplicate page referenced by recno page at item %lu", (u_long)pgno, (u_long)i)); break; } @@ -716,10 +730,10 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) /* Make sure tlen is reasonable. */ if (bo->tlen > dbp->pgsize * vdp->last_pgno) { isbad = 1; - EPRINT((dbp->dbenv, - "Impossible tlen %lu, item %lu, page %lu", - (u_long)bo->tlen, (u_long)i, - (u_long)pgno)); + EPRINT((dbenv, + "Page %lu: impossible tlen %lu, item %lu", + (u_long)pgno, + (u_long)bo->tlen, (u_long)i)); /* Don't save as a child. */ break; } @@ -727,9 +741,9 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) if (!IS_VALID_PGNO(bo->pgno) || bo->pgno == pgno || bo->pgno == PGNO_INVALID) { isbad = 1; - EPRINT((dbp->dbenv, - "Offpage item %lu, page %lu has bad pgno", - (u_long)i, (u_long)pgno)); + EPRINT((dbenv, + "Page %lu: offpage item %lu has bad pgno %lu", + (u_long)pgno, (u_long)i, (u_long)bo->pgno)); /* Don't save as a child. */ break; } @@ -743,9 +757,8 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) break; default: isbad = 1; - EPRINT((dbp->dbenv, - "Item %lu on page %lu of invalid type %lu", - (u_long)i, (u_long)pgno)); + EPRINT((dbenv, "Page %lu: item %lu of invalid type %lu", + (u_long)pgno, (u_long)i, (u_long)B_TYPE(bk->type))); break; } } @@ -764,8 +777,8 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) continue; isbad = 1; - EPRINT((dbp->dbenv, - "Gap between items, page %lu offset %lu", + EPRINT((dbenv, + "Page %lu: gap between items at offset %lu", (u_long)pgno, (u_long)i)); /* Find the end of the gap */ for ( ; pagelayout[i + 1] == 0 && @@ -776,9 +789,9 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) /* We've found an item. Check its alignment. */ if (i != ALIGN(i, sizeof(u_int32_t))) { isbad = 1; - EPRINT((dbp->dbenv, - "Offset %lu page %lu unaligned", - (u_long)i, (u_long)pgno)); + EPRINT((dbenv, + "Page %lu: offset %lu unaligned", + (u_long)pgno, (u_long)i)); } initem = 1; nentries++; @@ -790,8 +803,8 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) * be an overlap. */ isbad = 1; - EPRINT((dbp->dbenv, - "Overlapping items, page %lu offset %lu", + EPRINT((dbenv, + "Page %lu: overlapping items at offset %lu", (u_long)pgno, (u_long)i)); break; default: @@ -815,25 +828,25 @@ __bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) * end. Overlap. */ isbad = 1; - EPRINT((dbp->dbenv, - "Overlapping items, page %lu offset %lu", + EPRINT((dbenv, + "Page %lu: overlapping items at offset %lu", (u_long)pgno, (u_long)i)); break; } - (void)__os_free(pagelayout, dbp->pgsize); + __os_free(dbenv, pagelayout); /* Verify HOFFSET. */ if ((db_indx_t)himark != HOFFSET(h)) { - EPRINT((dbp->dbenv, "Bad HOFFSET %lu, appears to be %lu", - (u_long)HOFFSET(h), (u_long)himark)); + EPRINT((dbenv, "Page %lu: bad HOFFSET %lu, appears to be %lu", + (u_long)pgno, (u_long)HOFFSET(h), (u_long)himark)); isbad = 1; } err: if (nentriesp != NULL) *nentriesp = nentries; - if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) + if ((t_ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0 && ret == 0) ret = t_ret; return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret); @@ -865,14 +878,15 @@ __bam_vrfy_itemorder(dbp, vdp, h, pgno, nentries, ovflok, hasdups, flags) int ovflok, hasdups; u_int32_t flags; { - DBT dbta, dbtb, dup1, dup2, *p1, *p2, *tmp; - BTREE *bt; BINTERNAL *bi; BKEYDATA *bk; BOVERFLOW *bo; + BTREE *bt; + DBT dbta, dbtb, dup_1, dup_2, *p1, *p2, *tmp; + DB_ENV *dbenv; VRFY_PAGEINFO *pip; db_indx_t i; - int cmp, freedup1, freedup2, isbad, ret, t_ret; + int cmp, freedup_1, freedup_2, isbad, ret, t_ret; int (*dupfunc) __P((DB *, const DBT *, const DBT *)); int (*func) __P((DB *, const DBT *, const DBT *)); void *buf1, *buf2, *tmpbuf; @@ -889,6 +903,7 @@ __bam_vrfy_itemorder(dbp, vdp, h, pgno, nentries, ovflok, hasdups, flags) } else pip = NULL; + dbenv = dbp->dbenv; ret = isbad = 0; bo = NULL; /* Shut up compiler. */ @@ -949,7 +964,7 @@ __bam_vrfy_itemorder(dbp, vdp, h, pgno, nentries, ovflok, hasdups, flags) */ switch (TYPE(h)) { case P_IBTREE: - bi = GET_BINTERNAL(h, i); + bi = GET_BINTERNAL(dbp, h, i); if (B_TYPE(bi->type) == B_OVERFLOW) { bo = (BOVERFLOW *)(bi->data); goto overflow; @@ -971,15 +986,15 @@ __bam_vrfy_itemorder(dbp, vdp, h, pgno, nentries, ovflok, hasdups, flags) #if 0 if (i == 0 && bi->len != 0) { isbad = 1; - EPRINT((dbp->dbenv, - "Lowest key on internal page %lu of nonzero length", + EPRINT((dbenv, + "Page %lu: lowest key on internal page of nonzero length", (u_long)pgno)); } #endif break; case P_LBTREE: case P_LDUP: - bk = GET_BKEYDATA(h, i); + bk = GET_BKEYDATA(dbp, h, i); if (B_TYPE(bk->type) == B_OVERFLOW) { bo = (BOVERFLOW *)bk; goto overflow; @@ -993,7 +1008,7 @@ __bam_vrfy_itemorder(dbp, vdp, h, pgno, nentries, ovflok, hasdups, flags) * This means our caller screwed up and sent us * an inappropriate page. */ - TYPE_ERR_PRINT(dbp->dbenv, + TYPE_ERR_PRINT(dbenv, "__bam_vrfy_itemorder", pgno, TYPE(h)) DB_ASSERT(0); ret = EINVAL; @@ -1029,9 +1044,9 @@ overflow: if (!ovflok) { if ((ret = __db_goff(dbp, p2, bo->tlen, bo->pgno, NULL, NULL)) != 0) { isbad = 1; - EPRINT((dbp->dbenv, - "Error %lu in fetching overflow item %lu, page %lu", - (u_long)ret, (u_long)i, (u_long)pgno)); + EPRINT((dbenv, + "Page %lu: error %lu in fetching overflow item %lu", + (u_long)pgno, (u_long)ret, (u_long)i)); } /* In case it got realloc'ed and thus changed. */ buf2 = p2->data; @@ -1044,8 +1059,8 @@ overflow: if (!ovflok) { /* comparison succeeded */ if (cmp > 0) { isbad = 1; - EPRINT((dbp->dbenv, - "Out-of-order key, page %lu item %lu", + EPRINT((dbenv, + "Page %lu: out-of-order key at entry %lu", (u_long)pgno, (u_long)i)); /* proceed */ } else if (cmp == 0) { @@ -1059,8 +1074,8 @@ overflow: if (!ovflok) { F_SET(pip, VRFY_HAS_DUPS); else if (hasdups == 0) { isbad = 1; - EPRINT((dbp->dbenv, - "Database with no duplicates has duplicated keys on page %lu", + EPRINT((dbenv, + "Page %lu: database with no duplicates has duplicated keys", (u_long)pgno)); } @@ -1092,11 +1107,11 @@ overflow: if (!ovflok) { * dups are probably (?) rare. */ if (((ret = __bam_safe_getdata(dbp, - h, i - 1, ovflok, &dup1, - &freedup1)) != 0) || + h, i - 1, ovflok, &dup_1, + &freedup_1)) != 0) || ((ret = __bam_safe_getdata(dbp, - h, i + 1, ovflok, &dup2, - &freedup2)) != 0)) + h, i + 1, ovflok, &dup_2, + &freedup_2)) != 0)) goto err; /* @@ -1105,8 +1120,8 @@ overflow: if (!ovflok) { * it's not safe to chase them now. * Mark an incomplete and return. */ - if (dup1.data == NULL || - dup2.data == NULL) { + if (dup_1.data == NULL || + dup_2.data == NULL) { DB_ASSERT(!ovflok); F_SET(pip, VRFY_INCOMPLETE); goto err; @@ -1118,26 +1133,26 @@ overflow: if (!ovflok) { * until we do the structure check * and see whether DUPSORT is set. */ - if (dupfunc(dbp, &dup1, &dup2) > 0) + if (dupfunc(dbp, &dup_1, &dup_2) > 0) F_SET(pip, VRFY_DUPS_UNSORTED); - if (freedup1) - __os_free(dup1.data, 0); - if (freedup2) - __os_free(dup2.data, 0); + if (freedup_1) + __os_ufree(dbenv, dup_1.data); + if (freedup_2) + __os_ufree(dbenv, dup_2.data); } } } } -err: if (pip != NULL && - ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0) && ret == 0) +err: if (pip != NULL && ((t_ret = + __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0) && ret == 0) ret = t_ret; if (buf1 != NULL) - __os_free(buf1, 0); + __os_ufree(dbenv, buf1); if (buf2 != NULL) - __os_free(buf2, 0); + __os_ufree(dbenv, buf2); return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); } @@ -1158,11 +1173,13 @@ __bam_vrfy_structure(dbp, vdp, meta_pgno, flags) u_int32_t flags; { DB *pgset; + DB_ENV *dbenv; VRFY_PAGEINFO *mip, *rip; db_pgno_t root, p; int t_ret, ret; u_int32_t nrecs, level, relen, stflags; + dbenv = dbp->dbenv; mip = rip = 0; pgset = vdp->pgset; @@ -1172,8 +1189,8 @@ __bam_vrfy_structure(dbp, vdp, meta_pgno, flags) if ((ret = __db_vrfy_pgset_get(pgset, meta_pgno, (int *)&p)) != 0) goto err; if (p != 0) { - EPRINT((dbp->dbenv, - "Btree metadata page number %lu observed twice", + EPRINT((dbenv, + "Page %lu: btree metadata page observed twice", (u_long)meta_pgno)); ret = DB_VERIFY_BAD; goto err; @@ -1184,8 +1201,9 @@ __bam_vrfy_structure(dbp, vdp, meta_pgno, flags) root = mip->root; if (root == 0) { - EPRINT((dbp->dbenv, - "Btree metadata page %lu has no root", (u_long)meta_pgno)); + EPRINT((dbenv, + "Page %lu: btree metadata page has no root", + (u_long)meta_pgno)); ret = DB_VERIFY_BAD; goto err; } @@ -1221,8 +1239,8 @@ __bam_vrfy_structure(dbp, vdp, meta_pgno, flags) * that should never happen. */ if (mip->re_len > 0 && relen > 0 && mip->re_len != relen) { - EPRINT((dbp->dbenv, - "Recno database with meta page %lu has bad re_len %lu", + EPRINT((dbenv, + "Page %lu: recno database has bad re_len %lu", (u_long)meta_pgno, (u_long)relen)); ret = DB_VERIFY_BAD; goto err; @@ -1230,25 +1248,25 @@ __bam_vrfy_structure(dbp, vdp, meta_pgno, flags) ret = 0; break; case P_LDUP: - EPRINT((dbp->dbenv, - "Duplicate tree referenced from metadata page %lu", + EPRINT((dbenv, + "Page %lu: duplicate tree referenced from metadata page", (u_long)meta_pgno)); ret = DB_VERIFY_BAD; break; default: - EPRINT((dbp->dbenv, - "Btree root of incorrect type %lu on meta page %lu", - (u_long)rip->type, (u_long)meta_pgno)); + EPRINT((dbenv, + "Page %lu: btree root of incorrect type %lu on metadata page", + (u_long)meta_pgno, (u_long)rip->type)); ret = DB_VERIFY_BAD; break; } -err: if (mip != NULL && - ((t_ret = __db_vrfy_putpageinfo(vdp, mip)) != 0) && ret == 0) - t_ret = ret; - if (rip != NULL && - ((t_ret = __db_vrfy_putpageinfo(vdp, rip)) != 0) && ret == 0) - t_ret = ret; +err: if (mip != NULL && ((t_ret = + __db_vrfy_putpageinfo(dbenv, vdp, mip)) != 0) && ret == 0) + ret = t_ret; + if (rip != NULL && ((t_ret = + __db_vrfy_putpageinfo(dbenv, vdp, rip)) != 0) && ret == 0) + ret = t_ret; return (ret); } @@ -1263,8 +1281,7 @@ err: if (mip != NULL && * PUBLIC: void *, u_int32_t, u_int32_t *, u_int32_t *, u_int32_t *)); */ int -__bam_vrfy_subtree(dbp, - vdp, pgno, l, r, flags, levelp, nrecsp, relenp) +__bam_vrfy_subtree(dbp, vdp, pgno, l, r, flags, levelp, nrecsp, relenp) DB *dbp; VRFY_DBINFO *vdp; db_pgno_t pgno; @@ -1274,19 +1291,27 @@ __bam_vrfy_subtree(dbp, BINTERNAL *li, *ri, *lp, *rp; DB *pgset; DBC *cc; + DB_ENV *dbenv; + DB_MPOOLFILE *mpf; PAGE *h; VRFY_CHILDINFO *child; VRFY_PAGEINFO *pip; - db_recno_t nrecs, child_nrecs; db_indx_t i; - int ret, t_ret, isbad, toplevel, p; + db_pgno_t next_pgno, prev_pgno; + db_recno_t child_nrecs, nrecs; + u_int32_t child_level, child_relen, j, level, relen, stflags; + u_int8_t leaf_type; int (*func) __P((DB *, const DBT *, const DBT *)); - u_int32_t level, child_level, stflags, child_relen, relen; + int isbad, p, ret, t_ret, toplevel; + dbenv = dbp->dbenv; + mpf = dbp->mpf; ret = isbad = 0; nrecs = 0; h = NULL; relen = 0; + leaf_type = P_INVALID; + next_pgno = prev_pgno = PGNO_INVALID; rp = (BINTERNAL *)r; lp = (BINTERNAL *)l; @@ -1300,9 +1325,32 @@ __bam_vrfy_subtree(dbp, cc = NULL; level = pip->bt_level; - toplevel = LF_ISSET(ST_TOPLEVEL); + toplevel = LF_ISSET(ST_TOPLEVEL) ? 1 : 0; LF_CLR(ST_TOPLEVEL); + /* + * If this is the root, initialize the vdp's prev- and next-pgno + * accounting. + * + * For each leaf page we hit, we'll want to make sure that + * vdp->prev_pgno is the same as pip->prev_pgno and vdp->next_pgno is + * our page number. Then, we'll set vdp->next_pgno to pip->next_pgno + * and vdp->prev_pgno to our page number, and the next leaf page in + * line should be able to do the same verification. + */ + if (toplevel) { + /* + * Cache the values stored in the vdp so that if we're an + * auxiliary tree such as an off-page duplicate set, our + * caller's leaf page chain doesn't get lost. + */ + prev_pgno = vdp->prev_pgno; + next_pgno = vdp->next_pgno; + leaf_type = vdp->leaf_type; + vdp->next_pgno = vdp->prev_pgno = PGNO_INVALID; + vdp->leaf_type = P_INVALID; + } + /* * We are recursively descending a btree, starting from the root * and working our way out to the leaves. @@ -1333,8 +1381,63 @@ __bam_vrfy_subtree(dbp, case P_LDUP: case P_LBTREE: /* - * Cases 1, 2 and 3 (overflow pages are common to all three); - * traverse child list, looking for overflows. + * Cases 1, 2 and 3. + * + * We're some sort of leaf page; verify + * that our linked list of leaves is consistent. + */ + if (vdp->leaf_type == P_INVALID) { + /* + * First leaf page. Set the type that all its + * successors should be, and verify that our prev_pgno + * is PGNO_INVALID. + */ + vdp->leaf_type = pip->type; + if (pip->prev_pgno != PGNO_INVALID) + goto bad_prev; + } else { + /* + * Successor leaf page. Check our type, the previous + * page's next_pgno, and our prev_pgno. + */ + if (pip->type != vdp->leaf_type) { + EPRINT((dbenv, + "Page %lu: unexpected page type %lu found in leaf chain (expected %lu)", + (u_long)pip->pgno, (u_long)pip->type, + (u_long)vdp->leaf_type)); + isbad = 1; + } + + /* + * Don't do the prev/next_pgno checks if we've lost + * leaf pages due to another corruption. + */ + if (!F_ISSET(vdp, VRFY_LEAFCHAIN_BROKEN)) { + if (pip->pgno != vdp->next_pgno) { + EPRINT((dbenv, + "Page %lu: incorrect next_pgno %lu found in leaf chain (should be %lu)", + (u_long)vdp->prev_pgno, + (u_long)vdp->next_pgno, + (u_long)pip->pgno)); + isbad = 1; + } + if (pip->prev_pgno != vdp->prev_pgno) { +bad_prev: EPRINT((dbenv, + "Page %lu: incorrect prev_pgno %lu found in leaf chain (should be %lu)", + (u_long)pip->pgno, + (u_long)pip->prev_pgno, + (u_long)vdp->prev_pgno)); + isbad = 1; + } + } + } + vdp->prev_pgno = pip->pgno; + vdp->next_pgno = pip->next_pgno; + F_CLR(vdp, VRFY_LEAFCHAIN_BROKEN); + + /* + * Overflow pages are common to all three leaf types; + * traverse the child list, looking for overflows. */ if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0) goto err; @@ -1359,8 +1462,8 @@ __bam_vrfy_subtree(dbp, if (!LF_ISSET(ST_IS_RECNO) && !(LF_ISSET(ST_DUPOK) && !LF_ISSET(ST_DUPSORT))) { isbad = 1; - EPRINT((dbp->dbenv, - "Recno leaf page %lu in non-recno tree", + EPRINT((dbenv, + "Page %lu: recno leaf page non-recno tree", (u_long)pgno)); goto done; } @@ -1371,8 +1474,8 @@ __bam_vrfy_subtree(dbp, * subtree. */ isbad = 1; - EPRINT((dbp->dbenv, - "Non-recno leaf page %lu in recno tree", + EPRINT((dbenv, + "Page %lu: non-recno leaf page in recno tree", (u_long)pgno)); goto done; } @@ -1388,8 +1491,8 @@ __bam_vrfy_subtree(dbp, /* If dups aren't allowed in this btree, trouble. */ if (!LF_ISSET(ST_DUPOK)) { isbad = 1; - EPRINT((dbp->dbenv, - "Duplicates on page %lu in non-dup btree", + EPRINT((dbenv, + "Page %lu: duplicates in non-dup btree", (u_long)pgno)); } else { /* @@ -1414,8 +1517,8 @@ __bam_vrfy_subtree(dbp, } if ((ret = __bam_vrfy_subtree( dbp, vdp, child->pgno, NULL, - NULL, stflags, NULL, NULL, - NULL)) != 0) { + NULL, stflags | ST_TOPLEVEL, + NULL, NULL, NULL)) != 0) { if (ret != DB_VERIFY_BAD) goto err; @@ -1435,15 +1538,14 @@ __bam_vrfy_subtree(dbp, */ if (F_ISSET(pip, VRFY_DUPS_UNSORTED) && LF_ISSET(ST_DUPSORT)) { - EPRINT((dbp->dbenv, - "Unsorted duplicate set at page %lu in sorted-dup database", + EPRINT((dbenv, + "Page %lu: unsorted duplicate set in sorted-dup database", (u_long)pgno)); isbad = 1; } } } goto leaf; - break; case P_IBTREE: case P_IRECNO: /* We handle these below. */ @@ -1455,10 +1557,27 @@ __bam_vrfy_subtree(dbp, * Note that the code at the "done" label assumes that the * current page is a btree/recno one of some sort; this * is not the case here, so we goto err. + * + * If the page is entirely zeroed, its pip->type will be a lie + * (we assumed it was a hash page, as they're allowed to be + * zeroed); handle this case specially. */ - EPRINT((dbp->dbenv, - "Page %lu is of inappropriate type %lu", - (u_long)pgno, (u_long)pip->type)); + if (F_ISSET(pip, VRFY_IS_ALLZEROES)) + ZEROPG_ERR_PRINT(dbenv, pgno, "btree or recno page"); + else + EPRINT((dbenv, + "Page %lu: btree or recno page is of inappropriate type %lu", + (u_long)pgno, (u_long)pip->type)); + + /* + * We probably lost a leaf page (or more if this was an + * internal page) from our prev/next_pgno chain. Flag + * that this is expected; we don't want or need to + * spew error messages about erroneous prev/next_pgnos, + * since that's probably not the real problem. + */ + F_SET(vdp, VRFY_LEAFCHAIN_BROKEN); + ret = DB_VERIFY_BAD; goto err; } @@ -1474,7 +1593,7 @@ __bam_vrfy_subtree(dbp, ret = __db_vrfy_ccnext(cc, &child)) if (child->type == V_RECNO) { if (pip->type != P_IRECNO) { - TYPE_ERR_PRINT(dbp->dbenv, "__bam_vrfy_subtree", + TYPE_ERR_PRINT(dbenv, "__bam_vrfy_subtree", pgno, pip->type); DB_ASSERT(0); ret = EINVAL; @@ -1499,30 +1618,64 @@ __bam_vrfy_subtree(dbp, else if (child_relen > 0 && relen != child_relen) { isbad = 1; - EPRINT((dbp->dbenv, - "Recno page %lu returned bad re_len", - (u_long)child->pgno)); + EPRINT((dbenv, + "Page %lu: recno page returned bad re_len %lu", + (u_long)child->pgno, + (u_long)child_relen)); } if (relenp) *relenp = relen; } if (LF_ISSET(ST_RECNUM)) nrecs += child_nrecs; - if (level != child_level + 1) { + if (isbad == 0 && level != child_level + 1) { isbad = 1; - EPRINT((dbp->dbenv, "%s%lu%s%lu%s%lu", - "Recno level incorrect on page ", - (u_long)child->pgno, ": got ", - (u_long)child_level, ", expected ", + EPRINT((dbenv, + "Page %lu: recno level incorrect: got %lu, expected %lu", + (u_long)child->pgno, (u_long)child_level, (u_long)(level - 1))); } - } else if (child->type == V_OVERFLOW && - (ret = __db_vrfy_ovfl_structure(dbp, vdp, - child->pgno, child->tlen, flags)) != 0) { - if (ret == DB_VERIFY_BAD) + } else if (child->type == V_OVERFLOW) { + /* + * It is possible for one internal page to reference + * a single overflow page twice, if all the items + * in the subtree referenced by slot 0 are deleted, + * then a similar number of items are put back + * before the key that formerly had been in slot 1. + * + * (Btree doesn't look at the key in slot 0, so the + * fact that the key formerly at slot 1 is the "wrong" + * parent of the stuff in the slot 0 subtree isn't + * really incorrect.) + * + * __db_vrfy_ovfl_structure is designed to be + * efficiently called multiple times for multiple + * references; call it here as many times as is + * appropriate. + */ + + /* Otherwise, __db_vrfy_childput would be broken. */ + DB_ASSERT(child->refcnt >= 1); + + /* + * An overflow referenced more than twice here + * shouldn't happen. + */ + if (child->refcnt > 2) { + EPRINT((dbenv, + "Page %lu: overflow page %lu referenced more than twice from internal page", + (u_long)pgno, (u_long)child->pgno)); isbad = 1; - else - goto done; + } else + for (j = 0; j < child->refcnt; j++) + if ((ret = __db_vrfy_ovfl_structure(dbp, + vdp, child->pgno, child->tlen, + flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto done; + } } if ((ret = __db_vrfy_ccclose(cc)) != 0) @@ -1543,12 +1696,12 @@ __bam_vrfy_subtree(dbp, * itself, which must sort lower than all entries on its child; * ri will be the key to its right, which must sort greater. */ - if (h == NULL && (ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + if (h == NULL && (ret = __memp_fget(mpf, &pgno, 0, &h)) != 0) goto err; for (i = 0; i < pip->entries; i += O_INDX) { - li = GET_BINTERNAL(h, i); + li = GET_BINTERNAL(dbp, h, i); ri = (i + O_INDX < pip->entries) ? - GET_BINTERNAL(h, i + O_INDX) : NULL; + GET_BINTERNAL(dbp, h, i + O_INDX) : NULL; /* * The leftmost key is forcibly sorted less than all entries, @@ -1577,19 +1730,19 @@ __bam_vrfy_subtree(dbp, */ if (li->nrecs != child_nrecs) { isbad = 1; - EPRINT((dbp->dbenv, - "Item %lu page %lu has incorrect record count of %lu, should be %lu", - (u_long)i, (u_long)pgno, (u_long)li->nrecs, + EPRINT((dbenv, + "Page %lu: item %lu has incorrect record count of %lu, should be %lu", + (u_long)pgno, (u_long)i, (u_long)li->nrecs, (u_long)child_nrecs)); } } if (level != child_level + 1) { isbad = 1; - EPRINT((dbp->dbenv, "%s%lu%s%lu%s%lu", - "Btree level incorrect on page ", (u_long)li->pgno, - ": got ", (u_long)child_level, ", expected ", - (u_long)(level - 1))); + EPRINT((dbenv, + "Page %lu: Btree level incorrect: got %lu, expected %lu", + (u_long)li->pgno, + (u_long)child_level, (u_long)(level - 1))); } } @@ -1616,7 +1769,7 @@ done: if (F_ISSET(pip, VRFY_INCOMPLETE) && isbad == 0 && ret == 0) { * isbad == 0, though, it's now safe to do so, as we've * traversed any child overflow pages. Do it. */ - if (h == NULL && (ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + if (h == NULL && (ret = __memp_fget(mpf, &pgno, 0, &h)) != 0) goto err; if ((ret = __bam_vrfy_itemorder(dbp, vdp, h, pgno, 0, 1, 0, flags)) != 0) @@ -1624,13 +1777,36 @@ done: if (F_ISSET(pip, VRFY_INCOMPLETE) && isbad == 0 && ret == 0) { F_CLR(pip, VRFY_INCOMPLETE); } + /* + * It's possible to get to this point with a page that has no + * items, but without having detected any sort of failure yet. + * Having zero items is legal if it's a leaf--it may be the + * root page in an empty tree, or the tree may have been + * modified with the DB_REVSPLITOFF flag set (there's no way + * to tell from what's on disk). For an internal page, + * though, having no items is a problem (all internal pages + * must have children). + */ + if (isbad == 0 && ret == 0) { + if (h == NULL && (ret = __memp_fget(mpf, &pgno, 0, &h)) != 0) + goto err; + + if (NUM_ENT(h) == 0 && ISINTERNAL(h)) { + EPRINT((dbenv, + "Page %lu: internal page is empty and should not be", + (u_long)pgno)); + isbad = 1; + goto err; + } + } + /* * Our parent has sent us BINTERNAL pointers to parent records * so that we can verify our place with respect to them. If it's * appropriate--we have a default sort function--verify this. */ if (isbad == 0 && ret == 0 && !LF_ISSET(DB_NOORDERCHK) && lp != NULL) { - if (h == NULL && (ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + if (h == NULL && (ret = __memp_fget(mpf, &pgno, 0, &h)) != 0) goto err; /* @@ -1661,8 +1837,8 @@ done: if (F_ISSET(pip, VRFY_INCOMPLETE) && isbad == 0 && ret == 0) { */ if (LF_ISSET(ST_RECNUM) && nrecs != pip->rec_cnt && toplevel) { isbad = 1; - EPRINT((dbp->dbenv, - "Bad record count on page %lu: got %lu, expected %lu", + EPRINT((dbenv, + "Page %lu: bad record count: has %lu records, claims %lu", (u_long)pgno, (u_long)nrecs, (u_long)pip->rec_cnt)); } @@ -1676,13 +1852,31 @@ done: if (F_ISSET(pip, VRFY_INCOMPLETE) && isbad == 0 && ret == 0) { goto err; if (p != 0) { isbad = 1; - EPRINT((dbp->dbenv, "Page %lu linked twice", (u_long)pgno)); + EPRINT((dbenv, "Page %lu: linked twice", (u_long)pgno)); } else if ((ret = __db_vrfy_pgset_inc(pgset, pgno)) != 0) goto err; -err: if (h != NULL && (t_ret = memp_fput(dbp->mpf, h, 0)) != 0 && ret == 0) + if (toplevel) + /* + * The last page's next_pgno in the leaf chain should have been + * PGNO_INVALID. + */ + if (vdp->next_pgno != PGNO_INVALID) { + EPRINT((dbenv, "Page %lu: unterminated leaf chain", + (u_long)vdp->prev_pgno)); + isbad = 1; + } + +err: if (toplevel) { + /* Restore our caller's settings. */ + vdp->next_pgno = next_pgno; + vdp->prev_pgno = prev_pgno; + vdp->leaf_type = leaf_type; + } + + if (h != NULL && (t_ret = __memp_fput(mpf, h, 0)) != 0 && ret == 0) ret = t_ret; - if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) + if ((t_ret = __db_vrfy_putpageinfo(dbenv, vdp, pip)) != 0 && ret == 0) ret = t_ret; if (cc != NULL && ((t_ret = __db_vrfy_ccclose(cc)) != 0) && ret == 0) ret = t_ret; @@ -1712,14 +1906,24 @@ __bam_vrfy_treeorder(dbp, pgno, h, lp, rp, func, flags) u_int32_t flags; { BOVERFLOW *bo; + DB_ENV *dbenv; DBT dbt; db_indx_t last; int ret, cmp; + dbenv = dbp->dbenv; memset(&dbt, 0, sizeof(DBT)); F_SET(&dbt, DB_DBT_MALLOC); ret = 0; + /* + * Empty pages are sorted correctly by definition. We check + * to see whether they ought to be empty elsewhere; leaf + * pages legally may be. + */ + if (NUM_ENT(h) == 0) + return (0); + switch (TYPE(h)) { case P_IBTREE: case P_LDUP: @@ -1729,8 +1933,7 @@ __bam_vrfy_treeorder(dbp, pgno, h, lp, rp, func, flags) last = NUM_ENT(h) - P_INDX; break; default: - TYPE_ERR_PRINT(dbp->dbenv, - "__bam_vrfy_treeorder", pgno, TYPE(h)); + TYPE_ERR_PRINT(dbenv, "__bam_vrfy_treeorder", pgno, TYPE(h)); DB_ASSERT(0); return (EINVAL); } @@ -1759,26 +1962,27 @@ __bam_vrfy_treeorder(dbp, pgno, h, lp, rp, func, flags) return (ret); } else { DB_ASSERT(0); - EPRINT((dbp->dbenv, - "Unknown type for internal record")); + EPRINT((dbenv, + "Page %lu: unknown type for internal record", + (u_long)PGNO(h))); return (EINVAL); } /* On error, fall through, free if neeeded, and return. */ if ((ret = __bam_cmp(dbp, &dbt, h, 0, func, &cmp)) == 0) { if (cmp > 0) { - EPRINT((dbp->dbenv, - "First item on page %lu sorted greater than parent entry", + EPRINT((dbenv, + "Page %lu: first item on page sorted greater than parent entry", (u_long)PGNO(h))); ret = DB_VERIFY_BAD; } } else - EPRINT((dbp->dbenv, - "First item on page %lu had comparison error", + EPRINT((dbenv, + "Page %lu: first item on page had comparison error", (u_long)PGNO(h))); if (dbt.data != lp->data) - __os_free(dbt.data, 0); + __os_ufree(dbenv, dbt.data); if (ret != 0) return (ret); } @@ -1794,26 +1998,27 @@ __bam_vrfy_treeorder(dbp, pgno, h, lp, rp, func, flags) return (ret); } else { DB_ASSERT(0); - EPRINT((dbp->dbenv, - "Unknown type for internal record")); + EPRINT((dbenv, + "Page %lu: unknown type for internal record", + (u_long)PGNO(h))); return (EINVAL); } /* On error, fall through, free if neeeded, and return. */ if ((ret = __bam_cmp(dbp, &dbt, h, last, func, &cmp)) == 0) { if (cmp < 0) { - EPRINT((dbp->dbenv, - "Last item on page %lu sorted greater than parent entry", + EPRINT((dbenv, + "Page %lu: last item on page sorted greater than parent entry", (u_long)PGNO(h))); ret = DB_VERIFY_BAD; } } else - EPRINT((dbp->dbenv, - "Last item on page %lu had comparison error", + EPRINT((dbenv, + "Page %lu: last item on page had comparison error", (u_long)PGNO(h))); if (dbt.data != rp->data) - __os_free(dbt.data, 0); + __os_ufree(dbenv, dbt.data); } return (ret); @@ -1841,37 +2046,41 @@ __bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags) u_int32_t flags; { DBT dbt, unkdbt; + DB_ENV *dbenv; BKEYDATA *bk; BOVERFLOW *bo; - db_indx_t i, beg, end; + db_indx_t i, beg, end, *inp; u_int32_t himark; u_int8_t *pgmap; void *ovflbuf; int t_ret, ret, err_ret; + dbenv = dbp->dbenv; + /* Shut up lint. */ COMPQUIET(end, 0); ovflbuf = pgmap = NULL; err_ret = ret = 0; + inp = P_INP(dbp, h); memset(&dbt, 0, sizeof(DBT)); dbt.flags = DB_DBT_REALLOC; memset(&unkdbt, 0, sizeof(DBT)); - unkdbt.size = strlen("UNKNOWN") + 1; + unkdbt.size = (u_int32_t)(strlen("UNKNOWN") + 1); unkdbt.data = "UNKNOWN"; /* * Allocate a buffer for overflow items. Start at one page; * __db_safe_goff will realloc as needed. */ - if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &ovflbuf)) != 0) + if ((ret = __os_malloc(dbenv, dbp->pgsize, &ovflbuf)) != 0) return (ret); if (LF_ISSET(DB_AGGRESSIVE)) { if ((ret = - __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &pgmap)) != 0) + __os_malloc(dbenv, dbp->pgsize, &pgmap)) != 0) goto err; memset(pgmap, 0, dbp->pgsize); } @@ -1914,7 +2123,7 @@ __bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags) * We only want to print deleted items if * DB_AGGRESSIVE is set. */ - bk = GET_BKEYDATA(h, i); + bk = GET_BKEYDATA(dbp, h, i); if (!LF_ISSET(DB_AGGRESSIVE) && B_DISSET(bk->type)) continue; @@ -1927,10 +2136,10 @@ __bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags) if (key != NULL && (i != 0 || !LF_ISSET(SA_SKIPFIRSTKEY))) if ((ret = __db_prdbt(key, - 0, " ", handle, callback, 0, NULL)) != 0) + 0, " ", handle, callback, 0, vdp)) != 0) err_ret = ret; - beg = h->inp[i]; + beg = inp[i]; switch (B_TYPE(bk->type)) { case B_DUPLICATE: end = beg + BOVERFLOW_SIZE - 1; @@ -1958,23 +2167,24 @@ __bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags) (i % P_INDX == 0)) { /* Not much to do on failure. */ if ((ret = __db_prdbt(&unkdbt, 0, " ", - handle, callback, 0, NULL)) != 0) + handle, callback, 0, vdp)) != 0) err_ret = ret; break; } if ((ret = __db_salvage_duptree(dbp, vdp, bo->pgno, &dbt, handle, callback, - flags | SA_SKIPFIRSTKEY)) != 0) + flags | SA_SKIPFIRSTKEY)) != 0) err_ret = ret; break; case B_KEYDATA: - end = ALIGN(beg + bk->len, sizeof(u_int32_t)) - 1; + end = + ALIGN(beg + bk->len, sizeof(u_int32_t)) - 1; dbt.data = bk->data; dbt.size = bk->len; if ((ret = __db_prdbt(&dbt, - 0, " ", handle, callback, 0, NULL)) != 0) + 0, " ", handle, callback, 0, vdp)) != 0) err_ret = ret; break; case B_OVERFLOW: @@ -1985,11 +2195,11 @@ __bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags) err_ret = ret; /* We care about err_ret more. */ (void)__db_prdbt(&unkdbt, 0, " ", - handle, callback, 0, NULL); + handle, callback, 0, vdp); break; } if ((ret = __db_prdbt(&dbt, - 0, " ", handle, callback, 0, NULL)) != 0) + 0, " ", handle, callback, 0, vdp)) != 0) err_ret = ret; break; default: @@ -2020,12 +2230,12 @@ __bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags) * a datum; fix this imbalance by printing an "UNKNOWN". */ if (pgtype == P_LBTREE && (i % P_INDX == 1) && ((ret = - __db_prdbt(&unkdbt, 0, " ", handle, callback, 0, NULL)) != 0)) + __db_prdbt(&unkdbt, 0, " ", handle, callback, 0, vdp)) != 0)) err_ret = ret; err: if (pgmap != NULL) - __os_free(pgmap, 0); - __os_free(ovflbuf, 0); + __os_free(dbenv, pgmap); + __os_free(dbenv, ovflbuf); /* Mark this page as done. */ if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0) @@ -2061,12 +2271,13 @@ __bam_salvage_walkdupint(dbp, vdp, h, key, handle, callback, flags) for (i = 0; i < NUM_ENT(h); i++) { switch (TYPE(h)) { case P_IBTREE: - bi = GET_BINTERNAL(h, i); + bi = GET_BINTERNAL(dbp, h, i); if ((t_ret = __db_salvage_duptree(dbp, vdp, bi->pgno, key, handle, callback, flags)) != 0) ret = t_ret; + break; case P_IRECNO: - ri = GET_RINTERNAL(h, i); + ri = GET_RINTERNAL(dbp, h, i); if ((t_ret = __db_salvage_duptree(dbp, vdp, ri->pgno, key, handle, callback, flags)) != 0) ret = t_ret; @@ -2110,11 +2321,13 @@ __bam_meta2pgset(dbp, vdp, btmeta, flags, pgset) DB *pgset; { BINTERNAL *bi; + DB_MPOOLFILE *mpf; PAGE *h; RINTERNAL *ri; db_pgno_t current, p; int err_ret, ret; + mpf = dbp->mpf; h = NULL; ret = err_ret = 0; DB_ASSERT(pgset != NULL); @@ -2123,7 +2336,7 @@ __bam_meta2pgset(dbp, vdp, btmeta, flags, pgset) err_ret = DB_VERIFY_BAD; goto err; } - if ((ret = memp_fget(dbp->mpf, ¤t, 0, &h)) != 0) { + if ((ret = __memp_fget(mpf, ¤t, 0, &h)) != 0) { err_ret = ret; goto err; } @@ -2137,10 +2350,10 @@ __bam_meta2pgset(dbp, vdp, btmeta, flags, pgset) goto err; } if (TYPE(h) == P_IBTREE) { - bi = GET_BINTERNAL(h, 0); + bi = GET_BINTERNAL(dbp, h, 0); current = bi->pgno; } else { /* P_IRECNO */ - ri = GET_RINTERNAL(h, 0); + ri = GET_RINTERNAL(dbp, h, 0); current = ri->pgno; } break; @@ -2152,7 +2365,7 @@ __bam_meta2pgset(dbp, vdp, btmeta, flags, pgset) goto err; } - if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) + if ((ret = __memp_fput(mpf, h, 0)) != 0) err_ret = ret; h = NULL; } @@ -2164,7 +2377,7 @@ __bam_meta2pgset(dbp, vdp, btmeta, flags, pgset) traverse: while (IS_VALID_PGNO(current) && current != PGNO_INVALID) { if (h == NULL && - (ret = memp_fget(dbp->mpf, ¤t, 0, &h) != 0)) { + (ret = __memp_fget(mpf, ¤t, 0, &h)) != 0) { err_ret = ret; break; } @@ -2184,13 +2397,13 @@ traverse: goto err; current = NEXT_PGNO(h); - if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) + if ((ret = __memp_fput(mpf, h, 0)) != 0) err_ret = ret; h = NULL; } err: if (h != NULL) - (void)memp_fput(dbp->mpf, h, 0); + (void)__memp_fput(mpf, h, 0); return (ret == 0 ? err_ret : ret); } @@ -2218,7 +2431,7 @@ __bam_safe_getdata(dbp, h, i, ovflok, dbt, freedbtp) memset(dbt, 0, sizeof(DBT)); *freedbtp = 0; - bk = GET_BKEYDATA(h, i); + bk = GET_BKEYDATA(dbp, h, i); if (B_TYPE(bk->type) == B_OVERFLOW) { if (!ovflok) return (0); diff --git a/db/btree/btree.src b/db/btree/btree.src index a1eba7d7f..85faff67f 100644 --- a/db/btree/btree.src +++ b/db/btree/btree.src @@ -1,13 +1,14 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2003 * Sleepycat Software. All rights reserved. * - * $Id: btree.src,v 10.26 2000/12/12 17:40:23 bostic Exp $ + * $Id: btree.src,v 10.39 2003/11/14 05:32:34 ubell Exp $ */ -PREFIX bam +PREFIX __bam +DBPRIVATE INCLUDE #include "db_config.h" INCLUDE @@ -15,93 +16,22 @@ INCLUDE #ifndef NO_SYSTEM_INCLUDES INCLUDE #include INCLUDE INCLUDE #include -INCLUDE #include INCLUDE #include INCLUDE #endif INCLUDE INCLUDE #include "db_int.h" -INCLUDE #include "db_page.h" -INCLUDE #include "db_dispatch.h" -INCLUDE #include "db_am.h" -INCLUDE #include "btree.h" -INCLUDE #include "txn.h" +INCLUDE #include "dbinc/crypto.h" +INCLUDE #include "dbinc/db_page.h" +INCLUDE #include "dbinc/db_dispatch.h" +INCLUDE #include "dbinc/db_am.h" +INCLUDE #include "dbinc/btree.h" +INCLUDE #include "dbinc/log.h" +INCLUDE #include "dbinc/txn.h" INCLUDE /* - * BTREE-pg_alloc: used to record allocating a new page. - * - * meta_lsn: the meta-data page's original lsn. - * page_lsn: the allocated page's original lsn. - * pgno: the page allocated. - * next: the next page on the free list. - */ -BEGIN pg_alloc 51 -ARG fileid int32_t ld -POINTER meta_lsn DB_LSN * lu -POINTER page_lsn DB_LSN * lu -ARG pgno db_pgno_t lu -ARG ptype u_int32_t lu -ARG next db_pgno_t lu -END - -DEPRECATED pg_alloc1 60 -ARG fileid int32_t ld -POINTER meta_lsn DB_LSN * lu -POINTER alloc_lsn DB_LSN * lu -POINTER page_lsn DB_LSN * lu -ARG pgno db_pgno_t lu -ARG ptype u_int32_t lu -ARG next db_pgno_t lu -END - -/* - * BTREE-pg_free: used to record freeing a page. - * - * pgno: the page being freed. - * meta_lsn: the meta-data page's original lsn. - * header: the header from the free'd page. - * next: the previous next pointer on the metadata page. - */ -BEGIN pg_free 52 -ARG fileid int32_t ld -ARG pgno db_pgno_t lu -POINTER meta_lsn DB_LSN * lu -DBT header DBT s -ARG next db_pgno_t lu -END - -DEPRECATED pg_free1 61 -ARG fileid int32_t ld -ARG pgno db_pgno_t lu -POINTER meta_lsn DB_LSN * lu -POINTER alloc_lsn DB_LSN * lu -DBT header DBT s -ARG next db_pgno_t lu -END - -/* - * BTREE-split: used to log a page split. - * - * left: the page number for the low-order contents. - * llsn: the left page's original LSN. - * right: the page number for the high-order contents. - * rlsn: the right page's original LSN. - * indx: the number of entries that went to the left page. - * npgno: the next page number - * nlsn: the next page's original LSN (or 0 if no next page). - * pg: the split page's contents before the split. + * NOTE: pg_alloc and pg_free have been moved to db.src, where they belong. */ -DEPRECATED split1 53 -ARG fileid int32_t ld -ARG left db_pgno_t lu -POINTER llsn DB_LSN * lu -ARG right db_pgno_t lu -POINTER rlsn DB_LSN * lu -ARG indx u_int32_t lu -ARG npgno db_pgno_t lu -POINTER nlsn DB_LSN * lu -DBT pg DBT s -END /* * BTREE-split: used to log a page split. @@ -112,14 +42,13 @@ END * rlsn: the right page's original LSN. * indx: the number of entries that went to the left page. * npgno: the next page number - * npgno: the next page number * nlsn: the next page's original LSN (or 0 if no next page). * root_pgno: the root page number * pg: the split page's contents before the split. * opflags: SPL_NRECS: if splitting a tree that maintains a record count. */ BEGIN split 62 -ARG fileid int32_t ld +DB fileid int32_t ld ARG left db_pgno_t lu POINTER llsn DB_LSN * lu ARG right db_pgno_t lu @@ -128,28 +57,10 @@ ARG indx u_int32_t lu ARG npgno db_pgno_t lu POINTER nlsn DB_LSN * lu ARG root_pgno db_pgno_t lu -DBT pg DBT s +PGDBT pg DBT s ARG opflags u_int32_t lu END -/* - * BTREE-rsplit: used to log a reverse-split - * - * pgno: the page number of the page copied over the root. - * pgdbt: the page being copied on the root page. - * nrec: the tree's record count. - * rootent: last entry on the root page. - * rootlsn: the root page's original lsn. - */ -DEPRECATED rsplit1 54 -ARG fileid int32_t ld -ARG pgno db_pgno_t lu -DBT pgdbt DBT s -ARG nrec db_pgno_t lu -DBT rootent DBT s -POINTER rootlsn DB_LSN * lu -END - /* * BTREE-rsplit: used to log a reverse-split * @@ -161,9 +72,9 @@ END * rootlsn: the root page's original lsn. */ BEGIN rsplit 63 -ARG fileid int32_t ld +DB fileid int32_t ld ARG pgno db_pgno_t lu -DBT pgdbt DBT s +PGDBT pgdbt DBT s ARG root_pgno db_pgno_t lu ARG nrec db_pgno_t lu DBT rootent DBT s @@ -180,7 +91,7 @@ END * is_insert: 0 if a delete, 1 if an insert. */ BEGIN adj 55 -ARG fileid int32_t ld +DB fileid int32_t ld ARG pgno db_pgno_t lu POINTER lsn DB_LSN * lu ARG indx u_int32_t lu @@ -198,7 +109,7 @@ END * opflags: CAD_UPDATEROOT: if root page count was adjusted. */ BEGIN cadjust 56 -ARG fileid int32_t ld +DB fileid int32_t ld ARG pgno db_pgno_t lu POINTER lsn DB_LSN * lu ARG indx u_int32_t lu @@ -214,7 +125,7 @@ END * indx: the index to be deleted. */ BEGIN cdel 57 -ARG fileid int32_t ld +DB fileid int32_t ld ARG pgno db_pgno_t lu POINTER lsn DB_LSN * lu ARG indx u_int32_t lu @@ -225,12 +136,15 @@ END * * pgno: the page modified. * lsn: the page's original lsn. + * indx: the index to be replaced. + * isdeleted: set if the record was previously deleted. * orig: the original data. - * new: the replacement data. - * duplicate: the prefix of the replacement that matches the original. + * repl: the replacement data. + * prefix: the prefix of the replacement that matches the original. + * suffix: the suffix of the replacement that matches the original. */ BEGIN repl 58 -ARG fileid int32_t ld +DB fileid int32_t ld ARG pgno db_pgno_t lu POINTER lsn DB_LSN * lu ARG indx u_int32_t lu @@ -245,7 +159,7 @@ END * BTREE-root: log the assignment of a root btree page. */ BEGIN root 59 -ARG fileid int32_t ld +DB fileid int32_t ld ARG meta_pgno db_pgno_t lu ARG root_pgno db_pgno_t lu POINTER meta_lsn DB_LSN * lu @@ -260,7 +174,7 @@ END */ BEGIN curadj 64 /* Fileid of db affected. */ -ARG fileid int32_t ld +DB fileid int32_t ld /* Which adjustment. */ ARG mode db_ca_mode ld /* Page entry is from. */ @@ -284,7 +198,7 @@ END */ BEGIN rcuradj 65 /* Fileid of db affected. */ -ARG fileid int32_t ld +DB fileid int32_t ld /* Which adjustment. */ ARG mode ca_recno_arg ld /* Root page number. */ diff --git a/db/btree/btree_auto.c b/db/btree/btree_auto.c index fdb27b7d2..16ebbcad9 100644 --- a/db/btree/btree_auto.c +++ b/db/btree/btree_auto.c @@ -5,609 +5,30 @@ #include #include -#include #include #endif #include "db_int.h" -#include "db_page.h" -#include "db_dispatch.h" -#include "db_am.h" -#include "btree.h" -#include "txn.h" - +#include "dbinc/crypto.h" +#include "dbinc/db_page.h" +#include "dbinc/db_dispatch.h" +#include "dbinc/db_am.h" +#include "dbinc/btree.h" +#include "dbinc/log.h" +#include "dbinc/txn.h" + +/* + * PUBLIC: int __bam_split_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t, DB_LSN *, u_int32_t, + * PUBLIC: db_pgno_t, DB_LSN *, db_pgno_t, const DBT *, u_int32_t)); + */ int -__bam_pg_alloc_log(dbenv, txnid, ret_lsnp, flags, - fileid, meta_lsn, page_lsn, pgno, ptype, next) - DB_ENV *dbenv; +__bam_split_log(dbp, txnid, ret_lsnp, flags, left, llsn, right, rlsn, indx, + npgno, nlsn, root_pgno, pg, opflags) + DB *dbp; DB_TXN *txnid; DB_LSN *ret_lsnp; u_int32_t flags; - int32_t fileid; - DB_LSN * meta_lsn; - DB_LSN * page_lsn; - db_pgno_t pgno; - u_int32_t ptype; - db_pgno_t next; -{ - DBT logrec; - DB_LSN *lsnp, null_lsn; - u_int32_t rectype, txn_num; - int ret; - u_int8_t *bp; - - rectype = DB_bam_pg_alloc; - if (txnid != NULL && - TAILQ_FIRST(&txnid->kids) != NULL && - (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) - return (ret); - txn_num = txnid == NULL ? 0 : txnid->txnid; - if (txnid == NULL) { - ZERO_LSN(null_lsn); - lsnp = &null_lsn; - } else - lsnp = &txnid->last_lsn; - logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) - + sizeof(fileid) - + sizeof(*meta_lsn) - + sizeof(*page_lsn) - + sizeof(pgno) - + sizeof(ptype) - + sizeof(next); - if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) - return (ret); - - bp = logrec.data; - memcpy(bp, &rectype, sizeof(rectype)); - bp += sizeof(rectype); - memcpy(bp, &txn_num, sizeof(txn_num)); - bp += sizeof(txn_num); - memcpy(bp, lsnp, sizeof(DB_LSN)); - bp += sizeof(DB_LSN); - memcpy(bp, &fileid, sizeof(fileid)); - bp += sizeof(fileid); - if (meta_lsn != NULL) - memcpy(bp, meta_lsn, sizeof(*meta_lsn)); - else - memset(bp, 0, sizeof(*meta_lsn)); - bp += sizeof(*meta_lsn); - if (page_lsn != NULL) - memcpy(bp, page_lsn, sizeof(*page_lsn)); - else - memset(bp, 0, sizeof(*page_lsn)); - bp += sizeof(*page_lsn); - memcpy(bp, &pgno, sizeof(pgno)); - bp += sizeof(pgno); - memcpy(bp, &ptype, sizeof(ptype)); - bp += sizeof(ptype); - memcpy(bp, &next, sizeof(next)); - bp += sizeof(next); - DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); - ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); - if (txnid != NULL) - txnid->last_lsn = *ret_lsnp; - __os_free(logrec.data, logrec.size); - return (ret); -} - -int -__bam_pg_alloc_print(dbenv, dbtp, lsnp, notused2, notused3) - DB_ENV *dbenv; - DBT *dbtp; - DB_LSN *lsnp; - db_recops notused2; - void *notused3; -{ - __bam_pg_alloc_args *argp; - u_int32_t i; - u_int ch; - int ret; - - i = 0; - ch = 0; - notused2 = DB_TXN_ABORT; - notused3 = NULL; - - if ((ret = __bam_pg_alloc_read(dbenv, dbtp->data, &argp)) != 0) - return (ret); - printf("[%lu][%lu]bam_pg_alloc: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", - (u_long)lsnp->file, - (u_long)lsnp->offset, - (u_long)argp->type, - (u_long)argp->txnid->txnid, - (u_long)argp->prev_lsn.file, - (u_long)argp->prev_lsn.offset); - printf("\tfileid: %ld\n", (long)argp->fileid); - printf("\tmeta_lsn: [%lu][%lu]\n", - (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset); - printf("\tpage_lsn: [%lu][%lu]\n", - (u_long)argp->page_lsn.file, (u_long)argp->page_lsn.offset); - printf("\tpgno: %lu\n", (u_long)argp->pgno); - printf("\tptype: %lu\n", (u_long)argp->ptype); - printf("\tnext: %lu\n", (u_long)argp->next); - printf("\n"); - __os_free(argp, 0); - return (0); -} - -int -__bam_pg_alloc_read(dbenv, recbuf, argpp) - DB_ENV *dbenv; - void *recbuf; - __bam_pg_alloc_args **argpp; -{ - __bam_pg_alloc_args *argp; - u_int8_t *bp; - int ret; - - ret = __os_malloc(dbenv, sizeof(__bam_pg_alloc_args) + - sizeof(DB_TXN), NULL, &argp); - if (ret != 0) - return (ret); - argp->txnid = (DB_TXN *)&argp[1]; - bp = recbuf; - memcpy(&argp->type, bp, sizeof(argp->type)); - bp += sizeof(argp->type); - memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); - bp += sizeof(argp->txnid->txnid); - memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); - bp += sizeof(DB_LSN); - memcpy(&argp->fileid, bp, sizeof(argp->fileid)); - bp += sizeof(argp->fileid); - memcpy(&argp->meta_lsn, bp, sizeof(argp->meta_lsn)); - bp += sizeof(argp->meta_lsn); - memcpy(&argp->page_lsn, bp, sizeof(argp->page_lsn)); - bp += sizeof(argp->page_lsn); - memcpy(&argp->pgno, bp, sizeof(argp->pgno)); - bp += sizeof(argp->pgno); - memcpy(&argp->ptype, bp, sizeof(argp->ptype)); - bp += sizeof(argp->ptype); - memcpy(&argp->next, bp, sizeof(argp->next)); - bp += sizeof(argp->next); - *argpp = argp; - return (0); -} - -int -__bam_pg_alloc1_print(dbenv, dbtp, lsnp, notused2, notused3) - DB_ENV *dbenv; - DBT *dbtp; - DB_LSN *lsnp; - db_recops notused2; - void *notused3; -{ - __bam_pg_alloc1_args *argp; - u_int32_t i; - u_int ch; - int ret; - - i = 0; - ch = 0; - notused2 = DB_TXN_ABORT; - notused3 = NULL; - - if ((ret = __bam_pg_alloc1_read(dbenv, dbtp->data, &argp)) != 0) - return (ret); - printf("[%lu][%lu]bam_pg_alloc1: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", - (u_long)lsnp->file, - (u_long)lsnp->offset, - (u_long)argp->type, - (u_long)argp->txnid->txnid, - (u_long)argp->prev_lsn.file, - (u_long)argp->prev_lsn.offset); - printf("\tfileid: %ld\n", (long)argp->fileid); - printf("\tmeta_lsn: [%lu][%lu]\n", - (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset); - printf("\talloc_lsn: [%lu][%lu]\n", - (u_long)argp->alloc_lsn.file, (u_long)argp->alloc_lsn.offset); - printf("\tpage_lsn: [%lu][%lu]\n", - (u_long)argp->page_lsn.file, (u_long)argp->page_lsn.offset); - printf("\tpgno: %lu\n", (u_long)argp->pgno); - printf("\tptype: %lu\n", (u_long)argp->ptype); - printf("\tnext: %lu\n", (u_long)argp->next); - printf("\n"); - __os_free(argp, 0); - return (0); -} - -int -__bam_pg_alloc1_read(dbenv, recbuf, argpp) - DB_ENV *dbenv; - void *recbuf; - __bam_pg_alloc1_args **argpp; -{ - __bam_pg_alloc1_args *argp; - u_int8_t *bp; - int ret; - - ret = __os_malloc(dbenv, sizeof(__bam_pg_alloc1_args) + - sizeof(DB_TXN), NULL, &argp); - if (ret != 0) - return (ret); - argp->txnid = (DB_TXN *)&argp[1]; - bp = recbuf; - memcpy(&argp->type, bp, sizeof(argp->type)); - bp += sizeof(argp->type); - memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); - bp += sizeof(argp->txnid->txnid); - memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); - bp += sizeof(DB_LSN); - memcpy(&argp->fileid, bp, sizeof(argp->fileid)); - bp += sizeof(argp->fileid); - memcpy(&argp->meta_lsn, bp, sizeof(argp->meta_lsn)); - bp += sizeof(argp->meta_lsn); - memcpy(&argp->alloc_lsn, bp, sizeof(argp->alloc_lsn)); - bp += sizeof(argp->alloc_lsn); - memcpy(&argp->page_lsn, bp, sizeof(argp->page_lsn)); - bp += sizeof(argp->page_lsn); - memcpy(&argp->pgno, bp, sizeof(argp->pgno)); - bp += sizeof(argp->pgno); - memcpy(&argp->ptype, bp, sizeof(argp->ptype)); - bp += sizeof(argp->ptype); - memcpy(&argp->next, bp, sizeof(argp->next)); - bp += sizeof(argp->next); - *argpp = argp; - return (0); -} - -int -__bam_pg_free_log(dbenv, txnid, ret_lsnp, flags, - fileid, pgno, meta_lsn, header, next) - DB_ENV *dbenv; - DB_TXN *txnid; - DB_LSN *ret_lsnp; - u_int32_t flags; - int32_t fileid; - db_pgno_t pgno; - DB_LSN * meta_lsn; - const DBT *header; - db_pgno_t next; -{ - DBT logrec; - DB_LSN *lsnp, null_lsn; - u_int32_t zero; - u_int32_t rectype, txn_num; - int ret; - u_int8_t *bp; - - rectype = DB_bam_pg_free; - if (txnid != NULL && - TAILQ_FIRST(&txnid->kids) != NULL && - (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) - return (ret); - txn_num = txnid == NULL ? 0 : txnid->txnid; - if (txnid == NULL) { - ZERO_LSN(null_lsn); - lsnp = &null_lsn; - } else - lsnp = &txnid->last_lsn; - logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) - + sizeof(fileid) - + sizeof(pgno) - + sizeof(*meta_lsn) - + sizeof(u_int32_t) + (header == NULL ? 0 : header->size) - + sizeof(next); - if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) - return (ret); - - bp = logrec.data; - memcpy(bp, &rectype, sizeof(rectype)); - bp += sizeof(rectype); - memcpy(bp, &txn_num, sizeof(txn_num)); - bp += sizeof(txn_num); - memcpy(bp, lsnp, sizeof(DB_LSN)); - bp += sizeof(DB_LSN); - memcpy(bp, &fileid, sizeof(fileid)); - bp += sizeof(fileid); - memcpy(bp, &pgno, sizeof(pgno)); - bp += sizeof(pgno); - if (meta_lsn != NULL) - memcpy(bp, meta_lsn, sizeof(*meta_lsn)); - else - memset(bp, 0, sizeof(*meta_lsn)); - bp += sizeof(*meta_lsn); - if (header == NULL) { - zero = 0; - memcpy(bp, &zero, sizeof(u_int32_t)); - bp += sizeof(u_int32_t); - } else { - memcpy(bp, &header->size, sizeof(header->size)); - bp += sizeof(header->size); - memcpy(bp, header->data, header->size); - bp += header->size; - } - memcpy(bp, &next, sizeof(next)); - bp += sizeof(next); - DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); - ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); - if (txnid != NULL) - txnid->last_lsn = *ret_lsnp; - __os_free(logrec.data, logrec.size); - return (ret); -} - -int -__bam_pg_free_print(dbenv, dbtp, lsnp, notused2, notused3) - DB_ENV *dbenv; - DBT *dbtp; - DB_LSN *lsnp; - db_recops notused2; - void *notused3; -{ - __bam_pg_free_args *argp; - u_int32_t i; - u_int ch; - int ret; - - i = 0; - ch = 0; - notused2 = DB_TXN_ABORT; - notused3 = NULL; - - if ((ret = __bam_pg_free_read(dbenv, dbtp->data, &argp)) != 0) - return (ret); - printf("[%lu][%lu]bam_pg_free: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", - (u_long)lsnp->file, - (u_long)lsnp->offset, - (u_long)argp->type, - (u_long)argp->txnid->txnid, - (u_long)argp->prev_lsn.file, - (u_long)argp->prev_lsn.offset); - printf("\tfileid: %ld\n", (long)argp->fileid); - printf("\tpgno: %lu\n", (u_long)argp->pgno); - printf("\tmeta_lsn: [%lu][%lu]\n", - (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset); - printf("\theader: "); - for (i = 0; i < argp->header.size; i++) { - ch = ((u_int8_t *)argp->header.data)[i]; - if (isprint(ch) || ch == 0xa) - putchar(ch); - else - printf("%#x ", ch); - } - printf("\n"); - printf("\tnext: %lu\n", (u_long)argp->next); - printf("\n"); - __os_free(argp, 0); - return (0); -} - -int -__bam_pg_free_read(dbenv, recbuf, argpp) - DB_ENV *dbenv; - void *recbuf; - __bam_pg_free_args **argpp; -{ - __bam_pg_free_args *argp; - u_int8_t *bp; - int ret; - - ret = __os_malloc(dbenv, sizeof(__bam_pg_free_args) + - sizeof(DB_TXN), NULL, &argp); - if (ret != 0) - return (ret); - argp->txnid = (DB_TXN *)&argp[1]; - bp = recbuf; - memcpy(&argp->type, bp, sizeof(argp->type)); - bp += sizeof(argp->type); - memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); - bp += sizeof(argp->txnid->txnid); - memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); - bp += sizeof(DB_LSN); - memcpy(&argp->fileid, bp, sizeof(argp->fileid)); - bp += sizeof(argp->fileid); - memcpy(&argp->pgno, bp, sizeof(argp->pgno)); - bp += sizeof(argp->pgno); - memcpy(&argp->meta_lsn, bp, sizeof(argp->meta_lsn)); - bp += sizeof(argp->meta_lsn); - memset(&argp->header, 0, sizeof(argp->header)); - memcpy(&argp->header.size, bp, sizeof(u_int32_t)); - bp += sizeof(u_int32_t); - argp->header.data = bp; - bp += argp->header.size; - memcpy(&argp->next, bp, sizeof(argp->next)); - bp += sizeof(argp->next); - *argpp = argp; - return (0); -} - -int -__bam_pg_free1_print(dbenv, dbtp, lsnp, notused2, notused3) - DB_ENV *dbenv; - DBT *dbtp; - DB_LSN *lsnp; - db_recops notused2; - void *notused3; -{ - __bam_pg_free1_args *argp; - u_int32_t i; - u_int ch; - int ret; - - i = 0; - ch = 0; - notused2 = DB_TXN_ABORT; - notused3 = NULL; - - if ((ret = __bam_pg_free1_read(dbenv, dbtp->data, &argp)) != 0) - return (ret); - printf("[%lu][%lu]bam_pg_free1: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", - (u_long)lsnp->file, - (u_long)lsnp->offset, - (u_long)argp->type, - (u_long)argp->txnid->txnid, - (u_long)argp->prev_lsn.file, - (u_long)argp->prev_lsn.offset); - printf("\tfileid: %ld\n", (long)argp->fileid); - printf("\tpgno: %lu\n", (u_long)argp->pgno); - printf("\tmeta_lsn: [%lu][%lu]\n", - (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset); - printf("\talloc_lsn: [%lu][%lu]\n", - (u_long)argp->alloc_lsn.file, (u_long)argp->alloc_lsn.offset); - printf("\theader: "); - for (i = 0; i < argp->header.size; i++) { - ch = ((u_int8_t *)argp->header.data)[i]; - if (isprint(ch) || ch == 0xa) - putchar(ch); - else - printf("%#x ", ch); - } - printf("\n"); - printf("\tnext: %lu\n", (u_long)argp->next); - printf("\n"); - __os_free(argp, 0); - return (0); -} - -int -__bam_pg_free1_read(dbenv, recbuf, argpp) - DB_ENV *dbenv; - void *recbuf; - __bam_pg_free1_args **argpp; -{ - __bam_pg_free1_args *argp; - u_int8_t *bp; - int ret; - - ret = __os_malloc(dbenv, sizeof(__bam_pg_free1_args) + - sizeof(DB_TXN), NULL, &argp); - if (ret != 0) - return (ret); - argp->txnid = (DB_TXN *)&argp[1]; - bp = recbuf; - memcpy(&argp->type, bp, sizeof(argp->type)); - bp += sizeof(argp->type); - memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); - bp += sizeof(argp->txnid->txnid); - memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); - bp += sizeof(DB_LSN); - memcpy(&argp->fileid, bp, sizeof(argp->fileid)); - bp += sizeof(argp->fileid); - memcpy(&argp->pgno, bp, sizeof(argp->pgno)); - bp += sizeof(argp->pgno); - memcpy(&argp->meta_lsn, bp, sizeof(argp->meta_lsn)); - bp += sizeof(argp->meta_lsn); - memcpy(&argp->alloc_lsn, bp, sizeof(argp->alloc_lsn)); - bp += sizeof(argp->alloc_lsn); - memset(&argp->header, 0, sizeof(argp->header)); - memcpy(&argp->header.size, bp, sizeof(u_int32_t)); - bp += sizeof(u_int32_t); - argp->header.data = bp; - bp += argp->header.size; - memcpy(&argp->next, bp, sizeof(argp->next)); - bp += sizeof(argp->next); - *argpp = argp; - return (0); -} - -int -__bam_split1_print(dbenv, dbtp, lsnp, notused2, notused3) - DB_ENV *dbenv; - DBT *dbtp; - DB_LSN *lsnp; - db_recops notused2; - void *notused3; -{ - __bam_split1_args *argp; - u_int32_t i; - u_int ch; - int ret; - - i = 0; - ch = 0; - notused2 = DB_TXN_ABORT; - notused3 = NULL; - - if ((ret = __bam_split1_read(dbenv, dbtp->data, &argp)) != 0) - return (ret); - printf("[%lu][%lu]bam_split1: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", - (u_long)lsnp->file, - (u_long)lsnp->offset, - (u_long)argp->type, - (u_long)argp->txnid->txnid, - (u_long)argp->prev_lsn.file, - (u_long)argp->prev_lsn.offset); - printf("\tfileid: %ld\n", (long)argp->fileid); - printf("\tleft: %lu\n", (u_long)argp->left); - printf("\tllsn: [%lu][%lu]\n", - (u_long)argp->llsn.file, (u_long)argp->llsn.offset); - printf("\tright: %lu\n", (u_long)argp->right); - printf("\trlsn: [%lu][%lu]\n", - (u_long)argp->rlsn.file, (u_long)argp->rlsn.offset); - printf("\tindx: %lu\n", (u_long)argp->indx); - printf("\tnpgno: %lu\n", (u_long)argp->npgno); - printf("\tnlsn: [%lu][%lu]\n", - (u_long)argp->nlsn.file, (u_long)argp->nlsn.offset); - printf("\tpg: "); - for (i = 0; i < argp->pg.size; i++) { - ch = ((u_int8_t *)argp->pg.data)[i]; - if (isprint(ch) || ch == 0xa) - putchar(ch); - else - printf("%#x ", ch); - } - printf("\n"); - printf("\n"); - __os_free(argp, 0); - return (0); -} - -int -__bam_split1_read(dbenv, recbuf, argpp) - DB_ENV *dbenv; - void *recbuf; - __bam_split1_args **argpp; -{ - __bam_split1_args *argp; - u_int8_t *bp; - int ret; - - ret = __os_malloc(dbenv, sizeof(__bam_split1_args) + - sizeof(DB_TXN), NULL, &argp); - if (ret != 0) - return (ret); - argp->txnid = (DB_TXN *)&argp[1]; - bp = recbuf; - memcpy(&argp->type, bp, sizeof(argp->type)); - bp += sizeof(argp->type); - memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); - bp += sizeof(argp->txnid->txnid); - memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); - bp += sizeof(DB_LSN); - memcpy(&argp->fileid, bp, sizeof(argp->fileid)); - bp += sizeof(argp->fileid); - memcpy(&argp->left, bp, sizeof(argp->left)); - bp += sizeof(argp->left); - memcpy(&argp->llsn, bp, sizeof(argp->llsn)); - bp += sizeof(argp->llsn); - memcpy(&argp->right, bp, sizeof(argp->right)); - bp += sizeof(argp->right); - memcpy(&argp->rlsn, bp, sizeof(argp->rlsn)); - bp += sizeof(argp->rlsn); - memcpy(&argp->indx, bp, sizeof(argp->indx)); - bp += sizeof(argp->indx); - memcpy(&argp->npgno, bp, sizeof(argp->npgno)); - bp += sizeof(argp->npgno); - memcpy(&argp->nlsn, bp, sizeof(argp->nlsn)); - bp += sizeof(argp->nlsn); - memset(&argp->pg, 0, sizeof(argp->pg)); - memcpy(&argp->pg.size, bp, sizeof(u_int32_t)); - bp += sizeof(u_int32_t); - argp->pg.data = bp; - bp += argp->pg.size; - *argpp = argp; - return (0); -} - -int -__bam_split_log(dbenv, txnid, ret_lsnp, flags, - fileid, left, llsn, right, rlsn, indx, - npgno, nlsn, root_pgno, pg, opflags) - DB_ENV *dbenv; - DB_TXN *txnid; - DB_LSN *ret_lsnp; - u_int32_t flags; - int32_t fileid; db_pgno_t left; DB_LSN * llsn; db_pgno_t right; @@ -620,72 +41,140 @@ __bam_split_log(dbenv, txnid, ret_lsnp, flags, u_int32_t opflags; { DBT logrec; + DB_ENV *dbenv; + DB_TXNLOGREC *lr; DB_LSN *lsnp, null_lsn; - u_int32_t zero; - u_int32_t rectype, txn_num; - int ret; + u_int32_t zero, uinttmp, rectype, txn_num; + u_int npad; u_int8_t *bp; - - rectype = DB_bam_split; - if (txnid != NULL && - TAILQ_FIRST(&txnid->kids) != NULL && - (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) - return (ret); - txn_num = txnid == NULL ? 0 : txnid->txnid; + int is_durable, ret; + + dbenv = dbp->dbenv; + rectype = DB___bam_split; + npad = 0; + + is_durable = 1; + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbenv, DB_ENV_TXN_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (F_ISSET(dbenv, DB_ENV_TXN_NOT_DURABLE) && txnid == NULL) + return (0); + is_durable = 0; + } if (txnid == NULL) { - ZERO_LSN(null_lsn); + txn_num = 0; + null_lsn.file = 0; + null_lsn.offset = 0; lsnp = &null_lsn; - } else + } else { + if (TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid->txnid; lsnp = &txnid->last_lsn; + } + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) - + sizeof(fileid) - + sizeof(left) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + sizeof(*llsn) - + sizeof(right) + + sizeof(u_int32_t) + sizeof(*rlsn) - + sizeof(indx) - + sizeof(npgno) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + sizeof(*nlsn) - + sizeof(root_pgno) + + sizeof(u_int32_t) + sizeof(u_int32_t) + (pg == NULL ? 0 : pg->size) - + sizeof(opflags); - if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) - return (ret); + + sizeof(u_int32_t); + if (CRYPTO_ON(dbenv)) { + npad = + ((DB_CIPHER *)dbenv->crypto_handle)->adj_size(logrec.size); + logrec.size += npad; + } + + if (!is_durable && txnid != NULL) { + if ((ret = __os_malloc(dbenv, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + goto do_malloc; +#else + logrec.data = &lr->data; +#endif + } else { +#ifdef DIAGNOSTIC +do_malloc: +#endif + if ((ret = + __os_malloc(dbenv, logrec.size, &logrec.data)) != 0) { +#ifdef DIAGNOSTIC + if (!is_durable && txnid != NULL) + (void)__os_free(dbenv, lr); +#endif + return (ret); + } + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); bp += sizeof(DB_LSN); - memcpy(bp, &fileid, sizeof(fileid)); - bp += sizeof(fileid); - memcpy(bp, &left, sizeof(left)); - bp += sizeof(left); + + DB_ASSERT(dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + uinttmp = (u_int32_t)dbp->log_filename->id; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)left; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + if (llsn != NULL) memcpy(bp, llsn, sizeof(*llsn)); else memset(bp, 0, sizeof(*llsn)); bp += sizeof(*llsn); - memcpy(bp, &right, sizeof(right)); - bp += sizeof(right); + + uinttmp = (u_int32_t)right; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + if (rlsn != NULL) memcpy(bp, rlsn, sizeof(*rlsn)); else memset(bp, 0, sizeof(*rlsn)); bp += sizeof(*rlsn); - memcpy(bp, &indx, sizeof(indx)); - bp += sizeof(indx); - memcpy(bp, &npgno, sizeof(npgno)); - bp += sizeof(npgno); + + uinttmp = (u_int32_t)indx; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)npgno; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + if (nlsn != NULL) memcpy(bp, nlsn, sizeof(*nlsn)); else memset(bp, 0, sizeof(*nlsn)); bp += sizeof(*nlsn); - memcpy(bp, &root_pgno, sizeof(root_pgno)); - bp += sizeof(root_pgno); + + uinttmp = (u_int32_t)root_pgno; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + if (pg == NULL) { zero = 0; memcpy(bp, &zero, sizeof(u_int32_t)); @@ -696,16 +185,96 @@ __bam_split_log(dbenv, txnid, ret_lsnp, flags, memcpy(bp, pg->data, pg->size); bp += pg->size; } - memcpy(bp, &opflags, sizeof(opflags)); - bp += sizeof(opflags); - DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); - ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); - if (txnid != NULL) - txnid->last_lsn = *ret_lsnp; - __os_free(logrec.data, logrec.size); + + uinttmp = (u_int32_t)opflags; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + +#ifdef DIAGNOSTIC + if (!is_durable && txnid != NULL) { + /* + * We set the debug bit if we are going + * to log non-durable transactions so + * they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + memcpy(logrec.data, &rectype, sizeof(rectype)); + } +#endif + + if (!is_durable && txnid != NULL) { + ret = 0; + STAILQ_INSERT_HEAD(&txnid->logs, lr, links); +#ifdef DIAGNOSTIC + goto do_put; +#endif + } else{ +#ifdef DIAGNOSTIC +do_put: +#endif + ret = __log_put(dbenv, + ret_lsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); + if (ret == 0 && txnid != NULL) + txnid->last_lsn = *ret_lsnp; + } + + if (!is_durable) + LSN_NOT_LOGGED(*ret_lsnp); +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__bam_split_print(dbenv, + (DBT *)&logrec, ret_lsnp, NULL, NULL); +#endif +#ifndef DIAGNOSTIC + if (is_durable || txnid == NULL) +#endif + __os_free(dbenv, logrec.data); + return (ret); } +#ifdef HAVE_REPLICATION +/* + * PUBLIC: int __bam_split_getpgnos __P((DB_ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__bam_split_getpgnos(dbenv, rec, lsnp, notused1, summary) + DB_ENV *dbenv; + DBT *rec; + DB_LSN *lsnp; + db_recops notused1; + void *summary; +{ + TXN_RECS *t; + int ret; + COMPQUIET(rec, NULL); + COMPQUIET(notused1, DB_TXN_ABORT); + + t = (TXN_RECS *)summary; + + if ((ret = __rep_check_alloc(dbenv, t, 1)) != 0) + return (ret); + + t->array[t->npages].flags = LSN_PAGE_NOLOCK; + t->array[t->npages].lsn = *lsnp; + t->array[t->npages].fid = DB_LOGFILEID_INVALID; + memset(&t->array[t->npages].pgdesc, 0, + sizeof(t->array[t->npages].pgdesc)); + + t->npages++; + + return (0); +} +#endif /* HAVE_REPLICATION */ + +/* + * PUBLIC: int __bam_split_print __P((DB_ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ int __bam_split_print(dbenv, dbtp, lsnp, notused2, notused3) DB_ENV *dbenv; @@ -716,50 +285,51 @@ __bam_split_print(dbenv, dbtp, lsnp, notused2, notused3) { __bam_split_args *argp; u_int32_t i; - u_int ch; + int ch; int ret; - i = 0; - ch = 0; notused2 = DB_TXN_ABORT; notused3 = NULL; if ((ret = __bam_split_read(dbenv, dbtp->data, &argp)) != 0) return (ret); - printf("[%lu][%lu]bam_split: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (void)printf( + "[%lu][%lu]__bam_split%s: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", (u_long)argp->type, (u_long)argp->txnid->txnid, (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); - printf("\tfileid: %ld\n", (long)argp->fileid); - printf("\tleft: %lu\n", (u_long)argp->left); - printf("\tllsn: [%lu][%lu]\n", + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tleft: %lu\n", (u_long)argp->left); + (void)printf("\tllsn: [%lu][%lu]\n", (u_long)argp->llsn.file, (u_long)argp->llsn.offset); - printf("\tright: %lu\n", (u_long)argp->right); - printf("\trlsn: [%lu][%lu]\n", + (void)printf("\tright: %lu\n", (u_long)argp->right); + (void)printf("\trlsn: [%lu][%lu]\n", (u_long)argp->rlsn.file, (u_long)argp->rlsn.offset); - printf("\tindx: %lu\n", (u_long)argp->indx); - printf("\tnpgno: %lu\n", (u_long)argp->npgno); - printf("\tnlsn: [%lu][%lu]\n", + (void)printf("\tindx: %lu\n", (u_long)argp->indx); + (void)printf("\tnpgno: %lu\n", (u_long)argp->npgno); + (void)printf("\tnlsn: [%lu][%lu]\n", (u_long)argp->nlsn.file, (u_long)argp->nlsn.offset); - printf("\troot_pgno: %lu\n", (u_long)argp->root_pgno); - printf("\tpg: "); + (void)printf("\troot_pgno: %lu\n", (u_long)argp->root_pgno); + (void)printf("\tpg: "); for (i = 0; i < argp->pg.size; i++) { ch = ((u_int8_t *)argp->pg.data)[i]; - if (isprint(ch) || ch == 0xa) - putchar(ch); - else - printf("%#x ", ch); - } - printf("\n"); - printf("\topflags: %lu\n", (u_long)argp->opflags); - printf("\n"); - __os_free(argp, 0); + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\topflags: %lu\n", (u_long)argp->opflags); + (void)printf("\n"); + __os_free(dbenv, argp); + return (0); } +/* + * PUBLIC: int __bam_split_read __P((DB_ENV *, void *, __bam_split_args **)); + */ int __bam_split_read(dbenv, recbuf, argpp) DB_ENV *dbenv; @@ -767,158 +337,84 @@ __bam_split_read(dbenv, recbuf, argpp) __bam_split_args **argpp; { __bam_split_args *argp; + u_int32_t uinttmp; u_int8_t *bp; int ret; - ret = __os_malloc(dbenv, sizeof(__bam_split_args) + - sizeof(DB_TXN), NULL, &argp); - if (ret != 0) + if ((ret = __os_malloc(dbenv, + sizeof(__bam_split_args) + sizeof(DB_TXN), &argp)) != 0) return (ret); argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); bp += sizeof(DB_LSN); - memcpy(&argp->fileid, bp, sizeof(argp->fileid)); - bp += sizeof(argp->fileid); - memcpy(&argp->left, bp, sizeof(argp->left)); - bp += sizeof(argp->left); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->left = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + memcpy(&argp->llsn, bp, sizeof(argp->llsn)); bp += sizeof(argp->llsn); - memcpy(&argp->right, bp, sizeof(argp->right)); - bp += sizeof(argp->right); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->right = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + memcpy(&argp->rlsn, bp, sizeof(argp->rlsn)); bp += sizeof(argp->rlsn); - memcpy(&argp->indx, bp, sizeof(argp->indx)); - bp += sizeof(argp->indx); - memcpy(&argp->npgno, bp, sizeof(argp->npgno)); - bp += sizeof(argp->npgno); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->indx = (u_int32_t)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->npgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + memcpy(&argp->nlsn, bp, sizeof(argp->nlsn)); bp += sizeof(argp->nlsn); - memcpy(&argp->root_pgno, bp, sizeof(argp->root_pgno)); - bp += sizeof(argp->root_pgno); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->root_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + memset(&argp->pg, 0, sizeof(argp->pg)); memcpy(&argp->pg.size, bp, sizeof(u_int32_t)); bp += sizeof(u_int32_t); argp->pg.data = bp; bp += argp->pg.size; - memcpy(&argp->opflags, bp, sizeof(argp->opflags)); - bp += sizeof(argp->opflags); - *argpp = argp; - return (0); -} - -int -__bam_rsplit1_print(dbenv, dbtp, lsnp, notused2, notused3) - DB_ENV *dbenv; - DBT *dbtp; - DB_LSN *lsnp; - db_recops notused2; - void *notused3; -{ - __bam_rsplit1_args *argp; - u_int32_t i; - u_int ch; - int ret; - - i = 0; - ch = 0; - notused2 = DB_TXN_ABORT; - notused3 = NULL; - if ((ret = __bam_rsplit1_read(dbenv, dbtp->data, &argp)) != 0) - return (ret); - printf("[%lu][%lu]bam_rsplit1: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", - (u_long)lsnp->file, - (u_long)lsnp->offset, - (u_long)argp->type, - (u_long)argp->txnid->txnid, - (u_long)argp->prev_lsn.file, - (u_long)argp->prev_lsn.offset); - printf("\tfileid: %ld\n", (long)argp->fileid); - printf("\tpgno: %lu\n", (u_long)argp->pgno); - printf("\tpgdbt: "); - for (i = 0; i < argp->pgdbt.size; i++) { - ch = ((u_int8_t *)argp->pgdbt.data)[i]; - if (isprint(ch) || ch == 0xa) - putchar(ch); - else - printf("%#x ", ch); - } - printf("\n"); - printf("\tnrec: %lu\n", (u_long)argp->nrec); - printf("\trootent: "); - for (i = 0; i < argp->rootent.size; i++) { - ch = ((u_int8_t *)argp->rootent.data)[i]; - if (isprint(ch) || ch == 0xa) - putchar(ch); - else - printf("%#x ", ch); - } - printf("\n"); - printf("\trootlsn: [%lu][%lu]\n", - (u_long)argp->rootlsn.file, (u_long)argp->rootlsn.offset); - printf("\n"); - __os_free(argp, 0); - return (0); -} - -int -__bam_rsplit1_read(dbenv, recbuf, argpp) - DB_ENV *dbenv; - void *recbuf; - __bam_rsplit1_args **argpp; -{ - __bam_rsplit1_args *argp; - u_int8_t *bp; - int ret; + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->opflags = (u_int32_t)uinttmp; + bp += sizeof(uinttmp); - ret = __os_malloc(dbenv, sizeof(__bam_rsplit1_args) + - sizeof(DB_TXN), NULL, &argp); - if (ret != 0) - return (ret); - argp->txnid = (DB_TXN *)&argp[1]; - bp = recbuf; - memcpy(&argp->type, bp, sizeof(argp->type)); - bp += sizeof(argp->type); - memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); - bp += sizeof(argp->txnid->txnid); - memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); - bp += sizeof(DB_LSN); - memcpy(&argp->fileid, bp, sizeof(argp->fileid)); - bp += sizeof(argp->fileid); - memcpy(&argp->pgno, bp, sizeof(argp->pgno)); - bp += sizeof(argp->pgno); - memset(&argp->pgdbt, 0, sizeof(argp->pgdbt)); - memcpy(&argp->pgdbt.size, bp, sizeof(u_int32_t)); - bp += sizeof(u_int32_t); - argp->pgdbt.data = bp; - bp += argp->pgdbt.size; - memcpy(&argp->nrec, bp, sizeof(argp->nrec)); - bp += sizeof(argp->nrec); - memset(&argp->rootent, 0, sizeof(argp->rootent)); - memcpy(&argp->rootent.size, bp, sizeof(u_int32_t)); - bp += sizeof(u_int32_t); - argp->rootent.data = bp; - bp += argp->rootent.size; - memcpy(&argp->rootlsn, bp, sizeof(argp->rootlsn)); - bp += sizeof(argp->rootlsn); *argpp = argp; return (0); } +/* + * PUBLIC: int __bam_rsplit_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_pgno_t, const DBT *, db_pgno_t, db_pgno_t, + * PUBLIC: const DBT *, DB_LSN *)); + */ int -__bam_rsplit_log(dbenv, txnid, ret_lsnp, flags, - fileid, pgno, pgdbt, root_pgno, nrec, rootent, - rootlsn) - DB_ENV *dbenv; +__bam_rsplit_log(dbp, txnid, ret_lsnp, flags, pgno, pgdbt, root_pgno, nrec, rootent, + rootlsn) + DB *dbp; DB_TXN *txnid; DB_LSN *ret_lsnp; u_int32_t flags; - int32_t fileid; db_pgno_t pgno; const DBT *pgdbt; db_pgno_t root_pgno; @@ -927,45 +423,102 @@ __bam_rsplit_log(dbenv, txnid, ret_lsnp, flags, DB_LSN * rootlsn; { DBT logrec; + DB_ENV *dbenv; + DB_TXNLOGREC *lr; DB_LSN *lsnp, null_lsn; - u_int32_t zero; - u_int32_t rectype, txn_num; - int ret; + u_int32_t zero, uinttmp, rectype, txn_num; + u_int npad; u_int8_t *bp; - - rectype = DB_bam_rsplit; - if (txnid != NULL && - TAILQ_FIRST(&txnid->kids) != NULL && - (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) - return (ret); - txn_num = txnid == NULL ? 0 : txnid->txnid; + int is_durable, ret; + + dbenv = dbp->dbenv; + rectype = DB___bam_rsplit; + npad = 0; + + is_durable = 1; + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbenv, DB_ENV_TXN_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (F_ISSET(dbenv, DB_ENV_TXN_NOT_DURABLE) && txnid == NULL) + return (0); + is_durable = 0; + } if (txnid == NULL) { - ZERO_LSN(null_lsn); + txn_num = 0; + null_lsn.file = 0; + null_lsn.offset = 0; lsnp = &null_lsn; - } else + } else { + if (TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid->txnid; lsnp = &txnid->last_lsn; + } + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) - + sizeof(fileid) - + sizeof(pgno) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + sizeof(u_int32_t) + (pgdbt == NULL ? 0 : pgdbt->size) - + sizeof(root_pgno) - + sizeof(nrec) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + sizeof(u_int32_t) + (rootent == NULL ? 0 : rootent->size) + sizeof(*rootlsn); - if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) - return (ret); + if (CRYPTO_ON(dbenv)) { + npad = + ((DB_CIPHER *)dbenv->crypto_handle)->adj_size(logrec.size); + logrec.size += npad; + } + + if (!is_durable && txnid != NULL) { + if ((ret = __os_malloc(dbenv, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + goto do_malloc; +#else + logrec.data = &lr->data; +#endif + } else { +#ifdef DIAGNOSTIC +do_malloc: +#endif + if ((ret = + __os_malloc(dbenv, logrec.size, &logrec.data)) != 0) { +#ifdef DIAGNOSTIC + if (!is_durable && txnid != NULL) + (void)__os_free(dbenv, lr); +#endif + return (ret); + } + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); bp += sizeof(DB_LSN); - memcpy(bp, &fileid, sizeof(fileid)); - bp += sizeof(fileid); - memcpy(bp, &pgno, sizeof(pgno)); - bp += sizeof(pgno); + + DB_ASSERT(dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + uinttmp = (u_int32_t)dbp->log_filename->id; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)pgno; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + if (pgdbt == NULL) { zero = 0; memcpy(bp, &zero, sizeof(u_int32_t)); @@ -976,10 +529,15 @@ __bam_rsplit_log(dbenv, txnid, ret_lsnp, flags, memcpy(bp, pgdbt->data, pgdbt->size); bp += pgdbt->size; } - memcpy(bp, &root_pgno, sizeof(root_pgno)); - bp += sizeof(root_pgno); - memcpy(bp, &nrec, sizeof(nrec)); - bp += sizeof(nrec); + + uinttmp = (u_int32_t)root_pgno; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)nrec; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + if (rootent == NULL) { zero = 0; memcpy(bp, &zero, sizeof(u_int32_t)); @@ -990,19 +548,98 @@ __bam_rsplit_log(dbenv, txnid, ret_lsnp, flags, memcpy(bp, rootent->data, rootent->size); bp += rootent->size; } + if (rootlsn != NULL) memcpy(bp, rootlsn, sizeof(*rootlsn)); else memset(bp, 0, sizeof(*rootlsn)); bp += sizeof(*rootlsn); - DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); - ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); - if (txnid != NULL) - txnid->last_lsn = *ret_lsnp; - __os_free(logrec.data, logrec.size); + + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + +#ifdef DIAGNOSTIC + if (!is_durable && txnid != NULL) { + /* + * We set the debug bit if we are going + * to log non-durable transactions so + * they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + memcpy(logrec.data, &rectype, sizeof(rectype)); + } +#endif + + if (!is_durable && txnid != NULL) { + ret = 0; + STAILQ_INSERT_HEAD(&txnid->logs, lr, links); +#ifdef DIAGNOSTIC + goto do_put; +#endif + } else{ +#ifdef DIAGNOSTIC +do_put: +#endif + ret = __log_put(dbenv, + ret_lsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); + if (ret == 0 && txnid != NULL) + txnid->last_lsn = *ret_lsnp; + } + + if (!is_durable) + LSN_NOT_LOGGED(*ret_lsnp); +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__bam_rsplit_print(dbenv, + (DBT *)&logrec, ret_lsnp, NULL, NULL); +#endif +#ifndef DIAGNOSTIC + if (is_durable || txnid == NULL) +#endif + __os_free(dbenv, logrec.data); + return (ret); } +#ifdef HAVE_REPLICATION +/* + * PUBLIC: int __bam_rsplit_getpgnos __P((DB_ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__bam_rsplit_getpgnos(dbenv, rec, lsnp, notused1, summary) + DB_ENV *dbenv; + DBT *rec; + DB_LSN *lsnp; + db_recops notused1; + void *summary; +{ + TXN_RECS *t; + int ret; + COMPQUIET(rec, NULL); + COMPQUIET(notused1, DB_TXN_ABORT); + + t = (TXN_RECS *)summary; + + if ((ret = __rep_check_alloc(dbenv, t, 1)) != 0) + return (ret); + + t->array[t->npages].flags = LSN_PAGE_NOLOCK; + t->array[t->npages].lsn = *lsnp; + t->array[t->npages].fid = DB_LOGFILEID_INVALID; + memset(&t->array[t->npages].pgdesc, 0, + sizeof(t->array[t->npages].pgdesc)); + + t->npages++; + + return (0); +} +#endif /* HAVE_REPLICATION */ + +/* + * PUBLIC: int __bam_rsplit_print __P((DB_ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ int __bam_rsplit_print(dbenv, dbtp, lsnp, notused2, notused3) DB_ENV *dbenv; @@ -1013,52 +650,50 @@ __bam_rsplit_print(dbenv, dbtp, lsnp, notused2, notused3) { __bam_rsplit_args *argp; u_int32_t i; - u_int ch; + int ch; int ret; - i = 0; - ch = 0; notused2 = DB_TXN_ABORT; notused3 = NULL; if ((ret = __bam_rsplit_read(dbenv, dbtp->data, &argp)) != 0) return (ret); - printf("[%lu][%lu]bam_rsplit: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (void)printf( + "[%lu][%lu]__bam_rsplit%s: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", (u_long)argp->type, (u_long)argp->txnid->txnid, (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); - printf("\tfileid: %ld\n", (long)argp->fileid); - printf("\tpgno: %lu\n", (u_long)argp->pgno); - printf("\tpgdbt: "); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tpgdbt: "); for (i = 0; i < argp->pgdbt.size; i++) { ch = ((u_int8_t *)argp->pgdbt.data)[i]; - if (isprint(ch) || ch == 0xa) - putchar(ch); - else - printf("%#x ", ch); - } - printf("\n"); - printf("\troot_pgno: %lu\n", (u_long)argp->root_pgno); - printf("\tnrec: %lu\n", (u_long)argp->nrec); - printf("\trootent: "); + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\troot_pgno: %lu\n", (u_long)argp->root_pgno); + (void)printf("\tnrec: %lu\n", (u_long)argp->nrec); + (void)printf("\trootent: "); for (i = 0; i < argp->rootent.size; i++) { ch = ((u_int8_t *)argp->rootent.data)[i]; - if (isprint(ch) || ch == 0xa) - putchar(ch); - else - printf("%#x ", ch); + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); } - printf("\n"); - printf("\trootlsn: [%lu][%lu]\n", + (void)printf("\n"); + (void)printf("\trootlsn: [%lu][%lu]\n", (u_long)argp->rootlsn.file, (u_long)argp->rootlsn.offset); - printf("\n"); - __os_free(argp, 0); + (void)printf("\n"); + __os_free(dbenv, argp); + return (0); } +/* + * PUBLIC: int __bam_rsplit_read __P((DB_ENV *, void *, __bam_rsplit_args **)); + */ int __bam_rsplit_read(dbenv, recbuf, argpp) DB_ENV *dbenv; @@ -1066,53 +701,71 @@ __bam_rsplit_read(dbenv, recbuf, argpp) __bam_rsplit_args **argpp; { __bam_rsplit_args *argp; + u_int32_t uinttmp; u_int8_t *bp; int ret; - ret = __os_malloc(dbenv, sizeof(__bam_rsplit_args) + - sizeof(DB_TXN), NULL, &argp); - if (ret != 0) + if ((ret = __os_malloc(dbenv, + sizeof(__bam_rsplit_args) + sizeof(DB_TXN), &argp)) != 0) return (ret); argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); bp += sizeof(DB_LSN); - memcpy(&argp->fileid, bp, sizeof(argp->fileid)); - bp += sizeof(argp->fileid); - memcpy(&argp->pgno, bp, sizeof(argp->pgno)); - bp += sizeof(argp->pgno); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + memset(&argp->pgdbt, 0, sizeof(argp->pgdbt)); memcpy(&argp->pgdbt.size, bp, sizeof(u_int32_t)); bp += sizeof(u_int32_t); argp->pgdbt.data = bp; bp += argp->pgdbt.size; - memcpy(&argp->root_pgno, bp, sizeof(argp->root_pgno)); - bp += sizeof(argp->root_pgno); - memcpy(&argp->nrec, bp, sizeof(argp->nrec)); - bp += sizeof(argp->nrec); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->root_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->nrec = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + memset(&argp->rootent, 0, sizeof(argp->rootent)); memcpy(&argp->rootent.size, bp, sizeof(u_int32_t)); bp += sizeof(u_int32_t); argp->rootent.data = bp; bp += argp->rootent.size; + memcpy(&argp->rootlsn, bp, sizeof(argp->rootlsn)); bp += sizeof(argp->rootlsn); + *argpp = argp; return (0); } +/* + * PUBLIC: int __bam_adj_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, u_int32_t, u_int32_t, + * PUBLIC: u_int32_t)); + */ int -__bam_adj_log(dbenv, txnid, ret_lsnp, flags, - fileid, pgno, lsn, indx, indx_copy, is_insert) - DB_ENV *dbenv; +__bam_adj_log(dbp, txnid, ret_lsnp, flags, pgno, lsn, indx, indx_copy, is_insert) + DB *dbp; DB_TXN *txnid; DB_LSN *ret_lsnp; u_int32_t flags; - int32_t fileid; db_pgno_t pgno; DB_LSN * lsn; u_int32_t indx; @@ -1120,62 +773,204 @@ __bam_adj_log(dbenv, txnid, ret_lsnp, flags, u_int32_t is_insert; { DBT logrec; + DB_ENV *dbenv; + DB_TXNLOGREC *lr; DB_LSN *lsnp, null_lsn; - u_int32_t rectype, txn_num; - int ret; + u_int32_t uinttmp, rectype, txn_num; + u_int npad; u_int8_t *bp; - - rectype = DB_bam_adj; - if (txnid != NULL && - TAILQ_FIRST(&txnid->kids) != NULL && - (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) - return (ret); - txn_num = txnid == NULL ? 0 : txnid->txnid; + int is_durable, ret; + + dbenv = dbp->dbenv; + rectype = DB___bam_adj; + npad = 0; + + is_durable = 1; + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbenv, DB_ENV_TXN_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (F_ISSET(dbenv, DB_ENV_TXN_NOT_DURABLE) && txnid == NULL) + return (0); + is_durable = 0; + } if (txnid == NULL) { - ZERO_LSN(null_lsn); + txn_num = 0; + null_lsn.file = 0; + null_lsn.offset = 0; lsnp = &null_lsn; - } else + } else { + if (TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid->txnid; lsnp = &txnid->last_lsn; + } + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) - + sizeof(fileid) - + sizeof(pgno) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + sizeof(*lsn) - + sizeof(indx) - + sizeof(indx_copy) - + sizeof(is_insert); - if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) - return (ret); + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t); + if (CRYPTO_ON(dbenv)) { + npad = + ((DB_CIPHER *)dbenv->crypto_handle)->adj_size(logrec.size); + logrec.size += npad; + } + + if (!is_durable && txnid != NULL) { + if ((ret = __os_malloc(dbenv, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + goto do_malloc; +#else + logrec.data = &lr->data; +#endif + } else { +#ifdef DIAGNOSTIC +do_malloc: +#endif + if ((ret = + __os_malloc(dbenv, logrec.size, &logrec.data)) != 0) { +#ifdef DIAGNOSTIC + if (!is_durable && txnid != NULL) + (void)__os_free(dbenv, lr); +#endif + return (ret); + } + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); bp += sizeof(DB_LSN); - memcpy(bp, &fileid, sizeof(fileid)); - bp += sizeof(fileid); - memcpy(bp, &pgno, sizeof(pgno)); - bp += sizeof(pgno); + + DB_ASSERT(dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + uinttmp = (u_int32_t)dbp->log_filename->id; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)pgno; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + if (lsn != NULL) memcpy(bp, lsn, sizeof(*lsn)); else memset(bp, 0, sizeof(*lsn)); bp += sizeof(*lsn); - memcpy(bp, &indx, sizeof(indx)); - bp += sizeof(indx); - memcpy(bp, &indx_copy, sizeof(indx_copy)); - bp += sizeof(indx_copy); - memcpy(bp, &is_insert, sizeof(is_insert)); - bp += sizeof(is_insert); - DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); - ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); - if (txnid != NULL) - txnid->last_lsn = *ret_lsnp; - __os_free(logrec.data, logrec.size); + + uinttmp = (u_int32_t)indx; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)indx_copy; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)is_insert; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + +#ifdef DIAGNOSTIC + if (!is_durable && txnid != NULL) { + /* + * We set the debug bit if we are going + * to log non-durable transactions so + * they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + memcpy(logrec.data, &rectype, sizeof(rectype)); + } +#endif + + if (!is_durable && txnid != NULL) { + ret = 0; + STAILQ_INSERT_HEAD(&txnid->logs, lr, links); +#ifdef DIAGNOSTIC + goto do_put; +#endif + } else{ +#ifdef DIAGNOSTIC +do_put: +#endif + ret = __log_put(dbenv, + ret_lsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); + if (ret == 0 && txnid != NULL) + txnid->last_lsn = *ret_lsnp; + } + + if (!is_durable) + LSN_NOT_LOGGED(*ret_lsnp); +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__bam_adj_print(dbenv, + (DBT *)&logrec, ret_lsnp, NULL, NULL); +#endif +#ifndef DIAGNOSTIC + if (is_durable || txnid == NULL) +#endif + __os_free(dbenv, logrec.data); + return (ret); } +#ifdef HAVE_REPLICATION +/* + * PUBLIC: int __bam_adj_getpgnos __P((DB_ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__bam_adj_getpgnos(dbenv, rec, lsnp, notused1, summary) + DB_ENV *dbenv; + DBT *rec; + DB_LSN *lsnp; + db_recops notused1; + void *summary; +{ + TXN_RECS *t; + int ret; + COMPQUIET(rec, NULL); + COMPQUIET(notused1, DB_TXN_ABORT); + + t = (TXN_RECS *)summary; + + if ((ret = __rep_check_alloc(dbenv, t, 1)) != 0) + return (ret); + + t->array[t->npages].flags = LSN_PAGE_NOLOCK; + t->array[t->npages].lsn = *lsnp; + t->array[t->npages].fid = DB_LOGFILEID_INVALID; + memset(&t->array[t->npages].pgdesc, 0, + sizeof(t->array[t->npages].pgdesc)); + + t->npages++; + + return (0); +} +#endif /* HAVE_REPLICATION */ + +/* + * PUBLIC: int __bam_adj_print __P((DB_ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ int __bam_adj_print(dbenv, dbtp, lsnp, notused2, notused3) DB_ENV *dbenv; @@ -1185,36 +980,38 @@ __bam_adj_print(dbenv, dbtp, lsnp, notused2, notused3) void *notused3; { __bam_adj_args *argp; - u_int32_t i; - u_int ch; int ret; - i = 0; - ch = 0; notused2 = DB_TXN_ABORT; notused3 = NULL; if ((ret = __bam_adj_read(dbenv, dbtp->data, &argp)) != 0) return (ret); - printf("[%lu][%lu]bam_adj: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (void)printf( + "[%lu][%lu]__bam_adj%s: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", (u_long)argp->type, (u_long)argp->txnid->txnid, (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); - printf("\tfileid: %ld\n", (long)argp->fileid); - printf("\tpgno: %lu\n", (u_long)argp->pgno); - printf("\tlsn: [%lu][%lu]\n", + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tlsn: [%lu][%lu]\n", (u_long)argp->lsn.file, (u_long)argp->lsn.offset); - printf("\tindx: %lu\n", (u_long)argp->indx); - printf("\tindx_copy: %lu\n", (u_long)argp->indx_copy); - printf("\tis_insert: %lu\n", (u_long)argp->is_insert); - printf("\n"); - __os_free(argp, 0); + (void)printf("\tindx: %lu\n", (u_long)argp->indx); + (void)printf("\tindx_copy: %lu\n", (u_long)argp->indx_copy); + (void)printf("\tis_insert: %lu\n", (u_long)argp->is_insert); + (void)printf("\n"); + __os_free(dbenv, argp); + return (0); } +/* + * PUBLIC: int __bam_adj_read __P((DB_ENV *, void *, __bam_adj_args **)); + */ int __bam_adj_read(dbenv, recbuf, argpp) DB_ENV *dbenv; @@ -1222,45 +1019,62 @@ __bam_adj_read(dbenv, recbuf, argpp) __bam_adj_args **argpp; { __bam_adj_args *argp; + u_int32_t uinttmp; u_int8_t *bp; int ret; - ret = __os_malloc(dbenv, sizeof(__bam_adj_args) + - sizeof(DB_TXN), NULL, &argp); - if (ret != 0) + if ((ret = __os_malloc(dbenv, + sizeof(__bam_adj_args) + sizeof(DB_TXN), &argp)) != 0) return (ret); argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); bp += sizeof(DB_LSN); - memcpy(&argp->fileid, bp, sizeof(argp->fileid)); - bp += sizeof(argp->fileid); - memcpy(&argp->pgno, bp, sizeof(argp->pgno)); - bp += sizeof(argp->pgno); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + memcpy(&argp->lsn, bp, sizeof(argp->lsn)); bp += sizeof(argp->lsn); - memcpy(&argp->indx, bp, sizeof(argp->indx)); - bp += sizeof(argp->indx); - memcpy(&argp->indx_copy, bp, sizeof(argp->indx_copy)); - bp += sizeof(argp->indx_copy); - memcpy(&argp->is_insert, bp, sizeof(argp->is_insert)); - bp += sizeof(argp->is_insert); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->indx = (u_int32_t)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->indx_copy = (u_int32_t)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->is_insert = (u_int32_t)uinttmp; + bp += sizeof(uinttmp); + *argpp = argp; return (0); } +/* + * PUBLIC: int __bam_cadjust_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, u_int32_t, int32_t, u_int32_t)); + */ int -__bam_cadjust_log(dbenv, txnid, ret_lsnp, flags, - fileid, pgno, lsn, indx, adjust, opflags) - DB_ENV *dbenv; +__bam_cadjust_log(dbp, txnid, ret_lsnp, flags, pgno, lsn, indx, adjust, opflags) + DB *dbp; DB_TXN *txnid; DB_LSN *ret_lsnp; u_int32_t flags; - int32_t fileid; db_pgno_t pgno; DB_LSN * lsn; u_int32_t indx; @@ -1268,62 +1082,204 @@ __bam_cadjust_log(dbenv, txnid, ret_lsnp, flags, u_int32_t opflags; { DBT logrec; + DB_ENV *dbenv; + DB_TXNLOGREC *lr; DB_LSN *lsnp, null_lsn; - u_int32_t rectype, txn_num; - int ret; + u_int32_t uinttmp, rectype, txn_num; + u_int npad; u_int8_t *bp; - - rectype = DB_bam_cadjust; - if (txnid != NULL && - TAILQ_FIRST(&txnid->kids) != NULL && - (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) - return (ret); - txn_num = txnid == NULL ? 0 : txnid->txnid; + int is_durable, ret; + + dbenv = dbp->dbenv; + rectype = DB___bam_cadjust; + npad = 0; + + is_durable = 1; + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbenv, DB_ENV_TXN_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (F_ISSET(dbenv, DB_ENV_TXN_NOT_DURABLE) && txnid == NULL) + return (0); + is_durable = 0; + } if (txnid == NULL) { - ZERO_LSN(null_lsn); + txn_num = 0; + null_lsn.file = 0; + null_lsn.offset = 0; lsnp = &null_lsn; - } else + } else { + if (TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid->txnid; lsnp = &txnid->last_lsn; + } + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) - + sizeof(fileid) - + sizeof(pgno) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + sizeof(*lsn) - + sizeof(indx) - + sizeof(adjust) - + sizeof(opflags); - if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) - return (ret); + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t); + if (CRYPTO_ON(dbenv)) { + npad = + ((DB_CIPHER *)dbenv->crypto_handle)->adj_size(logrec.size); + logrec.size += npad; + } + + if (!is_durable && txnid != NULL) { + if ((ret = __os_malloc(dbenv, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + goto do_malloc; +#else + logrec.data = &lr->data; +#endif + } else { +#ifdef DIAGNOSTIC +do_malloc: +#endif + if ((ret = + __os_malloc(dbenv, logrec.size, &logrec.data)) != 0) { +#ifdef DIAGNOSTIC + if (!is_durable && txnid != NULL) + (void)__os_free(dbenv, lr); +#endif + return (ret); + } + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); bp += sizeof(DB_LSN); - memcpy(bp, &fileid, sizeof(fileid)); - bp += sizeof(fileid); - memcpy(bp, &pgno, sizeof(pgno)); - bp += sizeof(pgno); + + DB_ASSERT(dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + uinttmp = (u_int32_t)dbp->log_filename->id; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)pgno; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + if (lsn != NULL) memcpy(bp, lsn, sizeof(*lsn)); else memset(bp, 0, sizeof(*lsn)); bp += sizeof(*lsn); - memcpy(bp, &indx, sizeof(indx)); - bp += sizeof(indx); - memcpy(bp, &adjust, sizeof(adjust)); - bp += sizeof(adjust); - memcpy(bp, &opflags, sizeof(opflags)); - bp += sizeof(opflags); - DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); - ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); - if (txnid != NULL) - txnid->last_lsn = *ret_lsnp; - __os_free(logrec.data, logrec.size); + + uinttmp = (u_int32_t)indx; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)adjust; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)opflags; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + +#ifdef DIAGNOSTIC + if (!is_durable && txnid != NULL) { + /* + * We set the debug bit if we are going + * to log non-durable transactions so + * they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + memcpy(logrec.data, &rectype, sizeof(rectype)); + } +#endif + + if (!is_durable && txnid != NULL) { + ret = 0; + STAILQ_INSERT_HEAD(&txnid->logs, lr, links); +#ifdef DIAGNOSTIC + goto do_put; +#endif + } else{ +#ifdef DIAGNOSTIC +do_put: +#endif + ret = __log_put(dbenv, + ret_lsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); + if (ret == 0 && txnid != NULL) + txnid->last_lsn = *ret_lsnp; + } + + if (!is_durable) + LSN_NOT_LOGGED(*ret_lsnp); +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__bam_cadjust_print(dbenv, + (DBT *)&logrec, ret_lsnp, NULL, NULL); +#endif +#ifndef DIAGNOSTIC + if (is_durable || txnid == NULL) +#endif + __os_free(dbenv, logrec.data); + return (ret); } +#ifdef HAVE_REPLICATION +/* + * PUBLIC: int __bam_cadjust_getpgnos __P((DB_ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__bam_cadjust_getpgnos(dbenv, rec, lsnp, notused1, summary) + DB_ENV *dbenv; + DBT *rec; + DB_LSN *lsnp; + db_recops notused1; + void *summary; +{ + TXN_RECS *t; + int ret; + COMPQUIET(rec, NULL); + COMPQUIET(notused1, DB_TXN_ABORT); + + t = (TXN_RECS *)summary; + + if ((ret = __rep_check_alloc(dbenv, t, 1)) != 0) + return (ret); + + t->array[t->npages].flags = LSN_PAGE_NOLOCK; + t->array[t->npages].lsn = *lsnp; + t->array[t->npages].fid = DB_LOGFILEID_INVALID; + memset(&t->array[t->npages].pgdesc, 0, + sizeof(t->array[t->npages].pgdesc)); + + t->npages++; + + return (0); +} +#endif /* HAVE_REPLICATION */ + +/* + * PUBLIC: int __bam_cadjust_print __P((DB_ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ int __bam_cadjust_print(dbenv, dbtp, lsnp, notused2, notused3) DB_ENV *dbenv; @@ -1333,36 +1289,39 @@ __bam_cadjust_print(dbenv, dbtp, lsnp, notused2, notused3) void *notused3; { __bam_cadjust_args *argp; - u_int32_t i; - u_int ch; int ret; - i = 0; - ch = 0; notused2 = DB_TXN_ABORT; notused3 = NULL; if ((ret = __bam_cadjust_read(dbenv, dbtp->data, &argp)) != 0) return (ret); - printf("[%lu][%lu]bam_cadjust: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (void)printf( + "[%lu][%lu]__bam_cadjust%s: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", (u_long)argp->type, (u_long)argp->txnid->txnid, (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); - printf("\tfileid: %ld\n", (long)argp->fileid); - printf("\tpgno: %lu\n", (u_long)argp->pgno); - printf("\tlsn: [%lu][%lu]\n", + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tlsn: [%lu][%lu]\n", (u_long)argp->lsn.file, (u_long)argp->lsn.offset); - printf("\tindx: %lu\n", (u_long)argp->indx); - printf("\tadjust: %ld\n", (long)argp->adjust); - printf("\topflags: %lu\n", (u_long)argp->opflags); - printf("\n"); - __os_free(argp, 0); + (void)printf("\tindx: %lu\n", (u_long)argp->indx); + (void)printf("\tadjust: %ld\n", (long)argp->adjust); + (void)printf("\topflags: %lu\n", (u_long)argp->opflags); + (void)printf("\n"); + __os_free(dbenv, argp); + return (0); } +/* + * PUBLIC: int __bam_cadjust_read __P((DB_ENV *, void *, + * PUBLIC: __bam_cadjust_args **)); + */ int __bam_cadjust_read(dbenv, recbuf, argpp) DB_ENV *dbenv; @@ -1370,100 +1329,255 @@ __bam_cadjust_read(dbenv, recbuf, argpp) __bam_cadjust_args **argpp; { __bam_cadjust_args *argp; + u_int32_t uinttmp; u_int8_t *bp; int ret; - ret = __os_malloc(dbenv, sizeof(__bam_cadjust_args) + - sizeof(DB_TXN), NULL, &argp); - if (ret != 0) + if ((ret = __os_malloc(dbenv, + sizeof(__bam_cadjust_args) + sizeof(DB_TXN), &argp)) != 0) return (ret); argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); bp += sizeof(DB_LSN); - memcpy(&argp->fileid, bp, sizeof(argp->fileid)); - bp += sizeof(argp->fileid); - memcpy(&argp->pgno, bp, sizeof(argp->pgno)); - bp += sizeof(argp->pgno); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + memcpy(&argp->lsn, bp, sizeof(argp->lsn)); bp += sizeof(argp->lsn); - memcpy(&argp->indx, bp, sizeof(argp->indx)); - bp += sizeof(argp->indx); - memcpy(&argp->adjust, bp, sizeof(argp->adjust)); - bp += sizeof(argp->adjust); - memcpy(&argp->opflags, bp, sizeof(argp->opflags)); - bp += sizeof(argp->opflags); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->indx = (u_int32_t)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->adjust = (int32_t)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->opflags = (u_int32_t)uinttmp; + bp += sizeof(uinttmp); + *argpp = argp; return (0); } +/* + * PUBLIC: int __bam_cdel_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, u_int32_t)); + */ int -__bam_cdel_log(dbenv, txnid, ret_lsnp, flags, - fileid, pgno, lsn, indx) - DB_ENV *dbenv; +__bam_cdel_log(dbp, txnid, ret_lsnp, flags, pgno, lsn, indx) + DB *dbp; DB_TXN *txnid; DB_LSN *ret_lsnp; u_int32_t flags; - int32_t fileid; db_pgno_t pgno; DB_LSN * lsn; u_int32_t indx; { DBT logrec; + DB_ENV *dbenv; + DB_TXNLOGREC *lr; DB_LSN *lsnp, null_lsn; - u_int32_t rectype, txn_num; - int ret; + u_int32_t uinttmp, rectype, txn_num; + u_int npad; u_int8_t *bp; - - rectype = DB_bam_cdel; - if (txnid != NULL && - TAILQ_FIRST(&txnid->kids) != NULL && - (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) - return (ret); - txn_num = txnid == NULL ? 0 : txnid->txnid; + int is_durable, ret; + + dbenv = dbp->dbenv; + rectype = DB___bam_cdel; + npad = 0; + + is_durable = 1; + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbenv, DB_ENV_TXN_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (F_ISSET(dbenv, DB_ENV_TXN_NOT_DURABLE) && txnid == NULL) + return (0); + is_durable = 0; + } if (txnid == NULL) { - ZERO_LSN(null_lsn); + txn_num = 0; + null_lsn.file = 0; + null_lsn.offset = 0; lsnp = &null_lsn; - } else + } else { + if (TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid->txnid; lsnp = &txnid->last_lsn; + } + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) - + sizeof(fileid) - + sizeof(pgno) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + sizeof(*lsn) - + sizeof(indx); - if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) - return (ret); + + sizeof(u_int32_t); + if (CRYPTO_ON(dbenv)) { + npad = + ((DB_CIPHER *)dbenv->crypto_handle)->adj_size(logrec.size); + logrec.size += npad; + } + + if (!is_durable && txnid != NULL) { + if ((ret = __os_malloc(dbenv, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + goto do_malloc; +#else + logrec.data = &lr->data; +#endif + } else { +#ifdef DIAGNOSTIC +do_malloc: +#endif + if ((ret = + __os_malloc(dbenv, logrec.size, &logrec.data)) != 0) { +#ifdef DIAGNOSTIC + if (!is_durable && txnid != NULL) + (void)__os_free(dbenv, lr); +#endif + return (ret); + } + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); bp += sizeof(DB_LSN); - memcpy(bp, &fileid, sizeof(fileid)); - bp += sizeof(fileid); - memcpy(bp, &pgno, sizeof(pgno)); - bp += sizeof(pgno); + + DB_ASSERT(dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + uinttmp = (u_int32_t)dbp->log_filename->id; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)pgno; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + if (lsn != NULL) memcpy(bp, lsn, sizeof(*lsn)); else memset(bp, 0, sizeof(*lsn)); bp += sizeof(*lsn); - memcpy(bp, &indx, sizeof(indx)); - bp += sizeof(indx); - DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); - ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); - if (txnid != NULL) - txnid->last_lsn = *ret_lsnp; - __os_free(logrec.data, logrec.size); + + uinttmp = (u_int32_t)indx; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + +#ifdef DIAGNOSTIC + if (!is_durable && txnid != NULL) { + /* + * We set the debug bit if we are going + * to log non-durable transactions so + * they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + memcpy(logrec.data, &rectype, sizeof(rectype)); + } +#endif + + if (!is_durable && txnid != NULL) { + ret = 0; + STAILQ_INSERT_HEAD(&txnid->logs, lr, links); +#ifdef DIAGNOSTIC + goto do_put; +#endif + } else{ +#ifdef DIAGNOSTIC +do_put: +#endif + ret = __log_put(dbenv, + ret_lsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); + if (ret == 0 && txnid != NULL) + txnid->last_lsn = *ret_lsnp; + } + + if (!is_durable) + LSN_NOT_LOGGED(*ret_lsnp); +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__bam_cdel_print(dbenv, + (DBT *)&logrec, ret_lsnp, NULL, NULL); +#endif +#ifndef DIAGNOSTIC + if (is_durable || txnid == NULL) +#endif + __os_free(dbenv, logrec.data); + return (ret); } +#ifdef HAVE_REPLICATION +/* + * PUBLIC: int __bam_cdel_getpgnos __P((DB_ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__bam_cdel_getpgnos(dbenv, rec, lsnp, notused1, summary) + DB_ENV *dbenv; + DBT *rec; + DB_LSN *lsnp; + db_recops notused1; + void *summary; +{ + TXN_RECS *t; + int ret; + COMPQUIET(rec, NULL); + COMPQUIET(notused1, DB_TXN_ABORT); + + t = (TXN_RECS *)summary; + + if ((ret = __rep_check_alloc(dbenv, t, 1)) != 0) + return (ret); + + t->array[t->npages].flags = LSN_PAGE_NOLOCK; + t->array[t->npages].lsn = *lsnp; + t->array[t->npages].fid = DB_LOGFILEID_INVALID; + memset(&t->array[t->npages].pgdesc, 0, + sizeof(t->array[t->npages].pgdesc)); + + t->npages++; + + return (0); +} +#endif /* HAVE_REPLICATION */ + +/* + * PUBLIC: int __bam_cdel_print __P((DB_ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ int __bam_cdel_print(dbenv, dbtp, lsnp, notused2, notused3) DB_ENV *dbenv; @@ -1473,34 +1587,36 @@ __bam_cdel_print(dbenv, dbtp, lsnp, notused2, notused3) void *notused3; { __bam_cdel_args *argp; - u_int32_t i; - u_int ch; int ret; - i = 0; - ch = 0; notused2 = DB_TXN_ABORT; notused3 = NULL; if ((ret = __bam_cdel_read(dbenv, dbtp->data, &argp)) != 0) return (ret); - printf("[%lu][%lu]bam_cdel: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (void)printf( + "[%lu][%lu]__bam_cdel%s: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", (u_long)argp->type, (u_long)argp->txnid->txnid, (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); - printf("\tfileid: %ld\n", (long)argp->fileid); - printf("\tpgno: %lu\n", (u_long)argp->pgno); - printf("\tlsn: [%lu][%lu]\n", + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tlsn: [%lu][%lu]\n", (u_long)argp->lsn.file, (u_long)argp->lsn.offset); - printf("\tindx: %lu\n", (u_long)argp->indx); - printf("\n"); - __os_free(argp, 0); + (void)printf("\tindx: %lu\n", (u_long)argp->indx); + (void)printf("\n"); + __os_free(dbenv, argp); + return (0); } +/* + * PUBLIC: int __bam_cdel_read __P((DB_ENV *, void *, __bam_cdel_args **)); + */ int __bam_cdel_read(dbenv, recbuf, argpp) DB_ENV *dbenv; @@ -1508,42 +1624,56 @@ __bam_cdel_read(dbenv, recbuf, argpp) __bam_cdel_args **argpp; { __bam_cdel_args *argp; + u_int32_t uinttmp; u_int8_t *bp; int ret; - ret = __os_malloc(dbenv, sizeof(__bam_cdel_args) + - sizeof(DB_TXN), NULL, &argp); - if (ret != 0) + if ((ret = __os_malloc(dbenv, + sizeof(__bam_cdel_args) + sizeof(DB_TXN), &argp)) != 0) return (ret); argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); bp += sizeof(DB_LSN); - memcpy(&argp->fileid, bp, sizeof(argp->fileid)); - bp += sizeof(argp->fileid); - memcpy(&argp->pgno, bp, sizeof(argp->pgno)); - bp += sizeof(argp->pgno); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + memcpy(&argp->lsn, bp, sizeof(argp->lsn)); bp += sizeof(argp->lsn); - memcpy(&argp->indx, bp, sizeof(argp->indx)); - bp += sizeof(argp->indx); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->indx = (u_int32_t)uinttmp; + bp += sizeof(uinttmp); + *argpp = argp; return (0); } +/* + * PUBLIC: int __bam_repl_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, u_int32_t, u_int32_t, + * PUBLIC: const DBT *, const DBT *, u_int32_t, u_int32_t)); + */ int -__bam_repl_log(dbenv, txnid, ret_lsnp, flags, - fileid, pgno, lsn, indx, isdeleted, orig, - repl, prefix, suffix) - DB_ENV *dbenv; +__bam_repl_log(dbp, txnid, ret_lsnp, flags, pgno, lsn, indx, isdeleted, orig, + repl, prefix, suffix) + DB *dbp; DB_TXN *txnid; DB_LSN *ret_lsnp; u_int32_t flags; - int32_t fileid; db_pgno_t pgno; DB_LSN * lsn; u_int32_t indx; @@ -1554,56 +1684,118 @@ __bam_repl_log(dbenv, txnid, ret_lsnp, flags, u_int32_t suffix; { DBT logrec; + DB_ENV *dbenv; + DB_TXNLOGREC *lr; DB_LSN *lsnp, null_lsn; - u_int32_t zero; - u_int32_t rectype, txn_num; - int ret; + u_int32_t zero, uinttmp, rectype, txn_num; + u_int npad; u_int8_t *bp; - - rectype = DB_bam_repl; - if (txnid != NULL && - TAILQ_FIRST(&txnid->kids) != NULL && - (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) - return (ret); - txn_num = txnid == NULL ? 0 : txnid->txnid; + int is_durable, ret; + + dbenv = dbp->dbenv; + rectype = DB___bam_repl; + npad = 0; + + is_durable = 1; + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbenv, DB_ENV_TXN_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (F_ISSET(dbenv, DB_ENV_TXN_NOT_DURABLE) && txnid == NULL) + return (0); + is_durable = 0; + } if (txnid == NULL) { - ZERO_LSN(null_lsn); + txn_num = 0; + null_lsn.file = 0; + null_lsn.offset = 0; lsnp = &null_lsn; - } else + } else { + if (TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid->txnid; lsnp = &txnid->last_lsn; + } + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) - + sizeof(fileid) - + sizeof(pgno) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + sizeof(*lsn) - + sizeof(indx) - + sizeof(isdeleted) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + sizeof(u_int32_t) + (orig == NULL ? 0 : orig->size) + sizeof(u_int32_t) + (repl == NULL ? 0 : repl->size) - + sizeof(prefix) - + sizeof(suffix); - if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) - return (ret); + + sizeof(u_int32_t) + + sizeof(u_int32_t); + if (CRYPTO_ON(dbenv)) { + npad = + ((DB_CIPHER *)dbenv->crypto_handle)->adj_size(logrec.size); + logrec.size += npad; + } + + if (!is_durable && txnid != NULL) { + if ((ret = __os_malloc(dbenv, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + goto do_malloc; +#else + logrec.data = &lr->data; +#endif + } else { +#ifdef DIAGNOSTIC +do_malloc: +#endif + if ((ret = + __os_malloc(dbenv, logrec.size, &logrec.data)) != 0) { +#ifdef DIAGNOSTIC + if (!is_durable && txnid != NULL) + (void)__os_free(dbenv, lr); +#endif + return (ret); + } + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); bp += sizeof(DB_LSN); - memcpy(bp, &fileid, sizeof(fileid)); - bp += sizeof(fileid); - memcpy(bp, &pgno, sizeof(pgno)); - bp += sizeof(pgno); + + DB_ASSERT(dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + uinttmp = (u_int32_t)dbp->log_filename->id; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)pgno; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + if (lsn != NULL) memcpy(bp, lsn, sizeof(*lsn)); else memset(bp, 0, sizeof(*lsn)); bp += sizeof(*lsn); - memcpy(bp, &indx, sizeof(indx)); - bp += sizeof(indx); - memcpy(bp, &isdeleted, sizeof(isdeleted)); - bp += sizeof(isdeleted); + + uinttmp = (u_int32_t)indx; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)isdeleted; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + if (orig == NULL) { zero = 0; memcpy(bp, &zero, sizeof(u_int32_t)); @@ -1614,6 +1806,7 @@ __bam_repl_log(dbenv, txnid, ret_lsnp, flags, memcpy(bp, orig->data, orig->size); bp += orig->size; } + if (repl == NULL) { zero = 0; memcpy(bp, &zero, sizeof(u_int32_t)); @@ -1624,18 +1817,100 @@ __bam_repl_log(dbenv, txnid, ret_lsnp, flags, memcpy(bp, repl->data, repl->size); bp += repl->size; } - memcpy(bp, &prefix, sizeof(prefix)); - bp += sizeof(prefix); - memcpy(bp, &suffix, sizeof(suffix)); - bp += sizeof(suffix); - DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); - ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); - if (txnid != NULL) - txnid->last_lsn = *ret_lsnp; - __os_free(logrec.data, logrec.size); + + uinttmp = (u_int32_t)prefix; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)suffix; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + +#ifdef DIAGNOSTIC + if (!is_durable && txnid != NULL) { + /* + * We set the debug bit if we are going + * to log non-durable transactions so + * they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + memcpy(logrec.data, &rectype, sizeof(rectype)); + } +#endif + + if (!is_durable && txnid != NULL) { + ret = 0; + STAILQ_INSERT_HEAD(&txnid->logs, lr, links); +#ifdef DIAGNOSTIC + goto do_put; +#endif + } else{ +#ifdef DIAGNOSTIC +do_put: +#endif + ret = __log_put(dbenv, + ret_lsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); + if (ret == 0 && txnid != NULL) + txnid->last_lsn = *ret_lsnp; + } + + if (!is_durable) + LSN_NOT_LOGGED(*ret_lsnp); +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__bam_repl_print(dbenv, + (DBT *)&logrec, ret_lsnp, NULL, NULL); +#endif +#ifndef DIAGNOSTIC + if (is_durable || txnid == NULL) +#endif + __os_free(dbenv, logrec.data); + return (ret); } +#ifdef HAVE_REPLICATION +/* + * PUBLIC: int __bam_repl_getpgnos __P((DB_ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__bam_repl_getpgnos(dbenv, rec, lsnp, notused1, summary) + DB_ENV *dbenv; + DBT *rec; + DB_LSN *lsnp; + db_recops notused1; + void *summary; +{ + TXN_RECS *t; + int ret; + COMPQUIET(rec, NULL); + COMPQUIET(notused1, DB_TXN_ABORT); + + t = (TXN_RECS *)summary; + + if ((ret = __rep_check_alloc(dbenv, t, 1)) != 0) + return (ret); + + t->array[t->npages].flags = LSN_PAGE_NOLOCK; + t->array[t->npages].lsn = *lsnp; + t->array[t->npages].fid = DB_LOGFILEID_INVALID; + memset(&t->array[t->npages].pgdesc, 0, + sizeof(t->array[t->npages].pgdesc)); + + t->npages++; + + return (0); +} +#endif /* HAVE_REPLICATION */ + +/* + * PUBLIC: int __bam_repl_print __P((DB_ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ int __bam_repl_print(dbenv, dbtp, lsnp, notused2, notused3) DB_ENV *dbenv; @@ -1646,54 +1921,52 @@ __bam_repl_print(dbenv, dbtp, lsnp, notused2, notused3) { __bam_repl_args *argp; u_int32_t i; - u_int ch; + int ch; int ret; - i = 0; - ch = 0; notused2 = DB_TXN_ABORT; notused3 = NULL; if ((ret = __bam_repl_read(dbenv, dbtp->data, &argp)) != 0) return (ret); - printf("[%lu][%lu]bam_repl: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (void)printf( + "[%lu][%lu]__bam_repl%s: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", (u_long)argp->type, (u_long)argp->txnid->txnid, (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); - printf("\tfileid: %ld\n", (long)argp->fileid); - printf("\tpgno: %lu\n", (u_long)argp->pgno); - printf("\tlsn: [%lu][%lu]\n", + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tlsn: [%lu][%lu]\n", (u_long)argp->lsn.file, (u_long)argp->lsn.offset); - printf("\tindx: %lu\n", (u_long)argp->indx); - printf("\tisdeleted: %lu\n", (u_long)argp->isdeleted); - printf("\torig: "); + (void)printf("\tindx: %lu\n", (u_long)argp->indx); + (void)printf("\tisdeleted: %lu\n", (u_long)argp->isdeleted); + (void)printf("\torig: "); for (i = 0; i < argp->orig.size; i++) { ch = ((u_int8_t *)argp->orig.data)[i]; - if (isprint(ch) || ch == 0xa) - putchar(ch); - else - printf("%#x ", ch); + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); } - printf("\n"); - printf("\trepl: "); + (void)printf("\n"); + (void)printf("\trepl: "); for (i = 0; i < argp->repl.size; i++) { ch = ((u_int8_t *)argp->repl.data)[i]; - if (isprint(ch) || ch == 0xa) - putchar(ch); - else - printf("%#x ", ch); - } - printf("\n"); - printf("\tprefix: %lu\n", (u_long)argp->prefix); - printf("\tsuffix: %lu\n", (u_long)argp->suffix); - printf("\n"); - __os_free(argp, 0); + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\tprefix: %lu\n", (u_long)argp->prefix); + (void)printf("\tsuffix: %lu\n", (u_long)argp->suffix); + (void)printf("\n"); + __os_free(dbenv, argp); + return (0); } +/* + * PUBLIC: int __bam_repl_read __P((DB_ENV *, void *, __bam_repl_args **)); + */ int __bam_repl_read(dbenv, recbuf, argpp) DB_ENV *dbenv; @@ -1701,112 +1974,271 @@ __bam_repl_read(dbenv, recbuf, argpp) __bam_repl_args **argpp; { __bam_repl_args *argp; + u_int32_t uinttmp; u_int8_t *bp; int ret; - ret = __os_malloc(dbenv, sizeof(__bam_repl_args) + - sizeof(DB_TXN), NULL, &argp); - if (ret != 0) + if ((ret = __os_malloc(dbenv, + sizeof(__bam_repl_args) + sizeof(DB_TXN), &argp)) != 0) return (ret); argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); bp += sizeof(DB_LSN); - memcpy(&argp->fileid, bp, sizeof(argp->fileid)); - bp += sizeof(argp->fileid); - memcpy(&argp->pgno, bp, sizeof(argp->pgno)); - bp += sizeof(argp->pgno); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + memcpy(&argp->lsn, bp, sizeof(argp->lsn)); bp += sizeof(argp->lsn); - memcpy(&argp->indx, bp, sizeof(argp->indx)); - bp += sizeof(argp->indx); - memcpy(&argp->isdeleted, bp, sizeof(argp->isdeleted)); - bp += sizeof(argp->isdeleted); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->indx = (u_int32_t)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->isdeleted = (u_int32_t)uinttmp; + bp += sizeof(uinttmp); + memset(&argp->orig, 0, sizeof(argp->orig)); memcpy(&argp->orig.size, bp, sizeof(u_int32_t)); bp += sizeof(u_int32_t); argp->orig.data = bp; bp += argp->orig.size; + memset(&argp->repl, 0, sizeof(argp->repl)); memcpy(&argp->repl.size, bp, sizeof(u_int32_t)); bp += sizeof(u_int32_t); argp->repl.data = bp; bp += argp->repl.size; - memcpy(&argp->prefix, bp, sizeof(argp->prefix)); - bp += sizeof(argp->prefix); - memcpy(&argp->suffix, bp, sizeof(argp->suffix)); - bp += sizeof(argp->suffix); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->prefix = (u_int32_t)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->suffix = (u_int32_t)uinttmp; + bp += sizeof(uinttmp); + *argpp = argp; return (0); } +/* + * PUBLIC: int __bam_root_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_pgno_t, db_pgno_t, DB_LSN *)); + */ int -__bam_root_log(dbenv, txnid, ret_lsnp, flags, - fileid, meta_pgno, root_pgno, meta_lsn) - DB_ENV *dbenv; +__bam_root_log(dbp, txnid, ret_lsnp, flags, meta_pgno, root_pgno, meta_lsn) + DB *dbp; DB_TXN *txnid; DB_LSN *ret_lsnp; u_int32_t flags; - int32_t fileid; db_pgno_t meta_pgno; db_pgno_t root_pgno; DB_LSN * meta_lsn; { DBT logrec; + DB_ENV *dbenv; + DB_TXNLOGREC *lr; DB_LSN *lsnp, null_lsn; - u_int32_t rectype, txn_num; - int ret; + u_int32_t uinttmp, rectype, txn_num; + u_int npad; u_int8_t *bp; - - rectype = DB_bam_root; - if (txnid != NULL && - TAILQ_FIRST(&txnid->kids) != NULL && - (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) - return (ret); - txn_num = txnid == NULL ? 0 : txnid->txnid; + int is_durable, ret; + + dbenv = dbp->dbenv; + rectype = DB___bam_root; + npad = 0; + + is_durable = 1; + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbenv, DB_ENV_TXN_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (F_ISSET(dbenv, DB_ENV_TXN_NOT_DURABLE) && txnid == NULL) + return (0); + is_durable = 0; + } if (txnid == NULL) { - ZERO_LSN(null_lsn); + txn_num = 0; + null_lsn.file = 0; + null_lsn.offset = 0; lsnp = &null_lsn; - } else + } else { + if (TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid->txnid; lsnp = &txnid->last_lsn; + } + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) - + sizeof(fileid) - + sizeof(meta_pgno) - + sizeof(root_pgno) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + sizeof(*meta_lsn); - if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) - return (ret); + if (CRYPTO_ON(dbenv)) { + npad = + ((DB_CIPHER *)dbenv->crypto_handle)->adj_size(logrec.size); + logrec.size += npad; + } + + if (!is_durable && txnid != NULL) { + if ((ret = __os_malloc(dbenv, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + goto do_malloc; +#else + logrec.data = &lr->data; +#endif + } else { +#ifdef DIAGNOSTIC +do_malloc: +#endif + if ((ret = + __os_malloc(dbenv, logrec.size, &logrec.data)) != 0) { +#ifdef DIAGNOSTIC + if (!is_durable && txnid != NULL) + (void)__os_free(dbenv, lr); +#endif + return (ret); + } + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); bp += sizeof(DB_LSN); - memcpy(bp, &fileid, sizeof(fileid)); - bp += sizeof(fileid); - memcpy(bp, &meta_pgno, sizeof(meta_pgno)); - bp += sizeof(meta_pgno); - memcpy(bp, &root_pgno, sizeof(root_pgno)); - bp += sizeof(root_pgno); + + DB_ASSERT(dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + uinttmp = (u_int32_t)dbp->log_filename->id; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)meta_pgno; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)root_pgno; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + if (meta_lsn != NULL) memcpy(bp, meta_lsn, sizeof(*meta_lsn)); else memset(bp, 0, sizeof(*meta_lsn)); bp += sizeof(*meta_lsn); - DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); - ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); - if (txnid != NULL) - txnid->last_lsn = *ret_lsnp; - __os_free(logrec.data, logrec.size); + + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + +#ifdef DIAGNOSTIC + if (!is_durable && txnid != NULL) { + /* + * We set the debug bit if we are going + * to log non-durable transactions so + * they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + memcpy(logrec.data, &rectype, sizeof(rectype)); + } +#endif + + if (!is_durable && txnid != NULL) { + ret = 0; + STAILQ_INSERT_HEAD(&txnid->logs, lr, links); +#ifdef DIAGNOSTIC + goto do_put; +#endif + } else{ +#ifdef DIAGNOSTIC +do_put: +#endif + ret = __log_put(dbenv, + ret_lsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); + if (ret == 0 && txnid != NULL) + txnid->last_lsn = *ret_lsnp; + } + + if (!is_durable) + LSN_NOT_LOGGED(*ret_lsnp); +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__bam_root_print(dbenv, + (DBT *)&logrec, ret_lsnp, NULL, NULL); +#endif +#ifndef DIAGNOSTIC + if (is_durable || txnid == NULL) +#endif + __os_free(dbenv, logrec.data); + return (ret); } +#ifdef HAVE_REPLICATION +/* + * PUBLIC: int __bam_root_getpgnos __P((DB_ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__bam_root_getpgnos(dbenv, rec, lsnp, notused1, summary) + DB_ENV *dbenv; + DBT *rec; + DB_LSN *lsnp; + db_recops notused1; + void *summary; +{ + TXN_RECS *t; + int ret; + COMPQUIET(rec, NULL); + COMPQUIET(notused1, DB_TXN_ABORT); + + t = (TXN_RECS *)summary; + + if ((ret = __rep_check_alloc(dbenv, t, 1)) != 0) + return (ret); + + t->array[t->npages].flags = LSN_PAGE_NOLOCK; + t->array[t->npages].lsn = *lsnp; + t->array[t->npages].fid = DB_LOGFILEID_INVALID; + memset(&t->array[t->npages].pgdesc, 0, + sizeof(t->array[t->npages].pgdesc)); + + t->npages++; + + return (0); +} +#endif /* HAVE_REPLICATION */ + +/* + * PUBLIC: int __bam_root_print __P((DB_ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ int __bam_root_print(dbenv, dbtp, lsnp, notused2, notused3) DB_ENV *dbenv; @@ -1816,34 +2248,36 @@ __bam_root_print(dbenv, dbtp, lsnp, notused2, notused3) void *notused3; { __bam_root_args *argp; - u_int32_t i; - u_int ch; int ret; - i = 0; - ch = 0; notused2 = DB_TXN_ABORT; notused3 = NULL; if ((ret = __bam_root_read(dbenv, dbtp->data, &argp)) != 0) return (ret); - printf("[%lu][%lu]bam_root: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (void)printf( + "[%lu][%lu]__bam_root%s: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", (u_long)argp->type, (u_long)argp->txnid->txnid, (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); - printf("\tfileid: %ld\n", (long)argp->fileid); - printf("\tmeta_pgno: %lu\n", (u_long)argp->meta_pgno); - printf("\troot_pgno: %lu\n", (u_long)argp->root_pgno); - printf("\tmeta_lsn: [%lu][%lu]\n", + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tmeta_pgno: %lu\n", (u_long)argp->meta_pgno); + (void)printf("\troot_pgno: %lu\n", (u_long)argp->root_pgno); + (void)printf("\tmeta_lsn: [%lu][%lu]\n", (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset); - printf("\n"); - __os_free(argp, 0); + (void)printf("\n"); + __os_free(dbenv, argp); + return (0); } +/* + * PUBLIC: int __bam_root_read __P((DB_ENV *, void *, __bam_root_args **)); + */ int __bam_root_read(dbenv, recbuf, argpp) DB_ENV *dbenv; @@ -1851,42 +2285,56 @@ __bam_root_read(dbenv, recbuf, argpp) __bam_root_args **argpp; { __bam_root_args *argp; + u_int32_t uinttmp; u_int8_t *bp; int ret; - ret = __os_malloc(dbenv, sizeof(__bam_root_args) + - sizeof(DB_TXN), NULL, &argp); - if (ret != 0) + if ((ret = __os_malloc(dbenv, + sizeof(__bam_root_args) + sizeof(DB_TXN), &argp)) != 0) return (ret); argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); bp += sizeof(DB_LSN); - memcpy(&argp->fileid, bp, sizeof(argp->fileid)); - bp += sizeof(argp->fileid); - memcpy(&argp->meta_pgno, bp, sizeof(argp->meta_pgno)); - bp += sizeof(argp->meta_pgno); - memcpy(&argp->root_pgno, bp, sizeof(argp->root_pgno)); - bp += sizeof(argp->root_pgno); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->meta_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->root_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + memcpy(&argp->meta_lsn, bp, sizeof(argp->meta_lsn)); bp += sizeof(argp->meta_lsn); + *argpp = argp; return (0); } +/* + * PUBLIC: int __bam_curadj_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_ca_mode, db_pgno_t, db_pgno_t, db_pgno_t, + * PUBLIC: u_int32_t, u_int32_t, u_int32_t)); + */ int -__bam_curadj_log(dbenv, txnid, ret_lsnp, flags, - fileid, mode, from_pgno, to_pgno, left_pgno, first_indx, - from_indx, to_indx) - DB_ENV *dbenv; +__bam_curadj_log(dbp, txnid, ret_lsnp, flags, mode, from_pgno, to_pgno, left_pgno, first_indx, + from_indx, to_indx) + DB *dbp; DB_TXN *txnid; DB_LSN *ret_lsnp; u_int32_t flags; - int32_t fileid; db_ca_mode mode; db_pgno_t from_pgno; db_pgno_t to_pgno; @@ -1896,65 +2344,212 @@ __bam_curadj_log(dbenv, txnid, ret_lsnp, flags, u_int32_t to_indx; { DBT logrec; + DB_ENV *dbenv; + DB_TXNLOGREC *lr; DB_LSN *lsnp, null_lsn; - u_int32_t rectype, txn_num; - int ret; + u_int32_t uinttmp, rectype, txn_num; + u_int npad; u_int8_t *bp; - - rectype = DB_bam_curadj; - if (txnid != NULL && - TAILQ_FIRST(&txnid->kids) != NULL && - (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) - return (ret); - txn_num = txnid == NULL ? 0 : txnid->txnid; + int is_durable, ret; + + dbenv = dbp->dbenv; + rectype = DB___bam_curadj; + npad = 0; + + is_durable = 1; + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbenv, DB_ENV_TXN_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (F_ISSET(dbenv, DB_ENV_TXN_NOT_DURABLE) && txnid == NULL) + return (0); + is_durable = 0; + } if (txnid == NULL) { - ZERO_LSN(null_lsn); + txn_num = 0; + null_lsn.file = 0; + null_lsn.offset = 0; lsnp = &null_lsn; - } else + } else { + if (TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid->txnid; lsnp = &txnid->last_lsn; + } + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) - + sizeof(fileid) - + sizeof(mode) - + sizeof(from_pgno) - + sizeof(to_pgno) - + sizeof(left_pgno) - + sizeof(first_indx) - + sizeof(from_indx) - + sizeof(to_indx); - if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) - return (ret); + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t); + if (CRYPTO_ON(dbenv)) { + npad = + ((DB_CIPHER *)dbenv->crypto_handle)->adj_size(logrec.size); + logrec.size += npad; + } + + if (!is_durable && txnid != NULL) { + if ((ret = __os_malloc(dbenv, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + goto do_malloc; +#else + logrec.data = &lr->data; +#endif + } else { +#ifdef DIAGNOSTIC +do_malloc: +#endif + if ((ret = + __os_malloc(dbenv, logrec.size, &logrec.data)) != 0) { +#ifdef DIAGNOSTIC + if (!is_durable && txnid != NULL) + (void)__os_free(dbenv, lr); +#endif + return (ret); + } + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); bp += sizeof(DB_LSN); - memcpy(bp, &fileid, sizeof(fileid)); - bp += sizeof(fileid); - memcpy(bp, &mode, sizeof(mode)); - bp += sizeof(mode); - memcpy(bp, &from_pgno, sizeof(from_pgno)); - bp += sizeof(from_pgno); - memcpy(bp, &to_pgno, sizeof(to_pgno)); - bp += sizeof(to_pgno); - memcpy(bp, &left_pgno, sizeof(left_pgno)); - bp += sizeof(left_pgno); - memcpy(bp, &first_indx, sizeof(first_indx)); - bp += sizeof(first_indx); - memcpy(bp, &from_indx, sizeof(from_indx)); - bp += sizeof(from_indx); - memcpy(bp, &to_indx, sizeof(to_indx)); - bp += sizeof(to_indx); - DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); - ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); - if (txnid != NULL) - txnid->last_lsn = *ret_lsnp; - __os_free(logrec.data, logrec.size); + + DB_ASSERT(dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + uinttmp = (u_int32_t)dbp->log_filename->id; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)mode; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)from_pgno; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)to_pgno; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)left_pgno; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)first_indx; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)from_indx; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)to_indx; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + +#ifdef DIAGNOSTIC + if (!is_durable && txnid != NULL) { + /* + * We set the debug bit if we are going + * to log non-durable transactions so + * they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + memcpy(logrec.data, &rectype, sizeof(rectype)); + } +#endif + + if (!is_durable && txnid != NULL) { + ret = 0; + STAILQ_INSERT_HEAD(&txnid->logs, lr, links); +#ifdef DIAGNOSTIC + goto do_put; +#endif + } else{ +#ifdef DIAGNOSTIC +do_put: +#endif + ret = __log_put(dbenv, + ret_lsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); + if (ret == 0 && txnid != NULL) + txnid->last_lsn = *ret_lsnp; + } + + if (!is_durable) + LSN_NOT_LOGGED(*ret_lsnp); +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__bam_curadj_print(dbenv, + (DBT *)&logrec, ret_lsnp, NULL, NULL); +#endif +#ifndef DIAGNOSTIC + if (is_durable || txnid == NULL) +#endif + __os_free(dbenv, logrec.data); + return (ret); } +#ifdef HAVE_REPLICATION +/* + * PUBLIC: int __bam_curadj_getpgnos __P((DB_ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__bam_curadj_getpgnos(dbenv, rec, lsnp, notused1, summary) + DB_ENV *dbenv; + DBT *rec; + DB_LSN *lsnp; + db_recops notused1; + void *summary; +{ + TXN_RECS *t; + int ret; + COMPQUIET(rec, NULL); + COMPQUIET(notused1, DB_TXN_ABORT); + + t = (TXN_RECS *)summary; + + if ((ret = __rep_check_alloc(dbenv, t, 1)) != 0) + return (ret); + + t->array[t->npages].flags = LSN_PAGE_NOLOCK; + t->array[t->npages].lsn = *lsnp; + t->array[t->npages].fid = DB_LOGFILEID_INVALID; + memset(&t->array[t->npages].pgdesc, 0, + sizeof(t->array[t->npages].pgdesc)); + + t->npages++; + + return (0); +} +#endif /* HAVE_REPLICATION */ + +/* + * PUBLIC: int __bam_curadj_print __P((DB_ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ int __bam_curadj_print(dbenv, dbtp, lsnp, notused2, notused3) DB_ENV *dbenv; @@ -1964,37 +2559,39 @@ __bam_curadj_print(dbenv, dbtp, lsnp, notused2, notused3) void *notused3; { __bam_curadj_args *argp; - u_int32_t i; - u_int ch; int ret; - i = 0; - ch = 0; notused2 = DB_TXN_ABORT; notused3 = NULL; if ((ret = __bam_curadj_read(dbenv, dbtp->data, &argp)) != 0) return (ret); - printf("[%lu][%lu]bam_curadj: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (void)printf( + "[%lu][%lu]__bam_curadj%s: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", (u_long)argp->type, (u_long)argp->txnid->txnid, (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); - printf("\tfileid: %ld\n", (long)argp->fileid); - printf("\tmode: %ld\n", (long)argp->mode); - printf("\tfrom_pgno: %lu\n", (u_long)argp->from_pgno); - printf("\tto_pgno: %lu\n", (u_long)argp->to_pgno); - printf("\tleft_pgno: %lu\n", (u_long)argp->left_pgno); - printf("\tfirst_indx: %lu\n", (u_long)argp->first_indx); - printf("\tfrom_indx: %lu\n", (u_long)argp->from_indx); - printf("\tto_indx: %lu\n", (u_long)argp->to_indx); - printf("\n"); - __os_free(argp, 0); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tmode: %ld\n", (long)argp->mode); + (void)printf("\tfrom_pgno: %lu\n", (u_long)argp->from_pgno); + (void)printf("\tto_pgno: %lu\n", (u_long)argp->to_pgno); + (void)printf("\tleft_pgno: %lu\n", (u_long)argp->left_pgno); + (void)printf("\tfirst_indx: %lu\n", (u_long)argp->first_indx); + (void)printf("\tfrom_indx: %lu\n", (u_long)argp->from_indx); + (void)printf("\tto_indx: %lu\n", (u_long)argp->to_indx); + (void)printf("\n"); + __os_free(dbenv, argp); + return (0); } +/* + * PUBLIC: int __bam_curadj_read __P((DB_ENV *, void *, __bam_curadj_args **)); + */ int __bam_curadj_read(dbenv, recbuf, argpp) DB_ENV *dbenv; @@ -2002,105 +2599,268 @@ __bam_curadj_read(dbenv, recbuf, argpp) __bam_curadj_args **argpp; { __bam_curadj_args *argp; + u_int32_t uinttmp; u_int8_t *bp; int ret; - ret = __os_malloc(dbenv, sizeof(__bam_curadj_args) + - sizeof(DB_TXN), NULL, &argp); - if (ret != 0) + if ((ret = __os_malloc(dbenv, + sizeof(__bam_curadj_args) + sizeof(DB_TXN), &argp)) != 0) return (ret); argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); bp += sizeof(DB_LSN); - memcpy(&argp->fileid, bp, sizeof(argp->fileid)); - bp += sizeof(argp->fileid); - memcpy(&argp->mode, bp, sizeof(argp->mode)); - bp += sizeof(argp->mode); - memcpy(&argp->from_pgno, bp, sizeof(argp->from_pgno)); - bp += sizeof(argp->from_pgno); - memcpy(&argp->to_pgno, bp, sizeof(argp->to_pgno)); - bp += sizeof(argp->to_pgno); - memcpy(&argp->left_pgno, bp, sizeof(argp->left_pgno)); - bp += sizeof(argp->left_pgno); - memcpy(&argp->first_indx, bp, sizeof(argp->first_indx)); - bp += sizeof(argp->first_indx); - memcpy(&argp->from_indx, bp, sizeof(argp->from_indx)); - bp += sizeof(argp->from_indx); - memcpy(&argp->to_indx, bp, sizeof(argp->to_indx)); - bp += sizeof(argp->to_indx); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->mode = (db_ca_mode)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->from_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->to_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->left_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->first_indx = (u_int32_t)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->from_indx = (u_int32_t)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->to_indx = (u_int32_t)uinttmp; + bp += sizeof(uinttmp); + *argpp = argp; return (0); } +/* + * PUBLIC: int __bam_rcuradj_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, ca_recno_arg, db_pgno_t, db_recno_t, u_int32_t)); + */ int -__bam_rcuradj_log(dbenv, txnid, ret_lsnp, flags, - fileid, mode, root, recno, order) - DB_ENV *dbenv; +__bam_rcuradj_log(dbp, txnid, ret_lsnp, flags, mode, root, recno, order) + DB *dbp; DB_TXN *txnid; DB_LSN *ret_lsnp; u_int32_t flags; - int32_t fileid; ca_recno_arg mode; db_pgno_t root; db_recno_t recno; u_int32_t order; { DBT logrec; + DB_ENV *dbenv; + DB_TXNLOGREC *lr; DB_LSN *lsnp, null_lsn; - u_int32_t rectype, txn_num; - int ret; + u_int32_t uinttmp, rectype, txn_num; + u_int npad; u_int8_t *bp; - - rectype = DB_bam_rcuradj; - if (txnid != NULL && - TAILQ_FIRST(&txnid->kids) != NULL && - (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) - return (ret); - txn_num = txnid == NULL ? 0 : txnid->txnid; + int is_durable, ret; + + dbenv = dbp->dbenv; + rectype = DB___bam_rcuradj; + npad = 0; + + is_durable = 1; + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbenv, DB_ENV_TXN_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (F_ISSET(dbenv, DB_ENV_TXN_NOT_DURABLE) && txnid == NULL) + return (0); + is_durable = 0; + } if (txnid == NULL) { - ZERO_LSN(null_lsn); + txn_num = 0; + null_lsn.file = 0; + null_lsn.offset = 0; lsnp = &null_lsn; - } else + } else { + if (TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid->txnid; lsnp = &txnid->last_lsn; + } + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) - + sizeof(fileid) - + sizeof(mode) - + sizeof(root) - + sizeof(recno) - + sizeof(order); - if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) - return (ret); + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t); + if (CRYPTO_ON(dbenv)) { + npad = + ((DB_CIPHER *)dbenv->crypto_handle)->adj_size(logrec.size); + logrec.size += npad; + } + + if (!is_durable && txnid != NULL) { + if ((ret = __os_malloc(dbenv, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + goto do_malloc; +#else + logrec.data = &lr->data; +#endif + } else { +#ifdef DIAGNOSTIC +do_malloc: +#endif + if ((ret = + __os_malloc(dbenv, logrec.size, &logrec.data)) != 0) { +#ifdef DIAGNOSTIC + if (!is_durable && txnid != NULL) + (void)__os_free(dbenv, lr); +#endif + return (ret); + } + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); bp += sizeof(DB_LSN); - memcpy(bp, &fileid, sizeof(fileid)); - bp += sizeof(fileid); - memcpy(bp, &mode, sizeof(mode)); - bp += sizeof(mode); - memcpy(bp, &root, sizeof(root)); - bp += sizeof(root); - memcpy(bp, &recno, sizeof(recno)); - bp += sizeof(recno); - memcpy(bp, &order, sizeof(order)); - bp += sizeof(order); - DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); - ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); - if (txnid != NULL) - txnid->last_lsn = *ret_lsnp; - __os_free(logrec.data, logrec.size); + + DB_ASSERT(dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + uinttmp = (u_int32_t)dbp->log_filename->id; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)mode; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)root; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)recno; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)order; + memcpy(bp, &uinttmp, sizeof(uinttmp)); + bp += sizeof(uinttmp); + + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + +#ifdef DIAGNOSTIC + if (!is_durable && txnid != NULL) { + /* + * We set the debug bit if we are going + * to log non-durable transactions so + * they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + memcpy(logrec.data, &rectype, sizeof(rectype)); + } +#endif + + if (!is_durable && txnid != NULL) { + ret = 0; + STAILQ_INSERT_HEAD(&txnid->logs, lr, links); +#ifdef DIAGNOSTIC + goto do_put; +#endif + } else{ +#ifdef DIAGNOSTIC +do_put: +#endif + ret = __log_put(dbenv, + ret_lsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); + if (ret == 0 && txnid != NULL) + txnid->last_lsn = *ret_lsnp; + } + + if (!is_durable) + LSN_NOT_LOGGED(*ret_lsnp); +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__bam_rcuradj_print(dbenv, + (DBT *)&logrec, ret_lsnp, NULL, NULL); +#endif +#ifndef DIAGNOSTIC + if (is_durable || txnid == NULL) +#endif + __os_free(dbenv, logrec.data); + return (ret); } +#ifdef HAVE_REPLICATION +/* + * PUBLIC: int __bam_rcuradj_getpgnos __P((DB_ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__bam_rcuradj_getpgnos(dbenv, rec, lsnp, notused1, summary) + DB_ENV *dbenv; + DBT *rec; + DB_LSN *lsnp; + db_recops notused1; + void *summary; +{ + TXN_RECS *t; + int ret; + COMPQUIET(rec, NULL); + COMPQUIET(notused1, DB_TXN_ABORT); + + t = (TXN_RECS *)summary; + + if ((ret = __rep_check_alloc(dbenv, t, 1)) != 0) + return (ret); + + t->array[t->npages].flags = LSN_PAGE_NOLOCK; + t->array[t->npages].lsn = *lsnp; + t->array[t->npages].fid = DB_LOGFILEID_INVALID; + memset(&t->array[t->npages].pgdesc, 0, + sizeof(t->array[t->npages].pgdesc)); + + t->npages++; + + return (0); +} +#endif /* HAVE_REPLICATION */ + +/* + * PUBLIC: int __bam_rcuradj_print __P((DB_ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ int __bam_rcuradj_print(dbenv, dbtp, lsnp, notused2, notused3) DB_ENV *dbenv; @@ -2110,34 +2870,37 @@ __bam_rcuradj_print(dbenv, dbtp, lsnp, notused2, notused3) void *notused3; { __bam_rcuradj_args *argp; - u_int32_t i; - u_int ch; int ret; - i = 0; - ch = 0; notused2 = DB_TXN_ABORT; notused3 = NULL; if ((ret = __bam_rcuradj_read(dbenv, dbtp->data, &argp)) != 0) return (ret); - printf("[%lu][%lu]bam_rcuradj: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (void)printf( + "[%lu][%lu]__bam_rcuradj%s: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", (u_long)argp->type, (u_long)argp->txnid->txnid, (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); - printf("\tfileid: %ld\n", (long)argp->fileid); - printf("\tmode: %ld\n", (long)argp->mode); - printf("\troot: %ld\n", (long)argp->root); - printf("\trecno: %ld\n", (long)argp->recno); - printf("\torder: %ld\n", (long)argp->order); - printf("\n"); - __os_free(argp, 0); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tmode: %ld\n", (long)argp->mode); + (void)printf("\troot: %ld\n", (long)argp->root); + (void)printf("\trecno: %ld\n", (long)argp->recno); + (void)printf("\torder: %ld\n", (long)argp->order); + (void)printf("\n"); + __os_free(dbenv, argp); + return (0); } +/* + * PUBLIC: int __bam_rcuradj_read __P((DB_ENV *, void *, + * PUBLIC: __bam_rcuradj_args **)); + */ int __bam_rcuradj_read(dbenv, recbuf, argpp) DB_ENV *dbenv; @@ -2145,140 +2908,173 @@ __bam_rcuradj_read(dbenv, recbuf, argpp) __bam_rcuradj_args **argpp; { __bam_rcuradj_args *argp; + u_int32_t uinttmp; u_int8_t *bp; int ret; - ret = __os_malloc(dbenv, sizeof(__bam_rcuradj_args) + - sizeof(DB_TXN), NULL, &argp); - if (ret != 0) + if ((ret = __os_malloc(dbenv, + sizeof(__bam_rcuradj_args) + sizeof(DB_TXN), &argp)) != 0) return (ret); argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; memcpy(&argp->type, bp, sizeof(argp->type)); bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); bp += sizeof(DB_LSN); - memcpy(&argp->fileid, bp, sizeof(argp->fileid)); - bp += sizeof(argp->fileid); - memcpy(&argp->mode, bp, sizeof(argp->mode)); - bp += sizeof(argp->mode); - memcpy(&argp->root, bp, sizeof(argp->root)); - bp += sizeof(argp->root); - memcpy(&argp->recno, bp, sizeof(argp->recno)); - bp += sizeof(argp->recno); - memcpy(&argp->order, bp, sizeof(argp->order)); - bp += sizeof(argp->order); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->mode = (ca_recno_arg)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->root = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->recno = (db_recno_t)uinttmp; + bp += sizeof(uinttmp); + + memcpy(&uinttmp, bp, sizeof(uinttmp)); + argp->order = (u_int32_t)uinttmp; + bp += sizeof(uinttmp); + *argpp = argp; return (0); } +/* + * PUBLIC: int __bam_init_print __P((DB_ENV *, int (***)(DB_ENV *, + * PUBLIC: DBT *, DB_LSN *, db_recops, void *), size_t *)); + */ int -__bam_init_print(dbenv) +__bam_init_print(dbenv, dtabp, dtabsizep) DB_ENV *dbenv; + int (***dtabp)__P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + size_t *dtabsizep; { int ret; - if ((ret = __db_add_recovery(dbenv, - __bam_pg_alloc_print, DB_bam_pg_alloc)) != 0) - return (ret); - if ((ret = __db_add_recovery(dbenv, - __bam_pg_alloc1_print, DB_bam_pg_alloc1)) != 0) - return (ret); - if ((ret = __db_add_recovery(dbenv, - __bam_pg_free_print, DB_bam_pg_free)) != 0) + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_split_print, DB___bam_split)) != 0) return (ret); - if ((ret = __db_add_recovery(dbenv, - __bam_pg_free1_print, DB_bam_pg_free1)) != 0) + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_rsplit_print, DB___bam_rsplit)) != 0) return (ret); - if ((ret = __db_add_recovery(dbenv, - __bam_split1_print, DB_bam_split1)) != 0) + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_adj_print, DB___bam_adj)) != 0) return (ret); - if ((ret = __db_add_recovery(dbenv, - __bam_split_print, DB_bam_split)) != 0) + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_cadjust_print, DB___bam_cadjust)) != 0) return (ret); - if ((ret = __db_add_recovery(dbenv, - __bam_rsplit1_print, DB_bam_rsplit1)) != 0) + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_cdel_print, DB___bam_cdel)) != 0) return (ret); - if ((ret = __db_add_recovery(dbenv, - __bam_rsplit_print, DB_bam_rsplit)) != 0) + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_repl_print, DB___bam_repl)) != 0) return (ret); - if ((ret = __db_add_recovery(dbenv, - __bam_adj_print, DB_bam_adj)) != 0) + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_root_print, DB___bam_root)) != 0) return (ret); - if ((ret = __db_add_recovery(dbenv, - __bam_cadjust_print, DB_bam_cadjust)) != 0) + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_curadj_print, DB___bam_curadj)) != 0) return (ret); - if ((ret = __db_add_recovery(dbenv, - __bam_cdel_print, DB_bam_cdel)) != 0) - return (ret); - if ((ret = __db_add_recovery(dbenv, - __bam_repl_print, DB_bam_repl)) != 0) - return (ret); - if ((ret = __db_add_recovery(dbenv, - __bam_root_print, DB_bam_root)) != 0) - return (ret); - if ((ret = __db_add_recovery(dbenv, - __bam_curadj_print, DB_bam_curadj)) != 0) - return (ret); - if ((ret = __db_add_recovery(dbenv, - __bam_rcuradj_print, DB_bam_rcuradj)) != 0) + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_rcuradj_print, DB___bam_rcuradj)) != 0) return (ret); return (0); } +#ifdef HAVE_REPLICATION +/* + * PUBLIC: int __bam_init_getpgnos __P((DB_ENV *, int (***)(DB_ENV *, + * PUBLIC: DBT *, DB_LSN *, db_recops, void *), size_t *)); + */ int -__bam_init_recover(dbenv) +__bam_init_getpgnos(dbenv, dtabp, dtabsizep) DB_ENV *dbenv; + int (***dtabp)__P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + size_t *dtabsizep; { int ret; - if ((ret = __db_add_recovery(dbenv, - __bam_pg_alloc_recover, DB_bam_pg_alloc)) != 0) + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_split_getpgnos, DB___bam_split)) != 0) return (ret); - if ((ret = __db_add_recovery(dbenv, - __deprecated_recover, DB_bam_pg_alloc1)) != 0) + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_rsplit_getpgnos, DB___bam_rsplit)) != 0) return (ret); - if ((ret = __db_add_recovery(dbenv, - __bam_pg_free_recover, DB_bam_pg_free)) != 0) + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_adj_getpgnos, DB___bam_adj)) != 0) return (ret); - if ((ret = __db_add_recovery(dbenv, - __deprecated_recover, DB_bam_pg_free1)) != 0) + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_cadjust_getpgnos, DB___bam_cadjust)) != 0) return (ret); - if ((ret = __db_add_recovery(dbenv, - __deprecated_recover, DB_bam_split1)) != 0) + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_cdel_getpgnos, DB___bam_cdel)) != 0) return (ret); - if ((ret = __db_add_recovery(dbenv, - __bam_split_recover, DB_bam_split)) != 0) + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_repl_getpgnos, DB___bam_repl)) != 0) return (ret); - if ((ret = __db_add_recovery(dbenv, - __deprecated_recover, DB_bam_rsplit1)) != 0) + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_root_getpgnos, DB___bam_root)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_curadj_getpgnos, DB___bam_curadj)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_rcuradj_getpgnos, DB___bam_rcuradj)) != 0) + return (ret); + return (0); +} +#endif /* HAVE_REPLICATION */ + +/* + * PUBLIC: int __bam_init_recover __P((DB_ENV *, int (***)(DB_ENV *, + * PUBLIC: DBT *, DB_LSN *, db_recops, void *), size_t *)); + */ +int +__bam_init_recover(dbenv, dtabp, dtabsizep) + DB_ENV *dbenv; + int (***dtabp)__P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + size_t *dtabsizep; +{ + int ret; + + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_split_recover, DB___bam_split)) != 0) return (ret); - if ((ret = __db_add_recovery(dbenv, - __bam_rsplit_recover, DB_bam_rsplit)) != 0) + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_rsplit_recover, DB___bam_rsplit)) != 0) return (ret); - if ((ret = __db_add_recovery(dbenv, - __bam_adj_recover, DB_bam_adj)) != 0) + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_adj_recover, DB___bam_adj)) != 0) return (ret); - if ((ret = __db_add_recovery(dbenv, - __bam_cadjust_recover, DB_bam_cadjust)) != 0) + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_cadjust_recover, DB___bam_cadjust)) != 0) return (ret); - if ((ret = __db_add_recovery(dbenv, - __bam_cdel_recover, DB_bam_cdel)) != 0) + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_cdel_recover, DB___bam_cdel)) != 0) return (ret); - if ((ret = __db_add_recovery(dbenv, - __bam_repl_recover, DB_bam_repl)) != 0) + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_repl_recover, DB___bam_repl)) != 0) return (ret); - if ((ret = __db_add_recovery(dbenv, - __bam_root_recover, DB_bam_root)) != 0) + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_root_recover, DB___bam_root)) != 0) return (ret); - if ((ret = __db_add_recovery(dbenv, - __bam_curadj_recover, DB_bam_curadj)) != 0) + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_curadj_recover, DB___bam_curadj)) != 0) return (ret); - if ((ret = __db_add_recovery(dbenv, - __bam_rcuradj_recover, DB_bam_rcuradj)) != 0) + if ((ret = __db_add_recovery(dbenv, dtabp, dtabsizep, + __bam_rcuradj_recover, DB___bam_rcuradj)) != 0) return (ret); return (0); } - -- cgit v1.2.3