diff options
Diffstat (limited to 'btree')
-rw-r--r-- | btree/Makefile.inc | 7 | ||||
-rw-r--r-- | btree/bt_close.c | 182 | ||||
-rw-r--r-- | btree/bt_compact.c | 3018 | ||||
-rw-r--r-- | btree/bt_compare.c | 213 | ||||
-rw-r--r-- | btree/bt_compress.c | 3024 | ||||
-rw-r--r-- | btree/bt_conv.c | 250 | ||||
-rw-r--r-- | btree/bt_curadj.c | 620 | ||||
-rw-r--r-- | btree/bt_cursor.c | 3055 | ||||
-rw-r--r-- | btree/bt_debug.c | 329 | ||||
-rw-r--r-- | btree/bt_delete.c | 1084 | ||||
-rw-r--r-- | btree/bt_get.c | 105 | ||||
-rw-r--r-- | btree/bt_method.c | 734 | ||||
-rw-r--r-- | btree/bt_open.c | 933 | ||||
-rw-r--r-- | btree/bt_overflow.c | 228 | ||||
-rw-r--r-- | btree/bt_page.c | 98 | ||||
-rw-r--r-- | btree/bt_put.c | 1201 | ||||
-rw-r--r-- | btree/bt_rec.c | 2035 | ||||
-rw-r--r-- | btree/bt_reclaim.c | 97 | ||||
-rw-r--r-- | btree/bt_recno.c | 1385 | ||||
-rw-r--r-- | btree/bt_rsearch.c | 502 | ||||
-rw-r--r-- | btree/bt_search.c | 1028 | ||||
-rw-r--r-- | btree/bt_seq.c | 460 | ||||
-rw-r--r-- | btree/bt_split.c | 1839 | ||||
-rw-r--r-- | btree/bt_stat.c | 669 | ||||
-rw-r--r-- | btree/bt_upgrade.c | 153 | ||||
-rw-r--r-- | btree/bt_utils.c | 260 | ||||
-rw-r--r-- | btree/bt_verify.c | 2746 | ||||
-rw-r--r-- | btree/btree.h | 383 | ||||
-rw-r--r-- | btree/btree.src | 291 | ||||
-rw-r--r-- | btree/btree_auto.c | 3547 | ||||
-rw-r--r-- | btree/btree_autop.c | 766 | ||||
-rw-r--r-- | btree/extern.h | 70 | ||||
l--------- | btree/tags | 1 |
33 files changed, 27059 insertions, 4254 deletions
diff --git a/btree/Makefile.inc b/btree/Makefile.inc deleted file mode 100644 index 8ed7649..0000000 --- a/btree/Makefile.inc +++ /dev/null @@ -1,7 +0,0 @@ -# @(#)Makefile.inc 8.2 (Berkeley) 7/14/94 - -.PATH: ${.CURDIR}/db/btree - -SRCS+= bt_close.c bt_conv.c bt_debug.c bt_delete.c bt_get.c bt_open.c \ - bt_overflow.c bt_page.c bt_put.c bt_search.c bt_seq.c bt_split.c \ - bt_utils.c diff --git a/btree/bt_close.c b/btree/bt_close.c deleted file mode 100644 index 27f9ab6..0000000 --- a/btree/bt_close.c +++ /dev/null @@ -1,182 +0,0 @@ -/*- - * Copyright (c) 1990, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Mike Olson. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#if defined(LIBC_SCCS) && !defined(lint) -static char sccsid[] = "@(#)bt_close.c 8.7 (Berkeley) 8/17/94"; -#endif /* LIBC_SCCS and not lint */ - -#include <sys/param.h> - -#include <errno.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <unistd.h> - -#include <db.h> -#include "btree.h" - -static int bt_meta __P((BTREE *)); - -/* - * BT_CLOSE -- Close a btree. - * - * Parameters: - * dbp: pointer to access method - * - * Returns: - * RET_ERROR, RET_SUCCESS - */ -int -__bt_close(dbp) - DB *dbp; -{ - BTREE *t; - int fd; - - t = dbp->internal; - - /* Toss any page pinned across calls. */ - if (t->bt_pinned != NULL) { - mpool_put(t->bt_mp, t->bt_pinned, 0); - t->bt_pinned = NULL; - } - - /* Sync the tree. */ - if (__bt_sync(dbp, 0) == RET_ERROR) - return (RET_ERROR); - - /* Close the memory pool. */ - if (mpool_close(t->bt_mp) == RET_ERROR) - return (RET_ERROR); - - /* Free random memory. */ - if (t->bt_cursor.key.data != NULL) { - free(t->bt_cursor.key.data); - t->bt_cursor.key.size = 0; - t->bt_cursor.key.data = NULL; - } - if (t->bt_rkey.data) { - free(t->bt_rkey.data); - t->bt_rkey.size = 0; - t->bt_rkey.data = NULL; - } - if (t->bt_rdata.data) { - free(t->bt_rdata.data); - t->bt_rdata.size = 0; - t->bt_rdata.data = NULL; - } - - fd = t->bt_fd; - free(t); - free(dbp); - return (close(fd) ? RET_ERROR : RET_SUCCESS); -} - -/* - * BT_SYNC -- sync the btree to disk. - * - * Parameters: - * dbp: pointer to access method - * - * Returns: - * RET_SUCCESS, RET_ERROR. - */ -int -__bt_sync(dbp, flags) - const DB *dbp; - u_int flags; -{ - BTREE *t; - int status; - - t = dbp->internal; - - /* Toss any page pinned across calls. */ - if (t->bt_pinned != NULL) { - mpool_put(t->bt_mp, t->bt_pinned, 0); - t->bt_pinned = NULL; - } - - /* Sync doesn't currently take any flags. */ - if (flags != 0) { - errno = EINVAL; - return (RET_ERROR); - } - - if (F_ISSET(t, B_INMEM | B_RDONLY) || !F_ISSET(t, B_MODIFIED)) - return (RET_SUCCESS); - - if (F_ISSET(t, B_METADIRTY) && bt_meta(t) == RET_ERROR) - return (RET_ERROR); - - if ((status = mpool_sync(t->bt_mp)) == RET_SUCCESS) - F_CLR(t, B_MODIFIED); - - return (status); -} - -/* - * BT_META -- write the tree meta data to disk. - * - * Parameters: - * t: tree - * - * Returns: - * RET_ERROR, RET_SUCCESS - */ -static int -bt_meta(t) - BTREE *t; -{ - BTMETA m; - void *p; - - if ((p = mpool_get(t->bt_mp, P_META, 0)) == NULL) - return (RET_ERROR); - - /* Fill in metadata. */ - m.magic = BTREEMAGIC; - m.version = BTREEVERSION; - m.psize = t->bt_psize; - m.free = t->bt_free; - m.nrecs = t->bt_nrecs; - m.flags = F_ISSET(t, SAVEMETA); - - memmove(p, &m, sizeof(BTMETA)); - mpool_put(t->bt_mp, p, MPOOL_DIRTY); - return (RET_SUCCESS); -} diff --git a/btree/bt_compact.c b/btree/bt_compact.c new file mode 100644 index 0000000..6b22e04 --- /dev/null +++ b/btree/bt_compact.c @@ -0,0 +1,3018 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1999-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" + +static int __bam_compact_dups __P((DBC *, + PAGE **, u_int32_t, int, DB_COMPACT *, int *)); +static int __bam_compact_int __P((DBC *, + DBT *, DBT *, u_int32_t, int *, DB_COMPACT *, int *)); +static int __bam_compact_isdone __P((DBC *, DBT *, PAGE *, int *)); +static int __bam_csearch __P((DBC *, DBT *, u_int32_t, int)); +static int __bam_lock_tree __P((DBC *, EPG *, EPG *csp, u_int32_t, u_int32_t)); +static int __bam_lock_subtree __P((DBC *, PAGE *, u_int32_t, u_int32_t)); +static int __bam_merge __P((DBC *, + DBC *, u_int32_t, DBT *, DB_COMPACT *,int *)); +static int __bam_merge_internal __P((DBC *, DBC *, int, DB_COMPACT *, int *)); +static int __bam_merge_pages __P((DBC *, DBC *, DB_COMPACT *)); +static int __bam_merge_records __P((DBC *, DBC*, u_int32_t, DB_COMPACT *)); +static int __bam_truncate_internal_overflow __P((DBC *, PAGE *, DB_COMPACT *)); +static int __bam_truncate_overflow __P((DBC *, + db_pgno_t, PAGE **, DB_COMPACT *)); +static int __bam_truncate_page __P((DBC *, PAGE **, PAGE *, int)); +static int __bam_truncate_root_page __P((DBC *, + PAGE *, u_int32_t, DB_COMPACT *)); + +#ifdef HAVE_FTRUNCATE +static int __bam_free_freelist __P((DB *, DB_THREAD_INFO *, DB_TXN *)); +static int __bam_savekey __P((DBC *, int, DBT *)); +static int __bam_setup_freelist __P((DB *, db_pglist_t *, u_int32_t)); +static int __bam_truncate_internal __P((DB *, + DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *)); +#endif + +#define SAVE_START \ + do { \ + save_data = *c_data; \ + ret = __db_retcopy(env, \ + &save_start, current.data, current.size, \ + &save_start.data, &save_start.ulen); \ + } while (0) + +/* + * Only restore those things that are negated by aborting the + * transaction. We don't restore the number of deadlocks, for example. + */ + +#define RESTORE_START \ + do { \ + c_data->compact_pages_free = \ + save_data.compact_pages_free; \ + c_data->compact_levels = save_data.compact_levels; \ + c_data->compact_truncate = save_data.compact_truncate; \ + ret = __db_retcopy(env, ¤t, \ + save_start.data, save_start.size, \ + ¤t.data, ¤t.ulen); \ + } while (0) + +/* + e __bam_compact -- compact a btree. + * + * PUBLIC: int __bam_compact __P((DB *, DB_THREAD_INFO *, DB_TXN *, + * PUBLIC: DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *)); + */ +int +__bam_compact(dbp, ip, txn, start, stop, c_data, flags, end) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + DBT *start, *stop; + DB_COMPACT *c_data; + u_int32_t flags; + DBT *end; +{ + DBC *dbc; + DBT current, save_start; + DB_COMPACT save_data; + ENV *env; + u_int32_t factor, retry; + int deadlock, have_freelist, isdone, ret, span, t_ret, txn_local; + +#ifdef HAVE_FTRUNCATE + db_pglist_t *list; + db_pgno_t last_pgno; + u_int32_t nelems, truncated; +#endif + + env = dbp->env; + + memset(¤t, 0, sizeof(current)); + memset(&save_start, 0, sizeof(save_start)); + dbc = NULL; + factor = 0; + have_freelist = deadlock = isdone = ret = span = 0; + ret = retry = 0; + +#ifdef HAVE_FTRUNCATE + list = NULL; + last_pgno = 0; + nelems = truncated = 0; +#endif + + /* + * We pass "current" to the internal routine, indicating where that + * routine should begin its work and expecting that it will return to + * us the last key that it processed. + */ + if (start != NULL && (ret = __db_retcopy(env, + ¤t, start->data, start->size, + ¤t.data, ¤t.ulen)) != 0) + return (ret); + + if (IS_DB_AUTO_COMMIT(dbp, txn)) + txn_local = 1; + else + txn_local = 0; + if (!LF_ISSET(DB_FREE_SPACE | DB_FREELIST_ONLY)) + goto no_free; + if (LF_ISSET(DB_FREELIST_ONLY)) + LF_SET(DB_FREE_SPACE); + +#ifdef HAVE_FTRUNCATE + /* Sort the freelist and set up the in-memory list representation. */ + if (txn_local && (ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0) + goto err; + + if ((ret = __db_free_truncate(dbp, ip, + txn, flags, c_data, &list, &nelems, &last_pgno)) != 0) { + LF_CLR(DB_FREE_SPACE); + goto terr; + } + + /* If the freelist is empty and we are not filling, get out. */ + if (nelems == 0 && LF_ISSET(DB_FREELIST_ONLY)) { + ret = 0; + LF_CLR(DB_FREE_SPACE); + goto terr; + } + if ((ret = __bam_setup_freelist(dbp, list, nelems)) != 0) { + /* Someone else owns the free list. */ + if (ret == EBUSY) + ret = 0; + } + if (ret == 0) + have_freelist = 1; + + /* Commit the txn and release the meta page lock. */ +terr: if (txn_local) { + if ((t_ret = __txn_commit(txn, DB_TXN_NOSYNC)) != 0 && ret == 0) + ret = t_ret; + txn = NULL; + } + if (ret != 0) + goto err; + + /* Save the number truncated so far, we will add what we get below. */ + truncated = c_data->compact_pages_truncated; + if (LF_ISSET(DB_FREELIST_ONLY)) + goto done; +#endif + + /* + * We want factor to be the target number of free bytes on each page, + * so we know when to stop adding items to a page. Make sure to + * subtract the page overhead when computing this target. This can + * result in a 1-2% error on the smallest page. + * First figure out how many bytes we should use: + */ +no_free: + factor = dbp->pgsize - SIZEOF_PAGE; + if (c_data->compact_fillpercent != 0) { + factor *= c_data->compact_fillpercent; + factor /= 100; + } + /* Now convert to the number of free bytes to target. */ + factor = (dbp->pgsize - SIZEOF_PAGE) - factor; + + if (c_data->compact_pages == 0) + c_data->compact_pages = DB_MAX_PAGES; + + do { + deadlock = 0; + + SAVE_START; + if (ret != 0) + break; + + if (txn_local) { + if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0) + break; + + if (c_data->compact_timeout != 0 && + (ret = __txn_set_timeout(txn, + c_data->compact_timeout, DB_SET_LOCK_TIMEOUT)) != 0) + goto err; + } + + if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0) + goto err; + + if ((ret = __bam_compact_int(dbc, ¤t, stop, factor, + &span, c_data, &isdone)) == + DB_LOCK_DEADLOCK && txn_local) { + /* + * We retry on deadlock. Cancel the statistics + * and reset the start point to before this + * iteration. + */ + deadlock = 1; + c_data->compact_deadlock++; + RESTORE_START; + } + /* + * If we could not get a lock while holding an internal + * node latched, commit the current local transaction otherwise + * report a deadlock. + */ + if (ret == DB_LOCK_NOTGRANTED) { + if (txn_local || retry++ < 5) + ret = 0; + else + ret = DB_LOCK_DEADLOCK; + } else + retry = 0; + + if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + +err: if (txn_local && txn != NULL) { + if (ret == 0 && deadlock == 0) + ret = __txn_commit(txn, DB_TXN_NOSYNC); + else if ((t_ret = __txn_abort(txn)) != 0 && ret == 0) + ret = t_ret; + txn = NULL; + } + } while (ret == 0 && !isdone); + + if (ret == 0 && end != NULL) + ret = __db_retcopy(env, end, current.data, current.size, + &end->data, &end->ulen); + if (current.data != NULL) + __os_free(env, current.data); + if (save_start.data != NULL) + __os_free(env, save_start.data); + +#ifdef HAVE_FTRUNCATE + /* + * Finish up truncation work. If there are pages left in the free + * list then search the internal nodes of the tree as we may have + * missed some while walking the leaf nodes. Then calculate how + * many pages we have truncated and release the in-memory free list. + */ +done: if (LF_ISSET(DB_FREE_SPACE)) { + DBMETA *meta; + db_pgno_t pgno; + + pgno = PGNO_BASE_MD; + isdone = 1; + if (ret == 0 && !LF_ISSET(DB_FREELIST_ONLY) && (t_ret = + __memp_fget(dbp->mpf, &pgno, ip, txn, 0, &meta)) == 0) { + isdone = meta->free == PGNO_INVALID; + ret = __memp_fput(dbp->mpf, ip, meta, dbp->priority); + } + + if (!isdone) + ret = __bam_truncate_internal(dbp, ip, txn, c_data); + + /* Clean up the free list. */ + if (list != NULL) + __os_free(env, list); + + if ((t_ret = + __memp_fget(dbp->mpf, &pgno, ip, txn, 0, &meta)) == 0) { + c_data->compact_pages_truncated = + truncated + last_pgno - meta->last_pgno; + if ((t_ret = __memp_fput(dbp->mpf, ip, + meta, dbp->priority)) != 0 && ret == 0) + ret = t_ret; + } else if (ret == 0) + ret = t_ret; + + if (have_freelist && (t_ret = + __bam_free_freelist(dbp, ip, txn)) != 0 && ret == 0) + t_ret = ret; + } +#endif + + return (ret); +} + +/* + * __bam_csearch -- isolate search code for bam_compact. + * This routine hides the differences between searching + * a BTREE and a RECNO from the rest of the code. + */ +#define CS_READ 0 /* We are just reading. */ +#define CS_PARENT 1 /* We want the parent too, write lock. */ +#define CS_NEXT 2 /* Get the next page. */ +#define CS_NEXT_WRITE 3 /* Get the next page and write lock. */ +#define CS_DEL 4 /* Get a stack to delete a page. */ +#define CS_START 5 /* Starting level for stack, write lock. */ +#define CS_NEXT_BOTH 6 /* Get this page and the next, write lock. */ +#define CS_GETRECNO 0x80 /* Extract record number from start. */ + +static int +__bam_csearch(dbc, start, sflag, level) + DBC *dbc; + DBT *start; + u_int32_t sflag; + int level; +{ + BTREE_CURSOR *cp; + int not_used, ret; + + cp = (BTREE_CURSOR *)dbc->internal; + + if (dbc->dbtype == DB_RECNO) { + /* If GETRECNO is not set the cp->recno is what we want. */ + if (FLD_ISSET(sflag, CS_GETRECNO)) { + if (start == NULL || start->size == 0) + cp->recno = 1; + else if ((ret = + __ram_getno(dbc, start, &cp->recno, 0)) != 0) + return (ret); + FLD_CLR(sflag, CS_GETRECNO); + } + switch (sflag) { + case CS_READ: + sflag = SR_READ; + break; + case CS_NEXT: + sflag = SR_PARENT | SR_READ; + break; + case CS_START: + level = LEAFLEVEL; + /* FALLTHROUGH */ + case CS_DEL: + case CS_NEXT_WRITE: + sflag = SR_STACK; + break; + case CS_NEXT_BOTH: + sflag = SR_BOTH | SR_NEXT | SR_WRITE; + break; + case CS_PARENT: + sflag = SR_PARENT | SR_WRITE; + break; + default: + return (__env_panic(dbc->env, EINVAL)); + } + if ((ret = __bam_rsearch(dbc, + &cp->recno, sflag, level, ¬_used)) != 0) + return (ret); + /* Reset the cursor's recno to the beginning of the page. */ + cp->recno -= cp->csp->indx; + } else { + FLD_CLR(sflag, CS_GETRECNO); + switch (sflag) { + case CS_READ: + sflag = SR_READ | SR_DUPFIRST; + break; + case CS_DEL: + sflag = SR_DEL; + break; + case CS_NEXT: + sflag = SR_NEXT; + break; + case CS_NEXT_WRITE: + sflag = SR_NEXT | SR_WRITE; + break; + case CS_NEXT_BOTH: + sflag = SR_BOTH | SR_NEXT | SR_WRITE; + break; + case CS_START: + sflag = SR_START | SR_WRITE; + break; + case CS_PARENT: + sflag = SR_PARENT | SR_WRITE; + break; + default: + return (__env_panic(dbc->env, EINVAL)); + } + if (start == NULL || start->size == 0) + FLD_SET(sflag, SR_MIN); + + if ((ret = __bam_search(dbc, + cp->root, start, sflag, level, NULL, ¬_used)) != 0) + return (ret); + } + + return (0); +} + +/* + * __bam_compact_int -- internal compaction routine. + * Called either with a cursor on the main database + * or a cursor initialized to the root of an off page duplicate + * tree. + */ +static int +__bam_compact_int(dbc, start, stop, factor, spanp, c_data, donep) + DBC *dbc; + DBT *start, *stop; + u_int32_t factor; + int *spanp; + DB_COMPACT *c_data; + int *donep; +{ + BTREE_CURSOR *cp, *ncp; + DB *dbp; + DBC *ndbc; + DB_LOCK metalock, next_lock, nnext_lock, prev_lock, saved_lock; + DB_MPOOLFILE *dbmp; + ENV *env; + EPG *epg; + PAGE *pg, *ppg, *npg; + db_pgno_t metapgno, npgno, nnext_pgno; + db_pgno_t pgno, prev_pgno, ppgno, saved_pgno; + db_recno_t next_recno; + u_int32_t sflag, pgs_free; + int check_dups, check_trunc, clear_root, isdone; + int merged, nentry, next_p, pgs_done, ret, t_ret, tdone; + +#ifdef DEBUG +#define CTRACE(dbc, location, t, start, f) do { \ + DBT __trace; \ + DB_SET_DBT(__trace, t, strlen(t)); \ + DEBUG_LWRITE( \ + dbc, (dbc)->txn, location, &__trace, start, f) \ + } while (0) +#define PTRACE(dbc, location, p, start, f) do { \ + char __buf[32]; \ + (void)snprintf(__buf, \ + sizeof(__buf), "pgno: %lu", (u_long)p); \ + CTRACE(dbc, location, __buf, start, f); \ + } while (0) +#else +#define CTRACE(dbc, location, t, start, f) +#define PTRACE(dbc, location, p, start, f) +#endif + + ndbc = NULL; + pg = NULL; + npg = NULL; + + isdone = 0; + tdone = 0; + pgs_done = 0; + next_recno = 0; + next_p = 0; + clear_root = 0; + metapgno = PGNO_BASE_MD; + LOCK_INIT(next_lock); + LOCK_INIT(nnext_lock); + LOCK_INIT(saved_lock); + LOCK_INIT(metalock); + LOCK_INIT(prev_lock); + check_trunc = c_data->compact_truncate != PGNO_INVALID; + check_dups = (!F_ISSET(dbc, DBC_OPD) && + F_ISSET(dbc->dbp, DB_AM_DUP)) || check_trunc; + + dbp = dbc->dbp; + env = dbp->env; + dbmp = dbp->mpf; + cp = (BTREE_CURSOR *)dbc->internal; + pgs_free = c_data->compact_pages_free; + + /* Search down the tree for the starting point. */ + if ((ret = __bam_csearch(dbc, + start, CS_READ | CS_GETRECNO, LEAFLEVEL)) != 0) { + /* Its not an error to compact an empty db. */ + if (ret == DB_NOTFOUND) + ret = 0; + isdone = 1; + goto err; + } + + /* + * Get the first leaf page. The loop below will change pg so + * we clear the stack reference so we don't put a a page twice. + */ + pg = cp->csp->page; + cp->csp->page = NULL; + next_recno = cp->recno; +next: /* + * This is the start of the main compaction loop. There are 3 + * parts to the process: + * 1) Walk the leaf pages of the tree looking for a page to + * process. We do this with read locks. Save the + * key from the page and release it. + * 2) Set up a cursor stack which will write lock the page + * and enough of its ancestors to get the job done. + * This could go to the root if we might delete a subtree + * or we have record numbers to update. + * 3) Loop fetching pages after the above page and move enough + * data to fill it. + * We exit the loop if we are at the end of the leaf pages, are + * about to lock a new subtree (we span) or on error. + */ + + /* Walk the pages looking for something to fill up. */ + while ((npgno = NEXT_PGNO(pg)) != PGNO_INVALID) { + c_data->compact_pages_examine++; + PTRACE(dbc, "Next", PGNO(pg), start, 0); + + /* If we have fetched the next page, get the new key. */ + if (next_p == 1 && + dbc->dbtype != DB_RECNO && NUM_ENT(pg) != 0) { + if ((ret = __db_ret(dbc, pg, 0, start, + &start->data, &start->ulen)) != 0) + goto err; + } + next_recno += NUM_ENT(pg); + if (P_FREESPACE(dbp, pg) > factor || + (check_trunc && PGNO(pg) > c_data->compact_truncate)) + break; + if (stop != NULL && stop->size > 0) { + if ((ret = __bam_compact_isdone(dbc, + stop, pg, &isdone)) != 0) + goto err; + if (isdone) + goto done; + } + + /* + * The page does not need more data or to be swapped, + * check to see if we want to look at possible duplicate + * trees or overflow records and the move on to the next page. + */ + cp->recno += NUM_ENT(pg); + next_p = 1; + tdone = pgs_done; + PTRACE(dbc, "Dups", PGNO(pg), start, 0); + if (check_dups && (ret = __bam_compact_dups( + dbc, &pg, factor, 0, c_data, &pgs_done)) != 0) + goto err; + npgno = NEXT_PGNO(pg); + if ((ret = __memp_fput(dbmp, + dbc->thread_info, pg, dbc->priority)) != 0) + goto err; + pg = NULL; + /* + * If we don't do anything we don't need to hold + * the lock on the previous page, so couple always. + */ + if ((ret = __db_lget(dbc, + tdone == pgs_done ? LCK_COUPLE_ALWAYS : LCK_COUPLE, + npgno, DB_LOCK_READ, 0, &cp->csp->lock)) != 0) + goto err; + if ((ret = __memp_fget(dbmp, &npgno, + dbc->thread_info, dbc->txn, 0, &pg)) != 0) + goto err; + } + + /* + * When we get here we have 3 cases: + * 1) We've reached the end of the leaf linked list and are done. + * 2) A page whose freespace exceeds our target and therefore needs + * to have data added to it. + * 3) A page that doesn't have too much free space but needs to be + * checked for truncation. + * In both cases 2 and 3, we need that page's first key or record + * number. We may already have it, if not get it here. + */ + if ((nentry = NUM_ENT(pg)) != 0) { + next_p = 0; + /* Get a copy of the first recno on the page. */ + if (dbc->dbtype == DB_RECNO) { + if ((ret = __db_retcopy(dbp->env, start, + &cp->recno, sizeof(cp->recno), + &start->data, &start->ulen)) != 0) + goto err; + } else if (start->size == 0 && (ret = __db_ret(dbc, + pg, 0, start, &start->data, &start->ulen)) != 0) + goto err; + + if (npgno == PGNO_INVALID) { + /* End of the tree, check its duplicates and exit. */ + PTRACE(dbc, "GoDone", PGNO(pg), start, 0); + if (check_dups && (ret = __bam_compact_dups(dbc, + &pg, factor, 0, c_data, &pgs_done)) != 0) + goto err; + c_data->compact_pages_examine++; + isdone = 1; + goto done; + } + } + + /* Release the page so we don't deadlock getting its parent. */ + if ((ret = __memp_fput(dbmp, dbc->thread_info, pg, dbc->priority)) != 0) + goto err; + if ((ret = __LPUT(dbc, cp->csp->lock)) != 0) + goto err; + BT_STK_CLR(cp); + pg = NULL; + saved_pgno = PGNO_INVALID; + prev_pgno = PGNO_INVALID; + nnext_pgno = PGNO_INVALID; + + /* + * We must lock the metadata page first because we cannot block + * while holding interior nodes of the tree pinned. + */ + + if (!LOCK_ISSET(metalock) && pgs_free == c_data->compact_pages_free && + (ret = __db_lget(dbc, + LCK_ALWAYS, metapgno, DB_LOCK_WRITE, 0, &metalock)) != 0) + goto err; + + /* + * Setup the cursor stack. There are 3 cases: + * 1) the page is empty and will be deleted: nentry == 0. + * 2) the next page has the same parent: *spanp == 0. + * 3) the next page has a different parent: *spanp == 1. + * + * We now need to search the tree again, getting a write lock + * on the page we are going to merge or delete. We do this by + * searching down the tree and locking as much of the subtree + * above the page as needed. In the case of a delete we will + * find the maximal subtree that can be deleted. In the case + * of merge if the current page and the next page are siblings + * with the same parent then we only need to lock the parent. + * Otherwise *span will be set and we need to search to find the + * lowest common ancestor. Dbc will be set to contain the subtree + * containing the page to be merged or deleted. Ndbc will contain + * the minimal subtree containing that page and its next sibling. + * In all cases for DB_RECNO we simplify things and get the whole + * tree if we need more than a single parent. + * The tree can collapse while we don't have it locked, so the + * page we are looking for may be gone. If so we are at + * the right most end of the leaf pages and are done. + */ + +retry: pg = NULL; + if (npg != NULL && (ret = __memp_fput(dbmp, + dbc->thread_info, npg, dbc->priority)) != 0) + goto err; + npg = NULL; + if (ndbc != NULL) { + ncp = (BTREE_CURSOR *)ndbc->internal; + if (clear_root == 1) { + ncp->sp->page = NULL; + LOCK_INIT(ncp->sp->lock); + } + if ((ret = __bam_stkrel(ndbc, 0)) != 0) + goto err; + } + clear_root = 0; + /* Case 1 -- page is empty. */ + if (nentry == 0) { + CTRACE(dbc, "Empty", "", start, 0); + if (next_p == 1) + sflag = CS_NEXT_WRITE; + else + sflag = CS_DEL; + if ((ret = __bam_csearch(dbc, start, sflag, LEAFLEVEL)) != 0) { + isdone = 1; + if (ret == DB_NOTFOUND) + ret = 0; + goto err; + } + + pg = cp->csp->page; + /* Check to see if the page is still empty. */ + if (NUM_ENT(pg) != 0) + npgno = PGNO(pg); + else { + npgno = NEXT_PGNO(pg); + /* If this is now the root, we are very done. */ + if (PGNO(pg) == cp->root) + isdone = 1; + else { + if (npgno != PGNO_INVALID) { + TRY_LOCK(dbc, npgno, saved_pgno, + next_lock, DB_LOCK_WRITE, retry); + if (ret != 0) + goto err; + } + if (PREV_PGNO(pg) != PGNO_INVALID) { + TRY_LOCK(dbc, PREV_PGNO(pg), prev_pgno, + prev_lock, DB_LOCK_WRITE, retry); + if (ret != 0) + goto err; + } + if ((ret = + __bam_dpages(dbc, 0, BTD_RELINK)) != 0) + goto err; + c_data->compact_pages_free++; + if ((ret = __TLPUT(dbc, prev_lock)) != 0) + goto err; + LOCK_INIT(prev_lock); + if ((ret = __TLPUT(dbc, next_lock)) != 0) + goto err; + LOCK_INIT(next_lock); + goto next_no_release; + } + } + goto next_page; + } + + /* case 3 -- different parents. */ + if (*spanp) { + CTRACE(dbc, "Span", "", start, 0); + /* + * Search the tree looking for the page containing and + * the next page after the current key. + * The stack will be rooted at the page that spans + * the current and next pages. The two subtrees + * are returned below that. For BTREE the current + * page subtreee will be first while for RECNO the + * next page subtree will be first + */ + if (ndbc == NULL && (ret = __dbc_dup(dbc, &ndbc, 0)) != 0) + goto err; + ncp = (BTREE_CURSOR *)ndbc->internal; + + ncp->recno = cp->recno; + cp->recno = next_recno; + + if ((ret = __bam_csearch(dbc, start, CS_NEXT_BOTH, 0)) != 0) { + if (ret == DB_NOTFOUND) { + isdone = 1; + ret = 0; + } + goto err; + } + + /* + * Find the top of the stack for the second subtree. + */ + for (epg = cp->csp - 1; epg > cp->sp; epg--) + if (LEVEL(epg->page) == LEAFLEVEL) + break; + DB_ASSERT(env, epg != cp->sp); + + /* + * Copy the root. We will have two instances of the + * same page, be careful not to free both. + */ + BT_STK_PUSH(env, ncp, cp->sp->page, cp->sp->indx, + cp->sp->lock, cp->sp->lock_mode, ret); + if (ret != 0) + goto err; + clear_root = 1; + + /* Copy the stack containing the next page. */ + for (epg++; epg <= cp->csp; epg++) { + BT_STK_PUSH(env, ncp, epg->page, epg->indx, + epg->lock, epg->lock_mode, ret); + if (ret != 0) + goto err; + } + /* adjust the stack pointer to remove these items. */ + ncp->csp--; + cp->csp -= ncp->csp - ncp->sp; + + /* + * If this is RECNO then we want to swap the stacks. + */ + if (dbp->type == DB_RECNO) { + ndbc->internal = (DBC_INTERNAL *)cp; + dbc->internal = (DBC_INTERNAL *)ncp; + cp = ncp; + ncp = (BTREE_CURSOR *)ndbc->internal; + cp->sp->indx--; + } else + ncp->sp->indx++; + + DB_ASSERT(env, + NEXT_PGNO(cp->csp->page) == PGNO(ncp->csp->page)); + pg = cp->csp->page; + + /* + * The page may have emptied while we waited for the + * lock or the record we are looking for may have + * moved. + * Reset npgno so we re-get this page when we go back + * to the top. + */ + if (NUM_ENT(pg) == 0 || + (dbc->dbtype == DB_RECNO && + NEXT_PGNO(cp->csp->page) != PGNO(ncp->csp->page))) { + npgno = PGNO(pg); + *spanp = 0; + goto next_page; + } + + if (check_trunc && PGNO(pg) > c_data->compact_truncate) { + if (PREV_PGNO(pg) != PGNO_INVALID) { + TRY_LOCK2(dbc, ndbc, PREV_PGNO(pg), prev_pgno, + prev_lock, DB_LOCK_WRITE, retry); + if (ret != 0) + goto err1; + } + pgs_done++; + /* Get a fresh low numbered page. */ + if ((ret = __bam_truncate_page(dbc, + &pg, ncp->csp->page, 1)) != 0) + goto err1; + if ((ret = __TLPUT(dbc, prev_lock)) != 0) + goto err; + LOCK_INIT(prev_lock); + } + *spanp = 0; + PTRACE(dbc, "SDups", PGNO(ncp->csp->page), start, 0); + if (check_dups && (ret = __bam_compact_dups(ndbc, + &ncp->csp->page, factor, 1, c_data, &pgs_done)) != 0) + goto err1; + + /* Check to see if the tree collapsed. */ + if (PGNO(ncp->csp->page) == ncp->root) + goto done; + + pg = cp->csp->page; + npgno = NEXT_PGNO(pg); + PTRACE(dbc, "SDups", PGNO(pg), start, 0); + if (check_dups && (ret = + __bam_compact_dups(dbc, &cp->csp->page, + factor, 1, c_data, &pgs_done)) != 0) + goto err1; + + /* + * We may have dropped our locks, check again + * to see if we still need to fill this page and + * we are in a spanning situation. + */ + + if (P_FREESPACE(dbp, pg) <= factor || + cp->csp[-1].indx != NUM_ENT(cp->csp[-1].page) - 1) + goto next_page; + + /* + * Try to move things into a single parent. + */ + merged = 0; + for (epg = cp->sp; epg != cp->csp; epg++) { + PTRACE(dbc, "PMerge", PGNO(epg->page), start, 0); + if ((ret = __bam_merge_internal(dbc, + ndbc, LEVEL(epg->page), c_data, &merged)) != 0) + break; + if (merged) + break; + } + + if (ret != 0 && ret != DB_LOCK_NOTGRANTED) + goto err1; + /* + * If we merged the parent, then we no longer span. + * Otherwise if we tried to merge the parent but would + * block on one of the other leaf pages try again. + * If we did not merge any records of the parent, + * exit to commit any local transactions and try again. + */ + if (merged || ret == DB_LOCK_NOTGRANTED) { + if (merged) + pgs_done++; + else + goto done; + if (cp->csp->page == NULL) + goto deleted; + npgno = PGNO(pg); + next_recno = cp->recno; + goto next_page; + } + PTRACE(dbc, "SMerge", PGNO(cp->csp->page), start, 0); + + /* if we remove the next page, then we need its next locked */ + npgno = NEXT_PGNO(ncp->csp->page); + if (npgno != PGNO_INVALID) { + TRY_LOCK2(dbc, ndbc, npgno, + nnext_pgno, nnext_lock, DB_LOCK_WRITE, retry); + if (ret != 0) + goto err1; + } + if ((ret = __bam_merge(dbc, + ndbc, factor, stop, c_data, &isdone)) != 0) + goto err1; + pgs_done++; + /* + * __bam_merge could have freed our stack if it + * deleted a page possibly collapsing the tree. + */ + if (cp->csp->page == NULL) + goto deleted; + cp->recno += NUM_ENT(pg); + + if ((ret = __TLPUT(dbc, nnext_lock)) != 0) + goto err1; + LOCK_INIT(nnext_lock); + + /* If we did not bump to the next page something did not fit. */ + if (npgno != NEXT_PGNO(pg)) { + npgno = NEXT_PGNO(pg); + goto next_page; + } + } else { + /* Case 2 -- same parents. */ + CTRACE(dbc, "Sib", "", start, 0); + if ((ret = + __bam_csearch(dbc, start, CS_PARENT, LEAFLEVEL)) != 0) { + if (ret == DB_NOTFOUND) { + isdone = 1; + ret = 0; + } + goto err; + } + + pg = cp->csp->page; + DB_ASSERT(env, IS_DIRTY(pg)); + DB_ASSERT(env, + PGNO(pg) == cp->root || IS_DIRTY(cp->csp[-1].page)); + + /* We now have a write lock, recheck the page. */ + if ((nentry = NUM_ENT(pg)) == 0) { + npgno = PGNO(pg); + goto next_page; + } + + /* Check duplicate trees, we have a write lock on the page. */ + PTRACE(dbc, "SibDup", PGNO(pg), start, 0); + if (check_dups && (ret = + __bam_compact_dups(dbc, &cp->csp->page, + factor, 1, c_data, &pgs_done)) != 0) + goto err1; + pg = cp->csp->page; + npgno = NEXT_PGNO(pg); + + /* Check to see if the tree collapsed. */ + if (PGNO(pg) == cp->root) + goto err1; + DB_ASSERT(env, cp->csp - cp->sp == 1); + + /* After re-locking check to see if we still need to fill. */ + if (P_FREESPACE(dbp, pg) <= factor) { + if (check_trunc && + PGNO(pg) > c_data->compact_truncate) { + if (PREV_PGNO(pg) != PGNO_INVALID) { + TRY_LOCK(dbc, PREV_PGNO(pg), prev_pgno, + prev_lock, DB_LOCK_WRITE, retry); + if (ret != 0) + goto err1; + } + if (npgno != PGNO_INVALID) { + TRY_LOCK(dbc, npgno, saved_pgno, + next_lock, DB_LOCK_WRITE, retry); + if (ret != 0) + goto err1; + } + pgs_done++; + /* Get a fresh low numbered page. */ + if ((ret = __bam_truncate_page(dbc, + &pg, NULL, 1)) != 0) + goto err1; + if ((ret = __TLPUT(dbc, prev_lock)) != 0) + goto err1; + LOCK_INIT(prev_lock); + if ((ret = __TLPUT(dbc, next_lock)) != 0) + goto err1; + LOCK_INIT(next_lock); + } + goto next_page; + } + + /* If they have the same parent, just dup the cursor */ + if (ndbc != NULL && (ret = __dbc_close(ndbc)) != 0) + goto err1; + if ((ret = __dbc_dup(dbc, &ndbc, DB_POSITION)) != 0) + goto err1; + ncp = (BTREE_CURSOR *)ndbc->internal; + + /* + * ncp->recno needs to have the recno of the next page. + * Bump it by the number of records on the current page. + */ + ncp->recno += NUM_ENT(pg); + } + + pgno = PGNO(cp->csp->page); + ppgno = PGNO(cp->csp[-1].page); + /* Fetch pages until we fill this one. */ + while (!isdone && npgno != PGNO_INVALID && + P_FREESPACE(dbp, pg) > factor && c_data->compact_pages != 0) { + /* + * merging may have to free the parent page, if it does, + * refetch it but do it decending the tree. + */ + epg = &cp->csp[-1]; + if ((ppg = epg->page) == NULL) { + if ((ret = __memp_fput(dbmp, dbc->thread_info, + cp->csp->page, dbc->priority)) != 0) + goto err1; + pg = NULL; + if ((ret = __memp_fget(dbmp, &ppgno, dbc->thread_info, + dbc->txn, DB_MPOOL_DIRTY, &ppg)) != 0) + goto err1; + if ((ret = __memp_fget(dbmp, &pgno, dbc->thread_info, + dbc->txn, DB_MPOOL_DIRTY, &pg)) != 0) + goto err1; + epg->page = ppg; + cp->csp->page = pg; + } + + /* + * If our current position is the last one on a parent + * page, then we are about to merge across different + * internal nodes. Thus, we need to lock higher up + * in the tree. We will exit the routine and commit + * what we have done so far. Set spanp so we know + * we are in this case when we come back. + */ + if (epg->indx == NUM_ENT(ppg) - 1) { + *spanp = 1; + npgno = PGNO(pg); + next_recno = cp->recno; + epg->page = ppg; + goto next_page; + } + + /* Lock and get the next page. */ + TRY_LOCK(dbc, npgno, + saved_pgno, saved_lock, DB_LOCK_WRITE, retry); + if (ret != 0) + goto err1; + if ((ret = __LPUT(dbc, ncp->lock)) != 0) + goto err1; + ncp->lock = saved_lock; + LOCK_INIT(saved_lock); + saved_pgno = PGNO_INVALID; + + if ((ret = __memp_fget(dbmp, &npgno, + dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &npg)) != 0) + goto err1; + + if (check_trunc && + PGNO(pg) > c_data->compact_truncate) { + if (PREV_PGNO(pg) != PGNO_INVALID) { + TRY_LOCK(dbc, PREV_PGNO(pg), + prev_pgno, prev_lock, DB_LOCK_WRITE, retry); + if (ret != 0) + goto err1; + } + pgs_done++; + /* Get a fresh low numbered page. */ + if ((ret = __bam_truncate_page(dbc, &pg, npg, 1)) != 0) + goto err1; + if ((ret = __TLPUT(dbc, prev_lock)) != 0) + goto err1; + LOCK_INIT(prev_lock); + pgno = PGNO(pg); + } + c_data->compact_pages_examine++; + + PTRACE(dbc, "MDups", PGNO(npg), start, 0); + if (check_dups && (ret = __bam_compact_dups(ndbc, + &npg, factor, 1, c_data, &pgs_done)) != 0) + goto err1; + + npgno = NEXT_PGNO(npg); + if (npgno != PGNO_INVALID) { + TRY_LOCK(dbc, npgno, + nnext_pgno, nnext_lock, DB_LOCK_WRITE, retry); + if (ret != 0) + goto err1; + } + + /* copy the common parent to the stack. */ + BT_STK_PUSH(env, ncp, ppg, + epg->indx + 1, epg->lock, epg->lock_mode, ret); + if (ret != 0) + goto err1; + + /* Put the page on the stack. */ + BT_STK_ENTER(env, ncp, npg, 0, ncp->lock, DB_LOCK_WRITE, ret); + + LOCK_INIT(ncp->lock); + npg = NULL; + + /* + * Merge the pages. This will either free the next + * page or just update its parent pointer. + */ + PTRACE(dbc, "Merge", PGNO(cp->csp->page), start, 0); + if ((ret = __bam_merge(dbc, + ndbc, factor, stop, c_data, &isdone)) != 0) + goto err1; + + pgs_done++; + + if ((ret = __TLPUT(dbc, nnext_lock)) != 0) + goto err1; + LOCK_INIT(nnext_lock); + + /* + * __bam_merge could have freed our stack if it + * deleted a page possibly collapsing the tree. + */ + if (cp->csp->page == NULL) + goto deleted; + /* If we did not bump to the next page something did not fit. */ + if (npgno != NEXT_PGNO(pg)) + break; + } + + /* Bottom of the main loop. Move to the next page. */ + npgno = NEXT_PGNO(pg); + cp->recno += NUM_ENT(pg); + next_recno = cp->recno; + +next_page: + if (ndbc != NULL) { + ncp = (BTREE_CURSOR *)ndbc->internal; + if (ncp->sp->page == cp->sp->page) { + ncp->sp->page = NULL; + LOCK_INIT(ncp->sp->lock); + } + if ((ret = __bam_stkrel(ndbc, + pgs_done == 0 ? STK_NOLOCK : 0)) != 0) + goto err; + } + /* + * Unlatch the tree before trying to lock the next page. We must + * unlatch to avoid a latch deadlock but we want to hold the + * lock on the parent node so this leaf cannot be unlinked. + */ + pg = NULL; + if ((ret = __bam_stkrel(dbc, STK_PGONLY)) != 0) + goto err; + if ((ret = __db_lget(dbc, 0, npgno, DB_LOCK_READ, 0, &next_lock)) != 0) + goto err; + if ((ret = __bam_stkrel(dbc, pgs_done == 0 ? STK_NOLOCK : 0)) != 0) + goto err; + if ((ret = __TLPUT(dbc, saved_lock)) != 0) + goto err; + if ((ret = __TLPUT(dbc, prev_lock)) != 0) + goto err; + +next_no_release: + pg = NULL; + + if (npgno == PGNO_INVALID || c_data->compact_pages == 0) + isdone = 1; + if (!isdone) { + /* + * If we are at the end of this parent commit the + * transaction so we don't tie things up. + */ + if (pgs_done != 0 && *spanp) { +deleted: if (((ret = __bam_stkrel(ndbc, 0)) != 0 || + (ret = __dbc_close(ndbc)) != 0)) + goto err; + *donep = 0; + return (0); + } + + /* Reget the next page to look at. */ + cp->recno = next_recno; + if ((ret = __memp_fget(dbmp, &npgno, + dbc->thread_info, dbc->txn, 0, &pg)) != 0) + goto err; + cp->csp->lock = next_lock; + LOCK_INIT(next_lock); + next_p = 1; + /* If we did not do anything we can drop the metalock. */ + if (pgs_done == 0 && (ret = __LPUT(dbc, metalock)) != 0) + goto err; + goto next; + } + +done: + if (0) { + /* + * We come here if pg came from cp->csp->page and could + * have already been fput. + */ +err1: pg = NULL; + } +err: /* + * Don't release locks (STK_PGONLY)if we had an error, we could reveal + * a bad tree to a dirty reader. Wait till the abort to free the locks. + */ + sflag = STK_CLRDBC; + if (dbc->txn != NULL && ret != 0) + sflag |= STK_PGONLY; + if (ndbc != NULL) { + ncp = (BTREE_CURSOR *)ndbc->internal; + if (npg == ncp->csp->page) + npg = NULL; + if (ncp->sp->page == cp->sp->page) { + ncp->sp->page = NULL; + LOCK_INIT(ncp->sp->lock); + } + if ((t_ret = __bam_stkrel(ndbc, sflag)) != 0 && ret == 0) + ret = t_ret; + else if ((t_ret = __dbc_close(ndbc)) != 0 && ret == 0) + ret = t_ret; + } + if (pg == cp->csp->page) + pg = NULL; + if ((t_ret = __bam_stkrel(dbc, sflag)) != 0 && ret == 0) + ret = t_ret; + + if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0) + ret = t_ret; + + if (pg != NULL && (t_ret = + __memp_fput(dbmp, + dbc->thread_info, pg, dbc->priority) != 0) && ret == 0) + ret = t_ret; + if (npg != NULL && (t_ret = + __memp_fput(dbmp, + dbc->thread_info, npg, dbc->priority) != 0) && ret == 0) + ret = t_ret; + + *donep = isdone; + + return (ret); +} + +/* + * __bam_merge -- do actual merging of leaf pages. + */ +static int +__bam_merge(dbc, ndbc, factor, stop, c_data, donep) + DBC *dbc, *ndbc; + u_int32_t factor; + DBT *stop; + DB_COMPACT *c_data; + int *donep; +{ + BTREE_CURSOR *cp, *ncp; + DB *dbp; + PAGE *pg, *npg; + db_indx_t nent; + int ret; + + DB_ASSERT(NULL, dbc != NULL); + DB_ASSERT(NULL, ndbc != NULL); + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + ncp = (BTREE_CURSOR *)ndbc->internal; + pg = cp->csp->page; + npg = ncp->csp->page; + + nent = NUM_ENT(npg); + + /* If the page is empty just throw it away. */ + if (nent == 0) + goto free_page; + + /* Find if the stopping point is on this page. */ + if (stop != NULL && stop->size != 0) { + if ((ret = __bam_compact_isdone(dbc, stop, npg, donep)) != 0) + return (ret); + if (*donep) + return (0); + } + + /* + * If there is too much data then just move records one at a time. + * Otherwise copy the data space over and fix up the index table. + * If we are on the left most child we will effect our parent's + * index entry so we call merge_records to figure out key sizes. + */ + if ((dbc->dbtype == DB_BTREE && + ncp->csp[-1].indx == 0 && ncp->csp[-1].entries != 1) || + (int)(P_FREESPACE(dbp, pg) - + ((dbp->pgsize - P_OVERHEAD(dbp)) - + P_FREESPACE(dbp, npg))) < (int)factor) + ret = __bam_merge_records(dbc, ndbc, factor, c_data); + else +free_page: ret = __bam_merge_pages(dbc, ndbc, c_data); + + return (ret); +} + +static int +__bam_merge_records(dbc, ndbc, factor, c_data) + DBC *dbc, *ndbc; + u_int32_t factor; + DB_COMPACT *c_data; +{ + BINTERNAL *bi; + BKEYDATA *bk, *tmp_bk; + BTREE *t; + BTREE_CURSOR *cp, *ncp; + DB *dbp; + DBT a, b, data, hdr; + ENV *env; + EPG *epg; + PAGE *pg, *npg; + db_indx_t adj, indx, nent, *ninp, pind; + int32_t adjust; + u_int32_t freespace, nksize, pfree, size; + int first_dup, is_dup, next_dup, n_ok, ret; + size_t (*func) __P((DB *, const DBT *, const DBT *)); + + dbp = dbc->dbp; + env = dbp->env; + t = dbp->bt_internal; + cp = (BTREE_CURSOR *)dbc->internal; + ncp = (BTREE_CURSOR *)ndbc->internal; + pg = cp->csp->page; + npg = ncp->csp->page; + memset(&hdr, 0, sizeof(hdr)); + pind = NUM_ENT(pg); + n_ok = 0; + adjust = 0; + ret = 0; + nent = NUM_ENT(npg); + + DB_ASSERT(env, nent != 0); + + /* See if we want to swap out this page. */ + if (c_data->compact_truncate != PGNO_INVALID && + PGNO(npg) > c_data->compact_truncate) { + /* Get a fresh low numbered page. */ + if ((ret = __bam_truncate_page(ndbc, &npg, pg, 1)) != 0) + goto err; + } + + ninp = P_INP(dbp, npg); + + /* + * pg is the page that is being filled, it is in the stack in cp. + * npg is the next page, it is in the stack in ncp. + */ + freespace = P_FREESPACE(dbp, pg); + + adj = TYPE(npg) == P_LBTREE ? P_INDX : O_INDX; + /* + * Loop through the records and find the stopping point. + */ + for (indx = 0; indx < nent; indx += adj) { + bk = GET_BKEYDATA(dbp, npg, indx); + + /* Size of the key. */ + size = BITEM_PSIZE(bk); + + /* Size of the data. */ + if (TYPE(pg) == P_LBTREE) + size += BITEM_PSIZE(GET_BKEYDATA(dbp, npg, indx + 1)); + /* + * If we are at a duplicate set, skip ahead to see and + * get the total size for the group. + */ + n_ok = adj; + if (TYPE(pg) == P_LBTREE && + indx < nent - adj && + ninp[indx] == ninp[indx + adj]) { + do { + /* Size of index for key reference. */ + size += sizeof(db_indx_t); + n_ok++; + /* Size of data item. */ + size += BITEM_PSIZE( + GET_BKEYDATA(dbp, npg, indx + n_ok)); + n_ok++; + } while (indx + n_ok < nent && + ninp[indx] == ninp[indx + n_ok]); + } + /* if the next set will not fit on the page we are done. */ + if (freespace < size) + break; + + /* + * Otherwise figure out if we are past the goal and if + * adding this set will put us closer to the goal than + * we are now. + */ + if ((freespace - size) < factor) { + if (freespace - factor > factor - (freespace - size)) + indx += n_ok; + break; + } + freespace -= size; + indx += n_ok - adj; + } + + /* If we have hit the first record then there is nothing we can move. */ + if (indx == 0) + goto done; + if (TYPE(pg) != P_LBTREE && TYPE(pg) != P_LDUP) { + if (indx == nent) + return (__bam_merge_pages(dbc, ndbc, c_data)); + goto no_check; + } + /* + * We need to update npg's parent key. Avoid creating a new key + * that will be too big. Get what space will be available on the + * parents. Then if there will not be room for this key, see if + * prefix compression will make it work, if not backup till we + * find something that will. (Needless to say, this is a very + * unlikely event.) If we are deleting this page then we will + * need to propagate the next key to our grand parents, so we + * see if that will fit. + */ + pfree = dbp->pgsize; + for (epg = &ncp->csp[-1]; epg >= ncp->sp; epg--) + if ((freespace = P_FREESPACE(dbp, epg->page)) < pfree) { + bi = GET_BINTERNAL(dbp, epg->page, epg->indx); + /* Add back in the key we will be deleting. */ + freespace += BINTERNAL_PSIZE(bi->len); + if (freespace < pfree) + pfree = freespace; + if (epg->indx != 0) + break; + } + + /* + * If we are at the end, we will delete this page. We need to + * check the next parent key only if we are the leftmost page and + * will therefore have to propagate the key up the tree. + */ + if (indx == nent) { + if (ncp->csp[-1].indx != 0 || ncp->csp[-1].entries == 1 || + BINTERNAL_PSIZE(GET_BINTERNAL(dbp, + ncp->csp[-1].page, 1)->len) <= pfree) + return (__bam_merge_pages(dbc, ndbc, c_data)); + indx -= adj; + } + bk = GET_BKEYDATA(dbp, npg, indx); + if (indx != 0 && BINTERNAL_SIZE(bk->len) >= pfree) { + if (F_ISSET(dbc, DBC_OPD)) { + if (dbp->dup_compare == __bam_defcmp) + func = __bam_defpfx; + else + func = NULL; + } else + func = t->bt_prefix; + } else + func = NULL; + + /* Skip to the beginning of a duplicate set. */ + while (indx != 0 && ninp[indx] == ninp[indx - adj]) + indx -= adj; + + while (indx != 0 && BINTERNAL_SIZE(bk->len) >= pfree) { + if (B_TYPE(bk->type) != B_KEYDATA) + goto noprefix; + /* + * Figure out if we can truncate this key. + * Code borrowed from bt_split.c + */ + if (func == NULL) + goto noprefix; + tmp_bk = GET_BKEYDATA(dbp, npg, indx - adj); + if (B_TYPE(tmp_bk->type) != B_KEYDATA) + goto noprefix; + memset(&a, 0, sizeof(a)); + a.size = tmp_bk->len; + a.data = tmp_bk->data; + memset(&b, 0, sizeof(b)); + b.size = bk->len; + b.data = bk->data; + nksize = (u_int32_t)func(dbp, &a, &b); + if (BINTERNAL_PSIZE(nksize) < pfree) + break; +noprefix: + /* Skip to the beginning of a duplicate set. */ + do { + indx -= adj; + } while (indx != 0 && ninp[indx] == ninp[indx - adj]); + + bk = GET_BKEYDATA(dbp, npg, indx); + } + + /* + * indx references the first record that will not move to the previous + * page. If it is 0 then we could not find a key that would fit in + * the parent that would permit us to move any records. + */ + if (indx == 0) + goto done; + DB_ASSERT(env, indx <= nent); + + /* Loop through the records and move them from npg to pg. */ +no_check: is_dup = first_dup = next_dup = 0; + pg = cp->csp->page; + npg = ncp->csp->page; + DB_ASSERT(env, IS_DIRTY(pg)); + DB_ASSERT(env, IS_DIRTY(npg)); + ninp = P_INP(dbp, npg); + do { + bk = GET_BKEYDATA(dbp, npg, 0); + /* Figure out if we are in a duplicate group or not. */ + if ((NUM_ENT(npg) % 2) == 0) { + if (NUM_ENT(npg) > 2 && ninp[0] == ninp[2]) { + if (!is_dup) { + first_dup = 1; + is_dup = 1; + } else + first_dup = 0; + + next_dup = 1; + } else if (next_dup) { + is_dup = 1; + first_dup = 0; + next_dup = 0; + } else + is_dup = 0; + } + + if (is_dup && !first_dup && (pind % 2) == 0) { + /* Duplicate key. */ + if ((ret = __bam_adjindx(dbc, + pg, pind, pind - P_INDX, 1)) != 0) + goto err; + if (!next_dup) + is_dup = 0; + } else switch (B_TYPE(bk->type)) { + case B_KEYDATA: + hdr.data = bk; + hdr.size = SSZA(BKEYDATA, data); + data.size = bk->len; + data.data = bk->data; + if ((ret = __db_pitem(dbc, pg, pind, + BKEYDATA_SIZE(bk->len), &hdr, &data)) != 0) + goto err; + break; + case B_OVERFLOW: + case B_DUPLICATE: + data.size = BOVERFLOW_SIZE; + data.data = bk; + if ((ret = __db_pitem(dbc, pg, pind, + BOVERFLOW_SIZE, &data, NULL)) != 0) + goto err; + break; + default: + __db_errx(env, + "Unknown record format, page %lu, indx 0", + (u_long)PGNO(pg)); + ret = EINVAL; + goto err; + } + pind++; + if (next_dup && (NUM_ENT(npg) % 2) == 0) { + if ((ret = __bam_adjindx(ndbc, + npg, 0, O_INDX, 0)) != 0) + goto err; + } else { + if ((ret = __db_ditem(ndbc, + npg, 0, BITEM_SIZE(bk))) != 0) + goto err; + } + adjust++; + } while (--indx != 0); + + DB_ASSERT(env, NUM_ENT(npg) != 0); + + if (adjust != 0 && + (F_ISSET(cp, C_RECNUM) || F_ISSET(dbc, DBC_OPD))) { + if (TYPE(pg) == P_LBTREE) + adjust /= P_INDX; + if ((ret = __bam_adjust(ndbc, -adjust)) != 0) + goto err; + + if ((ret = __bam_adjust(dbc, adjust)) != 0) + goto err; + } + + /* Update parent with new key. */ + if (ndbc->dbtype == DB_BTREE && + (ret = __bam_pupdate(ndbc, pg)) != 0) + goto err; + +done: if (cp->sp->page == ncp->sp->page) { + cp->sp->page = NULL; + LOCK_INIT(cp->sp->lock); + } + ret = __bam_stkrel(ndbc, STK_CLRDBC); + +err: return (ret); +} + +static int +__bam_merge_pages(dbc, ndbc, c_data) + DBC *dbc, *ndbc; + DB_COMPACT *c_data; +{ + BTREE_CURSOR *cp, *ncp; + DB *dbp; + DBT data, hdr; + DB_MPOOLFILE *dbmp; + PAGE *pg, *npg; + db_indx_t nent, *ninp, *pinp; + db_pgno_t ppgno; + u_int8_t *bp; + u_int32_t len; + int i, level, ret; + + COMPQUIET(ppgno, PGNO_INVALID); + dbp = dbc->dbp; + dbmp = dbp->mpf; + cp = (BTREE_CURSOR *)dbc->internal; + ncp = (BTREE_CURSOR *)ndbc->internal; + pg = cp->csp->page; + npg = ncp->csp->page; + memset(&hdr, 0, sizeof(hdr)); + nent = NUM_ENT(npg); + + /* If the page is empty just throw it away. */ + if (nent == 0) + goto free_page; + + pg = cp->csp->page; + npg = ncp->csp->page; + DB_ASSERT(dbp->env, IS_DIRTY(pg)); + DB_ASSERT(dbp->env, IS_DIRTY(npg)); + DB_ASSERT(dbp->env, nent == NUM_ENT(npg)); + + /* Bulk copy the data to the new page. */ + len = dbp->pgsize - HOFFSET(npg); + if (DBC_LOGGING(dbc)) { + memset(&hdr, 0, sizeof(hdr)); + hdr.data = npg; + hdr.size = LOFFSET(dbp, npg); + memset(&data, 0, sizeof(data)); + data.data = (u_int8_t *)npg + HOFFSET(npg); + data.size = len; + if ((ret = __bam_merge_log(dbp, + dbc->txn, &LSN(pg), 0, PGNO(pg), + &LSN(pg), PGNO(npg), &LSN(npg), &hdr, &data, 0)) != 0) + goto err; + } else + LSN_NOT_LOGGED(LSN(pg)); + LSN(npg) = LSN(pg); + bp = (u_int8_t *)pg + HOFFSET(pg) - len; + memcpy(bp, (u_int8_t *)npg + HOFFSET(npg), len); + + /* Copy index table offset by what was there already. */ + pinp = P_INP(dbp, pg) + NUM_ENT(pg); + ninp = P_INP(dbp, npg); + for (i = 0; i < NUM_ENT(npg); i++) + *pinp++ = *ninp++ - (dbp->pgsize - HOFFSET(pg)); + HOFFSET(pg) -= len; + NUM_ENT(pg) += i; + + NUM_ENT(npg) = 0; + HOFFSET(npg) += len; + + if (F_ISSET(cp, C_RECNUM) || F_ISSET(dbc, DBC_OPD)) { + /* + * There are two cases here regarding the stack. + * Either we have two two level stacks but only ndbc + * references the parent page or we have a multilevel + * stack and only ndbc has an entry for the spanning + * page. + */ + if (TYPE(pg) == P_LBTREE) + i /= P_INDX; + if ((ret = __bam_adjust(ndbc, -i)) != 0) + goto err; + + if ((ret = __bam_adjust(dbc, i)) != 0) + goto err; + } + +free_page: + /* + * __bam_dpages may decide to collapse the tree. + * This can happen if we have the root and there + * are exactly 2 pointers left in it. + * If it can collapse the tree we must free the other + * stack since it will nolonger be valid. This + * must be done before hand because we cannot + * hold a page pinned if it might be truncated. + */ + if ((ret = __bam_relink(dbc, + ncp->csp->page, cp->csp->page, PGNO_INVALID)) != 0) + goto err; + /* Drop the duplicate reference to the sub tree root. */ + cp->sp->page = NULL; + LOCK_INIT(cp->sp->lock); + if (PGNO(ncp->sp->page) == ncp->root && + NUM_ENT(ncp->sp->page) == 2) { + if ((ret = __bam_stkrel(dbc, STK_CLRDBC | STK_PGONLY)) != 0) + goto err; + level = LEVEL(ncp->sp->page); + ppgno = PGNO(ncp->csp[-1].page); + } else + level = 0; + if (c_data->compact_truncate > PGNO(npg)) + c_data->compact_truncate--; + if ((ret = __bam_dpages(ndbc, + 0, ndbc->dbtype == DB_RECNO ? 0 : BTD_UPDATE)) != 0) + goto err; + npg = NULL; + c_data->compact_pages_free++; + c_data->compact_pages--; + if (level != 0) { + if ((ret = __memp_fget(dbmp, &ncp->root, + dbc->thread_info, dbc->txn, 0, &npg)) != 0) + goto err; + if (level == LEVEL(npg)) + level = 0; + if ((ret = __memp_fput(dbmp, + dbc->thread_info, npg, dbc->priority)) != 0) + goto err; + npg = NULL; + if (level != 0) { + c_data->compact_levels++; + c_data->compact_pages_free++; + if (c_data->compact_truncate > ppgno) + c_data->compact_truncate--; + if (c_data->compact_pages != 0) + c_data->compact_pages--; + } + } + +err: return (ret); +} + +/* + * __bam_merge_internal -- + * Merge internal nodes of the tree. + */ +static int +__bam_merge_internal(dbc, ndbc, level, c_data, merged) + DBC *dbc, *ndbc; + int level; + DB_COMPACT *c_data; + int *merged; +{ + BINTERNAL bi, *bip, *fip; + BTREE_CURSOR *cp, *ncp; + DB *dbp; + DBT data, hdr; + DB_MPOOLFILE *dbmp; + EPG *epg, *save_csp, *nsave_csp; + PAGE *pg, *npg; + RINTERNAL *rk; + db_indx_t first, indx, pind; + db_pgno_t ppgno; + int32_t nrecs, trecs; + u_int16_t size; + u_int32_t freespace, pfree; + int ret; + + COMPQUIET(bip, NULL); + COMPQUIET(ppgno, PGNO_INVALID); + DB_ASSERT(NULL, dbc != NULL); + DB_ASSERT(NULL, ndbc != NULL); + + /* + * ndbc will contain the the dominating parent of the subtree. + * dbc will have the tree containing the left child. + * + * The stacks descend to the leaf level. + * If this is a recno tree then both stacks will start at the root. + */ + dbp = dbc->dbp; + dbmp = dbp->mpf; + cp = (BTREE_CURSOR *)dbc->internal; + ncp = (BTREE_CURSOR *)ndbc->internal; + *merged = 0; + ret = 0; + + /* + * Set the stacks to the level requested. + * Save the old value to restore when we exit. + */ + save_csp = cp->csp; + cp->csp = &cp->csp[-level + 1]; + pg = cp->csp->page; + pind = NUM_ENT(pg); + + nsave_csp = ncp->csp; + ncp->csp = &ncp->csp[-level + 1]; + npg = ncp->csp->page; + indx = NUM_ENT(npg); + + /* + * The caller may have two stacks that include common ancestors, we + * check here for convenience. + */ + if (npg == pg) + goto done; + + if (TYPE(pg) == P_IBTREE) { + /* + * Check for overflow keys on both pages while we have + * them locked. + */ + if ((ret = + __bam_truncate_internal_overflow(dbc, pg, c_data)) != 0) + goto err; + if ((ret = + __bam_truncate_internal_overflow(dbc, npg, c_data)) != 0) + goto err; + } + + /* + * If we are about to move data off the left most page of an + * internal node we will need to update its parents, make sure there + * will be room for the new key on all the parents in the stack. + * If not, move less data. + */ + fip = NULL; + if (TYPE(pg) == P_IBTREE) { + /* See where we run out of space. */ + freespace = P_FREESPACE(dbp, pg); + /* + * The leftmost key of an internal page is not accurate. + * Go up the tree to find a non-leftmost parent. + */ + epg = ncp->csp; + while (--epg >= ncp->sp && epg->indx == 0) + continue; + fip = bip = GET_BINTERNAL(dbp, epg->page, epg->indx); + epg = ncp->csp; + + for (indx = 0;;) { + size = BINTERNAL_PSIZE(bip->len); + if (size > freespace) + break; + freespace -= size; + if (++indx >= NUM_ENT(npg)) + break; + bip = GET_BINTERNAL(dbp, npg, indx); + } + + /* See if we are deleting the page and we are not left most. */ + if (indx == NUM_ENT(npg) && epg[-1].indx != 0) + goto fits; + + pfree = dbp->pgsize; + for (epg--; epg >= ncp->sp; epg--) + if ((freespace = P_FREESPACE(dbp, epg->page)) < pfree) { + bip = GET_BINTERNAL(dbp, epg->page, epg->indx); + /* Add back in the key we will be deleting. */ + freespace += BINTERNAL_PSIZE(bip->len); + if (freespace < pfree) + pfree = freespace; + if (epg->indx != 0) + break; + } + epg = ncp->csp; + + /* If we are at the end of the page we will delete it. */ + if (indx == NUM_ENT(npg)) { + if (NUM_ENT(epg[-1].page) == 1) + goto fits; + bip = + GET_BINTERNAL(dbp, epg[-1].page, epg[-1].indx + 1); + } else + bip = GET_BINTERNAL(dbp, npg, indx); + + /* Back up until we have a key that fits. */ + while (indx != 0 && BINTERNAL_PSIZE(bip->len) > pfree) { + indx--; + bip = GET_BINTERNAL(dbp, npg, indx); + } + if (indx == 0) + goto done; + } + +fits: memset(&bi, 0, sizeof(bi)); + memset(&hdr, 0, sizeof(hdr)); + memset(&data, 0, sizeof(data)); + trecs = 0; + + /* + * Copy data between internal nodes till one is full + * or the other is empty. + */ + first = 0; + nrecs = 0; + do { + if (dbc->dbtype == DB_BTREE) { + bip = GET_BINTERNAL(dbp, npg, 0); + size = fip == NULL ? + BINTERNAL_SIZE(bip->len) : + BINTERNAL_SIZE(fip->len); + if (P_FREESPACE(dbp, pg) < size + sizeof(db_indx_t)) + break; + + if (fip == NULL) { + data.size = bip->len; + data.data = bip->data; + } else { + data.size = fip->len; + data.data = fip->data; + } + bi.len = data.size; + B_TSET(bi.type, bip->type); + bi.pgno = bip->pgno; + bi.nrecs = bip->nrecs; + hdr.data = &bi; + hdr.size = SSZA(BINTERNAL, data); + if (F_ISSET(cp, C_RECNUM) || F_ISSET(dbc, DBC_OPD)) + nrecs = (int32_t)bip->nrecs; + } else { + rk = GET_RINTERNAL(dbp, npg, 0); + size = RINTERNAL_SIZE; + if (P_FREESPACE(dbp, pg) < size + sizeof(db_indx_t)) + break; + + hdr.data = rk; + hdr.size = size; + nrecs = (int32_t)rk->nrecs; + } + /* + * Try to lock the subtree leaf records without waiting. + * We must lock the subtree below the record we are merging + * and the one after it since that is were a search will wind + * up if it has already looked at our parent. After the first + * move we have the current subtree already locked. + * If we merged any records then we will revisit this + * node when we merge its leaves. If not we will return + * NOTGRANTED and our caller will do a retry. We only + * need to do this if we are in a transation. If not then + * we cannot abort and things will be hosed up on error + * anyway. + */ + if (dbc->txn != NULL && (ret = __bam_lock_tree(ndbc, + ncp->csp, nsave_csp, first, + NUM_ENT(ncp->csp->page) == 1 ? 1 : 2)) != 0) { + if (ret != DB_LOCK_NOTGRANTED) + goto err; + break; + } + first = 1; + if ((ret = __db_pitem(dbc, pg, pind, size, &hdr, &data)) != 0) + goto err; + pind++; + if (fip != NULL) { + /* reset size to be for the record being deleted. */ + size = BINTERNAL_SIZE(bip->len); + fip = NULL; + } + if ((ret = __db_ditem(ndbc, npg, 0, size)) != 0) + goto err; + *merged = 1; + trecs += nrecs; + } while (--indx != 0); + + if (!*merged) + goto done; + + if (trecs != 0) { + cp->csp--; + ret = __bam_adjust(dbc, trecs); + if (ret != 0) + goto err; + cp->csp++; + ncp->csp--; + if ((ret = __bam_adjust(ndbc, -trecs)) != 0) + goto err; + ncp->csp++; + } + + /* + * Either we emptied the page or we need to update its + * parent to reflect the first page we now point to. + * First get rid of the bottom of the stack, + * bam_dpages will clear the stack. Maintain transactional + * locks on the leaf pages to protect changes at this level. + */ + do { + if ((ret = __memp_fput(dbmp, dbc->thread_info, + nsave_csp->page, dbc->priority)) != 0) + goto err; + nsave_csp->page = NULL; + if ((ret = __TLPUT(dbc, nsave_csp->lock)) != 0) + goto err; + LOCK_INIT(nsave_csp->lock); + nsave_csp--; + } while (nsave_csp != ncp->csp); + + if (NUM_ENT(npg) == 0) { + /* + * __bam_dpages may decide to collapse the tree + * so we need to free our other stack. The tree + * will change in hight and our stack will nolonger + * be valid. + */ + cp->csp = save_csp; + cp->sp->page = NULL; + LOCK_INIT(cp->sp->lock); + if (PGNO(ncp->sp->page) == ncp->root && + NUM_ENT(ncp->sp->page) == 2) { + if ((ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0) + goto err; + level = LEVEL(ncp->sp->page); + ppgno = PGNO(ncp->csp[-1].page); + } else + level = 0; + + if (c_data->compact_truncate > PGNO(npg)) + c_data->compact_truncate--; + ret = __bam_dpages(ndbc, + 0, ndbc->dbtype == DB_RECNO ? + BTD_RELINK : BTD_UPDATE | BTD_RELINK); + c_data->compact_pages_free++; + if (ret == 0 && level != 0) { + if ((ret = __memp_fget(dbmp, &ncp->root, + dbc->thread_info, dbc->txn, 0, &npg)) != 0) + goto err; + if (level == LEVEL(npg)) + level = 0; + if ((ret = __memp_fput(dbmp, + dbc->thread_info, npg, dbc->priority)) != 0) + goto err; + npg = NULL; + if (level != 0) { + c_data->compact_levels++; + c_data->compact_pages_free++; + if (c_data->compact_truncate > ppgno) + c_data->compact_truncate--; + if (c_data->compact_pages != 0) + c_data->compact_pages--; + } + } + } else { + ret = __bam_pupdate(ndbc, npg); + + if (NUM_ENT(npg) != 0 && + c_data->compact_truncate != PGNO_INVALID && + PGNO(npg) > c_data->compact_truncate && + ncp->csp != ncp->sp) { + if ((ret = __bam_truncate_page(ndbc, &npg, pg, 1)) != 0) + goto err; + } + if (c_data->compact_truncate != PGNO_INVALID && + PGNO(pg) > c_data->compact_truncate && cp->csp != cp->sp) { + if ((ret = __bam_truncate_page(dbc, &pg, npg, 1)) != 0) + goto err; + } + } + cp->csp = save_csp; + + return (ret); + +done: +err: cp->csp = save_csp; + ncp->csp = nsave_csp; + + return (ret); +} + +/* + * __bam_compact_dups -- try to compress off page dup trees. + * We may or may not have a write lock on this page. + */ +static int +__bam_compact_dups(dbc, ppg, factor, have_lock, c_data, donep) + DBC *dbc; + PAGE **ppg; + u_int32_t factor; + int have_lock; + DB_COMPACT *c_data; + int *donep; +{ + BOVERFLOW *bo; + BTREE_CURSOR *cp; + DB *dbp; + DBC *opd; + DBT start; + DB_MPOOLFILE *dbmp; + ENV *env; + PAGE *dpg, *pg; + db_indx_t i; + db_pgno_t pgno; + int isdone, level, ret, span, t_ret; + + span = 0; + ret = 0; + opd = NULL; + + DB_ASSERT(NULL, dbc != NULL); + dbp = dbc->dbp; + env = dbp->env; + dbmp = dbp->mpf; + cp = (BTREE_CURSOR *)dbc->internal; + pg = *ppg; + + for (i = 0; i < NUM_ENT(pg); i++) { + bo = GET_BOVERFLOW(dbp, pg, i); + if (B_TYPE(bo->type) == B_KEYDATA) + continue; + c_data->compact_pages_examine++; + if (bo->pgno > c_data->compact_truncate) { + (*donep)++; + if (!have_lock) { + /* + * The caller should have the page at + * least read locked. Drop the buffer + * and get the write lock. + */ + pgno = PGNO(pg); + if ((ret = __memp_fput(dbmp, dbc->thread_info, + pg, dbc->priority)) != 0) + goto err; + *ppg = NULL; + if ((ret = __db_lget(dbc, 0, pgno, + DB_LOCK_WRITE, 0, &cp->csp->lock)) != 0) + goto err; + have_lock = 1; + if ((ret = __memp_fget(dbmp, &pgno, + dbc->thread_info, + dbc->txn, DB_MPOOL_DIRTY, ppg)) != 0) + goto err; + pg = *ppg; + } + if ((ret = + __bam_truncate_root_page(dbc, pg, i, c_data)) != 0) + goto err; + /* Just in case it should move. Could it? */ + bo = GET_BOVERFLOW(dbp, pg, i); + } + + if (B_TYPE(bo->type) == B_OVERFLOW) { + if ((ret = __bam_truncate_overflow(dbc, + bo->pgno, have_lock ? NULL : ppg, c_data)) != 0) + goto err; + (*donep)++; + continue; + } + /* + * Take a peek at the root. If it's a leaf then + * there is no tree here, avoid all the trouble. + */ + if ((ret = __memp_fget(dbmp, &bo->pgno, + dbc->thread_info, dbc->txn, 0, &dpg)) != 0) + goto err; + + level = dpg->level; + if ((ret = __memp_fput(dbmp, + dbc->thread_info, dpg, dbc->priority)) != 0) + goto err; + if (level == LEAFLEVEL) + continue; + if ((ret = __dbc_newopd(dbc, bo->pgno, NULL, &opd)) != 0) + return (ret); + if (!have_lock) { + /* + * The caller should have the page at + * least read locked. Drop the buffer + * and get the write lock. + */ + pgno = PGNO(pg); + if ((ret = __memp_fput(dbmp, dbc->thread_info, + pg, dbc->priority)) != 0) + goto err; + *ppg = NULL; + if ((ret = __db_lget(dbc, 0, pgno, + DB_LOCK_WRITE, 0, &cp->csp->lock)) != 0) + goto err; + have_lock = 1; + if ((ret = __memp_fget(dbmp, &pgno, + dbc->thread_info, + dbc->txn, DB_MPOOL_DIRTY, ppg)) != 0) + goto err; + pg = *ppg; + } + (*donep)++; + memset(&start, 0, sizeof(start)); + do { + if ((ret = __bam_compact_int(opd, &start, + NULL, factor, &span, c_data, &isdone)) != 0) + break; + } while (!isdone); + + if (start.data != NULL) + __os_free(env, start.data); + + if (ret != 0) + goto err; + + ret = __dbc_close(opd); + opd = NULL; + if (ret != 0) + goto err; + } + +err: if (opd != NULL && (t_ret = __dbc_close(opd)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __bam_truncate_page -- swap a page with a lower numbered page. + * The cusor has a stack which includes at least the + * immediate parent of this page. + */ +static int +__bam_truncate_page(dbc, pgp, opg, update_parent) + DBC *dbc; + PAGE **pgp, *opg; + int update_parent; +{ + BTREE_CURSOR *cp; + DB *dbp; + DBT data, hdr; + DB_LSN lsn; + DB_LOCK lock; + EPG *epg; + PAGE *newpage; + db_pgno_t newpgno, oldpgno, *pgnop; + int ret; + + DB_ASSERT(NULL, dbc != NULL); + dbp = dbc->dbp; + LOCK_INIT(lock); + + /* + * We want to free a page that lives in the part of the file that + * can be truncated, so we're going to move it onto a free page + * that is in the part of the file that need not be truncated. + * Since the freelist is ordered now, we can simply call __db_new + * which will grab the first element off the freelist; we know this + * is the lowest numbered free page. + */ + if ((ret = __db_new(dbc, P_DONTEXTEND | TYPE(*pgp), + TYPE(*pgp) == P_LBTREE ? &lock : NULL, &newpage)) != 0) + return (ret); + + /* + * If newpage is null then __db_new would have had to allocate + * a new page from the filesystem, so there is no reason + * to continue this action. + */ + if (newpage == NULL) + return (0); + + /* + * It is possible that a higher page is allocated if other threads + * are allocating at the same time, if so, just put it back. + */ + if (PGNO(newpage) > PGNO(*pgp)) { + /* Its unfortunate but you can't just free a new overflow. */ + if (TYPE(newpage) == P_OVERFLOW) + OV_LEN(newpage) = 0; + if ((ret = __LPUT(dbc, lock)) != 0) + return (ret); + return (__db_free(dbc, newpage)); + } + + /* Log if necessary. */ + if (DBC_LOGGING(dbc)) { + memset(&hdr, 0, sizeof(hdr)); + hdr.data = *pgp; + hdr.size = P_OVERHEAD(dbp); + memset(&data, 0, sizeof(data)); + if (TYPE(*pgp) == P_OVERFLOW) { + data.data = (u_int8_t *)*pgp + P_OVERHEAD(dbp); + data.size = OV_LEN(*pgp); + } else { + data.data = (u_int8_t *)*pgp + HOFFSET(*pgp); + data.size = dbp->pgsize - HOFFSET(*pgp); + hdr.size += NUM_ENT(*pgp) * sizeof(db_indx_t); + } + if ((ret = __bam_merge_log(dbp, dbc->txn, + &LSN(newpage), 0, PGNO(newpage), &LSN(newpage), + PGNO(*pgp), &LSN(*pgp), &hdr, &data, 1)) != 0) + goto err; + } else + LSN_NOT_LOGGED(LSN(newpage)); + + oldpgno = PGNO(*pgp); + newpgno = PGNO(newpage); + lsn = LSN(newpage); + memcpy(newpage, *pgp, dbp->pgsize); + PGNO(newpage) = newpgno; + LSN(newpage) = lsn; + + /* Empty the old page. */ + if ((ret = __memp_dirty(dbp->mpf, + pgp, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) + goto err; + if (TYPE(*pgp) == P_OVERFLOW) + OV_LEN(*pgp) = 0; + else { + HOFFSET(*pgp) = dbp->pgsize; + NUM_ENT(*pgp) = 0; + } + LSN(*pgp) = lsn; + + /* Update siblings. */ + switch (TYPE(newpage)) { + case P_OVERFLOW: + case P_LBTREE: + case P_LRECNO: + case P_LDUP: + if (NEXT_PGNO(newpage) == PGNO_INVALID && + PREV_PGNO(newpage) == PGNO_INVALID) + break; + if ((ret = __bam_relink(dbc, *pgp, opg, PGNO(newpage))) != 0) + goto err; + break; + default: + break; + } + cp = (BTREE_CURSOR *)dbc->internal; + + /* + * Now, if we free this page, it will get truncated, when we free + * all the pages after it in the file. + */ + ret = __db_free(dbc, *pgp); + /* db_free always puts the page. */ + *pgp = newpage; + + if (ret != 0) + return (ret); + + if (!update_parent) + goto done; + + /* Update the parent. */ + epg = &cp->csp[-1]; + + switch (TYPE(epg->page)) { + case P_IBTREE: + pgnop = &GET_BINTERNAL(dbp, epg->page, epg->indx)->pgno; + break; + case P_IRECNO: + pgnop = &GET_RINTERNAL(dbp, epg->page, epg->indx)->pgno; + break; + default: + pgnop = &GET_BOVERFLOW(dbp, epg->page, epg->indx)->pgno; + break; + } + DB_ASSERT(dbp->env, oldpgno == *pgnop); + if (DBC_LOGGING(dbc)) { + if ((ret = __bam_pgno_log(dbp, dbc->txn, &LSN(epg->page), + 0, PGNO(epg->page), &LSN(epg->page), (u_int32_t)epg->indx, + *pgnop, PGNO(newpage))) != 0) + return (ret); + } else + LSN_NOT_LOGGED(LSN(epg->page)); + + *pgnop = PGNO(newpage); + cp->csp->page = newpage; + if ((ret = __TLPUT(dbc, lock)) != 0) + return (ret); + +done: return (0); + +err: (void)__memp_fput(dbp->mpf, dbc->thread_info, newpage, dbc->priority); + (void)__TLPUT(dbc, lock); + return (ret); +} + +/* + * __bam_truncate_overflow -- find overflow pages to truncate. + * Walk the pages of an overflow chain and swap out + * high numbered pages. We are passed the first page + * but only deal with the second and subsequent pages. + */ + +static int +__bam_truncate_overflow(dbc, pgno, ppg, c_data) + DBC *dbc; + db_pgno_t pgno; + PAGE **ppg; + DB_COMPACT *c_data; +{ + DB *dbp; + DB_LOCK lock; + PAGE *page; + db_pgno_t ppgno; + int have_lock, ret, t_ret; + + dbp = dbc->dbp; + page = NULL; + LOCK_INIT(lock); + have_lock = ppg == NULL; + + if ((ret = __memp_fget(dbp->mpf, &pgno, + dbc->thread_info, dbc->txn, 0, &page)) != 0) + return (ret); + + while ((pgno = NEXT_PGNO(page)) != PGNO_INVALID) { + if ((ret = __memp_fput(dbp->mpf, + dbc->thread_info, page, dbc->priority)) != 0) + return (ret); + if ((ret = __memp_fget(dbp->mpf, &pgno, + dbc->thread_info, dbc->txn, 0, &page)) != 0) + return (ret); + if (pgno <= c_data->compact_truncate) + continue; + if (have_lock == 0) { + ppgno = PGNO(*ppg); + if ((ret = __memp_fput(dbp->mpf, dbc->thread_info, + *ppg, dbc->priority)) != 0) + goto err; + *ppg = NULL; + if ((ret = __db_lget(dbc, 0, ppgno, + DB_LOCK_WRITE, 0, &lock)) != 0) + goto err; + if ((ret = __memp_fget(dbp->mpf, &ppgno, + dbc->thread_info, + dbc->txn, DB_MPOOL_DIRTY, ppg)) != 0) + goto err; + have_lock = 1; + } + if ((ret = __bam_truncate_page(dbc, &page, NULL, 0)) != 0) + break; + } + +err: if (page != NULL && + (t_ret = __memp_fput( dbp->mpf, + dbc->thread_info, page, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __bam_truncate_root_page -- swap a page which is + * the root of an off page dup tree or the head of an overflow. + * The page is reference by the pg/indx passed in. + */ +static int +__bam_truncate_root_page(dbc, pg, indx, c_data) + DBC *dbc; + PAGE *pg; + u_int32_t indx; + DB_COMPACT *c_data; +{ + BINTERNAL *bi; + BOVERFLOW *bo; + DB *dbp; + DBT orig; + PAGE *page; + db_pgno_t newpgno, *pgnop; + int ret, t_ret; + + COMPQUIET(c_data, NULL); + COMPQUIET(bo, NULL); + COMPQUIET(newpgno, PGNO_INVALID); + dbp = dbc->dbp; + page = NULL; + if (TYPE(pg) == P_IBTREE) { + bi = GET_BINTERNAL(dbp, pg, indx); + if (B_TYPE(bi->type) == B_OVERFLOW) { + bo = (BOVERFLOW *)(bi->data); + pgnop = &bo->pgno; + } else + pgnop = &bi->pgno; + } else { + bo = GET_BOVERFLOW(dbp, pg, indx); + pgnop = &bo->pgno; + } + + DB_ASSERT(dbp->env, IS_DIRTY(pg)); + + if ((ret = __memp_fget(dbp->mpf, pgnop, + dbc->thread_info, dbc->txn, 0, &page)) != 0) + goto err; + + /* + * If this is a multiply reference overflow key, then we will just + * copy it and decrement the reference count. This is part of a + * fix to get rid of multiple references. + */ + if (TYPE(page) == P_OVERFLOW && OV_REF(page) > 1) { + if ((ret = __db_ovref(dbc, bo->pgno)) != 0) + goto err; + memset(&orig, 0, sizeof(orig)); + if ((ret = __db_goff(dbc, &orig, bo->tlen, bo->pgno, + &orig.data, &orig.size)) == 0) + ret = __db_poff(dbc, &orig, &newpgno); + if (orig.data != NULL) + __os_free(dbp->env, orig.data); + if (ret != 0) + goto err; + } else { + if ((ret = __bam_truncate_page(dbc, &page, NULL, 0)) != 0) + goto err; + newpgno = PGNO(page); + /* If we could not allocate from the free list, give up.*/ + if (newpgno == *pgnop) + goto err; + } + + /* Update the reference. */ + if (DBC_LOGGING(dbc)) { + if ((ret = __bam_pgno_log(dbp, + dbc->txn, &LSN(pg), 0, PGNO(pg), + &LSN(pg), (u_int32_t)indx, *pgnop, newpgno)) != 0) + goto err; + } else + LSN_NOT_LOGGED(LSN(pg)); + + *pgnop = newpgno; + +err: if (page != NULL && (t_ret = + __memp_fput(dbp->mpf, dbc->thread_info, + page, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * -- bam_truncate_internal_overflow -- find overflow keys + * on internal pages and if they have high page + * numbers swap them with lower pages and truncate them. + * Note that if there are overflow keys in the internal + * nodes they will get copied adding pages to the database. + */ +static int +__bam_truncate_internal_overflow(dbc, page, c_data) + DBC *dbc; + PAGE *page; + DB_COMPACT *c_data; +{ + BINTERNAL *bi; + BOVERFLOW *bo; + db_indx_t indx; + int ret; + + COMPQUIET(bo, NULL); + ret = 0; + for (indx = 0; indx < NUM_ENT(page); indx++) { + bi = GET_BINTERNAL(dbc->dbp, page, indx); + if (B_TYPE(bi->type) != B_OVERFLOW) + continue; + bo = (BOVERFLOW *)(bi->data); + if (bo->pgno > c_data->compact_truncate && (ret = + __bam_truncate_root_page(dbc, page, indx, c_data)) != 0) + break; + if ((ret = __bam_truncate_overflow( + dbc, bo->pgno, NULL, c_data)) != 0) + break; + } + return (ret); +} + +/* + * __bam_compact_isdone --- + * + * Check to see if the stop key specified by the caller is on the + * current page, in which case we are done compacting. + */ +static int +__bam_compact_isdone(dbc, stop, pg, isdone) + DBC *dbc; + DBT *stop; + PAGE *pg; + int *isdone; +{ + db_recno_t recno; + BTREE *t; + BTREE_CURSOR *cp; + int cmp, ret; + + *isdone = 0; + cp = (BTREE_CURSOR *)dbc->internal; + t = dbc->dbp->bt_internal; + + if (dbc->dbtype == DB_RECNO) { + if ((ret = __ram_getno(dbc, stop, &recno, 0)) != 0) + return (ret); + *isdone = cp->recno > recno; + } else { + DB_ASSERT(dbc->dbp->env, TYPE(pg) == P_LBTREE); + if ((ret = __bam_cmp(dbc, stop, pg, 0, + t->bt_compare, &cmp)) != 0) + return (ret); + + *isdone = cmp <= 0; + } + return (0); +} + +/* + * Lock the subtrees from the top of the stack. + * The 0'th child may be in the stack and locked otherwise iterate + * through the records by calling __bam_lock_subtree. + */ +static int +__bam_lock_tree(dbc, sp, csp, start, stop) + DBC *dbc; + EPG *sp, *csp; + u_int32_t start, stop; +{ + PAGE *cpage; + db_pgno_t pgno; + int ret; + + if (dbc->dbtype == DB_RECNO) + pgno = GET_RINTERNAL(dbc->dbp, sp->page, 0)->pgno; + else + pgno = GET_BINTERNAL(dbc->dbp, sp->page, 0)->pgno; + cpage = (sp + 1)->page; + /* + * First recurse down the left most sub tree if it is in the cursor + * stack. We already have these pages latched and locked if its a + * leaf. + */ + if (start == 0 && sp + 1 != csp && pgno == PGNO(cpage) && + (ret = __bam_lock_tree(dbc, sp + 1, csp, 0, NUM_ENT(cpage))) != 0) + return (ret); + + /* + * Then recurse on the other records on the page if needed. + * If the page is in the stack then its already locked or + * was processed above. + */ + if (start == 0 && pgno == PGNO(cpage)) + start = 1; + + if (start == stop) + return (0); + return (__bam_lock_subtree(dbc, sp->page, start, stop)); + +} + +/* + * Lock the subtree from the current node. + */ +static int +__bam_lock_subtree(dbc, page, indx, stop) + DBC *dbc; + PAGE *page; + u_int32_t indx, stop; +{ + DB *dbp; + DB_LOCK lock; + PAGE *cpage; + db_pgno_t pgno; + int ret, t_ret; + + dbp = dbc->dbp; + + for (; indx < stop; indx++) { + if (dbc->dbtype == DB_RECNO) + pgno = GET_RINTERNAL(dbc->dbp, page, indx)->pgno; + else + pgno = GET_BINTERNAL(dbc->dbp, page, indx)->pgno; + if (LEVEL(page) - 1 == LEAFLEVEL) { + if ((ret = __db_lget(dbc, 0, pgno, + DB_LOCK_WRITE, DB_LOCK_NOWAIT, &lock)) != 0) { + if (ret == DB_LOCK_DEADLOCK) + return (DB_LOCK_NOTGRANTED); + return (ret); + } + } else { + if ((ret = __memp_fget(dbp->mpf, &pgno, + dbc->thread_info, dbc->txn, 0, &cpage)) != 0) + return (ret); + ret = __bam_lock_subtree(dbc, cpage, 0, NUM_ENT(cpage)); + if ((t_ret = __memp_fput(dbp->mpf, dbc->thread_info, + cpage, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + return (ret); + } + } + return (0); +} + +#ifdef HAVE_FTRUNCATE +/* + * __bam_savekey -- save the key from an internal page. + * We need to save information so that we can + * fetch then next internal node of the tree. This means + * we need the btree key on this current page, or the + * next record number. + */ +static int +__bam_savekey(dbc, next, start) + DBC *dbc; + int next; + DBT *start; +{ + BINTERNAL *bi; + BKEYDATA *bk; + BOVERFLOW *bo; + BTREE_CURSOR *cp; + DB *dbp; + DB_LOCK lock; + ENV *env; + PAGE *pg; + RINTERNAL *ri; + db_indx_t indx, top; + db_pgno_t pgno, saved_pgno; + int ret, t_ret; + u_int32_t len; + u_int8_t *data; + int level; + + dbp = dbc->dbp; + env = dbp->env; + cp = (BTREE_CURSOR *)dbc->internal; + pg = cp->csp->page; + ret = 0; + + if (dbc->dbtype == DB_RECNO) { + if (next) + for (indx = 0, top = NUM_ENT(pg); indx != top; indx++) { + ri = GET_RINTERNAL(dbp, pg, indx); + cp->recno += ri->nrecs; + } + return (__db_retcopy(env, start, &cp->recno, + sizeof(cp->recno), &start->data, &start->ulen)); + + } + + bi = GET_BINTERNAL(dbp, pg, NUM_ENT(pg) - 1); + data = bi->data; + len = bi->len; + LOCK_INIT(lock); + saved_pgno = PGNO_INVALID; + /* If there is single record on the page it may have an empty key. */ + while (len == 0) { + /* + * We should not have an empty data page, since we just + * compacted things, check anyway and punt. + */ + if (NUM_ENT(pg) == 0) + goto no_key; + pgno = bi->pgno; + level = LEVEL(pg); + if (pg != cp->csp->page && + (ret = __memp_fput(dbp->mpf, + dbc->thread_info, pg, dbc->priority)) != 0) { + pg = NULL; + goto err; + } + if (level - 1 == LEAFLEVEL) { + TRY_LOCK(dbc, pgno, saved_pgno, + lock, DB_LOCK_READ, retry); + if (ret != 0) + goto err; + } + if ((ret = __memp_fget(dbp->mpf, &pgno, + dbc->thread_info, dbc->txn, 0, &pg)) != 0) + goto err; + + /* + * At the data level use the last key to try and avoid the + * possibility that the user has a zero length key, if they + * do, we punt. + */ + if (pg->level == LEAFLEVEL) { + bk = GET_BKEYDATA(dbp, pg, NUM_ENT(pg) - 2); + data = bk->data; + len = bk->len; + if (len == 0) { +no_key: __db_errx(env, + "Compact cannot handle zero length key"); + ret = DB_NOTFOUND; + goto err; + } + } else { + bi = GET_BINTERNAL(dbp, pg, NUM_ENT(pg) - 1); + data = bi->data; + len = bi->len; + } + } + if (B_TYPE(bi->type) == B_OVERFLOW) { + bo = (BOVERFLOW *)(data); + ret = __db_goff(dbc, start, bo->tlen, bo->pgno, + &start->data, &start->ulen); + } + else + ret = __db_retcopy(env, + start, data, len, &start->data, &start->ulen); + +err: if (pg != NULL && pg != cp->csp->page && + (t_ret = __memp_fput(dbp->mpf, dbc->thread_info, + pg, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + return (ret); + +retry: return (DB_LOCK_NOTGRANTED); +} + +/* + * bam_truncate_internal -- + * Find high numbered pages in the internal nodes of a tree and + * swap them. + */ +static int +__bam_truncate_internal(dbp, ip, txn, c_data) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + DB_COMPACT *c_data; +{ + BTREE_CURSOR *cp; + DBC *dbc; + DBT start; + DB_LOCK meta_lock; + PAGE *pg; + db_pgno_t pgno; + u_int32_t sflag; + int level, local_txn, ret, t_ret; + + dbc = NULL; + memset(&start, 0, sizeof(start)); + + if (IS_DB_AUTO_COMMIT(dbp, txn)) { + local_txn = 1; + txn = NULL; + } else + local_txn = 0; + + level = LEAFLEVEL + 1; + sflag = CS_READ | CS_GETRECNO; + LOCK_INIT(meta_lock); + +new_txn: + if (local_txn && + (ret = __txn_begin(dbp->env, ip, NULL, &txn, 0)) != 0) + goto err; + + if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0) + goto err; + cp = (BTREE_CURSOR *)dbc->internal; + + /* + * If the the root is a leaf we have nothing to do. + * Searching an empty RECNO tree will return NOTFOUND below and loop. + */ + if ((ret = __memp_fget(dbp->mpf, &cp->root, ip, txn, 0, &pg)) != 0) + goto err; + if (LEVEL(pg) == LEAFLEVEL) { + ret = __memp_fput(dbp->mpf, ip, pg, dbp->priority); + goto err; + } + if ((ret = __memp_fput(dbp->mpf, ip, pg, dbp->priority)) != 0) + goto err; + + pgno = PGNO_INVALID; + do { + if ((ret = __bam_csearch(dbc, &start, sflag, level)) != 0) { + /* No more at this level, go up one. */ + if (ret == DB_NOTFOUND) { + level++; + if (start.data != NULL) + __os_free(dbp->env, start.data); + memset(&start, 0, sizeof(start)); + sflag = CS_READ | CS_GETRECNO; + continue; + } + goto err; + } + c_data->compact_pages_examine++; + + pg = cp->csp->page; + pgno = PGNO(pg); + + sflag = CS_NEXT | CS_GETRECNO; + /* Grab info about the page and drop the stack. */ + if (pgno != cp->root && (ret = __bam_savekey(dbc, + pgno <= c_data->compact_truncate, &start)) != 0) { + if (ret == DB_LOCK_NOTGRANTED) + continue; + goto err; + } + + if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0) + goto err; + if (pgno == cp->root) + break; + + if (pgno <= c_data->compact_truncate) + continue; + + /* Get the meta page lock before latching interior nodes. */ + if (!LOCK_ISSET(meta_lock) && (ret = __db_lget(dbc, + 0, PGNO_BASE_MD, DB_LOCK_WRITE, 0, &meta_lock)) != 0) + goto err; + + /* Reget the page with a write latch, and its parent too. */ + if ((ret = __bam_csearch(dbc, + &start, CS_PARENT | CS_GETRECNO, level)) != 0) { + if (ret == DB_NOTFOUND) { + ret = 0; + } + goto err; + } + pg = cp->csp->page; + pgno = PGNO(pg); + + if (pgno > c_data->compact_truncate) { + if ((ret = __bam_truncate_page(dbc, &pg, NULL, 1)) != 0) + goto err; + if (pgno == PGNO(pg)) { + /* We could not allocate. Give up. */ + pgno = cp->root; + } + } + + if ((ret = __bam_stkrel(dbc, + pgno > c_data->compact_truncate ? 0 : STK_NOLOCK)) != 0) + goto err; + + /* We are locking subtrees, so drop the write locks asap. */ + if (local_txn && pgno > c_data->compact_truncate) + break; + } while (pgno != cp->root); + + if ((ret = __dbc_close(dbc)) != 0) + goto err; + dbc = NULL; + if (local_txn) { + if ((ret = __txn_commit(txn, DB_TXN_NOSYNC)) != 0) + goto err; + txn = NULL; + LOCK_INIT(meta_lock); + } + if (pgno != ((BTREE *)dbp->bt_internal)->bt_root) + goto new_txn; + +err: if (txn != NULL && ret != 0) + sflag = STK_PGONLY; + else + sflag = 0; + if (txn == NULL) + if ((t_ret = __LPUT(dbc, meta_lock)) != 0 && ret == 0) + ret = t_ret; + if (dbc != NULL && (t_ret = __bam_stkrel(dbc, sflag)) != 0 && ret == 0) + ret = t_ret; + if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + if (local_txn && + txn != NULL && (t_ret = __txn_abort(txn)) != 0 && ret == 0) + ret = t_ret; + if (start.data != NULL) + __os_free(dbp->env, start.data); + return (ret); +} + +static int +__bam_setup_freelist(dbp, list, nelems) + DB *dbp; + db_pglist_t *list; + u_int32_t nelems; +{ + DB_MPOOLFILE *mpf; + db_pgno_t *plist; + int ret; + + mpf = dbp->mpf; + + if ((ret = __memp_alloc_freelist(mpf, nelems, &plist)) != 0) + return (ret); + + while (nelems-- != 0) + *plist++ = list++->pgno; + + return (0); +} + +static int +__bam_free_freelist(dbp, ip, txn) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; +{ + DBC *dbc; + DB_LOCK lock; + int auto_commit, ret, t_ret; + + LOCK_INIT(lock); + auto_commit = ret = 0; + + /* + * If we are not in a transaction then we need to get + * a lock on the meta page, otherwise we should already + * have the lock. + */ + + dbc = NULL; + if (IS_DB_AUTO_COMMIT(dbp, txn)) { + /* + * We must not timeout the lock or we will not free the list. + * We ignore errors from txn_begin as there is little that + * the application can do with the error and we want to + * get the lock and free the list if at all possible. + */ + if (__txn_begin(dbp->env, ip, NULL, &txn, 0) == 0) { + (void)__lock_set_timeout(dbp->env, + txn->locker, 0, DB_SET_TXN_TIMEOUT); + (void)__lock_set_timeout(dbp->env, + txn->locker, 0, DB_SET_LOCK_TIMEOUT); + auto_commit = 1; + } + /* Get a cursor so we can call __db_lget. */ + if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0) + return (ret); + + if ((ret = __db_lget(dbc, + 0, PGNO_BASE_MD, DB_LOCK_WRITE, 0, &lock)) != 0) + goto err; + } + + ret = __memp_free_freelist(dbp->mpf); + +err: if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0) + ret = t_ret; + + if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + if (auto_commit && __txn_abort(txn) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} +#endif diff --git a/btree/bt_compare.c b/btree/bt_compare.c new file mode 100644 index 0000000..bc340f2 --- /dev/null +++ b/btree/bt_compare.c @@ -0,0 +1,213 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" + +/* + * __bam_cmp -- + * Compare a key to a given record. + * + * PUBLIC: int __bam_cmp __P((DBC *, const DBT *, PAGE *, u_int32_t, + * PUBLIC: int (*)(DB *, const DBT *, const DBT *), int *)); + */ +int +__bam_cmp(dbc, dbt, h, indx, func, cmpp) + DBC *dbc; + const DBT *dbt; + PAGE *h; + u_int32_t indx; + int (*func)__P((DB *, const DBT *, const DBT *)); + int *cmpp; +{ + BINTERNAL *bi; + BKEYDATA *bk; + BOVERFLOW *bo; + DB *dbp; + DBT pg_dbt; + + dbp = dbc->dbp; + + /* + * Returns: + * < 0 if dbt is < page record + * = 0 if dbt is = page record + * > 0 if dbt is > page record + * + * !!! + * We do not clear the pg_dbt DBT even though it's likely to contain + * random bits. That should be okay, because the app's comparison + * routine had better not be looking at fields other than data, size + * and app_data. We don't clear it because we go through this path a + * lot and it's expensive. + */ + switch (TYPE(h)) { + case P_LBTREE: + case P_LDUP: + case P_LRECNO: + bk = GET_BKEYDATA(dbp, h, indx); + if (B_TYPE(bk->type) == B_OVERFLOW) + bo = (BOVERFLOW *)bk; + else { + pg_dbt.app_data = NULL; + pg_dbt.data = bk->data; + pg_dbt.size = bk->len; + *cmpp = func(dbp, dbt, &pg_dbt); + return (0); + } + break; + case P_IBTREE: + /* + * The following code guarantees that the left-most key on an + * internal page at any place in the tree sorts less than any + * user-specified key. The reason is that if we have reached + * this internal page, we know the user key must sort greater + * than the key we're storing for this page in any internal + * pages at levels above us in the tree. It then follows that + * any user-specified key cannot sort less than the first page + * which we reference, and so there's no reason to call the + * comparison routine. While this may save us a comparison + * routine call or two, the real reason for this is because + * we don't maintain a copy of the smallest key in the tree, + * so that we don't have to update all the levels of the tree + * should the application store a new smallest key. And, so, + * we may not have a key to compare, which makes doing the + * comparison difficult and error prone. + */ + if (indx == 0) { + *cmpp = 1; + return (0); + } + + bi = GET_BINTERNAL(dbp, h, indx); + if (B_TYPE(bi->type) == B_OVERFLOW) + bo = (BOVERFLOW *)(bi->data); + else { + pg_dbt.app_data = NULL; + pg_dbt.data = bi->data; + pg_dbt.size = bi->len; + *cmpp = func(dbp, dbt, &pg_dbt); + return (0); + } + break; + default: + return (__db_pgfmt(dbp->env, PGNO(h))); + } + + /* + * Overflow. + */ + return (__db_moff(dbc, dbt, bo->pgno, bo->tlen, + func == __bam_defcmp ? NULL : func, cmpp)); +} + +/* + * __bam_defcmp -- + * Default comparison routine. + * + * PUBLIC: int __bam_defcmp __P((DB *, const DBT *, const DBT *)); + */ +int +__bam_defcmp(dbp, a, b) + DB *dbp; + const DBT *a, *b; +{ + size_t len; + u_int8_t *p1, *p2; + + COMPQUIET(dbp, NULL); + + /* + * Returns: + * < 0 if a is < b + * = 0 if a is = b + * > 0 if a is > b + * + * XXX + * If a size_t doesn't fit into a long, or if the difference between + * any two characters doesn't fit into an int, this routine can lose. + * What we need is a signed integral type that's guaranteed to be at + * least as large as a size_t, and there is no such thing. + */ + len = a->size > b->size ? b->size : a->size; + for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2) + if (*p1 != *p2) + return ((long)*p1 - (long)*p2); + return ((long)a->size - (long)b->size); +} + +/* + * __bam_defpfx -- + * Default prefix routine. + * + * PUBLIC: size_t __bam_defpfx __P((DB *, const DBT *, const DBT *)); + */ +size_t +__bam_defpfx(dbp, a, b) + DB *dbp; + const DBT *a, *b; +{ + size_t cnt, len; + u_int8_t *p1, *p2; + + COMPQUIET(dbp, NULL); + + cnt = 1; + len = a->size > b->size ? b->size : a->size; + for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2, ++cnt) + if (*p1 != *p2) + return (cnt); + + /* + * They match up to the smaller of the two sizes. + * Collate the longer after the shorter. + */ + if (a->size < b->size) + return (a->size + 1); + if (b->size < a->size) + return (b->size + 1); + return (b->size); +} diff --git a/btree/bt_compress.c b/btree/bt_compress.c new file mode 100644 index 0000000..bdf1e17 --- /dev/null +++ b/btree/bt_compress.c @@ -0,0 +1,3024 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" + +#ifdef HAVE_COMPRESSION + +static int __bam_compress_marshal_data __P((DB *, const DBT *, DBT *)); +static int __bam_compress_set_dbt __P((DB *, DBT *, const void *, u_int32_t)); +static int __bamc_compress_del_and_get_next __P((DBC *, DBT *, DBT *)); +static int __bamc_compress_get_bothc __P((DBC *, DBT *, u_int32_t)); +static int __bamc_compress_get_multiple_key __P((DBC *, DBT *, u_int32_t)); +static int __bamc_compress_get_multiple __P((DBC *, DBT *, DBT *,u_int32_t)); +static int __bamc_compress_get_next __P((DBC *, u_int32_t)); +static int __bamc_compress_get_next_dup __P((DBC *, DBT *, u_int32_t)); +static int __bamc_compress_get_next_nodup __P((DBC *, u_int32_t)); +static int __bamc_compress_get_prev __P((DBC *, u_int32_t)); +static int __bamc_compress_get_prev_dup __P((DBC *, u_int32_t)); +static int __bamc_compress_get_prev_nodup __P((DBC *, u_int32_t)); +static int __bamc_compress_get_set __P((DBC *, + DBT *, DBT *, u_int32_t, u_int32_t)); +static int __bamc_compress_ibulk_del __P((DBC *, DBT *, u_int32_t)); +static int __bamc_compress_idel __P((DBC *, u_int32_t)); +static int __bamc_compress_iget __P((DBC *, DBT *, DBT *, u_int32_t)); +static int __bamc_compress_iput __P((DBC *, DBT *, DBT *, u_int32_t)); +static int __bamc_compress_relocate __P((DBC *)); +static void __bamc_compress_reset __P((DBC *)); +static int __bamc_compress_seek __P((DBC *, + const DBT *, const DBT *, u_int32_t)); +static int __bamc_compress_store __P((DBC *, + DBT *, DBT*, DBT **, DBT **, DBT *, DBT *)); +static int __bamc_next_decompress __P((DBC *)); +static int __bamc_start_decompress __P((DBC *)); + +/* + * Call __dbc_iget(), resizing DBTs if DB_BUFFER_SMALL is returned. + * We're always using a transient cursor when this macro is used, so + * we have to replace the OP with DB_CURRENT when we retry. + */ +#define CMP_IGET_RETRY(ret, dbc, dbt1, dbt2, flags) do { \ + DB_ASSERT((dbc)->env, F_ISSET((dbt1), DB_DBT_USERMEM)); \ + DB_ASSERT((dbc)->env, F_ISSET((dbt2), DB_DBT_USERMEM)); \ + if (((ret) =__dbc_iget((dbc), \ + (dbt1), (dbt2), (flags))) == DB_BUFFER_SMALL) { \ + if ((CMP_RESIZE_DBT((ret), (dbc)->env, (dbt1))) != 0) \ + break; \ + if ((CMP_RESIZE_DBT((ret), (dbc)->env, (dbt2))) != 0) \ + break; \ + (ret) = __dbc_iget((dbc), (dbt1), (dbt2), \ + ((flags) & ~DB_OPFLAGS_MASK) | DB_CURRENT); \ + } \ +} while (0) + +#define CMP_INIT_DBT(dbt) do { \ + (dbt)->data = NULL; \ + (dbt)->size = 0; \ + (dbt)->ulen = 0; \ + (dbt)->doff = 0; \ + (dbt)->dlen = 0; \ + (dbt)->flags = DB_DBT_USERMEM; \ + (dbt)->app_data = NULL; \ +} while (0) + +#define CMP_FREE_DBT(env, dbt) do { \ + DB_ASSERT((env), F_ISSET((dbt), DB_DBT_USERMEM)); \ + __os_free((env), (dbt)->data); \ +} while (0) + +#define CMP_RESIZE_DBT(ret, env, dbt) \ + (((dbt)->size > (dbt)->ulen) ? \ + ((((ret) = __os_realloc((env), (dbt)->size, &(dbt)->data)) \ + != 0) ? (ret) : (((dbt)->ulen = (dbt)->size), 0)) : 0) + +static int +__bam_compress_set_dbt(dbp, dbt, data, size) + DB *dbp; + DBT *dbt; + const void *data; + u_int32_t size; +{ + int ret; + + ret = 0; + DB_ASSERT(dbp->env, F_ISSET(dbt, DB_DBT_USERMEM)); + + dbt->size = size; + if (CMP_RESIZE_DBT(ret, dbp->env, dbt) != 0) + return (ret); + + memcpy(dbt->data, data, size); + return (0); +} + +/******************************************************************************/ + +/* + * Very simple key/data stream to give __bamc_compress_merge_insert() + * a source of data to work on. + */ +struct __bam_compress_stream; +typedef struct __bam_compress_stream BTREE_COMPRESS_STREAM; +struct __bam_compress_stream +{ + int (*next)(BTREE_COMPRESS_STREAM *, DBT *, DBT *); + + void *kptr, *dptr; + DBT *key, *data; +}; + +/* + * These function prototypes can not go at the beginning because they rely on + * on BTREE_COMPRESS_STREAM defined above. + * The prototypes are required to avoid the Microsoft C++ compiler generating + * warnings about mismatching parameter lists. + */ +static int __bam_cs_next_done __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *)); +static int __bam_cs_single_next __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *)); +static void __bam_cs_create_single + __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *)); +static int __bam_cs_single_keyonly_next + __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *)); +static void __bam_cs_create_single_keyonly + __P((BTREE_COMPRESS_STREAM *, DBT *)); +static int __bam_cs_multiple_key_next + __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *)); +static void __bam_cs_create_multiple_key __P((BTREE_COMPRESS_STREAM *, DBT *)); +static int __bam_cs_multiple_next __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *)); +static void __bam_cs_create_multiple + __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *)); +static int __bam_cs_multiple_keyonly_next + __P((BTREE_COMPRESS_STREAM *, DBT *, DBT *)); +static void __bam_cs_create_multiple_keyonly + __P((BTREE_COMPRESS_STREAM *, DBT *)); +static int __bamc_compress_merge_insert + __P((DBC *, BTREE_COMPRESS_STREAM *, u_int32_t *, u_int32_t)); +static int __bamc_compress_merge_delete + __P((DBC *, BTREE_COMPRESS_STREAM *, u_int32_t *)); +static int __bamc_compress_merge_delete_dups + __P((DBC *, BTREE_COMPRESS_STREAM *, u_int32_t *)); + +/* BTREE_COMPRESS_STREAM->next() for when the data has finished. */ +static int +__bam_cs_next_done(stream, key, data) + BTREE_COMPRESS_STREAM *stream; + DBT *key, *data; +{ + COMPQUIET(stream, NULL); + COMPQUIET(key, NULL); + COMPQUIET(data, NULL); + return (0); +} + +/* BTREE_COMPRESS_STREAM->next() for a single key/data pair. */ +static int +__bam_cs_single_next(stream, key, data) + BTREE_COMPRESS_STREAM *stream; + DBT *key, *data; +{ + key->data = stream->key->data; + key->size = stream->key->size; + data->data = stream->data->data; + data->size = stream->data->size; + stream->next = __bam_cs_next_done; + return (1); +} + +/* Create a BTREE_COMPRESS_STREAM for a single key/data pair */ +static void +__bam_cs_create_single(stream, key, data) + BTREE_COMPRESS_STREAM *stream; + DBT *key, *data; +{ + stream->next = __bam_cs_single_next; + stream->key = key; + stream->data = data; +} + +/* BTREE_COMPRESS_STREAM->next() for a single key. */ +static int +__bam_cs_single_keyonly_next(stream, key, data) + BTREE_COMPRESS_STREAM *stream; + DBT *key, *data; +{ + key->data = stream->key->data; + key->size = stream->key->size; + if (data != NULL) { + data->data = NULL; + data->size = 0; + } + stream->next = __bam_cs_next_done; + return (1); +} + +/* Create a BTREE_COMPRESS_STREAM for a single key/data pair */ +static void +__bam_cs_create_single_keyonly(stream, key) + BTREE_COMPRESS_STREAM *stream; + DBT *key; +{ + stream->next = __bam_cs_single_keyonly_next; + stream->key = key; +} + +/* + * BTREE_COMPRESS_STREAM->next() for a single buffer in the DB_MULTIPLE_KEY + * format. + */ +static int +__bam_cs_multiple_key_next(stream, key, data) + BTREE_COMPRESS_STREAM *stream; + DBT *key, *data; +{ + DB_MULTIPLE_KEY_NEXT(stream->kptr, stream->key, key->data, key->size, + data->data, data->size); + if (key->data == NULL) { + stream->next = __bam_cs_next_done; + return (0); + } + return (1); +} + +/* + * Create a BTREE_COMPRESS_STREAM for a single buffer in the DB_MULTIPLE_KEY + * format. + */ +static void +__bam_cs_create_multiple_key(stream, multiple) + BTREE_COMPRESS_STREAM *stream; + DBT *multiple; +{ + stream->next = __bam_cs_multiple_key_next; + stream->key = multiple; + DB_MULTIPLE_INIT(stream->kptr, stream->key); +} + +/* BTREE_COMPRESS_STREAM->next() for two buffers in the DB_MULTIPLE format. */ +static int +__bam_cs_multiple_next(stream, key, data) + BTREE_COMPRESS_STREAM *stream; + DBT *key, *data; +{ + DB_MULTIPLE_NEXT(stream->kptr, stream->key, key->data, key->size); + DB_MULTIPLE_NEXT(stream->dptr, stream->data, data->data, data->size); + if (key->data == NULL || data->data == NULL) { + stream->next = __bam_cs_next_done; + return (0); + } + return (1); +} + +/* Create a BTREE_COMPRESS_STREAM for two buffers in the DB_MULTIPLE format. */ +static void +__bam_cs_create_multiple(stream, key, data) + BTREE_COMPRESS_STREAM *stream; + DBT *key, *data; +{ + stream->next = __bam_cs_multiple_next; + stream->key = key; + stream->data = data; + DB_MULTIPLE_INIT(stream->kptr, stream->key); + DB_MULTIPLE_INIT(stream->dptr, stream->data); +} + +/* + * BTREE_COMPRESS_STREAM->next() for a single buffer in the DB_MULTIPLE + * format. + */ +static int +__bam_cs_multiple_keyonly_next(stream, key, data) + BTREE_COMPRESS_STREAM *stream; + DBT *key, *data; +{ + DB_MULTIPLE_NEXT(stream->kptr, stream->key, key->data, key->size); + if (key->data == NULL) { + stream->next = __bam_cs_next_done; + return (0); + } + if (data != NULL) { + data->data = NULL; + data->size = 0; + } + return (1); +} + +/* + * Create a BTREE_COMPRESS_STREAM for a single buffer in the DB_MULTIPLE + * format. + */ +static void +__bam_cs_create_multiple_keyonly(stream, key) + BTREE_COMPRESS_STREAM *stream; + DBT *key; +{ + stream->next = __bam_cs_multiple_keyonly_next; + stream->key = key; + DB_MULTIPLE_INIT(stream->kptr, stream->key); +} + +/******************************************************************************/ + +/* + * Marshal data in initial data format into destbuf, resizing destbuf if + * necessary. + */ +static int +__bam_compress_marshal_data(dbp, data, destbuf) + DB *dbp; + const DBT *data; + DBT *destbuf; +{ + int ret; + u_int8_t *ptr; + + ret = 0; + DB_ASSERT(dbp->env, F_ISSET(destbuf, DB_DBT_USERMEM)); + + destbuf->size = __db_compress_count_int(data->size); + destbuf->size += data->size; + if (CMP_RESIZE_DBT(ret, dbp->env, destbuf) != 0) + return (ret); + + ptr = (u_int8_t*)destbuf->data; + ptr += __db_compress_int(ptr, data->size); + memcpy(ptr, data->data, data->size); + + return (0); +} + +/* + * Unmarshal initial data from source into data - does not copy, points + * into source. + */ +#define CMP_UNMARSHAL_DATA(src, dest) do { \ + (dest)->data = ((u_int8_t*)(src)->data) + \ + __db_decompress_int32((u_int8_t*)(src)->data, \ + &(dest)->size); \ +} while (0) + +/******************************************************************************/ + +/* + * __bam_compress_dupcmp -- + * Duplicate comparison function for compressed BTrees. + * + * PUBLIC: int __bam_compress_dupcmp __P((DB *, const DBT *, const DBT *)); + */ +int +__bam_compress_dupcmp(db, a, b) + DB *db; + const DBT *a; + const DBT *b; +{ + DBT dcmp_a, dcmp_b; + + /* Decompress the initial data in a */ + CMP_UNMARSHAL_DATA(a, &dcmp_a); + dcmp_a.ulen = 0; + dcmp_a.doff = 0; + dcmp_a.dlen = 0; + dcmp_a.flags = 0; + dcmp_a.app_data = 0; + + /* Decompress the initial data in b */ + CMP_UNMARSHAL_DATA(b, &dcmp_b); + dcmp_b.ulen = 0; + dcmp_b.doff = 0; + dcmp_b.dlen = 0; + dcmp_b.flags = 0; + dcmp_b.app_data = 0; + + /* Call the user's duplicate compare function */ + return ((BTREE *)db->bt_internal)-> + compress_dup_compare(db, &dcmp_a, &dcmp_b); +} + +/* + * __bam_defcompress -- + * Default compression routine. + * + * PUBLIC: int __bam_defcompress __P((DB *, const DBT *, const DBT *, + * PUBLIC: const DBT *, const DBT *, DBT *)); + */ +int +__bam_defcompress(dbp, prevKey, prevData, key, data, dest) + DB *dbp; + const DBT *prevKey, *prevData, *key, *data; + DBT *dest; +{ + u_int8_t *ptr; + const u_int8_t *k, *p; + size_t len, prefix, suffix; + + COMPQUIET(dbp, NULL); + + k = (const u_int8_t*)key->data; + p = (const u_int8_t*)prevKey->data; + len = key->size > prevKey->size ? prevKey->size : key->size; + for (; len-- && *k == *p; ++k, ++p) + continue; + + prefix = (size_t)(k - (u_int8_t*)key->data); + suffix = key->size - prefix; + + if (prefix == prevKey->size && suffix == 0) { + /* It's a duplicate - do prefix compression on the value */ + k = (const u_int8_t*)data->data; + p = (const u_int8_t*)prevData->data; + len = data->size > prevData->size ? prevData->size : data->size; + for (; len-- && *k == *p; ++k, ++p) + continue; + + prefix = (size_t)(k - (u_int8_t*)data->data); + suffix = data->size - prefix; + + /* Check that we have enough space in dest */ + dest->size = (u_int32_t)(1 + __db_compress_count_int(prefix) + + __db_compress_count_int(suffix) + suffix); + if (dest->size > dest->ulen) + return (DB_BUFFER_SMALL); + + /* Magic identifying byte */ + ptr = (u_int8_t*)dest->data; + *ptr = CMP_INT_SPARE_VAL; + ++ptr; + + /* prefix length */ + ptr += __db_compress_int(ptr, prefix); + + /* suffix length */ + ptr += __db_compress_int(ptr, suffix); + + /* suffix */ + memcpy(ptr, k, suffix); + + return (0); + } + + /* Check that we have enough space in dest */ + dest->size = (u_int32_t)(__db_compress_count_int(prefix) + + __db_compress_count_int(suffix) + + __db_compress_count_int(data->size) + suffix + data->size); + if (dest->size > dest->ulen) + return (DB_BUFFER_SMALL); + + /* prefix length */ + ptr = (u_int8_t*)dest->data; + ptr += __db_compress_int(ptr, prefix); + + /* suffix length */ + ptr += __db_compress_int(ptr, suffix); + + /* data length */ + ptr += __db_compress_int(ptr, data->size); + + /* suffix */ + memcpy(ptr, k, suffix); + ptr += suffix; + + /* data */ + memcpy(ptr, data->data, data->size); + + return (0); +} + +/* + * __bam_defdecompress -- + * Default decompression routine. + * + * PUBLIC: int __bam_defdecompress __P((DB *, const DBT *, const DBT *, DBT *, + * PUBLIC: DBT *, DBT *)); + */ +int +__bam_defdecompress(dbp, prevKey, prevData, compressed, destKey, destData) + DB *dbp; + const DBT *prevKey, *prevData; + DBT *compressed, *destKey, *destData; +{ + u_int8_t *s, *d; + u_int32_t prefix, suffix, size; + + COMPQUIET(dbp, NULL); + + /* + * Check for the magic identifying byte, that tells us that this is a + * compressed duplicate value. + */ + s = (u_int8_t*)compressed->data; + if (*s == CMP_INT_SPARE_VAL) { + ++s; + size = 1; + + /* Unmarshal prefix and suffix */ + size += __db_decompress_count_int(s); + if (size > compressed->size) + return (EINVAL); + s += __db_decompress_int32(s, &prefix); + + size += __db_decompress_count_int(s); + if (size > compressed->size) + return (EINVAL); + s += __db_decompress_int32(s, &suffix); + + /* Check destination lengths */ + destKey->size = prevKey->size; + destData->size = prefix + suffix; + if (destKey->size > destKey->ulen || + destData->size > destData->ulen) + return (DB_BUFFER_SMALL); + + /* Write the key */ + memcpy(destKey->data, prevKey->data, destKey->size); + + /* Write the prefix */ + if (prefix > prevData->size) + return (EINVAL); + d = (u_int8_t*)destData->data; + memcpy(d, prevData->data, prefix); + d += prefix; + + /* Write the suffix */ + size += suffix; + if (size > compressed->size) + return (EINVAL); + memcpy(d, s, suffix); + s += suffix; + + /* Return bytes read */ + compressed->size = (u_int32_t)(s - (u_int8_t*)compressed->data); + return (0); + } + + /* Unmarshal prefix, suffix and data length */ + size = __db_decompress_count_int(s); + if (size > compressed->size) + return (EINVAL); + s += __db_decompress_int32(s, &prefix); + + size += __db_decompress_count_int(s); + if (size > compressed->size) + return (EINVAL); + s += __db_decompress_int32(s, &suffix); + + size += __db_decompress_count_int(s); + if (size > compressed->size) + return (EINVAL); + s += __db_decompress_int32(s, &destData->size); + + /* Check destination lengths */ + destKey->size = prefix + suffix; + if (destKey->size > destKey->ulen || destData->size > destData->ulen) + return (DB_BUFFER_SMALL); + + /* Write the prefix */ + if (prefix > prevKey->size) + return (EINVAL); + d = (u_int8_t*)destKey->data; + memcpy(d, prevKey->data, prefix); + d += prefix; + + /* Write the suffix */ + size += suffix; + if (size > compressed->size) + return (EINVAL); + memcpy(d, s, suffix); + s += suffix; + + /* Write the data */ + size += destData->size; + if (size > compressed->size) + return (EINVAL); + memcpy(destData->data, s, destData->size); + s += destData->size; + + /* Return bytes read */ + compressed->size = (u_int32_t)(s - (u_int8_t*)compressed->data); + return (0); +} + +/******************************************************************************/ + +/* + * Set dbc up to start decompressing the compressed key/data pair, dbc->key1 + * and dbc->compressed. + */ +static int +__bamc_start_decompress(dbc) + DBC *dbc; +{ + BTREE_CURSOR *cp; + int ret; + u_int32_t datasize; + + cp = (BTREE_CURSOR *)dbc->internal; + + cp->prevKey = NULL; + cp->prevData = NULL; + cp->currentKey = &cp->key1; + cp->currentData = &cp->data1; + cp->compcursor = (u_int8_t*)cp->compressed.data; + cp->compend = cp->compcursor + cp->compressed.size; + cp->prevcursor = NULL; + cp->prev2cursor = NULL; + + /* Unmarshal the first data */ + cp->compcursor += __db_decompress_int32(cp->compcursor, &datasize); + ret = __bam_compress_set_dbt(dbc->dbp, + cp->currentData, cp->compcursor, datasize); + + if (ret == 0) + cp->compcursor += datasize; + return (ret); +} + +/* Decompress the next key/data pair from dbc->compressed. */ +static int +__bamc_next_decompress(dbc) + DBC *dbc; +{ + DBT compressed; + int ret; + BTREE_CURSOR *cp; + DB *db; + + ret = 0; + cp = (BTREE_CURSOR *)dbc->internal; + db = dbc->dbp; + + if (cp->compcursor >= cp->compend) + return (DB_NOTFOUND); + + cp->prevKey = cp->currentKey; + cp->prevData = cp->currentData; + cp->prev2cursor = cp->prevcursor; + cp->prevcursor = cp->compcursor; + + if (cp->currentKey == &cp->key1) { + cp->currentKey = &cp->key2; + cp->currentData = &cp->data2; + } else { + cp->currentKey = &cp->key1; + cp->currentData = &cp->data1; + } + + compressed.flags = DB_DBT_USERMEM; + compressed.data = (void*)cp->compcursor; + compressed.ulen = compressed.size = + (u_int32_t)(cp->compend - cp->compcursor); + compressed.app_data = NULL; + + while ((ret = ((BTREE *)db->bt_internal)->bt_decompress(db, + cp->prevKey, cp->prevData, &compressed, + cp->currentKey, cp->currentData)) == DB_BUFFER_SMALL) { + if (CMP_RESIZE_DBT(ret, dbc->env, cp->currentKey) != 0) + break; + if (CMP_RESIZE_DBT(ret, dbc->env, cp->currentData) != 0) + break; + } + + if (ret == 0) + cp->compcursor += compressed.size; + return (ret); +} + +/* + * Store key and data into destkey and destbuf, using the compression + * callback given. + */ +static int +__bamc_compress_store(dbc, key, data, prevKey, prevData, destkey, destbuf) + DBC *dbc; + DBT *key, *data; + DBT **prevKey, **prevData; + DBT *destkey, *destbuf; +{ + int ret; + DBT dest; + + if (*prevKey == 0) { + if ((ret = __bam_compress_set_dbt(dbc->dbp, + destkey, key->data, key->size)) != 0) + return (ret); + + /* Marshal data - resize if it won't fit */ + ret = __bam_compress_marshal_data(dbc->dbp, data, destbuf); + + } else if (((BTREE_CURSOR *)dbc->internal)->ovflsize > destbuf->size) { + /* + * Don't write more than cp->ovflsize bytes to the destination + * buffer - destbuf must be at least cp->ovflsize in size. + */ + dest.flags = DB_DBT_USERMEM; + dest.data = (u_int8_t*)destbuf->data + destbuf->size; + dest.ulen = + ((BTREE_CURSOR *)dbc->internal)->ovflsize - destbuf->size; + dest.size = 0; + dest.app_data = NULL; + + ret = ((BTREE *)dbc->dbp->bt_internal)->bt_compress( + dbc->dbp, *prevKey, *prevData, key, data, &dest); + + if (ret == 0) + destbuf->size += dest.size; + } else + ret = DB_BUFFER_SMALL; + + if (ret == 0) { + *prevKey = key; + *prevData = data; + } + + return (ret); +} + +/* + * Move dbc->dbc to the correct position to start linear searching for + * seek_key/seek_data - the biggest key smaller than or equal to + * seek_key/seek_data. + */ +static int +__bamc_compress_seek(dbc, seek_key, seek_data, flags) + DBC *dbc; + const DBT *seek_key; + const DBT *seek_data; + u_int32_t flags; +{ + int ret; + u_int32_t method; + DB *dbp; + BTREE_CURSOR *cp; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + + if ((ret = __bam_compress_set_dbt( + dbp, &cp->key1, seek_key->data, seek_key->size)) != 0) + return (ret); + + /* + * We allow seek_data to be 0 for __bamc_compress_get_set() with + * DB_SET + */ + if (F_ISSET(dbp, DB_AM_DUPSORT) && seek_data != NULL) { + if ((ret = __bam_compress_marshal_data( + dbp, seek_data, &cp->compressed)) != 0) + return (ret); + + method = DB_GET_BOTH_LTE; + } else + method = DB_SET_LTE; + + CMP_IGET_RETRY(ret, dbc, &cp->key1, &cp->compressed, method | flags); + + if (ret == 0 && + F_ISSET(dbp, DB_AM_DUPSORT) && seek_data == NULL && + __db_compare_both(dbp, seek_key, 0, &cp->key1, 0) == 0) { + /* + * Some entries for seek_key might be in the previous chunk, + * so we need to start searching there. + */ + CMP_IGET_RETRY(ret, + dbc, &cp->key1, &cp->compressed, DB_PREV | flags); + if (ret == DB_NOTFOUND) { + /* No previous, we must need the first entry */ + CMP_IGET_RETRY(ret, + dbc, &cp->key1, &cp->compressed, DB_FIRST | flags); + } + } + + return (ret); +} + +/* Reset the cursor to an uninitialized state */ +static void +__bamc_compress_reset(dbc) + DBC *dbc; +{ + BTREE_CURSOR *cp; + + cp = (BTREE_CURSOR *)dbc->internal; + + cp->prevKey = 0; + cp->prevData = 0; + cp->currentKey = 0; + cp->currentData = 0; + cp->compcursor = 0; + cp->compend = 0; + cp->prevcursor = 0; + cp->prev2cursor = 0; + + F_CLR(cp, C_COMPRESS_DELETED|C_COMPRESS_MODIFIED); +} + +/* + * Duplicate the cursor and delete the current entry, move the original cursor + * on and then close the cursor we used to delete. We do that to make sure that + * the close method runs __bamc_physdel(), and actually gets rid of the deleted + * entry! + */ +static int +__bamc_compress_del_and_get_next(dbc, nextk, nextc) + DBC *dbc; + DBT *nextk, *nextc; +{ + int ret, ret_n; + DBC *dbc_n; + + if ((ret = __dbc_dup(dbc, &dbc_n, DB_POSITION | DB_SHALLOW_DUP)) != 0) + return (ret); + F_SET(dbc_n, DBC_TRANSIENT); + + if ((ret = __dbc_idel(dbc_n, 0)) != 0) + goto err; + + /* Read the next position */ + CMP_IGET_RETRY(ret, dbc, nextk, nextc, DB_NEXT); + + err: + if ((ret_n = __dbc_close(dbc_n)) != 0 && ret == 0) + ret = ret_n; + + /* No need to relocate this cursor */ + F_CLR((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED); + + return (ret); +} + +/* + * Duplicate the cursor, re-locate the position that this cursor pointed to + * using the duplicate (it may have been deleted), and then swap + * the cursors. We do that to make sure that the close method runs + * __bamc_physdel(), and gets rid of the entry that may have been deleted. + */ +static int +__bamc_compress_relocate(dbc) + DBC *dbc; +{ + int ret, t_ret; + BTREE_CURSOR *cp, *cp_n; + DBC *dbc_n; + + cp = (BTREE_CURSOR *)dbc->internal; + + if ((ret = __dbc_dup(dbc, &dbc_n, 0)) != 0) + return (ret); + F_SET(dbc_n, DBC_TRANSIENT); + + cp_n = (BTREE_CURSOR *)dbc_n->internal; + + if (F_ISSET(cp, C_COMPRESS_DELETED)) { + /* Find the position after the deleted entry again */ + ret = __bamc_compress_get_set( + dbc_n, &cp->del_key, &cp->del_data, 0, 0); + if (ret == DB_NOTFOUND) { + __bamc_compress_reset(dbc_n); + ret = 0; + } else if (ret != 0) + goto err; + + F_SET(cp_n, C_COMPRESS_DELETED); + + } else if (cp->currentKey != NULL) { + /* Find the current entry again */ + ret = __bamc_compress_get_set( + dbc_n, cp->currentKey, cp->currentData, + F_ISSET(dbc->dbp, DB_AM_DUPSORT) ? DB_GET_BOTH : DB_SET, 0); + + if (ret == DB_NOTFOUND) { + /* The current entry has been deleted */ + if ((ret = __bam_compress_set_dbt(dbc_n->dbp, + &cp_n->del_key, + cp->currentKey->data, cp->currentKey->size)) != 0) + return (ret); + if ((ret = __bam_compress_set_dbt(dbc_n->dbp, + &cp_n->del_data, cp->currentData->data, + cp->currentData->size)) != 0) + return (ret); + F_SET(cp_n, C_COMPRESS_DELETED); + ret = 0; + } else if (ret != 0) + goto err; + } + + err: + /* Cleanup and cursor resolution. This also clears the + C_COMPRESS_MODIFIED flag. */ + if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/******************************************************************************/ + +#define CMP_STORE(key, data) do { \ + while ((ret = __bamc_compress_store(dbc, (key), (data), \ + &prevDestKey, &prevDestData, &destkey, &destbuf)) \ + == DB_BUFFER_SMALL) { \ + if ((ret = __dbc_iput(dbc, \ + &destkey, &destbuf, DB_KEYLAST)) != 0) \ + goto end; \ + prevDestKey = NULL; \ + prevDestData = NULL; \ + destbuf.size = 0; \ + } \ +} while (0) + +/* Merge the sorted key/data pairs from stream into the compressed database. */ +static int +__bamc_compress_merge_insert(dbc, stream, countp, flags) + DBC *dbc; + BTREE_COMPRESS_STREAM *stream; + u_int32_t *countp; + u_int32_t flags; +{ + DBT ikey1, ikey2, idata1, idata2, nextk, nextc, nextd, destkey, destbuf; + DBT *ikey, *idata, *prevIkey, *prevIdata, *prevDestKey, *prevDestData; + int ret, bulk_ret, cmp, nextExists, moreCompressed, iSmallEnough; + int moreStream; + u_int32_t chunk_count; + ENV *env; + BTREE_CURSOR *cp; + DB *dbp; + + env = dbc->env; + cp = (BTREE_CURSOR *)dbc->internal; + dbp = dbc->dbp; + bulk_ret = 0; + + memset(&ikey1, 0, sizeof(DBT)); + memset(&ikey2, 0, sizeof(DBT)); + memset(&idata1, 0, sizeof(DBT)); + memset(&idata2, 0, sizeof(DBT)); + + CMP_INIT_DBT(&nextk); + CMP_INIT_DBT(&nextc); + memset(&nextd, 0, sizeof(DBT)); + + CMP_INIT_DBT(&destkey); + CMP_INIT_DBT(&destbuf); + if ((ret = __os_malloc(env, cp->ovflsize, &destbuf.data)) != 0) + goto end; + destbuf.ulen = cp->ovflsize; + + if (countp != NULL) + *countp = 0; + chunk_count = 0; + + /* Get the first input key and data */ + ret = 0; + prevIkey = NULL; + prevIdata = NULL; + ikey = &ikey1; + idata = &idata1; + if (stream->next(stream, ikey, idata) == 0) + goto end; + + prevDestKey = NULL; + prevDestData = NULL; + + moreStream = 1; + while (moreStream != 0) { + nextExists = 1; + moreCompressed = 1; + + /* Seek the ikey/idata position */ + ret = __bamc_compress_seek(dbc, ikey, idata, 0); + if (ret == 0) { + /* + * Delete the key - we might overwrite it below + * but it's safer to just always delete it, and it + * doesn't seem significantly slower to do so. + */ + ret = __bamc_compress_del_and_get_next(dbc, &nextk, + &nextc); + if (ret == DB_NOTFOUND) { + ret = 0; + nextExists = 0; + } else if (ret == 0) { + CMP_UNMARSHAL_DATA(&nextc, &nextd); + } else + goto end; + ret = __bamc_start_decompress(dbc); + } else if (ret == DB_NOTFOUND) { + moreCompressed = 0; + + /* Read the next position */ + CMP_IGET_RETRY(ret, dbc, &nextk, &nextc, DB_FIRST); + if (ret == DB_NOTFOUND) { + ret = 0; + nextExists = 0; + } else if (ret == 0) { + CMP_UNMARSHAL_DATA(&nextc, &nextd); + } + } + + if (ret != 0) + goto end; + + /* !nextExists || ikey/idata < nextk/nextd */ + iSmallEnough = 1; + + while (moreCompressed != 0 || iSmallEnough != 0) { + if (moreCompressed == 0) + cmp = 1; + else if (iSmallEnough == 0) + cmp = -1; + else + cmp = __db_compare_both(dbp, cp->currentKey, + cp->currentData, ikey, idata); + + if (cmp < 0) { +store_current: CMP_STORE(cp->currentKey, cp->currentData); + if (ret != 0) + goto end; + } else { + switch (flags) { + case DB_KEYLAST: + case DB_KEYFIRST: + case DB_NODUPDATA: + if (cmp == 0 && bulk_ret == 0 && + F_ISSET(dbp, DB_AM_DUPSORT)) { + bulk_ret = __db_duperr(dbp, + flags); + + /* + * Continue until we store + * the current chunk, + * but don't insert any + * more entries. + */ + moreStream = 0; + iSmallEnough = 0; + + goto store_current; + } + break; + default: + break; + } + + CMP_STORE(ikey, idata); + if (ret != 0) + goto end; + ++chunk_count; + + /* + * prevDestKey/prevDestData now point to + * the same DBTs as ikey/idata. We don't + * want to overwrite them, so swap them + * to point to the other DBTs. + */ + if (ikey == &ikey1) { + ikey = &ikey2; + idata = &idata2; + prevIkey = &ikey1; + prevIdata = &idata1; + } else { + ikey = &ikey1; + idata = &idata1; + prevIkey = &ikey2; + prevIdata = &idata2; + } + + do { + /* Get the next input key and data */ + if (stream->next( + stream, ikey, idata) == 0) { + moreStream = 0; + iSmallEnough = 0; + break; + } + +#ifdef DIAGNOSTIC + /* Check that the stream is sorted */ + DB_ASSERT(env, __db_compare_both(dbp, + ikey, idata, prevIkey, + prevIdata) >= 0); +#endif + + /* Check for duplicates in the stream */ + } while (__db_compare_both(dbp, ikey, idata, + prevIkey, prevIdata) == 0); + + /* + * Check that !nextExists || + * ikey/idata < nextk/nextd + */ + if (moreStream != 0 && nextExists != 0 && + __db_compare_both(dbp, ikey, + idata, &nextk, &nextd) >= 0) + iSmallEnough = 0; + } + + if (cmp <= 0) { + ret = __bamc_next_decompress(dbc); + if (ret == DB_NOTFOUND) { + moreCompressed = 0; + ret = 0; + } else if (ret != 0) + goto end; + + } + } + + if (prevDestKey != NULL) { + if ((ret = __dbc_iput( + dbc, &destkey, &destbuf, DB_KEYLAST)) != 0) + goto end; + + if (countp != NULL) + *countp += chunk_count; + chunk_count = 0; + + prevDestKey = NULL; + prevDestData = NULL; + destbuf.size = 0; + } + } + + end: + CMP_FREE_DBT(env, &destkey); + CMP_FREE_DBT(env, &destbuf); + CMP_FREE_DBT(env, &nextk); + CMP_FREE_DBT(env, &nextc); + + return (ret != 0 ? ret : bulk_ret); +} + +/******************************************************************************/ + +/* Remove the sorted key/data pairs in stream from the compressed database. */ +static int +__bamc_compress_merge_delete(dbc, stream, countp) + DBC *dbc; + BTREE_COMPRESS_STREAM *stream; + u_int32_t *countp; +{ + DBT ikey, idata, nextk, nextc, nextd, destkey, destbuf, pdestkey; + DBT pdestdata; +#ifdef DIAGNOSTIC + DBT pikey, pidata; +#endif + DBT *prevDestKey, *prevDestData; + int ret, bulk_ret, cmp, moreCompressed, moreStream, nextExists; + int iSmallEnough; + u_int32_t chunk_count; + ENV *env; + BTREE_CURSOR *cp; + DB *dbp; + + env = dbc->env; + cp = (BTREE_CURSOR *)dbc->internal; + dbp = dbc->dbp; + bulk_ret = 0; + + memset(&ikey, 0, sizeof(DBT)); + memset(&idata, 0, sizeof(DBT)); + + CMP_INIT_DBT(&nextk); + CMP_INIT_DBT(&nextc); + memset(&nextd, 0, sizeof(DBT)); + + CMP_INIT_DBT(&pdestkey); + CMP_INIT_DBT(&pdestdata); + + CMP_INIT_DBT(&destkey); + CMP_INIT_DBT(&destbuf); + if ((ret = __os_malloc(env, cp->ovflsize, &destbuf.data)) != 0) + goto end; + destbuf.ulen = cp->ovflsize; + + if (countp != NULL) + *countp = 0; + chunk_count = 0; + + /* Get the first input key and data */ + ret = 0; + if (stream->next(stream, &ikey, &idata) == 0) + goto end; + + prevDestKey = NULL; + prevDestData = NULL; + + moreStream = 1; + while (moreStream != 0) { + nextExists = 1; + moreCompressed = 1; + + /* Seek the ikey/idata position */ + if ((ret = __bamc_compress_seek(dbc, &ikey, &idata, 0)) != 0) + goto end; + + /* + * Delete the key - we might overwrite it below but it's safer + * to just always delete it, and it doesn't seem significantly + * slower to do so. + */ + ret = __bamc_compress_del_and_get_next(dbc, &nextk, &nextc); + if (ret == DB_NOTFOUND) { + ret = 0; + nextExists = 0; + } else if (ret == 0) { + CMP_UNMARSHAL_DATA(&nextc, &nextd); + } else + goto end; + + if ((ret = __bamc_start_decompress(dbc)) != 0) + goto end; + + /* !nextExists || ikey/idata < nextk/nextd */ + iSmallEnough = 1; + + while (moreCompressed != 0 || iSmallEnough != 0) { + if (moreCompressed == 0) + cmp = 1; + else if (iSmallEnough == 0) + cmp = -1; + else + cmp = __db_compare_both(dbp, cp->currentKey, + cp->currentData, &ikey, &idata); + + if (cmp < 0) { + if ((ret = __bamc_compress_store(dbc, + cp->currentKey, cp->currentData, + &prevDestKey, &prevDestData, + &destkey, &destbuf)) != 0) + goto end; + + if ((ret = __bam_compress_set_dbt(dbp, + &pdestkey, cp->currentKey->data, + cp->currentKey->size)) != 0) + goto end; + if ((ret = __bam_compress_set_dbt(dbp, + &pdestdata, cp->currentData->data, + cp->currentData->size)) != 0) + goto end; + prevDestKey = &pdestkey; + prevDestData = &pdestdata; + } else { + if (cmp != 0) { + /* + * Continue until we store the current + * chunk, but don't delete any more + * entries. + */ + bulk_ret = DB_NOTFOUND; + moreStream = 0; + iSmallEnough = 0; + } else + ++chunk_count; + +#ifdef DIAGNOSTIC + pikey = ikey; + pidata = idata; +#endif + + /* Get the next input key and data */ + if (stream->next(stream, &ikey, &idata) == 0) { + moreStream = 0; + iSmallEnough = 0; + } + +#ifdef DIAGNOSTIC + /* Check that the stream is sorted */ + DB_ASSERT(env, moreStream == 0 || + __db_compare_both(dbp, &ikey, &idata, + &pikey, &pidata) >= 0); +#endif + + /* + * Check that !nextExists || + * ikey/idata < nextk/nextd + */ + if (moreStream != 0 && nextExists != 0 && + __db_compare_both(dbp, &ikey, + &idata, &nextk, &nextd) >= 0) + iSmallEnough = 0; + } + + if (cmp <= 0) { + ret = __bamc_next_decompress(dbc); + if (ret == DB_NOTFOUND) { + moreCompressed = 0; + ret = 0; + } else if (ret != 0) + goto end; + } + } + + if (prevDestKey != NULL) { + if ((ret = __dbc_iput( + dbc, &destkey, &destbuf, DB_KEYLAST)) != 0) + goto end; + + if (countp) + *countp += chunk_count; + chunk_count = 0; + + prevDestKey = NULL; + prevDestData = NULL; + destbuf.size = 0; + } + } + + end: + CMP_FREE_DBT(env, &destkey); + CMP_FREE_DBT(env, &destbuf); + CMP_FREE_DBT(env, &pdestkey); + CMP_FREE_DBT(env, &pdestdata); + CMP_FREE_DBT(env, &nextk); + CMP_FREE_DBT(env, &nextc); + + return (ret != 0 ? ret : bulk_ret); +} + +/* + * Remove the sorted keys in stream along with all duplicate values from + * the compressed database. + */ +static int +__bamc_compress_merge_delete_dups(dbc, stream, countp) + DBC *dbc; + BTREE_COMPRESS_STREAM *stream; + u_int32_t *countp; +{ + DBC *dbc_n; + DBT ikey, nextk, noread, destkey, destbuf, pdestkey, pdestdata; +#ifdef DIAGNOSTIC + DBT pikey; +#endif + DBT *prevDestKey, *prevDestData; + int ret, ret_n, bulk_ret, cmp, moreCompressed, moreStream, nextExists; + int iSmallEnough, ifound; + u_int32_t chunk_count; + ENV *env; + BTREE_CURSOR *cp; + DB *dbp; + + env = dbc->env; + cp = (BTREE_CURSOR *)dbc->internal; + dbp = dbc->dbp; + bulk_ret = 0; + + memset(&ikey, 0, sizeof(DBT)); + + CMP_INIT_DBT(&nextk); + + memset(&noread, 0, sizeof(DBT)); + noread.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM; + + CMP_INIT_DBT(&pdestkey); + CMP_INIT_DBT(&pdestdata); + + CMP_INIT_DBT(&destkey); + CMP_INIT_DBT(&destbuf); + if ((ret = __os_malloc(env, cp->ovflsize, &destbuf.data)) != 0) + goto end; + destbuf.ulen = cp->ovflsize; + + if (countp != NULL) + *countp = 0; + chunk_count = 0; + + /* Get the first input key and data */ + ret = 0; + if (stream->next(stream, &ikey, NULL) == 0) + goto end; + ifound = 0; + + prevDestKey = NULL; + prevDestData = NULL; + + moreStream = 1; + iSmallEnough = 0; + nextExists = 0; + while (moreStream != 0) { + if (iSmallEnough != 0) { + if (nextExists == 0) { + /* + * We've finished deleting the last key + * in the database + */ + if (ifound == 0) { + bulk_ret = DB_NOTFOUND; + } else + ++chunk_count; + break; + } + + /* Move to the next chunk */ + CMP_IGET_RETRY( + ret, dbc, &cp->key1, &cp->compressed, DB_CURRENT); + if (ret == DB_NOTFOUND) { + ret = 0; + break; + } else if (ret != 0) + goto end; + } else + /* Seek the ikey position */ + if ((ret = + __bamc_compress_seek(dbc, &ikey, NULL, 0)) != 0) + goto end; + + nextExists = 1; + moreCompressed = 1; + + /* + * Delete the key - we might overwrite it below but it's + * safer to just always delete it, and it doesn't seem + * significantly slower to do so. + */ + ret = __bamc_compress_del_and_get_next(dbc, &nextk, &noread); + if (ret == DB_NOTFOUND) { + ret = 0; + nextExists = 0; + } else if (ret != 0) + goto end; + + if ((ret = __bamc_start_decompress(dbc)) != 0) + goto end; + + /* !nextExists || ikey <= nextk */ + iSmallEnough = 1; + + while (moreCompressed != 0) { + if (moreCompressed == 0) + cmp = 1; + else if (iSmallEnough == 0) + cmp = -1; + else + cmp = __db_compare_both( + dbp, cp->currentKey, NULL, &ikey, NULL); + + if (cmp < 0) { + if ((ret = __bamc_compress_store(dbc, + cp->currentKey, cp->currentData, + &prevDestKey, + &prevDestData, &destkey, &destbuf)) != 0) + goto end; + + if ((ret = __bam_compress_set_dbt(dbp, + &pdestkey, cp->currentKey->data, + cp->currentKey->size)) != 0) + goto end; + if ((ret = __bam_compress_set_dbt(dbp, + &pdestdata, cp->currentData->data, + cp->currentData->size)) != 0) + goto end; + prevDestKey = &pdestkey; + prevDestData = &pdestdata; + } else if (cmp > 0) { + if (ifound == 0) { + /* + * Continue until we store the + * current chunk, but don't delete + * any more entries. + */ + bulk_ret = DB_NOTFOUND; + moreStream = 0; + iSmallEnough = 0; + } else + ++chunk_count; + +#ifdef DIAGNOSTIC + pikey = ikey; +#endif + + /* Get the next input key */ + if (stream->next(stream, &ikey, NULL) == 0) { + moreStream = 0; + iSmallEnough = 0; + } + ifound = 0; + +#ifdef DIAGNOSTIC + /* Check that the stream is sorted */ + DB_ASSERT(env, moreStream == 0 || + __db_compare_both(dbp, &ikey, NULL, + &pikey, NULL) >= 0); +#endif + + /* Check that !nextExists || ikey <= nextk */ + if (moreStream != 0 && nextExists != 0 && + __db_compare_both(dbp, + &ikey, NULL, &nextk, NULL) > 0) + iSmallEnough = 0; + } else /* cmp == 0 */ + ifound = 1; + + if (cmp <= 0) { + ret = __bamc_next_decompress(dbc); + if (ret == DB_NOTFOUND) { + moreCompressed = 0; + ret = 0; + } else if (ret != 0) + goto end; + } + } + + if (prevDestKey != NULL) { + /* + * Do the DBC->put() with a duplicate cursor, so that + * the main cursor's position isn't changed - we might + * need it to be the same in order to use DB_CURRENT + * above. + */ + if ((ret = __dbc_dup(dbc, &dbc_n, 0)) != 0) + goto end; + F_SET(dbc_n, DBC_TRANSIENT); + + ret = __dbc_iput(dbc_n, &destkey, &destbuf, DB_KEYLAST); + + if ((ret_n = __dbc_close(dbc_n)) != 0 && ret == 0) + ret = ret_n; + + if (ret != 0) + goto end; + + if (countp) + *countp += chunk_count; + chunk_count = 0; + + prevDestKey = NULL; + prevDestData = NULL; + destbuf.size = 0; + } + } + + end: + CMP_FREE_DBT(env, &destkey); + CMP_FREE_DBT(env, &destbuf); + CMP_FREE_DBT(env, &pdestkey); + CMP_FREE_DBT(env, &pdestdata); + CMP_FREE_DBT(env, &nextk); + + return (ret != 0 ? ret : bulk_ret); +} + +/******************************************************************************/ + +/* Implements DB_PREV and DB_LAST for __bamc_compress_get() */ +static int +__bamc_compress_get_prev(dbc, flags) + DBC *dbc; + u_int32_t flags; +{ + int ret; + u_int32_t tofind; + BTREE_CURSOR *cp; + + ret = 0; + cp = (BTREE_CURSOR *)dbc->internal; + + F_CLR(cp, C_COMPRESS_DELETED); + + if (cp->prevKey != NULL) { + /* Return the stored previous key */ + cp->currentKey = cp->prevKey; + cp->currentData = cp->prevData; + cp->compcursor = cp->prevcursor; + cp->prevKey = 0; + cp->prevData = 0; + cp->prevcursor = cp->prev2cursor; + cp->prev2cursor = 0; + } else { + if (cp->currentKey == NULL) { + /* No current key, so fetch the last key */ + flags |= DB_LAST; + tofind = (u_int32_t)-1; + } else if (cp->prevcursor == 0) { + /* + * The current key is at the begining of the + * compressed block, so get the last key from the + * previous block + */ + flags |= DB_PREV; + tofind = (u_int32_t)-1; + } else { + /* + * We have to search for the previous key in the + * current block + */ + flags |= DB_CURRENT; + tofind = (u_int32_t) + (cp->prevcursor - (u_int8_t*)cp->compressed.data); + } + + CMP_IGET_RETRY(ret, dbc, &cp->key1, &cp->compressed, flags); + if (ret != 0) + return (ret); + + /* Decompress until we reach tofind */ + ret = __bamc_start_decompress(dbc); + while (ret == 0 && tofind > (u_int32_t) + (cp->compcursor - (u_int8_t*)cp->compressed.data)) { + ret = __bamc_next_decompress(dbc); + } + + if (ret == DB_NOTFOUND) + ret = 0; + } + + return (ret); +} + +/* Implements DB_PREV_DUP for __bamc_compress_get() */ +static int +__bamc_compress_get_prev_dup(dbc, flags) + DBC *dbc; + u_int32_t flags; +{ + int ret; + BTREE_CURSOR *cp; + DB *dbp; + BTREE *t; + + ret = 0; + cp = (BTREE_CURSOR *)dbc->internal; + dbp = dbc->dbp; + t = (BTREE *)dbp->bt_internal; + + if (cp->currentKey == 0) + return (EINVAL); + + /* If this is a deleted entry, del_key is already set, otherwise we + have to set it now */ + if (!F_ISSET(cp, C_COMPRESS_DELETED)) { + if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key, + cp->currentKey->data, cp->currentKey->size)) != 0) + return (ret); + } + + if ((ret = __bamc_compress_get_prev(dbc, flags)) != 0) + return (ret); + + if (t->bt_compare(dbp, cp->currentKey, &cp->del_key) != 0) + return (DB_NOTFOUND); + + return (0); +} + +/* Implements DB_PREV_NODUP for __bamc_compress_get() */ +static int +__bamc_compress_get_prev_nodup(dbc, flags) + DBC *dbc; + u_int32_t flags; +{ + int ret; + BTREE_CURSOR *cp; + DB *dbp; + BTREE *t; + + cp = (BTREE_CURSOR *)dbc->internal; + dbp = dbc->dbp; + t = (BTREE *)dbp->bt_internal; + + if (cp->currentKey == 0) + return (__bamc_compress_get_prev(dbc, flags)); + + /* + * If this is a deleted entry, del_key is already set, otherwise we + * have to set it now. + */ + if (!F_ISSET(cp, C_COMPRESS_DELETED)) + if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key, + cp->currentKey->data, cp->currentKey->size)) != 0) + return (ret); + + /* + * Linear search for the next non-duplicate key - this is + * especially inefficient for DB_PREV_NODUP, since we have to + * decompress from the begining of the chunk to find previous + * key/data pairs. Instead we could check for key equality as we + * decompress. + */ + do + if ((ret = __bamc_compress_get_prev(dbc, flags)) != 0) + return (ret); + while (t->bt_compare(dbp, cp->currentKey, &cp->del_key) == 0); + + return (0); +} + +/* Implements DB_NEXT and DB_FIRST for __bamc_compress_get() */ +static int +__bamc_compress_get_next(dbc, flags) + DBC *dbc; + u_int32_t flags; +{ + int ret; + BTREE_CURSOR *cp; + + cp = (BTREE_CURSOR *)dbc->internal; + + if (F_ISSET(cp, C_COMPRESS_DELETED)) { + if (cp->currentKey == 0) + return (DB_NOTFOUND); + F_CLR(cp, C_COMPRESS_DELETED); + return (0); + } else if (cp->currentKey) { + ret = __bamc_next_decompress(dbc); + if (ret != DB_NOTFOUND) + return (ret); + + flags |= DB_NEXT; + } else + flags |= DB_FIRST; + + CMP_IGET_RETRY(ret, dbc, &cp->key1, &cp->compressed, flags); + if (ret == DB_NOTFOUND) { + /* + * Reset the cursor, so that + * __bamc_compress_get_multiple_key will end up pointing + * to the right place + */ + __bamc_compress_reset(dbc); + return (DB_NOTFOUND); + } else if (ret != 0) + return (ret); + + ret = __bamc_start_decompress(dbc); + + return (ret); +} + +/* Implements DB_NEXT_DUP for __bamc_compress_get() */ +static int +__bamc_compress_get_next_dup(dbc, key, flags) + DBC *dbc; + DBT *key; + u_int32_t flags; +{ + int ret; + BTREE_CURSOR *cp; + DB *dbp; + BTREE *t; + + cp = (BTREE_CURSOR *)dbc->internal; + dbp = dbc->dbp; + t = (BTREE *)dbp->bt_internal; + + if (cp->currentKey == 0) + return (EINVAL); + + if (F_ISSET(cp, C_COMPRESS_DELETED)) { + /* + * Check that the next entry has the same key as the + * deleted entry. + */ + if (cp->currentKey == 0) + return (DB_NOTFOUND); + F_CLR(cp, C_COMPRESS_DELETED); + return (t->bt_compare(dbp, + cp->currentKey, &cp->del_key) == 0 ? 0 : DB_NOTFOUND); + } + + /* Check that the next entry has the same key as the previous entry */ + ret = __bamc_next_decompress(dbc); + if (ret == 0 && t->bt_compare(dbp, cp->currentKey, cp->prevKey) != 0) + return (DB_NOTFOUND); + if (ret != DB_NOTFOUND) + return (ret); + + if (key == NULL) { + /* Copy the current key to del_key */ + if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key, + cp->currentKey->data, cp->currentKey->size)) != 0) + return (ret); + key = &cp->del_key; + } + + /* Fetch the next chunk */ + CMP_IGET_RETRY(ret, dbc, &cp->key1, &cp->compressed, DB_NEXT | flags); + if (ret == DB_NOTFOUND) { + /* + * Reset the cursor, so that __bamc_compress_get_multiple + * will end up pointing to the right place + */ + __bamc_compress_reset(dbc); + return (DB_NOTFOUND); + } else if (ret != 0) + return (ret); + + if ((ret = __bamc_start_decompress(dbc)) != 0) + return (ret); + + /* Check the keys are the same */ + if (t->bt_compare(dbp, cp->currentKey, key) != 0) + return (DB_NOTFOUND); + + return (0); +} + +/* Implements DB_NEXT_NODUP for __bamc_compress_get() */ +static int +__bamc_compress_get_next_nodup(dbc, flags) + DBC *dbc; + u_int32_t flags; +{ + int ret; + BTREE_CURSOR *cp; + DB *dbp; + BTREE *t; + + cp = (BTREE_CURSOR *)dbc->internal; + dbp = dbc->dbp; + t = (BTREE *)dbp->bt_internal; + + if (cp->currentKey == 0) + return (__bamc_compress_get_next(dbc, flags)); + + /* + * If this is a deleted entry, del_key is already set, otherwise + * we have to set it now + */ + if (!F_ISSET(cp, C_COMPRESS_DELETED)) + if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key, + cp->currentKey->data, cp->currentKey->size)) != 0) + return (ret); + + /* Linear search for the next non-duplicate key */ + do + if ((ret = __bamc_compress_get_next(dbc, flags)) != 0) + return (ret); + while (t->bt_compare(dbp, cp->currentKey, &cp->del_key) == 0); + + return (ret); +} + +/* + * Implements DB_SET, DB_SET_RANGE, DB_GET_BOTH, and DB_GET_BOTH_RANGE + * for __bamc_compress_get() + */ +static int +__bamc_compress_get_set(dbc, key, data, method, flags) + DBC *dbc; + DBT *key; + DBT *data; + u_int32_t method; + u_int32_t flags; +{ + int ret, cmp; + BTREE_CURSOR *cp; + DB *dbp; + + cp = (BTREE_CURSOR *)dbc->internal; + dbp = dbc->dbp; + + if (method == DB_SET || method == DB_SET_RANGE) + data = NULL; + + F_CLR(cp, C_COMPRESS_DELETED); + + ret = __bamc_compress_seek(dbc, key, data, flags); + if (ret == DB_NOTFOUND) + CMP_IGET_RETRY(ret, dbc, + &cp->key1, &cp->compressed, DB_FIRST | flags); + if (ret != 0) + return (ret); + + /* Decompress and perform a linear search for the key */ + cmp = 0; + ret = __bamc_start_decompress(dbc); + while (ret == 0 && (cmp = __db_compare_both(dbp, + cp->currentKey, cp->currentData, key, data)) < 0) { + ret = __bamc_next_decompress(dbc); + if (ret == DB_NOTFOUND) { + CMP_IGET_RETRY(ret, dbc, + &cp->key1, &cp->compressed, DB_NEXT | flags); + if (ret == 0) + ret = __bamc_start_decompress(dbc); + } + } + + switch (method) { + case DB_SET: + case DB_GET_BOTH_RANGE: + /* + * We need to exactly match the key, and if cmp != 0 we + * might not have - so check again here. + */ + if (ret == 0 && + __db_compare_both(dbp, cp->currentKey, 0, key, 0) != 0) { + /* We didn't find the key */ + ret = DB_NOTFOUND; + } + break; + case DB_GET_BOTH: + if (ret == 0 && (cmp != 0 || (!F_ISSET(dbp, DB_AM_DUPSORT) && + __bam_defcmp(dbp, cp->currentData, data) != 0))) { + /* We didn't find the key/data pair */ + ret = DB_NOTFOUND; + } + break; + default: + DB_ASSERT(dbp->env, method == 0 || method == DB_SET_RANGE); + } + + return (ret); +} + +/* Implements DB_GET_BOTHC for __bamc_compress_get() */ +static int +__bamc_compress_get_bothc(dbc, data, flags) + DBC *dbc; + DBT *data; + u_int32_t flags; +{ + int ret, cmp; + BTREE_CURSOR *cp; + DB *dbp; + + cp = (BTREE_CURSOR *)dbc->internal; + dbp = dbc->dbp; + + /* Check that the data we are looking for comes after the current + position */ + if (__db_compare_both(dbp, cp->currentKey, + cp->currentData, cp->currentKey, data) >= 0) + return (DB_NOTFOUND); + + cmp = 0; + /* Perform a linear search for the data in the current chunk */ + while ((ret = __bamc_next_decompress(dbc)) == 0 && + (cmp = __db_compare_both( + dbp, cp->currentKey, cp->currentData, cp->prevKey, data)) < 0) + continue; + + if (ret == 0) + return (cmp == 0 ? 0 : DB_NOTFOUND); + if (ret != DB_NOTFOUND) + return (ret); + + /* Copy the current key to del_key */ + if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key, + cp->currentKey->data, cp->currentKey->size)) != 0) + return (ret); + + /* Search for the data using DB_GET_BOTH */ + return __bamc_compress_get_set( + dbc, &cp->del_key, data, DB_GET_BOTH, flags); +} + +/* Implements DB_MULTIPLE_KEY for __bamc_compress_get() */ +static int +__bamc_compress_get_multiple_key(dbc, data, flags) + DBC *dbc; + DBT *data; + u_int32_t flags; +{ + int ret; + u_int8_t *writekey, *writedata; + void *mptr; + BTREE_CURSOR *cp; + + ret = 0; + cp = (BTREE_CURSOR *)dbc->internal; + + DB_MULTIPLE_WRITE_INIT(mptr, data); + DB_MULTIPLE_KEY_RESERVE_NEXT(mptr, data, writekey, cp->currentKey->size, + writedata, cp->currentData->size); + if (writekey == NULL) { + data->size = cp->currentKey->size + cp->currentData->size + + 4 * sizeof(u_int32_t); + return DB_BUFFER_SMALL; + } + DB_ASSERT(dbc->dbp->env, writedata != NULL); + + memcpy(writekey, cp->currentKey->data, cp->currentKey->size); + memcpy(writedata, cp->currentData->data, cp->currentData->size); + + while ((ret = __bamc_compress_get_next(dbc, flags)) == 0) { + DB_MULTIPLE_KEY_RESERVE_NEXT(mptr, data, writekey, + cp->currentKey->size, writedata, cp->currentData->size); + if (writekey == NULL) + break; + DB_ASSERT(dbc->dbp->env, writedata != NULL); + + /* + * We could choose to optimize this by just storing one + * copy of a key for each set of duplicate data. + */ + memcpy(writekey, cp->currentKey->data, cp->currentKey->size); + memcpy(writedata, cp->currentData->data, cp->currentData->size); + } + + if (ret == DB_NOTFOUND) + ret = 0; + + if (ret == 0) + /* + * Rewind to the previous key/data, since we can't fit + * this one in the buffer + */ + ret = __bamc_compress_get_prev(dbc, flags); + + return (ret); +} + +/* Implements DB_MULTIPLE for __bamc_compress_get() */ +static int +__bamc_compress_get_multiple(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; +{ + int ret; + u_int8_t *writedata; + void *mptr; + BTREE_CURSOR *cp; + + ret = 0; + cp = (BTREE_CURSOR *)dbc->internal; + + data->size = 0; + + DB_MULTIPLE_WRITE_INIT(mptr, data); + DB_MULTIPLE_RESERVE_NEXT(mptr, data, writedata, cp->currentData->size); + data->size += cp->currentData->size + 2 * sizeof(u_int32_t); + if (writedata == NULL) + return DB_BUFFER_SMALL; + + memcpy(writedata, cp->currentData->data, cp->currentData->size); + + while ((ret = __bamc_compress_get_next_dup(dbc, key, flags)) == 0) { + DB_MULTIPLE_RESERVE_NEXT( + mptr, data, writedata, cp->currentData->size); + data->size += cp->currentData->size + 2 * sizeof(u_int32_t); + if (writedata == NULL) { + /* DBC_FROM_DB_GET indicates we need to fit all the + * duplicates into the buffer or return DB_BUFFER_SMALL. + * [#17039] + */ + if (F_ISSET(dbc, DBC_FROM_DB_GET)) + return DB_BUFFER_SMALL; + break; + } + + memcpy(writedata, cp->currentData->data, cp->currentData->size); + } + + if (ret == DB_NOTFOUND) + ret = 0; + + if (ret == 0) + /* + * Rewind to the previous key/data, as that's now our current + * entry. + */ + ret = __bamc_compress_get_prev(dbc, flags); + + return (ret); +} + +/* + * __bamc_compress_iget -- + * Get using a compressed cursor. (internal) + */ +static int +__bamc_compress_iget(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; +{ + int ret; + u_int32_t multiple, method; + BTREE_CURSOR *cp; + DB *dbp; + + cp = (BTREE_CURSOR *)dbc->internal; + dbp = dbc->dbp; + ret = 0; + + multiple = flags & (DB_MULTIPLE|DB_MULTIPLE_KEY); + method = flags & DB_OPFLAGS_MASK; + flags = flags & ~(DB_OPFLAGS_MASK|DB_MULTIPLE|DB_MULTIPLE_KEY); + + switch (method) { + case DB_CURRENT: + if (F_ISSET(cp, C_COMPRESS_DELETED)) + ret = DB_KEYEMPTY; + else if (cp->currentKey == NULL) + ret = EINVAL; + break; + case DB_FIRST: + __bamc_compress_reset(dbc); + ret = __bamc_compress_get_next(dbc, flags); + break; + case DB_NEXT: + ret = __bamc_compress_get_next(dbc, flags); + break; + case DB_NEXT_DUP: + ret = __bamc_compress_get_next_dup(dbc, 0, flags); + break; + case DB_NEXT_NODUP: + ret = __bamc_compress_get_next_nodup(dbc, flags); + break; + case DB_LAST: + __bamc_compress_reset(dbc); + ret = __bamc_compress_get_prev(dbc, flags); + break; + case DB_PREV: + ret = __bamc_compress_get_prev(dbc, flags); + break; + case DB_PREV_DUP: + ret = __bamc_compress_get_prev_dup(dbc, flags); + break; + case DB_PREV_NODUP: + ret = __bamc_compress_get_prev_nodup(dbc, flags); + break; + case DB_SET: + if (((BTREE *) + dbc->dbp->bt_internal)->bt_compare == __bam_defcmp) + F_SET(key, DB_DBT_ISSET); + /* FALL THROUGH */ + case DB_SET_RANGE: + ret = __bamc_compress_get_set(dbc, key, 0, method, flags); + break; + case DB_GET_BOTH: + if (!F_ISSET(dbc->dbp, DB_AM_DUPSORT) || ((BTREE *)dbc->dbp-> + bt_internal)->compress_dup_compare == __bam_defcmp) + F_SET(data, DB_DBT_ISSET); + /* FALL THROUGH */ + case DB_GET_BOTH_RANGE: + if (((BTREE *) + dbc->dbp->bt_internal)->bt_compare == __bam_defcmp) + F_SET(key, DB_DBT_ISSET); + ret = __bamc_compress_get_set(dbc, key, data, method, flags); + break; + case DB_GET_BOTHC: + ret = __bamc_compress_get_bothc(dbc, data, flags); + break; + default: + ret = __db_unknown_flag(dbp->env, "__bamc_compress_iget", + method); + break; + } + + if (ret != 0) + goto err; + + switch (multiple) { + case 0: + if (!F_ISSET(key, DB_DBT_ISSET)) + ret = __db_retcopy(dbc->env, key, + cp->currentKey->data, cp->currentKey->size, + &dbc->rkey->data, &dbc->rkey->ulen); + if (!F_ISSET(data, DB_DBT_ISSET) && ret == 0) + ret = __db_retcopy(dbc->env, data, + cp->currentData->data, cp->currentData->size, + &dbc->rdata->data, &dbc->rdata->ulen); + break; + case DB_MULTIPLE: + if (!F_ISSET(key, DB_DBT_ISSET)) + ret = __db_retcopy(dbc->env, key, + cp->currentKey->data, cp->currentKey->size, + &dbc->rkey->data, &dbc->rkey->ulen); + if (ret == 0) + ret = + __bamc_compress_get_multiple(dbc, key, data, flags); + break; + case DB_MULTIPLE_KEY: + ret = __bamc_compress_get_multiple_key(dbc, data, flags); + break; + default: + ret = __db_unknown_flag(dbp->env, "__bamc_compress_iget", + multiple); + break; + } + + err: + F_CLR(key, DB_DBT_ISSET); + F_CLR(data, DB_DBT_ISSET); + + return (ret); +} + +/* + * __bamc_compress_get -- + * Get using a compressed cursor. + * + * PUBLIC: int __bamc_compress_get __P((DBC *, DBT *, DBT *, u_int32_t)); + */ +int +__bamc_compress_get(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; +{ + DBC *dbc_n; + int ret, t_ret; + u_int32_t tmp_flags; + + switch (flags & DB_OPFLAGS_MASK) { + case DB_CURRENT: + case DB_GET_BOTHC: + case DB_NEXT: + case DB_NEXT_DUP: + case DB_NEXT_NODUP: + case DB_PREV: + case DB_PREV_DUP: + case DB_PREV_NODUP: + if (F_ISSET((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED) + && (ret = __bamc_compress_relocate(dbc)) != 0) + return (ret); + tmp_flags = DB_POSITION; + break; + default: + F_CLR((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED); + tmp_flags = 0; + break; + } + + if (F_ISSET(dbc, DBC_TRANSIENT)) + dbc_n = dbc; + else { + if ((ret = __dbc_dup(dbc, &dbc_n, tmp_flags)) != 0) + goto err; + + /* + * We don't care about preserving the cursor's position on + * error. + */ + F_SET(dbc_n, DBC_TRANSIENT); + + COPY_RET_MEM(dbc, dbc_n); + } + + if ((ret = __bamc_compress_iget(dbc_n, key, data, flags)) != 0) + goto err; + +err: + /* Cleanup and cursor resolution. */ + if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && + (ret == 0 || ret == DB_BUFFER_SMALL)) + ret = t_ret; + return (ret); +} + +/* + * __bamc_compress_iput -- + * Put using a compressed cursor (internal) + */ +static int +__bamc_compress_iput(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; +{ + int ret; + u_int32_t multi; + DBT kcpy, pdata, empty; + BTREE_COMPRESS_STREAM stream; + BTREE_CURSOR *cp; + DB *dbp; + ENV *env; + + cp = (BTREE_CURSOR *)dbc->internal; + dbp = dbc->dbp; + env = dbc->env; + + memset(&pdata, 0, sizeof(DBT)); + memset(&empty, 0, sizeof(DBT)); + + multi = LF_ISSET(DB_MULTIPLE|DB_MULTIPLE_KEY); + LF_CLR(DB_MULTIPLE|DB_MULTIPLE_KEY); + + switch (flags) { + case DB_CURRENT: + if (cp->currentKey == 0 || F_ISSET(cp, C_COMPRESS_DELETED)) { + ret = DB_NOTFOUND; + goto end; + } + + if (F_ISSET(data, DB_DBT_PARTIAL)) { + if ((ret = __db_buildpartial( + dbp, cp->currentData, data, &pdata)) != 0) + goto end; + data = &pdata; + } + + if (F_ISSET(dbp, DB_AM_DUPSORT) && + ((BTREE *)dbp->bt_internal)->compress_dup_compare( + dbp, cp->currentData, data) != 0) { + __db_errx(env, + "Existing data sorts differently from put data"); + ret = EINVAL; + goto end; + } + CMP_INIT_DBT(&kcpy); + if ((ret = __bam_compress_set_dbt(dbp, + &kcpy, cp->currentKey->data, cp->currentKey->size)) != 0) + goto end; + + __bam_cs_create_single(&stream, &kcpy, data); + ret = __bamc_compress_merge_insert(dbc, &stream, NULL, flags); + + if (ret == 0) + /* Position the cursor on the entry written */ + ret = __bamc_compress_get_set( + dbc, &kcpy, data, DB_GET_BOTH_RANGE, 0); + + CMP_FREE_DBT(env, &kcpy); + break; + case DB_KEYFIRST: + case DB_KEYLAST: + case DB_NODUPDATA: + case DB_OVERWRITE_DUP: + switch (multi) { + case 0: + if (F_ISSET(data, DB_DBT_PARTIAL)) { + if ((ret = __bamc_compress_get_set(dbc, key, + data, DB_SET, 0)) != 0 && + ret != DB_NOTFOUND) + goto end; + if ((ret = __db_buildpartial(dbp, + ret == DB_NOTFOUND ? &empty : + cp->currentData, data, &pdata)) != 0) + goto end; + data = &pdata; + } + + __bam_cs_create_single(&stream, key, data); + ret = __bamc_compress_merge_insert( + dbc, &stream, NULL, flags); + + if (ret == 0) + /* Position the cursor on the entry written */ + ret = __bamc_compress_get_set( + dbc, key, data, DB_GET_BOTH_RANGE, 0); + break; + case DB_MULTIPLE: + __bam_cs_create_multiple(&stream, key, data); + ret = __bamc_compress_merge_insert( + dbc, &stream, &key->doff, flags); + break; + case DB_MULTIPLE_KEY: + __bam_cs_create_multiple_key(&stream, key); + ret = __bamc_compress_merge_insert( + dbc, &stream, &key->doff, flags); + break; + default: + return (__db_unknown_flag( + dbp->env, "__bamc_compress_iput", multi)); + } + break; + case DB_NOOVERWRITE: + /* Check key doesn't already exist */ + ret = __bamc_compress_get_set(dbc, key, 0, DB_SET, 0); + if (ret != DB_NOTFOUND) { + if (ret == 0) + ret = DB_KEYEXIST; + goto end; + } + + if (F_ISSET(data, DB_DBT_PARTIAL)) { + if ((ret = __db_buildpartial( + dbp, &empty, data, &pdata)) != 0) + goto end; + data = &pdata; + } + + __bam_cs_create_single(&stream, key, data); + ret = __bamc_compress_merge_insert(dbc, &stream, NULL, flags); + + if (ret == 0) + /* Position the cursor on the entry written */ + ret = __bamc_compress_get_set( + dbc, key, data, DB_GET_BOTH_RANGE, 0); + break; + default: + return (__db_unknown_flag( + dbp->env, "__bamc_compress_iput", flags)); + } + + end: + if (pdata.data != NULL) + __os_free(env, pdata.data); + return (ret); +} + +/* + * __bamc_compress_put -- + * Put using a compressed cursor. + * + * PUBLIC: int __bamc_compress_put __P((DBC *, DBT *, DBT *, u_int32_t)); + */ +int +__bamc_compress_put(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; +{ + DBC *dbc_n; + int ret, t_ret; + + if (F_ISSET((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED)) { + if ((flags & DB_OPFLAGS_MASK) == DB_CURRENT && + (ret = __bamc_compress_relocate(dbc)) != 0) + return (ret); + F_CLR((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED); + } + + if (F_ISSET(dbc, DBC_TRANSIENT)) + dbc_n = dbc; + else { + if ((ret = __dbc_dup(dbc, &dbc_n, + (flags & DB_OPFLAGS_MASK) == DB_CURRENT ? + DB_POSITION : 0)) != 0) + goto err; + + /* + * We don't care about preserving the cursor's position on + * error. + */ + F_SET(dbc_n, DBC_TRANSIENT); + } + + if ((ret = __bamc_compress_iput(dbc_n, key, data, flags)) != 0) + goto err; + +err: + /* Cleanup and cursor resolution. */ + if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && + (ret == 0 || ret == DB_BUFFER_SMALL)) + ret = t_ret; + return (ret); +} + +/* + * __bamc_compress_idel -- + * Del using a compressed cursor. (internal) + */ +static int +__bamc_compress_idel(dbc, flags) + DBC *dbc; + u_int32_t flags; +{ + int ret; + BTREE_COMPRESS_STREAM stream; + DB *dbp; + BTREE_CURSOR *cp; + + COMPQUIET(flags, 0); + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + + if (F_ISSET(cp, C_COMPRESS_DELETED)) + return DB_KEYEMPTY; + if (cp->currentKey == 0) + return DB_NOTFOUND; + + if ((ret = __bam_compress_set_dbt(dbp, &cp->del_key, + cp->currentKey->data, cp->currentKey->size)) != 0) + goto err; + if ((ret = __bam_compress_set_dbt(dbp, &cp->del_data, + cp->currentData->data, cp->currentData->size)) != 0) + goto err; + + __bam_cs_create_single(&stream, &cp->del_key, &cp->del_data); + if ((ret = __bamc_compress_merge_delete(dbc, &stream, NULL)) != 0) + goto err; + + /* Position the cursor on the entry after the key/data deleted */ + ret = __bamc_compress_get_set(dbc, &cp->del_key, &cp->del_data, 0, 0); + if (ret == DB_NOTFOUND) { + __bamc_compress_reset(dbc); + ret = 0; + } else if (ret != 0) + goto err; + + /* Mark current as being deleted */ + F_SET(cp, C_COMPRESS_DELETED); + + err: + return (ret); +} + +/* + * __bamc_compress_del -- + * Del using a compressed cursor. + * + * PUBLIC: int __bamc_compress_del __P((DBC *, u_int32_t)); + */ +int +__bamc_compress_del(dbc, flags) + DBC *dbc; + u_int32_t flags; +{ + int ret, t_ret; + DBC *dbc_n; + + if (F_ISSET((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED) && + (ret = __bamc_compress_relocate(dbc)) != 0) + return (ret); + + if (F_ISSET(dbc, DBC_TRANSIENT)) + dbc_n = dbc; + else { + if ((ret = __dbc_dup(dbc, &dbc_n, DB_POSITION)) != 0) + goto err; + + /* + * We don't care about preserving the cursor's position on + * error. + */ + F_SET(dbc_n, DBC_TRANSIENT); + + COPY_RET_MEM(dbc, dbc_n); + } + + if ((ret = __bamc_compress_idel(dbc_n, flags)) != 0) + goto err; + +err: + /* Cleanup and cursor resolution. */ + if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && + (ret == 0 || ret == DB_BUFFER_SMALL)) + ret = t_ret; + return (ret); +} + +/* + * __bamc_compress_ibulk_del -- + * Bulk del using a compressed cursor. (internal) + */ +static int +__bamc_compress_ibulk_del(dbc, key, flags) + DBC *dbc; + DBT *key; + u_int32_t flags; +{ + BTREE_COMPRESS_STREAM stream; + + switch (flags) { + case 0: + __bam_cs_create_single_keyonly(&stream, key); + return (__bamc_compress_merge_delete_dups(dbc, &stream, NULL)); + case DB_MULTIPLE: + __bam_cs_create_multiple_keyonly(&stream, key); + return (__bamc_compress_merge_delete_dups( + dbc, &stream, &key->doff)); + case DB_MULTIPLE_KEY: + __bam_cs_create_multiple_key(&stream, key); + return (__bamc_compress_merge_delete(dbc, &stream, &key->doff)); + default: + break; + } + + return (__db_unknown_flag( + dbc->env, "__bamc_compress_ibulk_del", flags)); +} + +/* + * __bamc_compress_bulk_del -- + * Bulk del using a compressed cursor. + * + * PUBLIC: int __bamc_compress_bulk_del __P((DBC *, DBT *, u_int32_t)); + */ +int +__bamc_compress_bulk_del(dbc, key, flags) + DBC *dbc; + DBT *key; + u_int32_t flags; +{ + int ret, t_ret; + DBC *dbc_n; + + F_CLR((BTREE_CURSOR *)dbc->internal, C_COMPRESS_MODIFIED); + + if (F_ISSET(dbc, DBC_TRANSIENT)) + dbc_n = dbc; + else { + if ((ret = __dbc_dup(dbc, &dbc_n, 0)) != 0) + goto err; + + /* + * We don't care about preserving the cursor's position on + * error. + */ + F_SET(dbc_n, DBC_TRANSIENT); + } + + if ((ret = __bamc_compress_ibulk_del(dbc_n, key, flags)) != 0) + goto err; + +err: + /* Cleanup and cursor resolution. */ + if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && + (ret == 0 || ret == DB_BUFFER_SMALL)) + ret = t_ret; + return (ret); +} + +/* + * __bamc_compress_count -- + * Count using a compressed cursor. + * + * PUBLIC: int __bamc_compress_count __P((DBC *, db_recno_t *)); + */ +int +__bamc_compress_count(dbc, countp) + DBC *dbc; + db_recno_t *countp; +{ + int ret, t_ret; + db_recno_t count; + DBT *key; + DBC *dbc_n; + BTREE_CURSOR *cp; + + cp = (BTREE_CURSOR *)dbc->internal; + + /* + * If the current entry is deleted use del_key, otherwise use + * currentKey. + */ + if (F_ISSET(cp, C_COMPRESS_DELETED)) + key = &cp->del_key; + else + key = cp->currentKey; + + /* Duplicate the cursor */ + if ((ret = __dbc_dup(dbc, &dbc_n, 0)) != 0) + return (ret); + + /* We don't care about preserving the cursor's position on error */ + F_SET(dbc_n, DBC_TRANSIENT); + + /* Find the first duplicate */ + if ((ret = __bamc_compress_get_set(dbc_n, key, 0, DB_SET, 0)) != 0) + goto err; + count = 1; + + /* Count subsequent duplicates */ + while ((ret = __bamc_compress_get_next_dup(dbc_n, key, 0)) == 0) + ++count; + + if (ret == DB_NOTFOUND) + ret = 0; + else if (ret != 0) + goto err; + + *countp = count; + + err: + if ((t_ret = __dbc_close(dbc_n)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __bamc_compress_cmp -- + * Compare which compressed value is pointed to. + * + * PUBLIC: int __bamc_compress_cmp __P((DBC *, DBC *, int *)); + */ +int +__bamc_compress_cmp(dbc, other_dbc, result) + DBC *dbc, *other_dbc; + int *result; +{ + DB *dbp; + BTREE_CURSOR *cp, *ocp; + + /* + * At this point, we already know that the cursors point to the same + * DB. + */ + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + ocp = (BTREE_CURSOR *)other_dbc->internal; + + if (F_ISSET(cp, C_COMPRESS_DELETED)) + if (F_ISSET(ocp, C_COMPRESS_DELETED)) + *result = __db_compare_both( + dbp, &cp->del_key, &cp->del_data, + &ocp->del_key, &ocp->del_data) == 0 ? 0 : 1; + else { + if (ocp->currentKey == 0) + goto err; + + *result = __db_compare_both( + dbp, &cp->del_key, &cp->del_data, + ocp->currentKey, ocp->currentData) == 0 ? 0 : 1; + } + else { + if (cp->currentKey == 0) + goto err; + + if (F_ISSET(ocp, C_COMPRESS_DELETED)) + *result = __db_compare_both( + dbp, cp->currentKey, cp->currentData, + &ocp->del_key, &ocp->del_data) == 0 ? 0 : 1; + else { + if (ocp->currentKey == 0) + goto err; + + *result = __db_compare_both( + dbp, cp->currentKey, cp->currentData, + ocp->currentKey, ocp->currentData) == 0 ? 0 : 1; + } + } + return (0); + + err: + __db_errx(dbc->env, + "Both cursors must be initialized before calling DBC->cmp."); + return (EINVAL); +} + +/* + * __bamc_compress_dup -- + * Duplicate the compression specific part of a btree cursor. + * + * PUBLIC: int __bamc_compress_dup __P((DBC *, DBC *, u_int32_t)); + */ +int +__bamc_compress_dup(orig_dbc, new_dbc, flags) + DBC *orig_dbc, *new_dbc; + u_int32_t flags; +{ + int ret; + DB *dbp; + BTREE_CURSOR *orig, *new; + + dbp = new_dbc->dbp; + + orig = (BTREE_CURSOR *)orig_dbc->internal; + new = (BTREE_CURSOR *)new_dbc->internal; + + if (orig->currentKey != NULL && !LF_ISSET(DB_SHALLOW_DUP)) { + new->currentKey = &new->key1; + new->currentData = &new->data1; + + if ((ret = __bam_compress_set_dbt(dbp, new->currentKey, + orig->currentKey->data, orig->currentKey->size)) != 0) + return (ret); + if ((ret = __bam_compress_set_dbt(dbp, new->currentData, + orig->currentData->data, orig->currentData->size)) != 0) + return (ret); + + if (orig->prevKey) { + new->prevKey = &new->key2; + new->prevData = &new->data2; + + if ((ret = __bam_compress_set_dbt(dbp, new->prevKey, + orig->prevKey->data, orig->prevKey->size)) != 0) + return (ret); + if ((ret = __bam_compress_set_dbt(dbp, new->prevData, + orig->prevData->data, orig->prevData->size)) != 0) + return (ret); + } + + if ((ret = __bam_compress_set_dbt(dbp, &new->compressed, + orig->compressed.data, orig->compressed.size)) != 0) + return (ret); + + new->compcursor = (u_int8_t*)new->compressed.data + + (orig->compcursor - (u_int8_t*)orig->compressed.data); + new->compend = (u_int8_t*)new->compressed.data + + (orig->compend - (u_int8_t*)orig->compressed.data); + new->prevcursor = orig->prevcursor == NULL ? NULL : + (u_int8_t*)new->compressed.data + (orig->prevcursor - + (u_int8_t*)orig->compressed.data); + new->prev2cursor = orig->prev2cursor == NULL ? NULL : + (u_int8_t*)new->compressed.data + (orig->prev2cursor - + (u_int8_t*)orig->compressed.data); + + if (F_ISSET(orig, C_COMPRESS_DELETED)) { + if ((ret = __bam_compress_set_dbt(dbp, &new->del_key, + orig->del_key.data, orig->del_key.size)) != 0) + return (ret); + if ((ret = __bam_compress_set_dbt(dbp, &new->del_data, + orig->del_data.data, orig->del_data.size)) != 0) + return (ret); + } + } + + return (0); +} + +/* + * __bam_compress_salvage -- + * Salvage the compressed data from the key/data pair + * + * PUBLIC: int __bam_compress_salvage __P((DB *, VRFY_DBINFO *, + * PUBLIC: void *, int (*)(void *, const void *), DBT *, DBT *)); + */ +int +__bam_compress_salvage(dbp, vdp, handle, callback, key, data) + DB *dbp; + VRFY_DBINFO *vdp; + void *handle; + int (*callback) __P((void *, const void *)); + DBT *key, *data; +{ + DBT key1, key2, data1, data2, compressed; + DBT *currentKey, *currentData, *prevKey, *prevData; + ENV *env; + int ret, t_ret; + u_int8_t *compcursor, *compend; + u_int32_t datasize, size; + + env = dbp->env; + + memset(&key1, 0, sizeof(DBT)); + memset(&key2, 0, sizeof(DBT)); + memset(&data1, 0, sizeof(DBT)); + memset(&data2, 0, sizeof(DBT)); + memset(&compressed, 0, sizeof(DBT)); + + key1.flags = DB_DBT_USERMEM; + key2.flags = DB_DBT_USERMEM; + data1.flags = DB_DBT_USERMEM; + data2.flags = DB_DBT_USERMEM; + compressed.flags = DB_DBT_USERMEM; + + prevKey = NULL; + prevData = NULL; + currentKey = key; + currentData = &data2; + compcursor = (u_int8_t*)data->data; + compend = compcursor + data->size; + + if (data->size == 0) { + ret = DB_VERIFY_FATAL; + goto unknown_data; + } + + /* Unmarshal the first data */ + size = __db_decompress_count_int(compcursor); + if (size == 0xFF || compcursor + size > compend) { + ret = DB_VERIFY_FATAL; + goto unknown_data; + } + compcursor += __db_decompress_int32(compcursor, &datasize); + + if (compcursor + datasize > compend) { + ret = DB_VERIFY_FATAL; + goto unknown_data; + } + if ((ret = __bam_compress_set_dbt( + dbp, currentData, compcursor, datasize)) != 0) + goto err; + compcursor += datasize; + + /* Output first data (first key has already been output by our caller */ + if ((ret = __db_vrfy_prdbt( + currentData, 0, " ", handle, callback, 0, vdp)) != 0) + goto err; + + while (compcursor < compend) { + prevKey = currentKey; + prevData = currentData; + + if (currentKey == &key1) { + currentKey = &key2; + currentData = &data2; + } else { + currentKey = &key1; + currentData = &data1; + } + + compressed.data = (void*)compcursor; + compressed.ulen = compressed.size = + (u_int32_t)(compend - compcursor); + + /* Decompress the next key/data pair */ + while ((ret = ((BTREE *)dbp->bt_internal)->bt_decompress( + dbp, prevKey, prevData, + &compressed, currentKey, currentData)) == DB_BUFFER_SMALL) { + if (CMP_RESIZE_DBT(ret, env, currentKey) != 0) + break; + if (CMP_RESIZE_DBT(ret, env, currentData) != 0) + break; + } + + if (ret == EINVAL) { + ret = DB_VERIFY_FATAL; + goto err; + } + if (ret != 0) + goto err; + + compcursor += compressed.size; + + if (compcursor > compend) { + ret = DB_VERIFY_FATAL; + goto err; + } + + /* Output the next key/data pair */ + if ((ret = __db_vrfy_prdbt( + currentKey, 0, " ", handle, callback, 0, vdp)) != 0) + goto err; + if ((ret = __db_vrfy_prdbt( + currentData, 0, " ", handle, callback, 0, vdp)) != 0) + goto err; + } + + if (0) { + unknown_data: + /* + * Make sure we output a data value for the key that's + * already been output + */ + DB_INIT_DBT( + compressed, "UNKNOWN_DATA", sizeof("UNKNOWN_DATA") - 1); + if ((t_ret = __db_vrfy_prdbt( + &compressed, 0, " ", handle, callback, 0, vdp)) != 0) + ret = t_ret; + } + + err: + __os_free(env, key1.data); + __os_free(env, key2.data); + __os_free(env, data1.data); + __os_free(env, data2.data); + return (ret); +} + +/* + * __bam_compress_count -- + * Calculate key and entry counts for the compressed BTree + * + * PUBLIC: int __bam_compress_count __P((DBC *, u_int32_t *, u_int32_t *)); + */ +int +__bam_compress_count(dbc, nkeysp, ndatap) + DBC *dbc; + u_int32_t *nkeysp, *ndatap; +{ + int ret, t_ret; + u_int32_t nkeys, ndata; + DB *dbp; + BTREE *t; + DBC *dbc_n; + BTREE_CURSOR *cp_n; + + dbp = dbc->dbp; + t = (BTREE *)dbp->bt_internal; + + /* Duplicate the cursor */ + if ((ret = __dbc_dup(dbc, &dbc_n, 0)) != 0) + return (ret); + + /* We don't care about preserving the cursor's position on error */ + F_SET(dbc_n, DBC_TRANSIENT); + + cp_n = (BTREE_CURSOR *)dbc_n->internal; + + nkeys = 0; + ndata = 0; + + CMP_IGET_RETRY(ret, dbc_n, &cp_n->key1, &cp_n->compressed, DB_FIRST); + if (ret != 0) + goto err; + + if ((ret = __bamc_start_decompress(dbc_n)) != 0) + goto err; + nkeys += 1; + + for (;;) { + ndata += 1; + + ret = __bamc_next_decompress(dbc_n); + if (ret == DB_NOTFOUND) { + if (cp_n->currentKey == &cp_n->key1) { + /* + * Make sure that the previous key isn't + * overwritten when we fetch the next chunk. + */ + if ((ret = __bam_compress_set_dbt(dbp, + &cp_n->key2, cp_n->key1.data, + cp_n->key1.size)) != 0) + goto err; + } + + CMP_IGET_RETRY(ret, dbc_n, &cp_n->key1, + &cp_n->compressed, DB_NEXT); + if (ret != 0) + goto err; + + ret = __bamc_start_decompress(dbc_n); + + cp_n->prevKey = &cp_n->key2; + } + + if (ret != 0) + goto err; + + if (t->bt_compare(dbp, cp_n->currentKey, cp_n->prevKey) != 0) + nkeys += 1; + } + +err: + if (ret == DB_NOTFOUND) + ret = 0; + + if ((t_ret = __dbc_close(dbc_n)) != 0 && ret == 0) + ret = t_ret; + + if (ret == 0) { + if (nkeysp != NULL) + *nkeysp = nkeys; + if (ndatap != NULL) + *ndatap = ndata; + } + + return (ret); +} + +#endif diff --git a/btree/bt_conv.c b/btree/bt_conv.c index 1cb208b..aa14173 100644 --- a/btree/bt_conv.c +++ b/btree/bt_conv.c @@ -1,221 +1,95 @@ /*- - * Copyright (c) 1990, 1993, 1994 - * The Regents of the University of California. All rights reserved. + * See the file LICENSE for redistribution information. * - * This code is derived from software contributed to Berkeley by - * Mike Olson. + * Copyright (c) 1996-2009 Oracle. All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. + * $Id$ */ -#if defined(LIBC_SCCS) && !defined(lint) -static char sccsid[] = "@(#)bt_conv.c 8.5 (Berkeley) 8/17/94"; -#endif /* LIBC_SCCS and not lint */ - -#include <sys/param.h> - -#include <stdio.h> +#include "db_config.h" -#include <db.h> -#include "btree.h" - -static void mswap __P((PAGE *)); +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_swap.h" +#include "dbinc/btree.h" /* - * __BT_BPGIN, __BT_BPGOUT -- - * Convert host-specific number layout to/from the host-independent - * format stored on disk. + * __bam_pgin -- + * Convert host-specific page layout from the host-independent format + * stored on disk. * - * Parameters: - * t: tree - * pg: page number - * h: page to convert + * PUBLIC: int __bam_pgin __P((DB *, db_pgno_t, void *, DBT *)); */ -void -__bt_pgin(t, pg, pp) - void *t; - pgno_t pg; +int +__bam_pgin(dbp, pg, pp, cookie) + DB *dbp; + db_pgno_t pg; void *pp; + DBT *cookie; { + DB_PGINFO *pginfo; PAGE *h; - indx_t i, top; - u_char flags; - char *p; - if (!F_ISSET(((BTREE *)t), B_NEEDSWAP)) - return; - if (pg == P_META) { - mswap(pp); - return; - } + pginfo = (DB_PGINFO *)cookie->data; + if (!F_ISSET(pginfo, DB_AM_SWAP)) + return (0); h = pp; - M_32_SWAP(h->pgno); - M_32_SWAP(h->prevpg); - M_32_SWAP(h->nextpg); - M_32_SWAP(h->flags); - M_16_SWAP(h->lower); - M_16_SWAP(h->upper); - - top = NEXTINDEX(h); - if ((h->flags & P_TYPE) == P_BINTERNAL) - for (i = 0; i < top; i++) { - M_16_SWAP(h->linp[i]); - p = (char *)GETBINTERNAL(h, i); - P_32_SWAP(p); - p += sizeof(u_int32_t); - P_32_SWAP(p); - p += sizeof(pgno_t); - if (*(u_char *)p & P_BIGKEY) { - p += sizeof(u_char); - P_32_SWAP(p); - p += sizeof(pgno_t); - P_32_SWAP(p); - } - } - else if ((h->flags & P_TYPE) == P_BLEAF) - for (i = 0; i < top; i++) { - M_16_SWAP(h->linp[i]); - p = (char *)GETBLEAF(h, i); - P_32_SWAP(p); - p += sizeof(u_int32_t); - P_32_SWAP(p); - p += sizeof(u_int32_t); - flags = *(u_char *)p; - if (flags & (P_BIGKEY | P_BIGDATA)) { - p += sizeof(u_char); - if (flags & P_BIGKEY) { - P_32_SWAP(p); - p += sizeof(pgno_t); - P_32_SWAP(p); - } - if (flags & P_BIGDATA) { - p += sizeof(u_int32_t); - P_32_SWAP(p); - p += sizeof(pgno_t); - P_32_SWAP(p); - } - } - } + return (TYPE(h) == P_BTREEMETA ? __bam_mswap(dbp->env, pp) : + __db_byteswap(dbp, pg, pp, pginfo->db_pagesize, 1)); } -void -__bt_pgout(t, pg, pp) - void *t; - pgno_t pg; +/* + * __bam_pgout -- + * Convert host-specific page layout to the host-independent format + * stored on disk. + * + * PUBLIC: int __bam_pgout __P((DB *, db_pgno_t, void *, DBT *)); + */ +int +__bam_pgout(dbp, pg, pp, cookie) + DB *dbp; + db_pgno_t pg; void *pp; + DBT *cookie; { + DB_PGINFO *pginfo; PAGE *h; - indx_t i, top; - u_char flags; - char *p; - if (!F_ISSET(((BTREE *)t), B_NEEDSWAP)) - return; - if (pg == P_META) { - mswap(pp); - return; - } + pginfo = (DB_PGINFO *)cookie->data; + if (!F_ISSET(pginfo, DB_AM_SWAP)) + return (0); h = pp; - top = NEXTINDEX(h); - if ((h->flags & P_TYPE) == P_BINTERNAL) - for (i = 0; i < top; i++) { - p = (char *)GETBINTERNAL(h, i); - P_32_SWAP(p); - p += sizeof(u_int32_t); - P_32_SWAP(p); - p += sizeof(pgno_t); - if (*(u_char *)p & P_BIGKEY) { - p += sizeof(u_char); - P_32_SWAP(p); - p += sizeof(pgno_t); - P_32_SWAP(p); - } - M_16_SWAP(h->linp[i]); - } - else if ((h->flags & P_TYPE) == P_BLEAF) - for (i = 0; i < top; i++) { - p = (char *)GETBLEAF(h, i); - P_32_SWAP(p); - p += sizeof(u_int32_t); - P_32_SWAP(p); - p += sizeof(u_int32_t); - flags = *(u_char *)p; - if (flags & (P_BIGKEY | P_BIGDATA)) { - p += sizeof(u_char); - if (flags & P_BIGKEY) { - P_32_SWAP(p); - p += sizeof(pgno_t); - P_32_SWAP(p); - } - if (flags & P_BIGDATA) { - p += sizeof(u_int32_t); - P_32_SWAP(p); - p += sizeof(pgno_t); - P_32_SWAP(p); - } - } - M_16_SWAP(h->linp[i]); - } - - M_32_SWAP(h->pgno); - M_32_SWAP(h->prevpg); - M_32_SWAP(h->nextpg); - M_32_SWAP(h->flags); - M_16_SWAP(h->lower); - M_16_SWAP(h->upper); + return (TYPE(h) == P_BTREEMETA ? __bam_mswap(dbp->env, pp) : + __db_byteswap(dbp, pg, pp, pginfo->db_pagesize, 0)); } /* - * MSWAP -- Actually swap the bytes on the meta page. + * __bam_mswap -- + * Swap the bytes on the btree metadata page. * - * Parameters: - * p: page to convert + * PUBLIC: int __bam_mswap __P((ENV *, PAGE *)); */ -static void -mswap(pg) +int +__bam_mswap(env, pg) + ENV *env; PAGE *pg; { - char *p; + u_int8_t *p; + + COMPQUIET(env, NULL); + + __db_metaswap(pg); + p = (u_int8_t *)pg + sizeof(DBMETA); + + p += sizeof(u_int32_t); /* unused */ + SWAP32(p); /* minkey */ + SWAP32(p); /* re_len */ + SWAP32(p); /* re_pad */ + SWAP32(p); /* root */ + p += 92 * sizeof(u_int32_t); /* unused */ + SWAP32(p); /* crypto_magic */ - p = (char *)pg; - P_32_SWAP(p); /* magic */ - p += sizeof(u_int32_t); - P_32_SWAP(p); /* version */ - p += sizeof(u_int32_t); - P_32_SWAP(p); /* psize */ - p += sizeof(u_int32_t); - P_32_SWAP(p); /* free */ - p += sizeof(u_int32_t); - P_32_SWAP(p); /* nrecs */ - p += sizeof(u_int32_t); - P_32_SWAP(p); /* flags */ - p += sizeof(u_int32_t); + return (0); } diff --git a/btree/bt_curadj.c b/btree/bt_curadj.c new file mode 100644 index 0000000..3f6077d --- /dev/null +++ b/btree/bt_curadj.c @@ -0,0 +1,620 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/mp.h" + +static int __bam_opd_cursor __P((DB *, DBC *, db_pgno_t, u_int32_t, u_int32_t)); + +/* + * Cursor adjustments are logged if they are for subtransactions. This is + * because it's possible for a subtransaction to adjust cursors which will + * still be active after the subtransaction aborts, and so which must be + * restored to their previous locations. Cursors that can be both affected + * by our cursor adjustments and active after our transaction aborts can + * only be found in our parent transaction -- cursors in other transactions, + * including other child transactions of our parent, must have conflicting + * locker IDs, and so cannot be affected by adjustments in this transaction. + */ + +/* + * __bam_ca_delete -- + * Update the cursors when items are deleted and when already deleted + * items are overwritten. Return the number of relevant cursors found. + * + * PUBLIC: int __bam_ca_delete __P((DB *, db_pgno_t, u_int32_t, int, int *)); + */ +int +__bam_ca_delete(dbp, pgno, indx, delete, countp) + DB *dbp; + db_pgno_t pgno; + u_int32_t indx; + int delete, *countp; +{ + BTREE_CURSOR *cp; + DB *ldbp; + DBC *dbc; + ENV *env; + int count; /* !!!: Has to contain max number of cursors. */ + + env = dbp->env; + + /* + * Adjust the cursors. We have the page write locked, so the + * only other cursors that can be pointing at a page are + * those in the same thread of control. Unfortunately, we don't + * know that they're using the same DB handle, so traverse + * all matching DB handles in the same ENV, then all cursors + * on each matching DB handle. + * + * Each cursor is single-threaded, so we only need to lock the + * list of DBs and then the list of cursors in each DB. + */ + MUTEX_LOCK(env, env->mtx_dblist); + FIND_FIRST_DB_MATCH(env, dbp, ldbp); + for (count = 0; + ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; + ldbp = TAILQ_NEXT(ldbp, dblistlinks)) { + MUTEX_LOCK(env, dbp->mutex); + TAILQ_FOREACH(dbc, &ldbp->active_queue, links) { + cp = (BTREE_CURSOR *)dbc->internal; + if (cp->pgno == pgno && cp->indx == indx && + !MVCC_SKIP_CURADJ(dbc, pgno)) { + /* + * [#8032] This assert is checking + * for possible race conditions where we + * hold a cursor position without a lock. + * Unfortunately, there are paths in the + * Btree code that do not satisfy these + * conditions. None of them are known to + * be a problem, but this assert should + * be re-activated when the Btree stack + * code is re-written. + DB_ASSERT(env, !STD_LOCKING(dbc) || + cp->lock_mode != DB_LOCK_NG); + */ + if (delete) { + F_SET(cp, C_DELETED); + /* + * If we're deleting the item, we can't + * keep a streaming offset cached. + */ + cp->stream_start_pgno = PGNO_INVALID; + } else + F_CLR(cp, C_DELETED); + +#ifdef HAVE_COMPRESSION + /* + * We also set the C_COMPRESS_MODIFIED flag, + * which prompts the compression code to look + * for it's current entry again if it needs to. + * + * The flag isn't cleared, because the + * compression code still needs to do that even + * for an entry that becomes undeleted. + * + * This flag also needs to be set if an entry is + * updated, but since the compression code + * always deletes before an update, setting it + * here is sufficient. + */ + F_SET(cp, C_COMPRESS_MODIFIED); +#endif + + ++count; + } + } + MUTEX_UNLOCK(env, dbp->mutex); + } + MUTEX_UNLOCK(env, env->mtx_dblist); + + if (countp != NULL) + *countp = count; + return (0); +} + +/* + * __ram_ca_delete -- + * Return if any relevant cursors found. + * + * PUBLIC: int __ram_ca_delete __P((DB *, db_pgno_t, int *)); + */ +int +__ram_ca_delete(dbp, root_pgno, foundp) + DB *dbp; + db_pgno_t root_pgno; + int *foundp; +{ + DB *ldbp; + DBC *dbc; + ENV *env; + int found; + + env = dbp->env; + + /* + * Review the cursors. See the comment in __bam_ca_delete(). + */ + MUTEX_LOCK(env, env->mtx_dblist); + FIND_FIRST_DB_MATCH(env, dbp, ldbp); + for (found = 0; + found == 0 && ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; + ldbp = TAILQ_NEXT(ldbp, dblistlinks)) { + MUTEX_LOCK(env, dbp->mutex); + TAILQ_FOREACH(dbc, &ldbp->active_queue, links) + if (dbc->internal->root == root_pgno && + !MVCC_SKIP_CURADJ(dbc, root_pgno)) { + found = 1; + break; + } + MUTEX_UNLOCK(env, dbp->mutex); + } + MUTEX_UNLOCK(env, env->mtx_dblist); + + *foundp = found; + return (0); +} + +/* + * __bam_ca_di -- + * Adjust the cursors during a delete or insert. + * + * PUBLIC: int __bam_ca_di __P((DBC *, db_pgno_t, u_int32_t, int)); + */ +int +__bam_ca_di(my_dbc, pgno, indx, adjust) + DBC *my_dbc; + db_pgno_t pgno; + u_int32_t indx; + int adjust; +{ + DB *dbp, *ldbp; + DBC *dbc; + DBC_INTERNAL *cp; + DB_LSN lsn; + DB_TXN *my_txn; + ENV *env; + int found, ret; + + dbp = my_dbc->dbp; + env = dbp->env; + + my_txn = IS_SUBTRANSACTION(my_dbc->txn) ? my_dbc->txn : NULL; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + */ + MUTEX_LOCK(env, env->mtx_dblist); + FIND_FIRST_DB_MATCH(env, dbp, ldbp); + for (found = 0; + ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; + ldbp = TAILQ_NEXT(ldbp, dblistlinks)) { + MUTEX_LOCK(env, dbp->mutex); + TAILQ_FOREACH(dbc, &ldbp->active_queue, links) { + if (dbc->dbtype == DB_RECNO) + continue; + cp = dbc->internal; + if (cp->pgno == pgno && cp->indx >= indx && + (dbc == my_dbc || !MVCC_SKIP_CURADJ(dbc, pgno))) { + /* Cursor indices should never be negative. */ + DB_ASSERT(env, cp->indx != 0 || adjust > 0); + /* [#8032] + DB_ASSERT(env, !STD_LOCKING(dbc) || + cp->lock_mode != DB_LOCK_NG); + */ + cp->indx += adjust; + if (my_txn != NULL && dbc->txn != my_txn) + found = 1; + } + } + MUTEX_UNLOCK(env, dbp->mutex); + } + MUTEX_UNLOCK(env, env->mtx_dblist); + + if (found != 0 && DBC_LOGGING(my_dbc)) { + if ((ret = __bam_curadj_log(dbp, my_dbc->txn, &lsn, 0, + DB_CA_DI, pgno, 0, 0, (u_int32_t)adjust, indx, 0)) != 0) + return (ret); + } + + return (0); +} + +/* + * __bam_opd_cursor -- create a new opd cursor. + */ +static int +__bam_opd_cursor(dbp, dbc, first, tpgno, ti) + DB *dbp; + DBC *dbc; + db_pgno_t tpgno; + u_int32_t first, ti; +{ + BTREE_CURSOR *cp, *orig_cp; + DBC *dbc_nopd; + int ret; + + orig_cp = (BTREE_CURSOR *)dbc->internal; + dbc_nopd = NULL; + + /* + * Allocate a new cursor and create the stack. If duplicates + * are sorted, we've just created an off-page duplicate Btree. + * If duplicates aren't sorted, we've just created a Recno tree. + * + * Note that in order to get here at all, there shouldn't be + * an old off-page dup cursor--to augment the checking dbc_newopd + * will do, assert this. + */ + DB_ASSERT(dbp->env, orig_cp->opd == NULL); + if ((ret = __dbc_newopd(dbc, tpgno, orig_cp->opd, &dbc_nopd)) != 0) + return (ret); + + cp = (BTREE_CURSOR *)dbc_nopd->internal; + cp->pgno = tpgno; + cp->indx = ti; + + if (dbp->dup_compare == NULL) { + /* + * Converting to off-page Recno trees is tricky. The + * record number for the cursor is the index + 1 (to + * convert to 1-based record numbers). + */ + cp->recno = ti + 1; + } + + /* + * Transfer the deleted flag from the top-level cursor to the + * created one. + */ + if (F_ISSET(orig_cp, C_DELETED)) { + F_SET(cp, C_DELETED); + F_CLR(orig_cp, C_DELETED); + } + + /* Stack the cursors and reset the initial cursor's index. */ + orig_cp->opd = dbc_nopd; + orig_cp->indx = first; + return (0); +} + +/* + * __bam_ca_dup -- + * Adjust the cursors when moving items from a leaf page to a duplicates + * page. + * + * PUBLIC: int __bam_ca_dup __P((DBC *, + * PUBLIC: u_int32_t, db_pgno_t, u_int32_t, db_pgno_t, u_int32_t)); + */ +int +__bam_ca_dup(my_dbc, first, fpgno, fi, tpgno, ti) + DBC *my_dbc; + db_pgno_t fpgno, tpgno; + u_int32_t first, fi, ti; +{ + BTREE_CURSOR *orig_cp; + DB *dbp, *ldbp; + DBC *dbc; + DB_LSN lsn; + DB_TXN *my_txn; + ENV *env; + int found, ret, t_ret; + + dbp = my_dbc->dbp; + env = dbp->env; + my_txn = IS_SUBTRANSACTION(my_dbc->txn) ? my_dbc->txn : NULL; + ret = 0; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + */ + MUTEX_LOCK(env, env->mtx_dblist); + FIND_FIRST_DB_MATCH(env, dbp, ldbp); + for (found = 0; + ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; + ldbp = TAILQ_NEXT(ldbp, dblistlinks)) { +loop: MUTEX_LOCK(env, dbp->mutex); + TAILQ_FOREACH(dbc, &ldbp->active_queue, links) { + /* Find cursors pointing to this record. */ + orig_cp = (BTREE_CURSOR *)dbc->internal; + if (orig_cp->pgno != fpgno || orig_cp->indx != fi || + MVCC_SKIP_CURADJ(dbc, fpgno)) + continue; + + /* + * Since we rescan the list see if this is already + * converted. + */ + if (orig_cp->opd != NULL) + continue; + + MUTEX_UNLOCK(env, dbp->mutex); + /* [#8032] + DB_ASSERT(env, !STD_LOCKING(dbc) || + orig_cp->lock_mode != DB_LOCK_NG); + */ + if ((ret = __bam_opd_cursor(dbp, + dbc, first, tpgno, ti)) != 0) + goto err; + if (my_txn != NULL && dbc->txn != my_txn) + found = 1; + /* We released the mutex to get a cursor, start over. */ + goto loop; + } + MUTEX_UNLOCK(env, dbp->mutex); + } +err: MUTEX_UNLOCK(env, env->mtx_dblist); + + if (found != 0 && DBC_LOGGING(my_dbc)) { + if ((t_ret = __bam_curadj_log(dbp, my_dbc->txn, + &lsn, 0, DB_CA_DUP, fpgno, tpgno, 0, first, fi, ti)) != 0 && + ret == 0) + ret = t_ret; + } + + return (ret); +} + +/* + * __bam_ca_undodup -- + * Adjust the cursors when returning items to a leaf page + * from a duplicate page. + * Called only during undo processing. + * + * PUBLIC: int __bam_ca_undodup __P((DB *, + * PUBLIC: u_int32_t, db_pgno_t, u_int32_t, u_int32_t)); + */ +int +__bam_ca_undodup(dbp, first, fpgno, fi, ti) + DB *dbp; + db_pgno_t fpgno; + u_int32_t first, fi, ti; +{ + BTREE_CURSOR *orig_cp; + DB *ldbp; + DBC *dbc; + ENV *env; + int ret; + + env = dbp->env; + ret = 0; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + */ + MUTEX_LOCK(env, env->mtx_dblist); + FIND_FIRST_DB_MATCH(env, dbp, ldbp); + for (; + ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; + ldbp = TAILQ_NEXT(ldbp, dblistlinks)) { +loop: MUTEX_LOCK(env, dbp->mutex); + TAILQ_FOREACH(dbc, &ldbp->active_queue, links) { + orig_cp = (BTREE_CURSOR *)dbc->internal; + + /* + * A note on the orig_cp->opd != NULL requirement here: + * it's possible that there's a cursor that refers to + * the same duplicate set, but which has no opd cursor, + * because it refers to a different item and we took + * care of it while processing a previous record. + */ + if (orig_cp->pgno != fpgno || + orig_cp->indx != first || + orig_cp->opd == NULL || ((BTREE_CURSOR *) + orig_cp->opd->internal)->indx != ti || + MVCC_SKIP_CURADJ(dbc, fpgno)) + continue; + MUTEX_UNLOCK(env, dbp->mutex); + if ((ret = __dbc_close(orig_cp->opd)) != 0) + goto err; + orig_cp->opd = NULL; + orig_cp->indx = fi; + /* + * We released the mutex to free a cursor, + * start over. + */ + goto loop; + } + MUTEX_UNLOCK(env, dbp->mutex); + } +err: MUTEX_UNLOCK(env, env->mtx_dblist); + + return (ret); +} + +/* + * __bam_ca_rsplit -- + * Adjust the cursors when doing reverse splits. + * + * PUBLIC: int __bam_ca_rsplit __P((DBC *, db_pgno_t, db_pgno_t)); + */ +int +__bam_ca_rsplit(my_dbc, fpgno, tpgno) + DBC* my_dbc; + db_pgno_t fpgno, tpgno; +{ + DB *dbp, *ldbp; + DBC *dbc; + DB_LSN lsn; + DB_TXN *my_txn; + ENV *env; + int found, ret; + + dbp = my_dbc->dbp; + env = dbp->env; + my_txn = IS_SUBTRANSACTION(my_dbc->txn) ? my_dbc->txn : NULL; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + */ + MUTEX_LOCK(env, env->mtx_dblist); + FIND_FIRST_DB_MATCH(env, dbp, ldbp); + for (found = 0; + ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; + ldbp = TAILQ_NEXT(ldbp, dblistlinks)) { + MUTEX_LOCK(env, dbp->mutex); + TAILQ_FOREACH(dbc, &ldbp->active_queue, links) { + if (dbc->dbtype == DB_RECNO) + continue; + if (dbc->internal->pgno == fpgno && + !MVCC_SKIP_CURADJ(dbc, fpgno)) { + dbc->internal->pgno = tpgno; + /* [#8032] + DB_ASSERT(env, !STD_LOCKING(dbc) || + dbc->internal->lock_mode != DB_LOCK_NG); + */ + if (my_txn != NULL && dbc->txn != my_txn) + found = 1; + } + } + MUTEX_UNLOCK(env, dbp->mutex); + } + MUTEX_UNLOCK(env, env->mtx_dblist); + + if (found != 0 && DBC_LOGGING(my_dbc)) { + if ((ret = __bam_curadj_log(dbp, my_dbc->txn, + &lsn, 0, DB_CA_RSPLIT, fpgno, tpgno, 0, 0, 0, 0)) != 0) + return (ret); + } + return (0); +} + +/* + * __bam_ca_split -- + * Adjust the cursors when splitting a page. + * + * PUBLIC: int __bam_ca_split __P((DBC *, + * PUBLIC: db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t, int)); + */ +int +__bam_ca_split(my_dbc, ppgno, lpgno, rpgno, split_indx, cleft) + DBC *my_dbc; + db_pgno_t ppgno, lpgno, rpgno; + u_int32_t split_indx; + int cleft; +{ + DB *dbp, *ldbp; + DBC *dbc; + DBC_INTERNAL *cp; + DB_LSN lsn; + DB_TXN *my_txn; + ENV *env; + int found, ret; + + dbp = my_dbc->dbp; + env = dbp->env; + my_txn = IS_SUBTRANSACTION(my_dbc->txn) ? my_dbc->txn : NULL; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + * + * If splitting the page that a cursor was on, the cursor has to be + * adjusted to point to the same record as before the split. Most + * of the time we don't adjust pointers to the left page, because + * we're going to copy its contents back over the original page. If + * the cursor is on the right page, it is decremented by the number of + * records split to the left page. + */ + MUTEX_LOCK(env, env->mtx_dblist); + FIND_FIRST_DB_MATCH(env, dbp, ldbp); + for (found = 0; + ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; + ldbp = TAILQ_NEXT(ldbp, dblistlinks)) { + MUTEX_LOCK(env, dbp->mutex); + TAILQ_FOREACH(dbc, &ldbp->active_queue, links) { + if (dbc->dbtype == DB_RECNO) + continue; + cp = dbc->internal; + if (cp->pgno == ppgno && + !MVCC_SKIP_CURADJ(dbc, ppgno)) { + /* [#8032] + DB_ASSERT(env, !STD_LOCKING(dbc) || + cp->lock_mode != DB_LOCK_NG); + */ + if (my_txn != NULL && dbc->txn != my_txn) + found = 1; + if (cp->indx < split_indx) { + if (cleft) + cp->pgno = lpgno; + } else { + cp->pgno = rpgno; + cp->indx -= split_indx; + } + } + } + MUTEX_UNLOCK(env, dbp->mutex); + } + MUTEX_UNLOCK(env, env->mtx_dblist); + + if (found != 0 && DBC_LOGGING(my_dbc)) { + if ((ret = __bam_curadj_log(dbp, + my_dbc->txn, &lsn, 0, DB_CA_SPLIT, ppgno, rpgno, + cleft ? lpgno : PGNO_INVALID, 0, split_indx, 0)) != 0) + return (ret); + } + + return (0); +} + +/* + * __bam_ca_undosplit -- + * Adjust the cursors when undoing a split of a page. + * If we grew a level we will execute this for both the + * left and the right pages. + * Called only during undo processing. + * + * PUBLIC: int __bam_ca_undosplit __P((DB *, + * PUBLIC: db_pgno_t, db_pgno_t, db_pgno_t, u_int32_t)); + */ +int +__bam_ca_undosplit(dbp, frompgno, topgno, lpgno, split_indx) + DB *dbp; + db_pgno_t frompgno, topgno, lpgno; + u_int32_t split_indx; +{ + DB *ldbp; + DBC *dbc; + DBC_INTERNAL *cp; + ENV *env; + + env = dbp->env; + + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + * + * When backing out a split, we move the cursor back + * to the original offset and bump it by the split_indx. + */ + MUTEX_LOCK(env, env->mtx_dblist); + FIND_FIRST_DB_MATCH(env, dbp, ldbp); + for (; + ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; + ldbp = TAILQ_NEXT(ldbp, dblistlinks)) { + MUTEX_LOCK(env, dbp->mutex); + TAILQ_FOREACH(dbc, &ldbp->active_queue, links) { + if (dbc->dbtype == DB_RECNO) + continue; + cp = dbc->internal; + if (cp->pgno == topgno && + !MVCC_SKIP_CURADJ(dbc, topgno)) { + cp->pgno = frompgno; + cp->indx += split_indx; + } else if (cp->pgno == lpgno && + !MVCC_SKIP_CURADJ(dbc, lpgno)) + cp->pgno = frompgno; + } + MUTEX_UNLOCK(env, dbp->mutex); + } + MUTEX_UNLOCK(env, env->mtx_dblist); + + return (0); +} diff --git a/btree/bt_cursor.c b/btree/bt_cursor.c new file mode 100644 index 0000000..b0d6f7d --- /dev/null +++ b/btree/bt_cursor.c @@ -0,0 +1,3055 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/lock.h" +#include "dbinc/mp.h" + +static int __bam_bulk __P((DBC *, DBT *, u_int32_t)); +static int __bamc_close __P((DBC *, db_pgno_t, int *)); +static int __bamc_del __P((DBC *, u_int32_t)); +static int __bamc_destroy __P((DBC *)); +static int __bamc_get __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *)); +static int __bamc_getstack __P((DBC *)); +static int __bamc_next __P((DBC *, int, int)); +static int __bamc_physdel __P((DBC *)); +static int __bamc_prev __P((DBC *)); +static int __bamc_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *)); +static int __bamc_search __P((DBC *, + db_pgno_t, const DBT *, u_int32_t, int *)); +static int __bamc_writelock __P((DBC *)); +static int __bam_getboth_finddatum __P((DBC *, DBT *, u_int32_t)); +static int __bam_getbothc __P((DBC *, DBT *)); +static int __bam_get_prev __P((DBC *)); +static int __bam_isopd __P((DBC *, db_pgno_t *)); +#ifdef HAVE_COMPRESSION +static int __bam_getlte __P((DBC *, DBT *, DBT *)); +#endif + +/* + * Acquire a new page/lock. If we hold a page/lock, discard the page, and + * lock-couple the lock. + * + * !!! + * We have to handle both where we have a lock to lock-couple and where we + * don't -- we don't duplicate locks when we duplicate cursors if we are + * running in a transaction environment as there's no point if locks are + * never discarded. This means that the cursor may or may not hold a lock. + * In the case where we are descending the tree we always want to unlock + * the held interior page so we use ACQUIRE_COUPLE. + */ +#undef ACQUIRE +#define ACQUIRE(dbc, mode, lpgno, lock, fpgno, pagep, flags, ret) do { \ + DB_MPOOLFILE *__mpf = (dbc)->dbp->mpf; \ + if ((pagep) != NULL) { \ + ret = __memp_fput(__mpf, \ + (dbc)->thread_info, pagep, dbc->priority); \ + pagep = NULL; \ + } else \ + ret = 0; \ + if ((ret) == 0 && STD_LOCKING(dbc)) \ + ret = __db_lget( \ + dbc, LCK_COUPLE, lpgno, mode, flags, &(lock)); \ + if ((ret) == 0) \ + ret = __memp_fget(__mpf, &(fpgno), \ + (dbc)->thread_info, (dbc)->txn, 0, &(pagep)); \ +} while (0) + +/* Acquire a new page/lock for a cursor. */ +#undef ACQUIRE_CUR +#define ACQUIRE_CUR(dbc, mode, p, flags, ret) do { \ + BTREE_CURSOR *__cp = (BTREE_CURSOR *)(dbc)->internal; \ + if (p != __cp->pgno) \ + __cp->pgno = PGNO_INVALID; \ + ACQUIRE(dbc, mode, p, __cp->lock, p, __cp->page, flags, ret); \ + if ((ret) == 0) { \ + __cp->pgno = p; \ + __cp->lock_mode = (mode); \ + } \ +} while (0) + +/* + * Acquire a write lock if we don't already have one. + * + * !!! + * See ACQUIRE macro on why we handle cursors that don't have locks. + */ +#undef ACQUIRE_WRITE_LOCK +#define ACQUIRE_WRITE_LOCK(dbc, ret) do { \ + BTREE_CURSOR *__cp = (BTREE_CURSOR *)(dbc)->internal; \ + DB_MPOOLFILE *__mpf = (dbc)->dbp->mpf; \ + int __get_page = 0; \ + ret = 0; \ + if (STD_LOCKING(dbc) && __cp->lock_mode != DB_LOCK_WRITE) { \ + if (__cp->page != NULL) { \ + (ret) = __memp_fput(__mpf, (dbc)->thread_info, \ + __cp->page, (dbc)->priority); \ + __cp->page = NULL; \ + __get_page = 1; \ + if ((ret) !=0) \ + break; \ + } \ + if (((ret) = __db_lget((dbc), \ + LOCK_ISSET(__cp->lock) ? LCK_COUPLE : 0, \ + __cp->pgno, DB_LOCK_WRITE, 0, &__cp->lock)) != 0) \ + break; \ + __cp->lock_mode = DB_LOCK_WRITE; \ + if (__get_page == 0) \ + break; \ + (ret) = __memp_fget(__mpf, &__cp->pgno, \ + (dbc)->thread_info, \ + (dbc)->txn, DB_MPOOL_DIRTY, &__cp->page); \ + } \ +} while (0) + +/* Discard the current page/lock for a cursor. */ +#undef DISCARD_CUR +#define DISCARD_CUR(dbc, ret) do { \ + BTREE_CURSOR *__cp = (BTREE_CURSOR *)(dbc)->internal; \ + DB_MPOOLFILE *__mpf = (dbc)->dbp->mpf; \ + int __t_ret; \ + if ((__cp->page) != NULL) { \ + __t_ret = __memp_fput(__mpf, \ + (dbc)->thread_info, __cp->page, dbc->priority);\ + __cp->page = NULL; \ + } else \ + __t_ret = 0; \ + if (__t_ret != 0 && (ret) == 0) \ + ret = __t_ret; \ + __t_ret = __TLPUT((dbc), __cp->lock); \ + if (__t_ret != 0 && (ret) == 0) \ + ret = __t_ret; \ + if ((ret) == 0 && !LOCK_ISSET(__cp->lock)) \ + __cp->lock_mode = DB_LOCK_NG; \ + __cp->stream_start_pgno = PGNO_INVALID; \ +} while (0) + +/* If on-page item is a deleted record. */ +#undef IS_DELETED +#define IS_DELETED(dbp, page, indx) \ + B_DISSET(GET_BKEYDATA(dbp, page, \ + (indx) + (TYPE(page) == P_LBTREE ? O_INDX : 0))->type) +#undef IS_CUR_DELETED +#define IS_CUR_DELETED(dbc) \ + IS_DELETED((dbc)->dbp, (dbc)->internal->page, (dbc)->internal->indx) + +/* + * Test to see if two cursors could point to duplicates of the same key. + * In the case of off-page duplicates they are they same, as the cursors + * will be in the same off-page duplicate tree. In the case of on-page + * duplicates, the key index offsets must be the same. For the last test, + * as the original cursor may not have a valid page pointer, we use the + * current cursor's. + */ +#undef IS_DUPLICATE +#define IS_DUPLICATE(dbc, i1, i2) \ + (P_INP((dbc)->dbp,((PAGE *)(dbc)->internal->page))[i1] == \ + P_INP((dbc)->dbp,((PAGE *)(dbc)->internal->page))[i2]) +#undef IS_CUR_DUPLICATE +#define IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx) \ + (F_ISSET(dbc, DBC_OPD) || \ + (orig_pgno == (dbc)->internal->pgno && \ + IS_DUPLICATE(dbc, (dbc)->internal->indx, orig_indx))) + +/* + * __bamc_init -- + * Initialize the access private portion of a cursor + * + * PUBLIC: int __bamc_init __P((DBC *, DBTYPE)); + */ +int +__bamc_init(dbc, dbtype) + DBC *dbc; + DBTYPE dbtype; +{ + ENV *env; + int ret; +#ifdef HAVE_COMPRESSION + BTREE_CURSOR *cp; +#endif + + env = dbc->env; + + /* Allocate/initialize the internal structure. */ + if (dbc->internal == NULL) { + if ((ret = __os_calloc( + env, 1, sizeof(BTREE_CURSOR), &dbc->internal)) != 0) + return (ret); + +#ifdef HAVE_COMPRESSION + cp = (BTREE_CURSOR*)dbc->internal; + cp->compressed.flags = DB_DBT_USERMEM; + cp->key1.flags = DB_DBT_USERMEM; + cp->key2.flags = DB_DBT_USERMEM; + cp->data1.flags = DB_DBT_USERMEM; + cp->data2.flags = DB_DBT_USERMEM; + cp->del_key.flags = DB_DBT_USERMEM; + cp->del_data.flags = DB_DBT_USERMEM; +#endif + } + + /* Initialize methods. */ + dbc->close = dbc->c_close = __dbc_close_pp; + dbc->cmp = __dbc_cmp_pp; + dbc->count = dbc->c_count = __dbc_count_pp; + dbc->del = dbc->c_del = __dbc_del_pp; + dbc->dup = dbc->c_dup = __dbc_dup_pp; + dbc->get = dbc->c_get = __dbc_get_pp; + dbc->pget = dbc->c_pget = __dbc_pget_pp; + dbc->put = dbc->c_put = __dbc_put_pp; + if (dbtype == DB_BTREE) { + dbc->am_bulk = __bam_bulk; + dbc->am_close = __bamc_close; + dbc->am_del = __bamc_del; + dbc->am_destroy = __bamc_destroy; + dbc->am_get = __bamc_get; + dbc->am_put = __bamc_put; + dbc->am_writelock = __bamc_writelock; + } else { + dbc->am_bulk = __bam_bulk; + dbc->am_close = __bamc_close; + dbc->am_del = __ramc_del; + dbc->am_destroy = __bamc_destroy; + dbc->am_get = __ramc_get; + dbc->am_put = __ramc_put; + dbc->am_writelock = __bamc_writelock; + } + + return (0); +} + +/* + * __bamc_refresh + * Set things up properly for cursor re-use. + * + * PUBLIC: int __bamc_refresh __P((DBC *)); + */ +int +__bamc_refresh(dbc) + DBC *dbc; +{ + BTREE *t; + BTREE_CURSOR *cp; + DB *dbp; + + dbp = dbc->dbp; + t = dbp->bt_internal; + cp = (BTREE_CURSOR *)dbc->internal; + + /* + * If our caller set the root page number, it's because the root was + * known. This is always the case for off page dup cursors. Else, + * pull it out of our internal information. + */ + if (cp->root == PGNO_INVALID) + cp->root = t->bt_root; + + LOCK_INIT(cp->lock); + cp->lock_mode = DB_LOCK_NG; + + if (cp->sp == NULL) { + cp->sp = cp->stack; + cp->esp = cp->stack + sizeof(cp->stack) / sizeof(cp->stack[0]); + } + BT_STK_CLR(cp); + +#ifdef HAVE_COMPRESSION + /* Initialize compression */ + cp->prevKey = 0; + cp->prevData = 0; + cp->currentKey = 0; + cp->currentData = 0; + cp->compcursor = 0; + cp->compend = 0; + cp->prevcursor = 0; + cp->prev2cursor = 0; +#endif + + /* + * The btree leaf page data structures require that two key/data pairs + * (or four items) fit on a page, but other than that there's no fixed + * requirement. The btree off-page duplicates only require two items, + * to be exact, but requiring four for them as well seems reasonable. + * + * Recno uses the btree bt_ovflsize value -- it's close enough. + */ + cp->ovflsize = B_MINKEY_TO_OVFLSIZE( + dbp, F_ISSET(dbc, DBC_OPD) ? 2 : t->bt_minkey, dbp->pgsize); + + cp->recno = RECNO_OOB; + cp->order = INVALID_ORDER; + cp->flags = 0; + + /* Initialize for record numbers. */ + if (F_ISSET(dbc, DBC_OPD) || + dbc->dbtype == DB_RECNO || F_ISSET(dbp, DB_AM_RECNUM)) { + F_SET(cp, C_RECNUM); + + /* + * All btrees that support record numbers, optionally standard + * recno trees, and all off-page duplicate recno trees have + * mutable record numbers. + */ + if ((F_ISSET(dbc, DBC_OPD) && dbc->dbtype == DB_RECNO) || + F_ISSET(dbp, DB_AM_RECNUM | DB_AM_RENUMBER)) + F_SET(cp, C_RENUMBER); + } + + return (0); +} + +/* + * __bamc_close -- + * Close down the cursor. + */ +static int +__bamc_close(dbc, root_pgno, rmroot) + DBC *dbc; + db_pgno_t root_pgno; + int *rmroot; +{ + BTREE_CURSOR *cp, *cp_opd, *cp_c; + DB *dbp; + DBC *dbc_opd, *dbc_c; + DB_MPOOLFILE *mpf; + ENV *env; + PAGE *h; + int cdb_lock, count, ret; + + dbp = dbc->dbp; + env = dbp->env; + mpf = dbp->mpf; + cp = (BTREE_CURSOR *)dbc->internal; + cp_opd = (dbc_opd = cp->opd) == NULL ? + NULL : (BTREE_CURSOR *)dbc_opd->internal; + cdb_lock = ret = 0; + + /* + * There are 3 ways this function is called: + * + * 1. Closing a primary cursor: we get called with a pointer to a + * primary cursor that has a NULL opd field. This happens when + * closing a btree/recno database cursor without an associated + * off-page duplicate tree. + * + * 2. Closing a primary and an off-page duplicate cursor stack: we + * get called with a pointer to the primary cursor which has a + * non-NULL opd field. This happens when closing a btree cursor + * into database with an associated off-page btree/recno duplicate + * tree. (It can't be a primary recno database, recno databases + * don't support duplicates.) + * + * 3. Closing an off-page duplicate cursor stack: we get called with + * a pointer to the off-page duplicate cursor. This happens when + * closing a non-btree database that has an associated off-page + * btree/recno duplicate tree or for a btree database when the + * opd tree is not empty (root_pgno == PGNO_INVALID). + * + * If either the primary or off-page duplicate cursor deleted a btree + * key/data pair, check to see if the item is still referenced by a + * different cursor. If it is, confirm that cursor's delete flag is + * set and leave it to that cursor to do the delete. + * + * NB: The test for == 0 below is correct. Our caller already removed + * our cursor argument from the active queue, we won't find it when we + * search the queue in __bam_ca_delete(). + * NB: It can't be true that both the primary and off-page duplicate + * cursors have deleted a btree key/data pair. Either the primary + * cursor may have deleted an item and there's no off-page duplicate + * cursor, or there's an off-page duplicate cursor and it may have + * deleted an item. + * + * Primary recno databases aren't an issue here. Recno keys are either + * deleted immediately or never deleted, and do not have to be handled + * here. + * + * Off-page duplicate recno databases are an issue here, cases #2 and + * #3 above can both be off-page recno databases. The problem is the + * same as the final problem for off-page duplicate btree databases. + * If we no longer need the off-page duplicate tree, we want to remove + * it. For off-page duplicate btrees, we are done with the tree when + * we delete the last item it contains, i.e., there can be no further + * references to it when it's empty. For off-page duplicate recnos, + * we remove items from the tree as the application calls the remove + * function, so we are done with the tree when we close the last cursor + * that references it. + * + * We optionally take the root page number from our caller. If the + * primary database is a btree, we can get it ourselves because dbc + * is the primary cursor. If the primary database is not a btree, + * the problem is that we may be dealing with a stack of pages. The + * cursor we're using to do the delete points at the bottom of that + * stack and we need the top of the stack. + */ + if (F_ISSET(cp, C_DELETED)) { + dbc_c = dbc; + switch (dbc->dbtype) { + case DB_BTREE: /* Case #1, #3. */ + if ((ret = __bam_ca_delete( + dbp, cp->pgno, cp->indx, 1, &count)) != 0) + goto err; + if (count == 0) + goto lock; + goto done; + case DB_RECNO: + if (!F_ISSET(dbc, DBC_OPD)) /* Case #1. */ + goto done; + /* Case #3. */ + if ((ret = __ram_ca_delete(dbp, cp->root, &count)) != 0) + goto err; + if (count == 0) + goto lock; + goto done; + case DB_HASH: + case DB_QUEUE: + case DB_UNKNOWN: + default: + ret = __db_unknown_type( + env, "DbCursor.close", dbc->dbtype); + goto err; + } + } + + if (dbc_opd == NULL) + goto done; + + if (F_ISSET(cp_opd, C_DELETED)) { /* Case #2. */ + /* + * We will not have been provided a root page number. Acquire + * one from the primary database. + */ + if ((h = cp->page) == NULL && (ret = __memp_fget(mpf, &cp->pgno, + dbc->thread_info, dbc->txn, 0, &h)) != 0) + goto err; + root_pgno = GET_BOVERFLOW(dbp, h, cp->indx + O_INDX)->pgno; + if ((ret = __memp_fput(mpf, + dbc->thread_info, h, dbc->priority)) != 0) + goto err; + cp->page = NULL; + + dbc_c = dbc_opd; + switch (dbc_opd->dbtype) { + case DB_BTREE: + if ((ret = __bam_ca_delete( + dbp, cp_opd->pgno, cp_opd->indx, 1, &count)) != 0) + goto err; + if (count == 0) + goto lock; + goto done; + case DB_RECNO: + if ((ret = + __ram_ca_delete(dbp, cp_opd->root, &count)) != 0) + goto err; + if (count == 0) + goto lock; + goto done; + case DB_HASH: + case DB_QUEUE: + case DB_UNKNOWN: + default: + ret = __db_unknown_type( + env, "DbCursor.close", dbc->dbtype); + goto err; + } + } + goto done; + +lock: cp_c = (BTREE_CURSOR *)dbc_c->internal; + + /* + * If this is CDB, upgrade the lock if necessary. While we acquired + * the write lock to logically delete the record, we released it when + * we returned from that call, and so may not be holding a write lock + * at the moment. + */ + if (CDB_LOCKING(env)) { + if (F_ISSET(dbc, DBC_WRITECURSOR)) { + if ((ret = __lock_get(env, + dbc->locker, DB_LOCK_UPGRADE, &dbc->lock_dbt, + DB_LOCK_WRITE, &dbc->mylock)) != 0) + goto err; + cdb_lock = 1; + } + goto delete; + } + + /* + * The variable dbc_c has been initialized to reference the cursor in + * which we're going to do the delete. Initialize the cursor's lock + * structures as necessary. + * + * First, we may not need to acquire any locks. If we're in case #3, + * that is, the primary database isn't a btree database, our caller + * is responsible for acquiring any necessary locks before calling us. + */ + if (F_ISSET(dbc, DBC_OPD)) + goto delete; + + /* + * Otherwise, acquire a write lock on the primary database's page. + * + * Lock the primary database page, regardless of whether we're deleting + * an item on a primary database page or an off-page duplicates page. + * + * If the cursor that did the initial logical deletion (and had a write + * lock) is not the same cursor doing the physical deletion (which may + * have only ever had a read lock on the item), we need to upgrade to a + * write lock. The confusion comes as follows: + * + * C1 created, acquires item read lock + * C2 dup C1, create C2, also has item read lock. + * C1 acquire write lock, delete item + * C1 close + * C2 close, needs a write lock to physically delete item. + * + * If we're in a TXN, we know that C2 will be able to acquire the write + * lock, because no locker other than the one shared by C1 and C2 can + * acquire a write lock -- the original write lock C1 acquired was never + * discarded. + * + * If we're not in a TXN, it's nastier. Other cursors might acquire + * read locks on the item after C1 closed, discarding its write lock, + * and such locks would prevent C2 from acquiring a read lock. That's + * OK, though, we'll simply wait until we can acquire a write lock, or + * we'll deadlock. (Which better not happen, since we're not in a TXN.) + * + * There are similar scenarios with dirty reads, where the cursor may + * have downgraded its write lock to a was-write lock. + */ + if (STD_LOCKING(dbc)) + if ((ret = __db_lget(dbc, + LCK_COUPLE, cp->pgno, DB_LOCK_WRITE, 0, &cp->lock)) != 0) + goto err; + +delete: /* + * If the delete occurred in a Btree, we're going to look at the page + * to see if the item has to be physically deleted. Otherwise, we do + * not need the actual page (and it may not even exist, it might have + * been truncated from the file after an allocation aborted). + * + * Delete the on-page physical item referenced by the cursor. + */ + if (dbc_c->dbtype == DB_BTREE) { + if ((ret = __memp_fget(mpf, &cp_c->pgno, dbc->thread_info, + dbc->txn, DB_MPOOL_DIRTY, &cp_c->page)) != 0) + goto err; + if ((ret = __bamc_physdel(dbc_c)) != 0) + goto err; + } + + /* + * If we're not working in an off-page duplicate tree, then we're + * done. + */ + if (!F_ISSET(dbc_c, DBC_OPD) || root_pgno == PGNO_INVALID) + goto done; + + /* + * We may have just deleted the last element in the off-page duplicate + * tree, and closed the last cursor in the tree. For an off-page btree + * there are no other cursors in the tree by definition, if the tree is + * empty. For an off-page recno we know we have closed the last cursor + * in the tree because the __ram_ca_delete call above returned 0 only + * in that case. So, if the off-page duplicate tree is empty at this + * point, we want to remove it. + */ + if (((h = dbc_c->internal->page) == NULL || h->pgno != root_pgno) && + (ret = __memp_fget(mpf, + &root_pgno, dbc->thread_info, dbc->txn, 0, &h)) != 0) + goto err; + if (NUM_ENT(h) == 0) { + if (h != dbc_c->internal->page) + DISCARD_CUR(dbc_c, ret); + else + dbc_c->internal->page = NULL; + if (ret != 0) + goto err; + if ((ret = __db_free(dbc, h)) != 0) + goto err; + } else { + if (h != dbc_c->internal->page && (ret = __memp_fput(mpf, + dbc->thread_info, h, dbc->priority)) != 0) + goto err; + goto done; + } + + /* + * When removing the tree, we have to do one of two things. If this is + * case #2, that is, the primary tree is a btree, delete the key that's + * associated with the tree from the btree leaf page. We know we are + * the only reference to it and we already have the correct lock. We + * detect this case because the cursor that was passed to us references + * an off-page duplicate cursor. + * + * If this is case #3, that is, the primary tree isn't a btree, pass + * the information back to our caller, it's their job to do cleanup on + * the primary page. + */ + if (dbc_opd != NULL) { + if ((ret = __memp_fget(mpf, &cp->pgno, dbc->thread_info, + dbc->txn, DB_MPOOL_DIRTY, &cp->page)) != 0) + goto err; + if ((ret = __bamc_physdel(dbc)) != 0) + goto err; + } else + *rmroot = 1; +err: +done: /* + * Discard the page references and locks, and confirm that the stack + * has been emptied. + */ + if (dbc_opd != NULL) + DISCARD_CUR(dbc_opd, ret); + DISCARD_CUR(dbc, ret); + + /* Downgrade any CDB lock we acquired. */ + if (cdb_lock) + (void)__lock_downgrade(env, &dbc->mylock, DB_LOCK_IWRITE, 0); + + return (ret); +} + +/* + * __bamc_cmp -- + * Compare two btree cursors for equality. + * + * This function is only called with two cursors that point to the same item. + * It only distinguishes cursors pointing to deleted and undeleted items at + * the same location. + * + * PUBLIC: int __bamc_cmp __P((DBC *, DBC *, int *)); + */ +int +__bamc_cmp(dbc, other_dbc, result) + DBC *dbc, *other_dbc; + int *result; +{ + ENV *env; + BTREE_CURSOR *bcp, *obcp; + + env = dbc->env; + bcp = (BTREE_CURSOR *)dbc->internal; + obcp = (BTREE_CURSOR *)other_dbc->internal; + + DB_ASSERT (env, bcp->pgno == obcp->pgno); + DB_ASSERT (env, bcp->indx == obcp->indx); + + /* Check to see if both cursors have the same deleted flag. */ + *result = + ((F_ISSET(bcp, C_DELETED)) == F_ISSET(obcp, C_DELETED)) ? 0 : 1; + return (0); +} + +/* + * __bamc_destroy -- + * Close a single cursor -- internal version. + */ +static int +__bamc_destroy(dbc) + DBC *dbc; +{ + BTREE_CURSOR *cp; + ENV *env; + + cp = (BTREE_CURSOR *)dbc->internal; + env = dbc->env; + + /* Discard the structures. */ + if (cp->sp != cp->stack) + __os_free(env, cp->sp); + +#ifdef HAVE_COMPRESSION + /* Free the memory used for compression */ + __os_free(env, cp->compressed.data); + __os_free(env, cp->key1.data); + __os_free(env, cp->key2.data); + __os_free(env, cp->data1.data); + __os_free(env, cp->data2.data); + __os_free(env, cp->del_key.data); + __os_free(env, cp->del_data.data); +#endif + + __os_free(env, cp); + + return (0); +} + +/* + * __bamc_count -- + * Return a count of on and off-page duplicates. + * + * PUBLIC: int __bamc_count __P((DBC *, db_recno_t *)); + */ +int +__bamc_count(dbc, recnop) + DBC *dbc; + db_recno_t *recnop; +{ + BTREE_CURSOR *cp; + DB *dbp; + DB_MPOOLFILE *mpf; + db_indx_t indx, top; + db_recno_t recno; + int ret; + + dbp = dbc->dbp; + mpf = dbp->mpf; + cp = (BTREE_CURSOR *)dbc->internal; + + /* + * Called with the top-level cursor that may reference an off-page + * duplicates tree. We don't have to acquire any new locks, we have + * to have a read lock to even get here. + */ + if (cp->opd == NULL) { + /* + * On-page duplicates, get the page and count. + */ + DB_ASSERT(dbp->env, cp->page == NULL); + if ((ret = __memp_fget(mpf, &cp->pgno, + dbc->thread_info, dbc->txn, 0, &cp->page)) != 0) + return (ret); + + /* + * Move back to the beginning of the set of duplicates and + * then count forward. + */ + for (indx = cp->indx;; indx -= P_INDX) + if (indx == 0 || + !IS_DUPLICATE(dbc, indx, indx - P_INDX)) + break; + for (recno = 0, + top = NUM_ENT(cp->page) - P_INDX;; indx += P_INDX) { + if (!IS_DELETED(dbp, cp->page, indx)) + ++recno; + if (indx == top || + !IS_DUPLICATE(dbc, indx, indx + P_INDX)) + break; + } + } else { + /* + * Off-page duplicates tree, get the root page of the off-page + * duplicate tree. + */ + if ((ret = __memp_fget(mpf, &cp->opd->internal->root, + dbc->thread_info, dbc->txn, 0, &cp->page)) != 0) + return (ret); + + /* + * If the page is an internal page use the page's count as it's + * up-to-date and reflects the status of cursors in the tree. + * If the page is a leaf page for unsorted duplicates, use the + * page's count as cursors don't mark items deleted on the page + * and wait, cursor delete items immediately. + * If the page is a leaf page for sorted duplicates, there may + * be cursors on the page marking deleted items -- count. + */ + if (TYPE(cp->page) == P_LDUP) + for (recno = 0, indx = 0, + top = NUM_ENT(cp->page) - O_INDX;; indx += O_INDX) { + if (!IS_DELETED(dbp, cp->page, indx)) + ++recno; + if (indx == top) + break; + } + else + recno = RE_NREC(cp->page); + } + + *recnop = recno; + + ret = __memp_fput(mpf, dbc->thread_info, cp->page, dbc->priority); + cp->page = NULL; + + return (ret); +} + +/* + * __bamc_del -- + * Delete using a cursor. + */ +static int +__bamc_del(dbc, flags) + DBC *dbc; + u_int32_t flags; +{ + BTREE_CURSOR *cp; + DB *dbp; + DB_MPOOLFILE *mpf; + int count, ret, t_ret; + + dbp = dbc->dbp; + mpf = dbp->mpf; + cp = (BTREE_CURSOR *)dbc->internal; + ret = 0; + COMPQUIET(flags, 0); + + /* If the item was already deleted, return failure. */ + if (F_ISSET(cp, C_DELETED)) + return (DB_KEYEMPTY); + + /* + * This code is always called with a page lock but no page. + */ + DB_ASSERT(dbp->env, cp->page == NULL); + + /* + * We don't physically delete the record until the cursor moves, so + * we have to have a long-lived write lock on the page instead of a + * a long-lived read lock. Note, we have to have a read lock to even + * get here. + * + * If we're maintaining record numbers, we lock the entire tree, else + * we lock the single page. + */ + if (F_ISSET(cp, C_RECNUM)) { + if ((ret = __bamc_getstack(dbc)) != 0) + goto err; + cp->page = cp->csp->page; + } else { + ACQUIRE_CUR(dbc, DB_LOCK_WRITE, cp->pgno, 0, ret); + if (ret != 0) + goto err; + } + + /* Mark the page dirty. */ + if ((ret = __memp_dirty(mpf, + &cp->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) + goto err; + + /* Log the change. */ + if (DBC_LOGGING(dbc)) { + if ((ret = __bam_cdel_log(dbp, dbc->txn, &LSN(cp->page), 0, + PGNO(cp->page), &LSN(cp->page), cp->indx)) != 0) + goto err; + } else + LSN_NOT_LOGGED(LSN(cp->page)); + + /* Set the intent-to-delete flag on the page. */ + if (TYPE(cp->page) == P_LBTREE) + B_DSET(GET_BKEYDATA(dbp, cp->page, cp->indx + O_INDX)->type); + else + B_DSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type); + +err: /* + * If we've been successful so far and the tree has record numbers, + * adjust the record counts. Either way, release acquired page(s). + */ + if (F_ISSET(cp, C_RECNUM)) { + cp->csp->page = cp->page; + if (ret == 0) + ret = __bam_adjust(dbc, -1); + (void)__bam_stkrel(dbc, 0); + } else + if (cp->page != NULL && + (t_ret = __memp_fput(mpf, dbc->thread_info, + cp->page, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + + cp->page = NULL; + + /* + * Update the cursors last, after all chance of recoverable failure + * is past. + */ + if (ret == 0) + ret = __bam_ca_delete(dbp, cp->pgno, cp->indx, 1, &count); + + return (ret); +} + +/* + * __bamc_dup -- + * Duplicate a btree cursor, such that the new one holds appropriate + * locks for the position of the original. + * + * PUBLIC: int __bamc_dup __P((DBC *, DBC *, u_int32_t)); + */ +int +__bamc_dup(orig_dbc, new_dbc, flags) + DBC *orig_dbc, *new_dbc; + u_int32_t flags; +{ + BTREE_CURSOR *orig, *new; + + orig = (BTREE_CURSOR *)orig_dbc->internal; + new = (BTREE_CURSOR *)new_dbc->internal; + + new->ovflsize = orig->ovflsize; + new->recno = orig->recno; + new->flags = orig->flags; + +#ifdef HAVE_COMPRESSION + /* Copy the compression state */ + return (__bamc_compress_dup(orig_dbc, new_dbc, flags)); +#else + COMPQUIET(flags, 0); + + return (0); +#endif +} + +/* + * __bamc_get -- + * Get using a cursor (btree). + */ +static int +__bamc_get(dbc, key, data, flags, pgnop) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; + db_pgno_t *pgnop; +{ + BTREE_CURSOR *cp; + DB *dbp; + DB_MPOOLFILE *mpf; + db_pgno_t orig_pgno; + db_indx_t orig_indx; + int exact, newopd, ret; + + dbp = dbc->dbp; + mpf = dbp->mpf; + cp = (BTREE_CURSOR *)dbc->internal; + orig_pgno = cp->pgno; + orig_indx = cp->indx; + + newopd = 0; + switch (flags) { + case DB_CURRENT: + /* It's not possible to return a deleted record. */ + if (F_ISSET(cp, C_DELETED)) { + ret = DB_KEYEMPTY; + goto err; + } + + /* + * Acquire the current page. We have at least a read-lock + * already. The caller may have set DB_RMW asking for a + * write lock, but upgrading to a write lock has no better + * chance of succeeding now instead of later, so don't try. + */ + if ((ret = __memp_fget(mpf, &cp->pgno, + dbc->thread_info, dbc->txn, 0, &cp->page)) != 0) + goto err; + break; + case DB_FIRST: + newopd = 1; + if ((ret = __bamc_search(dbc, + PGNO_INVALID, NULL, flags, &exact)) != 0) + goto err; + break; + case DB_GET_BOTH: + case DB_GET_BOTH_RANGE: + /* + * There are two ways to get here based on DBcursor->get + * with the DB_GET_BOTH/DB_GET_BOTH_RANGE flags set: + * + * 1. Searching a sorted off-page duplicate tree: do a tree + * search. + * + * 2. Searching btree: do a tree search. If it returns a + * reference to off-page duplicate tree, return immediately + * and let our caller deal with it. If the search doesn't + * return a reference to off-page duplicate tree, continue + * with an on-page search. + */ + if (F_ISSET(dbc, DBC_OPD)) { + if ((ret = __bamc_search( + dbc, PGNO_INVALID, data, flags, &exact)) != 0) + goto err; + if (flags == DB_GET_BOTH) { + if (!exact) { + ret = DB_NOTFOUND; + goto err; + } + break; + } + + /* + * We didn't require an exact match, so the search may + * may have returned an entry past the end of the page, + * or we may be referencing a deleted record. If so, + * move to the next entry. + */ + if ((cp->indx == NUM_ENT(cp->page) || + IS_CUR_DELETED(dbc)) && + (ret = __bamc_next(dbc, 1, 0)) != 0) + goto err; + } else { + if ((ret = __bamc_search( + dbc, PGNO_INVALID, key, flags, &exact)) != 0) + return (ret); + if (!exact) { + ret = DB_NOTFOUND; + goto err; + } + + if (pgnop != NULL && __bam_isopd(dbc, pgnop)) { + newopd = 1; + break; + } + if ((ret = + __bam_getboth_finddatum(dbc, data, flags)) != 0) + goto err; + } + break; +#ifdef HAVE_COMPRESSION + case DB_SET_LTE: + if ((ret = __bam_getlte(dbc, key, NULL)) != 0) + goto err; + break; + case DB_GET_BOTH_LTE: + if ((ret = __bam_getlte(dbc, key, data)) != 0) + goto err; + break; +#endif + case DB_GET_BOTHC: + if ((ret = __bam_getbothc(dbc, data)) != 0) + goto err; + break; + case DB_LAST: + newopd = 1; + if ((ret = __bamc_search(dbc, + PGNO_INVALID, NULL, flags, &exact)) != 0) + goto err; + break; + case DB_NEXT: + newopd = 1; + if (cp->pgno == PGNO_INVALID) { + if ((ret = __bamc_search(dbc, + PGNO_INVALID, NULL, DB_FIRST, &exact)) != 0) + goto err; + } else + if ((ret = __bamc_next(dbc, 1, 0)) != 0) + goto err; + break; + case DB_NEXT_DUP: + if ((ret = __bamc_next(dbc, 1, 0)) != 0) + goto err; + if (!IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx)) { + ret = DB_NOTFOUND; + goto err; + } + break; + case DB_NEXT_NODUP: + newopd = 1; + if (cp->pgno == PGNO_INVALID) { + if ((ret = __bamc_search(dbc, + PGNO_INVALID, NULL, DB_FIRST, &exact)) != 0) + goto err; + } else + do { + if ((ret = __bamc_next(dbc, 1, 0)) != 0) + goto err; + } while (IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx)); + break; + case DB_PREV: + newopd = 1; + if (cp->pgno == PGNO_INVALID) { + if ((ret = __bamc_search(dbc, + PGNO_INVALID, NULL, DB_LAST, &exact)) != 0) + goto err; + } else + if ((ret = __bamc_prev(dbc)) != 0) + goto err; + break; + case DB_PREV_DUP: + if ((ret = __bamc_prev(dbc)) != 0) + goto err; + if (!IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx)) { + ret = DB_NOTFOUND; + goto err; + } + break; + case DB_PREV_NODUP: + newopd = 1; + if (cp->pgno == PGNO_INVALID) { + if ((ret = __bamc_search(dbc, + PGNO_INVALID, NULL, DB_LAST, &exact)) != 0) + goto err; + } else + do { + if ((ret = __bamc_prev(dbc)) != 0) + goto err; + } while (IS_CUR_DUPLICATE(dbc, orig_pgno, orig_indx)); + break; + case DB_SET: + case DB_SET_RECNO: + newopd = 1; + if ((ret = __bamc_search(dbc, + PGNO_INVALID, key, flags, &exact)) != 0) + goto err; + break; + case DB_SET_RANGE: + newopd = 1; + if ((ret = __bamc_search(dbc, + PGNO_INVALID, key, flags, &exact)) != 0) + goto err; + + /* + * As we didn't require an exact match, the search function + * may have returned an entry past the end of the page. Or, + * we may be referencing a deleted record. If so, move to + * the next entry. + */ + if (cp->indx == NUM_ENT(cp->page) || IS_CUR_DELETED(dbc)) + if ((ret = __bamc_next(dbc, 0, 0)) != 0) + goto err; + break; + default: + ret = __db_unknown_flag(dbp->env, "__bamc_get", flags); + goto err; + } + + /* + * We may have moved to an off-page duplicate tree. Return that + * information to our caller. + */ + if (newopd && pgnop != NULL) + (void)__bam_isopd(dbc, pgnop); + +err: /* + * Regardless of whether we were successful or not, if the cursor + * moved, clear the delete flag, DBcursor->get never references a + * deleted key, if it moved at all. + */ + if (F_ISSET(cp, C_DELETED) && + (cp->pgno != orig_pgno || cp->indx != orig_indx)) + F_CLR(cp, C_DELETED); + + return (ret); +} + +static int +__bam_get_prev(dbc) + DBC *dbc; +{ + BTREE_CURSOR *cp; + DBT key, data; + db_pgno_t pgno; + int ret; + + if ((ret = __bamc_prev(dbc)) != 0) + return (ret); + + if (__bam_isopd(dbc, &pgno)) { + cp = (BTREE_CURSOR *)dbc->internal; + if ((ret = __dbc_newopd(dbc, pgno, cp->opd, &cp->opd)) != 0) + return (ret); + if ((ret = cp->opd->am_get(cp->opd, + &key, &data, DB_LAST, NULL)) != 0) + return (ret); + } + + return (0); +} + +/* + * __bam_bulk -- Return bulk data from a btree. + */ +static int +__bam_bulk(dbc, data, flags) + DBC *dbc; + DBT *data; + u_int32_t flags; +{ + BKEYDATA *bk; + BOVERFLOW *bo; + BTREE_CURSOR *cp; + PAGE *pg; + db_indx_t *inp, indx, pg_keyoff; + int32_t *endp, key_off, *offp, *saveoffp; + u_int8_t *dbuf, *dp, *np; + u_int32_t key_size, pagesize, size, space; + int adj, is_key, need_pg, next_key, no_dup, rec_key, ret; + + ret = 0; + key_off = 0; + size = 0; + pagesize = dbc->dbp->pgsize; + cp = (BTREE_CURSOR *)dbc->internal; + + /* + * dp tracks the beginning of the page in the buffer. + * np is the next place to copy things into the buffer. + * dbuf always stays at the beginning of the buffer. + */ + dbuf = data->data; + np = dp = dbuf; + + /* Keep track of space that is left. There is a termination entry */ + space = data->ulen; + space -= sizeof(*offp); + + /* Build the offset/size table from the end up. */ + endp = (int32_t *)((u_int8_t *)dbuf + data->ulen); + endp--; + offp = endp; + + key_size = 0; + + /* + * Distinguish between BTREE and RECNO. + * There are no keys in RECNO. If MULTIPLE_KEY is specified + * then we return the record numbers. + * is_key indicates that multiple btree keys are returned. + * rec_key is set if we are returning record numbers. + * next_key is set if we are going after the next key rather than dup. + */ + if (dbc->dbtype == DB_BTREE) { + is_key = LF_ISSET(DB_MULTIPLE_KEY) ? 1: 0; + rec_key = 0; + next_key = is_key && LF_ISSET(DB_OPFLAGS_MASK) != DB_NEXT_DUP; + adj = 2; + } else { + is_key = 0; + rec_key = LF_ISSET(DB_MULTIPLE_KEY) ? 1 : 0; + next_key = LF_ISSET(DB_OPFLAGS_MASK) != DB_NEXT_DUP; + adj = 1; + } + no_dup = LF_ISSET(DB_OPFLAGS_MASK) == DB_NEXT_NODUP; + +next_pg: + indx = cp->indx; + pg = cp->page; + + inp = P_INP(dbc->dbp, pg); + /* The current page is not yet in the buffer. */ + need_pg = 1; + + /* + * Keep track of the offset of the current key on the page. + * If we are returning keys, set it to 0 first so we force + * the copy of the key to the buffer. + */ + pg_keyoff = 0; + if (is_key == 0) + pg_keyoff = inp[indx]; + + do { + if (IS_DELETED(dbc->dbp, pg, indx)) { + if (dbc->dbtype != DB_RECNO) + continue; + + cp->recno++; + /* + * If we are not returning recnos then we + * need to fill in every slot so the user + * can calculate the record numbers. + */ + if (rec_key != 0) + continue; + + space -= 2 * sizeof(*offp); + /* Check if space as underflowed. */ + if (space > data->ulen) + goto back_up; + + /* Just mark the empty recno slots. */ + *offp-- = 0; + *offp-- = 0; + continue; + } + + /* + * Check to see if we have a new key. + * If so, then see if we need to put the + * key on the page. If its already there + * then we just point to it. + */ + if (is_key && pg_keyoff != inp[indx]) { + bk = GET_BKEYDATA(dbc->dbp, pg, indx); + if (B_TYPE(bk->type) == B_OVERFLOW) { + bo = (BOVERFLOW *)bk; + size = key_size = bo->tlen; + if (key_size > space) + goto get_key_space; + if ((ret = __bam_bulk_overflow(dbc, + bo->tlen, bo->pgno, np)) != 0) + return (ret); + space -= key_size; + key_off = (int32_t)(np - dbuf); + np += key_size; + } else { + if (need_pg) { + dp = np; + size = pagesize - HOFFSET(pg); + if (space < size) { +get_key_space: + /* Nothing added, then error. */ + if (offp == endp) { + data->size = (u_int32_t) + DB_ALIGN(size + + pagesize, 1024); + return + (DB_BUFFER_SMALL); + } + /* + * We need to back up to the + * last record put into the + * buffer so that it is + * CURRENT. + */ + if (indx != 0) + indx -= P_INDX; + else { + if ((ret = + __bam_get_prev( + dbc)) != 0) + return (ret); + indx = cp->indx; + pg = cp->page; + } + break; + } + /* + * Move the data part of the page + * to the buffer. + */ + memcpy(dp, + (u_int8_t *)pg + HOFFSET(pg), size); + need_pg = 0; + space -= size; + np += size; + } + key_size = bk->len; + key_off = (int32_t)((inp[indx] - HOFFSET(pg)) + + (dp - dbuf) + SSZA(BKEYDATA, data)); + pg_keyoff = inp[indx]; + } + } + + /* + * Reserve space for the pointers and sizes. + * Either key/data pair or just for a data item. + */ + space -= (is_key ? 4 : 2) * sizeof(*offp); + if (rec_key) + space -= sizeof(*offp); + + /* Check to see if space has underflowed. */ + if (space > data->ulen) + goto back_up; + + /* + * Determine if the next record is in the + * buffer already or if it needs to be copied in. + * If we have an off page dup, then copy as many + * as will fit into the buffer. + */ + bk = GET_BKEYDATA(dbc->dbp, pg, indx + adj - 1); + if (B_TYPE(bk->type) == B_DUPLICATE) { + bo = (BOVERFLOW *)bk; + if (is_key) { + *offp-- = (int32_t)key_off; + *offp-- = (int32_t)key_size; + } + /* + * We pass the offset of the current key. + * On return we check to see if offp has + * moved to see if any data fit. + */ + saveoffp = offp; + if ((ret = __bam_bulk_duplicates(dbc, bo->pgno, + dbuf, is_key ? offp + P_INDX : NULL, + &offp, &np, &space, no_dup)) != 0) { + if (ret == DB_BUFFER_SMALL) { + size = space; + space = 0; + /* If nothing was added, then error. */ + if (offp == saveoffp) { + offp += 2; + goto back_up; + } + goto get_space; + } + return (ret); + } + } else if (B_TYPE(bk->type) == B_OVERFLOW) { + bo = (BOVERFLOW *)bk; + size = bo->tlen; + if (size > space) + goto back_up; + if ((ret = + __bam_bulk_overflow(dbc, + bo->tlen, bo->pgno, np)) != 0) + return (ret); + space -= size; + if (is_key) { + *offp-- = (int32_t)key_off; + *offp-- = (int32_t)key_size; + } else if (rec_key) + *offp-- = (int32_t)cp->recno; + *offp-- = (int32_t)(np - dbuf); + np += size; + *offp-- = (int32_t)size; + } else { + if (need_pg) { + dp = np; + size = pagesize - HOFFSET(pg); + if (space < size) { +back_up: + /* + * Back up the index so that the + * last record in the buffer is CURRENT + */ + if (indx >= adj) + indx -= adj; + else { + if ((ret = + __bam_get_prev(dbc)) != 0 && + ret != DB_NOTFOUND) + return (ret); + indx = cp->indx; + pg = cp->page; + } + if (dbc->dbtype == DB_RECNO) + cp->recno--; +get_space: + /* + * See if we put anything in the + * buffer or if we are doing a DBP->get + * did we get all of the data. + */ + if (offp >= + (is_key ? &endp[-1] : endp) || + F_ISSET(dbc, DBC_FROM_DB_GET)) { + data->size = (u_int32_t) + DB_ALIGN(size + + data->ulen - space, 1024); + return (DB_BUFFER_SMALL); + } + break; + } + memcpy(dp, (u_int8_t *)pg + HOFFSET(pg), size); + need_pg = 0; + space -= size; + np += size; + } + /* + * Add the offsets and sizes to the end of the buffer. + * First add the key info then the data info. + */ + if (is_key) { + *offp-- = (int32_t)key_off; + *offp-- = (int32_t)key_size; + } else if (rec_key) + *offp-- = (int32_t)cp->recno; + *offp-- = (int32_t)((inp[indx + adj - 1] - HOFFSET(pg)) + + (dp - dbuf) + SSZA(BKEYDATA, data)); + *offp-- = bk->len; + } + if (dbc->dbtype == DB_RECNO) + cp->recno++; + else if (no_dup) { + while (indx + adj < NUM_ENT(pg) && + pg_keyoff == inp[indx + adj]) + indx += adj; + } + /* + * Stop when we either run off the page or we move to the next key and + * we are not returning multiple keys. + */ + } while ((indx += adj) < NUM_ENT(pg) && + (next_key || pg_keyoff == inp[indx])); + + /* If we are off the page then try to the next page. */ + if (ret == 0 && next_key && indx >= NUM_ENT(pg)) { + cp->indx = indx; + ret = __bamc_next(dbc, 0, 1); + if (ret == 0) + goto next_pg; + if (ret != DB_NOTFOUND) + return (ret); + } + + /* + * If we did a DBP->get we must error if we did not return + * all the data for the current key because there is + * no way to know if we did not get it all, nor any + * interface to fetch the balance. + */ + + if (ret == 0 && indx < pg->entries && + F_ISSET(dbc, DBC_TRANSIENT) && pg_keyoff == inp[indx]) { + data->size = (data->ulen - space) + size; + return (DB_BUFFER_SMALL); + } + /* + * Must leave the index pointing at the last record fetched. + * If we are not fetching keys, we may have stepped to the + * next key. + */ + if (ret == DB_BUFFER_SMALL || next_key || pg_keyoff == inp[indx]) + cp->indx = indx; + else + cp->indx = indx - P_INDX; + + if (rec_key == 1) + *offp = RECNO_OOB; + else + *offp = -1; + return (0); +} + +/* + * __bam_bulk_overflow -- + * Dump overflow record into the buffer. + * The space requirements have already been checked. + * PUBLIC: int __bam_bulk_overflow + * PUBLIC: __P((DBC *, u_int32_t, db_pgno_t, u_int8_t *)); + */ +int +__bam_bulk_overflow(dbc, len, pgno, dp) + DBC *dbc; + u_int32_t len; + db_pgno_t pgno; + u_int8_t *dp; +{ + DBT dbt; + + memset(&dbt, 0, sizeof(dbt)); + F_SET(&dbt, DB_DBT_USERMEM); + dbt.ulen = len; + dbt.data = (void *)dp; + return (__db_goff(dbc, &dbt, len, pgno, NULL, NULL)); +} + +/* + * __bam_bulk_duplicates -- + * Put as many off page duplicates as will fit into the buffer. + * This routine will adjust the cursor to reflect the position in + * the overflow tree. + * PUBLIC: int __bam_bulk_duplicates __P((DBC *, + * PUBLIC: db_pgno_t, u_int8_t *, int32_t *, + * PUBLIC: int32_t **, u_int8_t **, u_int32_t *, int)); + */ +int +__bam_bulk_duplicates(dbc, pgno, dbuf, keyoff, offpp, dpp, spacep, no_dup) + DBC *dbc; + db_pgno_t pgno; + u_int8_t *dbuf; + int32_t *keyoff, **offpp; + u_int8_t **dpp; + u_int32_t *spacep; + int no_dup; +{ + BKEYDATA *bk; + BOVERFLOW *bo; + BTREE_CURSOR *cp; + DB *dbp; + DBC *opd; + DBT key, data; + PAGE *pg; + db_indx_t indx, *inp; + int32_t *offp; + u_int32_t pagesize, size, space; + u_int8_t *dp, *np; + int first, need_pg, ret, t_ret; + + ret = 0; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + opd = cp->opd; + + if (opd == NULL) { + if ((ret = __dbc_newopd(dbc, pgno, NULL, &opd)) != 0) + return (ret); + cp->opd = opd; + if ((ret = opd->am_get(opd, + &key, &data, DB_FIRST, NULL)) != 0) + goto close_opd; + } + + pagesize = opd->dbp->pgsize; + cp = (BTREE_CURSOR *)opd->internal; + space = *spacep; + /* Get current offset slot. */ + offp = *offpp; + + /* + * np is the next place to put data. + * dp is the beginning of the current page in the buffer. + */ + np = dp = *dpp; + first = 1; + indx = cp->indx; + + do { + /* Fetch the current record. No initial move. */ + if ((ret = __bamc_next(opd, 0, 0)) != 0) + break; + pg = cp->page; + indx = cp->indx; + inp = P_INP(dbp, pg); + /* We need to copy the page to the buffer. */ + need_pg = 1; + + do { + if (IS_DELETED(dbp, pg, indx)) + goto contin; + bk = GET_BKEYDATA(dbp, pg, indx); + space -= 2 * sizeof(*offp); + /* Allocate space for key if needed. */ + if (first == 0 && keyoff != NULL) + space -= 2 * sizeof(*offp); + + /* Did space underflow? */ + if (space > *spacep) { + ret = DB_BUFFER_SMALL; + if (first == 1) { + /* Get the absolute value. */ + space = -(int32_t)space; + space = *spacep + space; + if (need_pg) + space += pagesize - HOFFSET(pg); + } + break; + } + if (B_TYPE(bk->type) == B_OVERFLOW) { + bo = (BOVERFLOW *)bk; + size = bo->tlen; + if (size > space) { + ret = DB_BUFFER_SMALL; + space = *spacep + size; + break; + } + if (first == 0 && keyoff != NULL) { + *offp-- = keyoff[0]; + *offp-- = keyoff[-1]; + } + if ((ret = __bam_bulk_overflow(dbc, + bo->tlen, bo->pgno, np)) != 0) + return (ret); + space -= size; + *offp-- = (int32_t)(np - dbuf); + np += size; + } else { + if (need_pg) { + dp = np; + size = pagesize - HOFFSET(pg); + if (space < size) { + ret = DB_BUFFER_SMALL; + /* Return space required. */ + space = *spacep + size; + break; + } + memcpy(dp, + (u_int8_t *)pg + HOFFSET(pg), size); + need_pg = 0; + space -= size; + np += size; + } + if (first == 0 && keyoff != NULL) { + *offp-- = keyoff[0]; + *offp-- = keyoff[-1]; + } + size = bk->len; + *offp-- = (int32_t)((inp[indx] - HOFFSET(pg)) + + (dp - dbuf) + SSZA(BKEYDATA, data)); + } + *offp-- = (int32_t)size; + first = 0; + if (no_dup) + break; +contin: + indx++; + if (opd->dbtype == DB_RECNO) + cp->recno++; + } while (indx < NUM_ENT(pg)); + if (no_dup) + break; + cp->indx = indx; + + } while (ret == 0); + + /* Return the updated information. */ + *spacep = space; + *offpp = offp; + *dpp = np; + + /* + * If we ran out of space back up the pointer. + * If we did not return any dups or reached the end, close the opd. + */ + if (ret == DB_BUFFER_SMALL) { + if (opd->dbtype == DB_RECNO) { + if (--cp->recno == 0) + goto close_opd; + } else if (indx != 0) + cp->indx--; + else { + t_ret = __bamc_prev(opd); + if (t_ret == DB_NOTFOUND) + goto close_opd; + if (t_ret != 0) + ret = t_ret; + } + } else if (keyoff == NULL && ret == DB_NOTFOUND) { + cp->indx--; + if (opd->dbtype == DB_RECNO) + --cp->recno; + } else if (indx == 0 || ret == DB_NOTFOUND) { +close_opd: + if (ret == DB_NOTFOUND) + ret = 0; + if ((t_ret = __dbc_close(opd)) != 0 && ret == 0) + ret = t_ret; + ((BTREE_CURSOR *)dbc->internal)->opd = NULL; + } + if (ret == DB_NOTFOUND) + ret = 0; + + return (ret); +} + +/* + * __bam_getbothc -- + * Search for a matching data item on a join. + */ +static int +__bam_getbothc(dbc, data) + DBC *dbc; + DBT *data; +{ + BTREE_CURSOR *cp; + DB *dbp; + DB_MPOOLFILE *mpf; + int cmp, exact, ret; + + dbp = dbc->dbp; + mpf = dbp->mpf; + cp = (BTREE_CURSOR *)dbc->internal; + + /* + * Acquire the current page. We have at least a read-lock + * already. The caller may have set DB_RMW asking for a + * write lock, but upgrading to a write lock has no better + * chance of succeeding now instead of later, so don't try. + */ + if ((ret = __memp_fget(mpf, &cp->pgno, + dbc->thread_info, dbc->txn, 0, &cp->page)) != 0) + return (ret); + + /* + * An off-page duplicate cursor. Search the remaining duplicates + * for one which matches (do a normal btree search, then verify + * that the retrieved record is greater than the original one). + */ + if (F_ISSET(dbc, DBC_OPD)) { + /* + * Check to make sure the desired item comes strictly after + * the current position; if it doesn't, return DB_NOTFOUND. + */ + if ((ret = __bam_cmp(dbc, data, cp->page, cp->indx, + dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare, + &cmp)) != 0) + return (ret); + + if (cmp <= 0) + return (DB_NOTFOUND); + + /* Discard the current page, we're going to do a full search. */ + if ((ret = __memp_fput(mpf, + dbc->thread_info, cp->page, dbc->priority)) != 0) + return (ret); + cp->page = NULL; + + return (__bamc_search(dbc, + PGNO_INVALID, data, DB_GET_BOTH, &exact)); + } + + /* + * We're doing a DBC->get(DB_GET_BOTHC) and we're already searching + * a set of on-page duplicates (either sorted or unsorted). Continue + * a linear search from after the current position. + * + * (Note that we could have just finished a "set" of one duplicate, + * i.e. not a duplicate at all, but the following check will always + * return DB_NOTFOUND in this case, which is the desired behavior.) + */ + if (cp->indx + P_INDX >= NUM_ENT(cp->page) || + !IS_DUPLICATE(dbc, cp->indx, cp->indx + P_INDX)) + return (DB_NOTFOUND); + cp->indx += P_INDX; + + return (__bam_getboth_finddatum(dbc, data, DB_GET_BOTH)); +} + +#ifdef HAVE_COMPRESSION +/* + * __bam_getlte -- + * Search for the largest entry <= key/data - used by compression. + * + * data == NULL indicates the DB_SET_LTE flag + * data != NULL indicates the DB_GET_BOTH_LTE flag + * + * Only works for a primary cursor - not an OPD cursor. Handles the + * OPD manipulation as well - no need to return to the caller to + * perform more OPD movements. + */ +static int +__bam_getlte(dbc, key, data) + DBC *dbc; + DBT *key, *data; +{ + BTREE_CURSOR *cp, *ocp; + DB *dbp; + db_pgno_t pgno; + int exact, ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + + /* Begin by searching for the key */ + ret = __bamc_search(dbc, PGNO_INVALID, key, DB_SET_RANGE, &exact); + if (ret == DB_NOTFOUND) + goto find_last; + if (ret != 0) + goto end; + + if (cp->indx == NUM_ENT(cp->page) || IS_CUR_DELETED(dbc)) { + /* + * Move to the next entry if we're past the end of the + * page or on a deleted entry. + */ + ret = __bamc_next(dbc, 0, 0); + if (ret == DB_NOTFOUND) + goto find_last; + if (ret != 0) + goto end; + + /* Check if we're still on the correct key */ + if ((ret = __bam_cmp(dbc, key, cp->page, cp->indx, + ((BTREE*)dbp->bt_internal)->bt_compare, &exact)) != 0) + goto end; + exact = (exact == 0); + } + + if (exact == 0) { + ret = __bam_get_prev(dbc); + goto end; + } + + if (__bam_isopd(dbc, &pgno)) { + /* + * We want to do unusual things with off-page duplicates, so + * deal with them here rather than returning to handle them. + */ + if ((ret = __dbc_newopd(dbc, pgno, cp->opd, &cp->opd)) != 0) + goto end; + + /* Search for the correct duplicate */ + ret = __bamc_search(cp->opd, PGNO_INVALID, data, + data == NULL ? DB_FIRST : DB_SET_RANGE, &exact); + if (ret == DB_NOTFOUND) + goto find_last_dup; + if (ret != 0) + goto end; + + ocp = (BTREE_CURSOR *)cp->opd->internal; + if (ocp->indx == NUM_ENT(ocp->page) || + IS_CUR_DELETED(cp->opd)) { + /* + * Move to the next entry if we're past the end of the + * page or on a deleted entry. + */ + ret = __bamc_next(cp->opd, 0, 0); + if (ret == DB_NOTFOUND) + goto find_last_dup; + if (ret != 0) + goto end; + + if (data != NULL) { + /* Check if we're still on the correct data */ + if ((ret = __bam_cmp( + dbc, data, ocp->page, ocp->indx, + dbp->dup_compare, &exact)) != 0) + goto end; + exact = (exact == 0); + } else + exact = 1; + } + + if (exact == 0) { + /* Move to the previous entry */ + ret = __bamc_prev(cp->opd); + if (ret == DB_NOTFOUND) { + if ((ret = __dbc_close(cp->opd)) != 0) + goto end; + cp->opd = NULL; + ret = __bam_get_prev(dbc); + } + } + } else if(data != NULL) { + /* + * If we got an exact match with on-page duplicates, we need to + * search in them. + */ + ret = __bam_getboth_finddatum(dbc, data, DB_GET_BOTH_RANGE); + if (ret == DB_NOTFOUND) + exact = 0; + else if (ret != 0) + goto end; + else { + /* Check if we're still on the correct data */ + if ((ret = __bam_cmp(dbc, data, cp->page, + cp->indx + O_INDX, dbp->dup_compare, &exact)) != 0) + goto end; + exact = (exact == 0); + } + + if (exact == 0) { + ret = __bam_get_prev(dbc); + } + } + + end: + return (ret); + + find_last: + if ((ret = __bamc_search( + dbc, PGNO_INVALID, NULL, DB_LAST, &exact)) != 0) + return (ret); + + if (__bam_isopd(dbc, &pgno)) { + if ((ret = __dbc_newopd(dbc, pgno, cp->opd, &cp->opd)) != 0) + return (ret); + find_last_dup: + if ((ret = __bamc_search( + cp->opd, PGNO_INVALID, NULL, DB_LAST, &exact)) != 0) + return (ret); + } + + return (ret); +} +#endif + +/* + * __bam_getboth_finddatum -- + * Find a matching on-page data item. + */ +static int +__bam_getboth_finddatum(dbc, data, flags) + DBC *dbc; + DBT *data; + u_int32_t flags; +{ + BTREE_CURSOR *cp; + DB *dbp; + db_indx_t base, lim, top; + int cmp, ret; + + COMPQUIET(cmp, 0); + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + + /* + * Called (sometimes indirectly) from DBC->get to search on-page data + * item(s) for a matching value. If the original flag was DB_GET_BOTH + * or DB_GET_BOTH_RANGE, the cursor is set to the first undeleted data + * item for the key. If the original flag was DB_GET_BOTHC, the cursor + * argument is set to the first data item we can potentially return. + * In both cases, there may or may not be additional duplicate data + * items to search. + * + * If the duplicates are not sorted, do a linear search. + */ + if (dbp->dup_compare == NULL) { + for (;; cp->indx += P_INDX) { + if (!IS_CUR_DELETED(dbc) && + (ret = __bam_cmp(dbc, data, cp->page, + cp->indx + O_INDX, __bam_defcmp, &cmp)) != 0) + return (ret); + if (cmp == 0) + return (0); + + if (cp->indx + P_INDX >= NUM_ENT(cp->page) || + !IS_DUPLICATE(dbc, cp->indx, cp->indx + P_INDX)) + break; + } + return (DB_NOTFOUND); + } + + /* + * If the duplicates are sorted, do a binary search. The reason for + * this is that large pages and small key/data pairs result in large + * numbers of on-page duplicates before they get pushed off-page. + * + * Find the top and bottom of the duplicate set. Binary search + * requires at least two items, don't loop if there's only one. + */ + for (base = top = cp->indx; top < NUM_ENT(cp->page); top += P_INDX) + if (!IS_DUPLICATE(dbc, cp->indx, top)) + break; + if (base == (top - P_INDX)) { + if ((ret = __bam_cmp(dbc, data, cp->page, + cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0) + return (ret); + if (cmp == 0 || (cmp < 0 && flags == DB_GET_BOTH_RANGE)) + return 0; + cp->indx = top; + return DB_NOTFOUND; + } + + for (lim = (top - base) / (db_indx_t)P_INDX; lim != 0; lim >>= 1) { + cp->indx = base + ((lim >> 1) * P_INDX); + if ((ret = __bam_cmp(dbc, data, cp->page, + cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0) + return (ret); + if (cmp == 0) { + /* + * XXX + * No duplicate duplicates in sorted duplicate sets, + * so there can be only one. + */ + if (!IS_CUR_DELETED(dbc)) + return (0); + break; + } + if (cmp > 0) { + base = cp->indx + P_INDX; + --lim; + } + } + + /* No match found; if we're looking for an exact match, we're done. */ + if (flags == DB_GET_BOTH) + return (DB_NOTFOUND); + + /* + * Base is the smallest index greater than the data item, may be zero + * or a last + O_INDX index, and may be deleted. Find an undeleted + * item. + */ + cp->indx = base; + while (cp->indx < top && IS_CUR_DELETED(dbc)) + cp->indx += P_INDX; + return (cp->indx < top ? 0 : DB_NOTFOUND); +} + +/* + * __bamc_put -- + * Put using a cursor. + */ +static int +__bamc_put(dbc, key, data, flags, pgnop) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; + db_pgno_t *pgnop; +{ + BTREE *t; + BTREE_CURSOR *cp; + DB *dbp; + DBT dbt; + DB_MPOOLFILE *mpf; + db_pgno_t root_pgno; + int cmp, exact, own, ret, stack; + u_int32_t iiop; + void *arg; + + dbp = dbc->dbp; + mpf = dbp->mpf; + cp = (BTREE_CURSOR *)dbc->internal; + root_pgno = cp->root; + +split: ret = stack = 0; + switch (flags) { + case DB_CURRENT: + if (F_ISSET(cp, C_DELETED)) + return (DB_NOTFOUND); + /* FALLTHROUGH */ + case DB_AFTER: + case DB_BEFORE: + iiop = flags; + own = 1; + + /* Acquire the current page with a write lock. */ + ACQUIRE_WRITE_LOCK(dbc, ret); + if (ret != 0) + goto err; + if (cp->page == NULL && (ret = __memp_fget(mpf, &cp->pgno, + dbc->thread_info, dbc->txn, 0, &cp->page)) != 0) + goto err; + break; + case DB_KEYFIRST: + case DB_KEYLAST: + case DB_NODUPDATA: + case DB_NOOVERWRITE: + case DB_OVERWRITE_DUP: + own = 0; + /* + * Searching off-page, sorted duplicate tree: do a tree search + * for the correct item; __bamc_search returns the smallest + * slot greater than the key, use it. + * + * See comment below regarding where we can start the search. + */ + if (F_ISSET(dbc, DBC_OPD)) { + if ((ret = __bamc_search(dbc, + F_ISSET(cp, C_RECNUM) ? cp->root : root_pgno, + data, flags, &exact)) != 0) + goto err; + stack = 1; + + /* Disallow "sorted" duplicate duplicates. */ + if (exact != 0) { + if (flags == DB_OVERWRITE_DUP || + IS_DELETED(dbp, cp->page, cp->indx)) { + iiop = DB_CURRENT; + break; + } + ret = __db_duperr(dbp, flags); + goto err; + } + iiop = DB_BEFORE; + break; + } + + /* + * Searching a btree. + * + * If we've done a split, we can start the search from the + * parent of the split page, which __bam_split returned + * for us in root_pgno, unless we're in a Btree with record + * numbering. In that case, we'll need the true root page + * in order to adjust the record count. + */ + if ((ret = __bamc_search(dbc, + F_ISSET(cp, C_RECNUM) ? cp->root : root_pgno, key, + flags == DB_KEYFIRST || dbp->dup_compare != NULL ? + DB_KEYFIRST : DB_KEYLAST, &exact)) != 0) + goto err; + stack = 1; + + /* + * If we don't have an exact match, __bamc_search returned + * the smallest slot greater than the key, use it. + */ + if (!exact) { + iiop = DB_KEYFIRST; + break; + + /* + * Check for NOOVERWRITE. It is possible that there + * is a key with an empty duplicate page attached. + */ + } else if (flags == DB_NOOVERWRITE && !IS_CUR_DELETED(dbc)) { + if (pgnop != NULL && __bam_isopd(dbc, pgnop)) + ret = __bam_opd_exists(dbc, *pgnop); + else + ret = DB_KEYEXIST; + if (ret != 0) + goto err; + } + + /* + * If duplicates aren't supported, replace the current item. + */ + if (!F_ISSET(dbp, DB_AM_DUP)) { + iiop = DB_CURRENT; + break; + } + + /* + * If we find a matching entry, it may be an off-page duplicate + * tree. Return the page number to our caller, we need a new + * cursor. + */ + if (pgnop != NULL && __bam_isopd(dbc, pgnop)) + goto done; + + /* If the duplicates aren't sorted, move to the right slot. */ + if (dbp->dup_compare == NULL) { + if (flags == DB_KEYFIRST) + iiop = DB_BEFORE; + else + for (;; cp->indx += P_INDX) + if (cp->indx + P_INDX >= + NUM_ENT(cp->page) || + !IS_DUPLICATE(dbc, cp->indx, + cp->indx + P_INDX)) { + iiop = DB_AFTER; + break; + } + break; + } + + /* + * We know that we're looking at the first of a set of sorted + * on-page duplicates. Walk the list to find the right slot. + */ + for (;; cp->indx += P_INDX) { + if ((ret = __bam_cmp(dbc, data, cp->page, + cp->indx + O_INDX, dbp->dup_compare, &cmp)) != 0) + goto err; + if (cmp < 0) { + iiop = DB_BEFORE; + break; + } + + /* Disallow "sorted" duplicate duplicates. */ + if (cmp == 0) { + if (flags == DB_OVERWRITE_DUP || + IS_DELETED(dbp, cp->page, cp->indx)) { + iiop = DB_CURRENT; + break; + } + ret = __db_duperr(dbp, flags); + goto err; + } + + if (cp->indx + P_INDX >= NUM_ENT(cp->page) || + P_INP(dbp, ((PAGE *)cp->page))[cp->indx] != + P_INP(dbp, ((PAGE *)cp->page))[cp->indx + P_INDX]) { + iiop = DB_AFTER; + break; + } + } + break; + default: + ret = __db_unknown_flag(dbp->env, "__bamc_put", flags); + goto err; + } + + switch (ret = __bam_iitem(dbc, key, data, iiop, 0)) { + case 0: + break; + case DB_NEEDSPLIT: + /* + * To split, we need a key for the page. Either use the key + * argument or get a copy of the key from the page. + */ + if (flags == DB_AFTER || + flags == DB_BEFORE || flags == DB_CURRENT) { + memset(&dbt, 0, sizeof(DBT)); + if ((ret = __db_ret(dbc, cp->page, 0, &dbt, + &dbc->my_rkey.data, &dbc->my_rkey.ulen)) != 0) + goto err; + arg = &dbt; + } else + arg = F_ISSET(dbc, DBC_OPD) ? data : key; + + /* + * Discard any locks and pinned pages (the locks are discarded + * even if we're running with transactions, as they lock pages + * that we're sorry we ever acquired). If stack is set and the + * cursor entries are valid, they point to the same entries as + * the stack, don't free them twice. + */ + if (stack) + ret = __bam_stkrel(dbc, STK_CLRDBC | STK_NOLOCK); + else + DISCARD_CUR(dbc, ret); + if (ret != 0) + goto err; + + /* + * SR [#6059] + * If we do not own a lock on the page any more, then clear the + * cursor so we don't point at it. Even if we call __bam_stkrel + * above we still may have entered the routine with the cursor + * positioned to a particular record. This is in the case + * where C_RECNUM is set. + */ + if (own == 0) { + cp->pgno = PGNO_INVALID; + cp->indx = 0; + } + + /* Split the tree. */ + if ((ret = __bam_split(dbc, arg, &root_pgno)) != 0) + return (ret); + + goto split; + default: + goto err; + } + +err: +done: /* + * If we inserted a key into the first or last slot of the tree, + * remember where it was so we can do it more quickly next time. + * If the tree has record numbers, we need a complete stack so + * that we can adjust the record counts, so skipping the tree search + * isn't possible. For subdatabases we need to be careful that the + * page does not move from one db to another, so we track its LSN. + * + * If there are duplicates and we are inserting into the last slot, + * the cursor will point _to_ the last item, not after it, which + * is why we subtract P_INDX below. + */ + + t = dbp->bt_internal; + if (ret == 0 && TYPE(cp->page) == P_LBTREE && + (flags == DB_KEYFIRST || flags == DB_KEYLAST) && + !F_ISSET(cp, C_RECNUM) && + (!F_ISSET(dbp, DB_AM_SUBDB) || + (LOGGING_ON(dbp->env) && !F_ISSET(dbp, DB_AM_NOT_DURABLE))) && + ((NEXT_PGNO(cp->page) == PGNO_INVALID && + cp->indx >= NUM_ENT(cp->page) - P_INDX) || + (PREV_PGNO(cp->page) == PGNO_INVALID && cp->indx == 0))) { + t->bt_lpgno = cp->pgno; + if (F_ISSET(dbp, DB_AM_SUBDB)) + t->bt_llsn = LSN(cp->page); + } else + t->bt_lpgno = PGNO_INVALID; + /* + * Discard any pages pinned in the tree and their locks, except for + * the leaf page. Note, the leaf page participated in any stack we + * acquired, and so we have to adjust the stack as necessary. If + * there was only a single page on the stack, we don't have to free + * further stack pages. + */ + if (stack && BT_STK_POP(cp) != NULL) + (void)__bam_stkrel(dbc, 0); + + /* + * Regardless of whether we were successful or not, clear the delete + * flag. If we're successful, we either moved the cursor or the item + * is no longer deleted. If we're not successful, then we're just a + * copy, no need to have the flag set. + * + * We may have instantiated off-page duplicate cursors during the put, + * so clear the deleted bit from the off-page duplicate cursor as well. + */ + F_CLR(cp, C_DELETED); + if (cp->opd != NULL) { + cp = (BTREE_CURSOR *)cp->opd->internal; + F_CLR(cp, C_DELETED); + } + + return (ret); +} + +/* + * __bamc_rget -- + * Return the record number for a cursor. + * + * PUBLIC: int __bamc_rget __P((DBC *, DBT *)); + */ +int +__bamc_rget(dbc, data) + DBC *dbc; + DBT *data; +{ + BTREE_CURSOR *cp; + DB *dbp; + DBT dbt; + DB_MPOOLFILE *mpf; + db_recno_t recno; + int exact, ret, t_ret; + + dbp = dbc->dbp; + mpf = dbp->mpf; + cp = (BTREE_CURSOR *)dbc->internal; + + /* + * Get the page with the current item on it. + * Get a copy of the key. + * Release the page, making sure we don't release it twice. + */ + if ((ret = __memp_fget(mpf, &cp->pgno, + dbc->thread_info, dbc->txn, 0, &cp->page)) != 0) + return (ret); + memset(&dbt, 0, sizeof(DBT)); + if ((ret = __db_ret(dbc, cp->page, cp->indx, &dbt, + &dbc->my_rkey.data, &dbc->my_rkey.ulen)) != 0) + goto err; + ret = __memp_fput(mpf, dbc->thread_info, cp->page, dbc->priority); + cp->page = NULL; + if (ret != 0) + return (ret); + + if ((ret = __bam_search(dbc, PGNO_INVALID, &dbt, + F_ISSET(dbc, DBC_RMW) ? SR_FIND_WR : SR_FIND, + 1, &recno, &exact)) != 0) + goto err; + + ret = __db_retcopy(dbc->env, data, + &recno, sizeof(recno), &dbc->rdata->data, &dbc->rdata->ulen); + + /* Release the stack. */ +err: if ((t_ret = __bam_stkrel(dbc, 0)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __bamc_writelock -- + * Upgrade the cursor to a write lock. + */ +static int +__bamc_writelock(dbc) + DBC *dbc; +{ + BTREE_CURSOR *cp; + int ret; + + cp = (BTREE_CURSOR *)dbc->internal; + + if (cp->lock_mode == DB_LOCK_WRITE) + return (0); + + /* + * When writing to an off-page duplicate tree, we need to have the + * appropriate page in the primary tree locked. The general DBC + * code calls us first with the primary cursor so we can acquire the + * appropriate lock. + */ + ACQUIRE_WRITE_LOCK(dbc, ret); + return (ret); +} + +/* + * __bamc_next -- + * Move to the next record. + */ +static int +__bamc_next(dbc, initial_move, deleted_okay) + DBC *dbc; + int initial_move, deleted_okay; +{ + BTREE_CURSOR *cp; + db_indx_t adjust; + db_lockmode_t lock_mode; + db_pgno_t pgno; + int ret; + + cp = (BTREE_CURSOR *)dbc->internal; + ret = 0; + + /* + * We're either moving through a page of duplicates or a btree leaf + * page. + * + * !!! + * This code handles empty pages and pages with only deleted entries. + */ + if (F_ISSET(dbc, DBC_OPD)) { + adjust = O_INDX; + lock_mode = DB_LOCK_NG; + } else { + adjust = dbc->dbtype == DB_BTREE ? P_INDX : O_INDX; + lock_mode = + F_ISSET(dbc, DBC_RMW) ? DB_LOCK_WRITE : DB_LOCK_READ; + } + if (cp->page == NULL) { + ACQUIRE_CUR(dbc, lock_mode, cp->pgno, 0, ret); + if (ret != 0) + return (ret); + } + + if (initial_move) + cp->indx += adjust; + + for (;;) { + /* + * If at the end of the page, move to a subsequent page. + * + * !!! + * Check for >= NUM_ENT. If the original search landed us on + * NUM_ENT, we may have incremented indx before the test. + */ + if (cp->indx >= NUM_ENT(cp->page)) { + if ((pgno = NEXT_PGNO(cp->page)) == PGNO_INVALID) + return (DB_NOTFOUND); + + ACQUIRE_CUR(dbc, lock_mode, pgno, 0, ret); + if (ret != 0) + return (ret); + cp->indx = 0; + continue; + } + if (!deleted_okay && IS_CUR_DELETED(dbc)) { + cp->indx += adjust; + continue; + } + break; + } + return (0); +} + +/* + * __bamc_prev -- + * Move to the previous record. + */ +static int +__bamc_prev(dbc) + DBC *dbc; +{ + BTREE_CURSOR *cp; + db_indx_t adjust; + db_lockmode_t lock_mode; + db_pgno_t pgno; + int ret; + + cp = (BTREE_CURSOR *)dbc->internal; + ret = 0; + + /* + * We're either moving through a page of duplicates or a btree leaf + * page. + * + * !!! + * This code handles empty pages and pages with only deleted entries. + */ + if (F_ISSET(dbc, DBC_OPD)) { + adjust = O_INDX; + lock_mode = DB_LOCK_NG; + } else { + adjust = dbc->dbtype == DB_BTREE ? P_INDX : O_INDX; + lock_mode = + F_ISSET(dbc, DBC_RMW) ? DB_LOCK_WRITE : DB_LOCK_READ; + } + if (cp->page == NULL) { + ACQUIRE_CUR(dbc, lock_mode, cp->pgno, 0, ret); + if (ret != 0) + return (ret); + } + + for (;;) { + /* If at the beginning of the page, move to a previous one. */ + if (cp->indx == 0) { + if ((pgno = + PREV_PGNO(cp->page)) == PGNO_INVALID) + return (DB_NOTFOUND); + + ACQUIRE_CUR(dbc, lock_mode, pgno, 0, ret); + if (ret != 0) + return (ret); + + if ((cp->indx = NUM_ENT(cp->page)) == 0) + continue; + } + + /* Ignore deleted records. */ + cp->indx -= adjust; + if (IS_CUR_DELETED(dbc)) + continue; + + break; + } + return (0); +} + +/* + * __bamc_search -- + * Move to a specified record. + */ +static int +__bamc_search(dbc, root_pgno, key, flags, exactp) + DBC *dbc; + db_pgno_t root_pgno; + const DBT *key; + u_int32_t flags; + int *exactp; +{ + BTREE *t; + BTREE_CURSOR *cp; + DB *dbp; + PAGE *h; + db_indx_t base, indx, *inp, lim; + db_pgno_t bt_lpgno; + db_recno_t recno; + u_int32_t sflags; + int bulk, cmp, ret, t_ret; + + COMPQUIET(cmp, 0); + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + t = dbp->bt_internal; + ret = 0; + bulk = (F_ISSET(dbc, DBC_BULK) && cp->pgno != PGNO_INVALID); + + /* + * Find an entry in the database. Discard any lock we currently hold, + * we're going to search the tree. + */ + DISCARD_CUR(dbc, ret); + if (ret != 0) + return (ret); + + switch (flags) { + case DB_FIRST: + sflags = (F_ISSET(dbc, DBC_RMW) ? SR_WRITE : SR_READ) | SR_MIN; + goto search; + case DB_LAST: + sflags = (F_ISSET(dbc, DBC_RMW) ? SR_WRITE : SR_READ) | SR_MAX; + goto search; + case DB_SET_RECNO: + if ((ret = __ram_getno(dbc, key, &recno, 0)) != 0) + return (ret); + sflags = + (F_ISSET(dbc, DBC_RMW) ? SR_FIND_WR : SR_FIND) | SR_EXACT; + if ((ret = __bam_rsearch(dbc, &recno, sflags, 1, exactp)) != 0) + return (ret); + goto done; + case DB_SET: + case DB_GET_BOTH: + sflags = + (F_ISSET(dbc, DBC_RMW) ? SR_FIND_WR : SR_FIND) | SR_EXACT; + if (bulk) + break; + goto search; + case DB_GET_BOTH_RANGE: + sflags = (F_ISSET(dbc, DBC_RMW) ? SR_FIND_WR : SR_FIND); + goto search; + case DB_SET_RANGE: + sflags = + (F_ISSET(dbc, DBC_RMW) ? SR_WRITE : SR_READ) | SR_DUPFIRST; + goto search; + case DB_KEYFIRST: + case DB_NOOVERWRITE: + sflags = SR_KEYFIRST; + break; + case DB_KEYLAST: + case DB_NODUPDATA: + case DB_OVERWRITE_DUP: + sflags = SR_KEYLAST; + break; + default: + return (__db_unknown_flag(dbp->env, "__bamc_search", flags)); + } + + /* + * If the application has a history of inserting into the first or last + * pages of the database, we check those pages first to avoid doing a + * full search. Similarly, if the cursor is configured as a bulk + * cursor, check whether this operation belongs on the same page as the + * last one. + */ + if (bulk) + bt_lpgno = cp->pgno; + else { + if (F_ISSET(dbc, DBC_OPD)) + goto search; + + /* + * !!! + * We do not mutex protect the t->bt_lpgno field, which means + * that it can only be used in an advisory manner. If we find + * page we can use, great. If we don't, we don't care, we do + * it the slow way instead. Regardless, copy it into a local + * variable, otherwise we might acquire a lock for a page and + * then read a different page because it changed underfoot. + */ + bt_lpgno = t->bt_lpgno; + } + + /* + * If the tree has no history of insertion, do it the slow way. + */ + if (bt_lpgno == PGNO_INVALID) + goto search; + + /* + * Lock and retrieve the page on which we last inserted. + * + * The page may not exist: if a transaction created the page + * and then aborted, the page might have been truncated from + * the end of the file. We don't want to wait on the lock. + * The page may not even be relevant to this search. + */ + h = NULL; + ACQUIRE_CUR(dbc, DB_LOCK_WRITE, bt_lpgno, DB_LOCK_NOWAIT, ret); + if (ret != 0) { + if (ret == DB_LOCK_DEADLOCK || + ret == DB_LOCK_NOTGRANTED || + ret == DB_PAGE_NOTFOUND) + ret = 0; + goto fast_miss; + } + + h = cp->page; + inp = P_INP(dbp, h); + + /* + * It's okay if the page type isn't right or it's empty, it + * just means that the world changed. + */ + if (TYPE(h) != P_LBTREE || NUM_ENT(h) == 0) + goto fast_miss; + + /* Verify that this page cannot have moved to another db. */ + if (F_ISSET(dbp, DB_AM_SUBDB) && + LOG_COMPARE(&t->bt_llsn, &LSN(h)) != 0) + goto fast_miss; + + /* + * What we do here is test to see if we're at the beginning or + * end of the tree and if the new item sorts before/after the + * first/last page entry. We only try to catch inserts into + * the middle of the tree for bulk cursors. + */ + if (h->next_pgno == PGNO_INVALID) { + indx = NUM_ENT(h) - P_INDX; + if ((ret = __bam_cmp(dbc, key, h, indx, + t->bt_compare, &cmp)) != 0) + goto fast_miss; + if (cmp > 0) + indx += P_INDX; + if (cmp >= 0) + goto fast_hit; + } + if (h->prev_pgno == PGNO_INVALID) { + indx = 0; + if ((ret = __bam_cmp(dbc, key, h, indx, + t->bt_compare, &cmp)) != 0) + goto fast_miss; + if (cmp <= 0) + goto fast_hit; + } + if (bulk) { + DB_BINARY_SEARCH_FOR(base, lim, NUM_ENT(h), P_INDX) { + DB_BINARY_SEARCH_INCR(indx, base, lim, P_INDX); + if ((ret = __bam_cmp(dbc, key, h, indx, + t->bt_compare, &cmp)) != 0) + goto fast_miss; + + if (cmp == 0) + goto fast_hit; + if (cmp > 0) + DB_BINARY_SEARCH_SHIFT_BASE(indx, base, + lim, P_INDX); + } + /* + * No match found: base is the smallest index greater than + * the key and may be zero or NUM_ENT(h). + */ + indx = base; + if (indx > 0 && indx < NUM_ENT(h)) { + if (FLD_ISSET(sflags, SR_EXACT)) + return (DB_NOTFOUND); + goto fast_hit; + } + } + goto fast_miss; + +fast_hit: + if (cmp == 0) { + /* + * Found a duplicate. Deal with DB_KEYFIRST / DB_KEYLAST. + */ + if (FLD_ISSET(sflags, SR_DUPFIRST)) + while (indx > 0 && inp[indx - P_INDX] == inp[indx]) + indx -= P_INDX; + else if (FLD_ISSET(sflags, SR_DUPLAST)) + while (indx < (db_indx_t)(NUM_ENT(h) - P_INDX) && + inp[indx] == inp[indx + P_INDX]) + indx += P_INDX; + } + + /* Set the exact match flag, we may have found a duplicate. */ + *exactp = (cmp == 0); + + /* + * Insert the entry in the stack. (Our caller is likely to + * call __bam_stkrel() after our return.) + */ + BT_STK_CLR(cp); + BT_STK_ENTER(dbp->env, + cp, h, indx, cp->lock, cp->lock_mode, ret); + if (ret != 0) + return (ret); + goto done; + +fast_miss: + /* + * This was not the right page, so we do not need to retain + * the lock even in the presence of transactions. + * + * This is also an error path, so ret may have been set. + */ + DISCARD_CUR(dbc, ret); + cp->pgno = PGNO_INVALID; + if ((t_ret = __LPUT(dbc, cp->lock)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + return (ret); + +search: + if ((ret = __bam_search(dbc, root_pgno, + key, sflags, 1, NULL, exactp)) != 0) + return (ret); + +done: /* Initialize the cursor from the stack. */ + cp->page = cp->csp->page; + cp->pgno = cp->csp->page->pgno; + cp->indx = cp->csp->indx; + cp->lock = cp->csp->lock; + cp->lock_mode = cp->csp->lock_mode; + + /* If on an empty page or a deleted record, move to the next one. */ + if (flags == DB_FIRST && + (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(dbc))) + if ((ret = __bamc_next(dbc, 0, 0)) != 0) + return (ret); + if (flags == DB_LAST && + (NUM_ENT(cp->page) == 0 || IS_CUR_DELETED(dbc))) + if ((ret = __bamc_prev(dbc)) != 0) + return (ret); + + return (0); +} + +/* + * __bamc_physdel -- + * Physically remove an item from the page. + */ +static int +__bamc_physdel(dbc) + DBC *dbc; +{ + BTREE_CURSOR *cp; + DB *dbp; + DBT key; + DB_LOCK next_lock, prev_lock; + db_pgno_t pgno; + int delete_page, empty_page, exact, ret; + + dbp = dbc->dbp; + memset(&key, 0, sizeof(DBT)); + cp = (BTREE_CURSOR *)dbc->internal; + delete_page = empty_page = ret = 0; + LOCK_INIT(next_lock); + LOCK_INIT(prev_lock); + + /* If the page is going to be emptied, consider deleting it. */ + delete_page = empty_page = + NUM_ENT(cp->page) == (TYPE(cp->page) == P_LBTREE ? 2 : 1); + + /* + * Check if the application turned off reverse splits. Applications + * can't turn off reverse splits in off-page duplicate trees, that + * space will never be reused unless the exact same key is specified. + */ + if (delete_page && + !F_ISSET(dbc, DBC_OPD) && F_ISSET(dbp, DB_AM_REVSPLITOFF)) + delete_page = 0; + + /* + * We never delete the last leaf page. (Not really true -- we delete + * the last leaf page of off-page duplicate trees, but that's handled + * by our caller, not down here.) + */ + if (delete_page && cp->pgno == cp->root) + delete_page = 0; + + /* + * To delete a leaf page other than an empty root page, we need a + * copy of a key from the page. Use the 0th page index since it's + * the last key the page held. + * + * !!! + * Note that because __bamc_physdel is always called from a cursor + * close, it should be safe to use the cursor's own "my_rkey" memory + * to temporarily hold this key. We shouldn't own any returned-data + * memory of interest--if we do, we're in trouble anyway. + */ + if (delete_page) { + if ((ret = __db_ret(dbc, cp->page, 0, &key, + &dbc->my_rkey.data, &dbc->my_rkey.ulen)) != 0) + return (ret); + } + + /* + * Delete the items. If page isn't empty, we adjust the cursors. + * + * !!! + * The following operations to delete a page may deadlock. The easy + * scenario is if we're deleting an item because we're closing cursors + * because we've already deadlocked and want to call txn->abort. If + * we fail due to deadlock, we'll leave a locked, possibly empty page + * in the tree, which won't be empty long because we'll undo the delete + * when we undo the transaction's modifications. + * + * !!! + * Delete the key item first, otherwise the on-page duplicate checks + * in __bam_ditem() won't work! + */ + if ((ret = __memp_dirty(dbp->mpf, + &cp->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) + return (ret); + if (TYPE(cp->page) == P_LBTREE) { + if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0) + return (ret); + if (!empty_page) + if ((ret = __bam_ca_di(dbc, + PGNO(cp->page), cp->indx, -1)) != 0) + return (ret); + } + if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0) + return (ret); + + /* Clear the deleted flag, the item is gone. */ + F_CLR(cp, C_DELETED); + + if (!empty_page) + if ((ret = __bam_ca_di(dbc, PGNO(cp->page), cp->indx, -1)) != 0) + return (ret); + + /* + * Need to downgrade write locks here or non-txn locks will get stuck. + */ + if (F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED)) { + if ((ret = __TLPUT(dbc, cp->lock)) != 0) + return (ret); + cp->lock_mode = DB_LOCK_WWRITE; + if (cp->page != NULL && + (ret = __memp_shared(dbp->mpf, cp->page)) != 0) + return (ret); + } + /* If we're not going to try and delete the page, we're done. */ + if (!delete_page) + return (0); + + /* + * Lock the previous and next pages before latching the parent + * sub tree. + */ + if (STD_LOCKING(dbc)) { + if ((pgno = PREV_PGNO(cp->page)) != PGNO_INVALID && + (ret = __db_lget(dbc, + 0, pgno, DB_LOCK_WRITE, 0, &prev_lock)) != 0) + return (ret); + if ((pgno = NEXT_PGNO(cp->page)) != PGNO_INVALID && + (ret = __db_lget(dbc, + 0, pgno, DB_LOCK_WRITE, 0, &next_lock)) != 0) { + (void)__TLPUT(dbc, next_lock); + return (ret); + } + } + DISCARD_CUR(dbc, ret); + if (ret != 0) + goto err; + ret = __bam_search(dbc, PGNO_INVALID, &key, SR_DEL, 0, NULL, &exact); + + /* + * If everything worked, delete the stack, otherwise, release the + * stack and page locks without further damage. + */ + if (ret == 0) + ret = __bam_dpages(dbc, 1, BTD_RELINK); + else + (void)__bam_stkrel(dbc, 0); + +err: (void)__TLPUT(dbc, prev_lock); + (void)__TLPUT(dbc, next_lock); + return (ret); +} + +/* + * __bamc_getstack -- + * Acquire a full stack for a cursor. + */ +static int +__bamc_getstack(dbc) + DBC *dbc; +{ + BTREE_CURSOR *cp; + DB *dbp; + DBT dbt; + DB_MPOOLFILE *mpf; + PAGE *h; + int exact, ret, t_ret; + + dbp = dbc->dbp; + mpf = dbp->mpf; + cp = (BTREE_CURSOR *)dbc->internal; + + /* + * Get the page with the current item on it. The caller of this + * routine has to already hold a read lock on the page, so there + * is no additional lock to acquire. + */ + if ((ret = __memp_fget(mpf, &cp->pgno, + dbc->thread_info, dbc->txn, 0, &h)) != 0) + return (ret); + + /* Get a copy of a key from the page. */ + memset(&dbt, 0, sizeof(DBT)); + ret = __db_ret(dbc, h, 0, &dbt, + &dbc->my_rkey.data, &dbc->my_rkey.ulen); + if ((t_ret = __memp_fput(mpf, + dbc->thread_info, h, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + return (ret); + + /* Get a write-locked stack for the page. */ + exact = 0; + ret = __bam_search(dbc, PGNO_INVALID, + &dbt, SR_KEYFIRST, 1, NULL, &exact); + + return (ret); +} + +/* + * __bam_isopd -- + * Return if the cursor references an off-page duplicate tree via its + * page number. + */ +static int +__bam_isopd(dbc, pgnop) + DBC *dbc; + db_pgno_t *pgnop; +{ + BOVERFLOW *bo; + + if (TYPE(dbc->internal->page) != P_LBTREE) + return (0); + + bo = GET_BOVERFLOW(dbc->dbp, + dbc->internal->page, dbc->internal->indx + O_INDX); + if (B_TYPE(bo->type) == B_DUPLICATE) { + *pgnop = bo->pgno; + return (1); + } + return (0); +} + +/* + * __bam_opd_exists -- + * Return if the current position has any data. + * PUBLIC: int __bam_opd_exists __P((DBC *, db_pgno_t)); + */ +int +__bam_opd_exists(dbc, pgno) + DBC *dbc; + db_pgno_t pgno; +{ + PAGE *h; + int ret; + + if ((ret = __memp_fget(dbc->dbp->mpf, &pgno, + dbc->thread_info, dbc->txn, 0, &h)) != 0) + return (ret); + + /* + * We always collapse OPD trees so we only need to check + * the number of entries on the root. If there is a non-empty + * tree then there will be duplicates. + */ + if (NUM_ENT(h) == 0) + ret = 0; + else + ret = DB_KEYEXIST; + + (void)__memp_fput(dbc->dbp->mpf, dbc->thread_info, h, dbc->priority); + + return (ret); +} diff --git a/btree/bt_debug.c b/btree/bt_debug.c deleted file mode 100644 index 3aefbe7..0000000 --- a/btree/bt_debug.c +++ /dev/null @@ -1,329 +0,0 @@ -/*- - * Copyright (c) 1990, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Mike Olson. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#if defined(LIBC_SCCS) && !defined(lint) -static char sccsid[] = "@(#)bt_debug.c 8.5 (Berkeley) 8/17/94"; -#endif /* LIBC_SCCS and not lint */ - -#include <sys/param.h> - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -#include <db.h> -#include "btree.h" - -#ifdef DEBUG -/* - * BT_DUMP -- Dump the tree - * - * Parameters: - * dbp: pointer to the DB - */ -void -__bt_dump(dbp) - DB *dbp; -{ - BTREE *t; - PAGE *h; - pgno_t i; - char *sep; - - t = dbp->internal; - (void)fprintf(stderr, "%s: pgsz %d", - F_ISSET(t, B_INMEM) ? "memory" : "disk", t->bt_psize); - if (F_ISSET(t, R_RECNO)) - (void)fprintf(stderr, " keys %lu", t->bt_nrecs); -#undef X -#define X(flag, name) \ - if (F_ISSET(t, flag)) { \ - (void)fprintf(stderr, "%s%s", sep, name); \ - sep = ", "; \ - } - if (t->flags != 0) { - sep = " flags ("; - X(R_FIXLEN, "FIXLEN"); - X(B_INMEM, "INMEM"); - X(B_NODUPS, "NODUPS"); - X(B_RDONLY, "RDONLY"); - X(R_RECNO, "RECNO"); - X(B_METADIRTY,"METADIRTY"); - (void)fprintf(stderr, ")\n"); - } -#undef X - - for (i = P_ROOT; (h = mpool_get(t->bt_mp, i, 0)) != NULL; ++i) { - __bt_dpage(h); - (void)mpool_put(t->bt_mp, h, 0); - } -} - -/* - * BT_DMPAGE -- Dump the meta page - * - * Parameters: - * h: pointer to the PAGE - */ -void -__bt_dmpage(h) - PAGE *h; -{ - BTMETA *m; - char *sep; - - m = (BTMETA *)h; - (void)fprintf(stderr, "magic %lx\n", m->magic); - (void)fprintf(stderr, "version %lu\n", m->version); - (void)fprintf(stderr, "psize %lu\n", m->psize); - (void)fprintf(stderr, "free %lu\n", m->free); - (void)fprintf(stderr, "nrecs %lu\n", m->nrecs); - (void)fprintf(stderr, "flags %lu", m->flags); -#undef X -#define X(flag, name) \ - if (m->flags & flag) { \ - (void)fprintf(stderr, "%s%s", sep, name); \ - sep = ", "; \ - } - if (m->flags) { - sep = " ("; - X(B_NODUPS, "NODUPS"); - X(R_RECNO, "RECNO"); - (void)fprintf(stderr, ")"); - } -} - -/* - * BT_DNPAGE -- Dump the page - * - * Parameters: - * n: page number to dump. - */ -void -__bt_dnpage(dbp, pgno) - DB *dbp; - pgno_t pgno; -{ - BTREE *t; - PAGE *h; - - t = dbp->internal; - if ((h = mpool_get(t->bt_mp, pgno, 0)) != NULL) { - __bt_dpage(h); - (void)mpool_put(t->bt_mp, h, 0); - } -} - -/* - * BT_DPAGE -- Dump the page - * - * Parameters: - * h: pointer to the PAGE - */ -void -__bt_dpage(h) - PAGE *h; -{ - BINTERNAL *bi; - BLEAF *bl; - RINTERNAL *ri; - RLEAF *rl; - indx_t cur, top; - char *sep; - - (void)fprintf(stderr, " page %d: (", h->pgno); -#undef X -#define X(flag, name) \ - if (h->flags & flag) { \ - (void)fprintf(stderr, "%s%s", sep, name); \ - sep = ", "; \ - } - sep = ""; - X(P_BINTERNAL, "BINTERNAL") /* types */ - X(P_BLEAF, "BLEAF") - X(P_RINTERNAL, "RINTERNAL") /* types */ - X(P_RLEAF, "RLEAF") - X(P_OVERFLOW, "OVERFLOW") - X(P_PRESERVE, "PRESERVE"); - (void)fprintf(stderr, ")\n"); -#undef X - - (void)fprintf(stderr, "\tprev %2d next %2d", h->prevpg, h->nextpg); - if (h->flags & P_OVERFLOW) - return; - - top = NEXTINDEX(h); - (void)fprintf(stderr, " lower %3d upper %3d nextind %d\n", - h->lower, h->upper, top); - for (cur = 0; cur < top; cur++) { - (void)fprintf(stderr, "\t[%03d] %4d ", cur, h->linp[cur]); - switch (h->flags & P_TYPE) { - case P_BINTERNAL: - bi = GETBINTERNAL(h, cur); - (void)fprintf(stderr, - "size %03d pgno %03d", bi->ksize, bi->pgno); - if (bi->flags & P_BIGKEY) - (void)fprintf(stderr, " (indirect)"); - else if (bi->ksize) - (void)fprintf(stderr, - " {%.*s}", (int)bi->ksize, bi->bytes); - break; - case P_RINTERNAL: - ri = GETRINTERNAL(h, cur); - (void)fprintf(stderr, "entries %03d pgno %03d", - ri->nrecs, ri->pgno); - break; - case P_BLEAF: - bl = GETBLEAF(h, cur); - if (bl->flags & P_BIGKEY) - (void)fprintf(stderr, - "big key page %lu size %u/", - *(pgno_t *)bl->bytes, - *(u_int32_t *)(bl->bytes + sizeof(pgno_t))); - else if (bl->ksize) - (void)fprintf(stderr, "%s/", bl->bytes); - if (bl->flags & P_BIGDATA) - (void)fprintf(stderr, - "big data page %lu size %u", - *(pgno_t *)(bl->bytes + bl->ksize), - *(u_int32_t *)(bl->bytes + bl->ksize + - sizeof(pgno_t))); - else if (bl->dsize) - (void)fprintf(stderr, "%.*s", - (int)bl->dsize, bl->bytes + bl->ksize); - break; - case P_RLEAF: - rl = GETRLEAF(h, cur); - if (rl->flags & P_BIGDATA) - (void)fprintf(stderr, - "big data page %lu size %u", - *(pgno_t *)rl->bytes, - *(u_int32_t *)(rl->bytes + sizeof(pgno_t))); - else if (rl->dsize) - (void)fprintf(stderr, - "%.*s", (int)rl->dsize, rl->bytes); - break; - } - (void)fprintf(stderr, "\n"); - } -} -#endif - -#ifdef STATISTICS -/* - * BT_STAT -- Gather/print the tree statistics - * - * Parameters: - * dbp: pointer to the DB - */ -void -__bt_stat(dbp) - DB *dbp; -{ - extern u_long bt_cache_hit, bt_cache_miss, bt_pfxsaved, bt_rootsplit; - extern u_long bt_sortsplit, bt_split; - BTREE *t; - PAGE *h; - pgno_t i, pcont, pinternal, pleaf; - u_long ifree, lfree, nkeys; - int levels; - - t = dbp->internal; - pcont = pinternal = pleaf = 0; - nkeys = ifree = lfree = 0; - for (i = P_ROOT; (h = mpool_get(t->bt_mp, i, 0)) != NULL; ++i) { - switch (h->flags & P_TYPE) { - case P_BINTERNAL: - case P_RINTERNAL: - ++pinternal; - ifree += h->upper - h->lower; - break; - case P_BLEAF: - case P_RLEAF: - ++pleaf; - lfree += h->upper - h->lower; - nkeys += NEXTINDEX(h); - break; - case P_OVERFLOW: - ++pcont; - break; - } - (void)mpool_put(t->bt_mp, h, 0); - } - - /* Count the levels of the tree. */ - for (i = P_ROOT, levels = 0 ;; ++levels) { - h = mpool_get(t->bt_mp, i, 0); - if (h->flags & (P_BLEAF|P_RLEAF)) { - if (levels == 0) - levels = 1; - (void)mpool_put(t->bt_mp, h, 0); - break; - } - i = F_ISSET(t, R_RECNO) ? - GETRINTERNAL(h, 0)->pgno : - GETBINTERNAL(h, 0)->pgno; - (void)mpool_put(t->bt_mp, h, 0); - } - - (void)fprintf(stderr, "%d level%s with %ld keys", - levels, levels == 1 ? "" : "s", nkeys); - if (F_ISSET(t, R_RECNO)) - (void)fprintf(stderr, " (%ld header count)", t->bt_nrecs); - (void)fprintf(stderr, - "\n%lu pages (leaf %ld, internal %ld, overflow %ld)\n", - pinternal + pleaf + pcont, pleaf, pinternal, pcont); - (void)fprintf(stderr, "%ld cache hits, %ld cache misses\n", - bt_cache_hit, bt_cache_miss); - (void)fprintf(stderr, "%ld splits (%ld root splits, %ld sort splits)\n", - bt_split, bt_rootsplit, bt_sortsplit); - pleaf *= t->bt_psize - BTDATAOFF; - if (pleaf) - (void)fprintf(stderr, - "%.0f%% leaf fill (%ld bytes used, %ld bytes free)\n", - ((double)(pleaf - lfree) / pleaf) * 100, - pleaf - lfree, lfree); - pinternal *= t->bt_psize - BTDATAOFF; - if (pinternal) - (void)fprintf(stderr, - "%.0f%% internal fill (%ld bytes used, %ld bytes free\n", - ((double)(pinternal - ifree) / pinternal) * 100, - pinternal - ifree, ifree); - if (bt_pfxsaved) - (void)fprintf(stderr, "prefix checking removed %lu bytes.\n", - bt_pfxsaved); -} -#endif diff --git a/btree/bt_delete.c b/btree/bt_delete.c index ece1ab6..f76aa05 100644 --- a/btree/bt_delete.c +++ b/btree/bt_delete.c @@ -1,5 +1,14 @@ /*- - * Copyright (c) 1990, 1993, 1994 + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by @@ -13,11 +22,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors + * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * @@ -32,626 +37,611 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. + * + * $Id$ */ -#if defined(LIBC_SCCS) && !defined(lint) -static char sccsid[] = "@(#)bt_delete.c 8.13 (Berkeley) 7/28/94"; -#endif /* LIBC_SCCS and not lint */ - -#include <sys/types.h> - -#include <errno.h> -#include <stdio.h> -#include <string.h> +#include "db_config.h" -#include <db.h> -#include "btree.h" - -static int __bt_bdelete __P((BTREE *, const DBT *)); -static int __bt_curdel __P((BTREE *, const DBT *, PAGE *, u_int)); -static int __bt_pdelete __P((BTREE *, PAGE *)); -static int __bt_relink __P((BTREE *, PAGE *)); -static int __bt_stkacq __P((BTREE *, PAGE **, CURSOR *)); +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/lock.h" +#include "dbinc/mp.h" /* - * __bt_delete - * Delete the item(s) referenced by a key. + * __bam_ditem -- + * Delete one or more entries from a page. * - * Return RET_SPECIAL if the key is not found. + * PUBLIC: int __bam_ditem __P((DBC *, PAGE *, u_int32_t)); */ int -__bt_delete(dbp, key, flags) - const DB *dbp; - const DBT *key; - u_int flags; -{ - BTREE *t; - CURSOR *c; +__bam_ditem(dbc, h, indx) + DBC *dbc; PAGE *h; - int status; - - t = dbp->internal; - - /* Toss any page pinned across calls. */ - if (t->bt_pinned != NULL) { - mpool_put(t->bt_mp, t->bt_pinned, 0); - t->bt_pinned = NULL; - } - - /* Check for change to a read-only tree. */ - if (F_ISSET(t, B_RDONLY)) { - errno = EPERM; - return (RET_ERROR); - } - - switch (flags) { - case 0: - status = __bt_bdelete(t, key); + u_int32_t indx; +{ + BINTERNAL *bi; + BKEYDATA *bk; + DB *dbp; + u_int32_t nbytes; + int ret; + db_indx_t *inp; + + dbp = dbc->dbp; + inp = P_INP(dbp, h); + + /* The page should already have been dirtied by our caller. */ + DB_ASSERT(dbp->env, IS_DIRTY(h)); + + switch (TYPE(h)) { + case P_IBTREE: + bi = GET_BINTERNAL(dbp, h, indx); + switch (B_TYPE(bi->type)) { + case B_DUPLICATE: + case B_KEYDATA: + nbytes = BINTERNAL_SIZE(bi->len); + break; + case B_OVERFLOW: + nbytes = BINTERNAL_SIZE(bi->len); + if ((ret = + __db_doff(dbc, ((BOVERFLOW *)bi->data)->pgno)) != 0) + return (ret); + break; + default: + return (__db_pgfmt(dbp->env, PGNO(h))); + } break; - case R_CURSOR: + case P_IRECNO: + nbytes = RINTERNAL_SIZE; + break; + case P_LBTREE: /* - * If flags is R_CURSOR, delete the cursor. Must already - * have started a scan and not have already deleted it. + * If it's a duplicate key, discard the index and don't touch + * the actual page item. + * + * !!! + * This works because no data item can have an index matching + * any other index so even if the data item is in a key "slot", + * it won't match any other index. */ - c = &t->bt_cursor; - if (F_ISSET(c, CURS_INIT)) { - if (F_ISSET(c, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE)) - return (RET_SPECIAL); - if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL) - return (RET_ERROR); - + if ((indx % 2) == 0) { /* - * If the page is about to be emptied, we'll need to - * delete it, which means we have to acquire a stack. + * Check for a duplicate after us on the page. NOTE: + * we have to delete the key item before deleting the + * data item, otherwise the "indx + P_INDX" calculation + * won't work! */ - if (NEXTINDEX(h) == 1) - if (__bt_stkacq(t, &h, &t->bt_cursor)) - return (RET_ERROR); - - status = __bt_dleaf(t, NULL, h, c->pg.index); - - if (NEXTINDEX(h) == 0 && status == RET_SUCCESS) { - if (__bt_pdelete(t, h)) - return (RET_ERROR); - } else - mpool_put(t->bt_mp, - h, status == RET_SUCCESS ? MPOOL_DIRTY : 0); - break; + if (indx + P_INDX < (u_int32_t)NUM_ENT(h) && + inp[indx] == inp[indx + P_INDX]) + return (__bam_adjindx(dbc, + h, indx, indx + O_INDX, 0)); + /* + * Check for a duplicate before us on the page. It + * doesn't matter if we delete the key item before or + * after the data item for the purposes of this one. + */ + if (indx > 0 && inp[indx] == inp[indx - P_INDX]) + return (__bam_adjindx(dbc, + h, indx, indx - P_INDX, 0)); } /* FALLTHROUGH */ + case P_LDUP: + case P_LRECNO: + bk = GET_BKEYDATA(dbp, h, indx); + switch (B_TYPE(bk->type)) { + case B_DUPLICATE: + nbytes = BOVERFLOW_SIZE; + break; + case B_OVERFLOW: + nbytes = BOVERFLOW_SIZE; + if ((ret = __db_doff( + dbc, (GET_BOVERFLOW(dbp, h, indx))->pgno)) != 0) + return (ret); + break; + case B_KEYDATA: + nbytes = BKEYDATA_SIZE(bk->len); + break; + default: + return (__db_pgfmt(dbp->env, PGNO(h))); + } + break; default: - errno = EINVAL; - return (RET_ERROR); + return (__db_pgfmt(dbp->env, PGNO(h))); } - if (status == RET_SUCCESS) - F_SET(t, B_MODIFIED); - return (status); + + /* Delete the item and mark the page dirty. */ + if ((ret = __db_ditem(dbc, h, indx, nbytes)) != 0) + return (ret); + + return (0); } /* - * __bt_stkacq -- - * Acquire a stack so we can delete a cursor entry. + * __bam_adjindx -- + * Adjust an index on the page. * - * Parameters: - * t: tree - * hp: pointer to current, pinned PAGE pointer - * c: pointer to the cursor + * PUBLIC: int __bam_adjindx __P((DBC *, PAGE *, u_int32_t, u_int32_t, int)); + */ +int +__bam_adjindx(dbc, h, indx, indx_copy, is_insert) + DBC *dbc; + PAGE *h; + u_int32_t indx, indx_copy; + int is_insert; +{ + DB *dbp; + db_indx_t copy, *inp; + int ret; + + dbp = dbc->dbp; + inp = P_INP(dbp, h); + + /* Log the change. */ + if (DBC_LOGGING(dbc)) { + if ((ret = __bam_adj_log(dbp, dbc->txn, &LSN(h), 0, + PGNO(h), &LSN(h), indx, indx_copy, (u_int32_t)is_insert)) != 0) + return (ret); + } else + LSN_NOT_LOGGED(LSN(h)); + + /* Shuffle the indices and mark the page dirty. */ + if (is_insert) { + copy = inp[indx_copy]; + if (indx != NUM_ENT(h)) + memmove(&inp[indx + O_INDX], &inp[indx], + sizeof(db_indx_t) * (NUM_ENT(h) - indx)); + inp[indx] = copy; + ++NUM_ENT(h); + } else { + --NUM_ENT(h); + if (indx != NUM_ENT(h)) + memmove(&inp[indx], &inp[indx + O_INDX], + sizeof(db_indx_t) * (NUM_ENT(h) - indx)); + } + + return (0); +} + +/* + * __bam_dpages -- + * Delete a set of locked pages. * - * Returns: - * 0 on success, 1 on failure + * PUBLIC: int __bam_dpages __P((DBC *, int, int)); */ -static int -__bt_stkacq(t, hp, c) - BTREE *t; - PAGE **hp; - CURSOR *c; +int +__bam_dpages(dbc, use_top, flags) + DBC *dbc; + int use_top; + int flags; { BINTERNAL *bi; - EPG *e; - EPGNO *parent; - PAGE *h; - indx_t index; - pgno_t pgno; - recno_t nextpg, prevpg; - int exact, level; - - /* - * Find the first occurrence of the key in the tree. Toss the - * currently locked page so we don't hit an already-locked page. - */ - h = *hp; - mpool_put(t->bt_mp, h, 0); - if ((e = __bt_search(t, &c->key, &exact)) == NULL) - return (1); - h = e->page; - - /* See if we got it in one shot. */ - if (h->pgno == c->pg.pgno) - goto ret; + BTREE_CURSOR *cp; + DB *dbp; + DBT a, b; + DB_LOCK c_lock, p_lock; + DB_MPOOLFILE *mpf; + EPG *epg, *save_sp, *stack_epg; + PAGE *child, *parent; + db_indx_t nitems; + db_pgno_t pgno, root_pgno; + db_recno_t rcnt; + int done, ret, t_ret; + + dbp = dbc->dbp; + mpf = dbp->mpf; + cp = (BTREE_CURSOR *)dbc->internal; + nitems = 0; + pgno = PGNO_INVALID; /* - * Move right, looking for the page. At each move we have to move - * up the stack until we don't have to move to the next page. If - * we have to change pages at an internal level, we have to fix the - * stack back up. + * We have the entire stack of deletable pages locked. + * + * Btree calls us with the first page in the stack is to have a + * single item deleted, and the rest of the pages are to be removed. + * + * Recno always has a stack to the root and __bam_merge operations + * may have unneeded items in the sack. We find the lowest page + * in the stack that has more than one record in it and start there. */ - while (h->pgno != c->pg.pgno) { - if ((nextpg = h->nextpg) == P_INVALID) - break; - mpool_put(t->bt_mp, h, 0); - - /* Move up the stack. */ - for (level = 0; (parent = BT_POP(t)) != NULL; ++level) { - /* Get the parent page. */ - if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) - return (1); - - /* Move to the next index. */ - if (parent->index != NEXTINDEX(h) - 1) { - index = parent->index + 1; - BT_PUSH(t, h->pgno, index); + ret = 0; + if (use_top) + stack_epg = cp->sp; + else + for (stack_epg = cp->csp; stack_epg > cp->sp; --stack_epg) + if (NUM_ENT(stack_epg->page) > 1) break; - } - mpool_put(t->bt_mp, h, 0); - } + epg = stack_epg; + /* + * !!! + * There is an interesting deadlock situation here. We have to relink + * the leaf page chain around the leaf page being deleted. Consider + * a cursor walking through the leaf pages, that has the previous page + * read-locked and is waiting on a lock for the page we're deleting. + * It will deadlock here. Before we unlink the subtree, we relink the + * leaf page chain. + */ + if (LF_ISSET(BTD_RELINK) && LEVEL(cp->csp->page) == 1 && + (ret = __bam_relink(dbc, cp->csp->page, NULL, PGNO_INVALID)) != 0) + goto discard; - /* Restore the stack. */ - while (level--) { - /* Push the next level down onto the stack. */ - bi = GETBINTERNAL(h, index); - pgno = bi->pgno; - BT_PUSH(t, pgno, 0); + /* + * Delete the last item that references the underlying pages that are + * to be deleted, and adjust cursors that reference that page. Then, + * save that page's page number and item count and release it. If + * the application isn't retaining locks because it's running without + * transactions, this lets the rest of the tree get back to business + * immediately. + */ + if ((ret = __memp_dirty(mpf, + &epg->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) + goto discard; + if ((ret = __bam_ditem(dbc, epg->page, epg->indx)) != 0) + goto discard; + if ((ret = __bam_ca_di(dbc, PGNO(epg->page), epg->indx, -1)) != 0) + goto discard; + + if (LF_ISSET(BTD_UPDATE) && epg->indx == 0) { + save_sp = cp->csp; + cp->csp = epg; + ret = __bam_pupdate(dbc, epg->page); + cp->csp = save_sp; + if (ret != 0) + goto discard; + } - /* Lose the currently pinned page. */ - mpool_put(t->bt_mp, h, 0); + pgno = PGNO(epg->page); + nitems = NUM_ENT(epg->page); + + ret = __memp_fput(mpf, dbc->thread_info, epg->page, dbc->priority); + epg->page = NULL; + if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + goto err_inc; + + /* Then, discard any pages that we don't care about. */ +discard: for (epg = cp->sp; epg < stack_epg; ++epg) { + if ((t_ret = __memp_fput(mpf, dbc->thread_info, + epg->page, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + epg->page = NULL; + if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0) + ret = t_ret; + } + if (ret != 0) + goto err; + + /* Free the rest of the pages in the stack. */ + while (++epg <= cp->csp) { + if ((ret = __memp_dirty(mpf, &epg->page, + dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) + goto err; + /* + * Delete page entries so they will be restored as part of + * recovery. We don't need to do cursor adjustment here as + * the pages are being emptied by definition and so cannot + * be referenced by a cursor. + */ + if (NUM_ENT(epg->page) != 0) { + DB_ASSERT(dbp->env, LEVEL(epg->page) != 1); - /* Get the next level down. */ - if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL) - return (1); - index = 0; + if ((ret = __bam_ditem(dbc, epg->page, epg->indx)) != 0) + goto err; + /* + * Sheer paranoia: if we find any pages that aren't + * emptied by the delete, someone else added an item + * while we were walking the tree, and we discontinue + * the delete. Shouldn't be possible, but we check + * regardless. + */ + if (NUM_ENT(epg->page) != 0) + goto err; } - mpool_put(t->bt_mp, h, 0); - if ((h = mpool_get(t->bt_mp, nextpg, 0)) == NULL) - return (1); - } - if (h->pgno == c->pg.pgno) - goto ret; + ret = __db_free(dbc, epg->page); + if (cp->page == epg->page) + cp->page = NULL; + epg->page = NULL; + if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + goto err_inc; + } - /* Reacquire the original stack. */ - mpool_put(t->bt_mp, h, 0); - if ((e = __bt_search(t, &c->key, &exact)) == NULL) - return (1); - h = e->page; + if (0) { +err_inc: ++epg; +err: for (; epg <= cp->csp; ++epg) { + if (epg->page != NULL) { + (void)__memp_fput(mpf, dbc->thread_info, + epg->page, dbc->priority); + epg->page = NULL; + } + (void)__TLPUT(dbc, epg->lock); + } + BT_STK_CLR(cp); + return (ret); + } + BT_STK_CLR(cp); /* - * Move left, looking for the page. At each move we have to move - * up the stack until we don't have to change pages to move to the - * next page. If we have to change pages at an internal level, we - * have to fix the stack back up. + * If we just deleted the next-to-last item from the root page, the + * tree can collapse one or more levels. While there remains only a + * single item on the root page, write lock the last page referenced + * by the root page and copy it over the root page. */ - while (h->pgno != c->pg.pgno) { - if ((prevpg = h->prevpg) == P_INVALID) + root_pgno = cp->root; + if (pgno != root_pgno || nitems != 1) + return (0); + + for (done = 0; !done;) { + /* Initialize. */ + parent = child = NULL; + LOCK_INIT(p_lock); + LOCK_INIT(c_lock); + + /* Lock the root. */ + pgno = root_pgno; + if ((ret = + __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &p_lock)) != 0) + goto stop; + if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, + DB_MPOOL_DIRTY, &parent)) != 0) + goto stop; + + if (NUM_ENT(parent) != 1) + goto stop; + + switch (TYPE(parent)) { + case P_IBTREE: + /* + * If this is overflow, then try to delete it. + * The child may or may not still point at it. + */ + bi = GET_BINTERNAL(dbp, parent, 0); + if (B_TYPE(bi->type) == B_OVERFLOW) + if ((ret = __db_doff(dbc, + ((BOVERFLOW *)bi->data)->pgno)) != 0) + goto stop; + pgno = bi->pgno; break; - mpool_put(t->bt_mp, h, 0); - - /* Move up the stack. */ - for (level = 0; (parent = BT_POP(t)) != NULL; ++level) { - /* Get the parent page. */ - if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) - return (1); - - /* Move to the next index. */ - if (parent->index != 0) { - index = parent->index - 1; - BT_PUSH(t, h->pgno, index); - break; - } - mpool_put(t->bt_mp, h, 0); + case P_IRECNO: + pgno = GET_RINTERNAL(dbp, parent, 0)->pgno; + break; + default: + goto stop; } - /* Restore the stack. */ - while (level--) { - /* Push the next level down onto the stack. */ - bi = GETBINTERNAL(h, index); - pgno = bi->pgno; + /* Lock the child page. */ + if ((ret = + __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &c_lock)) != 0) + goto stop; + if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, + DB_MPOOL_DIRTY, &child)) != 0) + goto stop; + + /* Log the change. */ + if (DBC_LOGGING(dbc)) { + memset(&a, 0, sizeof(a)); + a.data = child; + a.size = dbp->pgsize; + memset(&b, 0, sizeof(b)); + b.data = P_ENTRY(dbp, parent, 0); + b.size = TYPE(parent) == P_IRECNO ? RINTERNAL_SIZE : + BINTERNAL_SIZE(((BINTERNAL *)b.data)->len); + if ((ret = __bam_rsplit_log(dbp, dbc->txn, + &child->lsn, 0, PGNO(child), &a, PGNO(parent), + RE_NREC(parent), &b, &parent->lsn)) != 0) + goto stop; + } else + LSN_NOT_LOGGED(child->lsn); - /* Lose the currently pinned page. */ - mpool_put(t->bt_mp, h, 0); + /* + * Make the switch. + * + * One fixup -- internal pages below the top level do not store + * a record count, so we have to preserve it if we're not + * converting to a leaf page. Note also that we are about to + * overwrite the parent page, including its LSN. This is OK + * because the log message we wrote describing this update + * stores its LSN on the child page. When the child is copied + * onto the parent, the correct LSN is copied into place. + */ + COMPQUIET(rcnt, 0); + if (F_ISSET(cp, C_RECNUM) && LEVEL(child) > LEAFLEVEL) + rcnt = RE_NREC(parent); + memcpy(parent, child, dbp->pgsize); + PGNO(parent) = root_pgno; + if (F_ISSET(cp, C_RECNUM) && LEVEL(child) > LEAFLEVEL) + RE_NREC_SET(parent, rcnt); + + /* Adjust the cursors. */ + if ((ret = __bam_ca_rsplit(dbc, PGNO(child), root_pgno)) != 0) + goto stop; - /* Get the next level down. */ - if ((h = mpool_get(t->bt_mp, pgno, 0)) == NULL) - return (1); + /* + * Free the page copied onto the root page and discard its + * lock. (The call to __db_free() discards our reference + * to the page.) + */ + if ((ret = __db_free(dbc, child)) != 0) { + child = NULL; + goto stop; + } + child = NULL; - index = NEXTINDEX(h) - 1; - BT_PUSH(t, pgno, index); + if (0) { +stop: done = 1; } - mpool_put(t->bt_mp, h, 0); - if ((h = mpool_get(t->bt_mp, prevpg, 0)) == NULL) - return (1); + if ((t_ret = __TLPUT(dbc, p_lock)) != 0 && ret == 0) + ret = t_ret; + if (parent != NULL && + (t_ret = __memp_fput(mpf, dbc->thread_info, + parent, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __TLPUT(dbc, c_lock)) != 0 && ret == 0) + ret = t_ret; + if (child != NULL && + (t_ret = __memp_fput(mpf, dbc->thread_info, + child, dbc->priority)) != 0 && ret == 0) + ret = t_ret; } - -ret: mpool_put(t->bt_mp, h, 0); - return ((*hp = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL); + return (ret); } /* - * __bt_bdelete -- - * Delete all key/data pairs matching the specified key. + * __bam_relink -- + * Relink around a deleted page. * - * Parameters: - * t: tree - * key: key to delete - * - * Returns: - * RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found. + * PUBLIC: int __bam_relink __P((DBC *, PAGE *, PAGE *, db_pgno_t)); + * Otherp can be either the previous or the next page to use if + * the caller already holds that page. */ -static int -__bt_bdelete(t, key) - BTREE *t; - const DBT *key; +int +__bam_relink(dbc, pagep, otherp, new_pgno) + DBC *dbc; + PAGE *pagep, *otherp; + db_pgno_t new_pgno; { - EPG *e; - PAGE *h; - int deleted, exact, redo; - - deleted = 0; - - /* Find any matching record; __bt_search pins the page. */ -loop: if ((e = __bt_search(t, key, &exact)) == NULL) - return (deleted ? RET_SUCCESS : RET_ERROR); - if (!exact) { - mpool_put(t->bt_mp, e->page, 0); - return (deleted ? RET_SUCCESS : RET_SPECIAL); - } + DB *dbp; + DB_LOCK npl, ppl; + DB_LSN *nlsnp, *plsnp, ret_lsn; + DB_MPOOLFILE *mpf; + PAGE *np, *pp; + int ret, t_ret; + + dbp = dbc->dbp; + np = pp = NULL; + LOCK_INIT(npl); + LOCK_INIT(ppl); + nlsnp = plsnp = NULL; + mpf = dbp->mpf; + ret = 0; /* - * Delete forward, then delete backward, from the found key. If - * there are duplicates and we reach either side of the page, do - * the key search again, so that we get them all. + * Retrieve the one/two pages. The caller must have them locked + * because the parent is latched. For a remove, we may need + * two pages (the before and after). For an add, we only need one + * because, the split took care of the prev. */ - redo = 0; - h = e->page; - do { - if (__bt_dleaf(t, key, h, e->index)) { - mpool_put(t->bt_mp, h, 0); - return (RET_ERROR); - } - if (F_ISSET(t, B_NODUPS)) { - if (NEXTINDEX(h) == 0) { - if (__bt_pdelete(t, h)) - return (RET_ERROR); - } else - mpool_put(t->bt_mp, h, MPOOL_DIRTY); - return (RET_SUCCESS); - } - deleted = 1; - } while (e->index < NEXTINDEX(h) && __bt_cmp(t, key, e) == 0); - - /* Check for right-hand edge of the page. */ - if (e->index == NEXTINDEX(h)) - redo = 1; - - /* Delete from the key to the beginning of the page. */ - while (e->index-- > 0) { - if (__bt_cmp(t, key, e) != 0) - break; - if (__bt_dleaf(t, key, h, e->index) == RET_ERROR) { - mpool_put(t->bt_mp, h, 0); - return (RET_ERROR); + if (pagep->next_pgno != PGNO_INVALID) { + if (((np = otherp) == NULL || + PGNO(otherp) != pagep->next_pgno) && + (ret = __memp_fget(mpf, &pagep->next_pgno, + dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &np)) != 0) { + ret = __db_pgerr(dbp, pagep->next_pgno, ret); + goto err; } - if (e->index == 0) - redo = 1; + nlsnp = &np->lsn; } - - /* Check for an empty page. */ - if (NEXTINDEX(h) == 0) { - if (__bt_pdelete(t, h)) - return (RET_ERROR); - goto loop; + if (pagep->prev_pgno != PGNO_INVALID) { + if (((pp = otherp) == NULL || + PGNO(otherp) != pagep->prev_pgno) && + (ret = __memp_fget(mpf, &pagep->prev_pgno, + dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pp)) != 0) { + ret = __db_pgerr(dbp, pagep->prev_pgno, ret); + goto err; + } + plsnp = &pp->lsn; } - /* Put the page. */ - mpool_put(t->bt_mp, h, MPOOL_DIRTY); - - if (redo) - goto loop; - return (RET_SUCCESS); -} - -/* - * __bt_pdelete -- - * Delete a single page from the tree. - * - * Parameters: - * t: tree - * h: leaf page - * - * Returns: - * RET_SUCCESS, RET_ERROR. - * - * Side-effects: - * mpool_put's the page - */ -static int -__bt_pdelete(t, h) - BTREE *t; - PAGE *h; -{ - BINTERNAL *bi; - PAGE *pg; - EPGNO *parent; - indx_t cnt, index, *ip, offset; - u_int32_t nksize; - char *from; + /* Log the change. */ + if (DBC_LOGGING(dbc)) { + if ((ret = __bam_relink_log(dbp, dbc->txn, &ret_lsn, 0, + pagep->pgno, new_pgno, pagep->prev_pgno, plsnp, + pagep->next_pgno, nlsnp)) != 0) + goto err; + } else + LSN_NOT_LOGGED(ret_lsn); + if (np != NULL) + np->lsn = ret_lsn; + if (pp != NULL) + pp->lsn = ret_lsn; /* - * Walk the parent page stack -- a LIFO stack of the pages that were - * traversed when we searched for the page where the delete occurred. - * Each stack entry is a page number and a page index offset. The - * offset is for the page traversed on the search. We've just deleted - * a page, so we have to delete the key from the parent page. - * - * If the delete from the parent page makes it empty, this process may - * continue all the way up the tree. We stop if we reach the root page - * (which is never deleted, it's just not worth the effort) or if the - * delete does not empty the page. + * Modify and release the two pages. */ - while ((parent = BT_POP(t)) != NULL) { - /* Get the parent page. */ - if ((pg = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) - return (RET_ERROR); - - index = parent->index; - bi = GETBINTERNAL(pg, index); - - /* Free any overflow pages. */ - if (bi->flags & P_BIGKEY && - __ovfl_delete(t, bi->bytes) == RET_ERROR) { - mpool_put(t->bt_mp, pg, 0); - return (RET_ERROR); - } - - /* - * Free the parent if it has only the one key and it's not the - * root page. If it's the rootpage, turn it back into an empty - * leaf page. - */ - if (NEXTINDEX(pg) == 1) - if (pg->pgno == P_ROOT) { - pg->lower = BTDATAOFF; - pg->upper = t->bt_psize; - pg->flags = P_BLEAF; - } else { - if (__bt_relink(t, pg) || __bt_free(t, pg)) - return (RET_ERROR); - continue; - } - else { - /* Pack remaining key items at the end of the page. */ - nksize = NBINTERNAL(bi->ksize); - from = (char *)pg + pg->upper; - memmove(from + nksize, from, (char *)bi - from); - pg->upper += nksize; - - /* Adjust indices' offsets, shift the indices down. */ - offset = pg->linp[index]; - for (cnt = index, ip = &pg->linp[0]; cnt--; ++ip) - if (ip[0] < offset) - ip[0] += nksize; - for (cnt = NEXTINDEX(pg) - index; --cnt; ++ip) - ip[0] = ip[1] < offset ? ip[1] + nksize : ip[1]; - pg->lower -= sizeof(indx_t); - } - - mpool_put(t->bt_mp, pg, MPOOL_DIRTY); - break; + if (np != NULL) { + if (new_pgno == PGNO_INVALID) + np->prev_pgno = pagep->prev_pgno; + else + np->prev_pgno = new_pgno; + if (np != otherp) + ret = __memp_fput(mpf, + dbc->thread_info, np, dbc->priority); + if ((t_ret = __TLPUT(dbc, npl)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + goto err; } - /* Free the leaf page, as long as it wasn't the root. */ - if (h->pgno == P_ROOT) { - mpool_put(t->bt_mp, h, MPOOL_DIRTY); - return (RET_SUCCESS); + if (pp != NULL) { + if (new_pgno == PGNO_INVALID) + pp->next_pgno = pagep->next_pgno; + else + pp->next_pgno = new_pgno; + if (pp != otherp) + ret = __memp_fput(mpf, + dbc->thread_info, pp, dbc->priority); + if ((t_ret = __TLPUT(dbc, ppl)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + goto err; } - return (__bt_relink(t, h) || __bt_free(t, h)); + return (0); + +err: if (np != NULL && np != otherp) + (void)__memp_fput(mpf, dbc->thread_info, np, dbc->priority); + if (pp != NULL && pp != otherp) + (void)__memp_fput(mpf, dbc->thread_info, pp, dbc->priority); + return (ret); } /* - * __bt_dleaf -- - * Delete a single record from a leaf page. + * __bam_pupdate -- + * Update parent key pointers up the tree. * - * Parameters: - * t: tree - * key: referenced key - * h: page - * index: index on page to delete - * - * Returns: - * RET_SUCCESS, RET_ERROR. + * PUBLIC: int __bam_pupdate __P((DBC *, PAGE *)); */ int -__bt_dleaf(t, key, h, index) - BTREE *t; - const DBT *key; - PAGE *h; - u_int index; +__bam_pupdate(dbc, lpg) + DBC *dbc; + PAGE *lpg; { - BLEAF *bl; - indx_t cnt, *ip, offset; - u_int32_t nbytes; - void *to; - char *from; - - /* If this record is referenced by the cursor, delete the cursor. */ - if (F_ISSET(&t->bt_cursor, CURS_INIT) && - !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) && - t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index == index && - __bt_curdel(t, key, h, index)) - return (RET_ERROR); - - /* If the entry uses overflow pages, make them available for reuse. */ - to = bl = GETBLEAF(h, index); - if (bl->flags & P_BIGKEY && __ovfl_delete(t, bl->bytes) == RET_ERROR) - return (RET_ERROR); - if (bl->flags & P_BIGDATA && - __ovfl_delete(t, bl->bytes + bl->ksize) == RET_ERROR) - return (RET_ERROR); - - /* Pack the remaining key/data items at the end of the page. */ - nbytes = NBLEAF(bl); - from = (char *)h + h->upper; - memmove(from + nbytes, from, (char *)to - from); - h->upper += nbytes; - - /* Adjust the indices' offsets, shift the indices down. */ - offset = h->linp[index]; - for (cnt = index, ip = &h->linp[0]; cnt--; ++ip) - if (ip[0] < offset) - ip[0] += nbytes; - for (cnt = NEXTINDEX(h) - index; --cnt; ++ip) - ip[0] = ip[1] < offset ? ip[1] + nbytes : ip[1]; - h->lower -= sizeof(indx_t); - - /* If the cursor is on this page, adjust it as necessary. */ - if (F_ISSET(&t->bt_cursor, CURS_INIT) && - !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) && - t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index > index) - --t->bt_cursor.pg.index; - - return (RET_SUCCESS); -} + BTREE_CURSOR *cp; + ENV *env; + EPG *epg; + int ret; -/* - * __bt_curdel -- - * Delete the cursor. - * - * Parameters: - * t: tree - * key: referenced key (or NULL) - * h: page - * index: index on page to delete - * - * Returns: - * RET_SUCCESS, RET_ERROR. - */ -static int -__bt_curdel(t, key, h, index) - BTREE *t; - const DBT *key; - PAGE *h; - u_int index; -{ - CURSOR *c; - EPG e; - PAGE *pg; - int curcopy, status; + env = dbc->env; + cp = (BTREE_CURSOR *)dbc->internal; + ret = 0; /* - * If there are duplicates, move forward or backward to one. - * Otherwise, copy the key into the cursor area. + * Update the parents up the tree. __bam_pinsert only looks at the + * left child if is a leaf page, so we don't need to change it. We + * just do a delete and insert; a replace is possible but reusing + * pinsert is better. */ - c = &t->bt_cursor; - F_CLR(c, CURS_AFTER | CURS_BEFORE | CURS_ACQUIRE); - - curcopy = 0; - if (!F_ISSET(t, B_NODUPS)) { - /* - * We're going to have to do comparisons. If we weren't - * provided a copy of the key, i.e. the user is deleting - * the current cursor position, get one. - */ - if (key == NULL) { - e.page = h; - e.index = index; - if ((status = __bt_ret(t, &e, - &c->key, &c->key, NULL, NULL, 1)) != RET_SUCCESS) - return (status); - curcopy = 1; - key = &c->key; - } - /* Check previous key, if not at the beginning of the page. */ - if (index > 0) { - e.page = h; - e.index = index - 1; - if (__bt_cmp(t, key, &e) == 0) { - F_SET(c, CURS_BEFORE); - goto dup2; + for (epg = &cp->csp[-1]; epg >= cp->sp; epg--) { + if ((ret = __memp_dirty(dbc->dbp->mpf, &epg->page, + dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) + return (ret); + epg->indx--; + if ((ret = __bam_pinsert(dbc, epg, 0, + lpg, epg[1].page, BPI_NORECNUM | BPI_REPLACE)) != 0) { + if (ret == DB_NEEDSPLIT) { + /* This should not happen. */ + __db_errx(env, + "Not enough room in parent: %s: page %lu", + dbc->dbp->fname, (u_long)PGNO(epg->page)); + ret = __env_panic(env, EINVAL); } + epg->indx++; + return (ret); } - /* Check next key, if not at the end of the page. */ - if (index < NEXTINDEX(h) - 1) { - e.page = h; - e.index = index + 1; - if (__bt_cmp(t, key, &e) == 0) { - F_SET(c, CURS_AFTER); - goto dup2; - } - } - /* Check previous key if at the beginning of the page. */ - if (index == 0 && h->prevpg != P_INVALID) { - if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL) - return (RET_ERROR); - e.page = pg; - e.index = NEXTINDEX(pg) - 1; - if (__bt_cmp(t, key, &e) == 0) { - F_SET(c, CURS_BEFORE); - goto dup1; - } - mpool_put(t->bt_mp, pg, 0); - } - /* Check next key if at the end of the page. */ - if (index == NEXTINDEX(h) - 1 && h->nextpg != P_INVALID) { - if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) - return (RET_ERROR); - e.page = pg; - e.index = 0; - if (__bt_cmp(t, key, &e) == 0) { - F_SET(c, CURS_AFTER); -dup1: mpool_put(t->bt_mp, pg, 0); -dup2: c->pg.pgno = e.page->pgno; - c->pg.index = e.index; - return (RET_SUCCESS); - } - mpool_put(t->bt_mp, pg, 0); - } - } - e.page = h; - e.index = index; - if (curcopy || (status = - __bt_ret(t, &e, &c->key, &c->key, NULL, NULL, 1)) == RET_SUCCESS) { - F_SET(c, CURS_ACQUIRE); - return (RET_SUCCESS); - } - return (status); -} - -/* - * __bt_relink -- - * Link around a deleted page. - * - * Parameters: - * t: tree - * h: page to be deleted - */ -static int -__bt_relink(t, h) - BTREE *t; - PAGE *h; -{ - PAGE *pg; - - if (h->nextpg != P_INVALID) { - if ((pg = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) - return (RET_ERROR); - pg->prevpg = h->prevpg; - mpool_put(t->bt_mp, pg, MPOOL_DIRTY); + epg->indx++; } - if (h->prevpg != P_INVALID) { - if ((pg = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL) - return (RET_ERROR); - pg->nextpg = h->nextpg; - mpool_put(t->bt_mp, pg, MPOOL_DIRTY); - } - return (0); + return (ret); } diff --git a/btree/bt_get.c b/btree/bt_get.c deleted file mode 100644 index 74824c7..0000000 --- a/btree/bt_get.c +++ /dev/null @@ -1,105 +0,0 @@ -/*- - * Copyright (c) 1990, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Mike Olson. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#if defined(LIBC_SCCS) && !defined(lint) -static char sccsid[] = "@(#)bt_get.c 8.6 (Berkeley) 7/20/94"; -#endif /* LIBC_SCCS and not lint */ - -#include <sys/types.h> - -#include <errno.h> -#include <stddef.h> -#include <stdio.h> - -#include <db.h> -#include "btree.h" - -/* - * __BT_GET -- Get a record from the btree. - * - * Parameters: - * dbp: pointer to access method - * key: key to find - * data: data to return - * flag: currently unused - * - * Returns: - * RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key not found. - */ -int -__bt_get(dbp, key, data, flags) - const DB *dbp; - const DBT *key; - DBT *data; - u_int flags; -{ - BTREE *t; - EPG *e; - int exact, status; - - t = dbp->internal; - - /* Toss any page pinned across calls. */ - if (t->bt_pinned != NULL) { - mpool_put(t->bt_mp, t->bt_pinned, 0); - t->bt_pinned = NULL; - } - - /* Get currently doesn't take any flags. */ - if (flags) { - errno = EINVAL; - return (RET_ERROR); - } - - if ((e = __bt_search(t, key, &exact)) == NULL) - return (RET_ERROR); - if (!exact) { - mpool_put(t->bt_mp, e->page, 0); - return (RET_SPECIAL); - } - - status = __bt_ret(t, e, NULL, NULL, data, &t->bt_rdata, 0); - - /* - * If the user is doing concurrent access, we copied the - * key/data, toss the page. - */ - if (F_ISSET(t, B_DB_LOCK)) - mpool_put(t->bt_mp, e->page, 0); - else - t->bt_pinned = e->page; - return (status); -} diff --git a/btree/bt_method.c b/btree/bt_method.c new file mode 100644 index 0000000..d27fe3d --- /dev/null +++ b/btree/bt_method.c @@ -0,0 +1,734 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1999-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/qam.h" + +static int __bam_set_bt_minkey __P((DB *, u_int32_t)); +static int __bam_get_bt_compare + __P((DB *, int (**)(DB *, const DBT *, const DBT *))); +static int __bam_get_bt_prefix + __P((DB *, size_t(**)(DB *, const DBT *, const DBT *))); +static int __bam_set_bt_prefix + __P((DB *, size_t(*)(DB *, const DBT *, const DBT *))); +static int __bam_get_bt_compress __P((DB *, + int (**)(DB *, const DBT *, const DBT *, const DBT *, const DBT *, DBT *), + int (**)(DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *))); +static int __ram_get_re_delim __P((DB *, int *)); +static int __ram_set_re_delim __P((DB *, int)); +static int __ram_set_re_len __P((DB *, u_int32_t)); +static int __ram_set_re_pad __P((DB *, int)); +static int __ram_get_re_source __P((DB *, const char **)); +static int __ram_set_re_source __P((DB *, const char *)); + +/* + * __bam_db_create -- + * Btree specific initialization of the DB structure. + * + * PUBLIC: int __bam_db_create __P((DB *)); + */ +int +__bam_db_create(dbp) + DB *dbp; +{ + BTREE *t; + int ret; + + /* Allocate and initialize the private btree structure. */ + if ((ret = __os_calloc(dbp->env, 1, sizeof(BTREE), &t)) != 0) + return (ret); + dbp->bt_internal = t; + + t->bt_minkey = DEFMINKEYPAGE; /* Btree */ + t->bt_compare = __bam_defcmp; + t->bt_prefix = __bam_defpfx; +#ifdef HAVE_COMPRESSION + t->bt_compress = NULL; + t->bt_decompress = NULL; + t->compress_dup_compare = NULL; + + /* + * DB_AM_COMPRESS may have been set in __bam_metachk before the + * bt_internal structure existed. + */ + if (F_ISSET(dbp, DB_AM_COMPRESS) && + (ret = __bam_set_bt_compress(dbp, NULL, NULL)) != 0) + return (ret); +#endif + + dbp->get_bt_compare = __bam_get_bt_compare; + dbp->set_bt_compare = __bam_set_bt_compare; + dbp->get_bt_minkey = __bam_get_bt_minkey; + dbp->set_bt_minkey = __bam_set_bt_minkey; + dbp->get_bt_prefix = __bam_get_bt_prefix; + dbp->set_bt_prefix = __bam_set_bt_prefix; + dbp->get_bt_compress = __bam_get_bt_compress; + dbp->set_bt_compress = __bam_set_bt_compress; + + t->re_pad = ' '; /* Recno */ + t->re_delim = '\n'; + t->re_eof = 1; + + dbp->get_re_delim = __ram_get_re_delim; + dbp->set_re_delim = __ram_set_re_delim; + dbp->get_re_len = __ram_get_re_len; + dbp->set_re_len = __ram_set_re_len; + dbp->get_re_pad = __ram_get_re_pad; + dbp->set_re_pad = __ram_set_re_pad; + dbp->get_re_source = __ram_get_re_source; + dbp->set_re_source = __ram_set_re_source; + + return (0); +} + +/* + * __bam_db_close -- + * Btree specific discard of the DB structure. + * + * PUBLIC: int __bam_db_close __P((DB *)); + */ +int +__bam_db_close(dbp) + DB *dbp; +{ + BTREE *t; + + if ((t = dbp->bt_internal) == NULL) + return (0); + /* Recno */ + /* Close any backing source file descriptor. */ + if (t->re_fp != NULL) + (void)fclose(t->re_fp); + + /* Free any backing source file name. */ + if (t->re_source != NULL) + __os_free(dbp->env, t->re_source); + + __os_free(dbp->env, t); + dbp->bt_internal = NULL; + + return (0); +} + +/* + * __bam_map_flags -- + * Map Btree specific flags from public to the internal values. + * + * PUBLIC: void __bam_map_flags __P((DB *, u_int32_t *, u_int32_t *)); + */ +void +__bam_map_flags(dbp, inflagsp, outflagsp) + DB *dbp; + u_int32_t *inflagsp, *outflagsp; +{ + COMPQUIET(dbp, NULL); + + if (FLD_ISSET(*inflagsp, DB_DUP)) { + FLD_SET(*outflagsp, DB_AM_DUP); + FLD_CLR(*inflagsp, DB_DUP); + } + if (FLD_ISSET(*inflagsp, DB_DUPSORT)) { + FLD_SET(*outflagsp, DB_AM_DUP | DB_AM_DUPSORT); + FLD_CLR(*inflagsp, DB_DUPSORT); + } + if (FLD_ISSET(*inflagsp, DB_RECNUM)) { + FLD_SET(*outflagsp, DB_AM_RECNUM); + FLD_CLR(*inflagsp, DB_RECNUM); + } + if (FLD_ISSET(*inflagsp, DB_REVSPLITOFF)) { + FLD_SET(*outflagsp, DB_AM_REVSPLITOFF); + FLD_CLR(*inflagsp, DB_REVSPLITOFF); + } +} + +/* + * __bam_set_flags -- + * Set Btree specific flags. + * + * PUBLIC: int __bam_set_flags __P((DB *, u_int32_t *flagsp)); + */ +int +__bam_set_flags(dbp, flagsp) + DB *dbp; + u_int32_t *flagsp; +{ + BTREE *t; + u_int32_t flags; + + t = dbp->bt_internal; + + flags = *flagsp; + if (LF_ISSET(DB_DUP | DB_DUPSORT | DB_RECNUM | DB_REVSPLITOFF)) + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_flags"); + + /* + * The DB_DUP and DB_DUPSORT flags are shared by the Hash + * and Btree access methods. + */ + if (LF_ISSET(DB_DUP | DB_DUPSORT)) + DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE | DB_OK_HASH); + + if (LF_ISSET(DB_RECNUM | DB_REVSPLITOFF)) + DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE); + + /* DB_DUP/DB_DUPSORT is incompatible with DB_RECNUM. */ + if (LF_ISSET(DB_DUP | DB_DUPSORT) && F_ISSET(dbp, DB_AM_RECNUM)) + goto incompat; + + /* DB_RECNUM is incompatible with DB_DUP/DB_DUPSORT. */ + if (LF_ISSET(DB_RECNUM) && F_ISSET(dbp, DB_AM_DUP)) + goto incompat; + + /* DB_RECNUM is incompatible with DB_DUP/DB_DUPSORT. */ + if (LF_ISSET(DB_RECNUM) && LF_ISSET(DB_DUP | DB_DUPSORT)) + goto incompat; + +#ifdef HAVE_COMPRESSION + /* DB_RECNUM is incompatible with compression */ + if (LF_ISSET(DB_RECNUM) && DB_IS_COMPRESSED(dbp)) { + __db_errx(dbp->env, + "DB_RECNUM cannot be used with compression"); + return (EINVAL); + } + + /* DB_DUP without DB_DUPSORT is incompatible with compression */ + if (LF_ISSET(DB_DUP) && !LF_ISSET(DB_DUPSORT) && + !F_ISSET(dbp, DB_AM_DUPSORT) && DB_IS_COMPRESSED(dbp)) { + __db_errx(dbp->env, + "DB_DUP cannot be used with compression without DB_DUPSORT"); + return (EINVAL); + } +#endif + + if (LF_ISSET(DB_DUPSORT) && dbp->dup_compare == NULL) { +#ifdef HAVE_COMPRESSION + if (DB_IS_COMPRESSED(dbp)) { + dbp->dup_compare = __bam_compress_dupcmp; + t->compress_dup_compare = __bam_defcmp; + } else +#endif + dbp->dup_compare = __bam_defcmp; + } + + __bam_map_flags(dbp, flagsp, &dbp->flags); + return (0); + +incompat: + return (__db_ferr(dbp->env, "DB->set_flags", 1)); +} + +/* + * __bam_get_bt_compare -- + * Get the comparison function. + */ +static int +__bam_get_bt_compare(dbp, funcp) + DB *dbp; + int (**funcp) __P((DB *, const DBT *, const DBT *)); +{ + BTREE *t; + + DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE); + + t = dbp->bt_internal; + + if (funcp != NULL) + *funcp = t->bt_compare; + + return (0); +} + +/* + * __bam_set_bt_compare -- + * Set the comparison function. + * + * PUBLIC: int __bam_set_bt_compare + * PUBLIC: __P((DB *, int (*)(DB *, const DBT *, const DBT *))); + */ +int +__bam_set_bt_compare(dbp, func) + DB *dbp; + int (*func) __P((DB *, const DBT *, const DBT *)); +{ + BTREE *t; + + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_bt_compare"); + DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE); + + t = dbp->bt_internal; + + /* + * Can't default the prefix routine if the user supplies a comparison + * routine; shortening the keys can break their comparison algorithm. + */ + t->bt_compare = func; + if (t->bt_prefix == __bam_defpfx) + t->bt_prefix = NULL; + + return (0); +} + +/* + * __bam_get_bt_compress -- + * Get the compression functions. + */ +static int +__bam_get_bt_compress(dbp, compressp, decompressp) + DB *dbp; + int (**compressp) __P((DB *, const DBT *, const DBT *, const DBT *, + const DBT *, DBT *)); + int (**decompressp) __P((DB *, const DBT *, const DBT *, DBT *, DBT *, + DBT *)); +{ +#ifdef HAVE_COMPRESSION + BTREE *t; + + DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE); + + t = dbp->bt_internal; + + if (compressp != NULL) + *compressp = t->bt_compress; + if (decompressp != NULL) + *decompressp = t->bt_decompress; + + return (0); +#else + COMPQUIET(compressp, NULL); + COMPQUIET(decompressp, NULL); + + __db_errx(dbp->env, "compression support has not been compiled in"); + return (EINVAL); +#endif +} + +/* + * __bam_set_bt_compress -- + * Set the compression functions. + * + * PUBLIC: int __bam_set_bt_compress __P((DB *, + * PUBLIC: int (*)(DB *, const DBT *, const DBT *, + * PUBLIC: const DBT *, const DBT *, DBT *), + * PUBLIC: int (*)(DB *, const DBT *, const DBT *, DBT *, DBT *, DBT *))); + */ +int +__bam_set_bt_compress(dbp, compress, decompress) + DB *dbp; + int (*compress) __P((DB *, const DBT *, const DBT *, const DBT *, + const DBT *, DBT *)); + int (*decompress) __P((DB *, const DBT *, const DBT *, DBT *, DBT *, + DBT *)); +{ +#ifdef HAVE_COMPRESSION + BTREE *t; + + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_bt_compress"); + DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE); + + t = dbp->bt_internal; + + /* compression is incompatible with DB_RECNUM */ + if (F_ISSET(dbp, DB_AM_RECNUM)) { + __db_errx(dbp->env, + "compression cannot be used with DB_RECNUM"); + return (EINVAL); + } + + /* compression is incompatible with DB_DUP without DB_DUPSORT */ + if (F_ISSET(dbp, DB_AM_DUP) && !F_ISSET(dbp, DB_AM_DUPSORT)) { + __db_errx(dbp->env, + "compression cannot be used with DB_DUP without DB_DUPSORT"); + return (EINVAL); + } + + if (compress != 0 && decompress != 0) { + t->bt_compress = compress; + t->bt_decompress = decompress; + } else if (compress == 0 && decompress == 0) { + t->bt_compress = __bam_defcompress; + t->bt_decompress = __bam_defdecompress; + } else { + __db_errx(dbp->env, + "to enable compression you need to supply both function arguments"); + return (EINVAL); + } + F_SET(dbp, DB_AM_COMPRESS); + + /* Copy dup_compare to compress_dup_compare, and use the compression + duplicate compare */ + if (F_ISSET(dbp, DB_AM_DUPSORT)) { + t->compress_dup_compare = dbp->dup_compare; + dbp->dup_compare = __bam_compress_dupcmp; + } + + return (0); +#else + COMPQUIET(compress, NULL); + COMPQUIET(decompress, NULL); + + __db_errx(dbp->env, "compression support has not been compiled in"); + return (EINVAL); +#endif +} + +/* + * __db_get_bt_minkey -- + * Get the minimum keys per page. + * + * PUBLIC: int __bam_get_bt_minkey __P((DB *, u_int32_t *)); + */ +int +__bam_get_bt_minkey(dbp, bt_minkeyp) + DB *dbp; + u_int32_t *bt_minkeyp; +{ + BTREE *t; + + DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE); + + t = dbp->bt_internal; + *bt_minkeyp = t->bt_minkey; + return (0); +} + +/* + * __bam_set_bt_minkey -- + * Set the minimum keys per page. + */ +static int +__bam_set_bt_minkey(dbp, bt_minkey) + DB *dbp; + u_int32_t bt_minkey; +{ + BTREE *t; + + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_bt_minkey"); + DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE); + + t = dbp->bt_internal; + + if (bt_minkey < 2) { + __db_errx(dbp->env, "minimum bt_minkey value is 2"); + return (EINVAL); + } + + t->bt_minkey = bt_minkey; + return (0); +} + +/* + * __bam_get_bt_prefix -- + * Get the prefix function. + */ +static int +__bam_get_bt_prefix(dbp, funcp) + DB *dbp; + size_t (**funcp) __P((DB *, const DBT *, const DBT *)); +{ + BTREE *t; + + DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE); + + t = dbp->bt_internal; + if (funcp != NULL) + *funcp = t->bt_prefix; + return (0); +} + +/* + * __bam_set_bt_prefix -- + * Set the prefix function. + */ +static int +__bam_set_bt_prefix(dbp, func) + DB *dbp; + size_t (*func) __P((DB *, const DBT *, const DBT *)); +{ + BTREE *t; + + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_bt_prefix"); + DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE); + + t = dbp->bt_internal; + + t->bt_prefix = func; + return (0); +} + +/* + * __bam_copy_config + * Copy the configuration of one DB handle to another. + * PUBLIC: void __bam_copy_config __P((DB *, DB*, u_int32_t)); + */ +void +__bam_copy_config(src, dst, nparts) + DB *src, *dst; + u_int32_t nparts; +{ + BTREE *s, *d; + + COMPQUIET(nparts, 0); + + s = src->bt_internal; + d = dst->bt_internal; + d->bt_compare = s->bt_compare; + d->bt_minkey = s->bt_minkey; + d->bt_minkey = s->bt_minkey; + d->bt_prefix = s->bt_prefix; +#ifdef HAVE_COMPRESSION + d->bt_compress = s->bt_compress; + d->bt_decompress = s->bt_decompress; + d->compress_dup_compare = s->compress_dup_compare; +#endif +} + +/* + * __ram_map_flags -- + * Map Recno specific flags from public to the internal values. + * + * PUBLIC: void __ram_map_flags __P((DB *, u_int32_t *, u_int32_t *)); + */ +void +__ram_map_flags(dbp, inflagsp, outflagsp) + DB *dbp; + u_int32_t *inflagsp, *outflagsp; +{ + COMPQUIET(dbp, NULL); + + if (FLD_ISSET(*inflagsp, DB_RENUMBER)) { + FLD_SET(*outflagsp, DB_AM_RENUMBER); + FLD_CLR(*inflagsp, DB_RENUMBER); + } + if (FLD_ISSET(*inflagsp, DB_SNAPSHOT)) { + FLD_SET(*outflagsp, DB_AM_SNAPSHOT); + FLD_CLR(*inflagsp, DB_SNAPSHOT); + } +} + +/* + * __ram_set_flags -- + * Set Recno specific flags. + * + * PUBLIC: int __ram_set_flags __P((DB *, u_int32_t *flagsp)); + */ +int +__ram_set_flags(dbp, flagsp) + DB *dbp; + u_int32_t *flagsp; +{ + u_int32_t flags; + + flags = *flagsp; + if (LF_ISSET(DB_RENUMBER | DB_SNAPSHOT)) { + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_flags"); + DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO); + } + + __ram_map_flags(dbp, flagsp, &dbp->flags); + return (0); +} + +/* + * __db_get_re_delim -- + * Get the variable-length input record delimiter. + */ +static int +__ram_get_re_delim(dbp, re_delimp) + DB *dbp; + int *re_delimp; +{ + BTREE *t; + + DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO); + t = dbp->bt_internal; + *re_delimp = t->re_delim; + return (0); +} + +/* + * __ram_set_re_delim -- + * Set the variable-length input record delimiter. + */ +static int +__ram_set_re_delim(dbp, re_delim) + DB *dbp; + int re_delim; +{ + BTREE *t; + + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_re_delim"); + DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO); + + t = dbp->bt_internal; + + t->re_delim = re_delim; + F_SET(dbp, DB_AM_DELIMITER); + + return (0); +} + +/* + * __db_get_re_len -- + * Get the variable-length input record length. + * + * PUBLIC: int __ram_get_re_len __P((DB *, u_int32_t *)); + */ +int +__ram_get_re_len(dbp, re_lenp) + DB *dbp; + u_int32_t *re_lenp; +{ + BTREE *t; + QUEUE *q; + + DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO); + + /* + * This has to work for all access methods, before or after opening the + * database. When the record length is set with __ram_set_re_len, the + * value in both the BTREE and QUEUE structs will be correct. + * Otherwise, this only makes sense after the database in opened, in + * which case we know the type. + */ + if (dbp->type == DB_QUEUE) { + q = dbp->q_internal; + *re_lenp = q->re_len; + } else { + t = dbp->bt_internal; + *re_lenp = t->re_len; + } + + return (0); +} + +/* + * __ram_set_re_len -- + * Set the variable-length input record length. + */ +static int +__ram_set_re_len(dbp, re_len) + DB *dbp; + u_int32_t re_len; +{ + BTREE *t; + QUEUE *q; + + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_re_len"); + DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO); + + t = dbp->bt_internal; + t->re_len = re_len; + + q = dbp->q_internal; + q->re_len = re_len; + + F_SET(dbp, DB_AM_FIXEDLEN); + + return (0); +} + +/* + * __db_get_re_pad -- + * Get the fixed-length record pad character. + * + * PUBLIC: int __ram_get_re_pad __P((DB *, int *)); + */ +int +__ram_get_re_pad(dbp, re_padp) + DB *dbp; + int *re_padp; +{ + BTREE *t; + QUEUE *q; + + DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO); + + /* + * This has to work for all access methods, before or after opening the + * database. When the record length is set with __ram_set_re_pad, the + * value in both the BTREE and QUEUE structs will be correct. + * Otherwise, this only makes sense after the database in opened, in + * which case we know the type. + */ + if (dbp->type == DB_QUEUE) { + q = dbp->q_internal; + *re_padp = q->re_pad; + } else { + t = dbp->bt_internal; + *re_padp = t->re_pad; + } + + return (0); +} + +/* + * __ram_set_re_pad -- + * Set the fixed-length record pad character. + */ +static int +__ram_set_re_pad(dbp, re_pad) + DB *dbp; + int re_pad; +{ + BTREE *t; + QUEUE *q; + + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_re_pad"); + DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO); + + t = dbp->bt_internal; + t->re_pad = re_pad; + + q = dbp->q_internal; + q->re_pad = re_pad; + + F_SET(dbp, DB_AM_PAD); + + return (0); +} + +/* + * __db_get_re_source -- + * Get the backing source file name. + */ +static int +__ram_get_re_source(dbp, re_sourcep) + DB *dbp; + const char **re_sourcep; +{ + BTREE *t; + + DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO); + + t = dbp->bt_internal; + *re_sourcep = t->re_source; + return (0); +} + +/* + * __ram_set_re_source -- + * Set the backing source file name. + */ +static int +__ram_set_re_source(dbp, re_source) + DB *dbp; + const char *re_source; +{ + BTREE *t; + + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_re_source"); + DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO); + + t = dbp->bt_internal; + + return (__os_strdup(dbp->env, re_source, &t->re_source)); +} diff --git a/btree/bt_open.c b/btree/bt_open.c index f052249..1fdfea5 100644 --- a/btree/bt_open.c +++ b/btree/bt_open.c @@ -1,5 +1,14 @@ /*- - * Copyright (c) 1990, 1993, 1994 + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by @@ -13,11 +22,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors + * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * @@ -32,413 +37,633 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. + * + * $Id$ */ -#if defined(LIBC_SCCS) && !defined(lint) -static char sccsid[] = "@(#)bt_open.c 8.10 (Berkeley) 8/17/94"; -#endif /* LIBC_SCCS and not lint */ +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/crypto.h" +#include "dbinc/db_page.h" +#include "dbinc/db_swap.h" +#include "dbinc/btree.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/partition.h" +#include "dbinc/fop.h" + +static void __bam_init_meta __P((DB *, BTMETA *, db_pgno_t, DB_LSN *)); /* - * Implementation of btree access method for 4.4BSD. + * __bam_open -- + * Open a btree. * - * The design here was originally based on that of the btree access method - * used in the Postgres database system at UC Berkeley. This implementation - * is wholly independent of the Postgres code. + * PUBLIC: int __bam_open __P((DB *, DB_THREAD_INFO *, + * PUBLIC: DB_TXN *, const char *, db_pgno_t, u_int32_t)); */ +int +__bam_open(dbp, ip, txn, name, base_pgno, flags) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + const char *name; + db_pgno_t base_pgno; + u_int32_t flags; +{ + BTREE *t; -#include <sys/param.h> -#include <sys/stat.h> - -#include <errno.h> -#include <fcntl.h> -#include <limits.h> -#include <signal.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <unistd.h> + COMPQUIET(name, NULL); + t = dbp->bt_internal; -#include <db.h> -#include "btree.h" + /* + * We don't permit the user to specify a prefix routine if they didn't + * also specify a comparison routine, they can't know enough about our + * comparison routine to get it right. + */ + if (t->bt_compare == __bam_defcmp && t->bt_prefix != __bam_defpfx) { + __db_errx(dbp->env, +"prefix comparison may not be specified for default comparison routine"); + return (EINVAL); + } -#ifdef DEBUG -#undef MINPSIZE -#define MINPSIZE 128 -#endif + /* + * Verify that the bt_minkey value specified won't cause the + * calculation of ovflsize to underflow [#2406] for this pagesize. + */ + if (B_MINKEY_TO_OVFLSIZE(dbp, t->bt_minkey, dbp->pgsize) > + B_MINKEY_TO_OVFLSIZE(dbp, DEFMINKEYPAGE, dbp->pgsize)) { + __db_errx(dbp->env, + "bt_minkey value of %lu too high for page size of %lu", + (u_long)t->bt_minkey, (u_long)dbp->pgsize); + return (EINVAL); + } -static int byteorder __P((void)); -static int nroot __P((BTREE *)); -static int tmp __P((void)); + /* Start up the tree. */ + return (__bam_read_root(dbp, ip, txn, base_pgno, flags)); +} /* - * __BT_OPEN -- Open a btree. - * - * Creates and fills a DB struct, and calls the routine that actually - * opens the btree. - * - * Parameters: - * fname: filename (NULL for in-memory trees) - * flags: open flag bits - * mode: open permission bits - * b: BTREEINFO pointer - * - * Returns: - * NULL on failure, pointer to DB on success. + * __bam_metachk -- * + * PUBLIC: int __bam_metachk __P((DB *, const char *, BTMETA *)); */ -DB * -__bt_open(fname, flags, mode, openinfo, dflags) - const char *fname; - int flags, mode, dflags; - const BTREEINFO *openinfo; -{ - struct stat sb; - BTMETA m; - BTREE *t; - BTREEINFO b; +int +__bam_metachk(dbp, name, btm) DB *dbp; - pgno_t ncache; - ssize_t nr; - int machine_lorder; + const char *name; + BTMETA *btm; +{ + ENV *env; + u_int32_t vers; + int ret; - t = NULL; + env = dbp->env; /* - * Intention is to make sure all of the user's selections are okay - * here and then use them without checking. Can't be complete, since - * we don't know the right page size, lorder or flags until the backing - * file is opened. Also, the file's page size can cause the cachesize - * to change. + * At this point, all we know is that the magic number is for a Btree. + * Check the version, the database may be out of date. */ - machine_lorder = byteorder(); - if (openinfo) { - b = *openinfo; - - /* Flags: R_DUP. */ - if (b.flags & ~(R_DUP)) - goto einval; - - /* - * Page size must be indx_t aligned and >= MINPSIZE. Default - * page size is set farther on, based on the underlying file - * transfer size. - */ - if (b.psize && - (b.psize < MINPSIZE || b.psize > MAX_PAGE_OFFSET + 1 || - b.psize & sizeof(indx_t) - 1)) - goto einval; - - /* Minimum number of keys per page; absolute minimum is 2. */ - if (b.minkeypage) { - if (b.minkeypage < 2) - goto einval; - } else - b.minkeypage = DEFMINKEYPAGE; - - /* If no comparison, use default comparison and prefix. */ - if (b.compare == NULL) { - b.compare = __bt_defcmp; - if (b.prefix == NULL) - b.prefix = __bt_defpfx; - } - - if (b.lorder == 0) - b.lorder = machine_lorder; - } else { - b.compare = __bt_defcmp; - b.cachesize = 0; - b.flags = 0; - b.lorder = machine_lorder; - b.minkeypage = DEFMINKEYPAGE; - b.prefix = __bt_defpfx; - b.psize = 0; + vers = btm->dbmeta.version; + if (F_ISSET(dbp, DB_AM_SWAP)) + M_32_SWAP(vers); + switch (vers) { + case 6: + case 7: + __db_errx(env, + "%s: btree version %lu requires a version upgrade", + name, (u_long)vers); + return (DB_OLD_VERSION); + case 8: + case 9: + break; + default: + __db_errx(env, + "%s: unsupported btree version: %lu", name, (u_long)vers); + return (EINVAL); } - /* Check for the ubiquitous PDP-11. */ - if (b.lorder != BIG_ENDIAN && b.lorder != LITTLE_ENDIAN) - goto einval; - - /* Allocate and initialize DB and BTREE structures. */ - if ((t = (BTREE *)malloc(sizeof(BTREE))) == NULL) - goto err; - memset(t, 0, sizeof(BTREE)); - t->bt_fd = -1; /* Don't close unopened fd on error. */ - t->bt_lorder = b.lorder; - t->bt_order = NOT; - t->bt_cmp = b.compare; - t->bt_pfx = b.prefix; - t->bt_rfd = -1; - - if ((t->bt_dbp = dbp = (DB *)malloc(sizeof(DB))) == NULL) - goto err; - memset(t->bt_dbp, 0, sizeof(DB)); - if (t->bt_lorder != machine_lorder) - F_SET(t, B_NEEDSWAP); - - dbp->type = DB_BTREE; - dbp->internal = t; - dbp->close = __bt_close; - dbp->del = __bt_delete; - dbp->fd = __bt_fd; - dbp->get = __bt_get; - dbp->put = __bt_put; - dbp->seq = __bt_seq; - dbp->sync = __bt_sync; + /* Swap the page if we need to. */ + if (F_ISSET(dbp, DB_AM_SWAP) && + (ret = __bam_mswap(env, (PAGE *)btm)) != 0) + return (ret); /* - * If no file name was supplied, this is an in-memory btree and we - * open a backing temporary file. Otherwise, it's a disk-based tree. + * Check application info against metadata info, and set info, flags, + * and type based on metadata info. */ - if (fname) { - switch (flags & O_ACCMODE) { - case O_RDONLY: - F_SET(t, B_RDONLY); - break; - case O_RDWR: - break; - case O_WRONLY: - default: - goto einval; - } - - if ((t->bt_fd = open(fname, flags, mode)) < 0) - goto err; - + if ((ret = + __db_fchk(env, "DB->open", btm->dbmeta.flags, BTM_MASK)) != 0) + return (ret); + + if (F_ISSET(&btm->dbmeta, BTM_RECNO)) { + if (dbp->type == DB_BTREE) + goto wrong_type; + dbp->type = DB_RECNO; + DB_ILLEGAL_METHOD(dbp, DB_OK_RECNO); } else { - if ((flags & O_ACCMODE) != O_RDWR) - goto einval; - if ((t->bt_fd = tmp()) == -1) - goto err; - F_SET(t, B_INMEM); + if (dbp->type == DB_RECNO) + goto wrong_type; + dbp->type = DB_BTREE; + DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE); } - if (fcntl(t->bt_fd, F_SETFD, 1) == -1) - goto err; + if (F_ISSET(&btm->dbmeta, BTM_DUP)) + F_SET(dbp, DB_AM_DUP); + else + if (F_ISSET(dbp, DB_AM_DUP)) { + __db_errx(env, + "%s: DB_DUP specified to open method but not set in database", + name); + return (EINVAL); + } - if (fstat(t->bt_fd, &sb)) - goto err; - if (sb.st_size) { - if ((nr = read(t->bt_fd, &m, sizeof(BTMETA))) < 0) - goto err; - if (nr != sizeof(BTMETA)) - goto eftype; - - /* - * Read in the meta-data. This can change the notion of what - * the lorder, page size and flags are, and, when the page size - * changes, the cachesize value can change too. If the user - * specified the wrong byte order for an existing database, we - * don't bother to return an error, we just clear the NEEDSWAP - * bit. - */ - if (m.magic == BTREEMAGIC) - F_CLR(t, B_NEEDSWAP); - else { - F_SET(t, B_NEEDSWAP); - M_32_SWAP(m.magic); - M_32_SWAP(m.version); - M_32_SWAP(m.psize); - M_32_SWAP(m.free); - M_32_SWAP(m.nrecs); - M_32_SWAP(m.flags); + if (F_ISSET(&btm->dbmeta, BTM_RECNUM)) { + if (dbp->type != DB_BTREE) + goto wrong_type; + F_SET(dbp, DB_AM_RECNUM); + + if ((ret = __db_fcchk(env, + "DB->open", dbp->flags, DB_AM_DUP, DB_AM_RECNUM)) != 0) + return (ret); + } else + if (F_ISSET(dbp, DB_AM_RECNUM)) { + __db_errx(env, + "%s: DB_RECNUM specified to open method but not set in database", + name); + return (EINVAL); } - if (m.magic != BTREEMAGIC || m.version != BTREEVERSION) - goto eftype; - if (m.psize < MINPSIZE || m.psize > MAX_PAGE_OFFSET + 1 || - m.psize & sizeof(indx_t) - 1) - goto eftype; - if (m.flags & ~SAVEMETA) - goto eftype; - b.psize = m.psize; - F_SET(t, m.flags); - t->bt_free = m.free; - t->bt_nrecs = m.nrecs; - } else { - /* - * Set the page size to the best value for I/O to this file. - * Don't overflow the page offset type. - */ - if (b.psize == 0) { - b.psize = sb.st_blksize; - if (b.psize < MINPSIZE) - b.psize = MINPSIZE; - if (b.psize > MAX_PAGE_OFFSET + 1) - b.psize = MAX_PAGE_OFFSET + 1; + + if (F_ISSET(&btm->dbmeta, BTM_FIXEDLEN)) { + if (dbp->type != DB_RECNO) + goto wrong_type; + F_SET(dbp, DB_AM_FIXEDLEN); + } else + if (F_ISSET(dbp, DB_AM_FIXEDLEN)) { + __db_errx(env, + "%s: DB_FIXEDLEN specified to open method but not set in database", + name); + return (EINVAL); } - /* Set flag if duplicates permitted. */ - if (!(b.flags & R_DUP)) - F_SET(t, B_NODUPS); + if (F_ISSET(&btm->dbmeta, BTM_RENUMBER)) { + if (dbp->type != DB_RECNO) + goto wrong_type; + F_SET(dbp, DB_AM_RENUMBER); + } else + if (F_ISSET(dbp, DB_AM_RENUMBER)) { + __db_errx(env, + "%s: DB_RENUMBER specified to open method but not set in database", + name); + return (EINVAL); + } - t->bt_free = P_INVALID; - t->bt_nrecs = 0; - F_SET(t, B_METADIRTY); - } + if (F_ISSET(&btm->dbmeta, BTM_SUBDB)) + F_SET(dbp, DB_AM_SUBDB); + else + if (F_ISSET(dbp, DB_AM_SUBDB)) { + __db_errx(env, + "%s: multiple databases specified but not supported by file", + name); + return (EINVAL); + } - t->bt_psize = b.psize; + if (F_ISSET(&btm->dbmeta, BTM_DUPSORT)) { + if (dbp->dup_compare == NULL) + dbp->dup_compare = __bam_defcmp; + F_SET(dbp, DB_AM_DUPSORT); + } else + if (dbp->dup_compare != NULL) { + __db_errx(env, + "%s: duplicate sort specified but not supported in database", + name); + return (EINVAL); + } - /* Set the cache size; must be a multiple of the page size. */ - if (b.cachesize && b.cachesize & b.psize - 1) - b.cachesize += (~b.cachesize & b.psize - 1) + 1; - if (b.cachesize < b.psize * MINCACHE) - b.cachesize = b.psize * MINCACHE; +#ifdef HAVE_COMPRESSION + if (F_ISSET(&btm->dbmeta, BTM_COMPRESS)) { + F_SET(dbp, DB_AM_COMPRESS); + if ((BTREE *)dbp->bt_internal != NULL && + !DB_IS_COMPRESSED(dbp) && + (ret = __bam_set_bt_compress(dbp, NULL, NULL)) != 0) + return (ret); + } else { + if ((BTREE *)dbp->bt_internal != NULL && + DB_IS_COMPRESSED(dbp)) { + __db_errx(env, + "%s: compresssion specified to open method but not set in database", + name); + return (EINVAL); + } + } +#else + if (F_ISSET(&btm->dbmeta, BTM_COMPRESS)) { + __db_errx(env, + "%s: compression support has not been compiled in", + name); + return (EINVAL); + } +#endif - /* Calculate number of pages to cache. */ - ncache = (b.cachesize + t->bt_psize - 1) / t->bt_psize; + /* Set the page size. */ + dbp->pgsize = btm->dbmeta.pagesize; - /* - * The btree data structure requires that at least two keys can fit on - * a page, but other than that there's no fixed requirement. The user - * specified a minimum number per page, and we translated that into the - * number of bytes a key/data pair can use before being placed on an - * overflow page. This calculation includes the page header, the size - * of the index referencing the leaf item and the size of the leaf item - * structure. Also, don't let the user specify a minkeypage such that - * a key/data pair won't fit even if both key and data are on overflow - * pages. - */ - t->bt_ovflsize = (t->bt_psize - BTDATAOFF) / b.minkeypage - - (sizeof(indx_t) + NBLEAFDBT(0, 0)); - if (t->bt_ovflsize < NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t)) - t->bt_ovflsize = - NBLEAFDBT(NOVFLSIZE, NOVFLSIZE) + sizeof(indx_t); - - /* Initialize the buffer pool. */ - if ((t->bt_mp = - mpool_open(NULL, t->bt_fd, t->bt_psize, ncache)) == NULL) - goto err; - if (!F_ISSET(t, B_INMEM)) - mpool_filter(t->bt_mp, __bt_pgin, __bt_pgout, t); + /* Copy the file's ID. */ + memcpy(dbp->fileid, btm->dbmeta.uid, DB_FILE_ID_LEN); - /* Create a root page if new tree. */ - if (nroot(t) == RET_ERROR) - goto err; + return (0); - /* Global flags. */ - if (dflags & DB_LOCK) - F_SET(t, B_DB_LOCK); - if (dflags & DB_SHMEM) - F_SET(t, B_DB_SHMEM); - if (dflags & DB_TXN) - F_SET(t, B_DB_TXN); +wrong_type: + if (dbp->type == DB_BTREE) + __db_errx(env, + "open method type is Btree, database type is Recno"); + else + __db_errx(env, + "open method type is Recno, database type is Btree"); + return (EINVAL); +} - return (dbp); +/* + * __bam_read_root -- + * Read the root page and check a tree. + * + * PUBLIC: int __bam_read_root __P((DB *, + * PUBLIC: DB_THREAD_INFO *, DB_TXN *, db_pgno_t, u_int32_t)); + */ +int +__bam_read_root(dbp, ip, txn, base_pgno, flags) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + db_pgno_t base_pgno; + u_int32_t flags; +{ + BTMETA *meta; + BTREE *t; + DBC *dbc; + DB_LOCK metalock; + DB_MPOOLFILE *mpf; + int ret, t_ret; + + COMPQUIET(flags, 0); + + meta = NULL; + t = dbp->bt_internal; + LOCK_INIT(metalock); + mpf = dbp->mpf; + ret = 0; + + /* Get a cursor. */ + if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0) + return (ret); + + /* Get the metadata page. */ + if ((ret = + __db_lget(dbc, 0, base_pgno, DB_LOCK_READ, 0, &metalock)) != 0) + goto err; + if ((ret = __memp_fget(mpf, &base_pgno, ip, dbc->txn, 0, &meta)) != 0) + goto err; -einval: errno = EINVAL; - goto err; + /* + * If the magic number is set, the tree has been created. Correct + * any fields that may not be right. Note, all of the local flags + * were set by DB->open. + * + * Otherwise, we'd better be in recovery or abort, in which case the + * metadata page will be created/initialized elsewhere. + */ + if (meta->dbmeta.magic == DB_BTREEMAGIC) { + t->bt_minkey = meta->minkey; + t->re_pad = (int)meta->re_pad; + t->re_len = meta->re_len; + + t->bt_meta = base_pgno; + t->bt_root = meta->root; +#ifndef HAVE_FTRUNCATE + if (PGNO(meta) == PGNO_BASE_MD && + !F_ISSET(dbp, DB_AM_RECOVER) && !IS_VERSION(dbp, meta)) + __memp_set_last_pgno(mpf, meta->dbmeta.last_pgno); +#endif + } else { + DB_ASSERT(dbp->env, + IS_RECOVERING(dbp->env) || F_ISSET(dbp, DB_AM_RECOVER)); + } -eftype: errno = EFTYPE; - goto err; + /* + * !!! + * If creating a subdatabase, we've already done an insert when + * we put the subdatabase's entry into the master database, so + * our last-page-inserted value is wrongly initialized for the + * master database, not the subdatabase we're creating. I'm not + * sure where the *right* place to clear this value is, it's not + * intuitively obvious that it belongs here. + */ + t->bt_lpgno = PGNO_INVALID; + +err: /* Put the metadata page back. */ + if (meta != NULL && (t_ret = __memp_fput(mpf, + ip, meta, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0) + ret = t_ret; + + if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} -err: if (t) { - if (t->bt_dbp) - free(t->bt_dbp); - if (t->bt_fd != -1) - (void)close(t->bt_fd); - free(t); +/* + * __bam_init_meta -- + * + * Initialize a btree meta-data page. The following fields may need + * to be updated later: last_pgno, root. + */ +static void +__bam_init_meta(dbp, meta, pgno, lsnp) + DB *dbp; + BTMETA *meta; + db_pgno_t pgno; + DB_LSN *lsnp; +{ + BTREE *t; +#ifdef HAVE_PARTITION + DB_PARTITION *part; +#endif + ENV *env; + + env = dbp->env; + t = dbp->bt_internal; + + memset(meta, 0, sizeof(BTMETA)); + meta->dbmeta.lsn = *lsnp; + meta->dbmeta.pgno = pgno; + meta->dbmeta.magic = DB_BTREEMAGIC; + meta->dbmeta.version = DB_BTREEVERSION; + meta->dbmeta.pagesize = dbp->pgsize; + if (F_ISSET(dbp, DB_AM_CHKSUM)) + FLD_SET(meta->dbmeta.metaflags, DBMETA_CHKSUM); + if (F_ISSET(dbp, DB_AM_ENCRYPT)) { + meta->dbmeta.encrypt_alg = env->crypto_handle->alg; + DB_ASSERT(env, meta->dbmeta.encrypt_alg != 0); + meta->crypto_magic = meta->dbmeta.magic; } - return (NULL); + meta->dbmeta.type = P_BTREEMETA; + meta->dbmeta.free = PGNO_INVALID; + meta->dbmeta.last_pgno = pgno; + if (F_ISSET(dbp, DB_AM_DUP)) + F_SET(&meta->dbmeta, BTM_DUP); + if (F_ISSET(dbp, DB_AM_FIXEDLEN)) + F_SET(&meta->dbmeta, BTM_FIXEDLEN); + if (F_ISSET(dbp, DB_AM_RECNUM)) + F_SET(&meta->dbmeta, BTM_RECNUM); + if (F_ISSET(dbp, DB_AM_RENUMBER)) + F_SET(&meta->dbmeta, BTM_RENUMBER); + if (F_ISSET(dbp, DB_AM_SUBDB)) + F_SET(&meta->dbmeta, BTM_SUBDB); + if (dbp->dup_compare != NULL) + F_SET(&meta->dbmeta, BTM_DUPSORT); +#ifdef HAVE_COMPRESSION + if (DB_IS_COMPRESSED(dbp)) + F_SET(&meta->dbmeta, BTM_COMPRESS); +#endif + if (dbp->type == DB_RECNO) + F_SET(&meta->dbmeta, BTM_RECNO); + memcpy(meta->dbmeta.uid, dbp->fileid, DB_FILE_ID_LEN); + + meta->minkey = t->bt_minkey; + meta->re_len = t->re_len; + meta->re_pad = (u_int32_t)t->re_pad; + +#ifdef HAVE_PARTITION + if ((part = dbp->p_internal) != NULL) { + meta->dbmeta.nparts = part->nparts; + if (F_ISSET(part, PART_CALLBACK)) + FLD_SET(meta->dbmeta.metaflags, DBMETA_PART_CALLBACK); + if (F_ISSET(part, PART_RANGE)) + FLD_SET(meta->dbmeta.metaflags, DBMETA_PART_RANGE); + } +#endif } /* - * NROOT -- Create the root of a new tree. + * __bam_new_file -- + * Create the necessary pages to begin a new database file. * - * Parameters: - * t: tree + * This code appears more complex than it is because of the two cases (named + * and unnamed). The way to read the code is that for each page being created, + * there are three parts: 1) a "get page" chunk (which either uses malloc'd + * memory or calls __memp_fget), 2) the initialization, and 3) the "put page" + * chunk which either does a fop write or an __memp_fput. * - * Returns: - * RET_ERROR, RET_SUCCESS + * PUBLIC: int __bam_new_file __P((DB *, + * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *)); */ -static int -nroot(t) - BTREE *t; +int +__bam_new_file(dbp, ip, txn, fhp, name) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + DB_FH *fhp; + const char *name; { - PAGE *meta, *root; - pgno_t npg; + BTMETA *meta; + DBT pdbt; + DB_LSN lsn; + DB_MPOOLFILE *mpf; + DB_PGINFO pginfo; + ENV *env; + PAGE *root; + db_pgno_t pgno; + int ret, t_ret; + void *buf; + + env = dbp->env; + mpf = dbp->mpf; + root = NULL; + meta = NULL; + buf = NULL; + + if (F_ISSET(dbp, DB_AM_INMEM)) { + /* Build the meta-data page. */ + pgno = PGNO_BASE_MD; + if ((ret = __memp_fget(mpf, &pgno, ip, txn, + DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &meta)) != 0) + return (ret); + LSN_NOT_LOGGED(lsn); + __bam_init_meta(dbp, meta, PGNO_BASE_MD, &lsn); + meta->root = 1; + meta->dbmeta.last_pgno = 1; + if ((ret = + __db_log_page(dbp, txn, &lsn, pgno, (PAGE *)meta)) != 0) + goto err; + ret = __memp_fput(mpf, ip, meta, dbp->priority); + meta = NULL; + if (ret != 0) + goto err; - if ((meta = mpool_get(t->bt_mp, 0, 0)) != NULL) { - mpool_put(t->bt_mp, meta, 0); - return (RET_SUCCESS); - } - if (errno != EINVAL) /* It's OK to not exist. */ - return (RET_ERROR); - errno = 0; - - if ((meta = mpool_new(t->bt_mp, &npg)) == NULL) - return (RET_ERROR); - - if ((root = mpool_new(t->bt_mp, &npg)) == NULL) - return (RET_ERROR); - - if (npg != P_ROOT) - return (RET_ERROR); - root->pgno = npg; - root->prevpg = root->nextpg = P_INVALID; - root->lower = BTDATAOFF; - root->upper = t->bt_psize; - root->flags = P_BLEAF; - memset(meta, 0, t->bt_psize); - mpool_put(t->bt_mp, meta, MPOOL_DIRTY); - mpool_put(t->bt_mp, root, MPOOL_DIRTY); - return (RET_SUCCESS); -} + /* Build the root page. */ + pgno = 1; + if ((ret = __memp_fget(mpf, &pgno, + ip, txn, DB_MPOOL_CREATE, &root)) != 0) + goto err; + P_INIT(root, dbp->pgsize, 1, PGNO_INVALID, PGNO_INVALID, + LEAFLEVEL, dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE); + LSN_NOT_LOGGED(root->lsn); + if ((ret = + __db_log_page(dbp, txn, &root->lsn, pgno, root)) != 0) + goto err; + ret = __memp_fput(mpf, ip, root, dbp->priority); + root = NULL; + if (ret != 0) + goto err; + } else { + memset(&pdbt, 0, sizeof(pdbt)); + + /* Build the meta-data page. */ + pginfo.db_pagesize = dbp->pgsize; + pginfo.flags = + F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_SWAP)); + pginfo.type = dbp->type; + pdbt.data = &pginfo; + pdbt.size = sizeof(pginfo); + if ((ret = __os_calloc(env, 1, dbp->pgsize, &buf)) != 0) + return (ret); + meta = (BTMETA *)buf; + LSN_NOT_LOGGED(lsn); + __bam_init_meta(dbp, meta, PGNO_BASE_MD, &lsn); + meta->root = 1; + meta->dbmeta.last_pgno = 1; + if ((ret = __db_pgout( + dbp->dbenv, PGNO_BASE_MD, meta, &pdbt)) != 0) + goto err; + if ((ret = __fop_write(env, txn, name, dbp->dirname, + DB_APP_DATA, fhp, + dbp->pgsize, 0, 0, buf, dbp->pgsize, 1, F_ISSET( + dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0)) != 0) + goto err; + meta = NULL; -static int -tmp() -{ - sigset_t set, oset; - int fd; - char *envtmp; - char path[MAXPATHLEN]; - - envtmp = getenv("TMPDIR"); - (void)snprintf(path, - sizeof(path), "%s/bt.XXXXXX", envtmp ? envtmp : "/tmp"); - - (void)sigfillset(&set); - (void)sigprocmask(SIG_BLOCK, &set, &oset); - if ((fd = mkstemp(path)) != -1) - (void)unlink(path); - (void)sigprocmask(SIG_SETMASK, &oset, NULL); - return(fd); -} + /* Build the root page. */ +#ifdef DIAGNOSTIC + memset(buf, CLEAR_BYTE, dbp->pgsize); +#endif + root = (PAGE *)buf; + P_INIT(root, dbp->pgsize, 1, PGNO_INVALID, PGNO_INVALID, + LEAFLEVEL, dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE); + LSN_NOT_LOGGED(root->lsn); + if ((ret = + __db_pgout(dbp->dbenv, root->pgno, root, &pdbt)) != 0) + goto err; + if ((ret = + __fop_write(env, txn, name, dbp->dirname, DB_APP_DATA, + fhp, dbp->pgsize, 1, 0, buf, dbp->pgsize, 1, F_ISSET( + dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0)) != 0) + goto err; + root = NULL; + } -static int -byteorder() -{ - u_int32_t x; - u_char *p; - - x = 0x01020304; - p = (u_char *)&x; - switch (*p) { - case 1: - return (BIG_ENDIAN); - case 4: - return (LITTLE_ENDIAN); - default: - return (0); +err: if (buf != NULL) + __os_free(env, buf); + else { + if (meta != NULL && + (t_ret = __memp_fput(mpf, ip, + meta, dbp->priority)) != 0 && ret == 0) + ret = t_ret; + if (root != NULL && + (t_ret = __memp_fput(mpf, ip, + root, dbp->priority)) != 0 && ret == 0) + ret = t_ret; } + return (ret); } +/* + * __bam_new_subdb -- + * Create a metadata page and a root page for a new btree. + * + * PUBLIC: int __bam_new_subdb __P((DB *, DB *, DB_THREAD_INFO *, DB_TXN *)); + */ int -__bt_fd(dbp) - const DB *dbp; +__bam_new_subdb(mdbp, dbp, ip, txn) + DB *mdbp, *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; { - BTREE *t; + BTMETA *meta; + DBC *dbc; + DB_LOCK metalock; + DB_LSN lsn; + DB_MPOOLFILE *mpf; + ENV *env; + PAGE *root; + int ret, t_ret; + + env = mdbp->env; + mpf = mdbp->mpf; + dbc = NULL; + meta = NULL; + root = NULL; + + if ((ret = __db_cursor(mdbp, ip, txn, + &dbc, CDB_LOCKING(env) ? DB_WRITECURSOR : 0)) != 0) + return (ret); + + /* Get, and optionally create the metadata page. */ + if ((ret = __db_lget(dbc, + 0, dbp->meta_pgno, DB_LOCK_WRITE, 0, &metalock)) != 0) + goto err; + if ((ret = __memp_fget(mpf, &dbp->meta_pgno, + ip, txn, DB_MPOOL_CREATE, &meta)) != 0) + goto err; - t = dbp->internal; + /* Build meta-data page. */ + lsn = meta->dbmeta.lsn; + __bam_init_meta(dbp, meta, dbp->meta_pgno, &lsn); + if ((ret = __db_log_page(mdbp, + txn, &meta->dbmeta.lsn, dbp->meta_pgno, (PAGE *)meta)) != 0) + goto err; - /* Toss any page pinned across calls. */ - if (t->bt_pinned != NULL) { - mpool_put(t->bt_mp, t->bt_pinned, 0); - t->bt_pinned = NULL; - } + /* Create and initialize a root page. */ + if ((ret = __db_new(dbc, + dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE, NULL, &root)) != 0) + goto err; + root->level = LEAFLEVEL; - /* In-memory database can't have a file descriptor. */ - if (F_ISSET(t, B_INMEM)) { - errno = ENOENT; - return (-1); - } - return (t->bt_fd); + if (DBENV_LOGGING(env) && +#if !defined(DEBUG_WOP) + txn != NULL && +#endif + + (ret = __bam_root_log(mdbp, txn, &meta->dbmeta.lsn, 0, + meta->dbmeta.pgno, root->pgno, &meta->dbmeta.lsn)) != 0) + goto err; + + meta->root = root->pgno; + if ((ret = + __db_log_page(mdbp, txn, &root->lsn, root->pgno, root)) != 0) + goto err; + + /* Release the metadata and root pages. */ + if ((ret = __memp_fput(mpf, ip, meta, dbc->priority)) != 0) + goto err; + meta = NULL; + if ((ret = __memp_fput(mpf, ip, root, dbc->priority)) != 0) + goto err; + root = NULL; +err: + if (meta != NULL) + if ((t_ret = __memp_fput(mpf, ip, + meta, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + if (root != NULL) + if ((t_ret = __memp_fput(mpf, ip, + root, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0) + ret = t_ret; + if (dbc != NULL) + if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + return (ret); } diff --git a/btree/bt_overflow.c b/btree/bt_overflow.c deleted file mode 100644 index b28b8e0..0000000 --- a/btree/bt_overflow.c +++ /dev/null @@ -1,228 +0,0 @@ -/*- - * Copyright (c) 1990, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Mike Olson. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#if defined(LIBC_SCCS) && !defined(lint) -static char sccsid[] = "@(#)bt_overflow.c 8.5 (Berkeley) 7/16/94"; -#endif /* LIBC_SCCS and not lint */ - -#include <sys/param.h> - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -#include <db.h> -#include "btree.h" - -/* - * Big key/data code. - * - * Big key and data entries are stored on linked lists of pages. The initial - * reference is byte string stored with the key or data and is the page number - * and size. The actual record is stored in a chain of pages linked by the - * nextpg field of the PAGE header. - * - * The first page of the chain has a special property. If the record is used - * by an internal page, it cannot be deleted and the P_PRESERVE bit will be set - * in the header. - * - * XXX - * A single DBT is written to each chain, so a lot of space on the last page - * is wasted. This is a fairly major bug for some data sets. - */ - -/* - * __OVFL_GET -- Get an overflow key/data item. - * - * Parameters: - * t: tree - * p: pointer to { pgno_t, u_int32_t } - * buf: storage address - * bufsz: storage size - * - * Returns: - * RET_ERROR, RET_SUCCESS - */ -int -__ovfl_get(t, p, ssz, buf, bufsz) - BTREE *t; - void *p; - size_t *ssz; - void **buf; - size_t *bufsz; -{ - PAGE *h; - pgno_t pg; - size_t nb, plen; - u_int32_t sz; - - memmove(&pg, p, sizeof(pgno_t)); - memmove(&sz, (char *)p + sizeof(pgno_t), sizeof(u_int32_t)); - *ssz = sz; - -#ifdef DEBUG - if (pg == P_INVALID || sz == 0) - abort(); -#endif - /* Make the buffer bigger as necessary. */ - if (*bufsz < sz) { - *buf = (char *)(*buf == NULL ? malloc(sz) : realloc(*buf, sz)); - if (*buf == NULL) - return (RET_ERROR); - *bufsz = sz; - } - - /* - * Step through the linked list of pages, copying the data on each one - * into the buffer. Never copy more than the data's length. - */ - plen = t->bt_psize - BTDATAOFF; - for (p = *buf;; p = (char *)p + nb, pg = h->nextpg) { - if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) - return (RET_ERROR); - - nb = MIN(sz, plen); - memmove(p, (char *)h + BTDATAOFF, nb); - mpool_put(t->bt_mp, h, 0); - - if ((sz -= nb) == 0) - break; - } - return (RET_SUCCESS); -} - -/* - * __OVFL_PUT -- Store an overflow key/data item. - * - * Parameters: - * t: tree - * data: DBT to store - * pgno: storage page number - * - * Returns: - * RET_ERROR, RET_SUCCESS - */ -int -__ovfl_put(t, dbt, pg) - BTREE *t; - const DBT *dbt; - pgno_t *pg; -{ - PAGE *h, *last; - void *p; - pgno_t npg; - size_t nb, plen; - u_int32_t sz; - - /* - * Allocate pages and copy the key/data record into them. Store the - * number of the first page in the chain. - */ - plen = t->bt_psize - BTDATAOFF; - for (last = NULL, p = dbt->data, sz = dbt->size;; - p = (char *)p + plen, last = h) { - if ((h = __bt_new(t, &npg)) == NULL) - return (RET_ERROR); - - h->pgno = npg; - h->nextpg = h->prevpg = P_INVALID; - h->flags = P_OVERFLOW; - h->lower = h->upper = 0; - - nb = MIN(sz, plen); - memmove((char *)h + BTDATAOFF, p, nb); - - if (last) { - last->nextpg = h->pgno; - mpool_put(t->bt_mp, last, MPOOL_DIRTY); - } else - *pg = h->pgno; - - if ((sz -= nb) == 0) { - mpool_put(t->bt_mp, h, MPOOL_DIRTY); - break; - } - } - return (RET_SUCCESS); -} - -/* - * __OVFL_DELETE -- Delete an overflow chain. - * - * Parameters: - * t: tree - * p: pointer to { pgno_t, u_int32_t } - * - * Returns: - * RET_ERROR, RET_SUCCESS - */ -int -__ovfl_delete(t, p) - BTREE *t; - void *p; -{ - PAGE *h; - pgno_t pg; - size_t plen; - u_int32_t sz; - - memmove(&pg, p, sizeof(pgno_t)); - memmove(&sz, (char *)p + sizeof(pgno_t), sizeof(u_int32_t)); - -#ifdef DEBUG - if (pg == P_INVALID || sz == 0) - abort(); -#endif - if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) - return (RET_ERROR); - - /* Don't delete chains used by internal pages. */ - if (h->flags & P_PRESERVE) { - mpool_put(t->bt_mp, h, 0); - return (RET_SUCCESS); - } - - /* Step through the chain, calling the free routine for each page. */ - for (plen = t->bt_psize - BTDATAOFF;; sz -= plen) { - pg = h->nextpg; - __bt_free(t, h); - if (sz <= plen) - break; - if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) - return (RET_ERROR); - } - return (RET_SUCCESS); -} diff --git a/btree/bt_page.c b/btree/bt_page.c deleted file mode 100644 index 0d9d138..0000000 --- a/btree/bt_page.c +++ /dev/null @@ -1,98 +0,0 @@ -/*- - * Copyright (c) 1990, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#if defined(LIBC_SCCS) && !defined(lint) -static char sccsid[] = "@(#)bt_page.c 8.3 (Berkeley) 7/14/94"; -#endif /* LIBC_SCCS and not lint */ - -#include <sys/types.h> - -#include <stdio.h> - -#include <db.h> -#include "btree.h" - -/* - * __bt_free -- - * Put a page on the freelist. - * - * Parameters: - * t: tree - * h: page to free - * - * Returns: - * RET_ERROR, RET_SUCCESS - * - * Side-effect: - * mpool_put's the page. - */ -int -__bt_free(t, h) - BTREE *t; - PAGE *h; -{ - /* Insert the page at the head of the free list. */ - h->prevpg = P_INVALID; - h->nextpg = t->bt_free; - t->bt_free = h->pgno; - - /* Make sure the page gets written back. */ - return (mpool_put(t->bt_mp, h, MPOOL_DIRTY)); -} - -/* - * __bt_new -- - * Get a new page, preferably from the freelist. - * - * Parameters: - * t: tree - * npg: storage for page number. - * - * Returns: - * Pointer to a page, NULL on error. - */ -PAGE * -__bt_new(t, npg) - BTREE *t; - pgno_t *npg; -{ - PAGE *h; - - if (t->bt_free != P_INVALID && - (h = mpool_get(t->bt_mp, t->bt_free, 0)) != NULL) { - *npg = t->bt_free; - t->bt_free = h->nextpg; - return (h); - } - return (mpool_new(t->bt_mp, npg)); -} diff --git a/btree/bt_put.c b/btree/bt_put.c index 952be09..683b09c 100644 --- a/btree/bt_put.c +++ b/btree/bt_put.c @@ -1,5 +1,14 @@ /*- - * Copyright (c) 1990, 1993, 1994 + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by @@ -13,11 +22,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors + * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * @@ -32,289 +37,1033 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. + * + * $Id$ */ -#if defined(LIBC_SCCS) && !defined(lint) -static char sccsid[] = "@(#)bt_put.c 8.8 (Berkeley) 7/26/94"; -#endif /* LIBC_SCCS and not lint */ - -#include <sys/types.h> +#include "db_config.h" -#include <errno.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/lock.h" +#include "dbinc/mp.h" -#include <db.h> -#include "btree.h" - -static EPG *bt_fast __P((BTREE *, const DBT *, const DBT *, int *)); +static int __bam_build + __P((DBC *, u_int32_t, DBT *, PAGE *, u_int32_t, u_int32_t)); +static int __bam_dup_check __P((DBC *, u_int32_t, + PAGE *, u_int32_t, u_int32_t, db_indx_t *)); +static int __bam_dup_convert __P((DBC *, PAGE *, u_int32_t, u_int32_t)); +static int __bam_ovput + __P((DBC *, u_int32_t, db_pgno_t, PAGE *, u_int32_t, DBT *)); +static u_int32_t + __bam_partsize __P((DB *, u_int32_t, DBT *, PAGE *, u_int32_t)); /* - * __BT_PUT -- Add a btree item to the tree. - * - * Parameters: - * dbp: pointer to access method - * key: key - * data: data - * flag: R_NOOVERWRITE + * __bam_iitem -- + * Insert an item into the tree. * - * Returns: - * RET_ERROR, RET_SUCCESS and RET_SPECIAL if the key is already in the - * tree and R_NOOVERWRITE specified. + * PUBLIC: int __bam_iitem __P((DBC *, DBT *, DBT *, u_int32_t, u_int32_t)); */ int -__bt_put(dbp, key, data, flags) - const DB *dbp; - DBT *key; - const DBT *data; - u_int flags; +__bam_iitem(dbc, key, data, op, flags) + DBC *dbc; + DBT *key, *data; + u_int32_t op, flags; { + BKEYDATA *bk, bk_tmp; BTREE *t; - DBT tkey, tdata; - EPG *e; + BTREE_CURSOR *cp; + DB *dbp; + DBT bk_hdr, tdbt; + DB_MPOOLFILE *mpf; + ENV *env; PAGE *h; - indx_t index, nxtindex; - pgno_t pg; - u_int32_t nbytes; - int dflags, exact, status; - char *dest, db[NOVFLSIZE], kb[NOVFLSIZE]; + db_indx_t cnt, indx; + u_int32_t data_size, have_bytes, need_bytes, needed, pages, pagespace; + char tmp_ch; + int cmp, bigkey, bigdata, del, dupadjust; + int padrec, replace, ret, t_ret, was_deleted; + + COMPQUIET(cnt, 0); + + dbp = dbc->dbp; + env = dbp->env; + mpf = dbp->mpf; + cp = (BTREE_CURSOR *)dbc->internal; + t = dbp->bt_internal; + h = cp->page; + indx = cp->indx; + del = dupadjust = replace = was_deleted = 0; + + /* + * Fixed-length records with partial puts: it's an error to specify + * anything other simple overwrite. + */ + if (F_ISSET(dbp, DB_AM_FIXEDLEN) && + F_ISSET(data, DB_DBT_PARTIAL) && data->size != data->dlen) + return (__db_rec_repl(env, data->size, data->dlen)); + + /* + * Figure out how much space the data will take, including if it's a + * partial record. + * + * Fixed-length records: it's an error to specify a record that's + * longer than the fixed-length, and we never require less than + * the fixed-length record size. + */ + data_size = F_ISSET(data, DB_DBT_PARTIAL) ? + __bam_partsize(dbp, op, data, h, indx) : data->size; + padrec = 0; + if (F_ISSET(dbp, DB_AM_FIXEDLEN)) { + if (data_size > t->re_len) + return (__db_rec_toobig(env, data_size, t->re_len)); + + /* Records that are deleted anyway needn't be padded out. */ + if (!LF_ISSET(BI_DELETED) && data_size < t->re_len) { + padrec = 1; + data_size = t->re_len; + } + } - t = dbp->internal; + /* + * Handle partial puts or short fixed-length records: check whether we + * can just append the data or else build the real record. We can't + * append if there are secondaries: we need the whole data item for the + * application's secondary callback. + */ + if (op == DB_CURRENT && dbp->dup_compare == NULL && + F_ISSET(data, DB_DBT_PARTIAL) && !DB_IS_PRIMARY(dbp)) { + bk = GET_BKEYDATA( + dbp, h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0)); + /* + * If the item is an overflow type, and the input DBT is + * partial, and begins at the length of the current item then + * it is an append. Avoid deleting and re-creating the entire + * offpage item. + */ + if (B_TYPE(bk->type) == B_OVERFLOW && + data->doff == ((BOVERFLOW *)bk)->tlen) { + /* + * If the cursor has not already cached the last page + * in the offpage chain. We need to walk the chain + * to be sure that the page has been read. + */ + if (cp->stream_start_pgno != ((BOVERFLOW *)bk)->pgno || + cp->stream_off > data->doff || data->doff > + cp->stream_off + P_MAXSPACE(dbp, dbp->pgsize)) { + memset(&tdbt, 0, sizeof(DBT)); + tdbt.doff = data->doff - 1; + /* + * Set the length to 1, to force __db_goff + * to do the traversal. + */ + tdbt.dlen = tdbt.ulen = 1; + tdbt.data = &tmp_ch; + tdbt.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM; + + /* + * Read to the last page. It will be cached + * in the cursor. + */ + if ((ret = __db_goff( + dbc, &tdbt, ((BOVERFLOW *)bk)->tlen, + ((BOVERFLOW *)bk)->pgno, NULL, NULL)) != 0) + return (ret); + } - /* Toss any page pinned across calls. */ - if (t->bt_pinned != NULL) { - mpool_put(t->bt_mp, t->bt_pinned, 0); - t->bt_pinned = NULL; + /* + * Since this is an append, dlen is irrelevant (there + * are no bytes to overwrite). We need the caller's + * DBT size to end up with the total size of the item. + * From now on, use dlen as the length of the user's + * data that we are going to append. + * Don't futz with the caller's DBT any more than we + * have to in order to send back the size. + */ + tdbt = *data; + tdbt.dlen = data->size; + tdbt.size = data_size; + data = &tdbt; + F_SET(data, DB_DBT_STREAMING); + } + } + if (!F_ISSET(data, DB_DBT_STREAMING) && + (padrec || F_ISSET(data, DB_DBT_PARTIAL))) { + tdbt = *data; + if ((ret = + __bam_build(dbc, op, &tdbt, h, indx, data_size)) != 0) + return (ret); + data = &tdbt; } - /* Check for change to a read-only tree. */ - if (F_ISSET(t, B_RDONLY)) { - errno = EPERM; - return (RET_ERROR); + /* + * If the user has specified a duplicate comparison function, return + * an error if DB_CURRENT was specified and the replacement data + * doesn't compare equal to the current data. This stops apps from + * screwing up the duplicate sort order. We have to do this after + * we build the real record so that we're comparing the real items. + */ + if (op == DB_CURRENT && dbp->dup_compare != NULL) { + if ((ret = __bam_cmp(dbc, data, h, + indx + (TYPE(h) == P_LBTREE ? O_INDX : 0), + dbp->dup_compare, &cmp)) != 0) + return (ret); + if (cmp != 0) { + __db_errx(env, + "Existing data sorts differently from put data"); + return (EINVAL); + } } - switch (flags) { - case 0: - case R_NOOVERWRITE: + /* + * If the key or data item won't fit on a page, we'll have to store + * them on overflow pages. + */ + needed = 0; + bigdata = data_size > cp->ovflsize; + switch (op) { + case DB_KEYFIRST: + /* We're adding a new key and data pair. */ + bigkey = key->size > cp->ovflsize; + if (bigkey) + needed += BOVERFLOW_PSIZE; + else + needed += BKEYDATA_PSIZE(key->size); + if (bigdata) + needed += BOVERFLOW_PSIZE; + else + needed += BKEYDATA_PSIZE(data_size); break; - case R_CURSOR: + case DB_AFTER: + case DB_BEFORE: + case DB_CURRENT: /* - * If flags is R_CURSOR, put the cursor. Must already - * have started a scan and not have already deleted it. + * We're either overwriting the data item of a key/data pair + * or we're creating a new on-page duplicate and only adding + * a data item. + * + * !!! + * We're not currently correcting for space reclaimed from + * already deleted items, but I don't think it's worth the + * complexity. */ - if (F_ISSET(&t->bt_cursor, CURS_INIT) && - !F_ISSET(&t->bt_cursor, - CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE)) - break; - /* FALLTHROUGH */ + bigkey = 0; + if (op == DB_CURRENT) { + bk = GET_BKEYDATA(dbp, h, + indx + (TYPE(h) == P_LBTREE ? O_INDX : 0)); + if (B_TYPE(bk->type) == B_KEYDATA) + have_bytes = BKEYDATA_PSIZE(bk->len); + else + have_bytes = BOVERFLOW_PSIZE; + need_bytes = 0; + } else { + have_bytes = 0; + need_bytes = sizeof(db_indx_t); + } + if (bigdata) + need_bytes += BOVERFLOW_PSIZE; + else + need_bytes += BKEYDATA_PSIZE(data_size); + + if (have_bytes < need_bytes) + needed += need_bytes - have_bytes; + break; default: - errno = EINVAL; - return (RET_ERROR); + return (__db_unknown_flag(env, "DB->put", op)); + } + + /* Split the page if there's not enough room. */ + if (P_FREESPACE(dbp, h) < needed) + return (DB_NEEDSPLIT); + + /* + * Check to see if we will convert to off page duplicates -- if + * so, we'll need a page. + */ + if (F_ISSET(dbp, DB_AM_DUP) && + TYPE(h) == P_LBTREE && op != DB_KEYFIRST && + P_FREESPACE(dbp, h) - needed <= dbp->pgsize / 2 && + __bam_dup_check(dbc, op, h, indx, needed, &cnt)) { + pages = 1; + dupadjust = 1; + } else + pages = 0; + + /* + * If we are not using transactions and there is a page limit + * set on the file, then figure out if things will fit before + * taking action. + */ + if (dbc->txn == NULL && mpf->mfp->maxpgno != 0) { + pagespace = P_MAXSPACE(dbp, dbp->pgsize); + if (bigdata) + pages += ((data_size - 1) / pagespace) + 1; + if (bigkey) + pages += ((key->size - 1) / pagespace) + 1; + + if (pages > (mpf->mfp->maxpgno - mpf->mfp->last_pgno)) + return (__db_space_err(dbp)); } + ret = __memp_dirty(mpf, &h, + dbc->thread_info, dbc->txn, dbc->priority, 0); + if (cp->csp->page == cp->page) + cp->csp->page = h; + cp->page = h; + if (ret != 0) + return (ret); + /* - * If the key/data pair won't fit on a page, store it on overflow - * pages. Only put the key on the overflow page if the pair are - * still too big after moving the data to an overflow page. + * The code breaks it up into five cases: * - * XXX - * If the insert fails later on, the overflow pages aren't recovered. + * 1. Insert a new key/data pair. + * 2. Append a new data item (a new duplicate). + * 3. Insert a new data item (a new duplicate). + * 4. Delete and re-add the data item (overflow item). + * 5. Overwrite the data item. */ - dflags = 0; - if (key->size + data->size > t->bt_ovflsize) { - if (key->size > t->bt_ovflsize) { -storekey: if (__ovfl_put(t, key, &pg) == RET_ERROR) - return (RET_ERROR); - tkey.data = kb; - tkey.size = NOVFLSIZE; - memmove(kb, &pg, sizeof(pgno_t)); - memmove(kb + sizeof(pgno_t), - &key->size, sizeof(u_int32_t)); - dflags |= P_BIGKEY; - key = &tkey; + switch (op) { + case DB_KEYFIRST: /* 1. Insert a new key/data pair. */ + if (bigkey) { + if ((ret = __bam_ovput(dbc, + B_OVERFLOW, PGNO_INVALID, h, indx, key)) != 0) + return (ret); + } else + if ((ret = __db_pitem(dbc, h, indx, + BKEYDATA_SIZE(key->size), NULL, key)) != 0) + return (ret); + + if ((ret = __bam_ca_di(dbc, PGNO(h), indx, 1)) != 0) + return (ret); + ++indx; + break; + case DB_AFTER: /* 2. Append a new data item. */ + if (TYPE(h) == P_LBTREE) { + /* Copy the key for the duplicate and adjust cursors. */ + if ((ret = + __bam_adjindx(dbc, h, indx + P_INDX, indx, 1)) != 0) + return (ret); + if ((ret = + __bam_ca_di(dbc, PGNO(h), indx + P_INDX, 1)) != 0) + return (ret); + + indx += 3; + + cp->indx += 2; + } else { + ++indx; + cp->indx += 1; } - if (key->size + data->size > t->bt_ovflsize) { - if (__ovfl_put(t, data, &pg) == RET_ERROR) - return (RET_ERROR); - tdata.data = db; - tdata.size = NOVFLSIZE; - memmove(db, &pg, sizeof(pgno_t)); - memmove(db + sizeof(pgno_t), - &data->size, sizeof(u_int32_t)); - dflags |= P_BIGDATA; - data = &tdata; + break; + case DB_BEFORE: /* 3. Insert a new data item. */ + if (TYPE(h) == P_LBTREE) { + /* Copy the key for the duplicate and adjust cursors. */ + if ((ret = __bam_adjindx(dbc, h, indx, indx, 1)) != 0) + return (ret); + if ((ret = __bam_ca_di(dbc, PGNO(h), indx, 1)) != 0) + return (ret); + + ++indx; } - if (key->size + data->size > t->bt_ovflsize) - goto storekey; + break; + case DB_CURRENT: + /* + * Clear the cursor's deleted flag. The problem is that if + * we deadlock or fail while deleting the overflow item or + * replacing the non-overflow item, a subsequent cursor close + * will try and remove the item because the cursor's delete + * flag is set. + */ + if ((ret = __bam_ca_delete(dbp, PGNO(h), indx, 0, NULL)) != 0) + return (ret); + + if (TYPE(h) == P_LBTREE) + ++indx; + bk = GET_BKEYDATA(dbp, h, indx); + + /* + * In a Btree deleted records aren't counted (deleted records + * are counted in a Recno because all accesses are based on + * record number). If it's a Btree and it's a DB_CURRENT + * operation overwriting a previously deleted record, increment + * the record count. + */ + if (TYPE(h) == P_LBTREE || TYPE(h) == P_LDUP) + was_deleted = B_DISSET(bk->type); + + /* + * 4. Delete and re-add the data item. + * + * If we're changing the type of the on-page structure, or we + * are referencing offpage items, we have to delete and then + * re-add the item. We do not do any cursor adjustments here + * because we're going to immediately re-add the item into the + * same slot. + */ + if (bigdata || B_TYPE(bk->type) != B_KEYDATA) { + /* + * If streaming, don't delete the overflow item, + * just delete the item pointing to the overflow item. + * It will be added back in later, with the new size. + * We can't simply adjust the size of the item on the + * page, because there is no easy way to log a + * modification. + */ + if (F_ISSET(data, DB_DBT_STREAMING)) { + if ((ret = __db_ditem( + dbc, h, indx, BOVERFLOW_SIZE)) != 0) + return (ret); + } else if ((ret = __bam_ditem(dbc, h, indx)) != 0) + return (ret); + del = 1; + break; + } + + /* 5. Overwrite the data item. */ + replace = 1; + break; + default: + return (__db_unknown_flag(env, "DB->put", op)); + } + + /* Add the data. */ + if (bigdata) { + /* + * We do not have to handle deleted (BI_DELETED) records + * in this case; the actual records should never be created. + */ + DB_ASSERT(env, !LF_ISSET(BI_DELETED)); + ret = __bam_ovput(dbc, + B_OVERFLOW, PGNO_INVALID, h, indx, data); + } else { + if (LF_ISSET(BI_DELETED)) { + B_TSET_DELETED(bk_tmp.type, B_KEYDATA); + bk_tmp.len = data->size; + bk_hdr.data = &bk_tmp; + bk_hdr.size = SSZA(BKEYDATA, data); + ret = __db_pitem(dbc, h, indx, + BKEYDATA_SIZE(data->size), &bk_hdr, data); + } else if (replace) + ret = __bam_ritem(dbc, h, indx, data, 0); + else + ret = __db_pitem(dbc, h, indx, + BKEYDATA_SIZE(data->size), NULL, data); + } + if (ret != 0) { + if (del == 1 && (t_ret = + __bam_ca_di(dbc, PGNO(h), indx + 1, -1)) != 0) { + __db_err(env, t_ret, + "cursor adjustment after delete failed"); + return (__env_panic(env, t_ret)); + } + return (ret); } - /* Replace the cursor. */ - if (flags == R_CURSOR) { - if ((h = mpool_get(t->bt_mp, t->bt_cursor.pg.pgno, 0)) == NULL) - return (RET_ERROR); - index = t->bt_cursor.pg.index; - goto delete; + /* + * Re-position the cursors if necessary and reset the current cursor + * to point to the new item. + */ + if (op != DB_CURRENT) { + if ((ret = __bam_ca_di(dbc, PGNO(h), indx, 1)) != 0) + return (ret); + cp->indx = TYPE(h) == P_LBTREE ? indx - O_INDX : indx; } /* - * Find the key to delete, or, the location at which to insert. - * Bt_fast and __bt_search both pin the returned page. + * If we've changed the record count, update the tree. There's no + * need to adjust the count if the operation not performed on the + * current record or when the current record was previously deleted. */ - if (t->bt_order == NOT || (e = bt_fast(t, key, data, &exact)) == NULL) - if ((e = __bt_search(t, key, &exact)) == NULL) - return (RET_ERROR); - h = e->page; - index = e->index; + if (F_ISSET(cp, C_RECNUM) && (op != DB_CURRENT || was_deleted)) + if ((ret = __bam_adjust(dbc, 1)) != 0) + return (ret); /* - * Add the key/data pair to the tree. If an identical key is already - * in the tree, and R_NOOVERWRITE is set, an error is returned. If - * R_NOOVERWRITE is not set, the key is either added (if duplicates are - * permitted) or an error is returned. + * If a Btree leaf page is at least 50% full and we may have added or + * modified a duplicate data item, see if the set of duplicates takes + * up at least 25% of the space on the page. If it does, move it onto + * its own page. */ - switch (flags) { - case R_NOOVERWRITE: - if (!exact) - break; - mpool_put(t->bt_mp, h, 0); - return (RET_SPECIAL); - default: - if (!exact || !F_ISSET(t, B_NODUPS)) - break; + if (dupadjust && + (ret = __bam_dup_convert(dbc, h, indx - O_INDX, cnt)) != 0) + return (ret); + + /* If we've modified a recno file, set the flag. */ + if (dbc->dbtype == DB_RECNO) + t->re_modified = 1; + + return (ret); +} + +/* + * __bam_partsize -- + * Figure out how much space a partial data item is in total. + */ +static u_int32_t +__bam_partsize(dbp, op, data, h, indx) + DB *dbp; + u_int32_t op, indx; + DBT *data; + PAGE *h; +{ + BKEYDATA *bk; + u_int32_t nbytes; + + /* + * If the record doesn't already exist, it's simply the data we're + * provided. + */ + if (op != DB_CURRENT) + return (data->doff + data->size); + + /* + * Otherwise, it's the data provided plus any already existing data + * that we're not replacing. + */ + bk = GET_BKEYDATA(dbp, h, indx + (TYPE(h) == P_LBTREE ? O_INDX : 0)); + nbytes = + B_TYPE(bk->type) == B_OVERFLOW ? ((BOVERFLOW *)bk)->tlen : bk->len; + + return (__db_partsize(nbytes, data)); +} + +/* + * __bam_build -- + * Build the real record for a partial put, or short fixed-length record. + */ +static int +__bam_build(dbc, op, dbt, h, indx, nbytes) + DBC *dbc; + u_int32_t op, indx, nbytes; + DBT *dbt; + PAGE *h; +{ + BKEYDATA *bk, tbk; + BOVERFLOW *bo; + BTREE *t; + DB *dbp; + DBT copy, *rdata; + u_int32_t len, tlen; + u_int8_t *p; + int ret; + + COMPQUIET(bo, NULL); + + dbp = dbc->dbp; + t = dbp->bt_internal; + + /* We use the record data return memory, it's only a short-term use. */ + rdata = &dbc->my_rdata; + if (rdata->ulen < nbytes) { + if ((ret = __os_realloc(dbp->env, + nbytes, &rdata->data)) != 0) { + rdata->ulen = 0; + rdata->data = NULL; + return (ret); + } + rdata->ulen = nbytes; + } + + /* + * We use nul or pad bytes for any part of the record that isn't + * specified; get it over with. + */ + memset(rdata->data, + F_ISSET(dbp, DB_AM_FIXEDLEN) ? t->re_pad : 0, nbytes); + + /* + * In the next clauses, we need to do three things: a) set p to point + * to the place at which to copy the user's data, b) set tlen to the + * total length of the record, not including the bytes contributed by + * the user, and c) copy any valid data from an existing record. If + * it's not a partial put (this code is called for both partial puts + * and fixed-length record padding) or it's a new key, we can cut to + * the chase. + */ + if (!F_ISSET(dbt, DB_DBT_PARTIAL) || op != DB_CURRENT) { + p = (u_int8_t *)rdata->data + dbt->doff; + tlen = dbt->doff; + goto user_copy; + } + + /* Find the current record. */ + if (indx < NUM_ENT(h)) { + bk = GET_BKEYDATA(dbp, h, indx + (TYPE(h) == P_LBTREE ? + O_INDX : 0)); + bo = (BOVERFLOW *)bk; + } else { + bk = &tbk; + B_TSET(bk->type, B_KEYDATA); + bk->len = 0; + } + if (B_TYPE(bk->type) == B_OVERFLOW) { /* - * !!! - * Note, the delete may empty the page, so we need to put a - * new entry into the page immediately. + * In the case of an overflow record, we shift things around + * in the current record rather than allocate a separate copy. */ -delete: if (__bt_dleaf(t, key, h, index) == RET_ERROR) { - mpool_put(t->bt_mp, h, 0); - return (RET_ERROR); + memset(©, 0, sizeof(copy)); + if ((ret = __db_goff(dbc, ©, bo->tlen, bo->pgno, + &rdata->data, &rdata->ulen)) != 0) + return (ret); + + /* Skip any leading data from the original record. */ + tlen = dbt->doff; + p = (u_int8_t *)rdata->data + dbt->doff; + + /* + * Copy in any trailing data from the original record. + * + * If the original record was larger than the original offset + * plus the bytes being deleted, there is trailing data in the + * original record we need to preserve. If we aren't deleting + * the same number of bytes as we're inserting, copy it up or + * down, into place. + * + * Use memmove(), the regions may overlap. + */ + if (bo->tlen > dbt->doff + dbt->dlen) { + len = bo->tlen - (dbt->doff + dbt->dlen); + if (dbt->dlen != dbt->size) + memmove(p + dbt->size, p + dbt->dlen, len); + tlen += len; + } + } else { + /* Copy in any leading data from the original record. */ + memcpy(rdata->data, + bk->data, dbt->doff > bk->len ? bk->len : dbt->doff); + tlen = dbt->doff; + p = (u_int8_t *)rdata->data + dbt->doff; + + /* Copy in any trailing data from the original record. */ + len = dbt->doff + dbt->dlen; + if (bk->len > len) { + memcpy(p + dbt->size, bk->data + len, bk->len - len); + tlen += bk->len - len; } - break; } +user_copy: /* - * If not enough room, or the user has put a ceiling on the number of - * keys permitted in the page, split the page. The split code will - * insert the key and data and unpin the current page. If inserting - * into the offset array, shift the pointers up. + * Copy in the application provided data -- p and tlen must have been + * initialized above. */ - nbytes = NBLEAFDBT(key->size, data->size); - if (h->upper - h->lower < nbytes + sizeof(indx_t)) { - if ((status = __bt_split(t, h, key, - data, dflags, nbytes, index)) != RET_SUCCESS) - return (status); - goto success; + memcpy(p, dbt->data, dbt->size); + tlen += dbt->size; + + /* Set the DBT to reference our new record. */ + rdata->size = F_ISSET(dbp, DB_AM_FIXEDLEN) ? t->re_len : tlen; + rdata->dlen = 0; + rdata->doff = 0; + rdata->flags = 0; + *dbt = *rdata; + return (0); +} + +/* + * __bam_ritem -- + * Replace an item on a page. + * + * PUBLIC: int __bam_ritem __P((DBC *, PAGE *, u_int32_t, DBT *, u_int32_t)); + */ +int +__bam_ritem(dbc, h, indx, data, typeflag) + DBC *dbc; + PAGE *h; + u_int32_t indx; + DBT *data; + u_int32_t typeflag; +{ + BKEYDATA *bk; + BINTERNAL *bi; + DB *dbp; + DBT orig, repl; + db_indx_t cnt, lo, ln, min, off, prefix, suffix; + int32_t nbytes; + u_int32_t len; + int ret; + db_indx_t *inp; + u_int8_t *dp, *p, *t, type; + + dbp = dbc->dbp; + bi = NULL; + bk = NULL; + + /* + * Replace a single item onto a page. The logic figuring out where + * to insert and whether it fits is handled in the caller. All we do + * here is manage the page shuffling. + */ + if (TYPE(h) == P_IBTREE) { + /* Point at the part of the internal struct past the type. */ + bi = GET_BINTERNAL(dbp, h, indx); + if (B_TYPE(bi->type) == B_OVERFLOW) + len = BOVERFLOW_SIZE; + else + len = bi->len; + len += SSZA(BINTERNAL, data) - SSZ(BINTERNAL, unused); + dp = &bi->unused; + type = typeflag == 0 ? bi->type : + (bi->type == B_KEYDATA ? B_OVERFLOW : B_KEYDATA); + } else { + bk = GET_BKEYDATA(dbp, h, indx); + len = bk->len; + dp = bk->data; + type = bk->type; + typeflag = B_DISSET(type); } - if (index < (nxtindex = NEXTINDEX(h))) - memmove(h->linp + index + 1, h->linp + index, - (nxtindex - index) * sizeof(indx_t)); - h->lower += sizeof(indx_t); - - h->linp[index] = h->upper -= nbytes; - dest = (char *)h + h->upper; - WR_BLEAF(dest, key, data, dflags); - - /* If the cursor is on this page, adjust it as necessary. */ - if (F_ISSET(&t->bt_cursor, CURS_INIT) && - !F_ISSET(&t->bt_cursor, CURS_ACQUIRE) && - t->bt_cursor.pg.pgno == h->pgno && t->bt_cursor.pg.index >= index) - ++t->bt_cursor.pg.index; - - if (t->bt_order == NOT) - if (h->nextpg == P_INVALID) { - if (index == NEXTINDEX(h) - 1) { - t->bt_order = FORWARD; - t->bt_last.index = index; - t->bt_last.pgno = h->pgno; - } - } else if (h->prevpg == P_INVALID) { - if (index == 0) { - t->bt_order = BACK; - t->bt_last.index = 0; - t->bt_last.pgno = h->pgno; - } + /* Log the change. */ + if (DBC_LOGGING(dbc)) { + /* + * We might as well check to see if the two data items share + * a common prefix and suffix -- it can save us a lot of log + * message if they're large. + */ + min = data->size < len ? data->size : len; + for (prefix = 0, + p = dp, t = data->data; + prefix < min && *p == *t; ++prefix, ++p, ++t) + ; + + min -= prefix; + for (suffix = 0, + p = (u_int8_t *)dp + len - 1, + t = (u_int8_t *)data->data + data->size - 1; + suffix < min && *p == *t; ++suffix, --p, --t) + ; + + /* We only log the parts of the keys that have changed. */ + orig.data = (u_int8_t *)dp + prefix; + orig.size = len - (prefix + suffix); + repl.data = (u_int8_t *)data->data + prefix; + repl.size = data->size - (prefix + suffix); + if ((ret = __bam_repl_log(dbp, dbc->txn, &LSN(h), 0, PGNO(h), + &LSN(h), (u_int32_t)indx, typeflag, + &orig, &repl, (u_int32_t)prefix, (u_int32_t)suffix)) != 0) + return (ret); + } else + LSN_NOT_LOGGED(LSN(h)); + + /* + * Set references to the first in-use byte on the page and the + * first byte of the item being replaced. + */ + inp = P_INP(dbp, h); + p = (u_int8_t *)h + HOFFSET(h); + if (TYPE(h) == P_IBTREE) { + t = (u_int8_t *)bi; + lo = (db_indx_t)BINTERNAL_SIZE(bi->len); + ln = (db_indx_t)BINTERNAL_SIZE(data->size - + (SSZA(BINTERNAL, data) - SSZ(BINTERNAL, unused))); + } else { + t = (u_int8_t *)bk; + lo = (db_indx_t)BKEYDATA_SIZE(bk->len); + ln = (db_indx_t)BKEYDATA_SIZE(data->size); + } + + /* + * If the entry is growing in size, shift the beginning of the data + * part of the page down. If the entry is shrinking in size, shift + * the beginning of the data part of the page up. Use memmove(3), + * the regions overlap. + */ + if (lo != ln) { + nbytes = lo - ln; /* Signed difference. */ + if (p == t) /* First index is fast. */ + inp[indx] += nbytes; + else { /* Else, shift the page. */ + memmove(p + nbytes, p, (size_t)(t - p)); + + /* Adjust the indices' offsets. */ + off = inp[indx]; + for (cnt = 0; cnt < NUM_ENT(h); ++cnt) + if (inp[cnt] <= off) + inp[cnt] += nbytes; } - mpool_put(t->bt_mp, h, MPOOL_DIRTY); + /* Clean up the page and adjust the item's reference. */ + HOFFSET(h) += nbytes; + t += nbytes; + } -success: - if (flags == R_SETCURSOR) - __bt_setcur(t, e->page->pgno, e->index); + /* Copy the new item onto the page. */ + bk = (BKEYDATA *)t; + bk->len = data->size; + B_TSET(bk->type, type); + memcpy(bk->data, data->data, bk->len); - F_SET(t, B_MODIFIED); - return (RET_SUCCESS); -} + /* Remove the length of the internal header elements. */ + if (TYPE(h) == P_IBTREE) + bk->len -= SSZA(BINTERNAL, data) - SSZ(BINTERNAL, unused); -#ifdef STATISTICS -u_long bt_cache_hit, bt_cache_miss; -#endif + return (0); +} /* - * BT_FAST -- Do a quick check for sorted data. + * __bam_irep -- + * Replace an item on an internal page. * - * Parameters: - * t: tree - * key: key to insert - * - * Returns: - * EPG for new record or NULL if not found. + * PUBLIC: int __bam_irep __P((DBC *, PAGE *, u_int32_t, DBT *, DBT *)); */ -static EPG * -bt_fast(t, key, data, exactp) - BTREE *t; - const DBT *key, *data; - int *exactp; +int +__bam_irep(dbc, h, indx, hdr, data) + DBC *dbc; + PAGE *h; + u_int32_t indx; + DBT *hdr; + DBT *data; { + BINTERNAL *bi, *bn; + DB *dbp; + DBT dbt; + int ret; + + dbp = dbc->dbp; + + bi = GET_BINTERNAL(dbp, h, indx); + bn = (BINTERNAL *) hdr->data; + + if (B_TYPE(bi->type) == B_OVERFLOW && + (ret = __db_doff(dbc, ((BOVERFLOW *)bi->data)->pgno)) != 0) + return (ret); + + memset(&dbt, 0, sizeof(dbt)); + dbt.size = hdr->size + data->size - SSZ(BINTERNAL, unused); + if ((ret = __os_malloc(dbp->env, dbt.size, &dbt.data)) != 0) + return (ret); + memcpy(dbt.data, + (u_int8_t *)hdr->data + SSZ(BINTERNAL, unused), + hdr->size - SSZ(BINTERNAL, unused)); + memcpy((u_int8_t *)dbt.data + + hdr->size - SSZ(BINTERNAL, unused), data->data, data->size); + + ret = __bam_ritem(dbc, h, indx, &dbt, bi->type != bn->type); + + __os_free(dbp->env, dbt.data); + return (ret); +} + +/* + * __bam_dup_check -- + * Check to see if the duplicate set at indx should have its own page. + */ +static int +__bam_dup_check(dbc, op, h, indx, sz, cntp) + DBC *dbc; + u_int32_t op; PAGE *h; - u_int32_t nbytes; - int cmp; + u_int32_t indx, sz; + db_indx_t *cntp; +{ + BKEYDATA *bk; + DB *dbp; + db_indx_t cnt, first, *inp; + + dbp = dbc->dbp; + inp = P_INP(dbp, h); + + /* + * Count the duplicate records and calculate how much room they're + * using on the page. + */ + while (indx > 0 && inp[indx] == inp[indx - P_INDX]) + indx -= P_INDX; - if ((h = mpool_get(t->bt_mp, t->bt_last.pgno, 0)) == NULL) { - t->bt_order = NOT; - return (NULL); + /* Count the key once. */ + bk = GET_BKEYDATA(dbp, h, indx); + sz += B_TYPE(bk->type) == B_KEYDATA ? + BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE; + + /* Sum up all the data items. */ + first = indx; + + /* + * Account for the record being inserted. If we are replacing it, + * don't count it twice. + * + * We execute the loop with first == indx to get the size of the + * first record. + */ + cnt = op == DB_CURRENT ? 0 : 1; + for (first = indx; + indx < NUM_ENT(h) && inp[first] == inp[indx]; + ++cnt, indx += P_INDX) { + bk = GET_BKEYDATA(dbp, h, indx + O_INDX); + sz += B_TYPE(bk->type) == B_KEYDATA ? + BKEYDATA_PSIZE(bk->len) : BOVERFLOW_PSIZE; } - t->bt_cur.page = h; - t->bt_cur.index = t->bt_last.index; /* - * If won't fit in this page or have too many keys in this page, - * have to search to get split stack. + * We have to do these checks when the user is replacing the cursor's + * data item -- if the application replaces a duplicate item with a + * larger data item, it can increase the amount of space used by the + * duplicates, requiring this check. But that means we may have done + * this check when it wasn't a duplicate item after all. + */ + if (cnt == 1) + return (0); + + /* + * If this set of duplicates is using more than 25% of the page, move + * them off. The choice of 25% is a WAG, but the value must be small + * enough that we can always split a page without putting duplicates + * on two different pages. */ - nbytes = NBLEAFDBT(key->size, data->size); - if (h->upper - h->lower < nbytes + sizeof(indx_t)) - goto miss; - - if (t->bt_order == FORWARD) { - if (t->bt_cur.page->nextpg != P_INVALID) - goto miss; - if (t->bt_cur.index != NEXTINDEX(h) - 1) - goto miss; - if ((cmp = __bt_cmp(t, key, &t->bt_cur)) < 0) - goto miss; - t->bt_last.index = cmp ? ++t->bt_cur.index : t->bt_cur.index; + if (sz < dbp->pgsize / 4) + return (0); + + *cntp = cnt; + return (1); +} + +/* + * __bam_dup_convert -- + * Move a set of duplicates off-page and into their own tree. + */ +static int +__bam_dup_convert(dbc, h, indx, cnt) + DBC *dbc; + PAGE *h; + u_int32_t indx, cnt; +{ + BKEYDATA *bk; + DB *dbp; + DBT hdr; + DB_LOCK lock; + DB_MPOOLFILE *mpf; + PAGE *dp; + db_indx_t cpindx, dindx, first, *inp; + int ret, t_ret; + + dbp = dbc->dbp; + mpf = dbp->mpf; + inp = P_INP(dbp, h); + + /* Move to the beginning of the dup set. */ + while (indx > 0 && inp[indx] == inp[indx - P_INDX]) + indx -= P_INDX; + + /* Get a new page. */ + if ((ret = __db_new(dbc, + dbp->dup_compare == NULL ? P_LRECNO : P_LDUP, &lock, &dp)) != 0) + return (ret); + P_INIT(dp, dbp->pgsize, dp->pgno, + PGNO_INVALID, PGNO_INVALID, LEAFLEVEL, TYPE(dp)); + + /* + * Move this set of duplicates off the page. First points to the first + * key of the first duplicate key/data pair, cnt is the number of pairs + * we're dealing with. + */ + memset(&hdr, 0, sizeof(hdr)); + first = indx; + dindx = indx; + cpindx = 0; + do { + /* Move cursors referencing the old entry to the new entry. */ + if ((ret = __bam_ca_dup(dbc, first, + PGNO(h), indx, PGNO(dp), cpindx)) != 0) + goto err; + + /* + * Copy the entry to the new page. If the off-duplicate page + * If the off-duplicate page is a Btree page (i.e. dup_compare + * will be non-NULL, we use Btree pages for sorted dups, + * and Recno pages for unsorted dups), move all entries + * normally, even deleted ones. If it's a Recno page, + * deleted entries are discarded (if the deleted entry is + * overflow, then free up those pages). + */ + bk = GET_BKEYDATA(dbp, h, dindx + 1); + hdr.data = bk; + hdr.size = B_TYPE(bk->type) == B_KEYDATA ? + BKEYDATA_SIZE(bk->len) : BOVERFLOW_SIZE; + if (dbp->dup_compare == NULL && B_DISSET(bk->type)) { + /* + * Unsorted dups, i.e. recno page, and we have + * a deleted entry, don't move it, but if it was + * an overflow entry, we need to free those pages. + */ + if (B_TYPE(bk->type) == B_OVERFLOW && + (ret = __db_doff(dbc, + (GET_BOVERFLOW(dbp, h, dindx + 1))->pgno)) != 0) + goto err; + } else { + if ((ret = __db_pitem( + dbc, dp, cpindx, hdr.size, &hdr, NULL)) != 0) + goto err; + ++cpindx; + } + /* Delete all but the last reference to the key. */ + if (cnt != 1) { + if ((ret = __bam_adjindx(dbc, + h, dindx, first + 1, 0)) != 0) + goto err; + } else + dindx++; + + /* Delete the data item. */ + if ((ret = __db_ditem(dbc, h, dindx, hdr.size)) != 0) + goto err; + indx += P_INDX; + } while (--cnt); + + /* Put in a new data item that points to the duplicates page. */ + if ((ret = __bam_ovput(dbc, + B_DUPLICATE, dp->pgno, h, first + 1, NULL)) != 0) + goto err; + + /* Adjust cursors for all the above movements. */ + ret = __bam_ca_di(dbc, + PGNO(h), first + P_INDX, (int)(first + P_INDX - indx)); + +err: if ((t_ret = __memp_fput(mpf, + dbc->thread_info, dp, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + + (void)__TLPUT(dbc, lock); + return (ret); +} + +/* + * __bam_ovput -- + * Build an item for an off-page duplicates page or overflow page and + * insert it on the page. + */ +static int +__bam_ovput(dbc, type, pgno, h, indx, item) + DBC *dbc; + u_int32_t type, indx; + db_pgno_t pgno; + PAGE *h; + DBT *item; +{ + BOVERFLOW bo; + DBT hdr; + int ret; + + UMRW_SET(bo.unused1); + B_TSET(bo.type, type); + UMRW_SET(bo.unused2); + + /* + * If we're creating an overflow item, do so and acquire the page + * number for it. If we're creating an off-page duplicates tree, + * we are giving the page number as an argument. + */ + if (type == B_OVERFLOW) { + if ((ret = __db_poff(dbc, item, &bo.pgno)) != 0) + return (ret); + bo.tlen = item->size; } else { - if (t->bt_cur.page->prevpg != P_INVALID) - goto miss; - if (t->bt_cur.index != 0) - goto miss; - if ((cmp = __bt_cmp(t, key, &t->bt_cur)) > 0) - goto miss; - t->bt_last.index = 0; + bo.pgno = pgno; + bo.tlen = 0; } - *exactp = cmp == 0; -#ifdef STATISTICS - ++bt_cache_hit; -#endif - return (&t->bt_cur); - -miss: -#ifdef STATISTICS - ++bt_cache_miss; -#endif - t->bt_order = NOT; - mpool_put(t->bt_mp, h, 0); - return (NULL); + + /* Store the new record on the page. */ + memset(&hdr, 0, sizeof(hdr)); + hdr.data = &bo; + hdr.size = BOVERFLOW_SIZE; + return (__db_pitem(dbc, h, indx, BOVERFLOW_SIZE, &hdr, NULL)); } diff --git a/btree/bt_rec.c b/btree/bt_rec.c new file mode 100644 index 0000000..9650d92 --- /dev/null +++ b/btree/bt_rec.c @@ -0,0 +1,2035 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" + +#define IS_BTREE_PAGE(pagep) \ + (TYPE(pagep) == P_IBTREE || \ + TYPE(pagep) == P_LBTREE || TYPE(pagep) == P_LDUP) + +/* + * __bam_split_recover -- + * Recovery function for split. + * + * PUBLIC: int __bam_split_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_split_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __bam_split_args *argp; + DB_THREAD_INFO *ip; + DB *file_dbp; + DBC *dbc; + DB_LSN *plsnp; + DB_MPOOLFILE *mpf; + PAGE *_lp, *lp, *np, *pp, *_rp, *rp, *sp; + db_pgno_t pgno, parent_pgno; + u_int32_t ptype, size; + int cmp, l_update, p_update, r_update, ret, rootsplit, t_ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + REC_PRINT(__bam_split_print); + + _lp = lp = np = pp = _rp = rp = NULL; + sp = NULL; + + REC_INTRO(__bam_split_read, ip, 0); + + if ((ret = __db_cursor_int(file_dbp, ip, NULL, + (argp->opflags & SPL_RECNO) ? DB_RECNO : DB_BTREE, + PGNO_INVALID, 0, NULL, &dbc)) != 0) + goto out; + if (argp->opflags & SPL_NRECS) + F_SET((BTREE_CURSOR *)dbc->internal, C_RECNUM); + F_SET(dbc, DBC_RECOVER); + + /* + * There are two kinds of splits that we have to recover from. The + * first is a root-page split, where the root page is split from a + * leaf page into an internal page and two new leaf pages are created. + * The second is where a page is split into two pages, and a new key + * is inserted into the parent page. + * + * DBTs are not aligned in log records, so we need to copy the page + * so that we can access fields within it throughout this routine. + * Although we could hardcode the unaligned copies in this routine, + * we will be calling into regular btree functions with this page, + * so it's got to be aligned. Copying it into allocated memory is + * the only way to guarantee this. + */ + if ((ret = __os_malloc(env, argp->pg.size, &sp)) != 0) + goto out; + memcpy(sp, argp->pg.data, argp->pg.size); + + pgno = PGNO(sp); + parent_pgno = argp->ppgno; + rootsplit = parent_pgno == pgno; + + /* Get the pages going down the tree. */ + REC_FGET(mpf, ip, parent_pgno, &pp, left); +left: REC_FGET(mpf, ip, argp->left, &lp, right); +right: REC_FGET(mpf, ip, argp->right, &rp, redo); + +redo: if (DB_REDO(op)) { + l_update = r_update = p_update = 0; + /* + * Decide if we need to resplit the page. + * + * If this is a root split, then the root has to exist unless + * we have truncated it due to a future deallocation. + */ + if (pp != NULL) { + if (rootsplit) + plsnp = &LSN(argp->pg.data); + else + plsnp = &argp->plsn; + cmp = LOG_COMPARE(&LSN(pp), plsnp); + CHECK_LSN(env, op, cmp, &LSN(pp), plsnp); + if (cmp == 0) + p_update = 1; + } + + if (lp != NULL) { + cmp = LOG_COMPARE(&LSN(lp), &argp->llsn); + CHECK_LSN(env, op, cmp, &LSN(lp), &argp->llsn); + if (cmp == 0) + l_update = 1; + } + + if (rp != NULL) { + cmp = LOG_COMPARE(&LSN(rp), &argp->rlsn); + CHECK_LSN(env, op, cmp, &LSN(rp), &argp->rlsn); + if (cmp == 0) + r_update = 1; + } + + if (!p_update && !l_update && !r_update) + goto check_next; + + /* Allocate and initialize new left/right child pages. */ + if ((ret = __os_malloc(env, file_dbp->pgsize, &_lp)) != 0 || + (ret = __os_malloc(env, file_dbp->pgsize, &_rp)) != 0) + goto out; + if (rootsplit) { + P_INIT(_lp, file_dbp->pgsize, argp->left, + PGNO_INVALID, + ISINTERNAL(sp) ? PGNO_INVALID : argp->right, + LEVEL(sp), TYPE(sp)); + P_INIT(_rp, file_dbp->pgsize, argp->right, + ISINTERNAL(sp) ? PGNO_INVALID : argp->left, + PGNO_INVALID, LEVEL(sp), TYPE(sp)); + } else { + P_INIT(_lp, file_dbp->pgsize, PGNO(sp), + ISINTERNAL(sp) ? PGNO_INVALID : PREV_PGNO(sp), + ISINTERNAL(sp) ? PGNO_INVALID : argp->right, + LEVEL(sp), TYPE(sp)); + P_INIT(_rp, file_dbp->pgsize, argp->right, + ISINTERNAL(sp) ? PGNO_INVALID : sp->pgno, + ISINTERNAL(sp) ? PGNO_INVALID : NEXT_PGNO(sp), + LEVEL(sp), TYPE(sp)); + } + + /* Split the page. */ + if ((ret = __bam_copy(file_dbp, sp, _lp, 0, argp->indx)) != 0 || + (ret = __bam_copy(file_dbp, sp, _rp, argp->indx, + NUM_ENT(sp))) != 0) + goto out; + + if (l_update) { + REC_DIRTY(mpf, ip, file_dbp->priority, &lp); + memcpy(lp, _lp, file_dbp->pgsize); + lp->lsn = *lsnp; + } + + if (r_update) { + REC_DIRTY(mpf, ip, file_dbp->priority, &rp); + memcpy(rp, _rp, file_dbp->pgsize); + rp->lsn = *lsnp; + } + + /* + * Drop the latches on the lower level pages before + * getting an exclusive latch on the higher level page. + */ + if (lp != NULL && (ret = __memp_fput(mpf, + ip, lp, file_dbp->priority)) && ret == 0) + goto out; + lp = NULL; + if (rp != NULL && (ret = __memp_fput(mpf, + ip, rp, file_dbp->priority)) && ret == 0) + goto out; + rp = NULL; + /* + * If the parent page is wrong, update it. + * Initialize the page. If it is a root page update + * the record counts if needed and put the first record in. + * Then insert the record for the right hand child page. + */ + if (p_update) { + REC_DIRTY(mpf, ip, file_dbp->priority, &pp); + if (argp->opflags & SPL_RECNO) + ptype = P_IRECNO; + else + ptype = P_IBTREE; + + if (rootsplit) { + P_INIT(pp, file_dbp->pgsize, pgno, PGNO_INVALID, + PGNO_INVALID, _lp->level + 1, ptype); + if (argp->opflags & SPL_NRECS) { + RE_NREC_SET(pp, + __bam_total(file_dbp, _lp) + + __bam_total(file_dbp, _rp)); + } + if ((ret = __db_pitem_nolog(dbc, pp, + argp->pindx, argp->pentry.size, + &argp->pentry, NULL)) != 0) + goto out; + + } + if ((ret = __db_pitem_nolog(dbc, pp, argp->pindx + 1, + argp->rentry.size, &argp->rentry, NULL)) != 0) + goto out; + pp->lsn = *lsnp; + } + +check_next: /* + * Finally, redo the next-page link if necessary. This is of + * interest only if it wasn't a root split -- inserting a new + * page in the tree requires that any following page have its + * previous-page pointer updated to our new page. The next + * page must exist because we're redoing the operation. + */ + if (!rootsplit && argp->npgno != PGNO_INVALID) { + REC_FGET(mpf, ip, argp->npgno, &np, done); + cmp = LOG_COMPARE(&LSN(np), &argp->nlsn); + CHECK_LSN(env, op, cmp, &LSN(np), &argp->nlsn); + if (cmp == 0) { + REC_DIRTY(mpf, ip, file_dbp->priority, &np); + PREV_PGNO(np) = argp->right; + np->lsn = *lsnp; + } + } + } else { + /* + * If it's a root split and the left child ever existed, update + * its LSN. Otherwise its the split page. If + * right child ever existed, root split or not, update its LSN. + * The undo of the page allocation(s) will restore them to the + * free list. + */ + if (rootsplit && lp != NULL && + LOG_COMPARE(lsnp, &LSN(lp)) == 0) { + REC_DIRTY(mpf, ip, file_dbp->priority, &lp); + lp->lsn = argp->llsn; + } + if (rp != NULL && + LOG_COMPARE(lsnp, &LSN(rp)) == 0) { + REC_DIRTY(mpf, ip, file_dbp->priority, &rp); + rp->lsn = argp->rlsn; + } + /* + * Drop the lower level pages before getting an exclusive + * latch on the parent. + */ + if (rp != NULL && (ret = __memp_fput(mpf, + ip, rp, file_dbp->priority))) + goto out; + rp = NULL; + + /* + * Check the state of the split page. If its a rootsplit + * then thats the rootpage otherwise its the left page. + */ + if (rootsplit) { + DB_ASSERT(env, pgno == argp->ppgno); + if (lp != NULL && (ret = __memp_fput(mpf, ip, + lp, file_dbp->priority)) != 0) + goto out; + lp = pp; + pp = NULL; + } + if (lp != NULL) { + cmp = LOG_COMPARE(lsnp, &LSN(lp)); + CHECK_ABORT(env, op, cmp, &LSN(lp), lsnp); + if (cmp == 0) { + REC_DIRTY(mpf, ip, file_dbp->priority, &lp); + memcpy(lp, argp->pg.data, argp->pg.size); + if ((ret = __memp_fput(mpf, + ip, lp, file_dbp->priority))) + goto out; + lp = NULL; + } + } + + /* + * Next we can update the parent removing the new index. + */ + if (pp != NULL) { + DB_ASSERT(env, !rootsplit); + cmp = LOG_COMPARE(lsnp, &LSN(pp)); + CHECK_ABORT(env, op, cmp, &LSN(pp), lsnp); + if (cmp == 0) { + REC_DIRTY(mpf, ip, file_dbp->priority, &pp); + if (argp->opflags & SPL_RECNO) + size = RINTERNAL_SIZE; + else + size = BINTERNAL_SIZE( + GET_BINTERNAL(file_dbp, + pp, argp->pindx + 1)->len); + + if ((ret = __db_ditem(dbc, pp, + argp->pindx + 1, size)) != 0) + goto out; + pp->lsn = argp->plsn; + } + } + + /* + * Finally, undo the next-page link if necessary. This is of + * interest only if it wasn't a root split -- inserting a new + * page in the tree requires that any following page have its + * previous-page pointer updated to our new page. Since it's + * possible that the next-page never existed, we ignore it as + * if there's nothing to undo. + */ + if (!rootsplit && argp->npgno != PGNO_INVALID) { + if ((ret = __memp_fget(mpf, &argp->npgno, + ip, NULL, DB_MPOOL_EDIT, &np)) != 0) { + np = NULL; + goto done; + } + if (LOG_COMPARE(lsnp, &LSN(np)) == 0) { + REC_DIRTY(mpf, ip, file_dbp->priority, &np); + PREV_PGNO(np) = argp->left; + np->lsn = argp->nlsn; + } + } + } + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: /* Free any pages that are left. */ + if (lp != NULL && (t_ret = __memp_fput(mpf, + ip, lp, file_dbp->priority)) != 0 && ret == 0) + ret = t_ret; + if (np != NULL && (t_ret = __memp_fput(mpf, + ip, np, file_dbp->priority)) != 0 && ret == 0) + ret = t_ret; + if (rp != NULL && (t_ret = __memp_fput(mpf, + ip, rp, file_dbp->priority)) != 0 && ret == 0) + ret = t_ret; + if (pp != NULL && (t_ret = __memp_fput(mpf, + ip, pp, file_dbp->priority)) != 0 && ret == 0) + ret = t_ret; + + /* Free any allocated space. */ + if (_lp != NULL) + __os_free(env, _lp); + if (_rp != NULL) + __os_free(env, _rp); + if (sp != NULL) + __os_free(env, sp); + + REC_CLOSE; +} +/* + * __bam_split_recover -- + * Recovery function for split. + * + * PUBLIC: int __bam_split_42_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_split_42_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __bam_split_42_args *argp; + DB_THREAD_INFO *ip; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + PAGE *_lp, *lp, *np, *pp, *_rp, *rp, *sp; + db_pgno_t pgno, root_pgno; + u_int32_t ptype; + int cmp, l_update, p_update, r_update, rc, ret, rootsplit, t_ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + REC_PRINT(__bam_split_print); + + _lp = lp = np = pp = _rp = rp = NULL; + sp = NULL; + + REC_INTRO(__bam_split_42_read, ip, 0); + + /* + * There are two kinds of splits that we have to recover from. The + * first is a root-page split, where the root page is split from a + * leaf page into an internal page and two new leaf pages are created. + * The second is where a page is split into two pages, and a new key + * is inserted into the parent page. + * + * DBTs are not aligned in log records, so we need to copy the page + * so that we can access fields within it throughout this routine. + * Although we could hardcode the unaligned copies in this routine, + * we will be calling into regular btree functions with this page, + * so it's got to be aligned. Copying it into allocated memory is + * the only way to guarantee this. + */ + if ((ret = __os_malloc(env, argp->pg.size, &sp)) != 0) + goto out; + memcpy(sp, argp->pg.data, argp->pg.size); + + pgno = PGNO(sp); + root_pgno = argp->root_pgno; + rootsplit = root_pgno != PGNO_INVALID; + REC_FGET(mpf, ip, argp->left, &lp, right); +right: REC_FGET(mpf, ip, argp->right, &rp, redo); + +redo: if (DB_REDO(op)) { + l_update = r_update = p_update = 0; + /* + * Decide if we need to resplit the page. + * + * If this is a root split, then the root has to exist unless + * we have truncated it due to a future deallocation. + */ + if (rootsplit) { + REC_FGET(mpf, ip, root_pgno, &pp, do_left); + cmp = LOG_COMPARE(&LSN(pp), &LSN(argp->pg.data)); + CHECK_LSN(env, op, + cmp, &LSN(pp), &LSN(argp->pg.data)); + p_update = cmp == 0; + } + +do_left: if (lp != NULL) { + cmp = LOG_COMPARE(&LSN(lp), &argp->llsn); + CHECK_LSN(env, op, cmp, &LSN(lp), &argp->llsn); + if (cmp == 0) + l_update = 1; + } + + if (rp != NULL) { + cmp = LOG_COMPARE(&LSN(rp), &argp->rlsn); + CHECK_LSN(env, op, cmp, &LSN(rp), &argp->rlsn); + if (cmp == 0) + r_update = 1; + } + + if (!p_update && !l_update && !r_update) + goto check_next; + + /* Allocate and initialize new left/right child pages. */ + if ((ret = __os_malloc(env, file_dbp->pgsize, &_lp)) != 0 || + (ret = __os_malloc(env, file_dbp->pgsize, &_rp)) != 0) + goto out; + if (rootsplit) { + P_INIT(_lp, file_dbp->pgsize, argp->left, + PGNO_INVALID, + ISINTERNAL(sp) ? PGNO_INVALID : argp->right, + LEVEL(sp), TYPE(sp)); + P_INIT(_rp, file_dbp->pgsize, argp->right, + ISINTERNAL(sp) ? PGNO_INVALID : argp->left, + PGNO_INVALID, LEVEL(sp), TYPE(sp)); + } else { + P_INIT(_lp, file_dbp->pgsize, PGNO(sp), + ISINTERNAL(sp) ? PGNO_INVALID : PREV_PGNO(sp), + ISINTERNAL(sp) ? PGNO_INVALID : argp->right, + LEVEL(sp), TYPE(sp)); + P_INIT(_rp, file_dbp->pgsize, argp->right, + ISINTERNAL(sp) ? PGNO_INVALID : sp->pgno, + ISINTERNAL(sp) ? PGNO_INVALID : NEXT_PGNO(sp), + LEVEL(sp), TYPE(sp)); + } + + /* Split the page. */ + if ((ret = __bam_copy(file_dbp, sp, _lp, 0, argp->indx)) != 0 || + (ret = __bam_copy(file_dbp, sp, _rp, argp->indx, + NUM_ENT(sp))) != 0) + goto out; + + if (l_update) { + REC_DIRTY(mpf, ip, file_dbp->priority, &lp); + memcpy(lp, _lp, file_dbp->pgsize); + lp->lsn = *lsnp; + if ((ret = __memp_fput(mpf, + ip, lp, file_dbp->priority)) != 0) + goto out; + lp = NULL; + } + + if (r_update) { + REC_DIRTY(mpf, ip, file_dbp->priority, &rp); + memcpy(rp, _rp, file_dbp->pgsize); + rp->lsn = *lsnp; + if ((ret = __memp_fput(mpf, + ip, rp, file_dbp->priority)) != 0) + goto out; + rp = NULL; + } + + /* + * If the parent page is wrong, update it. This is of interest + * only if it was a root split, since root splits create parent + * pages. All other splits modify a parent page, but those are + * separately logged and recovered. + */ + if (rootsplit && p_update) { + if (IS_BTREE_PAGE(sp)) { + ptype = P_IBTREE; + rc = argp->opflags & SPL_NRECS ? 1 : 0; + } else { + ptype = P_IRECNO; + rc = 1; + } + + REC_DIRTY(mpf, ip, file_dbp->priority, &pp); + P_INIT(pp, file_dbp->pgsize, root_pgno, + PGNO_INVALID, PGNO_INVALID, _lp->level + 1, ptype); + RE_NREC_SET(pp, rc ? __bam_total(file_dbp, _lp) + + __bam_total(file_dbp, _rp) : 0); + + pp->lsn = *lsnp; + if ((ret = __memp_fput(mpf, + ip, pp, file_dbp->priority)) != 0) + goto out; + pp = NULL; + } + +check_next: /* + * Finally, redo the next-page link if necessary. This is of + * interest only if it wasn't a root split -- inserting a new + * page in the tree requires that any following page have its + * previous-page pointer updated to our new page. The next + * page must exist because we're redoing the operation. + */ + if (!rootsplit && argp->npgno != PGNO_INVALID) { + if ((ret = __memp_fget(mpf, &argp->npgno, + ip, NULL, 0, &np)) != 0) { + if (ret != DB_PAGE_NOTFOUND) { + ret = __db_pgerr( + file_dbp, argp->npgno, ret); + goto out; + } else + goto done; + } + cmp = LOG_COMPARE(&LSN(np), &argp->nlsn); + CHECK_LSN(env, op, cmp, &LSN(np), &argp->nlsn); + if (cmp == 0) { + REC_DIRTY(mpf, ip, file_dbp->priority, &np); + PREV_PGNO(np) = argp->right; + np->lsn = *lsnp; + if ((ret = __memp_fput(mpf, ip, + np, file_dbp->priority)) != 0) + goto out; + np = NULL; + } + } + } else { + /* + * If the split page is wrong, replace its contents with the + * logged page contents. If the page doesn't exist, it means + * that the create of the page never happened, nor did any of + * the adds onto the page that caused the split, and there's + * really no undo-ing to be done. + */ + if ((ret = __memp_fget(mpf, &pgno, ip, NULL, + DB_MPOOL_EDIT, &pp)) != 0) { + pp = NULL; + goto lrundo; + } + if (LOG_COMPARE(lsnp, &LSN(pp)) == 0) { + REC_DIRTY(mpf, ip, file_dbp->priority, &pp); + memcpy(pp, argp->pg.data, argp->pg.size); + if ((ret = __memp_fput(mpf, + ip, pp, file_dbp->priority)) != 0) + goto out; + pp = NULL; + } + + /* + * If it's a root split and the left child ever existed, update + * its LSN. (If it's not a root split, we've updated the left + * page already -- it's the same as the split page.) If the + * right child ever existed, root split or not, update its LSN. + * The undo of the page allocation(s) will restore them to the + * free list. + */ +lrundo: if ((rootsplit && lp != NULL) || rp != NULL) { + if (rootsplit && lp != NULL && + LOG_COMPARE(lsnp, &LSN(lp)) == 0) { + REC_DIRTY(mpf, ip, file_dbp->priority, &lp); + lp->lsn = argp->llsn; + if ((ret = __memp_fput(mpf, ip, + lp, file_dbp->priority)) != 0) + goto out; + lp = NULL; + } + if (rp != NULL && + LOG_COMPARE(lsnp, &LSN(rp)) == 0) { + REC_DIRTY(mpf, ip, file_dbp->priority, &rp); + rp->lsn = argp->rlsn; + if ((ret = __memp_fput(mpf, ip, + rp, file_dbp->priority)) != 0) + goto out; + rp = NULL; + } + } + + /* + * Finally, undo the next-page link if necessary. This is of + * interest only if it wasn't a root split -- inserting a new + * page in the tree requires that any following page have its + * previous-page pointer updated to our new page. Since it's + * possible that the next-page never existed, we ignore it as + * if there's nothing to undo. + */ + if (!rootsplit && argp->npgno != PGNO_INVALID) { + if ((ret = __memp_fget(mpf, &argp->npgno, + ip, NULL, DB_MPOOL_EDIT, &np)) != 0) { + np = NULL; + goto done; + } + if (LOG_COMPARE(lsnp, &LSN(np)) == 0) { + REC_DIRTY(mpf, ip, file_dbp->priority, &np); + PREV_PGNO(np) = argp->left; + np->lsn = argp->nlsn; + if (__memp_fput(mpf, + ip, np, file_dbp->priority)) + goto out; + np = NULL; + } + } + } + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: /* Free any pages that weren't dirtied. */ + if (pp != NULL && (t_ret = __memp_fput(mpf, + ip, pp, file_dbp->priority)) != 0 && ret == 0) + ret = t_ret; + if (lp != NULL && (t_ret = __memp_fput(mpf, + ip, lp, file_dbp->priority)) != 0 && ret == 0) + ret = t_ret; + if (np != NULL && (t_ret = __memp_fput(mpf, + ip, np, file_dbp->priority)) != 0 && ret == 0) + ret = t_ret; + if (rp != NULL && (t_ret = __memp_fput(mpf, + ip, rp, file_dbp->priority)) != 0 && ret == 0) + ret = t_ret; + + /* Free any allocated space. */ + if (_lp != NULL) + __os_free(env, _lp); + if (_rp != NULL) + __os_free(env, _rp); + if (sp != NULL) + __os_free(env, sp); + + REC_CLOSE; +} + +/* + * __bam_rsplit_recover -- + * Recovery function for a reverse split. + * + * PUBLIC: int __bam_rsplit_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_rsplit_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __bam_rsplit_args *argp; + DB_THREAD_INFO *ip; + DB *file_dbp; + DBC *dbc; + DB_LSN copy_lsn; + DB_MPOOLFILE *mpf; + PAGE *pagep; + db_pgno_t pgno, root_pgno; + db_recno_t rcnt; + int cmp_n, cmp_p, ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + pagep = NULL; + REC_PRINT(__bam_rsplit_print); + REC_INTRO(__bam_rsplit_read, ip, 1); + + /* Fix the root page. */ + pgno = root_pgno = argp->root_pgno; + if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &pagep)) != 0) { + if (ret != DB_PAGE_NOTFOUND) { + ret = __db_pgerr(file_dbp, pgno, ret); + goto out; + } else + goto do_page; + } + + cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->rootlsn); + CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->rootlsn); + CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp); + if (cmp_p == 0 && DB_REDO(op)) { + /* + * Copy the new data to the root page. If it is not now a + * leaf page we need to restore the record number. We could + * try to determine if C_RECNUM was set in the btree, but + * that's not really necessary since the field is not used + * otherwise. + */ + REC_DIRTY(mpf, ip, dbc->priority, &pagep); + rcnt = RE_NREC(pagep); + memcpy(pagep, argp->pgdbt.data, argp->pgdbt.size); + if (LEVEL(pagep) > LEAFLEVEL) + RE_NREC_SET(pagep, rcnt); + pagep->pgno = root_pgno; + pagep->lsn = *lsnp; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Need to undo update described. */ + REC_DIRTY(mpf, ip, dbc->priority, &pagep); + P_INIT(pagep, file_dbp->pgsize, root_pgno, + argp->nrec, PGNO_INVALID, pagep->level + 1, + IS_BTREE_PAGE(pagep) ? P_IBTREE : P_IRECNO); + if ((ret = __db_pitem(dbc, pagep, 0, + argp->rootent.size, &argp->rootent, NULL)) != 0) + goto out; + pagep->lsn = argp->rootlsn; + } + if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0) + goto out; + +do_page: + /* + * Fix the page copied over the root page. It's possible that the + * page never made it to disk, or was truncated so if the page + * doesn't exist, it's okay and there's nothing further to do. + */ + if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) { + if (ret != DB_PAGE_NOTFOUND) { + ret = __db_pgerr(file_dbp, argp->pgno, ret); + goto out; + } else + goto done; + } + (void)__ua_memcpy(©_lsn, &LSN(argp->pgdbt.data), sizeof(DB_LSN)); + cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); + cmp_p = LOG_COMPARE(&LSN(pagep), ©_lsn); + CHECK_LSN(env, op, cmp_p, &LSN(pagep), ©_lsn); + CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp); + if (cmp_p == 0 && DB_REDO(op)) { + /* Need to redo update described. */ + REC_DIRTY(mpf, ip, dbc->priority, &pagep); + pagep->lsn = *lsnp; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Need to undo update described. */ + REC_DIRTY(mpf, ip, dbc->priority, &pagep); + memcpy(pagep, argp->pgdbt.data, argp->pgdbt.size); + } + if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0) + goto out; + pagep = NULL; + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: if (pagep != NULL) + (void)__memp_fput(mpf, ip, pagep, dbc->priority); + REC_CLOSE; +} + +/* + * __bam_adj_recover -- + * Recovery function for adj. + * + * PUBLIC: int __bam_adj_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_adj_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __bam_adj_args *argp; + DB_THREAD_INFO *ip; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int cmp_n, cmp_p, ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + pagep = NULL; + REC_PRINT(__bam_adj_print); + REC_INTRO(__bam_adj_read, ip, 1); + + /* Get the page; if it never existed and we're undoing, we're done. */ + if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) { + if (ret != DB_PAGE_NOTFOUND) { + ret = __db_pgerr(file_dbp, argp->pgno, ret); + goto out; + } else + goto done; + } + + cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn); + CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn); + if (cmp_p == 0 && DB_REDO(op)) { + /* Need to redo update described. */ + REC_DIRTY(mpf, ip, dbc->priority, &pagep); + if ((ret = __bam_adjindx(dbc, + pagep, argp->indx, argp->indx_copy, argp->is_insert)) != 0) + goto out; + + LSN(pagep) = *lsnp; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Need to undo update described. */ + REC_DIRTY(mpf, ip, dbc->priority, &pagep); + if ((ret = __bam_adjindx(dbc, + pagep, argp->indx, argp->indx_copy, !argp->is_insert)) != 0) + goto out; + + LSN(pagep) = argp->lsn; + } + if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0) + goto out; + pagep = NULL; + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: if (pagep != NULL) + (void)__memp_fput(mpf, ip, pagep, dbc->priority); + REC_CLOSE; +} + +/* + * __bam_cadjust_recover -- + * Recovery function for the adjust of a count change in an internal + * page. + * + * PUBLIC: int __bam_cadjust_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_cadjust_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __bam_cadjust_args *argp; + DB_THREAD_INFO *ip; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int cmp_n, cmp_p, ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + pagep = NULL; + REC_PRINT(__bam_cadjust_print); + REC_INTRO(__bam_cadjust_read, ip, 0); + + /* Get the page; if it never existed and we're undoing, we're done. */ + if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) { + if (ret != DB_PAGE_NOTFOUND) { + ret = __db_pgerr(file_dbp, argp->pgno, ret); + goto out; + } else + goto done; + } + + cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn); + CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn); + if (cmp_p == 0 && DB_REDO(op)) { + /* Need to redo update described. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + if (IS_BTREE_PAGE(pagep)) { + GET_BINTERNAL(file_dbp, pagep, argp->indx)->nrecs += + argp->adjust; + if (argp->opflags & CAD_UPDATEROOT) + RE_NREC_ADJ(pagep, argp->adjust); + } else { + GET_RINTERNAL(file_dbp, pagep, argp->indx)->nrecs += + argp->adjust; + if (argp->opflags & CAD_UPDATEROOT) + RE_NREC_ADJ(pagep, argp->adjust); + } + + LSN(pagep) = *lsnp; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Need to undo update described. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + if (IS_BTREE_PAGE(pagep)) { + GET_BINTERNAL(file_dbp, pagep, argp->indx)->nrecs -= + argp->adjust; + if (argp->opflags & CAD_UPDATEROOT) + RE_NREC_ADJ(pagep, -(argp->adjust)); + } else { + GET_RINTERNAL(file_dbp, pagep, argp->indx)->nrecs -= + argp->adjust; + if (argp->opflags & CAD_UPDATEROOT) + RE_NREC_ADJ(pagep, -(argp->adjust)); + } + LSN(pagep) = argp->lsn; + } + if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) + goto out; + pagep = NULL; + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: if (pagep != NULL) + (void)__memp_fput(mpf, ip, pagep, file_dbp->priority); + REC_CLOSE; +} + +/* + * __bam_cdel_recover -- + * Recovery function for the intent-to-delete of a cursor record. + * + * PUBLIC: int __bam_cdel_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_cdel_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __bam_cdel_args *argp; + DB_THREAD_INFO *ip; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + PAGE *pagep; + u_int32_t indx; + int cmp_n, cmp_p, ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + pagep = NULL; + REC_PRINT(__bam_cdel_print); + REC_INTRO(__bam_cdel_read, ip, 0); + + /* Get the page; if it never existed and we're undoing, we're done. */ + if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) { + if (ret != DB_PAGE_NOTFOUND) { + ret = __db_pgerr(file_dbp, argp->pgno, ret); + goto out; + } else + goto done; + } + + cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn); + CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn); + CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp); + if (cmp_p == 0 && DB_REDO(op)) { + /* Need to redo update described. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + indx = argp->indx + (TYPE(pagep) == P_LBTREE ? O_INDX : 0); + B_DSET(GET_BKEYDATA(file_dbp, pagep, indx)->type); + + LSN(pagep) = *lsnp; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Need to undo update described. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + indx = argp->indx + (TYPE(pagep) == P_LBTREE ? O_INDX : 0); + B_DCLR(GET_BKEYDATA(file_dbp, pagep, indx)->type); + + if ((ret = __bam_ca_delete( + file_dbp, argp->pgno, argp->indx, 0, NULL)) != 0) + goto out; + + LSN(pagep) = argp->lsn; + } + if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) + goto out; + pagep = NULL; + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: if (pagep != NULL) + (void)__memp_fput(mpf, ip, pagep, file_dbp->priority); + REC_CLOSE; +} + +/* + * __bam_repl_recover -- + * Recovery function for page item replacement. + * + * PUBLIC: int __bam_repl_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_repl_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __bam_repl_args *argp; + DB_THREAD_INFO *ip; + BKEYDATA *bk; + BINTERNAL *bi; + DB *file_dbp; + DBC *dbc; + DBT dbt; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int cmp_n, cmp_p, ret; + u_int32_t len; + u_int8_t *dp, *p; + + ip = ((DB_TXNHEAD *)info)->thread_info; + pagep = NULL; + REC_PRINT(__bam_repl_print); + REC_INTRO(__bam_repl_read, ip, 1); + + /* Get the page; if it never existed and we're undoing, we're done. */ + if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) { + if (ret != DB_PAGE_NOTFOUND) { + ret = __db_pgerr(file_dbp, argp->pgno, ret); + goto out; + } else + goto done; + } + + cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn); + CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn); + CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp); + if (cmp_p == 0 && DB_REDO(op)) { + /* + * Need to redo update described. + * + * Re-build the replacement item. + */ + REC_DIRTY(mpf, ip, dbc->priority, &pagep); + if (TYPE(pagep) == P_IBTREE) { + /* Point at the internal struct past the type. */ + bi = GET_BINTERNAL(file_dbp, pagep, argp->indx); + dp = &bi->unused; + len = bi->len + + SSZA(BINTERNAL, data) - SSZ(BINTERNAL, unused); + } else { + bk = GET_BKEYDATA(file_dbp, pagep, argp->indx); + dp = bk->data; + len = bk->len; + } + memset(&dbt, 0, sizeof(dbt)); + dbt.size = argp->prefix + argp->suffix + argp->repl.size; + if ((ret = __os_malloc(env, dbt.size, &dbt.data)) != 0) + goto out; + p = dbt.data; + memcpy(p, dp, argp->prefix); + p += argp->prefix; + memcpy(p, argp->repl.data, argp->repl.size); + p += argp->repl.size; + memcpy(p, dp + (len - argp->suffix), argp->suffix); + + /* isdeleted has become the type flag for non-leaf replace */ + ret = __bam_ritem(dbc, + pagep, argp->indx, &dbt, argp->isdeleted); + __os_free(env, dbt.data); + if (ret != 0) + goto out; + + LSN(pagep) = *lsnp; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* + * Need to undo update described. + * + * Re-build the original item. + */ + REC_DIRTY(mpf, ip, dbc->priority, &pagep); + if (TYPE(pagep) == P_IBTREE) { + /* Point at the internal struct past the type. */ + bi = GET_BINTERNAL(file_dbp, pagep, argp->indx); + dp = &bi->unused; + len = bi->len + + SSZA(BINTERNAL, data) - SSZ(BINTERNAL, unused); + } else { + bk = GET_BKEYDATA(file_dbp, pagep, argp->indx); + dp = bk->data; + len = bk->len; + } + memset(&dbt, 0, sizeof(dbt)); + dbt.size = argp->prefix + argp->suffix + argp->orig.size; + if ((ret = __os_malloc(env, dbt.size, &dbt.data)) != 0) + goto out; + p = dbt.data; + memcpy(p, dp, argp->prefix); + p += argp->prefix; + memcpy(p, argp->orig.data, argp->orig.size); + p += argp->orig.size; + memcpy(p, dp + (len - argp->suffix), argp->suffix); + + ret = __bam_ritem(dbc, + pagep, argp->indx, &dbt, argp->isdeleted); + __os_free(env, dbt.data); + if (ret != 0) + goto out; + + /* Reset the deleted flag, if necessary. */ + if (argp->isdeleted && LEVEL(pagep) == LEAFLEVEL) + B_DSET(GET_BKEYDATA(file_dbp, pagep, argp->indx)->type); + + LSN(pagep) = argp->lsn; + } + if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0) + goto out; + pagep = NULL; + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: if (pagep != NULL) + (void)__memp_fput(mpf, ip, pagep, dbc->priority); + REC_CLOSE; +} + +/* + * __bam_root_recover -- + * Recovery function for setting the root page on the meta-data page. + * + * PUBLIC: int __bam_root_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_root_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __bam_root_args *argp; + DB_THREAD_INFO *ip; + BTMETA *meta; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + int cmp_n, cmp_p, ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + meta = NULL; + REC_PRINT(__bam_root_print); + REC_INTRO(__bam_root_read, ip, 0); + + if ((ret = __memp_fget(mpf, &argp->meta_pgno, ip, NULL, + 0, &meta)) != 0) { + if (ret != DB_PAGE_NOTFOUND) { + ret = __db_pgerr(file_dbp, argp->meta_pgno, ret); + goto out; + } else + goto done; + } + + cmp_n = LOG_COMPARE(lsnp, &LSN(meta)); + cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn); + CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn); + CHECK_ABORT(env, op, cmp_n, &LSN(meta), lsnp); + if (cmp_p == 0 && DB_REDO(op)) { + /* Need to redo update described. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &meta); + meta->root = argp->root_pgno; + meta->dbmeta.lsn = *lsnp; + ((BTREE *)file_dbp->bt_internal)->bt_root = meta->root; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Nothing to undo except lsn. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &meta); + meta->dbmeta.lsn = argp->meta_lsn; + } + if ((ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0) + goto out; + meta = NULL; + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: if (meta != NULL) + (void)__memp_fput(mpf, ip, meta, file_dbp->priority); + REC_CLOSE; +} + +/* + * __bam_curadj_recover -- + * Transaction abort function to undo cursor adjustments. + * This should only be triggered by subtransaction aborts. + * + * PUBLIC: int __bam_curadj_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_curadj_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __bam_curadj_args *argp; + DB_THREAD_INFO *ip; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + int ret; + + COMPQUIET(mpf, NULL); + + ip = ((DB_TXNHEAD *)info)->thread_info; + REC_PRINT(__bam_curadj_print); + REC_INTRO(__bam_curadj_read, ip, 1); + + ret = 0; + if (op != DB_TXN_ABORT) + goto done; + + switch (argp->mode) { + case DB_CA_DI: + if ((ret = __bam_ca_di(dbc, argp->from_pgno, + argp->from_indx, -(int)argp->first_indx)) != 0) + goto out; + break; + case DB_CA_DUP: + if ((ret = __bam_ca_undodup(file_dbp, argp->first_indx, + argp->from_pgno, argp->from_indx, argp->to_indx)) != 0) + goto out; + break; + + case DB_CA_RSPLIT: + if ((ret = + __bam_ca_rsplit(dbc, argp->to_pgno, argp->from_pgno)) != 0) + goto out; + break; + + case DB_CA_SPLIT: + if ((ret = __bam_ca_undosplit(file_dbp, argp->from_pgno, + argp->to_pgno, argp->left_pgno, argp->from_indx)) != 0) + goto out; + break; + } + +done: *lsnp = argp->prev_lsn; +out: REC_CLOSE; +} + +/* + * __bam_rcuradj_recover -- + * Transaction abort function to undo cursor adjustments in rrecno. + * This should only be triggered by subtransaction aborts. + * + * PUBLIC: int __bam_rcuradj_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_rcuradj_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __bam_rcuradj_args *argp; + DB_THREAD_INFO *ip; + BTREE_CURSOR *cp; + DB *file_dbp; + DBC *dbc, *rdbc; + DB_MPOOLFILE *mpf; + int ret, t_ret; + + COMPQUIET(mpf, NULL); + + ip = ((DB_TXNHEAD *)info)->thread_info; + rdbc = NULL; + REC_PRINT(__bam_rcuradj_print); + REC_INTRO(__bam_rcuradj_read, ip, 1); + + ret = t_ret = 0; + + if (op != DB_TXN_ABORT) + goto done; + + /* + * We don't know whether we're in an offpage dup set, and + * thus don't know whether the dbc REC_INTRO has handed us is + * of a reasonable type. It's certainly unset, so if this is + * an offpage dup set, we don't have an OPD cursor. The + * simplest solution is just to allocate a whole new cursor + * for our use; we're only really using it to hold pass some + * state into __ram_ca, and this way we don't need to make + * this function know anything about how offpage dups work. + */ + if ((ret = __db_cursor_int(file_dbp, NULL, + NULL, DB_RECNO, argp->root, 0, NULL, &rdbc)) != 0) + goto out; + + cp = (BTREE_CURSOR *)rdbc->internal; + F_SET(cp, C_RENUMBER); + cp->recno = argp->recno; + + switch (argp->mode) { + case CA_DELETE: + /* + * The way to undo a delete is with an insert. Since + * we're undoing it, the delete flag must be set. + */ + F_SET(cp, C_DELETED); + F_SET(cp, C_RENUMBER); /* Just in case. */ + cp->order = argp->order; + if ((ret = __ram_ca(rdbc, CA_ICURRENT, NULL)) != 0) + goto out; + break; + case CA_IAFTER: + case CA_IBEFORE: + case CA_ICURRENT: + /* + * The way to undo an insert is with a delete. The delete + * flag is unset to start with. + */ + F_CLR(cp, C_DELETED); + cp->order = INVALID_ORDER; + if ((ret = __ram_ca(rdbc, CA_DELETE, NULL)) != 0) + goto out; + break; + } + +done: *lsnp = argp->prev_lsn; +out: if (rdbc != NULL && (t_ret = __dbc_close(rdbc)) != 0 && ret == 0) + ret = t_ret; + REC_CLOSE; +} + +/* + * __bam_relink_recover -- + * Recovery function for relink. + * + * PUBLIC: int __bam_relink_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_relink_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __bam_relink_args *argp; + DB_THREAD_INFO *ip; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int cmp_n, cmp_p, ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + pagep = NULL; + REC_PRINT(__bam_relink_print); + REC_INTRO(__bam_relink_read, ip, 0); + + /* + * There are up to three pages we need to check -- the page, and the + * previous and next pages, if they existed. For a page add operation, + * the current page is the result of a split and is being recovered + * elsewhere, so all we need do is recover the next page. + */ + if (argp->next == PGNO_INVALID) + goto prev; + if ((ret = __memp_fget(mpf, &argp->next, ip, NULL, 0, &pagep)) != 0) { + if (ret != DB_PAGE_NOTFOUND) { + ret = __db_pgerr(file_dbp, argp->next, ret); + goto out; + } else + goto prev; + } + + cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_next); + CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_next); + CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp); + if (cmp_p == 0 && DB_REDO(op)) { + /* Redo the remove or replace. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + if (argp->new_pgno == PGNO_INVALID) + pagep->prev_pgno = argp->prev; + else + pagep->prev_pgno = argp->new_pgno; + + pagep->lsn = *lsnp; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Undo the remove or replace. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + pagep->prev_pgno = argp->pgno; + + pagep->lsn = argp->lsn_next; + } + + if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) + goto out; + pagep = NULL; + +prev: if (argp->prev == PGNO_INVALID) + goto done; + if ((ret = __memp_fget(mpf, &argp->prev, ip, NULL, 0, &pagep)) != 0) { + if (ret != DB_PAGE_NOTFOUND) { + ret = __db_pgerr(file_dbp, argp->prev, ret); + goto out; + } else + goto done; + } + + cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_prev); + CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_prev); + CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp); + if (cmp_p == 0 && DB_REDO(op)) { + /* Redo the relink. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + if (argp->new_pgno == PGNO_INVALID) + pagep->next_pgno = argp->next; + else + pagep->next_pgno = argp->new_pgno; + + pagep->lsn = *lsnp; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Undo the relink. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + pagep->next_pgno = argp->pgno; + pagep->lsn = argp->lsn_prev; + } + + if ((ret = __memp_fput(mpf, + ip, pagep, file_dbp->priority)) != 0) + goto out; + pagep = NULL; + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: if (pagep != NULL) + (void)__memp_fput(mpf, ip, pagep, file_dbp->priority); + REC_CLOSE; +} + +/* + * __bam_merge_44_recover -- + * Recovery function for merge. + * + * PUBLIC: int __bam_merge_44_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_merge_44_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __bam_merge_44_args *argp; + DB_THREAD_INFO *ip; + BKEYDATA *bk; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + PAGE *pagep; + db_indx_t indx, *ninp, *pinp; + u_int32_t size; + u_int8_t *bp; + int cmp_n, cmp_p, i, ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + REC_PRINT(__bam_merge_44_print); + REC_INTRO(__bam_merge_44_read, ip, 1); + + if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) { + if (ret != DB_PAGE_NOTFOUND) { + ret = __db_pgerr(file_dbp, argp->pgno, ret); + goto out; + } else + goto next; + } + + cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn); + CHECK_LSN(file_dbp->env, op, cmp_p, &LSN(pagep), &argp->lsn); + + if (cmp_p == 0 && DB_REDO(op)) { + /* + * If the header is provided the page is empty, copy the + * needed data. + */ + DB_ASSERT(env, argp->hdr.size == 0 || NUM_ENT(pagep) == 0); + REC_DIRTY(mpf, ip, dbc->priority, &pagep); + if (argp->hdr.size != 0) { + P_INIT(pagep, file_dbp->pgsize, pagep->pgno, + PREV_PGNO(argp->hdr.data), + NEXT_PGNO(argp->hdr.data), + LEVEL(argp->hdr.data), TYPE(argp->hdr.data)); + } + if (TYPE(pagep) == P_OVERFLOW) { + OV_REF(pagep) = OV_REF(argp->hdr.data); + OV_LEN(pagep) = OV_LEN(argp->hdr.data); + bp = (u_int8_t *) pagep + P_OVERHEAD(file_dbp); + memcpy(bp, argp->data.data, argp->data.size); + } else { + /* Copy the data segment. */ + bp = (u_int8_t *)pagep + + (db_indx_t)(HOFFSET(pagep) - argp->data.size); + memcpy(bp, argp->data.data, argp->data.size); + + /* Copy index table offset past the current entries. */ + pinp = P_INP(file_dbp, pagep) + NUM_ENT(pagep); + ninp = argp->ind.data; + for (i = 0; + i < (int)(argp->ind.size / sizeof(*ninp)); i++) + *pinp++ = *ninp++ + - (file_dbp->pgsize - HOFFSET(pagep)); + HOFFSET(pagep) -= argp->data.size; + NUM_ENT(pagep) += i; + } + pagep->lsn = *lsnp; + } else if (cmp_n == 0 && !DB_REDO(op)) { + /* + * Since logging is logical at the page level + * we cannot just truncate the data space. Delete + * the proper number of items from the logical end + * of the page. + */ + REC_DIRTY(mpf, ip, dbc->priority, &pagep); + for (i = 0; i < (int)(argp->ind.size / sizeof(*ninp)); i++) { + indx = NUM_ENT(pagep) - 1; + if (P_INP(file_dbp, pagep)[indx] == + P_INP(file_dbp, pagep)[indx - P_INDX]) { + NUM_ENT(pagep)--; + continue; + } + switch (TYPE(pagep)) { + case P_LBTREE: + case P_LRECNO: + case P_LDUP: + bk = GET_BKEYDATA(file_dbp, pagep, indx); + size = BITEM_SIZE(bk); + break; + + case P_IBTREE: + size = BINTERNAL_SIZE( + GET_BINTERNAL(file_dbp, pagep, indx)->len); + break; + case P_IRECNO: + size = RINTERNAL_SIZE; + break; + + default: + ret = __db_pgfmt(env, PGNO(pagep)); + goto out; + } + if ((ret = + __db_ditem(dbc, pagep, indx, size)) != 0) + goto out; + } + if (argp->ind.size == 0) + HOFFSET(pagep) = file_dbp->pgsize; + pagep->lsn = argp->lsn; + } + + if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0) + goto out; + +next: if ((ret = __memp_fget(mpf, &argp->npgno, ip, NULL, 0, &pagep)) != 0) { + if (ret != DB_PAGE_NOTFOUND) { + ret = __db_pgerr(file_dbp, argp->pgno, ret); + goto out; + } else + goto done; + } + + cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nlsn); + CHECK_LSN(file_dbp->env, op, cmp_p, &LSN(pagep), &argp->nlsn); + + if (cmp_p == 0 && DB_REDO(op)) { + /* Need to truncate the page. */ + REC_DIRTY(mpf, ip, dbc->priority, &pagep); + HOFFSET(pagep) = file_dbp->pgsize; + NUM_ENT(pagep) = 0; + pagep->lsn = *lsnp; + } else if (cmp_n == 0 && !DB_REDO(op)) { + /* Need to put the data back on the page. */ + REC_DIRTY(mpf, ip, dbc->priority, &pagep); + if (TYPE(pagep) == P_OVERFLOW) { + OV_REF(pagep) = OV_REF(argp->hdr.data); + OV_LEN(pagep) = OV_LEN(argp->hdr.data); + bp = (u_int8_t *) pagep + P_OVERHEAD(file_dbp); + memcpy(bp, argp->data.data, argp->data.size); + } else { + bp = (u_int8_t *)pagep + + (db_indx_t)(HOFFSET(pagep) - argp->data.size); + memcpy(bp, argp->data.data, argp->data.size); + + /* Copy index table. */ + pinp = P_INP(file_dbp, pagep) + NUM_ENT(pagep); + ninp = argp->ind.data; + for (i = 0; + i < (int)(argp->ind.size / sizeof(*ninp)); i++) + *pinp++ = *ninp++; + HOFFSET(pagep) -= argp->data.size; + NUM_ENT(pagep) = i; + } + pagep->lsn = argp->nlsn; + } + + if ((ret = __memp_fput(mpf, + ip, pagep, dbc->priority)) != 0) + goto out; +done: + *lsnp = argp->prev_lsn; + ret = 0; + +out: REC_CLOSE; +} + +/* + * __bam_merge_recover -- + * Recovery function for merge. + * + * PUBLIC: int __bam_merge_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_merge_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __bam_merge_args *argp; + DB_THREAD_INFO *ip; + BKEYDATA *bk; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + PAGE *pagep; + db_indx_t indx, *ninp, *pinp; + u_int32_t size; + u_int8_t *bp; + int cmp_n, cmp_p, i, ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + REC_PRINT(__bam_merge_print); + REC_INTRO(__bam_merge_read, ip, 1); + + if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) { + if (ret != DB_PAGE_NOTFOUND) { + ret = __db_pgerr(file_dbp, argp->pgno, ret); + goto out; + } else + goto next; + } + + cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn); + CHECK_LSN(file_dbp->env, op, cmp_p, &LSN(pagep), &argp->lsn); + CHECK_ABORT(file_dbp->env, op, cmp_n, &LSN(pagep), lsnp); + + if (cmp_p == 0 && DB_REDO(op)) { + /* + * When pg_copy is set, we are copying onto a new page. + */ + DB_ASSERT(env, !argp->pg_copy || NUM_ENT(pagep) == 0); + REC_DIRTY(mpf, ip, dbc->priority, &pagep); + if (argp->pg_copy) { + P_INIT(pagep, file_dbp->pgsize, pagep->pgno, + PREV_PGNO(argp->hdr.data), + NEXT_PGNO(argp->hdr.data), + LEVEL(argp->hdr.data), TYPE(argp->hdr.data)); + } + if (TYPE(pagep) == P_OVERFLOW) { + OV_REF(pagep) = OV_REF(argp->hdr.data); + OV_LEN(pagep) = OV_LEN(argp->hdr.data); + bp = (u_int8_t *)pagep + P_OVERHEAD(file_dbp); + memcpy(bp, argp->data.data, argp->data.size); + } else { + /* Copy the data segment. */ + bp = (u_int8_t *)pagep + + (db_indx_t)(HOFFSET(pagep) - argp->data.size); + memcpy(bp, argp->data.data, argp->data.size); + + /* Copy index table offset past the current entries. */ + pinp = P_INP(file_dbp, pagep) + NUM_ENT(pagep); + ninp = P_INP(file_dbp, argp->hdr.data); + for (i = 0; i < NUM_ENT(argp->hdr.data); i++) + *pinp++ = *ninp++ + - (file_dbp->pgsize - HOFFSET(pagep)); + HOFFSET(pagep) -= argp->data.size; + NUM_ENT(pagep) += i; + } + pagep->lsn = *lsnp; + } else if (cmp_n == 0 && !DB_REDO(op)) { + REC_DIRTY(mpf, ip, dbc->priority, &pagep); + if (TYPE(pagep) == P_OVERFLOW) { + HOFFSET(pagep) = file_dbp->pgsize; + goto setlsn; + } + + /* + * Since logging is logical at the page level we cannot just + * truncate the data space. Delete the proper number of items + * from the logical end of the page. + */ + for (i = 0; i < NUM_ENT(argp->hdr.data); i++) { + indx = NUM_ENT(pagep) - 1; + if (P_INP(file_dbp, pagep)[indx] == + P_INP(file_dbp, pagep)[indx - P_INDX]) { + NUM_ENT(pagep)--; + continue; + } + switch (TYPE(pagep)) { + case P_LBTREE: + case P_LRECNO: + case P_LDUP: + bk = GET_BKEYDATA(file_dbp, pagep, indx); + size = BITEM_SIZE(bk); + break; + + case P_IBTREE: + size = BINTERNAL_SIZE( + GET_BINTERNAL(file_dbp, pagep, indx)->len); + break; + case P_IRECNO: + size = RINTERNAL_SIZE; + break; + + default: + ret = __db_pgfmt(env, PGNO(pagep)); + goto out; + } + if ((ret = __db_ditem(dbc, pagep, indx, size)) != 0) + goto out; + } +setlsn: pagep->lsn = argp->lsn; + } + + if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0) + goto out; + +next: if ((ret = __memp_fget(mpf, &argp->npgno, ip, NULL, 0, &pagep)) != 0) { + if (ret != DB_PAGE_NOTFOUND) { + ret = __db_pgerr(file_dbp, argp->pgno, ret); + goto out; + } else + goto done; + } + + cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nlsn); + CHECK_LSN(file_dbp->env, op, cmp_p, &LSN(pagep), &argp->nlsn); + + if (cmp_p == 0 && DB_REDO(op)) { + /* Need to truncate the page. */ + REC_DIRTY(mpf, ip, dbc->priority, &pagep); + HOFFSET(pagep) = file_dbp->pgsize; + NUM_ENT(pagep) = 0; + pagep->lsn = *lsnp; + } else if (cmp_n == 0 && !DB_REDO(op)) { + /* Need to put the data back on the page. */ + REC_DIRTY(mpf, ip, dbc->priority, &pagep); + if (TYPE(pagep) == P_OVERFLOW) { + OV_REF(pagep) = OV_REF(argp->hdr.data); + OV_LEN(pagep) = OV_LEN(argp->hdr.data); + bp = (u_int8_t *)pagep + P_OVERHEAD(file_dbp); + memcpy(bp, argp->data.data, argp->data.size); + } else { + bp = (u_int8_t *)pagep + + (db_indx_t)(HOFFSET(pagep) - argp->data.size); + memcpy(bp, argp->data.data, argp->data.size); + + /* Copy index table. */ + pinp = P_INP(file_dbp, pagep) + NUM_ENT(pagep); + ninp = P_INP(file_dbp, argp->hdr.data); + for (i = 0; i < NUM_ENT(argp->hdr.data); i++) + *pinp++ = *ninp++; + HOFFSET(pagep) -= argp->data.size; + NUM_ENT(pagep) += i; + } + pagep->lsn = argp->nlsn; + } + + if ((ret = __memp_fput(mpf, + ip, pagep, dbc->priority)) != 0) + goto out; +done: + *lsnp = argp->prev_lsn; + ret = 0; + +out: REC_CLOSE; +} + +/* + * __bam_pgno_recover -- + * Recovery function for page number replacment. + * + * PUBLIC: int __bam_pgno_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_pgno_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + BINTERNAL *bi; + __bam_pgno_args *argp; + DB_THREAD_INFO *ip; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + PAGE *pagep, *npagep; + db_pgno_t *pgnop; + int cmp_n, cmp_p, ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + REC_PRINT(__bam_pgno_print); + REC_INTRO(__bam_pgno_read, ip, 0); + + REC_FGET(mpf, ip, argp->pgno, &pagep, done); + + cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn); + CHECK_LSN(file_dbp->env, op, cmp_p, &LSN(pagep), &argp->lsn); + CHECK_ABORT(file_dbp->env, op, cmp_n, &LSN(pagep), lsnp); + + if ((cmp_p == 0 && DB_REDO(op)) || (cmp_n == 0 && !DB_REDO(op))) { + switch (TYPE(pagep)) { + case P_IBTREE: + /* + * An internal record can have both a overflow + * and child pointer. Fetch the page to see + * which it is. + */ + bi = GET_BINTERNAL(file_dbp, pagep, argp->indx); + if (B_TYPE(bi->type) == B_OVERFLOW) { + REC_FGET(mpf, ip, argp->npgno, &npagep, out); + + if (TYPE(npagep) == P_OVERFLOW) + pgnop = + &((BOVERFLOW *)(bi->data))->pgno; + else + pgnop = &bi->pgno; + if ((ret = __memp_fput(mpf, ip, + npagep, file_dbp->priority)) != 0) + goto out; + break; + } + pgnop = &bi->pgno; + break; + case P_IRECNO: + pgnop = + &GET_RINTERNAL(file_dbp, pagep, argp->indx)->pgno; + break; + default: + pgnop = + &GET_BOVERFLOW(file_dbp, pagep, argp->indx)->pgno; + break; + } + + if (DB_REDO(op)) { + /* Need to redo update described. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + *pgnop = argp->npgno; + pagep->lsn = *lsnp; + } else { + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + *pgnop = argp->opgno; + pagep->lsn = argp->lsn; + } + } + + if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) + goto out; + +done: + *lsnp = argp->prev_lsn; + ret = 0; + +out: REC_CLOSE; +} + +/* + * __bam_relink_43_recover -- + * Recovery function for relink. + * + * PUBLIC: int __bam_relink_43_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__bam_relink_43_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __bam_relink_43_args *argp; + DB_THREAD_INFO *ip; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int cmp_n, cmp_p, modified, ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + pagep = NULL; + REC_PRINT(__bam_relink_43_print); + REC_INTRO(__bam_relink_43_read, ip, 0); + + /* + * There are up to three pages we need to check -- the page, and the + * previous and next pages, if they existed. For a page add operation, + * the current page is the result of a split and is being recovered + * elsewhere, so all we need do is recover the next page. + */ + if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) { + if (ret != DB_PAGE_NOTFOUND) { + ret = __db_pgerr(file_dbp, argp->pgno, ret); + goto out; + } else + goto next2; + } + + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn); + CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn); + if (cmp_p == 0 && DB_REDO(op)) { + /* Redo the relink. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + pagep->lsn = *lsnp; + } else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) { + /* Undo the relink. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + pagep->next_pgno = argp->next; + pagep->prev_pgno = argp->prev; + pagep->lsn = argp->lsn; + } + if ((ret = __memp_fput(mpf, + ip, pagep, file_dbp->priority)) != 0) + goto out; + pagep = NULL; + +next2: if ((ret = __memp_fget(mpf, &argp->next, ip, NULL, 0, &pagep)) != 0) { + if (ret != DB_PAGE_NOTFOUND) { + ret = __db_pgerr(file_dbp, argp->next, ret); + goto out; + } else + goto prev; + } + + modified = 0; + cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_next); + CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_next); + if (cmp_p == 0 && DB_REDO(op)) { + /* Redo the remove or undo the add. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + pagep->prev_pgno = argp->prev; + modified = 1; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Undo the remove or redo the add. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + pagep->prev_pgno = argp->pgno; + modified = 1; + } + if (modified) { + if (DB_UNDO(op)) + pagep->lsn = argp->lsn_next; + else + pagep->lsn = *lsnp; + } + if ((ret = __memp_fput(mpf, + ip, pagep, file_dbp->priority)) != 0) + goto out; + pagep = NULL; + +prev: if ((ret = __memp_fget(mpf, &argp->prev, ip, NULL, 0, &pagep)) != 0) { + if (ret != DB_PAGE_NOTFOUND) { + ret = __db_pgerr(file_dbp, argp->prev, ret); + goto out; + } else + goto done; + } + + modified = 0; + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_prev); + CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_prev); + if (cmp_p == 0 && DB_REDO(op)) { + /* Redo the relink. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + pagep->next_pgno = argp->next; + modified = 1; + } else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) { + /* Undo the relink. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + pagep->next_pgno = argp->pgno; + modified = 1; + } + if (modified) { + if (DB_UNDO(op)) + pagep->lsn = argp->lsn_prev; + else + pagep->lsn = *lsnp; + } + if ((ret = __memp_fput(mpf, + ip, pagep, file_dbp->priority)) != 0) + goto out; + pagep = NULL; + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: if (pagep != NULL) + (void)__memp_fput(mpf, ip, pagep, file_dbp->priority); + REC_CLOSE; +} diff --git a/btree/bt_reclaim.c b/btree/bt_reclaim.c new file mode 100644 index 0000000..835bf9f --- /dev/null +++ b/btree/bt_reclaim.c @@ -0,0 +1,97 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1998-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/lock.h" + +/* + * __bam_reclaim -- + * Free a database. + * + * PUBLIC: int __bam_reclaim __P((DB *, DB_THREAD_INFO *, DB_TXN *)); + */ +int +__bam_reclaim(dbp, ip, txn) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; +{ + DBC *dbc; + DB_LOCK meta_lock; + int ret, t_ret; + + /* Acquire a cursor. */ + if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0) + return (ret); + + /* Write lock the metapage for deallocations. */ + if ((ret = __db_lget(dbc, + 0, PGNO_BASE_MD, DB_LOCK_WRITE, 0, &meta_lock)) != 0) + goto err; + + /* Avoid locking every page, we have the handle locked exclusive. */ + F_SET(dbc, DBC_DONTLOCK); + + /* Walk the tree, freeing pages. */ + ret = __bam_traverse(dbc, + DB_LOCK_WRITE, dbc->internal->root, __db_reclaim_callback, NULL); + + if ((t_ret = __TLPUT(dbc, meta_lock)) != 0 && ret == 0) + ret = t_ret; + + /* Discard the cursor. */ +err: if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __bam_truncate -- + * Truncate a database. + * + * PUBLIC: int __bam_truncate __P((DBC *, u_int32_t *)); + */ +int +__bam_truncate(dbc, countp) + DBC *dbc; + u_int32_t *countp; +{ + u_int32_t count; + int ret; + +#ifdef HAVE_COMPRESSION + u_int32_t comp_count; + + comp_count = 0; + if (DB_IS_COMPRESSED(dbc->dbp) && + (ret = __bam_compress_count(dbc, NULL, &comp_count)) != 0) + return (ret); +#endif + + count = 0; + + /* Walk the tree, freeing pages. */ + ret = __bam_traverse(dbc, + DB_LOCK_WRITE, dbc->internal->root, __db_truncate_callback, &count); + +#ifdef HAVE_COMPRESSION + if (DB_IS_COMPRESSED(dbc->dbp)) { + if (countp != NULL) + *countp = comp_count; + } else +#endif + if (countp != NULL) + *countp = count; + + return (ret); +} diff --git a/btree/bt_recno.c b/btree/bt_recno.c new file mode 100644 index 0000000..524de46 --- /dev/null +++ b/btree/bt_recno.c @@ -0,0 +1,1385 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1997-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/lock.h" +#include "dbinc/mp.h" + +static int __ram_add __P((DBC *, db_recno_t *, DBT *, u_int32_t, u_int32_t)); +static int __ram_source __P((DB *)); +static int __ram_sread __P((DBC *, db_recno_t)); +static int __ram_update __P((DBC *, db_recno_t, int)); + +/* + * In recno, there are two meanings to the on-page "deleted" flag. If we're + * re-numbering records, it means the record was implicitly created. We skip + * over implicitly created records if doing a cursor "next" or "prev", and + * return DB_KEYEMPTY if they're explicitly requested.. If not re-numbering + * records, it means that the record was implicitly created, or was deleted. + * We skip over implicitly created or deleted records if doing a cursor "next" + * or "prev", and return DB_KEYEMPTY if they're explicitly requested. + * + * If we're re-numbering records, then we have to detect in the cursor that + * a record was deleted, and adjust the cursor as necessary on the next get. + * If we're not re-numbering records, then we can detect that a record has + * been deleted by looking at the actual on-page record, so we completely + * ignore the cursor's delete flag. This is different from the B+tree code. + * It also maintains whether the cursor references a deleted record in the + * cursor, and it doesn't always check the on-page value. + */ +#define CD_SET(cp) { \ + if (F_ISSET(cp, C_RENUMBER)) \ + F_SET(cp, C_DELETED); \ +} +#define CD_CLR(cp) { \ + if (F_ISSET(cp, C_RENUMBER)) { \ + F_CLR(cp, C_DELETED); \ + cp->order = INVALID_ORDER; \ + } \ +} +#define CD_ISSET(cp) \ + (F_ISSET(cp, C_RENUMBER) && F_ISSET(cp, C_DELETED) ? 1 : 0) + +/* + * Macros for comparing the ordering of two cursors. + * cp1 comes before cp2 iff one of the following holds: + * cp1's recno is less than cp2's recno + * recnos are equal, both deleted, and cp1's order is less than cp2's + * recnos are equal, cp1 deleted, and cp2 not deleted + */ +#define C_LESSTHAN(cp1, cp2) \ + (((cp1)->recno < (cp2)->recno) || \ + (((cp1)->recno == (cp2)->recno) && \ + ((CD_ISSET((cp1)) && CD_ISSET((cp2)) && (cp1)->order < (cp2)->order) || \ + (CD_ISSET((cp1)) && !CD_ISSET((cp2)))))) + +/* + * cp1 is equal to cp2 iff their recnos and delete flags are identical, + * and if the delete flag is set their orders are also identical. + */ +#define C_EQUAL(cp1, cp2) \ + ((cp1)->recno == (cp2)->recno && CD_ISSET((cp1)) == CD_ISSET((cp2)) && \ + (!CD_ISSET((cp1)) || (cp1)->order == (cp2)->order)) + +/* + * Do we need to log the current cursor adjustment? + */ +#define CURADJ_LOG(dbc) \ + (DBC_LOGGING((dbc)) && (dbc)->txn != NULL && (dbc)->txn->parent != NULL) + +/* + * After a search, copy the found page into the cursor, discarding any + * currently held lock. + */ +#define STACK_TO_CURSOR(cp, ret) { \ + int __t_ret; \ + (cp)->page = (cp)->csp->page; \ + (cp)->pgno = (cp)->csp->page->pgno; \ + (cp)->indx = (cp)->csp->indx; \ + if ((__t_ret = __TLPUT(dbc, (cp)->lock)) != 0 && (ret) == 0) \ + ret = __t_ret; \ + (cp)->lock = (cp)->csp->lock; \ + (cp)->lock_mode = (cp)->csp->lock_mode; \ +} + +/* + * __ram_open -- + * Recno open function. + * + * PUBLIC: int __ram_open __P((DB *, DB_THREAD_INFO *, + * PUBLIC: DB_TXN *, const char *, db_pgno_t, u_int32_t)); + */ +int +__ram_open(dbp, ip, txn, name, base_pgno, flags) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + const char *name; + db_pgno_t base_pgno; + u_int32_t flags; +{ + BTREE *t; + DBC *dbc; + int ret, t_ret; + + COMPQUIET(name, NULL); + t = dbp->bt_internal; + + /* Start up the tree. */ + if ((ret = __bam_read_root(dbp, ip, txn, base_pgno, flags)) != 0) + return (ret); + + /* + * If the user specified a source tree, open it and map it in. + * + * !!! + * We don't complain if the user specified transactions or threads. + * It's possible to make it work, but you'd better know what you're + * doing! + */ + if (t->re_source != NULL && (ret = __ram_source(dbp)) != 0) + return (ret); + + /* If we're snapshotting an underlying source file, do it now. */ + if (F_ISSET(dbp, DB_AM_SNAPSHOT)) { + /* Allocate a cursor. */ + if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0) + return (ret); + + /* Do the snapshot. */ + if ((ret = __ram_update(dbc, + DB_MAX_RECORDS, 0)) != 0 && ret == DB_NOTFOUND) + ret = 0; + + /* Discard the cursor. */ + if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + } + + return (ret); +} + +/* + * __ram_append -- + * Recno append function. + * + * PUBLIC: int __ram_append __P((DBC *, DBT *, DBT *)); + */ +int +__ram_append(dbc, key, data) + DBC *dbc; + DBT *key, *data; +{ + BTREE_CURSOR *cp; + int ret; + + cp = (BTREE_CURSOR *)dbc->internal; + + /* + * Make sure we've read in all of the backing source file. If + * we found the record or it simply didn't exist, add the + * user's record. + */ + ret = __ram_update(dbc, DB_MAX_RECORDS, 0); + if (ret == 0 || ret == DB_NOTFOUND) + ret = __ram_add(dbc, &cp->recno, data, DB_APPEND, 0); + + /* Return the record number. */ + if (ret == 0 && key != NULL) + ret = __db_retcopy(dbc->env, key, &cp->recno, + sizeof(cp->recno), &dbc->rkey->data, &dbc->rkey->ulen); + + return (ret); +} + +/* + * __ramc_del -- + * Recno DBC->del function. + * + * PUBLIC: int __ramc_del __P((DBC *, u_int32_t)); + */ +int +__ramc_del(dbc, flags) + DBC *dbc; + u_int32_t flags; +{ + BKEYDATA bk; + BTREE *t; + BTREE_CURSOR *cp; + DB *dbp; + DBT hdr, data; + DB_LOCK next_lock, prev_lock; + DB_LSN lsn; + db_pgno_t npgno, ppgno, save_npgno, save_ppgno; + int exact, nc, ret, stack, t_ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + t = dbp->bt_internal; + stack = 0; + save_npgno = save_ppgno = PGNO_INVALID; + LOCK_INIT(next_lock); + LOCK_INIT(prev_lock); + COMPQUIET(flags, 0); + + /* + * The semantics of cursors during delete are as follows: in + * non-renumbering recnos, records are replaced with a marker + * containing a delete flag. If the record referenced by this cursor + * has already been deleted, we will detect that as part of the delete + * operation, and fail. + * + * In renumbering recnos, cursors which represent deleted items + * are flagged with the C_DELETED flag, and it is an error to + * call c_del a second time without an intervening cursor motion. + */ + if (CD_ISSET(cp)) + return (DB_KEYEMPTY); + + /* Search the tree for the key; delete only deletes exact matches. */ +retry: if ((ret = __bam_rsearch(dbc, &cp->recno, SR_DELETE, 1, &exact)) != 0) + goto err; + if (!exact) { + ret = DB_NOTFOUND; + goto err; + } + stack = 1; + + /* Copy the page into the cursor. */ + STACK_TO_CURSOR(cp, ret); + if (ret != 0) + goto err; + + /* + * If re-numbering records, the on-page deleted flag can only mean + * that this record was implicitly created. Applications aren't + * permitted to delete records they never created, return an error. + * + * If not re-numbering records, the on-page deleted flag means that + * this record was implicitly created, or, was deleted at some time. + * The former is an error because applications aren't permitted to + * delete records they never created, the latter is an error because + * if the record was "deleted", we could never have found it. + */ + if (B_DISSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type)) { + ret = DB_KEYEMPTY; + goto err; + } + + if (F_ISSET(cp, C_RENUMBER)) { + /* If we are going to drop the page, lock its neighbors. */ + if (STD_LOCKING(dbc) && + NUM_ENT(cp->page) == 1 && PGNO(cp->page) != cp->root) { + if ((npgno = NEXT_PGNO(cp->page)) != PGNO_INVALID) + TRY_LOCK(dbc, npgno, save_npgno, + next_lock, DB_LOCK_WRITE, retry); + if (ret != 0) + goto err; + if ((ppgno = PREV_PGNO(cp->page)) != PGNO_INVALID) + TRY_LOCK(dbc, ppgno, save_ppgno, + prev_lock, DB_LOCK_WRITE, retry); + if (ret != 0) + goto err; + } + /* Delete the item, adjust the counts, adjust the cursors. */ + if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0) + goto err; + if ((ret = __bam_adjust(dbc, -1)) != 0) + goto err; + if ((ret = __ram_ca(dbc, CA_DELETE, &nc)) != 0) + goto err; + if (nc > 0 && + CURADJ_LOG(dbc) && (ret = __bam_rcuradj_log(dbp, dbc->txn, + &lsn, 0, CA_DELETE, cp->root, cp->recno, cp->order)) != 0) + goto err; + + /* + * If the page is empty, delete it. + * + * We never delete a root page. First, root pages of primary + * databases never go away, recno or otherwise. However, if + * it's the root page of an off-page duplicates database, then + * it can be deleted. We don't delete it here because we have + * no way of telling the primary database page holder (e.g., + * the hash access method) that its page element should cleaned + * up because the underlying tree is gone. So, we keep the page + * around until the last cursor referencing the empty tree is + * are closed, and then clean it up. + */ + if (NUM_ENT(cp->page) == 0 && PGNO(cp->page) != cp->root) { + /* + * We want to delete a single item out of the last page + * that we're not deleting. + */ + ret = __bam_dpages(dbc, 0, BTD_RELINK); + + /* + * Regardless of the return from __bam_dpages, it will + * discard our stack and pinned page. + */ + stack = 0; + cp->page = NULL; + LOCK_INIT(cp->lock); + cp->lock_mode = DB_LOCK_NG; + } + } else { + /* Use a delete/put pair to replace the record with a marker. */ + if ((ret = __bam_ditem(dbc, cp->page, cp->indx)) != 0) + goto err; + + B_TSET_DELETED(bk.type, B_KEYDATA); + bk.len = 0; + DB_INIT_DBT(hdr, &bk, SSZA(BKEYDATA, data)); + DB_INIT_DBT(data, "", 0); + if ((ret = __db_pitem(dbc, + cp->page, cp->indx, BKEYDATA_SIZE(0), &hdr, &data)) != 0) + goto err; + } + + t->re_modified = 1; + +err: if (stack && (t_ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __TLPUT(dbc, next_lock)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __TLPUT(dbc, prev_lock)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __ramc_get -- + * Recno DBC->get function. + * + * PUBLIC: int __ramc_get + * PUBLIC: __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *)); + */ +int +__ramc_get(dbc, key, data, flags, pgnop) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; + db_pgno_t *pgnop; +{ + BTREE_CURSOR *cp; + DB *dbp; + int cmp, exact, ret; + + COMPQUIET(pgnop, NULL); + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + + LF_CLR(DB_MULTIPLE|DB_MULTIPLE_KEY); +retry: switch (flags) { + case DB_CURRENT: + /* + * If we're using mutable records and the deleted flag is + * set, the cursor is pointing at a nonexistent record; + * return an error. + */ + if (CD_ISSET(cp)) + return (DB_KEYEMPTY); + break; + case DB_NEXT_DUP: + /* + * If we're not in an off-page dup set, we know there's no + * next duplicate since recnos don't have them. If we + * are in an off-page dup set, the next item assuredly is + * a dup, so we set flags to DB_NEXT and keep going. + */ + if (!F_ISSET(dbc, DBC_OPD)) + return (DB_NOTFOUND); + /* FALLTHROUGH */ + case DB_NEXT_NODUP: + /* + * Recno databases don't have duplicates, set flags to DB_NEXT + * and keep going. + */ + /* FALLTHROUGH */ + case DB_NEXT: + flags = DB_NEXT; + /* + * If record numbers are mutable: if we just deleted a record, + * we have to avoid incrementing the record number so that we + * return the right record by virtue of renumbering the tree. + */ + if (CD_ISSET(cp)) { + /* + * Clear the flag, we've moved off the deleted record. + */ + CD_CLR(cp); + break; + } + + if (cp->recno != RECNO_OOB) { + ++cp->recno; + break; + } + /* FALLTHROUGH */ + case DB_FIRST: + flags = DB_NEXT; + cp->recno = 1; + break; + case DB_PREV_DUP: + /* + * If we're not in an off-page dup set, we know there's no + * previous duplicate since recnos don't have them. If we + * are in an off-page dup set, the previous item assuredly + * is a dup, so we set flags to DB_PREV and keep going. + */ + if (!F_ISSET(dbc, DBC_OPD)) + return (DB_NOTFOUND); + /* FALLTHROUGH */ + case DB_PREV_NODUP: + /* + * Recno databases don't have duplicates, set flags to DB_PREV + * and keep going. + */ + /* FALLTHROUGH */ + case DB_PREV: + flags = DB_PREV; + if (cp->recno != RECNO_OOB) { + if (cp->recno == 1) { + ret = DB_NOTFOUND; + goto err; + } + --cp->recno; + break; + } + /* FALLTHROUGH */ + case DB_LAST: + flags = DB_PREV; + if (((ret = __ram_update(dbc, + DB_MAX_RECORDS, 0)) != 0) && ret != DB_NOTFOUND) + goto err; + if ((ret = __bam_nrecs(dbc, &cp->recno)) != 0) + goto err; + if (cp->recno == 0) { + ret = DB_NOTFOUND; + goto err; + } + break; + case DB_GET_BOTHC: + /* + * If we're doing a join and these are offpage dups, + * we want to keep searching forward from after the + * current cursor position. Increment the recno by 1, + * then proceed as for a DB_SET. + * + * Otherwise, we know there are no additional matching + * data, as recnos don't have dups. return DB_NOTFOUND. + */ + if (F_ISSET(dbc, DBC_OPD)) { + cp->recno++; + break; + } + ret = DB_NOTFOUND; + goto err; + /* NOTREACHED */ + case DB_GET_BOTH: + case DB_GET_BOTH_RANGE: + /* + * If we're searching a set of off-page dups, we start + * a new linear search from the first record. Otherwise, + * we compare the single data item associated with the + * requested record for a match. + */ + if (F_ISSET(dbc, DBC_OPD)) { + cp->recno = 1; + break; + } + /* FALLTHROUGH */ + case DB_SET: + case DB_SET_RANGE: + if ((ret = __ram_getno(dbc, key, &cp->recno, 0)) != 0) + goto err; + break; + default: + ret = __db_unknown_flag(dbp->env, "__ramc_get", flags); + goto err; + } + + /* + * For DB_PREV, DB_LAST, DB_SET and DB_SET_RANGE, we have already + * called __ram_update() to make sure sufficient records have been + * read from the backing source file. Do it now for DB_CURRENT (if + * the current record was deleted we may need more records from the + * backing file for a DB_CURRENT operation), DB_FIRST and DB_NEXT. + * (We don't have to test for flags == DB_FIRST, because the switch + * statement above re-set flags to DB_NEXT in that case.) + */ + if ((flags == DB_NEXT || flags == DB_CURRENT) && ((ret = + __ram_update(dbc, cp->recno, 0)) != 0) && ret != DB_NOTFOUND) + goto err; + + for (;; ++cp->recno) { + /* Search the tree for the record. */ + if ((ret = __bam_rsearch(dbc, &cp->recno, + F_ISSET(dbc, DBC_RMW) ? SR_FIND_WR : SR_FIND, + 1, &exact)) != 0) + goto err; + if (!exact) { + ret = DB_NOTFOUND; + goto err; + } + + /* Copy the page into the cursor. */ + STACK_TO_CURSOR(cp, ret); + if (ret != 0) + goto err; + + /* + * If re-numbering records, the on-page deleted flag means this + * record was implicitly created. If not re-numbering records, + * the on-page deleted flag means this record was implicitly + * created, or, it was deleted at some time. Regardless, we + * skip such records if doing cursor next/prev operations or + * walking through off-page duplicates, and fail if they were + * requested explicitly by the application. + */ + if (B_DISSET(GET_BKEYDATA(dbp, cp->page, cp->indx)->type)) + switch (flags) { + case DB_NEXT: + case DB_PREV: + (void)__bam_stkrel(dbc, STK_CLRDBC); + goto retry; + case DB_GET_BOTH: + case DB_GET_BOTH_RANGE: + /* + * If we're an OPD tree, we don't care about + * matching a record number on a DB_GET_BOTH + * -- everything belongs to the same tree. A + * normal recno should give up and return + * DB_NOTFOUND if the matching recno is deleted. + */ + if (F_ISSET(dbc, DBC_OPD)) { + (void)__bam_stkrel(dbc, STK_CLRDBC); + continue; + } + ret = DB_NOTFOUND; + goto err; + default: + ret = DB_KEYEMPTY; + goto err; + } + + if (flags == DB_GET_BOTH || + flags == DB_GET_BOTHC || flags == DB_GET_BOTH_RANGE) { + if ((ret = __bam_cmp(dbc, data, cp->page, cp->indx, + __bam_defcmp, &cmp)) != 0) + return (ret); + if (cmp == 0) + break; + if (!F_ISSET(dbc, DBC_OPD)) { + ret = DB_NOTFOUND; + goto err; + } + (void)__bam_stkrel(dbc, STK_CLRDBC); + } else + break; + } + + /* Return the key if the user didn't give us one. */ + if (!F_ISSET(dbc, DBC_OPD) && !F_ISSET(key, DB_DBT_ISSET)) { + ret = __db_retcopy(dbp->env, + key, &cp->recno, sizeof(cp->recno), + &dbc->rkey->data, &dbc->rkey->ulen); + F_SET(key, DB_DBT_ISSET); + } + + /* The cursor was reset, no further delete adjustment is necessary. */ +err: CD_CLR(cp); + + return (ret); +} + +/* + * __ramc_put -- + * Recno DBC->put function. + * + * PUBLIC: int __ramc_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *)); + */ +int +__ramc_put(dbc, key, data, flags, pgnop) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; + db_pgno_t *pgnop; +{ + BTREE_CURSOR *cp; + DB *dbp; + DB_LSN lsn; + ENV *env; + u_int32_t iiflags; + int exact, nc, ret, t_ret; + void *arg; + + COMPQUIET(pgnop, NULL); + + dbp = dbc->dbp; + env = dbp->env; + cp = (BTREE_CURSOR *)dbc->internal; + + /* + * DB_KEYFIRST and DB_KEYLAST mean different things if they're + * used in an off-page duplicate tree. If we're an off-page + * duplicate tree, they really mean "put at the beginning of the + * tree" and "put at the end of the tree" respectively, so translate + * them to something else. + */ + if (F_ISSET(dbc, DBC_OPD)) + switch (flags) { + case DB_KEYFIRST: + cp->recno = 1; + flags = DB_BEFORE; + break; + case DB_KEYLAST: + if ((ret = __ram_add(dbc, + &cp->recno, data, DB_APPEND, 0)) != 0) + return (ret); + if (CURADJ_LOG(dbc) && + (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0, + CA_ICURRENT, cp->root, cp->recno, cp->order)) != 0) + return (ret); + return (0); + default: + break; + } + + /* + * Handle normal DB_KEYFIRST/DB_KEYLAST; for a recno, which has + * no duplicates, these are identical and mean "put the given + * datum at the given recno". + */ + if (flags == DB_KEYFIRST || flags == DB_KEYLAST || + flags == DB_NOOVERWRITE || flags == DB_OVERWRITE_DUP) { + ret = __ram_getno(dbc, key, &cp->recno, 1); + if (ret == 0 || ret == DB_NOTFOUND) + ret = __ram_add(dbc, &cp->recno, data, flags, 0); + return (ret); + } + + /* + * If we're putting with a cursor that's marked C_DELETED, we need to + * take special care; the cursor doesn't "really" reference the item + * corresponding to its current recno, but instead is "between" that + * record and the current one. Translate the actual insert into + * DB_BEFORE, and let the __ram_ca work out the gory details of what + * should wind up pointing where. + */ + if (CD_ISSET(cp)) + iiflags = DB_BEFORE; + else + iiflags = flags; + +split: if ((ret = __bam_rsearch(dbc, &cp->recno, SR_INSERT, 1, &exact)) != 0) + goto err; + /* + * An inexact match is okay; it just means we're one record past the + * end, which is reasonable if we're marked deleted. + */ + DB_ASSERT(env, exact || CD_ISSET(cp)); + + /* Copy the page into the cursor. */ + STACK_TO_CURSOR(cp, ret); + if (ret != 0) + goto err; + + ret = __bam_iitem(dbc, key, data, iiflags, 0); + t_ret = __bam_stkrel(dbc, STK_CLRDBC); + + if (t_ret != 0 && (ret == 0 || ret == DB_NEEDSPLIT)) + ret = t_ret; + else if (ret == DB_NEEDSPLIT) { + arg = &cp->recno; + if ((ret = __bam_split(dbc, arg, NULL)) != 0) + goto err; + goto split; + } + if (ret != 0) + goto err; + + switch (flags) { /* Adjust the cursors. */ + case DB_AFTER: + if ((ret = __ram_ca(dbc, CA_IAFTER, &nc)) != 0) + goto err; + + /* + * We only need to adjust this cursor forward if we truly added + * the item after the current recno, rather than remapping it + * to DB_BEFORE. + */ + if (iiflags == DB_AFTER) + ++cp->recno; + + /* Only log if __ram_ca found any relevant cursors. */ + if (nc > 0 && CURADJ_LOG(dbc) && + (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0, CA_IAFTER, + cp->root, cp->recno, cp->order)) != 0) + goto err; + break; + case DB_BEFORE: + if ((ret = __ram_ca(dbc, CA_IBEFORE, &nc)) != 0) + goto err; + --cp->recno; + + /* Only log if __ram_ca found any relevant cursors. */ + if (nc > 0 && CURADJ_LOG(dbc) && + (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0, CA_IBEFORE, + cp->root, cp->recno, cp->order)) != 0) + goto err; + break; + case DB_CURRENT: + /* + * We only need to do an adjustment if we actually + * added an item, which we only would have done if the + * cursor was marked deleted. + */ + if (!CD_ISSET(cp)) + break; + + /* Only log if __ram_ca found any relevant cursors. */ + if ((ret = __ram_ca(dbc, CA_ICURRENT, &nc)) != 0) + goto err; + if (nc > 0 && CURADJ_LOG(dbc) && + (ret = __bam_rcuradj_log(dbp, dbc->txn, &lsn, 0, + CA_ICURRENT, cp->root, cp->recno, cp->order)) != 0) + goto err; + break; + default: + break; + } + + /* Return the key if we've created a new record. */ + if (!F_ISSET(dbc, DBC_OPD) && + (flags == DB_AFTER || flags == DB_BEFORE) && key != NULL) + ret = __db_retcopy(env, key, &cp->recno, + sizeof(cp->recno), &dbc->rkey->data, &dbc->rkey->ulen); + + /* The cursor was reset, no further delete adjustment is necessary. */ +err: CD_CLR(cp); + + return (ret); +} + +/* + * __ram_ca -- + * Adjust cursors. Returns the number of relevant cursors. + * + * PUBLIC: int __ram_ca __P((DBC *, ca_recno_arg, int *)); + */ +int +__ram_ca(dbc_arg, op, foundp) + DBC *dbc_arg; + ca_recno_arg op; + int *foundp; +{ + BTREE_CURSOR *cp, *cp_arg; + DB *dbp, *ldbp; + DBC *dbc; + ENV *env; + db_recno_t recno; + u_int32_t order; + int adjusted, found; + + dbp = dbc_arg->dbp; + env = dbp->env; + cp_arg = (BTREE_CURSOR *)dbc_arg->internal; + recno = cp_arg->recno; + + /* + * It only makes sense to adjust cursors if we're a renumbering + * recno; we should only be called if this is one. + */ + DB_ASSERT(env, F_ISSET(cp_arg, C_RENUMBER)); + + MUTEX_LOCK(env, env->mtx_dblist); + /* + * Adjust the cursors. See the comment in __bam_ca_delete(). + * + * If we're doing a delete, we need to find the highest + * order of any cursor currently pointing at this item, + * so we can assign a higher order to the newly deleted + * cursor. Unfortunately, this requires a second pass through + * the cursor list. + */ + if (op == CA_DELETE) { + FIND_FIRST_DB_MATCH(env, dbp, ldbp); + for (order = 1; + ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; + ldbp = TAILQ_NEXT(ldbp, dblistlinks)) { + MUTEX_LOCK(env, dbp->mutex); + TAILQ_FOREACH(dbc, &ldbp->active_queue, links) { + cp = (BTREE_CURSOR *)dbc->internal; + if (cp_arg->root == cp->root && + recno == cp->recno && CD_ISSET(cp) && + order <= cp->order && + !MVCC_SKIP_CURADJ(dbc, cp->root)) + order = cp->order + 1; + } + MUTEX_UNLOCK(env, dbp->mutex); + } + } else + order = INVALID_ORDER; + + /* Now go through and do the actual adjustments. */ + FIND_FIRST_DB_MATCH(env, dbp, ldbp); + for (found = 0; + ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; + ldbp = TAILQ_NEXT(ldbp, dblistlinks)) { + MUTEX_LOCK(env, dbp->mutex); + TAILQ_FOREACH(dbc, &ldbp->active_queue, links) { + cp = (BTREE_CURSOR *)dbc->internal; + if (cp_arg->root != cp->root || + MVCC_SKIP_CURADJ(dbc, cp->root)) + continue; + ++found; + adjusted = 0; + switch (op) { + case CA_DELETE: + if (recno < cp->recno) { + --cp->recno; + /* + * If the adjustment made them equal, + * we have to merge the orders. + */ + if (recno == cp->recno && CD_ISSET(cp)) + cp->order += order; + } else if (recno == cp->recno && + !CD_ISSET(cp)) { + CD_SET(cp); + cp->order = order; + /* + * If we're deleting the item, we can't + * keep a streaming offset cached. + */ + cp->stream_start_pgno = PGNO_INVALID; + } + break; + case CA_IBEFORE: + /* + * IBEFORE is just like IAFTER, except that we + * adjust cursors on the current record too. + */ + if (C_EQUAL(cp_arg, cp)) { + ++cp->recno; + adjusted = 1; + } + goto iafter; + case CA_ICURRENT: + + /* + * If the original cursor wasn't deleted, we + * just did a replacement and so there's no + * need to adjust anything--we shouldn't have + * gotten this far. Otherwise, we behave + * much like an IAFTER, except that all + * cursors pointing to the current item get + * marked undeleted and point to the new + * item. + */ + DB_ASSERT(env, CD_ISSET(cp_arg)); + if (C_EQUAL(cp_arg, cp)) { + CD_CLR(cp); + break; + } + /* FALLTHROUGH */ + case CA_IAFTER: +iafter: if (!adjusted && C_LESSTHAN(cp_arg, cp)) { + ++cp->recno; + adjusted = 1; + } + if (recno == cp->recno && adjusted) + /* + * If we've moved this cursor's recno, + * split its order number--i.e., + * decrement it by enough so that + * the lowest cursor moved has order 1. + * cp_arg->order is the split point, + * so decrement by one less than that. + */ + cp->order -= (cp_arg->order - 1); + break; + } + } + MUTEX_UNLOCK(dbp->env, dbp->mutex); + } + MUTEX_UNLOCK(env, env->mtx_dblist); + + if (foundp != NULL) + *foundp = found; + return (0); +} + +/* + * __ram_getno -- + * Check the user's record number, and make sure we've seen it. + * + * PUBLIC: int __ram_getno __P((DBC *, const DBT *, db_recno_t *, int)); + */ +int +__ram_getno(dbc, key, rep, can_create) + DBC *dbc; + const DBT *key; + db_recno_t *rep; + int can_create; +{ + DB *dbp; + db_recno_t recno; + + dbp = dbc->dbp; + + /* If passed an empty DBT from Java, key->data may be NULL */ + if (key->size != sizeof(db_recno_t)) { + __db_errx(dbp->env, "illegal record number size"); + return (EINVAL); + } + + /* Check the user's record number. */ + if ((recno = *(db_recno_t *)key->data) == 0) { + __db_errx(dbp->env, "illegal record number of 0"); + return (EINVAL); + } + if (rep != NULL) + *rep = recno; + + /* + * Btree can neither create records nor read them in. Recno can + * do both, see if we can find the record. + */ + return (dbc->dbtype == DB_RECNO ? + __ram_update(dbc, recno, can_create) : 0); +} + +/* + * __ram_update -- + * Ensure the tree has records up to and including the specified one. + */ +static int +__ram_update(dbc, recno, can_create) + DBC *dbc; + db_recno_t recno; + int can_create; +{ + BTREE *t; + DB *dbp; + DBT *rdata; + db_recno_t nrecs; + int ret; + + dbp = dbc->dbp; + t = dbp->bt_internal; + + /* + * If we can't create records and we've read the entire backing input + * file, we're done. + */ + if (!can_create && t->re_eof) + return (0); + + /* + * If we haven't seen this record yet, try to get it from the original + * file. + */ + if ((ret = __bam_nrecs(dbc, &nrecs)) != 0) + return (ret); + if (!t->re_eof && recno > nrecs) { + if ((ret = __ram_sread(dbc, recno)) != 0 && ret != DB_NOTFOUND) + return (ret); + if ((ret = __bam_nrecs(dbc, &nrecs)) != 0) + return (ret); + } + + /* + * If we can create records, create empty ones up to the requested + * record. + */ + if (!can_create || recno <= nrecs + 1) + return (0); + + rdata = &dbc->my_rdata; + rdata->flags = 0; + rdata->size = 0; + + while (recno > ++nrecs) + if ((ret = __ram_add(dbc, + &nrecs, rdata, 0, BI_DELETED)) != 0) + return (ret); + return (0); +} + +/* + * __ram_source -- + * Load information about the backing file. + */ +static int +__ram_source(dbp) + DB *dbp; +{ + BTREE *t; + ENV *env; + char *source; + int ret; + + env = dbp->env; + t = dbp->bt_internal; + + /* Find the real name, and swap out the one we had before. */ + if ((ret = __db_appname(env, + DB_APP_DATA, t->re_source, NULL, &source)) != 0) + return (ret); + __os_free(env, t->re_source); + t->re_source = source; + + /* + * !!! + * It's possible that the backing source file is read-only. We don't + * much care other than we'll complain if there are any modifications + * when it comes time to write the database back to the source. + */ + if ((t->re_fp = fopen(t->re_source, "rb")) == NULL) { + ret = __os_get_errno(); + __db_err(env, ret, "%s", t->re_source); + return (ret); + } + + t->re_eof = 0; + return (0); +} + +/* + * __ram_writeback -- + * Rewrite the backing file. + * + * PUBLIC: int __ram_writeback __P((DB *)); + */ +int +__ram_writeback(dbp) + DB *dbp; +{ + BTREE *t; + DBC *dbc; + DBT key, data; + DB_THREAD_INFO *ip; + ENV *env; + FILE *fp; + db_recno_t keyno; + int ret, t_ret; + u_int8_t delim, *pad; + + t = dbp->bt_internal; + env = dbp->env; + fp = NULL; + pad = NULL; + + /* If the file wasn't modified, we're done. */ + if (!t->re_modified) + return (0); + + /* If there's no backing source file, we're done. */ + if (t->re_source == NULL) { + t->re_modified = 0; + return (0); + } + + /* + * We step through the records, writing each one out. Use the record + * number and the dbp->get() function, instead of a cursor, so we find + * and write out "deleted" or non-existent records. The DB handle may + * be threaded, so allocate memory as we go. + */ + memset(&key, 0, sizeof(key)); + key.size = sizeof(db_recno_t); + key.data = &keyno; + memset(&data, 0, sizeof(data)); + F_SET(&data, DB_DBT_REALLOC); + + /* Allocate a cursor. */ + ENV_GET_THREAD_INFO(env, ip); + if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0) + return (ret); + + /* + * Read any remaining records into the tree. + * + * !!! + * This is why we can't support transactions when applications specify + * backing (re_source) files. At this point we have to read in the + * rest of the records from the file so that we can write all of the + * records back out again, which could modify a page for which we'd + * have to log changes and which we don't have locked. This could be + * partially fixed by taking a snapshot of the entire file during the + * DB->open as DB->open is transaction protected. But, if a checkpoint + * occurs then, the part of the log holding the copy of the file could + * be discarded, and that would make it impossible to recover in the + * face of disaster. This could all probably be fixed, but it would + * require transaction protecting the backing source file. + * + * XXX + * This could be made to work now that we have transactions protecting + * file operations. Margo has specifically asked for the privilege of + * doing this work. + */ + if ((ret = + __ram_update(dbc, DB_MAX_RECORDS, 0)) != 0 && ret != DB_NOTFOUND) + goto err; + + /* + * Close any existing file handle and re-open the file, truncating it. + */ + if (t->re_fp != NULL) { + if (fclose(t->re_fp) != 0) { + ret = __os_get_errno(); + __db_err(env, ret, "%s", t->re_source); + goto err; + } + t->re_fp = NULL; + } + if ((fp = fopen(t->re_source, "wb")) == NULL) { + ret = __os_get_errno(); + __db_err(env, ret, "%s", t->re_source); + goto err; + } + + /* + * We'll need the delimiter if we're doing variable-length records, + * and the pad character if we're doing fixed-length records. + */ + delim = t->re_delim; + for (keyno = 1;; ++keyno) { + switch (ret = __db_get(dbp, ip, NULL, &key, &data, 0)) { + case 0: + if (data.size != 0 && + fwrite(data.data, 1, data.size, fp) != data.size) + goto write_err; + break; + case DB_KEYEMPTY: + if (F_ISSET(dbp, DB_AM_FIXEDLEN)) { + if (pad == NULL) { + if ((ret = __os_malloc( + env, t->re_len, &pad)) != 0) + goto err; + memset(pad, t->re_pad, t->re_len); + } + if (fwrite(pad, 1, t->re_len, fp) != t->re_len) + goto write_err; + } + break; + case DB_NOTFOUND: + ret = 0; + goto done; + default: + goto err; + } + if (!F_ISSET(dbp, DB_AM_FIXEDLEN) && + fwrite(&delim, 1, 1, fp) != 1) { +write_err: ret = __os_get_errno(); + __db_err(env, ret, + "%s: write failed to backing file", t->re_source); + goto err; + } + } + +err: +done: /* Close the file descriptor. */ + if (fp != NULL && fclose(fp) != 0) { + t_ret = __os_get_errno(); + __db_err(env, t_ret, "%s", t->re_source); + if (ret == 0) + ret = t_ret; + } + + /* Discard the cursor. */ + if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + /* Discard memory allocated to hold the data items. */ + if (data.data != NULL) + __os_ufree(env, data.data); + if (pad != NULL) + __os_free(env, pad); + + if (ret == 0) + t->re_modified = 0; + + return (ret); +} + +/* + * __ram_sread -- + * Read records from a source file. + */ +static int +__ram_sread(dbc, top) + DBC *dbc; + db_recno_t top; +{ + BTREE *t; + DB *dbp; + DBT data, *rdata; + db_recno_t recno; + size_t len; + int ch, ret, was_modified; + + t = dbc->dbp->bt_internal; + dbp = dbc->dbp; + was_modified = t->re_modified; + + if ((ret = __bam_nrecs(dbc, &recno)) != 0) + return (ret); + + /* + * Use the record key return memory, it's only a short-term use. + * The record data return memory is used by __bam_iitem, which + * we'll indirectly call, so use the key so as not to collide. + */ + len = F_ISSET(dbp, DB_AM_FIXEDLEN) ? t->re_len : 256; + rdata = &dbc->my_rkey; + if (rdata->ulen < len) { + if ((ret = __os_realloc( + dbp->env, len, &rdata->data)) != 0) { + rdata->ulen = 0; + rdata->data = NULL; + return (ret); + } + rdata->ulen = (u_int32_t)len; + } + + memset(&data, 0, sizeof(data)); + while (recno < top) { + data.data = rdata->data; + data.size = 0; + if (F_ISSET(dbp, DB_AM_FIXEDLEN)) + for (len = t->re_len; len > 0; --len) { + if ((ch = fgetc(t->re_fp)) == EOF) { + if (data.size == 0) + goto eof; + break; + } + ((u_int8_t *)data.data)[data.size++] = ch; + } + else + for (;;) { + if ((ch = fgetc(t->re_fp)) == EOF) { + if (data.size == 0) + goto eof; + break; + } + if (ch == t->re_delim) + break; + + ((u_int8_t *)data.data)[data.size++] = ch; + if (data.size == rdata->ulen) { + if ((ret = __os_realloc(dbp->env, + rdata->ulen *= 2, + &rdata->data)) != 0) { + rdata->ulen = 0; + rdata->data = NULL; + return (ret); + } else + data.data = rdata->data; + } + } + + /* + * Another process may have read this record from the input + * file and stored it into the database already, in which + * case we don't need to repeat that operation. We detect + * this by checking if the last record we've read is greater + * or equal to the number of records in the database. + */ + if (t->re_last >= recno) { + ++recno; + if ((ret = __ram_add(dbc, &recno, &data, 0, 0)) != 0) + goto err; + } + ++t->re_last; + } + + if (0) { +eof: t->re_eof = 1; + ret = DB_NOTFOUND; + } +err: if (!was_modified) + t->re_modified = 0; + + return (ret); +} + +/* + * __ram_add -- + * Add records into the tree. + */ +static int +__ram_add(dbc, recnop, data, flags, bi_flags) + DBC *dbc; + db_recno_t *recnop; + DBT *data; + u_int32_t flags, bi_flags; +{ + BTREE_CURSOR *cp; + int exact, ret, stack, t_ret; + + cp = (BTREE_CURSOR *)dbc->internal; + +retry: /* Find the slot for insertion. */ + if ((ret = __bam_rsearch(dbc, recnop, + SR_INSERT | (flags == DB_APPEND ? SR_APPEND : 0), 1, &exact)) != 0) + return (ret); + stack = 1; + + /* Copy the page into the cursor. */ + STACK_TO_CURSOR(cp, ret); + if (ret != 0) + goto err; + + if (exact && flags == DB_NOOVERWRITE && !CD_ISSET(cp) && + !B_DISSET(GET_BKEYDATA(dbc->dbp, cp->page, cp->indx)->type)) { + ret = DB_KEYEXIST; + goto err; + } + + /* + * The application may modify the data based on the selected record + * number. + */ + if (flags == DB_APPEND && dbc->dbp->db_append_recno != NULL && + (ret = dbc->dbp->db_append_recno(dbc->dbp, data, *recnop)) != 0) + goto err; + + /* + * Select the arguments for __bam_iitem() and do the insert. If the + * key is an exact match, or we're replacing the data item with a + * new data item, replace the current item. If the key isn't an exact + * match, we're inserting a new key/data pair, before the search + * location. + */ + switch (ret = __bam_iitem(dbc, + NULL, data, exact ? DB_CURRENT : DB_BEFORE, bi_flags)) { + case 0: + /* + * Don't adjust anything. + * + * If we inserted a record, no cursors need adjusting because + * the only new record it's possible to insert is at the very + * end of the tree. The necessary adjustments to the internal + * page counts were made by __bam_iitem(). + * + * If we overwrote a record, no cursors need adjusting because + * future DBcursor->get calls will simply return the underlying + * record (there's no adjustment made for the DB_CURRENT flag + * when a cursor get operation immediately follows a cursor + * delete operation, and the normal adjustment for the DB_NEXT + * flag is still correct). + */ + break; + case DB_NEEDSPLIT: + /* Discard the stack of pages and split the page. */ + (void)__bam_stkrel(dbc, STK_CLRDBC); + stack = 0; + + if ((ret = __bam_split(dbc, recnop, NULL)) != 0) + goto err; + + goto retry; + /* NOTREACHED */ + default: + goto err; + } + +err: if (stack && (t_ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} diff --git a/btree/bt_rsearch.c b/btree/bt_rsearch.c new file mode 100644 index 0000000..1d5581a --- /dev/null +++ b/btree/bt_rsearch.c @@ -0,0 +1,502 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/lock.h" +#include "dbinc/mp.h" + +/* + * __bam_rsearch -- + * Search a btree for a record number. + * + * PUBLIC: int __bam_rsearch __P((DBC *, db_recno_t *, u_int32_t, int, int *)); + */ +int +__bam_rsearch(dbc, recnop, flags, stop, exactp) + DBC *dbc; + db_recno_t *recnop; + u_int32_t flags; + int stop, *exactp; +{ + BINTERNAL *bi; + BTREE_CURSOR *cp; + DB *dbp; + DB_LOCK lock; + DB_MPOOLFILE *mpf; + ENV *env; + PAGE *h; + RINTERNAL *ri; + db_indx_t adjust, deloffset, indx, top; + db_lockmode_t lock_mode; + db_pgno_t pg; + db_recno_t recno, t_recno, total; + u_int32_t get_mode; + int ret, stack, t_ret; + + dbp = dbc->dbp; + env = dbp->env; + mpf = dbp->mpf; + cp = (BTREE_CURSOR *)dbc->internal; + h = NULL; + + BT_STK_CLR(cp); + + /* + * There are several ways we search a btree tree. The flags argument + * specifies if we're acquiring read or write locks and if we are + * locking pairs of pages. In addition, if we're adding or deleting + * an item, we have to lock the entire tree, regardless. See btree.h + * for more details. + * + * If write-locking pages, we need to know whether or not to acquire a + * write lock on a page before getting it. This depends on how deep it + * is in tree, which we don't know until we acquire the root page. So, + * if we need to lock the root page we may have to upgrade it later, + * because we won't get the correct lock initially. + * + * Retrieve the root page. + */ + + if ((ret = __bam_get_root(dbc, cp->root, stop, flags, &stack)) != 0) + return (ret); + lock_mode = cp->csp->lock_mode; + get_mode = lock_mode == DB_LOCK_WRITE ? DB_MPOOL_DIRTY : 0; + lock = cp->csp->lock; + h = cp->csp->page; + + BT_STK_CLR(cp); + /* + * If appending to the tree, set the record number now -- we have the + * root page locked. + * + * Delete only deletes exact matches, read only returns exact matches. + * Note, this is different from __bam_search(), which returns non-exact + * matches for read. + * + * The record may not exist. We can only return the correct location + * for the record immediately after the last record in the tree, so do + * a fast check now. + */ + total = RE_NREC(h); + if (LF_ISSET(SR_APPEND)) { + *exactp = 0; + *recnop = recno = total + 1; + } else { + recno = *recnop; + if (recno <= total) + *exactp = 1; + else { + *exactp = 0; + if (!LF_ISSET(SR_PAST_EOF) || recno > total + 1) { + /* + * Keep the page locked for serializability. + * + * XXX + * This leaves the root page locked, which will + * eliminate any concurrency. A possible fix + * would be to lock the last leaf page instead. + */ + ret = __memp_fput(mpf, + dbc->thread_info, h, dbc->priority); + if ((t_ret = + __TLPUT(dbc, lock)) != 0 && ret == 0) + ret = t_ret; + return (ret == 0 ? DB_NOTFOUND : ret); + } + } + } + + /* + * !!! + * Record numbers in the tree are 0-based, but the recno is + * 1-based. All of the calculations below have to take this + * into account. + */ + for (total = 0;;) { + switch (TYPE(h)) { + case P_LBTREE: + if (LF_ISSET(SR_MAX)) { + indx = NUM_ENT(h) - 2; + goto enter; + } + /* FALLTHROUGH */ + case P_LDUP: + if (LF_ISSET(SR_MAX)) { + indx = NUM_ENT(h) - 1; + goto enter; + } + recno -= total; + /* + * There may be logically deleted records on the page. + * If there are enough, the record may not exist. + */ + if (TYPE(h) == P_LBTREE) { + adjust = P_INDX; + deloffset = O_INDX; + } else { + adjust = O_INDX; + deloffset = 0; + } + for (t_recno = 0, indx = 0;; indx += adjust) { + if (indx >= NUM_ENT(h)) { + *exactp = 0; + if (!LF_ISSET(SR_PAST_EOF) || + recno > t_recno + 1) { + ret = __memp_fput(mpf, + dbc->thread_info, + h, dbc->priority); + h = NULL; + if ((t_ret = __TLPUT(dbc, + lock)) != 0 && ret == 0) + ret = t_ret; + if (ret == 0) + ret = DB_NOTFOUND; + goto err; + } + } + if (!B_DISSET(GET_BKEYDATA(dbp, h, + indx + deloffset)->type) && + ++t_recno == recno) + break; + } + + BT_STK_ENTER(env, cp, h, indx, lock, lock_mode, ret); + if (ret != 0) + goto err; + if (LF_ISSET(SR_BOTH)) + goto get_prev; + return (0); + case P_IBTREE: + if (LF_ISSET(SR_MAX)) { + indx = NUM_ENT(h); + bi = GET_BINTERNAL(dbp, h, indx - 1); + } else for (indx = 0, top = NUM_ENT(h);;) { + bi = GET_BINTERNAL(dbp, h, indx); + if (++indx == top || total + bi->nrecs >= recno) + break; + total += bi->nrecs; + } + pg = bi->pgno; + break; + case P_LRECNO: + if (LF_ISSET(SR_MAX)) + recno = NUM_ENT(h); + else + recno -= total; + + /* Correct from 1-based to 0-based for a page offset. */ + --recno; +enter: BT_STK_ENTER(env, cp, h, recno, lock, lock_mode, ret); + if (ret != 0) + goto err; + if (LF_ISSET(SR_BOTH)) { +get_prev: DB_ASSERT(env, LF_ISSET(SR_NEXT)); + /* + * We have a NEXT tree, now add the sub tree + * that points gets to the previous page. + */ + cp->csp++; + indx = cp->sp->indx - 1; + h = cp->sp->page; + if (TYPE(h) == P_IRECNO) { + ri = GET_RINTERNAL(dbp, h, indx); + pg = ri->pgno; + } else { + DB_ASSERT(env, TYPE(h) == P_IBTREE); + bi = GET_BINTERNAL(dbp, h, indx); + pg = bi->pgno; + } + LF_CLR(SR_NEXT | SR_BOTH); + LF_SET(SR_MAX); + stack = 1; + h = NULL; + goto lock_next; + } + return (0); + case P_IRECNO: + if (LF_ISSET(SR_MAX)) { + indx = NUM_ENT(h); + ri = GET_RINTERNAL(dbp, h, indx - 1); + } else for (indx = 0, top = NUM_ENT(h);;) { + ri = GET_RINTERNAL(dbp, h, indx); + if (++indx == top || total + ri->nrecs >= recno) + break; + total += ri->nrecs; + } + pg = ri->pgno; + break; + default: + return (__db_pgfmt(env, h->pgno)); + } + --indx; + + /* Return if this is the lowest page wanted. */ + if (stop == LEVEL(h)) { + BT_STK_ENTER(env, cp, h, indx, lock, lock_mode, ret); + if (ret != 0) + goto err; + return (0); + } + if (stack) { + BT_STK_PUSH(env, cp, h, indx, lock, lock_mode, ret); + if (ret != 0) + goto err; + h = NULL; + + lock_mode = DB_LOCK_WRITE; + get_mode = DB_MPOOL_DIRTY; + if ((ret = + __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0) + goto err; + } else if (LF_ISSET(SR_NEXT)) { + /* + * For RECNO if we are doing a NEXT search the + * search recno is the one we are looking for + * but we want to keep the stack from the spanning + * node on down. We only know we have the spanning + * node when its child's index is 0, so save + * each node and discard the tree when we find out + * its not needed. + */ + if (indx != 0 && cp->sp->page != NULL) { + BT_STK_POP(cp); + if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0) + goto err; + } + + BT_STK_PUSH(env, cp, h, indx, lock, lock_mode, ret); + h = NULL; + if (ret != 0) + goto err; +lock_next: if ((ret = + __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0) + goto err; + } else { + /* + * Decide if we want to return a pointer to the next + * page in the stack. If we do, write lock it and + * never unlock it. + */ + if ((LF_ISSET(SR_PARENT) && + (u_int8_t)(stop + 1) >= (u_int8_t)(LEVEL(h) - 1)) || + (LEVEL(h) - 1) == LEAFLEVEL) + stack = 1; + + if ((ret = __memp_fput(mpf, + dbc->thread_info, h, dbc->priority)) != 0) + goto err; + h = NULL; + + lock_mode = stack && + LF_ISSET(SR_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ; + if (lock_mode == DB_LOCK_WRITE) + get_mode = DB_MPOOL_DIRTY; + if ((ret = __db_lget(dbc, + LCK_COUPLE_ALWAYS, pg, lock_mode, 0, &lock)) != 0) { + /* + * If we fail, discard the lock we held. This + * is OK because this only happens when we are + * descending the tree holding read-locks. + */ + (void)__LPUT(dbc, lock); + goto err; + } + } + + if ((ret = __memp_fget(mpf, &pg, + dbc->thread_info, dbc->txn, get_mode, &h)) != 0) + goto err; + } + /* NOTREACHED */ + +err: if (h != NULL && (t_ret = __memp_fput(mpf, + dbc->thread_info, h, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + + BT_STK_POP(cp); + (void)__bam_stkrel(dbc, 0); + + return (ret); +} + +/* + * __bam_adjust -- + * Adjust the tree after adding or deleting a record. + * + * PUBLIC: int __bam_adjust __P((DBC *, int32_t)); + */ +int +__bam_adjust(dbc, adjust) + DBC *dbc; + int32_t adjust; +{ + BTREE_CURSOR *cp; + DB *dbp; + DB_MPOOLFILE *mpf; + EPG *epg; + PAGE *h; + db_pgno_t root_pgno; + int ret; + + dbp = dbc->dbp; + mpf = dbp->mpf; + cp = (BTREE_CURSOR *)dbc->internal; + root_pgno = cp->root; + + /* Update the record counts for the tree. */ + for (epg = cp->sp; epg <= cp->csp; ++epg) { + h = epg->page; + if (TYPE(h) == P_IBTREE || TYPE(h) == P_IRECNO) { + ret = __memp_dirty(mpf, &h, + dbc->thread_info, dbc->txn, dbc->priority, 0); + epg->page = h; + if (ret != 0) + return (ret); + if (DBC_LOGGING(dbc)) { + if ((ret = __bam_cadjust_log(dbp, dbc->txn, + &LSN(h), 0, PGNO(h), &LSN(h), + (u_int32_t)epg->indx, adjust, + PGNO(h) == root_pgno ? + CAD_UPDATEROOT : 0)) != 0) + return (ret); + } else + LSN_NOT_LOGGED(LSN(h)); + + if (TYPE(h) == P_IBTREE) + GET_BINTERNAL(dbp, h, epg->indx)->nrecs += + adjust; + else + GET_RINTERNAL(dbp, h, epg->indx)->nrecs += + adjust; + + if (PGNO(h) == root_pgno) + RE_NREC_ADJ(h, adjust); + } + } + return (0); +} + +/* + * __bam_nrecs -- + * Return the number of records in the tree. + * + * PUBLIC: int __bam_nrecs __P((DBC *, db_recno_t *)); + */ +int +__bam_nrecs(dbc, rep) + DBC *dbc; + db_recno_t *rep; +{ + DB *dbp; + DB_LOCK lock; + DB_MPOOLFILE *mpf; + PAGE *h; + db_pgno_t pgno; + int ret, t_ret; + + dbp = dbc->dbp; + mpf = dbp->mpf; + + pgno = dbc->internal->root; + if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &lock)) != 0) + return (ret); + if ((ret = __memp_fget(mpf, &pgno, + dbc->thread_info, dbc->txn, 0, &h)) != 0) + return (ret); + + *rep = RE_NREC(h); + + ret = __memp_fput(mpf, dbc->thread_info, h, dbc->priority); + if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __bam_total -- + * Return the number of records below a page. + * + * PUBLIC: db_recno_t __bam_total __P((DB *, PAGE *)); + */ +db_recno_t +__bam_total(dbp, h) + DB *dbp; + PAGE *h; +{ + db_recno_t nrecs; + db_indx_t indx, top; + + nrecs = 0; + top = NUM_ENT(h); + + switch (TYPE(h)) { + case P_LBTREE: + /* Check for logically deleted records. */ + for (indx = 0; indx < top; indx += P_INDX) + if (!B_DISSET( + GET_BKEYDATA(dbp, h, indx + O_INDX)->type)) + ++nrecs; + break; + case P_LDUP: + /* Check for logically deleted records. */ + for (indx = 0; indx < top; indx += O_INDX) + if (!B_DISSET(GET_BKEYDATA(dbp, h, indx)->type)) + ++nrecs; + break; + case P_IBTREE: + for (indx = 0; indx < top; indx += O_INDX) + nrecs += GET_BINTERNAL(dbp, h, indx)->nrecs; + break; + case P_LRECNO: + nrecs = NUM_ENT(h); + break; + case P_IRECNO: + for (indx = 0; indx < top; indx += O_INDX) + nrecs += GET_RINTERNAL(dbp, h, indx)->nrecs; + break; + } + + return (nrecs); +} diff --git a/btree/bt_search.c b/btree/bt_search.c index 485afcb..6176b86 100644 --- a/btree/bt_search.c +++ b/btree/bt_search.c @@ -1,5 +1,14 @@ /*- - * Copyright (c) 1990, 1993, 1994 + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by @@ -13,11 +22,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors + * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * @@ -32,182 +37,929 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/lock.h" +#include "dbinc/mp.h" + +/* + * __bam_get_root -- + * Fetch the root of a tree and see if we want to keep + * it in the stack. + * + * PUBLIC: int __bam_get_root __P((DBC *, db_pgno_t, int, u_int32_t, int *)); */ +int +__bam_get_root(dbc, pg, slevel, flags, stack) + DBC *dbc; + db_pgno_t pg; + int slevel; + u_int32_t flags; + int *stack; +{ + BTREE_CURSOR *cp; + DB *dbp; + DB_LOCK lock; + DB_MPOOLFILE *mpf; + PAGE *h; + db_lockmode_t lock_mode; + u_int32_t get_mode; + int ret, t_ret; -#if defined(LIBC_SCCS) && !defined(lint) -static char sccsid[] = "@(#)bt_search.c 8.8 (Berkeley) 7/31/94"; -#endif /* LIBC_SCCS and not lint */ + LOCK_INIT(lock); + dbp = dbc->dbp; + mpf = dbp->mpf; + cp = (BTREE_CURSOR *)dbc->internal; + /* + * If write-locking pages, we need to know whether or not to acquire a + * write lock on a page before getting it. This depends on how deep it + * is in tree, which we don't know until we acquire the root page. So, + * if we need to lock the root page we may have to upgrade it later, + * because we won't get the correct lock initially. + * + * Retrieve the root page. + */ +try_again: + *stack = LF_ISSET(SR_STACK) && + (dbc->dbtype == DB_RECNO || F_ISSET(cp, C_RECNUM)); + lock_mode = DB_LOCK_READ; + if (*stack || + LF_ISSET(SR_DEL) || (LF_ISSET(SR_NEXT) && LF_ISSET(SR_WRITE))) + lock_mode = DB_LOCK_WRITE; + if ((lock_mode == DB_LOCK_WRITE || F_ISSET(dbc, DBC_DOWNREV) || + dbc->dbtype == DB_RECNO || F_ISSET(cp, C_RECNUM))) { +lock_it: if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0) + return (ret); + } -#include <sys/types.h> + /* + * Get the root. If the root happens to be a leaf page then + * we are supposed to get a read lock on it before latching + * it. So if we have not locked it do a try get first. + * If we can't get the root shared, then get a lock on it and + * then wait for the latch. + */ + if (lock_mode == DB_LOCK_WRITE) + get_mode = DB_MPOOL_DIRTY; + else if (LOCK_ISSET(lock) || !STD_LOCKING(dbc)) + get_mode = 0; + else + get_mode = DB_MPOOL_TRY; -#include <stdio.h> + if ((ret = __memp_fget(mpf, &pg, + dbc->thread_info, dbc->txn, get_mode, &h)) != 0) { + if (ret == DB_LOCK_NOTGRANTED) + goto lock_it; + /* Did not read it, so we can release the lock */ + (void)__LPUT(dbc, lock); + return (ret); + } + + /* + * Decide if we need to dirty and/or lock this page. + * We must not hold the latch while we get the lock. + */ + if (!*stack && + ((LF_ISSET(SR_PARENT) && (u_int8_t)(slevel + 1) >= LEVEL(h)) || + LEVEL(h) == LEAFLEVEL || + (LF_ISSET(SR_START) && slevel == LEVEL(h)))) { + *stack = 1; + /* If we already have the write lock, we are done. */ + if (dbc->dbtype == DB_RECNO || F_ISSET(cp, C_RECNUM)) { + if (lock_mode == DB_LOCK_WRITE) + goto done; + if ((ret = __LPUT(dbc, lock)) != 0) + return (ret); + } + + /* + * Now that we know what level the root is at, do we need a + * write lock? If not and we got the lock before latching + * we are done. + */ + if (LEVEL(h) != LEAFLEVEL || LF_ISSET(SR_WRITE)) { + lock_mode = DB_LOCK_WRITE; + /* Drop the read lock if we got it above. */ + if ((ret = __LPUT(dbc, lock)) != 0) + return (ret); + } else if (LOCK_ISSET(lock)) + goto done; + if (!STD_LOCKING(dbc)) { + if (lock_mode != DB_LOCK_WRITE) + goto done; + if ((ret = __memp_dirty(mpf, &h, dbc->thread_info, + dbc->txn, dbc->priority, 0)) != 0) { + if (h != NULL) + (void)__memp_fput(mpf, + dbc->thread_info, h, dbc->priority); + return (ret); + } + } else { + /* Try to lock the page without waiting first. */ + if ((ret = __db_lget(dbc, + 0, pg, lock_mode, DB_LOCK_NOWAIT, &lock)) == 0) { + if (lock_mode == DB_LOCK_WRITE && (ret = + __memp_dirty(mpf, &h, dbc->thread_info, + dbc->txn, dbc->priority, 0)) != 0) { + if (h != NULL) + (void)__memp_fput(mpf, + dbc->thread_info, h, + dbc->priority); + return (ret); + } + goto done; + } + + t_ret = __memp_fput(mpf, + dbc->thread_info, h, dbc->priority); + + if (ret == DB_LOCK_DEADLOCK || + ret == DB_LOCK_NOTGRANTED) + ret = 0; + if (ret == 0) + ret = t_ret; + + if (ret != 0) + return (ret); + + if ((ret = __db_lget(dbc, + 0, pg, lock_mode, 0, &lock)) != 0) + return (ret); + if ((ret = __memp_fget(mpf, + &pg, dbc->thread_info, dbc->txn, + lock_mode == DB_LOCK_WRITE ? DB_MPOOL_DIRTY : 0, + &h)) != 0) { + /* Did not read it, release the lock */ + (void)__LPUT(dbc, lock); + return (ret); + } + } + /* + * While getting dirty or locked we need to drop the mutex + * so someone else could get in and split the root. + */ + if (!((LF_ISSET(SR_PARENT) && + (u_int8_t)(slevel + 1) >= LEVEL(h)) || + LEVEL(h) == LEAFLEVEL || + (LF_ISSET(SR_START) && slevel == LEVEL(h)))) { + /* Someone else split the root, start over. */ + ret = __memp_fput(mpf, + dbc->thread_info, h, dbc->priority); + if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + return (ret); + goto try_again; + } + } -#include <db.h> -#include "btree.h" +done: BT_STK_ENTER(dbp->env, cp, h, 0, lock, lock_mode, ret); -static int __bt_snext __P((BTREE *, PAGE *, const DBT *, int *)); -static int __bt_sprev __P((BTREE *, PAGE *, const DBT *, int *)); + return (ret); +} /* - * __bt_search -- + * __bam_search -- * Search a btree for a key. * - * Parameters: - * t: tree to search - * key: key to find - * exactp: pointer to exact match flag - * - * Returns: - * The EPG for matching record, if any, or the EPG for the location - * of the key, if it were inserted into the tree, is entered into - * the bt_cur field of the tree. A pointer to the field is returned. + * PUBLIC: int __bam_search __P((DBC *, db_pgno_t, + * PUBLIC: const DBT *, u_int32_t, int, db_recno_t *, int *)); */ -EPG * -__bt_search(t, key, exactp) - BTREE *t; +int +__bam_search(dbc, root_pgno, key, flags, slevel, recnop, exactp) + DBC *dbc; + db_pgno_t root_pgno; const DBT *key; - int *exactp; + u_int32_t flags; + int slevel, *exactp; + db_recno_t *recnop; { - PAGE *h; - indx_t base, index, lim; - pgno_t pg; - int cmp; - - BT_CLR(t); - for (pg = P_ROOT;;) { - if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) - return (NULL); - - /* Do a binary search on the current page. */ - t->bt_cur.page = h; - for (base = 0, lim = NEXTINDEX(h); lim; lim >>= 1) { - t->bt_cur.index = index = base + (lim >> 1); - if ((cmp = __bt_cmp(t, key, &t->bt_cur)) == 0) { - if (h->flags & P_BLEAF) { - *exactp = 1; - return (&t->bt_cur); + BTREE *t; + BTREE_CURSOR *cp; + DB *dbp; + DB_LOCK lock, saved_lock; + DB_MPOOLFILE *mpf; + ENV *env; + PAGE *h, *parent_h; + db_indx_t base, i, indx, *inp, lim; + db_lockmode_t lock_mode; + db_pgno_t pg, saved_pg; + db_recno_t recno; + int adjust, cmp, deloffset, ret, set_stack, stack, t_ret; + int getlock, was_next; + int (*func) __P((DB *, const DBT *, const DBT *)); + u_int32_t get_mode, wait; + u_int8_t level, saved_level; + + dbp = dbc->dbp; + env = dbp->env; + mpf = dbp->mpf; + cp = (BTREE_CURSOR *)dbc->internal; + h = NULL; + parent_h = NULL; + t = dbp->bt_internal; + recno = 0; + t_ret = 0; + + BT_STK_CLR(cp); + LOCK_INIT(saved_lock); + LOCK_INIT(lock); + was_next = LF_ISSET(SR_NEXT); + wait = DB_LOCK_NOWAIT; + + /* + * There are several ways we search a btree tree. The flags argument + * specifies if we're acquiring read or write latches, if we position + * to the first or last item in a set of duplicates, if we return + * deleted items, and if we are latching pairs of pages. In addition, + * if we're modifying record numbers, we have to latch the entire tree + * regardless. See btree.h for more details. + */ + + if (root_pgno == PGNO_INVALID) + root_pgno = cp->root; + saved_pg = root_pgno; + saved_level = MAXBTREELEVEL; +retry: if ((ret = __bam_get_root(dbc, root_pgno, slevel, flags, &stack)) != 0) + goto err; + lock_mode = cp->csp->lock_mode; + get_mode = lock_mode == DB_LOCK_WRITE ? DB_MPOOL_DIRTY : 0; + h = cp->csp->page; + pg = PGNO(h); + lock = cp->csp->lock; + set_stack = stack; + /* + * Determine if we need to lock interiror nodes. + * If we have record numbers we always lock. Otherwise we only + * need to do this if we are write locking and we are returning + * a stack of nodes. SR_NEXT will eventually get a stack and + * release the locks above that level. + */ + if (F_ISSET(dbc, DBC_DOWNREV)) { + getlock = 1; + wait = 0; + } else + getlock = F_ISSET(cp, C_RECNUM) || + (lock_mode == DB_LOCK_WRITE && + (stack || LF_ISSET(SR_NEXT | SR_DEL))); + + /* + * If we are asked a level that is above the root, + * just return the root. This can happen if the tree + * collapses while we are trying to lock the root. + */ + if (!LF_ISSET(SR_START) && LEVEL(h) < slevel) + goto done; + + BT_STK_CLR(cp); + + /* Choose a comparison function. */ + func = F_ISSET(dbc, DBC_OPD) ? + (dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare) : + t->bt_compare; + + for (;;) { + if (TYPE(h) == P_LBTREE) + adjust = P_INDX; + else { + /* + * It is possible to catch an internal page as a change + * is being backed out. Its leaf pages will be locked + * but we must be sure we get to one. If the page + * is not populated enough lock it. + */ + if (TYPE(h) != P_LDUP && NUM_ENT(h) == 0) { + getlock = 1; + level = LEVEL(h) + 1; + if ((ret = __memp_fput(mpf, dbc->thread_info, + h, dbc->priority)) != 0) + goto err; + goto lock_next; + } + adjust = O_INDX; + } + inp = P_INP(dbp, h); + if (LF_ISSET(SR_MIN | SR_MAX)) { + if (LF_ISSET(SR_MIN) || NUM_ENT(h) == 0) + indx = 0; + else if (TYPE(h) == P_LBTREE) + indx = NUM_ENT(h) - 2; + else + indx = NUM_ENT(h) - 1; + + if (LEVEL(h) == LEAFLEVEL || + (!LF_ISSET(SR_START) && LEVEL(h) == slevel)) { + if (LF_ISSET(SR_NEXT)) + goto get_next; + goto found; + } + goto next; + } + /* + * Do a binary search on the current page. If we're searching + * a Btree leaf page, we have to walk the indices in groups of + * two. If we're searching an internal page or a off-page dup + * page, they're an index per page item. If we find an exact + * match on a leaf page, we're done. + */ + DB_BINARY_SEARCH_FOR(base, lim, NUM_ENT(h), adjust) { + DB_BINARY_SEARCH_INCR(indx, base, lim, adjust); + if ((ret = __bam_cmp(dbc, key, h, indx, + func, &cmp)) != 0) + goto err; + if (cmp == 0) { + if (LEVEL(h) == LEAFLEVEL || + (!LF_ISSET(SR_START) && + LEVEL(h) == slevel)) { + if (LF_ISSET(SR_NEXT)) + goto get_next; + goto found; } goto next; } - if (cmp > 0) { - base = index + 1; - --lim; + if (cmp > 0) + DB_BINARY_SEARCH_SHIFT_BASE(indx, base, + lim, adjust); + } + + /* + * No match found. Base is the smallest index greater than + * key and may be zero or a last + O_INDX index. + * + * If it's a leaf page or the stopping point, + * return base as the "found" value. + * Delete only deletes exact matches. + */ + if (LEVEL(h) == LEAFLEVEL || + (!LF_ISSET(SR_START) && LEVEL(h) == slevel)) { + *exactp = 0; + + if (LF_ISSET(SR_EXACT)) { + ret = DB_NOTFOUND; + goto err; } + + if (LF_ISSET(SR_STK_ONLY)) { + BT_STK_NUM(env, cp, h, base, ret); + if ((t_ret = + __LPUT(dbc, lock)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __memp_fput(mpf, dbc->thread_info, + h, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + h = NULL; + if (ret != 0) + goto err; + goto done; + } + if (LF_ISSET(SR_NEXT)) { +get_next: /* + * The caller could have asked for a NEXT + * at the root if the tree recently collapsed. + */ + if (PGNO(h) == root_pgno) { + ret = DB_NOTFOUND; + goto err; + } + + indx = cp->sp->indx + 1; + if (indx == NUM_ENT(cp->sp->page)) { + ret = DB_NOTFOUND; + cp->csp++; + goto err; + } + /* + * If we want both the key page and the next + * page, push the key page on the stack + * otherwise save the root of the subtree + * and drop the rest of the subtree. + * Search down again starting at the + * next child of the root of this subtree. + */ + LF_SET(SR_MIN); + LF_CLR(SR_NEXT); + set_stack = stack = 1; + if (LF_ISSET(SR_BOTH)) { + cp->csp++; + BT_STK_PUSH(env, + cp, h, indx, lock, lock_mode, ret); + if (ret != 0) + goto err; + LOCK_INIT(lock); + h = cp->sp->page; + pg = GET_BINTERNAL(dbp, h, indx)->pgno; + level = LEVEL(h); + h = NULL; + goto lock_next; + } else { + if ((ret = __LPUT(dbc, lock)) != 0) + goto err; + if ((ret = __memp_fput(mpf, + dbc->thread_info, + h, dbc->priority)) != 0) + goto err; + h = cp->sp->page; + cp->sp->page = NULL; + lock = cp->sp->lock; + LOCK_INIT(cp->sp->lock); + if ((ret = __bam_stkrel(dbc, + STK_NOLOCK)) != 0) + goto err; + goto next; + } + } + + /* + * !!! + * Possibly returning a deleted record -- DB_SET_RANGE, + * DB_KEYFIRST and DB_KEYLAST don't require an exact + * match, and we don't want to walk multiple pages here + * to find an undeleted record. This is handled by the + * calling routine. + */ + if (LF_ISSET(SR_DEL) && cp->csp == cp->sp) + cp->csp++; + BT_STK_ENTER(env, cp, h, base, lock, lock_mode, ret); + if (ret != 0) + goto err; + goto done; } /* - * If it's a leaf page, we're almost done. If no duplicates - * are allowed, or we have an exact match, we're done. Else, - * it's possible that there were matching keys on this page, - * which later deleted, and we're on a page with no matches - * while there are matches on other pages. If at the start or - * end of a page, check the adjacent page. + * If it's not a leaf page, record the internal page (which is + * a parent page for the key). Decrement the base by 1 if it's + * non-zero so that if a split later occurs, the inserted page + * will be to the right of the saved page. + */ + indx = base > 0 ? base - O_INDX : base; + + /* + * If we're trying to calculate the record number, sum up + * all the record numbers on this page up to the indx point. */ - if (h->flags & P_BLEAF) { - if (!F_ISSET(t, B_NODUPS)) { - if (base == 0 && - h->prevpg != P_INVALID && - __bt_sprev(t, h, key, exactp)) - return (&t->bt_cur); - if (base == NEXTINDEX(h) && - h->nextpg != P_INVALID && - __bt_snext(t, h, key, exactp)) - return (&t->bt_cur); +next: if (recnop != NULL) + for (i = 0; i < indx; ++i) + recno += GET_BINTERNAL(dbp, h, i)->nrecs; + + pg = GET_BINTERNAL(dbp, h, indx)->pgno; + level = LEVEL(h); + + /* See if we are at the level to start stacking. */ + if (LF_ISSET(SR_START) && slevel == level) + set_stack = stack = 1; + + if (LF_ISSET(SR_STK_ONLY)) { + if (slevel == LEVEL(h)) { + BT_STK_NUM(env, cp, h, indx, ret); + if ((t_ret = __memp_fput(mpf, dbc->thread_info, + h, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + h = NULL; + if (ret != 0) + goto err; + goto done; } - *exactp = 0; - t->bt_cur.index = base; - return (&t->bt_cur); + BT_STK_NUMPUSH(env, cp, h, indx, ret); + (void)__memp_fput(mpf, + dbc->thread_info, h, dbc->priority); + h = NULL; + } else if (stack) { + /* Return if this is the lowest page wanted. */ + if (LF_ISSET(SR_PARENT) && slevel == level) { + BT_STK_ENTER(env, + cp, h, indx, lock, lock_mode, ret); + if (ret != 0) + goto err; + goto done; + } + if (LF_ISSET(SR_DEL) && NUM_ENT(h) > 1) { + /* + * There was a page with a singleton pointer + * to a non-empty subtree. + */ + cp->csp--; + if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0) + goto err; + set_stack = stack = 0; + goto do_del; + } + BT_STK_PUSH(env, + cp, h, indx, lock, lock_mode, ret); + if (ret != 0) + goto err; + + LOCK_INIT(lock); + get_mode = DB_MPOOL_DIRTY; + lock_mode = DB_LOCK_WRITE; + goto lock_next; + } else { + /* + * Decide if we want to return a reference to the next + * page in the return stack. If so, latch it and don't + * unlatch it. We will want to stack things on the + * next iteration. The stack variable cannot be + * set until we leave this clause. If we are locking + * then we must lock this level before getting the page. + */ + if ((LF_ISSET(SR_PARENT) && + (u_int8_t)(slevel + 1) >= (level - 1)) || + (level - 1) == LEAFLEVEL) + set_stack = 1; + + /* + * Check for a normal search. If so, we need to + * latch couple the parent/chid buffers. + */ + if (!LF_ISSET(SR_DEL | SR_NEXT)) { + parent_h = h; + goto lock_next; + } + + /* + * Returning a subtree. See if we have hit the start + * point if so save the parent and set stack. + * Otherwise free the parent and temporarily + * save this one. + * For SR_DEL we need to find a page with 1 entry. + * For SR_NEXT we want find the minimal subtree + * that contains the key and the next page. + * We save pages as long as we are at the right + * edge of the subtree. When we leave the right + * edge, then drop the subtree. + */ + + if ((LF_ISSET(SR_DEL) && NUM_ENT(h) == 1)) { + /* + * We are pushing the things on the stack, + * set the stack variable now to indicate this + * has happened. + */ + stack = set_stack = 1; + LF_SET(SR_WRITE); + /* Push the parent. */ + cp->csp++; + /* Push this node. */ + BT_STK_PUSH(env, cp, h, + indx, lock, DB_LOCK_NG, ret); + if (ret != 0) + goto err; + LOCK_INIT(lock); + } else { + /* + * See if we want to save the tree so far. + * If we are looking for the next key, + * then we must save this node if we are + * at the end of the page. If not then + * discard anything we have saved so far. + * For delete only keep one node until + * we find a singleton. + */ +do_del: if (cp->csp->page != NULL) { + if (LF_ISSET(SR_NEXT) && + indx == NUM_ENT(h) - 1) + cp->csp++; + else if ((ret = + __bam_stkrel(dbc, STK_NOLOCK)) != 0) + goto err; + } + /* Save this node. */ + BT_STK_ENTER(env, cp, + h, indx, lock, lock_mode, ret); + if (ret != 0) + goto err; + LOCK_INIT(lock); + } + +lock_next: h = NULL; + + if (set_stack && LF_ISSET(SR_WRITE)) { + lock_mode = DB_LOCK_WRITE; + get_mode = DB_MPOOL_DIRTY; + getlock = 1; + } + /* + * If we are retrying and we are back at the same + * page then we already have it locked. If we are + * at a different page we want to lock couple and + * release that lock. + */ + if (level - 1 == saved_level) { + if ((ret = __LPUT(dbc, lock)) != 0) + goto err; + lock = saved_lock; + LOCK_INIT(saved_lock); + saved_level = MAXBTREELEVEL; + if (pg == saved_pg) + goto skip_lock; + } + if ((getlock || level - 1 == LEAFLEVEL) && + (ret = __db_lget(dbc, LCK_COUPLE_ALWAYS, + pg, lock_mode, wait, &lock)) != 0) { + /* + * If we are doing DEL or NEXT then we + * have an extra level saved in the stack, + * push it so it will get freed. + */ + if (LF_ISSET(SR_DEL | SR_NEXT) && !stack) + cp->csp++; + /* + * If we fail, discard the lock we held. + * This is ok because we will either search + * again or exit without actually looking + * at the data. + */ + if ((t_ret = __LPUT(dbc, lock)) != 0 && + ret == 0) + ret = t_ret; + /* + * If we blocked at a different level release + * the previous saved lock. + */ + if ((t_ret = __LPUT(dbc, saved_lock)) != 0 && + ret == 0) + ret = t_ret; + if (wait == 0 || (ret != DB_LOCK_NOTGRANTED && + ret != DB_LOCK_DEADLOCK)) + goto err; + + /* Relase the parent if we are holding it. */ + if (parent_h != NULL && + (ret = __memp_fput(mpf, dbc->thread_info, + parent_h, dbc->priority)) != 0) + goto err; + parent_h = NULL; + + BT_STK_POP(cp); + if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0) + goto err; + if ((ret = __db_lget(dbc, + 0, pg, lock_mode, 0, &saved_lock)) != 0) + goto err; + /* + * A very strange case: if this page was + * freed while we wait then we cannot hold + * the lock on it while we reget the root + * latch because allocation is one place + * we lock while holding a latch. + * Noone can have a free page locked, so + * check for that case. We do this by + * checking the level, since it will be 0 + * if free and we might as well see if this + * page moved and drop the lock in that case. + */ + if ((ret = __memp_fget(mpf, &pg, + dbc->thread_info, + dbc->txn, get_mode, &h)) != 0 && + ret != DB_PAGE_NOTFOUND) + goto err; + + if (ret != 0 || LEVEL(h) != level - 1) { + ret = __LPUT(dbc, saved_lock); + if (ret != 0) + goto err; + pg = root_pgno; + saved_level = MAXBTREELEVEL; + } + if (h != NULL && (ret = __memp_fput(mpf, + dbc->thread_info, h, dbc->priority)) != 0) + goto err; + h = NULL; + + if (was_next) { + LF_CLR(SR_MIN); + LF_SET(SR_NEXT); + } + /* + * We have the lock but we dropped the + * latch so we need to search again. If + * we get back to the same page then all + * is good, otherwise we need to try to + * lock the new page. + */ + saved_pg = pg; + saved_level = level - 1; + goto retry; + } +skip_lock: stack = set_stack; } + /* Get the child page. */ + if ((ret = __memp_fget(mpf, &pg, + dbc->thread_info, dbc->txn, get_mode, &h)) != 0) + goto err; + /* Release the parent. */ + if (parent_h != NULL && (ret = __memp_fput(mpf, + dbc->thread_info, parent_h, dbc->priority)) != 0) + goto err; + parent_h = NULL; + } + /* NOTREACHED */ + +found: *exactp = 1; + + /* + * If we got here, we know that we have a Btree leaf or off-page + * duplicates page. If it's a Btree leaf page, we have to handle + * on-page duplicates. + * + * If there are duplicates, go to the first/last one. This is + * safe because we know that we're not going to leave the page, + * all duplicate sets that are not on overflow pages exist on a + * single leaf page. + */ + if (TYPE(h) == P_LBTREE && NUM_ENT(h) > P_INDX) { + if (LF_ISSET(SR_DUPLAST)) + while (indx < (db_indx_t)(NUM_ENT(h) - P_INDX) && + inp[indx] == inp[indx + P_INDX]) + indx += P_INDX; + else if (LF_ISSET(SR_DUPFIRST)) + while (indx > 0 && + inp[indx] == inp[indx - P_INDX]) + indx -= P_INDX; + } + + /* + * Now check if we are allowed to return deleted items; if not, then + * find the next (or previous) non-deleted duplicate entry. (We do + * not move from the original found key on the basis of the SR_DELNO + * flag.) + */ + DB_ASSERT(env, recnop == NULL || LF_ISSET(SR_DELNO)); + if (LF_ISSET(SR_DELNO)) { + deloffset = TYPE(h) == P_LBTREE ? O_INDX : 0; + if (LF_ISSET(SR_DUPLAST)) + while (B_DISSET(GET_BKEYDATA(dbp, + h, indx + deloffset)->type) && indx > 0 && + inp[indx] == inp[indx - adjust]) + indx -= adjust; + else + while (B_DISSET(GET_BKEYDATA(dbp, + h, indx + deloffset)->type) && + indx < (db_indx_t)(NUM_ENT(h) - adjust) && + inp[indx] == inp[indx + adjust]) + indx += adjust; /* - * No match found. Base is the smallest index greater than - * key and may be zero or a last + 1 index. If it's non-zero, - * decrement by one, and record the internal page which should - * be a parent page for the key. If a split later occurs, the - * inserted page will be to the right of the saved page. + * If we weren't able to find a non-deleted duplicate, return + * DB_NOTFOUND. + */ + if (B_DISSET(GET_BKEYDATA(dbp, h, indx + deloffset)->type)) { + ret = DB_NOTFOUND; + goto err; + } + + /* + * Increment the record counter to point to the found element. + * Ignore any deleted key/data pairs. There doesn't need to + * be any correction for duplicates, as Btree doesn't support + * duplicates and record numbers in the same tree. */ - index = base ? base - 1 : base; + if (recnop != NULL) { + DB_ASSERT(env, TYPE(h) == P_LBTREE); -next: BT_PUSH(t, h->pgno, index); - pg = GETBINTERNAL(h, index)->pgno; - mpool_put(t->bt_mp, h, 0); + for (i = 0; i < indx; i += P_INDX) + if (!B_DISSET( + GET_BKEYDATA(dbp, h, i + O_INDX)->type)) + ++recno; + + /* Correct the number for a 0-base. */ + *recnop = recno + 1; + } } + + if (LF_ISSET(SR_STK_ONLY)) { + BT_STK_NUM(env, cp, h, indx, ret); + if ((t_ret = __memp_fput(mpf, + dbc->thread_info, h, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + } else { + if (LF_ISSET(SR_DEL) && cp->csp == cp->sp) + cp->csp++; + BT_STK_ENTER(env, cp, h, indx, lock, lock_mode, ret); + } + if (ret != 0) + goto err; + + cp->csp->lock = lock; + DB_ASSERT(env, parent_h == NULL); + +done: if ((ret = __LPUT(dbc, saved_lock)) != 0) + return (ret); + + return (0); + +err: if (ret == 0) + ret = t_ret; + if (h != NULL && (t_ret = __memp_fput(mpf, + dbc->thread_info, h, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + if (parent_h != NULL && (t_ret = __memp_fput(mpf, + dbc->thread_info, parent_h, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + + /* Keep any not-found page locked for serializability. */ + if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0) + ret = t_ret; + + (void)__LPUT(dbc, saved_lock); + + BT_STK_POP(cp); + (void)__bam_stkrel(dbc, 0); + + return (ret); } /* - * __bt_snext -- - * Check for an exact match after the key. - * - * Parameters: - * t: tree - * h: current page - * key: key - * exactp: pointer to exact match flag + * __bam_stkrel -- + * Release all pages currently held in the stack. * - * Returns: - * If an exact match found. + * PUBLIC: int __bam_stkrel __P((DBC *, u_int32_t)); */ -static int -__bt_snext(t, h, key, exactp) - BTREE *t; - PAGE *h; - const DBT *key; - int *exactp; +int +__bam_stkrel(dbc, flags) + DBC *dbc; + u_int32_t flags; { - EPG e; + BTREE_CURSOR *cp; + DB *dbp; + DB_MPOOLFILE *mpf; + EPG *epg; + int ret, t_ret; + + DB_ASSERT(NULL, dbc != NULL); + dbp = dbc->dbp; + mpf = dbp->mpf; + cp = (BTREE_CURSOR *)dbc->internal; /* - * Get the next page. The key is either an exact - * match, or not as good as the one we already have. + * Release inner pages first. + * + * The caller must be sure that setting STK_NOLOCK will not effect + * either serializability or recoverability. */ - if ((e.page = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) - return (0); - e.index = 0; - if (__bt_cmp(t, key, &e) == 0) { - mpool_put(t->bt_mp, h, 0); - t->bt_cur = e; - *exactp = 1; - return (1); + for (ret = 0, epg = cp->sp; epg <= cp->csp; ++epg) { + if (epg->page != NULL) { + if (LF_ISSET(STK_CLRDBC) && cp->page == epg->page) { + cp->page = NULL; + LOCK_INIT(cp->lock); + } + if ((t_ret = __memp_fput(mpf, dbc->thread_info, + epg->page, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + epg->page = NULL; + } + /* + * We set this if we need to release our pins, + * but are not logically ready to have the pages + * visible. + */ + if (LF_ISSET(STK_PGONLY)) + continue; + if (LF_ISSET(STK_NOLOCK)) { + if ((t_ret = __LPUT(dbc, epg->lock)) != 0 && ret == 0) + ret = t_ret; + } else + if ((t_ret = __TLPUT(dbc, epg->lock)) != 0 && ret == 0) + ret = t_ret; } - mpool_put(t->bt_mp, e.page, 0); - return (0); + + /* Clear the stack, all pages have been released. */ + if (!LF_ISSET(STK_PGONLY)) + BT_STK_CLR(cp); + + return (ret); } /* - * __bt_sprev -- - * Check for an exact match before the key. - * - * Parameters: - * t: tree - * h: current page - * key: key - * exactp: pointer to exact match flag + * __bam_stkgrow -- + * Grow the stack. * - * Returns: - * If an exact match found. + * PUBLIC: int __bam_stkgrow __P((ENV *, BTREE_CURSOR *)); */ -static int -__bt_sprev(t, h, key, exactp) - BTREE *t; - PAGE *h; - const DBT *key; - int *exactp; +int +__bam_stkgrow(env, cp) + ENV *env; + BTREE_CURSOR *cp; { - EPG e; + EPG *p; + size_t entries; + int ret; - /* - * Get the previous page. The key is either an exact - * match, or not as good as the one we already have. - */ - if ((e.page = mpool_get(t->bt_mp, h->prevpg, 0)) == NULL) - return (0); - e.index = NEXTINDEX(e.page) - 1; - if (__bt_cmp(t, key, &e) == 0) { - mpool_put(t->bt_mp, h, 0); - t->bt_cur = e; - *exactp = 1; - return (1); - } - mpool_put(t->bt_mp, e.page, 0); + entries = cp->esp - cp->sp; + + if ((ret = __os_calloc(env, entries * 2, sizeof(EPG), &p)) != 0) + return (ret); + memcpy(p, cp->sp, entries * sizeof(EPG)); + if (cp->sp != cp->stack) + __os_free(env, cp->sp); + cp->sp = p; + cp->csp = p + entries; + cp->esp = p + entries * 2; return (0); } diff --git a/btree/bt_seq.c b/btree/bt_seq.c deleted file mode 100644 index 303b481..0000000 --- a/btree/bt_seq.c +++ /dev/null @@ -1,460 +0,0 @@ -/*- - * Copyright (c) 1990, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Mike Olson. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#if defined(LIBC_SCCS) && !defined(lint) -static char sccsid[] = "@(#)bt_seq.c 8.7 (Berkeley) 7/20/94"; -#endif /* LIBC_SCCS and not lint */ - -#include <sys/types.h> - -#include <errno.h> -#include <stddef.h> -#include <stdio.h> -#include <stdlib.h> - -#include <db.h> -#include "btree.h" - -static int __bt_first __P((BTREE *, const DBT *, EPG *, int *)); -static int __bt_seqadv __P((BTREE *, EPG *, int)); -static int __bt_seqset __P((BTREE *, EPG *, DBT *, int)); - -/* - * Sequential scan support. - * - * The tree can be scanned sequentially, starting from either end of the - * tree or from any specific key. A scan request before any scanning is - * done is initialized as starting from the least node. - */ - -/* - * __bt_seq -- - * Btree sequential scan interface. - * - * Parameters: - * dbp: pointer to access method - * key: key for positioning and return value - * data: data return value - * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV. - * - * Returns: - * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. - */ -int -__bt_seq(dbp, key, data, flags) - const DB *dbp; - DBT *key, *data; - u_int flags; -{ - BTREE *t; - EPG e; - int status; - - t = dbp->internal; - - /* Toss any page pinned across calls. */ - if (t->bt_pinned != NULL) { - mpool_put(t->bt_mp, t->bt_pinned, 0); - t->bt_pinned = NULL; - } - - /* - * If scan unitialized as yet, or starting at a specific record, set - * the scan to a specific key. Both __bt_seqset and __bt_seqadv pin - * the page the cursor references if they're successful. - */ - switch (flags) { - case R_NEXT: - case R_PREV: - if (F_ISSET(&t->bt_cursor, CURS_INIT)) { - status = __bt_seqadv(t, &e, flags); - break; - } - /* FALLTHROUGH */ - case R_FIRST: - case R_LAST: - case R_CURSOR: - status = __bt_seqset(t, &e, key, flags); - break; - default: - errno = EINVAL; - return (RET_ERROR); - } - - if (status == RET_SUCCESS) { - __bt_setcur(t, e.page->pgno, e.index); - - status = - __bt_ret(t, &e, key, &t->bt_rkey, data, &t->bt_rdata, 0); - - /* - * If the user is doing concurrent access, we copied the - * key/data, toss the page. - */ - if (F_ISSET(t, B_DB_LOCK)) - mpool_put(t->bt_mp, e.page, 0); - else - t->bt_pinned = e.page; - } - return (status); -} - -/* - * __bt_seqset -- - * Set the sequential scan to a specific key. - * - * Parameters: - * t: tree - * ep: storage for returned key - * key: key for initial scan position - * flags: R_CURSOR, R_FIRST, R_LAST, R_NEXT, R_PREV - * - * Side effects: - * Pins the page the cursor references. - * - * Returns: - * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. - */ -static int -__bt_seqset(t, ep, key, flags) - BTREE *t; - EPG *ep; - DBT *key; - int flags; -{ - PAGE *h; - pgno_t pg; - int exact; - - /* - * Find the first, last or specific key in the tree and point the - * cursor at it. The cursor may not be moved until a new key has - * been found. - */ - switch (flags) { - case R_CURSOR: /* Keyed scan. */ - /* - * Find the first instance of the key or the smallest key - * which is greater than or equal to the specified key. - */ - if (key->data == NULL || key->size == 0) { - errno = EINVAL; - return (RET_ERROR); - } - return (__bt_first(t, key, ep, &exact)); - case R_FIRST: /* First record. */ - case R_NEXT: - /* Walk down the left-hand side of the tree. */ - for (pg = P_ROOT;;) { - if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) - return (RET_ERROR); - - /* Check for an empty tree. */ - if (NEXTINDEX(h) == 0) { - mpool_put(t->bt_mp, h, 0); - return (RET_SPECIAL); - } - - if (h->flags & (P_BLEAF | P_RLEAF)) - break; - pg = GETBINTERNAL(h, 0)->pgno; - mpool_put(t->bt_mp, h, 0); - } - ep->page = h; - ep->index = 0; - break; - case R_LAST: /* Last record. */ - case R_PREV: - /* Walk down the right-hand side of the tree. */ - for (pg = P_ROOT;;) { - if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) - return (RET_ERROR); - - /* Check for an empty tree. */ - if (NEXTINDEX(h) == 0) { - mpool_put(t->bt_mp, h, 0); - return (RET_SPECIAL); - } - - if (h->flags & (P_BLEAF | P_RLEAF)) - break; - pg = GETBINTERNAL(h, NEXTINDEX(h) - 1)->pgno; - mpool_put(t->bt_mp, h, 0); - } - - ep->page = h; - ep->index = NEXTINDEX(h) - 1; - break; - } - return (RET_SUCCESS); -} - -/* - * __bt_seqadvance -- - * Advance the sequential scan. - * - * Parameters: - * t: tree - * flags: R_NEXT, R_PREV - * - * Side effects: - * Pins the page the new key/data record is on. - * - * Returns: - * RET_ERROR, RET_SUCCESS or RET_SPECIAL if there's no next key. - */ -static int -__bt_seqadv(t, ep, flags) - BTREE *t; - EPG *ep; - int flags; -{ - CURSOR *c; - PAGE *h; - indx_t index; - pgno_t pg; - int exact; - - /* - * There are a couple of states that we can be in. The cursor has - * been initialized by the time we get here, but that's all we know. - */ - c = &t->bt_cursor; - - /* - * The cursor was deleted where there weren't any duplicate records, - * so the key was saved. Find out where that key would go in the - * current tree. It doesn't matter if the returned key is an exact - * match or not -- if it's an exact match, the record was added after - * the delete so we can just return it. If not, as long as there's - * a record there, return it. - */ - if (F_ISSET(c, CURS_ACQUIRE)) - return (__bt_first(t, &c->key, ep, &exact)); - - /* Get the page referenced by the cursor. */ - if ((h = mpool_get(t->bt_mp, c->pg.pgno, 0)) == NULL) - return (RET_ERROR); - - /* - * Find the next/previous record in the tree and point the cursor at - * it. The cursor may not be moved until a new key has been found. - */ - switch (flags) { - case R_NEXT: /* Next record. */ - /* - * The cursor was deleted in duplicate records, and moved - * forward to a record that has yet to be returned. Clear - * that flag, and return the record. - */ - if (F_ISSET(c, CURS_AFTER)) - goto usecurrent; - index = c->pg.index; - if (++index == NEXTINDEX(h)) { - pg = h->nextpg; - mpool_put(t->bt_mp, h, 0); - if (pg == P_INVALID) - return (RET_SPECIAL); - if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) - return (RET_ERROR); - index = 0; - } - break; - case R_PREV: /* Previous record. */ - /* - * The cursor was deleted in duplicate records, and moved - * backward to a record that has yet to be returned. Clear - * that flag, and return the record. - */ - if (F_ISSET(c, CURS_BEFORE)) { -usecurrent: F_CLR(c, CURS_AFTER | CURS_BEFORE); - ep->page = h; - ep->index = c->pg.index; - return (RET_SUCCESS); - } - index = c->pg.index; - if (index == 0) { - pg = h->prevpg; - mpool_put(t->bt_mp, h, 0); - if (pg == P_INVALID) - return (RET_SPECIAL); - if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) - return (RET_ERROR); - index = NEXTINDEX(h) - 1; - } else - --index; - break; - } - - ep->page = h; - ep->index = index; - return (RET_SUCCESS); -} - -/* - * __bt_first -- - * Find the first entry. - * - * Parameters: - * t: the tree - * key: the key - * erval: return EPG - * exactp: pointer to exact match flag - * - * Returns: - * The first entry in the tree greater than or equal to key, - * or RET_SPECIAL if no such key exists. - */ -static int -__bt_first(t, key, erval, exactp) - BTREE *t; - const DBT *key; - EPG *erval; - int *exactp; -{ - PAGE *h; - EPG *ep, save; - pgno_t pg; - - /* - * Find any matching record; __bt_search pins the page. - * - * If it's an exact match and duplicates are possible, walk backwards - * in the tree until we find the first one. Otherwise, make sure it's - * a valid key (__bt_search may return an index just past the end of a - * page) and return it. - */ - if ((ep = __bt_search(t, key, exactp)) == NULL) - return (NULL); - if (*exactp) { - if (F_ISSET(t, B_NODUPS)) { - *erval = *ep; - return (RET_SUCCESS); - } - - /* - * Walk backwards, as long as the entry matches and there are - * keys left in the tree. Save a copy of each match in case - * we go too far. - */ - save = *ep; - h = ep->page; - do { - if (save.page->pgno != ep->page->pgno) { - mpool_put(t->bt_mp, save.page, 0); - save = *ep; - } else - save.index = ep->index; - - /* - * Don't unpin the page the last (or original) match - * was on, but make sure it's unpinned if an error - * occurs. - */ - if (ep->index == 0) { - if (h->prevpg == P_INVALID) - break; - if (h->pgno != save.page->pgno) - mpool_put(t->bt_mp, h, 0); - if ((h = mpool_get(t->bt_mp, - h->prevpg, 0)) == NULL) { - if (h->pgno == save.page->pgno) - mpool_put(t->bt_mp, - save.page, 0); - return (RET_ERROR); - } - ep->page = h; - ep->index = NEXTINDEX(h); - } - --ep->index; - } while (__bt_cmp(t, key, ep) == 0); - - /* - * Reach here with the last page that was looked at pinned, - * which may or may not be the same as the last (or original) - * match page. If it's not useful, release it. - */ - if (h->pgno != save.page->pgno) - mpool_put(t->bt_mp, h, 0); - - *erval = save; - return (RET_SUCCESS); - } - - /* If at the end of a page, find the next entry. */ - if (ep->index == NEXTINDEX(ep->page)) { - h = ep->page; - pg = h->nextpg; - mpool_put(t->bt_mp, h, 0); - if (pg == P_INVALID) - return (RET_SPECIAL); - if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) - return (RET_ERROR); - ep->index = 0; - ep->page = h; - } - *erval = *ep; - return (RET_SUCCESS); -} - -/* - * __bt_setcur -- - * Set the cursor to an entry in the tree. - * - * Parameters: - * t: the tree - * pgno: page number - * index: page index - */ -void -__bt_setcur(t, pgno, index) - BTREE *t; - pgno_t pgno; - u_int index; -{ - /* Lose any already deleted key. */ - if (t->bt_cursor.key.data != NULL) { - free(t->bt_cursor.key.data); - t->bt_cursor.key.size = 0; - t->bt_cursor.key.data = NULL; - } - F_CLR(&t->bt_cursor, CURS_ACQUIRE | CURS_AFTER | CURS_BEFORE); - - /* Update the cursor. */ - t->bt_cursor.pg.pgno = pgno; - t->bt_cursor.pg.index = index; - F_SET(&t->bt_cursor, CURS_INIT); -} diff --git a/btree/bt_split.c b/btree/bt_split.c index 1646d82..fcf9aab 100644 --- a/btree/bt_split.c +++ b/btree/bt_split.c @@ -1,9 +1,15 @@ /*- - * Copyright (c) 1990, 1993, 1994 - * The Regents of the University of California. All rights reserved. + * See the file LICENSE for redistribution information. * - * This code is derived from software contributed to Berkeley by - * Mike Olson. + * Copyright (c) 1996-2009 Oracle. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -13,11 +19,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors + * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * @@ -32,796 +34,1277 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. + * + * $Id$ */ -#if defined(LIBC_SCCS) && !defined(lint) -static char sccsid[] = "@(#)bt_split.c 8.9 (Berkeley) 7/26/94"; -#endif /* LIBC_SCCS and not lint */ - -#include <sys/types.h> +#include "db_config.h" -#include <limits.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/lock.h" +#include "dbinc/mp.h" +#include "dbinc/btree.h" -#include <db.h> -#include "btree.h" - -static int bt_broot __P((BTREE *, PAGE *, PAGE *, PAGE *)); -static PAGE *bt_page - __P((BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t)); -static int bt_preserve __P((BTREE *, pgno_t)); -static PAGE *bt_psplit - __P((BTREE *, PAGE *, PAGE *, PAGE *, indx_t *, size_t)); -static PAGE *bt_root - __P((BTREE *, PAGE *, PAGE **, PAGE **, indx_t *, size_t)); -static int bt_rroot __P((BTREE *, PAGE *, PAGE *, PAGE *)); -static recno_t rec_total __P((PAGE *)); - -#ifdef STATISTICS -u_long bt_rootsplit, bt_split, bt_sortsplit, bt_pfxsaved; -#endif +static int __bam_page __P((DBC *, EPG *, EPG *)); +static int __bam_psplit __P((DBC *, EPG *, PAGE *, PAGE *, db_indx_t *)); +static int __bam_root __P((DBC *, EPG *)); /* - * __BT_SPLIT -- Split the tree. + * __bam_split -- + * Split a page. * - * Parameters: - * t: tree - * sp: page to split - * key: key to insert - * data: data to insert - * flags: BIGKEY/BIGDATA flags - * ilen: insert length - * skip: index to leave open - * - * Returns: - * RET_ERROR, RET_SUCCESS + * PUBLIC: int __bam_split __P((DBC *, void *, db_pgno_t *)); */ int -__bt_split(t, sp, key, data, flags, ilen, argskip) - BTREE *t; - PAGE *sp; - const DBT *key, *data; - int flags; - size_t ilen; - u_int32_t argskip; +__bam_split(dbc, arg, root_pgnop) + DBC *dbc; + void *arg; + db_pgno_t *root_pgnop; { - BINTERNAL *bi; - BLEAF *bl, *tbl; - DBT a, b; - EPGNO *parent; - PAGE *h, *l, *r, *lchild, *rchild; - indx_t nxtindex; - u_int16_t skip; - u_int32_t n, nbytes, nksize; - int parentsplit; - char *dest; + BTREE_CURSOR *cp; + DB_LOCK metalock, next_lock; + enum { UP, DOWN } dir; + db_pgno_t pgno, next_pgno, root_pgno; + int exact, level, ret; - /* - * Split the page into two pages, l and r. The split routines return - * a pointer to the page into which the key should be inserted and with - * skip set to the offset which should be used. Additionally, l and r - * are pinned. - */ - skip = argskip; - h = sp->pgno == P_ROOT ? - bt_root(t, sp, &l, &r, &skip, ilen) : - bt_page(t, sp, &l, &r, &skip, ilen); - if (h == NULL) - return (RET_ERROR); + cp = (BTREE_CURSOR *)dbc->internal; + root_pgno = cp->root; + LOCK_INIT(next_lock); + next_pgno = PGNO_INVALID; /* - * Insert the new key/data pair into the leaf page. (Key inserts - * always cause a leaf page to split first.) + * First get a lock on the metadata page, we will have to allocate + * pages and cannot get a lock while we have the search tree pinnned. */ - h->linp[skip] = h->upper -= ilen; - dest = (char *)h + h->upper; - if (F_ISSET(t, R_RECNO)) - WR_RLEAF(dest, data, flags) - else - WR_BLEAF(dest, key, data, flags) - /* If the root page was split, make it look right. */ - if (sp->pgno == P_ROOT && - (F_ISSET(t, R_RECNO) ? - bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR) - goto err2; + pgno = PGNO_BASE_MD; + if ((ret = __db_lget(dbc, + 0, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0) + goto err; /* - * Now we walk the parent page stack -- a LIFO stack of the pages that - * were traversed when we searched for the page that split. Each stack - * entry is a page number and a page index offset. The offset is for - * the page traversed on the search. We've just split a page, so we - * have to insert a new key into the parent page. + * The locking protocol we use to avoid deadlock to acquire locks by + * walking down the tree, but we do it as lazily as possible, locking + * the root only as a last resort. We expect all stack pages to have + * been discarded before we're called; we discard all short-term locks. * - * If the insert into the parent page causes it to split, may have to - * continue splitting all the way up the tree. We stop if the root - * splits or the page inserted into didn't have to split to hold the - * new key. Some algorithms replace the key for the old page as well - * as the new page. We don't, as there's no reason to believe that the - * first key on the old page is any better than the key we have, and, - * in the case of a key being placed at index 0 causing the split, the - * key is unavailable. + * When __bam_split is first called, we know that a leaf page was too + * full for an insert. We don't know what leaf page it was, but we + * have the key/recno that caused the problem. We call XX_search to + * reacquire the leaf page, but this time get both the leaf page and + * its parent, locked. We then split the leaf page and see if the new + * internal key will fit into the parent page. If it will, we're done. + * + * If it won't, we discard our current locks and repeat the process, + * only this time acquiring the parent page and its parent, locked. + * This process repeats until we succeed in the split, splitting the + * root page as the final resort. The entire process then repeats, + * as necessary, until we split a leaf page. * - * There are a maximum of 5 pages pinned at any time. We keep the left - * and right pages pinned while working on the parent. The 5 are the - * two children, left parent and right parent (when the parent splits) - * and the root page or the overflow key page when calling bt_preserve. - * This code must make sure that all pins are released other than the - * root page or overflow page which is unlocked elsewhere. + * XXX + * A traditional method of speeding this up is to maintain a stack of + * the pages traversed in the original search. You can detect if the + * stack is correct by storing the page's LSN when it was searched and + * comparing that LSN with the current one when it's locked during the + * split. This would be an easy change for this code, but I have no + * numbers that indicate it's worthwhile. */ - while ((parent = BT_POP(t)) != NULL) { - lchild = l; - rchild = r; - - /* Get the parent page. */ - if ((h = mpool_get(t->bt_mp, parent->pgno, 0)) == NULL) - goto err2; - - /* - * The new key goes ONE AFTER the index, because the split - * was to the right. + for (dir = UP, level = LEAFLEVEL;; dir == UP ? ++level : --level) { + /* + * Acquire a page and its parent, locked. */ - skip = parent->index + 1; +retry: if ((ret = (dbc->dbtype == DB_BTREE ? + __bam_search(dbc, PGNO_INVALID, + arg, SR_WRPAIR, level, NULL, &exact) : + __bam_rsearch(dbc, + (db_recno_t *)arg, SR_WRPAIR, level, &exact))) != 0) + break; + + if (cp->csp[0].page->pgno == root_pgno) { + /* we can overshoot the top of the tree. */ + level = cp->csp[0].page->level; + if (root_pgnop != NULL) + *root_pgnop = root_pgno; + } else if (root_pgnop != NULL) + *root_pgnop = cp->csp[-1].page->pgno; /* - * Calculate the space needed on the parent page. - * - * Prefix trees: space hack when inserting into BINTERNAL - * pages. Retain only what's needed to distinguish between - * the new entry and the LAST entry on the page to its left. - * If the keys compare equal, retain the entire key. Note, - * we don't touch overflow keys, and the entire key must be - * retained for the next-to-left most key on the leftmost - * page of each level, or the search will fail. Applicable - * ONLY to internal pages that have leaf pages as children. - * Further reduction of the key between pairs of internal - * pages loses too much information. + * Split the page if it still needs it (it's possible another + * thread of control has already split the page). If we are + * guaranteed that two items will fit on the page, the split + * is no longer necessary. */ - switch (rchild->flags & P_TYPE) { - case P_BINTERNAL: - bi = GETBINTERNAL(rchild, 0); - nbytes = NBINTERNAL(bi->ksize); - break; - case P_BLEAF: - bl = GETBLEAF(rchild, 0); - nbytes = NBINTERNAL(bl->ksize); - if (t->bt_pfx && !(bl->flags & P_BIGKEY) && - (h->prevpg != P_INVALID || skip > 1)) { - tbl = GETBLEAF(lchild, NEXTINDEX(lchild) - 1); - a.size = tbl->ksize; - a.data = tbl->bytes; - b.size = bl->ksize; - b.data = bl->bytes; - nksize = t->bt_pfx(&a, &b); - n = NBINTERNAL(nksize); - if (n < nbytes) { -#ifdef STATISTICS - bt_pfxsaved += nbytes - n; -#endif - nbytes = n; - } else - nksize = 0; - } else - nksize = 0; - break; - case P_RINTERNAL: - case P_RLEAF: - nbytes = NRINTERNAL; - break; - default: - abort(); + if (2 * B_MAXSIZEONPAGE(cp->ovflsize) + <= (db_indx_t)P_FREESPACE(dbc->dbp, cp->csp[0].page)) { + if ((ret = __bam_stkrel(dbc, STK_NOLOCK)) != 0) + goto err; + goto no_split; } - /* Split the parent page if necessary or shift the indices. */ - if (h->upper - h->lower < nbytes + sizeof(indx_t)) { - sp = h; - h = h->pgno == P_ROOT ? - bt_root(t, h, &l, &r, &skip, nbytes) : - bt_page(t, h, &l, &r, &skip, nbytes); - if (h == NULL) - goto err1; - parentsplit = 1; - } else { - if (skip < (nxtindex = NEXTINDEX(h))) - memmove(h->linp + skip + 1, h->linp + skip, - (nxtindex - skip) * sizeof(indx_t)); - h->lower += sizeof(indx_t); - parentsplit = 0; + /* + * We need to try to lock the next page so we can update + * its PREV. + */ + if (dbc->dbtype == DB_BTREE && ISLEAF(cp->csp->page) && + (pgno = NEXT_PGNO(cp->csp->page)) != PGNO_INVALID) { + TRY_LOCK(dbc, pgno, + next_pgno, next_lock, DB_LOCK_WRITE, retry); + if (ret != 0) + goto err; } - - /* Insert the key into the parent page. */ - switch (rchild->flags & P_TYPE) { - case P_BINTERNAL: - h->linp[skip] = h->upper -= nbytes; - dest = (char *)h + h->linp[skip]; - memmove(dest, bi, nbytes); - ((BINTERNAL *)dest)->pgno = rchild->pgno; - break; - case P_BLEAF: - h->linp[skip] = h->upper -= nbytes; - dest = (char *)h + h->linp[skip]; - WR_BINTERNAL(dest, nksize ? nksize : bl->ksize, - rchild->pgno, bl->flags & P_BIGKEY); - memmove(dest, bl->bytes, nksize ? nksize : bl->ksize); - if (bl->flags & P_BIGKEY && - bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR) - goto err1; - break; - case P_RINTERNAL: - /* - * Update the left page count. If split - * added at index 0, fix the correct page. - */ - if (skip > 0) - dest = (char *)h + h->linp[skip - 1]; - else - dest = (char *)l + l->linp[NEXTINDEX(l) - 1]; - ((RINTERNAL *)dest)->nrecs = rec_total(lchild); - ((RINTERNAL *)dest)->pgno = lchild->pgno; - - /* Update the right page count. */ - h->linp[skip] = h->upper -= nbytes; - dest = (char *)h + h->linp[skip]; - ((RINTERNAL *)dest)->nrecs = rec_total(rchild); - ((RINTERNAL *)dest)->pgno = rchild->pgno; + ret = cp->csp[0].page->pgno == root_pgno ? + __bam_root(dbc, &cp->csp[0]) : + __bam_page(dbc, &cp->csp[-1], &cp->csp[0]); + BT_STK_CLR(cp); + + switch (ret) { + case 0: +no_split: /* Once we've split the leaf page, we're done. */ + if (level == LEAFLEVEL) + goto done; + + /* Switch directions. */ + if (dir == UP) + dir = DOWN; break; - case P_RLEAF: + case DB_NEEDSPLIT: /* - * Update the left page count. If split - * added at index 0, fix the correct page. + * It's possible to fail to split repeatedly, as other + * threads may be modifying the tree, or the page usage + * is sufficiently bad that we don't get enough space + * the first time. */ - if (skip > 0) - dest = (char *)h + h->linp[skip - 1]; - else - dest = (char *)l + l->linp[NEXTINDEX(l) - 1]; - ((RINTERNAL *)dest)->nrecs = NEXTINDEX(lchild); - ((RINTERNAL *)dest)->pgno = lchild->pgno; - - /* Update the right page count. */ - h->linp[skip] = h->upper -= nbytes; - dest = (char *)h + h->linp[skip]; - ((RINTERNAL *)dest)->nrecs = NEXTINDEX(rchild); - ((RINTERNAL *)dest)->pgno = rchild->pgno; + if (dir == DOWN) + dir = UP; break; default: - abort(); + goto err; } + } - /* Unpin the held pages. */ - if (!parentsplit) { - mpool_put(t->bt_mp, h, MPOOL_DIRTY); - break; - } +err: if (root_pgnop != NULL) + *root_pgnop = cp->root; +done: (void)__LPUT(dbc, metalock); + (void)__TLPUT(dbc, next_lock); + return (ret); +} - /* If the root page was split, make it look right. */ - if (sp->pgno == P_ROOT && - (F_ISSET(t, R_RECNO) ? - bt_rroot(t, sp, l, r) : bt_broot(t, sp, l, r)) == RET_ERROR) - goto err1; +/* + * __bam_root -- + * Split the root page of a btree. + */ +static int +__bam_root(dbc, cp) + DBC *dbc; + EPG *cp; +{ + DB *dbp; + DBT log_dbt, rootent[2]; + DB_LOCK llock, rlock; + DB_LSN log_lsn; + DB_MPOOLFILE *mpf; + PAGE *lp, *rp; + db_indx_t split; + u_int32_t opflags; + int ret, t_ret; + + dbp = dbc->dbp; + mpf = dbp->mpf; + lp = rp = NULL; + LOCK_INIT(llock); + LOCK_INIT(rlock); + COMPQUIET(log_dbt.data, NULL); + + /* Yeah, right. */ + if (cp->page->level >= MAXBTREELEVEL) { + __db_errx(dbp->env, + "Too many btree levels: %d", cp->page->level); + return (ENOSPC); + } - mpool_put(t->bt_mp, lchild, MPOOL_DIRTY); - mpool_put(t->bt_mp, rchild, MPOOL_DIRTY); + if ((ret = __memp_dirty(mpf, + &cp->page, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) + goto err; + + /* Create new left and right pages for the split. */ + if ((ret = __db_new(dbc, TYPE(cp->page), &llock, &lp)) != 0 || + (ret = __db_new(dbc, TYPE(cp->page), &rlock, &rp)) != 0) + goto err; + P_INIT(lp, dbp->pgsize, lp->pgno, + PGNO_INVALID, ISINTERNAL(cp->page) ? PGNO_INVALID : rp->pgno, + cp->page->level, TYPE(cp->page)); + P_INIT(rp, dbp->pgsize, rp->pgno, + ISINTERNAL(cp->page) ? PGNO_INVALID : lp->pgno, PGNO_INVALID, + cp->page->level, TYPE(cp->page)); + + /* Split the page. */ + if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0) + goto err; + + if (DBC_LOGGING(dbc)) { + memset(&log_dbt, 0, sizeof(log_dbt)); + if ((ret = + __os_malloc(dbp->env, dbp->pgsize, &log_dbt.data)) != 0) + goto err; + log_dbt.size = dbp->pgsize; + memcpy(log_dbt.data, cp->page, dbp->pgsize); } - /* Unpin the held pages. */ - mpool_put(t->bt_mp, l, MPOOL_DIRTY); - mpool_put(t->bt_mp, r, MPOOL_DIRTY); + /* Clean up the new root page. */ + if ((ret = (dbc->dbtype == DB_RECNO ? + __ram_root(dbc, cp->page, lp, rp) : + __bam_broot(dbc, cp->page, split, lp, rp))) != 0) { + if (DBC_LOGGING(dbc)) + __os_free(dbp->env, log_dbt.data); + goto err; + } - /* Clear any pages left on the stack. */ - return (RET_SUCCESS); + /* Log the change. */ + if (DBC_LOGGING(dbc)) { + memset(rootent, 0, sizeof(rootent)); + rootent[0].data = GET_BINTERNAL(dbp, cp->page, 0); + rootent[1].data = GET_BINTERNAL(dbp, cp->page, 1); + if (dbc->dbtype == DB_RECNO) + rootent[0].size = rootent[1].size = RINTERNAL_SIZE; + else { + rootent[0].size = BINTERNAL_SIZE( + ((BINTERNAL *)rootent[0].data)->len); + rootent[1].size = BINTERNAL_SIZE( + ((BINTERNAL *)rootent[1].data)->len); + } + ZERO_LSN(log_lsn); + opflags = F_ISSET( + (BTREE_CURSOR *)dbc->internal, C_RECNUM) ? SPL_NRECS : 0; + if (dbc->dbtype == DB_RECNO) + opflags |= SPL_RECNO; + ret = __bam_split_log(dbp, + dbc->txn, &LSN(cp->page), 0, PGNO(lp), &LSN(lp), PGNO(rp), + &LSN(rp), (u_int32_t)NUM_ENT(lp), PGNO_INVALID, &log_lsn, + dbc->internal->root, &LSN(cp->page), 0, + &log_dbt, &rootent[0], &rootent[1], opflags); + + __os_free(dbp->env, log_dbt.data); + + if (ret != 0) + goto err; + } else + LSN_NOT_LOGGED(LSN(cp->page)); + LSN(lp) = LSN(cp->page); + LSN(rp) = LSN(cp->page); + + /* Adjust any cursors. */ + ret = __bam_ca_split(dbc, cp->page->pgno, lp->pgno, rp->pgno, split, 1); + + /* Success or error: release pages and locks. */ +err: if (cp->page != NULL && (t_ret = __memp_fput(mpf, + dbc->thread_info, cp->page, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + cp->page = NULL; /* - * If something fails in the above loop we were already walking back - * up the tree and the tree is now inconsistent. Nothing much we can - * do about it but release any memory we're holding. + * We are done. Put or downgrade all our locks and release + * the pages. */ -err1: mpool_put(t->bt_mp, lchild, MPOOL_DIRTY); - mpool_put(t->bt_mp, rchild, MPOOL_DIRTY); - -err2: mpool_put(t->bt_mp, l, 0); - mpool_put(t->bt_mp, r, 0); - __dbpanic(t->bt_dbp); - return (RET_ERROR); + if ((t_ret = __TLPUT(dbc, llock)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __TLPUT(dbc, rlock)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __TLPUT(dbc, cp->lock)) != 0 && ret == 0) + ret = t_ret; + if (lp != NULL && (t_ret = __memp_fput(mpf, + dbc->thread_info, lp, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + if (rp != NULL && (t_ret = __memp_fput(mpf, + dbc->thread_info, rp, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + + return (ret); } /* - * BT_PAGE -- Split a non-root page of a btree. - * - * Parameters: - * t: tree - * h: root page - * lp: pointer to left page pointer - * rp: pointer to right page pointer - * skip: pointer to index to leave open - * ilen: insert length - * - * Returns: - * Pointer to page in which to insert or NULL on error. + * __bam_page -- + * Split the non-root page of a btree. */ -static PAGE * -bt_page(t, h, lp, rp, skip, ilen) - BTREE *t; - PAGE *h, **lp, **rp; - indx_t *skip; - size_t ilen; +static int +__bam_page(dbc, pp, cp) + DBC *dbc; + EPG *pp, *cp; { - PAGE *l, *r, *tp; - pgno_t npg; - -#ifdef STATISTICS - ++bt_split; -#endif - /* Put the new right page for the split into place. */ - if ((r = __bt_new(t, &npg)) == NULL) - return (NULL); - r->pgno = npg; - r->lower = BTDATAOFF; - r->upper = t->bt_psize; - r->nextpg = h->nextpg; - r->prevpg = h->pgno; - r->flags = h->flags & P_TYPE; + BTREE_CURSOR *bc; + DB *dbp; + DBT log_dbt, rentry; + DB_LOCK rplock; + DB_LSN log_lsn; + DB_LSN save_lsn; + DB_MPOOLFILE *mpf; + PAGE *lp, *rp, *alloc_rp, *tp; + db_indx_t split; + u_int32_t opflags; + int ret, t_ret; + + dbp = dbc->dbp; + mpf = dbp->mpf; + alloc_rp = lp = rp = tp = NULL; + LOCK_INIT(rplock); + ret = -1; /* - * If we're splitting the last page on a level because we're appending - * a key to it (skip is NEXTINDEX()), it's likely that the data is - * sorted. Adding an empty page on the side of the level is less work - * and can push the fill factor much higher than normal. If we're - * wrong it's no big deal, we'll just do the split the right way next - * time. It may look like it's equally easy to do a similar hack for - * reverse sorted data, that is, split the tree left, but it's not. - * Don't even try. + * Create new left page for the split, and fill in everything + * except its LSN and next-page page number. + * + * Create a new right page for the split, and fill in everything + * except its LSN and page number. + * + * We malloc space for both the left and right pages, so we don't get + * a new page from the underlying buffer pool until we know the split + * is going to succeed. The reason is that we can't release locks + * acquired during the get-a-new-page process because metadata page + * locks can't be discarded on failure since we may have modified the + * free list. So, if you assume that we're holding a write lock on the + * leaf page which ran out of space and started this split (e.g., we + * have already written records to the page, or we retrieved a record + * from it with the DB_RMW flag set), failing in a split with both a + * leaf page locked and the metadata page locked can potentially lock + * up the tree badly, because we've violated the rule of always locking + * down the tree, and never up. */ - if (h->nextpg == P_INVALID && *skip == NEXTINDEX(h)) { -#ifdef STATISTICS - ++bt_sortsplit; -#endif - h->nextpg = r->pgno; - r->lower = BTDATAOFF + sizeof(indx_t); - *skip = 0; - *lp = h; - *rp = r; - return (r); - } + if ((ret = __os_malloc(dbp->env, dbp->pgsize * 2, &lp)) != 0) + goto err; + P_INIT(lp, dbp->pgsize, PGNO(cp->page), + ISINTERNAL(cp->page) ? PGNO_INVALID : PREV_PGNO(cp->page), + ISINTERNAL(cp->page) ? PGNO_INVALID : 0, + cp->page->level, TYPE(cp->page)); + + rp = (PAGE *)((u_int8_t *)lp + dbp->pgsize); + P_INIT(rp, dbp->pgsize, 0, + ISINTERNAL(cp->page) ? PGNO_INVALID : PGNO(cp->page), + ISINTERNAL(cp->page) ? PGNO_INVALID : NEXT_PGNO(cp->page), + cp->page->level, TYPE(cp->page)); - /* Put the new left page for the split into place. */ - if ((l = (PAGE *)malloc(t->bt_psize)) == NULL) { - mpool_put(t->bt_mp, r, 0); - return (NULL); - } -#ifdef PURIFY - memset(l, 0xff, t->bt_psize); -#endif - l->pgno = h->pgno; - l->nextpg = r->pgno; - l->prevpg = h->prevpg; - l->lower = BTDATAOFF; - l->upper = t->bt_psize; - l->flags = h->flags & P_TYPE; - - /* Fix up the previous pointer of the page after the split page. */ - if (h->nextpg != P_INVALID) { - if ((tp = mpool_get(t->bt_mp, h->nextpg, 0)) == NULL) { - free(l); - /* XXX mpool_free(t->bt_mp, r->pgno); */ - return (NULL); + /* + * Split right. + * + * Only the indices are sorted on the page, i.e., the key/data pairs + * aren't, so it's simpler to copy the data from the split page onto + * two new pages instead of copying half the data to a new right page + * and compacting the left page in place. Since the left page can't + * change, we swap the original and the allocated left page after the + * split. + */ + if ((ret = __bam_psplit(dbc, cp, lp, rp, &split)) != 0) + goto err; + + /* + * Test to see if we are going to be able to insert the new pages into + * the parent page. The interesting failure here is that the parent + * page can't hold the new keys, and has to be split in turn, in which + * case we want to release all the locks we can. + */ + if ((ret = __bam_pinsert(dbc, pp, split, lp, rp, BPI_SPACEONLY)) != 0) + goto err; + + /* + * We've got everything locked down we need, and we know the split + * is going to succeed. Go and get the additional page we'll need. + */ + if ((ret = __db_new(dbc, TYPE(cp->page), &rplock, &alloc_rp)) != 0) + goto err; + + /* + * Prepare to fix up the previous pointer of any leaf page following + * the split page. Our caller has already write locked the page so + * we can get it without deadlocking on the parent latch. + */ + if (ISLEAF(cp->page) && NEXT_PGNO(cp->page) != PGNO_INVALID && + (ret = __memp_fget(mpf, &NEXT_PGNO(cp->page), + dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &tp)) != 0) + goto err; + + /* + * Fix up the page numbers we didn't have before. We have to do this + * before calling __bam_pinsert because it may copy a page number onto + * the parent page and it takes the page number from its page argument. + */ + PGNO(rp) = NEXT_PGNO(lp) = PGNO(alloc_rp); + + DB_ASSERT(dbp->env, IS_DIRTY(cp->page)); + DB_ASSERT(dbp->env, IS_DIRTY(pp->page)); + + /* Actually update the parent page. */ + if ((ret = __bam_pinsert(dbc, pp, split, lp, rp, BPI_NOLOGGING)) != 0) + goto err; + + bc = (BTREE_CURSOR *)dbc->internal; + /* Log the change. */ + if (DBC_LOGGING(dbc)) { + memset(&log_dbt, 0, sizeof(log_dbt)); + log_dbt.data = cp->page; + log_dbt.size = dbp->pgsize; + memset(&rentry, 0, sizeof(rentry)); + rentry.data = GET_BINTERNAL(dbp, pp->page, pp->indx + 1); + opflags = F_ISSET(bc, C_RECNUM) ? SPL_NRECS : 0; + if (dbc->dbtype == DB_RECNO) { + opflags |= SPL_RECNO; + rentry.size = RINTERNAL_SIZE; + } else + rentry.size = + BINTERNAL_SIZE(((BINTERNAL *)rentry.data)->len); + if (tp == NULL) + ZERO_LSN(log_lsn); + if ((ret = __bam_split_log(dbp, dbc->txn, &LSN(cp->page), 0, + PGNO(cp->page), &LSN(cp->page), PGNO(alloc_rp), + &LSN(alloc_rp), (u_int32_t)NUM_ENT(lp), + tp == NULL ? 0 : PGNO(tp), tp == NULL ? &log_lsn : &LSN(tp), + PGNO(pp->page), &LSN(pp->page), pp->indx, + &log_dbt, NULL, &rentry, opflags)) != 0) { + /* + * Undo the update to the parent page, which has not + * been logged yet. This must succeed. + */ + t_ret = __db_ditem_nolog(dbc, pp->page, + pp->indx + 1, rentry.size); + DB_ASSERT(dbp->env, t_ret == 0); + + goto err; } - tp->prevpg = r->pgno; - mpool_put(t->bt_mp, tp, MPOOL_DIRTY); + + } else + LSN_NOT_LOGGED(LSN(cp->page)); + + /* Update the LSNs for all involved pages. */ + LSN(alloc_rp) = LSN(cp->page); + LSN(lp) = LSN(cp->page); + LSN(rp) = LSN(cp->page); + LSN(pp->page) = LSN(cp->page); + if (tp != NULL) { + /* Log record has been written; now it is safe to update next page. */ + PREV_PGNO(tp) = PGNO(rp); + LSN(tp) = LSN(cp->page); } /* - * Split right. The key/data pairs aren't sorted in the btree page so - * it's simpler to copy the data from the split page onto two new pages - * instead of copying half the data to the right page and compacting - * the left page in place. Since the left page can't change, we have - * to swap the original and the allocated left page after the split. + * Copy the left and right pages into place. There are two paths + * through here. Either we are logging and we set the LSNs in the + * logging path. However, if we are not logging, then we do not + * have valid LSNs on lp or rp. The correct LSNs to use are the + * ones on the page we got from __db_new or the one that was + * originally on cp->page. In both cases, we save the LSN from the + * real database page (not a malloc'd one) and reapply it after we + * do the copy. */ - tp = bt_psplit(t, h, l, r, skip, ilen); + save_lsn = alloc_rp->lsn; + memcpy(alloc_rp, rp, LOFFSET(dbp, rp)); + memcpy((u_int8_t *)alloc_rp + HOFFSET(rp), + (u_int8_t *)rp + HOFFSET(rp), dbp->pgsize - HOFFSET(rp)); + alloc_rp->lsn = save_lsn; - /* Move the new left page onto the old left page. */ - memmove(h, l, t->bt_psize); - if (tp == l) - tp = h; - free(l); + save_lsn = cp->page->lsn; + memcpy(cp->page, lp, LOFFSET(dbp, lp)); + memcpy((u_int8_t *)cp->page + HOFFSET(lp), + (u_int8_t *)lp + HOFFSET(lp), dbp->pgsize - HOFFSET(lp)); + cp->page->lsn = save_lsn; - *lp = h; - *rp = r; - return (tp); -} + /* Adjust any cursors. */ + if ((ret = __bam_ca_split(dbc, + PGNO(cp->page), PGNO(cp->page), PGNO(rp), split, 0)) != 0) + goto err; -/* - * BT_ROOT -- Split the root page of a btree. - * - * Parameters: - * t: tree - * h: root page - * lp: pointer to left page pointer - * rp: pointer to right page pointer - * skip: pointer to index to leave open - * ilen: insert length - * - * Returns: - * Pointer to page in which to insert or NULL on error. - */ -static PAGE * -bt_root(t, h, lp, rp, skip, ilen) - BTREE *t; - PAGE *h, **lp, **rp; - indx_t *skip; - size_t ilen; -{ - PAGE *l, *r, *tp; - pgno_t lnpg, rnpg; - -#ifdef STATISTICS - ++bt_split; - ++bt_rootsplit; -#endif - /* Put the new left and right pages for the split into place. */ - if ((l = __bt_new(t, &lnpg)) == NULL || - (r = __bt_new(t, &rnpg)) == NULL) - return (NULL); - l->pgno = lnpg; - r->pgno = rnpg; - l->nextpg = r->pgno; - r->prevpg = l->pgno; - l->prevpg = r->nextpg = P_INVALID; - l->lower = r->lower = BTDATAOFF; - l->upper = r->upper = t->bt_psize; - l->flags = r->flags = h->flags & P_TYPE; - - /* Split the root page. */ - tp = bt_psplit(t, h, l, r, skip, ilen); - - *lp = l; - *rp = r; - return (tp); -} + __os_free(dbp->env, lp); -/* - * BT_RROOT -- Fix up the recno root page after it has been split. - * - * Parameters: - * t: tree - * h: root page - * l: left page - * r: right page - * - * Returns: - * RET_ERROR, RET_SUCCESS - */ -static int -bt_rroot(t, h, l, r) - BTREE *t; - PAGE *h, *l, *r; -{ - char *dest; + /* + * Success -- write the real pages back to the store. + */ + if ((t_ret = __memp_fput(mpf, + dbc->thread_info, alloc_rp, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __TLPUT(dbc, rplock)) != 0 && ret == 0) + ret = t_ret; + if (tp != NULL) { + if ((t_ret = __memp_fput(mpf, + dbc->thread_info, tp, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + } + if ((t_ret = __bam_stkrel(dbc, STK_CLRDBC)) != 0 && ret == 0) + ret = t_ret; + return (ret); + +err: if (lp != NULL) + __os_free(dbp->env, lp); + if (alloc_rp != NULL) + (void)__memp_fput(mpf, + dbc->thread_info, alloc_rp, dbc->priority); + if (tp != NULL) + (void)__memp_fput(mpf, dbc->thread_info, tp, dbc->priority); + + if (pp->page != NULL) + (void)__memp_fput(mpf, + dbc->thread_info, pp->page, dbc->priority); + + if (ret == DB_NEEDSPLIT) + (void)__LPUT(dbc, pp->lock); + else + (void)__TLPUT(dbc, pp->lock); - /* Insert the left and right keys, set the header information. */ - h->linp[0] = h->upper = t->bt_psize - NRINTERNAL; - dest = (char *)h + h->upper; - WR_RINTERNAL(dest, - l->flags & P_RLEAF ? NEXTINDEX(l) : rec_total(l), l->pgno); + (void)__memp_fput(mpf, dbc->thread_info, cp->page, dbc->priority); - h->linp[1] = h->upper -= NRINTERNAL; - dest = (char *)h + h->upper; - WR_RINTERNAL(dest, - r->flags & P_RLEAF ? NEXTINDEX(r) : rec_total(r), r->pgno); + /* + * We don't drop the left and right page locks. If we doing dirty + * reads then we need to hold the locks until we abort the transaction. + * If we are not transactional, we are hosed anyway as the tree + * is trashed. It may be better not to leak the locks. + */ - h->lower = BTDATAOFF + 2 * sizeof(indx_t); + if (dbc->txn == NULL) + (void)__LPUT(dbc, rplock); - /* Unpin the root page, set to recno internal page. */ - h->flags &= ~P_TYPE; - h->flags |= P_RINTERNAL; - mpool_put(t->bt_mp, h, MPOOL_DIRTY); + if (dbc->txn == NULL || ret == DB_NEEDSPLIT) + (void)__LPUT(dbc, cp->lock); - return (RET_SUCCESS); + return (ret); } /* - * BT_BROOT -- Fix up the btree root page after it has been split. - * - * Parameters: - * t: tree - * h: root page - * l: left page - * r: right page - * - * Returns: - * RET_ERROR, RET_SUCCESS + * __bam_broot -- + * Fix up the btree root page after it has been split. + * PUBLIC: int __bam_broot __P((DBC *, PAGE *, u_int32_t, PAGE *, PAGE *)); */ -static int -bt_broot(t, h, l, r) - BTREE *t; - PAGE *h, *l, *r; +int +__bam_broot(dbc, rootp, split, lp, rp) + DBC *dbc; + u_int32_t split; + PAGE *rootp, *lp, *rp; { - BINTERNAL *bi; - BLEAF *bl; - u_int32_t nbytes; - char *dest; - + BINTERNAL bi, bi0, *child_bi; + BKEYDATA *child_bk; + BOVERFLOW bo, *child_bo; + BTREE_CURSOR *cp; + DB *dbp; + DBT hdr, hdr0, data; + db_pgno_t root_pgno; + int ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + child_bo = NULL; + data.data = NULL; + memset(&bi, 0, sizeof(bi)); + + switch (TYPE(rootp)) { + case P_IBTREE: + /* Copy the first key of the child page onto the root page. */ + child_bi = GET_BINTERNAL(dbp, rootp, split); + switch (B_TYPE(child_bi->type)) { + case B_KEYDATA: + bi.len = child_bi->len; + B_TSET(bi.type, B_KEYDATA); + bi.pgno = rp->pgno; + DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data)); + if ((ret = __os_malloc(dbp->env, + child_bi->len, &data.data)) != 0) + return (ret); + memcpy(data.data, child_bi->data, child_bi->len); + data.size = child_bi->len; + break; + case B_OVERFLOW: + /* Reuse the overflow key. */ + child_bo = (BOVERFLOW *)child_bi->data; + memset(&bo, 0, sizeof(bo)); + bo.type = B_OVERFLOW; + bo.tlen = child_bo->tlen; + bo.pgno = child_bo->pgno; + bi.len = BOVERFLOW_SIZE; + B_TSET(bi.type, B_OVERFLOW); + bi.pgno = rp->pgno; + DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data)); + DB_SET_DBT(data, &bo, BOVERFLOW_SIZE); + break; + case B_DUPLICATE: + default: + goto pgfmt; + } + break; + case P_LDUP: + case P_LBTREE: + /* Copy the first key of the child page onto the root page. */ + child_bk = GET_BKEYDATA(dbp, rootp, split); + switch (B_TYPE(child_bk->type)) { + case B_KEYDATA: + bi.len = child_bk->len; + B_TSET(bi.type, B_KEYDATA); + bi.pgno = rp->pgno; + DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data)); + if ((ret = __os_malloc(dbp->env, + child_bk->len, &data.data)) != 0) + return (ret); + memcpy(data.data, child_bk->data, child_bk->len); + data.size = child_bk->len; + break; + case B_OVERFLOW: + /* Copy the overflow key. */ + child_bo = (BOVERFLOW *)child_bk; + memset(&bo, 0, sizeof(bo)); + bo.type = B_OVERFLOW; + bo.tlen = child_bo->tlen; + memset(&hdr, 0, sizeof(hdr)); + if ((ret = __db_goff(dbc, &hdr, child_bo->tlen, + child_bo->pgno, &hdr.data, &hdr.size)) == 0) + ret = __db_poff(dbc, &hdr, &bo.pgno); + + if (hdr.data != NULL) + __os_free(dbp->env, hdr.data); + if (ret != 0) + return (ret); + + bi.len = BOVERFLOW_SIZE; + B_TSET(bi.type, B_OVERFLOW); + bi.pgno = rp->pgno; + DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data)); + DB_SET_DBT(data, &bo, BOVERFLOW_SIZE); + break; + case B_DUPLICATE: + default: + goto pgfmt; + } + break; + default: +pgfmt: return (__db_pgfmt(dbp->env, rp->pgno)); + } /* * If the root page was a leaf page, change it into an internal page. * We copy the key we split on (but not the key's data, in the case of * a leaf page) to the new root page. - * - * The btree comparison code guarantees that the left-most key on any - * level of the tree is never used, so it doesn't need to be filled in. */ - nbytes = NBINTERNAL(0); - h->linp[0] = h->upper = t->bt_psize - nbytes; - dest = (char *)h + h->upper; - WR_BINTERNAL(dest, 0, l->pgno, 0); - - switch (h->flags & P_TYPE) { - case P_BLEAF: - bl = GETBLEAF(r, 0); - nbytes = NBINTERNAL(bl->ksize); - h->linp[1] = h->upper -= nbytes; - dest = (char *)h + h->upper; - WR_BINTERNAL(dest, bl->ksize, r->pgno, 0); - memmove(dest, bl->bytes, bl->ksize); + root_pgno = cp->root; + P_INIT(rootp, dbp->pgsize, + root_pgno, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IBTREE); - /* - * If the key is on an overflow page, mark the overflow chain - * so it isn't deleted when the leaf copy of the key is deleted. - */ - if (bl->flags & P_BIGKEY && - bt_preserve(t, *(pgno_t *)bl->bytes) == RET_ERROR) - return (RET_ERROR); - break; - case P_BINTERNAL: - bi = GETBINTERNAL(r, 0); - nbytes = NBINTERNAL(bi->ksize); - h->linp[1] = h->upper -= nbytes; - dest = (char *)h + h->upper; - memmove(dest, bi, nbytes); - ((BINTERNAL *)dest)->pgno = r->pgno; - break; - default: - abort(); + /* + * The btree comparison code guarantees that the left-most key on any + * internal btree page is never used, so it doesn't need to be filled + * in. Set the record count if necessary. + */ + memset(&bi0, 0, sizeof(bi0)); + B_TSET(bi0.type, B_KEYDATA); + bi0.pgno = lp->pgno; + if (F_ISSET(cp, C_RECNUM)) { + bi0.nrecs = __bam_total(dbp, lp); + RE_NREC_SET(rootp, bi0.nrecs); + bi.nrecs = __bam_total(dbp, rp); + RE_NREC_ADJ(rootp, bi.nrecs); } + DB_SET_DBT(hdr0, &bi0, SSZA(BINTERNAL, data)); + if ((ret = __db_pitem_nolog(dbc, rootp, + 0, BINTERNAL_SIZE(0), &hdr0, NULL)) != 0) + goto err; + ret = __db_pitem_nolog(dbc, rootp, 1, + BINTERNAL_SIZE(data.size), &hdr, &data); + +err: if (data.data != NULL && child_bo == NULL) + __os_free(dbp->env, data.data); + return (ret); +} + +/* + * __ram_root -- + * Fix up the recno root page after it has been split. + * PUBLIC: int __ram_root __P((DBC *, PAGE *, PAGE *, PAGE *)); + */ +int +__ram_root(dbc, rootp, lp, rp) + DBC *dbc; + PAGE *rootp, *lp, *rp; +{ + DB *dbp; + DBT hdr; + RINTERNAL ri; + db_pgno_t root_pgno; + int ret; + + dbp = dbc->dbp; + root_pgno = dbc->internal->root; - /* There are two keys on the page. */ - h->lower = BTDATAOFF + 2 * sizeof(indx_t); + /* Initialize the page. */ + P_INIT(rootp, dbp->pgsize, + root_pgno, PGNO_INVALID, PGNO_INVALID, lp->level + 1, P_IRECNO); - /* Unpin the root page, set to btree internal page. */ - h->flags &= ~P_TYPE; - h->flags |= P_BINTERNAL; - mpool_put(t->bt_mp, h, MPOOL_DIRTY); + /* Initialize the header. */ + DB_SET_DBT(hdr, &ri, RINTERNAL_SIZE); - return (RET_SUCCESS); + /* Insert the left and right keys, set the header information. */ + ri.pgno = lp->pgno; + ri.nrecs = __bam_total(dbp, lp); + if ((ret = __db_pitem_nolog(dbc, + rootp, 0, RINTERNAL_SIZE, &hdr, NULL)) != 0) + return (ret); + RE_NREC_SET(rootp, ri.nrecs); + ri.pgno = rp->pgno; + ri.nrecs = __bam_total(dbp, rp); + if ((ret = __db_pitem_nolog(dbc, + rootp, 1, RINTERNAL_SIZE, &hdr, NULL)) != 0) + return (ret); + RE_NREC_ADJ(rootp, ri.nrecs); + return (0); } /* - * BT_PSPLIT -- Do the real work of splitting the page. - * - * Parameters: - * t: tree - * h: page to be split - * l: page to put lower half of data - * r: page to put upper half of data - * pskip: pointer to index to leave open - * ilen: insert length + * __bam_pinsert -- + * Insert a new key into a parent page, completing the split. * - * Returns: - * Pointer to page in which to insert. + * PUBLIC: int __bam_pinsert + * PUBLIC: __P((DBC *, EPG *, u_int32_t, PAGE *, PAGE *, int)); */ -static PAGE * -bt_psplit(t, h, l, r, pskip, ilen) - BTREE *t; - PAGE *h, *l, *r; - indx_t *pskip; - size_t ilen; +int +__bam_pinsert(dbc, parent, split, lchild, rchild, flags) + DBC *dbc; + EPG *parent; + u_int32_t split; + PAGE *lchild, *rchild; + int flags; { - BINTERNAL *bi; - BLEAF *bl; - CURSOR *c; - RLEAF *rl; - PAGE *rval; - void *src; - indx_t full, half, nxt, off, skip, top, used; - u_int32_t nbytes; - int bigkeycnt, isbigkey; + BINTERNAL bi, *child_bi; + BKEYDATA *child_bk, *tmp_bk; + BOVERFLOW bo, *child_bo; + BTREE *t; + BTREE_CURSOR *cp; + DB *dbp; + DBT a, b, hdr, data; + EPG *child; + PAGE *ppage; + RINTERNAL ri; + db_indx_t off; + db_recno_t nrecs; + size_t (*func) __P((DB *, const DBT *, const DBT *)); + int (*pitem) __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *)); + u_int32_t n, nbytes, nksize, oldsize, size; + int ret; + + dbp = dbc->dbp; + cp = (BTREE_CURSOR *)dbc->internal; + t = dbp->bt_internal; + ppage = parent->page; + child = parent + 1; + + /* If handling record numbers, count records split to the right page. */ + nrecs = F_ISSET(cp, C_RECNUM) && + !LF_ISSET(BPI_SPACEONLY) ? __bam_total(dbp, rchild) : 0; /* - * Split the data to the left and right pages. Leave the skip index - * open. Additionally, make some effort not to split on an overflow - * key. This makes internal page processing faster and can save - * space as overflow keys used by internal pages are never deleted. + * Now we insert the new page's first key into the parent page, which + * completes the split. The parent points to a PAGE and a page index + * offset, where the new key goes ONE AFTER the index, because we split + * to the right. + * + * XXX + * Some btree algorithms replace the key for the old page as well as + * the new page. We don't, as there's no reason to believe that the + * first key on the old page is any better than the key we have, and, + * in the case of a key being placed at index 0 causing the split, the + * key is unavailable. */ - bigkeycnt = 0; - skip = *pskip; - full = t->bt_psize - BTDATAOFF; - half = full / 2; - used = 0; - for (nxt = off = 0, top = NEXTINDEX(h); nxt < top; ++off) { - if (skip == off) { - nbytes = ilen; - isbigkey = 0; /* XXX: not really known. */ - } else - switch (h->flags & P_TYPE) { - case P_BINTERNAL: - src = bi = GETBINTERNAL(h, nxt); - nbytes = NBINTERNAL(bi->ksize); - isbigkey = bi->flags & P_BIGKEY; - break; - case P_BLEAF: - src = bl = GETBLEAF(h, nxt); - nbytes = NBLEAF(bl); - isbigkey = bl->flags & P_BIGKEY; - break; - case P_RINTERNAL: - src = GETRINTERNAL(h, nxt); - nbytes = NRINTERNAL; - isbigkey = 0; - break; - case P_RLEAF: - src = rl = GETRLEAF(h, nxt); - nbytes = NRLEAF(rl); - isbigkey = 0; - break; - default: - abort(); - } + off = parent->indx + O_INDX; + if (LF_ISSET(BPI_REPLACE)) + oldsize = TYPE(ppage) == P_IRECNO ? RINTERNAL_PSIZE : + BINTERNAL_PSIZE(GET_BINTERNAL(dbp, ppage, off)->len); + else + oldsize = 0; - /* - * If the key/data pairs are substantial fractions of the max - * possible size for the page, it's possible to get situations - * where we decide to try and copy too much onto the left page. - * Make sure that doesn't happen. - */ - if (skip <= off && used + nbytes >= full) { - --off; + /* + * Calculate the space needed on the parent page. + * + * Prefix trees: space hack used when inserting into BINTERNAL pages. + * Retain only what's needed to distinguish between the new entry and + * the LAST entry on the page to its left. If the keys compare equal, + * retain the entire key. We ignore overflow keys, and the entire key + * must be retained for the next-to-leftmost key on the leftmost page + * of each level, or the search will fail. Applicable ONLY to internal + * pages that have leaf pages as children. Further reduction of the + * key between pairs of internal pages loses too much information. + */ + switch (TYPE(child->page)) { + case P_IBTREE: + child_bi = GET_BINTERNAL(dbp, child->page, split); + nbytes = BINTERNAL_PSIZE(child_bi->len); + + if (P_FREESPACE(dbp, ppage) + oldsize < nbytes) + return (DB_NEEDSPLIT); + if (LF_ISSET(BPI_SPACEONLY)) + return (0); + + switch (B_TYPE(child_bi->type)) { + case B_KEYDATA: + /* Add a new record for the right page. */ + memset(&bi, 0, sizeof(bi)); + bi.len = child_bi->len; + B_TSET(bi.type, B_KEYDATA); + bi.pgno = rchild->pgno; + bi.nrecs = nrecs; + DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data)); + DB_SET_DBT(data, child_bi->data, child_bi->len); + size = BINTERNAL_SIZE(child_bi->len); break; + case B_OVERFLOW: + /* Reuse the overflow key. */ + child_bo = (BOVERFLOW *)child_bi->data; + memset(&bo, 0, sizeof(bo)); + bo.type = B_OVERFLOW; + bo.tlen = child_bo->tlen; + bo.pgno = child_bo->pgno; + bi.len = BOVERFLOW_SIZE; + B_TSET(bi.type, B_OVERFLOW); + bi.pgno = rchild->pgno; + bi.nrecs = nrecs; + DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data)); + DB_SET_DBT(data, &bo, BOVERFLOW_SIZE); + size = BINTERNAL_SIZE(BOVERFLOW_SIZE); + break; + case B_DUPLICATE: + default: + goto pgfmt; } + break; + case P_LDUP: + case P_LBTREE: + child_bk = GET_BKEYDATA(dbp, child->page, split); + switch (B_TYPE(child_bk->type)) { + case B_KEYDATA: + nbytes = BINTERNAL_PSIZE(child_bk->len); + nksize = child_bk->len; - /* Copy the key/data pair, if not the skipped index. */ - if (skip != off) { - ++nxt; + /* + * Prefix compression: + * We set t->bt_prefix to NULL if we have a comparison + * callback but no prefix compression callback. But, + * if we're splitting in an off-page duplicates tree, + * we still have to do some checking. If using the + * default off-page duplicates comparison routine we + * can use the default prefix compression callback. If + * not using the default off-page duplicates comparison + * routine, we can't do any kind of prefix compression + * as there's no way for an application to specify a + * prefix compression callback that corresponds to its + * comparison callback. + * + * No prefix compression if we don't have a compression + * function, or the key we'd compress isn't a normal + * key (for example, it references an overflow page). + * + * Generate a parent page key for the right child page + * from a comparison of the last key on the left child + * page and the first key on the right child page. + */ + if (F_ISSET(dbc, DBC_OPD)) { + if (dbp->dup_compare == __bam_defcmp) + func = __bam_defpfx; + else + func = NULL; + } else + func = t->bt_prefix; + if (func == NULL) + goto noprefix; + tmp_bk = GET_BKEYDATA(dbp, lchild, NUM_ENT(lchild) - + (TYPE(lchild) == P_LDUP ? O_INDX : P_INDX)); + if (B_TYPE(tmp_bk->type) != B_KEYDATA) + goto noprefix; + DB_INIT_DBT(a, tmp_bk->data, tmp_bk->len); + DB_INIT_DBT(b, child_bk->data, child_bk->len); + nksize = (u_int32_t)func(dbp, &a, &b); + if ((n = BINTERNAL_PSIZE(nksize)) < nbytes) + nbytes = n; + else + nksize = child_bk->len; + +noprefix: if (P_FREESPACE(dbp, ppage) + oldsize < nbytes) + return (DB_NEEDSPLIT); + if (LF_ISSET(BPI_SPACEONLY)) + return (0); + + memset(&bi, 0, sizeof(bi)); + bi.len = nksize; + B_TSET(bi.type, B_KEYDATA); + bi.pgno = rchild->pgno; + bi.nrecs = nrecs; + DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data)); + DB_SET_DBT(data, child_bk->data, nksize); + size = BINTERNAL_SIZE(nksize); + break; + case B_OVERFLOW: + nbytes = BINTERNAL_PSIZE(BOVERFLOW_SIZE); + + if (P_FREESPACE(dbp, ppage) + oldsize < nbytes) + return (DB_NEEDSPLIT); + if (LF_ISSET(BPI_SPACEONLY)) + return (0); + + /* Copy the overflow key. */ + child_bo = (BOVERFLOW *)child_bk; + memset(&bo, 0, sizeof(bo)); + bo.type = B_OVERFLOW; + bo.tlen = child_bo->tlen; + memset(&hdr, 0, sizeof(hdr)); + if ((ret = __db_goff(dbc, &hdr, child_bo->tlen, + child_bo->pgno, &hdr.data, &hdr.size)) == 0) + ret = __db_poff(dbc, &hdr, &bo.pgno); + + if (hdr.data != NULL) + __os_free(dbp->env, hdr.data); + if (ret != 0) + return (ret); + + memset(&bi, 0, sizeof(bi)); + bi.len = BOVERFLOW_SIZE; + B_TSET(bi.type, B_OVERFLOW); + bi.pgno = rchild->pgno; + bi.nrecs = nrecs; + DB_SET_DBT(hdr, &bi, SSZA(BINTERNAL, data)); + DB_SET_DBT(data, &bo, BOVERFLOW_SIZE); + size = BINTERNAL_SIZE(BOVERFLOW_SIZE); - l->linp[off] = l->upper -= nbytes; - memmove((char *)l + l->upper, src, nbytes); + break; + case B_DUPLICATE: + default: + goto pgfmt; } - - used += nbytes; - if (used >= half) { - if (!isbigkey || bigkeycnt == 3) - break; - else - ++bigkeycnt; + break; + case P_IRECNO: + case P_LRECNO: + nbytes = RINTERNAL_PSIZE; + + if (P_FREESPACE(dbp, ppage) + oldsize < nbytes) + return (DB_NEEDSPLIT); + if (LF_ISSET(BPI_SPACEONLY)) + return (0); + + /* Add a new record for the right page. */ + DB_SET_DBT(hdr, &ri, RINTERNAL_SIZE); + ri.pgno = rchild->pgno; + ri.nrecs = nrecs; + size = RINTERNAL_SIZE; + data.size = 0; + /* + * For now, we are locking internal recno nodes so + * use two steps. + */ + if (LF_ISSET(BPI_REPLACE)) { + if ((ret = __bam_ditem(dbc, ppage, off)) != 0) + return (ret); + LF_CLR(BPI_REPLACE); } + break; + default: +pgfmt: return (__db_pgfmt(dbp->env, PGNO(child->page))); + } + + if (LF_ISSET(BPI_REPLACE)) { + DB_ASSERT(dbp->env, !LF_ISSET(BPI_NOLOGGING)); + if ((ret = __bam_irep(dbc, ppage, + off, &hdr, data.size != 0 ? &data : NULL)) != 0) + return (ret); + } else { + if (LF_ISSET(BPI_NOLOGGING)) + pitem = __db_pitem_nolog; + else + pitem = __db_pitem; + + if ((ret = pitem(dbc, ppage, + off, size, &hdr, data.size != 0 ? &data : NULL)) != 0) + return (ret); } /* - * Off is the last offset that's valid for the left page. - * Nxt is the first offset to be placed on the right page. + * If a Recno or Btree with record numbers AM page, or an off-page + * duplicates tree, adjust the parent page's left page record count. */ - l->lower += (off + 1) * sizeof(indx_t); + if (F_ISSET(cp, C_RECNUM) && !LF_ISSET(BPI_NORECNUM)) { + /* Log the change. */ + if (DBC_LOGGING(dbc)) { + if ((ret = __bam_cadjust_log(dbp, dbc->txn, + &LSN(ppage), 0, PGNO(ppage), &LSN(ppage), + parent->indx, -(int32_t)nrecs, 0)) != 0) + return (ret); + } else + LSN_NOT_LOGGED(LSN(ppage)); + + /* Update the left page count. */ + if (dbc->dbtype == DB_RECNO) + GET_RINTERNAL(dbp, ppage, parent->indx)->nrecs -= nrecs; + else + GET_BINTERNAL(dbp, ppage, parent->indx)->nrecs -= nrecs; + } + + return (0); +} + +/* + * __bam_psplit -- + * Do the real work of splitting the page. + */ +static int +__bam_psplit(dbc, cp, lp, rp, splitret) + DBC *dbc; + EPG *cp; + PAGE *lp, *rp; + db_indx_t *splitret; +{ + DB *dbp; + PAGE *pp; + db_indx_t half, *inp, nbytes, off, splitp, top; + int adjust, cnt, iflag, isbigkey, ret; + + dbp = dbc->dbp; + pp = cp->page; + inp = P_INP(dbp, pp); + adjust = TYPE(pp) == P_LBTREE ? P_INDX : O_INDX; /* - * If splitting the page that the cursor was on, the cursor has to be - * adjusted to point to the same record as before the split. If the - * cursor is at or past the skipped slot, the cursor is incremented by - * one. If the cursor is on the right page, it is decremented by the - * number of records split to the left page. + * If we're splitting the first (last) page on a level because we're + * inserting (appending) a key to it, it's likely that the data is + * sorted. Moving a single item to the new page is less work and can + * push the fill factor higher than normal. This is trivial when we + * are splitting a new page before the beginning of the tree, all of + * the interesting tests are against values of 0. + * + * Catching appends to the tree is harder. In a simple append, we're + * inserting an item that sorts past the end of the tree; the cursor + * will point past the last element on the page. But, in trees with + * duplicates, the cursor may point to the last entry on the page -- + * in this case, the entry will also be the last element of a duplicate + * set (the last because the search call specified the SR_DUPLAST flag). + * The only way to differentiate between an insert immediately before + * the last item in a tree or an append after a duplicate set which is + * also the last item in the tree is to call the comparison function. + * When splitting internal pages during an append, the search code + * guarantees the cursor always points to the largest page item less + * than the new internal entry. To summarize, we want to catch three + * possible index values: + * + * NUM_ENT(page) Btree/Recno leaf insert past end-of-tree + * NUM_ENT(page) - O_INDX Btree or Recno internal insert past EOT + * NUM_ENT(page) - P_INDX Btree leaf insert past EOT after a set + * of duplicates + * + * two of which, (NUM_ENT(page) - O_INDX or P_INDX) might be an insert + * near the end of the tree, and not after the end of the tree at all. + * Do a simple test which might be wrong because calling the comparison + * functions is expensive. Regardless, it's not a big deal if we're + * wrong, we'll do the split the right way next time. */ - c = &t->bt_cursor; - if (F_ISSET(c, CURS_INIT) && c->pg.pgno == h->pgno) { - if (c->pg.index >= skip) - ++c->pg.index; - if (c->pg.index < nxt) /* Left page. */ - c->pg.pgno = l->pgno; - else { /* Right page. */ - c->pg.pgno = r->pgno; - c->pg.index -= nxt; - } - } + off = 0; + if (NEXT_PGNO(pp) == PGNO_INVALID && cp->indx >= NUM_ENT(pp) - adjust) + off = NUM_ENT(pp) - adjust; + else if (PREV_PGNO(pp) == PGNO_INVALID && cp->indx == 0) + off = adjust; + if (off != 0) + goto sort; /* - * If the skipped index was on the left page, just return that page. - * Otherwise, adjust the skip index to reflect the new position on - * the right page. + * Split the data to the left and right pages. Try not to split on + * an overflow key. (Overflow keys on internal pages will slow down + * searches.) Refuse to split in the middle of a set of duplicates. + * + * First, find the optimum place to split. + * + * It's possible to try and split past the last record on the page if + * there's a very large record at the end of the page. Make sure this + * doesn't happen by bounding the check at the next-to-last entry on + * the page. + * + * Note, we try and split half the data present on the page. This is + * because another process may have already split the page and left + * it half empty. We don't try and skip the split -- we don't know + * how much space we're going to need on the page, and we may need up + * to half the page for a big item, so there's no easy test to decide + * if we need to split or not. Besides, if two threads are inserting + * data into the same place in the database, we're probably going to + * need more space soon anyway. */ - if (skip <= off) { - skip = 0; - rval = l; - } else { - rval = r; - *pskip -= nxt; - } + top = NUM_ENT(pp) - adjust; + half = (dbp->pgsize - HOFFSET(pp)) / 2; + for (nbytes = 0, off = 0; off < top && nbytes < half; ++off) + switch (TYPE(pp)) { + case P_IBTREE: + if (B_TYPE( + GET_BINTERNAL(dbp, pp, off)->type) == B_KEYDATA) + nbytes += BINTERNAL_SIZE( + GET_BINTERNAL(dbp, pp, off)->len); + else + nbytes += BINTERNAL_SIZE(BOVERFLOW_SIZE); + break; + case P_LBTREE: + if (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) == + B_KEYDATA) + nbytes += BKEYDATA_SIZE(GET_BKEYDATA(dbp, + pp, off)->len); + else + nbytes += BOVERFLOW_SIZE; - for (off = 0; nxt < top; ++off) { - if (skip == nxt) { ++off; - skip = 0; - } - switch (h->flags & P_TYPE) { - case P_BINTERNAL: - src = bi = GETBINTERNAL(h, nxt); - nbytes = NBINTERNAL(bi->ksize); - break; - case P_BLEAF: - src = bl = GETBLEAF(h, nxt); - nbytes = NBLEAF(bl); - break; - case P_RINTERNAL: - src = GETRINTERNAL(h, nxt); - nbytes = NRINTERNAL; + /* FALLTHROUGH */ + case P_LDUP: + case P_LRECNO: + if (B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) == + B_KEYDATA) + nbytes += BKEYDATA_SIZE(GET_BKEYDATA(dbp, + pp, off)->len); + else + nbytes += BOVERFLOW_SIZE; break; - case P_RLEAF: - src = rl = GETRLEAF(h, nxt); - nbytes = NRLEAF(rl); + case P_IRECNO: + nbytes += RINTERNAL_SIZE; break; default: - abort(); + return (__db_pgfmt(dbp->env, pp->pgno)); } - ++nxt; - r->linp[off] = r->upper -= nbytes; - memmove((char *)r + r->upper, src, nbytes); - } - r->lower += off * sizeof(indx_t); +sort: splitp = off; - /* If the key is being appended to the page, adjust the index. */ - if (skip == top) - r->lower += sizeof(indx_t); + /* + * Splitp is either at or just past the optimum split point. If the + * tree type is such that we're going to promote a key to an internal + * page, and our current choice is an overflow key, look for something + * close by that's smaller. + */ + switch (TYPE(pp)) { + case P_IBTREE: + iflag = 1; + isbigkey = + B_TYPE(GET_BINTERNAL(dbp, pp, off)->type) != B_KEYDATA; + break; + case P_LBTREE: + case P_LDUP: + iflag = 0; + isbigkey = B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) != + B_KEYDATA; + break; + default: + iflag = isbigkey = 0; + } + if (isbigkey) + for (cnt = 1; cnt <= 3; ++cnt) { + off = splitp + cnt * adjust; + if (off < (db_indx_t)NUM_ENT(pp) && + ((iflag && B_TYPE( + GET_BINTERNAL(dbp, pp,off)->type) == B_KEYDATA) || + B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) == + B_KEYDATA)) { + splitp = off; + break; + } + if (splitp <= (db_indx_t)(cnt * adjust)) + continue; + off = splitp - cnt * adjust; + if (iflag ? B_TYPE( + GET_BINTERNAL(dbp, pp, off)->type) == B_KEYDATA : + B_TYPE(GET_BKEYDATA(dbp, pp, off)->type) == + B_KEYDATA) { + splitp = off; + break; + } + } - return (rval); -} + /* + * We can't split in the middle a set of duplicates. We know that + * no duplicate set can take up more than about 25% of the page, + * because that's the point where we push it off onto a duplicate + * page set. So, this loop can't be unbounded. + */ + if (TYPE(pp) == P_LBTREE && + inp[splitp] == inp[splitp - adjust]) + for (cnt = 1;; ++cnt) { + off = splitp + cnt * adjust; + if (off < NUM_ENT(pp) && + inp[splitp] != inp[off]) { + splitp = off; + break; + } + if (splitp <= (db_indx_t)(cnt * adjust)) + continue; + off = splitp - cnt * adjust; + if (inp[splitp] != inp[off]) { + splitp = off + adjust; + break; + } + } -/* - * BT_PRESERVE -- Mark a chain of pages as used by an internal node. - * - * Chains of indirect blocks pointed to by leaf nodes get reclaimed when the - * record that references them gets deleted. Chains pointed to by internal - * pages never get deleted. This routine marks a chain as pointed to by an - * internal page. - * - * Parameters: - * t: tree - * pg: page number of first page in the chain. - * - * Returns: - * RET_SUCCESS, RET_ERROR. - */ -static int -bt_preserve(t, pg) - BTREE *t; - pgno_t pg; -{ - PAGE *h; + /* We're going to split at splitp. */ + if ((ret = __bam_copy(dbp, pp, lp, 0, splitp)) != 0) + return (ret); + if ((ret = __bam_copy(dbp, pp, rp, splitp, NUM_ENT(pp))) != 0) + return (ret); - if ((h = mpool_get(t->bt_mp, pg, 0)) == NULL) - return (RET_ERROR); - h->flags |= P_PRESERVE; - mpool_put(t->bt_mp, h, MPOOL_DIRTY); - return (RET_SUCCESS); + *splitret = splitp; + return (0); } /* - * REC_TOTAL -- Return the number of recno entries below a page. - * - * Parameters: - * h: page - * - * Returns: - * The number of recno entries below a page. + * __bam_copy -- + * Copy a set of records from one page to another. * - * XXX - * These values could be set by the bt_psplit routine. The problem is that the - * entry has to be popped off of the stack etc. or the values have to be passed - * all the way back to bt_split/bt_rroot and it's not very clean. + * PUBLIC: int __bam_copy __P((DB *, PAGE *, PAGE *, u_int32_t, u_int32_t)); */ -static recno_t -rec_total(h) - PAGE *h; +int +__bam_copy(dbp, pp, cp, nxt, stop) + DB *dbp; + PAGE *pp, *cp; + u_int32_t nxt, stop; { - recno_t recs; - indx_t nxt, top; + BINTERNAL internal; + db_indx_t *cinp, nbytes, off, *pinp; - for (recs = 0, nxt = 0, top = NEXTINDEX(h); nxt < top; ++nxt) - recs += GETRINTERNAL(h, nxt)->nrecs; - return (recs); + cinp = P_INP(dbp, cp); + pinp = P_INP(dbp, pp); + /* + * Nxt is the offset of the next record to be placed on the target page. + */ + for (off = 0; nxt < stop; ++nxt, ++NUM_ENT(cp), ++off) { + switch (TYPE(pp)) { + case P_IBTREE: + if (off == 0 && nxt != 0) + nbytes = BINTERNAL_SIZE(0); + else if (B_TYPE( + GET_BINTERNAL(dbp, pp, nxt)->type) == B_KEYDATA) + nbytes = BINTERNAL_SIZE( + GET_BINTERNAL(dbp, pp, nxt)->len); + else + nbytes = BINTERNAL_SIZE(BOVERFLOW_SIZE); + break; + case P_LBTREE: + /* + * If we're on a key and it's a duplicate, just copy + * the offset. + */ + if (off != 0 && (nxt % P_INDX) == 0 && + pinp[nxt] == pinp[nxt - P_INDX]) { + cinp[off] = cinp[off - P_INDX]; + continue; + } + /* FALLTHROUGH */ + case P_LDUP: + case P_LRECNO: + if (B_TYPE(GET_BKEYDATA(dbp, pp, nxt)->type) == + B_KEYDATA) + nbytes = BKEYDATA_SIZE(GET_BKEYDATA(dbp, + pp, nxt)->len); + else + nbytes = BOVERFLOW_SIZE; + break; + case P_IRECNO: + nbytes = RINTERNAL_SIZE; + break; + default: + return (__db_pgfmt(dbp->env, pp->pgno)); + } + cinp[off] = HOFFSET(cp) -= nbytes; + if (off == 0 && nxt != 0 && TYPE(pp) == P_IBTREE) { + internal.len = 0; + UMRW_SET(internal.unused); + internal.type = B_KEYDATA; + internal.pgno = GET_BINTERNAL(dbp, pp, nxt)->pgno; + internal.nrecs = GET_BINTERNAL(dbp, pp, nxt)->nrecs; + memcpy(P_ENTRY(dbp, cp, off), &internal, nbytes); + } + else + memcpy(P_ENTRY(dbp, cp, off), + P_ENTRY(dbp, pp, nxt), nbytes); + } + return (0); } diff --git a/btree/bt_stat.c b/btree/bt_stat.c new file mode 100644 index 0000000..912a166 --- /dev/null +++ b/btree/bt_stat.c @@ -0,0 +1,669 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/lock.h" +#include "dbinc/mp.h" +#include "dbinc/partition.h" + +#ifdef HAVE_STATISTICS +/* + * __bam_stat -- + * Gather/print the btree statistics + * + * PUBLIC: int __bam_stat __P((DBC *, void *, u_int32_t)); + */ +int +__bam_stat(dbc, spp, flags) + DBC *dbc; + void *spp; + u_int32_t flags; +{ + BTMETA *meta; + BTREE *t; + BTREE_CURSOR *cp; + DB *dbp; + DB_BTREE_STAT *sp; + DB_LOCK lock, metalock; + DB_MPOOLFILE *mpf; + ENV *env; + PAGE *h; + db_pgno_t pgno; + int ret, t_ret, write_meta; + + dbp = dbc->dbp; + env = dbp->env; + + meta = NULL; + t = dbp->bt_internal; + sp = NULL; + LOCK_INIT(metalock); + LOCK_INIT(lock); + mpf = dbp->mpf; + h = NULL; + ret = write_meta = 0; + + cp = (BTREE_CURSOR *)dbc->internal; + + /* Allocate and clear the structure. */ + if ((ret = __os_umalloc(env, sizeof(*sp), &sp)) != 0) + goto err; + memset(sp, 0, sizeof(*sp)); + + /* Get the metadata page for the entire database. */ + pgno = PGNO_BASE_MD; + if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &metalock)) != 0) + goto err; + if ((ret = __memp_fget(mpf, &pgno, + dbc->thread_info, dbc->txn, 0, &meta)) != 0) + goto err; + + if (flags == DB_FAST_STAT) + goto meta_only; + + /* Walk the metadata free list, counting pages. */ + for (sp->bt_free = 0, pgno = meta->dbmeta.free; pgno != PGNO_INVALID;) { + ++sp->bt_free; + + if ((ret = __memp_fget(mpf, &pgno, + dbc->thread_info, dbc->txn, 0, &h)) != 0) + goto err; + + pgno = h->next_pgno; + if ((ret = __memp_fput(mpf, + dbc->thread_info, h, dbc->priority)) != 0) + goto err; + h = NULL; + } + + /* Get the root page. */ + pgno = cp->root; + if ((ret = __db_lget(dbc, 0, pgno, DB_LOCK_READ, 0, &lock)) != 0) + goto err; + if ((ret = __memp_fget(mpf, &pgno, + dbc->thread_info, dbc->txn, 0, &h)) != 0) + goto err; + + /* Get the levels from the root page. */ + sp->bt_levels = h->level; + + /* Discard the root page. */ + ret = __memp_fput(mpf, dbc->thread_info, h, dbc->priority); + h = NULL; + if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + goto err; + + /* Walk the tree. */ + if ((ret = __bam_traverse(dbc, + DB_LOCK_READ, cp->root, __bam_stat_callback, sp)) != 0) + goto err; + +#ifdef HAVE_COMPRESSION + if (DB_IS_COMPRESSED(dbp) && (ret = __bam_compress_count(dbc, + &sp->bt_nkeys, &sp->bt_ndata)) != 0) + goto err; +#endif + + /* + * Get the subdatabase metadata page if it's not the same as the + * one we already have. + */ + write_meta = !F_ISSET(dbp, DB_AM_RDONLY) && + (!MULTIVERSION(dbp) || dbc->txn != NULL); +meta_only: + if (t->bt_meta != PGNO_BASE_MD || write_meta) { + ret = __memp_fput(mpf, dbc->thread_info, meta, dbc->priority); + meta = NULL; + if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + goto err; + + if ((ret = __db_lget(dbc, + 0, t->bt_meta, write_meta ? DB_LOCK_WRITE : DB_LOCK_READ, + 0, &metalock)) != 0) + goto err; + if ((ret = __memp_fget(mpf, &t->bt_meta, + dbc->thread_info, dbc->txn, + write_meta ? DB_MPOOL_DIRTY : 0, &meta)) != 0) + goto err; + } + if (flags == DB_FAST_STAT) { + if (dbp->type == DB_RECNO || + (dbp->type == DB_BTREE && F_ISSET(dbp, DB_AM_RECNUM))) { + if ((ret = __db_lget(dbc, 0, + cp->root, DB_LOCK_READ, 0, &lock)) != 0) + goto err; + if ((ret = __memp_fget(mpf, &cp->root, + dbc->thread_info, dbc->txn, 0, &h)) != 0) + goto err; + + sp->bt_nkeys = RE_NREC(h); + } else + sp->bt_nkeys = meta->dbmeta.key_count; + + sp->bt_ndata = dbp->type == DB_RECNO ? + sp->bt_nkeys : meta->dbmeta.record_count; + } + + /* Get metadata page statistics. */ + sp->bt_metaflags = meta->dbmeta.flags; + sp->bt_minkey = meta->minkey; + sp->bt_re_len = meta->re_len; + sp->bt_re_pad = meta->re_pad; + /* + * Don't take the page number from the meta-data page -- that value is + * only maintained in the primary database, we may have been called on + * a subdatabase. (Yes, I read the primary database meta-data page + * earlier in this function, but I'm asking the underlying cache so the + * code for the Hash and Btree methods is the same.) + */ + if ((ret = __memp_get_last_pgno(dbp->mpf, &pgno)) != 0) + goto err; + sp->bt_pagecnt = pgno + 1; + sp->bt_pagesize = meta->dbmeta.pagesize; + sp->bt_magic = meta->dbmeta.magic; + sp->bt_version = meta->dbmeta.version; + + if (write_meta != 0) { + meta->dbmeta.key_count = sp->bt_nkeys; + meta->dbmeta.record_count = sp->bt_ndata; + } + + *(DB_BTREE_STAT **)spp = sp; + +err: /* Discard the second page. */ + if ((t_ret = __LPUT(dbc, lock)) != 0 && ret == 0) + ret = t_ret; + if (h != NULL && (t_ret = __memp_fput(mpf, + dbc->thread_info, h, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + + /* Discard the metadata page. */ + if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0) + ret = t_ret; + if (meta != NULL && (t_ret = __memp_fput(mpf, + dbc->thread_info, meta, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + + if (ret != 0 && sp != NULL) { + __os_ufree(env, sp); + *(DB_BTREE_STAT **)spp = NULL; + } + + return (ret); +} + +/* + * __bam_stat_print -- + * Display btree/recno statistics. + * + * PUBLIC: int __bam_stat_print __P((DBC *, u_int32_t)); + */ +int +__bam_stat_print(dbc, flags) + DBC *dbc; + u_int32_t flags; +{ + static const FN fn[] = { + { BTM_DUP, "duplicates" }, + { BTM_RECNO, "recno" }, + { BTM_RECNUM, "record-numbers" }, + { BTM_FIXEDLEN, "fixed-length" }, + { BTM_RENUMBER, "renumber" }, + { BTM_SUBDB, "multiple-databases" }, + { BTM_DUPSORT, "sorted duplicates" }, + { BTM_COMPRESS, "compressed" }, + { 0, NULL } + }; + DB *dbp; + DB_BTREE_STAT *sp; + ENV *env; + int lorder, ret; + const char *s; + + dbp = dbc->dbp; + env = dbp->env; +#ifdef HAVE_PARTITION + if (DB_IS_PARTITIONED(dbp)) { + if ((ret = __partition_stat(dbc, &sp, flags)) != 0) + return (ret); + } else +#endif + if ((ret = __bam_stat(dbc, &sp, LF_ISSET(DB_FAST_STAT))) != 0) + return (ret); + + if (LF_ISSET(DB_STAT_ALL)) { + __db_msg(env, "%s", DB_GLOBAL(db_line)); + __db_msg(env, "Default Btree/Recno database information:"); + } + + __db_msg(env, "%lx\tBtree magic number", (u_long)sp->bt_magic); + __db_msg(env, "%lu\tBtree version number", (u_long)sp->bt_version); + + (void)__db_get_lorder(dbp, &lorder); + switch (lorder) { + case 1234: + s = "Little-endian"; + break; + case 4321: + s = "Big-endian"; + break; + default: + s = "Unrecognized byte order"; + break; + } + __db_msg(env, "%s\tByte order", s); + __db_prflags(env, NULL, sp->bt_metaflags, fn, NULL, "\tFlags"); + if (dbp->type == DB_BTREE) + __db_dl(env, "Minimum keys per-page", (u_long)sp->bt_minkey); + if (dbp->type == DB_RECNO) { + __db_dl(env, + "Fixed-length record size", (u_long)sp->bt_re_len); + __db_msg(env, + "%#x\tFixed-length record pad", (u_int)sp->bt_re_pad); + } + __db_dl(env, + "Underlying database page size", (u_long)sp->bt_pagesize); + if (dbp->type == DB_BTREE) + __db_dl(env, "Overflow key/data size", + ((BTREE_CURSOR *)dbc->internal)->ovflsize); + __db_dl(env, "Number of levels in the tree", (u_long)sp->bt_levels); + __db_dl(env, dbp->type == DB_BTREE ? + "Number of unique keys in the tree" : + "Number of records in the tree", (u_long)sp->bt_nkeys); + __db_dl(env, + "Number of data items in the tree", (u_long)sp->bt_ndata); + + __db_dl(env, + "Number of tree internal pages", (u_long)sp->bt_int_pg); + __db_dl_pct(env, + "Number of bytes free in tree internal pages", + (u_long)sp->bt_int_pgfree, + DB_PCT_PG(sp->bt_int_pgfree, sp->bt_int_pg, sp->bt_pagesize), "ff"); + + __db_dl(env, + "Number of tree leaf pages", (u_long)sp->bt_leaf_pg); + __db_dl_pct(env, "Number of bytes free in tree leaf pages", + (u_long)sp->bt_leaf_pgfree, DB_PCT_PG( + sp->bt_leaf_pgfree, sp->bt_leaf_pg, sp->bt_pagesize), "ff"); + + __db_dl(env, + "Number of tree duplicate pages", (u_long)sp->bt_dup_pg); + __db_dl_pct(env, + "Number of bytes free in tree duplicate pages", + (u_long)sp->bt_dup_pgfree, + DB_PCT_PG(sp->bt_dup_pgfree, sp->bt_dup_pg, sp->bt_pagesize), "ff"); + + __db_dl(env, + "Number of tree overflow pages", (u_long)sp->bt_over_pg); + __db_dl_pct(env, "Number of bytes free in tree overflow pages", + (u_long)sp->bt_over_pgfree, DB_PCT_PG( + sp->bt_over_pgfree, sp->bt_over_pg, sp->bt_pagesize), "ff"); + __db_dl(env, "Number of empty pages", (u_long)sp->bt_empty_pg); + + __db_dl(env, "Number of pages on the free list", (u_long)sp->bt_free); + + __os_ufree(env, sp); + + return (0); +} + +/* + * __bam_stat_callback -- + * Statistics callback. + * + * PUBLIC: int __bam_stat_callback __P((DBC *, PAGE *, void *, int *)); + */ +int +__bam_stat_callback(dbc, h, cookie, putp) + DBC *dbc; + PAGE *h; + void *cookie; + int *putp; +{ + DB *dbp; + DB_BTREE_STAT *sp; + db_indx_t indx, *inp, top; + u_int8_t type; + + dbp = dbc->dbp; + sp = cookie; + *putp = 0; + top = NUM_ENT(h); + inp = P_INP(dbp, h); + + switch (TYPE(h)) { + case P_IBTREE: + case P_IRECNO: + ++sp->bt_int_pg; + sp->bt_int_pgfree += P_FREESPACE(dbp, h); + break; + case P_LBTREE: + if (top == 0) + ++sp->bt_empty_pg; + + /* Correct for on-page duplicates and deleted items. */ + for (indx = 0; indx < top; indx += P_INDX) { + type = GET_BKEYDATA(dbp, h, indx + O_INDX)->type; + /* Ignore deleted items. */ + if (B_DISSET(type)) + continue; + + /* Ignore duplicate keys. */ + if (indx + P_INDX >= top || + inp[indx] != inp[indx + P_INDX]) + ++sp->bt_nkeys; + + /* Ignore off-page duplicates. */ + if (B_TYPE(type) != B_DUPLICATE) + ++sp->bt_ndata; + } + + ++sp->bt_leaf_pg; + sp->bt_leaf_pgfree += P_FREESPACE(dbp, h); + break; + case P_LRECNO: + if (top == 0) + ++sp->bt_empty_pg; + + /* + * If walking a recno tree, then each of these items is a key. + * Otherwise, we're walking an off-page duplicate set. + */ + if (dbp->type == DB_RECNO) { + /* + * Correct for deleted items in non-renumbering Recno + * databases. + */ + if (F_ISSET(dbp, DB_AM_RENUMBER)) { + sp->bt_nkeys += top; + sp->bt_ndata += top; + } else + for (indx = 0; indx < top; indx += O_INDX) { + type = GET_BKEYDATA(dbp, h, indx)->type; + if (!B_DISSET(type)) { + ++sp->bt_ndata; + ++sp->bt_nkeys; + } + } + + ++sp->bt_leaf_pg; + sp->bt_leaf_pgfree += P_FREESPACE(dbp, h); + } else { + sp->bt_ndata += top; + + ++sp->bt_dup_pg; + sp->bt_dup_pgfree += P_FREESPACE(dbp, h); + } + break; + case P_LDUP: + if (top == 0) + ++sp->bt_empty_pg; + + /* Correct for deleted items. */ + for (indx = 0; indx < top; indx += O_INDX) + if (!B_DISSET(GET_BKEYDATA(dbp, h, indx)->type)) + ++sp->bt_ndata; + + ++sp->bt_dup_pg; + sp->bt_dup_pgfree += P_FREESPACE(dbp, h); + break; + case P_OVERFLOW: + ++sp->bt_over_pg; + sp->bt_over_pgfree += P_OVFLSPACE(dbp, dbp->pgsize, h); + break; + default: + return (__db_pgfmt(dbp->env, h->pgno)); + } + return (0); +} + +/* + * __bam_print_cursor -- + * Display the current internal cursor. + * + * PUBLIC: void __bam_print_cursor __P((DBC *)); + */ +void +__bam_print_cursor(dbc) + DBC *dbc; +{ + static const FN fn[] = { + { C_DELETED, "C_DELETED" }, + { C_RECNUM, "C_RECNUM" }, + { C_RENUMBER, "C_RENUMBER" }, + { 0, NULL } + }; + ENV *env; + BTREE_CURSOR *cp; + + env = dbc->env; + cp = (BTREE_CURSOR *)dbc->internal; + + STAT_ULONG("Overflow size", cp->ovflsize); + if (dbc->dbtype == DB_RECNO) + STAT_ULONG("Recno", cp->recno); + STAT_ULONG("Order", cp->order); + __db_prflags(env, NULL, cp->flags, fn, NULL, "\tInternal Flags"); +} + +#else /* !HAVE_STATISTICS */ + +int +__bam_stat(dbc, spp, flags) + DBC *dbc; + void *spp; + u_int32_t flags; +{ + COMPQUIET(spp, NULL); + COMPQUIET(flags, 0); + + return (__db_stat_not_built(dbc->env)); +} + +int +__bam_stat_print(dbc, flags) + DBC *dbc; + u_int32_t flags; +{ + COMPQUIET(flags, 0); + + return (__db_stat_not_built(dbc->env)); +} +#endif + +#ifndef HAVE_BREW +/* + * __bam_key_range -- + * Return proportion of keys relative to given key. The numbers are + * slightly skewed due to on page duplicates. + * + * PUBLIC: int __bam_key_range __P((DBC *, DBT *, DB_KEY_RANGE *, u_int32_t)); + */ +int +__bam_key_range(dbc, dbt, kp, flags) + DBC *dbc; + DBT *dbt; + DB_KEY_RANGE *kp; + u_int32_t flags; +{ + BTREE_CURSOR *cp; + EPG *sp; + double factor; + int exact, ret; + + COMPQUIET(flags, 0); + + if ((ret = __bam_search(dbc, PGNO_INVALID, + dbt, SR_STK_ONLY, 1, NULL, &exact)) != 0) + return (ret); + + cp = (BTREE_CURSOR *)dbc->internal; + kp->less = kp->greater = 0.0; + + factor = 1.0; + + /* Correct the leaf page. */ + cp->csp->entries /= 2; + cp->csp->indx /= 2; + for (sp = cp->sp; sp <= cp->csp; ++sp) { + /* + * At each level we know that pages greater than indx contain + * keys greater than what we are looking for and those less + * than indx are less than. The one pointed to by indx may + * have some less, some greater or even equal. If indx is + * equal to the number of entries, then the key is out of range + * and everything is less. + */ + if (sp->indx == 0) + kp->greater += factor * (sp->entries - 1)/sp->entries; + else if (sp->indx == sp->entries) + kp->less += factor; + else { + kp->less += factor * sp->indx / sp->entries; + kp->greater += factor * + ((sp->entries - sp->indx) - 1) / sp->entries; + } + factor *= 1.0/sp->entries; + } + + /* + * If there was an exact match then assign 1 n'th to the key itself. + * Otherwise that factor belongs to those greater than the key, unless + * the key was out of range. + */ + if (exact) + kp->equal = factor; + else { + if (kp->less != 1) + kp->greater += factor; + kp->equal = 0; + } + + BT_STK_CLR(cp); + + return (0); +} +#endif + +/* + * __bam_traverse -- + * Walk a Btree database. + * + * PUBLIC: int __bam_traverse __P((DBC *, db_lockmode_t, + * PUBLIC: db_pgno_t, int (*)(DBC *, PAGE *, void *, int *), void *)); + */ +int +__bam_traverse(dbc, mode, root_pgno, callback, cookie) + DBC *dbc; + db_lockmode_t mode; + db_pgno_t root_pgno; + int (*callback)__P((DBC *, PAGE *, void *, int *)); + void *cookie; +{ + BINTERNAL *bi; + BKEYDATA *bk; + DB *dbp; + DB_LOCK lock; + DB_MPOOLFILE *mpf; + PAGE *h; + RINTERNAL *ri; + db_indx_t indx, *inp; + int already_put, ret, t_ret; + + dbp = dbc->dbp; + mpf = dbp->mpf; + already_put = 0; + + if ((ret = __db_lget(dbc, 0, root_pgno, mode, 0, &lock)) != 0) + return (ret); + if ((ret = __memp_fget(mpf, &root_pgno, + dbc->thread_info, dbc->txn, 0, &h)) != 0) { + (void)__TLPUT(dbc, lock); + return (ret); + } + + switch (TYPE(h)) { + case P_IBTREE: + for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) { + bi = GET_BINTERNAL(dbp, h, indx); + if (B_TYPE(bi->type) == B_OVERFLOW && + (ret = __db_traverse_big(dbc, + ((BOVERFLOW *)bi->data)->pgno, + callback, cookie)) != 0) + goto err; + if ((ret = __bam_traverse( + dbc, mode, bi->pgno, callback, cookie)) != 0) + goto err; + } + break; + case P_IRECNO: + for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) { + ri = GET_RINTERNAL(dbp, h, indx); + if ((ret = __bam_traverse( + dbc, mode, ri->pgno, callback, cookie)) != 0) + goto err; + } + break; + case P_LBTREE: + inp = P_INP(dbp, h); + for (indx = 0; indx < NUM_ENT(h); indx += P_INDX) { + bk = GET_BKEYDATA(dbp, h, indx); + if (B_TYPE(bk->type) == B_OVERFLOW && + (indx + P_INDX >= NUM_ENT(h) || + inp[indx] != inp[indx + P_INDX])) { + if ((ret = __db_traverse_big(dbc, + GET_BOVERFLOW(dbp, h, indx)->pgno, + callback, cookie)) != 0) + goto err; + } + bk = GET_BKEYDATA(dbp, h, indx + O_INDX); + if (B_TYPE(bk->type) == B_DUPLICATE && + (ret = __bam_traverse(dbc, mode, + GET_BOVERFLOW(dbp, h, indx + O_INDX)->pgno, + callback, cookie)) != 0) + goto err; + if (B_TYPE(bk->type) == B_OVERFLOW && + (ret = __db_traverse_big(dbc, + GET_BOVERFLOW(dbp, h, indx + O_INDX)->pgno, + callback, cookie)) != 0) + goto err; + } + break; + case P_LDUP: + case P_LRECNO: + for (indx = 0; indx < NUM_ENT(h); indx += O_INDX) { + bk = GET_BKEYDATA(dbp, h, indx); + if (B_TYPE(bk->type) == B_OVERFLOW && + (ret = __db_traverse_big(dbc, + GET_BOVERFLOW(dbp, h, indx)->pgno, + callback, cookie)) != 0) + goto err; + } + break; + default: + return (__db_pgfmt(dbp->env, h->pgno)); + } + + ret = callback(dbc, h, cookie, &already_put); + +err: if (!already_put && (t_ret = __memp_fput(mpf, + dbc->thread_info, h, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} diff --git a/btree/bt_upgrade.c b/btree/bt_upgrade.c new file mode 100644 index 0000000..edf6718 --- /dev/null +++ b/btree/bt_upgrade.c @@ -0,0 +1,153 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_upgrade.h" +#include "dbinc/btree.h" + +/* + * __bam_30_btreemeta -- + * Upgrade the metadata pages from version 6 to version 7. + * + * PUBLIC: int __bam_30_btreemeta __P((DB *, char *, u_int8_t *)); + */ +int +__bam_30_btreemeta(dbp, real_name, buf) + DB *dbp; + char *real_name; + u_int8_t *buf; +{ + BTMETA2X *oldmeta; + BTMETA30 *newmeta; + ENV *env; + int ret; + + env = dbp->env; + + newmeta = (BTMETA30 *)buf; + oldmeta = (BTMETA2X *)buf; + + /* + * Move things from the end up, so we do not overwrite things. + * We are going to create a new uid, so we can move the stuff + * at the end of the structure first, overwriting the uid. + */ + + newmeta->re_pad = oldmeta->re_pad; + newmeta->re_len = oldmeta->re_len; + newmeta->minkey = oldmeta->minkey; + newmeta->maxkey = oldmeta->maxkey; + newmeta->dbmeta.free = oldmeta->free; + newmeta->dbmeta.flags = oldmeta->flags; + newmeta->dbmeta.type = P_BTREEMETA; + + newmeta->dbmeta.version = 7; + /* Replace the unique ID. */ + if ((ret = __os_fileid(env, real_name, 1, buf + 36)) != 0) + return (ret); + + newmeta->root = 1; + + return (0); +} + +/* + * __bam_31_btreemeta -- + * Upgrade the database from version 7 to version 8. + * + * PUBLIC: int __bam_31_btreemeta + * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); + */ +int +__bam_31_btreemeta(dbp, real_name, flags, fhp, h, dirtyp) + DB *dbp; + char *real_name; + u_int32_t flags; + DB_FH *fhp; + PAGE *h; + int *dirtyp; +{ + BTMETA30 *oldmeta; + BTMETA31 *newmeta; + + COMPQUIET(dbp, NULL); + COMPQUIET(real_name, NULL); + COMPQUIET(fhp, NULL); + + newmeta = (BTMETA31 *)h; + oldmeta = (BTMETA30 *)h; + + /* + * Copy the effected fields down the page. + * The fields may overlap each other so we + * start at the bottom and use memmove. + */ + newmeta->root = oldmeta->root; + newmeta->re_pad = oldmeta->re_pad; + newmeta->re_len = oldmeta->re_len; + newmeta->minkey = oldmeta->minkey; + newmeta->maxkey = oldmeta->maxkey; + memmove(newmeta->dbmeta.uid, + oldmeta->dbmeta.uid, sizeof(oldmeta->dbmeta.uid)); + newmeta->dbmeta.flags = oldmeta->dbmeta.flags; + newmeta->dbmeta.record_count = 0; + newmeta->dbmeta.key_count = 0; + ZERO_LSN(newmeta->dbmeta.unused3); + + /* Set the version number. */ + newmeta->dbmeta.version = 8; + + /* Upgrade the flags. */ + if (LF_ISSET(DB_DUPSORT)) + F_SET(&newmeta->dbmeta, BTM_DUPSORT); + + *dirtyp = 1; + return (0); +} + +/* + * __bam_31_lbtree -- + * Upgrade the database btree leaf pages. + * + * PUBLIC: int __bam_31_lbtree + * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); + */ +int +__bam_31_lbtree(dbp, real_name, flags, fhp, h, dirtyp) + DB *dbp; + char *real_name; + u_int32_t flags; + DB_FH *fhp; + PAGE *h; + int *dirtyp; +{ + BKEYDATA *bk; + db_pgno_t pgno; + db_indx_t indx; + int ret; + + ret = 0; + for (indx = O_INDX; indx < NUM_ENT(h); indx += P_INDX) { + bk = GET_BKEYDATA(dbp, h, indx); + if (B_TYPE(bk->type) == B_DUPLICATE) { + pgno = GET_BOVERFLOW(dbp, h, indx)->pgno; + if ((ret = __db_31_offdup(dbp, real_name, fhp, + LF_ISSET(DB_DUPSORT) ? 1 : 0, &pgno)) != 0) + break; + if (pgno != GET_BOVERFLOW(dbp, h, indx)->pgno) { + *dirtyp = 1; + GET_BOVERFLOW(dbp, h, indx)->pgno = pgno; + } + } + } + + return (ret); +} diff --git a/btree/bt_utils.c b/btree/bt_utils.c deleted file mode 100644 index 9c1438e..0000000 --- a/btree/bt_utils.c +++ /dev/null @@ -1,260 +0,0 @@ -/*- - * Copyright (c) 1990, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Mike Olson. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#if defined(LIBC_SCCS) && !defined(lint) -static char sccsid[] = "@(#)bt_utils.c 8.8 (Berkeley) 7/20/94"; -#endif /* LIBC_SCCS and not lint */ - -#include <sys/param.h> - -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -#include <db.h> -#include "btree.h" - -/* - * __bt_ret -- - * Build return key/data pair. - * - * Parameters: - * t: tree - * e: key/data pair to be returned - * key: user's key structure (NULL if not to be filled in) - * rkey: memory area to hold key - * data: user's data structure (NULL if not to be filled in) - * rdata: memory area to hold data - * copy: always copy the key/data item - * - * Returns: - * RET_SUCCESS, RET_ERROR. - */ -int -__bt_ret(t, e, key, rkey, data, rdata, copy) - BTREE *t; - EPG *e; - DBT *key, *rkey, *data, *rdata; - int copy; -{ - BLEAF *bl; - void *p; - - bl = GETBLEAF(e->page, e->index); - - /* - * We must copy big keys/data to make them contigous. Otherwise, - * leave the page pinned and don't copy unless the user specified - * concurrent access. - */ - if (key == NULL) - goto dataonly; - - if (bl->flags & P_BIGKEY) { - if (__ovfl_get(t, bl->bytes, - &key->size, &rkey->data, &rkey->size)) - return (RET_ERROR); - key->data = rkey->data; - } else if (copy || F_ISSET(t, B_DB_LOCK)) { - if (bl->ksize > rkey->size) { - p = (void *)(rkey->data == NULL ? - malloc(bl->ksize) : realloc(rkey->data, bl->ksize)); - if (p == NULL) - return (RET_ERROR); - rkey->data = p; - rkey->size = bl->ksize; - } - memmove(rkey->data, bl->bytes, bl->ksize); - key->size = bl->ksize; - key->data = rkey->data; - } else { - key->size = bl->ksize; - key->data = bl->bytes; - } - -dataonly: - if (data == NULL) - return (RET_SUCCESS); - - if (bl->flags & P_BIGDATA) { - if (__ovfl_get(t, bl->bytes + bl->ksize, - &data->size, &rdata->data, &rdata->size)) - return (RET_ERROR); - data->data = rdata->data; - } else if (copy || F_ISSET(t, B_DB_LOCK)) { - /* Use +1 in case the first record retrieved is 0 length. */ - if (bl->dsize + 1 > rdata->size) { - p = (void *)(rdata->data == NULL ? - malloc(bl->dsize + 1) : - realloc(rdata->data, bl->dsize + 1)); - if (p == NULL) - return (RET_ERROR); - rdata->data = p; - rdata->size = bl->dsize + 1; - } - memmove(rdata->data, bl->bytes + bl->ksize, bl->dsize); - data->size = bl->dsize; - data->data = rdata->data; - } else { - data->size = bl->dsize; - data->data = bl->bytes + bl->ksize; - } - - return (RET_SUCCESS); -} - -/* - * __BT_CMP -- Compare a key to a given record. - * - * Parameters: - * t: tree - * k1: DBT pointer of first arg to comparison - * e: pointer to EPG for comparison - * - * Returns: - * < 0 if k1 is < record - * = 0 if k1 is = record - * > 0 if k1 is > record - */ -int -__bt_cmp(t, k1, e) - BTREE *t; - const DBT *k1; - EPG *e; -{ - BINTERNAL *bi; - BLEAF *bl; - DBT k2; - PAGE *h; - void *bigkey; - - /* - * The left-most key on internal pages, at any level of the tree, is - * guaranteed by the following code to be less than any user key. - * This saves us from having to update the leftmost key on an internal - * page when the user inserts a new key in the tree smaller than - * anything we've yet seen. - */ - h = e->page; - if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & P_BLEAF)) - return (1); - - bigkey = NULL; - if (h->flags & P_BLEAF) { - bl = GETBLEAF(h, e->index); - if (bl->flags & P_BIGKEY) - bigkey = bl->bytes; - else { - k2.data = bl->bytes; - k2.size = bl->ksize; - } - } else { - bi = GETBINTERNAL(h, e->index); - if (bi->flags & P_BIGKEY) - bigkey = bi->bytes; - else { - k2.data = bi->bytes; - k2.size = bi->ksize; - } - } - - if (bigkey) { - if (__ovfl_get(t, bigkey, - &k2.size, &t->bt_rdata.data, &t->bt_rdata.size)) - return (RET_ERROR); - k2.data = t->bt_rdata.data; - } - return ((*t->bt_cmp)(k1, &k2)); -} - -/* - * __BT_DEFCMP -- Default comparison routine. - * - * Parameters: - * a: DBT #1 - * b: DBT #2 - * - * Returns: - * < 0 if a is < b - * = 0 if a is = b - * > 0 if a is > b - */ -int -__bt_defcmp(a, b) - const DBT *a, *b; -{ - register size_t len; - register u_char *p1, *p2; - - /* - * XXX - * If a size_t doesn't fit in an int, this routine can lose. - * What we need is a integral type which is guaranteed to be - * larger than a size_t, and there is no such thing. - */ - len = MIN(a->size, b->size); - for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2) - if (*p1 != *p2) - return ((int)*p1 - (int)*p2); - return ((int)a->size - (int)b->size); -} - -/* - * __BT_DEFPFX -- Default prefix routine. - * - * Parameters: - * a: DBT #1 - * b: DBT #2 - * - * Returns: - * Number of bytes needed to distinguish b from a. - */ -size_t -__bt_defpfx(a, b) - const DBT *a, *b; -{ - register u_char *p1, *p2; - register size_t cnt, len; - - cnt = 1; - len = MIN(a->size, b->size); - for (p1 = a->data, p2 = b->data; len--; ++p1, ++p2, ++cnt) - if (*p1 != *p2) - return (cnt); - - /* a->size must be <= b->size, or they wouldn't be in this order. */ - return (a->size < b->size ? a->size + 1 : a->size); -} diff --git a/btree/bt_verify.c b/btree/bt_verify.c new file mode 100644 index 0000000..1c561d2 --- /dev/null +++ b/btree/bt_verify.c @@ -0,0 +1,2746 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1999-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_verify.h" +#include "dbinc/btree.h" +#include "dbinc/lock.h" +#include "dbinc/mp.h" + +static int __bam_safe_getdata __P((DB *, DB_THREAD_INFO *, + PAGE *, u_int32_t, int, DBT *, int *)); +static int __bam_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, + db_indx_t *, u_int32_t)); +static int __bam_vrfy_treeorder __P((DB *, DB_THREAD_INFO *, PAGE *, + BINTERNAL *, BINTERNAL *, int (*)(DB *, const DBT *, const DBT *), + u_int32_t)); +static int __ram_vrfy_inp __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, + db_indx_t *, u_int32_t)); + +/* + * __bam_vrfy_meta -- + * Verify the btree-specific part of a metadata page. + * + * PUBLIC: int __bam_vrfy_meta __P((DB *, VRFY_DBINFO *, BTMETA *, + * PUBLIC: db_pgno_t, u_int32_t)); + */ +int +__bam_vrfy_meta(dbp, vdp, meta, pgno, flags) + DB *dbp; + VRFY_DBINFO *vdp; + BTMETA *meta; + db_pgno_t pgno; + u_int32_t flags; +{ + ENV *env; + VRFY_PAGEINFO *pip; + int isbad, t_ret, ret; + db_indx_t ovflsize; + + env = dbp->env; + isbad = 0; + + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + + /* + * If VRFY_INCOMPLETE is not set, then we didn't come through + * __db_vrfy_pagezero and didn't incompletely + * check this page--we haven't checked it at all. + * Thus we need to call __db_vrfy_meta and check the common fields. + * + * If VRFY_INCOMPLETE is set, we've already done all the same work + * in __db_vrfy_pagezero, so skip the check. + */ + if (!F_ISSET(pip, VRFY_INCOMPLETE) && + (ret = __db_vrfy_meta(dbp, vdp, &meta->dbmeta, pgno, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + + /* bt_minkey: must be >= 2; must produce sensible ovflsize */ + + /* avoid division by zero */ + ovflsize = meta->minkey > 0 ? + B_MINKEY_TO_OVFLSIZE(dbp, meta->minkey, dbp->pgsize) : 0; + + if (meta->minkey < 2 || + ovflsize > B_MINKEY_TO_OVFLSIZE(dbp, DEFMINKEYPAGE, dbp->pgsize)) { + pip->bt_minkey = 0; + isbad = 1; + EPRINT((env, + "Page %lu: nonsensical bt_minkey value %lu on metadata page", + (u_long)pgno, (u_long)meta->minkey)); + } else + pip->bt_minkey = meta->minkey; + + /* re_len: no constraints on this (may be zero or huge--we make rope) */ + pip->re_pad = meta->re_pad; + pip->re_len = meta->re_len; + + /* + * The root must not be current page or 0 and it must be within + * database. If this metadata page is the master meta data page + * of the file, then the root page had better be page 1. + */ + pip->root = 0; + if (meta->root == PGNO_INVALID || + meta->root == pgno || !IS_VALID_PGNO(meta->root) || + (pgno == PGNO_BASE_MD && meta->root != 1)) { + isbad = 1; + EPRINT((env, + "Page %lu: nonsensical root page %lu on metadata page", + (u_long)pgno, (u_long)meta->root)); + } else + pip->root = meta->root; + + /* Flags. */ + if (F_ISSET(&meta->dbmeta, BTM_RENUMBER)) + F_SET(pip, VRFY_IS_RRECNO); + + if (F_ISSET(&meta->dbmeta, BTM_SUBDB)) { + /* + * If this is a master db meta page, it had better not have + * duplicates. + */ + if (F_ISSET(&meta->dbmeta, BTM_DUP) && pgno == PGNO_BASE_MD) { + isbad = 1; + EPRINT((env, +"Page %lu: Btree metadata page has both duplicates and multiple databases", + (u_long)pgno)); + } + F_SET(pip, VRFY_HAS_SUBDBS); + } + + if (F_ISSET(&meta->dbmeta, BTM_DUP)) + F_SET(pip, VRFY_HAS_DUPS); + if (F_ISSET(&meta->dbmeta, BTM_DUPSORT)) + F_SET(pip, VRFY_HAS_DUPSORT); + if (F_ISSET(&meta->dbmeta, BTM_RECNUM)) + F_SET(pip, VRFY_HAS_RECNUMS); + if (F_ISSET(pip, VRFY_HAS_RECNUMS) && F_ISSET(pip, VRFY_HAS_DUPS)) { + EPRINT((env, + "Page %lu: Btree metadata page illegally has both recnums and dups", + (u_long)pgno)); + isbad = 1; + } + + if (F_ISSET(&meta->dbmeta, BTM_RECNO)) { + F_SET(pip, VRFY_IS_RECNO); + dbp->type = DB_RECNO; + } else if (F_ISSET(pip, VRFY_IS_RRECNO)) { + isbad = 1; + EPRINT((env, + "Page %lu: metadata page has renumber flag set but is not recno", + (u_long)pgno)); + } + +#ifdef HAVE_COMPRESSION + if (F_ISSET(&meta->dbmeta, BTM_COMPRESS)) { + F_SET(pip, VRFY_HAS_COMPRESS); + if (!DB_IS_COMPRESSED(dbp)) { + ((BTREE *)dbp->bt_internal)->bt_compress = + __bam_defcompress; + ((BTREE *)dbp->bt_internal)->bt_decompress = + __bam_defdecompress; + } + /* + * Copy dup_compare to compress_dup_compare, and use the + * compression duplicate compare. + */ + if (F_ISSET(pip, VRFY_HAS_DUPSORT)) { + if (dbp->dup_compare == NULL) + dbp->dup_compare = __bam_defcmp; + if (((BTREE *)dbp->bt_internal)->compress_dup_compare + == NULL) { + ((BTREE *)dbp->bt_internal)-> + compress_dup_compare = dbp->dup_compare; + dbp->dup_compare = __bam_compress_dupcmp; + } + } + } + + if (F_ISSET(pip, VRFY_HAS_RECNUMS) && F_ISSET(pip, VRFY_HAS_COMPRESS)) { + EPRINT((env, + "Page %lu: Btree metadata page illegally has both recnums and compression", + (u_long)pgno)); + isbad = 1; + } + if (F_ISSET(pip, VRFY_HAS_DUPS) && !F_ISSET(pip, VRFY_HAS_DUPSORT) && + F_ISSET(pip, VRFY_HAS_COMPRESS)) { + EPRINT((env, + "Page %lu: Btree metadata page illegally has both unsorted duplicates%s", + (u_long)pgno, + " and compression")); + isbad = 1; + } +#endif + + if (F_ISSET(pip, VRFY_IS_RECNO) && F_ISSET(pip, VRFY_HAS_DUPS)) { + EPRINT((env, + "Page %lu: recno metadata page specifies duplicates", + (u_long)pgno)); + isbad = 1; + } + + if (F_ISSET(&meta->dbmeta, BTM_FIXEDLEN)) + F_SET(pip, VRFY_IS_FIXEDLEN); + else if (pip->re_len > 0) { + /* + * It's wrong to have an re_len if it's not a fixed-length + * database + */ + isbad = 1; + EPRINT((env, + "Page %lu: re_len of %lu in non-fixed-length database", + (u_long)pgno, (u_long)pip->re_len)); + } + + /* + * We do not check that the rest of the page is 0, because it may + * not be and may still be correct. + */ + +err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0) + ret = t_ret; + if (LF_ISSET(DB_SALVAGE) && + (t_ret = __db_salvage_markdone(vdp, pgno)) != 0 && ret == 0) + ret = t_ret; + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); +} + +/* + * __ram_vrfy_leaf -- + * Verify a recno leaf page. + * + * PUBLIC: int __ram_vrfy_leaf __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, + * PUBLIC: u_int32_t)); + */ +int +__ram_vrfy_leaf(dbp, vdp, h, pgno, flags) + DB *dbp; + VRFY_DBINFO *vdp; + PAGE *h; + db_pgno_t pgno; + u_int32_t flags; +{ + BKEYDATA *bk; + ENV *env; + VRFY_PAGEINFO *pip; + db_indx_t i; + int ret, t_ret, isbad; + u_int32_t re_len_guess, len; + + env = dbp->env; + isbad = 0; + + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + + if (TYPE(h) != P_LRECNO) { + ret = __db_unknown_path(env, "__ram_vrfy_leaf"); + goto err; + } + + /* + * Verify (and, if relevant, save off) page fields common to + * all PAGEs. + */ + if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + + /* + * Verify inp[]. Return immediately if it returns DB_VERIFY_BAD; + * further checks are dangerous. + */ + if ((ret = __bam_vrfy_inp(dbp, + vdp, h, pgno, &pip->entries, flags)) != 0) + goto err; + + if (F_ISSET(pip, VRFY_HAS_DUPS)) { + EPRINT((env, + "Page %lu: Recno database has dups", (u_long)pgno)); + ret = DB_VERIFY_BAD; + goto err; + } + + /* + * Walk through inp and see if the lengths of all the records are the + * same--if so, this may be a fixed-length database, and we want to + * save off this value. We know inp to be safe if we've gotten this + * far. + */ + re_len_guess = 0; + for (i = 0; i < NUM_ENT(h); i++) { + bk = GET_BKEYDATA(dbp, h, i); + /* KEYEMPTY. Go on. */ + if (B_DISSET(bk->type)) + continue; + if (bk->type == B_OVERFLOW) + len = ((BOVERFLOW *)bk)->tlen; + else if (bk->type == B_KEYDATA) + len = bk->len; + else { + isbad = 1; + EPRINT((env, + "Page %lu: nonsensical type for item %lu", + (u_long)pgno, (u_long)i)); + continue; + } + if (re_len_guess == 0) + re_len_guess = len; + + /* + * Is this item's len the same as the last one's? If not, + * reset to 0 and break--we don't have a single re_len. + * Otherwise, go on to the next item. + */ + if (re_len_guess != len) { + re_len_guess = 0; + break; + } + } + pip->re_len = re_len_guess; + + /* Save off record count. */ + pip->rec_cnt = NUM_ENT(h); + +err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0) + ret = t_ret; + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); +} + +/* + * __bam_vrfy -- + * Verify a btree leaf or internal page. + * + * PUBLIC: int __bam_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, + * PUBLIC: u_int32_t)); + */ +int +__bam_vrfy(dbp, vdp, h, pgno, flags) + DB *dbp; + VRFY_DBINFO *vdp; + PAGE *h; + db_pgno_t pgno; + u_int32_t flags; +{ + ENV *env; + VRFY_PAGEINFO *pip; + int ret, t_ret, isbad; + + env = dbp->env; + isbad = 0; + + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + + switch (TYPE(h)) { + case P_IBTREE: + case P_IRECNO: + case P_LBTREE: + case P_LDUP: + break; + default: + ret = __db_unknown_path(env, "__bam_vrfy"); + goto err; + } + + /* + * Verify (and, if relevant, save off) page fields common to + * all PAGEs. + */ + if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + + /* + * The record count is, on internal pages, stored in an overloaded + * next_pgno field. Save it off; we'll verify it when we check + * overall database structure. We could overload the field + * in VRFY_PAGEINFO, too, but this seems gross, and space + * is not at such a premium. + */ + pip->rec_cnt = RE_NREC(h); + + /* + * Verify inp[]. + */ + if (TYPE(h) == P_IRECNO) { + if ((ret = __ram_vrfy_inp(dbp, + vdp, h, pgno, &pip->entries, flags)) != 0) + goto err; + } else if ((ret = __bam_vrfy_inp(dbp, + vdp, h, pgno, &pip->entries, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + EPRINT((env, + "Page %lu: item order check unsafe: skipping", + (u_long)pgno)); + } else if (!LF_ISSET(DB_NOORDERCHK) && (ret = + __bam_vrfy_itemorder(dbp, + vdp, vdp->thread_info, h, pgno, 0, 0, 0, flags)) != 0) { + /* + * We know that the elements of inp are reasonable. + * + * Check that elements fall in the proper order. + */ + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + +err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0) + ret = t_ret; + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); +} + +/* + * __ram_vrfy_inp -- + * Verify that all entries in a P_IRECNO inp[] array are reasonable, + * and count them. Note that P_LRECNO uses __bam_vrfy_inp; + * P_IRECNOs are a special, and simpler, case, since they have + * RINTERNALs rather than BKEYDATA/BINTERNALs. + */ +static int +__ram_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) + DB *dbp; + VRFY_DBINFO *vdp; + PAGE *h; + db_pgno_t pgno; + db_indx_t *nentriesp; + u_int32_t flags; +{ + ENV *env; + RINTERNAL *ri; + VRFY_CHILDINFO child; + VRFY_PAGEINFO *pip; + int ret, t_ret, isbad; + u_int32_t himark, i, offset, nentries; + db_indx_t *inp; + u_int8_t *pagelayout, *p; + + env = dbp->env; + isbad = 0; + memset(&child, 0, sizeof(VRFY_CHILDINFO)); + nentries = 0; + pagelayout = NULL; + + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + + if (TYPE(h) != P_IRECNO) { + ret = __db_unknown_path(env, "__ram_vrfy_inp"); + goto err; + } + + himark = dbp->pgsize; + if ((ret = __os_malloc(env, dbp->pgsize, &pagelayout)) != 0) + goto err; + memset(pagelayout, 0, dbp->pgsize); + inp = P_INP(dbp, h); + for (i = 0; i < NUM_ENT(h); i++) { + if ((u_int8_t *)inp + i >= (u_int8_t *)h + himark) { + EPRINT((env, + "Page %lu: entries listing %lu overlaps data", + (u_long)pgno, (u_long)i)); + ret = DB_VERIFY_BAD; + goto err; + } + offset = inp[i]; + /* + * Check that the item offset is reasonable: it points + * somewhere after the inp array and before the end of the + * page. + */ + if (offset <= (u_int32_t)((u_int8_t *)inp + i - + (u_int8_t *)h) || + offset > (u_int32_t)(dbp->pgsize - RINTERNAL_SIZE)) { + isbad = 1; + EPRINT((env, + "Page %lu: bad offset %lu at index %lu", + (u_long)pgno, (u_long)offset, (u_long)i)); + continue; + } + + /* Update the high-water mark (what HOFFSET should be) */ + if (offset < himark) + himark = offset; + + nentries++; + + /* Make sure this RINTERNAL is not multiply referenced. */ + ri = GET_RINTERNAL(dbp, h, i); + if (pagelayout[offset] == 0) { + pagelayout[offset] = 1; + child.pgno = ri->pgno; + child.type = V_RECNO; + child.nrecs = ri->nrecs; + if ((ret = __db_vrfy_childput(vdp, pgno, &child)) != 0) + goto err; + } else { + EPRINT((env, + "Page %lu: RINTERNAL structure at offset %lu referenced twice", + (u_long)pgno, (u_long)offset)); + isbad = 1; + } + } + + for (p = pagelayout + himark; + p < pagelayout + dbp->pgsize; + p += RINTERNAL_SIZE) + if (*p != 1) { + EPRINT((env, + "Page %lu: gap between items at offset %lu", + (u_long)pgno, (u_long)(p - pagelayout))); + isbad = 1; + } + + if ((db_indx_t)himark != HOFFSET(h)) { + EPRINT((env, + "Page %lu: bad HOFFSET %lu, appears to be %lu", + (u_long)pgno, (u_long)(HOFFSET(h)), (u_long)himark)); + isbad = 1; + } + + *nentriesp = nentries; + +err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0) + ret = t_ret; + if (pagelayout != NULL) + __os_free(env, pagelayout); + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); +} + +typedef enum { VRFY_ITEM_NOTSET=0, VRFY_ITEM_BEGIN, VRFY_ITEM_END } VRFY_ITEM; + +/* + * __bam_vrfy_inp -- + * Verify that all entries in inp[] array are reasonable; + * count them. + */ +static int +__bam_vrfy_inp(dbp, vdp, h, pgno, nentriesp, flags) + DB *dbp; + VRFY_DBINFO *vdp; + PAGE *h; + db_pgno_t pgno; + db_indx_t *nentriesp; + u_int32_t flags; +{ + BKEYDATA *bk; + BOVERFLOW *bo; + ENV *env; + VRFY_CHILDINFO child; + VRFY_ITEM *pagelayout; + VRFY_PAGEINFO *pip; + u_int32_t himark, offset; /* + * These would be db_indx_ts + * but for alignment. + */ + u_int32_t i, endoff, nentries; + int isbad, initem, isdupitem, ret, t_ret; + + env = dbp->env; + isbad = isdupitem = 0; + nentries = 0; + memset(&child, 0, sizeof(VRFY_CHILDINFO)); + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + + switch (TYPE(h)) { + case P_IBTREE: + case P_LBTREE: + case P_LDUP: + case P_LRECNO: + break; + default: + /* + * In the salvager, we might call this from a page which + * we merely suspect is a btree page. Otherwise, it + * shouldn't get called--if it is, that's a verifier bug. + */ + if (LF_ISSET(DB_SALVAGE)) + break; + ret = __db_unknown_path(env, "__bam_vrfy_inp"); + goto err; + } + + /* + * Loop through inp[], the array of items, until we either + * run out of entries or collide with the data. Keep track + * of h_offset in himark. + * + * For each element in inp[i], make sure it references a region + * that starts after the end of the inp array (as defined by + * NUM_ENT(h)), ends before the beginning of the page, doesn't + * overlap any other regions, and doesn't have a gap between + * it and the region immediately after it. + */ + himark = dbp->pgsize; + if ((ret = __os_calloc( + env, dbp->pgsize, sizeof(pagelayout[0]), &pagelayout)) != 0) + goto err; + for (i = 0; i < NUM_ENT(h); i++) { + switch (ret = __db_vrfy_inpitem(dbp, + h, pgno, i, 1, flags, &himark, &offset)) { + case 0: + break; + case DB_VERIFY_BAD: + isbad = 1; + continue; + case DB_VERIFY_FATAL: + isbad = 1; + goto err; + default: + DB_ASSERT(env, ret != 0); + break; + } + + /* + * We now have a plausible beginning for the item, and we know + * its length is safe. + * + * Mark the beginning and end in pagelayout so we can make sure + * items have no overlaps or gaps. + */ + bk = GET_BKEYDATA(dbp, h, i); + if (pagelayout[offset] == VRFY_ITEM_NOTSET) + pagelayout[offset] = VRFY_ITEM_BEGIN; + else if (pagelayout[offset] == VRFY_ITEM_BEGIN) { + /* + * Having two inp entries that point at the same patch + * of page is legal if and only if the page is + * a btree leaf and they're onpage duplicate keys-- + * that is, if (i % P_INDX) == 0. + */ + if ((i % P_INDX == 0) && (TYPE(h) == P_LBTREE)) { + /* Flag for later. */ + F_SET(pip, VRFY_HAS_DUPS); + + /* Bump up nentries so we don't undercount. */ + nentries++; + + /* + * We'll check to make sure the end is + * equal, too. + */ + isdupitem = 1; + } else { + isbad = 1; + EPRINT((env, "Page %lu: duplicated item %lu", + (u_long)pgno, (u_long)i)); + } + } + + /* + * Mark the end. Its location varies with the page type + * and the item type. + * + * If the end already has a sign other than 0, do nothing-- + * it's an overlap that we'll catch later. + */ + switch (B_TYPE(bk->type)) { + case B_KEYDATA: + if (TYPE(h) == P_IBTREE) + /* It's a BINTERNAL. */ + endoff = offset + BINTERNAL_SIZE(bk->len) - 1; + else + endoff = offset + BKEYDATA_SIZE(bk->len) - 1; + break; + case B_DUPLICATE: + /* + * Flag that we have dups; we'll check whether + * that's okay during the structure check. + */ + F_SET(pip, VRFY_HAS_DUPS); + /* FALLTHROUGH */ + case B_OVERFLOW: + /* + * Overflow entries on internal pages are stored + * as the _data_ of a BINTERNAL; overflow entries + * on leaf pages are stored as the entire entry. + */ + endoff = offset + + ((TYPE(h) == P_IBTREE) ? + BINTERNAL_SIZE(BOVERFLOW_SIZE) : + BOVERFLOW_SIZE) - 1; + break; + default: + /* + * We'll complain later; for now, just mark + * a minimum. + */ + endoff = offset + BKEYDATA_SIZE(0) - 1; + break; + } + + /* + * If this is an onpage duplicate key we've seen before, + * the end had better coincide too. + */ + if (isdupitem && pagelayout[endoff] != VRFY_ITEM_END) { + EPRINT((env, "Page %lu: duplicated item %lu", + (u_long)pgno, (u_long)i)); + isbad = 1; + } else if (pagelayout[endoff] == VRFY_ITEM_NOTSET) + pagelayout[endoff] = VRFY_ITEM_END; + isdupitem = 0; + + /* + * There should be no deleted items in a quiescent tree, + * except in recno. + */ + if (B_DISSET(bk->type) && TYPE(h) != P_LRECNO) { + isbad = 1; + EPRINT((env, "Page %lu: item %lu marked deleted", + (u_long)pgno, (u_long)i)); + } + + /* + * Check the type and such of bk--make sure it's reasonable + * for the pagetype. + */ + switch (B_TYPE(bk->type)) { + case B_KEYDATA: + /* + * This is a normal, non-overflow BKEYDATA or BINTERNAL. + * The only thing to check is the len, and that's + * already been done. + */ + break; + case B_DUPLICATE: + if (TYPE(h) == P_IBTREE) { + isbad = 1; + EPRINT((env, + "Page %lu: duplicate page referenced by internal btree page at item %lu", + (u_long)pgno, (u_long)i)); + break; + } else if (TYPE(h) == P_LRECNO) { + isbad = 1; + EPRINT((env, + "Page %lu: duplicate page referenced by recno page at item %lu", + (u_long)pgno, (u_long)i)); + break; + } + /* FALLTHROUGH */ + case B_OVERFLOW: + bo = (TYPE(h) == P_IBTREE) ? + (BOVERFLOW *)(((BINTERNAL *)bk)->data) : + (BOVERFLOW *)bk; + + if (B_TYPE(bk->type) == B_OVERFLOW) + /* Make sure tlen is reasonable. */ + if (bo->tlen > dbp->pgsize * vdp->last_pgno) { + isbad = 1; + EPRINT((env, + "Page %lu: impossible tlen %lu, item %lu", + (u_long)pgno, + (u_long)bo->tlen, (u_long)i)); + /* Don't save as a child. */ + break; + } + + if (!IS_VALID_PGNO(bo->pgno) || bo->pgno == pgno || + bo->pgno == PGNO_INVALID) { + isbad = 1; + EPRINT((env, + "Page %lu: offpage item %lu has bad pgno %lu", + (u_long)pgno, (u_long)i, (u_long)bo->pgno)); + /* Don't save as a child. */ + break; + } + + child.pgno = bo->pgno; + child.type = (B_TYPE(bk->type) == B_OVERFLOW ? + V_OVERFLOW : V_DUPLICATE); + child.tlen = bo->tlen; + if ((ret = __db_vrfy_childput(vdp, pgno, &child)) != 0) + goto err; + break; + default: + isbad = 1; + EPRINT((env, "Page %lu: item %lu of invalid type %lu", + (u_long)pgno, (u_long)i, (u_long)B_TYPE(bk->type))); + break; + } + } + + /* + * Now, loop through and make sure the items are contiguous and + * non-overlapping. + */ + initem = 0; + for (i = himark; i < dbp->pgsize; i++) + if (initem == 0) + switch (pagelayout[i]) { + case VRFY_ITEM_NOTSET: + /* May be just for alignment. */ + if (i != DB_ALIGN(i, sizeof(u_int32_t))) + continue; + + isbad = 1; + EPRINT((env, + "Page %lu: gap between items at offset %lu", + (u_long)pgno, (u_long)i)); + /* Find the end of the gap */ + for (; pagelayout[i + 1] == VRFY_ITEM_NOTSET && + (size_t)(i + 1) < dbp->pgsize; i++) + ; + break; + case VRFY_ITEM_BEGIN: + /* We've found an item. Check its alignment. */ + if (i != DB_ALIGN(i, sizeof(u_int32_t))) { + isbad = 1; + EPRINT((env, + "Page %lu: offset %lu unaligned", + (u_long)pgno, (u_long)i)); + } + initem = 1; + nentries++; + break; + case VRFY_ITEM_END: + /* + * We've hit the end of an item even though + * we don't think we're in one; must + * be an overlap. + */ + isbad = 1; + EPRINT((env, + "Page %lu: overlapping items at offset %lu", + (u_long)pgno, (u_long)i)); + break; + } + else + switch (pagelayout[i]) { + case VRFY_ITEM_NOTSET: + /* In the middle of an item somewhere. Okay. */ + break; + case VRFY_ITEM_END: + /* End of an item; switch to out-of-item mode.*/ + initem = 0; + break; + case VRFY_ITEM_BEGIN: + /* + * Hit a second item beginning without an + * end. Overlap. + */ + isbad = 1; + EPRINT((env, + "Page %lu: overlapping items at offset %lu", + (u_long)pgno, (u_long)i)); + break; + } + + __os_free(env, pagelayout); + + /* Verify HOFFSET. */ + if ((db_indx_t)himark != HOFFSET(h)) { + EPRINT((env, "Page %lu: bad HOFFSET %lu, appears to be %lu", + (u_long)pgno, (u_long)HOFFSET(h), (u_long)himark)); + isbad = 1; + } + +err: if (nentriesp != NULL) + *nentriesp = nentries; + + if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0) + ret = t_ret; + + return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret); +} + +/* + * __bam_vrfy_itemorder -- + * Make sure the items on a page sort correctly. + * + * Assumes that NUM_ENT(h) and inp[0]..inp[NUM_ENT(h) - 1] are + * reasonable; be sure that __bam_vrfy_inp has been called first. + * + * If ovflok is set, it also assumes that overflow page chains + * hanging off the current page have been sanity-checked, and so we + * can use __bam_cmp to verify their ordering. If it is not set, + * and we run into an overflow page, carp and return DB_VERIFY_BAD; + * we shouldn't be called if any exist. + * + * PUBLIC: int __bam_vrfy_itemorder __P((DB *, VRFY_DBINFO *, DB_THREAD_INFO *, + * PUBLIC: PAGE *, db_pgno_t, u_int32_t, int, int, u_int32_t)); + */ +int +__bam_vrfy_itemorder(dbp, vdp, ip, h, pgno, nentries, ovflok, hasdups, flags) + DB *dbp; + VRFY_DBINFO *vdp; + DB_THREAD_INFO *ip; + PAGE *h; + db_pgno_t pgno; + u_int32_t nentries; + int ovflok, hasdups; + u_int32_t flags; +{ + BINTERNAL *bi; + BKEYDATA *bk; + BOVERFLOW *bo; + BTREE *bt; + DBC *dbc; + DBT dbta, dbtb, dup_1, dup_2, *p1, *p2, *tmp; + ENV *env; + VRFY_PAGEINFO *pip; + db_indx_t i, *inp; + int adj, cmp, freedup_1, freedup_2, isbad, ret, t_ret; + int (*dupfunc) __P((DB *, const DBT *, const DBT *)); + int (*func) __P((DB *, const DBT *, const DBT *)); + void *buf1, *buf2, *tmpbuf; + + /* + * We need to work in the ORDERCHKONLY environment where we might + * not have a pip, but we also may need to work in contexts where + * NUM_ENT isn't safe. + */ + if (vdp != NULL) { + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + nentries = pip->entries; + } else + pip = NULL; + + env = dbp->env; + ret = isbad = 0; + bo = NULL; /* Shut up compiler. */ + + memset(&dbta, 0, sizeof(DBT)); + F_SET(&dbta, DB_DBT_REALLOC); + + memset(&dbtb, 0, sizeof(DBT)); + F_SET(&dbtb, DB_DBT_REALLOC); + + buf1 = buf2 = NULL; + + DB_ASSERT(env, !LF_ISSET(DB_NOORDERCHK)); + + dupfunc = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare; + if (TYPE(h) == P_LDUP) + func = dupfunc; + else { + func = __bam_defcmp; + if (dbp->bt_internal != NULL) { + bt = (BTREE *)dbp->bt_internal; + if (bt->bt_compare != NULL) + func = bt->bt_compare; + } + } + + /* + * We alternate our use of dbta and dbtb so that we can walk + * through the page key-by-key without copying a dbt twice. + * p1 is always the dbt for index i - 1, and p2 for index i. + * Reset the data pointers in case we are retrying. + */ +retry: p1 = &dbta; + p1->data = NULL; + p2 = &dbtb; + p2->data = NULL; + + /* + * Loop through the entries. nentries ought to contain the + * actual count, and so is a safe way to terminate the loop; whether + * we inc. by one or two depends on whether we're a leaf page-- + * on a leaf page, we care only about keys. On internal pages + * and LDUP pages, we want to check the order of all entries. + * + * Note that on IBTREE pages or the index page of a partitioned + * database, we start with item 1, since item 0 doesn't get looked + * at by __bam_cmp. + */ + inp = P_INP(dbp, h); + adj = (TYPE(h) == P_LBTREE) ? P_INDX : O_INDX; + for (i = (TYPE(h) == P_IBTREE || dbp->p_internal != NULL) ? adj : 0; + i < nentries; i += adj) { + /* + * Put key i-1, now in p2, into p1, by swapping DBTs and bufs. + */ + tmp = p1; + p1 = p2; + p2 = tmp; + tmpbuf = buf1; + buf1 = buf2; + buf2 = tmpbuf; + + /* + * Get key i into p2. + */ + switch (TYPE(h)) { + case P_IBTREE: + bi = GET_BINTERNAL(dbp, h, i); + if (B_TYPE(bi->type) == B_OVERFLOW) { + bo = (BOVERFLOW *)(bi->data); + goto overflow; + } else { + p2->data = bi->data; + p2->size = bi->len; + } + + /* + * The leftmost key on an internal page must be + * len 0, since it's just a placeholder and + * automatically sorts less than all keys. + * + * XXX + * This criterion does not currently hold! + * See todo list item #1686. Meanwhile, it's harmless + * to just not check for it. + */ +#if 0 + if (i == 0 && bi->len != 0) { + isbad = 1; + EPRINT((env, + "Page %lu: lowest key on internal page of nonzero length", + (u_long)pgno)); + } +#endif + break; + case P_LBTREE: + case P_LDUP: + bk = GET_BKEYDATA(dbp, h, i); + if (B_TYPE(bk->type) == B_OVERFLOW) { + bo = (BOVERFLOW *)bk; + goto overflow; + } else { + p2->data = bk->data; + p2->size = bk->len; + } + break; + default: + /* + * This means our caller screwed up and sent us + * an inappropriate page. + */ + ret = __db_unknown_path(env, "__bam_vrfy_itemorder"); + goto err; + } + + if (0) { + /* + * If ovflok != 1, we can't safely go chasing + * overflow pages with the normal routines now; + * they might be unsafe or nonexistent. Mark this + * page as incomplete and return. + * + * Note that we don't need to worry about freeing + * buffers, since they can't have been allocated + * if overflow items are unsafe. + */ +overflow: if (!ovflok) { + F_SET(pip, VRFY_INCOMPLETE); + goto err; + } + + /* + * Overflow items are safe to chase. Do so. + * Fetch the overflow item into p2->data, + * NULLing it or reallocing it as appropriate. + * + * (We set p2->data to buf2 before the call + * so we're sure to realloc if we can and if p2 + * was just pointing at a non-overflow item.) + */ + p2->data = buf2; + if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE, + PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0) + goto err; + if ((ret = __db_goff(dbc, + p2, bo->tlen, bo->pgno, NULL, NULL)) != 0) { + isbad = 1; + EPRINT((env, + "Page %lu: error %lu in fetching overflow item %lu", + (u_long)pgno, (u_long)ret, (u_long)i)); + } + /* In case it got realloc'ed and thus changed. */ + buf2 = p2->data; + } + + /* Compare with the last key. */ + if (p1->data != NULL && p2->data != NULL) { + cmp = inp[i] == inp[i - adj] ? 0 : func(dbp, p1, p2); + + /* comparison succeeded */ + if (cmp > 0) { + /* + * If we are looking at an internal page, we + * don't know whether it is part of the main + * database or in an off-page-duplicate tree. + * If the main comparator fails, retry with + * the duplicate comparator. + */ + if (TYPE(h) == P_IBTREE && func != dupfunc) { + func = dupfunc; + goto retry; + } + + isbad = 1; + EPRINT((env, + "Page %lu: out-of-order key at entry %lu", + (u_long)pgno, (u_long)i)); + /* proceed */ + } else if (cmp == 0) { + if (inp[i] != inp[i - adj]) { + /* See above. */ + if (TYPE(h) == P_IBTREE && + func != dupfunc) { + func = dupfunc; + goto retry; + } + isbad = 1; + EPRINT((env, + "Page %lu: non-dup dup key at entry %lu", + (u_long)pgno, (u_long)i)); + } + /* + * If they compared equally, this + * had better be a (sub)database with dups. + * Mark it so we can check during the + * structure check. + */ + if (pip != NULL) + F_SET(pip, VRFY_HAS_DUPS); + else if (hasdups == 0) { + /* See above. */ + if (TYPE(h) == P_IBTREE && + func != dupfunc) { + func = dupfunc; + goto retry; + } + isbad = 1; + EPRINT((env, + "Page %lu: database with no duplicates has duplicated keys", + (u_long)pgno)); + } + + /* + * If we're a btree leaf, check to see + * if the data items of these on-page dups are + * in sorted order. If not, flag this, so + * that we can make sure during the + * structure checks that the DUPSORT flag + * is unset. + * + * At this point i points to a duplicate key. + * Compare the datum before it (same key) + * to the datum after it, i.e. i-1 to i+1. + */ + if (TYPE(h) == P_LBTREE) { + /* + * Unsafe; continue and we'll pick + * up the bogus nentries later. + */ + if (i + 1 >= (db_indx_t)nentries) + continue; + + /* + * We don't bother with clever memory + * management with on-page dups, + * as it's only really a big win + * in the overflow case, and overflow + * dups are probably (?) rare. + */ + if (((ret = __bam_safe_getdata(dbp, + ip, h, i - 1, ovflok, + &dup_1, &freedup_1)) != 0) || + ((ret = __bam_safe_getdata(dbp, + ip, h, i + 1, ovflok, + &dup_2, &freedup_2)) != 0)) + goto err; + + /* + * If either of the data are NULL, + * it's because they're overflows and + * it's not safe to chase them now. + * Mark an incomplete and return. + */ + if (dup_1.data == NULL || + dup_2.data == NULL) { + DB_ASSERT(env, !ovflok); + F_SET(pip, VRFY_INCOMPLETE); + goto err; + } + + /* + * If the dups are out of order, + * flag this. It's not an error + * until we do the structure check + * and see whether DUPSORT is set. + */ + if (dupfunc(dbp, &dup_1, &dup_2) > 0) + F_SET(pip, VRFY_DUPS_UNSORTED); + + if (freedup_1) + __os_ufree(env, dup_1.data); + if (freedup_2) + __os_ufree(env, dup_2.data); + } + } + } + } + +err: if (pip != NULL && ((t_ret = + __db_vrfy_putpageinfo(env, vdp, pip)) != 0) && ret == 0) + ret = t_ret; + + if (buf1 != NULL) + __os_ufree(env, buf1); + if (buf2 != NULL) + __os_ufree(env, buf2); + + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); +} + +/* + * __bam_vrfy_structure -- + * Verify the tree structure of a btree database (including the master + * database containing subdbs). + * + * PUBLIC: int __bam_vrfy_structure __P((DB *, VRFY_DBINFO *, db_pgno_t, + * PUBLIC: void *, void *, u_int32_t)); + */ +int +__bam_vrfy_structure(dbp, vdp, meta_pgno, lp, rp, flags) + DB *dbp; + VRFY_DBINFO *vdp; + db_pgno_t meta_pgno; + void *lp, *rp; + u_int32_t flags; +{ + DB *pgset; + ENV *env; + VRFY_PAGEINFO *mip, *rip; + db_pgno_t root, p; + int t_ret, ret; + u_int32_t nrecs, level, relen, stflags; + + env = dbp->env; + mip = rip = 0; + pgset = vdp->pgset; + + if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &mip)) != 0) + return (ret); + + if ((ret = __db_vrfy_pgset_get(pgset, + vdp->thread_info, meta_pgno, (int *)&p)) != 0) + goto err; + if (p != 0) { + EPRINT((env, + "Page %lu: btree metadata page observed twice", + (u_long)meta_pgno)); + ret = DB_VERIFY_BAD; + goto err; + } + if ((ret = + __db_vrfy_pgset_inc(pgset, vdp->thread_info, meta_pgno)) != 0) + goto err; + + root = mip->root; + + if (root == 0) { + EPRINT((env, + "Page %lu: btree metadata page has no root", + (u_long)meta_pgno)); + ret = DB_VERIFY_BAD; + goto err; + } + + if ((ret = __db_vrfy_getpageinfo(vdp, root, &rip)) != 0) + goto err; + + switch (rip->type) { + case P_IBTREE: + case P_LBTREE: + stflags = flags | DB_ST_TOPLEVEL; + if (F_ISSET(mip, VRFY_HAS_DUPS)) + stflags |= DB_ST_DUPOK; + if (F_ISSET(mip, VRFY_HAS_DUPSORT)) + stflags |= DB_ST_DUPSORT; + if (F_ISSET(mip, VRFY_HAS_RECNUMS)) + stflags |= DB_ST_RECNUM; + ret = __bam_vrfy_subtree(dbp, + vdp, root, lp, rp, stflags, NULL, NULL, NULL); + break; + case P_IRECNO: + case P_LRECNO: + stflags = + flags | DB_ST_RECNUM | DB_ST_IS_RECNO | DB_ST_TOPLEVEL; + if (mip->re_len > 0) + stflags |= DB_ST_RELEN; + if ((ret = __bam_vrfy_subtree(dbp, vdp, + root, NULL, NULL, stflags, &level, &nrecs, &relen)) != 0) + goto err; + /* + * Even if mip->re_len > 0, re_len may come back zero if the + * tree is empty. It should be okay to just skip the check in + * this case, as if there are any non-deleted keys at all, + * that should never happen. + */ + if (mip->re_len > 0 && relen > 0 && mip->re_len != relen) { + EPRINT((env, + "Page %lu: recno database has bad re_len %lu", + (u_long)meta_pgno, (u_long)relen)); + ret = DB_VERIFY_BAD; + goto err; + } + ret = 0; + break; + case P_LDUP: + EPRINT((env, + "Page %lu: duplicate tree referenced from metadata page", + (u_long)meta_pgno)); + ret = DB_VERIFY_BAD; + break; + default: + EPRINT((env, + "Page %lu: btree root of incorrect type %lu on metadata page", + (u_long)meta_pgno, (u_long)rip->type)); + ret = DB_VERIFY_BAD; + break; + } + +err: if (mip != NULL && ((t_ret = + __db_vrfy_putpageinfo(env, vdp, mip)) != 0) && ret == 0) + ret = t_ret; + if (rip != NULL && ((t_ret = + __db_vrfy_putpageinfo(env, vdp, rip)) != 0) && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __bam_vrfy_subtree-- + * Verify a subtree (or entire) btree with specified root. + * + * Note that this is public because it must be called to verify + * offpage dup trees, including from hash. + * + * PUBLIC: int __bam_vrfy_subtree __P((DB *, VRFY_DBINFO *, db_pgno_t, void *, + * PUBLIC: void *, u_int32_t, u_int32_t *, u_int32_t *, u_int32_t *)); + */ +int +__bam_vrfy_subtree(dbp, vdp, pgno, l, r, flags, levelp, nrecsp, relenp) + DB *dbp; + VRFY_DBINFO *vdp; + db_pgno_t pgno; + void *l, *r; + u_int32_t flags, *levelp, *nrecsp, *relenp; +{ + BINTERNAL *li, *ri; + DB *pgset; + DBC *cc; + DB_MPOOLFILE *mpf; + ENV *env; + PAGE *h; + VRFY_CHILDINFO *child; + VRFY_PAGEINFO *pip; + db_indx_t i; + db_pgno_t next_pgno, prev_pgno; + db_recno_t child_nrecs, nrecs; + u_int32_t child_level, child_relen, j, level, relen, stflags; + u_int8_t leaf_type; + int (*func) __P((DB *, const DBT *, const DBT *)); + int isbad, p, ret, t_ret, toplevel; + + if (levelp != NULL) /* Don't leave uninitialized on error. */ + *levelp = 0; + if (nrecsp != NULL) + *nrecsp = 0; + + env = dbp->env; + mpf = dbp->mpf; + h = NULL; + next_pgno = prev_pgno = PGNO_INVALID; + nrecs = 0; + relen = 0; + leaf_type = P_INVALID; + isbad = ret = 0; + + /* Provide feedback on our progress to the application. */ + if (!LF_ISSET(DB_SALVAGE)) + __db_vrfy_struct_feedback(dbp, vdp); + + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + + cc = NULL; + level = pip->bt_level; + + toplevel = LF_ISSET(DB_ST_TOPLEVEL) ? 1 : 0; + LF_CLR(DB_ST_TOPLEVEL); + + /* + * If this is the root, initialize the vdp's prev- and next-pgno + * accounting. + * + * For each leaf page we hit, we'll want to make sure that + * vdp->prev_pgno is the same as pip->prev_pgno and vdp->next_pgno is + * our page number. Then, we'll set vdp->next_pgno to pip->next_pgno + * and vdp->prev_pgno to our page number, and the next leaf page in + * line should be able to do the same verification. + */ + if (toplevel) { + /* + * Cache the values stored in the vdp so that if we're an + * auxiliary tree such as an off-page duplicate set, our + * caller's leaf page chain doesn't get lost. + */ + prev_pgno = vdp->prev_pgno; + next_pgno = vdp->next_pgno; + leaf_type = vdp->leaf_type; + vdp->next_pgno = vdp->prev_pgno = PGNO_INVALID; + vdp->leaf_type = P_INVALID; + } + + /* + * We are recursively descending a btree, starting from the root + * and working our way out to the leaves. + * + * There are four cases we need to deal with: + * 1. pgno is a recno leaf page. Any children are overflows. + * 2. pgno is a duplicate leaf page. Any children + * are overflow pages; traverse them, and then return + * level and nrecs. + * 3. pgno is an ordinary leaf page. Check whether dups are + * allowed, and if so, traverse any off-page dups or + * overflows. Then return nrecs and level. + * 4. pgno is a recno internal page. Recursively check any + * child pages, making sure their levels are one lower + * and their nrecs sum to ours. + * 5. pgno is a btree internal page. Same as #4, plus we + * must verify that for each pair of BINTERNAL entries + * N and N+1, the leftmost item on N's child sorts + * greater than N, and the rightmost item on N's child + * sorts less than N+1. + * + * Furthermore, in any sorted page type (P_LDUP, P_LBTREE, P_IBTREE), + * we need to verify the internal sort order is correct if, + * due to overflow items, we were not able to do so earlier. + */ + switch (pip->type) { + case P_LRECNO: + case P_LDUP: + case P_LBTREE: + /* + * Cases 1, 2 and 3. + * + * We're some sort of leaf page; verify + * that our linked list of leaves is consistent. + */ + if (vdp->leaf_type == P_INVALID) { + /* + * First leaf page. Set the type that all its + * successors should be, and verify that our prev_pgno + * is PGNO_INVALID. + */ + vdp->leaf_type = pip->type; + if (pip->prev_pgno != PGNO_INVALID) + goto bad_prev; + } else { + /* + * Successor leaf page. Check our type, the previous + * page's next_pgno, and our prev_pgno. + */ + if (pip->type != vdp->leaf_type) { + isbad = 1; + EPRINT((env, + "Page %lu: unexpected page type %lu found in leaf chain (expected %lu)", + (u_long)pip->pgno, (u_long)pip->type, + (u_long)vdp->leaf_type)); + } + + /* + * Don't do the prev/next_pgno checks if we've lost + * leaf pages due to another corruption. + */ + if (!F_ISSET(vdp, VRFY_LEAFCHAIN_BROKEN)) { + if (pip->pgno != vdp->next_pgno) { + isbad = 1; + EPRINT((env, + "Page %lu: incorrect next_pgno %lu found in leaf chain (should be %lu)", + (u_long)vdp->prev_pgno, + (u_long)vdp->next_pgno, + (u_long)pip->pgno)); + } + if (pip->prev_pgno != vdp->prev_pgno) { +bad_prev: isbad = 1; + EPRINT((env, + "Page %lu: incorrect prev_pgno %lu found in leaf chain (should be %lu)", + (u_long)pip->pgno, + (u_long)pip->prev_pgno, + (u_long)vdp->prev_pgno)); + } + } + } + vdp->prev_pgno = pip->pgno; + vdp->next_pgno = pip->next_pgno; + F_CLR(vdp, VRFY_LEAFCHAIN_BROKEN); + + /* + * Overflow pages are common to all three leaf types; + * traverse the child list, looking for overflows. + */ + if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0) + goto err; + for (ret = __db_vrfy_ccset(cc, pgno, &child); ret == 0; + ret = __db_vrfy_ccnext(cc, &child)) + if (child->type == V_OVERFLOW && + (ret = __db_vrfy_ovfl_structure(dbp, vdp, + child->pgno, child->tlen, + flags | DB_ST_OVFL_LEAF)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto done; + } + + if ((ret = __db_vrfy_ccclose(cc)) != 0) + goto err; + cc = NULL; + + /* Case 1 */ + if (pip->type == P_LRECNO) { + if (!LF_ISSET(DB_ST_IS_RECNO) && + !(LF_ISSET(DB_ST_DUPOK) && + !LF_ISSET(DB_ST_DUPSORT))) { + isbad = 1; + EPRINT((env, + "Page %lu: recno leaf page non-recno tree", + (u_long)pgno)); + goto done; + } + goto leaf; + } else if (LF_ISSET(DB_ST_IS_RECNO)) { + /* + * It's a non-recno leaf. Had better not be a recno + * subtree. + */ + isbad = 1; + EPRINT((env, + "Page %lu: non-recno leaf page in recno tree", + (u_long)pgno)); + goto done; + } + + /* Case 2--no more work. */ + if (pip->type == P_LDUP) + goto leaf; + + /* Case 3 */ + + /* Check if we have any dups. */ + if (F_ISSET(pip, VRFY_HAS_DUPS)) { + /* If dups aren't allowed in this btree, trouble. */ + if (!LF_ISSET(DB_ST_DUPOK)) { + isbad = 1; + EPRINT((env, + "Page %lu: duplicates in non-dup btree", + (u_long)pgno)); + } else { + /* + * We correctly have dups. If any are off-page, + * traverse those btrees recursively. + */ + if ((ret = + __db_vrfy_childcursor(vdp, &cc)) != 0) + goto err; + for (ret = __db_vrfy_ccset(cc, pgno, &child); + ret == 0; + ret = __db_vrfy_ccnext(cc, &child)) { + stflags = + flags | DB_ST_RECNUM | DB_ST_DUPSET; + /* Skip any overflow entries. */ + if (child->type == V_DUPLICATE) { + if ((ret = __db_vrfy_duptype( + dbp, vdp, child->pgno, + stflags)) != 0) { + isbad = 1; + /* Next child. */ + continue; + } + if ((ret = __bam_vrfy_subtree( + dbp, vdp, child->pgno, + NULL, NULL, + stflags | DB_ST_TOPLEVEL, + NULL, NULL, NULL)) != 0) { + if (ret == + DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + } + } + + if ((ret = __db_vrfy_ccclose(cc)) != 0) + goto err; + cc = NULL; + + /* + * If VRFY_DUPS_UNSORTED is set, + * DB_ST_DUPSORT had better not be. + */ + if (F_ISSET(pip, VRFY_DUPS_UNSORTED) && + LF_ISSET(DB_ST_DUPSORT)) { + isbad = 1; + EPRINT((env, + "Page %lu: unsorted duplicate set in sorted-dup database", + (u_long)pgno)); + } + } + } + goto leaf; + case P_IBTREE: + case P_IRECNO: + /* We handle these below. */ + break; + default: + /* + * If a P_IBTREE or P_IRECNO contains a reference to an + * invalid page, we'll wind up here; handle it gracefully. + * Note that the code at the "done" label assumes that the + * current page is a btree/recno one of some sort; this + * is not the case here, so we goto err. + * + * If the page is entirely zeroed, its pip->type will be a lie + * (we assumed it was a hash page, as they're allowed to be + * zeroed); handle this case specially. + */ + if (F_ISSET(pip, VRFY_IS_ALLZEROES)) + ZEROPG_ERR_PRINT(env, pgno, "btree or recno page"); + else + EPRINT((env, + "Page %lu: btree or recno page is of inappropriate type %lu", + (u_long)pgno, (u_long)pip->type)); + + /* + * We probably lost a leaf page (or more if this was an + * internal page) from our prev/next_pgno chain. Flag + * that this is expected; we don't want or need to + * spew error messages about erroneous prev/next_pgnos, + * since that's probably not the real problem. + */ + F_SET(vdp, VRFY_LEAFCHAIN_BROKEN); + + ret = DB_VERIFY_BAD; + goto err; + } + + /* + * Cases 4 & 5: This is a btree or recno internal page. For each child, + * recurse, keeping a running count of nrecs and making sure the level + * is always reasonable. + */ + if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0) + goto err; + for (ret = __db_vrfy_ccset(cc, pgno, &child); ret == 0; + ret = __db_vrfy_ccnext(cc, &child)) + if (child->type == V_RECNO) { + if (pip->type != P_IRECNO) { + ret = __db_unknown_path( + env, "__bam_vrfy_subtree"); + goto err; + } + if ((ret = __bam_vrfy_subtree(dbp, vdp, child->pgno, + NULL, NULL, flags, &child_level, &child_nrecs, + &child_relen)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto done; + } + + if (LF_ISSET(DB_ST_RELEN)) { + if (relen == 0) + relen = child_relen; + /* + * child_relen may be zero if the child subtree + * is empty. + */ + else if (child_relen > 0 && + relen != child_relen) { + isbad = 1; + EPRINT((env, + "Page %lu: recno page returned bad re_len %lu", + (u_long)child->pgno, + (u_long)child_relen)); + } + if (relenp) + *relenp = relen; + } + if (LF_ISSET(DB_ST_RECNUM)) { + if (child->nrecs != child_nrecs) { + isbad = 1; + EPRINT((env, + "Page %lu: record count incorrect: actual %lu, in record %lu", + (u_long)child->pgno, + (u_long)child_nrecs, + (u_long)child->nrecs)); + } + nrecs += child_nrecs; + } + if (isbad == 0 && level != child_level + 1) { + isbad = 1; + EPRINT((env, + "Page %lu: recno level incorrect: got %lu, expected %lu", + (u_long)child->pgno, (u_long)child_level, + (u_long)(level - 1))); + } + } else if (child->type == V_OVERFLOW) { + /* + * It is possible for one internal page to reference + * a single overflow page twice, if all the items + * in the subtree referenced by slot 0 are deleted, + * then a similar number of items are put back + * before the key that formerly had been in slot 1. + * + * (Btree doesn't look at the key in slot 0, so the + * fact that the key formerly at slot 1 is the "wrong" + * parent of the stuff in the slot 0 subtree isn't + * really incorrect.) + * + * __db_vrfy_ovfl_structure is designed to be + * efficiently called multiple times for multiple + * references; call it here as many times as is + * appropriate. + */ + + /* Otherwise, __db_vrfy_childput would be broken. */ + DB_ASSERT(env, child->refcnt >= 1); + + /* + * An overflow referenced more than twice here + * shouldn't happen. + */ + if (child->refcnt > 2) { + isbad = 1; + EPRINT((env, + "Page %lu: overflow page %lu referenced more than twice from internal page", + (u_long)pgno, (u_long)child->pgno)); + } else + for (j = 0; j < child->refcnt; j++) + if ((ret = __db_vrfy_ovfl_structure(dbp, + vdp, child->pgno, child->tlen, + flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto done; + } + } + + if ((ret = __db_vrfy_ccclose(cc)) != 0) + goto err; + cc = NULL; + + /* We're done with case 4. */ + if (pip->type == P_IRECNO) + goto done; + + /* + * Case 5. Btree internal pages. + * As described above, we need to iterate through all the + * items on the page and make sure that our children sort appropriately + * with respect to them. + * + * For each entry, li will be the "left-hand" key for the entry + * itself, which must sort lower than all entries on its child; + * ri will be the key to its right, which must sort greater. + */ + if (h == NULL && + (ret = __memp_fget(mpf, &pgno, vdp->thread_info, NULL, 0, &h)) != 0) + goto err; + for (i = 0; i < pip->entries; i += O_INDX) { + li = GET_BINTERNAL(dbp, h, i); + ri = (i + O_INDX < pip->entries) ? + GET_BINTERNAL(dbp, h, i + O_INDX) : r; + + /* + * The leftmost key is forcibly sorted less than all entries, + * so don't bother passing it. + */ + if ((ret = __bam_vrfy_subtree(dbp, vdp, li->pgno, + i == 0 ? NULL : li, ri, flags, &child_level, + &child_nrecs, NULL)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto done; + } + + if (LF_ISSET(DB_ST_RECNUM)) { + /* + * Keep a running tally on the actual record count so + * we can return it to our parent (if we have one) or + * compare it to the NRECS field if we're a root page. + */ + nrecs += child_nrecs; + + /* + * Make sure the actual record count of the child + * is equal to the value in the BINTERNAL structure. + */ + if (li->nrecs != child_nrecs) { + isbad = 1; + EPRINT((env, + "Page %lu: item %lu has incorrect record count of %lu, should be %lu", + (u_long)pgno, (u_long)i, (u_long)li->nrecs, + (u_long)child_nrecs)); + } + } + + if (level != child_level + 1) { + isbad = 1; + EPRINT((env, + "Page %lu: Btree level incorrect: got %lu, expected %lu", + (u_long)li->pgno, + (u_long)child_level, (u_long)(level - 1))); + } + } + + if (0) { +leaf: level = LEAFLEVEL; + if (LF_ISSET(DB_ST_RECNUM)) + nrecs = pip->rec_cnt; + + /* XXX + * We should verify that the record count on a leaf page + * is the sum of the number of keys and the number of + * records in its off-page dups. This requires looking + * at the page again, however, and it may all be changing + * soon, so for now we don't bother. + */ + + if (LF_ISSET(DB_ST_RELEN) && relenp) + *relenp = pip->re_len; + } +done: if (F_ISSET(pip, VRFY_INCOMPLETE) && isbad == 0 && ret == 0) { + /* + * During the page-by-page pass, item order verification was + * not finished due to the presence of overflow items. If + * isbad == 0, though, it's now safe to do so, as we've + * traversed any child overflow pages. Do it. + */ + if (h == NULL && (ret = __memp_fget(mpf, &pgno, + vdp->thread_info, NULL, 0, &h)) != 0) + goto err; + if ((ret = __bam_vrfy_itemorder(dbp, + vdp, vdp->thread_info, h, pgno, 0, 1, 0, flags)) != 0) + goto err; + F_CLR(pip, VRFY_INCOMPLETE); + } + + /* + * It's possible to get to this point with a page that has no + * items, but without having detected any sort of failure yet. + * Having zero items is legal if it's a leaf--it may be the + * root page in an empty tree, or the tree may have been + * modified with the DB_REVSPLITOFF flag set (there's no way + * to tell from what's on disk). For an internal page, + * though, having no items is a problem (all internal pages + * must have children). + */ + if (isbad == 0 && ret == 0) { + if (h == NULL && (ret = __memp_fget(mpf, &pgno, + vdp->thread_info, NULL, 0, &h)) != 0) + goto err; + + if (NUM_ENT(h) == 0 && ISINTERNAL(h)) { + isbad = 1; + EPRINT((env, + "Page %lu: internal page is empty and should not be", + (u_long)pgno)); + goto err; + } + } + + /* + * Our parent has sent us BINTERNAL pointers to parent records + * so that we can verify our place with respect to them. If it's + * appropriate--we have a default sort function--verify this. + */ + if (isbad == 0 && ret == 0 && !LF_ISSET(DB_NOORDERCHK) && + pip->type != P_IRECNO && pip->type != P_LRECNO) { + if (h == NULL && (ret = __memp_fget(mpf, &pgno, + vdp->thread_info, NULL, 0, &h)) != 0) + goto err; + + /* + * __bam_vrfy_treeorder needs to know what comparison function + * to use. If DB_ST_DUPSET is set, we're in a duplicate tree + * and we use the duplicate comparison function; otherwise, + * use the btree one. If unset, use the default, of course. + */ + func = LF_ISSET(DB_ST_DUPSET) ? dbp->dup_compare : + ((BTREE *)dbp->bt_internal)->bt_compare; + if (func == NULL) + func = __bam_defcmp; + + if ((ret = __bam_vrfy_treeorder(dbp, + vdp->thread_info, h, l, r, func, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + } + + /* + * This is guaranteed to succeed for leaf pages, but no harm done. + * + * Internal pages below the top level do not store their own + * record numbers, so we skip them. + */ + if (LF_ISSET(DB_ST_RECNUM) && nrecs != pip->rec_cnt && toplevel) { + isbad = 1; + EPRINT((env, + "Page %lu: bad record count: has %lu records, claims %lu", + (u_long)pgno, (u_long)nrecs, (u_long)pip->rec_cnt)); + } + + if (levelp) + *levelp = level; + if (nrecsp) + *nrecsp = nrecs; + + pgset = vdp->pgset; + if ((ret = __db_vrfy_pgset_get(pgset, + vdp->thread_info, pgno, &p)) != 0) + goto err; + if (p != 0) { + isbad = 1; + EPRINT((env, "Page %lu: linked twice", (u_long)pgno)); + } else if ((ret = + __db_vrfy_pgset_inc(pgset, vdp->thread_info, pgno)) != 0) + goto err; + + if (toplevel) + /* + * The last page's next_pgno in the leaf chain should have been + * PGNO_INVALID. + */ + if (vdp->next_pgno != PGNO_INVALID) { + isbad = 1; + EPRINT((env, "Page %lu: unterminated leaf chain", + (u_long)vdp->prev_pgno)); + } + +err: if (toplevel) { + /* Restore our caller's settings. */ + vdp->next_pgno = next_pgno; + vdp->prev_pgno = prev_pgno; + vdp->leaf_type = leaf_type; + } + + if (h != NULL && (t_ret = __memp_fput(mpf, + vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0) + ret = t_ret; + if (cc != NULL && ((t_ret = __db_vrfy_ccclose(cc)) != 0) && ret == 0) + ret = t_ret; + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); +} + +/* + * __bam_vrfy_treeorder -- + * Verify that the lowest key on a page sorts greater than the + * BINTERNAL which points to it (lp), and the highest key + * sorts less than the BINTERNAL above that (rp). + * + * If lp is NULL, this means that it was the leftmost key on the + * parent, which (regardless of sort function) sorts less than + * all keys. No need to check it. + * + * If rp is NULL, lp was the highest key on the parent, so there's + * no higher key we must sort less than. + */ +static int +__bam_vrfy_treeorder(dbp, ip, h, lp, rp, func, flags) + DB *dbp; + DB_THREAD_INFO *ip; + PAGE *h; + BINTERNAL *lp, *rp; + int (*func) __P((DB *, const DBT *, const DBT *)); + u_int32_t flags; +{ + BOVERFLOW *bo; + DBC *dbc; + DBT dbt; + ENV *env; + db_indx_t last; + int ret, cmp; + + env = dbp->env; + memset(&dbt, 0, sizeof(DBT)); + F_SET(&dbt, DB_DBT_MALLOC); + ret = 0; + + /* + * Empty pages are sorted correctly by definition. We check + * to see whether they ought to be empty elsewhere; leaf + * pages legally may be. + */ + if (NUM_ENT(h) == 0) + return (0); + + switch (TYPE(h)) { + case P_IBTREE: + case P_LDUP: + last = NUM_ENT(h) - O_INDX; + break; + case P_LBTREE: + last = NUM_ENT(h) - P_INDX; + break; + default: + return (__db_unknown_path(env, "__bam_vrfy_treeorder")); + } + + /* Populate a dummy cursor. */ + if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE, + PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0) + return (ret); + /* + * The key on page h, the child page, is more likely to be + * an overflow page, so we pass its offset, rather than lp/rp's, + * into __bam_cmp. This will take advantage of __db_moff. + */ + + /* + * Skip first-item check if we're an internal page--the first + * entry on an internal page is treated specially by __bam_cmp, + * so what's on the page shouldn't matter. (Plus, since we're passing + * our page and item 0 as to __bam_cmp, we'll sort before our + * parent and falsely report a failure.) + */ + if (lp != NULL && TYPE(h) != P_IBTREE) { + if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE, + PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0) + return (ret); + if (lp->type == B_KEYDATA) { + dbt.data = lp->data; + dbt.size = lp->len; + } else if (lp->type == B_OVERFLOW) { + bo = (BOVERFLOW *)lp->data; + if ((ret = __db_goff(dbc, &dbt, + bo->tlen, bo->pgno, NULL, NULL)) != 0) + return (ret); + } else + return ( + __db_unknown_path(env, "__bam_vrfy_treeorder")); + + /* On error, fall through, free if needed, and return. */ + if ((ret = __bam_cmp(dbc, &dbt, h, 0, func, &cmp)) == 0) { + if (cmp > 0) { + EPRINT((env, + "Page %lu: first item on page sorted greater than parent entry", + (u_long)PGNO(h))); + ret = DB_VERIFY_BAD; + } + } else + EPRINT((env, + "Page %lu: first item on page had comparison error", + (u_long)PGNO(h))); + + if (dbt.data != lp->data) + __os_ufree(env, dbt.data); + if (ret != 0) + return (ret); + } + + if (rp != NULL) { + if (rp->type == B_KEYDATA) { + dbt.data = rp->data; + dbt.size = rp->len; + } else if (rp->type == B_OVERFLOW) { + bo = (BOVERFLOW *)rp->data; + if ((ret = __db_goff(dbc, &dbt, + bo->tlen, bo->pgno, NULL, NULL)) != 0) + return (ret); + } else + return ( + __db_unknown_path(env, "__bam_vrfy_treeorder")); + + /* On error, fall through, free if needed, and return. */ + if ((ret = __bam_cmp(dbc, &dbt, h, last, func, &cmp)) == 0) { + if (cmp < 0) { + EPRINT((env, + "Page %lu: last item on page sorted greater than parent entry", + (u_long)PGNO(h))); + ret = DB_VERIFY_BAD; + } + } else + EPRINT((env, + "Page %lu: last item on page had comparison error", + (u_long)PGNO(h))); + + if (dbt.data != rp->data) + __os_ufree(env, dbt.data); + } + + return (ret); +} + +/* + * __bam_salvage -- + * Safely dump out anything that looks like a key on an alleged + * btree leaf page, also mark overflow pages as seen. For internal btree + * pages, just mark any overflow pages as seen. + * + * PUBLIC: int __bam_salvage __P((DB *, VRFY_DBINFO *, + * PUBLIC: db_pgno_t, u_int32_t, PAGE *, void *, + * PUBLIC: int (*)(void *, const void *), DBT *, u_int32_t)); + */ +int +__bam_salvage(dbp, vdp, pgno, pgtype, h, handle, callback, key, flags) + DB *dbp; + VRFY_DBINFO *vdp; + db_pgno_t pgno; + u_int32_t pgtype; + PAGE *h; + void *handle; + int (*callback) __P((void *, const void *)); + DBT *key; + u_int32_t flags; +{ + BKEYDATA *bk; + BOVERFLOW *bo; + DBT dbt, repldbt, unknown_key, unknown_data; + ENV *env; + VRFY_ITEM *pgmap; + db_indx_t i, last, beg, end, *inp; + db_pgno_t ovflpg; + u_int32_t himark, ovfl_bufsz; + void *ovflbuf; + int adj, ret, t_ret, t2_ret; +#ifdef HAVE_COMPRESSION + DBT kcpy, *last_key; + int unknown_dup_key; +#endif + + env = dbp->env; + ovflbuf = pgmap = NULL; + inp = P_INP(dbp, h); + + memset(&dbt, 0, sizeof(DBT)); + dbt.flags = DB_DBT_REALLOC; + memset(&repldbt, 0, sizeof(DBT)); + +#ifdef HAVE_COMPRESSION + memset(&kcpy, 0, sizeof(DBT)); + unknown_dup_key = LF_ISSET(DB_SA_UNKNOWNKEY); + last_key = unknown_dup_key ? NULL : key; +#endif + LF_CLR(DB_SA_UNKNOWNKEY); + + DB_INIT_DBT(unknown_key, "UNKNOWN_KEY", sizeof("UNKNOWN_KEY") - 1); + DB_INIT_DBT(unknown_data, "UNKNOWN_DATA", sizeof("UNKNOWN_DATA") - 1); + + /* + * Allocate a buffer for overflow items. Start at one page; + * __db_safe_goff will realloc as needed. + */ + if ((ret = __os_malloc(env, dbp->pgsize, &ovflbuf)) != 0) + goto err; + ovfl_bufsz = dbp->pgsize; + + if (LF_ISSET(DB_AGGRESSIVE) && (ret = + __os_calloc(env, dbp->pgsize, sizeof(pgmap[0]), &pgmap)) != 0) + goto err; + + /* + * Loop through the inp array, spitting out key/data pairs. + * + * If we're salvaging normally, loop from 0 through NUM_ENT(h). If + * we're being aggressive, loop until we hit the end of the page -- + * NUM_ENT() may be bogus. + */ + himark = dbp->pgsize; + for (i = 0, last = UINT16_MAX;; i += O_INDX) { + /* + * If we're not aggressive, or if we're on an internal page, + * break when we hit NUM_ENT(h). + */ + if ((!LF_ISSET(DB_AGGRESSIVE) || + pgtype == P_IBTREE) && i >= NUM_ENT(h)) + break; + + /* Verify the current item. */ + t_ret = + __db_vrfy_inpitem(dbp, h, pgno, i, 1, flags, &himark, NULL); + + if (t_ret != 0) { + /* + * If this is a btree leaf and we've printed out a key + * but not its associated data item, fix this imbalance + * by printing an "UNKNOWN_DATA". + */ + if (pgtype == P_LBTREE && i % P_INDX == 1 && + last == i - 1 && (t2_ret = __db_vrfy_prdbt( + &unknown_data, + 0, " ", handle, callback, 0, vdp)) != 0) { + if (ret == 0) + ret = t2_ret; + goto err; + } + + /* + * Don't return DB_VERIFY_FATAL; it's private and means + * only that we can't go on with this page, not with + * the whole database. It's not even an error if we've + * run into it after NUM_ENT(h). + */ + if (t_ret == DB_VERIFY_FATAL) { + if (i < NUM_ENT(h) && ret == 0) + ret = DB_VERIFY_BAD; + break; + } + continue; + } + + /* + * If this returned 0, it's safe to print or (carefully) + * try to fetch. + * + * We only print deleted items if DB_AGGRESSIVE is set. + */ + bk = GET_BKEYDATA(dbp, h, i); + if (!LF_ISSET(DB_AGGRESSIVE) && B_DISSET(bk->type)) + continue; + + /* + * If this is a btree leaf and we're about to print out a data + * item for which we didn't print out a key, fix this imbalance + * by printing an "UNKNOWN_KEY". + */ + if (pgtype == P_LBTREE && i % P_INDX == 1 && last != i - 1) { +#ifdef HAVE_COMPRESSION + last_key = NULL; +#endif + if ((t_ret = __db_vrfy_prdbt(&unknown_key, + 0, " ", handle, callback, 0, vdp)) != 0) { + if (ret == 0) + ret = t_ret; + goto err; + } + } + last = i; + + /* + * We're going to go try to print the next item. If key is + * non-NULL, we're a dup page, so we've got to print the key + * first, unless DB_SA_SKIPFIRSTKEY is set and we're on the + * first entry. + */ + if (key != NULL && (i != 0 || !LF_ISSET(DB_SA_SKIPFIRSTKEY))) { +#ifdef HAVE_COMPRESSION + last_key = unknown_dup_key ? NULL : key; +#endif + if ((t_ret = __db_vrfy_prdbt(key, + 0, " ", handle, callback, 0, vdp)) != 0) { + if (ret == 0) + ret = t_ret; + goto err; + } + } + + beg = end = inp[i]; + switch (B_TYPE(bk->type)) { + case B_DUPLICATE: + if (pgtype == P_IBTREE) + break; + + end = beg + BOVERFLOW_SIZE - 1; + /* + * If we're not on a normal btree leaf page, there + * shouldn't be off-page dup sets. Something's + * confused; just drop it, and the code to pick up + * unlinked offpage dup sets will print it out + * with key "UNKNOWN" later. + */ + if (pgtype != P_LBTREE) + break; + + bo = (BOVERFLOW *)bk; + + /* + * If the page number is unreasonable, or if this is + * supposed to be a key item, output "UNKNOWN_KEY" -- + * the best we can do is run into the data items in + * the unlinked offpage dup pass. + */ + if (!IS_VALID_PGNO(bo->pgno) || (i % P_INDX == 0)) { + /* Not much to do on failure. */ +#ifdef HAVE_COMPRESSION + if (key == NULL && i % P_INDX == 0) + last_key = NULL; +#endif + if ((t_ret = __db_vrfy_prdbt( + i % P_INDX == 0 ? &unknown_key : &unknown_data, + 0, " ", handle, callback, 0, vdp)) != 0) { + if (ret == 0) + ret = t_ret; + goto err; + } + break; + } + + /* Don't stop on error. */ + if ((t_ret = __db_salvage_duptree(dbp, + vdp, bo->pgno, &dbt, handle, callback, + flags | DB_SA_SKIPFIRSTKEY +#ifdef HAVE_COMPRESSION + | (last_key == NULL ? DB_SA_UNKNOWNKEY : 0) +#endif + )) != 0 && ret == 0) + ret = t_ret; + + break; + case B_KEYDATA: + if (pgtype == P_IBTREE) + break; + + end = (db_indx_t)DB_ALIGN( + beg + bk->len, sizeof(u_int32_t)) - 1; + + dbt.data = bk->data; + dbt.size = bk->len; + +#ifdef HAVE_COMPRESSION + if (DB_IS_COMPRESSED(dbp) && last_key != NULL && + (key != NULL || (i % P_INDX == 1))) { + /* Decompress the key/data pair - the key + is in last_key, and the data is in dbt */ + if ((t_ret = __bam_compress_salvage(dbp, vdp, + handle, callback, last_key, &dbt)) != 0) { + if (t_ret == DB_VERIFY_FATAL) { + if (ret == 0) + ret = DB_VERIFY_BAD; + if (!LF_ISSET(DB_AGGRESSIVE)) + goto err; + } else if (ret == 0) { + ret = t_ret; + goto err; + } + } + } else { + if (key == NULL && i % P_INDX == 0) { + if ((ret = __os_realloc( + env, dbt.size, &kcpy.data)) != 0) + goto err; + memcpy(kcpy.data, dbt.data, dbt.size); + kcpy.size = dbt.size; + last_key = &kcpy; + } +#endif + + if ((t_ret = __db_vrfy_prdbt(&dbt, + 0, " ", handle, callback, 0, vdp)) != 0) { + if (ret == 0) + ret = t_ret; + goto err; + } +#ifdef HAVE_COMPRESSION + } +#endif + break; + case B_OVERFLOW: + if (pgtype != P_IBTREE) + end = beg + BOVERFLOW_SIZE - 1; + bo = (BOVERFLOW *)bk; + + /* + * Check for replicated overflow keys, so that we only + * call __db_safe_goff once per overflow page. If we + * get the same offset as the previous key just re-use + * the previous dbt. + * + * P_IBTREE pages will never have replicated overflow + * keys. + */ + adj = pgtype == P_IBTREE ? O_INDX : P_INDX; + if (pgtype == P_IBTREE) { + /* + * If we're looking at a P_IBTREE, we just want + * to mark the overflow page as seen. + * + * Note that this call to __db_safe_goff differs + * from the non-P_IBTREE call. + * + * Only call __db_safe_goff if the overflow page + * hasn't been seen. + */ + ovflpg = ((BOVERFLOW *) + ((BINTERNAL *)bk)->data)->pgno; + if (__db_salvage_isdone(vdp, ovflpg) == 0 && + (t_ret =__db_safe_goff(dbp, vdp, ovflpg, + &dbt, &ovflbuf, + &ovfl_bufsz, flags)) != 0 && ret == 0) + ret = t_ret; + break; + } else if (i > adj - 1 && + i % adj == 0 && inp[i] == inp[i - adj]) + dbt = repldbt; + else { + /* Don't stop on error. */ + if ((t_ret = __db_safe_goff(dbp, vdp, + bo->pgno, &dbt, &ovflbuf, + &ovfl_bufsz, flags)) != 0 && ret == 0) + ret = t_ret; + + /* + * If this is a key, save it in case the next + * key is a replicated overflow, so we don't + * call __db_safe_goff again. Copy out dbt.data + * in case that pointer gets realloc'd when + * getting a data item. + */ + if (i % P_INDX == 0) { + if (t_ret == 0) { + if ((t_ret = __os_realloc(env, + dbt.size, + &repldbt.data)) != 0) { + if (ret == 0) + ret = t_ret; + goto err; + } + memcpy(repldbt.data, + dbt.data, dbt.size); + repldbt.size = dbt.size; + } else { + if (__os_realloc(env, + unknown_key.size, + &repldbt.data) != 0) + goto err; + memcpy(repldbt.data, + unknown_key.data, + unknown_key.size); + repldbt.size = unknown_key.size; + } + } + + } + +#ifdef HAVE_COMPRESSION + if (DB_IS_COMPRESSED(dbp) && last_key && t_ret == 0 && + (key != NULL || (i % P_INDX == 1))) { + /* Decompress the key/data pair - the key + is in last_key, and the data is in dbt */ + if ((t_ret = __bam_compress_salvage(dbp, vdp, + handle, callback, last_key, &dbt)) != 0) { + if (t_ret == DB_VERIFY_FATAL) { + if (ret == 0) + ret = DB_VERIFY_BAD; + if (!LF_ISSET(DB_AGGRESSIVE)) + goto err; + } else if (ret == 0) { + ret = t_ret; + goto err; + } + } + } else { + if (key == NULL && i % P_INDX == 0) { + if (t_ret == 0) { + if ((ret = __os_realloc(env, + dbt.size, &kcpy.data)) != 0) + goto err; + memcpy(kcpy.data, dbt.data, + dbt.size); + kcpy.size = dbt.size; + last_key = &kcpy; + } else + last_key = NULL; + } +#endif + + if ((t_ret = __db_vrfy_prdbt( + t_ret == 0 ? &dbt : &unknown_key, + 0, " ", handle, callback, 0, vdp)) + != 0 && ret == 0) + ret = t_ret; +#ifdef HAVE_COMPRESSION + } +#endif + break; + default: + /* + * We should never get here; __db_vrfy_inpitem should + * not be returning 0 if bk->type is unrecognizable. + */ + t_ret = __db_unknown_path(env, "__bam_salvage"); + if (ret == 0) + ret = t_ret; + goto err; + } + + /* + * If we're being aggressive, mark the beginning and end of + * the item; we'll come back and print whatever "junk" is in + * the gaps in case we had any bogus inp elements and thereby + * missed stuff. + */ + if (LF_ISSET(DB_AGGRESSIVE) && pgtype != P_IBTREE) { + pgmap[beg] = VRFY_ITEM_BEGIN; + pgmap[end] = VRFY_ITEM_END; + } + } + +err: if (pgmap != NULL) + __os_free(env, pgmap); + if (ovflbuf != NULL) + __os_free(env, ovflbuf); + if (repldbt.data != NULL) + __os_free(env, repldbt.data); +#ifdef HAVE_COMPRESSION + if (kcpy.data != NULL) + __os_free(env, kcpy.data); +#endif + + /* Mark this page as done. */ + if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __bam_salvage_walkdupint -- + * Walk a known-good btree or recno internal page which is part of + * a dup tree, calling __db_salvage_duptree on each child page. + * + * PUBLIC: int __bam_salvage_walkdupint __P((DB *, VRFY_DBINFO *, PAGE *, + * PUBLIC: DBT *, void *, int (*)(void *, const void *), u_int32_t)); + */ +int +__bam_salvage_walkdupint(dbp, vdp, h, key, handle, callback, flags) + DB *dbp; + VRFY_DBINFO *vdp; + PAGE *h; + DBT *key; + void *handle; + int (*callback) __P((void *, const void *)); + u_int32_t flags; +{ + BINTERNAL *bi; + ENV *env; + RINTERNAL *ri; + int ret, t_ret; + db_indx_t i; + + env = dbp->env; + ret = 0; + + for (i = 0; i < NUM_ENT(h); i++) { + switch (TYPE(h)) { + case P_IBTREE: + bi = GET_BINTERNAL(dbp, h, i); + if ((t_ret = __db_salvage_duptree(dbp, + vdp, bi->pgno, key, handle, callback, flags)) != 0) + ret = t_ret; + break; + case P_IRECNO: + ri = GET_RINTERNAL(dbp, h, i); + if ((t_ret = __db_salvage_duptree(dbp, + vdp, ri->pgno, key, handle, callback, flags)) != 0) + ret = t_ret; + break; + default: + return (__db_unknown_path( + env, "__bam_salvage_walkdupint")); + } + /* Pass DB_SA_SKIPFIRSTKEY, if set, on to the 0th child only. */ + flags &= ~LF_ISSET(DB_SA_SKIPFIRSTKEY); + } + + return (ret); +} + +/* + * __bam_meta2pgset -- + * Given a known-good meta page, return in pgsetp a 0-terminated list of + * db_pgno_t's corresponding to the pages in the btree. + * + * We do this by a somewhat sleazy method, to avoid having to traverse the + * btree structure neatly: we walk down the left side to the very + * first leaf page, then we mark all the pages in the chain of + * NEXT_PGNOs (being wary of cycles and invalid ones), then we + * consolidate our scratch array into a nice list, and return. This + * avoids the memory management hassles of recursion and the + * trouble of walking internal pages--they just don't matter, except + * for the left branch. + * + * PUBLIC: int __bam_meta2pgset __P((DB *, VRFY_DBINFO *, BTMETA *, + * PUBLIC: u_int32_t, DB *)); + */ +int +__bam_meta2pgset(dbp, vdp, btmeta, flags, pgset) + DB *dbp; + VRFY_DBINFO *vdp; + BTMETA *btmeta; + u_int32_t flags; + DB *pgset; +{ + BINTERNAL *bi; + DB_MPOOLFILE *mpf; + PAGE *h; + RINTERNAL *ri; + db_pgno_t current, p; + int err_ret, ret; + + DB_ASSERT(dbp->env, pgset != NULL); + + mpf = dbp->mpf; + h = NULL; + ret = err_ret = 0; + + for (current = btmeta->root;;) { + if (!IS_VALID_PGNO(current) || current == PGNO(btmeta)) { + err_ret = DB_VERIFY_BAD; + goto err; + } + if ((ret = __memp_fget(mpf, ¤t, + vdp->thread_info, NULL, 0, &h)) != 0) { + err_ret = ret; + goto err; + } + + switch (TYPE(h)) { + case P_IBTREE: + case P_IRECNO: + if ((ret = __bam_vrfy(dbp, + vdp, h, current, flags | DB_NOORDERCHK)) != 0) { + err_ret = ret; + goto err; + } + if (TYPE(h) == P_IBTREE) { + bi = GET_BINTERNAL(dbp, h, 0); + current = bi->pgno; + } else { /* P_IRECNO */ + ri = GET_RINTERNAL(dbp, h, 0); + current = ri->pgno; + } + break; + case P_LBTREE: + case P_LRECNO: + goto traverse; + default: + err_ret = DB_VERIFY_BAD; + goto err; + } + + if ((ret = __memp_fput(mpf, + vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0) + err_ret = ret; + h = NULL; + } + + /* + * At this point, current is the pgno of leaf page h, the 0th in the + * tree we're concerned with. + */ +traverse: + while (IS_VALID_PGNO(current) && current != PGNO_INVALID) { + if (h == NULL && (ret = __memp_fget(mpf, + ¤t, vdp->thread_info, NULL, 0, &h)) != 0) { + err_ret = ret; + break; + } + + if ((ret = __db_vrfy_pgset_get(pgset, + vdp->thread_info, current, (int *)&p)) != 0) + goto err; + + if (p != 0) { + /* + * We've found a cycle. Return success anyway-- + * our caller may as well use however much of + * the pgset we've come up with. + */ + break; + } + if ((ret = + __db_vrfy_pgset_inc(pgset, vdp->thread_info, current)) != 0) + goto err; + + current = NEXT_PGNO(h); + if ((ret = __memp_fput(mpf, + vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0) + err_ret = ret; + h = NULL; + } + +err: if (h != NULL) + (void)__memp_fput(mpf, + vdp->thread_info, h, DB_PRIORITY_UNCHANGED); + + return (ret == 0 ? err_ret : ret); +} + +/* + * __bam_safe_getdata -- + * + * Utility function for __bam_vrfy_itemorder. Safely gets the datum at + * index i, page h, and sticks it in DBT dbt. If ovflok is 1 and i's an + * overflow item, we do a safe_goff to get the item and signal that we need + * to free dbt->data; if ovflok is 0, we leaves the DBT zeroed. + */ +static int +__bam_safe_getdata(dbp, ip, h, i, ovflok, dbt, freedbtp) + DB *dbp; + DB_THREAD_INFO *ip; + PAGE *h; + u_int32_t i; + int ovflok; + DBT *dbt; + int *freedbtp; +{ + BKEYDATA *bk; + BOVERFLOW *bo; + DBC *dbc; + int ret; + + memset(dbt, 0, sizeof(DBT)); + *freedbtp = 0; + + bk = GET_BKEYDATA(dbp, h, i); + if (B_TYPE(bk->type) == B_OVERFLOW) { + if (!ovflok) + return (0); + + if ((ret = __db_cursor_int(dbp, ip, NULL, DB_BTREE, + PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0) + return (ret); + bo = (BOVERFLOW *)bk; + F_SET(dbt, DB_DBT_MALLOC); + + *freedbtp = 1; + return (__db_goff(dbc, dbt, bo->tlen, bo->pgno, NULL, NULL)); + } else { + dbt->data = bk->data; + dbt->size = bk->len; + } + + return (0); +} diff --git a/btree/btree.h b/btree/btree.h deleted file mode 100644 index 36d35c9..0000000 --- a/btree/btree.h +++ /dev/null @@ -1,383 +0,0 @@ -/*- - * Copyright (c) 1991, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * This code is derived from software contributed to Berkeley by - * Mike Olson. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)btree.h 8.11 (Berkeley) 8/17/94 - */ - -/* Macros to set/clear/test flags. */ -#define F_SET(p, f) (p)->flags |= (f) -#define F_CLR(p, f) (p)->flags &= ~(f) -#define F_ISSET(p, f) ((p)->flags & (f)) - -#include <mpool.h> - -#define DEFMINKEYPAGE (2) /* Minimum keys per page */ -#define MINCACHE (5) /* Minimum cached pages */ -#define MINPSIZE (512) /* Minimum page size */ - -/* - * Page 0 of a btree file contains a copy of the meta-data. This page is also - * used as an out-of-band page, i.e. page pointers that point to nowhere point - * to page 0. Page 1 is the root of the btree. - */ -#define P_INVALID 0 /* Invalid tree page number. */ -#define P_META 0 /* Tree metadata page number. */ -#define P_ROOT 1 /* Tree root page number. */ - -/* - * There are five page layouts in the btree: btree internal pages (BINTERNAL), - * btree leaf pages (BLEAF), recno internal pages (RINTERNAL), recno leaf pages - * (RLEAF) and overflow pages. All five page types have a page header (PAGE). - * This implementation requires that values within structures NOT be padded. - * (ANSI C permits random padding.) If your compiler pads randomly you'll have - * to do some work to get this package to run. - */ -typedef struct _page { - pgno_t pgno; /* this page's page number */ - pgno_t prevpg; /* left sibling */ - pgno_t nextpg; /* right sibling */ - -#define P_BINTERNAL 0x01 /* btree internal page */ -#define P_BLEAF 0x02 /* leaf page */ -#define P_OVERFLOW 0x04 /* overflow page */ -#define P_RINTERNAL 0x08 /* recno internal page */ -#define P_RLEAF 0x10 /* leaf page */ -#define P_TYPE 0x1f /* type mask */ -#define P_PRESERVE 0x20 /* never delete this chain of pages */ - u_int32_t flags; - - indx_t lower; /* lower bound of free space on page */ - indx_t upper; /* upper bound of free space on page */ - indx_t linp[1]; /* indx_t-aligned VAR. LENGTH DATA */ -} PAGE; - -/* First and next index. */ -#define BTDATAOFF \ - (sizeof(pgno_t) + sizeof(pgno_t) + sizeof(pgno_t) + \ - sizeof(u_int32_t) + sizeof(indx_t) + sizeof(indx_t)) -#define NEXTINDEX(p) (((p)->lower - BTDATAOFF) / sizeof(indx_t)) - -/* - * For pages other than overflow pages, there is an array of offsets into the - * rest of the page immediately following the page header. Each offset is to - * an item which is unique to the type of page. The h_lower offset is just - * past the last filled-in index. The h_upper offset is the first item on the - * page. Offsets are from the beginning of the page. - * - * If an item is too big to store on a single page, a flag is set and the item - * is a { page, size } pair such that the page is the first page of an overflow - * chain with size bytes of item. Overflow pages are simply bytes without any - * external structure. - * - * The page number and size fields in the items are pgno_t-aligned so they can - * be manipulated without copying. (This presumes that 32 bit items can be - * manipulated on this system.) - */ -#define LALIGN(n) (((n) + sizeof(pgno_t) - 1) & ~(sizeof(pgno_t) - 1)) -#define NOVFLSIZE (sizeof(pgno_t) + sizeof(u_int32_t)) - -/* - * For the btree internal pages, the item is a key. BINTERNALs are {key, pgno} - * pairs, such that the key compares less than or equal to all of the records - * on that page. For a tree without duplicate keys, an internal page with two - * consecutive keys, a and b, will have all records greater than or equal to a - * and less than b stored on the page associated with a. Duplicate keys are - * somewhat special and can cause duplicate internal and leaf page records and - * some minor modifications of the above rule. - */ -typedef struct _binternal { - u_int32_t ksize; /* key size */ - pgno_t pgno; /* page number stored on */ -#define P_BIGDATA 0x01 /* overflow data */ -#define P_BIGKEY 0x02 /* overflow key */ - u_char flags; - char bytes[1]; /* data */ -} BINTERNAL; - -/* Get the page's BINTERNAL structure at index indx. */ -#define GETBINTERNAL(pg, indx) \ - ((BINTERNAL *)((char *)(pg) + (pg)->linp[indx])) - -/* Get the number of bytes in the entry. */ -#define NBINTERNAL(len) \ - LALIGN(sizeof(u_int32_t) + sizeof(pgno_t) + sizeof(u_char) + (len)) - -/* Copy a BINTERNAL entry to the page. */ -#define WR_BINTERNAL(p, size, pgno, flags) { \ - *(u_int32_t *)p = size; \ - p += sizeof(u_int32_t); \ - *(pgno_t *)p = pgno; \ - p += sizeof(pgno_t); \ - *(u_char *)p = flags; \ - p += sizeof(u_char); \ -} - -/* - * For the recno internal pages, the item is a page number with the number of - * keys found on that page and below. - */ -typedef struct _rinternal { - recno_t nrecs; /* number of records */ - pgno_t pgno; /* page number stored below */ -} RINTERNAL; - -/* Get the page's RINTERNAL structure at index indx. */ -#define GETRINTERNAL(pg, indx) \ - ((RINTERNAL *)((char *)(pg) + (pg)->linp[indx])) - -/* Get the number of bytes in the entry. */ -#define NRINTERNAL \ - LALIGN(sizeof(recno_t) + sizeof(pgno_t)) - -/* Copy a RINTERAL entry to the page. */ -#define WR_RINTERNAL(p, nrecs, pgno) { \ - *(recno_t *)p = nrecs; \ - p += sizeof(recno_t); \ - *(pgno_t *)p = pgno; \ -} - -/* For the btree leaf pages, the item is a key and data pair. */ -typedef struct _bleaf { - u_int32_t ksize; /* size of key */ - u_int32_t dsize; /* size of data */ - u_char flags; /* P_BIGDATA, P_BIGKEY */ - char bytes[1]; /* data */ -} BLEAF; - -/* Get the page's BLEAF structure at index indx. */ -#define GETBLEAF(pg, indx) \ - ((BLEAF *)((char *)(pg) + (pg)->linp[indx])) - -/* Get the number of bytes in the entry. */ -#define NBLEAF(p) NBLEAFDBT((p)->ksize, (p)->dsize) - -/* Get the number of bytes in the user's key/data pair. */ -#define NBLEAFDBT(ksize, dsize) \ - LALIGN(sizeof(u_int32_t) + sizeof(u_int32_t) + sizeof(u_char) + \ - (ksize) + (dsize)) - -/* Copy a BLEAF entry to the page. */ -#define WR_BLEAF(p, key, data, flags) { \ - *(u_int32_t *)p = key->size; \ - p += sizeof(u_int32_t); \ - *(u_int32_t *)p = data->size; \ - p += sizeof(u_int32_t); \ - *(u_char *)p = flags; \ - p += sizeof(u_char); \ - memmove(p, key->data, key->size); \ - p += key->size; \ - memmove(p, data->data, data->size); \ -} - -/* For the recno leaf pages, the item is a data entry. */ -typedef struct _rleaf { - u_int32_t dsize; /* size of data */ - u_char flags; /* P_BIGDATA */ - char bytes[1]; -} RLEAF; - -/* Get the page's RLEAF structure at index indx. */ -#define GETRLEAF(pg, indx) \ - ((RLEAF *)((char *)(pg) + (pg)->linp[indx])) - -/* Get the number of bytes in the entry. */ -#define NRLEAF(p) NRLEAFDBT((p)->dsize) - -/* Get the number of bytes from the user's data. */ -#define NRLEAFDBT(dsize) \ - LALIGN(sizeof(u_int32_t) + sizeof(u_char) + (dsize)) - -/* Copy a RLEAF entry to the page. */ -#define WR_RLEAF(p, data, flags) { \ - *(u_int32_t *)p = data->size; \ - p += sizeof(u_int32_t); \ - *(u_char *)p = flags; \ - p += sizeof(u_char); \ - memmove(p, data->data, data->size); \ -} - -/* - * A record in the tree is either a pointer to a page and an index in the page - * or a page number and an index. These structures are used as a cursor, stack - * entry and search returns as well as to pass records to other routines. - * - * One comment about searches. Internal page searches must find the largest - * record less than key in the tree so that descents work. Leaf page searches - * must find the smallest record greater than key so that the returned index - * is the record's correct position for insertion. - */ -typedef struct _epgno { - pgno_t pgno; /* the page number */ - indx_t index; /* the index on the page */ -} EPGNO; - -typedef struct _epg { - PAGE *page; /* the (pinned) page */ - indx_t index; /* the index on the page */ -} EPG; - -/* - * About cursors. The cursor (and the page that contained the key/data pair - * that it referenced) can be deleted, which makes things a bit tricky. If - * there are no duplicates of the cursor key in the tree (i.e. B_NODUPS is set - * or there simply aren't any duplicates of the key) we copy the key that it - * referenced when it's deleted, and reacquire a new cursor key if the cursor - * is used again. If there are duplicates keys, we move to the next/previous - * key, and set a flag so that we know what happened. NOTE: if duplicate (to - * the cursor) keys are added to the tree during this process, it is undefined - * if they will be returned or not in a cursor scan. - * - * The flags determine the possible states of the cursor: - * - * CURS_INIT The cursor references *something*. - * CURS_ACQUIRE The cursor was deleted, and a key has been saved so that - * we can reacquire the right position in the tree. - * CURS_AFTER, CURS_BEFORE - * The cursor was deleted, and now references a key/data pair - * that has not yet been returned, either before or after the - * deleted key/data pair. - * XXX - * This structure is broken out so that we can eventually offer multiple - * cursors as part of the DB interface. - */ -typedef struct _cursor { - EPGNO pg; /* B: Saved tree reference. */ - DBT key; /* B: Saved key, or key.data == NULL. */ - recno_t rcursor; /* R: recno cursor (1-based) */ - -#define CURS_ACQUIRE 0x01 /* B: Cursor needs to be reacquired. */ -#define CURS_AFTER 0x02 /* B: Unreturned cursor after key. */ -#define CURS_BEFORE 0x04 /* B: Unreturned cursor before key. */ -#define CURS_INIT 0x08 /* RB: Cursor initialized. */ - u_int8_t flags; -} CURSOR; - -/* - * The metadata of the tree. The nrecs field is used only by the RECNO code. - * This is because the btree doesn't really need it and it requires that every - * put or delete call modify the metadata. - */ -typedef struct _btmeta { - u_int32_t magic; /* magic number */ - u_int32_t version; /* version */ - u_int32_t psize; /* page size */ - u_int32_t free; /* page number of first free page */ - u_int32_t nrecs; /* R: number of records */ - -#define SAVEMETA (B_NODUPS | R_RECNO) - u_int32_t flags; /* bt_flags & SAVEMETA */ -} BTMETA; - -/* The in-memory btree/recno data structure. */ -typedef struct _btree { - MPOOL *bt_mp; /* memory pool cookie */ - - DB *bt_dbp; /* pointer to enclosing DB */ - - EPG bt_cur; /* current (pinned) page */ - PAGE *bt_pinned; /* page pinned across calls */ - - CURSOR bt_cursor; /* cursor */ - -#define BT_PUSH(t, p, i) { \ - t->bt_sp->pgno = p; \ - t->bt_sp->index = i; \ - ++t->bt_sp; \ -} -#define BT_POP(t) (t->bt_sp == t->bt_stack ? NULL : --t->bt_sp) -#define BT_CLR(t) (t->bt_sp = t->bt_stack) - EPGNO bt_stack[50]; /* stack of parent pages */ - EPGNO *bt_sp; /* current stack pointer */ - - DBT bt_rkey; /* returned key */ - DBT bt_rdata; /* returned data */ - - int bt_fd; /* tree file descriptor */ - - pgno_t bt_free; /* next free page */ - u_int32_t bt_psize; /* page size */ - indx_t bt_ovflsize; /* cut-off for key/data overflow */ - int bt_lorder; /* byte order */ - /* sorted order */ - enum { NOT, BACK, FORWARD } bt_order; - EPGNO bt_last; /* last insert */ - - /* B: key comparison function */ - int (*bt_cmp) __P((const DBT *, const DBT *)); - /* B: prefix comparison function */ - size_t (*bt_pfx) __P((const DBT *, const DBT *)); - /* R: recno input function */ - int (*bt_irec) __P((struct _btree *, recno_t)); - - FILE *bt_rfp; /* R: record FILE pointer */ - int bt_rfd; /* R: record file descriptor */ - - caddr_t bt_cmap; /* R: current point in mapped space */ - caddr_t bt_smap; /* R: start of mapped space */ - caddr_t bt_emap; /* R: end of mapped space */ - size_t bt_msize; /* R: size of mapped region. */ - - recno_t bt_nrecs; /* R: number of records */ - size_t bt_reclen; /* R: fixed record length */ - u_char bt_bval; /* R: delimiting byte/pad character */ - -/* - * NB: - * B_NODUPS and R_RECNO are stored on disk, and may not be changed. - */ -#define B_INMEM 0x00001 /* in-memory tree */ -#define B_METADIRTY 0x00002 /* need to write metadata */ -#define B_MODIFIED 0x00004 /* tree modified */ -#define B_NEEDSWAP 0x00008 /* if byte order requires swapping */ -#define B_RDONLY 0x00010 /* read-only tree */ - -#define B_NODUPS 0x00020 /* no duplicate keys permitted */ -#define R_RECNO 0x00080 /* record oriented tree */ - -#define R_CLOSEFP 0x00040 /* opened a file pointer */ -#define R_EOF 0x00100 /* end of input file reached. */ -#define R_FIXLEN 0x00200 /* fixed length records */ -#define R_MEMMAPPED 0x00400 /* memory mapped file. */ -#define R_INMEM 0x00800 /* in-memory file */ -#define R_MODIFIED 0x01000 /* modified file */ -#define R_RDONLY 0x02000 /* read-only file */ - -#define B_DB_LOCK 0x04000 /* DB_LOCK specified. */ -#define B_DB_SHMEM 0x08000 /* DB_SHMEM specified. */ -#define B_DB_TXN 0x10000 /* DB_TXN specified. */ - u_int32_t flags; -} BTREE; - -#include "extern.h" diff --git a/btree/btree.src b/btree/btree.src new file mode 100644 index 0000000..b6198e1 --- /dev/null +++ b/btree/btree.src @@ -0,0 +1,291 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +DBPRIVATE +PREFIX __bam + +INCLUDE #include "db_int.h" +INCLUDE #include "dbinc/crypto.h" +INCLUDE #include "dbinc/db_page.h" +INCLUDE #include "dbinc/db_am.h" +INCLUDE #include "dbinc/btree.h" +INCLUDE #include "dbinc/log.h" +INCLUDE #include "dbinc/txn.h" +INCLUDE + +/* + * BTREE-split: used to log a page split. + * + * left: the page number for the low-order contents. + * llsn: the left page's original LSN. + * right: the page number for the high-order contents. + * rlsn: the right page's original LSN. + * indx: the number of entries that went to the left page. + * npgno: the next page number + * nlsn: the next page's original LSN (or 0 if no next page). + * pgno: the parent page number + * plsn: the parent page's original LSN. + * pg: the split page's contents before the split. + * opflags: SPL_NRECS: if splitting a tree that maintains a record count. + * pindx: index of new record in parent page. + */ +BEGIN split 48 62 +DB fileid int32_t ld +ARG left db_pgno_t lu +POINTER llsn DB_LSN * lu +ARG right db_pgno_t lu +POINTER rlsn DB_LSN * lu +ARG indx u_int32_t lu +ARG npgno db_pgno_t lu +POINTER nlsn DB_LSN * lu +ARG ppgno db_pgno_t lu +POINTER plsn DB_LSN * lu +ARG pindx u_int32_t lu +PGDBT pg DBT s +DBT pentry DBT s +DBT rentry DBT s +ARG opflags u_int32_t lu +END + +BEGIN_COMPAT split 42 62 +DB fileid int32_t ld +ARG left db_pgno_t lu +POINTER llsn DB_LSN * lu +ARG right db_pgno_t lu +POINTER rlsn DB_LSN * lu +ARG indx u_int32_t lu +ARG npgno db_pgno_t lu +POINTER nlsn DB_LSN * lu +ARG root_pgno db_pgno_t lu +PGDBT pg DBT s +ARG opflags u_int32_t lu +END + +/* + * BTREE-rsplit: used to log a reverse-split + * + * pgno: the page number of the page copied over the root. + * pgdbt: the page being copied on the root page. + * root_pgno: the root page number. + * nrec: the tree's record count. + * rootent: last entry on the root page. + * rootlsn: the root page's original lsn. + */ +BEGIN rsplit 42 63 +DB fileid int32_t ld +ARG pgno db_pgno_t lu +PGDBT pgdbt DBT s +ARG root_pgno db_pgno_t lu +ARG nrec db_pgno_t lu +DBT rootent DBT s +POINTER rootlsn DB_LSN * lu +END + +/* + * BTREE-adj: used to log the adjustment of an index. + * + * pgno: the page modified. + * lsn: the page's original lsn. + * indx: the index adjusted. + * indx_copy: the index to copy if inserting. + * is_insert: 0 if a delete, 1 if an insert. + */ +BEGIN adj 42 55 +DB fileid int32_t ld +ARG pgno db_pgno_t lu +POINTER lsn DB_LSN * lu +ARG indx u_int32_t lu +ARG indx_copy u_int32_t lu +ARG is_insert u_int32_t lu +END + +/* + * BTREE-cadjust: used to adjust the count change in an internal page. + * + * pgno: the page modified. + * lsn: the page's original lsn. + * indx: the index to be adjusted. + * adjust: the signed adjustment. + * opflags: CAD_UPDATEROOT: if root page count was adjusted. + */ +BEGIN cadjust 42 56 +DB fileid int32_t ld +ARG pgno db_pgno_t lu +POINTER lsn DB_LSN * lu +ARG indx u_int32_t lu +ARG adjust int32_t ld +ARG opflags u_int32_t lu +END + +/* + * BTREE-cdel: used to log the intent-to-delete of a cursor record. + * + * pgno: the page modified. + * lsn: the page's original lsn. + * indx: the index to be deleted. + */ +BEGIN cdel 42 57 +DB fileid int32_t ld +ARG pgno db_pgno_t lu +POINTER lsn DB_LSN * lu +ARG indx u_int32_t lu +END + +/* + * BTREE-repl: used to log the replacement of an item. + * + * pgno: the page modified. + * lsn: the page's original lsn. + * indx: the index to be replaced. + * isdeleted: set if the record was previously deleted. + * orig: the original data. + * repl: the replacement data. + * prefix: the prefix of the replacement that matches the original. + * suffix: the suffix of the replacement that matches the original. + */ +BEGIN repl 42 58 +DB fileid int32_t ld +ARG pgno db_pgno_t lu +POINTER lsn DB_LSN * lu +ARG indx u_int32_t lu +ARG isdeleted u_int32_t lu +DBT orig DBT s +DBT repl DBT s +ARG prefix u_int32_t lu +ARG suffix u_int32_t lu +END + +/* + * BTREE-root: log the assignment of a root btree page. + */ +BEGIN root 42 59 +DB fileid int32_t ld +ARG meta_pgno db_pgno_t lu +ARG root_pgno db_pgno_t lu +POINTER meta_lsn DB_LSN * lu +END + +/* + * BTREE-curadj: undo cursor adjustments on txn abort. + * Should only be processed during DB_TXN_ABORT. + * NOTE: the first_indx field gets used to hold + * signed index adjustment in one case. + * care should be taken if its size is changed. + */ +BEGIN curadj 42 64 +/* Fileid of db affected. */ +DB fileid int32_t ld +/* Which adjustment. */ +ARG mode db_ca_mode ld +/* Page entry is from. */ +ARG from_pgno db_pgno_t lu +/* Page entry went to. */ +ARG to_pgno db_pgno_t lu +/* Left page of root split. */ +ARG left_pgno db_pgno_t lu +/* First index of dup set. Also used as adjustment. */ +ARG first_indx u_int32_t lu +/* Index entry is from. */ +ARG from_indx u_int32_t lu +/* Index where entry went. */ +ARG to_indx u_int32_t lu +END + +/* + * BTREE-rcuradj: undo cursor adjustments on txn abort in + * renumbering recno trees. + * Should only be processed during DB_TXN_ABORT. + */ +BEGIN rcuradj 42 65 +/* Fileid of db affected. */ +DB fileid int32_t ld +/* Which adjustment. */ +ARG mode ca_recno_arg ld +/* Root page number. */ +ARG root db_pgno_t ld +/* Recno of the adjustment. */ +ARG recno db_recno_t ld +/* Order number of the adjustment. */ +ARG order u_int32_t lu +END + +/* + * BTREE-relink -- Handles relinking around a deleted leaf page. + * + */ +BEGIN_COMPAT relink 43 147 +/* Fileid of db affected. */ +DB fileid int32_t ld +/* The page being removed. */ +ARG pgno db_pgno_t lu +/* The page's original lsn. */ +POINTER lsn DB_LSN * lu +/* The previous page. */ +ARG prev db_pgno_t lu +/* The previous page's original lsn. */ +POINTER lsn_prev DB_LSN * lu +/* The next page. */ +ARG next db_pgno_t lu +/* The previous page's original lsn. */ +POINTER lsn_next DB_LSN * lu +END + +BEGIN relink 44 147 +/* Fileid of db affected. */ +DB fileid int32_t ld +/* The page being removed. */ +ARG pgno db_pgno_t lu +/* The new page number, if any. */ +ARG new_pgno db_pgno_t lu +/* The previous page. */ +ARG prev db_pgno_t lu +/* The previous page's original lsn. */ +POINTER lsn_prev DB_LSN * lu +/* The next page. */ +ARG next db_pgno_t lu +/* The previous page's original lsn. */ +POINTER lsn_next DB_LSN * lu +END + +/* + * BTREE-merge -- Handles merging of pages during a compaction. + */ +BEGIN_COMPAT merge 44 148 +DB fileid int32_t ld +ARG pgno db_pgno_t lu +POINTER lsn DB_LSN * lu +ARG npgno db_pgno_t lu +POINTER nlsn DB_LSN * lu +DBT hdr DBT s +DBT data DBT s +DBT ind DBT s +END + +BEGIN merge 47 148 +DB fileid int32_t ld +ARG pgno db_pgno_t lu +POINTER lsn DB_LSN * lu +ARG npgno db_pgno_t lu +POINTER nlsn DB_LSN * lu +PGDBT hdr DBT s +PGDDBT data DBT s +ARG pg_copy int32_t lu +END + +/* + * BTREE-pgno -- Handles replacing a page number in the record + * reference on pgno by indx. + */ +BEGIN pgno 44 149 +DB fileid int32_t ld +ARG pgno db_pgno_t lu +POINTER lsn DB_LSN * lu +ARG indx u_int32_t lu +ARG opgno db_pgno_t lu +ARG npgno db_pgno_t lu +END diff --git a/btree/btree_auto.c b/btree/btree_auto.c new file mode 100644 index 0000000..460f038 --- /dev/null +++ b/btree/btree_auto.c @@ -0,0 +1,3547 @@ +/* Do not edit: automatically built by gen_rec.awk. */ + +#include "db_config.h" +#include "db_int.h" +#include "dbinc/crypto.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/btree.h" +#include "dbinc/log.h" +#include "dbinc/txn.h" + +/* + * PUBLIC: int __bam_split_read __P((ENV *, DB **, void *, void *, + * PUBLIC: __bam_split_args **)); + */ +int +__bam_split_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __bam_split_args **argpp; +{ + __bam_split_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__bam_split_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->left = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->llsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->right = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->rlsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &argp->indx, bp); + bp += sizeof(argp->indx); + + LOGCOPY_32(env, &uinttmp, bp); + argp->npgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->nlsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->ppgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->plsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &argp->pindx, bp); + bp += sizeof(argp->pindx); + + memset(&argp->pg, 0, sizeof(argp->pg)); + LOGCOPY_32(env,&argp->pg.size, bp); + bp += sizeof(u_int32_t); + argp->pg.data = bp; + bp += argp->pg.size; + if (LOG_SWAPPED(env) && dbpp != NULL && *dbpp != NULL) { + int t_ret; + if ((t_ret = __db_pageswap(*dbpp, (PAGE *)argp->pg.data, + (size_t)argp->pg.size, NULL, 1)) != 0) + return (t_ret); + } + + memset(&argp->pentry, 0, sizeof(argp->pentry)); + LOGCOPY_32(env,&argp->pentry.size, bp); + bp += sizeof(u_int32_t); + argp->pentry.data = bp; + bp += argp->pentry.size; + + memset(&argp->rentry, 0, sizeof(argp->rentry)); + LOGCOPY_32(env,&argp->rentry.size, bp); + bp += sizeof(u_int32_t); + argp->rentry.data = bp; + bp += argp->rentry.size; + + LOGCOPY_32(env, &argp->opflags, bp); + bp += sizeof(argp->opflags); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __bam_split_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t, DB_LSN *, u_int32_t, + * PUBLIC: db_pgno_t, DB_LSN *, db_pgno_t, DB_LSN *, u_int32_t, const DBT *, + * PUBLIC: const DBT *, const DBT *, u_int32_t)); + */ +int +__bam_split_log(dbp, txnp, ret_lsnp, flags, left, llsn, right, rlsn, indx, + npgno, nlsn, ppgno, plsn, pindx, pg, + pentry, rentry, opflags) + DB *dbp; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + db_pgno_t left; + DB_LSN * llsn; + db_pgno_t right; + DB_LSN * rlsn; + u_int32_t indx; + db_pgno_t npgno; + DB_LSN * nlsn; + db_pgno_t ppgno; + DB_LSN * plsn; + u_int32_t pindx; + const DBT *pg; + const DBT *pentry; + const DBT *rentry; + u_int32_t opflags; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + ENV *env; + u_int32_t zero, uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + env = dbp->env; + rlsnp = ret_lsnp; + rectype = DB___bam_split; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + DB_ASSERT(env, dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(*llsn) + + sizeof(u_int32_t) + + sizeof(*rlsn) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(*nlsn) + + sizeof(u_int32_t) + + sizeof(*plsn) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + (pg == NULL ? 0 : pg->size) + + sizeof(u_int32_t) + (pentry == NULL ? 0 : pentry->size) + + sizeof(u_int32_t) + (rentry == NULL ? 0 : rentry->size) + + sizeof(u_int32_t); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + uinttmp = (u_int32_t)dbp->log_filename->id; + LOGCOPY_32(env, bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)left; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (llsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(llsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, llsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, llsn); + } else + memset(bp, 0, sizeof(*llsn)); + bp += sizeof(*llsn); + + uinttmp = (u_int32_t)right; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (rlsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(rlsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, rlsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, rlsn); + } else + memset(bp, 0, sizeof(*rlsn)); + bp += sizeof(*rlsn); + + LOGCOPY_32(env, bp, &indx); + bp += sizeof(indx); + + uinttmp = (u_int32_t)npgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (nlsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(nlsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, nlsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, nlsn); + } else + memset(bp, 0, sizeof(*nlsn)); + bp += sizeof(*nlsn); + + uinttmp = (u_int32_t)ppgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (plsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(plsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, plsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, plsn); + } else + memset(bp, 0, sizeof(*plsn)); + bp += sizeof(*plsn); + + LOGCOPY_32(env, bp, &pindx); + bp += sizeof(pindx); + + if (pg == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &pg->size); + bp += sizeof(pg->size); + memcpy(bp, pg->data, pg->size); + if (LOG_SWAPPED(env)) + if ((ret = __db_pageswap(dbp, + (PAGE *)bp, (size_t)pg->size, (DBT *)NULL, 0)) != 0) + return (ret); + bp += pg->size; + } + + if (pentry == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &pentry->size); + bp += sizeof(pentry->size); + memcpy(bp, pentry->data, pentry->size); + bp += pentry->size; + } + + if (rentry == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &rentry->size); + bp += sizeof(rentry->size); + memcpy(bp, rentry->data, rentry->size); + bp += rentry->size; + } + + LOGCOPY_32(env, bp, &opflags); + bp += sizeof(opflags); + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__bam_split_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __bam_split_42_read __P((ENV *, DB **, void *, + * PUBLIC: void *, __bam_split_42_args **)); + */ +int +__bam_split_42_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __bam_split_42_args **argpp; +{ + __bam_split_42_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__bam_split_42_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->left = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->llsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->right = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->rlsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &argp->indx, bp); + bp += sizeof(argp->indx); + + LOGCOPY_32(env, &uinttmp, bp); + argp->npgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->nlsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->root_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + memset(&argp->pg, 0, sizeof(argp->pg)); + LOGCOPY_32(env,&argp->pg.size, bp); + bp += sizeof(u_int32_t); + argp->pg.data = bp; + bp += argp->pg.size; + + LOGCOPY_32(env, &argp->opflags, bp); + bp += sizeof(argp->opflags); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __bam_rsplit_read __P((ENV *, DB **, void *, void *, + * PUBLIC: __bam_rsplit_args **)); + */ +int +__bam_rsplit_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __bam_rsplit_args **argpp; +{ + __bam_rsplit_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__bam_rsplit_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + memset(&argp->pgdbt, 0, sizeof(argp->pgdbt)); + LOGCOPY_32(env,&argp->pgdbt.size, bp); + bp += sizeof(u_int32_t); + argp->pgdbt.data = bp; + bp += argp->pgdbt.size; + if (LOG_SWAPPED(env) && dbpp != NULL && *dbpp != NULL) { + int t_ret; + if ((t_ret = __db_pageswap(*dbpp, (PAGE *)argp->pgdbt.data, + (size_t)argp->pgdbt.size, NULL, 1)) != 0) + return (t_ret); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->root_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &uinttmp, bp); + argp->nrec = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + memset(&argp->rootent, 0, sizeof(argp->rootent)); + LOGCOPY_32(env,&argp->rootent.size, bp); + bp += sizeof(u_int32_t); + argp->rootent.data = bp; + bp += argp->rootent.size; + + LOGCOPY_TOLSN(env, &argp->rootlsn, bp); + bp += sizeof(DB_LSN); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __bam_rsplit_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_pgno_t, const DBT *, db_pgno_t, db_pgno_t, + * PUBLIC: const DBT *, DB_LSN *)); + */ +int +__bam_rsplit_log(dbp, txnp, ret_lsnp, flags, pgno, pgdbt, root_pgno, nrec, rootent, + rootlsn) + DB *dbp; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + db_pgno_t pgno; + const DBT *pgdbt; + db_pgno_t root_pgno; + db_pgno_t nrec; + const DBT *rootent; + DB_LSN * rootlsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + ENV *env; + u_int32_t zero, uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + env = dbp->env; + rlsnp = ret_lsnp; + rectype = DB___bam_rsplit; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + DB_ASSERT(env, dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + (pgdbt == NULL ? 0 : pgdbt->size) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + (rootent == NULL ? 0 : rootent->size) + + sizeof(*rootlsn); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + uinttmp = (u_int32_t)dbp->log_filename->id; + LOGCOPY_32(env, bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (pgdbt == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &pgdbt->size); + bp += sizeof(pgdbt->size); + memcpy(bp, pgdbt->data, pgdbt->size); + if (LOG_SWAPPED(env)) + if ((ret = __db_pageswap(dbp, + (PAGE *)bp, (size_t)pgdbt->size, (DBT *)NULL, 0)) != 0) + return (ret); + bp += pgdbt->size; + } + + uinttmp = (u_int32_t)root_pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)nrec; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (rootent == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &rootent->size); + bp += sizeof(rootent->size); + memcpy(bp, rootent->data, rootent->size); + bp += rootent->size; + } + + if (rootlsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(rootlsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, rootlsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, rootlsn); + } else + memset(bp, 0, sizeof(*rootlsn)); + bp += sizeof(*rootlsn); + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__bam_rsplit_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __bam_adj_read __P((ENV *, DB **, void *, void *, + * PUBLIC: __bam_adj_args **)); + */ +int +__bam_adj_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __bam_adj_args **argpp; +{ + __bam_adj_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__bam_adj_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &argp->indx, bp); + bp += sizeof(argp->indx); + + LOGCOPY_32(env, &argp->indx_copy, bp); + bp += sizeof(argp->indx_copy); + + LOGCOPY_32(env, &argp->is_insert, bp); + bp += sizeof(argp->is_insert); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __bam_adj_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, u_int32_t, u_int32_t, + * PUBLIC: u_int32_t)); + */ +int +__bam_adj_log(dbp, txnp, ret_lsnp, flags, pgno, lsn, indx, indx_copy, is_insert) + DB *dbp; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + db_pgno_t pgno; + DB_LSN * lsn; + u_int32_t indx; + u_int32_t indx_copy; + u_int32_t is_insert; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + ENV *env; + u_int32_t uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + env = dbp->env; + rlsnp = ret_lsnp; + rectype = DB___bam_adj; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + DB_ASSERT(env, dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(*lsn) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + uinttmp = (u_int32_t)dbp->log_filename->id; + LOGCOPY_32(env, bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (lsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(lsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, lsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, lsn); + } else + memset(bp, 0, sizeof(*lsn)); + bp += sizeof(*lsn); + + LOGCOPY_32(env, bp, &indx); + bp += sizeof(indx); + + LOGCOPY_32(env, bp, &indx_copy); + bp += sizeof(indx_copy); + + LOGCOPY_32(env, bp, &is_insert); + bp += sizeof(is_insert); + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__bam_adj_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __bam_cadjust_read __P((ENV *, DB **, void *, void *, + * PUBLIC: __bam_cadjust_args **)); + */ +int +__bam_cadjust_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __bam_cadjust_args **argpp; +{ + __bam_cadjust_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__bam_cadjust_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &argp->indx, bp); + bp += sizeof(argp->indx); + + LOGCOPY_32(env, &uinttmp, bp); + argp->adjust = (int32_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &argp->opflags, bp); + bp += sizeof(argp->opflags); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __bam_cadjust_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, u_int32_t, int32_t, u_int32_t)); + */ +int +__bam_cadjust_log(dbp, txnp, ret_lsnp, flags, pgno, lsn, indx, adjust, opflags) + DB *dbp; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + db_pgno_t pgno; + DB_LSN * lsn; + u_int32_t indx; + int32_t adjust; + u_int32_t opflags; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + ENV *env; + u_int32_t uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + env = dbp->env; + rlsnp = ret_lsnp; + rectype = DB___bam_cadjust; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + DB_ASSERT(env, dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(*lsn) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + uinttmp = (u_int32_t)dbp->log_filename->id; + LOGCOPY_32(env, bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (lsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(lsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, lsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, lsn); + } else + memset(bp, 0, sizeof(*lsn)); + bp += sizeof(*lsn); + + LOGCOPY_32(env, bp, &indx); + bp += sizeof(indx); + + uinttmp = (u_int32_t)adjust; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + LOGCOPY_32(env, bp, &opflags); + bp += sizeof(opflags); + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__bam_cadjust_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __bam_cdel_read __P((ENV *, DB **, void *, void *, + * PUBLIC: __bam_cdel_args **)); + */ +int +__bam_cdel_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __bam_cdel_args **argpp; +{ + __bam_cdel_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__bam_cdel_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &argp->indx, bp); + bp += sizeof(argp->indx); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __bam_cdel_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, u_int32_t)); + */ +int +__bam_cdel_log(dbp, txnp, ret_lsnp, flags, pgno, lsn, indx) + DB *dbp; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + db_pgno_t pgno; + DB_LSN * lsn; + u_int32_t indx; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + ENV *env; + u_int32_t uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + env = dbp->env; + rlsnp = ret_lsnp; + rectype = DB___bam_cdel; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + DB_ASSERT(env, dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(*lsn) + + sizeof(u_int32_t); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + uinttmp = (u_int32_t)dbp->log_filename->id; + LOGCOPY_32(env, bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (lsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(lsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, lsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, lsn); + } else + memset(bp, 0, sizeof(*lsn)); + bp += sizeof(*lsn); + + LOGCOPY_32(env, bp, &indx); + bp += sizeof(indx); + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__bam_cdel_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __bam_repl_read __P((ENV *, DB **, void *, void *, + * PUBLIC: __bam_repl_args **)); + */ +int +__bam_repl_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __bam_repl_args **argpp; +{ + __bam_repl_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__bam_repl_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &argp->indx, bp); + bp += sizeof(argp->indx); + + LOGCOPY_32(env, &argp->isdeleted, bp); + bp += sizeof(argp->isdeleted); + + memset(&argp->orig, 0, sizeof(argp->orig)); + LOGCOPY_32(env,&argp->orig.size, bp); + bp += sizeof(u_int32_t); + argp->orig.data = bp; + bp += argp->orig.size; + + memset(&argp->repl, 0, sizeof(argp->repl)); + LOGCOPY_32(env,&argp->repl.size, bp); + bp += sizeof(u_int32_t); + argp->repl.data = bp; + bp += argp->repl.size; + + LOGCOPY_32(env, &argp->prefix, bp); + bp += sizeof(argp->prefix); + + LOGCOPY_32(env, &argp->suffix, bp); + bp += sizeof(argp->suffix); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __bam_repl_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, u_int32_t, u_int32_t, + * PUBLIC: const DBT *, const DBT *, u_int32_t, u_int32_t)); + */ +int +__bam_repl_log(dbp, txnp, ret_lsnp, flags, pgno, lsn, indx, isdeleted, orig, + repl, prefix, suffix) + DB *dbp; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + db_pgno_t pgno; + DB_LSN * lsn; + u_int32_t indx; + u_int32_t isdeleted; + const DBT *orig; + const DBT *repl; + u_int32_t prefix; + u_int32_t suffix; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + ENV *env; + u_int32_t zero, uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + env = dbp->env; + rlsnp = ret_lsnp; + rectype = DB___bam_repl; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + DB_ASSERT(env, dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(*lsn) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + (orig == NULL ? 0 : orig->size) + + sizeof(u_int32_t) + (repl == NULL ? 0 : repl->size) + + sizeof(u_int32_t) + + sizeof(u_int32_t); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + uinttmp = (u_int32_t)dbp->log_filename->id; + LOGCOPY_32(env, bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (lsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(lsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, lsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, lsn); + } else + memset(bp, 0, sizeof(*lsn)); + bp += sizeof(*lsn); + + LOGCOPY_32(env, bp, &indx); + bp += sizeof(indx); + + LOGCOPY_32(env, bp, &isdeleted); + bp += sizeof(isdeleted); + + if (orig == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &orig->size); + bp += sizeof(orig->size); + memcpy(bp, orig->data, orig->size); + bp += orig->size; + } + + if (repl == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &repl->size); + bp += sizeof(repl->size); + memcpy(bp, repl->data, repl->size); + bp += repl->size; + } + + LOGCOPY_32(env, bp, &prefix); + bp += sizeof(prefix); + + LOGCOPY_32(env, bp, &suffix); + bp += sizeof(suffix); + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__bam_repl_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __bam_root_read __P((ENV *, DB **, void *, void *, + * PUBLIC: __bam_root_args **)); + */ +int +__bam_root_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __bam_root_args **argpp; +{ + __bam_root_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__bam_root_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->meta_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &uinttmp, bp); + argp->root_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->meta_lsn, bp); + bp += sizeof(DB_LSN); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __bam_root_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_pgno_t, db_pgno_t, DB_LSN *)); + */ +int +__bam_root_log(dbp, txnp, ret_lsnp, flags, meta_pgno, root_pgno, meta_lsn) + DB *dbp; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + db_pgno_t meta_pgno; + db_pgno_t root_pgno; + DB_LSN * meta_lsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + ENV *env; + u_int32_t uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + env = dbp->env; + rlsnp = ret_lsnp; + rectype = DB___bam_root; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + DB_ASSERT(env, dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(*meta_lsn); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + uinttmp = (u_int32_t)dbp->log_filename->id; + LOGCOPY_32(env, bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)meta_pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)root_pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (meta_lsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(meta_lsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, meta_lsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, meta_lsn); + } else + memset(bp, 0, sizeof(*meta_lsn)); + bp += sizeof(*meta_lsn); + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__bam_root_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __bam_curadj_read __P((ENV *, DB **, void *, void *, + * PUBLIC: __bam_curadj_args **)); + */ +int +__bam_curadj_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __bam_curadj_args **argpp; +{ + __bam_curadj_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__bam_curadj_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->mode = (db_ca_mode)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &uinttmp, bp); + argp->from_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &uinttmp, bp); + argp->to_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &uinttmp, bp); + argp->left_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &argp->first_indx, bp); + bp += sizeof(argp->first_indx); + + LOGCOPY_32(env, &argp->from_indx, bp); + bp += sizeof(argp->from_indx); + + LOGCOPY_32(env, &argp->to_indx, bp); + bp += sizeof(argp->to_indx); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __bam_curadj_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_ca_mode, db_pgno_t, db_pgno_t, db_pgno_t, + * PUBLIC: u_int32_t, u_int32_t, u_int32_t)); + */ +int +__bam_curadj_log(dbp, txnp, ret_lsnp, flags, mode, from_pgno, to_pgno, left_pgno, first_indx, + from_indx, to_indx) + DB *dbp; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + db_ca_mode mode; + db_pgno_t from_pgno; + db_pgno_t to_pgno; + db_pgno_t left_pgno; + u_int32_t first_indx; + u_int32_t from_indx; + u_int32_t to_indx; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + ENV *env; + u_int32_t uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + env = dbp->env; + rlsnp = ret_lsnp; + rectype = DB___bam_curadj; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + DB_ASSERT(env, dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + uinttmp = (u_int32_t)dbp->log_filename->id; + LOGCOPY_32(env, bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)mode; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)from_pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)to_pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)left_pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + LOGCOPY_32(env, bp, &first_indx); + bp += sizeof(first_indx); + + LOGCOPY_32(env, bp, &from_indx); + bp += sizeof(from_indx); + + LOGCOPY_32(env, bp, &to_indx); + bp += sizeof(to_indx); + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__bam_curadj_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __bam_rcuradj_read __P((ENV *, DB **, void *, void *, + * PUBLIC: __bam_rcuradj_args **)); + */ +int +__bam_rcuradj_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __bam_rcuradj_args **argpp; +{ + __bam_rcuradj_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__bam_rcuradj_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->mode = (ca_recno_arg)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &uinttmp, bp); + argp->root = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &uinttmp, bp); + argp->recno = (db_recno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &argp->order, bp); + bp += sizeof(argp->order); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __bam_rcuradj_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, ca_recno_arg, db_pgno_t, db_recno_t, u_int32_t)); + */ +int +__bam_rcuradj_log(dbp, txnp, ret_lsnp, flags, mode, root, recno, order) + DB *dbp; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + ca_recno_arg mode; + db_pgno_t root; + db_recno_t recno; + u_int32_t order; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + ENV *env; + u_int32_t uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + env = dbp->env; + rlsnp = ret_lsnp; + rectype = DB___bam_rcuradj; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + DB_ASSERT(env, dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + uinttmp = (u_int32_t)dbp->log_filename->id; + LOGCOPY_32(env, bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)mode; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)root; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)recno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + LOGCOPY_32(env, bp, &order); + bp += sizeof(order); + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__bam_rcuradj_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __bam_relink_43_read __P((ENV *, DB **, void *, + * PUBLIC: void *, __bam_relink_43_args **)); + */ +int +__bam_relink_43_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __bam_relink_43_args **argpp; +{ + __bam_relink_43_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__bam_relink_43_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->prev = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->lsn_prev, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->next = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->lsn_next, bp); + bp += sizeof(DB_LSN); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __bam_relink_read __P((ENV *, DB **, void *, void *, + * PUBLIC: __bam_relink_args **)); + */ +int +__bam_relink_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __bam_relink_args **argpp; +{ + __bam_relink_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__bam_relink_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &uinttmp, bp); + argp->new_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &uinttmp, bp); + argp->prev = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->lsn_prev, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->next = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->lsn_next, bp); + bp += sizeof(DB_LSN); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __bam_relink_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_pgno_t, db_pgno_t, db_pgno_t, DB_LSN *, db_pgno_t, + * PUBLIC: DB_LSN *)); + */ +int +__bam_relink_log(dbp, txnp, ret_lsnp, flags, pgno, new_pgno, prev, lsn_prev, next, + lsn_next) + DB *dbp; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + db_pgno_t pgno; + db_pgno_t new_pgno; + db_pgno_t prev; + DB_LSN * lsn_prev; + db_pgno_t next; + DB_LSN * lsn_next; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + ENV *env; + u_int32_t uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + env = dbp->env; + rlsnp = ret_lsnp; + rectype = DB___bam_relink; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + DB_ASSERT(env, dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(*lsn_prev) + + sizeof(u_int32_t) + + sizeof(*lsn_next); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + uinttmp = (u_int32_t)dbp->log_filename->id; + LOGCOPY_32(env, bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)new_pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)prev; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (lsn_prev != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(lsn_prev, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, lsn_prev)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, lsn_prev); + } else + memset(bp, 0, sizeof(*lsn_prev)); + bp += sizeof(*lsn_prev); + + uinttmp = (u_int32_t)next; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (lsn_next != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(lsn_next, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, lsn_next)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, lsn_next); + } else + memset(bp, 0, sizeof(*lsn_next)); + bp += sizeof(*lsn_next); + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__bam_relink_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __bam_merge_44_read __P((ENV *, DB **, void *, + * PUBLIC: void *, __bam_merge_44_args **)); + */ +int +__bam_merge_44_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __bam_merge_44_args **argpp; +{ + __bam_merge_44_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__bam_merge_44_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->npgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->nlsn, bp); + bp += sizeof(DB_LSN); + + memset(&argp->hdr, 0, sizeof(argp->hdr)); + LOGCOPY_32(env,&argp->hdr.size, bp); + bp += sizeof(u_int32_t); + argp->hdr.data = bp; + bp += argp->hdr.size; + + memset(&argp->data, 0, sizeof(argp->data)); + LOGCOPY_32(env,&argp->data.size, bp); + bp += sizeof(u_int32_t); + argp->data.data = bp; + bp += argp->data.size; + + memset(&argp->ind, 0, sizeof(argp->ind)); + LOGCOPY_32(env,&argp->ind.size, bp); + bp += sizeof(u_int32_t); + argp->ind.data = bp; + bp += argp->ind.size; + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __bam_merge_read __P((ENV *, DB **, void *, void *, + * PUBLIC: __bam_merge_args **)); + */ +int +__bam_merge_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __bam_merge_args **argpp; +{ + __bam_merge_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__bam_merge_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->npgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->nlsn, bp); + bp += sizeof(DB_LSN); + + memset(&argp->hdr, 0, sizeof(argp->hdr)); + LOGCOPY_32(env,&argp->hdr.size, bp); + bp += sizeof(u_int32_t); + argp->hdr.data = bp; + bp += argp->hdr.size; + + memset(&argp->data, 0, sizeof(argp->data)); + LOGCOPY_32(env,&argp->data.size, bp); + bp += sizeof(u_int32_t); + argp->data.data = bp; + bp += argp->data.size; + if (LOG_SWAPPED(env) && dbpp != NULL && *dbpp != NULL) { + int t_ret; + if ((t_ret = __db_pageswap(*dbpp, + (PAGE *)argp->hdr.data, (size_t)argp->hdr.size, + &argp->data, 1)) != 0) + return (t_ret); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->pg_copy = (int32_t)uinttmp; + bp += sizeof(uinttmp); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __bam_merge_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t, DB_LSN *, const DBT *, + * PUBLIC: const DBT *, int32_t)); + */ +int +__bam_merge_log(dbp, txnp, ret_lsnp, flags, pgno, lsn, npgno, nlsn, hdr, + data, pg_copy) + DB *dbp; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + db_pgno_t pgno; + DB_LSN * lsn; + db_pgno_t npgno; + DB_LSN * nlsn; + const DBT *hdr; + const DBT *data; + int32_t pg_copy; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + ENV *env; + u_int32_t zero, uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + env = dbp->env; + rlsnp = ret_lsnp; + rectype = DB___bam_merge; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + DB_ASSERT(env, dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(*lsn) + + sizeof(u_int32_t) + + sizeof(*nlsn) + + sizeof(u_int32_t) + (hdr == NULL ? 0 : hdr->size) + + sizeof(u_int32_t) + (data == NULL ? 0 : data->size) + + sizeof(u_int32_t); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + uinttmp = (u_int32_t)dbp->log_filename->id; + LOGCOPY_32(env, bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (lsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(lsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, lsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, lsn); + } else + memset(bp, 0, sizeof(*lsn)); + bp += sizeof(*lsn); + + uinttmp = (u_int32_t)npgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (nlsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(nlsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, nlsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, nlsn); + } else + memset(bp, 0, sizeof(*nlsn)); + bp += sizeof(*nlsn); + + if (hdr == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &hdr->size); + bp += sizeof(hdr->size); + memcpy(bp, hdr->data, hdr->size); + if (LOG_SWAPPED(env)) + if ((ret = __db_pageswap(dbp, + (PAGE *)bp, (size_t)hdr->size, (DBT *)data, 0)) != 0) + return (ret); + bp += hdr->size; + } + + if (data == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &data->size); + bp += sizeof(data->size); + memcpy(bp, data->data, data->size); + if (LOG_SWAPPED(env) && F_ISSET(data, DB_DBT_APPMALLOC)) + __os_free(env, data->data); + bp += data->size; + } + + uinttmp = (u_int32_t)pg_copy; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__bam_merge_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __bam_pgno_read __P((ENV *, DB **, void *, void *, + * PUBLIC: __bam_pgno_args **)); + */ +int +__bam_pgno_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __bam_pgno_args **argpp; +{ + __bam_pgno_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__bam_pgno_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &argp->indx, bp); + bp += sizeof(argp->indx); + + LOGCOPY_32(env, &uinttmp, bp); + argp->opgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &uinttmp, bp); + argp->npgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __bam_pgno_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, u_int32_t, db_pgno_t, + * PUBLIC: db_pgno_t)); + */ +int +__bam_pgno_log(dbp, txnp, ret_lsnp, flags, pgno, lsn, indx, opgno, npgno) + DB *dbp; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + db_pgno_t pgno; + DB_LSN * lsn; + u_int32_t indx; + db_pgno_t opgno; + db_pgno_t npgno; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + ENV *env; + u_int32_t uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + env = dbp->env; + rlsnp = ret_lsnp; + rectype = DB___bam_pgno; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + DB_ASSERT(env, dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(*lsn) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + uinttmp = (u_int32_t)dbp->log_filename->id; + LOGCOPY_32(env, bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (lsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(lsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, lsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, lsn); + } else + memset(bp, 0, sizeof(*lsn)); + bp += sizeof(*lsn); + + LOGCOPY_32(env, bp, &indx); + bp += sizeof(indx); + + uinttmp = (u_int32_t)opgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)npgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__bam_pgno_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __bam_init_recover __P((ENV *, DB_DISTAB *)); + */ +int +__bam_init_recover(env, dtabp) + ENV *env; + DB_DISTAB *dtabp; +{ + int ret; + + if ((ret = __db_add_recovery_int(env, dtabp, + __bam_split_recover, DB___bam_split)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __bam_rsplit_recover, DB___bam_rsplit)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __bam_adj_recover, DB___bam_adj)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __bam_cadjust_recover, DB___bam_cadjust)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __bam_cdel_recover, DB___bam_cdel)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __bam_repl_recover, DB___bam_repl)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __bam_root_recover, DB___bam_root)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __bam_curadj_recover, DB___bam_curadj)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __bam_rcuradj_recover, DB___bam_rcuradj)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __bam_relink_recover, DB___bam_relink)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __bam_merge_recover, DB___bam_merge)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __bam_pgno_recover, DB___bam_pgno)) != 0) + return (ret); + return (0); +} diff --git a/btree/btree_autop.c b/btree/btree_autop.c new file mode 100644 index 0000000..54cb501 --- /dev/null +++ b/btree/btree_autop.c @@ -0,0 +1,766 @@ +/* Do not edit: automatically built by gen_rec.awk. */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/crypto.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/btree.h" +#include "dbinc/log.h" +#include "dbinc/txn.h" + +/* + * PUBLIC: int __bam_split_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__bam_split_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_split_args *argp; + u_int32_t i; + int ch; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __bam_split_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__bam_split%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tleft: %lu\n", (u_long)argp->left); + (void)printf("\tllsn: [%lu][%lu]\n", + (u_long)argp->llsn.file, (u_long)argp->llsn.offset); + (void)printf("\tright: %lu\n", (u_long)argp->right); + (void)printf("\trlsn: [%lu][%lu]\n", + (u_long)argp->rlsn.file, (u_long)argp->rlsn.offset); + (void)printf("\tindx: %lu\n", (u_long)argp->indx); + (void)printf("\tnpgno: %lu\n", (u_long)argp->npgno); + (void)printf("\tnlsn: [%lu][%lu]\n", + (u_long)argp->nlsn.file, (u_long)argp->nlsn.offset); + (void)printf("\tppgno: %lu\n", (u_long)argp->ppgno); + (void)printf("\tplsn: [%lu][%lu]\n", + (u_long)argp->plsn.file, (u_long)argp->plsn.offset); + (void)printf("\tpindx: %lu\n", (u_long)argp->pindx); + (void)printf("\tpg: "); + for (i = 0; i < argp->pg.size; i++) { + ch = ((u_int8_t *)argp->pg.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\tpentry: "); + for (i = 0; i < argp->pentry.size; i++) { + ch = ((u_int8_t *)argp->pentry.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\trentry: "); + for (i = 0; i < argp->rentry.size; i++) { + ch = ((u_int8_t *)argp->rentry.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\topflags: %lu\n", (u_long)argp->opflags); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __bam_split_42_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__bam_split_42_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_split_42_args *argp; + u_int32_t i; + int ch; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __bam_split_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__bam_split_42%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tleft: %lu\n", (u_long)argp->left); + (void)printf("\tllsn: [%lu][%lu]\n", + (u_long)argp->llsn.file, (u_long)argp->llsn.offset); + (void)printf("\tright: %lu\n", (u_long)argp->right); + (void)printf("\trlsn: [%lu][%lu]\n", + (u_long)argp->rlsn.file, (u_long)argp->rlsn.offset); + (void)printf("\tindx: %lu\n", (u_long)argp->indx); + (void)printf("\tnpgno: %lu\n", (u_long)argp->npgno); + (void)printf("\tnlsn: [%lu][%lu]\n", + (u_long)argp->nlsn.file, (u_long)argp->nlsn.offset); + (void)printf("\troot_pgno: %lu\n", (u_long)argp->root_pgno); + (void)printf("\tpg: "); + for (i = 0; i < argp->pg.size; i++) { + ch = ((u_int8_t *)argp->pg.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\topflags: %lu\n", (u_long)argp->opflags); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __bam_rsplit_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__bam_rsplit_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_rsplit_args *argp; + u_int32_t i; + int ch; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __bam_rsplit_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__bam_rsplit%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tpgdbt: "); + for (i = 0; i < argp->pgdbt.size; i++) { + ch = ((u_int8_t *)argp->pgdbt.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\troot_pgno: %lu\n", (u_long)argp->root_pgno); + (void)printf("\tnrec: %lu\n", (u_long)argp->nrec); + (void)printf("\trootent: "); + for (i = 0; i < argp->rootent.size; i++) { + ch = ((u_int8_t *)argp->rootent.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\trootlsn: [%lu][%lu]\n", + (u_long)argp->rootlsn.file, (u_long)argp->rootlsn.offset); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __bam_adj_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__bam_adj_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_adj_args *argp; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __bam_adj_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__bam_adj%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tlsn: [%lu][%lu]\n", + (u_long)argp->lsn.file, (u_long)argp->lsn.offset); + (void)printf("\tindx: %lu\n", (u_long)argp->indx); + (void)printf("\tindx_copy: %lu\n", (u_long)argp->indx_copy); + (void)printf("\tis_insert: %lu\n", (u_long)argp->is_insert); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __bam_cadjust_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__bam_cadjust_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_cadjust_args *argp; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __bam_cadjust_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__bam_cadjust%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tlsn: [%lu][%lu]\n", + (u_long)argp->lsn.file, (u_long)argp->lsn.offset); + (void)printf("\tindx: %lu\n", (u_long)argp->indx); + (void)printf("\tadjust: %ld\n", (long)argp->adjust); + (void)printf("\topflags: %lu\n", (u_long)argp->opflags); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __bam_cdel_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__bam_cdel_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_cdel_args *argp; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __bam_cdel_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__bam_cdel%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tlsn: [%lu][%lu]\n", + (u_long)argp->lsn.file, (u_long)argp->lsn.offset); + (void)printf("\tindx: %lu\n", (u_long)argp->indx); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __bam_repl_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__bam_repl_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_repl_args *argp; + u_int32_t i; + int ch; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __bam_repl_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__bam_repl%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tlsn: [%lu][%lu]\n", + (u_long)argp->lsn.file, (u_long)argp->lsn.offset); + (void)printf("\tindx: %lu\n", (u_long)argp->indx); + (void)printf("\tisdeleted: %lu\n", (u_long)argp->isdeleted); + (void)printf("\torig: "); + for (i = 0; i < argp->orig.size; i++) { + ch = ((u_int8_t *)argp->orig.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\trepl: "); + for (i = 0; i < argp->repl.size; i++) { + ch = ((u_int8_t *)argp->repl.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\tprefix: %lu\n", (u_long)argp->prefix); + (void)printf("\tsuffix: %lu\n", (u_long)argp->suffix); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __bam_root_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__bam_root_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_root_args *argp; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __bam_root_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__bam_root%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tmeta_pgno: %lu\n", (u_long)argp->meta_pgno); + (void)printf("\troot_pgno: %lu\n", (u_long)argp->root_pgno); + (void)printf("\tmeta_lsn: [%lu][%lu]\n", + (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __bam_curadj_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__bam_curadj_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_curadj_args *argp; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __bam_curadj_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__bam_curadj%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tmode: %ld\n", (long)argp->mode); + (void)printf("\tfrom_pgno: %lu\n", (u_long)argp->from_pgno); + (void)printf("\tto_pgno: %lu\n", (u_long)argp->to_pgno); + (void)printf("\tleft_pgno: %lu\n", (u_long)argp->left_pgno); + (void)printf("\tfirst_indx: %lu\n", (u_long)argp->first_indx); + (void)printf("\tfrom_indx: %lu\n", (u_long)argp->from_indx); + (void)printf("\tto_indx: %lu\n", (u_long)argp->to_indx); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __bam_rcuradj_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__bam_rcuradj_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_rcuradj_args *argp; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __bam_rcuradj_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__bam_rcuradj%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tmode: %ld\n", (long)argp->mode); + (void)printf("\troot: %ld\n", (long)argp->root); + (void)printf("\trecno: %ld\n", (long)argp->recno); + (void)printf("\torder: %lu\n", (u_long)argp->order); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __bam_relink_43_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__bam_relink_43_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_relink_43_args *argp; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __bam_relink_43_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__bam_relink_43%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tlsn: [%lu][%lu]\n", + (u_long)argp->lsn.file, (u_long)argp->lsn.offset); + (void)printf("\tprev: %lu\n", (u_long)argp->prev); + (void)printf("\tlsn_prev: [%lu][%lu]\n", + (u_long)argp->lsn_prev.file, (u_long)argp->lsn_prev.offset); + (void)printf("\tnext: %lu\n", (u_long)argp->next); + (void)printf("\tlsn_next: [%lu][%lu]\n", + (u_long)argp->lsn_next.file, (u_long)argp->lsn_next.offset); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __bam_relink_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__bam_relink_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_relink_args *argp; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __bam_relink_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__bam_relink%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tnew_pgno: %lu\n", (u_long)argp->new_pgno); + (void)printf("\tprev: %lu\n", (u_long)argp->prev); + (void)printf("\tlsn_prev: [%lu][%lu]\n", + (u_long)argp->lsn_prev.file, (u_long)argp->lsn_prev.offset); + (void)printf("\tnext: %lu\n", (u_long)argp->next); + (void)printf("\tlsn_next: [%lu][%lu]\n", + (u_long)argp->lsn_next.file, (u_long)argp->lsn_next.offset); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __bam_merge_44_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__bam_merge_44_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_merge_44_args *argp; + u_int32_t i; + int ch; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __bam_merge_44_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__bam_merge_44%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tlsn: [%lu][%lu]\n", + (u_long)argp->lsn.file, (u_long)argp->lsn.offset); + (void)printf("\tnpgno: %lu\n", (u_long)argp->npgno); + (void)printf("\tnlsn: [%lu][%lu]\n", + (u_long)argp->nlsn.file, (u_long)argp->nlsn.offset); + (void)printf("\thdr: "); + for (i = 0; i < argp->hdr.size; i++) { + ch = ((u_int8_t *)argp->hdr.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\tdata: "); + for (i = 0; i < argp->data.size; i++) { + ch = ((u_int8_t *)argp->data.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\tind: "); + for (i = 0; i < argp->ind.size; i++) { + ch = ((u_int8_t *)argp->ind.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __bam_merge_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__bam_merge_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_merge_args *argp; + u_int32_t i; + int ch; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __bam_merge_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__bam_merge%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tlsn: [%lu][%lu]\n", + (u_long)argp->lsn.file, (u_long)argp->lsn.offset); + (void)printf("\tnpgno: %lu\n", (u_long)argp->npgno); + (void)printf("\tnlsn: [%lu][%lu]\n", + (u_long)argp->nlsn.file, (u_long)argp->nlsn.offset); + (void)printf("\thdr: "); + for (i = 0; i < argp->hdr.size; i++) { + ch = ((u_int8_t *)argp->hdr.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\tdata: "); + for (i = 0; i < argp->data.size; i++) { + ch = ((u_int8_t *)argp->data.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\tpg_copy: %lu\n", (u_long)argp->pg_copy); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __bam_pgno_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__bam_pgno_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __bam_pgno_args *argp; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __bam_pgno_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__bam_pgno%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tlsn: [%lu][%lu]\n", + (u_long)argp->lsn.file, (u_long)argp->lsn.offset); + (void)printf("\tindx: %lu\n", (u_long)argp->indx); + (void)printf("\topgno: %lu\n", (u_long)argp->opgno); + (void)printf("\tnpgno: %lu\n", (u_long)argp->npgno); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __bam_init_print __P((ENV *, DB_DISTAB *)); + */ +int +__bam_init_print(env, dtabp) + ENV *env; + DB_DISTAB *dtabp; +{ + int ret; + + if ((ret = __db_add_recovery_int(env, dtabp, + __bam_split_print, DB___bam_split)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __bam_rsplit_print, DB___bam_rsplit)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __bam_adj_print, DB___bam_adj)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __bam_cadjust_print, DB___bam_cadjust)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __bam_cdel_print, DB___bam_cdel)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __bam_repl_print, DB___bam_repl)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __bam_root_print, DB___bam_root)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __bam_curadj_print, DB___bam_curadj)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __bam_rcuradj_print, DB___bam_rcuradj)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __bam_relink_print, DB___bam_relink)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __bam_merge_print, DB___bam_merge)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __bam_pgno_print, DB___bam_pgno)) != 0) + return (ret); + return (0); +} diff --git a/btree/extern.h b/btree/extern.h deleted file mode 100644 index ebd9c54..0000000 --- a/btree/extern.h +++ /dev/null @@ -1,70 +0,0 @@ -/*- - * Copyright (c) 1991, 1993, 1994 - * The Regents of the University of California. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * @(#)extern.h 8.10 (Berkeley) 7/20/94 - */ - -int __bt_close __P((DB *)); -int __bt_cmp __P((BTREE *, const DBT *, EPG *)); -int __bt_crsrdel __P((BTREE *, EPGNO *)); -int __bt_defcmp __P((const DBT *, const DBT *)); -size_t __bt_defpfx __P((const DBT *, const DBT *)); -int __bt_delete __P((const DB *, const DBT *, u_int)); -int __bt_dleaf __P((BTREE *, const DBT *, PAGE *, u_int)); -int __bt_fd __P((const DB *)); -int __bt_free __P((BTREE *, PAGE *)); -int __bt_get __P((const DB *, const DBT *, DBT *, u_int)); -PAGE *__bt_new __P((BTREE *, pgno_t *)); -void __bt_pgin __P((void *, pgno_t, void *)); -void __bt_pgout __P((void *, pgno_t, void *)); -int __bt_push __P((BTREE *, pgno_t, int)); -int __bt_put __P((const DB *dbp, DBT *, const DBT *, u_int)); -int __bt_ret __P((BTREE *, EPG *, DBT *, DBT *, DBT *, DBT *, int)); -EPG *__bt_search __P((BTREE *, const DBT *, int *)); -int __bt_seq __P((const DB *, DBT *, DBT *, u_int)); -void __bt_setcur __P((BTREE *, pgno_t, u_int)); -int __bt_split __P((BTREE *, PAGE *, - const DBT *, const DBT *, int, size_t, u_int32_t)); -int __bt_sync __P((const DB *, u_int)); - -int __ovfl_delete __P((BTREE *, void *)); -int __ovfl_get __P((BTREE *, void *, size_t *, void **, size_t *)); -int __ovfl_put __P((BTREE *, const DBT *, pgno_t *)); - -#ifdef DEBUG -void __bt_dnpage __P((DB *, pgno_t)); -void __bt_dpage __P((PAGE *)); -void __bt_dump __P((DB *)); -#endif -#ifdef STATISTICS -void __bt_stat __P((DB *)); -#endif diff --git a/btree/tags b/btree/tags deleted file mode 120000 index 7ab656b..0000000 --- a/btree/tags +++ /dev/null @@ -1 +0,0 @@ -../db/tags
\ No newline at end of file |