diff options
author | Kim Kibum <kb0929.kim@samsung.com> | 2012-05-21 17:40:46 +0900 |
---|---|---|
committer | Kim Kibum <kb0929.kim@samsung.com> | 2012-05-21 17:40:46 +0900 |
commit | 2e082c838d2ca750f5daac6dcdabecc22dfd4e46 (patch) | |
tree | 01c1dd87d4cc0b62a655c0d768ff695d2d244728 /db | |
parent | a86e3ca152fb414b376e64c449c201d762e414dd (diff) | |
download | db4-2e082c838d2ca750f5daac6dcdabecc22dfd4e46.tar.gz db4-2e082c838d2ca750f5daac6dcdabecc22dfd4e46.tar.bz2 db4-2e082c838d2ca750f5daac6dcdabecc22dfd4e46.zip |
Upload Tizen:Base source
Diffstat (limited to 'db')
-rw-r--r-- | db/crdel.src | 72 | ||||
-rw-r--r-- | db/crdel_auto.c | 945 | ||||
-rw-r--r-- | db/crdel_autop.c | 227 | ||||
-rw-r--r-- | db/crdel_rec.c | 298 | ||||
-rw-r--r-- | db/db.c | 1539 | ||||
-rw-r--r-- | db/db.src | 328 | ||||
-rw-r--r-- | db/db_am.c | 1015 | ||||
-rw-r--r-- | db/db_auto.c | 3267 | ||||
-rw-r--r-- | db/db_autop.c | 802 | ||||
-rw-r--r-- | db/db_cam.c | 3460 | ||||
-rw-r--r-- | db/db_cds.c | 177 | ||||
-rw-r--r-- | db/db_conv.c | 733 | ||||
-rw-r--r-- | db/db_dispatch.c | 953 | ||||
-rw-r--r-- | db/db_dup.c | 203 | ||||
-rw-r--r-- | db/db_iface.c | 2817 | ||||
-rw-r--r-- | db/db_join.c | 940 | ||||
-rw-r--r-- | db/db_meta.c | 1299 | ||||
-rw-r--r-- | db/db_method.c | 1052 | ||||
-rw-r--r-- | db/db_open.c | 628 | ||||
-rw-r--r-- | db/db_overflow.c | 706 | ||||
-rw-r--r-- | db/db_ovfl_vrfy.c | 409 | ||||
-rw-r--r-- | db/db_pr.c | 1659 | ||||
-rw-r--r-- | db/db_rec.c | 1859 | ||||
-rw-r--r-- | db/db_reclaim.c | 246 | ||||
-rw-r--r-- | db/db_remove.c | 492 | ||||
-rw-r--r-- | db/db_rename.c | 372 | ||||
-rw-r--r-- | db/db_ret.c | 156 | ||||
-rw-r--r-- | db/db_setid.c | 213 | ||||
-rw-r--r-- | db/db_setlsn.c | 137 | ||||
-rw-r--r-- | db/db_sort_multiple.c | 287 | ||||
-rw-r--r-- | db/db_stati.c | 494 | ||||
-rw-r--r-- | db/db_truncate.c | 225 | ||||
-rw-r--r-- | db/db_upg.c | 510 | ||||
-rw-r--r-- | db/db_upg_opd.c | 343 | ||||
-rw-r--r-- | db/db_vrfy.c | 2894 | ||||
-rw-r--r-- | db/db_vrfy_stub.c | 117 | ||||
-rw-r--r-- | db/db_vrfyutil.c | 916 | ||||
-rw-r--r-- | db/partition.c | 2048 |
38 files changed, 34838 insertions, 0 deletions
diff --git a/db/crdel.src b/db/crdel.src new file mode 100644 index 0000000..cd0b02f --- /dev/null +++ b/db/crdel.src @@ -0,0 +1,72 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +DBPRIVATE +PREFIX __crdel + +INCLUDE #include "db_int.h" +INCLUDE #include "dbinc/crypto.h" +INCLUDE #include "dbinc/db_page.h" +INCLUDE #include "dbinc/db_dispatch.h" +INCLUDE #include "dbinc/db_am.h" +INCLUDE #include "dbinc/log.h" +INCLUDE #include "dbinc/txn.h" +INCLUDE + +/* + * Metasub: log the creation of a subdatabase meta data page. + * + * fileid: identifies the file being acted upon. + * pgno: page number on which to write this meta-data page + * page: the actual meta-data page + * lsn: lsn of the page. + */ +BEGIN metasub 42 142 +DB fileid int32_t ld +ARG pgno db_pgno_t lu +PGDBT page DBT s +POINTER lsn DB_LSN * lu +END + +/* + * Inmem_create: Log the creation of an in-memory database. + * + * name: Name of the database + * fid: File id of the database + */ +BEGIN inmem_create 44 138 +ARG fileid int32_t ld +DBT name DBT s +DBT fid DBT s +ARG pgsize u_int32_t lu +END + +/* + * Inmem_rename: Log the renaming of an in-memory only database. + * + * oldname: database's starting name + * newname: database's ending name + * fid: fileid + */ +BEGIN inmem_rename 44 139 +DBT oldname DBT s +DBT newname DBT s +DBT fid DBT s +END + +/* + * Inmem_remove: Log the removal of an in-memory only database. + * + * name: database's ending name + * fid: fileid + */ +BEGIN inmem_remove 44 140 +DBT name DBT s +DBT fid DBT s +END + diff --git a/db/crdel_auto.c b/db/crdel_auto.c new file mode 100644 index 0000000..801a0a5 --- /dev/null +++ b/db/crdel_auto.c @@ -0,0 +1,945 @@ +/* Do not edit: automatically built by gen_rec.awk. */ + +#include "db_config.h" +#include "db_int.h" +#include "dbinc/crypto.h" +#include "dbinc/db_page.h" +#include "dbinc/db_dispatch.h" +#include "dbinc/db_am.h" +#include "dbinc/log.h" +#include "dbinc/txn.h" + +/* + * PUBLIC: int __crdel_metasub_read __P((ENV *, DB **, void *, + * PUBLIC: void *, __crdel_metasub_args **)); + */ +int +__crdel_metasub_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __crdel_metasub_args **argpp; +{ + __crdel_metasub_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__crdel_metasub_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + memset(&argp->page, 0, sizeof(argp->page)); + LOGCOPY_32(env,&argp->page.size, bp); + bp += sizeof(u_int32_t); + argp->page.data = bp; + bp += argp->page.size; + if (LOG_SWAPPED(env) && dbpp != NULL && *dbpp != NULL) { + int t_ret; + if ((t_ret = __db_pageswap(*dbpp, (PAGE *)argp->page.data, + (size_t)argp->page.size, NULL, 1)) != 0) + return (t_ret); + } + + LOGCOPY_TOLSN(env, &argp->lsn, bp); + bp += sizeof(DB_LSN); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __crdel_metasub_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_pgno_t, const DBT *, DB_LSN *)); + */ +int +__crdel_metasub_log(dbp, txnp, ret_lsnp, flags, pgno, page, lsn) + DB *dbp; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + db_pgno_t pgno; + const DBT *page; + DB_LSN * lsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + ENV *env; + u_int32_t zero, uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + env = dbp->env; + rlsnp = ret_lsnp; + rectype = DB___crdel_metasub; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + DB_ASSERT(env, dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + (page == NULL ? 0 : page->size) + + sizeof(*lsn); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + uinttmp = (u_int32_t)dbp->log_filename->id; + LOGCOPY_32(env, bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (page == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &page->size); + bp += sizeof(page->size); + memcpy(bp, page->data, page->size); + if (LOG_SWAPPED(env)) + if ((ret = __db_pageswap(dbp, + (PAGE *)bp, (size_t)page->size, (DBT *)NULL, 0)) != 0) + return (ret); + bp += page->size; + } + + if (lsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(lsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, lsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, lsn); + } else + memset(bp, 0, sizeof(*lsn)); + bp += sizeof(*lsn); + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__crdel_metasub_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __crdel_inmem_create_read __P((ENV *, void *, + * PUBLIC: __crdel_inmem_create_args **)); + */ +int +__crdel_inmem_create_read(env, recbuf, argpp) + ENV *env; + void *recbuf; + __crdel_inmem_create_args **argpp; +{ + __crdel_inmem_create_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__crdel_inmem_create_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + + memset(&argp->name, 0, sizeof(argp->name)); + LOGCOPY_32(env,&argp->name.size, bp); + bp += sizeof(u_int32_t); + argp->name.data = bp; + bp += argp->name.size; + + memset(&argp->fid, 0, sizeof(argp->fid)); + LOGCOPY_32(env,&argp->fid.size, bp); + bp += sizeof(u_int32_t); + argp->fid.data = bp; + bp += argp->fid.size; + + LOGCOPY_32(env, &argp->pgsize, bp); + bp += sizeof(argp->pgsize); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __crdel_inmem_create_log __P((ENV *, DB_TXN *, + * PUBLIC: DB_LSN *, u_int32_t, int32_t, const DBT *, const DBT *, + * PUBLIC: u_int32_t)); + */ +int +__crdel_inmem_create_log(env, txnp, ret_lsnp, flags, + fileid, name, fid, pgsize) + ENV *env; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + int32_t fileid; + const DBT *name; + const DBT *fid; + u_int32_t pgsize; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + u_int32_t zero, uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + rlsnp = ret_lsnp; + rectype = DB___crdel_inmem_create; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + (name == NULL ? 0 : name->size) + + sizeof(u_int32_t) + (fid == NULL ? 0 : fid->size) + + sizeof(u_int32_t); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + uinttmp = (u_int32_t)fileid; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (name == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &name->size); + bp += sizeof(name->size); + memcpy(bp, name->data, name->size); + bp += name->size; + } + + if (fid == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &fid->size); + bp += sizeof(fid->size); + memcpy(bp, fid->data, fid->size); + bp += fid->size; + } + + LOGCOPY_32(env, bp, &pgsize); + bp += sizeof(pgsize); + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__crdel_inmem_create_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __crdel_inmem_rename_read __P((ENV *, void *, + * PUBLIC: __crdel_inmem_rename_args **)); + */ +int +__crdel_inmem_rename_read(env, recbuf, argpp) + ENV *env; + void *recbuf; + __crdel_inmem_rename_args **argpp; +{ + __crdel_inmem_rename_args *argp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__crdel_inmem_rename_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + memset(&argp->oldname, 0, sizeof(argp->oldname)); + LOGCOPY_32(env,&argp->oldname.size, bp); + bp += sizeof(u_int32_t); + argp->oldname.data = bp; + bp += argp->oldname.size; + + memset(&argp->newname, 0, sizeof(argp->newname)); + LOGCOPY_32(env,&argp->newname.size, bp); + bp += sizeof(u_int32_t); + argp->newname.data = bp; + bp += argp->newname.size; + + memset(&argp->fid, 0, sizeof(argp->fid)); + LOGCOPY_32(env,&argp->fid.size, bp); + bp += sizeof(u_int32_t); + argp->fid.data = bp; + bp += argp->fid.size; + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __crdel_inmem_rename_log __P((ENV *, DB_TXN *, + * PUBLIC: DB_LSN *, u_int32_t, const DBT *, const DBT *, const DBT *)); + */ +int +__crdel_inmem_rename_log(env, txnp, ret_lsnp, flags, + oldname, newname, fid) + ENV *env; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + const DBT *oldname; + const DBT *newname; + const DBT *fid; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + u_int32_t zero, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + rlsnp = ret_lsnp; + rectype = DB___crdel_inmem_rename; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + (oldname == NULL ? 0 : oldname->size) + + sizeof(u_int32_t) + (newname == NULL ? 0 : newname->size) + + sizeof(u_int32_t) + (fid == NULL ? 0 : fid->size); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + if (oldname == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &oldname->size); + bp += sizeof(oldname->size); + memcpy(bp, oldname->data, oldname->size); + bp += oldname->size; + } + + if (newname == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &newname->size); + bp += sizeof(newname->size); + memcpy(bp, newname->data, newname->size); + bp += newname->size; + } + + if (fid == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &fid->size); + bp += sizeof(fid->size); + memcpy(bp, fid->data, fid->size); + bp += fid->size; + } + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__crdel_inmem_rename_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __crdel_inmem_remove_read __P((ENV *, void *, + * PUBLIC: __crdel_inmem_remove_args **)); + */ +int +__crdel_inmem_remove_read(env, recbuf, argpp) + ENV *env; + void *recbuf; + __crdel_inmem_remove_args **argpp; +{ + __crdel_inmem_remove_args *argp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__crdel_inmem_remove_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + memset(&argp->name, 0, sizeof(argp->name)); + LOGCOPY_32(env,&argp->name.size, bp); + bp += sizeof(u_int32_t); + argp->name.data = bp; + bp += argp->name.size; + + memset(&argp->fid, 0, sizeof(argp->fid)); + LOGCOPY_32(env,&argp->fid.size, bp); + bp += sizeof(u_int32_t); + argp->fid.data = bp; + bp += argp->fid.size; + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __crdel_inmem_remove_log __P((ENV *, DB_TXN *, + * PUBLIC: DB_LSN *, u_int32_t, const DBT *, const DBT *)); + */ +int +__crdel_inmem_remove_log(env, txnp, ret_lsnp, flags, + name, fid) + ENV *env; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + const DBT *name; + const DBT *fid; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + u_int32_t zero, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + rlsnp = ret_lsnp; + rectype = DB___crdel_inmem_remove; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + (name == NULL ? 0 : name->size) + + sizeof(u_int32_t) + (fid == NULL ? 0 : fid->size); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + if (name == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &name->size); + bp += sizeof(name->size); + memcpy(bp, name->data, name->size); + bp += name->size; + } + + if (fid == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &fid->size); + bp += sizeof(fid->size); + memcpy(bp, fid->data, fid->size); + bp += fid->size; + } + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__crdel_inmem_remove_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __crdel_init_recover __P((ENV *, DB_DISTAB *)); + */ +int +__crdel_init_recover(env, dtabp) + ENV *env; + DB_DISTAB *dtabp; +{ + int ret; + + if ((ret = __db_add_recovery_int(env, dtabp, + __crdel_metasub_recover, DB___crdel_metasub)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __crdel_inmem_create_recover, DB___crdel_inmem_create)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __crdel_inmem_rename_recover, DB___crdel_inmem_rename)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __crdel_inmem_remove_recover, DB___crdel_inmem_remove)) != 0) + return (ret); + return (0); +} diff --git a/db/crdel_autop.c b/db/crdel_autop.c new file mode 100644 index 0000000..6bf4bb6 --- /dev/null +++ b/db/crdel_autop.c @@ -0,0 +1,227 @@ +/* Do not edit: automatically built by gen_rec.awk. */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/crypto.h" +#include "dbinc/db_page.h" +#include "dbinc/db_dispatch.h" +#include "dbinc/db_am.h" +#include "dbinc/log.h" +#include "dbinc/txn.h" + +/* + * PUBLIC: int __crdel_metasub_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__crdel_metasub_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __crdel_metasub_args *argp; + u_int32_t i; + int ch; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __crdel_metasub_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__crdel_metasub%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tpage: "); + for (i = 0; i < argp->page.size; i++) { + ch = ((u_int8_t *)argp->page.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\tlsn: [%lu][%lu]\n", + (u_long)argp->lsn.file, (u_long)argp->lsn.offset); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __crdel_inmem_create_print __P((ENV *, DBT *, + * PUBLIC: DB_LSN *, db_recops, void *)); + */ +int +__crdel_inmem_create_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __crdel_inmem_create_args *argp; + u_int32_t i; + int ch; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = __crdel_inmem_create_read(env, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__crdel_inmem_create%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tname: "); + for (i = 0; i < argp->name.size; i++) { + ch = ((u_int8_t *)argp->name.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\tfid: "); + for (i = 0; i < argp->fid.size; i++) { + ch = ((u_int8_t *)argp->fid.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\tpgsize: %lu\n", (u_long)argp->pgsize); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __crdel_inmem_rename_print __P((ENV *, DBT *, + * PUBLIC: DB_LSN *, db_recops, void *)); + */ +int +__crdel_inmem_rename_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __crdel_inmem_rename_args *argp; + u_int32_t i; + int ch; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = __crdel_inmem_rename_read(env, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__crdel_inmem_rename%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\toldname: "); + for (i = 0; i < argp->oldname.size; i++) { + ch = ((u_int8_t *)argp->oldname.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\tnewname: "); + for (i = 0; i < argp->newname.size; i++) { + ch = ((u_int8_t *)argp->newname.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\tfid: "); + for (i = 0; i < argp->fid.size; i++) { + ch = ((u_int8_t *)argp->fid.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __crdel_inmem_remove_print __P((ENV *, DBT *, + * PUBLIC: DB_LSN *, db_recops, void *)); + */ +int +__crdel_inmem_remove_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __crdel_inmem_remove_args *argp; + u_int32_t i; + int ch; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = __crdel_inmem_remove_read(env, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__crdel_inmem_remove%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tname: "); + for (i = 0; i < argp->name.size; i++) { + ch = ((u_int8_t *)argp->name.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\tfid: "); + for (i = 0; i < argp->fid.size; i++) { + ch = ((u_int8_t *)argp->fid.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __crdel_init_print __P((ENV *, DB_DISTAB *)); + */ +int +__crdel_init_print(env, dtabp) + ENV *env; + DB_DISTAB *dtabp; +{ + int ret; + + if ((ret = __db_add_recovery_int(env, dtabp, + __crdel_metasub_print, DB___crdel_metasub)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __crdel_inmem_create_print, DB___crdel_inmem_create)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __crdel_inmem_rename_print, DB___crdel_inmem_rename)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __crdel_inmem_remove_print, DB___crdel_inmem_remove)) != 0) + return (ret); + return (0); +} diff --git a/db/crdel_rec.c b/db/crdel_rec.c new file mode 100644 index 0000000..285b965 --- /dev/null +++ b/db/crdel_rec.c @@ -0,0 +1,298 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/fop.h" +#include "dbinc/hash.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" + +/* + * __crdel_metasub_recover -- + * Recovery function for metasub. + * + * PUBLIC: int __crdel_metasub_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__crdel_metasub_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __crdel_metasub_args *argp; + DB_THREAD_INFO *ip; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int cmp_p, ret, t_ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + pagep = NULL; + REC_PRINT(__crdel_metasub_print); + REC_INTRO(__crdel_metasub_read, ip, 0); + + /* + * If we are undoing this operation, but the DB that we got back + * was never really opened, then this open was an in-memory open + * that did not finish. We can let the file creation take care + * of any necessary undo/cleanup. + */ + if (DB_UNDO(op) && !F_ISSET(file_dbp, DB_AM_OPEN_CALLED)) + goto done; + + if ((ret = __memp_fget(mpf, &argp->pgno, + ip, NULL, 0, &pagep)) != 0) { + /* If this is an in-memory file, this might be OK. */ + if (F_ISSET(file_dbp, DB_AM_INMEM) && + (ret = __memp_fget(mpf, &argp->pgno, ip, NULL, + DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &pagep)) == 0) { + LSN_NOT_LOGGED(LSN(pagep)); + } else { + *lsnp = argp->prev_lsn; + ret = 0; + goto out; + } + } + + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn); + CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn); + + if (cmp_p == 0 && DB_REDO(op)) { + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + memcpy(pagep, argp->page.data, argp->page.size); + LSN(pagep) = *lsnp; + + /* + * If this was an in-memory database and we are re-creating + * and this is the meta-data page, then we need to set up a + * bunch of fields in the dbo as well. + */ + if (F_ISSET(file_dbp, DB_AM_INMEM) && + argp->pgno == PGNO_BASE_MD && + (ret = __db_meta_setup(file_dbp->env, file_dbp, + file_dbp->dname, (DBMETA *)pagep, 0, DB_CHK_META)) != 0) + goto out; + } else if (DB_UNDO(op)) { + /* + * We want to undo this page creation. The page creation + * happened in two parts. First, we called __db_pg_alloc which + * was logged separately. Then we wrote the meta-data onto + * the page. So long as we restore the LSN, then the recovery + * for __db_pg_alloc will do everything else. + * + * Don't bother checking the lsn on the page. If we are + * rolling back the next thing is that this page will get + * freed. Opening the subdb will have reinitialized the + * page, but not the lsn. + */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + LSN(pagep) = argp->lsn; + } + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: if (pagep != NULL && (t_ret = __memp_fput(mpf, + ip, pagep, file_dbp->priority)) != 0 && + ret == 0) + ret = t_ret; + + REC_CLOSE; +} + +/* + * __crdel_inmem_create_recover -- + * Recovery function for inmem_create. + * + * PUBLIC: int __crdel_inmem_create_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__crdel_inmem_create_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __crdel_inmem_create_args *argp; + DB *dbp; + int do_close, ret, t_ret; + + COMPQUIET(info, NULL); + + dbp = NULL; + do_close = 0; + REC_PRINT(__crdel_inmem_create_print); + REC_NOOP_INTRO(__crdel_inmem_create_read); + + /* First, see if the DB handle already exists. */ + if (argp->fileid == DB_LOGFILEID_INVALID) { + if (DB_REDO(op)) + ret = ENOENT; + else + ret = 0; + } else + ret = __dbreg_id_to_db(env, argp->txnp, &dbp, argp->fileid, 0); + + if (DB_REDO(op)) { + /* + * If the dbreg failed, that means that we're creating a + * tmp file. + */ + if (ret != 0) { + if ((ret = __db_create_internal(&dbp, env, 0)) != 0) + goto out; + + F_SET(dbp, DB_AM_RECOVER | DB_AM_INMEM); + memcpy(dbp->fileid, argp->fid.data, DB_FILE_ID_LEN); + if (((ret = __os_strdup(env, + argp->name.data, &dbp->dname)) != 0)) + goto out; + + /* + * This DBP is never going to be entered into the + * dbentry table, so if we leave it open here, + * then we're going to lose it. + */ + do_close = 1; + } + + /* Now, set the fileid. */ + memcpy(dbp->fileid, argp->fid.data, argp->fid.size); + if ((ret = __memp_set_fileid(dbp->mpf, dbp->fileid)) != 0) + goto out; + dbp->preserve_fid = 1; + MAKE_INMEM(dbp); + if ((ret = __env_setup(dbp, + NULL, NULL, argp->name.data, TXN_INVALID, 0)) != 0) + goto out; + ret = __env_mpool(dbp, argp->name.data, 0); + + if (ret == ENOENT) { + dbp->pgsize = argp->pgsize; + if ((ret = __env_mpool(dbp, + argp->name.data, DB_CREATE)) != 0) + goto out; + } else if (ret != 0) + goto out; + } + + if (DB_UNDO(op)) { + if (ret == 0) + ret = __memp_nameop(env, argp->fid.data, NULL, + (const char *)argp->name.data, NULL, 1); + + if (ret == ENOENT || ret == DB_DELETED) + ret = 0; + else + goto out; + } + + *lsnp = argp->prev_lsn; + +out: if (dbp != NULL) { + t_ret = 0; + + if (do_close || ret != 0) + t_ret = __db_close(dbp, NULL, DB_NOSYNC); + if (t_ret != 0 && ret == 0) + ret = t_ret; + } + REC_NOOP_CLOSE; +} + +/* + * __crdel_inmem_rename_recover -- + * Recovery function for inmem_rename. + * + * PUBLIC: int __crdel_inmem_rename_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__crdel_inmem_rename_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __crdel_inmem_rename_args *argp; + u_int8_t *fileid; + int ret; + + COMPQUIET(info, NULL); + + REC_PRINT(__crdel_inmem_rename_print); + REC_NOOP_INTRO(__crdel_inmem_rename_read); + fileid = argp->fid.data; + + /* Void out errors because the files may or may not still exist. */ + if (DB_REDO(op)) + (void)__memp_nameop(env, fileid, + (const char *)argp->newname.data, + (const char *)argp->oldname.data, + (const char *)argp->newname.data, 1); + + if (DB_UNDO(op)) + (void)__memp_nameop(env, fileid, + (const char *)argp->oldname.data, + (const char *)argp->newname.data, + (const char *)argp->oldname.data, 1); + + *lsnp = argp->prev_lsn; + ret = 0; + + REC_NOOP_CLOSE; +} + +/* + * __crdel_inmem_remove_recover -- + * Recovery function for inmem_remove. + * + * PUBLIC: int __crdel_inmem_remove_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__crdel_inmem_remove_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __crdel_inmem_remove_args *argp; + int ret; + + COMPQUIET(info, NULL); + + REC_PRINT(__crdel_inmem_remove_print); + REC_NOOP_INTRO(__crdel_inmem_remove_read); + + /* + * Since removes are delayed; there is no undo for a remove; only redo. + * The remove may fail, which is OK. + */ + if (DB_REDO(op)) { + (void)__memp_nameop(env, + argp->fid.data, NULL, argp->name.data, NULL, 1); + } + + *lsnp = argp->prev_lsn; + ret = 0; + + REC_NOOP_CLOSE; +} @@ -0,0 +1,1539 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_swap.h" +#include "dbinc/btree.h" +#include "dbinc/fop.h" +#include "dbinc/hash.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/partition.h" +#include "dbinc/qam.h" +#include "dbinc/txn.h" + +static int __db_disassociate __P((DB *)); +static int __db_disassociate_foreign __P ((DB *)); + +#ifdef CONFIG_TEST +static int __db_makecopy __P((ENV *, const char *, const char *)); +static int __qam_testdocopy __P((DB *, const char *)); +#endif + +/* + * DB.C -- + * This file contains the utility functions for the DBP layer. + */ + +/* + * __db_master_open -- + * Open up a handle on a master database. + * + * PUBLIC: int __db_master_open __P((DB *, DB_THREAD_INFO *, + * PUBLIC: DB_TXN *, const char *, u_int32_t, int, DB **)); + */ +int +__db_master_open(subdbp, ip, txn, name, flags, mode, dbpp) + DB *subdbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + const char *name; + u_int32_t flags; + int mode; + DB **dbpp; +{ + DB *dbp; + int ret; + + *dbpp = NULL; + + /* Open up a handle on the main database. */ + if ((ret = __db_create_internal(&dbp, subdbp->env, 0)) != 0) + return (ret); + + /* + * It's always a btree. + * Run in the transaction we've created. + * Set the pagesize in case we're creating a new database. + * Flag that we're creating a database with subdatabases. + */ + dbp->pgsize = subdbp->pgsize; + F_SET(dbp, DB_AM_SUBDB); + F_SET(dbp, F_ISSET(subdbp, + DB_AM_RECOVER | DB_AM_SWAP | + DB_AM_ENCRYPT | DB_AM_CHKSUM | DB_AM_NOT_DURABLE)); + + /* + * If there was a subdb specified, then we only want to apply + * DB_EXCL to the subdb, not the actual file. We only got here + * because there was a subdb specified. + */ + LF_CLR(DB_EXCL); + LF_SET(DB_RDWRMASTER); + if ((ret = __db_open(dbp, ip, + txn, name, NULL, DB_BTREE, flags, mode, PGNO_BASE_MD)) != 0) + goto err; + + /* + * The items in dbp are initialized from the master file's meta page. + * Other items such as checksum and encryption are checked when we + * read the meta-page, so we do not check those here. However, if + * the meta-page caused checksumming to be turned on and it wasn't + * already, set it here. + */ + if (F_ISSET(dbp, DB_AM_CHKSUM)) + F_SET(subdbp, DB_AM_CHKSUM); + + /* + * The user may have specified a page size for an existing file, + * which we want to ignore. + */ + subdbp->pgsize = dbp->pgsize; + *dbpp = dbp; + + if (0) { +err: if (!F_ISSET(dbp, DB_AM_DISCARD)) + (void)__db_close(dbp, txn, 0); + } + + return (ret); +} + +/* + * __db_master_update -- + * Add/Open/Remove a subdatabase from a master database. + * + * PUBLIC: int __db_master_update __P((DB *, DB *, DB_THREAD_INFO *, DB_TXN *, + * PUBLIC: const char *, DBTYPE, mu_action, const char *, u_int32_t)); + */ +int +__db_master_update(mdbp, sdbp, ip, txn, subdb, type, action, newname, flags) + DB *mdbp, *sdbp; + DB_TXN *txn; + DB_THREAD_INFO *ip; + const char *subdb; + DBTYPE type; + mu_action action; + const char *newname; + u_int32_t flags; +{ + DBC *dbc, *ndbc; + DBT key, data, ndata; + ENV *env; + PAGE *p, *r; + db_pgno_t t_pgno; + int modify, ret, t_ret; + + env = mdbp->env; + dbc = ndbc = NULL; + p = NULL; + + /* + * Open up a cursor. If this is CDB and we're creating the database, + * make it an update cursor. + * + * Might we modify the master database? If so, we'll need to lock. + */ + modify = (action != MU_OPEN || LF_ISSET(DB_CREATE)) ? 1 : 0; + + if ((ret = __db_cursor(mdbp, ip, txn, &dbc, + (CDB_LOCKING(env) && modify) ? DB_WRITECURSOR : 0)) != 0) + return (ret); + + /* + * Point the cursor at the record. + * + * If we're removing or potentially creating an entry, lock the page + * with DB_RMW. + * + * We do multiple cursor operations with the cursor in some cases and + * subsequently access the data DBT information. Set DB_DBT_MALLOC so + * we don't risk modification of the data between our uses of it. + * + * !!! + * We don't include the name's nul termination in the database. + */ + DB_INIT_DBT(key, subdb, strlen(subdb)); + memset(&data, 0, sizeof(data)); + F_SET(&data, DB_DBT_MALLOC); + + ret = __dbc_get(dbc, &key, &data, + DB_SET | ((STD_LOCKING(dbc) && modify) ? DB_RMW : 0)); + + /* + * What we do next--whether or not we found a record for the + * specified subdatabase--depends on what the specified action is. + * Handle ret appropriately as the first statement of each case. + */ + switch (action) { + case MU_REMOVE: + /* + * We should have found something if we're removing it. Note + * that in the common case where the DB we're asking to remove + * doesn't exist, we won't get this far; __db_subdb_remove + * will already have returned an error from __db_open. + */ + if (ret != 0) + goto err; + + /* + * Delete the subdatabase entry first; if this fails, + * we don't want to touch the actual subdb pages. + */ + if ((ret = __dbc_del(dbc, 0)) != 0) + goto err; + + /* + * We're handling actual data, not on-page meta-data, + * so it hasn't been converted to/from opposite + * endian architectures. Do it explicitly, now. + */ + memcpy(&sdbp->meta_pgno, data.data, sizeof(db_pgno_t)); + DB_NTOHL_SWAP(env, &sdbp->meta_pgno); + if ((ret = __memp_fget(mdbp->mpf, &sdbp->meta_pgno, + ip, dbc->txn, DB_MPOOL_DIRTY, &p)) != 0) + goto err; + + /* Free the root on the master db if it was created. */ + if (TYPE(p) == P_BTREEMETA && + ((BTMETA *)p)->root != PGNO_INVALID) { + if ((ret = __memp_fget(mdbp->mpf, + &((BTMETA *)p)->root, ip, dbc->txn, + DB_MPOOL_DIRTY, &r)) != 0) + goto err; + + /* Free and put the page. */ + if ((ret = __db_free(dbc, r)) != 0) { + r = NULL; + goto err; + } + } + /* Free and put the page. */ + if ((ret = __db_free(dbc, p)) != 0) { + p = NULL; + goto err; + } + p = NULL; + break; + case MU_RENAME: + /* We should have found something if we're renaming it. */ + if (ret != 0) + goto err; + + /* + * Before we rename, we need to make sure we're not + * overwriting another subdatabase, or else this operation + * won't be undoable. Open a second cursor and check + * for the existence of newname; it shouldn't appear under + * us since we hold the metadata lock. + */ + if ((ret = __db_cursor(mdbp, ip, txn, &ndbc, + CDB_LOCKING(env) ? DB_WRITECURSOR : 0)) != 0) + goto err; + DB_SET_DBT(key, newname, strlen(newname)); + + /* + * We don't actually care what the meta page of the potentially- + * overwritten DB is; we just care about existence. + */ + memset(&ndata, 0, sizeof(ndata)); + F_SET(&ndata, DB_DBT_USERMEM | DB_DBT_PARTIAL); + + if ((ret = __dbc_get(ndbc, &key, &ndata, DB_SET)) == 0) { + /* A subdb called newname exists. Bail. */ + ret = EEXIST; + __db_errx(env, "rename: database %s exists", newname); + goto err; + } else if (ret != DB_NOTFOUND) + goto err; + + /* + * Now do the put first; we don't want to lose our only + * reference to the subdb. Use the second cursor so the + * first one continues to point to the old record. + */ + if ((ret = __dbc_put(ndbc, &key, &data, DB_KEYFIRST)) != 0) + goto err; + if ((ret = __dbc_del(dbc, 0)) != 0) { + /* + * If the delete fails, try to delete the record + * we just put, in case we're not txn-protected. + */ + (void)__dbc_del(ndbc, 0); + goto err; + } + + break; + case MU_OPEN: + /* + * Get the subdatabase information. If it already exists, + * copy out the page number and we're done. + */ + switch (ret) { + case 0: + if (LF_ISSET(DB_CREATE) && LF_ISSET(DB_EXCL)) { + ret = EEXIST; + goto err; + } + memcpy(&sdbp->meta_pgno, data.data, sizeof(db_pgno_t)); + DB_NTOHL_SWAP(env, &sdbp->meta_pgno); + goto done; + case DB_NOTFOUND: + if (LF_ISSET(DB_CREATE)) + break; + /* + * No db_err, it is reasonable to remove a + * nonexistent db. + */ + ret = ENOENT; + goto err; + default: + goto err; + } + + /* Create a subdatabase. */ + if ((ret = __db_new(dbc, + type == DB_HASH ? P_HASHMETA : P_BTREEMETA, NULL, &p)) != 0) + goto err; + sdbp->meta_pgno = PGNO(p); + + /* + * XXX + * We're handling actual data, not on-page meta-data, so it + * hasn't been converted to/from opposite endian architectures. + * Do it explicitly, now. + */ + t_pgno = PGNO(p); + DB_HTONL_SWAP(env, &t_pgno); + memset(&ndata, 0, sizeof(ndata)); + ndata.data = &t_pgno; + ndata.size = sizeof(db_pgno_t); + if ((ret = __dbc_put(dbc, &key, &ndata, 0)) != 0) + goto err; + F_SET(sdbp, DB_AM_CREATED); + break; + } + +err: +done: /* + * If we allocated a page: if we're successful, mark the page dirty + * and return it to the cache, otherwise, discard/free it. + */ + if (p != NULL && (t_ret = __memp_fput(mdbp->mpf, + dbc->thread_info, p, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + + /* Discard the cursor(s) and data. */ + if (data.data != NULL) + __os_ufree(env, data.data); + if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + if (ndbc != NULL && (t_ret = __dbc_close(ndbc)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __env_setup -- + * Set up the underlying environment during a db_open. + * + * PUBLIC: int __env_setup __P((DB *, + * PUBLIC: DB_TXN *, const char *, const char *, u_int32_t, u_int32_t)); + */ +int +__env_setup(dbp, txn, fname, dname, id, flags) + DB *dbp; + DB_TXN *txn; + const char *fname, *dname; + u_int32_t id, flags; +{ + DB *ldbp; + DB_ENV *dbenv; + ENV *env; + u_int32_t maxid; + int ret; + + env = dbp->env; + dbenv = env->dbenv; + + /* If we don't yet have an environment, it's time to create it. */ + if (!F_ISSET(env, ENV_OPEN_CALLED)) { + /* Make sure we have at least DB_MINCACHE pages in our cache. */ + if (dbenv->mp_gbytes == 0 && + dbenv->mp_bytes < dbp->pgsize * DB_MINPAGECACHE && + (ret = __memp_set_cachesize( + dbenv, 0, dbp->pgsize * DB_MINPAGECACHE, 0)) != 0) + return (ret); + + if ((ret = __env_open(dbenv, NULL, DB_CREATE | + DB_INIT_MPOOL | DB_PRIVATE | LF_ISSET(DB_THREAD), 0)) != 0) + return (ret); + } + + /* Join the underlying cache. */ + if ((!F_ISSET(dbp, DB_AM_INMEM) || dname == NULL) && + (ret = __env_mpool(dbp, fname, flags)) != 0) + return (ret); + + /* We may need a per-thread mutex. */ + if (LF_ISSET(DB_THREAD) && (ret = __mutex_alloc( + env, MTX_DB_HANDLE, DB_MUTEX_PROCESS_ONLY, &dbp->mutex)) != 0) + return (ret); + + /* + * Set up a bookkeeping entry for this database in the log region, + * if such a region exists. Note that even if we're in recovery + * or a replication client, where we won't log registries, we'll + * still need an FNAME struct, so LOGGING_ON is the correct macro. + */ + if (LOGGING_ON(env) && dbp->log_filename == NULL +#if !defined(DEBUG_ROP) && !defined(DEBUG_WOP) && !defined(DIAGNOSTIC) + && (txn != NULL || F_ISSET(dbp, DB_AM_RECOVER)) +#endif +#if !defined(DEBUG_ROP) + && !F_ISSET(dbp, DB_AM_RDONLY) +#endif + ) { + if ((ret = __dbreg_setup(dbp, + F_ISSET(dbp, DB_AM_INMEM) ? dname : fname, + F_ISSET(dbp, DB_AM_INMEM) ? NULL : dname, id)) != 0) + return (ret); + + /* + * If we're actively logging and our caller isn't a + * recovery function that already did so, then assign + * this dbp a log fileid. + */ + if (DBENV_LOGGING(env) && !F_ISSET(dbp, DB_AM_RECOVER) && + (ret = __dbreg_new_id(dbp, txn)) != 0) + return (ret); + } + + /* + * Insert ourselves into the ENV's dblist. We allocate a + * unique ID to each {fileid, meta page number} pair, and to + * each temporary file (since they all have a zero fileid). + * This ID gives us something to use to tell which DB handles + * go with which databases in all the cursor adjustment + * routines, where we don't want to do a lot of ugly and + * expensive memcmps. + */ + MUTEX_LOCK(env, env->mtx_dblist); + maxid = 0; + TAILQ_FOREACH(ldbp, &env->dblist, dblistlinks) { + /* + * There are three cases: on-disk database (first clause), + * named in-memory database (second clause), temporary database + * (never matches; no clause). + */ + if (!F_ISSET(dbp, DB_AM_INMEM)) { + if (memcmp(ldbp->fileid, dbp->fileid, DB_FILE_ID_LEN) + == 0 && ldbp->meta_pgno == dbp->meta_pgno) + break; + } else if (dname != NULL) { + if (F_ISSET(ldbp, DB_AM_INMEM) && + ldbp->dname != NULL && + strcmp(ldbp->dname, dname) == 0) + break; + } + if (ldbp->adj_fileid > maxid) + maxid = ldbp->adj_fileid; + } + + /* + * If ldbp is NULL, we didn't find a match. Assign the dbp an + * adj_fileid one higher than the largest we found, and + * insert it at the head of the master dbp list. + * + * If ldbp is not NULL, it is a match for our dbp. Give dbp + * the same ID that ldbp has, and add it after ldbp so they're + * together in the list. + */ + if (ldbp == NULL) { + dbp->adj_fileid = maxid + 1; + TAILQ_INSERT_HEAD(&env->dblist, dbp, dblistlinks); + } else { + dbp->adj_fileid = ldbp->adj_fileid; + TAILQ_INSERT_AFTER(&env->dblist, ldbp, dbp, dblistlinks); + } + MUTEX_UNLOCK(env, env->mtx_dblist); + + return (0); +} + +/* + * __env_mpool -- + * Set up the underlying environment cache during a db_open. + * + * PUBLIC: int __env_mpool __P((DB *, const char *, u_int32_t)); + */ +int +__env_mpool(dbp, fname, flags) + DB *dbp; + const char *fname; + u_int32_t flags; +{ + DBT pgcookie; + DB_MPOOLFILE *mpf; + DB_PGINFO pginfo; + ENV *env; + int fidset, ftype, ret; + int32_t lsn_off; + u_int8_t nullfid[DB_FILE_ID_LEN]; + u_int32_t clear_len; + + env = dbp->env; + + /* The LSN is the first entry on a DB page, byte offset 0. */ + lsn_off = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LSN_OFF_NOTSET : 0; + + /* It's possible that this database is already open. */ + if (F_ISSET(dbp, DB_AM_OPEN_CALLED)) + return (0); + + /* + * If we need to pre- or post-process a file's pages on I/O, set the + * file type. If it's a hash file, always call the pgin and pgout + * routines. This means that hash files can never be mapped into + * process memory. If it's a btree file and requires swapping, we + * need to page the file in and out. This has to be right -- we can't + * mmap files that are being paged in and out. + */ + switch (dbp->type) { + case DB_BTREE: + case DB_RECNO: + ftype = F_ISSET(dbp, DB_AM_SWAP | DB_AM_ENCRYPT | DB_AM_CHKSUM) + ? DB_FTYPE_SET : DB_FTYPE_NOTSET; + clear_len = CRYPTO_ON(env) ? + (dbp->pgsize != 0 ? dbp->pgsize : DB_CLEARLEN_NOTSET) : + DB_PAGE_DB_LEN; + break; + case DB_HASH: + ftype = DB_FTYPE_SET; + clear_len = CRYPTO_ON(env) ? + (dbp->pgsize != 0 ? dbp->pgsize : DB_CLEARLEN_NOTSET) : + DB_PAGE_DB_LEN; + break; + case DB_QUEUE: + ftype = F_ISSET(dbp, + DB_AM_SWAP | DB_AM_ENCRYPT | DB_AM_CHKSUM) ? + DB_FTYPE_SET : DB_FTYPE_NOTSET; + + /* + * If we came in here without a pagesize set, then we need + * to mark the in-memory handle as having clear_len not + * set, because we don't really know the clear length or + * the page size yet (since the file doesn't yet exist). + */ + clear_len = dbp->pgsize != 0 ? dbp->pgsize : DB_CLEARLEN_NOTSET; + break; + case DB_UNKNOWN: + /* + * If we're running in the verifier, our database might + * be corrupt and we might not know its type--but we may + * still want to be able to verify and salvage. + * + * If we can't identify the type, it's not going to be safe + * to call __db_pgin--we pretty much have to give up all + * hope of salvaging cross-endianness. Proceed anyway; + * at worst, the database will just appear more corrupt + * than it actually is, but at best, we may be able + * to salvage some data even with no metadata page. + */ + if (F_ISSET(dbp, DB_AM_VERIFYING)) { + ftype = DB_FTYPE_NOTSET; + clear_len = DB_PAGE_DB_LEN; + break; + } + + /* + * This might be an in-memory file and we won't know its + * file type until after we open it and read the meta-data + * page. + */ + if (F_ISSET(dbp, DB_AM_INMEM)) { + clear_len = DB_CLEARLEN_NOTSET; + ftype = DB_FTYPE_NOTSET; + lsn_off = DB_LSN_OFF_NOTSET; + break; + } + /* FALLTHROUGH */ + default: + return (__db_unknown_type(env, "DB->open", dbp->type)); + } + + mpf = dbp->mpf; + + memset(nullfid, 0, DB_FILE_ID_LEN); + fidset = memcmp(nullfid, dbp->fileid, DB_FILE_ID_LEN); + if (fidset) + (void)__memp_set_fileid(mpf, dbp->fileid); + + (void)__memp_set_clear_len(mpf, clear_len); + (void)__memp_set_ftype(mpf, ftype); + (void)__memp_set_lsn_offset(mpf, lsn_off); + + pginfo.db_pagesize = dbp->pgsize; + pginfo.flags = + F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_SWAP)); + pginfo.type = dbp->type; + pgcookie.data = &pginfo; + pgcookie.size = sizeof(DB_PGINFO); + (void)__memp_set_pgcookie(mpf, &pgcookie); + +#ifndef DIAG_MVCC + if (F_ISSET(env->dbenv, DB_ENV_MULTIVERSION)) +#endif + if (F_ISSET(dbp, DB_AM_TXN) && + dbp->type != DB_QUEUE && dbp->type != DB_UNKNOWN) + LF_SET(DB_MULTIVERSION); + + if ((ret = __memp_fopen(mpf, NULL, fname, &dbp->dirname, + LF_ISSET(DB_CREATE | DB_DURABLE_UNKNOWN | DB_MULTIVERSION | + DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE) | + (F_ISSET(env->dbenv, DB_ENV_DIRECT_DB) ? DB_DIRECT : 0) | + (F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_TXN_NOT_DURABLE : 0), + 0, dbp->pgsize)) != 0) { + /* + * The open didn't work; we need to reset the mpf, + * retaining the in-memory semantics (if any). + */ + (void)__memp_fclose(dbp->mpf, 0); + (void)__memp_fcreate(env, &dbp->mpf); + if (F_ISSET(dbp, DB_AM_INMEM)) + MAKE_INMEM(dbp); + return (ret); + } + + /* + * Set the open flag. We use it to mean that the dbp has gone + * through mpf setup, including dbreg_register. Also, below, + * the underlying access method open functions may want to do + * things like acquire cursors, so the open flag has to be set + * before calling them. + */ + F_SET(dbp, DB_AM_OPEN_CALLED); + if (!fidset && fname != NULL) { + (void)__memp_get_fileid(dbp->mpf, dbp->fileid); + dbp->preserve_fid = 1; + } + + return (0); +} + +/* + * __db_close -- + * DB->close method. + * + * PUBLIC: int __db_close __P((DB *, DB_TXN *, u_int32_t)); + */ +int +__db_close(dbp, txn, flags) + DB *dbp; + DB_TXN *txn; + u_int32_t flags; +{ + ENV *env; + int db_ref, deferred_close, ret, t_ret; + + env = dbp->env; + deferred_close = ret = 0; + + /* + * Validate arguments, but as a DB handle destructor, we can't fail. + * + * Check for consistent transaction usage -- ignore errors. Only + * internal callers specify transactions, so it's a serious problem + * if we get error messages. + */ + if (txn != NULL) + (void)__db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0); + + /* Refresh the structure and close any underlying resources. */ + ret = __db_refresh(dbp, txn, flags, &deferred_close, 0); + + /* + * If we've deferred the close because the logging of the close failed, + * return our failure right away without destroying the handle. + */ + if (deferred_close) + return (ret); + + /* !!! + * This code has an apparent race between the moment we read and + * decrement env->db_ref and the moment we check whether it's 0. + * However, if the environment is DBLOCAL, the user shouldn't have a + * reference to the env handle anyway; the only way we can get + * multiple dbps sharing a local env is if we open them internally + * during something like a subdatabase open. If any such thing is + * going on while the user is closing the original dbp with a local + * env, someone's already badly screwed up, so there's no reason + * to bother engineering around this possibility. + */ + MUTEX_LOCK(env, env->mtx_dblist); + db_ref = --env->db_ref; + MUTEX_UNLOCK(env, env->mtx_dblist); + if (F_ISSET(env, ENV_DBLOCAL) && db_ref == 0 && + (t_ret = __env_close(env->dbenv, 0)) != 0 && ret == 0) + ret = t_ret; + + /* Free the database handle. */ + memset(dbp, CLEAR_BYTE, sizeof(*dbp)); + __os_free(env, dbp); + + return (ret); +} + +/* + * __db_refresh -- + * Refresh the DB structure, releasing any allocated resources. + * This does most of the work of closing files now because refresh + * is what is used during abort processing (since we can't destroy + * the actual handle) and during abort processing, we may have a + * fully opened handle. + * + * PUBLIC: int __db_refresh __P((DB *, DB_TXN *, u_int32_t, int *, int)); + */ +int +__db_refresh(dbp, txn, flags, deferred_closep, reuse) + DB *dbp; + DB_TXN *txn; + u_int32_t flags; + int *deferred_closep, reuse; +{ + DB *sdbp; + DBC *dbc; + DB_FOREIGN_INFO *f_info, *tmp; + DB_LOCKER *locker; + DB_LOCKREQ lreq; + ENV *env; + REGENV *renv; + REGINFO *infop; + u_int32_t save_flags; + int resync, ret, t_ret; + + ret = 0; + + env = dbp->env; + infop = env->reginfo; + if (infop != NULL) + renv = infop->primary; + else + renv = NULL; + + /* + * If this dbp is not completely open, avoid trapping by trying to + * sync without an mpool file. + */ + if (dbp->mpf == NULL) + LF_SET(DB_NOSYNC); + + /* If never opened, or not currently open, it's easy. */ + if (!F_ISSET(dbp, DB_AM_OPEN_CALLED)) + goto never_opened; + + /* + * If we have any secondary indices, disassociate them from us. + * We don't bother with the mutex here; it only protects some + * of the ops that will make us core-dump mid-close anyway, and + * if you're trying to do something with a secondary *while* you're + * closing the primary, you deserve what you get. The disassociation + * is mostly done just so we can close primaries and secondaries in + * any order--but within one thread of control. + */ + LIST_FOREACH(sdbp, &dbp->s_secondaries, s_links) { + LIST_REMOVE(sdbp, s_links); + if ((t_ret = __db_disassociate(sdbp)) != 0 && ret == 0) + ret = t_ret; + } + + /* + * Disassociate ourself from any databases using us as a foreign key + * database by clearing the referring db's pointer. Reclaim memory. + */ + f_info = LIST_FIRST(&dbp->f_primaries); + while (f_info != NULL) { + tmp = LIST_NEXT(f_info, f_links); + LIST_REMOVE(f_info, f_links); + f_info->dbp->s_foreign = NULL; + __os_free(env, f_info); + f_info = tmp; + } + + if (dbp->s_foreign != NULL && + (t_ret = __db_disassociate_foreign(dbp)) != 0 && ret == 0) + ret = t_ret; + + /* + * Sync the underlying access method. Do before closing the cursors + * because DB->sync allocates cursors in order to write Recno backing + * source text files. + * + * Sync is slow on some systems, notably Solaris filesystems where the + * entire buffer cache is searched. If we're in recovery, don't flush + * the file, it's not necessary. + */ + if (!LF_ISSET(DB_NOSYNC) && + !F_ISSET(dbp, DB_AM_DISCARD | DB_AM_RECOVER) && + (t_ret = __db_sync(dbp)) != 0 && ret == 0) + ret = t_ret; + + /* + * Go through the active cursors and call the cursor recycle routine, + * which resolves pending operations and moves the cursors onto the + * free list. Then, walk the free list and call the cursor destroy + * routine. Note that any failure on a close is considered "really + * bad" and we just break out of the loop and force forward. + */ + resync = TAILQ_FIRST(&dbp->active_queue) == NULL ? 0 : 1; + while ((dbc = TAILQ_FIRST(&dbp->active_queue)) != NULL) + if ((t_ret = __dbc_close(dbc)) != 0) { + if (ret == 0) + ret = t_ret; + break; + } + + while ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL) + if ((t_ret = __dbc_destroy(dbc)) != 0) { + if (ret == 0) + ret = t_ret; + break; + } + + /* + * Close any outstanding join cursors. Join cursors destroy themselves + * on close and have no separate destroy routine. We don't have to set + * the resync flag here, because join cursors aren't write cursors. + */ + while ((dbc = TAILQ_FIRST(&dbp->join_queue)) != NULL) + if ((t_ret = __db_join_close(dbc)) != 0) { + if (ret == 0) + ret = t_ret; + break; + } + + /* + * Sync the memory pool, even though we've already called DB->sync, + * because closing cursors can dirty pages by deleting items they + * referenced. + * + * Sync is slow on some systems, notably Solaris filesystems where the + * entire buffer cache is searched. If we're in recovery, don't flush + * the file, it's not necessary. + */ + if (resync && !LF_ISSET(DB_NOSYNC) && + !F_ISSET(dbp, DB_AM_DISCARD | DB_AM_RECOVER) && + (t_ret = __memp_fsync(dbp->mpf)) != 0 && ret == 0) + ret = t_ret; + +never_opened: + /* + * At this point, we haven't done anything to render the DB handle + * unusable, at least by a transaction abort. Take the opportunity + * now to log the file close if we have initialized the logging + * information. If this log fails and we're in a transaction, + * we have to bail out of the attempted close; we'll need a dbp in + * order to successfully abort the transaction, and we can't conjure + * a new one up because we haven't gotten out the dbreg_register + * record that represents the close. In this case, we put off + * actually closing the dbp until we've performed the abort. + */ + if (!reuse && LOGGING_ON(dbp->env) && dbp->log_filename != NULL) { + /* + * Discard the log file id, if any. We want to log the close + * if and only if this is not a recovery dbp or a client dbp, + * or a dead dbp handle. + */ + DB_ASSERT(env, renv != NULL); + if (F_ISSET(dbp, DB_AM_RECOVER) || IS_REP_CLIENT(env) || + dbp->timestamp != renv->rep_timestamp) { + if ((t_ret = __dbreg_revoke_id(dbp, + 0, DB_LOGFILEID_INVALID)) == 0 && ret == 0) + ret = t_ret; + if ((t_ret = __dbreg_teardown(dbp)) != 0 && ret == 0) + ret = t_ret; + } else { + if ((t_ret = __dbreg_close_id(dbp, + txn, DBREG_CLOSE)) != 0 && txn != NULL) { + /* + * We're in a txn and the attempt to log the + * close failed; let the txn subsystem know + * that we need to destroy this dbp once we're + * done with the abort, then bail from the + * close. + * + * Note that if the attempt to put off the + * close -also- fails--which it won't unless + * we're out of heap memory--we're really + * screwed. Panic. + */ + if ((ret = + __txn_closeevent(env, txn, dbp)) != 0) + return (__env_panic(env, ret)); + if (deferred_closep != NULL) + *deferred_closep = 1; + return (t_ret); + } + /* + * If dbreg_close_id failed and we were not in a + * transaction, then we need to finish this close + * because the caller can't do anything with the + * handle after we return an error. We rely on + * dbreg_close_id to mark the entry in some manner + * so that we do not do a clean shutdown of this + * environment. If shutdown isn't clean, then the + * application *must* run recovery and that will + * generate the RCLOSE record. + */ + } + + } + + /* Close any handle we've been holding since the open. */ + if (dbp->saved_open_fhp != NULL && + (t_ret = __os_closehandle(env, dbp->saved_open_fhp)) != 0 && + ret == 0) + ret = t_ret; + + /* + * Remove this DB handle from the ENV's dblist, if it's been added. + * + * Close our reference to the underlying cache while locked, we don't + * want to race with a thread searching for our underlying cache link + * while opening a DB handle. + * + * The DB handle may not yet have been added to the ENV list, don't + * blindly call the underlying TAILQ_REMOVE macro. Explicitly reset + * the field values to NULL so that we can't call TAILQ_REMOVE twice. + */ + MUTEX_LOCK(env, env->mtx_dblist); + if (!reuse && + (dbp->dblistlinks.tqe_next != NULL || + dbp->dblistlinks.tqe_prev != NULL)) { + TAILQ_REMOVE(&env->dblist, dbp, dblistlinks); + dbp->dblistlinks.tqe_next = NULL; + dbp->dblistlinks.tqe_prev = NULL; + } + + /* Close the memory pool file handle. */ + if (dbp->mpf != NULL) { + if ((t_ret = __memp_fclose(dbp->mpf, + F_ISSET(dbp, DB_AM_DISCARD) ? DB_MPOOL_DISCARD : 0)) != 0 && + ret == 0) + ret = t_ret; + dbp->mpf = NULL; + if (reuse && + (t_ret = __memp_fcreate(env, &dbp->mpf)) != 0 && + ret == 0) + ret = t_ret; + } + + MUTEX_UNLOCK(env, env->mtx_dblist); + + /* + * Call the access specific close function. + * + * We do this here rather than in __db_close as we need to do this when + * aborting an open so that file descriptors are closed and abort of + * renames can succeed on platforms that lock open files (such as + * Windows). In particular, we need to ensure that all the extents + * associated with a queue are closed so that queue renames can be + * aborted. + * + * It is also important that we do this before releasing the handle + * lock, because dbremove and dbrename assume that once they have the + * handle lock, it is safe to modify the underlying file(s). + * + * !!! + * Because of where these functions are called in the DB handle close + * process, these routines can't do anything that would dirty pages or + * otherwise affect closing down the database. Specifically, we can't + * abort and recover any of the information they control. + */ +#ifdef HAVE_PARTITION + if (dbp->p_internal != NULL && + (t_ret = __partition_close(dbp, txn, flags)) != 0 && ret == 0) + ret = t_ret; +#endif + if ((t_ret = __bam_db_close(dbp)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __ham_db_close(dbp)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __qam_db_close(dbp, dbp->flags)) != 0 && ret == 0) + ret = t_ret; + + /* + * !!! + * At this point, the access-method specific information has been + * freed. From now on, we can use the dbp, but not touch any + * access-method specific data. + */ + + if (!reuse && dbp->locker != NULL) { + /* We may have pending trade operations on this dbp. */ + if (txn == NULL) + txn = dbp->cur_txn; + if (IS_REAL_TXN(txn)) + __txn_remlock(env, + txn, &dbp->handle_lock, dbp->locker); + + /* We may be holding the handle lock; release it. */ + lreq.op = DB_LOCK_PUT_ALL; + lreq.obj = NULL; + if ((t_ret = __lock_vec(env, + dbp->locker, 0, &lreq, 1, NULL)) != 0 && ret == 0) + ret = t_ret; + + if ((t_ret = + __lock_id_free(env, dbp->locker)) != 0 && ret == 0) + ret = t_ret; + dbp->locker = NULL; + LOCK_INIT(dbp->handle_lock); + } + + /* + * If this is a temporary file (un-named in-memory file), then + * discard the locker ID allocated as the fileid. + */ + if (LOCKING_ON(env) && + F_ISSET(dbp, DB_AM_INMEM) && !dbp->preserve_fid && + *(u_int32_t *)dbp->fileid != DB_LOCK_INVALIDID) { + if ((t_ret = __lock_getlocker(env->lk_handle, + *(u_int32_t *)dbp->fileid, 0, &locker)) == 0) + t_ret = __lock_id_free(env, locker); + if (ret == 0) + ret = t_ret; + } + + if (reuse) { + /* + * If we are reusing this dbp, then we're done now. Re-init + * the handle, preserving important flags, and then return. + * This code is borrowed from __db_init, which does more + * than we can do here. + */ + save_flags = F_ISSET(dbp, DB_AM_INMEM | DB_AM_TXN); + + if ((ret = __bam_db_create(dbp)) != 0) + return (ret); + if ((ret = __ham_db_create(dbp)) != 0) + return (ret); + if ((ret = __qam_db_create(dbp)) != 0) + return (ret); + + /* Restore flags */ + dbp->flags = dbp->orig_flags | save_flags; + + if (FLD_ISSET(save_flags, DB_AM_INMEM)) { + /* + * If this is inmem, then it may have a fileid + * even if it was never opened, and we need to + * clear out that fileid. + */ + memset(dbp->fileid, 0, sizeof(dbp->fileid)); + MAKE_INMEM(dbp); + } + return (ret); + } + + dbp->type = DB_UNKNOWN; + + /* + * The thread mutex may have been invalidated in __dbreg_close_id if the + * fname refcount did not go to 0. If not, discard the thread mutex. + */ + if ((t_ret = __mutex_free(env, &dbp->mutex)) != 0 && ret == 0) + ret = t_ret; + + /* Discard any memory allocated for the file and database names. */ + if (dbp->fname != NULL) { + __os_free(dbp->env, dbp->fname); + dbp->fname = NULL; + } + if (dbp->dname != NULL) { + __os_free(dbp->env, dbp->dname); + dbp->dname = NULL; + } + + /* Discard any memory used to store returned data. */ + if (dbp->my_rskey.data != NULL) + __os_free(dbp->env, dbp->my_rskey.data); + if (dbp->my_rkey.data != NULL) + __os_free(dbp->env, dbp->my_rkey.data); + if (dbp->my_rdata.data != NULL) + __os_free(dbp->env, dbp->my_rdata.data); + + /* For safety's sake; we may refresh twice. */ + memset(&dbp->my_rskey, 0, sizeof(DBT)); + memset(&dbp->my_rkey, 0, sizeof(DBT)); + memset(&dbp->my_rdata, 0, sizeof(DBT)); + + /* Clear out fields that normally get set during open. */ + memset(dbp->fileid, 0, sizeof(dbp->fileid)); + dbp->adj_fileid = 0; + dbp->meta_pgno = 0; + dbp->cur_locker = NULL; + dbp->cur_txn = NULL; + dbp->associate_locker = NULL; + dbp->cl_id = 0; + dbp->open_flags = 0; + + /* + * If we are being refreshed with a txn specified, then we need + * to make sure that we clear out the lock handle field, because + * releasing all the locks for this transaction will release this + * lock and we don't want close to stumble upon this handle and + * try to close it. + */ + if (txn != NULL) + LOCK_INIT(dbp->handle_lock); + + /* Reset flags to whatever the user configured. */ + dbp->flags = dbp->orig_flags; + + return (ret); +} + +/* + * __db_disassociate -- + * Destroy the association between a given secondary and its primary. + */ +static int +__db_disassociate(sdbp) + DB *sdbp; +{ + DBC *dbc; + int ret, t_ret; + + ret = 0; + + sdbp->s_callback = NULL; + sdbp->s_primary = NULL; + sdbp->get = sdbp->stored_get; + sdbp->close = sdbp->stored_close; + + /* + * Complain, but proceed, if we have any active cursors. (We're in + * the middle of a close, so there's really no turning back.) + */ + if (sdbp->s_refcnt != 1 || + TAILQ_FIRST(&sdbp->active_queue) != NULL || + TAILQ_FIRST(&sdbp->join_queue) != NULL) { + __db_errx(sdbp->env, + "Closing a primary DB while a secondary DB has active cursors is unsafe"); + ret = EINVAL; + } + sdbp->s_refcnt = 0; + + while ((dbc = TAILQ_FIRST(&sdbp->free_queue)) != NULL) + if ((t_ret = __dbc_destroy(dbc)) != 0 && ret == 0) + ret = t_ret; + + F_CLR(sdbp, DB_AM_SECONDARY); + return (ret); +} + +/* + * __db_disassociate_foreign -- + * Destroy the association between a given secondary and its foreign. + */ +static int +__db_disassociate_foreign(sdbp) + DB *sdbp; +{ + DB *fdbp; + DB_FOREIGN_INFO *f_info, *tmp; + int ret; + + if (sdbp->s_foreign == NULL) + return (0); + if ((ret = __os_malloc(sdbp->env, sizeof(DB_FOREIGN_INFO), &tmp)) != 0) + return (ret); + + fdbp = sdbp->s_foreign; + ret = 0; + f_info = LIST_FIRST(&fdbp->f_primaries); + while (f_info != NULL) { + tmp = LIST_NEXT(f_info, f_links); + if (f_info ->dbp == sdbp) { + LIST_REMOVE(f_info, f_links); + __os_free(sdbp->env, f_info); + } + f_info = tmp; + } + + return (ret); +} + +/* + * __db_log_page + * Log a meta-data or root page during a subdatabase create operation. + * + * PUBLIC: int __db_log_page __P((DB *, DB_TXN *, DB_LSN *, db_pgno_t, PAGE *)); + */ +int +__db_log_page(dbp, txn, lsn, pgno, page) + DB *dbp; + DB_TXN *txn; + DB_LSN *lsn; + db_pgno_t pgno; + PAGE *page; +{ + DBT page_dbt; + DB_LSN new_lsn; + int ret; + + if (!LOGGING_ON(dbp->env) || txn == NULL) + return (0); + + memset(&page_dbt, 0, sizeof(page_dbt)); + page_dbt.size = dbp->pgsize; + page_dbt.data = page; + + ret = __crdel_metasub_log(dbp, txn, &new_lsn, 0, pgno, &page_dbt, lsn); + + if (ret == 0) + page->lsn = new_lsn; + return (ret); +} + +/* + * __db_backup_name + * Create the backup file name for a given file. + * + * PUBLIC: int __db_backup_name __P((ENV *, + * PUBLIC: const char *, DB_TXN *, char **)); + */ +#undef BACKUP_PREFIX +#define BACKUP_PREFIX "__db." + +#undef MAX_INT_TO_HEX +#define MAX_INT_TO_HEX 8 + +int +__db_backup_name(env, name, txn, backup) + ENV *env; + const char *name; + DB_TXN *txn; + char **backup; +{ + u_int32_t id; + size_t len; + int ret; + char *p, *retp; + + *backup = NULL; + + /* + * Part of the name may be a full path, so we need to make sure that + * we allocate enough space for it, even in the case where we don't + * use the entire filename for the backup name. + */ + len = strlen(name) + strlen(BACKUP_PREFIX) + 2 * MAX_INT_TO_HEX + 1; + if ((ret = __os_malloc(env, len, &retp)) != 0) + return (ret); + + /* + * Create the name. Backup file names are in one of 2 forms: in a + * transactional env "__db.TXNID.ID", where ID is a random number, + * and in any other env "__db.FILENAME". + * + * In addition, the name passed may contain an env-relative path. + * In that case, put the "__db." in the right place (in the last + * component of the pathname). + * + * There are four cases here: + * 1. simple path w/out transaction + * 2. simple path + transaction + * 3. multi-component path w/out transaction + * 4. multi-component path + transaction + */ + p = __db_rpath(name); + if (IS_REAL_TXN(txn)) { + __os_unique_id(env, &id); + if (p == NULL) /* Case 2. */ + snprintf(retp, len, "%s%x.%x", + BACKUP_PREFIX, txn->txnid, id); + else /* Case 4. */ + snprintf(retp, len, "%.*s%x.%x", + (int)(p - name) + 1, name, txn->txnid, id); + } else { + if (p == NULL) /* Case 1. */ + snprintf(retp, len, "%s%s", BACKUP_PREFIX, name); + else /* Case 3. */ + snprintf(retp, len, "%.*s%s%s", + (int)(p - name) + 1, name, BACKUP_PREFIX, p + 1); + } + + *backup = retp; + return (0); +} + +#ifdef CONFIG_TEST +/* + * __db_testcopy + * Create a copy of all backup files and our "main" DB. + * + * PUBLIC: #ifdef CONFIG_TEST + * PUBLIC: int __db_testcopy __P((ENV *, DB *, const char *)); + * PUBLIC: #endif + */ +int +__db_testcopy(env, dbp, name) + ENV *env; + DB *dbp; + const char *name; +{ + DB_MPOOL *dbmp; + DB_MPOOLFILE *mpf; + + DB_ASSERT(env, dbp != NULL || name != NULL); + + if (name == NULL) { + dbmp = env->mp_handle; + mpf = dbp->mpf; + name = R_ADDR(dbmp->reginfo, mpf->mfp->path_off); + } + + if (dbp != NULL && dbp->type == DB_QUEUE) + return (__qam_testdocopy(dbp, name)); + else +#ifdef HAVE_PARTITION + if (dbp != NULL && DB_IS_PARTITIONED(dbp)) + return (__part_testdocopy(dbp, name)); + else +#endif + return (__db_testdocopy(env, name)); +} + +static int +__qam_testdocopy(dbp, name) + DB *dbp; + const char *name; +{ + DB_THREAD_INFO *ip; + QUEUE_FILELIST *filelist, *fp; + int ret; + char buf[DB_MAXPATHLEN], *dir; + + filelist = NULL; + if ((ret = __db_testdocopy(dbp->env, name)) != 0) + return (ret); + + /* Call ENV_GET_THREAD_INFO to get a valid DB_THREAD_INFO */ + ENV_GET_THREAD_INFO(dbp->env, ip); + if (dbp->mpf != NULL && + (ret = __qam_gen_filelist(dbp, ip, &filelist)) != 0) + goto done; + + if (filelist == NULL) + return (0); + dir = ((QUEUE *)dbp->q_internal)->dir; + for (fp = filelist; fp->mpf != NULL; fp++) { + snprintf(buf, sizeof(buf), + QUEUE_EXTENT, dir, PATH_SEPARATOR[0], name, fp->id); + if ((ret = __db_testdocopy(dbp->env, buf)) != 0) + return (ret); + } + +done: __os_free(dbp->env, filelist); + return (0); +} + +/* + * __db_testdocopy + * Create a copy of all backup files and our "main" DB. + * PUBLIC: int __db_testdocopy __P((ENV *, const char *)); + */ +int +__db_testdocopy(env, name) + ENV *env; + const char *name; +{ + size_t len; + int dircnt, i, ret; + char *copy, **namesp, *p, *real_name; + + dircnt = 0; + copy = NULL; + namesp = NULL; + + /* Create the real backing file name. */ + if ((ret = __db_appname(env, + DB_APP_DATA, name, NULL, &real_name)) != 0) + return (ret); + + /* + * !!! + * There are tests that attempt to copy non-existent files. I'd guess + * it's a testing bug, but I don't have time to figure it out. Block + * the case here. + */ + if (__os_exists(env, real_name, NULL) != 0) { + __os_free(env, real_name); + return (0); + } + + /* + * Copy the file itself. + * + * Allocate space for the file name, including adding an ".afterop" and + * trailing nul byte. + */ + len = strlen(real_name) + sizeof(".afterop"); + if ((ret = __os_malloc(env, len, ©)) != 0) + goto err; + snprintf(copy, len, "%s.afterop", real_name); + if ((ret = __db_makecopy(env, real_name, copy)) != 0) + goto err; + + /* + * Get the directory path to call __os_dirlist(). + */ + if ((p = __db_rpath(real_name)) != NULL) + *p = '\0'; + if ((ret = __os_dirlist(env, real_name, 0, &namesp, &dircnt)) != 0) + goto err; + + /* + * Walk the directory looking for backup files. Backup file names in + * transactional environments are of the form: + * + * BACKUP_PREFIX.TXNID.ID + */ + for (i = 0; i < dircnt; i++) { + /* Check for a related backup file name. */ + if (strncmp( + namesp[i], BACKUP_PREFIX, sizeof(BACKUP_PREFIX) - 1) != 0) + continue; + p = namesp[i] + sizeof(BACKUP_PREFIX); + p += strspn(p, "0123456789ABCDEFabcdef"); + if (*p != '.') + continue; + ++p; + p += strspn(p, "0123456789ABCDEFabcdef"); + if (*p != '\0') + continue; + + /* + * Copy the backup file. + * + * Allocate space for the file name, including adding a + * ".afterop" and trailing nul byte. + */ + if (real_name != NULL) { + __os_free(env, real_name); + real_name = NULL; + } + if ((ret = __db_appname(env, + DB_APP_DATA, namesp[i], NULL, &real_name)) != 0) + goto err; + if (copy != NULL) { + __os_free(env, copy); + copy = NULL; + } + len = strlen(real_name) + sizeof(".afterop"); + if ((ret = __os_malloc(env, len, ©)) != 0) + goto err; + snprintf(copy, len, "%s.afterop", real_name); + if ((ret = __db_makecopy(env, real_name, copy)) != 0) + goto err; + } + +err: if (namesp != NULL) + __os_dirfree(env, namesp, dircnt); + if (copy != NULL) + __os_free(env, copy); + if (real_name != NULL) + __os_free(env, real_name); + return (ret); +} + +static int +__db_makecopy(env, src, dest) + ENV *env; + const char *src, *dest; +{ + DB_FH *rfhp, *wfhp; + size_t rcnt, wcnt; + int ret; + char *buf; + + rfhp = wfhp = NULL; + + if ((ret = __os_malloc(env, 64 * 1024, &buf)) != 0) + goto err; + + if ((ret = __os_open(env, src, 0, + DB_OSO_RDONLY, DB_MODE_600, &rfhp)) != 0) + goto err; + if ((ret = __os_open(env, dest, 0, + DB_OSO_CREATE | DB_OSO_TRUNC, DB_MODE_600, &wfhp)) != 0) + goto err; + + for (;;) { + if ((ret = + __os_read(env, rfhp, buf, sizeof(buf), &rcnt)) != 0) + goto err; + if (rcnt == 0) + break; + if ((ret = + __os_write(env, wfhp, buf, sizeof(buf), &wcnt)) != 0) + goto err; + } + + if (0) { +err: __db_err(env, ret, "__db_makecopy: %s -> %s", src, dest); + } + + if (buf != NULL) + __os_free(env, buf); + if (rfhp != NULL) + (void)__os_closehandle(env, rfhp); + if (wfhp != NULL) + (void)__os_closehandle(env, wfhp); + return (ret); +} +#endif diff --git a/db/db.src b/db/db.src new file mode 100644 index 0000000..2136b79 --- /dev/null +++ b/db/db.src @@ -0,0 +1,328 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +DBPRIVATE +PREFIX __db + +INCLUDE #include "db_int.h" +INCLUDE #include "dbinc/crypto.h" +INCLUDE #include "dbinc/db_page.h" +INCLUDE #include "dbinc/db_dispatch.h" +INCLUDE #include "dbinc/db_am.h" +INCLUDE #include "dbinc/log.h" +INCLUDE #include "dbinc/txn.h" +INCLUDE + +/* + * addrem -- Add or remove an entry from a duplicate page. + * + * opcode: identifies if this is an add or delete. + * fileid: file identifier of the file being modified. + * pgno: duplicate page number. + * indx: location at which to insert or delete. + * nbytes: number of bytes added/removed to/from the page. + * hdr: header for the data item. + * dbt: data that is deleted or is to be added. + * pagelsn: former lsn of the page. + * + * If the hdr was NULL then, the dbt is a regular B_KEYDATA. + * If the dbt was NULL then the hdr is a complete item to be + * pasted on the page. + */ +BEGIN addrem 42 41 +ARG opcode u_int32_t lu +DB fileid int32_t ld +ARG pgno db_pgno_t lu +ARG indx u_int32_t lu +ARG nbytes u_int32_t lu +DBT hdr DBT s +DBT dbt DBT s +POINTER pagelsn DB_LSN * lu +END + +/* + * big -- Handles addition and deletion of big key/data items. + * + * opcode: identifies get/put. + * fileid: file identifier of the file being modified. + * pgno: page onto which data is being added/removed. + * prev_pgno: the page before the one we are logging. + * next_pgno: the page after the one we are logging. + * dbt: data being written onto the page. + * pagelsn: former lsn of the orig_page. + * prevlsn: former lsn of the prev_pgno. + * nextlsn: former lsn of the next_pgno. This is not currently used, but + * may be used later if we actually do overwrites of big key/ + * data items in place. + */ +BEGIN big 42 43 +ARG opcode u_int32_t lu +DB fileid int32_t ld +ARG pgno db_pgno_t lu +ARG prev_pgno db_pgno_t lu +ARG next_pgno db_pgno_t lu +DBT dbt DBT s +POINTER pagelsn DB_LSN * lu +POINTER prevlsn DB_LSN * lu +POINTER nextlsn DB_LSN * lu +END + +/* + * ovref -- Handles increment/decrement of overflow page reference count. + * + * fileid: identifies the file being modified. + * pgno: page number whose ref count is being incremented/decremented. + * adjust: the adjustment being made. + * lsn: the page's original lsn. + */ +BEGIN ovref 42 44 +DB fileid int32_t ld +ARG pgno db_pgno_t lu +ARG adjust int32_t ld +POINTER lsn DB_LSN * lu +END + +/* + * relink -- Handles relinking around a page. + * + * opcode: indicates if this is an addpage or delete page + * pgno: the page being changed. + * lsn the page's original lsn. + * prev: the previous page. + * lsn_prev: the previous page's original lsn. + * next: the next page. + * lsn_next: the previous page's original lsn. + */ +BEGIN_COMPAT relink 42 45 +ARG opcode u_int32_t lu +DB fileid int32_t ld +ARG pgno db_pgno_t lu +POINTER lsn DB_LSN * lu +ARG prev db_pgno_t lu +POINTER lsn_prev DB_LSN * lu +ARG next db_pgno_t lu +POINTER lsn_next DB_LSN * lu +END + +/* + * Debug -- log an operation upon entering an access method. + * op: Operation (cursor, c_close, c_get, c_put, c_del, + * get, put, delete). + * fileid: identifies the file being acted upon. + * key: key paramater + * data: data parameter + * flags: flags parameter + */ +BEGIN debug 42 47 +DBT op DBT s +ARG fileid int32_t ld +DBT key DBT s +DBT data DBT s +ARG arg_flags u_int32_t lu +END + +/* + * noop -- do nothing, but get an LSN. + */ +BEGIN noop 42 48 +DB fileid int32_t ld +ARG pgno db_pgno_t lu +POINTER prevlsn DB_LSN * lu +END + +/* + * pg_alloc: used to record allocating a new page. + * + * meta_lsn: the original lsn of the page reference by meta_pgno. + * meta_pgno the page pointing at the allocated page in the free list. + * If the list is unsorted this is the metadata page. + * page_lsn: the allocated page's original lsn. + * pgno: the page allocated. + * ptype: the type of the page allocated. + * next: the next page on the free list. + * last_pgno: the last page in the file after this op (4.3+). + */ +BEGIN_COMPAT pg_alloc 42 49 +DB fileid int32_t ld +POINTER meta_lsn DB_LSN * lu +ARG meta_pgno db_pgno_t lu +POINTER page_lsn DB_LSN * lu +ARG pgno db_pgno_t lu +ARG ptype u_int32_t lu +ARG next db_pgno_t lu +END + +BEGIN pg_alloc 43 49 +DB fileid int32_t ld +POINTER meta_lsn DB_LSN * lu +ARG meta_pgno db_pgno_t lu +POINTER page_lsn DB_LSN * lu +ARG pgno db_pgno_t lu +ARG ptype u_int32_t lu +ARG next db_pgno_t lu +ARG last_pgno db_pgno_t lu +END + +/* + * pg_free: used to record freeing a page. + * If we are maintaining a sorted free list (during compact) meta_pgno + * will be non-zero and refer to the page that preceeds the one we are freeing + * in the free list. Meta_lsn will then be the lsn of that page. + * + * pgno: the page being freed. + * meta_lsn: the meta-data page's original lsn. + * meta_pgno: the meta-data page number. + * header: the header from the free'd page. + * next: the previous next pointer on the metadata page. + * last_pgno: the last page in the file before this op (4.3+). + */ +BEGIN_COMPAT pg_free 42 50 +DB fileid int32_t ld +ARG pgno db_pgno_t lu +POINTER meta_lsn DB_LSN * lu +ARG meta_pgno db_pgno_t lu +PGDBT header DBT s +ARG next db_pgno_t lu +END + +BEGIN pg_free 43 50 +DB fileid int32_t ld +ARG pgno db_pgno_t lu +POINTER meta_lsn DB_LSN * lu +ARG meta_pgno db_pgno_t lu +PGDBT header DBT s +ARG next db_pgno_t lu +ARG last_pgno db_pgno_t lu +END + +/* + * cksum -- + * This log record is written when we're unable to checksum a page, + * before returning DB_RUNRECOVERY. This log record causes normal + * recovery to itself return DB_RUNRECOVERY, as only catastrophic + * recovery can fix things. + */ +BEGIN cksum 42 51 +END + +/* + * pg_freedata: used to record freeing a page with data on it. + * + * pgno: the page being freed. + * meta_lsn: the meta-data page's original lsn. + * meta_pgno: the meta-data page number. + * header: the header and index entries from the free'd page. + * data: the data from the free'd page. + * next: the previous next pointer on the metadata page. + * last_pgno: the last page in the file before this op (4.3+). + */ +BEGIN_COMPAT pg_freedata 42 52 +DB fileid int32_t ld +ARG pgno db_pgno_t lu +POINTER meta_lsn DB_LSN * lu +ARG meta_pgno db_pgno_t lu +PGDBT header DBT s +ARG next db_pgno_t lu +PGDDBT data DBT s +END + +BEGIN pg_freedata 43 52 +DB fileid int32_t ld +ARG pgno db_pgno_t lu +POINTER meta_lsn DB_LSN * lu +ARG meta_pgno db_pgno_t lu +PGDBT header DBT s +ARG next db_pgno_t lu +ARG last_pgno db_pgno_t lu +PGDDBT data DBT s +END + +/* + * pg_prepare: used to record an aborted page in a prepared transaction. + * + * pgno: the page being freed. + */ +X BEGIN pg_prepare 42 53 +X DB fileid int32_t ld +X ARG pgno db_pgno_t lu +X END + +/* + * pg_new: used to record a new page put on the free list. + * + * pgno: the page being freed. + * meta_lsn: the meta-data page's original lsn. + * meta_pgno: the meta-data page number. + * header: the header from the free'd page. + * next: the previous next pointer on the metadata page. + */ +X BEGIN pg_new 42 54 +X DB fileid int32_t ld +X ARG pgno db_pgno_t lu +X POINTER meta_lsn DB_LSN * lu +X ARG meta_pgno db_pgno_t lu +X PGDBT header DBT s +X ARG next db_pgno_t lu +X END + +/* + * pg_init: used to reinitialize a page during truncate. + * + * pgno: the page being initialized. + * header: the header from the page. + * data: data that used to be on the page. + */ +BEGIN pg_init 43 60 +DB fileid int32_t ld +ARG pgno db_pgno_t lu +PGDBT header DBT s +PGDDBT data DBT s +END + +/* + * pg_sort: sort the free list + * + * meta: meta page number + * meta_lsn: lsn on meta page. + * last_free: page number of new last free page. + * last_lsn; lsn of last free page. + * last_pgno: current last page number. + * list: list of pages and lsns to sort. + */ +BEGIN_COMPAT pg_sort 44 61 +DB fileid int32_t ld +ARG meta db_pgno_t lu +POINTER meta_lsn DB_LSN * lu +ARG last_free db_pgno_t lu +POINTER last_lsn DB_LSN * lu +ARG last_pgno db_pgno_t lu +DBT list DBT s +END + + +/* + * pg_truc: truncate the free list + * + * meta: meta page number + * meta_lsn: lsn on meta page. + * last_free: page number of new last free page. + * last_lsn; lsn of last free page. + * last_pgno: current last page number. + * list: list of pages and lsns on free list. + */ +BEGIN pg_trunc 49 66 +DB fileid int32_t ld +ARG meta db_pgno_t lu +POINTER meta_lsn DB_LSN * lu +ARG last_free db_pgno_t lu +POINTER last_lsn DB_LSN * lu +ARG next_free db_pgno_t lu +ARG last_pgno db_pgno_t lu +DBT list DBT s +END + diff --git a/db/db_am.c b/db/db_am.c new file mode 100644 index 0000000..c453ea9 --- /dev/null +++ b/db/db_am.c @@ -0,0 +1,1015 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1998-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/hash.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/partition.h" +#include "dbinc/qam.h" +#include "dbinc/txn.h" + +static int __db_secondary_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); +static int __dbc_set_priority __P((DBC *, DB_CACHE_PRIORITY)); +static int __dbc_get_priority __P((DBC *, DB_CACHE_PRIORITY* )); + +/* + * __db_cursor_int -- + * Internal routine to create a cursor. + * + * PUBLIC: int __db_cursor_int __P((DB *, DB_THREAD_INFO *, + * PUBLIC: DB_TXN *, DBTYPE, db_pgno_t, int, DB_LOCKER *, DBC **)); + */ +int +__db_cursor_int(dbp, ip, txn, dbtype, root, flags, locker, dbcp) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + DBTYPE dbtype; + db_pgno_t root; + int flags; + DB_LOCKER *locker; + DBC **dbcp; +{ + DBC *dbc; + DBC_INTERNAL *cp; + ENV *env; + db_threadid_t tid; + int allocated, ret; + pid_t pid; + + env = dbp->env; + allocated = 0; + + /* + * If dbcp is non-NULL it is assumed to point to an area to initialize + * as a cursor. + * + * Take one from the free list if it's available. Take only the + * right type. With off page dups we may have different kinds + * of cursors on the queue for a single database. + */ + MUTEX_LOCK(env, dbp->mutex); + +#ifndef HAVE_NO_DB_REFCOUNT + /* + * If this DBP is being logged then refcount the log filename + * relative to this transaction. We do this here because we have + * the dbp->mutex which protects the refcount. We want to avoid + * calling the function if we are duplicating a cursor. This includes + * the case of creating an off page duplicate cursor. If we know this + * cursor will not be used in an update, we could avoid this, + * but we don't have that information. + */ + if (txn != NULL && !LF_ISSET(DBC_OPD | DBC_DUPLICATE) + && !F_ISSET(dbp, DB_AM_RECOVER) && + dbp->log_filename != NULL && !IS_REP_CLIENT(env) && + (ret = __txn_record_fname(env, txn, dbp->log_filename)) != 0) { + MUTEX_UNLOCK(env, dbp->mutex); + return (ret); + } + +#endif + + TAILQ_FOREACH(dbc, &dbp->free_queue, links) + if (dbtype == dbc->dbtype) { + TAILQ_REMOVE(&dbp->free_queue, dbc, links); + F_CLR(dbc, ~DBC_OWN_LID); + break; + } + MUTEX_UNLOCK(env, dbp->mutex); + + if (dbc == NULL) { + if ((ret = __os_calloc(env, 1, sizeof(DBC), &dbc)) != 0) + return (ret); + allocated = 1; + dbc->flags = 0; + + dbc->dbp = dbp; + dbc->dbenv = dbp->dbenv; + dbc->env = dbp->env; + + /* Set up locking information. */ + if (LOCKING_ON(env)) { + /* + * If we are not threaded, we share a locker ID among + * all cursors opened in the environment handle, + * allocating one if this is the first cursor. + * + * This relies on the fact that non-threaded DB handles + * always have non-threaded environment handles, since + * we set DB_THREAD on DB handles created with threaded + * environment handles. + */ + if (!DB_IS_THREADED(dbp)) { + if (env->env_lref == NULL && (ret = + __lock_id(env, NULL, &env->env_lref)) != 0) + goto err; + dbc->lref = env->env_lref; + } else { + if ((ret = + __lock_id(env, NULL, &dbc->lref)) != 0) + goto err; + F_SET(dbc, DBC_OWN_LID); + } + + /* + * In CDB, secondary indices should share a lock file + * ID with the primary; otherwise we're susceptible + * to deadlocks. We also use __db_cursor_int rather + * than __db_cursor to create secondary update cursors + * in c_put and c_del; these won't acquire a new lock. + * + * !!! + * Since this is in the one-time cursor allocation + * code, we need to be sure to destroy, not just + * close, all cursors in the secondary when we + * associate. + */ + if (CDB_LOCKING(env) && + F_ISSET(dbp, DB_AM_SECONDARY)) + memcpy(dbc->lock.fileid, + dbp->s_primary->fileid, DB_FILE_ID_LEN); + else + memcpy(dbc->lock.fileid, + dbp->fileid, DB_FILE_ID_LEN); + + if (CDB_LOCKING(env)) { + if (F_ISSET(env->dbenv, DB_ENV_CDB_ALLDB)) { + /* + * If we are doing a single lock per + * environment, set up the global + * lock object just like we do to + * single thread creates. + */ + DB_ASSERT(env, sizeof(db_pgno_t) == + sizeof(u_int32_t)); + dbc->lock_dbt.size = sizeof(u_int32_t); + dbc->lock_dbt.data = &dbc->lock.pgno; + dbc->lock.pgno = 0; + } else { + dbc->lock_dbt.size = DB_FILE_ID_LEN; + dbc->lock_dbt.data = dbc->lock.fileid; + } + } else { + dbc->lock.type = DB_PAGE_LOCK; + dbc->lock_dbt.size = sizeof(dbc->lock); + dbc->lock_dbt.data = &dbc->lock; + } + } + /* Init the DBC internal structure. */ +#ifdef HAVE_PARTITION + if (DB_IS_PARTITIONED(dbp)) { + if ((ret = __partc_init(dbc)) != 0) + goto err; + } else +#endif + switch (dbtype) { + case DB_BTREE: + case DB_RECNO: + if ((ret = __bamc_init(dbc, dbtype)) != 0) + goto err; + break; + case DB_HASH: + if ((ret = __hamc_init(dbc)) != 0) + goto err; + break; + case DB_QUEUE: + if ((ret = __qamc_init(dbc)) != 0) + goto err; + break; + case DB_UNKNOWN: + default: + ret = __db_unknown_type(env, "DB->cursor", dbtype); + goto err; + } + + cp = dbc->internal; + } + + /* Refresh the DBC structure. */ + dbc->dbtype = dbtype; + RESET_RET_MEM(dbc); + dbc->set_priority = __dbc_set_priority; + dbc->get_priority = __dbc_get_priority; + dbc->priority = dbp->priority; + + if ((dbc->txn = txn) != NULL) + dbc->locker = txn->locker; + else if (LOCKING_ON(env)) { + /* + * There are certain cases in which we want to create a + * new cursor with a particular locker ID that is known + * to be the same as (and thus not conflict with) an + * open cursor. + * + * The most obvious case is cursor duplication; when we + * call DBC->dup or __dbc_idup, we want to use the original + * cursor's locker ID. + * + * Another case is when updating secondary indices. Standard + * CDB locking would mean that we might block ourself: we need + * to open an update cursor in the secondary while an update + * cursor in the primary is open, and when the secondary and + * primary are subdatabases or we're using env-wide locking, + * this is disastrous. + * + * In these cases, our caller will pass a nonzero locker + * ID into this function. Use this locker ID instead of + * the default as the locker ID for our new cursor. + */ + if (locker != NULL) + dbc->locker = locker; + else { + /* + * If we are threaded then we need to set the + * proper thread id into the locker. + */ + if (DB_IS_THREADED(dbp)) { + env->dbenv->thread_id(env->dbenv, &pid, &tid); + __lock_set_thread_id(dbc->lref, pid, tid); + } + dbc->locker = dbc->lref; + } + } + + /* + * These fields change when we are used as a secondary index, so + * if the DB is a secondary, make sure they're set properly just + * in case we opened some cursors before we were associated. + * + * __dbc_get is used by all access methods, so this should be safe. + */ + if (F_ISSET(dbp, DB_AM_SECONDARY)) + dbc->get = dbc->c_get = __dbc_secondary_get_pp; + + if (LF_ISSET(DB_CURSOR_BULK) && dbtype == DB_BTREE) + F_SET(dbc, DBC_BULK); + if (LF_ISSET(DB_CURSOR_TRANSIENT)) + F_SET(dbc, DBC_TRANSIENT); + if (LF_ISSET(DBC_OPD)) + F_SET(dbc, DBC_OPD); + if (F_ISSET(dbp, DB_AM_RECOVER)) + F_SET(dbc, DBC_RECOVER); + if (F_ISSET(dbp, DB_AM_COMPENSATE)) + F_SET(dbc, DBC_DONTLOCK); +#ifdef HAVE_REPLICATION + /* + * If we are replicating from a down rev version then we must + * use old locking protocols. + */ + if (LOGGING_ON(env) && + ((LOG *)env->lg_handle-> + reginfo.primary)->persist.version < DB_LOGVERSION_LATCHING) + F_SET(dbc, DBC_DOWNREV); +#endif + + /* Refresh the DBC internal structure. */ + cp = dbc->internal; + cp->opd = NULL; + cp->pdbc = NULL; + + cp->indx = 0; + cp->page = NULL; + cp->pgno = PGNO_INVALID; + cp->root = root; + cp->stream_start_pgno = cp->stream_curr_pgno = PGNO_INVALID; + cp->stream_off = 0; + + if (DB_IS_PARTITIONED(dbp)) { + DBC_PART_REFRESH(dbc); + } else switch (dbtype) { + case DB_BTREE: + case DB_RECNO: + if ((ret = __bamc_refresh(dbc)) != 0) + goto err; + break; + case DB_HASH: + case DB_QUEUE: + break; + case DB_UNKNOWN: + default: + ret = __db_unknown_type(env, "DB->cursor", dbp->type); + goto err; + } + + /* + * The transaction keeps track of how many cursors were opened within + * it to catch application errors where the cursor isn't closed when + * the transaction is resolved. + */ + if (txn != NULL) + ++txn->cursors; + if (ip != NULL) + dbc->thread_info = ip; + else if (txn != NULL) + dbc->thread_info = txn->thread_info; + else + ENV_GET_THREAD_INFO(env, dbc->thread_info); + + MUTEX_LOCK(env, dbp->mutex); + TAILQ_INSERT_TAIL(&dbp->active_queue, dbc, links); + F_SET(dbc, DBC_ACTIVE); + MUTEX_UNLOCK(env, dbp->mutex); + + *dbcp = dbc; + return (0); + +err: if (allocated) + __os_free(env, dbc); + return (ret); +} + +/* + * __db_put -- + * Store a key/data pair. + * + * PUBLIC: int __db_put __P((DB *, + * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, u_int32_t)); + */ +int +__db_put(dbp, ip, txn, key, data, flags) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + DBT *key, *data; + u_int32_t flags; +{ + DBC *dbc; + DBT tdata, tkey; + ENV *env; + void *bulk_kptr, *bulk_ptr; + db_recno_t recno; + u_int32_t cursor_flags; + int ret, t_ret; + + env = dbp->env; + + /* + * See the comment in __db_get() regarding DB_CURSOR_TRANSIENT. + * + * Note that the get in the DB_NOOVERWRITE case is safe to do with this + * flag set; if it errors in any way other than DB_NOTFOUND, we're + * going to close the cursor without doing anything else, and if it + * returns DB_NOTFOUND then it's safe to do a c_put(DB_KEYLAST) even if + * an access method moved the cursor, since that's not + * position-dependent. + */ + cursor_flags = DB_WRITELOCK; + if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) + cursor_flags |= DB_CURSOR_BULK; + else + cursor_flags |= DB_CURSOR_TRANSIENT; + if ((ret = __db_cursor(dbp, ip, txn, &dbc, cursor_flags)) != 0) + return (ret); + + DEBUG_LWRITE(dbc, txn, "DB->put", key, data, flags); + + SET_RET_MEM(dbc, dbp); + + if (flags == DB_APPEND && !DB_IS_PRIMARY(dbp)) { + /* + * If there is an append callback, the value stored in + * data->data may be replaced and then freed. To avoid + * passing a freed pointer back to the user, just operate + * on a copy of the data DBT. + */ + tdata = *data; + + /* + * Append isn't a normal put operation; call the appropriate + * access method's append function. + */ + switch (dbp->type) { + case DB_QUEUE: + if ((ret = __qam_append(dbc, key, &tdata)) != 0) + goto err; + break; + case DB_RECNO: + if ((ret = __ram_append(dbc, key, &tdata)) != 0) + goto err; + break; + case DB_BTREE: + case DB_HASH: + case DB_UNKNOWN: + default: + /* The interface should prevent this. */ + DB_ASSERT(env, + dbp->type == DB_QUEUE || dbp->type == DB_RECNO); + + ret = __db_ferr(env, "DB->put", 0); + goto err; + } + + /* + * The append callback, if one exists, may have allocated + * a new tdata.data buffer. If so, free it. + */ + FREE_IF_NEEDED(env, &tdata); + + /* No need for a cursor put; we're done. */ +#ifdef HAVE_COMPRESSION + } else if (DB_IS_COMPRESSED(dbp) && !F_ISSET(dbp, DB_AM_SECONDARY) && + !DB_IS_PRIMARY(dbp) && LIST_FIRST(&dbp->f_primaries) == NULL) { + ret = __dbc_put(dbc, key, data, flags); +#endif + } else if (LF_ISSET(DB_MULTIPLE)) { + ret = 0; + memset(&tkey, 0, sizeof(tkey)); + if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO) { + tkey.data = &recno; + tkey.size = sizeof(recno); + } + memset(&tdata, 0, sizeof(tdata)); + DB_MULTIPLE_INIT(bulk_kptr, key); + DB_MULTIPLE_INIT(bulk_ptr, data); + key->doff = 0; + while (ret == 0) { + if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO) + DB_MULTIPLE_RECNO_NEXT(bulk_kptr, key, + recno, tdata.data, tdata.size); + else + DB_MULTIPLE_NEXT(bulk_kptr, key, + tkey.data, tkey.size); + DB_MULTIPLE_NEXT(bulk_ptr, data, + tdata.data, tdata.size); + if (bulk_kptr == NULL || bulk_ptr == NULL) + break; + ret = __dbc_put(dbc, &tkey, &tdata, + LF_ISSET(DB_OPFLAGS_MASK)); + if (ret == 0) + ++key->doff; + } + } else if (LF_ISSET(DB_MULTIPLE_KEY)) { + ret = 0; + memset(&tkey, 0, sizeof(tkey)); + if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO) { + tkey.data = &recno; + tkey.size = sizeof(recno); + } + memset(&tdata, 0, sizeof(tdata)); + DB_MULTIPLE_INIT(bulk_ptr, key); + while (ret == 0) { + if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO) + DB_MULTIPLE_RECNO_NEXT(bulk_ptr, key, recno, + tdata.data, tdata.size); + else + DB_MULTIPLE_KEY_NEXT(bulk_ptr, key, tkey.data, + tkey.size, tdata.data, tdata.size); + if (bulk_ptr == NULL) + break; + ret = __dbc_put(dbc, &tkey, &tdata, + LF_ISSET(DB_OPFLAGS_MASK)); + if (ret == 0) + ++key->doff; + } + } else + ret = __dbc_put(dbc, key, data, flags); + +err: /* Close the cursor. */ + if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __db_del -- + * Delete the items referenced by a key. + * + * PUBLIC: int __db_del __P((DB *, + * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DBT *, u_int32_t)); + */ +int +__db_del(dbp, ip, txn, key, flags) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + DBT *key; + u_int32_t flags; +{ + DBC *dbc; + DBT data, tkey; + void *bulk_ptr; + db_recno_t recno; + u_int32_t cursor_flags, f_init, f_next; + int ret, t_ret; + + COMPQUIET(bulk_ptr, NULL); + /* Allocate a cursor. */ + cursor_flags = DB_WRITELOCK; + if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) + cursor_flags |= DB_CURSOR_BULK; + if ((ret = __db_cursor(dbp, ip, txn, &dbc, cursor_flags)) != 0) + goto err; + + DEBUG_LWRITE(dbc, txn, "DB->del", key, NULL, flags); + +#ifdef HAVE_COMPRESSION + if (DB_IS_COMPRESSED(dbp) && !F_ISSET(dbp, DB_AM_SECONDARY) && + !DB_IS_PRIMARY(dbp) && LIST_FIRST(&dbp->f_primaries) == NULL) { + F_SET(dbc, DBC_TRANSIENT); + ret = __dbc_bulk_del(dbc, key, flags); + goto err; + } +#endif + + /* + * Walk a cursor through the key/data pairs, deleting as we go. Set + * the DB_DBT_USERMEM flag, as this might be a threaded application + * and the flags checking will catch us. We don't actually want the + * keys or data, set DB_DBT_ISSET. We rely on __dbc_get to clear + * this. + */ + memset(&data, 0, sizeof(data)); + F_SET(&data, DB_DBT_USERMEM); + tkey = *key; + + f_init = LF_ISSET(DB_MULTIPLE_KEY) ? DB_GET_BOTH : DB_SET; + f_next = DB_NEXT_DUP; + + /* + * If locking (and we haven't already acquired CDB locks), set the + * read-modify-write flag. + */ + if (STD_LOCKING(dbc)) { + f_init |= DB_RMW; + f_next |= DB_RMW; + } + + if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) { + if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO) { + memset(&tkey, 0, sizeof(tkey)); + tkey.data = &recno; + tkey.size = sizeof(recno); + } + DB_MULTIPLE_INIT(bulk_ptr, key); + /* We return the number of keys deleted in doff. */ + key->doff = 0; +bulk_next: if (dbp->type == DB_QUEUE || dbp->type == DB_RECNO) + DB_MULTIPLE_RECNO_NEXT(bulk_ptr, key, + recno, data.data, data.size); + else if (LF_ISSET(DB_MULTIPLE)) + DB_MULTIPLE_NEXT(bulk_ptr, key, tkey.data, tkey.size); + else + DB_MULTIPLE_KEY_NEXT(bulk_ptr, key, + tkey.data, tkey.size, data.data, data.size); + if (bulk_ptr == NULL) + goto err; + } + + /* We're not interested in the data -- do not return it. */ + F_SET(&tkey, DB_DBT_ISSET); + F_SET(&data, DB_DBT_ISSET); + + /* + * Optimize the simple cases. For all AMs if we don't have secondaries + * and are not a secondary and we aren't a foreign database and there + * are no dups then we can avoid a bunch of overhead. For queue we + * don't need to fetch the record since we delete by direct calculation + * from the record number. + * + * Hash permits an optimization in DB->del: since on-page duplicates are + * stored in a single HKEYDATA structure, it's possible to delete an + * entire set of them at once, and as the HKEYDATA has to be rebuilt + * and re-put each time it changes, this is much faster than deleting + * the duplicates one by one. Thus, if not pointing at an off-page + * duplicate set, and we're not using secondary indices (in which case + * we'd have to examine the items one by one anyway), let hash do this + * "quick delete". + * + * !!! + * Note that this is the only application-executed delete call in + * Berkeley DB that does not go through the __dbc_del function. + * If anything other than the delete itself (like a secondary index + * update) has to happen there in a particular situation, the + * conditions here should be modified not to use these optimizations. + * The ordinary AM-independent alternative will work just fine; + * it'll just be slower. + */ + if (!F_ISSET(dbp, DB_AM_SECONDARY) && !DB_IS_PRIMARY(dbp) && + LIST_FIRST(&dbp->f_primaries) == NULL) { +#ifdef HAVE_QUEUE + if (dbp->type == DB_QUEUE) { + ret = __qam_delete(dbc, &tkey, flags); + goto next; + } +#endif + + /* Fetch the first record. */ + if ((ret = __dbc_get(dbc, &tkey, &data, f_init)) != 0) + goto err; + +#ifdef HAVE_HASH + /* + * Hash "quick delete" removes all on-page duplicates. We + * can't do that if deleting specific key/data pairs. + */ + if (dbp->type == DB_HASH && !LF_ISSET(DB_MULTIPLE_KEY)) { + DBC *sdbc; + sdbc = dbc; +#ifdef HAVE_PARTITION + if (F_ISSET(dbc, DBC_PARTITIONED)) + sdbc = + ((PART_CURSOR*)dbc->internal)->sub_cursor; +#endif + if (sdbc->internal->opd == NULL) { + ret = __ham_quick_delete(sdbc); + goto next; + } + } +#endif + + if (!F_ISSET(dbp, DB_AM_DUP)) { + ret = dbc->am_del(dbc, 0); + goto next; + } + } else if ((ret = __dbc_get(dbc, &tkey, &data, f_init)) != 0) + goto err; + + /* Walk through the set of key/data pairs, deleting as we go. */ + for (;;) { + if ((ret = __dbc_del(dbc, flags)) != 0) + break; + /* + * With DB_MULTIPLE_KEY, the application has specified the + * exact records they want deleted. We don't need to walk + * through a set of duplicates. + */ + if (LF_ISSET(DB_MULTIPLE_KEY)) + break; + + F_SET(&tkey, DB_DBT_ISSET); + F_SET(&data, DB_DBT_ISSET); + if ((ret = __dbc_get(dbc, &tkey, &data, f_next)) != 0) { + if (ret == DB_NOTFOUND) + ret = 0; + break; + } + } + +next: if (ret == 0 && LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) { + ++key->doff; + goto bulk_next; + } +err: /* Discard the cursor. */ + if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __db_sync -- + * Flush the database cache. + * + * PUBLIC: int __db_sync __P((DB *)); + */ +int +__db_sync(dbp) + DB *dbp; +{ + int ret, t_ret; + + ret = 0; + + /* If the database was read-only, we're done. */ + if (F_ISSET(dbp, DB_AM_RDONLY)) + return (0); + + /* If it's a Recno tree, write the backing source text file. */ + if (dbp->type == DB_RECNO) + ret = __ram_writeback(dbp); + + /* If the database was never backed by a database file, we're done. */ + if (F_ISSET(dbp, DB_AM_INMEM)) + return (ret); +#ifdef HAVE_PARTITION + if (DB_IS_PARTITIONED(dbp)) + ret = __partition_sync(dbp); + else +#endif + if (dbp->type == DB_QUEUE) + ret = __qam_sync(dbp); + else + /* Flush any dirty pages from the cache to the backing file. */ + if ((t_ret = __memp_fsync(dbp->mpf)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __db_associate -- + * Associate another database as a secondary index to this one. + * + * PUBLIC: int __db_associate __P((DB *, DB_THREAD_INFO *, DB_TXN *, DB *, + * PUBLIC: int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t)); + */ +int +__db_associate(dbp, ip, txn, sdbp, callback, flags) + DB *dbp, *sdbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + int (*callback) __P((DB *, const DBT *, const DBT *, DBT *)); + u_int32_t flags; +{ + DBC *pdbc, *sdbc; + DBT key, data, skey, *tskeyp; + ENV *env; + int build, ret, t_ret; + u_int32_t nskey; + + env = dbp->env; + pdbc = sdbc = NULL; + ret = 0; + + memset(&skey, 0, sizeof(DBT)); + nskey = 0; + tskeyp = NULL; + + /* + * Check to see if the secondary is empty -- and thus if we should + * build it -- before we link it in and risk making it show up in other + * threads. Do this first so that the databases remain unassociated on + * error. + */ + build = 0; + if (LF_ISSET(DB_CREATE)) { + if ((ret = __db_cursor(sdbp, ip, txn, &sdbc, 0)) != 0) + goto err; + + /* + * We don't care about key or data; we're just doing + * an existence check. + */ + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + F_SET(&key, DB_DBT_PARTIAL | DB_DBT_USERMEM); + F_SET(&data, DB_DBT_PARTIAL | DB_DBT_USERMEM); + if ((ret = __dbc_get(sdbc, &key, &data, + (STD_LOCKING(sdbc) ? DB_RMW : 0) | + DB_FIRST)) == DB_NOTFOUND) { + build = 1; + ret = 0; + } + + if ((t_ret = __dbc_close(sdbc)) != 0 && ret == 0) + ret = t_ret; + + /* Reset for later error check. */ + sdbc = NULL; + + if (ret != 0) + goto err; + } + + /* + * Set up the database handle as a secondary. + */ + sdbp->s_callback = callback; + sdbp->s_primary = dbp; + + sdbp->stored_get = sdbp->get; + sdbp->get = __db_secondary_get; + + sdbp->stored_close = sdbp->close; + sdbp->close = __db_secondary_close_pp; + + F_SET(sdbp, DB_AM_SECONDARY); + + if (LF_ISSET(DB_IMMUTABLE_KEY)) + FLD_SET(sdbp->s_assoc_flags, DB_ASSOC_IMMUTABLE_KEY); + + /* + * Add the secondary to the list on the primary. Do it here + * so that we see any updates that occur while we're walking + * the primary. + */ + MUTEX_LOCK(env, dbp->mutex); + + /* See __db_s_next for an explanation of secondary refcounting. */ + DB_ASSERT(env, sdbp->s_refcnt == 0); + sdbp->s_refcnt = 1; + LIST_INSERT_HEAD(&dbp->s_secondaries, sdbp, s_links); + MUTEX_UNLOCK(env, dbp->mutex); + + if (build) { + /* + * We loop through the primary, putting each item we + * find into the new secondary. + * + * If we're using CDB, opening these two cursors puts us + * in a bit of a locking tangle: CDB locks are done on the + * primary, so that we stay deadlock-free, but that means + * that updating the secondary while we have a read cursor + * open on the primary will self-block. To get around this, + * we force the primary cursor to use the same locker ID + * as the secondary, so they won't conflict. This should + * be harmless even if we're not using CDB. + */ + if ((ret = __db_cursor(sdbp, ip, txn, &sdbc, + CDB_LOCKING(sdbp->env) ? DB_WRITECURSOR : 0)) != 0) + goto err; + if ((ret = __db_cursor_int(dbp, ip, + txn, dbp->type, PGNO_INVALID, 0, sdbc->locker, &pdbc)) != 0) + goto err; + + /* Lock out other threads, now that we have a locker. */ + dbp->associate_locker = sdbc->locker; + + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + while ((ret = __dbc_get(pdbc, &key, &data, DB_NEXT)) == 0) { + if ((ret = callback(sdbp, &key, &data, &skey)) != 0) { + if (ret == DB_DONOTINDEX) + continue; + goto err; + } + if (F_ISSET(&skey, DB_DBT_MULTIPLE)) { +#ifdef DIAGNOSTIC + __db_check_skeyset(sdbp, &skey); +#endif + nskey = skey.size; + tskeyp = (DBT *)skey.data; + } else { + nskey = 1; + tskeyp = &skey; + } + SWAP_IF_NEEDED(sdbp, &key); + for (; nskey > 0; nskey--, tskeyp++) { + if ((ret = __dbc_put(sdbc, + tskeyp, &key, DB_UPDATE_SECONDARY)) != 0) + goto err; + FREE_IF_NEEDED(env, tskeyp); + } + SWAP_IF_NEEDED(sdbp, &key); + FREE_IF_NEEDED(env, &skey); + } + if (ret == DB_NOTFOUND) + ret = 0; + } + +err: if (sdbc != NULL && (t_ret = __dbc_close(sdbc)) != 0 && ret == 0) + ret = t_ret; + + if (pdbc != NULL && (t_ret = __dbc_close(pdbc)) != 0 && ret == 0) + ret = t_ret; + + dbp->associate_locker = NULL; + + for (; nskey > 0; nskey--, tskeyp++) + FREE_IF_NEEDED(env, tskeyp); + FREE_IF_NEEDED(env, &skey); + + return (ret); +} + +/* + * __db_secondary_get -- + * This wrapper function for DB->pget() is the DB->get() function + * on a database which has been made into a secondary index. + */ +static int +__db_secondary_get(sdbp, txn, skey, data, flags) + DB *sdbp; + DB_TXN *txn; + DBT *skey, *data; + u_int32_t flags; +{ + DB_ASSERT(sdbp->env, F_ISSET(sdbp, DB_AM_SECONDARY)); + return (__db_pget_pp(sdbp, txn, skey, NULL, data, flags)); +} + +/* + * __db_secondary_close -- + * Wrapper function for DB->close() which we use on secondaries to + * manage refcounting and make sure we don't close them underneath + * a primary that is updating. + * + * PUBLIC: int __db_secondary_close __P((DB *, u_int32_t)); + */ +int +__db_secondary_close(sdbp, flags) + DB *sdbp; + u_int32_t flags; +{ + DB *primary; + ENV *env; + int doclose; + + doclose = 0; + primary = sdbp->s_primary; + env = primary->env; + + MUTEX_LOCK(env, primary->mutex); + /* + * Check the refcount--if it was at 1 when we were called, no + * thread is currently updating this secondary through the primary, + * so it's safe to close it for real. + * + * If it's not safe to do the close now, we do nothing; the + * database will actually be closed when the refcount is decremented, + * which can happen in either __db_s_next or __db_s_done. + */ + DB_ASSERT(env, sdbp->s_refcnt != 0); + if (--sdbp->s_refcnt == 0) { + LIST_REMOVE(sdbp, s_links); + /* We don't want to call close while the mutex is held. */ + doclose = 1; + } + MUTEX_UNLOCK(env, primary->mutex); + + /* + * sdbp->close is this function; call the real one explicitly if + * need be. + */ + return (doclose ? __db_close(sdbp, NULL, flags) : 0); +} + +/* + * __db_associate_foreign -- + * Associate this database (fdbp) as a foreign constraint to another + * database (pdbp). That is, dbp's keys appear as foreign key values in + * pdbp. + * + * PUBLIC: int __db_associate_foreign __P((DB *, DB *, + * PUBLIC: int (*)(DB *, const DBT *, DBT *, const DBT *, int *), + * PUBLIC: u_int32_t)); + */ +int +__db_associate_foreign(fdbp, pdbp, callback, flags) + DB *fdbp, *pdbp; + int (*callback)(DB *, const DBT *, DBT *, const DBT *, int *); + u_int32_t flags; +{ + DB_FOREIGN_INFO *f_info; + ENV *env; + int ret; + + env = fdbp->env; + ret = 0; + + if ((ret = __os_malloc(env, sizeof(DB_FOREIGN_INFO), &f_info)) != 0) { + return ret; + } + memset(f_info, 0, sizeof(DB_FOREIGN_INFO)); + + f_info->dbp = pdbp; + f_info->callback = callback; + + /* + * It might be wise to filter this, but for now the flags only + * set the delete action type. + */ + FLD_SET(f_info->flags, flags); + + /* + * Add f_info to the foreign database's list of primaries. That is to + * say, fdbp->f_primaries lists all databases for which fdbp is a + * foreign constraint. + */ + MUTEX_LOCK(env, fdbp->mutex); + LIST_INSERT_HEAD(&fdbp->f_primaries, f_info, f_links); + MUTEX_UNLOCK(env, fdbp->mutex); + + /* + * Associate fdbp as pdbp's foreign db, for referential integrity + * checks. We don't allow the foreign db to be changed, because we + * currently have no way of removing pdbp from the old foreign db's list + * of primaries. + */ + if (pdbp->s_foreign != NULL) + return (EINVAL); + pdbp->s_foreign = fdbp; + + return (ret); +} + +static int +__dbc_set_priority(dbc, priority) + DBC *dbc; + DB_CACHE_PRIORITY priority; +{ + dbc->priority = priority; + return (0); +} + +static int +__dbc_get_priority(dbc, priority) + DBC *dbc; + DB_CACHE_PRIORITY *priority; +{ + *priority = dbc->priority; + return (0); +} diff --git a/db/db_auto.c b/db/db_auto.c new file mode 100644 index 0000000..2ce4199 --- /dev/null +++ b/db/db_auto.c @@ -0,0 +1,3267 @@ +/* Do not edit: automatically built by gen_rec.awk. */ + +#include "db_config.h" +#include "db_int.h" +#include "dbinc/crypto.h" +#include "dbinc/db_page.h" +#include "dbinc/db_dispatch.h" +#include "dbinc/db_am.h" +#include "dbinc/log.h" +#include "dbinc/txn.h" + +/* + * PUBLIC: int __db_addrem_read __P((ENV *, DB **, void *, void *, + * PUBLIC: __db_addrem_args **)); + */ +int +__db_addrem_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __db_addrem_args **argpp; +{ + __db_addrem_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__db_addrem_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &argp->opcode, bp); + bp += sizeof(argp->opcode); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &argp->indx, bp); + bp += sizeof(argp->indx); + + LOGCOPY_32(env, &argp->nbytes, bp); + bp += sizeof(argp->nbytes); + + memset(&argp->hdr, 0, sizeof(argp->hdr)); + LOGCOPY_32(env,&argp->hdr.size, bp); + bp += sizeof(u_int32_t); + argp->hdr.data = bp; + bp += argp->hdr.size; + + memset(&argp->dbt, 0, sizeof(argp->dbt)); + LOGCOPY_32(env,&argp->dbt.size, bp); + bp += sizeof(u_int32_t); + argp->dbt.data = bp; + bp += argp->dbt.size; + + LOGCOPY_TOLSN(env, &argp->pagelsn, bp); + bp += sizeof(DB_LSN); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __db_addrem_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, u_int32_t, db_pgno_t, u_int32_t, u_int32_t, + * PUBLIC: const DBT *, const DBT *, DB_LSN *)); + */ +int +__db_addrem_log(dbp, txnp, ret_lsnp, flags, + opcode, pgno, indx, nbytes, hdr, + dbt, pagelsn) + DB *dbp; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t opcode; + db_pgno_t pgno; + u_int32_t indx; + u_int32_t nbytes; + const DBT *hdr; + const DBT *dbt; + DB_LSN * pagelsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + ENV *env; + u_int32_t zero, uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + env = dbp->env; + rlsnp = ret_lsnp; + rectype = DB___db_addrem; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + DB_ASSERT(env, dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + (hdr == NULL ? 0 : hdr->size) + + sizeof(u_int32_t) + (dbt == NULL ? 0 : dbt->size) + + sizeof(*pagelsn); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, bp, &opcode); + bp += sizeof(opcode); + + uinttmp = (u_int32_t)dbp->log_filename->id; + LOGCOPY_32(env, bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + LOGCOPY_32(env, bp, &indx); + bp += sizeof(indx); + + LOGCOPY_32(env, bp, &nbytes); + bp += sizeof(nbytes); + + if (hdr == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &hdr->size); + bp += sizeof(hdr->size); + memcpy(bp, hdr->data, hdr->size); + bp += hdr->size; + } + + if (dbt == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &dbt->size); + bp += sizeof(dbt->size); + memcpy(bp, dbt->data, dbt->size); + bp += dbt->size; + } + + if (pagelsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(pagelsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, pagelsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, pagelsn); + } else + memset(bp, 0, sizeof(*pagelsn)); + bp += sizeof(*pagelsn); + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__db_addrem_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __db_big_read __P((ENV *, DB **, void *, void *, + * PUBLIC: __db_big_args **)); + */ +int +__db_big_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __db_big_args **argpp; +{ + __db_big_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__db_big_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &argp->opcode, bp); + bp += sizeof(argp->opcode); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &uinttmp, bp); + argp->prev_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &uinttmp, bp); + argp->next_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + memset(&argp->dbt, 0, sizeof(argp->dbt)); + LOGCOPY_32(env,&argp->dbt.size, bp); + bp += sizeof(u_int32_t); + argp->dbt.data = bp; + bp += argp->dbt.size; + + LOGCOPY_TOLSN(env, &argp->pagelsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_TOLSN(env, &argp->prevlsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_TOLSN(env, &argp->nextlsn, bp); + bp += sizeof(DB_LSN); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __db_big_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, u_int32_t, db_pgno_t, db_pgno_t, db_pgno_t, + * PUBLIC: const DBT *, DB_LSN *, DB_LSN *, DB_LSN *)); + */ +int +__db_big_log(dbp, txnp, ret_lsnp, flags, + opcode, pgno, prev_pgno, next_pgno, dbt, + pagelsn, prevlsn, nextlsn) + DB *dbp; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t opcode; + db_pgno_t pgno; + db_pgno_t prev_pgno; + db_pgno_t next_pgno; + const DBT *dbt; + DB_LSN * pagelsn; + DB_LSN * prevlsn; + DB_LSN * nextlsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + ENV *env; + u_int32_t zero, uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + env = dbp->env; + rlsnp = ret_lsnp; + rectype = DB___db_big; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + DB_ASSERT(env, dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + (dbt == NULL ? 0 : dbt->size) + + sizeof(*pagelsn) + + sizeof(*prevlsn) + + sizeof(*nextlsn); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, bp, &opcode); + bp += sizeof(opcode); + + uinttmp = (u_int32_t)dbp->log_filename->id; + LOGCOPY_32(env, bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)prev_pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)next_pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (dbt == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &dbt->size); + bp += sizeof(dbt->size); + memcpy(bp, dbt->data, dbt->size); + bp += dbt->size; + } + + if (pagelsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(pagelsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, pagelsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, pagelsn); + } else + memset(bp, 0, sizeof(*pagelsn)); + bp += sizeof(*pagelsn); + + if (prevlsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(prevlsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, prevlsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, prevlsn); + } else + memset(bp, 0, sizeof(*prevlsn)); + bp += sizeof(*prevlsn); + + if (nextlsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(nextlsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, nextlsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, nextlsn); + } else + memset(bp, 0, sizeof(*nextlsn)); + bp += sizeof(*nextlsn); + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__db_big_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __db_ovref_read __P((ENV *, DB **, void *, void *, + * PUBLIC: __db_ovref_args **)); + */ +int +__db_ovref_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __db_ovref_args **argpp; +{ + __db_ovref_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__db_ovref_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &uinttmp, bp); + argp->adjust = (int32_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->lsn, bp); + bp += sizeof(DB_LSN); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __db_ovref_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_pgno_t, int32_t, DB_LSN *)); + */ +int +__db_ovref_log(dbp, txnp, ret_lsnp, flags, pgno, adjust, lsn) + DB *dbp; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + db_pgno_t pgno; + int32_t adjust; + DB_LSN * lsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + ENV *env; + u_int32_t uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + env = dbp->env; + rlsnp = ret_lsnp; + rectype = DB___db_ovref; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + DB_ASSERT(env, dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(*lsn); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + uinttmp = (u_int32_t)dbp->log_filename->id; + LOGCOPY_32(env, bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)adjust; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (lsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(lsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, lsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, lsn); + } else + memset(bp, 0, sizeof(*lsn)); + bp += sizeof(*lsn); + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__db_ovref_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __db_relink_42_read __P((ENV *, DB **, void *, + * PUBLIC: void *, __db_relink_42_args **)); + */ +int +__db_relink_42_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __db_relink_42_args **argpp; +{ + __db_relink_42_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__db_relink_42_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &argp->opcode, bp); + bp += sizeof(argp->opcode); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->prev = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->lsn_prev, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->next = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->lsn_next, bp); + bp += sizeof(DB_LSN); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __db_debug_read __P((ENV *, void *, __db_debug_args **)); + */ +int +__db_debug_read(env, recbuf, argpp) + ENV *env; + void *recbuf; + __db_debug_args **argpp; +{ + __db_debug_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__db_debug_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + memset(&argp->op, 0, sizeof(argp->op)); + LOGCOPY_32(env,&argp->op.size, bp); + bp += sizeof(u_int32_t); + argp->op.data = bp; + bp += argp->op.size; + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + + memset(&argp->key, 0, sizeof(argp->key)); + LOGCOPY_32(env,&argp->key.size, bp); + bp += sizeof(u_int32_t); + argp->key.data = bp; + bp += argp->key.size; + + memset(&argp->data, 0, sizeof(argp->data)); + LOGCOPY_32(env,&argp->data.size, bp); + bp += sizeof(u_int32_t); + argp->data.data = bp; + bp += argp->data.size; + + LOGCOPY_32(env, &argp->arg_flags, bp); + bp += sizeof(argp->arg_flags); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __db_debug_log __P((ENV *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, const DBT *, int32_t, const DBT *, const DBT *, + * PUBLIC: u_int32_t)); + */ +int +__db_debug_log(env, txnp, ret_lsnp, flags, + op, fileid, key, data, arg_flags) + ENV *env; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + const DBT *op; + int32_t fileid; + const DBT *key; + const DBT *data; + u_int32_t arg_flags; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + u_int32_t zero, uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + rlsnp = ret_lsnp; + rectype = DB___db_debug; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + (op == NULL ? 0 : op->size) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + (key == NULL ? 0 : key->size) + + sizeof(u_int32_t) + (data == NULL ? 0 : data->size) + + sizeof(u_int32_t); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + if (op == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &op->size); + bp += sizeof(op->size); + memcpy(bp, op->data, op->size); + bp += op->size; + } + + uinttmp = (u_int32_t)fileid; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (key == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &key->size); + bp += sizeof(key->size); + memcpy(bp, key->data, key->size); + bp += key->size; + } + + if (data == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &data->size); + bp += sizeof(data->size); + memcpy(bp, data->data, data->size); + bp += data->size; + } + + LOGCOPY_32(env, bp, &arg_flags); + bp += sizeof(arg_flags); + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__db_debug_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __db_noop_read __P((ENV *, DB **, void *, void *, + * PUBLIC: __db_noop_args **)); + */ +int +__db_noop_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __db_noop_args **argpp; +{ + __db_noop_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__db_noop_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->prevlsn, bp); + bp += sizeof(DB_LSN); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __db_noop_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *)); + */ +int +__db_noop_log(dbp, txnp, ret_lsnp, flags, pgno, prevlsn) + DB *dbp; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + db_pgno_t pgno; + DB_LSN * prevlsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + ENV *env; + u_int32_t uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + env = dbp->env; + rlsnp = ret_lsnp; + rectype = DB___db_noop; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + DB_ASSERT(env, dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(*prevlsn); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + uinttmp = (u_int32_t)dbp->log_filename->id; + LOGCOPY_32(env, bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (prevlsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(prevlsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, prevlsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, prevlsn); + } else + memset(bp, 0, sizeof(*prevlsn)); + bp += sizeof(*prevlsn); + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__db_noop_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __db_pg_alloc_42_read __P((ENV *, DB **, void *, + * PUBLIC: void *, __db_pg_alloc_42_args **)); + */ +int +__db_pg_alloc_42_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __db_pg_alloc_42_args **argpp; +{ + __db_pg_alloc_42_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__db_pg_alloc_42_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_TOLSN(env, &argp->meta_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->meta_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->page_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &argp->ptype, bp); + bp += sizeof(argp->ptype); + + LOGCOPY_32(env, &uinttmp, bp); + argp->next = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __db_pg_alloc_read __P((ENV *, DB **, void *, void *, + * PUBLIC: __db_pg_alloc_args **)); + */ +int +__db_pg_alloc_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __db_pg_alloc_args **argpp; +{ + __db_pg_alloc_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__db_pg_alloc_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_TOLSN(env, &argp->meta_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->meta_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->page_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &argp->ptype, bp); + bp += sizeof(argp->ptype); + + LOGCOPY_32(env, &uinttmp, bp); + argp->next = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &uinttmp, bp); + argp->last_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __db_pg_alloc_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, DB_LSN *, db_pgno_t, DB_LSN *, db_pgno_t, u_int32_t, + * PUBLIC: db_pgno_t, db_pgno_t)); + */ +int +__db_pg_alloc_log(dbp, txnp, ret_lsnp, flags, meta_lsn, meta_pgno, page_lsn, pgno, ptype, + next, last_pgno) + DB *dbp; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + DB_LSN * meta_lsn; + db_pgno_t meta_pgno; + DB_LSN * page_lsn; + db_pgno_t pgno; + u_int32_t ptype; + db_pgno_t next; + db_pgno_t last_pgno; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + ENV *env; + u_int32_t uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + env = dbp->env; + rlsnp = ret_lsnp; + rectype = DB___db_pg_alloc; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + DB_ASSERT(env, dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(*meta_lsn) + + sizeof(u_int32_t) + + sizeof(*page_lsn) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + uinttmp = (u_int32_t)dbp->log_filename->id; + LOGCOPY_32(env, bp, &uinttmp); + bp += sizeof(uinttmp); + + if (meta_lsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(meta_lsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, meta_lsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, meta_lsn); + } else + memset(bp, 0, sizeof(*meta_lsn)); + bp += sizeof(*meta_lsn); + + uinttmp = (u_int32_t)meta_pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (page_lsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(page_lsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, page_lsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, page_lsn); + } else + memset(bp, 0, sizeof(*page_lsn)); + bp += sizeof(*page_lsn); + + uinttmp = (u_int32_t)pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + LOGCOPY_32(env, bp, &ptype); + bp += sizeof(ptype); + + uinttmp = (u_int32_t)next; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)last_pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__db_pg_alloc_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __db_pg_free_42_read __P((ENV *, DB **, void *, + * PUBLIC: void *, __db_pg_free_42_args **)); + */ +int +__db_pg_free_42_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __db_pg_free_42_args **argpp; +{ + __db_pg_free_42_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__db_pg_free_42_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->meta_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->meta_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + memset(&argp->header, 0, sizeof(argp->header)); + LOGCOPY_32(env,&argp->header.size, bp); + bp += sizeof(u_int32_t); + argp->header.data = bp; + bp += argp->header.size; + + LOGCOPY_32(env, &uinttmp, bp); + argp->next = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __db_pg_free_read __P((ENV *, DB **, void *, void *, + * PUBLIC: __db_pg_free_args **)); + */ +int +__db_pg_free_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __db_pg_free_args **argpp; +{ + __db_pg_free_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__db_pg_free_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->meta_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->meta_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + memset(&argp->header, 0, sizeof(argp->header)); + LOGCOPY_32(env,&argp->header.size, bp); + bp += sizeof(u_int32_t); + argp->header.data = bp; + bp += argp->header.size; + if (LOG_SWAPPED(env) && dbpp != NULL && *dbpp != NULL) { + int t_ret; + if ((t_ret = __db_pageswap(*dbpp, (PAGE *)argp->header.data, + (size_t)argp->header.size, NULL, 1)) != 0) + return (t_ret); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->next = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &uinttmp, bp); + argp->last_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __db_pg_free_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t, const DBT *, + * PUBLIC: db_pgno_t, db_pgno_t)); + */ +int +__db_pg_free_log(dbp, txnp, ret_lsnp, flags, pgno, meta_lsn, meta_pgno, header, next, + last_pgno) + DB *dbp; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + db_pgno_t pgno; + DB_LSN * meta_lsn; + db_pgno_t meta_pgno; + const DBT *header; + db_pgno_t next; + db_pgno_t last_pgno; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + ENV *env; + u_int32_t zero, uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + env = dbp->env; + rlsnp = ret_lsnp; + rectype = DB___db_pg_free; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + DB_ASSERT(env, dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(*meta_lsn) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + (header == NULL ? 0 : header->size) + + sizeof(u_int32_t) + + sizeof(u_int32_t); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + uinttmp = (u_int32_t)dbp->log_filename->id; + LOGCOPY_32(env, bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (meta_lsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(meta_lsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, meta_lsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, meta_lsn); + } else + memset(bp, 0, sizeof(*meta_lsn)); + bp += sizeof(*meta_lsn); + + uinttmp = (u_int32_t)meta_pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (header == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &header->size); + bp += sizeof(header->size); + memcpy(bp, header->data, header->size); + if (LOG_SWAPPED(env)) + if ((ret = __db_pageswap(dbp, + (PAGE *)bp, (size_t)header->size, (DBT *)NULL, 0)) != 0) + return (ret); + bp += header->size; + } + + uinttmp = (u_int32_t)next; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)last_pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__db_pg_free_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __db_cksum_read __P((ENV *, void *, __db_cksum_args **)); + */ +int +__db_cksum_read(env, recbuf, argpp) + ENV *env; + void *recbuf; + __db_cksum_args **argpp; +{ + __db_cksum_args *argp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__db_cksum_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __db_cksum_log __P((ENV *, DB_TXN *, DB_LSN *, u_int32_t)); + */ +int +__db_cksum_log(env, txnp, ret_lsnp, flags) + ENV *env; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + u_int32_t rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + rlsnp = ret_lsnp; + rectype = DB___db_cksum; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__db_cksum_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __db_pg_freedata_42_read __P((ENV *, DB **, void *, + * PUBLIC: void *, __db_pg_freedata_42_args **)); + */ +int +__db_pg_freedata_42_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __db_pg_freedata_42_args **argpp; +{ + __db_pg_freedata_42_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__db_pg_freedata_42_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->meta_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->meta_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + memset(&argp->header, 0, sizeof(argp->header)); + LOGCOPY_32(env,&argp->header.size, bp); + bp += sizeof(u_int32_t); + argp->header.data = bp; + bp += argp->header.size; + + LOGCOPY_32(env, &uinttmp, bp); + argp->next = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + memset(&argp->data, 0, sizeof(argp->data)); + LOGCOPY_32(env,&argp->data.size, bp); + bp += sizeof(u_int32_t); + argp->data.data = bp; + bp += argp->data.size; + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __db_pg_freedata_read __P((ENV *, DB **, void *, + * PUBLIC: void *, __db_pg_freedata_args **)); + */ +int +__db_pg_freedata_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __db_pg_freedata_args **argpp; +{ + __db_pg_freedata_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__db_pg_freedata_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->meta_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->meta_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + memset(&argp->header, 0, sizeof(argp->header)); + LOGCOPY_32(env,&argp->header.size, bp); + bp += sizeof(u_int32_t); + argp->header.data = bp; + bp += argp->header.size; + + LOGCOPY_32(env, &uinttmp, bp); + argp->next = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &uinttmp, bp); + argp->last_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + memset(&argp->data, 0, sizeof(argp->data)); + LOGCOPY_32(env,&argp->data.size, bp); + bp += sizeof(u_int32_t); + argp->data.data = bp; + bp += argp->data.size; + if (LOG_SWAPPED(env) && dbpp != NULL && *dbpp != NULL) { + int t_ret; + if ((t_ret = __db_pageswap(*dbpp, + (PAGE *)argp->header.data, (size_t)argp->header.size, + &argp->data, 1)) != 0) + return (t_ret); + } + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __db_pg_freedata_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t, const DBT *, + * PUBLIC: db_pgno_t, db_pgno_t, const DBT *)); + */ +int +__db_pg_freedata_log(dbp, txnp, ret_lsnp, flags, pgno, meta_lsn, meta_pgno, header, next, + last_pgno, data) + DB *dbp; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + db_pgno_t pgno; + DB_LSN * meta_lsn; + db_pgno_t meta_pgno; + const DBT *header; + db_pgno_t next; + db_pgno_t last_pgno; + const DBT *data; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + ENV *env; + u_int32_t zero, uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + env = dbp->env; + rlsnp = ret_lsnp; + rectype = DB___db_pg_freedata; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + DB_ASSERT(env, dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(*meta_lsn) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + (header == NULL ? 0 : header->size) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + (data == NULL ? 0 : data->size); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + uinttmp = (u_int32_t)dbp->log_filename->id; + LOGCOPY_32(env, bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (meta_lsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(meta_lsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, meta_lsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, meta_lsn); + } else + memset(bp, 0, sizeof(*meta_lsn)); + bp += sizeof(*meta_lsn); + + uinttmp = (u_int32_t)meta_pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (header == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &header->size); + bp += sizeof(header->size); + memcpy(bp, header->data, header->size); + if (LOG_SWAPPED(env)) + if ((ret = __db_pageswap(dbp, + (PAGE *)bp, (size_t)header->size, (DBT *)data, 0)) != 0) + return (ret); + bp += header->size; + } + + uinttmp = (u_int32_t)next; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)last_pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (data == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &data->size); + bp += sizeof(data->size); + memcpy(bp, data->data, data->size); + if (LOG_SWAPPED(env) && F_ISSET(data, DB_DBT_APPMALLOC)) + __os_free(env, data->data); + bp += data->size; + } + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__db_pg_freedata_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __db_pg_init_read __P((ENV *, DB **, void *, void *, + * PUBLIC: __db_pg_init_args **)); + */ +int +__db_pg_init_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __db_pg_init_args **argpp; +{ + __db_pg_init_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__db_pg_init_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + memset(&argp->header, 0, sizeof(argp->header)); + LOGCOPY_32(env,&argp->header.size, bp); + bp += sizeof(u_int32_t); + argp->header.data = bp; + bp += argp->header.size; + + memset(&argp->data, 0, sizeof(argp->data)); + LOGCOPY_32(env,&argp->data.size, bp); + bp += sizeof(u_int32_t); + argp->data.data = bp; + bp += argp->data.size; + if (LOG_SWAPPED(env) && dbpp != NULL && *dbpp != NULL) { + int t_ret; + if ((t_ret = __db_pageswap(*dbpp, + (PAGE *)argp->header.data, (size_t)argp->header.size, + &argp->data, 1)) != 0) + return (t_ret); + } + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __db_pg_init_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_pgno_t, const DBT *, const DBT *)); + */ +int +__db_pg_init_log(dbp, txnp, ret_lsnp, flags, pgno, header, data) + DB *dbp; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + db_pgno_t pgno; + const DBT *header; + const DBT *data; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + ENV *env; + u_int32_t zero, uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + env = dbp->env; + rlsnp = ret_lsnp; + rectype = DB___db_pg_init; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + DB_ASSERT(env, dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + (header == NULL ? 0 : header->size) + + sizeof(u_int32_t) + (data == NULL ? 0 : data->size); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + uinttmp = (u_int32_t)dbp->log_filename->id; + LOGCOPY_32(env, bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (header == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &header->size); + bp += sizeof(header->size); + memcpy(bp, header->data, header->size); + if (LOG_SWAPPED(env)) + if ((ret = __db_pageswap(dbp, + (PAGE *)bp, (size_t)header->size, (DBT *)data, 0)) != 0) + return (ret); + bp += header->size; + } + + if (data == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &data->size); + bp += sizeof(data->size); + memcpy(bp, data->data, data->size); + if (LOG_SWAPPED(env) && F_ISSET(data, DB_DBT_APPMALLOC)) + __os_free(env, data->data); + bp += data->size; + } + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__db_pg_init_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __db_pg_sort_44_read __P((ENV *, DB **, void *, + * PUBLIC: void *, __db_pg_sort_44_args **)); + */ +int +__db_pg_sort_44_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __db_pg_sort_44_args **argpp; +{ + __db_pg_sort_44_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__db_pg_sort_44_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->meta = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->meta_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->last_free = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->last_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->last_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + memset(&argp->list, 0, sizeof(argp->list)); + LOGCOPY_32(env,&argp->list.size, bp); + bp += sizeof(u_int32_t); + argp->list.data = bp; + bp += argp->list.size; + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __db_pg_trunc_read __P((ENV *, DB **, void *, void *, + * PUBLIC: __db_pg_trunc_args **)); + */ +int +__db_pg_trunc_read(env, dbpp, td, recbuf, argpp) + ENV *env; + DB **dbpp; + void *td; + void *recbuf; + __db_pg_trunc_args **argpp; +{ + __db_pg_trunc_args *argp; + u_int32_t uinttmp; + u_int8_t *bp; + int ret; + + if ((ret = __os_malloc(env, + sizeof(__db_pg_trunc_args) + sizeof(DB_TXN), &argp)) != 0) + return (ret); + bp = recbuf; + argp->txnp = (DB_TXN *)&argp[1]; + memset(argp->txnp, 0, sizeof(DB_TXN)); + + argp->txnp->td = td; + LOGCOPY_32(env, &argp->type, bp); + bp += sizeof(argp->type); + + LOGCOPY_32(env, &argp->txnp->txnid, bp); + bp += sizeof(argp->txnp->txnid); + + LOGCOPY_TOLSN(env, &argp->prev_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->fileid = (int32_t)uinttmp; + bp += sizeof(uinttmp); + if (dbpp != NULL) { + *dbpp = NULL; + ret = __dbreg_id_to_db( + env, argp->txnp, dbpp, argp->fileid, 1); + } + + LOGCOPY_32(env, &uinttmp, bp); + argp->meta = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->meta_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->last_free = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_TOLSN(env, &argp->last_lsn, bp); + bp += sizeof(DB_LSN); + + LOGCOPY_32(env, &uinttmp, bp); + argp->next_free = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + LOGCOPY_32(env, &uinttmp, bp); + argp->last_pgno = (db_pgno_t)uinttmp; + bp += sizeof(uinttmp); + + memset(&argp->list, 0, sizeof(argp->list)); + LOGCOPY_32(env,&argp->list.size, bp); + bp += sizeof(u_int32_t); + argp->list.data = bp; + bp += argp->list.size; + + *argpp = argp; + return (ret); +} + +/* + * PUBLIC: int __db_pg_trunc_log __P((DB *, DB_TXN *, DB_LSN *, + * PUBLIC: u_int32_t, db_pgno_t, DB_LSN *, db_pgno_t, DB_LSN *, db_pgno_t, + * PUBLIC: db_pgno_t, const DBT *)); + */ +int +__db_pg_trunc_log(dbp, txnp, ret_lsnp, flags, meta, meta_lsn, last_free, last_lsn, next_free, + last_pgno, list) + DB *dbp; + DB_TXN *txnp; + DB_LSN *ret_lsnp; + u_int32_t flags; + db_pgno_t meta; + DB_LSN * meta_lsn; + db_pgno_t last_free; + DB_LSN * last_lsn; + db_pgno_t next_free; + db_pgno_t last_pgno; + const DBT *list; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn, *rlsnp; + DB_TXNLOGREC *lr; + ENV *env; + u_int32_t zero, uinttmp, rectype, txn_num; + u_int npad; + u_int8_t *bp; + int is_durable, ret; + + COMPQUIET(lr, NULL); + + env = dbp->env; + rlsnp = ret_lsnp; + rectype = DB___db_pg_trunc; + npad = 0; + ret = 0; + + if (LF_ISSET(DB_LOG_NOT_DURABLE) || + F_ISSET(dbp, DB_AM_NOT_DURABLE)) { + if (txnp == NULL) + return (0); + is_durable = 0; + } else + is_durable = 1; + + if (txnp == NULL) { + txn_num = 0; + lsnp = &null_lsn; + null_lsn.file = null_lsn.offset = 0; + } else { + if (TAILQ_FIRST(&txnp->kids) != NULL && + (ret = __txn_activekids(env, rectype, txnp)) != 0) + return (ret); + /* + * We need to assign begin_lsn while holding region mutex. + * That assignment is done inside the DbEnv->log_put call, + * so pass in the appropriate memory location to be filled + * in by the log_put code. + */ + DB_SET_TXN_LSNP(txnp, &rlsnp, &lsnp); + txn_num = txnp->txnid; + } + + DB_ASSERT(env, dbp->log_filename != NULL); + if (dbp->log_filename->id == DB_LOGFILEID_INVALID && + (ret = __dbreg_lazy_id(dbp)) != 0) + return (ret); + + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(*meta_lsn) + + sizeof(u_int32_t) + + sizeof(*last_lsn) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + + sizeof(u_int32_t) + (list == NULL ? 0 : list->size); + if (CRYPTO_ON(env)) { + npad = env->crypto_handle->adj_size(logrec.size); + logrec.size += npad; + } + + if (is_durable || txnp == NULL) { + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) + return (ret); + } else { + if ((ret = __os_malloc(env, + logrec.size + sizeof(DB_TXNLOGREC), &lr)) != 0) + return (ret); +#ifdef DIAGNOSTIC + if ((ret = + __os_malloc(env, logrec.size, &logrec.data)) != 0) { + __os_free(env, lr); + return (ret); + } +#else + logrec.data = lr->data; +#endif + } + if (npad > 0) + memset((u_int8_t *)logrec.data + logrec.size - npad, 0, npad); + + bp = logrec.data; + + LOGCOPY_32(env, bp, &rectype); + bp += sizeof(rectype); + + LOGCOPY_32(env, bp, &txn_num); + bp += sizeof(txn_num); + + LOGCOPY_FROMLSN(env, bp, lsnp); + bp += sizeof(DB_LSN); + + uinttmp = (u_int32_t)dbp->log_filename->id; + LOGCOPY_32(env, bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)meta; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (meta_lsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(meta_lsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, meta_lsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, meta_lsn); + } else + memset(bp, 0, sizeof(*meta_lsn)); + bp += sizeof(*meta_lsn); + + uinttmp = (u_int32_t)last_free; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (last_lsn != NULL) { + if (txnp != NULL) { + LOG *lp = env->lg_handle->reginfo.primary; + if (LOG_COMPARE(last_lsn, &lp->lsn) >= 0 && (ret = + __log_check_page_lsn(env, dbp, last_lsn)) != 0) + return (ret); + } + LOGCOPY_FROMLSN(env, bp, last_lsn); + } else + memset(bp, 0, sizeof(*last_lsn)); + bp += sizeof(*last_lsn); + + uinttmp = (u_int32_t)next_free; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + uinttmp = (u_int32_t)last_pgno; + LOGCOPY_32(env,bp, &uinttmp); + bp += sizeof(uinttmp); + + if (list == NULL) { + zero = 0; + LOGCOPY_32(env, bp, &zero); + bp += sizeof(u_int32_t); + } else { + LOGCOPY_32(env, bp, &list->size); + bp += sizeof(list->size); + memcpy(bp, list->data, list->size); + bp += list->size; + } + + DB_ASSERT(env, + (u_int32_t)(bp - (u_int8_t *)logrec.data) <= logrec.size); + + if (is_durable || txnp == NULL) { + if ((ret = __log_put(env, rlsnp,(DBT *)&logrec, + flags | DB_LOG_NOCOPY)) == 0 && txnp != NULL) { + *lsnp = *rlsnp; + if (rlsnp != ret_lsnp) + *ret_lsnp = *rlsnp; + } + } else { + ret = 0; +#ifdef DIAGNOSTIC + /* + * Set the debug bit if we are going to log non-durable + * transactions so they will be ignored by recovery. + */ + memcpy(lr->data, logrec.data, logrec.size); + rectype |= DB_debug_FLAG; + LOGCOPY_32(env, logrec.data, &rectype); + + if (!IS_REP_CLIENT(env)) + ret = __log_put(env, + rlsnp, (DBT *)&logrec, flags | DB_LOG_NOCOPY); +#endif + STAILQ_INSERT_HEAD(&txnp->logs, lr, links); + F_SET((TXN_DETAIL *)txnp->td, TXN_DTL_INMEMORY); + LSN_NOT_LOGGED(*ret_lsnp); + } + +#ifdef LOG_DIAGNOSTIC + if (ret != 0) + (void)__db_pg_trunc_print(env, + (DBT *)&logrec, ret_lsnp, DB_TXN_PRINT, NULL); +#endif + +#ifdef DIAGNOSTIC + __os_free(env, logrec.data); +#else + if (is_durable || txnp == NULL) + __os_free(env, logrec.data); +#endif + return (ret); +} + +/* + * PUBLIC: int __db_init_recover __P((ENV *, DB_DISTAB *)); + */ +int +__db_init_recover(env, dtabp) + ENV *env; + DB_DISTAB *dtabp; +{ + int ret; + + if ((ret = __db_add_recovery_int(env, dtabp, + __db_addrem_recover, DB___db_addrem)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __db_big_recover, DB___db_big)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __db_ovref_recover, DB___db_ovref)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __db_debug_recover, DB___db_debug)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __db_noop_recover, DB___db_noop)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __db_pg_alloc_recover, DB___db_pg_alloc)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __db_pg_free_recover, DB___db_pg_free)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __db_cksum_recover, DB___db_cksum)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __db_pg_freedata_recover, DB___db_pg_freedata)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __db_pg_init_recover, DB___db_pg_init)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __db_pg_trunc_recover, DB___db_pg_trunc)) != 0) + return (ret); + return (0); +} diff --git a/db/db_autop.c b/db/db_autop.c new file mode 100644 index 0000000..f3b0635 --- /dev/null +++ b/db/db_autop.c @@ -0,0 +1,802 @@ +/* Do not edit: automatically built by gen_rec.awk. */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/crypto.h" +#include "dbinc/db_page.h" +#include "dbinc/db_dispatch.h" +#include "dbinc/db_am.h" +#include "dbinc/log.h" +#include "dbinc/txn.h" + +/* + * PUBLIC: int __db_addrem_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__db_addrem_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __db_addrem_args *argp; + u_int32_t i; + int ch; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __db_addrem_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__db_addrem%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\topcode: %lu\n", (u_long)argp->opcode); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tindx: %lu\n", (u_long)argp->indx); + (void)printf("\tnbytes: %lu\n", (u_long)argp->nbytes); + (void)printf("\thdr: "); + for (i = 0; i < argp->hdr.size; i++) { + ch = ((u_int8_t *)argp->hdr.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\tdbt: "); + for (i = 0; i < argp->dbt.size; i++) { + ch = ((u_int8_t *)argp->dbt.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\tpagelsn: [%lu][%lu]\n", + (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __db_big_print __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__db_big_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __db_big_args *argp; + u_int32_t i; + int ch; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __db_big_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__db_big%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\topcode: %lu\n", (u_long)argp->opcode); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tprev_pgno: %lu\n", (u_long)argp->prev_pgno); + (void)printf("\tnext_pgno: %lu\n", (u_long)argp->next_pgno); + (void)printf("\tdbt: "); + for (i = 0; i < argp->dbt.size; i++) { + ch = ((u_int8_t *)argp->dbt.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\tpagelsn: [%lu][%lu]\n", + (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset); + (void)printf("\tprevlsn: [%lu][%lu]\n", + (u_long)argp->prevlsn.file, (u_long)argp->prevlsn.offset); + (void)printf("\tnextlsn: [%lu][%lu]\n", + (u_long)argp->nextlsn.file, (u_long)argp->nextlsn.offset); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __db_ovref_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__db_ovref_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __db_ovref_args *argp; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __db_ovref_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__db_ovref%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tadjust: %ld\n", (long)argp->adjust); + (void)printf("\tlsn: [%lu][%lu]\n", + (u_long)argp->lsn.file, (u_long)argp->lsn.offset); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __db_relink_42_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__db_relink_42_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __db_relink_42_args *argp; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __db_relink_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__db_relink_42%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\topcode: %lu\n", (u_long)argp->opcode); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tlsn: [%lu][%lu]\n", + (u_long)argp->lsn.file, (u_long)argp->lsn.offset); + (void)printf("\tprev: %lu\n", (u_long)argp->prev); + (void)printf("\tlsn_prev: [%lu][%lu]\n", + (u_long)argp->lsn_prev.file, (u_long)argp->lsn_prev.offset); + (void)printf("\tnext: %lu\n", (u_long)argp->next); + (void)printf("\tlsn_next: [%lu][%lu]\n", + (u_long)argp->lsn_next.file, (u_long)argp->lsn_next.offset); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __db_debug_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__db_debug_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __db_debug_args *argp; + u_int32_t i; + int ch; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = __db_debug_read(env, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__db_debug%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\top: "); + for (i = 0; i < argp->op.size; i++) { + ch = ((u_int8_t *)argp->op.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tkey: "); + for (i = 0; i < argp->key.size; i++) { + ch = ((u_int8_t *)argp->key.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\tdata: "); + for (i = 0; i < argp->data.size; i++) { + ch = ((u_int8_t *)argp->data.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\targ_flags: %lu\n", (u_long)argp->arg_flags); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __db_noop_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__db_noop_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __db_noop_args *argp; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __db_noop_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__db_noop%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tprevlsn: [%lu][%lu]\n", + (u_long)argp->prevlsn.file, (u_long)argp->prevlsn.offset); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __db_pg_alloc_42_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__db_pg_alloc_42_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __db_pg_alloc_42_args *argp; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __db_pg_alloc_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__db_pg_alloc_42%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tmeta_lsn: [%lu][%lu]\n", + (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset); + (void)printf("\tmeta_pgno: %lu\n", (u_long)argp->meta_pgno); + (void)printf("\tpage_lsn: [%lu][%lu]\n", + (u_long)argp->page_lsn.file, (u_long)argp->page_lsn.offset); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tptype: %lu\n", (u_long)argp->ptype); + (void)printf("\tnext: %lu\n", (u_long)argp->next); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __db_pg_alloc_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__db_pg_alloc_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __db_pg_alloc_args *argp; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __db_pg_alloc_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__db_pg_alloc%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tmeta_lsn: [%lu][%lu]\n", + (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset); + (void)printf("\tmeta_pgno: %lu\n", (u_long)argp->meta_pgno); + (void)printf("\tpage_lsn: [%lu][%lu]\n", + (u_long)argp->page_lsn.file, (u_long)argp->page_lsn.offset); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tptype: %lu\n", (u_long)argp->ptype); + (void)printf("\tnext: %lu\n", (u_long)argp->next); + (void)printf("\tlast_pgno: %lu\n", (u_long)argp->last_pgno); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __db_pg_free_42_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__db_pg_free_42_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __db_pg_free_42_args *argp; + u_int32_t i; + int ch; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __db_pg_free_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__db_pg_free_42%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tmeta_lsn: [%lu][%lu]\n", + (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset); + (void)printf("\tmeta_pgno: %lu\n", (u_long)argp->meta_pgno); + (void)printf("\theader: "); + for (i = 0; i < argp->header.size; i++) { + ch = ((u_int8_t *)argp->header.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\tnext: %lu\n", (u_long)argp->next); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __db_pg_free_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__db_pg_free_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __db_pg_free_args *argp; + u_int32_t i; + int ch; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __db_pg_free_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__db_pg_free%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tmeta_lsn: [%lu][%lu]\n", + (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset); + (void)printf("\tmeta_pgno: %lu\n", (u_long)argp->meta_pgno); + (void)printf("\theader: "); + for (i = 0; i < argp->header.size; i++) { + ch = ((u_int8_t *)argp->header.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\tnext: %lu\n", (u_long)argp->next); + (void)printf("\tlast_pgno: %lu\n", (u_long)argp->last_pgno); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __db_cksum_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__db_cksum_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __db_cksum_args *argp; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = __db_cksum_read(env, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__db_cksum%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __db_pg_freedata_42_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__db_pg_freedata_42_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __db_pg_freedata_42_args *argp; + u_int32_t i; + int ch; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __db_pg_freedata_42_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__db_pg_freedata_42%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tmeta_lsn: [%lu][%lu]\n", + (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset); + (void)printf("\tmeta_pgno: %lu\n", (u_long)argp->meta_pgno); + (void)printf("\theader: "); + for (i = 0; i < argp->header.size; i++) { + ch = ((u_int8_t *)argp->header.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\tnext: %lu\n", (u_long)argp->next); + (void)printf("\tdata: "); + for (i = 0; i < argp->data.size; i++) { + ch = ((u_int8_t *)argp->data.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __db_pg_freedata_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__db_pg_freedata_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __db_pg_freedata_args *argp; + u_int32_t i; + int ch; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __db_pg_freedata_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__db_pg_freedata%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\tmeta_lsn: [%lu][%lu]\n", + (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset); + (void)printf("\tmeta_pgno: %lu\n", (u_long)argp->meta_pgno); + (void)printf("\theader: "); + for (i = 0; i < argp->header.size; i++) { + ch = ((u_int8_t *)argp->header.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\tnext: %lu\n", (u_long)argp->next); + (void)printf("\tlast_pgno: %lu\n", (u_long)argp->last_pgno); + (void)printf("\tdata: "); + for (i = 0; i < argp->data.size; i++) { + ch = ((u_int8_t *)argp->data.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __db_pg_init_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__db_pg_init_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __db_pg_init_args *argp; + u_int32_t i; + int ch; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __db_pg_init_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__db_pg_init%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tpgno: %lu\n", (u_long)argp->pgno); + (void)printf("\theader: "); + for (i = 0; i < argp->header.size; i++) { + ch = ((u_int8_t *)argp->header.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\tdata: "); + for (i = 0; i < argp->data.size; i++) { + ch = ((u_int8_t *)argp->data.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __db_pg_sort_44_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__db_pg_sort_44_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __db_pg_sort_44_args *argp; + u_int32_t i; + int ch; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __db_pg_sort_44_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__db_pg_sort_44%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tmeta: %lu\n", (u_long)argp->meta); + (void)printf("\tmeta_lsn: [%lu][%lu]\n", + (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset); + (void)printf("\tlast_free: %lu\n", (u_long)argp->last_free); + (void)printf("\tlast_lsn: [%lu][%lu]\n", + (u_long)argp->last_lsn.file, (u_long)argp->last_lsn.offset); + (void)printf("\tlast_pgno: %lu\n", (u_long)argp->last_pgno); + (void)printf("\tlist: "); + for (i = 0; i < argp->list.size; i++) { + ch = ((u_int8_t *)argp->list.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __db_pg_trunc_print __P((ENV *, DBT *, DB_LSN *, + * PUBLIC: db_recops, void *)); + */ +int +__db_pg_trunc_print(env, dbtp, lsnp, notused2, notused3) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __db_pg_trunc_args *argp; + u_int32_t i; + int ch; + int ret; + + notused2 = DB_TXN_PRINT; + notused3 = NULL; + + if ((ret = + __db_pg_trunc_read(env, NULL, NULL, dbtp->data, &argp)) != 0) + return (ret); + (void)printf( + "[%lu][%lu]__db_pg_trunc%s: rec: %lu txnp %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, (u_long)lsnp->offset, + (argp->type & DB_debug_FLAG) ? "_debug" : "", + (u_long)argp->type, + (u_long)argp->txnp->txnid, + (u_long)argp->prev_lsn.file, (u_long)argp->prev_lsn.offset); + (void)printf("\tfileid: %ld\n", (long)argp->fileid); + (void)printf("\tmeta: %lu\n", (u_long)argp->meta); + (void)printf("\tmeta_lsn: [%lu][%lu]\n", + (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset); + (void)printf("\tlast_free: %lu\n", (u_long)argp->last_free); + (void)printf("\tlast_lsn: [%lu][%lu]\n", + (u_long)argp->last_lsn.file, (u_long)argp->last_lsn.offset); + (void)printf("\tnext_free: %lu\n", (u_long)argp->next_free); + (void)printf("\tlast_pgno: %lu\n", (u_long)argp->last_pgno); + (void)printf("\tlist: "); + for (i = 0; i < argp->list.size; i++) { + ch = ((u_int8_t *)argp->list.data)[i]; + printf(isprint(ch) || ch == 0x0a ? "%c" : "%#x ", ch); + } + (void)printf("\n"); + (void)printf("\n"); + __os_free(env, argp); + return (0); +} + +/* + * PUBLIC: int __db_init_print __P((ENV *, DB_DISTAB *)); + */ +int +__db_init_print(env, dtabp) + ENV *env; + DB_DISTAB *dtabp; +{ + int ret; + + if ((ret = __db_add_recovery_int(env, dtabp, + __db_addrem_print, DB___db_addrem)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __db_big_print, DB___db_big)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __db_ovref_print, DB___db_ovref)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __db_debug_print, DB___db_debug)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __db_noop_print, DB___db_noop)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __db_pg_alloc_print, DB___db_pg_alloc)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __db_pg_free_print, DB___db_pg_free)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __db_cksum_print, DB___db_cksum)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __db_pg_freedata_print, DB___db_pg_freedata)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __db_pg_init_print, DB___db_pg_init)) != 0) + return (ret); + if ((ret = __db_add_recovery_int(env, dtabp, + __db_pg_trunc_print, DB___db_pg_trunc)) != 0) + return (ret); + return (0); +} diff --git a/db/db_cam.c b/db/db_cam.c new file mode 100644 index 0000000..4c1322d --- /dev/null +++ b/db/db_cam.c @@ -0,0 +1,3460 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2000, 2010 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/hash.h" +#include "dbinc/lock.h" +#include "dbinc/mp.h" +#include "dbinc/partition.h" +#include "dbinc/qam.h" +#include "dbinc/txn.h" + +static int __db_s_count __P((DB *)); +static int __db_wrlock_err __P((ENV *)); +static int __dbc_del_foreign __P((DBC *)); +static int __dbc_del_oldskey __P((DB *, DBC *, DBT *, DBT *, DBT *)); +static int __dbc_del_secondary __P((DBC *)); +static int __dbc_pget_recno __P((DBC *, DBT *, DBT *, u_int32_t)); +static inline int __dbc_put_append __P((DBC *, + DBT *, DBT *, u_int32_t *, u_int32_t)); +static inline int __dbc_put_fixed_len __P((DBC *, DBT *, DBT *)); +static inline int __dbc_put_partial __P((DBC *, + DBT *, DBT *, DBT *, DBT *, u_int32_t *, u_int32_t)); +static int __dbc_put_primary __P((DBC *, DBT *, DBT *, u_int32_t)); +static inline int __dbc_put_resolve_key __P((DBC *, + DBT *, DBT *, u_int32_t *, u_int32_t)); +static inline int __dbc_put_secondaries __P((DBC *, + DBT *, DBT *, DBT *, int, DBT *, u_int32_t *)); + +#define CDB_LOCKING_INIT(env, dbc) \ + /* \ + * If we are running CDB, this had better be either a write \ + * cursor or an immediate writer. If it's a regular writer, \ + * that means we have an IWRITE lock and we need to upgrade \ + * it to a write lock. \ + */ \ + if (CDB_LOCKING(env)) { \ + if (!F_ISSET(dbc, DBC_WRITECURSOR | DBC_WRITER)) \ + return (__db_wrlock_err(env)); \ + \ + if (F_ISSET(dbc, DBC_WRITECURSOR) && \ + (ret = __lock_get(env, \ + (dbc)->locker, DB_LOCK_UPGRADE, &(dbc)->lock_dbt, \ + DB_LOCK_WRITE, &(dbc)->mylock)) != 0) \ + return (ret); \ + } +#define CDB_LOCKING_DONE(env, dbc) \ + /* Release the upgraded lock. */ \ + if (F_ISSET(dbc, DBC_WRITECURSOR)) \ + (void)__lock_downgrade( \ + env, &(dbc)->mylock, DB_LOCK_IWRITE, 0); + +#define SET_READ_LOCKING_FLAGS(dbc, var) do { \ + var = 0; \ + if (!F_ISSET(dbc, DBC_READ_COMMITTED | DBC_READ_UNCOMMITTED)) { \ + if (LF_ISSET(DB_READ_COMMITTED)) \ + var = DBC_READ_COMMITTED | DBC_WAS_READ_COMMITTED; \ + if (LF_ISSET(DB_READ_UNCOMMITTED)) \ + var = DBC_READ_UNCOMMITTED; \ + } \ + LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED); \ +} while (0) + +/* + * __dbc_close -- + * DBC->close. + * + * PUBLIC: int __dbc_close __P((DBC *)); + */ +int +__dbc_close(dbc) + DBC *dbc; +{ + DB *dbp; + DBC *opd; + DBC_INTERNAL *cp; + DB_TXN *txn; + ENV *env; + int ret, t_ret; + + dbp = dbc->dbp; + env = dbp->env; + cp = dbc->internal; + opd = cp->opd; + ret = 0; + + /* + * Remove the cursor(s) from the active queue. We may be closing two + * cursors at once here, a top-level one and a lower-level, off-page + * duplicate one. The access-method specific cursor close routine must + * close both of them in a single call. + * + * !!! + * Cursors must be removed from the active queue before calling the + * access specific cursor close routine, btree depends on having that + * order of operations. + */ + MUTEX_LOCK(env, dbp->mutex); + + if (opd != NULL) { + DB_ASSERT(env, F_ISSET(opd, DBC_ACTIVE)); + F_CLR(opd, DBC_ACTIVE); + TAILQ_REMOVE(&dbp->active_queue, opd, links); + } + DB_ASSERT(env, F_ISSET(dbc, DBC_ACTIVE)); + F_CLR(dbc, DBC_ACTIVE); + TAILQ_REMOVE(&dbp->active_queue, dbc, links); + + MUTEX_UNLOCK(env, dbp->mutex); + + /* Call the access specific cursor close routine. */ + if ((t_ret = + dbc->am_close(dbc, PGNO_INVALID, NULL)) != 0 && ret == 0) + ret = t_ret; + + /* + * Release the lock after calling the access method specific close + * routine, a Btree cursor may have had pending deletes. + */ + if (CDB_LOCKING(env)) { + /* + * Also, be sure not to free anything if mylock.off is + * INVALID; in some cases, such as idup'ed read cursors + * and secondary update cursors, a cursor in a CDB + * environment may not have a lock at all. + */ + if ((t_ret = __LPUT(dbc, dbc->mylock)) != 0 && ret == 0) + ret = t_ret; + + /* For safety's sake, since this is going on the free queue. */ + memset(&dbc->mylock, 0, sizeof(dbc->mylock)); + if (opd != NULL) + memset(&opd->mylock, 0, sizeof(opd->mylock)); + } + + if ((txn = dbc->txn) != NULL) + txn->cursors--; + + /* Move the cursor(s) to the free queue. */ + MUTEX_LOCK(env, dbp->mutex); + if (opd != NULL) { + if (txn != NULL) + txn->cursors--; + TAILQ_INSERT_TAIL(&dbp->free_queue, opd, links); + opd = NULL; + } + TAILQ_INSERT_TAIL(&dbp->free_queue, dbc, links); + MUTEX_UNLOCK(env, dbp->mutex); + + if (txn != NULL && F_ISSET(txn, TXN_PRIVATE) && txn->cursors == 0 && + (t_ret = __txn_commit(txn, 0)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __dbc_destroy -- + * Destroy the cursor, called after DBC->close. + * + * PUBLIC: int __dbc_destroy __P((DBC *)); + */ +int +__dbc_destroy(dbc) + DBC *dbc; +{ + DB *dbp; + ENV *env; + int ret, t_ret; + + dbp = dbc->dbp; + env = dbp->env; + + /* Remove the cursor from the free queue. */ + MUTEX_LOCK(env, dbp->mutex); + TAILQ_REMOVE(&dbp->free_queue, dbc, links); + MUTEX_UNLOCK(env, dbp->mutex); + + /* Free up allocated memory. */ + if (dbc->my_rskey.data != NULL) + __os_free(env, dbc->my_rskey.data); + if (dbc->my_rkey.data != NULL) + __os_free(env, dbc->my_rkey.data); + if (dbc->my_rdata.data != NULL) + __os_free(env, dbc->my_rdata.data); + + /* Call the access specific cursor destroy routine. */ + ret = dbc->am_destroy == NULL ? 0 : dbc->am_destroy(dbc); + + /* + * Release the lock id for this cursor. + */ + if (LOCKING_ON(env) && + F_ISSET(dbc, DBC_OWN_LID) && + (t_ret = __lock_id_free(env, dbc->lref)) != 0 && ret == 0) + ret = t_ret; + + __os_free(env, dbc); + + return (ret); +} + +/* + * __dbc_cmp -- + * Compare the position of two cursors. Return whether two cursors are + * pointing to the same key/data pair. + * + * result == 0 if both cursors refer to the same item. + * result == 1 otherwise + * + * PUBLIC: int __dbc_cmp __P((DBC *, DBC *, int *)); + */ +int +__dbc_cmp(dbc, other_dbc, result) + DBC *dbc, *other_dbc; + int *result; +{ + DBC *curr_dbc, *curr_odbc; + DBC_INTERNAL *dbc_int, *odbc_int; + ENV *env; + int ret; + + env = dbc->env; + ret = 0; + +#ifdef HAVE_PARTITION + if (DB_IS_PARTITIONED(dbc->dbp)) { + dbc = ((PART_CURSOR *)dbc->internal)->sub_cursor; + other_dbc = ((PART_CURSOR *)other_dbc->internal)->sub_cursor; + } + /* Both cursors must still be valid. */ + if (dbc == NULL || other_dbc == NULL) { + __db_errx(env, +"Both cursors must be initialized before calling DBC->cmp."); + return (EINVAL); + } + + if (dbc->dbp != other_dbc->dbp) { + *result = 1; + return (0); + } +#endif + +#ifdef HAVE_COMPRESSION + if (DB_IS_COMPRESSED(dbc->dbp)) + return (__bamc_compress_cmp(dbc, other_dbc, result)); +#endif + + curr_dbc = dbc; + curr_odbc = other_dbc; + dbc_int = dbc->internal; + odbc_int = other_dbc->internal; + + /* Both cursors must be on valid positions. */ + if (dbc_int->pgno == PGNO_INVALID || odbc_int->pgno == PGNO_INVALID) { + __db_errx(env, +"Both cursors must be initialized before calling DBC->cmp."); + return (EINVAL); + } + + /* + * Use a loop since cursors can be nested. Off page duplicate + * sets can only be nested one level deep, so it is safe to use a + * while (true) loop. + */ + while (1) { + if (dbc_int->pgno == odbc_int->pgno && + dbc_int->indx == odbc_int->indx) { + /* + * If one cursor is sitting on an off page duplicate + * set, the other will be pointing to the same set. Be + * careful, and check anyway. + */ + if (dbc_int->opd != NULL && odbc_int->opd != NULL) { + curr_dbc = dbc_int->opd; + curr_odbc = odbc_int->opd; + dbc_int = dbc_int->opd->internal; + odbc_int= odbc_int->opd->internal; + continue; + } else if (dbc_int->opd == NULL && + odbc_int->opd == NULL) + *result = 0; + else { + __db_errx(env, + "DBCursor->cmp mismatched off page duplicate cursor pointers."); + return (EINVAL); + } + + switch (curr_dbc->dbtype) { + case DB_HASH: + /* + * Make sure that on-page duplicate data + * indexes match, and that the deleted + * flags are consistent. + */ + ret = __hamc_cmp(curr_dbc, curr_odbc, result); + break; + case DB_BTREE: + case DB_RECNO: + /* + * Check for consisted deleted flags on btree + * specific cursors. + */ + ret = __bamc_cmp(curr_dbc, curr_odbc, result); + break; + default: + /* NO-OP break out. */ + break; + } + } else + *result = 1; + return (ret); + } + /* NOTREACHED. */ + return (ret); +} + +/* + * __dbc_count -- + * Return a count of duplicate data items. + * + * PUBLIC: int __dbc_count __P((DBC *, db_recno_t *)); + */ +int +__dbc_count(dbc, recnop) + DBC *dbc; + db_recno_t *recnop; +{ + ENV *env; + int ret; + + env = dbc->env; + +#ifdef HAVE_PARTITION + if (DB_IS_PARTITIONED(dbc->dbp)) + dbc = ((PART_CURSOR *)dbc->internal)->sub_cursor; +#endif + /* + * Cursor Cleanup Note: + * All of the cursors passed to the underlying access methods by this + * routine are not duplicated and will not be cleaned up on return. + * So, pages/locks that the cursor references must be resolved by the + * underlying functions. + */ + switch (dbc->dbtype) { + case DB_QUEUE: + case DB_RECNO: + *recnop = 1; + break; + case DB_HASH: + if (dbc->internal->opd == NULL) { + if ((ret = __hamc_count(dbc, recnop)) != 0) + return (ret); + break; + } + /* FALLTHROUGH */ + case DB_BTREE: +#ifdef HAVE_COMPRESSION + if (DB_IS_COMPRESSED(dbc->dbp)) + return (__bamc_compress_count(dbc, recnop)); +#endif + if ((ret = __bamc_count(dbc, recnop)) != 0) + return (ret); + break; + case DB_UNKNOWN: + default: + return (__db_unknown_type(env, "__dbc_count", dbc->dbtype)); + } + return (0); +} + +/* + * __dbc_del -- + * DBC->del. + * + * PUBLIC: int __dbc_del __P((DBC *, u_int32_t)); + */ +int +__dbc_del(dbc, flags) + DBC *dbc; + u_int32_t flags; +{ + DB *dbp; + ENV *env; + int ret; + + dbp = dbc->dbp; + env = dbp->env; + + CDB_LOCKING_INIT(env, dbc); + + /* + * If we're a secondary index, and DB_UPDATE_SECONDARY isn't set + * (which it only is if we're being called from a primary update), + * then we need to call through to the primary and delete the item. + * + * Note that this will delete the current item; we don't need to + * delete it ourselves as well, so we can just goto done. + */ + if (flags != DB_UPDATE_SECONDARY && F_ISSET(dbp, DB_AM_SECONDARY)) { + ret = __dbc_del_secondary(dbc); + goto done; + } + + /* + * If we are a foreign db, go through and check any foreign key + * constraints first, which will make rolling back changes on an abort + * simpler. + */ + if (LIST_FIRST(&dbp->f_primaries) != NULL && + (ret = __dbc_del_foreign(dbc)) != 0) + goto done; + + /* + * If we are a primary and have secondary indices, go through + * and delete any secondary keys that point at the current record. + */ + if (DB_IS_PRIMARY(dbp) && + (ret = __dbc_del_primary(dbc)) != 0) + goto done; + +#ifdef HAVE_COMPRESSION + if (DB_IS_COMPRESSED(dbp)) + ret = __bamc_compress_del(dbc, flags); + else +#endif + ret = __dbc_idel(dbc, flags); + +done: CDB_LOCKING_DONE(env, dbc); + + return (ret); +} + +/* + * __dbc_del -- + * Implemenation of DBC->del. + * + * PUBLIC: int __dbc_idel __P((DBC *, u_int32_t)); + */ +int +__dbc_idel(dbc, flags) + DBC *dbc; + u_int32_t flags; +{ + DB *dbp; + DBC *opd; + int ret, t_ret; + + COMPQUIET(flags, 0); + + dbp = dbc->dbp; + + /* + * Cursor Cleanup Note: + * All of the cursors passed to the underlying access methods by this + * routine are not duplicated and will not be cleaned up on return. + * So, pages/locks that the cursor references must be resolved by the + * underlying functions. + */ + + /* + * Off-page duplicate trees are locked in the primary tree, that is, + * we acquire a write lock in the primary tree and no locks in the + * off-page dup tree. If the del operation is done in an off-page + * duplicate tree, call the primary cursor's upgrade routine first. + */ + opd = dbc->internal->opd; + if (opd == NULL) + ret = dbc->am_del(dbc, flags); + else if ((ret = dbc->am_writelock(dbc)) == 0) + ret = opd->am_del(opd, flags); + + /* + * If this was an update that is supporting dirty reads + * then we may have just swapped our read for a write lock + * which is held by the surviving cursor. We need + * to explicitly downgrade this lock. The closed cursor + * may only have had a read lock. + */ + if (F_ISSET(dbp, DB_AM_READ_UNCOMMITTED) && + dbc->internal->lock_mode == DB_LOCK_WRITE) { + if ((t_ret = + __TLPUT(dbc, dbc->internal->lock)) != 0 && ret == 0) + ret = t_ret; + if (t_ret == 0) + dbc->internal->lock_mode = DB_LOCK_WWRITE; + if (dbc->internal->page != NULL && (t_ret = + __memp_shared(dbp->mpf, dbc->internal->page)) != 0 && + ret == 0) + ret = t_ret; + } + + return (ret); +} + +#ifdef HAVE_COMPRESSION +/* + * __dbc_bulk_del -- + * Bulk del for a cursor. + * + * Only implemented for compressed BTrees. In this file in order to + * use the CDB_LOCKING_* macros. + * + * PUBLIC: #ifdef HAVE_COMPRESSION + * PUBLIC: int __dbc_bulk_del __P((DBC *, DBT *, u_int32_t)); + * PUBLIC: #endif + */ +int +__dbc_bulk_del(dbc, key, flags) + DBC *dbc; + DBT *key; + u_int32_t flags; +{ + ENV *env; + int ret; + + env = dbc->env; + + DB_ASSERT(env, DB_IS_COMPRESSED(dbc->dbp)); + + CDB_LOCKING_INIT(env, dbc); + + ret = __bamc_compress_bulk_del(dbc, key, flags); + + CDB_LOCKING_DONE(env, dbc); + + return (ret); +} +#endif + +/* + * __dbc_dup -- + * Duplicate a cursor + * + * PUBLIC: int __dbc_dup __P((DBC *, DBC **, u_int32_t)); + */ +int +__dbc_dup(dbc_orig, dbcp, flags) + DBC *dbc_orig; + DBC **dbcp; + u_int32_t flags; +{ + DBC *dbc_n, *dbc_nopd; + int ret; + + dbc_n = dbc_nopd = NULL; + + /* Allocate a new cursor and initialize it. */ + if ((ret = __dbc_idup(dbc_orig, &dbc_n, flags)) != 0) + goto err; + *dbcp = dbc_n; + + /* + * If the cursor references an off-page duplicate tree, allocate a + * new cursor for that tree and initialize it. + */ + if (dbc_orig->internal->opd != NULL) { + if ((ret = + __dbc_idup(dbc_orig->internal->opd, &dbc_nopd, flags)) != 0) + goto err; + dbc_n->internal->opd = dbc_nopd; + dbc_nopd->internal->pdbc = dbc_n; + } + return (0); + +err: if (dbc_n != NULL) + (void)__dbc_close(dbc_n); + if (dbc_nopd != NULL) + (void)__dbc_close(dbc_nopd); + + return (ret); +} + +/* + * __dbc_idup -- + * Internal version of __dbc_dup. + * + * PUBLIC: int __dbc_idup __P((DBC *, DBC **, u_int32_t)); + */ +int +__dbc_idup(dbc_orig, dbcp, flags) + DBC *dbc_orig, **dbcp; + u_int32_t flags; +{ + DB *dbp; + DBC *dbc_n; + DBC_INTERNAL *int_n, *int_orig; + ENV *env; + int ret; + + dbp = dbc_orig->dbp; + dbc_n = *dbcp; + env = dbp->env; + + if ((ret = __db_cursor_int(dbp, dbc_orig->thread_info, + dbc_orig->txn, dbc_orig->dbtype, dbc_orig->internal->root, + F_ISSET(dbc_orig, DBC_OPD) | DBC_DUPLICATE, + dbc_orig->locker, &dbc_n)) != 0) + return (ret); + + /* Position the cursor if requested, acquiring the necessary locks. */ + if (LF_ISSET(DB_POSITION)) { + int_n = dbc_n->internal; + int_orig = dbc_orig->internal; + + dbc_n->flags |= dbc_orig->flags & ~DBC_OWN_LID; + + int_n->indx = int_orig->indx; + int_n->pgno = int_orig->pgno; + int_n->root = int_orig->root; + int_n->lock_mode = int_orig->lock_mode; + + int_n->stream_start_pgno = int_orig->stream_start_pgno; + int_n->stream_off = int_orig->stream_off; + int_n->stream_curr_pgno = int_orig->stream_curr_pgno; + + switch (dbc_orig->dbtype) { + case DB_QUEUE: + if ((ret = __qamc_dup(dbc_orig, dbc_n)) != 0) + goto err; + break; + case DB_BTREE: + case DB_RECNO: + if ((ret = __bamc_dup(dbc_orig, dbc_n, flags)) != 0) + goto err; + break; + case DB_HASH: + if ((ret = __hamc_dup(dbc_orig, dbc_n)) != 0) + goto err; + break; + case DB_UNKNOWN: + default: + ret = __db_unknown_type(env, + "__dbc_idup", dbc_orig->dbtype); + goto err; + } + } else if (F_ISSET(dbc_orig, DBC_BULK)) { + /* + * For bulk cursors, remember what page were on, even if we + * don't know that the next operation will be nearby. + */ + dbc_n->internal->pgno = dbc_orig->internal->pgno; + } + + /* Copy the locking flags to the new cursor. */ + F_SET(dbc_n, F_ISSET(dbc_orig, DBC_BULK | + DBC_READ_COMMITTED | DBC_READ_UNCOMMITTED | DBC_WRITECURSOR)); + + /* + * If we're in CDB and this isn't an offpage dup cursor, then + * we need to get a lock for the duplicated cursor. + */ + if (CDB_LOCKING(env) && !F_ISSET(dbc_n, DBC_OPD) && + (ret = __lock_get(env, dbc_n->locker, 0, + &dbc_n->lock_dbt, F_ISSET(dbc_orig, DBC_WRITECURSOR) ? + DB_LOCK_IWRITE : DB_LOCK_READ, &dbc_n->mylock)) != 0) + goto err; + + dbc_n->priority = dbc_orig->priority; + dbc_n->internal->pdbc = dbc_orig->internal->pdbc; + *dbcp = dbc_n; + return (0); + +err: (void)__dbc_close(dbc_n); + return (ret); +} + +/* + * __dbc_newopd -- + * Create a new off-page duplicate cursor. + * + * PUBLIC: int __dbc_newopd __P((DBC *, db_pgno_t, DBC *, DBC **)); + */ +int +__dbc_newopd(dbc_parent, root, oldopd, dbcp) + DBC *dbc_parent; + db_pgno_t root; + DBC *oldopd; + DBC **dbcp; +{ + DB *dbp; + DBC *opd; + DBTYPE dbtype; + int ret; + + dbp = dbc_parent->dbp; + dbtype = (dbp->dup_compare == NULL) ? DB_RECNO : DB_BTREE; + + /* + * On failure, we want to default to returning the old off-page dup + * cursor, if any; our caller can't be left with a dangling pointer + * to a freed cursor. On error the only allowable behavior is to + * close the cursor (and the old OPD cursor it in turn points to), so + * this should be safe. + */ + *dbcp = oldopd; + + if ((ret = __db_cursor_int(dbp, dbc_parent->thread_info, + dbc_parent->txn, + dbtype, root, DBC_OPD, dbc_parent->locker, &opd)) != 0) + return (ret); + + opd->priority = dbc_parent->priority; + opd->internal->pdbc = dbc_parent; + *dbcp = opd; + + /* + * Check to see if we already have an off-page dup cursor that we've + * passed in. If we do, close it. It'd be nice to use it again + * if it's a cursor belonging to the right tree, but if we're doing + * a cursor-relative operation this might not be safe, so for now + * we'll take the easy way out and always close and reopen. + * + * Note that under no circumstances do we want to close the old + * cursor without returning a valid new one; we don't want to + * leave the main cursor in our caller with a non-NULL pointer + * to a freed off-page dup cursor. + */ + if (oldopd != NULL && (ret = __dbc_close(oldopd)) != 0) + return (ret); + + return (0); +} + +/* + * __dbc_get -- + * Get using a cursor. + * + * PUBLIC: int __dbc_get __P((DBC *, DBT *, DBT *, u_int32_t)); + */ +int +__dbc_get(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; +{ +#ifdef HAVE_PARTITION + if (F_ISSET(dbc, DBC_PARTITIONED)) + return (__partc_get(dbc, key, data, flags)); +#endif + +#ifdef HAVE_COMPRESSION + if (DB_IS_COMPRESSED(dbc->dbp)) + return (__bamc_compress_get(dbc, key, data, flags)); +#endif + + return (__dbc_iget(dbc, key, data, flags)); +} + +/* + * __dbc_iget -- + * Implementation of get using a cursor. + * + * PUBLIC: int __dbc_iget __P((DBC *, DBT *, DBT *, u_int32_t)); + */ +int +__dbc_iget(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; +{ + DB *dbp; + DBC *ddbc, *dbc_n, *opd; + DBC_INTERNAL *cp, *cp_n; + DB_MPOOLFILE *mpf; + ENV *env; + db_pgno_t pgno; + db_indx_t indx_off; + u_int32_t multi, orig_ulen, tmp_flags, tmp_read_locking, tmp_rmw; + u_int8_t type; + int key_small, ret, t_ret; + + COMPQUIET(orig_ulen, 0); + + key_small = 0; + + /* + * Cursor Cleanup Note: + * All of the cursors passed to the underlying access methods by this + * routine are duplicated cursors. On return, any referenced pages + * will be discarded, and, if the cursor is not intended to be used + * again, the close function will be called. So, pages/locks that + * the cursor references do not need to be resolved by the underlying + * functions. + */ + dbp = dbc->dbp; + env = dbp->env; + mpf = dbp->mpf; + dbc_n = NULL; + opd = NULL; + + /* Clear OR'd in additional bits so we can check for flag equality. */ + tmp_rmw = LF_ISSET(DB_RMW); + LF_CLR(DB_RMW); + + SET_READ_LOCKING_FLAGS(dbc, tmp_read_locking); + + multi = LF_ISSET(DB_MULTIPLE|DB_MULTIPLE_KEY); + LF_CLR(DB_MULTIPLE|DB_MULTIPLE_KEY); + + /* + * Return a cursor's record number. It has nothing to do with the + * cursor get code except that it was put into the interface. + */ + if (flags == DB_GET_RECNO) { + if (tmp_rmw) + F_SET(dbc, DBC_RMW); + F_SET(dbc, tmp_read_locking); + ret = __bamc_rget(dbc, data); + if (tmp_rmw) + F_CLR(dbc, DBC_RMW); + /* Clear the temp flags, but leave WAS_READ_COMMITTED. */ + F_CLR(dbc, tmp_read_locking & ~DBC_WAS_READ_COMMITTED); + return (ret); + } + + if (flags == DB_CONSUME || flags == DB_CONSUME_WAIT) + CDB_LOCKING_INIT(env, dbc); + + /* Don't return the key or data if it was passed to us. */ + if (!DB_RETURNS_A_KEY(dbp, flags)) + F_SET(key, DB_DBT_ISSET); + if (flags == DB_GET_BOTH && + (dbp->dup_compare == NULL || dbp->dup_compare == __bam_defcmp)) + F_SET(data, DB_DBT_ISSET); + + /* + * If we have an off-page duplicates cursor, and the operation applies + * to it, perform the operation. Duplicate the cursor and call the + * underlying function. + * + * Off-page duplicate trees are locked in the primary tree, that is, + * we acquire a write lock in the primary tree and no locks in the + * off-page dup tree. If the DB_RMW flag was specified and the get + * operation is done in an off-page duplicate tree, call the primary + * cursor's upgrade routine first. + */ + cp = dbc->internal; + if (cp->opd != NULL && + (flags == DB_CURRENT || flags == DB_GET_BOTHC || + flags == DB_NEXT || flags == DB_NEXT_DUP || + flags == DB_PREV || flags == DB_PREV_DUP)) { + if (tmp_rmw && (ret = dbc->am_writelock(dbc)) != 0) + goto err; + if (F_ISSET(dbc, DBC_TRANSIENT)) + opd = cp->opd; + else if ((ret = __dbc_idup(cp->opd, &opd, DB_POSITION)) != 0) + goto err; + + if ((ret = opd->am_get(opd, key, data, flags, NULL)) == 0) + goto done; + /* + * Another cursor may have deleted all of the off-page + * duplicates, so for operations that are moving a cursor, we + * need to skip the empty tree and retry on the parent cursor. + */ + if (ret == DB_NOTFOUND && + (flags == DB_PREV || flags == DB_NEXT)) { + ret = __dbc_close(opd); + opd = NULL; + if (F_ISSET(dbc, DBC_TRANSIENT)) + cp->opd = NULL; + } + if (ret != 0) + goto err; + } else if (cp->opd != NULL && F_ISSET(dbc, DBC_TRANSIENT)) { + if ((ret = __dbc_close(cp->opd)) != 0) + goto err; + cp->opd = NULL; + } + + /* + * Perform an operation on the main cursor. Duplicate the cursor, + * upgrade the lock as required, and call the underlying function. + */ + switch (flags) { + case DB_CURRENT: + case DB_GET_BOTHC: + case DB_NEXT: + case DB_NEXT_DUP: + case DB_NEXT_NODUP: + case DB_PREV: + case DB_PREV_DUP: + case DB_PREV_NODUP: + tmp_flags = DB_POSITION; + break; + default: + tmp_flags = 0; + break; + } + + /* + * If this cursor is going to be closed immediately, we don't + * need to take precautions to clean it up on error. + */ + if (F_ISSET(dbc, DBC_TRANSIENT | DBC_PARTITIONED)) + dbc_n = dbc; + else { + ret = __dbc_idup(dbc, &dbc_n, tmp_flags); + + if (ret != 0) + goto err; + COPY_RET_MEM(dbc, dbc_n); + } + + if (tmp_rmw) + F_SET(dbc_n, DBC_RMW); + F_SET(dbc_n, tmp_read_locking); + + switch (multi) { + case DB_MULTIPLE: + F_SET(dbc_n, DBC_MULTIPLE); + break; + case DB_MULTIPLE_KEY: + F_SET(dbc_n, DBC_MULTIPLE_KEY); + break; + case DB_MULTIPLE | DB_MULTIPLE_KEY: + F_SET(dbc_n, DBC_MULTIPLE|DBC_MULTIPLE_KEY); + break; + case 0: + default: + break; + } + +retry: pgno = PGNO_INVALID; + ret = dbc_n->am_get(dbc_n, key, data, flags, &pgno); + if (tmp_rmw) + F_CLR(dbc_n, DBC_RMW); + /* + * Clear the temporary locking flags in the new cursor. The user's + * (old) cursor needs to have the WAS_READ_COMMITTED flag because this + * is used on the next call on that cursor. + */ + F_CLR(dbc_n, tmp_read_locking); + F_SET(dbc, tmp_read_locking & DBC_WAS_READ_COMMITTED); + F_CLR(dbc_n, DBC_MULTIPLE|DBC_MULTIPLE_KEY); + if (ret != 0) + goto err; + + cp_n = dbc_n->internal; + + /* + * We may be referencing a new off-page duplicates tree. Acquire + * a new cursor and call the underlying function. + */ + if (pgno != PGNO_INVALID) { + if ((ret = __dbc_newopd(dbc, + pgno, cp_n->opd, &cp_n->opd)) != 0) + goto err; + + switch (flags) { + case DB_FIRST: + case DB_NEXT: + case DB_NEXT_NODUP: + case DB_SET: + case DB_SET_RECNO: + case DB_SET_RANGE: + tmp_flags = DB_FIRST; + break; + case DB_LAST: + case DB_PREV: + case DB_PREV_NODUP: + tmp_flags = DB_LAST; + break; + case DB_GET_BOTH: + case DB_GET_BOTHC: + case DB_GET_BOTH_RANGE: + tmp_flags = flags; + break; + default: + ret = __db_unknown_flag(env, "__dbc_get", flags); + goto err; + } + ret = cp_n->opd->am_get(cp_n->opd, key, data, tmp_flags, NULL); + /* + * Another cursor may have deleted all of the off-page + * duplicates, so for operations that are moving a cursor, we + * need to skip the empty tree and retry on the parent cursor. + */ + if (ret == DB_NOTFOUND) { + switch (flags) { + case DB_FIRST: + case DB_NEXT: + case DB_NEXT_NODUP: + flags = DB_NEXT; + break; + case DB_LAST: + case DB_PREV: + case DB_PREV_NODUP: + flags = DB_PREV; + break; + default: + goto err; + } + + ret = __dbc_close(cp_n->opd); + cp_n->opd = NULL; + if (ret == 0) + goto retry; + } + if (ret != 0) + goto err; + } + +done: /* + * Return a key/data item. The only exception is that we don't return + * a key if the user already gave us one, that is, if the DB_SET flag + * was set. The DB_SET flag is necessary. In a Btree, the user's key + * doesn't have to be the same as the key stored the tree, depending on + * the magic performed by the comparison function. As we may not have + * done any key-oriented operation here, the page reference may not be + * valid. Fill it in as necessary. We don't have to worry about any + * locks, the cursor must already be holding appropriate locks. + * + * XXX + * If not a Btree and DB_SET_RANGE is set, we shouldn't return a key + * either, should we? + */ + cp_n = dbc_n == NULL ? dbc->internal : dbc_n->internal; + if (!F_ISSET(key, DB_DBT_ISSET)) { + if (cp_n->page == NULL && (ret = __memp_fget(mpf, &cp_n->pgno, + dbc->thread_info, dbc->txn, 0, &cp_n->page)) != 0) + goto err; + + if ((ret = __db_ret(dbc, cp_n->page, cp_n->indx, key, + &dbc->rkey->data, &dbc->rkey->ulen)) != 0) { + /* + * If the key DBT is too small, we still want to return + * the size of the data. Otherwise applications are + * forced to check each one with a separate call. We + * don't want to copy the data, so we set the ulen to + * zero before calling __db_ret. + */ + if (ret == DB_BUFFER_SMALL && + F_ISSET(data, DB_DBT_USERMEM)) { + key_small = 1; + orig_ulen = data->ulen; + data->ulen = 0; + } else + goto err; + } + } + if (multi != 0 && dbc->am_bulk != NULL) { + /* + * Even if fetching from the OPD cursor we need a duplicate + * primary cursor if we are going after multiple keys. + */ + if (dbc_n == NULL) { + /* + * Non-"_KEY" DB_MULTIPLE doesn't move the main cursor, + * so it's safe to just use dbc, unless the cursor + * has an open off-page duplicate cursor whose state + * might need to be preserved. + */ + if ((!(multi & DB_MULTIPLE_KEY) && + dbc->internal->opd == NULL) || + F_ISSET(dbc, DBC_TRANSIENT | DBC_PARTITIONED)) + dbc_n = dbc; + else { + if ((ret = __dbc_idup(dbc, + &dbc_n, DB_POSITION)) != 0) + goto err; + if ((ret = dbc_n->am_get(dbc_n, + key, data, DB_CURRENT, &pgno)) != 0) + goto err; + } + cp_n = dbc_n->internal; + } + + /* + * If opd is set then we dupped the opd that we came in with. + * When we return we may have a new opd if we went to another + * key. + */ + if (opd != NULL) { + DB_ASSERT(env, cp_n->opd == NULL); + cp_n->opd = opd; + opd = NULL; + } + + /* + * Bulk get doesn't use __db_retcopy, so data.size won't + * get set up unless there is an error. Assume success + * here. This is the only call to am_bulk, and it avoids + * setting it exactly the same everywhere. If we have an + * DB_BUFFER_SMALL error, it'll get overwritten with the + * needed value. + */ + data->size = data->ulen; + ret = dbc_n->am_bulk(dbc_n, data, flags | multi); + } else if (!F_ISSET(data, DB_DBT_ISSET)) { + ddbc = opd != NULL ? opd : + cp_n->opd != NULL ? cp_n->opd : dbc_n; + cp = ddbc->internal; + if (cp->page == NULL && + (ret = __memp_fget(mpf, &cp->pgno, + dbc->thread_info, ddbc->txn, 0, &cp->page)) != 0) + goto err; + + type = TYPE(cp->page); + indx_off = ((type == P_LBTREE || + type == P_HASH || type == P_HASH_UNSORTED) ? O_INDX : 0); + ret = __db_ret(ddbc, cp->page, cp->indx + indx_off, + data, &dbc->rdata->data, &dbc->rdata->ulen); + } + +err: /* Don't pass DB_DBT_ISSET back to application level, error or no. */ + F_CLR(key, DB_DBT_ISSET); + F_CLR(data, DB_DBT_ISSET); + + /* Cleanup and cursor resolution. */ + if (opd != NULL) { + /* + * To support dirty reads we must reget the write lock + * if we have just stepped off a deleted record. + * Since the OPD cursor does not know anything + * about the referencing page or cursor we need + * to peek at the OPD cursor and get the lock here. + */ + if (F_ISSET(dbp, DB_AM_READ_UNCOMMITTED) && + F_ISSET((BTREE_CURSOR *) + dbc->internal->opd->internal, C_DELETED)) + if ((t_ret = + dbc->am_writelock(dbc)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __dbc_cleanup( + dbc->internal->opd, opd, ret)) != 0 && ret == 0) + ret = t_ret; + } + + if (key_small) { + data->ulen = orig_ulen; + if (ret == 0) + ret = DB_BUFFER_SMALL; + } + + if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && + (ret == 0 || ret == DB_BUFFER_SMALL)) + ret = t_ret; + + if (flags == DB_CONSUME || flags == DB_CONSUME_WAIT) + CDB_LOCKING_DONE(env, dbc); + return (ret); +} + +/* Internal flags shared by the dbc_put functions. */ +#define DBC_PUT_RMW 0x001 +#define DBC_PUT_NODEL 0x002 +#define DBC_PUT_HAVEREC 0x004 + +/* + * __dbc_put_resolve_key -- + * Get the current key and data so that we can correctly update the + * secondary and foreign databases. + */ +static inline int +__dbc_put_resolve_key(dbc, oldkey, olddata, put_statep, flags) + DBC *dbc; + DBT *oldkey, *olddata; + u_int32_t flags, *put_statep; +{ + DB *dbp; + ENV *env; + int ret, rmw; + + dbp = dbc->dbp; + env = dbp->env; + rmw = FLD_ISSET(*put_statep, DBC_PUT_RMW) ? DB_RMW : 0; + + DB_ASSERT(env, flags == DB_CURRENT); + COMPQUIET(flags, 0); + + /* + * This is safe to do on the cursor we already have; + * error or no, it won't move. + * + * We use DB_RMW for all of these gets because we'll be + * writing soon enough in the "normal" put code. In + * transactional databases we'll hold those write locks + * even if we close the cursor we're reading with. + * + * The DB_KEYEMPTY return needs special handling -- if the + * cursor is on a deleted key, we return DB_NOTFOUND. + */ + memset(oldkey, 0, sizeof(DBT)); + if ((ret = __dbc_get(dbc, oldkey, olddata, rmw | DB_CURRENT)) != 0) + return (ret == DB_KEYEMPTY ? DB_NOTFOUND : ret); + + /* Record that we've looked for the old record. */ + FLD_SET(*put_statep, DBC_PUT_HAVEREC); + return (0); +} + +/* + * __dbc_put_append -- + * Handle an append to a primary. + */ +static inline int +__dbc_put_append(dbc, key, data, put_statep, flags) + DBC *dbc; + DBT *key, *data; + u_int32_t flags, *put_statep; +{ + DB *dbp; + ENV *env; + DBC *dbc_n; + DBT tdata; + int ret, t_ret; + + dbp = dbc->dbp; + env = dbp->env; + ret = 0; + dbc_n = NULL; + + DB_ASSERT(env, flags == DB_APPEND); + COMPQUIET(flags, 0); + + /* + * With DB_APPEND, we need to do the insert to populate the key value. + * So we swap the 'normal' order of updating secondary / verifying + * foreign databases and inserting. + * + * If there is an append callback, the value stored in data->data may + * be replaced and then freed. To avoid passing a freed pointer back + * to the user, just operate on a copy of the data DBT. + */ + tdata = *data; + + /* + * If this cursor is going to be closed immediately, we don't + * need to take precautions to clean it up on error. + */ + if (F_ISSET(dbc, DBC_TRANSIENT)) + dbc_n = dbc; + else if ((ret = __dbc_idup(dbc, &dbc_n, 0)) != 0) + goto err; + + /* + * Append isn't a normal put operation; call the appropriate access + * method's append function. + */ + switch (dbp->type) { + case DB_QUEUE: + if ((ret = __qam_append(dbc_n, key, &tdata)) != 0) + goto err; + break; + case DB_RECNO: + if ((ret = __ram_append(dbc_n, key, &tdata)) != 0) + goto err; + break; + default: + /* The interface should prevent this. */ + DB_ASSERT(env, + dbp->type == DB_QUEUE || dbp->type == DB_RECNO); + + ret = __db_ferr(env, "DBC->put", 0); + goto err; + } + + /* + * The append callback, if one exists, may have allocated a new + * tdata.data buffer. If so, free it. + */ + FREE_IF_NEEDED(env, &tdata); + + /* + * The key value may have been generated by the above operation, but + * not set in the data buffer. Make sure it is there so that secondary + * updates can complete. + */ + if ((ret = __dbt_usercopy(env, key)) != 0) + goto err; + + /* An append cannot be replacing an existing item. */ + FLD_SET(*put_statep, DBC_PUT_NODEL); + +err: if (dbc_n != NULL && + (t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __dbc_put_partial -- + * Ensure that the data item we are using is complete and correct. + * Otherwise we could break the secondary constraints. + */ +static inline int +__dbc_put_partial(dbc, pkey, data, orig_data, out_data, put_statep, flags) + DBC *dbc; + DBT *pkey, *data, *orig_data, *out_data; + u_int32_t *put_statep, flags; +{ + DB *dbp; + DBC *pdbc; + ENV *env; + int ret, rmw, t_ret; + + dbp = dbc->dbp; + env = dbp->env; + ret = t_ret = 0; + rmw = FLD_ISSET(*put_statep, DBC_PUT_RMW) ? DB_RMW : 0; + + if (!FLD_ISSET(*put_statep, DBC_PUT_HAVEREC) && + !FLD_ISSET(*put_statep, DBC_PUT_NODEL)) { + /* + * We're going to have to search the tree for the + * specified key. Dup a cursor (so we have the same + * locking info) and do a c_get. + */ + if ((ret = __dbc_idup(dbc, &pdbc, 0)) != 0) + return (ret); + + /* + * When doing a put with DB_CURRENT, partial data items have + * already been resolved. + */ + DB_ASSERT(env, flags != DB_CURRENT); + + F_SET(pkey, DB_DBT_ISSET); + ret = __dbc_get(pdbc, pkey, orig_data, rmw | DB_SET); + if (ret == DB_KEYEMPTY || ret == DB_NOTFOUND) { + FLD_SET(*put_statep, DBC_PUT_NODEL); + ret = 0; + } + if ((t_ret = __dbc_close(pdbc)) != 0) + ret = t_ret; + if (ret != 0) + return (ret); + + FLD_SET(*put_statep, DBC_PUT_HAVEREC); + } + + COMPQUIET(flags, 0); + + /* + * Now build the new datum from orig_data and the partial data + * we were given. It's okay to do this if no record was + * returned above: a partial put on an empty record is allowed, + * if a little strange. The data is zero-padded. + */ + return (__db_buildpartial(dbp, orig_data, data, out_data)); +} + +/* + * __dbc_put_fixed_len -- + * Handle padding for fixed-length records. + */ +static inline int +__dbc_put_fixed_len(dbc, data, out_data) + DBC *dbc; + DBT *data, *out_data; +{ + DB *dbp; + ENV *env; + int re_pad, ret; + u_int32_t re_len, size; + + dbp = dbc->dbp; + env = dbp->env; + ret = 0; + + /* + * Handle fixed-length records. If the primary database has + * fixed-length records, we need to pad out the datum before + * we pass it into the callback function; we always index the + * "real" record. + */ + if (dbp->type == DB_QUEUE) { + re_len = ((QUEUE *)dbp->q_internal)->re_len; + re_pad = ((QUEUE *)dbp->q_internal)->re_pad; + } else { + re_len = ((BTREE *)dbp->bt_internal)->re_len; + re_pad = ((BTREE *)dbp->bt_internal)->re_pad; + } + + size = data->size; + if (size > re_len) { + ret = __db_rec_toobig(env, size, re_len); + return (ret); + } else if (size < re_len) { + /* + * If we're not doing a partial put, copy data->data into + * out_data->data, then pad out out_data->data. This overrides + * the assignment made above, which is used in the more common + * case when padding is not needed. + * + * If we're doing a partial put, the data we want are already + * in out_data.data; we just need to pad. + */ + if (F_ISSET(data, DB_DBT_PARTIAL)) { + if ((ret = __os_realloc( + env, re_len, &out_data->data)) != 0) + return (ret); + /* + * In the partial case, we have built the item into + * out_data already using __db_buildpartial. Just need + * to pad from the end of out_data, not from data->size. + */ + size = out_data->size; + } else { + if ((ret = __os_malloc( + env, re_len, &out_data->data)) != 0) + return (ret); + memcpy(out_data->data, data->data, size); + } + memset((u_int8_t *)out_data->data + size, re_pad, + re_len - size); + out_data->size = re_len; + } + + return (ret); +} + +/* + * __dbc_put_secondaries -- + * Insert the secondary keys, and validate the foreign key constraints. + */ +static inline int +__dbc_put_secondaries(dbc, + pkey, data, orig_data, s_count, s_keys_buf, put_statep) + DBC *dbc; + DBT *pkey, *data, *orig_data, *s_keys_buf; + int s_count; + u_int32_t *put_statep; +{ + DB *dbp, *sdbp; + DBC *fdbc, *sdbc; + DBT fdata, oldpkey, *skeyp, temppkey, tempskey, *tskeyp; + ENV *env; + int cmp, ret, rmw, t_ret; + u_int32_t nskey; + + dbp = dbc->dbp; + env = dbp->env; + fdbc = sdbc = NULL; + sdbp = NULL; + ret = t_ret = 0; + rmw = FLD_ISSET(*put_statep, DBC_PUT_RMW) ? DB_RMW : 0; + + /* + * Loop through the secondaries. (Step 3.) + * + * Note that __db_s_first and __db_s_next will take care of + * thread-locking and refcounting issues. + */ + for (ret = __db_s_first(dbp, &sdbp), skeyp = s_keys_buf; + sdbp != NULL && ret == 0; + ret = __db_s_next(&sdbp, dbc->txn), ++skeyp) { + DB_ASSERT(env, skeyp - s_keys_buf < s_count); + /* + * Don't process this secondary if the key is immutable and we + * know that the old record exists. This optimization can't be + * used if we have not checked for the old record yet. + */ + if (FLD_ISSET(*put_statep, DBC_PUT_HAVEREC) && + !FLD_ISSET(*put_statep, DBC_PUT_NODEL) && + FLD_ISSET(sdbp->s_assoc_flags, DB_ASSOC_IMMUTABLE_KEY)) + continue; + + /* + * Call the callback for this secondary, to get the + * appropriate secondary key. + */ + if ((ret = sdbp->s_callback(sdbp, + pkey, data, skeyp)) != 0) { + /* Not indexing is equivalent to an empty key set. */ + if (ret == DB_DONOTINDEX) { + F_SET(skeyp, DB_DBT_MULTIPLE); + skeyp->size = 0; + ret = 0; + } else + goto err; + } + + if (sdbp->s_foreign != NULL && + (ret = __db_cursor_int(sdbp->s_foreign, + dbc->thread_info, dbc->txn, sdbp->s_foreign->type, + PGNO_INVALID, 0, dbc->locker, &fdbc)) != 0) + goto err; + + /* + * Mark the secondary key DBT(s) as set -- that is, the + * callback returned at least one secondary key. + * + * Also, if this secondary index is associated with a foreign + * database, check that the foreign db contains the key(s) to + * maintain referential integrity. Set flags in fdata to avoid + * mem copying, we just need to know existence. We need to do + * this check before setting DB_DBT_ISSET, otherwise __dbc_get + * will overwrite the flag values. + */ + if (F_ISSET(skeyp, DB_DBT_MULTIPLE)) { +#ifdef DIAGNOSTIC + __db_check_skeyset(sdbp, skeyp); +#endif + for (tskeyp = (DBT *)skeyp->data, nskey = skeyp->size; + nskey > 0; nskey--, tskeyp++) { + if (fdbc != NULL) { + memset(&fdata, 0, sizeof(DBT)); + F_SET(&fdata, + DB_DBT_PARTIAL | DB_DBT_USERMEM); + if ((ret = __dbc_get( + fdbc, tskeyp, &fdata, + DB_SET | rmw)) == DB_NOTFOUND || + ret == DB_KEYEMPTY) { + ret = DB_FOREIGN_CONFLICT; + break; + } + } + F_SET(tskeyp, DB_DBT_ISSET); + } + tskeyp = (DBT *)skeyp->data; + nskey = skeyp->size; + } else { + if (fdbc != NULL) { + memset(&fdata, 0, sizeof(DBT)); + F_SET(&fdata, DB_DBT_PARTIAL | DB_DBT_USERMEM); + if ((ret = __dbc_get(fdbc, skeyp, &fdata, + DB_SET | rmw)) == DB_NOTFOUND || + ret == DB_KEYEMPTY) + ret = DB_FOREIGN_CONFLICT; + } + F_SET(skeyp, DB_DBT_ISSET); + tskeyp = skeyp; + nskey = 1; + } + if (fdbc != NULL && (t_ret = __dbc_close(fdbc)) != 0 && + ret == 0) + ret = t_ret; + fdbc = NULL; + if (ret != 0) + goto err; + + /* + * If we have the old record, we can generate and remove any + * old secondary key(s) now. We can also skip the secondary + * put if there is no change. + */ + if (FLD_ISSET(*put_statep, DBC_PUT_HAVEREC)) { + if ((ret = __dbc_del_oldskey(sdbp, dbc, + skeyp, pkey, orig_data)) == DB_KEYEXIST) + continue; + else if (ret != 0) + goto err; + } + if (nskey == 0) + continue; + + /* + * Open a cursor in this secondary. + * + * Use the same locker ID as our primary cursor, so that + * we're guaranteed that the locks don't conflict (e.g. in CDB + * or if we're subdatabases that share and want to lock a + * metadata page). + */ + if ((ret = __db_cursor_int(sdbp, dbc->thread_info, dbc->txn, + sdbp->type, PGNO_INVALID, 0, dbc->locker, &sdbc)) != 0) + goto err; + + /* + * If we're in CDB, updates will fail since the new cursor + * isn't a writer. However, we hold the WRITE lock in the + * primary and will for as long as our new cursor lasts, + * and the primary and secondary share a lock file ID, + * so it's safe to consider this a WRITER. The close + * routine won't try to put anything because we don't + * really have a lock. + */ + if (CDB_LOCKING(env)) { + DB_ASSERT(env, sdbc->mylock.off == LOCK_INVALID); + F_SET(sdbc, DBC_WRITER); + } + + /* + * Swap the primary key to the byte order of this secondary, if + * necessary. By doing this now, we can compare directly + * against the data already in the secondary without having to + * swap it after reading. + */ + SWAP_IF_NEEDED(sdbp, pkey); + + for (; nskey > 0 && ret == 0; nskey--, tskeyp++) { + /* Skip this key if it is already in the database. */ + if (!F_ISSET(tskeyp, DB_DBT_ISSET)) + continue; + + /* + * There are three cases here-- + * 1) The secondary supports sorted duplicates. + * If we attempt to put a secondary/primary pair + * that already exists, that's a duplicate + * duplicate, and c_put will return DB_KEYEXIST + * (see __db_duperr). This will leave us with + * exactly one copy of the secondary/primary pair, + * and this is just right--we'll avoid deleting it + * later, as the old and new secondaries will + * match (since the old secondary is the dup dup + * that's already there). + * 2) The secondary supports duplicates, but they're not + * sorted. We need to avoid putting a duplicate + * duplicate, because the matching old and new + * secondaries will prevent us from deleting + * anything and we'll wind up with two secondary + * records that point to the same primary key. Do + * a c_get(DB_GET_BOTH); only do the put if the + * secondary doesn't exist. + * 3) The secondary doesn't support duplicates at all. + * In this case, secondary keys must be unique; + * if another primary key already exists for this + * secondary key, we have to either overwrite it + * or not put this one, and in either case we've + * corrupted the secondary index. Do a + * c_get(DB_SET). If the secondary/primary pair + * already exists, do nothing; if the secondary + * exists with a different primary, return an + * error; and if the secondary does not exist, + * put it. + */ + if (!F_ISSET(sdbp, DB_AM_DUP)) { + /* Case 3. */ + memset(&oldpkey, 0, sizeof(DBT)); + F_SET(&oldpkey, DB_DBT_MALLOC); + ret = __dbc_get(sdbc, + tskeyp, &oldpkey, rmw | DB_SET); + if (ret == 0) { + cmp = __bam_defcmp(sdbp, + &oldpkey, pkey); + __os_ufree(env, oldpkey.data); + /* + * If the secondary key is unchanged, + * skip the put and go on to the next + * one. + */ + if (cmp == 0) + continue; + + __db_errx(env, "%s%s", + "Put results in a non-unique secondary key in an ", + "index not configured to support duplicates"); + ret = EINVAL; + } + if (ret != DB_NOTFOUND && ret != DB_KEYEMPTY) + break; + } else if (!F_ISSET(sdbp, DB_AM_DUPSORT)) { + /* Case 2. */ + DB_INIT_DBT(tempskey, + tskeyp->data, tskeyp->size); + DB_INIT_DBT(temppkey, + pkey->data, pkey->size); + ret = __dbc_get(sdbc, &tempskey, &temppkey, + rmw | DB_GET_BOTH); + if (ret != DB_NOTFOUND && ret != DB_KEYEMPTY) + break; + } + + ret = __dbc_put(sdbc, tskeyp, pkey, + DB_UPDATE_SECONDARY); + + /* + * We don't know yet whether this was a put-overwrite + * that in fact changed nothing. If it was, we may get + * DB_KEYEXIST. This is not an error. + */ + if (ret == DB_KEYEXIST) + ret = 0; + } + + /* Make sure the primary key is back in native byte-order. */ + SWAP_IF_NEEDED(sdbp, pkey); + + if ((t_ret = __dbc_close(sdbc)) != 0 && ret == 0) + ret = t_ret; + + if (ret != 0) + goto err; + + /* + * Mark that we have a key for this secondary so we can check + * it later before deleting the old one. We can't set it + * earlier or it would be cleared in the calls above. + */ + F_SET(skeyp, DB_DBT_ISSET); + } +err: if (sdbp != NULL && + (t_ret = __db_s_done(sdbp, dbc->txn)) != 0 && ret == 0) + ret = t_ret; + COMPQUIET(s_count, 0); + return (ret); +} + +static int +__dbc_put_primary(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; +{ + DB *dbp, *sdbp; + DBC *dbc_n, *pdbc; + DBT oldkey, olddata, newdata; + DBT *all_skeys, *skeyp, *tskeyp; + ENV *env; + int ret, t_ret, s_count; + u_int32_t nskey, put_state, rmw; + + dbp = dbc->dbp; + env = dbp->env; + ret = t_ret = s_count = 0; + put_state = 0; + sdbp = NULL; + pdbc = dbc_n = NULL; + all_skeys = NULL; + memset(&newdata, 0, sizeof(DBT)); + memset(&olddata, 0, sizeof(DBT)); + + /* + * We do multiple cursor operations in some cases and subsequently + * access the data DBT information. Set DB_DBT_MALLOC so we don't risk + * modification of the data between our uses of it. + */ + F_SET(&olddata, DB_DBT_MALLOC); + + /* + * We have at least one secondary which we may need to update. + * + * There is a rather vile locking issue here. Secondary gets + * will always involve acquiring a read lock in the secondary, + * then acquiring a read lock in the primary. Ideally, we + * would likewise perform puts by updating all the secondaries + * first, then doing the actual put in the primary, to avoid + * deadlock (since having multiple threads doing secondary + * gets and puts simultaneously is probably a common case). + * + * However, if this put is a put-overwrite--and we have no way to + * tell in advance whether it will be--we may need to delete + * an outdated secondary key. In order to find that old + * secondary key, we need to get the record we're overwriting, + * before we overwrite it. + * + * (XXX: It would be nice to avoid this extra get, and have the + * underlying put routines somehow pass us the old record + * since they need to traverse the tree anyway. I'm saving + * this optimization for later, as it's a lot of work, and it + * would be hard to fit into this locking paradigm anyway.) + * + * The simple thing to do would be to go get the old record before + * we do anything else. Unfortunately, though, doing so would + * violate our "secondary, then primary" lock acquisition + * ordering--even in the common case where no old primary record + * exists, we'll still acquire and keep a lock on the page where + * we're about to do the primary insert. + * + * To get around this, we do the following gyrations, which + * hopefully solve this problem in the common case: + * + * 1) If this is a c_put(DB_CURRENT), go ahead and get the + * old record. We already hold the lock on this page in + * the primary, so no harm done, and we'll need the primary + * key (which we weren't passed in this case) to do any + * secondary puts anyway. + * If this is a put(DB_APPEND), then we need to insert the item, + * so that we can know the key value. So go ahead and insert. In + * the case of a put(DB_APPEND) without secondaries it is + * implemented in the __db_put method as an optimization. + * + * 2) If we're doing a partial put, we need to perform the + * get on the primary key right away, since we don't have + * the whole datum that the secondary key is based on. + * We may also need to pad out the record if the primary + * has a fixed record length. + * + * 3) Loop through the secondary indices, putting into each a + * new secondary key that corresponds to the new record. + * + * 4) If we haven't done so in (1) or (2), get the old primary + * key/data pair. If one does not exist--the common case--we're + * done with secondary indices, and can go straight on to the + * primary put. + * + * 5) If we do have an old primary key/data pair, however, we need + * to loop through all the secondaries a second time and delete + * the old secondary in each. + */ + s_count = __db_s_count(dbp); + if ((ret = __os_calloc(env, + (u_int)s_count, sizeof(DBT), &all_skeys)) != 0) + goto err; + + /* + * Primary indices can't have duplicates, so only DB_APPEND, + * DB_CURRENT, DB_KEYFIRST, and DB_KEYLAST make any sense. Other flags + * should have been caught by the checking routine, but + * add a sprinkling of paranoia. + */ + DB_ASSERT(env, flags == DB_APPEND || flags == DB_CURRENT || + flags == DB_KEYFIRST || flags == DB_KEYLAST || + flags == DB_NOOVERWRITE || flags == DB_OVERWRITE_DUP); + + /* + * We'll want to use DB_RMW in a few places, but it's only legal + * when locking is on. + */ + rmw = STD_LOCKING(dbc) ? DB_RMW : 0; + if (rmw) + FLD_SET(put_state, DBC_PUT_RMW); + + /* Resolve the primary key if required (Step 1). */ + if (flags == DB_CURRENT) { + if ((ret = __dbc_put_resolve_key(dbc, + &oldkey, &olddata, &put_state, flags)) != 0) + goto err; + key = &oldkey; + } else if (flags == DB_APPEND) { + if ((ret = __dbc_put_append(dbc, + key, data, &put_state, flags)) != 0) + goto err; + } + + /* + * PUT_NOOVERWRITE with secondaries is a troublesome case. We need + * to check that the insert will work prior to making any changes + * to secondaries. Try to work within the locking constraints outlined + * above. + * + * This is DB->put (DB_NOOVERWRITE). DBC->put(DB_NODUPDATA) is not + * relevant since it is only valid on DBs that support duplicates, + * which primaries with secondaries can't have. + */ + if (flags == DB_NOOVERWRITE) { + /* Don't bother retrieving the data. */ + F_SET(key, DB_DBT_ISSET); + olddata.dlen = 0; + olddata.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM; + if (__dbc_get(dbc, key, &olddata, DB_SET) != DB_NOTFOUND) { + ret = DB_KEYEXIST; + goto done; + } + } + + /* + * Check for partial puts using DB_DBT_PARTIAL (Step 2). + */ + if (F_ISSET(data, DB_DBT_PARTIAL)) { + if ((ret = __dbc_put_partial(dbc, + key, data, &olddata, &newdata, &put_state, flags)) != 0) + goto err; + } else { + newdata = *data; + } + + /* + * Check for partial puts, with fixed length record databases (Step 2). + */ + if ((dbp->type == DB_RECNO && F_ISSET(dbp, DB_AM_FIXEDLEN)) || + (dbp->type == DB_QUEUE)) { + if ((ret = __dbc_put_fixed_len(dbc, data, &newdata)) != 0) + goto err; + } + + /* Validate any foreign databases, and update secondaries. (Step 3). */ + if ((ret = __dbc_put_secondaries(dbc, key, &newdata, + &olddata, s_count, all_skeys, &put_state)) + != 0) + goto err; + /* + * If we've already got the old primary key/data pair, the secondary + * updates are already done. + */ + if (FLD_ISSET(put_state, DBC_PUT_HAVEREC)) + goto done; + + /* + * If still necessary, go get the old primary key/data. (Step 4.) + * + * See the comments in step 2. This is real familiar. + */ + if ((ret = __dbc_idup(dbc, &pdbc, 0)) != 0) + goto err; + DB_ASSERT(env, flags != DB_CURRENT); + F_SET(key, DB_DBT_ISSET); + ret = __dbc_get(pdbc, key, &olddata, rmw | DB_SET); + if (ret == DB_KEYEMPTY || ret == DB_NOTFOUND) { + FLD_SET(put_state, DBC_PUT_NODEL); + ret = 0; + } + if ((t_ret = __dbc_close(pdbc)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + goto err; + + /* + * Check whether we do in fact have an old record we may need to + * delete. (Step 5). + */ + if (FLD_ISSET(put_state, DBC_PUT_NODEL)) + goto done; + + for (ret = __db_s_first(dbp, &sdbp), skeyp = all_skeys; + sdbp != NULL && ret == 0; + ret = __db_s_next(&sdbp, dbc->txn), skeyp++) { + DB_ASSERT(env, skeyp - all_skeys < s_count); + /* + * Don't process this secondary if the key is immutable. We + * know that the old record exists, so this optimization can + * always be used. + */ + if (FLD_ISSET(sdbp->s_assoc_flags, DB_ASSOC_IMMUTABLE_KEY)) + continue; + + if ((ret = __dbc_del_oldskey(sdbp, dbc, + skeyp, key, &olddata)) != 0 && ret != DB_KEYEXIST) + goto err; + } + if (ret != 0) + goto err; + +done: +err: + if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && ret == 0) + ret = t_ret; + + /* If newdata or olddata were used, free their buffers. */ + if (newdata.data != NULL && newdata.data != data->data) + __os_free(env, newdata.data); + if (olddata.data != NULL) + __os_ufree(env, olddata.data); + + CDB_LOCKING_DONE(env, dbc); + + if (sdbp != NULL && + (t_ret = __db_s_done(sdbp, dbc->txn)) != 0 && ret == 0) + ret = t_ret; + + for (skeyp = all_skeys; skeyp - all_skeys < s_count; skeyp++) { + if (F_ISSET(skeyp, DB_DBT_MULTIPLE)) { + for (nskey = skeyp->size, tskeyp = (DBT *)skeyp->data; + nskey > 0; + nskey--, tskeyp++) + FREE_IF_NEEDED(env, tskeyp); + } + FREE_IF_NEEDED(env, skeyp); + } + if (all_skeys != NULL) + __os_free(env, all_skeys); + return (ret); +} + +/* + * __dbc_put -- + * Put using a cursor. + * + * PUBLIC: int __dbc_put __P((DBC *, DBT *, DBT *, u_int32_t)); + */ +int +__dbc_put(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; +{ + DB *dbp; + int ret; + + dbp = dbc->dbp; + ret = 0; + + /* + * Putting to secondary indices is forbidden; when we need to + * internally update one, we're called with a private flag, + * DB_UPDATE_SECONDARY, which does the right thing but won't return an + * error during flag checking. + * + * As a convenience, many places that want the default DB_KEYLAST + * behavior call DBC->put with flags == 0. Protect lower-level code + * here by translating that. + * + * Lastly, the DB_OVERWRITE_DUP flag is equivalent to DB_KEYLAST unless + * there are sorted duplicates. Limit the number of places that need + * to test for it explicitly. + */ + if (flags == DB_UPDATE_SECONDARY || flags == 0 || + (flags == DB_OVERWRITE_DUP && !F_ISSET(dbp, DB_AM_DUPSORT))) + flags = DB_KEYLAST; + + CDB_LOCKING_INIT(dbc->env, dbc); + + /* + * Check to see if we are a primary and have secondary indices. + * If we are not, we save ourselves a good bit of trouble and + * just skip to the "normal" put. + */ + if (DB_IS_PRIMARY(dbp) && + ((ret = __dbc_put_primary(dbc, key, data, flags)) != 0)) + return (ret); + + /* + * If this is an append operation, the insert was done prior to the + * secondary updates, so we are finished. + */ + if (flags == DB_APPEND) + return (ret); + +#ifdef HAVE_COMPRESSION + if (DB_IS_COMPRESSED(dbp)) + return (__bamc_compress_put(dbc, key, data, flags)); +#endif + + return (__dbc_iput(dbc, key, data, flags)); +} + +/* + * __dbc_iput -- + * Implementation of put using a cursor. + * + * PUBLIC: int __dbc_iput __P((DBC *, DBT *, DBT *, u_int32_t)); + */ +int +__dbc_iput(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; +{ + DBC *dbc_n, *oldopd, *opd; + db_pgno_t pgno; + int ret, t_ret; + u_int32_t tmp_flags; + + /* + * Cursor Cleanup Note: + * All of the cursors passed to the underlying access methods by this + * routine are duplicated cursors. On return, any referenced pages + * will be discarded, and, if the cursor is not intended to be used + * again, the close function will be called. So, pages/locks that + * the cursor references do not need to be resolved by the underlying + * functions. + */ + dbc_n = NULL; + ret = t_ret = 0; + + /* + * If we have an off-page duplicates cursor, and the operation applies + * to it, perform the operation. Duplicate the cursor and call the + * underlying function. + * + * Off-page duplicate trees are locked in the primary tree, that is, + * we acquire a write lock in the primary tree and no locks in the + * off-page dup tree. If the put operation is done in an off-page + * duplicate tree, call the primary cursor's upgrade routine first. + */ + if (dbc->internal->opd != NULL && + (flags == DB_AFTER || flags == DB_BEFORE || flags == DB_CURRENT)) { + /* + * A special case for hash off-page duplicates. Hash doesn't + * support (and is documented not to support) put operations + * relative to a cursor which references an already deleted + * item. For consistency, apply the same criteria to off-page + * duplicates as well. + */ + if (dbc->dbtype == DB_HASH && F_ISSET( + ((BTREE_CURSOR *)(dbc->internal->opd->internal)), + C_DELETED)) { + ret = DB_NOTFOUND; + goto err; + } + + if ((ret = dbc->am_writelock(dbc)) != 0 || + (ret = __dbc_dup(dbc, &dbc_n, DB_POSITION)) != 0) + goto err; + opd = dbc_n->internal->opd; + if ((ret = opd->am_put( + opd, key, data, flags, NULL)) != 0) + goto err; + goto done; + } + + /* + * Perform an operation on the main cursor. Duplicate the cursor, + * and call the underlying function. + */ + if (flags == DB_AFTER || flags == DB_BEFORE || flags == DB_CURRENT) + tmp_flags = DB_POSITION; + else + tmp_flags = 0; + + /* + * If this cursor is going to be closed immediately, we don't + * need to take precautions to clean it up on error. + */ + if (F_ISSET(dbc, DBC_TRANSIENT | DBC_PARTITIONED)) + dbc_n = dbc; + else if ((ret = __dbc_idup(dbc, &dbc_n, tmp_flags)) != 0) + goto err; + + pgno = PGNO_INVALID; + if ((ret = dbc_n->am_put(dbc_n, key, data, flags, &pgno)) != 0) + goto err; + + /* + * We may be referencing a new off-page duplicates tree. Acquire + * a new cursor and call the underlying function. + */ + if (pgno != PGNO_INVALID) { + oldopd = dbc_n->internal->opd; + if ((ret = __dbc_newopd(dbc, pgno, oldopd, &opd)) != 0) { + dbc_n->internal->opd = opd; + goto err; + } + + dbc_n->internal->opd = opd; + opd->internal->pdbc = dbc_n; + + if (flags == DB_NOOVERWRITE) + flags = DB_KEYLAST; + if ((ret = opd->am_put( + opd, key, data, flags, NULL)) != 0) + goto err; + } + +done: +err: /* Cleanup and cursor resolution. */ + if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __dbc_del_oldskey -- + * Delete an old secondary key, if necessary. + * Returns DB_KEYEXIST if the new and old keys match.. + */ +static int +__dbc_del_oldskey(sdbp, dbc, skey, pkey, olddata) + DB *sdbp; + DBC *dbc; + DBT *skey, *pkey, *olddata; +{ + DB *dbp; + DBC *sdbc; + DBT *toldskeyp, *tskeyp; + DBT oldskey, temppkey, tempskey; + ENV *env; + int ret, t_ret; + u_int32_t i, noldskey, nsame, nskey, rmw; + + sdbc = NULL; + dbp = sdbp->s_primary; + env = dbp->env; + nsame = 0; + rmw = STD_LOCKING(dbc) ? DB_RMW : 0; + + /* + * Get the old secondary key. + */ + memset(&oldskey, 0, sizeof(DBT)); + if ((ret = sdbp->s_callback(sdbp, pkey, olddata, &oldskey)) != 0) { + if (ret == DB_DONOTINDEX || + (F_ISSET(&oldskey, DB_DBT_MULTIPLE) && oldskey.size == 0)) + /* There's no old key to delete. */ + ret = 0; + return (ret); + } + + if (F_ISSET(&oldskey, DB_DBT_MULTIPLE)) { +#ifdef DIAGNOSTIC + __db_check_skeyset(sdbp, &oldskey); +#endif + toldskeyp = (DBT *)oldskey.data; + noldskey = oldskey.size; + } else { + toldskeyp = &oldskey; + noldskey = 1; + } + + if (F_ISSET(skey, DB_DBT_MULTIPLE)) { + nskey = skey->size; + skey = (DBT *)skey->data; + } else + nskey = F_ISSET(skey, DB_DBT_ISSET) ? 1 : 0; + + for (; noldskey > 0 && ret == 0; noldskey--, toldskeyp++) { + /* + * Check whether this old secondary key is also a new key + * before we delete it. Note that bt_compare is (and must be) + * set no matter what access method we're in. + */ + for (i = 0, tskeyp = skey; i < nskey; i++, tskeyp++) + if (((BTREE *)sdbp->bt_internal)->bt_compare(sdbp, + toldskeyp, tskeyp) == 0) { + nsame++; + F_CLR(tskeyp, DB_DBT_ISSET); + break; + } + + if (i < nskey) { + FREE_IF_NEEDED(env, toldskeyp); + continue; + } + + if (sdbc == NULL) { + if ((ret = __db_cursor_int(sdbp, + dbc->thread_info, dbc->txn, sdbp->type, + PGNO_INVALID, 0, dbc->locker, &sdbc)) != 0) + goto err; + if (CDB_LOCKING(env)) { + DB_ASSERT(env, + sdbc->mylock.off == LOCK_INVALID); + F_SET(sdbc, DBC_WRITER); + } + } + + /* + * Don't let c_get(DB_GET_BOTH) stomp on our data. Use + * temporary DBTs instead. + */ + SWAP_IF_NEEDED(sdbp, pkey); + DB_INIT_DBT(temppkey, pkey->data, pkey->size); + DB_INIT_DBT(tempskey, toldskeyp->data, toldskeyp->size); + if ((ret = __dbc_get(sdbc, + &tempskey, &temppkey, rmw | DB_GET_BOTH)) == 0) + ret = __dbc_del(sdbc, DB_UPDATE_SECONDARY); + else if (ret == DB_NOTFOUND) + ret = __db_secondary_corrupt(dbp); + SWAP_IF_NEEDED(sdbp, pkey); + FREE_IF_NEEDED(env, toldskeyp); + } + +err: for (; noldskey > 0; noldskey--, toldskeyp++) + FREE_IF_NEEDED(env, toldskeyp); + FREE_IF_NEEDED(env, &oldskey); + if (sdbc != NULL && (t_ret = __dbc_close(sdbc)) != 0 && ret == 0) + ret = t_ret; + if (ret == 0 && nsame == nskey) + return (DB_KEYEXIST); + return (ret); +} + +/* + * __db_duperr() + * Error message: we don't currently support sorted duplicate duplicates. + * PUBLIC: int __db_duperr __P((DB *, u_int32_t)); + */ +int +__db_duperr(dbp, flags) + DB *dbp; + u_int32_t flags; +{ + /* + * If we run into this error while updating a secondary index, + * don't yell--there's no clean way to pass DB_NODUPDATA in along + * with DB_UPDATE_SECONDARY, but we may run into this problem + * in a normal, non-error course of events. + * + * !!! + * If and when we ever permit duplicate duplicates in sorted-dup + * databases, we need to either change the secondary index code + * to check for dup dups, or we need to maintain the implicit + * "DB_NODUPDATA" behavior for databases with DB_AM_SECONDARY set. + */ + if (flags != DB_NODUPDATA && !F_ISSET(dbp, DB_AM_SECONDARY)) + __db_errx(dbp->env, + "Duplicate data items are not supported with sorted data"); + return (DB_KEYEXIST); +} + +/* + * __dbc_cleanup -- + * Clean up duplicate cursors. + * + * PUBLIC: int __dbc_cleanup __P((DBC *, DBC *, int)); + */ +int +__dbc_cleanup(dbc, dbc_n, failed) + DBC *dbc, *dbc_n; + int failed; +{ + DB *dbp; + DBC *opd; + DBC_INTERNAL *internal; + DB_MPOOLFILE *mpf; + int ret, t_ret; + + dbp = dbc->dbp; + mpf = dbp->mpf; + internal = dbc->internal; + ret = 0; + + /* Discard any pages we're holding. */ + if (internal->page != NULL) { + if ((t_ret = __memp_fput(mpf, dbc->thread_info, + internal->page, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + internal->page = NULL; + } + opd = internal->opd; + if (opd != NULL && opd->internal->page != NULL) { + if ((t_ret = __memp_fput(mpf, dbc->thread_info, + opd->internal->page, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + opd->internal->page = NULL; + } + + /* + * If dbc_n is NULL, there's no internal cursor swapping to be done + * and no dbc_n to close--we probably did the entire operation on an + * offpage duplicate cursor. Just return. + * + * If dbc and dbc_n are the same, we're either inside a DB->{put/get} + * operation, and as an optimization we performed the operation on + * the main cursor rather than on a duplicated one, or we're in a + * bulk get that can't have moved the cursor (DB_MULTIPLE with the + * initial c_get operation on an off-page dup cursor). Just + * return--either we know we didn't move the cursor, or we're going + * to close it before we return to application code, so we're sure + * not to visibly violate the "cursor stays put on error" rule. + */ + if (dbc_n == NULL || dbc == dbc_n) + return (ret); + + if (dbc_n->internal->page != NULL) { + if ((t_ret = __memp_fput(mpf, dbc->thread_info, + dbc_n->internal->page, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + dbc_n->internal->page = NULL; + } + opd = dbc_n->internal->opd; + if (opd != NULL && opd->internal->page != NULL) { + if ((t_ret = __memp_fput(mpf, dbc->thread_info, + opd->internal->page, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + opd->internal->page = NULL; + } + + /* + * If we didn't fail before entering this routine or just now when + * freeing pages, swap the interesting contents of the old and new + * cursors. + */ + if (!failed && ret == 0) { + if (opd != NULL) + opd->internal->pdbc = dbc; + if (internal->opd != NULL) + internal->opd->internal->pdbc = dbc_n; + dbc->internal = dbc_n->internal; + dbc_n->internal = internal; + } + + /* + * Close the cursor we don't care about anymore. The close can fail, + * but we only expect DB_LOCK_DEADLOCK failures. This violates our + * "the cursor is unchanged on error" semantics, but since all you can + * do with a DB_LOCK_DEADLOCK failure is close the cursor, I believe + * that's OK. + * + * XXX + * There's no way to recover from failure to close the old cursor. + * All we can do is move to the new position and return an error. + * + * XXX + * We might want to consider adding a flag to the cursor, so that any + * subsequent operations other than close just return an error? + */ + if ((t_ret = __dbc_close(dbc_n)) != 0 && ret == 0) + ret = t_ret; + + /* + * If this was an update that is supporting dirty reads + * then we may have just swapped our read for a write lock + * which is held by the surviving cursor. We need + * to explicitly downgrade this lock. The closed cursor + * may only have had a read lock. + */ + if (F_ISSET(dbp, DB_AM_READ_UNCOMMITTED) && + dbc->internal->lock_mode == DB_LOCK_WRITE) { + if ((t_ret = + __TLPUT(dbc, dbc->internal->lock)) != 0 && ret == 0) + ret = t_ret; + if (t_ret == 0) + dbc->internal->lock_mode = DB_LOCK_WWRITE; + if (dbc->internal->page != NULL && (t_ret = + __memp_shared(dbp->mpf, dbc->internal->page)) != 0 && + ret == 0) + ret = t_ret; + } + + return (ret); +} + +/* + * __dbc_secondary_get_pp -- + * This wrapper function for DBC->pget() is the DBC->get() function + * for a secondary index cursor. + * + * PUBLIC: int __dbc_secondary_get_pp __P((DBC *, DBT *, DBT *, u_int32_t)); + */ +int +__dbc_secondary_get_pp(dbc, skey, data, flags) + DBC *dbc; + DBT *skey, *data; + u_int32_t flags; +{ + DB_ASSERT(dbc->env, F_ISSET(dbc->dbp, DB_AM_SECONDARY)); + return (__dbc_pget_pp(dbc, skey, NULL, data, flags)); +} + +/* + * __dbc_pget -- + * Get a primary key/data pair through a secondary index. + * + * PUBLIC: int __dbc_pget __P((DBC *, DBT *, DBT *, DBT *, u_int32_t)); + */ +int +__dbc_pget(dbc, skey, pkey, data, flags) + DBC *dbc; + DBT *skey, *pkey, *data; + u_int32_t flags; +{ + DB *pdbp, *sdbp; + DBC *dbc_n, *pdbc; + DBT nullpkey; + u_int32_t save_pkey_flags, tmp_flags, tmp_read_locking, tmp_rmw; + int pkeymalloc, ret, t_ret; + + sdbp = dbc->dbp; + pdbp = sdbp->s_primary; + dbc_n = NULL; + pkeymalloc = t_ret = 0; + + /* + * The challenging part of this function is getting the behavior + * right for all the various permutations of DBT flags. The + * next several blocks handle the various cases we need to + * deal with specially. + */ + + /* + * We may be called with a NULL pkey argument, if we've been + * wrapped by a 2-DBT get call. If so, we need to use our + * own DBT. + */ + if (pkey == NULL) { + memset(&nullpkey, 0, sizeof(DBT)); + pkey = &nullpkey; + } + + /* Clear OR'd in additional bits so we can check for flag equality. */ + tmp_rmw = LF_ISSET(DB_RMW); + LF_CLR(DB_RMW); + + SET_READ_LOCKING_FLAGS(dbc, tmp_read_locking); + /* + * DB_GET_RECNO is a special case, because we're interested not in + * the primary key/data pair, but rather in the primary's record + * number. + */ + if (flags == DB_GET_RECNO) { + if (tmp_rmw) + F_SET(dbc, DBC_RMW); + F_SET(dbc, tmp_read_locking); + ret = __dbc_pget_recno(dbc, pkey, data, flags); + if (tmp_rmw) + F_CLR(dbc, DBC_RMW); + /* Clear the temp flags, but leave WAS_READ_COMMITTED. */ + F_CLR(dbc, tmp_read_locking & ~DBC_WAS_READ_COMMITTED); + return (ret); + } + + /* + * If the DBTs we've been passed don't have any of the + * user-specified memory management flags set, we want to make sure + * we return values using the DBTs dbc->rskey, dbc->rkey, and + * dbc->rdata, respectively. + * + * There are two tricky aspects to this: first, we need to pass + * skey and pkey *in* to the initial c_get on the secondary key, + * since either or both may be looked at by it (depending on the + * get flag). Second, we must not use a normal DB->get call + * on the secondary, even though that's what we want to accomplish, + * because the DB handle may be free-threaded. Instead, + * we open a cursor, then take steps to ensure that we actually use + * the rkey/rdata from the *secondary* cursor. + * + * We accomplish all this by passing in the DBTs we started out + * with to the c_get, but swapping the contents of rskey and rkey, + * respectively, into rkey and rdata; __db_ret will treat them like + * the normal key/data pair in a c_get call, and will realloc them as + * need be (this is "step 1"). Then, for "step 2", we swap back + * rskey/rkey/rdata to normal, and do a get on the primary with the + * secondary dbc appointed as the owner of the returned-data memory. + * + * Note that in step 2, we copy the flags field in case we need to + * pass down a DB_DBT_PARTIAL or other flag that is compatible with + * letting DB do the memory management. + */ + + /* + * It is correct, though slightly sick, to attempt a partial get of a + * primary key. However, if we do so here, we'll never find the + * primary record; clear the DB_DBT_PARTIAL field of pkey just for the + * duration of the next call. + */ + save_pkey_flags = pkey->flags; + F_CLR(pkey, DB_DBT_PARTIAL); + + /* + * Now we can go ahead with the meat of this call. First, get the + * primary key from the secondary index. (What exactly we get depends + * on the flags, but the underlying cursor get will take care of the + * dirty work.) Duplicate the cursor, in case the later get on the + * primary fails. + */ + switch (flags) { + case DB_CURRENT: + case DB_GET_BOTHC: + case DB_NEXT: + case DB_NEXT_DUP: + case DB_NEXT_NODUP: + case DB_PREV: + case DB_PREV_DUP: + case DB_PREV_NODUP: + tmp_flags = DB_POSITION; + break; + default: + tmp_flags = 0; + break; + } + + if (F_ISSET(dbc, DBC_PARTITIONED | DBC_TRANSIENT)) + dbc_n = dbc; + else if ((ret = __dbc_dup(dbc, &dbc_n, tmp_flags)) != 0) + return (ret); + + F_SET(dbc_n, DBC_TRANSIENT); + + if (tmp_rmw) + F_SET(dbc_n, DBC_RMW); + F_SET(dbc_n, tmp_read_locking); + + /* + * If we've been handed a primary key, it will be in native byte order, + * so we need to swap it before reading from the secondary. + */ + if (flags == DB_GET_BOTH || flags == DB_GET_BOTHC || + flags == DB_GET_BOTH_RANGE) + SWAP_IF_NEEDED(sdbp, pkey); + +retry: /* Step 1. */ + dbc_n->rdata = dbc->rkey; + dbc_n->rkey = dbc->rskey; + ret = __dbc_get(dbc_n, skey, pkey, flags); + /* Restore pkey's flags in case we stomped the PARTIAL flag. */ + pkey->flags = save_pkey_flags; + + /* + * We need to swap the primary key to native byte order if we read it + * successfully, or if we swapped it on entry above. We can't return + * with the application's data modified. + */ + if (ret == 0 || flags == DB_GET_BOTH || flags == DB_GET_BOTHC || + flags == DB_GET_BOTH_RANGE) + SWAP_IF_NEEDED(sdbp, pkey); + + if (ret != 0) + goto err; + + /* + * Now we're ready for "step 2". If either or both of pkey and data do + * not have memory management flags set--that is, if DB is managing + * their memory--we need to swap around the rkey/rdata structures so + * that we don't wind up trying to use memory managed by the primary + * database cursor, which we'll close before we return. + * + * !!! + * If you're carefully following the bouncing ball, you'll note that in + * the DB-managed case, the buffer hanging off of pkey is the same as + * dbc->rkey->data. This is just fine; we may well realloc and stomp + * on it when we return, if we're doing a DB_GET_BOTH and need to + * return a different partial or key (depending on the comparison + * function), but this is safe. + * + * !!! + * We need to use __db_cursor_int here rather than simply calling + * pdbp->cursor, because otherwise, if we're in CDB, we'll allocate a + * new locker ID and leave ourselves open to deadlocks. (Even though + * we're only acquiring read locks, we'll still block if there are any + * waiters.) + */ + if ((ret = __db_cursor_int(pdbp, dbc->thread_info, + dbc->txn, pdbp->type, PGNO_INVALID, 0, dbc->locker, &pdbc)) != 0) + goto err; + + F_SET(pdbc, tmp_read_locking | + F_ISSET(dbc, DBC_READ_UNCOMMITTED | DBC_READ_COMMITTED | DBC_RMW)); + + /* + * We're about to use pkey a second time. If DB_DBT_MALLOC is set on + * it, we'll leak the memory we allocated the first time. Thus, set + * DB_DBT_REALLOC instead so that we reuse that memory instead of + * leaking it. + * + * Alternatively, if the application is handling copying for pkey, we + * need to take a copy now. The copy will be freed on exit from + * __dbc_pget_pp (and we must be coming through there if DB_DBT_USERCOPY + * is set). In the case of DB_GET_BOTH_RANGE, the pkey supplied by + * the application has already been copied in but the value may have + * changed in the search. In that case, free the original copy and get + * a new one. + * + * !!! + * This assumes that the user must always specify a compatible realloc + * function if a malloc function is specified. I think this is a + * reasonable requirement. + */ + if (F_ISSET(pkey, DB_DBT_MALLOC)) { + F_CLR(pkey, DB_DBT_MALLOC); + F_SET(pkey, DB_DBT_REALLOC); + pkeymalloc = 1; + } else if (F_ISSET(pkey, DB_DBT_USERCOPY)) { + if (flags == DB_GET_BOTH_RANGE) + __dbt_userfree(sdbp->env, NULL, pkey, NULL); + if ((ret = __dbt_usercopy(sdbp->env, pkey)) != 0) + goto err; + } + + /* + * Do the actual get. Set DBC_TRANSIENT since we don't care about + * preserving the position on error, and it's faster. SET_RET_MEM so + * that the secondary DBC owns any returned-data memory. + */ + F_SET(pdbc, DBC_TRANSIENT); + SET_RET_MEM(pdbc, dbc); + ret = __dbc_get(pdbc, pkey, data, DB_SET); + + /* + * If the item wasn't found in the primary, this is a bug; our + * secondary has somehow gotten corrupted, and contains elements that + * don't correspond to anything in the primary. Complain. + */ + + /* Now close the primary cursor. */ + if ((t_ret = __dbc_close(pdbc)) != 0 && ret == 0) + ret = t_ret; + + else if (ret == DB_NOTFOUND) { + if (!F_ISSET(pdbc, DBC_READ_UNCOMMITTED)) + ret = __db_secondary_corrupt(pdbp); + else switch (flags) { + case DB_GET_BOTHC: + case DB_NEXT: + case DB_NEXT_DUP: + case DB_NEXT_NODUP: + case DB_PREV: + case DB_PREV_DUP: + case DB_PREV_NODUP: + goto retry; + default: + break; + } + } + +err: /* Cleanup and cursor resolution. */ + if ((t_ret = __dbc_cleanup(dbc, dbc_n, ret)) != 0 && ret == 0) + ret = t_ret; + if (pkeymalloc) { + /* + * If pkey had a MALLOC flag, we need to restore it; otherwise, + * if the user frees the buffer but reuses the DBT without + * NULL'ing its data field or changing the flags, we may drop + * core. + */ + F_CLR(pkey, DB_DBT_REALLOC); + F_SET(pkey, DB_DBT_MALLOC); + } + + return (ret); +} + +/* + * __dbc_pget_recno -- + * Perform a DB_GET_RECNO c_pget on a secondary index. Returns + * the secondary's record number in the pkey field and the primary's + * in the data field. + */ +static int +__dbc_pget_recno(sdbc, pkey, data, flags) + DBC *sdbc; + DBT *pkey, *data; + u_int32_t flags; +{ + DB *pdbp, *sdbp; + DBC *pdbc; + DBT discardme, primary_key; + ENV *env; + db_recno_t oob; + u_int32_t rmw; + int ret, t_ret; + + sdbp = sdbc->dbp; + pdbp = sdbp->s_primary; + env = sdbp->env; + pdbc = NULL; + ret = t_ret = 0; + + rmw = LF_ISSET(DB_RMW); + + memset(&discardme, 0, sizeof(DBT)); + F_SET(&discardme, DB_DBT_USERMEM | DB_DBT_PARTIAL); + + oob = RECNO_OOB; + + /* + * If the primary is an rbtree, we want its record number, whether + * or not the secondary is one too. Fetch the recno into "data". + * + * If it's not an rbtree, return RECNO_OOB in "data". + */ + if (F_ISSET(pdbp, DB_AM_RECNUM)) { + /* + * Get the primary key, so we can find the record number + * in the primary. (We're uninterested in the secondary key.) + */ + memset(&primary_key, 0, sizeof(DBT)); + F_SET(&primary_key, DB_DBT_MALLOC); + if ((ret = __dbc_get(sdbc, + &discardme, &primary_key, rmw | DB_CURRENT)) != 0) + return (ret); + + /* + * Open a cursor on the primary, set it to the right record, + * and fetch its recno into "data". + * + * (See __dbc_pget for comments on the use of __db_cursor_int.) + * + * SET_RET_MEM so that the secondary DBC owns any returned-data + * memory. + */ + if ((ret = __db_cursor_int(pdbp, sdbc->thread_info, sdbc->txn, + pdbp->type, PGNO_INVALID, 0, sdbc->locker, &pdbc)) != 0) + goto perr; + SET_RET_MEM(pdbc, sdbc); + if ((ret = __dbc_get(pdbc, + &primary_key, &discardme, rmw | DB_SET)) != 0) + goto perr; + + ret = __dbc_get(pdbc, &discardme, data, rmw | DB_GET_RECNO); + +perr: __os_ufree(env, primary_key.data); + if (pdbc != NULL && + (t_ret = __dbc_close(pdbc)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + return (ret); + } else if ((ret = __db_retcopy(env, data, &oob, + sizeof(oob), &sdbc->rkey->data, &sdbc->rkey->ulen)) != 0) + return (ret); + + /* + * If the secondary is an rbtree, we want its record number, whether + * or not the primary is one too. Fetch the recno into "pkey". + * + * If it's not an rbtree, return RECNO_OOB in "pkey". + */ + if (F_ISSET(sdbp, DB_AM_RECNUM)) + return (__dbc_get(sdbc, &discardme, pkey, flags)); + else + return (__db_retcopy(env, pkey, &oob, + sizeof(oob), &sdbc->rdata->data, &sdbc->rdata->ulen)); +} + +/* + * __db_wrlock_err -- do not have a write lock. + */ +static int +__db_wrlock_err(env) + ENV *env; +{ + __db_errx(env, "Write attempted on read-only cursor"); + return (EPERM); +} + +/* + * __dbc_del_secondary -- + * Perform a delete operation on a secondary index: call through + * to the primary and delete the primary record that this record + * points to. + * + * Note that deleting the primary record will call c_del on all + * the secondaries, including this one; thus, it is not necessary + * to execute both this function and an actual delete. + */ +static int +__dbc_del_secondary(dbc) + DBC *dbc; +{ + DB *pdbp; + DBC *pdbc; + DBT skey, pkey; + ENV *env; + int ret, t_ret; + u_int32_t rmw; + + pdbp = dbc->dbp->s_primary; + env = pdbp->env; + rmw = STD_LOCKING(dbc) ? DB_RMW : 0; + + /* + * Get the current item that we're pointing at. + * We don't actually care about the secondary key, just + * the primary. + */ + memset(&skey, 0, sizeof(DBT)); + memset(&pkey, 0, sizeof(DBT)); + F_SET(&skey, DB_DBT_PARTIAL | DB_DBT_USERMEM); + if ((ret = __dbc_get(dbc, &skey, &pkey, DB_CURRENT)) != 0) + return (ret); + + SWAP_IF_NEEDED(dbc->dbp, &pkey); + + /* + * Create a cursor on the primary with our locker ID, + * so that when it calls back, we don't conflict. + * + * We create a cursor explicitly because there's no + * way to specify the same locker ID if we're using + * locking but not transactions if we use the DB->del + * interface. This shouldn't be any less efficient + * anyway. + */ + if ((ret = __db_cursor_int(pdbp, dbc->thread_info, dbc->txn, + pdbp->type, PGNO_INVALID, 0, dbc->locker, &pdbc)) != 0) + return (ret); + + /* + * See comment in __dbc_put--if we're in CDB, + * we already hold the locks we need, and we need to flag + * the cursor as a WRITER so we don't run into errors + * when we try to delete. + */ + if (CDB_LOCKING(env)) { + DB_ASSERT(env, pdbc->mylock.off == LOCK_INVALID); + F_SET(pdbc, DBC_WRITER); + } + + /* + * Set the new cursor to the correct primary key. Then + * delete it. We don't really care about the datum; + * just reuse our skey DBT. + * + * If the primary get returns DB_NOTFOUND, something is amiss-- + * every record in the secondary should correspond to some record + * in the primary. + */ + if ((ret = __dbc_get(pdbc, &pkey, &skey, DB_SET | rmw)) == 0) + ret = __dbc_del(pdbc, 0); + else if (ret == DB_NOTFOUND) + ret = __db_secondary_corrupt(pdbp); + + if ((t_ret = __dbc_close(pdbc)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __dbc_del_primary -- + * Perform a delete operation on a primary index. Loop through + * all the secondary indices which correspond to this primary + * database, and delete any secondary keys that point at the current + * record. + * + * PUBLIC: int __dbc_del_primary __P((DBC *)); + */ +int +__dbc_del_primary(dbc) + DBC *dbc; +{ + DB *dbp, *sdbp; + DBC *sdbc; + DBT *tskeyp; + DBT data, pkey, skey, temppkey, tempskey; + ENV *env; + u_int32_t nskey, rmw; + int ret, t_ret; + + dbp = dbc->dbp; + env = dbp->env; + sdbp = NULL; + rmw = STD_LOCKING(dbc) ? DB_RMW : 0; + + /* + * If we're called at all, we have at least one secondary. + * (Unfortunately, we can't assert this without grabbing the mutex.) + * Get the current record so that we can construct appropriate + * secondary keys as needed. + */ + memset(&pkey, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + if ((ret = __dbc_get(dbc, &pkey, &data, DB_CURRENT)) != 0) + return (ret); + + memset(&skey, 0, sizeof(DBT)); + for (ret = __db_s_first(dbp, &sdbp); + sdbp != NULL && ret == 0; + ret = __db_s_next(&sdbp, dbc->txn)) { + /* + * Get the secondary key for this secondary and the current + * item. + */ + if ((ret = sdbp->s_callback(sdbp, &pkey, &data, &skey)) != 0) { + /* Not indexing is equivalent to an empty key set. */ + if (ret == DB_DONOTINDEX) { + F_SET(&skey, DB_DBT_MULTIPLE); + skey.size = 0; + } else /* We had a substantive error. Bail. */ + goto err; + } + +#ifdef DIAGNOSTIC + if (F_ISSET(&skey, DB_DBT_MULTIPLE)) + __db_check_skeyset(sdbp, &skey); +#endif + + if (F_ISSET(&skey, DB_DBT_MULTIPLE)) { + tskeyp = (DBT *)skey.data; + nskey = skey.size; + if (nskey == 0) + continue; + } else { + tskeyp = &skey; + nskey = 1; + } + + /* Open a secondary cursor. */ + if ((ret = __db_cursor_int(sdbp, + dbc->thread_info, dbc->txn, sdbp->type, + PGNO_INVALID, 0, dbc->locker, &sdbc)) != 0) + goto err; + /* See comment above and in __dbc_put. */ + if (CDB_LOCKING(env)) { + DB_ASSERT(env, sdbc->mylock.off == LOCK_INVALID); + F_SET(sdbc, DBC_WRITER); + } + + for (; nskey > 0; nskey--, tskeyp++) { + /* + * Set the secondary cursor to the appropriate item. + * Delete it. + * + * We want to use DB_RMW if locking is on; it's only + * legal then, though. + * + * !!! + * Don't stomp on any callback-allocated buffer in skey + * when we do a c_get(DB_GET_BOTH); use a temp DBT + * instead. Similarly, don't allow pkey to be + * invalidated when the cursor is closed. + */ + DB_INIT_DBT(tempskey, tskeyp->data, tskeyp->size); + SWAP_IF_NEEDED(sdbp, &pkey); + DB_INIT_DBT(temppkey, pkey.data, pkey.size); + if ((ret = __dbc_get(sdbc, &tempskey, &temppkey, + DB_GET_BOTH | rmw)) == 0) + ret = __dbc_del(sdbc, DB_UPDATE_SECONDARY); + else if (ret == DB_NOTFOUND) + ret = __db_secondary_corrupt(dbp); + SWAP_IF_NEEDED(sdbp, &pkey); + FREE_IF_NEEDED(env, tskeyp); + } + + if ((t_ret = __dbc_close(sdbc)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + goto err; + + /* + * In the common case where there is a single secondary key, we + * will have freed any application-allocated data in skey + * already. In the multiple key case, we need to free it here. + * It is safe to do this twice as the macro resets the data + * field. + */ + FREE_IF_NEEDED(env, &skey); + } + +err: if (sdbp != NULL && + (t_ret = __db_s_done(sdbp, dbc->txn)) != 0 && ret == 0) + ret = t_ret; + FREE_IF_NEEDED(env, &skey); + return (ret); +} + +/* + * __dbc_del_foreign -- + * Apply the foreign database constraints for a particular foreign + * database when an item is being deleted (dbc points at item being deleted + * in the foreign database.) + * + * Delete happens in dbp, check for occurrences of key in pdpb. + * Terminology: + * Foreign db = Where delete occurs (dbp). + * Secondary db = Where references to dbp occur (sdbp, a secondary) + * Primary db = sdbp's primary database, references to dbp are secondary + * keys here + * Foreign Key = Key being deleted in dbp (fkey) + * Primary Key = Key of the corresponding entry in sdbp's primary (pkey). + */ +static int +__dbc_del_foreign(dbc) + DBC *dbc; +{ + DB_FOREIGN_INFO *f_info; + DB *dbp, *pdbp, *sdbp; + DBC *pdbc, *sdbc; + DBT data, fkey, pkey; + ENV *env; + u_int32_t flags, rmw; + int changed, ret, t_ret; + + dbp = dbc->dbp; + env = dbp->env; + + memset(&fkey, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + if ((ret = __dbc_get(dbc, &fkey, &data, DB_CURRENT)) != 0) + return (ret); + + LIST_FOREACH(f_info, &(dbp->f_primaries), f_links) { + sdbp = f_info->dbp; + pdbp = sdbp->s_primary; + flags = f_info->flags; + + rmw = (STD_LOCKING(dbc) && + !LF_ISSET(DB_FOREIGN_ABORT)) ? DB_RMW : 0; + + /* + * Handle CDB locking. Some of this is copied from + * __dbc_del_primary, but a bit more acrobatics are required. + * If we're not going to abort, then we need to get a write + * cursor. If CDB_ALLDB is set, then only one write cursor is + * allowed and we hold it, so we fudge things and promote the + * cursor on the other DBs manually, it won't cause a problem. + * If CDB_ALLDB is not set, then we go through the usual route + * to make sure we block as necessary. If there are any open + * read cursors on sdbp, the delete or put call later will + * block. + * + * If NULLIFY is set, we'll need a cursor on the primary to + * update it with the nullified data. Because primary and + * secondary dbs share a lock file ID in CDB, we open a cursor + * on the secondary and then get another writeable cursor on the + * primary via __db_cursor_int to avoid deadlocking. + */ + sdbc = pdbc = NULL; + if (!LF_ISSET(DB_FOREIGN_ABORT) && CDB_LOCKING(env) && + !F_ISSET(env->dbenv, DB_ENV_CDB_ALLDB)) { + ret = __db_cursor(sdbp, + dbc->thread_info, dbc->txn, &sdbc, DB_WRITECURSOR); + if (LF_ISSET(DB_FOREIGN_NULLIFY) && ret == 0) { + ret = __db_cursor_int(pdbp, + dbc->thread_info, dbc->txn, pdbp->type, + PGNO_INVALID, 0, dbc->locker, &pdbc); + F_SET(pdbc, DBC_WRITER); + } + } else { + ret = __db_cursor_int(sdbp, dbc->thread_info, dbc->txn, + sdbp->type, PGNO_INVALID, 0, dbc->locker, &sdbc); + if (LF_ISSET(DB_FOREIGN_NULLIFY) && ret == 0) + ret = __db_cursor_int(pdbp, dbc->thread_info, + dbc->txn, pdbp->type, PGNO_INVALID, 0, + dbc->locker, &pdbc); + } + if (ret != 0) { + if (sdbc != NULL) + (void)__dbc_close(sdbc); + return (ret); + } + if (CDB_LOCKING(env) && F_ISSET(env->dbenv, DB_ENV_CDB_ALLDB)) { + DB_ASSERT(env, sdbc->mylock.off == LOCK_INVALID); + F_SET(sdbc, DBC_WRITER); + if (LF_ISSET(DB_FOREIGN_NULLIFY) && pdbc != NULL) { + DB_ASSERT(env, + pdbc->mylock.off == LOCK_INVALID); + F_SET(pdbc, DBC_WRITER); + } + } + + /* + * There are three actions possible when a foreign database has + * items corresponding to a deleted item: + * DB_FOREIGN_ABORT - The delete operation should be aborted. + * DB_FOREIGN_CASCADE - All corresponding foreign items should + * be deleted. + * DB_FOREIGN_NULLIFY - A callback needs to be made, allowing + * the application to modify the data DBT from the + * associated database. If the callback makes a + * modification, the updated item needs to replace the + * original item in the foreign db + */ + memset(&pkey, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + ret = __dbc_pget(sdbc, &fkey, &pkey, &data, DB_SET|rmw); + + if (ret == DB_NOTFOUND) { + /* No entry means no constraint */ + ret = __dbc_close(sdbc); + if (LF_ISSET(DB_FOREIGN_NULLIFY) && + (t_ret = __dbc_close(pdbc)) != 0) + ret = t_ret; + if (ret != 0) + return (ret); + continue; + } else if (ret != 0) { + /* Just return the error code from the pget */ + (void)__dbc_close(sdbc); + if (LF_ISSET(DB_FOREIGN_NULLIFY)) + (void)__dbc_close(pdbc); + return (ret); + } else if (LF_ISSET(DB_FOREIGN_ABORT)) { + /* If the record exists and ABORT is set, we're done */ + if ((ret = __dbc_close(sdbc)) != 0) + return (ret); + return (DB_FOREIGN_CONFLICT); + } + + /* + * There were matching items in the primary DB, and the action + * is either DB_FOREIGN_CASCADE or DB_FOREIGN_NULLIFY. + */ + while (ret == 0) { + if (LF_ISSET(DB_FOREIGN_CASCADE)) { + /* + * Don't use the DB_UPDATE_SECONDARY flag, + * since we want the delete to cascade into the + * secondary's primary. + */ + if ((ret = __dbc_del(sdbc, 0)) != 0) { + __db_err(env, ret, + "Attempt to execute cascading delete in a foreign index failed"); + break; + } + } else if (LF_ISSET(DB_FOREIGN_NULLIFY)) { + changed = 0; + if ((ret = f_info->callback(sdbp, + &pkey, &data, &fkey, &changed)) != 0) { + __db_err(env, ret, + "Foreign database application callback"); + break; + } + + /* + * If the user callback modified the DBT and + * a put on the primary failed. + */ + if (changed && (ret = __dbc_put(pdbc, + &pkey, &data, DB_KEYFIRST)) != 0) { + __db_err(env, ret, + "Attempt to overwrite item in foreign database with nullified value failed"); + break; + } + } + /* retrieve the next matching item from the prim. db */ + memset(&pkey, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + ret = __dbc_pget(sdbc, + &fkey, &pkey, &data, DB_NEXT_DUP|rmw); + } + + if (ret == DB_NOTFOUND) + ret = 0; + if ((t_ret = __dbc_close(sdbc)) != 0 && ret == 0) + ret = t_ret; + if (LF_ISSET(DB_FOREIGN_NULLIFY) && + (t_ret = __dbc_close(pdbc)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + return (ret); + } + + return (ret); +} + +/* + * __db_s_first -- + * Get the first secondary, if any are present, from the primary. + * + * PUBLIC: int __db_s_first __P((DB *, DB **)); + */ +int +__db_s_first(pdbp, sdbpp) + DB *pdbp, **sdbpp; +{ + DB *sdbp; + + MUTEX_LOCK(pdbp->env, pdbp->mutex); + sdbp = LIST_FIRST(&pdbp->s_secondaries); + + /* See __db_s_next. */ + if (sdbp != NULL) + sdbp->s_refcnt++; + MUTEX_UNLOCK(pdbp->env, pdbp->mutex); + + *sdbpp = sdbp; + + return (0); +} + +/* + * __db_s_next -- + * Get the next secondary in the list. + * + * PUBLIC: int __db_s_next __P((DB **, DB_TXN *)); + */ +int +__db_s_next(sdbpp, txn) + DB **sdbpp; + DB_TXN *txn; +{ + DB *sdbp, *pdbp, *closeme; + ENV *env; + int ret; + + /* + * Secondary indices are kept in a linked list, s_secondaries, + * off each primary DB handle. If a primary is free-threaded, + * this list may only be traversed or modified while the primary's + * thread mutex is held. + * + * The tricky part is that we don't want to hold the thread mutex + * across the full set of secondary puts necessary for each primary + * put, or we'll wind up essentially single-threading all the puts + * to the handle; the secondary puts will each take about as + * long as the primary does, and may require I/O. So we instead + * hold the thread mutex only long enough to follow one link to the + * next secondary, and then we release it before performing the + * actual secondary put. + * + * The only danger here is that we might legitimately close a + * secondary index in one thread while another thread is performing + * a put and trying to update that same secondary index. To + * prevent this from happening, we refcount the secondary handles. + * If close is called on a secondary index handle while we're putting + * to it, it won't really be closed--the refcount will simply drop, + * and we'll be responsible for closing it here. + */ + sdbp = *sdbpp; + pdbp = sdbp->s_primary; + env = pdbp->env; + closeme = NULL; + + MUTEX_LOCK(env, pdbp->mutex); + DB_ASSERT(env, sdbp->s_refcnt != 0); + if (--sdbp->s_refcnt == 0) { + LIST_REMOVE(sdbp, s_links); + closeme = sdbp; + } + sdbp = LIST_NEXT(sdbp, s_links); + if (sdbp != NULL) + sdbp->s_refcnt++; + MUTEX_UNLOCK(env, pdbp->mutex); + + *sdbpp = sdbp; + + /* + * closeme->close() is a wrapper; call __db_close explicitly. + */ + if (closeme == NULL) + ret = 0; + else + ret = __db_close(closeme, txn, 0); + + return (ret); +} + +/* + * __db_s_done -- + * Properly decrement the refcount on a secondary database handle we're + * using, without calling __db_s_next. + * + * PUBLIC: int __db_s_done __P((DB *, DB_TXN *)); + */ +int +__db_s_done(sdbp, txn) + DB *sdbp; + DB_TXN *txn; +{ + DB *pdbp; + ENV *env; + int doclose, ret; + + pdbp = sdbp->s_primary; + env = pdbp->env; + doclose = 0; + + MUTEX_LOCK(env, pdbp->mutex); + DB_ASSERT(env, sdbp->s_refcnt != 0); + if (--sdbp->s_refcnt == 0) { + LIST_REMOVE(sdbp, s_links); + doclose = 1; + } + MUTEX_UNLOCK(env, pdbp->mutex); + + if (doclose == 0) + ret = 0; + else + ret = __db_close(sdbp, txn, 0); + return (ret); +} + +/* + * __db_s_count -- + * Count the number of secondaries associated with a given primary. + */ +static int +__db_s_count(pdbp) + DB *pdbp; +{ + DB *sdbp; + ENV *env; + int count; + + env = pdbp->env; + count = 0; + + MUTEX_LOCK(env, pdbp->mutex); + for (sdbp = LIST_FIRST(&pdbp->s_secondaries); + sdbp != NULL; + sdbp = LIST_NEXT(sdbp, s_links)) + ++count; + MUTEX_UNLOCK(env, pdbp->mutex); + + return (count); +} + +/* + * __db_buildpartial -- + * Build the record that will result after a partial put is applied to + * an existing record. + * + * This should probably be merged with __bam_build, but that requires + * a little trickery if we plan to keep the overflow-record optimization + * in that function. + * + * PUBLIC: int __db_buildpartial __P((DB *, DBT *, DBT *, DBT *)); + */ +int +__db_buildpartial(dbp, oldrec, partial, newrec) + DB *dbp; + DBT *oldrec, *partial, *newrec; +{ + ENV *env; + u_int32_t len, nbytes; + u_int8_t *buf; + int ret; + + env = dbp->env; + + DB_ASSERT(env, F_ISSET(partial, DB_DBT_PARTIAL)); + + memset(newrec, 0, sizeof(DBT)); + + nbytes = __db_partsize(oldrec->size, partial); + newrec->size = nbytes; + + if ((ret = __os_malloc(env, nbytes, &buf)) != 0) + return (ret); + newrec->data = buf; + + /* Nul or pad out the buffer, for any part that isn't specified. */ + memset(buf, + F_ISSET(dbp, DB_AM_FIXEDLEN) ? ((BTREE *)dbp->bt_internal)->re_pad : + 0, nbytes); + + /* Copy in any leading data from the original record. */ + memcpy(buf, oldrec->data, + partial->doff > oldrec->size ? oldrec->size : partial->doff); + + /* Copy the data from partial. */ + memcpy(buf + partial->doff, partial->data, partial->size); + + /* Copy any trailing data from the original record. */ + len = partial->doff + partial->dlen; + if (oldrec->size > len) + memcpy(buf + partial->doff + partial->size, + (u_int8_t *)oldrec->data + len, oldrec->size - len); + + return (0); +} + +/* + * __db_partsize -- + * Given the number of bytes in an existing record and a DBT that + * is about to be partial-put, calculate the size of the record + * after the put. + * + * This code is called from __bam_partsize. + * + * PUBLIC: u_int32_t __db_partsize __P((u_int32_t, DBT *)); + */ +u_int32_t +__db_partsize(nbytes, data) + u_int32_t nbytes; + DBT *data; +{ + + /* + * There are really two cases here: + * + * Case 1: We are replacing some bytes that do not exist (i.e., they + * are past the end of the record). In this case the number of bytes + * we are replacing is irrelevant and all we care about is how many + * bytes we are going to add from offset. So, the new record length + * is going to be the size of the new bytes (size) plus wherever those + * new bytes begin (doff). + * + * Case 2: All the bytes we are replacing exist. Therefore, the new + * size is the oldsize (nbytes) minus the bytes we are replacing (dlen) + * plus the bytes we are adding (size). + */ + if (nbytes < data->doff + data->dlen) /* Case 1 */ + return (data->doff + data->size); + + return (nbytes + data->size - data->dlen); /* Case 2 */ +} + +#ifdef DIAGNOSTIC +/* + * __db_check_skeyset -- + * Diagnostic check that the application's callback returns a set of + * secondary keys without repeats. + * + * PUBLIC: #ifdef DIAGNOSTIC + * PUBLIC: void __db_check_skeyset __P((DB *, DBT *)); + * PUBLIC: #endif + */ +void +__db_check_skeyset(sdbp, skeyp) + DB *sdbp; + DBT *skeyp; +{ + DBT *firstkey, *lastkey, *key1, *key2; + ENV *env; + + env = sdbp->env; + + firstkey = (DBT *)skeyp->data; + lastkey = firstkey + skeyp->size; + for (key1 = firstkey; key1 < lastkey; key1++) + for (key2 = key1 + 1; key2 < lastkey; key2++) + DB_ASSERT(env, + ((BTREE *)sdbp->bt_internal)->bt_compare(sdbp, + key1, key2) != 0); +} +#endif diff --git a/db/db_cds.c b/db/db_cds.c new file mode 100644 index 0000000..5efda31 --- /dev/null +++ b/db/db_cds.c @@ -0,0 +1,177 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2000-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/lock.h" +#include "dbinc/txn.h" + +static int __cdsgroup_abort __P((DB_TXN *txn)); +static int __cdsgroup_commit __P((DB_TXN *txn, u_int32_t flags)); +static int __cdsgroup_discard __P((DB_TXN *txn, u_int32_t flags)); +static u_int32_t __cdsgroup_id __P((DB_TXN *txn)); +static int __cdsgroup_notsup __P((ENV *env, const char *meth)); +static int __cdsgroup_prepare __P((DB_TXN *txn, u_int8_t *gid)); +static int __cdsgroup_set_name __P((DB_TXN *txn, const char *name)); +static int __cdsgroup_set_timeout + __P((DB_TXN *txn, db_timeout_t timeout, u_int32_t flags)); + +/* + * __cdsgroup_notsup -- + * Error when CDS groups don't support a method. + */ +static int +__cdsgroup_notsup(env, meth) + ENV *env; + const char *meth; +{ + __db_errx(env, "CDS groups do not support %s", meth); + return (DB_OPNOTSUP); +} + +static int +__cdsgroup_abort(txn) + DB_TXN *txn; +{ + return (__cdsgroup_notsup(txn->mgrp->env, "abort")); +} + +static int +__cdsgroup_commit(txn, flags) + DB_TXN *txn; + u_int32_t flags; +{ + DB_LOCKER *locker; + DB_LOCKREQ lreq; + ENV *env; + int ret, t_ret; + + COMPQUIET(flags, 0); + env = txn->mgrp->env; + + /* Check for live cursors. */ + if (txn->cursors != 0) { + __db_errx(env, "CDS group has active cursors"); + return (EINVAL); + } + + /* We may be holding handle locks; release them. */ + lreq.op = DB_LOCK_PUT_ALL; + lreq.obj = NULL; + ret = __lock_vec(env, txn->locker, 0, &lreq, 1, NULL); + + env = txn->mgrp->env; + locker = txn->locker; + __os_free(env, txn->mgrp); + __os_free(env, txn); + if ((t_ret = __lock_id_free(env, locker)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +static int __cdsgroup_discard(txn, flags) + DB_TXN *txn; + u_int32_t flags; +{ + COMPQUIET(flags, 0); + return (__cdsgroup_notsup(txn->mgrp->env, "discard")); +} + +static u_int32_t __cdsgroup_id(txn) + DB_TXN *txn; +{ + return (txn->txnid); +} + +static int __cdsgroup_prepare(txn, gid) + DB_TXN *txn; + u_int8_t *gid; +{ + COMPQUIET(gid, NULL); + return (__cdsgroup_notsup(txn->mgrp->env, "prepare")); +} + +static int __cdsgroup_set_name(txn, name) + DB_TXN *txn; + const char *name; +{ + COMPQUIET(name, NULL); + return (__cdsgroup_notsup(txn->mgrp->env, "set_name")); +} + +static int __cdsgroup_set_timeout(txn, timeout, flags) + DB_TXN *txn; + db_timeout_t timeout; + u_int32_t flags; +{ + COMPQUIET(timeout, 0); + COMPQUIET(flags, 0); + return (__cdsgroup_notsup(txn->mgrp->env, "set_timeout")); +} + +/* + * __cds_txn_begin -- + * ENV->cdsgroup_begin + * + * PUBLIC: int __cdsgroup_begin __P((DB_ENV *, DB_TXN **)); + */ +int +__cdsgroup_begin(dbenv, txnpp) + DB_ENV *dbenv; + DB_TXN **txnpp; +{ + DB_THREAD_INFO *ip; + DB_TXN *txn; + ENV *env; + int ret; + + env = dbenv->env; + + ENV_ILLEGAL_BEFORE_OPEN(env, "cdsgroup_begin"); + if (!CDB_LOCKING(env)) + return (__env_not_config(env, "cdsgroup_begin", DB_INIT_CDB)); + + ENV_ENTER(env, ip); + *txnpp = txn = NULL; + if ((ret = __os_calloc(env, 1, sizeof(DB_TXN), &txn)) != 0) + goto err; + /* + * We need a dummy DB_TXNMGR -- it's the only way to get from a + * transaction handle to the environment handle. + */ + if ((ret = __os_calloc(env, 1, sizeof(DB_TXNMGR), &txn->mgrp)) != 0) + goto err; + txn->mgrp->env = env; + + if ((ret = __lock_id(env, &txn->txnid, &txn->locker)) != 0) + goto err; + + txn->flags = TXN_CDSGROUP; + txn->abort = __cdsgroup_abort; + txn->commit = __cdsgroup_commit; + txn->discard = __cdsgroup_discard; + txn->id = __cdsgroup_id; + txn->prepare = __cdsgroup_prepare; + txn->set_name = __cdsgroup_set_name; + txn->set_timeout = __cdsgroup_set_timeout; + + *txnpp = txn; + + if (0) { +err: if (txn != NULL) { + if (txn->mgrp != NULL) + __os_free(env, txn->mgrp); + __os_free(env, txn); + } + } + ENV_LEAVE(env, ip); + return (ret); +} diff --git a/db/db_conv.c b/db/db_conv.c new file mode 100644 index 0000000..4572683 --- /dev/null +++ b/db/db_conv.c @@ -0,0 +1,733 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/crypto.h" +#include "dbinc/hmac.h" +#include "dbinc/db_page.h" +#include "dbinc/db_swap.h" +#include "dbinc/btree.h" +#include "dbinc/hash.h" +#include "dbinc/log.h" +#include "dbinc/qam.h" + +/* + * __db_pgin -- + * Primary page-swap routine. + * + * PUBLIC: int __db_pgin __P((DB_ENV *, db_pgno_t, void *, DBT *)); + */ +int +__db_pgin(dbenv, pg, pp, cookie) + DB_ENV *dbenv; + db_pgno_t pg; + void *pp; + DBT *cookie; +{ + DB dummydb, *dbp; + DB_CIPHER *db_cipher; + DB_LSN not_used; + DB_PGINFO *pginfo; + ENV *env; + PAGE *pagep; + size_t sum_len; + int is_hmac, ret; + u_int8_t *chksum; + + pginfo = (DB_PGINFO *)cookie->data; + env = dbenv->env; + pagep = (PAGE *)pp; + + ret = is_hmac = 0; + chksum = NULL; + memset(&dummydb, 0, sizeof(DB)); + dbp = &dummydb; + dbp->dbenv = dbenv; + dbp->env = env; + dbp->flags = pginfo->flags; + dbp->pgsize = pginfo->db_pagesize; + db_cipher = env->crypto_handle; + switch (pagep->type) { + case P_HASHMETA: + case P_BTREEMETA: + case P_QAMMETA: + /* + * If checksumming is set on the meta-page, we must set + * it in the dbp. + */ + if (FLD_ISSET(((DBMETA *)pp)->metaflags, DBMETA_CHKSUM)) + F_SET(dbp, DB_AM_CHKSUM); + else + F_CLR(dbp, DB_AM_CHKSUM); + if (((DBMETA *)pp)->encrypt_alg != 0 || + F_ISSET(dbp, DB_AM_ENCRYPT)) + is_hmac = 1; + /* + * !!! + * For all meta pages it is required that the chksum + * be at the same location. Use BTMETA to get to it + * for any meta type. + */ + chksum = ((BTMETA *)pp)->chksum; + sum_len = DBMETASIZE; + break; + case P_INVALID: + /* + * We assume that we've read a file hole if we have + * a zero LSN, zero page number and P_INVALID. Otherwise + * we have an invalid page that might contain real data. + */ + if (IS_ZERO_LSN(LSN(pagep)) && pagep->pgno == PGNO_INVALID) { + sum_len = 0; + break; + } + /* FALLTHROUGH */ + default: + chksum = P_CHKSUM(dbp, pagep); + sum_len = pginfo->db_pagesize; + /* + * If we are reading in a non-meta page, then if we have + * a db_cipher then we are using hmac. + */ + is_hmac = CRYPTO_ON(env) ? 1 : 0; + break; + } + + /* + * We expect a checksum error if there was a configuration problem. + * If there is no configuration problem and we don't get a match, + * it's fatal: panic the system. + */ + if (F_ISSET(dbp, DB_AM_CHKSUM) && sum_len != 0) { + if (F_ISSET(dbp, DB_AM_SWAP) && is_hmac == 0) + P_32_SWAP(chksum); + switch (ret = __db_check_chksum( + env, NULL, db_cipher, chksum, pp, sum_len, is_hmac)) { + case 0: + break; + case -1: + if (DBENV_LOGGING(env)) + (void)__db_cksum_log( + env, NULL, ¬_used, DB_FLUSH); + __db_errx(env, + "checksum error: page %lu: catastrophic recovery required", + (u_long)pg); + return (__env_panic(env, DB_RUNRECOVERY)); + default: + return (ret); + } + } + if ((ret = __db_decrypt_pg(env, dbp, pagep)) != 0) + return (ret); + switch (pagep->type) { + case P_INVALID: + if (pginfo->type == DB_QUEUE) + return (__qam_pgin_out(env, pg, pp, cookie)); + else + return (__ham_pgin(dbp, pg, pp, cookie)); + case P_HASH_UNSORTED: + case P_HASH: + case P_HASHMETA: + return (__ham_pgin(dbp, pg, pp, cookie)); + case P_BTREEMETA: + case P_IBTREE: + case P_IRECNO: + case P_LBTREE: + case P_LDUP: + case P_LRECNO: + case P_OVERFLOW: + return (__bam_pgin(dbp, pg, pp, cookie)); + case P_QAMMETA: + case P_QAMDATA: + return (__qam_pgin_out(env, pg, pp, cookie)); + default: + break; + } + return (__db_pgfmt(env, pg)); +} + +/* + * __db_pgout -- + * Primary page-swap routine. + * + * PUBLIC: int __db_pgout __P((DB_ENV *, db_pgno_t, void *, DBT *)); + */ +int +__db_pgout(dbenv, pg, pp, cookie) + DB_ENV *dbenv; + db_pgno_t pg; + void *pp; + DBT *cookie; +{ + DB dummydb, *dbp; + DB_PGINFO *pginfo; + ENV *env; + PAGE *pagep; + int ret; + + pginfo = (DB_PGINFO *)cookie->data; + env = dbenv->env; + pagep = (PAGE *)pp; + + memset(&dummydb, 0, sizeof(DB)); + dbp = &dummydb; + dbp->dbenv = dbenv; + dbp->env = env; + dbp->flags = pginfo->flags; + dbp->pgsize = pginfo->db_pagesize; + ret = 0; + switch (pagep->type) { + case P_INVALID: + if (pginfo->type == DB_QUEUE) + ret = __qam_pgin_out(env, pg, pp, cookie); + else + ret = __ham_pgout(dbp, pg, pp, cookie); + break; + case P_HASH: + case P_HASH_UNSORTED: + /* + * Support pgout of unsorted hash pages - since online + * replication upgrade can cause pages of this type to be + * written out. + * + * FALLTHROUGH + */ + case P_HASHMETA: + ret = __ham_pgout(dbp, pg, pp, cookie); + break; + case P_BTREEMETA: + case P_IBTREE: + case P_IRECNO: + case P_LBTREE: + case P_LDUP: + case P_LRECNO: + case P_OVERFLOW: + ret = __bam_pgout(dbp, pg, pp, cookie); + break; + case P_QAMMETA: + case P_QAMDATA: + ret = __qam_pgin_out(env, pg, pp, cookie); + break; + default: + return (__db_pgfmt(env, pg)); + } + if (ret) + return (ret); + + return (__db_encrypt_and_checksum_pg(env, dbp, pagep)); +} + +/* + * __db_decrypt_pg -- + * Utility function to decrypt a db page. + * + * PUBLIC: int __db_decrypt_pg __P((ENV *, DB *, PAGE *)); + */ +int +__db_decrypt_pg (env, dbp, pagep) + ENV *env; + DB *dbp; + PAGE *pagep; +{ + DB_CIPHER *db_cipher; + size_t pg_len, pg_off; + u_int8_t *iv; + int ret; + + db_cipher = env->crypto_handle; + ret = 0; + iv = NULL; + if (F_ISSET(dbp, DB_AM_ENCRYPT)) { + DB_ASSERT(env, db_cipher != NULL); + DB_ASSERT(env, F_ISSET(dbp, DB_AM_CHKSUM)); + + pg_off = P_OVERHEAD(dbp); + DB_ASSERT(env, db_cipher->adj_size(pg_off) == 0); + + switch (pagep->type) { + case P_HASHMETA: + case P_BTREEMETA: + case P_QAMMETA: + /* + * !!! + * For all meta pages it is required that the iv + * be at the same location. Use BTMETA to get to it + * for any meta type. + */ + iv = ((BTMETA *)pagep)->iv; + pg_len = DBMETASIZE; + break; + case P_INVALID: + if (IS_ZERO_LSN(LSN(pagep)) && + pagep->pgno == PGNO_INVALID) { + pg_len = 0; + break; + } + /* FALLTHROUGH */ + default: + iv = P_IV(dbp, pagep); + pg_len = dbp->pgsize; + break; + } + if (pg_len != 0) + ret = db_cipher->decrypt(env, db_cipher->data, + iv, ((u_int8_t *)pagep) + pg_off, + pg_len - pg_off); + } + return (ret); +} + +/* + * __db_encrypt_and_checksum_pg -- + * Utility function to encrypt and checksum a db page. + * + * PUBLIC: int __db_encrypt_and_checksum_pg + * PUBLIC: __P((ENV *, DB *, PAGE *)); + */ +int +__db_encrypt_and_checksum_pg (env, dbp, pagep) + ENV *env; + DB *dbp; + PAGE *pagep; +{ + DB_CIPHER *db_cipher; + int ret; + size_t pg_off, pg_len, sum_len; + u_int8_t *chksum, *iv, *key; + + chksum = iv = key = NULL; + db_cipher = env->crypto_handle; + + if (F_ISSET(dbp, DB_AM_ENCRYPT)) { + DB_ASSERT(env, db_cipher != NULL); + DB_ASSERT(env, F_ISSET(dbp, DB_AM_CHKSUM)); + + pg_off = P_OVERHEAD(dbp); + DB_ASSERT(env, db_cipher->adj_size(pg_off) == 0); + + key = db_cipher->mac_key; + + switch (pagep->type) { + case P_HASHMETA: + case P_BTREEMETA: + case P_QAMMETA: + /* + * !!! + * For all meta pages it is required that the iv + * be at the same location. Use BTMETA to get to it + * for any meta type. + */ + iv = ((BTMETA *)pagep)->iv; + pg_len = DBMETASIZE; + break; + default: + iv = P_IV(dbp, pagep); + pg_len = dbp->pgsize; + break; + } + if ((ret = db_cipher->encrypt(env, db_cipher->data, + iv, ((u_int8_t *)pagep) + pg_off, pg_len - pg_off)) != 0) + return (ret); + } + if (F_ISSET(dbp, DB_AM_CHKSUM)) { + switch (pagep->type) { + case P_HASHMETA: + case P_BTREEMETA: + case P_QAMMETA: + /* + * !!! + * For all meta pages it is required that the chksum + * be at the same location. Use BTMETA to get to it + * for any meta type. + */ + chksum = ((BTMETA *)pagep)->chksum; + sum_len = DBMETASIZE; + break; + default: + chksum = P_CHKSUM(dbp, pagep); + sum_len = dbp->pgsize; + break; + } + __db_chksum(NULL, (u_int8_t *)pagep, sum_len, key, chksum); + if (F_ISSET(dbp, DB_AM_SWAP) && !F_ISSET(dbp, DB_AM_ENCRYPT)) + P_32_SWAP(chksum); + } + return (0); +} + +/* + * __db_metaswap -- + * Byteswap the common part of the meta-data page. + * + * PUBLIC: void __db_metaswap __P((PAGE *)); + */ +void +__db_metaswap(pg) + PAGE *pg; +{ + u_int8_t *p; + + p = (u_int8_t *)pg; + + /* Swap the meta-data information. */ + SWAP32(p); /* lsn.file */ + SWAP32(p); /* lsn.offset */ + SWAP32(p); /* pgno */ + SWAP32(p); /* magic */ + SWAP32(p); /* version */ + SWAP32(p); /* pagesize */ + p += 4; /* unused, page type, unused, unused */ + SWAP32(p); /* free */ + SWAP32(p); /* alloc_lsn part 1 */ + SWAP32(p); /* alloc_lsn part 2 */ + SWAP32(p); /* cached key count */ + SWAP32(p); /* cached record count */ + SWAP32(p); /* flags */ +} + +/* + * __db_byteswap -- + * Byteswap an ordinary database page. + * + * PUBLIC: int __db_byteswap + * PUBLIC: __P((DB *, db_pgno_t, PAGE *, size_t, int)); + */ +int +__db_byteswap(dbp, pg, h, pagesize, pgin) + DB *dbp; + db_pgno_t pg; + PAGE *h; + size_t pagesize; + int pgin; +{ + ENV *env; + BINTERNAL *bi; + BKEYDATA *bk; + BOVERFLOW *bo; + RINTERNAL *ri; + db_indx_t i, *inp, len, tmp; + u_int8_t *end, *p, *pgend; + + if (pagesize == 0) + return (0); + + env = dbp->env; + + if (pgin) { + M_32_SWAP(h->lsn.file); + M_32_SWAP(h->lsn.offset); + M_32_SWAP(h->pgno); + M_32_SWAP(h->prev_pgno); + M_32_SWAP(h->next_pgno); + M_16_SWAP(h->entries); + M_16_SWAP(h->hf_offset); + } + + pgend = (u_int8_t *)h + pagesize; + + inp = P_INP(dbp, h); + if ((u_int8_t *)inp >= pgend) + goto out; + + switch (TYPE(h)) { + case P_HASH_UNSORTED: + case P_HASH: + for (i = 0; i < NUM_ENT(h); i++) { + if (pgin) + M_16_SWAP(inp[i]); + + if (P_ENTRY(dbp, h, i) >= pgend) + continue; + + switch (HPAGE_TYPE(dbp, h, i)) { + case H_KEYDATA: + break; + case H_DUPLICATE: + len = LEN_HKEYDATA(dbp, h, pagesize, i); + p = HKEYDATA_DATA(P_ENTRY(dbp, h, i)); + for (end = p + len; p < end;) { + if (pgin) { + P_16_SWAP(p); + memcpy(&tmp, + p, sizeof(db_indx_t)); + p += sizeof(db_indx_t); + } else { + memcpy(&tmp, + p, sizeof(db_indx_t)); + SWAP16(p); + } + p += tmp; + SWAP16(p); + } + break; + case H_OFFDUP: + p = HOFFPAGE_PGNO(P_ENTRY(dbp, h, i)); + SWAP32(p); /* pgno */ + break; + case H_OFFPAGE: + p = HOFFPAGE_PGNO(P_ENTRY(dbp, h, i)); + SWAP32(p); /* pgno */ + SWAP32(p); /* tlen */ + break; + default: + return (__db_pgfmt(env, pg)); + } + + } + + /* + * The offsets in the inp array are used to determine + * the size of entries on a page; therefore they + * cannot be converted until we've done all the + * entries. + */ + if (!pgin) + for (i = 0; i < NUM_ENT(h); i++) + M_16_SWAP(inp[i]); + break; + case P_LBTREE: + case P_LDUP: + case P_LRECNO: + for (i = 0; i < NUM_ENT(h); i++) { + if (pgin) + M_16_SWAP(inp[i]); + + /* + * In the case of on-page duplicates, key information + * should only be swapped once. + */ + if (h->type == P_LBTREE && i > 1) { + if (pgin) { + if (inp[i] == inp[i - 2]) + continue; + } else { + M_16_SWAP(inp[i]); + if (inp[i] == inp[i - 2]) + continue; + M_16_SWAP(inp[i]); + } + } + + bk = GET_BKEYDATA(dbp, h, i); + if ((u_int8_t *)bk >= pgend) + continue; + switch (B_TYPE(bk->type)) { + case B_KEYDATA: + M_16_SWAP(bk->len); + break; + case B_DUPLICATE: + case B_OVERFLOW: + bo = (BOVERFLOW *)bk; + M_32_SWAP(bo->pgno); + M_32_SWAP(bo->tlen); + break; + default: + return (__db_pgfmt(env, pg)); + } + + if (!pgin) + M_16_SWAP(inp[i]); + } + break; + case P_IBTREE: + for (i = 0; i < NUM_ENT(h); i++) { + if (pgin) + M_16_SWAP(inp[i]); + + bi = GET_BINTERNAL(dbp, h, i); + if ((u_int8_t *)bi >= pgend) + continue; + + M_16_SWAP(bi->len); + M_32_SWAP(bi->pgno); + M_32_SWAP(bi->nrecs); + + switch (B_TYPE(bi->type)) { + case B_KEYDATA: + break; + case B_DUPLICATE: + case B_OVERFLOW: + bo = (BOVERFLOW *)bi->data; + M_32_SWAP(bo->pgno); + M_32_SWAP(bo->tlen); + break; + default: + return (__db_pgfmt(env, pg)); + } + + if (!pgin) + M_16_SWAP(inp[i]); + } + break; + case P_IRECNO: + for (i = 0; i < NUM_ENT(h); i++) { + if (pgin) + M_16_SWAP(inp[i]); + + ri = GET_RINTERNAL(dbp, h, i); + if ((u_int8_t *)ri >= pgend) + continue; + + M_32_SWAP(ri->pgno); + M_32_SWAP(ri->nrecs); + + if (!pgin) + M_16_SWAP(inp[i]); + } + break; + case P_INVALID: + case P_OVERFLOW: + case P_QAMDATA: + /* Nothing to do. */ + break; + default: + return (__db_pgfmt(env, pg)); + } + +out: if (!pgin) { + /* Swap the header information. */ + M_32_SWAP(h->lsn.file); + M_32_SWAP(h->lsn.offset); + M_32_SWAP(h->pgno); + M_32_SWAP(h->prev_pgno); + M_32_SWAP(h->next_pgno); + M_16_SWAP(h->entries); + M_16_SWAP(h->hf_offset); + } + return (0); +} + +/* + * __db_pageswap -- + * Byteswap any database page. Normally, the page to be swapped will be + * referenced by the "pp" argument and the pdata argument will be NULL. + * This function is also called by automatically generated log functions, + * where the page may be split into separate header and data parts. In + * that case, pdata is not NULL we reconsitute + * + * PUBLIC: int __db_pageswap + * PUBLIC: __P((DB *, void *, size_t, DBT *, int)); + */ +int +__db_pageswap(dbp, pp, len, pdata, pgin) + DB *dbp; + void *pp; + size_t len; + DBT *pdata; + int pgin; +{ + ENV *env; + db_pgno_t pg; + size_t pgsize; + void *pgcopy; + int ret; + u_int16_t hoffset; + + env = dbp->env; + + switch (TYPE(pp)) { + case P_BTREEMETA: + return (__bam_mswap(env, pp)); + + case P_HASHMETA: + return (__ham_mswap(env, pp)); + + case P_QAMMETA: + return (__qam_mswap(env, pp)); + + case P_INVALID: + case P_OVERFLOW: + case P_QAMDATA: + /* + * We may have been passed an invalid page, or a queue data + * page, or an overflow page where fields like hoffset have a + * special meaning. In that case, no swapping of the page data + * is required, just the fields in the page header. + */ + pdata = NULL; + break; + + default: + break; + } + + if (pgin) { + P_32_COPYSWAP(&PGNO(pp), &pg); + P_16_COPYSWAP(&HOFFSET(pp), &hoffset); + } else { + pg = PGNO(pp); + hoffset = HOFFSET(pp); + } + + if (pdata == NULL) + ret = __db_byteswap(dbp, pg, (PAGE *)pp, len, pgin); + else { + pgsize = hoffset + pdata->size; + if ((ret = __os_malloc(env, pgsize, &pgcopy)) != 0) + return (ret); + memset(pgcopy, 0, pgsize); + memcpy(pgcopy, pp, len); + memcpy((u_int8_t *)pgcopy + hoffset, pdata->data, pdata->size); + + ret = __db_byteswap(dbp, pg, (PAGE *)pgcopy, pgsize, pgin); + memcpy(pp, pgcopy, len); + + /* + * If we are swapping data to be written to the log, we can't + * overwrite the buffer that was passed in: it may be a pointer + * into a page in cache. We set DB_DBT_APPMALLOC here so that + * the calling code can free the memory we allocate here. + */ + if (!pgin) { + if ((ret = + __os_malloc(env, pdata->size, &pdata->data)) != 0) { + __os_free(env, pgcopy); + return (ret); + } + F_SET(pdata, DB_DBT_APPMALLOC); + } + memcpy(pdata->data, (u_int8_t *)pgcopy + hoffset, pdata->size); + __os_free(env, pgcopy); + } + + return (ret); +} diff --git a/db/db_dispatch.c b/db/db_dispatch.c new file mode 100644 index 0000000..65dc260 --- /dev/null +++ b/db/db_dispatch.c @@ -0,0 +1,953 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + */ +/* + * Copyright (c) 1995, 1996 + * The President and Fellows of Harvard University. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/hash.h" +#include "dbinc/fop.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" + +static int __db_txnlist_find_internal __P((ENV *, DB_TXNHEAD *, + db_txnlist_type, u_int32_t, DB_TXNLIST **, + int, u_int32_t *)); + +/* + * __db_dispatch -- + * + * This is the transaction dispatch function used by the db access methods. + * It is designed to handle the record format used by all the access + * methods (the one automatically generated by the db_{h,log,read}.sh + * scripts in the tools directory). An application using a different + * recovery paradigm will supply a different dispatch function to txn_open. + * + * PUBLIC: int __db_dispatch __P((ENV *, + * PUBLIC: DB_DISTAB *, DBT *, DB_LSN *, db_recops, DB_TXNHEAD *)); + */ +int +__db_dispatch(env, dtab, db, lsnp, redo, info) + ENV *env; /* The environment. */ + DB_DISTAB *dtab; + DBT *db; /* The log record upon which to dispatch. */ + DB_LSN *lsnp; /* The lsn of the record being dispatched. */ + db_recops redo; /* Redo this op (or undo it). */ + DB_TXNHEAD *info; /* Transaction list. */ +{ + DB_ENV *dbenv; + DB_LSN prev_lsn; + u_int32_t rectype, status, txnid, urectype; + int make_call, ret; + + dbenv = env->dbenv; + LOGCOPY_32(env, &rectype, db->data); + LOGCOPY_32(env, &txnid, (u_int8_t *)db->data + sizeof(rectype)); + + make_call = ret = 0; + + /* If we don't have a dispatch table, it's hard to dispatch. */ + DB_ASSERT(env, dtab != NULL); + + /* + * If we find a record that is in the user's number space and they + * have specified a recovery routine, let them handle it. If they + * didn't specify a recovery routine, then we expect that they've + * followed all our rules and registered new recovery functions. + */ + switch (redo) { + case DB_TXN_ABORT: + case DB_TXN_APPLY: + case DB_TXN_PRINT: + make_call = 1; + break; + case DB_TXN_OPENFILES: + /* + * We collect all the transactions that have + * "begin" records, those with no previous LSN, + * so that we do not abort partial transactions. + * These are known to be undone, otherwise the + * log would not have been freeable. + */ + LOGCOPY_TOLSN(env, &prev_lsn, (u_int8_t *)db->data + + sizeof(rectype) + sizeof(txnid)); + if (txnid != 0 && prev_lsn.file == 0 && (ret = + __db_txnlist_add(env, info, txnid, TXN_OK, NULL)) != 0) + return (ret); + + /* FALLTHROUGH */ + case DB_TXN_POPENFILES: + if (rectype == DB___dbreg_register || + rectype == DB___txn_child || + rectype == DB___txn_ckp || rectype == DB___txn_recycle) + return ((dtab->int_dispatch[rectype])(env, + db, lsnp, redo, info)); + break; + case DB_TXN_BACKWARD_ROLL: + /* + * Running full recovery in the backward pass. In general, + * we only process records during this pass that belong + * to aborted transactions. Unfortunately, there are several + * exceptions: + * 1. If this is a meta-record, one not associated with + * a transaction, then we must always process it. + * 2. If this is a transaction commit/abort, we must + * always process it, so that we know the status of + * every transaction. + * 3. If this is a child commit, we need to process it + * because the outcome of the child transaction depends + * on the outcome of the parent. + * 4. If this is a dbreg_register record, we must always + * process is because they contain non-transactional + * closes that must be properly handled. + * 5. If this is a noop, we must always undo it so that we + * properly handle any aborts before a file was closed. + * 6. If this a file remove, we need to process it to + * determine if the on-disk file is the same as the + * one being described. + */ + switch (rectype) { + /* + * These either do not belong to a transaction or (regop) + * must be processed regardless of the status of the + * transaction. + */ + case DB___txn_regop: + case DB___txn_recycle: + case DB___txn_ckp: + make_call = 1; + break; + /* + * These belong to a transaction whose status must be + * checked. + */ + case DB___txn_child: + case DB___db_noop: + case DB___fop_file_remove: + case DB___dbreg_register: + make_call = 1; + + /* FALLTHROUGH */ + default: + if (txnid == 0) + break; + + ret = __db_txnlist_find(env, info, txnid, &status); + + /* If not found, this is an incomplete abort. */ + if (ret == DB_NOTFOUND) + return (__db_txnlist_add(env, + info, txnid, TXN_IGNORE, lsnp)); + if (ret != 0) + return (ret); + + /* + * If we ignore the transaction, ignore the operation + * UNLESS this is a child commit in which case we need + * to make sure that the child also gets marked as + * ignore. + */ + if (status == TXN_IGNORE && rectype != DB___txn_child) { + make_call = 0; + break; + } + if (status == TXN_COMMIT) + break; + + /* Set make_call in case we came through default */ + make_call = 1; + if (status == TXN_OK && + (ret = __db_txnlist_update(env, + info, txnid, rectype == DB___txn_prepare ? + TXN_PREPARE : TXN_ABORT, NULL, &status, 0)) != 0) + return (ret); + } + break; + case DB_TXN_FORWARD_ROLL: + /* + * In the forward pass, if we haven't seen the transaction, + * do nothing, else recover it. + * + * We need to always redo DB___db_noop records, so that we + * properly handle any commits after the file was closed. + */ + switch (rectype) { + case DB___txn_recycle: + case DB___txn_ckp: + case DB___db_noop: + case DB___dbreg_register: + make_call = 1; + break; + + default: + if (txnid == 0) + status = 0; + else { + ret = __db_txnlist_find(env, + info, txnid, &status); + + if (ret == DB_NOTFOUND) + /* Break out out of if clause. */ + ; + else if (ret != 0) + return (ret); + else if (status == TXN_COMMIT) { + make_call = 1; + break; + } + } + + } + break; + default: + return (__db_unknown_flag( + env, "__db_dispatch", (u_int32_t)redo)); + } + + if (make_call) { + /* + * If the debug flag is set then we are logging + * records for a non-durable update so that they + * may be examined for diagnostic purposes. + * So only make the call if we are printing, + * otherwise we need to extract the previous + * lsn so undo will work properly. + */ + if (rectype & DB_debug_FLAG) { + if (redo == DB_TXN_PRINT) + rectype &= ~DB_debug_FLAG; + else { + LOGCOPY_TOLSN(env, lsnp, + (u_int8_t *)db->data + + sizeof(rectype) + + sizeof(txnid)); + return (0); + } + } + if (rectype >= DB_user_BEGIN) { + if (dbenv->app_dispatch != NULL) + return (dbenv->app_dispatch(dbenv, + db, lsnp, redo)); + + /* No application-specific dispatch */ + urectype = rectype - DB_user_BEGIN; + if (urectype > dtab->ext_size || + dtab->ext_dispatch[urectype] == NULL) { + __db_errx(env, + "Illegal application-specific record type %lu in log", + (u_long)rectype); + return (EINVAL); + } + return ((dtab->ext_dispatch[urectype])(dbenv, + db, lsnp, redo)); + } else { + if (rectype > dtab->int_size || + dtab->int_dispatch[rectype] == NULL) { + __db_errx(env, + "Illegal record type %lu in log", + (u_long)rectype); + return (EINVAL); + } + return ((dtab->int_dispatch[rectype])(env, + db, lsnp, redo, info)); + } + } + + return (0); +} + +/* + * __db_add_recovery -- Add recovery functions to the dispatch table. + * + * We have two versions of this, an external one and an internal one, + * because application-specific functions take different arguments + * for dispatch (ENV versus DB_ENV). + * + * This is the external version. + * + * PUBLIC: int __db_add_recovery __P((DB_ENV *, DB_DISTAB *, + * PUBLIC: int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops), u_int32_t)); + */ +int +__db_add_recovery(dbenv, dtab, func, ndx) + DB_ENV *dbenv; + DB_DISTAB *dtab; + int (*func) __P((DB_ENV *, DBT *, DB_LSN *, db_recops)); + u_int32_t ndx; +{ + size_t i, nsize; + int ret; + + /* Make sure this is an application-specific record. */ + if (ndx < DB_user_BEGIN) { + __db_errx(dbenv->env, + "Attempting to add application-specific record with invalid type %lu", + (u_long)ndx); + return (EINVAL); + } + ndx -= DB_user_BEGIN; + + /* Check if we have to grow the table. */ + if (ndx >= dtab->ext_size) { + nsize = ndx + 40; + if ((ret = + __os_realloc(dbenv->env, nsize * + sizeof((dtab->ext_dispatch)[0]), &dtab->ext_dispatch)) + != 0) + return (ret); + for (i = dtab->ext_size; i < nsize; ++i) + (dtab->ext_dispatch)[i] = NULL; + dtab->ext_size = nsize; + } + + (dtab->ext_dispatch)[ndx] = func; + return (0); +} + +/* + * __db_add_recovery_int -- + * + * Internal version of dispatch addition function. + * + * + * PUBLIC: int __db_add_recovery_int __P((ENV *, DB_DISTAB *, + * PUBLIC: int (*)(ENV *, DBT *, DB_LSN *, db_recops, void *), u_int32_t)); + */ +int +__db_add_recovery_int(env, dtab, func, ndx) + ENV *env; + DB_DISTAB *dtab; + int (*func) __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + u_int32_t ndx; +{ + size_t i, nsize; + int ret; + + if (ndx >= DB_user_BEGIN) { + __db_errx(env, + "Attempting to add internal record with invalid type %lu", + (u_long)ndx); + return (EINVAL); + } + + /* Check if we have to grow the table. */ + if (ndx >= dtab->int_size) { + nsize = ndx + 40; + if ((ret = + __os_realloc(env, nsize * sizeof((dtab->int_dispatch)[0]), + &dtab->int_dispatch)) != 0) + return (ret); + for (i = dtab->int_size; i < nsize; ++i) + (dtab->int_dispatch)[i] = NULL; + dtab->int_size = nsize; + } + + (dtab->int_dispatch)[ndx] = func; + return (0); +} + +/* + * __db_txnlist_init -- + * Initialize transaction linked list. + * + * PUBLIC: int __db_txnlist_init __P((ENV *, DB_THREAD_INFO *, + * PUBLIC: u_int32_t, u_int32_t, DB_LSN *, DB_TXNHEAD **)); + */ +int +__db_txnlist_init(env, ip, low_txn, hi_txn, trunc_lsn, retp) + ENV *env; + DB_THREAD_INFO *ip; + u_int32_t low_txn, hi_txn; + DB_LSN *trunc_lsn; + DB_TXNHEAD **retp; +{ + DB_TXNHEAD *headp; + u_int32_t size, tmp; + int ret; + + /* + * Size a hash table. + * If low is zero then we are being called during rollback + * and we need only one slot. + * Hi maybe lower than low if we have recycled txnid's. + * The numbers here are guesses about txn density, we can afford + * to look at a few entries in each slot. + */ + if (low_txn == 0) + size = 1; + else { + if (hi_txn < low_txn) { + tmp = hi_txn; + hi_txn = low_txn; + low_txn = tmp; + } + tmp = hi_txn - low_txn; + /* See if we wrapped around. */ + if (tmp > (TXN_MAXIMUM - TXN_MINIMUM) / 2) + tmp = (low_txn - TXN_MINIMUM) + (TXN_MAXIMUM - hi_txn); + size = tmp / 5; + if (size < 100) + size = 100; + } + if ((ret = __os_malloc(env, + sizeof(DB_TXNHEAD) + size * sizeof(headp->head), &headp)) != 0) + return (ret); + + memset(headp, 0, sizeof(DB_TXNHEAD) + size * sizeof(headp->head)); + headp->maxid = hi_txn; + headp->generation = 0; + headp->nslots = size; + headp->gen_alloc = 8; + headp->thread_info = ip; + if ((ret = __os_malloc(env, headp->gen_alloc * + sizeof(headp->gen_array[0]), &headp->gen_array)) != 0) { + __os_free(env, headp); + return (ret); + } + headp->gen_array[0].generation = 0; + headp->gen_array[0].txn_min = TXN_MINIMUM; + headp->gen_array[0].txn_max = TXN_MAXIMUM; + if (trunc_lsn != NULL) { + headp->trunc_lsn = *trunc_lsn; + headp->maxlsn = *trunc_lsn; + } else { + ZERO_LSN(headp->trunc_lsn); + ZERO_LSN(headp->maxlsn); + } + ZERO_LSN(headp->ckplsn); + + *retp = headp; + return (0); +} + +#define FIND_GENERATION(hp, txnid, gen) do { \ + u_int32_t __i; \ + for (__i = 0; __i <= (hp)->generation; __i++) \ + /* The range may wrap around the end. */ \ + if ((hp)->gen_array[__i].txn_min < \ + (hp)->gen_array[__i].txn_max ? \ + ((txnid) >= (hp)->gen_array[__i].txn_min && \ + (txnid) <= (hp)->gen_array[__i].txn_max) : \ + ((txnid) >= (hp)->gen_array[__i].txn_min || \ + (txnid) <= (hp)->gen_array[__i].txn_max)) \ + break; \ + DB_ASSERT(env, __i <= (hp)->generation); \ + gen = (hp)->gen_array[__i].generation; \ +} while (0) + +/* + * __db_txnlist_add -- + * Add an element to our transaction linked list. + * + * PUBLIC: int __db_txnlist_add __P((ENV *, + * PUBLIC: DB_TXNHEAD *, u_int32_t, u_int32_t, DB_LSN *)); + */ +int +__db_txnlist_add(env, hp, txnid, status, lsn) + ENV *env; + DB_TXNHEAD *hp; + u_int32_t txnid, status; + DB_LSN *lsn; +{ + DB_TXNLIST *elp; + int ret; + + if ((ret = __os_malloc(env, sizeof(DB_TXNLIST), &elp)) != 0) + return (ret); + + LIST_INSERT_HEAD(&hp->head[DB_TXNLIST_MASK(hp, txnid)], elp, links); + + /* Find the most recent generation containing this ID */ + FIND_GENERATION(hp, txnid, elp->u.t.generation); + elp->type = TXNLIST_TXNID; + elp->u.t.txnid = txnid; + elp->u.t.status = status; + if (txnid > hp->maxid) + hp->maxid = txnid; + if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT) + hp->maxlsn = *lsn; + + DB_ASSERT(env, lsn == NULL || + status != TXN_COMMIT || LOG_COMPARE(&hp->maxlsn, lsn) >= 0); + + return (0); +} + +/* + * __db_txnlist_remove -- + * Remove an element from our transaction linked list. + * + * PUBLIC: int __db_txnlist_remove __P((ENV *, DB_TXNHEAD *, u_int32_t)); + */ +int +__db_txnlist_remove(env, hp, txnid) + ENV *env; + DB_TXNHEAD *hp; + u_int32_t txnid; +{ + DB_TXNLIST *entry; + u_int32_t status; + + return (__db_txnlist_find_internal(env, + hp, TXNLIST_TXNID, txnid, &entry, 1, &status)); +} + +/* + * __db_txnlist_ckp -- + * Used to record the maximum checkpoint that will be retained + * after recovery. Typically this is simply the max checkpoint, but + * if we are doing client replication recovery or timestamp-based + * recovery, we are going to virtually truncate the log and we need + * to retain the last checkpoint before the truncation point. + * + * PUBLIC: void __db_txnlist_ckp __P((ENV *, DB_TXNHEAD *, DB_LSN *)); + */ +void +__db_txnlist_ckp(env, hp, ckp_lsn) + ENV *env; + DB_TXNHEAD *hp; + DB_LSN *ckp_lsn; +{ + + COMPQUIET(env, NULL); + + if (IS_ZERO_LSN(hp->ckplsn) && !IS_ZERO_LSN(hp->maxlsn) && + LOG_COMPARE(&hp->maxlsn, ckp_lsn) >= 0) + hp->ckplsn = *ckp_lsn; +} + +/* + * __db_txnlist_end -- + * Discard transaction linked list. + * + * PUBLIC: void __db_txnlist_end __P((ENV *, DB_TXNHEAD *)); + */ +void +__db_txnlist_end(env, hp) + ENV *env; + DB_TXNHEAD *hp; +{ + u_int32_t i; + DB_TXNLIST *p; + + if (hp == NULL) + return; + + for (i = 0; i < hp->nslots; i++) + while (hp != NULL && (p = LIST_FIRST(&hp->head[i])) != NULL) { + switch (p->type) { + case TXNLIST_LSN: + __os_free(env, p->u.l.lsn_stack); + break; + case TXNLIST_DELETE: + case TXNLIST_TXNID: + default: + /* + * Possibly an incomplete DB_TXNLIST; just + * free it. + */ + break; + } + LIST_REMOVE(p, links); + __os_free(env, p); + } + + if (hp->gen_array != NULL) + __os_free(env, hp->gen_array); + __os_free(env, hp); +} + +/* + * __db_txnlist_find -- + * Checks to see if a txnid with the current generation is in the + * txnid list. This returns DB_NOTFOUND if the item isn't in the + * list otherwise it returns (like __db_txnlist_find_internal) + * the status of the transaction. A txnid of 0 means the record + * was generated while not in a transaction. + * + * PUBLIC: int __db_txnlist_find __P((ENV *, + * PUBLIC: DB_TXNHEAD *, u_int32_t, u_int32_t *)); + */ +int +__db_txnlist_find(env, hp, txnid, statusp) + ENV *env; + DB_TXNHEAD *hp; + u_int32_t txnid, *statusp; +{ + DB_TXNLIST *entry; + + if (txnid == 0) + return (DB_NOTFOUND); + + return (__db_txnlist_find_internal(env, hp, + TXNLIST_TXNID, txnid, &entry, 0, statusp)); +} + +/* + * __db_txnlist_update -- + * Change the status of an existing transaction entry. + * Returns DB_NOTFOUND if no such entry exists. + * + * PUBLIC: int __db_txnlist_update __P((ENV *, DB_TXNHEAD *, + * PUBLIC: u_int32_t, u_int32_t, DB_LSN *, u_int32_t *, int)); + */ +int +__db_txnlist_update(env, hp, txnid, status, lsn, ret_status, add_ok) + ENV *env; + DB_TXNHEAD *hp; + u_int32_t txnid, status; + DB_LSN *lsn; + u_int32_t *ret_status; + int add_ok; +{ + DB_TXNLIST *elp; + int ret; + + if (txnid == 0) + return (DB_NOTFOUND); + + ret = __db_txnlist_find_internal(env, + hp, TXNLIST_TXNID, txnid, &elp, 0, ret_status); + + if (ret == DB_NOTFOUND && add_ok) { + *ret_status = status; + return (__db_txnlist_add(env, hp, txnid, status, lsn)); + } + if (ret != 0) + return (ret); + + if (*ret_status == TXN_IGNORE) + return (0); + + elp->u.t.status = status; + + if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT) + hp->maxlsn = *lsn; + + return (ret); +} + +/* + * __db_txnlist_find_internal -- + * Find an entry on the transaction list. If the entry is not there or + * the list pointer is not initialized we return DB_NOTFOUND. If the + * item is found, we return the status. Currently we always call this + * with an initialized list pointer but checking for NULL keeps it general. + */ +static int +__db_txnlist_find_internal(env, + hp, type, txnid, txnlistp, delete, statusp) + ENV *env; + DB_TXNHEAD *hp; + db_txnlist_type type; + u_int32_t txnid; + DB_TXNLIST **txnlistp; + int delete; + u_int32_t *statusp; +{ + struct __db_headlink *head; + DB_TXNLIST *p; + u_int32_t generation, hash; + int ret; + + ret = 0; + + if (hp == NULL) + return (DB_NOTFOUND); + + switch (type) { + case TXNLIST_TXNID: + hash = txnid; + FIND_GENERATION(hp, txnid, generation); + break; + case TXNLIST_DELETE: + case TXNLIST_LSN: + default: + return (__env_panic(env, EINVAL)); + } + + head = &hp->head[DB_TXNLIST_MASK(hp, hash)]; + LIST_FOREACH(p, head, links) { + if (p->type != type) + continue; + switch (type) { + case TXNLIST_TXNID: + if (p->u.t.txnid != txnid || + generation != p->u.t.generation) + continue; + *statusp = p->u.t.status; + break; + + case TXNLIST_DELETE: + case TXNLIST_LSN: + default: + return (__env_panic(env, EINVAL)); + } + if (delete == 1) { + LIST_REMOVE(p, links); + __os_free(env, p); + *txnlistp = NULL; + } else if (p != LIST_FIRST(head)) { + /* Move it to head of list. */ + LIST_REMOVE(p, links); + LIST_INSERT_HEAD(head, p, links); + *txnlistp = p; + } else + *txnlistp = p; + return (ret); + } + + return (DB_NOTFOUND); +} + +/* + * __db_txnlist_gen -- + * Change the current generation number. + * + * PUBLIC: int __db_txnlist_gen __P((ENV *, + * PUBLIC: DB_TXNHEAD *, int, u_int32_t, u_int32_t)); + */ +int +__db_txnlist_gen(env, hp, incr, min, max) + ENV *env; + DB_TXNHEAD *hp; + int incr; + u_int32_t min, max; +{ + int ret; + + /* + * During recovery generation numbers keep track of "restart" + * checkpoints and recycle records. Restart checkpoints occur + * whenever we take a checkpoint and there are no outstanding + * transactions. When that happens, we can reset transaction IDs + * back to TXNID_MINIMUM. Currently we only do the reset + * at then end of recovery. Recycle records occur when txnids + * are exhausted during runtime. A free range of ids is identified + * and logged. This code maintains a stack of ranges. A txnid + * is given the generation number of the first range it falls into + * in the stack. + */ + if (incr < 0) { + --hp->generation; + memmove(hp->gen_array, &hp->gen_array[1], + (hp->generation + 1) * sizeof(hp->gen_array[0])); + } else { + ++hp->generation; + if (hp->generation >= hp->gen_alloc) { + hp->gen_alloc *= 2; + if ((ret = __os_realloc(env, hp->gen_alloc * + sizeof(hp->gen_array[0]), &hp->gen_array)) != 0) + return (ret); + } + memmove(&hp->gen_array[1], &hp->gen_array[0], + hp->generation * sizeof(hp->gen_array[0])); + hp->gen_array[0].generation = hp->generation; + hp->gen_array[0].txn_min = min; + hp->gen_array[0].txn_max = max; + } + return (0); +} + +/* + * __db_txnlist_lsnadd -- + * Save the prev_lsn from a txn_child record. + * + * PUBLIC: int __db_txnlist_lsnadd __P((ENV *, DB_TXNHEAD *, DB_LSN *)); + */ +int +__db_txnlist_lsnadd(env, hp, lsnp) + ENV *env; + DB_TXNHEAD *hp; + DB_LSN *lsnp; +{ + DB_TXNLIST *elp; + int ret; + + if (IS_ZERO_LSN(*lsnp)) + return (0); + + LIST_FOREACH(elp, &hp->head[0], links) + if (elp->type == TXNLIST_LSN) + break; + + if (elp == NULL) { + if ((ret = __db_txnlist_lsninit(env, hp, lsnp)) != 0) + return (ret); + return (DB_SURPRISE_KID); + } + + if (elp->u.l.stack_indx == elp->u.l.stack_size) { + elp->u.l.stack_size <<= 1; + if ((ret = __os_realloc(env, sizeof(DB_LSN) * + elp->u.l.stack_size, &elp->u.l.lsn_stack)) != 0) { + __db_txnlist_end(env, hp); + return (ret); + } + } + elp->u.l.lsn_stack[elp->u.l.stack_indx++] = *lsnp; + + return (0); +} + +/* + * __db_txnlist_lsnget -- + * + * PUBLIC: int __db_txnlist_lsnget __P((ENV *, + * PUBLIC: DB_TXNHEAD *, DB_LSN *, u_int32_t)); + * Get the lsn saved from a txn_child record. + */ +int +__db_txnlist_lsnget(env, hp, lsnp, flags) + ENV *env; + DB_TXNHEAD *hp; + DB_LSN *lsnp; + u_int32_t flags; +{ + DB_TXNLIST *elp; + + COMPQUIET(env, NULL); + COMPQUIET(flags, 0); + + LIST_FOREACH(elp, &hp->head[0], links) + if (elp->type == TXNLIST_LSN) + break; + + if (elp == NULL || elp->u.l.stack_indx == 0) { + ZERO_LSN(*lsnp); + return (0); + } + + *lsnp = elp->u.l.lsn_stack[--elp->u.l.stack_indx]; + + return (0); +} + +/* + * __db_txnlist_lsninit -- + * Initialize a transaction list with an lsn array entry. + * + * PUBLIC: int __db_txnlist_lsninit __P((ENV *, DB_TXNHEAD *, DB_LSN *)); + */ +int +__db_txnlist_lsninit(env, hp, lsnp) + ENV *env; + DB_TXNHEAD *hp; + DB_LSN *lsnp; +{ + DB_TXNLIST *elp; + int ret; + + elp = NULL; + + if ((ret = __os_malloc(env, sizeof(DB_TXNLIST), &elp)) != 0) + goto err; + LIST_INSERT_HEAD(&hp->head[0], elp, links); + + elp->type = TXNLIST_LSN; + if ((ret = __os_malloc(env, + sizeof(DB_LSN) * DB_LSN_STACK_SIZE, &elp->u.l.lsn_stack)) != 0) + goto err; + elp->u.l.stack_indx = 1; + elp->u.l.stack_size = DB_LSN_STACK_SIZE; + elp->u.l.lsn_stack[0] = *lsnp; + + return (0); + +err: __db_txnlist_end(env, hp); + return (ret); +} + +#ifdef DEBUG +/* + * __db_txnlist_print -- + * Print out the transaction list. + * + * PUBLIC: void __db_txnlist_print __P((DB_TXNHEAD *)); + */ +void +__db_txnlist_print(hp) + DB_TXNHEAD *hp; +{ + DB_TXNLIST *p; + u_int32_t i; + char *txntype; + + printf("Maxid: %lu Generation: %lu\n", + (u_long)hp->maxid, (u_long)hp->generation); + for (i = 0; i < hp->nslots; i++) + LIST_FOREACH(p, &hp->head[i], links) { + if (p->type != TXNLIST_TXNID) { + printf("Unrecognized type: %d\n", p->type); + continue; + } + switch (p->u.t.status) { + case TXN_OK: + txntype = "OK"; + break; + case TXN_COMMIT: + txntype = "commit"; + break; + case TXN_PREPARE: + txntype = "prepare"; + break; + case TXN_ABORT: + txntype = "abort"; + break; + case TXN_IGNORE: + txntype = "ignore"; + break; + case TXN_EXPECTED: + txntype = "expected"; + break; + case TXN_UNEXPECTED: + txntype = "unexpected"; + break; + default: + txntype = "UNKNOWN"; + break; + } + printf("TXNID: %lx(%lu): %s\n", + (u_long)p->u.t.txnid, + (u_long)p->u.t.generation, txntype); + } +} +#endif diff --git a/db/db_dup.c b/db/db_dup.c new file mode 100644 index 0000000..b789e03 --- /dev/null +++ b/db/db_dup.c @@ -0,0 +1,203 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/mp.h" +#include "dbinc/db_am.h" + +/* + * __db_ditem_nolog -- + * Remove an item from a page without affecting its recoverability. + * + * PUBLIC: int __db_ditem_nolog __P((DBC *, PAGE *, u_int32_t, u_int32_t)); + */ +int +__db_ditem_nolog(dbc, pagep, indx, nbytes) + DBC *dbc; + PAGE *pagep; + u_int32_t indx, nbytes; +{ + DB *dbp; + db_indx_t cnt, *inp, offset; + u_int8_t *from; + + dbp = dbc->dbp; + DB_ASSERT(dbp->env, IS_DIRTY(pagep)); + DB_ASSERT(dbp->env, indx < NUM_ENT(pagep)); + + /* + * If there's only a single item on the page, we don't have to + * work hard. + */ + if (NUM_ENT(pagep) == 1) { + NUM_ENT(pagep) = 0; + HOFFSET(pagep) = dbp->pgsize; + return (0); + } + + inp = P_INP(dbp, pagep); + /* + * Pack the remaining key/data items at the end of the page. Use + * memmove(3), the regions may overlap. + */ + from = (u_int8_t *)pagep + HOFFSET(pagep); + DB_ASSERT(dbp->env, inp[indx] >= HOFFSET(pagep)); + memmove(from + nbytes, from, inp[indx] - HOFFSET(pagep)); + HOFFSET(pagep) += nbytes; + + /* Adjust the indices' offsets. */ + offset = inp[indx]; + for (cnt = 0; cnt < NUM_ENT(pagep); ++cnt) + if (inp[cnt] < offset) + inp[cnt] += nbytes; + + /* Shift the indices down. */ + --NUM_ENT(pagep); + if (indx != NUM_ENT(pagep)) + memmove(&inp[indx], &inp[indx + 1], + sizeof(db_indx_t) * (NUM_ENT(pagep) - indx)); + + return (0); +} + +/* + * __db_ditem -- + * Remove an item from a page, logging it if enabled. + * + * PUBLIC: int __db_ditem __P((DBC *, PAGE *, u_int32_t, u_int32_t)); + */ +int +__db_ditem(dbc, pagep, indx, nbytes) + DBC *dbc; + PAGE *pagep; + u_int32_t indx, nbytes; +{ + DB *dbp; + DBT ldbt; + int ret; + + dbp = dbc->dbp; + + if (DBC_LOGGING(dbc)) { + ldbt.data = P_ENTRY(dbp, pagep, indx); + ldbt.size = nbytes; + if ((ret = __db_addrem_log(dbp, dbc->txn, + &LSN(pagep), 0, DB_REM_DUP, PGNO(pagep), + (u_int32_t)indx, nbytes, &ldbt, NULL, &LSN(pagep))) != 0) + return (ret); + } else + LSN_NOT_LOGGED(LSN(pagep)); + + return (__db_ditem_nolog(dbc, pagep, indx, nbytes)); +} + +/* + * __db_pitem_nolog -- + * Put an item on a page without logging. + * + * PUBLIC: int __db_pitem_nolog + * PUBLIC: __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *)); + */ +int +__db_pitem_nolog(dbc, pagep, indx, nbytes, hdr, data) + DBC *dbc; + PAGE *pagep; + u_int32_t indx; + u_int32_t nbytes; + DBT *hdr, *data; +{ + BKEYDATA bk; + DB *dbp; + DBT thdr; + db_indx_t *inp; + u_int8_t *p; + + dbp = dbc->dbp; + + DB_ASSERT(dbp->env, IS_DIRTY(pagep)); + + if (nbytes > P_FREESPACE(dbp, pagep)) { + DB_ASSERT(dbp->env, nbytes <= P_FREESPACE(dbp, pagep)); + return (EINVAL); + } + + if (hdr == NULL) { + B_TSET(bk.type, B_KEYDATA); + bk.len = data == NULL ? 0 : data->size; + + thdr.data = &bk; + thdr.size = SSZA(BKEYDATA, data); + hdr = &thdr; + } + inp = P_INP(dbp, pagep); + + /* Adjust the index table, then put the item on the page. */ + if (indx != NUM_ENT(pagep)) + memmove(&inp[indx + 1], &inp[indx], + sizeof(db_indx_t) * (NUM_ENT(pagep) - indx)); + HOFFSET(pagep) -= nbytes; + inp[indx] = HOFFSET(pagep); + ++NUM_ENT(pagep); + + p = P_ENTRY(dbp, pagep, indx); + memcpy(p, hdr->data, hdr->size); + if (data != NULL) + memcpy(p + hdr->size, data->data, data->size); + + return (0); +} + +/* + * __db_pitem -- + * Put an item on a page. + * + * PUBLIC: int __db_pitem + * PUBLIC: __P((DBC *, PAGE *, u_int32_t, u_int32_t, DBT *, DBT *)); + */ +int +__db_pitem(dbc, pagep, indx, nbytes, hdr, data) + DBC *dbc; + PAGE *pagep; + u_int32_t indx; + u_int32_t nbytes; + DBT *hdr, *data; +{ + DB *dbp; + int ret; + + dbp = dbc->dbp; + /* + * Put a single item onto a page. The logic figuring out where to + * insert and whether it fits is handled in the caller. All we do + * here is manage the page shuffling. We cheat a little bit in that + * we don't want to copy the dbt on a normal put twice. If hdr is + * NULL, we create a BKEYDATA structure on the page, otherwise, just + * copy the caller's information onto the page. + * + * This routine is also used to put entries onto the page where the + * entry is pre-built, e.g., during recovery. In this case, the hdr + * will point to the entry, and the data argument will be NULL. + * + * !!! + * There's a tremendous potential for off-by-one errors here, since + * the passed in header sizes must be adjusted for the structure's + * placeholder for the trailing variable-length data field. + */ + if (DBC_LOGGING(dbc)) { + if ((ret = __db_addrem_log(dbp, dbc->txn, + &LSN(pagep), 0, DB_ADD_DUP, PGNO(pagep), + (u_int32_t)indx, nbytes, hdr, data, &LSN(pagep))) != 0) + return (ret); + } else + LSN_NOT_LOGGED(LSN(pagep)); + + return (__db_pitem_nolog(dbc, pagep, indx, nbytes, hdr, data)); +} diff --git a/db/db_iface.c b/db/db_iface.c new file mode 100644 index 0000000..55f3e2a --- /dev/null +++ b/db/db_iface.c @@ -0,0 +1,2817 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/hash.h" +#ifndef HAVE_QUEUE +#include "dbinc/qam.h" /* For __db_no_queue_am(). */ +#endif +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/partition.h" +#include "dbinc/txn.h" + +static int __db_associate_arg __P((DB *, DB *, + int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t)); +static int __dbc_del_arg __P((DBC *, u_int32_t)); +static int __dbc_pget_arg __P((DBC *, DBT *, u_int32_t)); +static int __dbc_put_arg __P((DBC *, DBT *, DBT *, u_int32_t)); +static int __db_curinval __P((const ENV *)); +static int __db_cursor_arg __P((DB *, u_int32_t)); +static int __db_del_arg __P((DB *, DBT *, u_int32_t)); +static int __db_get_arg __P((const DB *, DBT *, DBT *, u_int32_t)); +static int __db_join_arg __P((DB *, DBC **, u_int32_t)); +static int __db_open_arg __P((DB *, + DB_TXN *, const char *, const char *, DBTYPE, u_int32_t)); +static int __db_pget_arg __P((DB *, DBT *, u_int32_t)); +static int __db_put_arg __P((DB *, DBT *, DBT *, u_int32_t)); +static int __dbt_ferr __P((const DB *, const char *, const DBT *, int)); +static int __db_associate_foreign_arg __P((DB *, DB *, + int (*)(DB *, const DBT *, DBT *, const DBT *, int *), + u_int32_t)); + +/* + * These functions implement the Berkeley DB API. They are organized in a + * layered fashion. The interface functions (XXX_pp) perform all generic + * error checks (for example, PANIC'd region, replication state change + * in progress, inconsistent transaction usage), call function-specific + * check routines (_arg) to check for proper flag usage, etc., do pre-amble + * processing (incrementing handle counts, handling local transactions), + * call the function and then do post-amble processing (local transactions, + * decrement handle counts). + * + * The basic structure is: + * Check for simple/generic errors (PANIC'd region) + * Check if replication is changing state (increment handle count). + * Call function-specific argument checking routine + * Create internal transaction if necessary + * Call underlying worker function + * Commit/abort internal transaction if necessary + * Decrement handle count + */ + +/* + * __db_associate_pp -- + * DB->associate pre/post processing. + * + * PUBLIC: int __db_associate_pp __P((DB *, DB_TXN *, DB *, + * PUBLIC: int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t)); + */ +int +__db_associate_pp(dbp, txn, sdbp, callback, flags) + DB *dbp, *sdbp; + DB_TXN *txn; + int (*callback) __P((DB *, const DBT *, const DBT *, DBT *)); + u_int32_t flags; +{ + DBC *sdbc; + DB_THREAD_INFO *ip; + ENV *env; + int handle_check, ret, t_ret, txn_local; + + env = dbp->env; + txn_local = 0; + + STRIP_AUTO_COMMIT(flags); + + ENV_ENTER(env, ip); + + /* Check for replication block. */ + handle_check = IS_ENV_REPLICATED(env); + if (handle_check && + (ret = __db_rep_enter(dbp, 1, 0, txn != NULL)) != 0) { + handle_check = 0; + goto err; + } + + /* + * Secondary cursors may have the primary's lock file ID, so we need + * to make sure that no older cursors are lying around when we make + * the transition. + */ + if (TAILQ_FIRST(&sdbp->active_queue) != NULL || + TAILQ_FIRST(&sdbp->join_queue) != NULL) { + __db_errx(env, + "Databases may not become secondary indices while cursors are open"); + ret = EINVAL; + goto err; + } + + if ((ret = __db_associate_arg(dbp, sdbp, callback, flags)) != 0) + goto err; + + /* + * Create a local transaction as necessary, check for consistent + * transaction usage, and, if we have no transaction but do have + * locking on, acquire a locker id for the handle lock acquisition. + */ + if (IS_DB_AUTO_COMMIT(dbp, txn)) { + if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0) + goto err; + txn_local = 1; + } + + /* Check for consistent transaction usage. */ + if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0) + goto err; + + while ((sdbc = TAILQ_FIRST(&sdbp->free_queue)) != NULL) + if ((ret = __dbc_destroy(sdbc)) != 0) + goto err; + + ret = __db_associate(dbp, ip, txn, sdbp, callback, flags); + +err: if (txn_local && + (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0) + ret = t_ret; + + /* Release replication block. */ + if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __db_associate_arg -- + * Check DB->associate arguments. + */ +static int +__db_associate_arg(dbp, sdbp, callback, flags) + DB *dbp, *sdbp; + int (*callback) __P((DB *, const DBT *, const DBT *, DBT *)); + u_int32_t flags; +{ + ENV *env; + int ret; + + env = dbp->env; + + if (F_ISSET(sdbp, DB_AM_SECONDARY)) { + __db_errx(env, + "Secondary index handles may not be re-associated"); + return (EINVAL); + } + if (F_ISSET(dbp, DB_AM_SECONDARY)) { + __db_errx(env, + "Secondary indices may not be used as primary databases"); + return (EINVAL); + } + if (F_ISSET(dbp, DB_AM_DUP)) { + __db_errx(env, + "Primary databases may not be configured with duplicates"); + return (EINVAL); + } + if (F_ISSET(dbp, DB_AM_RENUMBER)) { + __db_errx(env, + "Renumbering recno databases may not be used as primary databases"); + return (EINVAL); + } + + /* + * It's OK for the primary and secondary to not share an environment IFF + * the environments are local to the DB handle. (Specifically, cursor + * adjustment will work correctly in this case.) The environment being + * local implies the environment is not configured for either locking or + * transactions, as neither of those could work correctly. + */ + if (dbp->env != sdbp->env && + (!F_ISSET(dbp->env, ENV_DBLOCAL) || + !F_ISSET(sdbp->env, ENV_DBLOCAL))) { + __db_errx(env, + "The primary and secondary must be opened in the same environment"); + return (EINVAL); + } + if ((DB_IS_THREADED(dbp) && !DB_IS_THREADED(sdbp)) || + (!DB_IS_THREADED(dbp) && DB_IS_THREADED(sdbp))) { + __db_errx(env, + "The DB_THREAD setting must be the same for primary and secondary"); + return (EINVAL); + } + if (callback == NULL && + (!F_ISSET(dbp, DB_AM_RDONLY) || !F_ISSET(sdbp, DB_AM_RDONLY))) { + __db_errx(env, + "Callback function may be NULL only when database handles are read-only"); + return (EINVAL); + } + + if ((ret = __db_fchk(env, "DB->associate", flags, DB_CREATE | + DB_IMMUTABLE_KEY)) != 0) + return (ret); + + return (0); +} + +/* + * __db_close_pp -- + * DB->close pre/post processing. + * + * PUBLIC: int __db_close_pp __P((DB *, u_int32_t)); + */ +int +__db_close_pp(dbp, flags) + DB *dbp; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int handle_check, ret, t_ret; + + env = dbp->env; + ret = 0; + + /* + * Close a DB handle -- as a handle destructor, we can't fail. + * + * !!! + * The actual argument checking is simple, do it inline, outside of + * the replication block. + */ + if (flags != 0 && flags != DB_NOSYNC) + ret = __db_ferr(env, "DB->close", 0); + + ENV_ENTER(env, ip); + + /* Check for replication block. */ + handle_check = IS_ENV_REPLICATED(env); + if (handle_check && (t_ret = __db_rep_enter(dbp, 0, 0, 0)) != 0) { + handle_check = 0; + if (ret == 0) + ret = t_ret; + } + + if ((t_ret = __db_close(dbp, NULL, flags)) != 0 && ret == 0) + ret = t_ret; + + /* Release replication block. */ + if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __db_cursor_pp -- + * DB->cursor pre/post processing. + * + * PUBLIC: int __db_cursor_pp __P((DB *, DB_TXN *, DBC **, u_int32_t)); + */ +int +__db_cursor_pp(dbp, txn, dbcp, flags) + DB *dbp; + DB_TXN *txn; + DBC **dbcp; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + REGENV *renv; + int rep_blocked, ret; + + env = dbp->env; + + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->cursor"); + + ENV_ENTER(env, ip); + + /* Check for replication block. */ + rep_blocked = 0; + if (txn == NULL && IS_ENV_REPLICATED(env)) { + if ((ret = __op_rep_enter(env)) != 0) + goto err; + rep_blocked = 1; + renv = env->reginfo->primary; + if (dbp->timestamp != renv->rep_timestamp) { + __db_errx(env, "%s %s", + "replication recovery unrolled committed transactions;", + "open DB and DBcursor handles must be closed"); + ret = DB_REP_HANDLE_DEAD; + goto err; + } + } + if ((ret = __db_cursor_arg(dbp, flags)) != 0) + goto err; + + /* + * Check for consistent transaction usage. For now, assume this + * cursor might be used for read operations only (in which case + * it may not require a txn). We'll check more stringently in + * c_del and c_put. (Note this means the read-op txn tests have + * to be a subset of the write-op ones.) + */ + if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 1)) != 0) + goto err; + + ret = __db_cursor(dbp, ip, txn, dbcp, flags); + +err: /* Release replication block on error. */ + if (ret != 0 && rep_blocked) + (void)__op_rep_exit(env); + + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __db_cursor -- + * DB->cursor. + * + * PUBLIC: int __db_cursor __P((DB *, + * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DBC **, u_int32_t)); + */ +int +__db_cursor(dbp, ip, txn, dbcp, flags) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + DBC **dbcp; + u_int32_t flags; +{ + DBC *dbc; + ENV *env; + db_lockmode_t mode; + int ret; + + env = dbp->env; + + if (MULTIVERSION(dbp) && txn == NULL && (LF_ISSET(DB_TXN_SNAPSHOT) || + F_ISSET(env->dbenv, DB_ENV_TXN_SNAPSHOT))) { + if ((ret = + __txn_begin(env, ip, NULL, &txn, DB_TXN_SNAPSHOT)) != 0) + return (ret); + F_SET(txn, TXN_PRIVATE); + } + + if ((ret = __db_cursor_int(dbp, ip, txn, dbp->type, PGNO_INVALID, + LF_ISSET(DB_CURSOR_BULK | DB_CURSOR_TRANSIENT), NULL, &dbc)) != 0) + return (ret); + + /* + * If this is CDB, do all the locking in the interface, which is + * right here. + */ + if (CDB_LOCKING(env)) { + mode = (LF_ISSET(DB_WRITELOCK)) ? DB_LOCK_WRITE : + ((LF_ISSET(DB_WRITECURSOR) || txn != NULL) ? + DB_LOCK_IWRITE : DB_LOCK_READ); + if ((ret = __lock_get(env, dbc->locker, 0, + &dbc->lock_dbt, mode, &dbc->mylock)) != 0) + goto err; + if (LF_ISSET(DB_WRITECURSOR)) + F_SET(dbc, DBC_WRITECURSOR); + if (LF_ISSET(DB_WRITELOCK)) + F_SET(dbc, DBC_WRITER); + } + + if (LF_ISSET(DB_READ_UNCOMMITTED) || + (txn != NULL && F_ISSET(txn, TXN_READ_UNCOMMITTED))) + F_SET(dbc, DBC_READ_UNCOMMITTED); + + if (LF_ISSET(DB_READ_COMMITTED) || + (txn != NULL && F_ISSET(txn, TXN_READ_COMMITTED))) + F_SET(dbc, DBC_READ_COMMITTED); + + *dbcp = dbc; + return (0); + +err: (void)__dbc_close(dbc); + return (ret); +} + +/* + * __db_cursor_arg -- + * Check DB->cursor arguments. + */ +static int +__db_cursor_arg(dbp, flags) + DB *dbp; + u_int32_t flags; +{ + ENV *env; + + env = dbp->env; + + /* + * DB_READ_COMMITTED and DB_READ_UNCOMMITTED require locking. + */ + if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED)) { + if (!LOCKING_ON(env)) + return (__db_fnl(env, "DB->cursor")); + } + + LF_CLR(DB_CURSOR_BULK | + DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_TXN_SNAPSHOT); + + /* Check for invalid function flags. */ + if (LF_ISSET(DB_WRITECURSOR)) { + if (DB_IS_READONLY(dbp)) + return (__db_rdonly(env, "DB->cursor")); + if (!CDB_LOCKING(env)) + return (__db_ferr(env, "DB->cursor", 0)); + LF_CLR(DB_WRITECURSOR); + } else if (LF_ISSET(DB_WRITELOCK)) { + if (DB_IS_READONLY(dbp)) + return (__db_rdonly(env, "DB->cursor")); + LF_CLR(DB_WRITELOCK); + } + + if (flags != 0) + return (__db_ferr(env, "DB->cursor", 0)); + + return (0); +} + +/* + * __db_del_pp -- + * DB->del pre/post processing. + * + * PUBLIC: int __db_del_pp __P((DB *, DB_TXN *, DBT *, u_int32_t)); + */ +int +__db_del_pp(dbp, txn, key, flags) + DB *dbp; + DB_TXN *txn; + DBT *key; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int handle_check, ret, t_ret, txn_local; + + env = dbp->env; + txn_local = 0; + + STRIP_AUTO_COMMIT(flags); + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->del"); + +#ifdef CONFIG_TEST + if (IS_REP_MASTER(env)) + DB_TEST_WAIT(env, env->test_check); +#endif + ENV_ENTER(env, ip); + + /* Check for replication block. */ + handle_check = IS_ENV_REPLICATED(env); + if (handle_check && + (ret = __db_rep_enter(dbp, 1, 0, txn != NULL)) != 0) { + handle_check = 0; + goto err; + } + + if ((ret = __db_del_arg(dbp, key, flags)) != 0) + goto err; + + /* Create local transaction as necessary. */ + if (IS_DB_AUTO_COMMIT(dbp, txn)) { + if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0) + goto err; + txn_local = 1; + } + + /* Check for consistent transaction usage. */ + if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0) + goto err; + + ret = __db_del(dbp, ip, txn, key, flags); + +err: if (txn_local && + (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0) + ret = t_ret; + + /* Release replication block. */ + if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + ENV_LEAVE(env, ip); + __dbt_userfree(env, key, NULL, NULL); + return (ret); +} + +/* + * __db_del_arg -- + * Check DB->delete arguments. + */ +static int +__db_del_arg(dbp, key, flags) + DB *dbp; + DBT *key; + u_int32_t flags; +{ + ENV *env; + int ret; + + env = dbp->env; + + /* Check for changes to a read-only tree. */ + if (DB_IS_READONLY(dbp)) + return (__db_rdonly(env, "DB->del")); + + /* Check for invalid function flags. */ + switch (flags) { + case DB_CONSUME: + if (dbp->type != DB_QUEUE) + return (__db_ferr(env, "DB->del", 0)); + goto copy; + case DB_MULTIPLE: + case DB_MULTIPLE_KEY: + if (!F_ISSET(key, DB_DBT_BULK)) { + __db_errx(env, + "DB->del with DB_MULTIPLE(_KEY) requires multiple key records"); + return (EINVAL); + } + /* FALL THROUGH */ + case 0: +copy: if ((ret = __dbt_usercopy(env, key)) != 0) + return (ret); + break; + default: + return (__db_ferr(env, "DB->del", 0)); + } + + return (0); +} + +/* + * __db_exists -- + * DB->exists implementation. + * + * PUBLIC: int __db_exists __P((DB *, DB_TXN *, DBT *, u_int32_t)); + */ +int +__db_exists(dbp, txn, key, flags) + DB *dbp; + DB_TXN *txn; + DBT *key; + u_int32_t flags; +{ + DBT data; + int ret; + + /* + * Most flag checking is done in the DB->get call, we only check for + * specific incompatibilities here. This saves making __get_arg + * aware of the exist method's API constraints. + */ + STRIP_AUTO_COMMIT(flags); + if ((ret = __db_fchk(dbp->env, "DB->exists", flags, + DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW)) != 0) + return (ret); + + /* + * Configure a data DBT that returns no bytes so there's no copy + * of the data. + */ + memset(&data, 0, sizeof(data)); + data.dlen = 0; + data.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM; + + return (dbp->get(dbp, txn, key, &data, flags)); +} + +/* + * db_fd_pp -- + * DB->fd pre/post processing. + * + * PUBLIC: int __db_fd_pp __P((DB *, int *)); + */ +int +__db_fd_pp(dbp, fdp) + DB *dbp; + int *fdp; +{ + DB_FH *fhp; + DB_THREAD_INFO *ip; + ENV *env; + int handle_check, ret, t_ret; + + env = dbp->env; + + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->fd"); + + ENV_ENTER(env, ip); + + /* Check for replication block. */ + handle_check = IS_ENV_REPLICATED(env); + if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, 0)) != 0) + goto err; + + /* + * !!! + * There's no argument checking to be done. + * + * !!! + * The actual method call is simple, do it inline. + * + * XXX + * Truly spectacular layering violation. + */ + if ((ret = __mp_xxx_fh(dbp->mpf, &fhp)) == 0) { + if (fhp == NULL) { + *fdp = -1; + __db_errx(env, + "Database does not have a valid file handle"); + ret = ENOENT; + } else + *fdp = fhp->fd; + } + + /* Release replication block. */ + if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + +err: ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __db_get_pp -- + * DB->get pre/post processing. + * + * PUBLIC: int __db_get_pp __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); + */ +int +__db_get_pp(dbp, txn, key, data, flags) + DB *dbp; + DB_TXN *txn; + DBT *key, *data; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + u_int32_t mode; + int handle_check, ignore_lease, ret, t_ret, txn_local; + + env = dbp->env; + mode = 0; + txn_local = 0; + + STRIP_AUTO_COMMIT(flags); + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get"); + + ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0; + LF_CLR(DB_IGNORE_LEASE); + + if ((ret = __db_get_arg(dbp, key, data, flags)) != 0) + return (ret); + + ENV_ENTER(env, ip); + + /* Check for replication block. */ + handle_check = IS_ENV_REPLICATED(env); + if (handle_check && + (ret = __db_rep_enter(dbp, 1, 0, txn != NULL)) != 0) { + handle_check = 0; + goto err; + } + + if (LF_ISSET(DB_READ_UNCOMMITTED)) + mode = DB_READ_UNCOMMITTED; + else if ((flags & DB_OPFLAGS_MASK) == DB_CONSUME || + (flags & DB_OPFLAGS_MASK) == DB_CONSUME_WAIT) { + mode = DB_WRITELOCK; + if (IS_DB_AUTO_COMMIT(dbp, txn)) { + if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0) + goto err; + txn_local = 1; + } + } + + /* Check for consistent transaction usage. */ + if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, + mode == DB_WRITELOCK || LF_ISSET(DB_RMW) ? 0 : 1)) != 0) + goto err; + + ret = __db_get(dbp, ip, txn, key, data, flags); + /* + * Check for master leases. + */ + if (ret == 0 && + IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease) + ret = __rep_lease_check(env, 1); + +err: if (txn_local && + (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0) + ret = t_ret; + + /* Release replication block. */ + if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + + ENV_LEAVE(env, ip); + __dbt_userfree(env, key, NULL, data); + return (ret); +} + +/* + * __db_get -- + * DB->get. + * + * PUBLIC: int __db_get __P((DB *, + * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, u_int32_t)); + */ +int +__db_get(dbp, ip, txn, key, data, flags) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + DBT *key, *data; + u_int32_t flags; +{ + DBC *dbc; + u_int32_t mode; + int ret, t_ret; + + /* + * The DB_CURSOR_TRANSIENT flag indicates that we're just doing a single + * operation with this cursor, and that in case of error we don't need + * to restore it to its old position. Thus, we can perform the get + * without duplicating the cursor, saving some cycles in this common + * case. + */ + mode = DB_CURSOR_TRANSIENT; + if (LF_ISSET(DB_READ_UNCOMMITTED)) { + mode |= DB_READ_UNCOMMITTED; + LF_CLR(DB_READ_UNCOMMITTED); + } else if (LF_ISSET(DB_READ_COMMITTED)) { + mode |= DB_READ_COMMITTED; + LF_CLR(DB_READ_COMMITTED); + } else if ((flags & DB_OPFLAGS_MASK) == DB_CONSUME || + (flags & DB_OPFLAGS_MASK) == DB_CONSUME_WAIT) + mode |= DB_WRITELOCK; + + if ((ret = __db_cursor(dbp, ip, txn, &dbc, mode)) != 0) + return (ret); + + DEBUG_LREAD(dbc, txn, "DB->get", key, NULL, flags); + + /* + * The semantics of bulk gets are different for DB->get vs DBC->get. + * Mark the cursor so the low-level bulk get routines know which + * behavior we want. + */ + F_SET(dbc, DBC_FROM_DB_GET); + + /* + * SET_RET_MEM indicates that if key and/or data have no DBT + * flags set and DB manages the returned-data memory, that memory + * will belong to this handle, not to the underlying cursor. + */ + SET_RET_MEM(dbc, dbp); + + if (LF_ISSET(~(DB_RMW | DB_MULTIPLE)) == 0) + LF_SET(DB_SET); + +#ifdef HAVE_PARTITION + if (F_ISSET(dbc, DBC_PARTITIONED)) + ret = __partc_get(dbc, key, data, flags); + else +#endif + ret = __dbc_get(dbc, key, data, flags); + + if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __db_get_arg -- + * DB->get argument checking, used by both DB->get and DB->pget. + */ +static int +__db_get_arg(dbp, key, data, flags) + const DB *dbp; + DBT *key, *data; + u_int32_t flags; +{ + ENV *env; + int dirty, multi, ret; + + env = dbp->env; + + /* + * Check for read-modify-write validity. DB_RMW doesn't make sense + * with CDB cursors since if you're going to write the cursor, you + * had to create it with DB_WRITECURSOR. Regardless, we check for + * LOCKING_ON and not STD_LOCKING, as we don't want to disallow it. + * If this changes, confirm that DB does not itself set the DB_RMW + * flag in a path where CDB may have been configured. + */ + dirty = 0; + if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW)) { + if (!LOCKING_ON(env)) + return (__db_fnl(env, "DB->get")); + if ((ret = __db_fcchk(env, "DB->get", + flags, DB_READ_UNCOMMITTED, DB_READ_COMMITTED)) != 0) + return (ret); + if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED)) + dirty = 1; + LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW); + } + + multi = 0; + if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) { + if (LF_ISSET(DB_MULTIPLE_KEY)) + goto multi_err; + multi = LF_ISSET(DB_MULTIPLE) ? 1 : 0; + LF_CLR(DB_MULTIPLE); + } + + /* Check for invalid function flags. */ + switch (flags) { + case DB_GET_BOTH: + if ((ret = __dbt_usercopy(env, data)) != 0) + return (ret); + /* FALLTHROUGH */ + case 0: + if ((ret = __dbt_usercopy(env, key)) != 0) { + __dbt_userfree(env, key, NULL, data); + return (ret); + } + break; + case DB_SET_RECNO: + if (!F_ISSET(dbp, DB_AM_RECNUM)) + goto err; + if ((ret = __dbt_usercopy(env, key)) != 0) + return (ret); + break; + case DB_CONSUME: + case DB_CONSUME_WAIT: + if (dirty) { + __db_errx(env, + "%s is not supported with DB_CONSUME or DB_CONSUME_WAIT", + LF_ISSET(DB_READ_UNCOMMITTED) ? + "DB_READ_UNCOMMITTED" : "DB_READ_COMMITTED"); + return (EINVAL); + } + if (multi) +multi_err: return (__db_ferr(env, "DB->get", 1)); + if (dbp->type == DB_QUEUE) + break; + /* FALLTHROUGH */ + default: +err: return (__db_ferr(env, "DB->get", 0)); + } + + /* + * Check for invalid key/data flags. + */ + if ((ret = + __dbt_ferr(dbp, "key", key, DB_RETURNS_A_KEY(dbp, flags))) != 0) + return (ret); + if ((ret = __dbt_ferr(dbp, "data", data, 1)) != 0) + return (ret); + + if (multi) { + if (!F_ISSET(data, DB_DBT_USERMEM)) { + __db_errx(env, + "DB_MULTIPLE requires DB_DBT_USERMEM be set"); + return (EINVAL); + } + if (F_ISSET(key, DB_DBT_PARTIAL) || + F_ISSET(data, DB_DBT_PARTIAL)) { + __db_errx(env, + "DB_MULTIPLE does not support DB_DBT_PARTIAL"); + return (EINVAL); + } + if (data->ulen < 1024 || + data->ulen < dbp->pgsize || data->ulen % 1024 != 0) { + __db_errx(env, "%s%s", + "DB_MULTIPLE buffers must be ", + "aligned, at least page size and multiples of 1KB"); + return (EINVAL); + } + } + + return (0); +} + +/* + * __db_join_pp -- + * DB->join pre/post processing. + * + * PUBLIC: int __db_join_pp __P((DB *, DBC **, DBC **, u_int32_t)); + */ +int +__db_join_pp(primary, curslist, dbcp, flags) + DB *primary; + DBC **curslist, **dbcp; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int handle_check, ret, t_ret; + + env = primary->env; + + ENV_ENTER(env, ip); + + /* Check for replication block. */ + handle_check = IS_ENV_REPLICATED(env); + if (handle_check && (ret = + __db_rep_enter(primary, 1, 0, curslist[0]->txn != NULL)) != 0) { + handle_check = 0; + goto err; + } + + if ((ret = __db_join_arg(primary, curslist, flags)) == 0) + ret = __db_join(primary, curslist, dbcp, flags); + + /* Release replication block. */ + if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + +err: ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __db_join_arg -- + * Check DB->join arguments. + */ +static int +__db_join_arg(primary, curslist, flags) + DB *primary; + DBC **curslist; + u_int32_t flags; +{ + DB_TXN *txn; + ENV *env; + int i; + + env = primary->env; + + switch (flags) { + case 0: + case DB_JOIN_NOSORT: + break; + default: + return (__db_ferr(env, "DB->join", 0)); + } + + if (curslist == NULL || curslist[0] == NULL) { + __db_errx(env, + "At least one secondary cursor must be specified to DB->join"); + return (EINVAL); + } + + txn = curslist[0]->txn; + for (i = 1; curslist[i] != NULL; i++) + if (curslist[i]->txn != txn) { + __db_errx(env, + "All secondary cursors must share the same transaction"); + return (EINVAL); + } + + return (0); +} + +/* + * __db_key_range_pp -- + * DB->key_range pre/post processing. + * + * PUBLIC: int __db_key_range_pp + * PUBLIC: __P((DB *, DB_TXN *, DBT *, DB_KEY_RANGE *, u_int32_t)); + */ +int +__db_key_range_pp(dbp, txn, key, kr, flags) + DB *dbp; + DB_TXN *txn; + DBT *key; + DB_KEY_RANGE *kr; + u_int32_t flags; +{ + DBC *dbc; + DB_THREAD_INFO *ip; + ENV *env; + int handle_check, ret, t_ret; + + env = dbp->env; + + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->key_range"); + + /* + * !!! + * The actual argument checking is simple, do it inline, outside of + * the replication block. + */ + if (flags != 0) + return (__db_ferr(env, "DB->key_range", 0)); + + ENV_ENTER(env, ip); + + /* Check for replication block. */ + handle_check = IS_ENV_REPLICATED(env); + if (handle_check && + (ret = __db_rep_enter(dbp, 1, 0, txn != NULL)) != 0) { + handle_check = 0; + goto err; + } + + /* Check for consistent transaction usage. */ + if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 1)) != 0) + goto err; + + /* + * !!! + * The actual method call is simple, do it inline. + */ + switch (dbp->type) { + case DB_BTREE: +#ifndef HAVE_BREW + if ((ret = __dbt_usercopy(env, key)) != 0) + goto err; + + /* Acquire a cursor. */ + if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0) + break; + + DEBUG_LWRITE(dbc, NULL, "bam_key_range", NULL, NULL, 0); +#ifdef HAVE_PARTITION + if (DB_IS_PARTITIONED(dbp)) + ret = __part_key_range(dbc, key, kr, flags); + else +#endif + ret = __bam_key_range(dbc, key, kr, flags); + + if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + __dbt_userfree(env, key, NULL, NULL); + break; +#else + COMPQUIET(dbc, NULL); + COMPQUIET(key, NULL); + COMPQUIET(kr, NULL); + /* FALLTHROUGH */ +#endif + case DB_HASH: + case DB_QUEUE: + case DB_RECNO: + ret = __dbh_am_chk(dbp, DB_OK_BTREE); + break; + case DB_UNKNOWN: + default: + ret = __db_unknown_type(env, "DB->key_range", dbp->type); + break; + } + +err: /* Release replication block. */ + if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __db_open_pp -- + * DB->open pre/post processing. + * + * PUBLIC: int __db_open_pp __P((DB *, DB_TXN *, + * PUBLIC: const char *, const char *, DBTYPE, u_int32_t, int)); + */ +int +__db_open_pp(dbp, txn, fname, dname, type, flags, mode) + DB *dbp; + DB_TXN *txn; + const char *fname, *dname; + DBTYPE type; + u_int32_t flags; + int mode; +{ + DB_THREAD_INFO *ip; + ENV *env; + int handle_check, nosync, remove_me, ret, t_ret, txn_local; + + env = dbp->env; + nosync = 1; + handle_check = remove_me = txn_local = 0; + + ENV_ENTER(env, ip); + + /* + * Save the file and database names and flags. We do this here + * because we don't pass all of the flags down into the actual + * DB->open method call, we strip DB_AUTO_COMMIT at this layer. + */ + if ((fname != NULL && + (ret = __os_strdup(env, fname, &dbp->fname)) != 0)) + goto err; + if ((dname != NULL && + (ret = __os_strdup(env, dname, &dbp->dname)) != 0)) + goto err; + dbp->open_flags = flags; + + /* Save the current DB handle flags for refresh. */ + dbp->orig_flags = dbp->flags; + + /* Check for replication block. */ + handle_check = IS_ENV_REPLICATED(env); + if (handle_check && + (ret = __db_rep_enter(dbp, 1, 0, txn != NULL)) != 0) { + handle_check = 0; + goto err; + } + + /* + * Create local transaction as necessary, check for consistent + * transaction usage. + */ + if (IS_ENV_AUTO_COMMIT(env, txn, flags)) { + if ((ret = __db_txn_auto_init(env, ip, &txn)) != 0) + goto err; + txn_local = 1; + } else if (txn != NULL && !TXN_ON(env) && + (!CDB_LOCKING(env) || !F_ISSET(txn, TXN_CDSGROUP))) { + ret = __db_not_txn_env(env); + goto err; + } + LF_CLR(DB_AUTO_COMMIT); + + /* + * We check arguments after possibly creating a local transaction, + * which is unusual -- the reason is some flags are illegal if any + * kind of transaction is in effect. + */ + if ((ret = __db_open_arg(dbp, txn, fname, dname, type, flags)) == 0) + if ((ret = __db_open(dbp, ip, txn, fname, dname, type, + flags, mode, PGNO_BASE_MD)) != 0) + goto txnerr; + + /* + * You can open the database that describes the subdatabases in the + * rest of the file read-only. The content of each key's data is + * unspecified and applications should never be adding new records + * or updating existing records. However, during recovery, we need + * to open these databases R/W so we can redo/undo changes in them. + * Likewise, we need to open master databases read/write during + * rename and remove so we can be sure they're fully sync'ed, so + * we provide an override flag for the purpose. + */ + if (dname == NULL && !IS_RECOVERING(env) && !LF_ISSET(DB_RDONLY) && + !LF_ISSET(DB_RDWRMASTER) && F_ISSET(dbp, DB_AM_SUBDB)) { + __db_errx(env, + "files containing multiple databases may only be opened read-only"); + ret = EINVAL; + goto txnerr; + } + + /* + * Success: file creations have to be synchronous, otherwise we don't + * care. + */ + if (F_ISSET(dbp, DB_AM_CREATED | DB_AM_CREATED_MSTR)) + nosync = 0; + + /* Success: don't discard the file on close. */ + F_CLR(dbp, DB_AM_DISCARD | DB_AM_CREATED | DB_AM_CREATED_MSTR); + + /* + * If not transactional, remove the databases/subdatabases if it is + * persistent. If we're transactional, the child transaction abort + * cleans up. + */ +txnerr: if (ret != 0 && !IS_REAL_TXN(txn)) { + remove_me = (F_ISSET(dbp, DB_AM_CREATED) && + (fname != NULL || dname != NULL)) ? 1 : 0; + if (F_ISSET(dbp, DB_AM_CREATED_MSTR) || + (dname == NULL && remove_me)) + /* Remove file. */ + (void)__db_remove_int(dbp, + ip, txn, fname, NULL, DB_FORCE); + else if (remove_me) + /* Remove subdatabase. */ + (void)__db_remove_int(dbp, + ip, txn, fname, dname, DB_FORCE); + } + + if (txn_local && (t_ret = + __db_txn_auto_resolve(env, txn, nosync, ret)) && ret == 0) + ret = t_ret; + +err: /* Release replication block. */ + if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __db_open_arg -- + * Check DB->open arguments. + */ +static int +__db_open_arg(dbp, txn, fname, dname, type, flags) + DB *dbp; + DB_TXN *txn; + const char *fname, *dname; + DBTYPE type; + u_int32_t flags; +{ + ENV *env; + u_int32_t ok_flags; + int ret; + + env = dbp->env; + + /* Validate arguments. */ +#undef OKFLAGS +#define OKFLAGS \ + (DB_AUTO_COMMIT | DB_CREATE | DB_EXCL | DB_FCNTL_LOCKING | \ + DB_MULTIVERSION | DB_NOMMAP | DB_NO_AUTO_COMMIT | DB_RDONLY | \ + DB_RDWRMASTER | DB_READ_UNCOMMITTED | DB_THREAD | DB_TRUNCATE) + if ((ret = __db_fchk(env, "DB->open", flags, OKFLAGS)) != 0) + return (ret); + if (LF_ISSET(DB_EXCL) && !LF_ISSET(DB_CREATE)) + return (__db_ferr(env, "DB->open", 1)); + if (LF_ISSET(DB_RDONLY) && LF_ISSET(DB_CREATE)) + return (__db_ferr(env, "DB->open", 1)); + +#ifdef HAVE_VXWORKS + if (LF_ISSET(DB_TRUNCATE)) { + __db_errx(env, "DB_TRUNCATE not supported on VxWorks"); + return (DB_OPNOTSUP); + } +#endif + switch (type) { + case DB_UNKNOWN: + if (LF_ISSET(DB_CREATE|DB_TRUNCATE)) { + __db_errx(env, + "DB_UNKNOWN type specified with DB_CREATE or DB_TRUNCATE"); + return (EINVAL); + } + ok_flags = 0; + break; + case DB_BTREE: + ok_flags = DB_OK_BTREE; + break; + case DB_HASH: +#ifndef HAVE_HASH + return (__db_no_hash_am(env)); +#endif + ok_flags = DB_OK_HASH; + break; + case DB_QUEUE: +#ifndef HAVE_QUEUE + return (__db_no_queue_am(env)); +#endif + ok_flags = DB_OK_QUEUE; + break; + case DB_RECNO: + ok_flags = DB_OK_RECNO; + break; + default: + __db_errx(env, "unknown type: %lu", (u_long)type); + return (EINVAL); + } + if (ok_flags) + DB_ILLEGAL_METHOD(dbp, ok_flags); + + /* The environment may have been created, but never opened. */ + if (!F_ISSET(env, ENV_DBLOCAL | ENV_OPEN_CALLED)) { + __db_errx(env, "database environment not yet opened"); + return (EINVAL); + } + + /* + * Historically, you could pass in an environment that didn't have a + * mpool, and DB would create a private one behind the scenes. This + * no longer works. + */ + if (!F_ISSET(env, ENV_DBLOCAL) && !MPOOL_ON(env)) { + __db_errx(env, "environment did not include a memory pool"); + return (EINVAL); + } + + /* + * You can't specify threads during DB->open if subsystems in the + * environment weren't configured with them. + */ + if (LF_ISSET(DB_THREAD) && !F_ISSET(env, ENV_DBLOCAL | ENV_THREAD)) { + __db_errx(env, "environment not created using DB_THREAD"); + return (EINVAL); + } + + /* DB_MULTIVERSION requires a database configured for transactions. */ + if (LF_ISSET(DB_MULTIVERSION) && !IS_REAL_TXN(txn)) { + __db_errx(env, + "DB_MULTIVERSION illegal without a transaction specified"); + return (EINVAL); + } + + if (LF_ISSET(DB_MULTIVERSION) && type == DB_QUEUE) { + __db_errx(env, + "DB_MULTIVERSION illegal with queue databases"); + return (EINVAL); + } + + /* DB_TRUNCATE is neither transaction recoverable nor lockable. */ + if (LF_ISSET(DB_TRUNCATE) && (LOCKING_ON(env) || txn != NULL)) { + __db_errx(env, + "DB_TRUNCATE illegal with %s specified", + LOCKING_ON(env) ? "locking" : "transactions"); + return (EINVAL); + } + + /* Subdatabase checks. */ + if (dname != NULL) { + /* QAM can only be done on in-memory subdatabases. */ + if (type == DB_QUEUE && fname != NULL) { + __db_errx( + env, "Queue databases must be one-per-file"); + return (EINVAL); + } + + /* + * Named in-memory databases can't support certain flags, + * so check here. + */ + if (fname == NULL) + F_CLR(dbp, DB_AM_CHKSUM | DB_AM_ENCRYPT); + } + + return (0); +} + +/* + * __db_pget_pp -- + * DB->pget pre/post processing. + * + * PUBLIC: int __db_pget_pp + * PUBLIC: __P((DB *, DB_TXN *, DBT *, DBT *, DBT *, u_int32_t)); + */ +int +__db_pget_pp(dbp, txn, skey, pkey, data, flags) + DB *dbp; + DB_TXN *txn; + DBT *skey, *pkey, *data; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int handle_check, ignore_lease, ret, t_ret; + + env = dbp->env; + + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->pget"); + + ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0; + LF_CLR(DB_IGNORE_LEASE); + + if ((ret = __db_pget_arg(dbp, pkey, flags)) != 0 || + (ret = __db_get_arg(dbp, skey, data, flags)) != 0) { + __dbt_userfree(env, skey, pkey, data); + return (ret); + } + + ENV_ENTER(env, ip); + + /* Check for replication block. */ + handle_check = IS_ENV_REPLICATED(env); + if (handle_check && + (ret = __db_rep_enter(dbp, 1, 0, txn != NULL)) != 0) { + handle_check = 0; + goto err; + } + + ret = __db_pget(dbp, ip, txn, skey, pkey, data, flags); + /* + * Check for master leases. + */ + if (ret == 0 && + IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease) + ret = __rep_lease_check(env, 1); + +err: /* Release replication block. */ + if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + + ENV_LEAVE(env, ip); + __dbt_userfree(env, skey, pkey, data); + return (ret); +} + +/* + * __db_pget -- + * DB->pget. + * + * PUBLIC: int __db_pget __P((DB *, + * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DBT *, DBT *, DBT *, u_int32_t)); + */ +int +__db_pget(dbp, ip, txn, skey, pkey, data, flags) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + DBT *skey, *pkey, *data; + u_int32_t flags; +{ + DBC *dbc; + u_int32_t mode; + int ret, t_ret; + + mode = DB_CURSOR_TRANSIENT; + if (LF_ISSET(DB_READ_UNCOMMITTED)) { + mode |= DB_READ_UNCOMMITTED; + LF_CLR(DB_READ_UNCOMMITTED); + } else if (LF_ISSET(DB_READ_COMMITTED)) { + mode |= DB_READ_COMMITTED; + LF_CLR(DB_READ_COMMITTED); + } + + if ((ret = __db_cursor(dbp, ip, txn, &dbc, mode)) != 0) + return (ret); + + SET_RET_MEM(dbc, dbp); + + DEBUG_LREAD(dbc, txn, "__db_pget", skey, NULL, flags); + + /* + * !!! + * The actual method call is simple, do it inline. + * + * The underlying cursor pget will fill in a default DBT for null + * pkeys, and use the cursor's returned-key memory internally to + * store any intermediate primary keys. However, we've just set + * the returned-key memory to the DB handle's key memory, which + * is unsafe to use if the DB handle is threaded. If the pkey + * argument is NULL, use the DBC-owned returned-key memory + * instead; it'll go away when we close the cursor before we + * return, but in this case that's just fine, as we're not + * returning the primary key. + */ + if (pkey == NULL) + dbc->rkey = &dbc->my_rkey; + + /* + * The cursor is just a perfectly ordinary secondary database cursor. + * Call its c_pget() method to do the dirty work. + */ + if (flags == 0 || flags == DB_RMW) + flags |= DB_SET; + + ret = __dbc_pget(dbc, skey, pkey, data, flags); + + if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __db_pget_arg -- + * Check DB->pget arguments. + */ +static int +__db_pget_arg(dbp, pkey, flags) + DB *dbp; + DBT *pkey; + u_int32_t flags; +{ + ENV *env; + int ret; + + env = dbp->env; + + if (!F_ISSET(dbp, DB_AM_SECONDARY)) { + __db_errx(env, + "DB->pget may only be used on secondary indices"); + return (EINVAL); + } + + if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) { + __db_errx(env, + "DB_MULTIPLE and DB_MULTIPLE_KEY may not be used on secondary indices"); + return (EINVAL); + } + + /* DB_CONSUME makes no sense on a secondary index. */ + LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW); + switch (flags) { + case DB_CONSUME: + case DB_CONSUME_WAIT: + return (__db_ferr(env, "DB->pget", 0)); + default: + /* __db_get_arg will catch the rest. */ + break; + } + + /* + * We allow the pkey field to be NULL, so that we can make the + * two-DBT get calls into wrappers for the three-DBT ones. + */ + if (pkey != NULL && + (ret = __dbt_ferr(dbp, "primary key", pkey, 1)) != 0) + return (ret); + + if (flags == DB_GET_BOTH) { + /* The pkey field can't be NULL if we're doing a DB_GET_BOTH. */ + if (pkey == NULL) { + __db_errx(env, + "DB_GET_BOTH on a secondary index requires a primary key"); + return (EINVAL); + } + if ((ret = __dbt_usercopy(env, pkey)) != 0) + return (ret); + } + + return (0); +} + +/* + * __db_put_pp -- + * DB->put pre/post processing. + * + * PUBLIC: int __db_put_pp __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t)); + */ +int +__db_put_pp(dbp, txn, key, data, flags) + DB *dbp; + DB_TXN *txn; + DBT *key, *data; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int handle_check, ret, txn_local, t_ret; + + env = dbp->env; + txn_local = 0; + + STRIP_AUTO_COMMIT(flags); + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->put"); + + if ((ret = __db_put_arg(dbp, key, data, flags)) != 0) + return (ret); + + ENV_ENTER(env, ip); + + /* Check for replication block. */ + handle_check = IS_ENV_REPLICATED(env); + if (handle_check && + (ret = __db_rep_enter(dbp, 1, 0, txn != NULL)) != 0) { + handle_check = 0; + goto err; + } + + /* Create local transaction as necessary. */ + if (IS_DB_AUTO_COMMIT(dbp, txn)) { + if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0) + goto err; + txn_local = 1; + } + + /* Check for consistent transaction usage. */ + if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0) + goto err; + + ret = __db_put(dbp, ip, txn, key, data, flags); + +err: if (txn_local && + (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0) + ret = t_ret; + + /* Release replication block. */ + if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + + ENV_LEAVE(env, ip); + __dbt_userfree(env, key, NULL, data); + return (ret); +} + +/* + * __db_put_arg -- + * Check DB->put arguments. + */ +static int +__db_put_arg(dbp, key, data, flags) + DB *dbp; + DBT *key, *data; + u_int32_t flags; +{ + ENV *env; + int ret, returnkey; + + env = dbp->env; + returnkey = 0; + + /* Check for changes to a read-only tree. */ + if (DB_IS_READONLY(dbp)) + return (__db_rdonly(env, "DB->put")); + + /* Check for puts on a secondary. */ + if (F_ISSET(dbp, DB_AM_SECONDARY)) { + __db_errx(env, "DB->put forbidden on secondary indices"); + return (EINVAL); + } + + if (LF_ISSET(DB_MULTIPLE_KEY | DB_MULTIPLE)) { + if (LF_ISSET(DB_MULTIPLE) && LF_ISSET(DB_MULTIPLE_KEY)) + goto err; + + switch (LF_ISSET(DB_OPFLAGS_MASK)) { + case 0: + case DB_OVERWRITE_DUP: + break; + default: + __db_errx(env, + "DB->put: DB_MULTIPLE(_KEY) can only be combined with DB_OVERWRITE_DUP"); + return (EINVAL); + } + + if (!F_ISSET(key, DB_DBT_BULK)) { + __db_errx(env, + "DB->put with DB_MULTIPLE(_KEY) requires a bulk key buffer"); + return (EINVAL); + } + } + if (LF_ISSET(DB_MULTIPLE)) { + if (!F_ISSET(data, DB_DBT_BULK)) { + __db_errx(env, + "DB->put with DB_MULTIPLE requires a bulk data buffer"); + return (EINVAL); + } + } + + /* Check for invalid function flags. */ + switch (LF_ISSET(DB_OPFLAGS_MASK)) { + case 0: + case DB_NOOVERWRITE: + case DB_OVERWRITE_DUP: + break; + case DB_APPEND: + if (dbp->type != DB_RECNO && dbp->type != DB_QUEUE) + goto err; + returnkey = 1; + break; + case DB_NODUPDATA: + if (F_ISSET(dbp, DB_AM_DUPSORT)) + break; + /* FALLTHROUGH */ + default: +err: return (__db_ferr(env, "DB->put", 0)); + } + + /* + * Check for invalid key/data flags. The key may reasonably be NULL + * if DB_APPEND is set and the application doesn't care about the + * returned key. + */ + if (((returnkey && key != NULL) || !returnkey) && + (ret = __dbt_ferr(dbp, "key", key, returnkey)) != 0) + return (ret); + if (!LF_ISSET(DB_MULTIPLE_KEY) && + (ret = __dbt_ferr(dbp, "data", data, 0)) != 0) + return (ret); + + /* + * The key parameter should not be NULL or have the "partial" flag set + * in a put call unless the user doesn't care about a key value we'd + * return. The user tells us they don't care about the returned key by + * setting the key parameter to NULL or configuring the key DBT to not + * return any information. (Returned keys from a put are always record + * numbers, and returning part of a record number doesn't make sense: + * only accept a partial return if the length returned is 0.) + */ + if ((returnkey && + key != NULL && F_ISSET(key, DB_DBT_PARTIAL) && key->dlen != 0) || + (!returnkey && F_ISSET(key, DB_DBT_PARTIAL))) + return (__db_ferr(env, "key DBT", 0)); + + /* Check for partial puts in the presence of duplicates. */ + if (data != NULL && F_ISSET(data, DB_DBT_PARTIAL) && + (F_ISSET(dbp, DB_AM_DUP) || F_ISSET(key, DB_DBT_DUPOK))) { + __db_errx(env, +"a partial put in the presence of duplicates requires a cursor operation"); + return (EINVAL); + } + + if ((flags != DB_APPEND && (ret = __dbt_usercopy(env, key)) != 0) || + (!LF_ISSET(DB_MULTIPLE_KEY) && + (ret = __dbt_usercopy(env, data)) != 0)) + return (ret); + + return (0); +} + +/* + * __db_compact_pp -- + * DB->compact pre/post processing. + * + * PUBLIC: int __db_compact_pp __P((DB *, DB_TXN *, + * PUBLIC: DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *)); + */ +int +__db_compact_pp(dbp, txn, start, stop, c_data, flags, end) + DB *dbp; + DB_TXN *txn; + DBT *start, *stop; + DB_COMPACT *c_data; + u_int32_t flags; + DBT *end; +{ + DB_COMPACT *dp, l_data; + DB_THREAD_INFO *ip; + ENV *env; + int handle_check, ret, t_ret; + + env = dbp->env; + + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->compact"); + + /* + * !!! + * The actual argument checking is simple, do it inline, outside of + * the replication block. + */ + if ((ret = __db_fchk( + env, "DB->compact", flags, DB_FREELIST_ONLY | DB_FREE_SPACE)) != 0) + return (ret); + + /* Check for changes to a read-only database. */ + if (DB_IS_READONLY(dbp)) + return (__db_rdonly(env, "DB->compact")); + + if (start != NULL && (ret = __dbt_usercopy(env, start)) != 0) + return (ret); + if (stop != NULL && (ret = __dbt_usercopy(env, stop)) != 0) + return (ret); + + ENV_ENTER(env, ip); + + /* Check for replication block. */ + handle_check = IS_ENV_REPLICATED(env); + if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, + txn != NULL)) != 0) { + handle_check = 0; + goto err; + } + + if (c_data == NULL) { + dp = &l_data; + memset(dp, 0, sizeof(*dp)); + } else + dp = c_data; +#ifdef HAVE_PARTITION + if (DB_IS_PARTITIONED(dbp)) + ret = __part_compact(dbp, ip, txn, start, stop, dp, flags, end); + else +#endif + switch (dbp->type) { + case DB_HASH: + if (!LF_ISSET(DB_FREELIST_ONLY)) + goto err; + /* FALLTHROUGH */ + case DB_BTREE: + case DB_RECNO: + ret = __bam_compact(dbp, ip, txn, start, stop, dp, flags, end); + break; + + default: +err: ret = __dbh_am_chk(dbp, DB_OK_BTREE); + break; + } + + /* Release replication block. */ + if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + + ENV_LEAVE(env, ip); + __dbt_userfree(env, start, stop, NULL); + return (ret); +} + +/* + * __db_associate_foreign_pp -- + * DB->associate_foreign pre/post processing. + * + * PUBLIC: int __db_associate_foreign_pp __P((DB *, DB *, + * PUBLIC: int (*)(DB *, const DBT *, DBT *, const DBT *, int *), + * PUBLIC: u_int32_t)); + */ +int +__db_associate_foreign_pp(fdbp, dbp, callback, flags) + DB *dbp, *fdbp; + int (*callback) __P((DB *, const DBT *, DBT *, const DBT *, int *)); + u_int32_t flags; +{ + /* Most of this is based on the implementation of associate */ + DB_THREAD_INFO *ip; + ENV *env; + int handle_check, ret, t_ret; + + env = dbp->env; + + PANIC_CHECK(env); + STRIP_AUTO_COMMIT(flags); + + ENV_ENTER(env, ip); + + /* Check for replication block. */ + handle_check = IS_ENV_REPLICATED(env); + if (handle_check && + (ret = __db_rep_enter(dbp, 1, 0, 0)) != 0) { + handle_check = 0; + goto err; + } + + if ((ret = __db_associate_foreign_arg(fdbp, dbp, callback, flags)) != 0) + goto err; + + ret = __db_associate_foreign(fdbp, dbp, callback, flags); + +err: /* Release replication block. */ + if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __db_associate_foreign_arg -- + * DB->associate_foreign argument checking. + */ +static int +__db_associate_foreign_arg(fdbp, dbp, callback, flags) + DB *dbp, *fdbp; + int (*callback) __P((DB *, const DBT *, DBT *, const DBT *, int *)); + u_int32_t flags; +{ + ENV *env; + + env = fdbp->env; + + if (F_ISSET(fdbp, DB_AM_SECONDARY)) { + __db_errx(env, + "Secondary indices may not be used as foreign databases"); + return (EINVAL); + } + if (F_ISSET(fdbp, DB_AM_DUP)) { + __db_errx(env, + "Foreign databases may not be configured with duplicates"); + return (EINVAL); + } + if (F_ISSET(fdbp, DB_AM_RENUMBER)) { + __db_errx(env, + "Renumbering recno databases may not be used as foreign databases"); + return (EINVAL); + } + if (!F_ISSET(dbp, DB_AM_SECONDARY)) { + __db_errx(env, + "The associating database must be a secondary index."); + return (EINVAL); + } + if (LF_ISSET(DB_FOREIGN_NULLIFY) && callback == NULL) { + __db_errx(env, + "When specifying a delete action of nullify, a callback%s", + " function needs to be configured"); + return (EINVAL); + } else if (!LF_ISSET(DB_FOREIGN_NULLIFY) && callback != NULL) { + __db_errx(env, + "When not specifying a delete action of nullify, a%s", + " callback function cannot be configured"); + return (EINVAL); + } + + return (0); +} + +/* + * __db_sync_pp -- + * DB->sync pre/post processing. + * + * PUBLIC: int __db_sync_pp __P((DB *, u_int32_t)); + */ +int +__db_sync_pp(dbp, flags) + DB *dbp; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int handle_check, ret, t_ret; + + env = dbp->env; + + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->sync"); + + /* + * !!! + * The actual argument checking is simple, do it inline, outside of + * the replication block. + */ + if (flags != 0) + return (__db_ferr(env, "DB->sync", 0)); + + ENV_ENTER(env, ip); + + /* Check for replication block. */ + handle_check = IS_ENV_REPLICATED(env); + if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, 0)) != 0) { + handle_check = 0; + goto err; + } + + ret = __db_sync(dbp); + + /* Release replication block. */ + if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + +err: ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __dbc_close_pp -- + * DBC->close pre/post processing. + * + * PUBLIC: int __dbc_close_pp __P((DBC *)); + */ +int +__dbc_close_pp(dbc) + DBC *dbc; +{ + DB *dbp; + DB_THREAD_INFO *ip; + ENV *env; + int handle_check, ret, t_ret; + + dbp = dbc->dbp; + env = dbp->env; + + /* + * If the cursor is already closed we have a serious problem, and we + * assume that the cursor isn't on the active queue. Don't do any of + * the remaining cursor close processing. + */ + if (!F_ISSET(dbc, DBC_ACTIVE)) { + __db_errx(env, "Closing already-closed cursor"); + return (EINVAL); + } + + ENV_ENTER(env, ip); + + /* Check for replication block. */ + handle_check = dbc->txn == NULL && IS_ENV_REPLICATED(env); + ret = __dbc_close(dbc); + + /* Release replication block. */ + if (handle_check && + (t_ret = __op_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __dbc_cmp_pp -- + * DBC->cmp pre/post processing. + * + * PUBLIC: int __dbc_cmp_pp __P((DBC *, DBC *, int*, u_int32_t)); + */ +int +__dbc_cmp_pp(dbc, other_cursor, result, flags) + DBC *dbc, *other_cursor; + int *result; + u_int32_t flags; +{ + DB *dbp, *odbp; + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + dbp = dbc->dbp; + odbp = other_cursor->dbp; + env = dbp->env; + + if (flags != 0) + return (__db_ferr(env, "DBcursor->cmp", 0)); + + if (other_cursor == NULL) { + __db_errx(env, "DBcursor->cmp dbc pointer must not be null"); + return (EINVAL); + } + + if (dbp != odbp) { + __db_errx(env, +"DBcursor->cmp both cursors must refer to the same database."); + return (EINVAL); + } + + ENV_ENTER(env, ip); + ret = __dbc_cmp(dbc, other_cursor, result); + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __dbc_count_pp -- + * DBC->count pre/post processing. + * + * PUBLIC: int __dbc_count_pp __P((DBC *, db_recno_t *, u_int32_t)); + */ +int +__dbc_count_pp(dbc, recnop, flags) + DBC *dbc; + db_recno_t *recnop; + u_int32_t flags; +{ + DB *dbp; + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + dbp = dbc->dbp; + env = dbp->env; + + /* + * !!! + * The actual argument checking is simple, do it inline, outside of + * the replication block. + * + * The cursor must be initialized, return EINVAL for an invalid cursor. + */ + if (flags != 0) + return (__db_ferr(env, "DBcursor->count", 0)); + + if (!IS_INITIALIZED(dbc)) + return (__db_curinval(env)); + + ENV_ENTER(env, ip); + ret = __dbc_count(dbc, recnop); + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __dbc_del_pp -- + * DBC->del pre/post processing. + * + * PUBLIC: int __dbc_del_pp __P((DBC *, u_int32_t)); + */ +int +__dbc_del_pp(dbc, flags) + DBC *dbc; + u_int32_t flags; +{ + DB *dbp; + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + dbp = dbc->dbp; + env = dbp->env; + + if ((ret = __dbc_del_arg(dbc, flags)) != 0) + return (ret); + + ENV_ENTER(env, ip); + + /* Check for consistent transaction usage. */ + if ((ret = __db_check_txn(dbp, dbc->txn, dbc->locker, 0)) != 0) + goto err; + + DEBUG_LWRITE(dbc, dbc->txn, "DBcursor->del", NULL, NULL, flags); + ret = __dbc_del(dbc, flags); + +err: ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __dbc_del_arg -- + * Check DBC->del arguments. + */ +static int +__dbc_del_arg(dbc, flags) + DBC *dbc; + u_int32_t flags; +{ + DB *dbp; + ENV *env; + + dbp = dbc->dbp; + env = dbp->env; + + /* Check for changes to a read-only tree. */ + if (DB_IS_READONLY(dbp)) + return (__db_rdonly(env, "DBcursor->del")); + + /* Check for invalid function flags. */ + switch (flags) { + case 0: + break; + case DB_CONSUME: + if (dbp->type != DB_QUEUE) + return (__db_ferr(env, "DBC->del", 0)); + break; + case DB_UPDATE_SECONDARY: + DB_ASSERT(env, F_ISSET(dbp, DB_AM_SECONDARY)); + break; + default: + return (__db_ferr(env, "DBcursor->del", 0)); + } + + /* + * The cursor must be initialized, return EINVAL for an invalid cursor, + * otherwise 0. + */ + if (!IS_INITIALIZED(dbc)) + return (__db_curinval(env)); + + return (0); +} + +/* + * __dbc_dup_pp -- + * DBC->dup pre/post processing. + * + * PUBLIC: int __dbc_dup_pp __P((DBC *, DBC **, u_int32_t)); + */ +int +__dbc_dup_pp(dbc, dbcp, flags) + DBC *dbc, **dbcp; + u_int32_t flags; +{ + DB *dbp; + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + dbp = dbc->dbp; + env = dbp->env; + + /* + * !!! + * The actual argument checking is simple, do it inline, outside of + * the replication block. + */ + if (flags != 0 && flags != DB_POSITION) + return (__db_ferr(env, "DBcursor->dup", 0)); + + ENV_ENTER(env, ip); + ret = __dbc_dup(dbc, dbcp, flags); + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __dbc_get_pp -- + * DBC->get pre/post processing. + * + * PUBLIC: int __dbc_get_pp __P((DBC *, DBT *, DBT *, u_int32_t)); + */ +int +__dbc_get_pp(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; +{ + DB *dbp; + DB_THREAD_INFO *ip; + ENV *env; + int ignore_lease, ret; + + dbp = dbc->dbp; + env = dbp->env; + + ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0; + LF_CLR(DB_IGNORE_LEASE); + if ((ret = __dbc_get_arg(dbc, key, data, flags)) != 0) + return (ret); + + ENV_ENTER(env, ip); + + DEBUG_LREAD(dbc, dbc->txn, "DBcursor->get", + flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags); + ret = __dbc_get(dbc, key, data, flags); + + /* + * Check for master leases. + */ + if (ret == 0 && + IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease) + ret = __rep_lease_check(env, 1); + + ENV_LEAVE(env, ip); + __dbt_userfree(env, key, NULL, data); + return (ret); +} + +/* + * __dbc_get_arg -- + * Common DBC->get argument checking, used by both DBC->get and DBC->pget. + * PUBLIC: int __dbc_get_arg __P((DBC *, DBT *, DBT *, u_int32_t)); + */ +int +__dbc_get_arg(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; +{ + DB *dbp; + ENV *env; + int dirty, multi, ret; + + dbp = dbc->dbp; + env = dbp->env; + + /* + * Typically in checking routines that modify the flags, we have + * to save them and restore them, because the checking routine + * calls the work routine. However, this is a pure-checking + * routine which returns to a function that calls the work routine, + * so it's OK that we do not save and restore the flags, even though + * we modify them. + * + * Check for read-modify-write validity. DB_RMW doesn't make sense + * with CDB cursors since if you're going to write the cursor, you + * had to create it with DB_WRITECURSOR. Regardless, we check for + * LOCKING_ON and not STD_LOCKING, as we don't want to disallow it. + * If this changes, confirm that DB does not itself set the DB_RMW + * flag in a path where CDB may have been configured. + */ + dirty = 0; + if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW)) { + if (!LOCKING_ON(env)) + return (__db_fnl(env, "DBcursor->get")); + if (LF_ISSET(DB_READ_UNCOMMITTED)) + dirty = 1; + LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW); + } + + multi = 0; + if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) { + multi = 1; + if (LF_ISSET(DB_MULTIPLE) && LF_ISSET(DB_MULTIPLE_KEY)) + goto multi_err; + LF_CLR(DB_MULTIPLE | DB_MULTIPLE_KEY); + } + + /* Check for invalid function flags. */ + switch (flags) { + case DB_CONSUME: + case DB_CONSUME_WAIT: + if (dirty) { + __db_errx(env, + "DB_READ_UNCOMMITTED is not supported with DB_CONSUME or DB_CONSUME_WAIT"); + return (EINVAL); + } + if (dbp->type != DB_QUEUE) + goto err; + break; + case DB_CURRENT: + case DB_FIRST: + case DB_NEXT: + case DB_NEXT_DUP: + case DB_NEXT_NODUP: + break; + case DB_LAST: + case DB_PREV: + case DB_PREV_DUP: + case DB_PREV_NODUP: + if (multi) +multi_err: return (__db_ferr(env, "DBcursor->get", 1)); + break; + case DB_GET_BOTHC: + if (dbp->type == DB_QUEUE) + goto err; + /* FALLTHROUGH */ + case DB_GET_BOTH: + case DB_GET_BOTH_RANGE: + if ((ret = __dbt_usercopy(env, data)) != 0) + goto err; + /* FALLTHROUGH */ + case DB_SET: + case DB_SET_RANGE: + if ((ret = __dbt_usercopy(env, key)) != 0) + goto err; + break; + case DB_GET_RECNO: + /* + * The one situation in which this might be legal with a + * non-RECNUM dbp is if dbp is a secondary and its primary is + * DB_AM_RECNUM. + */ + if (!F_ISSET(dbp, DB_AM_RECNUM) && + (!F_ISSET(dbp, DB_AM_SECONDARY) || + !F_ISSET(dbp->s_primary, DB_AM_RECNUM))) + goto err; + break; + case DB_SET_RECNO: + if (!F_ISSET(dbp, DB_AM_RECNUM)) + goto err; + if ((ret = __dbt_usercopy(env, key)) != 0) + goto err; + break; + default: +err: __dbt_userfree(env, key, NULL, data); + return (__db_ferr(env, "DBcursor->get", 0)); + } + + /* Check for invalid key/data flags. */ + if ((ret = __dbt_ferr(dbp, "key", key, 0)) != 0) + return (ret); + if ((ret = __dbt_ferr(dbp, "data", data, 0)) != 0) + return (ret); + + if (multi) { + if (!F_ISSET(data, DB_DBT_USERMEM)) { + __db_errx(env, + "DB_MULTIPLE/DB_MULTIPLE_KEY require DB_DBT_USERMEM be set"); + return (EINVAL); + } + if (F_ISSET(key, DB_DBT_PARTIAL) || + F_ISSET(data, DB_DBT_PARTIAL)) { + __db_errx(env, + "DB_MULTIPLE/DB_MULTIPLE_KEY do not support DB_DBT_PARTIAL"); + return (EINVAL); + } + if (data->ulen < 1024 || + data->ulen < dbp->pgsize || data->ulen % 1024 != 0) { + __db_errx(env, "%s%s", + "DB_MULTIPLE/DB_MULTIPLE_KEY buffers must be ", + "aligned, at least page size and multiples of 1KB"); + return (EINVAL); + } + } + + /* + * The cursor must be initialized for DB_CURRENT, DB_GET_RECNO, + * DB_PREV_DUP and DB_NEXT_DUP. Return EINVAL for an invalid + * cursor, otherwise 0. + */ + if (!IS_INITIALIZED(dbc) && (flags == DB_CURRENT || + flags == DB_GET_RECNO || + flags == DB_NEXT_DUP || flags == DB_PREV_DUP)) + return (__db_curinval(env)); + + /* Check for consistent transaction usage. */ + if (LF_ISSET(DB_RMW) && + (ret = __db_check_txn(dbp, dbc->txn, dbc->locker, 0)) != 0) + return (ret); + + return (0); +} + +/* + * __db_secondary_close_pp -- + * DB->close for secondaries + * + * PUBLIC: int __db_secondary_close_pp __P((DB *, u_int32_t)); + */ +int +__db_secondary_close_pp(dbp, flags) + DB *dbp; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int handle_check, ret, t_ret; + + env = dbp->env; + ret = 0; + + /* + * As a DB handle destructor, we can't fail. + * + * !!! + * The actual argument checking is simple, do it inline, outside of + * the replication block. + */ + if (flags != 0 && flags != DB_NOSYNC) + ret = __db_ferr(env, "DB->close", 0); + + ENV_ENTER(env, ip); + + /* Check for replication block. */ + handle_check = IS_ENV_REPLICATED(env); + if (handle_check && (t_ret = __db_rep_enter(dbp, 0, 0, 0)) != 0) { + handle_check = 0; + if (ret == 0) + ret = t_ret; + } + + if ((t_ret = __db_secondary_close(dbp, flags)) != 0 && ret == 0) + ret = t_ret; + + /* Release replication block. */ + if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __dbc_pget_pp -- + * DBC->pget pre/post processing. + * + * PUBLIC: int __dbc_pget_pp __P((DBC *, DBT *, DBT *, DBT *, u_int32_t)); + */ +int +__dbc_pget_pp(dbc, skey, pkey, data, flags) + DBC *dbc; + DBT *skey, *pkey, *data; + u_int32_t flags; +{ + DB *dbp; + DB_THREAD_INFO *ip; + ENV *env; + int ignore_lease, ret; + + dbp = dbc->dbp; + env = dbp->env; + + ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0; + LF_CLR(DB_IGNORE_LEASE); + if ((ret = __dbc_pget_arg(dbc, pkey, flags)) != 0 || + (ret = __dbc_get_arg(dbc, skey, data, flags)) != 0) + return (ret); + + ENV_ENTER(env, ip); + ret = __dbc_pget(dbc, skey, pkey, data, flags); + /* + * Check for master leases. + */ + if (ret == 0 && + IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease) + ret = __rep_lease_check(env, 1); + + ENV_LEAVE(env, ip); + + __dbt_userfree(env, skey, pkey, data); + return (ret); +} + +/* + * __dbc_pget_arg -- + * Check DBC->pget arguments. + */ +static int +__dbc_pget_arg(dbc, pkey, flags) + DBC *dbc; + DBT *pkey; + u_int32_t flags; +{ + DB *dbp; + ENV *env; + int ret; + + dbp = dbc->dbp; + env = dbp->env; + + if (!F_ISSET(dbp, DB_AM_SECONDARY)) { + __db_errx(env, + "DBcursor->pget may only be used on secondary indices"); + return (EINVAL); + } + + if (LF_ISSET(DB_MULTIPLE | DB_MULTIPLE_KEY)) { + __db_errx(env, + "DB_MULTIPLE and DB_MULTIPLE_KEY may not be used on secondary indices"); + return (EINVAL); + } + + switch (LF_ISSET(DB_OPFLAGS_MASK)) { + case DB_CONSUME: + case DB_CONSUME_WAIT: + /* These flags make no sense on a secondary index. */ + return (__db_ferr(env, "DBcursor->pget", 0)); + case DB_GET_BOTH: + case DB_GET_BOTH_RANGE: + /* BOTH is "get both the primary and the secondary". */ + if (pkey == NULL) { + __db_errx(env, + "%s requires both a secondary and a primary key", + LF_ISSET(DB_GET_BOTH) ? + "DB_GET_BOTH" : "DB_GET_BOTH_RANGE"); + return (EINVAL); + } + if ((ret = __dbt_usercopy(env, pkey)) != 0) + return (ret); + break; + default: + /* __dbc_get_arg will catch the rest. */ + break; + } + + /* + * We allow the pkey field to be NULL, so that we can make the + * two-DBT get calls into wrappers for the three-DBT ones. + */ + if (pkey != NULL && + (ret = __dbt_ferr(dbp, "primary key", pkey, 0)) != 0) + return (ret); + + /* But the pkey field can't be NULL if we're doing a DB_GET_BOTH. */ + if (pkey == NULL && (flags & DB_OPFLAGS_MASK) == DB_GET_BOTH) { + __db_errx(env, + "DB_GET_BOTH on a secondary index requires a primary key"); + return (EINVAL); + } + return (0); +} + +/* + * __dbc_put_pp -- + * DBC->put pre/post processing. + * + * PUBLIC: int __dbc_put_pp __P((DBC *, DBT *, DBT *, u_int32_t)); + */ +int +__dbc_put_pp(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; +{ + DB *dbp; + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + dbp = dbc->dbp; + env = dbp->env; + + if ((ret = __dbc_put_arg(dbc, key, data, flags)) != 0) + return (ret); + + ENV_ENTER(env, ip); + + /* Check for consistent transaction usage. */ + if ((ret = __db_check_txn(dbp, dbc->txn, dbc->locker, 0)) != 0) + goto err; + + DEBUG_LWRITE(dbc, dbc->txn, "DBcursor->put", + flags == DB_KEYFIRST || flags == DB_KEYLAST || + flags == DB_NODUPDATA || flags == DB_UPDATE_SECONDARY ? + key : NULL, data, flags); + ret = __dbc_put(dbc, key, data, flags); + +err: ENV_LEAVE(env, ip); + __dbt_userfree(env, key, NULL, data); + return (ret); +} + +/* + * __dbc_put_arg -- + * Check DBC->put arguments. + */ +static int +__dbc_put_arg(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; +{ + DB *dbp; + ENV *env; + int key_flags, ret; + + dbp = dbc->dbp; + env = dbp->env; + key_flags = 0; + + /* Check for changes to a read-only tree. */ + if (DB_IS_READONLY(dbp)) + return (__db_rdonly(env, "DBcursor->put")); + + /* Check for puts on a secondary. */ + if (F_ISSET(dbp, DB_AM_SECONDARY)) { + if (flags == DB_UPDATE_SECONDARY) + flags = 0; + else { + __db_errx(env, + "DBcursor->put forbidden on secondary indices"); + return (EINVAL); + } + } + + if ((ret = __dbt_usercopy(env, data)) != 0) + return (ret); + + /* Check for invalid function flags. */ + switch (flags) { + case DB_AFTER: + case DB_BEFORE: + switch (dbp->type) { + case DB_BTREE: + case DB_HASH: /* Only with unsorted duplicates. */ + if (!F_ISSET(dbp, DB_AM_DUP)) + goto err; + if (dbp->dup_compare != NULL) + goto err; + break; + case DB_QUEUE: /* Not permitted. */ + goto err; + case DB_RECNO: /* Only with mutable record numbers. */ + if (!F_ISSET(dbp, DB_AM_RENUMBER)) + goto err; + key_flags = key == NULL ? 0 : 1; + break; + case DB_UNKNOWN: + default: + goto err; + } + break; + case DB_CURRENT: + /* + * If there is a comparison function, doing a DB_CURRENT + * must not change the part of the data item that is used + * for the comparison. + */ + break; + case DB_NODUPDATA: + if (!F_ISSET(dbp, DB_AM_DUPSORT)) + goto err; + /* FALLTHROUGH */ + case DB_KEYFIRST: + case DB_KEYLAST: + case DB_OVERWRITE_DUP: + key_flags = 1; + if ((ret = __dbt_usercopy(env, key)) != 0) + return (ret); + break; + default: +err: return (__db_ferr(env, "DBcursor->put", 0)); + } + + /* + * Check for invalid key/data flags. The key may reasonably be NULL + * if DB_AFTER or DB_BEFORE is set and the application doesn't care + * about the returned key, or if the DB_CURRENT flag is set. + */ + if (key_flags && (ret = __dbt_ferr(dbp, "key", key, 0)) != 0) + return (ret); + if ((ret = __dbt_ferr(dbp, "data", data, 0)) != 0) + return (ret); + + /* + * The key parameter should not be NULL or have the "partial" flag set + * in a put call unless the user doesn't care about a key value we'd + * return. The user tells us they don't care about the returned key by + * setting the key parameter to NULL or configuring the key DBT to not + * return any information. (Returned keys from a put are always record + * numbers, and returning part of a record number doesn't make sense: + * only accept a partial return if the length returned is 0.) + */ + if (key_flags && F_ISSET(key, DB_DBT_PARTIAL) && key->dlen != 0) + return (__db_ferr(env, "key DBT", 0)); + + /* + * The cursor must be initialized for anything other than DB_KEYFIRST, + * DB_KEYLAST or zero: return EINVAL for an invalid cursor, otherwise 0. + */ + if (!IS_INITIALIZED(dbc) && flags != 0 && flags != DB_KEYFIRST && + flags != DB_KEYLAST && flags != DB_NODUPDATA && + flags != DB_OVERWRITE_DUP) + return (__db_curinval(env)); + + return (0); +} + +/* + * __dbt_ferr -- + * Check a DBT for flag errors. + */ +static int +__dbt_ferr(dbp, name, dbt, check_thread) + const DB *dbp; + const char *name; + const DBT *dbt; + int check_thread; +{ + ENV *env; + int ret; + + env = dbp->env; + + /* + * Check for invalid DBT flags. We allow any of the flags to be + * specified to any DB or DBcursor call so that applications can + * set DB_DBT_MALLOC when retrieving a data item from a secondary + * database and then specify that same DBT as a key to a primary + * database, without having to clear flags. + */ + if ((ret = __db_fchk(env, name, dbt->flags, DB_DBT_APPMALLOC | + DB_DBT_BULK | DB_DBT_DUPOK | DB_DBT_MALLOC | DB_DBT_REALLOC | + DB_DBT_USERCOPY | DB_DBT_USERMEM | DB_DBT_PARTIAL)) != 0) + return (ret); + switch (F_ISSET(dbt, DB_DBT_MALLOC | DB_DBT_REALLOC | + DB_DBT_USERCOPY | DB_DBT_USERMEM)) { + case 0: + case DB_DBT_MALLOC: + case DB_DBT_REALLOC: + case DB_DBT_USERCOPY: + case DB_DBT_USERMEM: + break; + default: + return (__db_ferr(env, name, 1)); + } + + if (F_ISSET(dbt, DB_DBT_BULK) && F_ISSET(dbt, DB_DBT_PARTIAL)) { + __db_errx(env, + "Bulk and partial operations cannot be combined on %s DBT", name); + return (EINVAL); + } + + if (check_thread && DB_IS_THREADED(dbp) && + !F_ISSET(dbt, DB_DBT_MALLOC | DB_DBT_REALLOC | + DB_DBT_USERCOPY | DB_DBT_USERMEM)) { + __db_errx(env, + "DB_THREAD mandates memory allocation flag on %s DBT", + name); + return (EINVAL); + } + return (0); +} + +/* + * __db_curinval + * Report that a cursor is in an invalid state. + */ +static int +__db_curinval(env) + const ENV *env; +{ + __db_errx(env, + "Cursor position must be set before performing this operation"); + return (EINVAL); +} + +/* + * __db_txn_auto_init -- + * Handle DB_AUTO_COMMIT initialization. + * + * PUBLIC: int __db_txn_auto_init __P((ENV *, DB_THREAD_INFO *, DB_TXN **)); + */ +int +__db_txn_auto_init(env, ip, txnidp) + ENV *env; + DB_THREAD_INFO *ip; + DB_TXN **txnidp; +{ + /* + * Method calls where applications explicitly specify DB_AUTO_COMMIT + * require additional validation: the DB_AUTO_COMMIT flag cannot be + * specified if a transaction cookie is also specified, nor can the + * flag be specified in a non-transactional environment. + */ + if (*txnidp != NULL) { + __db_errx(env, + "DB_AUTO_COMMIT may not be specified along with a transaction handle"); + return (EINVAL); + } + + if (!TXN_ON(env)) { + __db_errx(env, + "DB_AUTO_COMMIT may not be specified in non-transactional environment"); + return (EINVAL); + } + + /* + * Our caller checked to see if replication is making a state change. + * Don't call the user-level API (which would repeat that check). + */ + return (__txn_begin(env, ip, NULL, txnidp, 0)); +} + +/* + * __db_txn_auto_resolve -- + * Resolve local transactions. + * + * PUBLIC: int __db_txn_auto_resolve __P((ENV *, DB_TXN *, int, int)); + */ +int +__db_txn_auto_resolve(env, txn, nosync, ret) + ENV *env; + DB_TXN *txn; + int nosync, ret; +{ + int t_ret; + + /* + * We're resolving a transaction for the user, and must decrement the + * replication handle count. Call the user-level API. + */ + if (ret == 0) + return (__txn_commit(txn, nosync ? DB_TXN_NOSYNC : 0)); + + if ((t_ret = __txn_abort(txn)) != 0) + return (__env_panic(env, t_ret)); + + return (ret); +} diff --git a/db/db_join.c b/db/db_join.c new file mode 100644 index 0000000..05c11a4 --- /dev/null +++ b/db/db_join.c @@ -0,0 +1,940 @@ +/* + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1998-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_join.h" +#include "dbinc/btree.h" +#include "dbinc/lock.h" + +static int __db_join_close_pp __P((DBC *)); +static int __db_join_cmp __P((const void *, const void *)); +static int __db_join_del __P((DBC *, u_int32_t)); +static int __db_join_get __P((DBC *, DBT *, DBT *, u_int32_t)); +static int __db_join_get_pp __P((DBC *, DBT *, DBT *, u_int32_t)); +static int __db_join_getnext __P((DBC *, DBT *, DBT *, u_int32_t, u_int32_t)); +static int __db_join_primget __P((DB *, DB_THREAD_INFO *, + DB_TXN *, DB_LOCKER *, DBT *, DBT *, u_int32_t)); +static int __db_join_put __P((DBC *, DBT *, DBT *, u_int32_t)); + +/* + * Check to see if the Nth secondary cursor of join cursor jc is pointing + * to a sorted duplicate set. + */ +#define SORTED_SET(jc, n) ((jc)->j_curslist[(n)]->dbp->dup_compare != NULL) + +/* + * This is the duplicate-assisted join functionality. Right now we're + * going to write it such that we return one item at a time, although + * I think we may need to optimize it to return them all at once. + * It should be easier to get it working this way, and I believe that + * changing it should be fairly straightforward. + * + * We optimize the join by sorting cursors from smallest to largest + * cardinality. In most cases, this is indeed optimal. However, if + * a cursor with large cardinality has very few data in common with the + * first cursor, it is possible that the join will be made faster by + * putting it earlier in the cursor list. Since we have no way to detect + * cases like this, we simply provide a flag, DB_JOIN_NOSORT, which retains + * the sort order specified by the caller, who may know more about the + * structure of the data. + * + * The first cursor moves sequentially through the duplicate set while + * the others search explicitly for the duplicate in question. + * + */ + +/* + * __db_join -- + * This is the interface to the duplicate-assisted join functionality. + * In the same way that cursors mark a position in a database, a cursor + * can mark a position in a join. While most cursors are created by the + * cursor method of a DB, join cursors are created through an explicit + * call to DB->join. + * + * The curslist is an array of existing, initialized cursors and primary + * is the DB of the primary file. The data item that joins all the + * cursors in the curslist is used as the key into the primary and that + * key and data are returned. When no more items are left in the join + * set, the c_next operation off the join cursor will return DB_NOTFOUND. + * + * PUBLIC: int __db_join __P((DB *, DBC **, DBC **, u_int32_t)); + */ +int +__db_join(primary, curslist, dbcp, flags) + DB *primary; + DBC **curslist, **dbcp; + u_int32_t flags; +{ + DBC *dbc; + ENV *env; + JOIN_CURSOR *jc; + size_t ncurs, nslots; + u_int32_t i; + int ret; + + env = primary->env; + dbc = NULL; + jc = NULL; + + if ((ret = __os_calloc(env, 1, sizeof(DBC), &dbc)) != 0) + goto err; + + if ((ret = __os_calloc(env, 1, sizeof(JOIN_CURSOR), &jc)) != 0) + goto err; + + if ((ret = __os_malloc(env, 256, &jc->j_key.data)) != 0) + goto err; + jc->j_key.ulen = 256; + F_SET(&jc->j_key, DB_DBT_USERMEM); + + F_SET(&jc->j_rdata, DB_DBT_REALLOC); + + for (jc->j_curslist = curslist; + *jc->j_curslist != NULL; jc->j_curslist++) + ; + + /* + * The number of cursor slots we allocate is one greater than + * the number of cursors involved in the join, because the + * list is NULL-terminated. + */ + ncurs = (size_t)(jc->j_curslist - curslist); + nslots = ncurs + 1; + + /* + * !!! -- A note on the various lists hanging off jc. + * + * j_curslist is the initial NULL-terminated list of cursors passed + * into __db_join. The original cursors are not modified; pristine + * copies are required because, in databases with unsorted dups, we + * must reset all of the secondary cursors after the first each + * time the first one is incremented, or else we will lose data + * which happen to be sorted differently in two different cursors. + * + * j_workcurs is where we put those copies that we're planning to + * work with. They're lazily c_dup'ed from j_curslist as we need + * them, and closed when the join cursor is closed or when we need + * to reset them to their original values (in which case we just + * c_dup afresh). + * + * j_fdupcurs is an array of cursors which point to the first + * duplicate in the duplicate set that contains the data value + * we're currently interested in. We need this to make + * __db_join_get correctly return duplicate duplicates; i.e., if a + * given data value occurs twice in the set belonging to cursor #2, + * and thrice in the set belonging to cursor #3, and once in all + * the other cursors, successive calls to __db_join_get need to + * return that data item six times. To make this happen, each time + * cursor N is allowed to advance to a new datum, all cursors M + * such that M > N have to be reset to the first duplicate with + * that datum, so __db_join_get will return all the dup-dups again. + * We could just reset them to the original cursor from j_curslist, + * but that would be a bit slower in the unsorted case and a LOT + * slower in the sorted one. + * + * j_exhausted is a list of boolean values which represent + * whether or not their corresponding cursors are "exhausted", + * i.e. whether the datum under the corresponding cursor has + * been found not to exist in any unreturned combinations of + * later secondary cursors, in which case they are ready to be + * incremented. + */ + + /* We don't want to free regions whose callocs have failed. */ + jc->j_curslist = NULL; + jc->j_workcurs = NULL; + jc->j_fdupcurs = NULL; + jc->j_exhausted = NULL; + + if ((ret = __os_calloc(env, nslots, sizeof(DBC *), + &jc->j_curslist)) != 0) + goto err; + if ((ret = __os_calloc(env, nslots, sizeof(DBC *), + &jc->j_workcurs)) != 0) + goto err; + if ((ret = __os_calloc(env, nslots, sizeof(DBC *), + &jc->j_fdupcurs)) != 0) + goto err; + if ((ret = __os_calloc(env, nslots, sizeof(u_int8_t), + &jc->j_exhausted)) != 0) + goto err; + for (i = 0; curslist[i] != NULL; i++) { + jc->j_curslist[i] = curslist[i]; + jc->j_workcurs[i] = NULL; + jc->j_fdupcurs[i] = NULL; + jc->j_exhausted[i] = 0; + } + jc->j_ncurs = (u_int32_t)ncurs; + + /* + * If DB_JOIN_NOSORT is not set, optimize secondary cursors by + * sorting in order of increasing cardinality. + */ + if (!LF_ISSET(DB_JOIN_NOSORT)) + qsort(jc->j_curslist, ncurs, sizeof(DBC *), __db_join_cmp); + + /* + * We never need to reset the 0th cursor, so there's no + * solid reason to use workcurs[0] rather than curslist[0] in + * join_get. Nonetheless, it feels cleaner to do it for symmetry, + * and this is the most logical place to copy it. + * + * !!! + * There's no need to close the new cursor if we goto err only + * because this is the last thing that can fail. Modifier of this + * function beware! + */ + if ((ret = + __dbc_dup(jc->j_curslist[0], jc->j_workcurs, DB_POSITION)) != 0) + goto err; + + dbc->close = dbc->c_close = __db_join_close_pp; + dbc->del = dbc->c_del = __db_join_del; + dbc->get = dbc->c_get = __db_join_get_pp; + dbc->put = dbc->c_put = __db_join_put; + dbc->internal = (DBC_INTERNAL *)jc; + dbc->dbp = primary; + jc->j_primary = primary; + + /* Stash the first cursor's transaction here for easy access. */ + dbc->txn = curslist[0]->txn; + + *dbcp = dbc; + + MUTEX_LOCK(env, primary->mutex); + TAILQ_INSERT_TAIL(&primary->join_queue, dbc, links); + MUTEX_UNLOCK(env, primary->mutex); + + return (0); + +err: if (jc != NULL) { + if (jc->j_curslist != NULL) + __os_free(env, jc->j_curslist); + if (jc->j_workcurs != NULL) { + if (jc->j_workcurs[0] != NULL) + (void)__dbc_close(jc->j_workcurs[0]); + __os_free(env, jc->j_workcurs); + } + if (jc->j_fdupcurs != NULL) + __os_free(env, jc->j_fdupcurs); + if (jc->j_exhausted != NULL) + __os_free(env, jc->j_exhausted); + __os_free(env, jc); + } + if (dbc != NULL) + __os_free(env, dbc); + return (ret); +} + +/* + * __db_join_close_pp -- + * DBC->close pre/post processing for join cursors. + */ +static int +__db_join_close_pp(dbc) + DBC *dbc; +{ + DB *dbp; + DB_THREAD_INFO *ip; + ENV *env; + int handle_check, ret, t_ret; + + dbp = dbc->dbp; + env = dbp->env; + + ENV_ENTER(env, ip); + + handle_check = IS_ENV_REPLICATED(env); + if (handle_check && + (ret = __db_rep_enter(dbp, 1, 0, dbc->txn != NULL)) != 0) { + handle_check = 0; + goto err; + } + + ret = __db_join_close(dbc); + + if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + +err: ENV_LEAVE(env, ip); + return (ret); +} + +static int +__db_join_put(dbc, key, data, flags) + DBC *dbc; + DBT *key; + DBT *data; + u_int32_t flags; +{ + COMPQUIET(dbc, NULL); + COMPQUIET(key, NULL); + COMPQUIET(data, NULL); + COMPQUIET(flags, 0); + return (EINVAL); +} + +static int +__db_join_del(dbc, flags) + DBC *dbc; + u_int32_t flags; +{ + COMPQUIET(dbc, NULL); + COMPQUIET(flags, 0); + return (EINVAL); +} + +/* + * __db_join_get_pp -- + * DBjoin->get pre/post processing. + */ +static int +__db_join_get_pp(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; +{ + DB *dbp; + DB_THREAD_INFO *ip; + ENV *env; + u_int32_t handle_check, save_flags; + int ret, t_ret; + + dbp = dbc->dbp; + env = dbp->env; + + /* Save the original flags value. */ + save_flags = flags; + + if (LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW)) { + if (!LOCKING_ON(env)) + return (__db_fnl(env, "DBC->get")); + LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW); + } + + switch (flags) { + case 0: + case DB_JOIN_ITEM: + break; + default: + return (__db_ferr(env, "DBC->get", 0)); + } + + /* + * A partial get of the key of a join cursor don't make much sense; + * the entire key is necessary to query the primary database + * and find the datum, and so regardless of the size of the key + * it would not be a performance improvement. Since it would require + * special handling, we simply disallow it. + * + * A partial get of the data, however, potentially makes sense (if + * all possible data are a predictable large structure, for instance) + * and causes us no headaches, so we permit it. + */ + if (F_ISSET(key, DB_DBT_PARTIAL)) { + __db_errx(env, + "DB_DBT_PARTIAL may not be set on key during join_get"); + return (EINVAL); + } + + ENV_ENTER(env, ip); + + handle_check = IS_ENV_REPLICATED(env); + if (handle_check && + (ret = __db_rep_enter(dbp, 1, 0, dbc->txn != NULL)) != 0) { + handle_check = 0; + goto err; + } + + /* Restore the original flags value. */ + flags = save_flags; + + ret = __db_join_get(dbc, key, data, flags); + + if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + +err: ENV_LEAVE(env, ip); + __dbt_userfree(env, key, NULL, NULL); + return (ret); +} + +static int +__db_join_get(dbc, key_arg, data_arg, flags) + DBC *dbc; + DBT *key_arg, *data_arg; + u_int32_t flags; +{ + DB *dbp; + DBC *cp; + DBT *key_n, key_n_mem; + ENV *env; + JOIN_CURSOR *jc; + int db_manage_data, ret; + u_int32_t i, j, operation, opmods; + + dbp = dbc->dbp; + env = dbp->env; + jc = (JOIN_CURSOR *)dbc->internal; + + operation = LF_ISSET(DB_OPFLAGS_MASK); + + /* !!! + * If the set of flags here changes, check that __db_join_primget + * is updated to handle them properly. + */ + opmods = LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW); + + /* + * Since we are fetching the key as a datum in the secondary indices, + * we must be careful of caller-specified DB_DBT_* memory + * management flags. If necessary, use a stack-allocated DBT; + * we'll appropriately copy and/or allocate the data later. + */ + if (F_ISSET(key_arg, + DB_DBT_MALLOC | DB_DBT_USERCOPY | DB_DBT_USERMEM)) { + /* We just use the default buffer; no need to go malloc. */ + key_n = &key_n_mem; + memset(key_n, 0, sizeof(DBT)); + } else { + /* + * Either DB_DBT_REALLOC or the default buffer will work + * fine if we have to reuse it, as we do. + */ + key_n = key_arg; + } + if (F_ISSET(key_arg, DB_DBT_USERCOPY)) + key_arg->data = NULL; + + /* + * If our last attempt to do a get on the primary key failed, + * short-circuit the join and try again with the same key. + */ + if (F_ISSET(jc, JOIN_RETRY)) + goto samekey; + F_CLR(jc, JOIN_RETRY); + +retry: ret = __dbc_get(jc->j_workcurs[0], &jc->j_key, key_n, + opmods | (jc->j_exhausted[0] ? DB_NEXT_DUP : DB_CURRENT)); + + if (ret == DB_BUFFER_SMALL) { + jc->j_key.ulen <<= 1; + if ((ret = __os_realloc(env, + jc->j_key.ulen, &jc->j_key.data)) != 0) + goto mem_err; + goto retry; + } + + /* + * If ret == DB_NOTFOUND, we're out of elements of the first + * secondary cursor. This is how we finally finish the join + * if all goes well. + */ + if (ret != 0) + goto err; + + /* + * If jc->j_exhausted[0] == 1, we've just advanced the first cursor, + * and we're going to want to advance all the cursors that point to + * the first member of a duplicate duplicate set (j_fdupcurs[1..N]). + * Close all the cursors in j_fdupcurs; we'll reopen them the + * first time through the upcoming loop. + */ + for (i = 1; i < jc->j_ncurs; i++) { + if (jc->j_fdupcurs[i] != NULL && + (ret = __dbc_close(jc->j_fdupcurs[i])) != 0) + goto err; + jc->j_fdupcurs[i] = NULL; + } + + /* + * If jc->j_curslist[1] == NULL, we have only one cursor in the join. + * Thus, we can safely increment that one cursor on each call + * to __db_join_get, and we signal this by setting jc->j_exhausted[0] + * right away. + * + * Otherwise, reset jc->j_exhausted[0] to 0, so that we don't + * increment it until we know we're ready to. + */ + if (jc->j_curslist[1] == NULL) + jc->j_exhausted[0] = 1; + else + jc->j_exhausted[0] = 0; + + /* We have the first element; now look for it in the other cursors. */ + for (i = 1; i < jc->j_ncurs; i++) { + DB_ASSERT(env, jc->j_curslist[i] != NULL); + if (jc->j_workcurs[i] == NULL) + /* If this is NULL, we need to dup curslist into it. */ + if ((ret = __dbc_dup(jc->j_curslist[i], + &jc->j_workcurs[i], DB_POSITION)) != 0) + goto err; + +retry2: cp = jc->j_workcurs[i]; + + if ((ret = __db_join_getnext(cp, &jc->j_key, key_n, + jc->j_exhausted[i], opmods)) == DB_NOTFOUND) { + /* + * jc->j_workcurs[i] has no more of the datum we're + * interested in. Go back one cursor and get + * a new dup. We can't just move to a new + * element of the outer relation, because that way + * we might miss duplicate duplicates in cursor i-1. + * + * If this takes us back to the first cursor, + * -then- we can move to a new element of the outer + * relation. + */ + --i; + jc->j_exhausted[i] = 1; + + if (i == 0) { + for (j = 1; jc->j_workcurs[j] != NULL; j++) { + /* + * We're moving to a new element of + * the first secondary cursor. If + * that cursor is sorted, then any + * other sorted cursors can be safely + * reset to the first duplicate + * duplicate in the current set if we + * have a pointer to it (we can't just + * leave them be, or we'll miss + * duplicate duplicates in the outer + * relation). + * + * If the first cursor is unsorted, or + * if cursor j is unsorted, we can + * make no assumptions about what + * we're looking for next or where it + * will be, so we reset to the very + * beginning (setting workcurs NULL + * will achieve this next go-round). + * + * XXX: This is likely to break + * horribly if any two cursors are + * both sorted, but have different + * specified sort functions. For, + * now, we dismiss this as pathology + * and let strange things happen--we + * can't make rope childproof. + */ + if ((ret = __dbc_close( + jc->j_workcurs[j])) != 0) + goto err; + if (!SORTED_SET(jc, 0) || + !SORTED_SET(jc, j) || + jc->j_fdupcurs[j] == NULL) + /* + * Unsafe conditions; + * reset fully. + */ + jc->j_workcurs[j] = NULL; + else + /* Partial reset suffices. */ + if ((__dbc_dup( + jc->j_fdupcurs[j], + &jc->j_workcurs[j], + DB_POSITION)) != 0) + goto err; + jc->j_exhausted[j] = 0; + } + goto retry; + /* NOTREACHED */ + } + + /* + * We're about to advance the cursor and need to + * reset all of the workcurs[j] where j>i, so that + * we don't miss any duplicate duplicates. + */ + for (j = i + 1; + jc->j_workcurs[j] != NULL; + j++) { + if ((ret = + __dbc_close(jc->j_workcurs[j])) != 0) + goto err; + jc->j_exhausted[j] = 0; + if (jc->j_fdupcurs[j] == NULL) + jc->j_workcurs[j] = NULL; + else if ((ret = __dbc_dup(jc->j_fdupcurs[j], + &jc->j_workcurs[j], DB_POSITION)) != 0) + goto err; + } + goto retry2; + /* NOTREACHED */ + } + + if (ret == DB_BUFFER_SMALL) { + jc->j_key.ulen <<= 1; + if ((ret = __os_realloc(env, jc->j_key.ulen, + &jc->j_key.data)) != 0) { +mem_err: __db_errx(env, + "Allocation failed for join key, len = %lu", + (u_long)jc->j_key.ulen); + goto err; + } + goto retry2; + } + + if (ret != 0) + goto err; + + /* + * If we made it this far, we've found a matching + * datum in cursor i. Mark the current cursor + * unexhausted, so we don't miss any duplicate + * duplicates the next go-round--unless this is the + * very last cursor, in which case there are none to + * miss, and we'll need that exhausted flag to finally + * get a DB_NOTFOUND and move on to the next datum in + * the outermost cursor. + */ + if (i + 1 != jc->j_ncurs) + jc->j_exhausted[i] = 0; + else + jc->j_exhausted[i] = 1; + + /* + * If jc->j_fdupcurs[i] is NULL and the ith cursor's dups are + * sorted, then we're here for the first time since advancing + * cursor 0, and we have a new datum of interest. + * jc->j_workcurs[i] points to the beginning of a set of + * duplicate duplicates; store this into jc->j_fdupcurs[i]. + */ + if (SORTED_SET(jc, i) && jc->j_fdupcurs[i] == NULL && (ret = + __dbc_dup(cp, &jc->j_fdupcurs[i], DB_POSITION)) != 0) + goto err; + } + +err: if (ret != 0) + return (ret); + + if (0) { +samekey: /* + * Get the key we tried and failed to return last time; + * it should be the current datum of all the secondary cursors. + */ + if ((ret = __dbc_get(jc->j_workcurs[0], + &jc->j_key, key_n, DB_CURRENT | opmods)) != 0) + return (ret); + F_CLR(jc, JOIN_RETRY); + } + + /* + * ret == 0; we have a key to return. + * + * If DB_DBT_USERMEM or DB_DBT_MALLOC is set, we need to copy the key + * back into the dbt we were given for the key; call __db_retcopy. + * Otherwise, assert that we do not need to copy anything and proceed. + */ + DB_ASSERT(env, F_ISSET(key_arg, DB_DBT_USERMEM | DB_DBT_MALLOC | + DB_DBT_USERCOPY) || key_n == key_arg); + + if ((F_ISSET(key_arg, DB_DBT_USERMEM | DB_DBT_MALLOC | + DB_DBT_USERCOPY)) && + (ret = __db_retcopy(env, + key_arg, key_n->data, key_n->size, NULL, NULL)) != 0) { + /* + * The retcopy failed, most commonly because we have a user + * buffer for the key which is too small. Set things up to + * retry next time, and return. + */ + F_SET(jc, JOIN_RETRY); + return (ret); + } + + /* + * If DB_JOIN_ITEM is set, we return it; otherwise we do the lookup + * in the primary and then return. + */ + if (operation == DB_JOIN_ITEM) + return (0); + + /* + * If data_arg->flags == 0--that is, if DB is managing the + * data DBT's memory--it's not safe to just pass the DBT + * through to the primary get call, since we don't want that + * memory to belong to the primary DB handle (and if the primary + * is free-threaded, it can't anyway). + * + * Instead, use memory that is managed by the join cursor, in + * jc->j_rdata. + */ + if (!F_ISSET(data_arg, DB_DBT_MALLOC | DB_DBT_REALLOC | + DB_DBT_USERMEM | DB_DBT_USERCOPY)) + db_manage_data = 1; + else + db_manage_data = 0; + if ((ret = __db_join_primget(jc->j_primary, dbc->thread_info, + jc->j_curslist[0]->txn, jc->j_curslist[0]->locker, key_n, + db_manage_data ? &jc->j_rdata : data_arg, opmods)) != 0) { + if (ret == DB_NOTFOUND) { + if (LF_ISSET(DB_READ_UNCOMMITTED) || + (jc->j_curslist[0]->txn != NULL && F_ISSET( + jc->j_curslist[0]->txn, TXN_READ_UNCOMMITTED))) + goto retry; + /* + * If ret == DB_NOTFOUND, the primary and secondary + * are out of sync; every item in each secondary + * should correspond to something in the primary, + * or we shouldn't have done the join this way. + * Wail. + */ + ret = __db_secondary_corrupt(jc->j_primary); + } else + /* + * The get on the primary failed for some other + * reason, most commonly because we're using a user + * buffer that's not big enough. Flag our failure + * so we can return the same key next time. + */ + F_SET(jc, JOIN_RETRY); + } + if (db_manage_data && ret == 0) { + data_arg->data = jc->j_rdata.data; + data_arg->size = jc->j_rdata.size; + } + + return (ret); +} + +/* + * __db_join_close -- + * DBC->close for join cursors. + * + * PUBLIC: int __db_join_close __P((DBC *)); + */ +int +__db_join_close(dbc) + DBC *dbc; +{ + DB *dbp; + DB_THREAD_INFO *ip; + ENV *env; + JOIN_CURSOR *jc; + int ret, t_ret; + u_int32_t i; + + jc = (JOIN_CURSOR *)dbc->internal; + dbp = dbc->dbp; + env = dbp->env; + ret = t_ret = 0; + + /* + * Remove from active list of join cursors. Note that this + * must happen before any action that can fail and return, or else + * __db_close may loop indefinitely. + */ + MUTEX_LOCK(env, dbp->mutex); + TAILQ_REMOVE(&dbp->join_queue, dbc, links); + MUTEX_UNLOCK(env, dbp->mutex); + + ENV_ENTER(env, ip); + /* + * Close any open scratch cursors. In each case, there may + * not be as many outstanding as there are cursors in + * curslist, but we want to close whatever's there. + * + * If any close fails, there's no reason not to close everything else; + * we'll just return the error code of the last one to fail. There's + * not much the caller can do anyway, since these cursors only exist + * hanging off a db-internal data structure that they shouldn't be + * mucking with. + */ + for (i = 0; i < jc->j_ncurs; i++) { + if (jc->j_workcurs[i] != NULL && + (t_ret = __dbc_close(jc->j_workcurs[i])) != 0) + ret = t_ret; + if (jc->j_fdupcurs[i] != NULL && + (t_ret = __dbc_close(jc->j_fdupcurs[i])) != 0) + ret = t_ret; + } + ENV_LEAVE(env, ip); + + __os_free(env, jc->j_exhausted); + __os_free(env, jc->j_curslist); + __os_free(env, jc->j_workcurs); + __os_free(env, jc->j_fdupcurs); + __os_free(env, jc->j_key.data); + if (jc->j_rdata.data != NULL) + __os_ufree(env, jc->j_rdata.data); + __os_free(env, jc); + __os_free(env, dbc); + + return (ret); +} + +/* + * __db_join_getnext -- + * This function replaces the DBC_CONTINUE and DBC_KEYSET + * functionality inside the various cursor get routines. + * + * If exhausted == 0, we're not done with the current datum; + * return it if it matches "matching", otherwise search + * using DB_GET_BOTHC (which is faster than iteratively doing + * DB_NEXT_DUP) forward until we find one that does. + * + * If exhausted == 1, we are done with the current datum, so just + * leap forward to searching NEXT_DUPs. + * + * If no matching datum exists, returns DB_NOTFOUND, else 0. + */ +static int +__db_join_getnext(dbc, key, data, exhausted, opmods) + DBC *dbc; + DBT *key, *data; + u_int32_t exhausted, opmods; +{ + int ret, cmp; + DB *dbp; + DBT ldata; + int (*func) __P((DB *, const DBT *, const DBT *)); + + dbp = dbc->dbp; + func = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare; + + switch (exhausted) { + case 0: + /* + * We don't want to step on data->data; use a new + * DBT and malloc so we don't step on dbc's rdata memory. + */ + memset(&ldata, 0, sizeof(DBT)); + F_SET(&ldata, DB_DBT_MALLOC); + if ((ret = __dbc_get(dbc, + key, &ldata, opmods | DB_CURRENT)) != 0) + break; + cmp = func(dbp, data, &ldata); + if (cmp == 0) { + /* + * We have to return the real data value. Copy + * it into data, then free the buffer we malloc'ed + * above. + */ + if ((ret = __db_retcopy(dbp->env, data, ldata.data, + ldata.size, &data->data, &data->size)) != 0) + return (ret); + __os_ufree(dbp->env, ldata.data); + return (0); + } + + /* + * Didn't match--we want to fall through and search future + * dups. We just forget about ldata and free + * its buffer--data contains the value we're searching for. + */ + __os_ufree(dbp->env, ldata.data); + /* FALLTHROUGH */ + case 1: + ret = __dbc_get(dbc, key, data, opmods | DB_GET_BOTHC); + break; + default: + ret = EINVAL; + break; + } + + return (ret); +} + +/* + * __db_join_cmp -- + * Comparison function for sorting DBCs in cardinality order. + */ +static int +__db_join_cmp(a, b) + const void *a, *b; +{ + DBC *dbca, *dbcb; + db_recno_t counta, countb; + + dbca = *((DBC * const *)a); + dbcb = *((DBC * const *)b); + + if (__dbc_count(dbca, &counta) != 0 || + __dbc_count(dbcb, &countb) != 0) + return (0); + + return ((long)counta - (long)countb); +} + +/* + * __db_join_primget -- + * Perform a DB->get in the primary, being careful not to use a new + * locker ID if we're doing CDB locking. + */ +static int +__db_join_primget(dbp, ip, txn, locker, key, data, flags) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + DB_LOCKER *locker; + DBT *key, *data; + u_int32_t flags; +{ + DBC *dbc; + u_int32_t rmw; + int ret, t_ret; + + if ((ret = __db_cursor_int(dbp, ip, + txn, dbp->type, PGNO_INVALID, 0, locker, &dbc)) != 0) + return (ret); + + /* + * The only allowable flags here are the two flags copied into "opmods" + * in __db_join_get, DB_RMW and DB_READ_UNCOMMITTED. The former is an + * op on the c_get call, the latter on the cursor call. It's a DB bug + * if we allow any other flags down in here. + */ + rmw = LF_ISSET(DB_RMW); + if (LF_ISSET(DB_READ_UNCOMMITTED) || + (txn != NULL && F_ISSET(txn, TXN_READ_UNCOMMITTED))) + F_SET(dbc, DBC_READ_UNCOMMITTED); + + if (LF_ISSET(DB_READ_COMMITTED) || + (txn != NULL && F_ISSET(txn, TXN_READ_COMMITTED))) + F_SET(dbc, DBC_READ_COMMITTED); + + LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED | DB_RMW); + DB_ASSERT(dbp->env, flags == 0); + + F_SET(dbc, DBC_TRANSIENT); + + /* + * This shouldn't be necessary, thanks to the fact that join cursors + * swap in their own DB_DBT_REALLOC'ed buffers, but just for form's + * sake, we mirror what __db_get does. + */ + SET_RET_MEM(dbc, dbp); + + ret = __dbc_get(dbc, key, data, DB_SET | rmw); + + if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __db_secondary_corrupt -- + * Report primary/secondary inconsistencies. + * + * PUBLIC: int __db_secondary_corrupt __P((DB *)); + */ +int +__db_secondary_corrupt(dbp) + DB *dbp; +{ + __db_err(dbp->env, DB_SECONDARY_BAD, "%s%s%s", + dbp->fname == NULL ? "unnamed" : dbp->fname, + dbp->dname == NULL ? "" : "/", + dbp->dname == NULL ? "" : dbp->dname); + return (DB_SECONDARY_BAD); +} diff --git a/db/db_meta.c b/db/db_meta.c new file mode 100644 index 0000000..ef42e44 --- /dev/null +++ b/db/db_meta.c @@ -0,0 +1,1299 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" +#include "dbinc/db_am.h" +#include "dbinc/hash.h" + +static void __db_init_meta __P((DB *, void *, db_pgno_t, u_int32_t)); +#ifdef HAVE_FTRUNCATE +static int __db_pglistcmp __P((const void *, const void *)); +static int __db_truncate_freelist __P((DBC *, DBMETA *, + PAGE *, db_pgno_t *, u_int32_t, u_int32_t)); +#endif + +/* + * __db_init_meta -- + * Helper function for __db_new that initializes the important fields in + * a meta-data page (used instead of P_INIT). We need to make sure that we + * retain the page number and LSN of the existing page. + */ +static void +__db_init_meta(dbp, p, pgno, pgtype) + DB *dbp; + void *p; + db_pgno_t pgno; + u_int32_t pgtype; +{ + DBMETA *meta; + DB_LSN save_lsn; + + meta = (DBMETA *)p; + save_lsn = meta->lsn; + memset(meta, 0, sizeof(DBMETA)); + meta->lsn = save_lsn; + meta->pagesize = dbp->pgsize; + if (F_ISSET(dbp, DB_AM_CHKSUM)) + FLD_SET(meta->metaflags, DBMETA_CHKSUM); + meta->pgno = pgno; + meta->type = (u_int8_t)pgtype; +} + +/* + * __db_new -- + * Get a new page, preferably from the freelist. + * + * PUBLIC: int __db_new __P((DBC *, u_int32_t, DB_LOCK *, PAGE **)); + */ +int +__db_new(dbc, type, lockp, pagepp) + DBC *dbc; + u_int32_t type; + DB_LOCK *lockp; + PAGE **pagepp; +{ + DB *dbp; + DBMETA *meta; + DB_LOCK metalock; + DB_LSN lsn; + DB_MPOOLFILE *mpf; + ENV *env; + PAGE *h; + db_pgno_t last, *list, pgno, newnext; + int extend, hash, ret, t_ret; + + meta = NULL; + dbp = dbc->dbp; + env = dbp->env; + mpf = dbp->mpf; + h = NULL; + newnext = PGNO_INVALID; + if (lockp != NULL) + LOCK_INIT(*lockp); + + hash = 0; + ret = 0; + LOCK_INIT(metalock); + +#ifdef HAVE_HASH + if (dbp->type == DB_HASH) { + if ((ret = __ham_return_meta(dbc, DB_MPOOL_DIRTY, &meta)) != 0) + goto err; + if (meta != NULL) + hash = 1; + } +#endif + if (meta == NULL) { + pgno = PGNO_BASE_MD; + if ((ret = __db_lget(dbc, + LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0) + goto err; + if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, + DB_MPOOL_DIRTY, &meta)) != 0) + goto err; + } + + last = meta->last_pgno; + if (meta->free == PGNO_INVALID) { + if (FLD_ISSET(type, P_DONTEXTEND)) { + *pagepp = NULL; + goto err; + } + last = pgno = meta->last_pgno + 1; + ZERO_LSN(lsn); + extend = 1; + } else { + pgno = meta->free; + /* + * Lock the new page. Do this here because we must do it + * before getting the page and the caller may need the lock + * to keep readers from seeing the page before the transaction + * commits. We can do this because no one will hold a free + * page locked. + */ + if (lockp != NULL && (ret = + __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, lockp)) != 0) + goto err; + if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, + DB_MPOOL_DIRTY, &h)) != 0) + goto err; + + /* + * We want to take the first page off the free list and + * then set meta->free to the that page's next_pgno, but + * we need to log the change first. + */ + newnext = h->next_pgno; + lsn = h->lsn; + extend = 0; + DB_ASSERT(env, TYPE(h) == P_INVALID); + + if (TYPE(h) != P_INVALID) { + __db_errx(env, + "%s page %lu is on free list with type %lu", + dbp->fname, (u_long)PGNO(h), (u_long)TYPE(h)); + return (__env_panic(env, EINVAL)); + } + + } + + FLD_CLR(type, P_DONTEXTEND); + + /* + * Log the allocation before fetching the new page. If we + * don't have room in the log then we don't want to tell + * mpool to extend the file. + */ + if (DBC_LOGGING(dbc)) { + if ((ret = __db_pg_alloc_log(dbp, dbc->txn, &LSN(meta), 0, + &LSN(meta), PGNO_BASE_MD, &lsn, + pgno, (u_int32_t)type, newnext, meta->last_pgno)) != 0) + goto err; + } else + LSN_NOT_LOGGED(LSN(meta)); + + meta->free = newnext; + + if (extend == 1) { + if (lockp != NULL && (ret = + __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, lockp)) != 0) + goto err; + if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, + DB_MPOOL_NEW, &h)) != 0) + goto err; + DB_ASSERT(env, last == pgno); + meta->last_pgno = pgno; + ZERO_LSN(h->lsn); + h->pgno = pgno; + } + LSN(h) = LSN(meta); + + if (hash == 0) + ret = __memp_fput(mpf, dbc->thread_info, meta, dbc->priority); + meta = NULL; + if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + goto err; + + switch (type) { + case P_BTREEMETA: + case P_HASHMETA: + case P_QAMMETA: + __db_init_meta(dbp, h, h->pgno, type); + break; + default: + P_INIT(h, dbp->pgsize, + h->pgno, PGNO_INVALID, PGNO_INVALID, 0, type); + break; + } + + /* Fix up the sorted free list if necessary. */ +#ifdef HAVE_FTRUNCATE + if (extend == 0) { + u_int32_t nelems = 0; + + if ((ret = __memp_get_freelist(dbp->mpf, &nelems, &list)) != 0) + goto err; + if (nelems != 0) { + DB_ASSERT(env, h->pgno == list[0]); + memmove(list, &list[1], (nelems - 1) * sizeof(*list)); + if ((ret = __memp_extend_freelist( + dbp->mpf, nelems - 1, &list)) != 0) + goto err; + } + } +#else + COMPQUIET(list, NULL); +#endif + + *pagepp = h; + return (0); + +err: if (h != NULL) + (void)__memp_fput(mpf, dbc->thread_info, h, dbc->priority); + if (meta != NULL && hash == 0) + (void)__memp_fput(mpf, dbc->thread_info, meta, dbc->priority); + (void)__TLPUT(dbc, metalock); + if (lockp != NULL) + (void)__LPUT(dbc, *lockp); + return (ret); +} + +/* + * __db_free -- + * Add a page to the head of the freelist. + * + * PUBLIC: int __db_free __P((DBC *, PAGE *)); + */ +int +__db_free(dbc, h) + DBC *dbc; + PAGE *h; +{ + DB *dbp; + DBMETA *meta; + DBT ddbt, ldbt; + DB_LOCK metalock; + DB_LSN *lsnp; + DB_MPOOLFILE *mpf; + PAGE *prev; + db_pgno_t last_pgno, next_pgno, pgno, prev_pgno; + u_int32_t lflag; + int hash, ret, t_ret; +#ifdef HAVE_FTRUNCATE + db_pgno_t *list, *lp; + u_int32_t nelem, position, start; + int do_truncate; +#endif + + dbp = dbc->dbp; + mpf = dbp->mpf; + prev_pgno = PGNO_INVALID; + meta = NULL; + prev = NULL; + LOCK_INIT(metalock); +#ifdef HAVE_FTRUNCATE + lp = NULL; + nelem = 0; + do_truncate = 0; +#endif + + /* + * Retrieve the metadata page. If we are not keeping a sorted + * free list put the page at the head of the the free list. + * If we are keeping a sorted free list, for truncation, + * then figure out where this page belongs and either + * link it in or truncate the file as much as possible. + * If either the lock get or page get routines + * fail, then we need to put the page with which we were called + * back because our caller assumes we take care of it. + */ + hash = 0; + + pgno = PGNO_BASE_MD; +#ifdef HAVE_HASH + if (dbp->type == DB_HASH) { + if ((ret = __ham_return_meta(dbc, +#ifdef HAVE_FTRUNCATE + 0, +#else + DB_MPOOL_DIRTY, +#endif + &meta)) != 0) + goto err; + if (meta != NULL) + hash = 1; + } +#endif + if (meta == NULL) { + if ((ret = __db_lget(dbc, + LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0) + goto err; + + /* If we support truncate, we might not dirty the meta page. */ + if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, +#ifdef HAVE_FTRUNCATE + 0, +#else + DB_MPOOL_DIRTY, +#endif + &meta)) != 0) + goto err1; + } + + last_pgno = meta->last_pgno; + next_pgno = meta->free; + /* + * Assign lsnp here so it always initialized when + * HAVE_FTRUNCATE is not defined. + */ + lsnp = &LSN(meta); + + DB_ASSERT(dbp->env, h->pgno != next_pgno); + +#ifdef HAVE_FTRUNCATE + /* + * If we are maintaining a sorted free list see if we either have a + * new truncation point or the page goes somewhere in the middle of + * the list. If it goes in the middle of the list, we will drop the + * meta page and get the previous page. + */ + if ((ret = __memp_get_freelist(mpf, &nelem, &list)) != 0) + goto err1; + if (list == NULL) + goto no_sort; + + if (h->pgno != last_pgno) { + /* + * Put the page number in the sorted list. + * Finds its position and the previous page, + * extend the list, make room and insert. + */ + position = 0; + if (nelem != 0) { + __db_freelist_pos(h->pgno, list, nelem, &position); + + DB_ASSERT(dbp->env, h->pgno != list[position]); + + /* Get the previous page if this is not the smallest. */ + if (position != 0 || h->pgno > list[0]) + prev_pgno = list[position]; + } + + } else if (nelem != 0) { + /* Find the truncation point. */ + for (lp = &list[nelem - 1]; lp >= list; lp--) + if (--last_pgno != *lp) + break; + if (lp < list || last_pgno < h->pgno - 1) + do_truncate = 1; + last_pgno = meta->last_pgno; + } + +no_sort: + if (prev_pgno == PGNO_INVALID) { +#ifdef HAVE_HASH + if (hash) { + if ((ret = + __ham_return_meta(dbc, DB_MPOOL_DIRTY, &meta)) != 0) + goto err1; + } else +#endif + if ((ret = __memp_dirty(mpf, + &meta, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) + goto err1; + lsnp = &LSN(meta); + } else { + pgno = prev_pgno; + if ((ret = __memp_fget(mpf, &pgno, + dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &prev)) != 0) + goto err1; + next_pgno = NEXT_PGNO(prev); + lsnp = &LSN(prev); + } +#endif + + /* + * Log the change. + * We are either logging an update to the metapage or to the + * previous page in the sorted list. + */ + if (DBC_LOGGING(dbc)) { + memset(&ldbt, 0, sizeof(ldbt)); + ldbt.data = h; + ldbt.size = P_OVERHEAD(dbp); + /* + * If we are truncating the file, we need to make sure + * the logging happens before the truncation. If we + * are truncating multiple pages we don't need to flush the + * log here as it will be flushed by __db_truncate_freelist. + * If we are zeroing pages rather than truncating we still + * need to flush since they will not have valid LSNs. + */ + lflag = 0; + + if (h->pgno == last_pgno +#ifdef HAVE_FTRUNCATE + && do_truncate == 0 +#endif + ) + lflag = DB_FLUSH; + switch (h->type) { + case P_HASH: + case P_IBTREE: + case P_IRECNO: + case P_LBTREE: + case P_LRECNO: + case P_LDUP: + if (h->entries > 0) { + ldbt.size += h->entries * sizeof(db_indx_t); + ddbt.data = (u_int8_t *)h + HOFFSET(h); + ddbt.size = dbp->pgsize - HOFFSET(h); + if ((ret = __db_pg_freedata_log(dbp, dbc->txn, + lsnp, lflag, + h->pgno, lsnp, pgno, + &ldbt, next_pgno, last_pgno, &ddbt)) != 0) + goto err1; + goto logged; + } + break; + case P_HASHMETA: + ldbt.size = sizeof(HMETA); + break; + case P_BTREEMETA: + ldbt.size = sizeof(BTMETA); + break; + case P_OVERFLOW: + ldbt.size += OV_LEN(h); + break; + default: + DB_ASSERT(dbp->env, h->type != P_QAMDATA); + } + + if ((ret = __db_pg_free_log(dbp, + dbc->txn, lsnp, lflag, h->pgno, + lsnp, pgno, &ldbt, next_pgno, last_pgno)) != 0) + goto err1; + } else + LSN_NOT_LOGGED(*lsnp); + +logged: +#ifdef HAVE_FTRUNCATE + if (do_truncate) { + start = (u_int32_t) (lp - list) + 1; + meta->last_pgno--; + ret = __db_truncate_freelist( + dbc, meta, h, list, start, nelem); + h = NULL; + } else +#endif + if (h->pgno == last_pgno) { + /* + * We are going to throw this page away, but if we are + * using MVCC then this version may stick around and we + * might have to make a copy. + */ + if (mpf->mfp->multiversion && (ret = __memp_dirty(mpf, + &h, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) + goto err1; + LSN(h) = *lsnp; + P_INIT(h, dbp->pgsize, + h->pgno, PGNO_INVALID, next_pgno, 0, P_INVALID); + if ((ret = __memp_fput(mpf, + dbc->thread_info, h, DB_PRIORITY_VERY_LOW)) != 0) + goto err1; + h = NULL; + /* Give the page back to the OS. */ + if ((ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info, + last_pgno, 0)) != 0) + goto err1; + DB_ASSERT(dbp->env, meta->pgno == PGNO_BASE_MD); + meta->last_pgno--; + h = NULL; + } else { +#ifdef HAVE_FTRUNCATE + if (list != NULL) { + /* Put the page number into the list. */ + if ((ret = + __memp_extend_freelist(mpf, nelem + 1, &list)) != 0) + goto err1; + if (prev_pgno != PGNO_INVALID) + lp = &list[position + 1]; + else + lp = list; + if (nelem != 0 && position != nelem) + memmove(lp + 1, lp, (size_t) + ((u_int8_t*)&list[nelem] - (u_int8_t*)lp)); + *lp = h->pgno; + } +#endif + /* + * If we are not truncating the page then we + * reinitialize it and put it at the head of + * the free list. + */ + if ((ret = __memp_dirty(mpf, + &h, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) + goto err1; + LSN(h) = *lsnp; + P_INIT(h, dbp->pgsize, + h->pgno, PGNO_INVALID, next_pgno, 0, P_INVALID); +#ifdef DIAGNOSTIC + memset((u_int8_t *) h + P_OVERHEAD(dbp), + CLEAR_BYTE, dbp->pgsize - P_OVERHEAD(dbp)); +#endif + if (prev_pgno == PGNO_INVALID) + meta->free = h->pgno; + else + NEXT_PGNO(prev) = h->pgno; + } + + /* Discard the metadata or previous page. */ +err1: if (hash == 0 && meta != NULL && (t_ret = __memp_fput(mpf, + dbc->thread_info, (PAGE *)meta, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0) + ret = t_ret; + if (prev != (PAGE*) meta && prev != NULL && (t_ret = __memp_fput(mpf, + dbc->thread_info, prev, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + + /* Discard the caller's page reference. */ +err: if (h != NULL && (t_ret = __memp_fput(mpf, + dbc->thread_info, h, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + + /* + * XXX + * We have to unlock the caller's page in the caller! + */ + return (ret); +} + +#ifdef HAVE_FTRUNCATE +/* + * __db_freelist_pos -- find the position of a page in the freelist. + * The list is sorted, we do a binary search. + * + * PUBLIC: #ifdef HAVE_FTRUNCATE + * PUBLIC: void __db_freelist_pos __P((db_pgno_t, + * PUBLIC: db_pgno_t *, u_int32_t, u_int32_t *)); + * PUBLIC: #endif + */ +void +__db_freelist_pos(pgno, list, nelem, posp) + db_pgno_t pgno; + db_pgno_t *list; + u_int32_t nelem; + u_int32_t *posp; +{ + u_int32_t base, indx, lim; + + indx = 0; + for (base = 0, lim = nelem; lim != 0; lim >>= 1) { + indx = base + (lim >> 1); + if (pgno == list[indx]) { + *posp = indx; + return; + } + if (pgno > list[indx]) { + base = indx + 1; + --lim; + } + } + if (base != 0) + base--; + *posp = base; + return; +} + +static int +__db_pglistcmp(a, b) + const void *a, *b; +{ + db_pglist_t *ap, *bp; + + ap = (db_pglist_t *)a; + bp = (db_pglist_t *)b; + + return ((ap->pgno > bp->pgno) ? 1 : (ap->pgno < bp->pgno) ? -1: 0); +} + +/* + * __db_freelist_sort -- sort a list of free pages. + * PUBLIC: void __db_freelist_sort __P((db_pglist_t *, u_int32_t)); + */ +void +__db_freelist_sort(list, nelems) + db_pglist_t *list; + u_int32_t nelems; +{ + qsort(list, (size_t)nelems, sizeof(db_pglist_t), __db_pglistcmp); +} + +/* + * __db_pg_truncate -- find the truncation point in a sorted freelist. + * + * PUBLIC: #ifdef HAVE_FTRUNCATE + * PUBLIC: int __db_pg_truncate __P((DBC *, DB_TXN *, + * PUBLIC: db_pglist_t *, DB_COMPACT *, u_int32_t *, + * PUBLIC: db_pgno_t , db_pgno_t *, DB_LSN *, int)); + * PUBLIC: #endif + */ +int +__db_pg_truncate(dbc, txn, + list, c_data, nelemp, free_pgno, last_pgno, lsnp, in_recovery) + DBC *dbc; + DB_TXN *txn; + db_pglist_t *list; + DB_COMPACT *c_data; + u_int32_t *nelemp; + db_pgno_t free_pgno, *last_pgno; + DB_LSN *lsnp; + int in_recovery; +{ + DB *dbp; + DBT ddbt; + DB_LSN null_lsn; + DB_MPOOLFILE *mpf; + PAGE *h; + db_pglist_t *lp, *slp; + db_pgno_t lpgno, pgno; + u_int32_t elems, log_size, tpoint; + int last, ret; + + ret = 0; + h = NULL; + + dbp = dbc->dbp; + mpf = dbp->mpf; + elems = tpoint = *nelemp; + + /* + * Figure out what (if any) pages can be truncated immediately and + * record the place from which we can truncate, so we can do the + * memp_ftruncate below. We also use this to avoid ever putting + * these pages on the freelist, which we are about to relink. + */ + pgno = *last_pgno; + lp = &list[elems - 1]; + last = 1; + while (tpoint != 0) { + if (lp->pgno != pgno) + break; + pgno--; + tpoint--; + lp--; + } + + lp = list; + slp = &list[elems]; + /* + * Log the sorted list. We log the whole list so it can be rebuilt. + * Don't overflow the log file. + */ +again: if (DBC_LOGGING(dbc)) { + last = 1; + lpgno = *last_pgno; + ddbt.size = elems * sizeof(*lp); + ddbt.data = lp; + log_size = ((LOG *)dbc->env-> + lg_handle->reginfo.primary)->log_size; + if (ddbt.size > log_size / 2) { + elems = (log_size / 2) / sizeof(*lp); + ddbt.size = elems * sizeof(*lp); + last = 0; + /* + * If we stopped after the truncation point + * then we need to truncate from here. + */ + if (lp + elems >= &list[tpoint]) + lpgno = lp[elems - 1].pgno; + } + /* + * If this is not the begining of the list fetch the end + * of the previous segment. This page becomes the last_free + * page and will link to this segment if it is not truncated. + */ + if (lp != list) { + if ((ret = __memp_fget(mpf, &lp[-1].pgno, + dbc->thread_info, txn, 0, &h)) != 0) + goto err; + } + + slp = &lp[elems]; + + ZERO_LSN(null_lsn); + if ((ret = __db_pg_trunc_log(dbp, dbc->txn, + lsnp, last == 1 ? DB_FLUSH : 0, PGNO_BASE_MD, + lsnp, h != NULL ? PGNO(h) : PGNO_INVALID, + h != NULL ? &LSN(h) : &null_lsn, + free_pgno, lpgno, &ddbt)) != 0) + goto err; + if (h != NULL) { + LSN(h) = *lsnp; + if ((ret = __memp_fput(mpf, + dbc->thread_info, h, dbc->priority)) != 0) + goto err; + } + h = NULL; + } else if (!in_recovery) + LSN_NOT_LOGGED(*lsnp); + + for (; lp < slp && lp < &list[tpoint]; lp++) { + if ((ret = __memp_fget(mpf, &lp->pgno, dbc->thread_info, + txn, !in_recovery ? DB_MPOOL_DIRTY : 0, &h)) != 0) { + /* Page may have been truncated later. */ + if (in_recovery && ret == DB_PAGE_NOTFOUND) { + ret = 0; + continue; + } + goto err; + } + if (in_recovery) { + if (LOG_COMPARE(&LSN(h), &lp->lsn) == 0) { + if ((ret = __memp_dirty(mpf, &h, + dbc->thread_info, + txn, dbp->priority, 0)) != 0) { + (void)__memp_fput(mpf, + dbc->thread_info, h, dbp->priority); + goto err; + } + } else + goto skip; + } + + if (lp == &list[tpoint - 1]) + NEXT_PGNO(h) = PGNO_INVALID; + else + NEXT_PGNO(h) = lp[1].pgno; + DB_ASSERT(mpf->env, NEXT_PGNO(h) < *last_pgno); + + LSN(h) = *lsnp; +skip: if ((ret = __memp_fput(mpf, + dbc->thread_info, h, dbp->priority)) != 0) + goto err; + h = NULL; + } + + /* + * If we did not log everything try again. We start from slp and + * try to go to the end of the list. + */ + if (last == 0) { + elems = (u_int32_t)(&list[*nelemp] - slp); + lp = slp; + goto again; + } + + /* + * Truncate the file. Its possible that the last page is the + * only one that got truncated and that's done in the caller. + */ + if (pgno != *last_pgno) { + if (tpoint != *nelemp && + (ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info, + pgno + 1, in_recovery ? MP_TRUNC_RECOVER : 0)) != 0) + goto err; + if (c_data) + c_data->compact_pages_truncated += *last_pgno - pgno; + *last_pgno = pgno; + } + *nelemp = tpoint; + + if (0) { +err: if (h != NULL) + (void)__memp_fput(mpf, + dbc->thread_info, h, dbc->priority); + } + return (ret); +} + +/* + * __db_free_truncate -- + * Build a sorted free list and truncate free pages at the end + * of the file. + * + * PUBLIC: #ifdef HAVE_FTRUNCATE + * PUBLIC: int __db_free_truncate __P((DB *, DB_THREAD_INFO *, DB_TXN *, + * PUBLIC: u_int32_t, DB_COMPACT *, db_pglist_t **, u_int32_t *, + * PUBLIC: db_pgno_t *)); + * PUBLIC: #endif + */ +int +__db_free_truncate(dbp, ip, txn, flags, c_data, listp, nelemp, last_pgnop) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + u_int32_t flags; + DB_COMPACT *c_data; + db_pglist_t **listp; + u_int32_t *nelemp; + db_pgno_t *last_pgnop; +{ + DBC *dbc; + DBMETA *meta; + DB_LOCK metalock; + DB_MPOOLFILE *mpf; + ENV *env; + PAGE *h; + db_pglist_t *list, *lp; + db_pgno_t pgno; + u_int32_t nelems; + int ret, t_ret; + size_t size; + + COMPQUIET(flags, 0); + list = NULL; + meta = NULL; + env = dbp->env; + mpf = dbp->mpf; + h = NULL; + nelems = 0; + if (listp != NULL) { + *listp = NULL; + DB_ASSERT(env, nelemp != NULL); + *nelemp = 0; + } + + if ((ret = __db_cursor(dbp, ip, txn, &dbc, DB_WRITELOCK)) != 0) + return (ret); + + pgno = PGNO_BASE_MD; + if ((ret = __db_lget(dbc, + LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0) + goto err; + if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, 0, + &meta)) != 0) + goto err; + + if (last_pgnop != NULL) + *last_pgnop = meta->last_pgno; + if ((pgno = meta->free) == PGNO_INVALID) + goto done; + + size = 128; + if ((ret = __os_malloc(env, size * sizeof(*list), &list)) != 0) + goto err; + lp = list; + + do { + if (lp == &list[size]) { + size *= 2; + if ((ret = __os_realloc(env, + size * sizeof(*list), &list)) != 0) + goto err; + lp = &list[size / 2]; + } + if ((ret = __memp_fget(mpf, &pgno, + dbc->thread_info, dbc->txn, 0, &h)) != 0) + goto err; + + lp->pgno = pgno; + lp->next_pgno = NEXT_PGNO(h); + lp->lsn = LSN(h); + pgno = NEXT_PGNO(h); + if ((ret = __memp_fput(mpf, + dbc->thread_info, h, dbc->priority)) != 0) + goto err; + lp++; + } while (pgno != PGNO_INVALID); + nelems = (u_int32_t)(lp - list); + + if ((ret = __memp_dirty(mpf, + &meta, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) + goto err; + + /* Sort the list */ + __db_freelist_sort(list, nelems); + + if ((ret = __db_pg_truncate(dbc, txn, list, c_data, + &nelems, meta->free, &meta->last_pgno, &LSN(meta), 0)) != 0) + goto err; + + if (nelems == 0) + meta->free = PGNO_INVALID; + else + meta->free = list[0].pgno; + +done: if (last_pgnop != NULL) + *last_pgnop = meta->last_pgno; + + /* + * The truncate point is the number of pages in the free + * list back from the last page. The number of pages + * in the free list are the number that we can swap in. + */ + if (c_data) + c_data->compact_truncate = (u_int32_t)meta->last_pgno - nelems; + + if (nelems != 0 && listp != NULL) { + *listp = list; + *nelemp = nelems; + list = NULL; + } + +err: if (list != NULL) + __os_free(env, list); + if (meta != NULL && (t_ret = __memp_fput(mpf, + dbc->thread_info, (PAGE *)meta, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +static int +__db_truncate_freelist(dbc, meta, h, list, start, nelem) + DBC *dbc; + DBMETA *meta; + PAGE *h; + db_pgno_t *list; + u_int32_t start, nelem; +{ + DB *dbp; + DBT ddbt; + DB_LSN null_lsn; + DB_MPOOLFILE *mpf; + PAGE *last_free, *pg; + db_pgno_t *lp, free_pgno, lpgno; + db_pglist_t *plist, *pp, *spp; + u_int32_t elem, log_size; + int last, ret; + + dbp = dbc->dbp; + mpf = dbp->mpf; + plist = NULL; + last_free = NULL; + pg = NULL; + + if (start != 0 && + (ret = __memp_fget(mpf, &list[start - 1], + dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &last_free)) != 0) + goto err; + + if (DBC_LOGGING(dbc)) { + if ((ret = __os_malloc(dbp->env, + (nelem - start) * sizeof(*pp), &plist)) != 0) + goto err; + + pp = plist; + for (lp = &list[start]; lp < &list[nelem]; lp++) { + pp->pgno = *lp; + if ((ret = __memp_fget(mpf, lp, + dbc->thread_info, dbc->txn, 0, &pg)) != 0) + goto err; + pp->lsn = LSN(pg); + pp->next_pgno = NEXT_PGNO(pg); + if ((ret = __memp_fput(mpf, + dbc->thread_info, pg, DB_PRIORITY_VERY_LOW)) != 0) + goto err; + pg = NULL; + pp++; + } + ZERO_LSN(null_lsn); + pp = plist; + elem = nelem - start; + log_size = ((LOG *)dbc->env-> + lg_handle->reginfo.primary)->log_size; +again: ddbt.data = spp = pp; + free_pgno = pp->pgno; + lpgno = meta->last_pgno; + ddbt.size = elem * sizeof(*pp); + if (ddbt.size > log_size / 2) { + elem = (log_size / 2) / (u_int32_t)sizeof(*pp); + ddbt.size = elem * sizeof(*pp); + pp += elem; + elem = (nelem - start) - (u_int32_t)(pp - plist); + lpgno = pp[-1].pgno; + last = 0; + } else + last = 1; + /* + * Get the page which will link to this section if we abort. + * If this is the first segment then its last_free. + */ + if (spp == plist) + pg = last_free; + else if ((ret = __memp_fget(mpf, &spp[-1].pgno, + dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pg)) != 0) + goto err; + + if ((ret = __db_pg_trunc_log(dbp, dbc->txn, + &LSN(meta), last == 1 ? DB_FLUSH : 0, + PGNO(meta), &LSN(meta), + pg != NULL ? PGNO(pg) : PGNO_INVALID, + pg != NULL ? &LSN(pg) : &null_lsn, + free_pgno, lpgno, &ddbt)) != 0) + goto err; + if (pg != NULL) { + LSN(pg) = LSN(meta); + if (pg != last_free && (ret = __memp_fput(mpf, + dbc->thread_info, pg, DB_PRIORITY_VERY_LOW)) != 0) + goto err; + pg = NULL; + } + if (last == 0) + goto again; + } else + LSN_NOT_LOGGED(LSN(meta)); + + if ((ret = __memp_fput(mpf, + dbc->thread_info, h, DB_PRIORITY_VERY_LOW)) != 0) + goto err; + h = NULL; + if ((ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info, + list[start], 0)) != 0) + goto err; + meta->last_pgno = list[start] - 1; + + if (start == 0) + meta->free = PGNO_INVALID; + else { + NEXT_PGNO(last_free) = PGNO_INVALID; + if ((ret = __memp_fput(mpf, + dbc->thread_info, last_free, dbc->priority)) != 0) + goto err; + last_free = NULL; + } + + /* Shrink the number of elements in the list. */ + ret = __memp_extend_freelist(mpf, start, &list); + +err: if (plist != NULL) + __os_free(dbp->env, plist); + + /* We need to put the page on error. */ + if (h != NULL) + (void)__memp_fput(mpf, dbc->thread_info, h, dbc->priority); + if (pg != NULL && pg != last_free) + (void)__memp_fput(mpf, dbc->thread_info, pg, dbc->priority); + if (last_free != NULL) + (void)__memp_fput(mpf, + dbc->thread_info, last_free, dbc->priority); + + return (ret); +} +#endif + +#ifdef DEBUG +/* + * __db_lprint -- + * Print out the list of locks currently held by a cursor. + * + * PUBLIC: int __db_lprint __P((DBC *)); + */ +int +__db_lprint(dbc) + DBC *dbc; +{ + DB *dbp; + DB_LOCKREQ req; + ENV *env; + + dbp = dbc->dbp; + env = dbp->env; + + if (LOCKING_ON(env)) { + req.op = DB_LOCK_DUMP; + (void)__lock_vec(env, dbc->locker, 0, &req, 1, NULL); + } + return (0); +} +#endif + +/* + * __db_lget -- + * The standard lock get call. + * + * PUBLIC: int __db_lget __P((DBC *, + * PUBLIC: int, db_pgno_t, db_lockmode_t, u_int32_t, DB_LOCK *)); + */ +int +__db_lget(dbc, action, pgno, mode, lkflags, lockp) + DBC *dbc; + int action; + db_pgno_t pgno; + db_lockmode_t mode; + u_int32_t lkflags; + DB_LOCK *lockp; +{ + DB *dbp; + DB_LOCKREQ couple[3], *reqp; + DB_TXN *txn; + ENV *env; + int has_timeout, i, ret; + + dbp = dbc->dbp; + env = dbp->env; + txn = dbc->txn; + + /* + * We do not always check if we're configured for locking before + * calling __db_lget to acquire the lock. + */ + if (CDB_LOCKING(env) || !LOCKING_ON(env) || + (MULTIVERSION(dbp) && mode == DB_LOCK_READ && + dbc->txn != NULL && F_ISSET(dbc->txn, TXN_SNAPSHOT)) || + F_ISSET(dbc, DBC_DONTLOCK) || (F_ISSET(dbc, DBC_RECOVER) && + (action != LCK_ROLLBACK || IS_REP_CLIENT(env))) || + (action != LCK_ALWAYS && F_ISSET(dbc, DBC_OPD))) { + LOCK_INIT(*lockp); + return (0); + } + + dbc->lock.pgno = pgno; + if (lkflags & DB_LOCK_RECORD) + dbc->lock.type = DB_RECORD_LOCK; + else + dbc->lock.type = DB_PAGE_LOCK; + lkflags &= ~DB_LOCK_RECORD; + + /* + * If the transaction enclosing this cursor has DB_LOCK_NOWAIT set, + * pass that along to the lock call. + */ + if (DB_NONBLOCK(dbc)) + lkflags |= DB_LOCK_NOWAIT; + + if (F_ISSET(dbc, DBC_READ_UNCOMMITTED) && mode == DB_LOCK_READ) + mode = DB_LOCK_READ_UNCOMMITTED; + + has_timeout = F_ISSET(dbc, DBC_RECOVER) || + (txn != NULL && F_ISSET(txn, TXN_LOCKTIMEOUT)); + + /* + * Transactional locking. + * Hold on to the previous read lock only if we are in full isolation. + * COUPLE_ALWAYS indicates we are holding an interior node which need + * not be isolated. + * Downgrade write locks if we are supporting dirty readers. + */ + if ((action != LCK_COUPLE && action != LCK_COUPLE_ALWAYS) || + !LOCK_ISSET(*lockp)) + action = 0; + else if (dbc->txn == NULL || action == LCK_COUPLE_ALWAYS) + action = LCK_COUPLE; + else if (F_ISSET(dbc, DBC_READ_COMMITTED | DBC_WAS_READ_COMMITTED) && + lockp->mode == DB_LOCK_READ) + action = LCK_COUPLE; + else if (lockp->mode == DB_LOCK_READ_UNCOMMITTED) + action = LCK_COUPLE; + else if (F_ISSET(dbc->dbp, + DB_AM_READ_UNCOMMITTED) && lockp->mode == DB_LOCK_WRITE) + action = LCK_DOWNGRADE; + else + action = 0; + + i = 0; + switch (action) { + default: + if (has_timeout) + goto do_couple; + ret = __lock_get(env, + dbc->locker, lkflags, &dbc->lock_dbt, mode, lockp); + break; + + case LCK_DOWNGRADE: + couple[0].op = DB_LOCK_GET; + couple[0].obj = NULL; + couple[0].lock = *lockp; + couple[0].mode = DB_LOCK_WWRITE; + UMRW_SET(couple[0].timeout); + i++; + /* FALLTHROUGH */ + case LCK_COUPLE: +do_couple: couple[i].op = has_timeout? DB_LOCK_GET_TIMEOUT : DB_LOCK_GET; + couple[i].obj = &dbc->lock_dbt; + couple[i].mode = mode; + UMRW_SET(couple[i].timeout); + i++; + if (has_timeout) + couple[0].timeout = + F_ISSET(dbc, DBC_RECOVER) ? 0 : txn->lock_timeout; + if (action == LCK_COUPLE || action == LCK_DOWNGRADE) { + couple[i].op = DB_LOCK_PUT; + couple[i].lock = *lockp; + i++; + } + + ret = __lock_vec(env, + dbc->locker, lkflags, couple, i, &reqp); + if (ret == 0 || reqp == &couple[i - 1]) + *lockp = i == 1 ? couple[0].lock : couple[i - 2].lock; + break; + } + + if (txn != NULL && ret == DB_LOCK_DEADLOCK) + F_SET(txn, TXN_DEADLOCK); + return ((ret == DB_LOCK_NOTGRANTED && !F_ISSET(env->dbenv, + DB_ENV_TIME_NOTGRANTED)) ? DB_LOCK_DEADLOCK : ret); +} + +/* + * __db_lput -- + * The standard lock put call. + * + * PUBLIC: int __db_lput __P((DBC *, DB_LOCK *)); + */ +int +__db_lput(dbc, lockp) + DBC *dbc; + DB_LOCK *lockp; +{ + DB_LOCKREQ couple[2], *reqp; + ENV *env; + int action, ret; + + /* + * Transactional locking. + * Hold on to the read locks only if we are in full isolation. + * Downgrade write locks if we are supporting dirty readers. + */ + if (F_ISSET(dbc->dbp, + DB_AM_READ_UNCOMMITTED) && lockp->mode == DB_LOCK_WRITE) + action = LCK_DOWNGRADE; + else if (dbc->txn == NULL) + action = LCK_COUPLE; + else if (F_ISSET(dbc, DBC_READ_COMMITTED | DBC_WAS_READ_COMMITTED) && + lockp->mode == DB_LOCK_READ) + action = LCK_COUPLE; + else if (lockp->mode == DB_LOCK_READ_UNCOMMITTED) + action = LCK_COUPLE; + else + action = 0; + + env = dbc->env; + switch (action) { + case LCK_COUPLE: + ret = __lock_put(env, lockp); + break; + case LCK_DOWNGRADE: + couple[0].op = DB_LOCK_GET; + couple[0].obj = NULL; + couple[0].mode = DB_LOCK_WWRITE; + couple[0].lock = *lockp; + UMRW_SET(couple[0].timeout); + couple[1].op = DB_LOCK_PUT; + couple[1].lock = *lockp; + ret = __lock_vec(env, dbc->locker, 0, couple, 2, &reqp); + if (ret == 0 || reqp == &couple[1]) + *lockp = couple[0].lock; + break; + default: + ret = 0; + break; + } + + return (ret); +} diff --git a/db/db_method.c b/db/db_method.c new file mode 100644 index 0000000..1182f97 --- /dev/null +++ b/db/db_method.c @@ -0,0 +1,1052 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1999-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/crypto.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/hash.h" +#include "dbinc/lock.h" +#include "dbinc/mp.h" +#include "dbinc/qam.h" +#include "dbinc/txn.h" + +#ifdef HAVE_RPC +#ifdef HAVE_SYSTEM_INCLUDE_FILES +#include <rpc/rpc.h> +#endif +#include "db_server.h" +#include "dbinc_auto/rpc_client_ext.h" +#endif + +static int __db_get_byteswapped __P((DB *, int *)); +static int __db_get_dbname __P((DB *, const char **, const char **)); +static DB_ENV *__db_get_env __P((DB *)); +static void __db_get_msgcall + __P((DB *, void (**)(const DB_ENV *, const char *))); +static DB_MPOOLFILE *__db_get_mpf __P((DB *)); +static int __db_get_multiple __P((DB *)); +static int __db_get_transactional __P((DB *)); +static int __db_get_type __P((DB *, DBTYPE *dbtype)); +static int __db_init __P((DB *, u_int32_t)); +static int __db_get_alloc __P((DB *, void *(**)(size_t), + void *(**)(void *, size_t), void (**)(void *))); +static int __db_set_alloc __P((DB *, void *(*)(size_t), + void *(*)(void *, size_t), void (*)(void *))); +static int __db_get_append_recno __P((DB *, + int (**)(DB *, DBT *, db_recno_t))); +static int __db_set_append_recno __P((DB *, int (*)(DB *, DBT *, db_recno_t))); +static int __db_get_cachesize __P((DB *, u_int32_t *, u_int32_t *, int *)); +static int __db_set_cachesize __P((DB *, u_int32_t, u_int32_t, int)); +static int __db_get_create_dir __P((DB *, const char **)); +static int __db_set_create_dir __P((DB *, const char *)); +static int __db_get_dup_compare + __P((DB *, int (**)(DB *, const DBT *, const DBT *))); +static int __db_set_dup_compare + __P((DB *, int (*)(DB *, const DBT *, const DBT *))); +static int __db_get_encrypt_flags __P((DB *, u_int32_t *)); +static int __db_set_encrypt __P((DB *, const char *, u_int32_t)); +static int __db_get_feedback __P((DB *, void (**)(DB *, int, int))); +static int __db_set_feedback __P((DB *, void (*)(DB *, int, int))); +static void __db_map_flags __P((DB *, u_int32_t *, u_int32_t *)); +static int __db_get_pagesize __P((DB *, u_int32_t *)); +static int __db_set_paniccall __P((DB *, void (*)(DB_ENV *, int))); +static int __db_set_priority __P((DB *, DB_CACHE_PRIORITY)); +static int __db_get_priority __P((DB *, DB_CACHE_PRIORITY *)); +static void __db_get_errcall __P((DB *, + void (**)(const DB_ENV *, const char *, const char *))); +static void __db_set_errcall + __P((DB *, void (*)(const DB_ENV *, const char *, const char *))); +static void __db_get_errfile __P((DB *, FILE **)); +static void __db_set_errfile __P((DB *, FILE *)); +static void __db_get_errpfx __P((DB *, const char **)); +static void __db_set_errpfx __P((DB *, const char *)); +static void __db_set_msgcall + __P((DB *, void (*)(const DB_ENV *, const char *))); +static void __db_get_msgfile __P((DB *, FILE **)); +static void __db_set_msgfile __P((DB *, FILE *)); +static void __dbh_err __P((DB *, int, const char *, ...)); +static void __dbh_errx __P((DB *, const char *, ...)); + +/* + * db_create -- + * DB constructor. + * + * EXTERN: int db_create __P((DB **, DB_ENV *, u_int32_t)); + */ +int +db_create(dbpp, dbenv, flags) + DB **dbpp; + DB_ENV *dbenv; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + ip = NULL; + env = dbenv == NULL ? NULL : dbenv->env; + + /* Check for invalid function flags. */ + if (flags != 0) + return (__db_ferr(env, "db_create", 0)); + + if (env != NULL) + ENV_ENTER(env, ip); + ret = __db_create_internal(dbpp, env, flags); + if (env != NULL) + ENV_LEAVE(env, ip); + + return (ret); +} + +/* + * __db_create_internal -- + * DB constructor internal routine. + * + * PUBLIC: int __db_create_internal __P((DB **, ENV *, u_int32_t)); + */ +int +__db_create_internal(dbpp, env, flags) + DB **dbpp; + ENV *env; + u_int32_t flags; +{ + DB *dbp; + DB_ENV *dbenv; + DB_REP *db_rep; + int ret; + + *dbpp = NULL; + + /* If we don't have an environment yet, allocate a local one. */ + if (env == NULL) { + if ((ret = db_env_create(&dbenv, 0)) != 0) + return (ret); + env = dbenv->env; + F_SET(env, ENV_DBLOCAL); + } else + dbenv = env->dbenv; + + /* Allocate and initialize the DB handle. */ + if ((ret = __os_calloc(env, 1, sizeof(*dbp), &dbp)) != 0) + goto err; + + dbp->dbenv = env->dbenv; + dbp->env = env; + if ((ret = __db_init(dbp, flags)) != 0) + goto err; + + MUTEX_LOCK(env, env->mtx_dblist); + ++env->db_ref; + MUTEX_UNLOCK(env, env->mtx_dblist); + + /* + * Set the replication timestamp; it's 0 if we're not in a replicated + * environment. Don't acquire a lock to read the value, even though + * it's opaque: all we check later is value equality, nothing else. + */ + dbp->timestamp = REP_ON(env) ? + ((REGENV *)env->reginfo->primary)->rep_timestamp : 0; + /* + * Set the replication generation number for fid management; valid + * replication generations start at 1. Don't acquire a lock to + * read the value. All we check later is value equality. + */ + db_rep = env->rep_handle; + dbp->fid_gen = REP_ON(env) ? ((REP *)db_rep->region)->gen : 0; + + /* If not RPC, open a backing DB_MPOOLFILE handle in the memory pool. */ + if (!RPC_ON(dbenv) && (ret = __memp_fcreate(env, &dbp->mpf)) != 0) + goto err; + + dbp->type = DB_UNKNOWN; + + *dbpp = dbp; + return (0); + +err: if (dbp != NULL) { + if (dbp->mpf != NULL) + (void)__memp_fclose(dbp->mpf, 0); + __os_free(env, dbp); + } + + if (F_ISSET(env, ENV_DBLOCAL)) + (void)__env_close(dbp->dbenv, 0); + + return (ret); +} + +/* + * __db_init -- + * Initialize a DB structure. + */ +static int +__db_init(dbp, flags) + DB *dbp; + u_int32_t flags; +{ + int ret; + + dbp->locker = NULL; + LOCK_INIT(dbp->handle_lock); + + TAILQ_INIT(&dbp->free_queue); + TAILQ_INIT(&dbp->active_queue); + TAILQ_INIT(&dbp->join_queue); + LIST_INIT(&dbp->s_secondaries); + + FLD_SET(dbp->am_ok, + DB_OK_BTREE | DB_OK_HASH | DB_OK_QUEUE | DB_OK_RECNO); + + /* DB PUBLIC HANDLE LIST BEGIN */ + dbp->associate = __db_associate_pp; + dbp->associate_foreign = __db_associate_foreign_pp; + dbp->close = __db_close_pp; + dbp->compact = __db_compact_pp; + dbp->cursor = __db_cursor_pp; + dbp->del = __db_del_pp; + dbp->dump = __db_dump_pp; + dbp->err = __dbh_err; + dbp->errx = __dbh_errx; + dbp->exists = __db_exists; + dbp->fd = __db_fd_pp; + dbp->get = __db_get_pp; + dbp->get_alloc = __db_get_alloc; + dbp->get_append_recno = __db_get_append_recno; + dbp->get_byteswapped = __db_get_byteswapped; + dbp->get_cachesize = __db_get_cachesize; + dbp->get_create_dir = __db_get_create_dir; + dbp->get_dbname = __db_get_dbname; + dbp->get_dup_compare = __db_get_dup_compare; + dbp->get_encrypt_flags = __db_get_encrypt_flags; + dbp->get_env = __db_get_env; + dbp->get_errcall = __db_get_errcall; + dbp->get_errfile = __db_get_errfile; + dbp->get_errpfx = __db_get_errpfx; + dbp->get_feedback = __db_get_feedback; + dbp->get_flags = __db_get_flags; + dbp->get_lorder = __db_get_lorder; + dbp->get_mpf = __db_get_mpf; + dbp->get_msgcall = __db_get_msgcall; + dbp->get_msgfile = __db_get_msgfile; + dbp->get_multiple = __db_get_multiple; + dbp->get_open_flags = __db_get_open_flags; + dbp->get_partition_dirs = __partition_get_dirs; + dbp->get_partition_callback = __partition_get_callback; + dbp->get_partition_keys = __partition_get_keys; + dbp->get_pagesize = __db_get_pagesize; + dbp->get_priority = __db_get_priority; + dbp->get_transactional = __db_get_transactional; + dbp->get_type = __db_get_type; + dbp->join = __db_join_pp; + dbp->key_range = __db_key_range_pp; + dbp->open = __db_open_pp; + dbp->pget = __db_pget_pp; + dbp->put = __db_put_pp; + dbp->remove = __db_remove_pp; + dbp->rename = __db_rename_pp; + dbp->set_alloc = __db_set_alloc; + dbp->set_append_recno = __db_set_append_recno; + dbp->set_cachesize = __db_set_cachesize; + dbp->set_create_dir = __db_set_create_dir; + dbp->set_dup_compare = __db_set_dup_compare; + dbp->set_encrypt = __db_set_encrypt; + dbp->set_errcall = __db_set_errcall; + dbp->set_errfile = __db_set_errfile; + dbp->set_errpfx = __db_set_errpfx; + dbp->set_feedback = __db_set_feedback; + dbp->set_flags = __db_set_flags; + dbp->set_lorder = __db_set_lorder; + dbp->set_msgcall = __db_set_msgcall; + dbp->set_msgfile = __db_set_msgfile; + dbp->set_pagesize = __db_set_pagesize; + dbp->set_paniccall = __db_set_paniccall; + dbp->set_partition = __partition_set; + dbp->set_partition_dirs = __partition_set_dirs; + dbp->set_priority = __db_set_priority; + dbp->sort_multiple = __db_sort_multiple; + dbp->stat = __db_stat_pp; + dbp->stat_print = __db_stat_print_pp; + dbp->sync = __db_sync_pp; + dbp->truncate = __db_truncate_pp; + dbp->upgrade = __db_upgrade_pp; + dbp->verify = __db_verify_pp; + /* DB PUBLIC HANDLE LIST END */ + + /* Access method specific. */ + if ((ret = __bam_db_create(dbp)) != 0) + return (ret); + if ((ret = __ham_db_create(dbp)) != 0) + return (ret); + if ((ret = __qam_db_create(dbp)) != 0) + return (ret); + +#ifdef HAVE_RPC + /* + * RPC specific: must be last, as we replace methods set by the + * access methods. + */ + if (RPC_ON(dbp->dbenv)) { + __dbcl_dbp_init(dbp); + /* + * !!! + * We wrap the DB->open method for RPC, and the rpc.src file + * can't handle that. + */ + dbp->open = __dbcl_db_open_wrap; + if ((ret = __dbcl_db_create(dbp, dbp->dbenv, flags)) != 0) + return (ret); + } +#else + COMPQUIET(flags, 0); +#endif + + return (0); +} + +/* + * __dbh_am_chk -- + * Error if an unreasonable method is called. + * + * PUBLIC: int __dbh_am_chk __P((DB *, u_int32_t)); + */ +int +__dbh_am_chk(dbp, flags) + DB *dbp; + u_int32_t flags; +{ + /* + * We start out allowing any access methods to be called, and as the + * application calls the methods the options become restricted. The + * idea is to quit as soon as an illegal method combination is called. + */ + if ((LF_ISSET(DB_OK_BTREE) && FLD_ISSET(dbp->am_ok, DB_OK_BTREE)) || + (LF_ISSET(DB_OK_HASH) && FLD_ISSET(dbp->am_ok, DB_OK_HASH)) || + (LF_ISSET(DB_OK_QUEUE) && FLD_ISSET(dbp->am_ok, DB_OK_QUEUE)) || + (LF_ISSET(DB_OK_RECNO) && FLD_ISSET(dbp->am_ok, DB_OK_RECNO))) { + FLD_CLR(dbp->am_ok, ~flags); + return (0); + } + + __db_errx(dbp->env, + "call implies an access method which is inconsistent with previous calls"); + return (EINVAL); +} + +/* + * __dbh_err -- + * Db.err method. + */ +static void +#ifdef STDC_HEADERS +__dbh_err(DB *dbp, int error, const char *fmt, ...) +#else +__dbh_err(dbp, error, fmt, va_alist) + DB *dbp; + int error; + const char *fmt; + va_dcl +#endif +{ + /* Message with error string, to stderr by default. */ + DB_REAL_ERR(dbp->dbenv, error, DB_ERROR_SET, 1, fmt); +} + +/* + * __dbh_errx -- + * Db.errx method. + */ +static void +#ifdef STDC_HEADERS +__dbh_errx(DB *dbp, const char *fmt, ...) +#else +__dbh_errx(dbp, fmt, va_alist) + DB *dbp; + const char *fmt; + va_dcl +#endif +{ + /* Message without error string, to stderr by default. */ + DB_REAL_ERR(dbp->dbenv, 0, DB_ERROR_NOT_SET, 1, fmt); +} + +/* + * __db_get_byteswapped -- + * Return if database requires byte swapping. + */ +static int +__db_get_byteswapped(dbp, isswapped) + DB *dbp; + int *isswapped; +{ + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_byteswapped"); + + *isswapped = F_ISSET(dbp, DB_AM_SWAP) ? 1 : 0; + return (0); +} + +/* + * __db_get_dbname -- + * Get the name of the database as passed to DB->open. + */ +static int +__db_get_dbname(dbp, fnamep, dnamep) + DB *dbp; + const char **fnamep, **dnamep; +{ + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_dbname"); + + if (fnamep != NULL) + *fnamep = dbp->fname; + if (dnamep != NULL) + *dnamep = dbp->dname; + return (0); +} + +/* + * __db_get_env -- + * Get the DB_ENV handle that was passed to db_create. + */ +static DB_ENV * +__db_get_env(dbp) + DB *dbp; +{ + return (dbp->dbenv); +} + +/* + * __db_get_mpf -- + * Get the underlying DB_MPOOLFILE handle. + */ +static DB_MPOOLFILE * +__db_get_mpf(dbp) + DB *dbp; +{ + return (dbp->mpf); +} + +/* + * get_multiple -- + * Return whether this DB handle references a physical file with multiple + * databases. + */ +static int +__db_get_multiple(dbp) + DB *dbp; +{ + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_multiple"); + + /* + * Only return TRUE if the handle is for the master database, not for + * any subdatabase in the physical file. If it's a Btree, with the + * subdatabases flag set, and the meta-data page has the right value, + * return TRUE. (We don't need to check it's a Btree, I suppose, but + * it doesn't hurt.) + */ + return (dbp->type == DB_BTREE && + F_ISSET(dbp, DB_AM_SUBDB) && + dbp->meta_pgno == PGNO_BASE_MD ? 1 : 0); +} + +/* + * get_transactional -- + * Return whether this database was created in a transaction. + */ +static int +__db_get_transactional(dbp) + DB *dbp; +{ + return (F_ISSET(dbp, DB_AM_TXN) ? 1 : 0); +} + +/* + * __db_get_type -- + * Return type of underlying database. + */ +static int +__db_get_type(dbp, dbtype) + DB *dbp; + DBTYPE *dbtype; +{ + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_type"); + + *dbtype = dbp->type; + return (0); +} + +/* + * __db_get_append_recno -- + * Get record number append routine. + */ +static int +__db_get_append_recno(dbp, funcp) + DB *dbp; + int (**funcp) __P((DB *, DBT *, db_recno_t)); +{ + DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO); + if (funcp) + *funcp = dbp->db_append_recno; + + return (0); +} +/* + * __db_set_append_recno -- + * Set record number append routine. + */ +static int +__db_set_append_recno(dbp, func) + DB *dbp; + int (*func) __P((DB *, DBT *, db_recno_t)); +{ + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_append_recno"); + DB_ILLEGAL_METHOD(dbp, DB_OK_QUEUE | DB_OK_RECNO); + + dbp->db_append_recno = func; + + return (0); +} + +/* + * __db_get_cachesize -- + * Get underlying cache size. + */ +static int +__db_get_cachesize(dbp, cache_gbytesp, cache_bytesp, ncachep) + DB *dbp; + u_int32_t *cache_gbytesp, *cache_bytesp; + int *ncachep; +{ + DB_ILLEGAL_IN_ENV(dbp, "DB->get_cachesize"); + + return (__memp_get_cachesize(dbp->dbenv, + cache_gbytesp, cache_bytesp, ncachep)); +} + +/* + * __db_set_cachesize -- + * Set underlying cache size. + */ +static int +__db_set_cachesize(dbp, cache_gbytes, cache_bytes, ncache) + DB *dbp; + u_int32_t cache_gbytes, cache_bytes; + int ncache; +{ + DB_ILLEGAL_IN_ENV(dbp, "DB->set_cachesize"); + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_cachesize"); + + return (__memp_set_cachesize( + dbp->dbenv, cache_gbytes, cache_bytes, ncache)); +} + +static int +__db_set_create_dir(dbp, dir) + DB *dbp; + const char *dir; +{ + DB_ENV *dbenv; + int i; + + dbenv = dbp->dbenv; + + for (i = 0; i < dbenv->data_next; i++) + if (strcmp(dir, dbenv->db_data_dir[i]) == 0) + break; + + if (i == dbenv->data_next) { + __db_errx(dbp->env, + "Directory %s not in environment list.", dir); + return (EINVAL); + } + + dbp->dirname = dbenv->db_data_dir[i]; + return (0); +} + +static int +__db_get_create_dir(dbp, dirp) + DB *dbp; + const char **dirp; +{ + *dirp = dbp->dirname; + return (0); +} + +/* + * __db_get_dup_compare -- + * Get duplicate comparison routine. + */ +static int +__db_get_dup_compare(dbp, funcp) + DB *dbp; + int (**funcp) __P((DB *, const DBT *, const DBT *)); +{ + + DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE | DB_OK_HASH); + + if (funcp != NULL) { +#ifdef HAVE_COMPRESSION + if (DB_IS_COMPRESSED(dbp)) { + *funcp = + ((BTREE *)dbp->bt_internal)->compress_dup_compare; + } else +#endif + *funcp = dbp->dup_compare; + } + + return (0); +} + +/* + * __db_set_dup_compare -- + * Set duplicate comparison routine. + */ +static int +__db_set_dup_compare(dbp, func) + DB *dbp; + int (*func) __P((DB *, const DBT *, const DBT *)); +{ + int ret; + + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_dup_compare"); + DB_ILLEGAL_METHOD(dbp, DB_OK_BTREE | DB_OK_HASH); + + if ((ret = __db_set_flags(dbp, DB_DUPSORT)) != 0) + return (ret); + +#ifdef HAVE_COMPRESSION + if (DB_IS_COMPRESSED(dbp)) { + dbp->dup_compare = __bam_compress_dupcmp; + ((BTREE *)dbp->bt_internal)->compress_dup_compare = func; + } else +#endif + dbp->dup_compare = func; + + return (0); +} + +/* + * __db_get_encrypt_flags -- + */ +static int +__db_get_encrypt_flags(dbp, flagsp) + DB *dbp; + u_int32_t *flagsp; +{ + DB_ILLEGAL_IN_ENV(dbp, "DB->get_encrypt_flags"); + + return (__env_get_encrypt_flags(dbp->dbenv, flagsp)); +} + +/* + * __db_set_encrypt -- + * Set database passwd. + */ +static int +__db_set_encrypt(dbp, passwd, flags) + DB *dbp; + const char *passwd; + u_int32_t flags; +{ + DB_CIPHER *db_cipher; + int ret; + + DB_ILLEGAL_IN_ENV(dbp, "DB->set_encrypt"); + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_encrypt"); + + if ((ret = __env_set_encrypt(dbp->dbenv, passwd, flags)) != 0) + return (ret); + + /* + * In a real env, this gets initialized with the region. In a local + * env, we must do it here. + */ + db_cipher = dbp->env->crypto_handle; + if (!F_ISSET(db_cipher, CIPHER_ANY) && + (ret = db_cipher->init(dbp->env, db_cipher)) != 0) + return (ret); + + return (__db_set_flags(dbp, DB_ENCRYPT)); +} + +static void +__db_get_errcall(dbp, errcallp) + DB *dbp; + void (**errcallp) __P((const DB_ENV *, const char *, const char *)); +{ + __env_get_errcall(dbp->dbenv, errcallp); +} + +static void +__db_set_errcall(dbp, errcall) + DB *dbp; + void (*errcall) __P((const DB_ENV *, const char *, const char *)); +{ + __env_set_errcall(dbp->dbenv, errcall); +} + +static void +__db_get_errfile(dbp, errfilep) + DB *dbp; + FILE **errfilep; +{ + __env_get_errfile(dbp->dbenv, errfilep); +} + +static void +__db_set_errfile(dbp, errfile) + DB *dbp; + FILE *errfile; +{ + __env_set_errfile(dbp->dbenv, errfile); +} + +static void +__db_get_errpfx(dbp, errpfxp) + DB *dbp; + const char **errpfxp; +{ + __env_get_errpfx(dbp->dbenv, errpfxp); +} + +static void +__db_set_errpfx(dbp, errpfx) + DB *dbp; + const char *errpfx; +{ + __env_set_errpfx(dbp->dbenv, errpfx); +} + +static int +__db_get_feedback(dbp, feedbackp) + DB *dbp; + void (**feedbackp) __P((DB *, int, int)); +{ + if (feedbackp != NULL) + *feedbackp = dbp->db_feedback; + return (0); +} + +static int +__db_set_feedback(dbp, feedback) + DB *dbp; + void (*feedback) __P((DB *, int, int)); +{ + dbp->db_feedback = feedback; + return (0); +} + +/* + * __db_map_flags -- + * Maps between public and internal flag values. + * This function doesn't check for validity, so it can't fail. + */ +static void +__db_map_flags(dbp, inflagsp, outflagsp) + DB *dbp; + u_int32_t *inflagsp, *outflagsp; +{ + COMPQUIET(dbp, NULL); + + if (FLD_ISSET(*inflagsp, DB_CHKSUM)) { + FLD_SET(*outflagsp, DB_AM_CHKSUM); + FLD_CLR(*inflagsp, DB_CHKSUM); + } + if (FLD_ISSET(*inflagsp, DB_ENCRYPT)) { + FLD_SET(*outflagsp, DB_AM_ENCRYPT | DB_AM_CHKSUM); + FLD_CLR(*inflagsp, DB_ENCRYPT); + } + if (FLD_ISSET(*inflagsp, DB_TXN_NOT_DURABLE)) { + FLD_SET(*outflagsp, DB_AM_NOT_DURABLE); + FLD_CLR(*inflagsp, DB_TXN_NOT_DURABLE); + } +} + +/* + * __db_get_flags -- + * The DB->get_flags method. + * + * PUBLIC: int __db_get_flags __P((DB *, u_int32_t *)); + */ +int +__db_get_flags(dbp, flagsp) + DB *dbp; + u_int32_t *flagsp; +{ + static const u_int32_t db_flags[] = { + DB_CHKSUM, + DB_DUP, + DB_DUPSORT, + DB_ENCRYPT, +#ifdef HAVE_QUEUE + DB_INORDER, +#endif + DB_RECNUM, + DB_RENUMBER, + DB_REVSPLITOFF, + DB_SNAPSHOT, + DB_TXN_NOT_DURABLE, + 0 + }; + u_int32_t f, flags, mapped_flag; + int i; + + flags = 0; + for (i = 0; (f = db_flags[i]) != 0; i++) { + mapped_flag = 0; + __db_map_flags(dbp, &f, &mapped_flag); + __bam_map_flags(dbp, &f, &mapped_flag); + __ram_map_flags(dbp, &f, &mapped_flag); +#ifdef HAVE_QUEUE + __qam_map_flags(dbp, &f, &mapped_flag); +#endif + DB_ASSERT(dbp->env, f == 0); + if (F_ISSET(dbp, mapped_flag) == mapped_flag) + LF_SET(db_flags[i]); + } + + *flagsp = flags; + return (0); +} + +/* + * __db_set_flags -- + * DB->set_flags. + * + * PUBLIC: int __db_set_flags __P((DB *, u_int32_t)); + */ +int +__db_set_flags(dbp, flags) + DB *dbp; + u_int32_t flags; +{ + ENV *env; + int ret; + + env = dbp->env; + + if (LF_ISSET(DB_ENCRYPT) && !CRYPTO_ON(env)) { + __db_errx(env, + "Database environment not configured for encryption"); + return (EINVAL); + } + if (LF_ISSET(DB_TXN_NOT_DURABLE)) + ENV_REQUIRES_CONFIG(env, + env->tx_handle, "DB_NOT_DURABLE", DB_INIT_TXN); + + __db_map_flags(dbp, &flags, &dbp->flags); + + if ((ret = __bam_set_flags(dbp, &flags)) != 0) + return (ret); + if ((ret = __ram_set_flags(dbp, &flags)) != 0) + return (ret); +#ifdef HAVE_QUEUE + if ((ret = __qam_set_flags(dbp, &flags)) != 0) + return (ret); +#endif + + return (flags == 0 ? 0 : __db_ferr(env, "DB->set_flags", 0)); +} + +/* + * __db_get_lorder -- + * Get whether lorder is swapped or not. + * + * PUBLIC: int __db_get_lorder __P((DB *, int *)); + */ +int +__db_get_lorder(dbp, db_lorderp) + DB *dbp; + int *db_lorderp; +{ + int ret; + + /* Flag if the specified byte order requires swapping. */ + switch (ret = __db_byteorder(dbp->env, 1234)) { + case 0: + *db_lorderp = F_ISSET(dbp, DB_AM_SWAP) ? 4321 : 1234; + break; + case DB_SWAPBYTES: + *db_lorderp = F_ISSET(dbp, DB_AM_SWAP) ? 1234 : 4321; + break; + default: + return (ret); + /* NOTREACHED */ + } + + return (0); +} + +/* + * __db_set_lorder -- + * Set whether lorder is swapped or not. + * + * PUBLIC: int __db_set_lorder __P((DB *, int)); + */ +int +__db_set_lorder(dbp, db_lorder) + DB *dbp; + int db_lorder; +{ + int ret; + + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_lorder"); + + /* Flag if the specified byte order requires swapping. */ + switch (ret = __db_byteorder(dbp->env, db_lorder)) { + case 0: + F_CLR(dbp, DB_AM_SWAP); + break; + case DB_SWAPBYTES: + F_SET(dbp, DB_AM_SWAP); + break; + default: + return (ret); + /* NOTREACHED */ + } + return (0); +} + +static int +__db_get_alloc(dbp, mal_funcp, real_funcp, free_funcp) + DB *dbp; + void *(**mal_funcp) __P((size_t)); + void *(**real_funcp) __P((void *, size_t)); + void (**free_funcp) __P((void *)); +{ + DB_ILLEGAL_IN_ENV(dbp, "DB->get_alloc"); + + return (__env_get_alloc(dbp->dbenv, mal_funcp, + real_funcp, free_funcp)); +} + +static int +__db_set_alloc(dbp, mal_func, real_func, free_func) + DB *dbp; + void *(*mal_func) __P((size_t)); + void *(*real_func) __P((void *, size_t)); + void (*free_func) __P((void *)); +{ + DB_ILLEGAL_IN_ENV(dbp, "DB->set_alloc"); + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_alloc"); + + return (__env_set_alloc(dbp->dbenv, mal_func, real_func, free_func)); +} + +static void +__db_get_msgcall(dbp, msgcallp) + DB *dbp; + void (**msgcallp) __P((const DB_ENV *, const char *)); +{ + __env_get_msgcall(dbp->dbenv, msgcallp); +} + +static void +__db_set_msgcall(dbp, msgcall) + DB *dbp; + void (*msgcall) __P((const DB_ENV *, const char *)); +{ + __env_set_msgcall(dbp->dbenv, msgcall); +} + +static void +__db_get_msgfile(dbp, msgfilep) + DB *dbp; + FILE **msgfilep; +{ + __env_get_msgfile(dbp->dbenv, msgfilep); +} + +static void +__db_set_msgfile(dbp, msgfile) + DB *dbp; + FILE *msgfile; +{ + __env_set_msgfile(dbp->dbenv, msgfile); +} + +static int +__db_get_pagesize(dbp, db_pagesizep) + DB *dbp; + u_int32_t *db_pagesizep; +{ + *db_pagesizep = dbp->pgsize; + return (0); +} + +/* + * __db_set_pagesize -- + * DB->set_pagesize + * + * PUBLIC: int __db_set_pagesize __P((DB *, u_int32_t)); + */ +int +__db_set_pagesize(dbp, db_pagesize) + DB *dbp; + u_int32_t db_pagesize; +{ + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_pagesize"); + + if (db_pagesize < DB_MIN_PGSIZE) { + __db_errx(dbp->env, "page sizes may not be smaller than %lu", + (u_long)DB_MIN_PGSIZE); + return (EINVAL); + } + if (db_pagesize > DB_MAX_PGSIZE) { + __db_errx(dbp->env, "page sizes may not be larger than %lu", + (u_long)DB_MAX_PGSIZE); + return (EINVAL); + } + + /* + * We don't want anything that's not a power-of-2, as we rely on that + * for alignment of various types on the pages. + */ + if (!POWER_OF_TWO(db_pagesize)) { + __db_errx(dbp->env, "page sizes must be a power-of-2"); + return (EINVAL); + } + + /* + * XXX + * Should we be checking for a page size that's not a multiple of 512, + * so that we never try and write less than a disk sector? + */ + dbp->pgsize = db_pagesize; + + return (0); +} + +static int +__db_set_paniccall(dbp, paniccall) + DB *dbp; + void (*paniccall) __P((DB_ENV *, int)); +{ + return (__env_set_paniccall(dbp->dbenv, paniccall)); +} + +static int +__db_set_priority(dbp, priority) + DB *dbp; + DB_CACHE_PRIORITY priority; +{ + dbp->priority = priority; + return (0); +} + +static int +__db_get_priority(dbp, priority) + DB *dbp; + DB_CACHE_PRIORITY *priority; +{ + *priority = dbp->priority; + return (0); +} diff --git a/db/db_open.c b/db/db_open.c new file mode 100644 index 0000000..5c5db09 --- /dev/null +++ b/db/db_open.c @@ -0,0 +1,628 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_swap.h" +#include "dbinc/btree.h" +#include "dbinc/crypto.h" +#include "dbinc/hmac.h" +#include "dbinc/fop.h" +#include "dbinc/hash.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/qam.h" +#include "dbinc/txn.h" + +/* + * __db_open -- + * DB->open method. + * + * This routine gets called in three different ways: + * + * 1. It can be called to open a file/database. In this case, subdb will + * be NULL and meta_pgno will be PGNO_BASE_MD. + * 2. It can be called to open a subdatabase during normal operation. In + * this case, name and subname will both be non-NULL and meta_pgno will + * be PGNO_BASE_MD (also PGNO_INVALID). + * 3. It can be called to open an in-memory database (name == NULL; + * subname = name). + * 4. It can be called during recovery to open a file/database, in which case + * name will be non-NULL, subname will be NULL, and meta-pgno will be + * PGNO_BASE_MD. + * 5. It can be called during recovery to open a subdatabase, in which case + * name will be non-NULL, subname may be NULL and meta-pgno will be + * a valid pgno (i.e., not PGNO_BASE_MD). + * 6. It can be called during recovery to open an in-memory database. + * + * PUBLIC: int __db_open __P((DB *, DB_THREAD_INFO *, DB_TXN *, + * PUBLIC: const char *, const char *, DBTYPE, u_int32_t, int, db_pgno_t)); + */ +int +__db_open(dbp, ip, txn, fname, dname, type, flags, mode, meta_pgno) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + const char *fname, *dname; + DBTYPE type; + u_int32_t flags; + int mode; + db_pgno_t meta_pgno; +{ + DB *tdbp; + ENV *env; + int ret; + u_int32_t id; + + env = dbp->env; + id = TXN_INVALID; + + /* + * We must flush any existing pages before truncating the file + * since they could age out of mpool and overwrite new pages. + */ + if (LF_ISSET(DB_TRUNCATE)) { + if ((ret = __db_create_internal(&tdbp, dbp->env, 0)) != 0) + goto err; + ret = __db_open(tdbp, ip, txn, fname, dname, DB_UNKNOWN, + DB_NOERROR | (flags & ~(DB_TRUNCATE|DB_CREATE)), + mode, meta_pgno); + if (ret == 0) + ret = __memp_ftruncate(tdbp->mpf, txn, ip, 0, 0); + (void)__db_close(tdbp, txn, DB_NOSYNC); + if (ret != 0 && ret != ENOENT && ret != EINVAL) + goto err; + ret = 0; + } + + DB_TEST_RECOVERY(dbp, DB_TEST_PREOPEN, ret, fname); + + /* + * If the environment was configured with threads, the DB handle + * must also be free-threaded, so we force the DB_THREAD flag on. + * (See SR #2033 for why this is a requirement--recovery needs + * to be able to grab a dbp using __db_fileid_to_dbp, and it has + * no way of knowing which dbp goes with which thread, so whichever + * one it finds has to be usable in any of them.) + */ + if (F_ISSET(env, ENV_THREAD)) + LF_SET(DB_THREAD); + + /* Convert any DB->open flags. */ + if (LF_ISSET(DB_RDONLY)) + F_SET(dbp, DB_AM_RDONLY); + if (LF_ISSET(DB_READ_UNCOMMITTED)) + F_SET(dbp, DB_AM_READ_UNCOMMITTED); + + if (IS_REAL_TXN(txn)) + F_SET(dbp, DB_AM_TXN); + + /* Fill in the type. */ + dbp->type = type; + + /* + * If both fname and subname are NULL, it's always a create, so make + * sure that we have both DB_CREATE and a type specified. It would + * be nice if this checking were done in __db_open where most of the + * interface checking is done, but this interface (__db_dbopen) is + * used by the recovery and limbo system, so we need to safeguard + * this interface as well. + */ + if (fname == NULL) { + if (dbp->p_internal != NULL) { + __db_errx(env, + "Partitioned databases may not be in memory."); + return (ENOENT); + } + if (dname == NULL) { + if (!LF_ISSET(DB_CREATE)) { + __db_errx(env, + "DB_CREATE must be specified to create databases."); + return (ENOENT); + } + + F_SET(dbp, DB_AM_INMEM); + F_SET(dbp, DB_AM_CREATED); + + if (dbp->type == DB_UNKNOWN) { + __db_errx(env, + "DBTYPE of unknown without existing file"); + return (EINVAL); + } + + if (dbp->pgsize == 0) + dbp->pgsize = DB_DEF_IOSIZE; + + /* + * If the file is a temporary file and we're + * doing locking, then we have to create a + * unique file ID. We can't use our normal + * dev/inode pair (or whatever this OS uses + * in place of dev/inode pairs) because no + * backing file will be created until the + * mpool cache is filled forcing the buffers + * to disk. Grab a random locker ID to use + * as a file ID. The created ID must never + * match a potential real file ID -- we know + * it won't because real file IDs contain a + * time stamp after the dev/inode pair, and + * we're simply storing a 4-byte value. + + * !!! + * Store the locker in the file id structure + * -- we can get it from there as necessary, + * and it saves having two copies. + */ + if (LOCKING_ON(env) && (ret = __lock_id(env, + (u_int32_t *)dbp->fileid, NULL)) != 0) + return (ret); + } else + MAKE_INMEM(dbp); + + /* + * Normally we would do handle locking here, however, with + * in-memory files, we cannot do any database manipulation + * until the mpool is open, so it happens later. + */ + } else if (dname == NULL && meta_pgno == PGNO_BASE_MD) { + /* Open/create the underlying file. Acquire locks. */ + if ((ret = __fop_file_setup(dbp, ip, + txn, fname, mode, flags, &id)) != 0) + return (ret); + } else { + if (dbp->p_internal != NULL) { + __db_errx(env, + "Partitioned databases may not be included with multiple databases."); + return (ENOENT); + } + if ((ret = __fop_subdb_setup(dbp, ip, + txn, fname, dname, mode, flags)) != 0) + return (ret); + meta_pgno = dbp->meta_pgno; + } + + /* Set up the underlying environment. */ + if ((ret = __env_setup(dbp, txn, fname, dname, id, flags)) != 0) + return (ret); + + /* For in-memory databases, we now need to open/create the database. */ + if (F_ISSET(dbp, DB_AM_INMEM)) { + if (dname == NULL) + ret = __db_new_file(dbp, ip, txn, NULL, NULL); + else { + id = TXN_INVALID; + if ((ret = __fop_file_setup(dbp, ip, + txn, dname, mode, flags, &id)) == 0 && + DBENV_LOGGING(env) && !F_ISSET(dbp, DB_AM_RECOVER) +#if !defined(DEBUG_ROP) && !defined(DEBUG_WOP) && !defined(DIAGNOSTIC) + && txn != NULL +#endif +#if !defined(DEBUG_ROP) + && !F_ISSET(dbp, DB_AM_RDONLY) +#endif + ) + ret = __dbreg_log_id(dbp, + txn, dbp->log_filename->id, 1); + } + if (ret != 0) + goto err; + } + + switch (dbp->type) { + case DB_BTREE: + ret = __bam_open(dbp, ip, txn, fname, meta_pgno, flags); + break; + case DB_HASH: + ret = __ham_open(dbp, ip, txn, fname, meta_pgno, flags); + break; + case DB_RECNO: + ret = __ram_open(dbp, ip, txn, fname, meta_pgno, flags); + break; + case DB_QUEUE: + ret = __qam_open( + dbp, ip, txn, fname, meta_pgno, mode, flags); + break; + case DB_UNKNOWN: + return ( + __db_unknown_type(env, "__db_dbopen", dbp->type)); + } + if (ret != 0) + goto err; + +#ifdef HAVE_PARTITION + if (dbp->p_internal != NULL && (ret = + __partition_open(dbp, ip, txn, fname, type, flags, mode, 1)) != 0) + goto err; +#endif + DB_TEST_RECOVERY(dbp, DB_TEST_POSTOPEN, ret, fname); + + /* + * Temporary files don't need handle locks, so we only have to check + * for a handle lock downgrade or lockevent in the case of named + * files. + */ + if (!F_ISSET(dbp, DB_AM_RECOVER) && (fname != NULL || dname != NULL) && + LOCK_ISSET(dbp->handle_lock)) { + if (IS_REAL_TXN(txn)) + ret = __txn_lockevent(env, + txn, dbp, &dbp->handle_lock, dbp->locker); + else if (LOCKING_ON(env)) + /* Trade write handle lock for read handle lock. */ + ret = __lock_downgrade(env, + &dbp->handle_lock, DB_LOCK_READ, 0); + } +DB_TEST_RECOVERY_LABEL +err: + return (ret); +} + +/* + * __db_get_open_flags -- + * Accessor for flags passed into DB->open call + * + * PUBLIC: int __db_get_open_flags __P((DB *, u_int32_t *)); + */ +int +__db_get_open_flags(dbp, flagsp) + DB *dbp; + u_int32_t *flagsp; +{ + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->get_open_flags"); + + *flagsp = dbp->open_flags; + return (0); +} + +/* + * __db_new_file -- + * Create a new database file. + * + * PUBLIC: int __db_new_file __P((DB *, + * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_FH *, const char *)); + */ +int +__db_new_file(dbp, ip, txn, fhp, name) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + DB_FH *fhp; + const char *name; +{ + int ret; + + switch (dbp->type) { + case DB_BTREE: + case DB_RECNO: + ret = __bam_new_file(dbp, ip, txn, fhp, name); + break; + case DB_HASH: + ret = __ham_new_file(dbp, ip, txn, fhp, name); + break; + case DB_QUEUE: + ret = __qam_new_file(dbp, ip, txn, fhp, name); + break; + case DB_UNKNOWN: + default: + __db_errx(dbp->env, + "%s: Invalid type %d specified", name, dbp->type); + ret = EINVAL; + break; + } + + DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOGMETA, ret, name); + /* Sync the file in preparation for moving it into place. */ + if (ret == 0 && fhp != NULL) + ret = __os_fsync(dbp->env, fhp); + + DB_TEST_RECOVERY(dbp, DB_TEST_POSTSYNC, ret, name); + +DB_TEST_RECOVERY_LABEL + return (ret); +} + +/* + * __db_init_subdb -- + * Initialize the dbp for a subdb. + * + * PUBLIC: int __db_init_subdb __P((DB *, + * PUBLIC: DB *, const char *, DB_THREAD_INFO *, DB_TXN *)); + */ +int +__db_init_subdb(mdbp, dbp, name, ip, txn) + DB *mdbp, *dbp; + const char *name; + DB_THREAD_INFO *ip; + DB_TXN *txn; +{ + DBMETA *meta; + DB_MPOOLFILE *mpf; + int ret, t_ret; + + ret = 0; + if (!F_ISSET(dbp, DB_AM_CREATED)) { + /* Subdb exists; read meta-data page and initialize. */ + mpf = mdbp->mpf; + if ((ret = __memp_fget(mpf, &dbp->meta_pgno, + ip, txn, 0, &meta)) != 0) + goto err; + ret = __db_meta_setup(mdbp->env, dbp, name, meta, 0, 0); + if ((t_ret = __memp_fput(mpf, + ip, meta, dbp->priority)) != 0 && ret == 0) + ret = t_ret; + /* + * If __db_meta_setup found that the meta-page hadn't + * been written out during recovery, we can just return. + */ + if (ret == ENOENT) + ret = 0; + goto err; + } + + /* Handle the create case here. */ + switch (dbp->type) { + case DB_BTREE: + case DB_RECNO: + ret = __bam_new_subdb(mdbp, dbp, ip, txn); + break; + case DB_HASH: + ret = __ham_new_subdb(mdbp, dbp, ip, txn); + break; + case DB_QUEUE: + ret = EINVAL; + break; + case DB_UNKNOWN: + default: + __db_errx(dbp->env, + "Invalid subdatabase type %d specified", dbp->type); + return (EINVAL); + } + +err: return (ret); +} + +/* + * __db_chk_meta -- + * Take a buffer containing a meta-data page and check it for a valid LSN, + * checksum (and verify the checksum if necessary) and possibly decrypt it. + * + * Return 0 on success, >0 (errno) on error, -1 on checksum mismatch. + * + * PUBLIC: int __db_chk_meta __P((ENV *, DB *, DBMETA *, u_int32_t)); + */ +int +__db_chk_meta(env, dbp, meta, flags) + ENV *env; + DB *dbp; + DBMETA *meta; + u_int32_t flags; +{ + DB_LSN swap_lsn; + int is_hmac, ret, swapped; + u_int32_t magic, orig_chk; + u_int8_t *chksum; + + ret = 0; + swapped = 0; + + if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM)) { + if (dbp != NULL) + F_SET(dbp, DB_AM_CHKSUM); + + is_hmac = meta->encrypt_alg == 0 ? 0 : 1; + chksum = ((BTMETA *)meta)->chksum; + + /* + * If we need to swap, the checksum function overwrites the + * original checksum with 0, so we need to save a copy of the + * original for swapping later. + */ + orig_chk = *(u_int32_t *)chksum; + + /* + * We cannot add this to __db_metaswap because that gets done + * later after we've verified the checksum or decrypted. + */ + if (LF_ISSET(DB_CHK_META)) { + swapped = 0; +chk_retry: if ((ret = + __db_check_chksum(env, NULL, env->crypto_handle, + chksum, meta, DBMETASIZE, is_hmac)) != 0) { + if (is_hmac || swapped) + return (ret); + + M_32_SWAP(orig_chk); + swapped = 1; + *(u_int32_t *)chksum = orig_chk; + goto chk_retry; + } + } + } else if (dbp != NULL) + F_CLR(dbp, DB_AM_CHKSUM); + +#ifdef HAVE_CRYPTO + ret = __crypto_decrypt_meta(env, + dbp, (u_int8_t *)meta, LF_ISSET(DB_CHK_META)); +#endif + + /* Now that we're decrypted, we can check LSN. */ + if (LOGGING_ON(env) && !LF_ISSET(DB_CHK_NOLSN)) { + /* + * This gets called both before and after swapping, so we + * need to check ourselves. If we already swapped it above, + * we'll know that here. + */ + + swap_lsn = meta->lsn; + magic = meta->magic; +lsn_retry: + if (swapped) { + M_32_SWAP(swap_lsn.file); + M_32_SWAP(swap_lsn.offset); + M_32_SWAP(magic); + } + switch (magic) { + case DB_BTREEMAGIC: + case DB_HASHMAGIC: + case DB_QAMMAGIC: + case DB_RENAMEMAGIC: + break; + default: + if (swapped) + return (EINVAL); + swapped = 1; + goto lsn_retry; + } + if (!IS_REP_CLIENT(env) && + !IS_NOT_LOGGED_LSN(swap_lsn) && !IS_ZERO_LSN(swap_lsn)) + /* Need to do check. */ + ret = __log_check_page_lsn(env, dbp, &swap_lsn); + } + return (ret); +} + +/* + * __db_meta_setup -- + * + * Take a buffer containing a meta-data page and figure out if it's + * valid, and if so, initialize the dbp from the meta-data page. + * + * PUBLIC: int __db_meta_setup __P((ENV *, + * PUBLIC: DB *, const char *, DBMETA *, u_int32_t, u_int32_t)); + */ +int +__db_meta_setup(env, dbp, name, meta, oflags, flags) + ENV *env; + DB *dbp; + const char *name; + DBMETA *meta; + u_int32_t oflags; + u_int32_t flags; +{ + u_int32_t magic; + int ret; + + ret = 0; + + /* + * Figure out what access method we're dealing with, and then + * call access method specific code to check error conditions + * based on conflicts between the found file and application + * arguments. A found file overrides some user information -- + * we don't consider it an error, for example, if the user set + * an expected byte order and the found file doesn't match it. + */ + F_CLR(dbp, DB_AM_SWAP | DB_AM_IN_RENAME); + magic = meta->magic; + +swap_retry: + switch (magic) { + case DB_BTREEMAGIC: + case DB_HASHMAGIC: + case DB_QAMMAGIC: + case DB_RENAMEMAGIC: + break; + case 0: + /* + * The only time this should be 0 is if we're in the + * midst of opening a subdb during recovery and that + * subdatabase had its meta-data page allocated, but + * not yet initialized. + */ + if (F_ISSET(dbp, DB_AM_SUBDB) && ((IS_RECOVERING(env) && + F_ISSET(env->lg_handle, DBLOG_FORCE_OPEN)) || + meta->pgno != PGNO_INVALID)) + return (ENOENT); + + goto bad_format; + default: + if (F_ISSET(dbp, DB_AM_SWAP)) + goto bad_format; + + M_32_SWAP(magic); + F_SET(dbp, DB_AM_SWAP); + goto swap_retry; + } + + /* + * We can only check the meta page if we are sure we have a meta page. + * If it is random data, then this check can fail. So only now can we + * checksum and decrypt. Don't distinguish between configuration and + * checksum match errors here, because we haven't opened the database + * and even a checksum error isn't a reason to panic the environment. + */ + if ((ret = __db_chk_meta(env, dbp, meta, flags)) != 0) { + if (ret == -1) + __db_errx(env, + "%s: metadata page checksum error", name); + goto bad_format; + } + + switch (magic) { + case DB_BTREEMAGIC: + if (dbp->type != DB_UNKNOWN && + dbp->type != DB_RECNO && dbp->type != DB_BTREE) + goto bad_format; + + flags = meta->flags; + if (F_ISSET(dbp, DB_AM_SWAP)) + M_32_SWAP(flags); + if (LF_ISSET(BTM_RECNO)) + dbp->type = DB_RECNO; + else + dbp->type = DB_BTREE; + if ((oflags & DB_TRUNCATE) == 0 && (ret = + __bam_metachk(dbp, name, (BTMETA *)meta)) != 0) + return (ret); + break; + case DB_HASHMAGIC: + if (dbp->type != DB_UNKNOWN && dbp->type != DB_HASH) + goto bad_format; + + dbp->type = DB_HASH; + if ((oflags & DB_TRUNCATE) == 0 && (ret = + __ham_metachk(dbp, name, (HMETA *)meta)) != 0) + return (ret); + break; + case DB_QAMMAGIC: + if (dbp->type != DB_UNKNOWN && dbp->type != DB_QUEUE) + goto bad_format; + dbp->type = DB_QUEUE; + if ((oflags & DB_TRUNCATE) == 0 && (ret = + __qam_metachk(dbp, name, (QMETA *)meta)) != 0) + return (ret); + break; + case DB_RENAMEMAGIC: + F_SET(dbp, DB_AM_IN_RENAME); + + /* Copy the file's ID. */ + memcpy(dbp->fileid, ((DBMETA *)meta)->uid, DB_FILE_ID_LEN); + + break; + default: + goto bad_format; + } + + if (FLD_ISSET(meta->metaflags, + DBMETA_PART_RANGE | DBMETA_PART_CALLBACK)) + if ((ret = + __partition_init(dbp, meta->metaflags)) != 0) + return (ret); + return (0); + +bad_format: + if (F_ISSET(dbp, DB_AM_RECOVER)) + ret = ENOENT; + else + __db_errx(env, + "__db_meta_setup: %s: unexpected file type or format", + name); + return (ret == 0 ? EINVAL : ret); +} diff --git a/db/db_overflow.c b/db/db_overflow.c new file mode 100644 index 0000000..a718182 --- /dev/null +++ b/db/db_overflow.c @@ -0,0 +1,706 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/mp.h" + +/* + * Big key/data code. + * + * Big key and data entries are stored on linked lists of pages. The initial + * reference is a structure with the total length of the item and the page + * number where it begins. Each entry in the linked list contains a pointer + * to the next page of data, and so on. + */ + +/* + * __db_goff -- + * Get an offpage item. + * + * PUBLIC: int __db_goff __P((DBC *, + * PUBLIC: DBT *, u_int32_t, db_pgno_t, void **, u_int32_t *)); + */ +int +__db_goff(dbc, dbt, tlen, pgno, bpp, bpsz) + DBC *dbc; + DBT *dbt; + u_int32_t tlen; + db_pgno_t pgno; + void **bpp; + u_int32_t *bpsz; +{ + DB *dbp; + DB_MPOOLFILE *mpf; + DB_TXN *txn; + DBC_INTERNAL *cp; + ENV *env; + PAGE *h; + DB_THREAD_INFO *ip; + db_indx_t bytes; + u_int32_t curoff, needed, start; + u_int8_t *p, *src; + int ret; + + dbp = dbc->dbp; + cp = dbc->internal; + env = dbp->env; + ip = dbc->thread_info; + mpf = dbp->mpf; + txn = dbc->txn; + + /* + * Check if the buffer is big enough; if it is not and we are + * allowed to malloc space, then we'll malloc it. If we are + * not (DB_DBT_USERMEM), then we'll set the dbt and return + * appropriately. + */ + if (F_ISSET(dbt, DB_DBT_PARTIAL)) { + start = dbt->doff; + if (start > tlen) + needed = 0; + else if (dbt->dlen > tlen - start) + needed = tlen - start; + else + needed = dbt->dlen; + } else { + start = 0; + needed = tlen; + } + + /* + * If the caller has not requested any data, return success. This + * "early-out" also avoids setting up the streaming optimization when + * no page would be retrieved. If it were removed, the streaming code + * should only initialize when needed is not 0. + */ + if (needed == 0) { + dbt->size = 0; + return (0); + } + + if (F_ISSET(dbt, DB_DBT_USERCOPY)) + goto skip_alloc; + + /* Allocate any necessary memory. */ + if (F_ISSET(dbt, DB_DBT_USERMEM)) { + if (needed > dbt->ulen) { + dbt->size = needed; + return (DB_BUFFER_SMALL); + } + } else if (F_ISSET(dbt, DB_DBT_MALLOC)) { + if ((ret = __os_umalloc(env, needed, &dbt->data)) != 0) + return (ret); + } else if (F_ISSET(dbt, DB_DBT_REALLOC)) { + if ((ret = __os_urealloc(env, needed, &dbt->data)) != 0) + return (ret); + } else if (bpsz != NULL && (*bpsz == 0 || *bpsz < needed)) { + if ((ret = __os_realloc(env, needed, bpp)) != 0) + return (ret); + *bpsz = needed; + dbt->data = *bpp; + } else if (bpp != NULL) + dbt->data = *bpp; + else { + DB_ASSERT(env, + F_ISSET(dbt, + DB_DBT_USERMEM | DB_DBT_MALLOC | DB_DBT_REALLOC) || + bpsz != NULL || bpp != NULL); + return (DB_BUFFER_SMALL); + } + +skip_alloc: + /* Set up a start page in the overflow chain if streaming. */ + if (cp->stream_start_pgno != PGNO_INVALID && + pgno == cp->stream_start_pgno && start >= cp->stream_off && + start < cp->stream_off + P_MAXSPACE(dbp, dbp->pgsize)) { + pgno = cp->stream_curr_pgno; + curoff = cp->stream_off; + } else { + cp->stream_start_pgno = cp->stream_curr_pgno = pgno; + cp->stream_off = curoff = 0; + } + + /* + * Step through the linked list of pages, copying the data on each + * one into the buffer. Never copy more than the total data length. + */ + dbt->size = needed; + for (p = dbt->data; pgno != PGNO_INVALID && needed > 0;) { + if ((ret = __memp_fget(mpf, + &pgno, ip, txn, 0, &h)) != 0) + return (ret); + DB_ASSERT(env, TYPE(h) == P_OVERFLOW); + + /* Check if we need any bytes from this page. */ + if (curoff + OV_LEN(h) >= start) { + bytes = OV_LEN(h); + src = (u_int8_t *)h + P_OVERHEAD(dbp); + if (start > curoff) { + src += start - curoff; + bytes -= start - curoff; + } + if (bytes > needed) + bytes = needed; + if (F_ISSET(dbt, DB_DBT_USERCOPY)) { + /* + * The offset into the DBT is the total size + * less the amount of data still needed. Care + * needs to be taken if doing a partial copy + * beginning at an offset other than 0. + */ + if ((ret = env->dbt_usercopy( + dbt, dbt->size - needed, + src, bytes, DB_USERCOPY_SETDATA)) != 0) { + (void)__memp_fput(mpf, + ip, h, dbp->priority); + return (ret); + } + } else + memcpy(p, src, bytes); + p += bytes; + needed -= bytes; + } + cp->stream_off = curoff; + curoff += OV_LEN(h); + cp->stream_curr_pgno = pgno; + pgno = h->next_pgno; + (void)__memp_fput(mpf, ip, h, dbp->priority); + } + + return (0); +} + +/* + * __db_poff -- + * Put an offpage item. + * + * PUBLIC: int __db_poff __P((DBC *, const DBT *, db_pgno_t *)); + */ +int +__db_poff(dbc, dbt, pgnop) + DBC *dbc; + const DBT *dbt; + db_pgno_t *pgnop; +{ + DB *dbp; + DBT tmp_dbt; + DB_LSN null_lsn; + DB_MPOOLFILE *mpf; + PAGE *pagep, *lastp; + db_indx_t pagespace; + db_pgno_t pgno; + u_int32_t space, sz, tlen; + u_int8_t *p; + int ret, t_ret; + + /* + * Allocate pages and copy the key/data item into them. Calculate the + * number of bytes we get for pages we fill completely with a single + * item. + */ + dbp = dbc->dbp; + lastp = NULL; + mpf = dbp->mpf; + pagespace = P_MAXSPACE(dbp, dbp->pgsize); + p = dbt->data; + sz = dbt->size; + + /* + * Check whether we are streaming at the end of the overflow item. + * If so, the last pgno and offset will be cached in the cursor. + */ + if (F_ISSET(dbt, DB_DBT_STREAMING)) { + tlen = dbt->size - dbt->dlen; + pgno = dbc->internal->stream_curr_pgno; + if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, + dbc->txn, DB_MPOOL_DIRTY, &lastp)) != 0) + return (ret); + + /* + * Calculate how much we can write on the last page of the + * overflow item. + */ + DB_ASSERT(dbp->env, + OV_LEN(lastp) == (tlen - dbc->internal->stream_off)); + space = pagespace - OV_LEN(lastp); + + /* Only copy as much data as we have. */ + if (space > dbt->dlen) + space = dbt->dlen; + + if (DBC_LOGGING(dbc)) { + tmp_dbt.data = dbt->data; + tmp_dbt.size = space; + ZERO_LSN(null_lsn); + if ((ret = __db_big_log(dbp, dbc->txn, + &LSN(lastp), 0, DB_APPEND_BIG, pgno, + PGNO_INVALID, PGNO_INVALID, &tmp_dbt, + &LSN(lastp), &null_lsn, &null_lsn)) != 0) + goto err; + } else + LSN_NOT_LOGGED(LSN(lastp)); + + memcpy((u_int8_t *)lastp + P_OVERHEAD(dbp) + OV_LEN(lastp), + dbt->data, space); + OV_LEN(lastp) += space; + sz -= space + dbt->doff; + p += space; + *pgnop = dbc->internal->stream_start_pgno; + } + + ret = 0; + for (; sz > 0; p += pagespace, sz -= pagespace) { + /* + * Reduce pagespace so we terminate the loop correctly and + * don't copy too much data. + */ + if (sz < pagespace) + pagespace = sz; + + /* + * Allocate and initialize a new page and copy all or part of + * the item onto the page. If sz is less than pagespace, we + * have a partial record. + */ + if ((ret = __db_new(dbc, P_OVERFLOW, NULL, &pagep)) != 0) + break; + if (DBC_LOGGING(dbc)) { + tmp_dbt.data = p; + tmp_dbt.size = pagespace; + ZERO_LSN(null_lsn); + if ((ret = __db_big_log(dbp, dbc->txn, + &LSN(pagep), 0, DB_ADD_BIG, PGNO(pagep), + lastp ? PGNO(lastp) : PGNO_INVALID, + PGNO_INVALID, &tmp_dbt, &LSN(pagep), + lastp == NULL ? &null_lsn : &LSN(lastp), + &null_lsn)) != 0) { + (void)__memp_fput(mpf, dbc->thread_info, + pagep, dbc->priority); + goto err; + } + } else + LSN_NOT_LOGGED(LSN(pagep)); + + /* Move LSN onto page. */ + if (lastp != NULL) + LSN(lastp) = LSN(pagep); + + OV_LEN(pagep) = pagespace; + OV_REF(pagep) = 1; + memcpy((u_int8_t *)pagep + P_OVERHEAD(dbp), p, pagespace); + + /* + * If this is the first entry, update the user's info and + * initialize the cursor to allow for streaming of subsequent + * updates. Otherwise, update the entry on the last page + * filled in and release that page. + */ + if (lastp == NULL) { + *pgnop = PGNO(pagep); + dbc->internal->stream_start_pgno = + dbc->internal->stream_curr_pgno = *pgnop; + dbc->internal->stream_off = 0; + } else { + lastp->next_pgno = PGNO(pagep); + pagep->prev_pgno = PGNO(lastp); + if ((ret = __memp_fput(mpf, + dbc->thread_info, lastp, dbc->priority)) != 0) { + lastp = NULL; + goto err; + } + } + lastp = pagep; + } +err: if (lastp != NULL) { + if (ret == 0) { + dbc->internal->stream_curr_pgno = PGNO(lastp); + dbc->internal->stream_off = dbt->size - OV_LEN(lastp); + } + + if ((t_ret = __memp_fput(mpf, dbc->thread_info, lastp, + dbc->priority)) != 0 && ret == 0) + ret = t_ret; + } + return (ret); +} + +/* + * __db_ovref -- + * Decrement the reference count on an overflow page. + * + * PUBLIC: int __db_ovref __P((DBC *, db_pgno_t)); + */ +int +__db_ovref(dbc, pgno) + DBC *dbc; + db_pgno_t pgno; +{ + DB *dbp; + DB_MPOOLFILE *mpf; + PAGE *h; + int ret; + + dbp = dbc->dbp; + mpf = dbp->mpf; + + if ((ret = __memp_fget(mpf, &pgno, + dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &h)) != 0) + return (ret); + + if (DBC_LOGGING(dbc)) { + if ((ret = __db_ovref_log(dbp, + dbc->txn, &LSN(h), 0, h->pgno, -1, &LSN(h))) != 0) { + (void)__memp_fput(mpf, + dbc->thread_info, h, dbc->priority); + return (ret); + } + } else + LSN_NOT_LOGGED(LSN(h)); + + /* + * In BDB releases before 4.5, the overflow reference counts were + * incremented when an overflow item was split onto an internal + * page. There was a lock race in that code, and rather than fix + * the race, we changed BDB to copy overflow items when splitting + * them onto internal pages. The code to decrement reference + * counts remains so databases already in the field continue to + * work. + */ + --OV_REF(h); + + return (__memp_fput(mpf, dbc->thread_info, h, dbc->priority)); +} + +/* + * __db_doff -- + * Delete an offpage chain of overflow pages. + * + * PUBLIC: int __db_doff __P((DBC *, db_pgno_t)); + */ +int +__db_doff(dbc, pgno) + DBC *dbc; + db_pgno_t pgno; +{ + DB *dbp; + DBT tmp_dbt; + DB_LSN null_lsn; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int ret; + + dbp = dbc->dbp; + mpf = dbp->mpf; + + do { + if ((ret = __memp_fget(mpf, &pgno, + dbc->thread_info, dbc->txn, 0, &pagep)) != 0) + return (ret); + + DB_ASSERT(dbp->env, TYPE(pagep) == P_OVERFLOW); + /* + * If it's referenced by more than one key/data item, + * decrement the reference count and return. + */ + if (OV_REF(pagep) > 1) { + (void)__memp_fput(mpf, + dbc->thread_info, pagep, dbc->priority); + return (__db_ovref(dbc, pgno)); + } + + if ((ret = __memp_dirty(mpf, &pagep, + dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) { + if (pagep != NULL) + (void)__memp_fput(mpf, + dbc->thread_info, pagep, dbc->priority); + return (ret); + } + + if (DBC_LOGGING(dbc)) { + tmp_dbt.data = (u_int8_t *)pagep + P_OVERHEAD(dbp); + tmp_dbt.size = OV_LEN(pagep); + ZERO_LSN(null_lsn); + if ((ret = __db_big_log(dbp, dbc->txn, + &LSN(pagep), 0, DB_REM_BIG, + PGNO(pagep), PREV_PGNO(pagep), + NEXT_PGNO(pagep), &tmp_dbt, + &LSN(pagep), &null_lsn, &null_lsn)) != 0) { + (void)__memp_fput(mpf, + dbc->thread_info, pagep, dbc->priority); + return (ret); + } + } else + LSN_NOT_LOGGED(LSN(pagep)); + pgno = pagep->next_pgno; + OV_LEN(pagep) = 0; + if ((ret = __db_free(dbc, pagep)) != 0) + return (ret); + } while (pgno != PGNO_INVALID); + + return (0); +} + +/* + * __db_moff -- + * Match on overflow pages. + * + * Given a starting page number and a key, return <0, 0, >0 to indicate if the + * key on the page is less than, equal to or greater than the key specified. + * We optimize this by doing chunk at a time comparison unless the user has + * specified a comparison function. In this case, we need to materialize + * the entire object and call their comparison routine. + * + * __db_moff and __db_coff are generic functions useful in searching and + * ordering off page items. __db_moff matches an overflow DBT with an offpage + * item. __db_coff compares two offpage items for lexicographic sort order. + * + * PUBLIC: int __db_moff __P((DBC *, const DBT *, db_pgno_t, u_int32_t, + * PUBLIC: int (*)(DB *, const DBT *, const DBT *), int *)); + */ +int +__db_moff(dbc, dbt, pgno, tlen, cmpfunc, cmpp) + DBC *dbc; + const DBT *dbt; + db_pgno_t pgno; + u_int32_t tlen; + int (*cmpfunc) __P((DB *, const DBT *, const DBT *)), *cmpp; +{ + DB *dbp; + DBT local_dbt; + DB_MPOOLFILE *mpf; + DB_THREAD_INFO *ip; + PAGE *pagep; + void *buf; + u_int32_t bufsize, cmp_bytes, key_left; + u_int8_t *p1, *p2; + int ret; + + dbp = dbc->dbp; + ip = dbc->thread_info; + mpf = dbp->mpf; + + /* + * If there is a user-specified comparison function, build a + * contiguous copy of the key, and call it. + */ + if (cmpfunc != NULL) { + memset(&local_dbt, 0, sizeof(local_dbt)); + buf = NULL; + bufsize = 0; + + if ((ret = __db_goff(dbc, + &local_dbt, tlen, pgno, &buf, &bufsize)) != 0) + return (ret); + /* Pass the key as the first argument */ + *cmpp = cmpfunc(dbp, dbt, &local_dbt); + __os_free(dbp->env, buf); + return (0); + } + + /* While there are both keys to compare. */ + for (*cmpp = 0, p1 = dbt->data, + key_left = dbt->size; key_left > 0 && pgno != PGNO_INVALID;) { + if ((ret = + __memp_fget(mpf, &pgno, ip, dbc->txn, 0, &pagep)) != 0) + return (ret); + + cmp_bytes = OV_LEN(pagep) < key_left ? OV_LEN(pagep) : key_left; + tlen -= cmp_bytes; + key_left -= cmp_bytes; + for (p2 = (u_int8_t *)pagep + P_OVERHEAD(dbp); + cmp_bytes-- > 0; ++p1, ++p2) + if (*p1 != *p2) { + *cmpp = (long)*p1 - (long)*p2; + break; + } + pgno = NEXT_PGNO(pagep); + if ((ret = __memp_fput(mpf, ip, pagep, dbp->priority)) != 0) + return (ret); + if (*cmpp != 0) + return (0); + } + if (key_left > 0) /* DBT is longer than the page key. */ + *cmpp = 1; + else if (tlen > 0) /* DBT is shorter than the page key. */ + *cmpp = -1; + else + *cmpp = 0; + + return (0); +} + +/* + * __db_coff -- + * Match two offpage dbts. + * + * The DBTs must both refer to offpage items. + * The match happens a chunk (page) at a time unless a user defined comparison + * function exists. It is not possible to optimize this comparison away when + * a lexicographic sort order is required on mismatch. + * + * NOTE: For now this function only works for H_OFFPAGE type items. It would + * be simple to extend it for use with B_OVERFLOW type items. It would only + * require extracting the total length, and page number, dependent on the + * DBT type. + * + * PUBLIC: int __db_coff __P((DBC *, const DBT *, const DBT *, + * PUBLIC: int (*)(DB *, const DBT *, const DBT *), int *)); + */ +int +__db_coff(dbc, dbt, match, cmpfunc, cmpp) + DBC *dbc; + const DBT *dbt, *match; + int (*cmpfunc) __P((DB *, const DBT *, const DBT *)), *cmpp; +{ + DB *dbp; + DB_THREAD_INFO *ip; + DB_MPOOLFILE *mpf; + DB_TXN *txn; + DBT local_key, local_match; + PAGE *dbt_pagep, *match_pagep; + db_pgno_t dbt_pgno, match_pgno; + u_int32_t cmp_bytes, dbt_bufsz, dbt_len, match_bufsz; + u_int32_t match_len, max_data, page_sz; + u_int8_t *p1, *p2; + int ret; + void *dbt_buf, *match_buf; + + dbp = dbc->dbp; + ip = dbc->thread_info; + txn = dbc->txn; + mpf = dbp->mpf; + page_sz = dbp->pgsize; + *cmpp = 0; + dbt_buf = match_buf = NULL; + + DB_ASSERT(dbp->env, HPAGE_PTYPE(dbt->data) == H_OFFPAGE); + DB_ASSERT(dbp->env, HPAGE_PTYPE(match->data) == H_OFFPAGE); + + /* Extract potentially unaligned length and pgno fields from DBTs */ + memcpy(&dbt_len, HOFFPAGE_TLEN(dbt->data), sizeof(u_int32_t)); + memcpy(&dbt_pgno, HOFFPAGE_PGNO(dbt->data), sizeof(db_pgno_t)); + memcpy(&match_len, HOFFPAGE_TLEN(match->data), sizeof(u_int32_t)); + memcpy(&match_pgno, HOFFPAGE_PGNO(match->data), sizeof(db_pgno_t)); + max_data = (dbt_len < match_len ? dbt_len : match_len); + + /* + * If there is a custom comparator, fully resolve both DBTs. + * Then call the users comparator. + */ + if (cmpfunc != NULL) { + memset(&local_key, 0, sizeof(local_key)); + memset(&local_match, 0, sizeof(local_match)); + dbt_buf = match_buf = NULL; + dbt_bufsz = match_bufsz = 0; + + if ((ret = __db_goff(dbc, &local_key, dbt_len, + dbt_pgno, &dbt_buf, &dbt_bufsz)) != 0) + goto err1; + if ((ret = __db_goff(dbc, &local_match, match_len, + match_pgno, &match_buf, &match_bufsz)) != 0) + goto err1; + /* The key needs to be the first argument for sort order */ + *cmpp = cmpfunc(dbp, &local_key, &local_match); + +err1: if (dbt_buf != NULL) + __os_free(dbp->env, dbt_buf); + if (match_buf != NULL) + __os_free(dbp->env, match_buf); + return (ret); + } + + /* Match the offpage DBTs a page at a time. */ + while (dbt_pgno != PGNO_INVALID && match_pgno != PGNO_INVALID) { + if ((ret = + __memp_fget(mpf, &dbt_pgno, ip, txn, 0, &dbt_pagep)) != 0) + return (ret); + if ((ret = + __memp_fget(mpf, &match_pgno, + ip, txn, 0, &match_pagep)) != 0) { + (void)__memp_fput( + mpf, ip, dbt_pagep, DB_PRIORITY_UNCHANGED); + return (ret); + } + cmp_bytes = page_sz < max_data ? page_sz : max_data; + for (p1 = (u_int8_t *)dbt_pagep + P_OVERHEAD(dbp), + p2 = (u_int8_t *)match_pagep + P_OVERHEAD(dbp); + cmp_bytes-- > 0; ++p1, ++p2) + if (*p1 != *p2) { + *cmpp = (long)*p1 - (long)*p2; + break; + } + + dbt_pgno = NEXT_PGNO(dbt_pagep); + match_pgno = NEXT_PGNO(match_pagep); + max_data -= page_sz; + if ((ret = __memp_fput(mpf, + ip, dbt_pagep, DB_PRIORITY_UNCHANGED)) != 0) { + (void)__memp_fput(mpf, + ip, match_pagep, DB_PRIORITY_UNCHANGED); + return (ret); + } + if ((ret = __memp_fput(mpf, + ip, match_pagep, DB_PRIORITY_UNCHANGED)) != 0) + return (ret); + if (*cmpp != 0) + return (0); + } + + /* If a lexicographic mismatch was found, then the result has already + * been returned. If the DBTs matched, consider the lengths of the + * items, and return appropriately. + */ + if (dbt_len > match_len) /* DBT is longer than the match key. */ + *cmpp = 1; + else if (match_len > dbt_len) /* DBT is shorter than the match key. */ + *cmpp = -1; + else + *cmpp = 0; + + return (0); + +} diff --git a/db/db_ovfl_vrfy.c b/db/db_ovfl_vrfy.c new file mode 100644 index 0000000..fdd0201 --- /dev/null +++ b/db/db_ovfl_vrfy.c @@ -0,0 +1,409 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/db_verify.h" +#include "dbinc/mp.h" + +/* + * __db_vrfy_overflow -- + * Verify overflow page. + * + * PUBLIC: int __db_vrfy_overflow __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, + * PUBLIC: u_int32_t)); + */ +int +__db_vrfy_overflow(dbp, vdp, h, pgno, flags) + DB *dbp; + VRFY_DBINFO *vdp; + PAGE *h; + db_pgno_t pgno; + u_int32_t flags; +{ + VRFY_PAGEINFO *pip; + int isbad, ret, t_ret; + + isbad = 0; + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + + if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + + pip->refcount = OV_REF(h); + if (pip->refcount < 1) { + EPRINT((dbp->env, + "Page %lu: overflow page has zero reference count", + (u_long)pgno)); + isbad = 1; + } + + /* Just store for now. */ + pip->olen = HOFFSET(h); + +err: if ((t_ret = __db_vrfy_putpageinfo(dbp->env, vdp, pip)) != 0) + ret = t_ret; + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); +} + +/* + * __db_vrfy_ovfl_structure -- + * Walk a list of overflow pages, avoiding cycles and marking + * pages seen. + * + * PUBLIC: int __db_vrfy_ovfl_structure + * PUBLIC: __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t, u_int32_t)); + */ +int +__db_vrfy_ovfl_structure(dbp, vdp, pgno, tlen, flags) + DB *dbp; + VRFY_DBINFO *vdp; + db_pgno_t pgno; + u_int32_t tlen; + u_int32_t flags; +{ + DB *pgset; + ENV *env; + VRFY_PAGEINFO *pip; + db_pgno_t next, prev; + int isbad, ret, seen_cnt, t_ret; + u_int32_t refcount; + + env = dbp->env; + pgset = vdp->pgset; + DB_ASSERT(env, pgset != NULL); + isbad = 0; + + /* This shouldn't happen, but just to be sure. */ + if (!IS_VALID_PGNO(pgno)) + return (DB_VERIFY_BAD); + + /* + * Check the first prev_pgno; it ought to be PGNO_INVALID, + * since there's no prev page. + */ + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + + /* The refcount is stored on the first overflow page. */ + refcount = pip->refcount; + + if (pip->type != P_OVERFLOW) { + EPRINT((env, + "Page %lu: overflow page of invalid type %lu", + (u_long)pgno, (u_long)pip->type)); + ret = DB_VERIFY_BAD; + goto err; /* Unsafe to continue. */ + } + + prev = pip->prev_pgno; + if (prev != PGNO_INVALID) { + EPRINT((env, + "Page %lu: first page in overflow chain has a prev_pgno %lu", + (u_long)pgno, (u_long)prev)); + isbad = 1; + } + + for (;;) { + /* + * We may have seen this page elsewhere, if the overflow entry + * has been promoted to an internal page; we just want to + * make sure that each overflow page is seen exactly as many + * times as its refcount dictates. + * + * Note that this code also serves to keep us from looping + * infinitely if there's a cycle in an overflow chain. + */ + if ((ret = __db_vrfy_pgset_get(pgset, + vdp->thread_info, pgno, &seen_cnt)) != 0) + goto err; + if ((u_int32_t)seen_cnt > refcount) { + EPRINT((env, + "Page %lu: encountered too many times in overflow traversal", + (u_long)pgno)); + ret = DB_VERIFY_BAD; + goto err; + } + if ((ret = + __db_vrfy_pgset_inc(pgset, vdp->thread_info, pgno)) != 0) + goto err; + + /* + * Each overflow page can be referenced multiple times, + * because it's possible for overflow Btree keys to get + * promoted to internal pages. We want to make sure that + * each page is referenced from a Btree leaf (or Hash data + * page, which we consider a "leaf" here) exactly once; if + * the parent was a leaf, set a flag to indicate that we've + * seen this page in a leaf context. + * + * If the parent is not a leaf--in which case it's a Btree + * internal page--we don't need to bother doing any further + * verification, as we'll do it when we hit the leaf (or + * complain that we never saw the leaf). Only the first + * page in an overflow chain should ever have a refcount + * greater than 1, and the combination of the LEAFSEEN check + * and the fact that we bail after the first page for + * non-leaves should ensure this. + * + * Note that each "child" of a page, such as an overflow page, + * is stored and verified in a structure check exactly once, + * so this code does not need to contend with the fact that + * overflow chains used as Btree duplicate keys may be + * referenced multiply from a single Btree leaf page. + */ + if (LF_ISSET(DB_ST_OVFL_LEAF)) { + if (F_ISSET(pip, VRFY_OVFL_LEAFSEEN)) { + EPRINT((env, + "Page %lu: overflow page linked twice from leaf or data page", + (u_long)pgno)); + ret = DB_VERIFY_BAD; + goto err; + } + F_SET(pip, VRFY_OVFL_LEAFSEEN); + } + + /* + * We want to verify each overflow chain only once, and + * although no chain should be linked more than once from a + * leaf page, we can't guarantee that it'll be linked that + * once if it's linked from an internal page and the key + * is gone. + * + * seen_cnt is the number of times we'd encountered this page + * before calling this function. + */ + if (seen_cnt == 0) { + /* + * Keep a running tab on how much of the item we've + * seen. + */ + tlen -= pip->olen; + + /* Send the application feedback about our progress. */ + if (!LF_ISSET(DB_SALVAGE)) + __db_vrfy_struct_feedback(dbp, vdp); + } else + goto done; + + next = pip->next_pgno; + + /* Are we there yet? */ + if (next == PGNO_INVALID) + break; + + /* + * We've already checked this when we saved it, but just + * to be sure... + */ + if (!IS_VALID_PGNO(next)) { + EPRINT((env, + "Page %lu: bad next_pgno %lu on overflow page", + (u_long)pgno, (u_long)next)); + ret = DB_VERIFY_BAD; + goto err; + } + + if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 || + (ret = __db_vrfy_getpageinfo(vdp, next, &pip)) != 0) + return (ret); + if (pip->prev_pgno != pgno) { + EPRINT((env, + "Page %lu: bad prev_pgno %lu on overflow page (should be %lu)", + (u_long)next, (u_long)pip->prev_pgno, + (u_long)pgno)); + isbad = 1; + /* + * It's safe to continue because we have separate + * cycle detection. + */ + } + + pgno = next; + } + + if (tlen > 0) { + isbad = 1; + EPRINT((env, + "Page %lu: overflow item incomplete", (u_long)pgno)); + } + +done: +err: if ((t_ret = + __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0) + ret = t_ret; + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); +} + +/* + * __db_safe_goff -- + * Get an overflow item, very carefully, from an untrusted database, + * in the context of the salvager. + * + * PUBLIC: int __db_safe_goff __P((DB *, VRFY_DBINFO *, + * PUBLIC: db_pgno_t, DBT *, void *, u_int32_t *, u_int32_t)); + */ +int +__db_safe_goff(dbp, vdp, pgno, dbt, buf, bufsz, flags) + DB *dbp; + VRFY_DBINFO *vdp; + db_pgno_t pgno; + DBT *dbt; + void *buf; + u_int32_t *bufsz; + u_int32_t flags; +{ + DB_MPOOLFILE *mpf; + PAGE *h; + int ret, t_ret; + u_int32_t bytesgot, bytes; + u_int8_t *src, *dest; + + mpf = dbp->mpf; + h = NULL; + ret = t_ret = 0; + bytesgot = bytes = 0; + + DB_ASSERT(dbp->env, bufsz != NULL); + + /* + * Back up to the start of the overflow chain (if necessary) via the + * prev pointer of the overflow page. This guarantees we transverse the + * longest possible chains of overflow pages and won't be called again + * with a pgno earlier in the chain, stepping on ourselves. + */ + for (;;) { + if ((ret = __memp_fget( + mpf, &pgno, vdp->thread_info, NULL, 0, &h)) != 0) + return (ret); + + if (PREV_PGNO(h) == PGNO_INVALID || + !IS_VALID_PGNO(PREV_PGNO(h))) + break; + + pgno = PREV_PGNO(h); + + if ((ret = __memp_fput(mpf, + vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0) + return (ret); + } + if ((ret = __memp_fput( + mpf, vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0) + return (ret); + + h = NULL; + + while ((pgno != PGNO_INVALID) && (IS_VALID_PGNO(pgno))) { + /* + * Mark that we're looking at this page; if we've seen it + * already, quit. + */ + if ((ret = __db_salvage_markdone(vdp, pgno)) != 0) + break; + + if ((ret = __memp_fget(mpf, &pgno, + vdp->thread_info, NULL, 0, &h)) != 0) + break; + + /* + * Make sure it's really an overflow page, unless we're + * being aggressive, in which case we pretend it is. + */ + if (!LF_ISSET(DB_AGGRESSIVE) && TYPE(h) != P_OVERFLOW) { + ret = DB_VERIFY_BAD; + break; + } + + src = (u_int8_t *)h + P_OVERHEAD(dbp); + bytes = OV_LEN(h); + + if (bytes + P_OVERHEAD(dbp) > dbp->pgsize) + bytes = dbp->pgsize - P_OVERHEAD(dbp); + + /* + * Realloc if buf is too small + */ + if (bytesgot + bytes > *bufsz) { + if ((ret = + __os_realloc(dbp->env, bytesgot + bytes, buf)) != 0) + break; + *bufsz = bytesgot + bytes; + } + + dest = *(u_int8_t **)buf + bytesgot; + bytesgot += bytes; + + memcpy(dest, src, bytes); + + pgno = NEXT_PGNO(h); + + if ((ret = __memp_fput(mpf, + vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0) + break; + h = NULL; + } + + /* + * If we're being aggressive, salvage a partial datum if there + * was an error somewhere along the way. + */ + if (ret == 0 || LF_ISSET(DB_AGGRESSIVE)) { + dbt->size = bytesgot; + dbt->data = *(void **)buf; + } + + /* If we broke out on error, don't leave pages pinned. */ + if (h != NULL && (t_ret = __memp_fput(mpf, + vdp->thread_info, h, DB_PRIORITY_UNCHANGED)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} diff --git a/db/db_pr.c b/db/db_pr.c new file mode 100644 index 0000000..69fb7c7 --- /dev/null +++ b/db/db_pr.c @@ -0,0 +1,1659 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/hash.h" +#include "dbinc/mp.h" +#include "dbinc/partition.h" +#include "dbinc/qam.h" +#include "dbinc/db_verify.h" + +/* + * __db_loadme -- + * A nice place to put a breakpoint. + * + * PUBLIC: void __db_loadme __P((void)); + */ +void +__db_loadme() +{ + pid_t pid; + + __os_id(NULL, &pid, NULL); +} + +#ifdef HAVE_STATISTICS +static int __db_bmeta __P((DB *, BTMETA *, u_int32_t)); +static int __db_hmeta __P((DB *, HMETA *, u_int32_t)); +static void __db_meta __P((DB *, DBMETA *, FN const *, u_int32_t)); +static const char *__db_pagetype_to_string __P((u_int32_t)); +static void __db_prdb __P((DB *, u_int32_t)); +static void __db_proff __P((ENV *, DB_MSGBUF *, void *)); +static int __db_prtree __P((DB *, DB_TXN *, u_int32_t)); +static int __db_qmeta __P((DB *, QMETA *, u_int32_t)); + +/* + * __db_dumptree -- + * Dump the tree to a file. + * + * PUBLIC: int __db_dumptree __P((DB *, DB_TXN *, char *, char *)); + */ +int +__db_dumptree(dbp, txn, op, name) + DB *dbp; + DB_TXN *txn; + char *op, *name; +{ + ENV *env; + FILE *fp, *orig_fp; + u_int32_t flags; + int ret; + + env = dbp->env; + + for (flags = 0; *op != '\0'; ++op) + switch (*op) { + case 'a': + LF_SET(DB_PR_PAGE); + break; + case 'h': + break; + case 'r': + LF_SET(DB_PR_RECOVERYTEST); + break; + default: + return (EINVAL); + } + + if (name != NULL) { + if ((fp = fopen(name, "w")) == NULL) + return (__os_get_errno()); + + orig_fp = dbp->dbenv->db_msgfile; + dbp->dbenv->db_msgfile = fp; + } else + fp = orig_fp = NULL; + + __db_prdb(dbp, flags); + + __db_msg(env, "%s", DB_GLOBAL(db_line)); + + ret = __db_prtree(dbp, txn, flags); + + if (fp != NULL) { + (void)fclose(fp); + env->dbenv->db_msgfile = orig_fp; + } + + return (ret); +} + +static const FN __db_flags_fn[] = { + { DB_AM_CHKSUM, "checksumming" }, + { DB_AM_COMPENSATE, "created by compensating transaction" }, + { DB_AM_CREATED, "database created" }, + { DB_AM_CREATED_MSTR, "encompassing file created" }, + { DB_AM_DBM_ERROR, "dbm/ndbm error" }, + { DB_AM_DELIMITER, "variable length" }, + { DB_AM_DISCARD, "discard cached pages" }, + { DB_AM_DUP, "duplicates" }, + { DB_AM_DUPSORT, "sorted duplicates" }, + { DB_AM_ENCRYPT, "encrypted" }, + { DB_AM_FIXEDLEN, "fixed-length records" }, + { DB_AM_INMEM, "in-memory" }, + { DB_AM_IN_RENAME, "file is being renamed" }, + { DB_AM_NOT_DURABLE, "changes not logged" }, + { DB_AM_OPEN_CALLED, "open called" }, + { DB_AM_PAD, "pad value" }, + { DB_AM_PGDEF, "default page size" }, + { DB_AM_RDONLY, "read-only" }, + { DB_AM_READ_UNCOMMITTED, "read-uncommitted" }, + { DB_AM_RECNUM, "Btree record numbers" }, + { DB_AM_RECOVER, "opened for recovery" }, + { DB_AM_RENUMBER, "renumber" }, + { DB_AM_REVSPLITOFF, "no reverse splits" }, + { DB_AM_SECONDARY, "secondary" }, + { DB_AM_SNAPSHOT, "load on open" }, + { DB_AM_SUBDB, "subdatabases" }, + { DB_AM_SWAP, "needswap" }, + { DB_AM_TXN, "transactional" }, + { DB_AM_VERIFYING, "verifier" }, + { 0, NULL } +}; + +/* + * __db_get_flags_fn -- + * Return the __db_flags_fn array. + * + * PUBLIC: const FN * __db_get_flags_fn __P((void)); + */ +const FN * +__db_get_flags_fn() +{ + return (__db_flags_fn); +} + +/* + * __db_prdb -- + * Print out the DB structure information. + */ +static void +__db_prdb(dbp, flags) + DB *dbp; + u_int32_t flags; +{ + BTREE *bt; + DB_MSGBUF mb; + ENV *env; + HASH *h; + QUEUE *q; + + env = dbp->env; + + DB_MSGBUF_INIT(&mb); + __db_msg(env, "In-memory DB structure:"); + __db_msgadd(env, &mb, "%s: %#lx", + __db_dbtype_to_string(dbp->type), (u_long)dbp->flags); + __db_prflags(env, &mb, dbp->flags, __db_flags_fn, " (", ")"); + DB_MSGBUF_FLUSH(env, &mb); + + switch (dbp->type) { + case DB_BTREE: + case DB_RECNO: + bt = dbp->bt_internal; + __db_msg(env, "bt_meta: %lu bt_root: %lu", + (u_long)bt->bt_meta, (u_long)bt->bt_root); + __db_msg(env, "bt_minkey: %lu", (u_long)bt->bt_minkey); + if (!LF_ISSET(DB_PR_RECOVERYTEST)) + __db_msg(env, "bt_compare: %#lx bt_prefix: %#lx", + P_TO_ULONG(bt->bt_compare), + P_TO_ULONG(bt->bt_prefix)); +#ifdef HAVE_COMPRESSION + if (!LF_ISSET(DB_PR_RECOVERYTEST)) + __db_msg(env, "bt_compress: %#lx bt_decompress: %#lx", + P_TO_ULONG(bt->bt_compress), + P_TO_ULONG(bt->bt_decompress)); +#endif + __db_msg(env, "bt_lpgno: %lu", (u_long)bt->bt_lpgno); + if (dbp->type == DB_RECNO) { + __db_msg(env, + "re_pad: %#lx re_delim: %#lx re_len: %lu re_source: %s", + (u_long)bt->re_pad, (u_long)bt->re_delim, + (u_long)bt->re_len, + bt->re_source == NULL ? "" : bt->re_source); + __db_msg(env, + "re_modified: %d re_eof: %d re_last: %lu", + bt->re_modified, bt->re_eof, (u_long)bt->re_last); + } + break; + case DB_HASH: + h = dbp->h_internal; + __db_msg(env, "meta_pgno: %lu", (u_long)h->meta_pgno); + __db_msg(env, "h_ffactor: %lu", (u_long)h->h_ffactor); + __db_msg(env, "h_nelem: %lu", (u_long)h->h_nelem); + if (!LF_ISSET(DB_PR_RECOVERYTEST)) + __db_msg(env, "h_hash: %#lx", P_TO_ULONG(h->h_hash)); + break; + case DB_QUEUE: + q = dbp->q_internal; + __db_msg(env, "q_meta: %lu", (u_long)q->q_meta); + __db_msg(env, "q_root: %lu", (u_long)q->q_root); + __db_msg(env, "re_pad: %#lx re_len: %lu", + (u_long)q->re_pad, (u_long)q->re_len); + __db_msg(env, "rec_page: %lu", (u_long)q->rec_page); + __db_msg(env, "page_ext: %lu", (u_long)q->page_ext); + break; + case DB_UNKNOWN: + default: + break; + } +} + +/* + * __db_prtree -- + * Print out the entire tree. + */ +static int +__db_prtree(dbp, txn, flags) + DB *dbp; + DB_TXN *txn; + u_int32_t flags; +{ + DB_MPOOLFILE *mpf; + PAGE *h; + db_pgno_t i, last; + int ret; + + mpf = dbp->mpf; + + if (dbp->type == DB_QUEUE) + return (__db_prqueue(dbp, flags)); + + /* + * Find out the page number of the last page in the database, then + * dump each page. + */ + if ((ret = __memp_get_last_pgno(mpf, &last)) != 0) + return (ret); + for (i = 0; i <= last; ++i) { + if ((ret = __memp_fget(mpf, &i, NULL, txn, 0, &h)) != 0) + return (ret); + (void)__db_prpage(dbp, h, flags); + if ((ret = __memp_fput(mpf, NULL, h, dbp->priority)) != 0) + return (ret); + } + + return (0); +} + +/* + * __db_meta -- + * Print out common metadata information. + */ +static void +__db_meta(dbp, dbmeta, fn, flags) + DB *dbp; + DBMETA *dbmeta; + FN const *fn; + u_int32_t flags; +{ + DB_MPOOLFILE *mpf; + DB_MSGBUF mb; + ENV *env; + PAGE *h; + db_pgno_t pgno; + u_int8_t *p; + int cnt, ret; + const char *sep; + + env = dbp->env; + mpf = dbp->mpf; + DB_MSGBUF_INIT(&mb); + + __db_msg(env, "\tmagic: %#lx", (u_long)dbmeta->magic); + __db_msg(env, "\tversion: %lu", (u_long)dbmeta->version); + __db_msg(env, "\tpagesize: %lu", (u_long)dbmeta->pagesize); + __db_msg(env, "\ttype: %lu", (u_long)dbmeta->type); + __db_msg(env, "\tmetaflags %#lx", (u_long)dbmeta->metaflags); + __db_msg(env, "\tkeys: %lu\trecords: %lu", + (u_long)dbmeta->key_count, (u_long)dbmeta->record_count); + if (dbmeta->nparts) + __db_msg(env, "\tnparts: %lu", (u_long)dbmeta->nparts); + + /* + * If we're doing recovery testing, don't display the free list, + * it may have changed and that makes the dump diff not work. + */ + if (!LF_ISSET(DB_PR_RECOVERYTEST)) { + __db_msgadd( + env, &mb, "\tfree list: %lu", (u_long)dbmeta->free); + for (pgno = dbmeta->free, + cnt = 0, sep = ", "; pgno != PGNO_INVALID;) { + if ((ret = __memp_fget(mpf, + &pgno, NULL, NULL, 0, &h)) != 0) { + DB_MSGBUF_FLUSH(env, &mb); + __db_msg(env, + "Unable to retrieve free-list page: %lu: %s", + (u_long)pgno, db_strerror(ret)); + break; + } + pgno = h->next_pgno; + (void)__memp_fput(mpf, NULL, h, dbp->priority); + __db_msgadd(env, &mb, "%s%lu", sep, (u_long)pgno); + if (++cnt % 10 == 0) { + DB_MSGBUF_FLUSH(env, &mb); + cnt = 0; + sep = "\t"; + } else + sep = ", "; + } + DB_MSGBUF_FLUSH(env, &mb); + __db_msg(env, "\tlast_pgno: %lu", (u_long)dbmeta->last_pgno); + } + + if (fn != NULL) { + DB_MSGBUF_FLUSH(env, &mb); + __db_msgadd(env, &mb, "\tflags: %#lx", (u_long)dbmeta->flags); + __db_prflags(env, &mb, dbmeta->flags, fn, " (", ")"); + } + + DB_MSGBUF_FLUSH(env, &mb); + __db_msgadd(env, &mb, "\tuid: "); + for (p = (u_int8_t *)dbmeta->uid, + cnt = 0; cnt < DB_FILE_ID_LEN; ++cnt) { + __db_msgadd(env, &mb, "%x", *p++); + if (cnt < DB_FILE_ID_LEN - 1) + __db_msgadd(env, &mb, " "); + } + DB_MSGBUF_FLUSH(env, &mb); +} + +/* + * __db_bmeta -- + * Print out the btree meta-data page. + */ +static int +__db_bmeta(dbp, h, flags) + DB *dbp; + BTMETA *h; + u_int32_t flags; +{ + static const FN fn[] = { + { BTM_DUP, "duplicates" }, + { BTM_RECNO, "recno" }, + { BTM_RECNUM, "btree:recnum" }, + { BTM_FIXEDLEN, "recno:fixed-length" }, + { BTM_RENUMBER, "recno:renumber" }, + { BTM_SUBDB, "multiple-databases" }, + { BTM_DUPSORT, "sorted duplicates" }, + { BTM_COMPRESS, "compressed" }, + { 0, NULL } + }; + ENV *env; + + env = dbp->env; + + __db_meta(dbp, (DBMETA *)h, fn, flags); + + __db_msg(env, "\tminkey: %lu", (u_long)h->minkey); + if (dbp->type == DB_RECNO) + __db_msg(env, "\tre_len: %#lx re_pad: %#lx", + (u_long)h->re_len, (u_long)h->re_pad); + __db_msg(env, "\troot: %lu", (u_long)h->root); + + return (0); +} + +/* + * __db_hmeta -- + * Print out the hash meta-data page. + */ +static int +__db_hmeta(dbp, h, flags) + DB *dbp; + HMETA *h; + u_int32_t flags; +{ + static const FN fn[] = { + { DB_HASH_DUP, "duplicates" }, + { DB_HASH_SUBDB, "multiple-databases" }, + { DB_HASH_DUPSORT, "sorted duplicates" }, + { 0, NULL } + }; + ENV *env; + DB_MSGBUF mb; + int i; + + env = dbp->env; + DB_MSGBUF_INIT(&mb); + + __db_meta(dbp, (DBMETA *)h, fn, flags); + + __db_msg(env, "\tmax_bucket: %lu", (u_long)h->max_bucket); + __db_msg(env, "\thigh_mask: %#lx", (u_long)h->high_mask); + __db_msg(env, "\tlow_mask: %#lx", (u_long)h->low_mask); + __db_msg(env, "\tffactor: %lu", (u_long)h->ffactor); + __db_msg(env, "\tnelem: %lu", (u_long)h->nelem); + __db_msg(env, "\th_charkey: %#lx", (u_long)h->h_charkey); + __db_msgadd(env, &mb, "\tspare points: "); + for (i = 0; i < NCACHED; i++) + __db_msgadd(env, &mb, "%lu ", (u_long)h->spares[i]); + DB_MSGBUF_FLUSH(env, &mb); + + return (0); +} + +/* + * __db_qmeta -- + * Print out the queue meta-data page. + */ +static int +__db_qmeta(dbp, h, flags) + DB *dbp; + QMETA *h; + u_int32_t flags; +{ + ENV *env; + + env = dbp->env; + + __db_meta(dbp, (DBMETA *)h, NULL, flags); + + __db_msg(env, "\tfirst_recno: %lu", (u_long)h->first_recno); + __db_msg(env, "\tcur_recno: %lu", (u_long)h->cur_recno); + __db_msg(env, "\tre_len: %#lx re_pad: %lu", + (u_long)h->re_len, (u_long)h->re_pad); + __db_msg(env, "\trec_page: %lu", (u_long)h->rec_page); + __db_msg(env, "\tpage_ext: %lu", (u_long)h->page_ext); + + return (0); +} + +/* + * __db_prnpage + * -- Print out a specific page. + * + * PUBLIC: int __db_prnpage __P((DB *, DB_TXN *, db_pgno_t)); + */ +int +__db_prnpage(dbp, txn, pgno) + DB *dbp; + DB_TXN *txn; + db_pgno_t pgno; +{ + DB_MPOOLFILE *mpf; + PAGE *h; + int ret, t_ret; + + mpf = dbp->mpf; + + if ((ret = __memp_fget(mpf, &pgno, NULL, txn, 0, &h)) != 0) + return (ret); + + ret = __db_prpage(dbp, h, DB_PR_PAGE); + + if ((t_ret = __memp_fput(mpf, NULL, h, dbp->priority)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __db_prpage + * -- Print out a page. + * + * PUBLIC: int __db_prpage __P((DB *, PAGE *, u_int32_t)); + */ +int +__db_prpage(dbp, h, flags) + DB *dbp; + PAGE *h; + u_int32_t flags; +{ + BINTERNAL *bi; + BKEYDATA *bk; + DB_MSGBUF mb; + ENV *env; + HOFFPAGE a_hkd; + QAMDATA *qp, *qep; + RINTERNAL *ri; + db_indx_t dlen, len, i, *inp; + db_pgno_t pgno; + db_recno_t recno; + u_int32_t pagesize, qlen; + u_int8_t *ep, *hk, *p; + int deleted, ret; + const char *s; + void *sp; + + env = dbp->env; + DB_MSGBUF_INIT(&mb); + + /* + * If we're doing recovery testing and this page is P_INVALID, + * assume it's a page that's on the free list, and don't display it. + */ + if (LF_ISSET(DB_PR_RECOVERYTEST) && TYPE(h) == P_INVALID) + return (0); + + if ((s = __db_pagetype_to_string(TYPE(h))) == NULL) { + __db_msg(env, "ILLEGAL PAGE TYPE: page: %lu type: %lu", + (u_long)h->pgno, (u_long)TYPE(h)); + return (EINVAL); + } + + /* + * !!! + * Find out the page size. We don't want to do it the "right" way, + * by reading the value from the meta-data page, that's going to be + * slow. Reach down into the mpool region. + */ + pagesize = (u_int32_t)dbp->mpf->mfp->stat.st_pagesize; + + /* Page number, page type. */ + __db_msgadd(env, &mb, "page %lu: %s:", (u_long)h->pgno, s); + + /* + * LSNs on a metadata page will be different from the original after an + * abort, in some cases. Don't display them if we're testing recovery. + */ + if (!LF_ISSET(DB_PR_RECOVERYTEST) || + (TYPE(h) != P_BTREEMETA && TYPE(h) != P_HASHMETA && + TYPE(h) != P_QAMMETA && TYPE(h) != P_QAMDATA)) + __db_msgadd(env, &mb, " LSN [%lu][%lu]:", + (u_long)LSN(h).file, (u_long)LSN(h).offset); + + /* + * Page level (only applicable for Btree/Recno, but we always display + * it, for no particular reason. + */ + __db_msgadd(env, &mb, " level %lu", (u_long)h->level); + + /* Record count. */ + if (TYPE(h) == P_IBTREE || + TYPE(h) == P_IRECNO || (TYPE(h) == P_LRECNO && + h->pgno == ((BTREE *)dbp->bt_internal)->bt_root)) + __db_msgadd(env, &mb, " records: %lu", (u_long)RE_NREC(h)); + DB_MSGBUF_FLUSH(env, &mb); + + switch (TYPE(h)) { + case P_BTREEMETA: + return (__db_bmeta(dbp, (BTMETA *)h, flags)); + case P_HASHMETA: + return (__db_hmeta(dbp, (HMETA *)h, flags)); + case P_QAMMETA: + return (__db_qmeta(dbp, (QMETA *)h, flags)); + case P_QAMDATA: /* Should be meta->start. */ + if (!LF_ISSET(DB_PR_PAGE)) + return (0); + + qlen = ((QUEUE *)dbp->q_internal)->re_len; + recno = (h->pgno - 1) * QAM_RECNO_PER_PAGE(dbp) + 1; + i = 0; + qep = (QAMDATA *)((u_int8_t *)h + pagesize - qlen); + for (qp = QAM_GET_RECORD(dbp, h, i); qp < qep; + recno++, i++, qp = QAM_GET_RECORD(dbp, h, i)) { + if (!F_ISSET(qp, QAM_SET)) + continue; + + __db_msgadd(env, &mb, "%s", + F_ISSET(qp, QAM_VALID) ? "\t" : " D"); + __db_msgadd(env, &mb, "[%03lu] %4lu ", (u_long)recno, + (u_long)((u_int8_t *)qp - (u_int8_t *)h)); + __db_prbytes(env, &mb, qp->data, qlen); + } + return (0); + default: + break; + } + + s = "\t"; + if (TYPE(h) != P_IBTREE && TYPE(h) != P_IRECNO) { + __db_msgadd(env, &mb, "%sprev: %4lu next: %4lu", + s, (u_long)PREV_PGNO(h), (u_long)NEXT_PGNO(h)); + s = " "; + } + if (TYPE(h) == P_OVERFLOW) { + __db_msgadd(env, &mb, + "%sref cnt: %4lu ", s, (u_long)OV_REF(h)); + __db_prbytes(env, + &mb, (u_int8_t *)h + P_OVERHEAD(dbp), OV_LEN(h)); + return (0); + } + __db_msgadd(env, &mb, "%sentries: %4lu", s, (u_long)NUM_ENT(h)); + __db_msgadd(env, &mb, " offset: %4lu", (u_long)HOFFSET(h)); + DB_MSGBUF_FLUSH(env, &mb); + + if (TYPE(h) == P_INVALID || !LF_ISSET(DB_PR_PAGE)) + return (0); + + ret = 0; + inp = P_INP(dbp, h); + for (i = 0; i < NUM_ENT(h); i++) { + if ((uintptr_t)(P_ENTRY(dbp, h, i) - (u_int8_t *)h) < + (uintptr_t)(P_OVERHEAD(dbp)) || + (size_t)(P_ENTRY(dbp, h, i) - (u_int8_t *)h) >= pagesize) { + __db_msg(env, + "ILLEGAL PAGE OFFSET: indx: %lu of %lu", + (u_long)i, (u_long)inp[i]); + ret = EINVAL; + continue; + } + deleted = 0; + switch (TYPE(h)) { + case P_HASH_UNSORTED: + case P_HASH: + case P_IBTREE: + case P_IRECNO: + sp = P_ENTRY(dbp, h, i); + break; + case P_LBTREE: + sp = P_ENTRY(dbp, h, i); + deleted = i % 2 == 0 && + B_DISSET(GET_BKEYDATA(dbp, h, i + O_INDX)->type); + break; + case P_LDUP: + case P_LRECNO: + sp = P_ENTRY(dbp, h, i); + deleted = B_DISSET(GET_BKEYDATA(dbp, h, i)->type); + break; + default: + goto type_err; + } + __db_msgadd(env, &mb, "%s", deleted ? " D" : "\t"); + __db_msgadd( + env, &mb, "[%03lu] %4lu ", (u_long)i, (u_long)inp[i]); + switch (TYPE(h)) { + case P_HASH_UNSORTED: + case P_HASH: + hk = sp; + switch (HPAGE_PTYPE(hk)) { + case H_OFFDUP: + memcpy(&pgno, + HOFFDUP_PGNO(hk), sizeof(db_pgno_t)); + __db_msgadd(env, &mb, + "%4lu [offpage dups]", (u_long)pgno); + DB_MSGBUF_FLUSH(env, &mb); + break; + case H_DUPLICATE: + /* + * If this is the first item on a page, then + * we cannot figure out how long it is, so + * we only print the first one in the duplicate + * set. + */ + if (i != 0) + len = LEN_HKEYDATA(dbp, h, 0, i); + else + len = 1; + + __db_msgadd(env, &mb, "Duplicates:"); + DB_MSGBUF_FLUSH(env, &mb); + for (p = HKEYDATA_DATA(hk), + ep = p + len; p < ep;) { + memcpy(&dlen, p, sizeof(db_indx_t)); + p += sizeof(db_indx_t); + __db_msgadd(env, &mb, "\t\t"); + __db_prbytes(env, &mb, p, dlen); + p += sizeof(db_indx_t) + dlen; + } + break; + case H_KEYDATA: + __db_prbytes(env, &mb, HKEYDATA_DATA(hk), + LEN_HKEYDATA(dbp, h, i == 0 ? + pagesize : 0, i)); + break; + case H_OFFPAGE: + memcpy(&a_hkd, hk, HOFFPAGE_SIZE); + __db_msgadd(env, &mb, + "overflow: total len: %4lu page: %4lu", + (u_long)a_hkd.tlen, (u_long)a_hkd.pgno); + DB_MSGBUF_FLUSH(env, &mb); + break; + default: + DB_MSGBUF_FLUSH(env, &mb); + __db_msg(env, "ILLEGAL HASH PAGE TYPE: %lu", + (u_long)HPAGE_PTYPE(hk)); + ret = EINVAL; + break; + } + break; + case P_IBTREE: + bi = sp; + + if (F_ISSET(dbp, DB_AM_RECNUM)) + __db_msgadd(env, &mb, + "count: %4lu ", (u_long)bi->nrecs); + __db_msgadd(env, &mb, + "pgno: %4lu type: %lu ", + (u_long)bi->pgno, (u_long)bi->type); + switch (B_TYPE(bi->type)) { + case B_KEYDATA: + __db_prbytes(env, &mb, bi->data, bi->len); + break; + case B_DUPLICATE: + case B_OVERFLOW: + __db_proff(env, &mb, bi->data); + break; + default: + DB_MSGBUF_FLUSH(env, &mb); + __db_msg(env, "ILLEGAL BINTERNAL TYPE: %lu", + (u_long)B_TYPE(bi->type)); + ret = EINVAL; + break; + } + break; + case P_IRECNO: + ri = sp; + __db_msgadd(env, &mb, "entries %4lu pgno %4lu", + (u_long)ri->nrecs, (u_long)ri->pgno); + DB_MSGBUF_FLUSH(env, &mb); + break; + case P_LBTREE: + case P_LDUP: + case P_LRECNO: + bk = sp; + switch (B_TYPE(bk->type)) { + case B_KEYDATA: + __db_prbytes(env, &mb, bk->data, bk->len); + break; + case B_DUPLICATE: + case B_OVERFLOW: + __db_proff(env, &mb, bk); + break; + default: + DB_MSGBUF_FLUSH(env, &mb); + __db_msg(env, + "ILLEGAL DUPLICATE/LBTREE/LRECNO TYPE: %lu", + (u_long)B_TYPE(bk->type)); + ret = EINVAL; + break; + } + break; + default: +type_err: DB_MSGBUF_FLUSH(env, &mb); + __db_msg(env, + "ILLEGAL PAGE TYPE: %lu", (u_long)TYPE(h)); + ret = EINVAL; + continue; + } + } + return (ret); +} + +/* + * __db_prbytes -- + * Print out a data element. + * + * PUBLIC: void __db_prbytes __P((ENV *, DB_MSGBUF *, u_int8_t *, u_int32_t)); + */ +void +__db_prbytes(env, mbp, bytes, len) + ENV *env; + DB_MSGBUF *mbp; + u_int8_t *bytes; + u_int32_t len; +{ + u_int8_t *p; + u_int32_t i; + int msg_truncated; + + __db_msgadd(env, mbp, "len: %3lu", (u_long)len); + if (len != 0) { + __db_msgadd(env, mbp, " data: "); + + /* + * Print the first 20 bytes of the data. If that chunk is + * all printable characters, print it as text, else print it + * in hex. We have this heuristic because we're displaying + * things like lock objects that could be either text or data. + */ + if (len > 20) { + len = 20; + msg_truncated = 1; + } else + msg_truncated = 0; + for (p = bytes, i = len; i > 0; --i, ++p) + if (!isprint((int)*p) && *p != '\t' && *p != '\n') + break; + if (i == 0) + for (p = bytes, i = len; i > 0; --i, ++p) + __db_msgadd(env, mbp, "%c", *p); + else + for (p = bytes, i = len; i > 0; --i, ++p) + __db_msgadd(env, mbp, "%#.2x", (u_int)*p); + if (msg_truncated) + __db_msgadd(env, mbp, "..."); + } + DB_MSGBUF_FLUSH(env, mbp); +} + +/* + * __db_proff -- + * Print out an off-page element. + */ +static void +__db_proff(env, mbp, vp) + ENV *env; + DB_MSGBUF *mbp; + void *vp; +{ + BOVERFLOW *bo; + + bo = vp; + switch (B_TYPE(bo->type)) { + case B_OVERFLOW: + __db_msgadd(env, mbp, "overflow: total len: %4lu page: %4lu", + (u_long)bo->tlen, (u_long)bo->pgno); + break; + case B_DUPLICATE: + __db_msgadd( + env, mbp, "duplicate: page: %4lu", (u_long)bo->pgno); + break; + default: + /* NOTREACHED */ + break; + } + DB_MSGBUF_FLUSH(env, mbp); +} + +/* + * __db_prflags -- + * Print out flags values. + * + * PUBLIC: void __db_prflags __P((ENV *, DB_MSGBUF *, + * PUBLIC: u_int32_t, const FN *, const char *, const char *)); + */ +void +__db_prflags(env, mbp, flags, fn, prefix, suffix) + ENV *env; + DB_MSGBUF *mbp; + u_int32_t flags; + FN const *fn; + const char *prefix, *suffix; +{ + DB_MSGBUF mb; + const FN *fnp; + int found, standalone; + const char *sep; + + if (fn == NULL) + return; + + /* + * If it's a standalone message, output the suffix (which will be the + * label), regardless of whether we found anything or not, and flush + * the line. + */ + if (mbp == NULL) { + standalone = 1; + mbp = &mb; + DB_MSGBUF_INIT(mbp); + } else + standalone = 0; + + sep = prefix == NULL ? "" : prefix; + for (found = 0, fnp = fn; fnp->mask != 0; ++fnp) + if (LF_ISSET(fnp->mask)) { + __db_msgadd(env, mbp, "%s%s", sep, fnp->name); + sep = ", "; + found = 1; + } + + if ((standalone || found) && suffix != NULL) + __db_msgadd(env, mbp, "%s", suffix); + if (standalone) + DB_MSGBUF_FLUSH(env, mbp); +} + +/* + * __db_lockmode_to_string -- + * Return the name of the lock mode. + * + * PUBLIC: const char * __db_lockmode_to_string __P((db_lockmode_t)); + */ +const char * +__db_lockmode_to_string(mode) + db_lockmode_t mode; +{ + switch (mode) { + case DB_LOCK_NG: + return ("Not granted"); + case DB_LOCK_READ: + return ("Shared/read"); + case DB_LOCK_WRITE: + return ("Exclusive/write"); + case DB_LOCK_WAIT: + return ("Wait for event"); + case DB_LOCK_IWRITE: + return ("Intent exclusive/write"); + case DB_LOCK_IREAD: + return ("Intent shared/read"); + case DB_LOCK_IWR: + return ("Intent to read/write"); + case DB_LOCK_READ_UNCOMMITTED: + return ("Read uncommitted"); + case DB_LOCK_WWRITE: + return ("Was written"); + default: + break; + } + return ("UNKNOWN LOCK MODE"); +} + +/* + * __db_pagetype_to_string -- + * Return the name of the specified page type. + */ +static const char * +__db_pagetype_to_string(type) + u_int32_t type; +{ + char *s; + + s = NULL; + switch (type) { + case P_BTREEMETA: + s = "btree metadata"; + break; + case P_LDUP: + s = "duplicate"; + break; + case P_HASH_UNSORTED: + s = "hash unsorted"; + break; + case P_HASH: + s = "hash"; + break; + case P_HASHMETA: + s = "hash metadata"; + break; + case P_IBTREE: + s = "btree internal"; + break; + case P_INVALID: + s = "invalid"; + break; + case P_IRECNO: + s = "recno internal"; + break; + case P_LBTREE: + s = "btree leaf"; + break; + case P_LRECNO: + s = "recno leaf"; + break; + case P_OVERFLOW: + s = "overflow"; + break; + case P_QAMMETA: + s = "queue metadata"; + break; + case P_QAMDATA: + s = "queue"; + break; + default: + /* Just return a NULL. */ + break; + } + return (s); +} + +#else /* !HAVE_STATISTICS */ + +/* + * __db_dumptree -- + * Dump the tree to a file. + * + * PUBLIC: int __db_dumptree __P((DB *, DB_TXN *, char *, char *)); + */ +int +__db_dumptree(dbp, txn, op, name) + DB *dbp; + DB_TXN *txn; + char *op, *name; +{ + COMPQUIET(txn, NULL); + COMPQUIET(op, NULL); + COMPQUIET(name, NULL); + + return (__db_stat_not_built(dbp->env)); +} + +/* + * __db_get_flags_fn -- + * Return the __db_flags_fn array. + * + * PUBLIC: const FN * __db_get_flags_fn __P((void)); + */ +const FN * +__db_get_flags_fn() +{ + /* + * !!! + * The Tcl API uses this interface, stub it off. + */ + return (NULL); +} +#endif + +/* + * __db_dump_pp -- + * DB->dump pre/post processing. + * + * PUBLIC: int __db_dump_pp __P((DB *, const char *, + * PUBLIC: int (*)(void *, const void *), void *, int, int)); + */ +int +__db_dump_pp(dbp, subname, callback, handle, pflag, keyflag) + DB *dbp; + const char *subname; + int (*callback) __P((void *, const void *)); + void *handle; + int pflag, keyflag; +{ + DB_THREAD_INFO *ip; + ENV *env; + int handle_check, ret, t_ret; + + env = dbp->env; + + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->dump"); + + ENV_ENTER(env, ip); + + /* Check for replication block. */ + handle_check = IS_ENV_REPLICATED(env); + if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, 1)) != 0) { + handle_check = 0; + goto err; + } + + ret = __db_dump(dbp, subname, callback, handle, pflag, keyflag); + + /* Release replication block. */ + if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + +err: ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __db_dump -- + * DB->dump. + * + * PUBLIC: int __db_dump __P((DB *, const char *, + * PUBLIC: int (*)(void *, const void *), void *, int, int)); + */ +int +__db_dump(dbp, subname, callback, handle, pflag, keyflag) + DB *dbp; + const char *subname; + int (*callback) __P((void *, const void *)); + void *handle; + int pflag, keyflag; +{ + DBC *dbcp; + DBT key, data; + DBT keyret, dataret; + ENV *env; + db_recno_t recno; + int is_recno, ret, t_ret; + void *pointer; + + env = dbp->env; + + if ((ret = __db_prheader( + dbp, subname, pflag, keyflag, handle, callback, NULL, 0)) != 0) + return (ret); + + /* + * Get a cursor and step through the database, printing out each + * key/data pair. + */ + if ((ret = __db_cursor(dbp, NULL, NULL, &dbcp, 0)) != 0) + return (ret); + + memset(&key, 0, sizeof(key)); + memset(&data, 0, sizeof(data)); + if ((ret = __os_malloc(env, 1024 * 1024, &data.data)) != 0) + goto err; + data.ulen = 1024 * 1024; + data.flags = DB_DBT_USERMEM; + is_recno = (dbp->type == DB_RECNO || dbp->type == DB_QUEUE); + keyflag = is_recno ? keyflag : 1; + if (is_recno) { + keyret.data = &recno; + keyret.size = sizeof(recno); + } + +retry: while ((ret = + __dbc_get(dbcp, &key, &data, DB_NEXT | DB_MULTIPLE_KEY)) == 0) { + DB_MULTIPLE_INIT(pointer, &data); + for (;;) { + if (is_recno) + DB_MULTIPLE_RECNO_NEXT(pointer, &data, + recno, dataret.data, dataret.size); + else + DB_MULTIPLE_KEY_NEXT(pointer, + &data, keyret.data, + keyret.size, dataret.data, dataret.size); + + if (dataret.data == NULL) + break; + + if ((keyflag && + (ret = __db_prdbt(&keyret, pflag, " ", + handle, callback, is_recno)) != 0) || + (ret = __db_prdbt(&dataret, pflag, " ", + handle, callback, 0)) != 0) + goto err; + } + } + if (ret == DB_BUFFER_SMALL) { + data.size = (u_int32_t)DB_ALIGN(data.size, 1024); + if ((ret = __os_realloc(env, data.size, &data.data)) != 0) + goto err; + data.ulen = data.size; + goto retry; + } + if (ret == DB_NOTFOUND) + ret = 0; + + if ((t_ret = __db_prfooter(handle, callback)) != 0 && ret == 0) + ret = t_ret; + +err: if ((t_ret = __dbc_close(dbcp)) != 0 && ret == 0) + ret = t_ret; + if (data.data != NULL) + __os_free(env, data.data); + + return (ret); +} + +/* + * __db_prdbt -- + * Print out a DBT data element. + * + * PUBLIC: int __db_prdbt __P((DBT *, int, const char *, void *, + * PUBLIC: int (*)(void *, const void *), int)); + */ +int +__db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno) + DBT *dbtp; + int checkprint; + const char *prefix; + void *handle; + int (*callback) __P((void *, const void *)); + int is_recno; +{ + static const u_char hex[] = "0123456789abcdef"; + db_recno_t recno; + size_t len; + int ret; +#define DBTBUFLEN 100 + u_int8_t *p, *hp; + char buf[DBTBUFLEN], hbuf[DBTBUFLEN]; + + /* + * !!! + * This routine is the routine that dumps out items in the format + * used by db_dump(1) and db_load(1). This means that the format + * cannot change. + */ + if (prefix != NULL && (ret = callback(handle, prefix)) != 0) + return (ret); + if (is_recno) { + /* + * We're printing a record number, and this has to be done + * in a platform-independent way. So we use the numeral in + * straight ASCII. + */ + (void)__ua_memcpy(&recno, dbtp->data, sizeof(recno)); + snprintf(buf, DBTBUFLEN, "%lu", (u_long)recno); + + /* If we're printing data as hex, print keys as hex too. */ + if (!checkprint) { + for (len = strlen(buf), p = (u_int8_t *)buf, + hp = (u_int8_t *)hbuf; len-- > 0; ++p) { + *hp++ = hex[(u_int8_t)(*p & 0xf0) >> 4]; + *hp++ = hex[*p & 0x0f]; + } + *hp = '\0'; + ret = callback(handle, hbuf); + } else + ret = callback(handle, buf); + + if (ret != 0) + return (ret); + } else if (checkprint) { + for (len = dbtp->size, p = dbtp->data; len--; ++p) + if (isprint((int)*p)) { + if (*p == '\\' && + (ret = callback(handle, "\\")) != 0) + return (ret); + snprintf(buf, DBTBUFLEN, "%c", *p); + if ((ret = callback(handle, buf)) != 0) + return (ret); + } else { + snprintf(buf, DBTBUFLEN, "\\%c%c", + hex[(u_int8_t)(*p & 0xf0) >> 4], + hex[*p & 0x0f]); + if ((ret = callback(handle, buf)) != 0) + return (ret); + } + } else + for (len = dbtp->size, p = dbtp->data; len--; ++p) { + snprintf(buf, DBTBUFLEN, "%c%c", + hex[(u_int8_t)(*p & 0xf0) >> 4], + hex[*p & 0x0f]); + if ((ret = callback(handle, buf)) != 0) + return (ret); + } + + return (callback(handle, "\n")); +} + +/* + * __db_prheader -- + * Write out header information in the format expected by db_load. + * + * PUBLIC: int __db_prheader __P((DB *, const char *, int, int, void *, + * PUBLIC: int (*)(void *, const void *), VRFY_DBINFO *, db_pgno_t)); + */ +int +__db_prheader(dbp, subname, pflag, keyflag, handle, callback, vdp, meta_pgno) + DB *dbp; + const char *subname; + int pflag, keyflag; + void *handle; + int (*callback) __P((void *, const void *)); + VRFY_DBINFO *vdp; + db_pgno_t meta_pgno; +{ + DBT dbt; + DBTYPE dbtype; + ENV *env; + VRFY_PAGEINFO *pip; + u_int32_t flags, tmp_u_int32; + size_t buflen; + char *buf; + int using_vdp, ret, t_ret, tmp_int; + + ret = 0; + buf = NULL; + COMPQUIET(buflen, 0); + + /* + * If dbp is NULL, then pip is guaranteed to be non-NULL; we only ever + * call __db_prheader with a NULL dbp from one case inside __db_prdbt, + * and this is a special subdatabase for "lost" items. In this case + * we have a vdp (from which we'll get a pip). In all other cases, we + * will have a non-NULL dbp (and vdp may or may not be NULL depending + * on whether we're salvaging). + */ + if (dbp == NULL) + env = NULL; + else + env = dbp->env; + DB_ASSERT(env, dbp != NULL || vdp != NULL); + + /* + * If we've been passed a verifier statistics object, use that; we're + * being called in a context where dbp->stat is unsafe. + * + * Also, the verifier may set the pflag on a per-salvage basis. If so, + * respect that. + */ + if (vdp != NULL) { + if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &pip)) != 0) + return (ret); + + if (F_ISSET(vdp, SALVAGE_PRINTABLE)) + pflag = 1; + using_vdp = 1; + } else { + pip = NULL; + using_vdp = 0; + } + + /* + * If dbp is NULL, make it a btree. Otherwise, set dbtype to whatever + * appropriate type for the specified meta page, or the type of the dbp. + */ + if (dbp == NULL) + dbtype = DB_BTREE; + else if (using_vdp) + switch (pip->type) { + case P_BTREEMETA: + if (F_ISSET(pip, VRFY_IS_RECNO)) + dbtype = DB_RECNO; + else + dbtype = DB_BTREE; + break; + case P_HASHMETA: + dbtype = DB_HASH; + break; + case P_QAMMETA: + dbtype = DB_QUEUE; + break; + default: + /* + * If the meta page is of a bogus type, it's because + * we have a badly corrupt database. (We must be in + * the verifier for pip to be non-NULL.) Pretend we're + * a Btree and salvage what we can. + */ + DB_ASSERT(env, F_ISSET(dbp, DB_AM_VERIFYING)); + dbtype = DB_BTREE; + break; + } + else + dbtype = dbp->type; + + if ((ret = callback(handle, "VERSION=3\n")) != 0) + goto err; + if (pflag) { + if ((ret = callback(handle, "format=print\n")) != 0) + goto err; + } else if ((ret = callback(handle, "format=bytevalue\n")) != 0) + goto err; + + /* + * 64 bytes is long enough, as a minimum bound, for any of the + * fields besides subname. Subname uses __db_prdbt and therefore + * does not need buffer space here. + */ + buflen = 64; + if ((ret = __os_malloc(env, buflen, &buf)) != 0) + goto err; + if (subname != NULL) { + snprintf(buf, buflen, "database="); + if ((ret = callback(handle, buf)) != 0) + goto err; + DB_INIT_DBT(dbt, subname, strlen(subname)); + if ((ret = __db_prdbt(&dbt, 1, NULL, handle, callback, 0)) != 0) + goto err; + } + switch (dbtype) { + case DB_BTREE: + if ((ret = callback(handle, "type=btree\n")) != 0) + goto err; + if (using_vdp) + tmp_int = F_ISSET(pip, VRFY_HAS_RECNUMS) ? 1 : 0; + else { + if ((ret = __db_get_flags(dbp, &flags)) != 0) { + __db_err(env, ret, "DB->get_flags"); + goto err; + } + tmp_int = F_ISSET(dbp, DB_AM_RECNUM) ? 1 : 0; + } + if (tmp_int && (ret = callback(handle, "recnum=1\n")) != 0) + goto err; + + if (using_vdp) + tmp_u_int32 = pip->bt_minkey; + else + if ((ret = + __bam_get_bt_minkey(dbp, &tmp_u_int32)) != 0) { + __db_err(env, ret, "DB->get_bt_minkey"); + goto err; + } + if (tmp_u_int32 != 0 && tmp_u_int32 != DEFMINKEYPAGE) { + snprintf(buf, buflen, + "bt_minkey=%lu\n", (u_long)tmp_u_int32); + if ((ret = callback(handle, buf)) != 0) + goto err; + } + break; + case DB_HASH: +#ifdef HAVE_HASH + if ((ret = callback(handle, "type=hash\n")) != 0) + goto err; + if (using_vdp) + tmp_u_int32 = pip->h_ffactor; + else + if ((ret = + __ham_get_h_ffactor(dbp, &tmp_u_int32)) != 0) { + __db_err(env, ret, "DB->get_h_ffactor"); + goto err; + } + if (tmp_u_int32 != 0) { + snprintf(buf, buflen, + "h_ffactor=%lu\n", (u_long)tmp_u_int32); + if ((ret = callback(handle, buf)) != 0) + goto err; + } + + if (using_vdp) + tmp_u_int32 = pip->h_nelem; + else + if ((ret = __ham_get_h_nelem(dbp, &tmp_u_int32)) != 0) { + __db_err(env, ret, "DB->get_h_nelem"); + goto err; + } + /* + * Hash databases have an h_nelem field of 0 or 1, neither + * of those values is interesting. + */ + if (tmp_u_int32 > 1) { + snprintf(buf, buflen, + "h_nelem=%lu\n", (u_long)tmp_u_int32); + if ((ret = callback(handle, buf)) != 0) + goto err; + } + break; +#else + ret = __db_no_hash_am(env); + goto err; +#endif + case DB_QUEUE: +#ifdef HAVE_QUEUE + if ((ret = callback(handle, "type=queue\n")) != 0) + goto err; + if (using_vdp) + tmp_u_int32 = vdp->re_len; + else + if ((ret = __ram_get_re_len(dbp, &tmp_u_int32)) != 0) { + __db_err(env, ret, "DB->get_re_len"); + goto err; + } + snprintf(buf, buflen, "re_len=%lu\n", (u_long)tmp_u_int32); + if ((ret = callback(handle, buf)) != 0) + goto err; + + if (using_vdp) + tmp_int = (int)vdp->re_pad; + else + if ((ret = __ram_get_re_pad(dbp, &tmp_int)) != 0) { + __db_err(env, ret, "DB->get_re_pad"); + goto err; + } + if (tmp_int != 0 && tmp_int != ' ') { + snprintf(buf, buflen, "re_pad=%#x\n", tmp_int); + if ((ret = callback(handle, buf)) != 0) + goto err; + } + + if (using_vdp) + tmp_u_int32 = vdp->page_ext; + else + if ((ret = + __qam_get_extentsize(dbp, &tmp_u_int32)) != 0) { + __db_err(env, ret, "DB->get_q_extentsize"); + goto err; + } + if (tmp_u_int32 != 0) { + snprintf(buf, buflen, + "extentsize=%lu\n", (u_long)tmp_u_int32); + if ((ret = callback(handle, buf)) != 0) + goto err; + } + break; +#else + ret = __db_no_queue_am(env); + goto err; +#endif + case DB_RECNO: + if ((ret = callback(handle, "type=recno\n")) != 0) + goto err; + if (using_vdp) + tmp_int = F_ISSET(pip, VRFY_IS_RRECNO) ? 1 : 0; + else + tmp_int = F_ISSET(dbp, DB_AM_RENUMBER) ? 1 : 0; + if (tmp_int != 0 && + (ret = callback(handle, "renumber=1\n")) != 0) + goto err; + + if (using_vdp) + tmp_int = F_ISSET(pip, VRFY_IS_FIXEDLEN) ? 1 : 0; + else + tmp_int = F_ISSET(dbp, DB_AM_FIXEDLEN) ? 1 : 0; + if (tmp_int) { + if (using_vdp) + tmp_u_int32 = pip->re_len; + else + if ((ret = + __ram_get_re_len(dbp, &tmp_u_int32)) != 0) { + __db_err(env, ret, "DB->get_re_len"); + goto err; + } + snprintf(buf, buflen, + "re_len=%lu\n", (u_long)tmp_u_int32); + if ((ret = callback(handle, buf)) != 0) + goto err; + + if (using_vdp) + tmp_int = (int)pip->re_pad; + else + if ((ret = + __ram_get_re_pad(dbp, &tmp_int)) != 0) { + __db_err(env, ret, "DB->get_re_pad"); + goto err; + } + if (tmp_int != 0 && tmp_int != ' ') { + snprintf(buf, + buflen, "re_pad=%#x\n", (u_int)tmp_int); + if ((ret = callback(handle, buf)) != 0) + goto err; + } + } + break; + case DB_UNKNOWN: /* Impossible. */ + ret = __db_unknown_path(env, "__db_prheader"); + goto err; + } + + if (using_vdp) { + if (F_ISSET(pip, VRFY_HAS_CHKSUM)) + if ((ret = callback(handle, "chksum=1\n")) != 0) + goto err; + if (F_ISSET(pip, VRFY_HAS_DUPS)) + if ((ret = callback(handle, "duplicates=1\n")) != 0) + goto err; + if (F_ISSET(pip, VRFY_HAS_DUPSORT)) + if ((ret = callback(handle, "dupsort=1\n")) != 0) + goto err; +#ifdef HAVE_COMPRESSION + if (F_ISSET(pip, VRFY_HAS_COMPRESS)) + if ((ret = callback(handle, "compressed=1\n")) != 0) + goto err; +#endif + /* + * !!! + * We don't know if the page size was the default if we're + * salvaging. It doesn't seem that interesting to have, so + * we ignore it for now. + */ + } else { + if (F_ISSET(dbp, DB_AM_CHKSUM)) + if ((ret = callback(handle, "chksum=1\n")) != 0) + goto err; + if (F_ISSET(dbp, DB_AM_DUP)) + if ((ret = callback(handle, "duplicates=1\n")) != 0) + goto err; + if (F_ISSET(dbp, DB_AM_DUPSORT)) + if ((ret = callback(handle, "dupsort=1\n")) != 0) + goto err; +#ifdef HAVE_COMPRESSION + if (DB_IS_COMPRESSED(dbp)) + if ((ret = callback(handle, "compressed=1\n")) != 0) + goto err; +#endif + if (!F_ISSET(dbp, DB_AM_PGDEF)) { + snprintf(buf, buflen, + "db_pagesize=%lu\n", (u_long)dbp->pgsize); + if ((ret = callback(handle, buf)) != 0) + goto err; + } + } + +#ifdef HAVE_PARTITION + if (DB_IS_PARTITIONED(dbp) && + F_ISSET((DB_PARTITION *)dbp->p_internal, PART_RANGE)) { + DBT *keys; + u_int32_t i; + + if ((ret = __partition_get_keys(dbp, &tmp_u_int32, &keys)) != 0) + goto err; + if (tmp_u_int32 != 0) { + snprintf(buf, + buflen, "nparts=%lu\n", (u_long)tmp_u_int32); + if ((ret = callback(handle, buf)) != 0) + goto err; + for (i = 0; i < tmp_u_int32 - 1; i++) + if ((ret = __db_prdbt(&keys[i], + pflag, " ", handle, callback, 0)) != 0) + goto err; + } + } +#endif + + if (keyflag && (ret = callback(handle, "keys=1\n")) != 0) + goto err; + + ret = callback(handle, "HEADER=END\n"); + +err: if (using_vdp && + (t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0) + ret = t_ret; + if (buf != NULL) + __os_free(env, buf); + + return (ret); +} + +/* + * __db_prfooter -- + * Print the footer that marks the end of a DB dump. This is trivial, + * but for consistency's sake we don't want to put its literal contents + * in multiple places. + * + * PUBLIC: int __db_prfooter __P((void *, int (*)(void *, const void *))); + */ +int +__db_prfooter(handle, callback) + void *handle; + int (*callback) __P((void *, const void *)); +{ + return (callback(handle, "DATA=END\n")); +} + +/* + * __db_pr_callback -- + * Callback function for using pr_* functions from C. + * + * PUBLIC: int __db_pr_callback __P((void *, const void *)); + */ +int +__db_pr_callback(handle, str_arg) + void *handle; + const void *str_arg; +{ + char *str; + FILE *f; + + str = (char *)str_arg; + f = (FILE *)handle; + + if (fprintf(f, "%s", str) != (int)strlen(str)) + return (EIO); + + return (0); +} + +/* + * __db_dbtype_to_string -- + * Return the name of the database type. + * + * PUBLIC: const char * __db_dbtype_to_string __P((DBTYPE)); + */ +const char * +__db_dbtype_to_string(type) + DBTYPE type; +{ + switch (type) { + case DB_BTREE: + return ("btree"); + case DB_HASH: + return ("hash"); + case DB_RECNO: + return ("recno"); + case DB_QUEUE: + return ("queue"); + case DB_UNKNOWN: + default: + break; + } + return ("UNKNOWN TYPE"); +} diff --git a/db/db_rec.c b/db/db_rec.c new file mode 100644 index 0000000..02fe096 --- /dev/null +++ b/db/db_rec.c @@ -0,0 +1,1859 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 2010 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/hash.h" + +static int __db_pg_free_recover_int __P((ENV *, DB_THREAD_INFO *, + __db_pg_freedata_args *, DB *, DB_LSN *, DB_MPOOLFILE *, db_recops, int)); +static int __db_pg_free_recover_42_int __P((ENV *, DB_THREAD_INFO *, + __db_pg_freedata_42_args *, + DB *, DB_LSN *, DB_MPOOLFILE *, db_recops, int)); + +/* + * PUBLIC: int __db_addrem_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + * + * This log message is generated whenever we add or remove a duplicate + * to/from a duplicate page. On recover, we just do the opposite. + */ +int +__db_addrem_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __db_addrem_args *argp; + DB_THREAD_INFO *ip; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int cmp_n, cmp_p, modified, ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + pagep = NULL; + REC_PRINT(__db_addrem_print); + REC_INTRO(__db_addrem_read, ip, 1); + + REC_FGET(mpf, ip, argp->pgno, &pagep, done); + modified = 0; + + cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn); + CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn); + CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp); + if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_DUP) || + (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_REM_DUP)) { + /* Need to redo an add, or undo a delete. */ + REC_DIRTY(mpf, ip, dbc->priority, &pagep); + if ((ret = __db_pitem(dbc, pagep, argp->indx, argp->nbytes, + argp->hdr.size == 0 ? NULL : &argp->hdr, + argp->dbt.size == 0 ? NULL : &argp->dbt)) != 0) + goto out; + modified = 1; + + } else if ((cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_ADD_DUP) || + (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_REM_DUP)) { + /* Need to undo an add, or redo a delete. */ + REC_DIRTY(mpf, ip, dbc->priority, &pagep); + if ((ret = __db_ditem(dbc, + pagep, argp->indx, argp->nbytes)) != 0) + goto out; + modified = 1; + } + + if (modified) { + if (DB_REDO(op)) + LSN(pagep) = *lsnp; + else + LSN(pagep) = argp->pagelsn; + } + + if ((ret = __memp_fput(mpf, ip, pagep, dbc->priority)) != 0) + goto out; + pagep = NULL; + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: if (pagep != NULL) + (void)__memp_fput(mpf, ip, pagep, dbc->priority); + REC_CLOSE; +} + +/* + * PUBLIC: int __db_big_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__db_big_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __db_big_args *argp; + DB_THREAD_INFO *ip; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int cmp_n, cmp_p, modified, ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + pagep = NULL; + REC_PRINT(__db_big_print); + REC_INTRO(__db_big_read, ip, 0); + + REC_FGET(mpf, ip, argp->pgno, &pagep, ppage); + modified = 0; + + /* + * There are three pages we need to check. The one on which we are + * adding data, the previous one whose next_pointer may have + * been updated, and the next one whose prev_pointer may have + * been updated. + */ + cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->pagelsn); + CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->pagelsn); + CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp); + if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_BIG) || + (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_REM_BIG)) { + /* We are either redo-ing an add, or undoing a delete. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + P_INIT(pagep, file_dbp->pgsize, argp->pgno, argp->prev_pgno, + argp->next_pgno, 0, P_OVERFLOW); + OV_LEN(pagep) = argp->dbt.size; + OV_REF(pagep) = 1; + memcpy((u_int8_t *)pagep + P_OVERHEAD(file_dbp), argp->dbt.data, + argp->dbt.size); + PREV_PGNO(pagep) = argp->prev_pgno; + modified = 1; + } else if ((cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_ADD_BIG) || + (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_REM_BIG)) { + /* + * We are either undo-ing an add or redo-ing a delete. + * The page is about to be reclaimed in either case, so + * there really isn't anything to do here. + */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + modified = 1; + } else if (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_APPEND_BIG) { + /* We are redoing an append. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + memcpy((u_int8_t *)pagep + P_OVERHEAD(file_dbp) + + OV_LEN(pagep), argp->dbt.data, argp->dbt.size); + OV_LEN(pagep) += argp->dbt.size; + modified = 1; + } else if (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DB_APPEND_BIG) { + /* We are undoing an append. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + OV_LEN(pagep) -= argp->dbt.size; + memset((u_int8_t *)pagep + P_OVERHEAD(file_dbp) + + OV_LEN(pagep), 0, argp->dbt.size); + modified = 1; + } + if (modified) + LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn; + + ret = __memp_fput(mpf, ip, pagep, file_dbp->priority); + pagep = NULL; + if (ret != 0) + goto out; + + /* + * We only delete a whole chain of overflow items, and appends only + * apply to a single page. Adding a page is the only case that + * needs to update the chain. + */ + if (argp->opcode != DB_ADD_BIG) + goto done; + + /* Now check the previous page. */ +ppage: if (argp->prev_pgno != PGNO_INVALID) { + REC_FGET(mpf, ip, argp->prev_pgno, &pagep, npage); + modified = 0; + + cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->prevlsn); + CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->prevlsn); + CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp); + + if (cmp_p == 0 && DB_REDO(op) && argp->opcode == DB_ADD_BIG) { + /* Redo add, undo delete. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + NEXT_PGNO(pagep) = argp->pgno; + modified = 1; + } else if (cmp_n == 0 && + DB_UNDO(op) && argp->opcode == DB_ADD_BIG) { + /* Redo delete, undo add. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + NEXT_PGNO(pagep) = argp->next_pgno; + modified = 1; + } + if (modified) + LSN(pagep) = DB_REDO(op) ? *lsnp : argp->prevlsn; + ret = __memp_fput(mpf, ip, pagep, file_dbp->priority); + pagep = NULL; + if (ret != 0) + goto out; + } + pagep = NULL; + + /* Now check the next page. Can only be set on a delete. */ +npage: if (argp->next_pgno != PGNO_INVALID) { + REC_FGET(mpf, ip, argp->next_pgno, &pagep, done); + modified = 0; + + cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->nextlsn); + CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->nextlsn); + CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp); + if (cmp_p == 0 && DB_REDO(op)) { + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + PREV_PGNO(pagep) = PGNO_INVALID; + modified = 1; + } else if (cmp_n == 0 && DB_UNDO(op)) { + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + PREV_PGNO(pagep) = argp->pgno; + modified = 1; + } + if (modified) + LSN(pagep) = DB_REDO(op) ? *lsnp : argp->nextlsn; + ret = __memp_fput(mpf, ip, pagep, file_dbp->priority); + pagep = NULL; + if (ret != 0) + goto out; + } + pagep = NULL; + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: if (pagep != NULL) + (void)__memp_fput(mpf, ip, pagep, file_dbp->priority); + REC_CLOSE; +} + +/* + * __db_ovref_recover -- + * Recovery function for __db_ovref(). + * + * PUBLIC: int __db_ovref_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__db_ovref_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __db_ovref_args *argp; + DB_THREAD_INFO *ip; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int cmp, ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + pagep = NULL; + REC_PRINT(__db_ovref_print); + REC_INTRO(__db_ovref_read, ip, 0); + + REC_FGET(mpf, ip, argp->pgno, &pagep, done); + + cmp = LOG_COMPARE(&LSN(pagep), &argp->lsn); + CHECK_LSN(env, op, cmp, &LSN(pagep), &argp->lsn); + if (cmp == 0 && DB_REDO(op)) { + /* Need to redo update described. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + OV_REF(pagep) += argp->adjust; + pagep->lsn = *lsnp; + } else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) { + /* Need to undo update described. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + OV_REF(pagep) -= argp->adjust; + pagep->lsn = argp->lsn; + } + ret = __memp_fput(mpf, ip, pagep, file_dbp->priority); + pagep = NULL; + if (ret != 0) + goto out; + pagep = NULL; + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: if (pagep != NULL) + (void)__memp_fput(mpf, ip, pagep, file_dbp->priority); + REC_CLOSE; +} + +/* + * __db_debug_recover -- + * Recovery function for debug. + * + * PUBLIC: int __db_debug_recover __P((ENV *, + * PUBLIC: DBT *, DB_LSN *, db_recops, void *)); + */ +int +__db_debug_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __db_debug_args *argp; + int ret; + + COMPQUIET(op, DB_TXN_ABORT); + COMPQUIET(info, NULL); + + REC_PRINT(__db_debug_print); + REC_NOOP_INTRO(__db_debug_read); + + *lsnp = argp->prev_lsn; + ret = 0; + + REC_NOOP_CLOSE; +} + +/* + * __db_noop_recover -- + * Recovery function for noop. + * + * PUBLIC: int __db_noop_recover __P((ENV *, + * PUBLIC: DBT *, DB_LSN *, db_recops, void *)); + */ +int +__db_noop_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __db_noop_args *argp; + DB_THREAD_INFO *ip; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int cmp_n, cmp_p, ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + pagep = NULL; + REC_PRINT(__db_noop_print); + REC_INTRO(__db_noop_read, ip, 0); + + REC_FGET(mpf, ip, argp->pgno, &pagep, done); + + cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->prevlsn); + CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->prevlsn); + CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp); + if (cmp_p == 0 && DB_REDO(op)) { + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + LSN(pagep) = *lsnp; + } else if (cmp_n == 0 && DB_UNDO(op)) { + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + LSN(pagep) = argp->prevlsn; + } + ret = __memp_fput(mpf, ip, pagep, file_dbp->priority); + pagep = NULL; + +done: *lsnp = argp->prev_lsn; +out: if (pagep != NULL) + (void)__memp_fput(mpf, + ip, pagep, file_dbp->priority); + REC_CLOSE; +} + +/* + * __db_pg_alloc_recover -- + * Recovery function for pg_alloc. + * + * PUBLIC: int __db_pg_alloc_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__db_pg_alloc_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __db_pg_alloc_args *argp; + DB_THREAD_INFO *ip; + DB *file_dbp; + DBC *dbc; + DBMETA *meta; + DB_MPOOLFILE *mpf; + PAGE *pagep; + db_pgno_t pgno; + int cmp_n, cmp_p, created, level, ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + meta = NULL; + pagep = NULL; + created = 0; + REC_PRINT(__db_pg_alloc_print); + REC_INTRO(__db_pg_alloc_read, ip, 0); + + /* + * Fix up the metadata page. If we're redoing the operation, we have + * to get the metadata page and update its LSN and its free pointer. + * If we're undoing the operation and the page was ever created, we put + * it on the freelist. + */ + pgno = PGNO_BASE_MD; + if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &meta)) != 0) { + /* The metadata page must always exist on redo. */ + if (DB_REDO(op)) { + ret = __db_pgerr(file_dbp, pgno, ret); + goto out; + } else + goto done; + } + cmp_n = LOG_COMPARE(lsnp, &LSN(meta)); + cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn); + CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn); + CHECK_ABORT(env, op, cmp_n, &LSN(meta), lsnp); + if (cmp_p == 0 && DB_REDO(op)) { + /* Need to redo update described. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &meta); + LSN(meta) = *lsnp; + meta->free = argp->next; + if (argp->pgno > meta->last_pgno) + meta->last_pgno = argp->pgno; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Need to undo update described. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &meta); + LSN(meta) = argp->meta_lsn; + /* + * If the page has a zero LSN then its newly created and + * will be truncated rather than go on the free list. + */ + if (!IS_ZERO_LSN(argp->page_lsn)) + meta->free = argp->pgno; + meta->last_pgno = argp->last_pgno; + } + +#ifdef HAVE_FTRUNCATE + /* + * check to see if we are keeping a sorted freelist, if so put + * this back in the in memory list. It must be the first element. + */ + if (op == DB_TXN_ABORT && !IS_ZERO_LSN(argp->page_lsn)) { + db_pgno_t *list; + u_int32_t nelem; + + if ((ret = __memp_get_freelist(mpf, &nelem, &list)) != 0) + goto out; + if (list != NULL && (nelem == 0 || *list != argp->pgno)) { + if ((ret = + __memp_extend_freelist(mpf, nelem + 1, &list)) != 0) + goto out; + if (nelem != 0) + memmove(list + 1, list, nelem * sizeof(*list)); + *list = argp->pgno; + } + } +#endif + + /* + * Fix up the allocated page. If the page does not exist + * and we can truncate it then don't create it. + * Otherwise if we're redoing the operation, we have + * to get the page (creating it if it doesn't exist), and update its + * LSN. If we're undoing the operation, we have to reset the page's + * LSN and put it on the free list. + */ + if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) { + /* + * We have to be able to identify if a page was newly + * created so we can recover it properly. We cannot simply + * look for an empty header, because hash uses a pgin + * function that will set the header. Instead, we explicitly + * try for the page without CREATE and if that fails, then + * create it. + */ + if (DB_UNDO(op)) + goto do_truncate; + if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, + DB_MPOOL_CREATE, &pagep)) != 0) { + if (DB_UNDO(op) && ret == ENOSPC) + goto do_truncate; + ret = __db_pgerr(file_dbp, argp->pgno, ret); + goto out; + } + created = 1; + } + + /* Fix up the allocated page. */ + cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->page_lsn); + + /* + * If an initial allocation is aborted and then reallocated during + * an archival restore the log record will have an LSN for the page + * but the page will be empty. + */ + if (IS_ZERO_LSN(LSN(pagep))) + cmp_p = 0; + + CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->page_lsn); + /* + * Another special case we have to handle is if we ended up with a + * page of all 0's which can happen if we abort between allocating a + * page in mpool and initializing it. In that case, even if we're + * undoing, we need to re-initialize the page. + */ + if (DB_REDO(op) && cmp_p == 0) { + /* Need to redo update described. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + switch (argp->ptype) { + case P_LBTREE: + case P_LRECNO: + case P_LDUP: + level = LEAFLEVEL; + break; + default: + level = 0; + break; + } + P_INIT(pagep, file_dbp->pgsize, + argp->pgno, PGNO_INVALID, PGNO_INVALID, level, argp->ptype); + + pagep->lsn = *lsnp; + } else if (DB_UNDO(op) && (cmp_n == 0 || created)) { + /* + * This is where we handle the case of a 0'd page (pagep->pgno + * is equal to PGNO_INVALID). + * Undo the allocation, reinitialize the page and + * link its next pointer to the free list. + */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + P_INIT(pagep, file_dbp->pgsize, + argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID); + + pagep->lsn = argp->page_lsn; + } + +do_truncate: + /* + * If the page was newly created, give it back. + */ + if ((pagep == NULL || IS_ZERO_LSN(LSN(pagep))) && + IS_ZERO_LSN(argp->page_lsn) && DB_UNDO(op)) { + /* Discard the page. */ + if (pagep != NULL) { + if ((ret = __memp_fput(mpf, ip, + pagep, DB_PRIORITY_VERY_LOW)) != 0) + goto out; + pagep = NULL; + } + /* Give the page back to the OS. */ + if (meta->last_pgno <= argp->pgno && (ret = __memp_ftruncate( + mpf, NULL, ip, argp->pgno, MP_TRUNC_RECOVER)) != 0) + goto out; + } + + if (pagep != NULL) { + ret = __memp_fput(mpf, ip, pagep, file_dbp->priority); + pagep = NULL; + if (ret != 0) + goto out; + } + + ret = __memp_fput(mpf, ip, meta, file_dbp->priority); + meta = NULL; + if (ret != 0) + goto out; + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: if (pagep != NULL) + (void)__memp_fput(mpf, ip, pagep, file_dbp->priority); + if (meta != NULL) + (void)__memp_fput(mpf, ip, meta, file_dbp->priority); + REC_CLOSE; +} + +/* + * __db_pg_free_recover_int -- + */ +static int +__db_pg_free_recover_int(env, ip, argp, file_dbp, lsnp, mpf, op, data) + ENV *env; + DB_THREAD_INFO *ip; + __db_pg_freedata_args *argp; + DB *file_dbp; + DB_LSN *lsnp; + DB_MPOOLFILE *mpf; + db_recops op; + int data; +{ + DBMETA *meta; + DB_LSN copy_lsn; + PAGE *pagep, *prevp; + int cmp_n, cmp_p, is_meta, ret; + + meta = NULL; + pagep = prevp = NULL; + + /* + * Get the "metapage". This will either be the metapage + * or the previous page in the free list if we are doing + * sorted allocations. If its a previous page then + * we will not be truncating. + */ + is_meta = argp->meta_pgno == PGNO_BASE_MD; + + REC_FGET(mpf, ip, argp->meta_pgno, &meta, check_meta); + + if (argp->meta_pgno != PGNO_BASE_MD) + prevp = (PAGE *)meta; + + cmp_n = LOG_COMPARE(lsnp, &LSN(meta)); + cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn); + CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn); + CHECK_ABORT(env, op, cmp_n, &LSN(meta), lsnp); + + /* + * Fix up the metadata page. If we're redoing or undoing the operation + * we get the page and update its LSN, last and free pointer. + */ + if (cmp_p == 0 && DB_REDO(op)) { + REC_DIRTY(mpf, ip, file_dbp->priority, &meta); + /* + * If we are at the end of the file truncate, otherwise + * put on the free list. + */ + if (argp->pgno == argp->last_pgno) + meta->last_pgno = argp->pgno - 1; + else if (is_meta) + meta->free = argp->pgno; + else + NEXT_PGNO(prevp) = argp->pgno; + LSN(meta) = *lsnp; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Need to undo the deallocation. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &meta); + if (is_meta) { + if (meta->last_pgno < argp->pgno) + meta->last_pgno = argp->pgno; + meta->free = argp->next; + } else + NEXT_PGNO(prevp) = argp->next; + LSN(meta) = argp->meta_lsn; + } + +check_meta: + if (ret != 0 && is_meta) { + /* The metadata page must always exist. */ + ret = __db_pgerr(file_dbp, argp->meta_pgno, ret); + goto out; + } + + /* + * Get the freed page. Don't create the page if we are going to + * free it. If we're redoing the operation we get the page and + * explicitly discard its contents, then update its LSN. If we're + * undoing the operation, we get the page and restore its header. + */ + if (DB_REDO(op) || (is_meta && meta->last_pgno < argp->pgno)) { + if ((ret = __memp_fget(mpf, &argp->pgno, + ip, NULL, 0, &pagep)) != 0) { + if (ret != DB_PAGE_NOTFOUND) + goto out; + if (is_meta && + DB_REDO(op) && meta->last_pgno <= argp->pgno) + goto trunc; + goto done; + } + } else if ((ret = __memp_fget(mpf, &argp->pgno, + ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + + (void)__ua_memcpy(©_lsn, &LSN(argp->header.data), sizeof(DB_LSN)); + cmp_n = IS_ZERO_LSN(LSN(pagep)) ? 0 : LOG_COMPARE(lsnp, &LSN(pagep)); + cmp_p = LOG_COMPARE(&LSN(pagep), ©_lsn); + + /* + * This page got extended by a later allocation, + * but its allocation was not in the scope of this + * recovery pass. + */ + if (IS_ZERO_LSN(LSN(pagep))) + cmp_p = 0; + + CHECK_LSN(env, op, cmp_p, &LSN(pagep), ©_lsn); + if (DB_REDO(op) && + (cmp_p == 0 || + (IS_ZERO_LSN(copy_lsn) && + LOG_COMPARE(&LSN(pagep), &argp->meta_lsn) <= 0))) { + /* Need to redo the deallocation. */ + /* + * The page can be truncated if it was truncated at runtime + * and the current metapage reflects the truncation. + */ + if (is_meta && meta->last_pgno <= argp->pgno && + argp->last_pgno <= argp->pgno) { + if ((ret = __memp_fput(mpf, ip, + pagep, DB_PRIORITY_VERY_LOW)) != 0) + goto out; + pagep = NULL; +trunc: if ((ret = __memp_ftruncate(mpf, NULL, ip, + argp->pgno, MP_TRUNC_RECOVER)) != 0) + goto out; + } else if (argp->last_pgno == argp->pgno) { + /* The page was truncated at runtime, zero it out. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + P_INIT(pagep, 0, PGNO_INVALID, + PGNO_INVALID, PGNO_INVALID, 0, P_INVALID); + ZERO_LSN(pagep->lsn); + } else { + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + P_INIT(pagep, file_dbp->pgsize, + argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID); + pagep->lsn = *lsnp; + + } + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Need to reallocate the page. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + memcpy(pagep, argp->header.data, argp->header.size); + if (data) + memcpy((u_int8_t*)pagep + HOFFSET(pagep), + argp->data.data, argp->data.size); + } + if (pagep != NULL && + (ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) + goto out; + + pagep = NULL; +#ifdef HAVE_FTRUNCATE + /* + * If we are keeping an in memory free list remove this + * element from the list. + */ + if (op == DB_TXN_ABORT && argp->pgno != argp->last_pgno) { + db_pgno_t *lp; + u_int32_t nelem, pos; + + if ((ret = __memp_get_freelist(mpf, &nelem, &lp)) != 0) + goto out; + if (lp != NULL) { + pos = 0; + if (!is_meta) { + __db_freelist_pos(argp->pgno, lp, nelem, &pos); + + /* + * If we aborted after logging but before + * updating the free list don't do anything. + */ + if (argp->pgno != lp[pos]) { + DB_ASSERT(env, + argp->meta_pgno == lp[pos]); + goto done; + } + DB_ASSERT(env, + argp->meta_pgno == lp[pos - 1]); + } else if (nelem != 0 && argp->pgno != lp[pos]) + goto done; + + if (pos < nelem) + memmove(&lp[pos], &lp[pos + 1], + ((nelem - pos) - 1) * sizeof(*lp)); + + /* Shrink the list */ + if ((ret = + __memp_extend_freelist(mpf, nelem - 1, &lp)) != 0) + goto out; + } + } +#endif +done: + if (meta != NULL && + (ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0) + goto out; + meta = NULL; + ret = 0; + +out: if (pagep != NULL) + (void)__memp_fput(mpf, ip, pagep, file_dbp->priority); + if (meta != NULL) + (void)__memp_fput(mpf, ip, meta, file_dbp->priority); + + return (ret); +} + +/* + * __db_pg_free_recover -- + * Recovery function for pg_free. + * + * PUBLIC: int __db_pg_free_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__db_pg_free_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __db_pg_free_args *argp; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + DB_THREAD_INFO *ip; + int ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + REC_PRINT(__db_pg_free_print); + REC_INTRO(__db_pg_free_read, ip, 0); + + ret = __db_pg_free_recover_int(env, ip, + (__db_pg_freedata_args *)argp, file_dbp, lsnp, mpf, op, 0); + +done: *lsnp = argp->prev_lsn; +out: + REC_CLOSE; +} + +/* + * __db_pg_freedata_recover -- + * Recovery function for pg_freedata. + * + * PUBLIC: int __db_pg_freedata_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__db_pg_freedata_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __db_pg_freedata_args *argp; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + DB_THREAD_INFO *ip; + int ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + REC_PRINT(__db_pg_freedata_print); + REC_INTRO(__db_pg_freedata_read, ip, 0); + + ret = __db_pg_free_recover_int(env, + ip, argp, file_dbp, lsnp, mpf, op, 1); + +done: *lsnp = argp->prev_lsn; +out: + REC_CLOSE; +} + +/* + * __db_cksum_recover -- + * Recovery function for checksum failure log record. + * + * PUBLIC: int __db_cksum_recover __P((ENV *, + * PUBLIC: DBT *, DB_LSN *, db_recops, void *)); + */ +int +__db_cksum_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __db_cksum_args *argp; + int ret; + + COMPQUIET(info, NULL); + COMPQUIET(lsnp, NULL); + COMPQUIET(op, DB_TXN_ABORT); + + REC_PRINT(__db_cksum_print); + + if ((ret = __db_cksum_read(env, dbtp->data, &argp)) != 0) + return (ret); + + /* + * We had a checksum failure -- the only option is to run catastrophic + * recovery. + */ + if (F_ISSET(env, ENV_RECOVER_FATAL)) + ret = 0; + else { + __db_errx(env, + "Checksum failure requires catastrophic recovery"); + ret = __env_panic(env, DB_RUNRECOVERY); + } + + __os_free(env, argp); + return (ret); +} + +/* + * __db_pg_init_recover -- + * Recovery function to reinit pages after truncation. + * + * PUBLIC: int __db_pg_init_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__db_pg_init_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __db_pg_init_args *argp; + DB_THREAD_INFO *ip; + DB *file_dbp; + DBC *dbc; + DB_LSN copy_lsn; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int cmp_n, cmp_p, ret, type; + + ip = ((DB_TXNHEAD *)info)->thread_info; + REC_PRINT(__db_pg_init_print); + REC_INTRO(__db_pg_init_read, ip, 0); + + mpf = file_dbp->mpf; + if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) { + if (DB_UNDO(op)) { + if (ret == DB_PAGE_NOTFOUND) + goto done; + else { + ret = __db_pgerr(file_dbp, argp->pgno, ret); + goto out; + } + } + + /* + * This page was truncated and may simply not have + * had an item written to it yet. This should only + * happen on hash databases, so confirm that. + */ + DB_ASSERT(env, file_dbp->type == DB_HASH); + if ((ret = __memp_fget(mpf, &argp->pgno, + ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) { + ret = __db_pgerr(file_dbp, argp->pgno, ret); + goto out; + } + } + + (void)__ua_memcpy(©_lsn, &LSN(argp->header.data), sizeof(DB_LSN)); + cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); + cmp_p = LOG_COMPARE(&LSN(pagep), ©_lsn); + CHECK_LSN(env, op, cmp_p, &LSN(pagep), ©_lsn); + CHECK_ABORT(env, op, cmp_n, &LSN(pagep), lsnp); + + if (cmp_p == 0 && DB_REDO(op)) { + if (TYPE(pagep) == P_HASH) + type = P_HASH; + else + type = file_dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE; + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + P_INIT(pagep, file_dbp->pgsize, PGNO(pagep), PGNO_INVALID, + PGNO_INVALID, TYPE(pagep) == P_HASH ? 0 : 1, type); + pagep->lsn = *lsnp; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Put the data back on the page. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + memcpy(pagep, argp->header.data, argp->header.size); + if (argp->data.size > 0) + memcpy((u_int8_t*)pagep + HOFFSET(pagep), + argp->data.data, argp->data.size); + } + if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) + goto out; + +done: *lsnp = argp->prev_lsn; +out: + REC_CLOSE; +} + +/* + * __db_pg_trunc_recover -- + * Recovery function for pg_trunc. + * + * PUBLIC: int __db_pg_trunc_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__db_pg_trunc_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ +#ifdef HAVE_FTRUNCATE + __db_pg_trunc_args *argp; + DB_THREAD_INFO *ip; + DB *file_dbp; + DBC *dbc; + DBMETA *meta; + DB_MPOOLFILE *mpf; + PAGE *pagep; + db_pglist_t *pglist, *lp; + db_pgno_t last_pgno, *list; + u_int32_t felem, nelem, pos; + int ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + REC_PRINT(__db_pg_trunc_print); + REC_INTRO(__db_pg_trunc_read, ip, 1); + + pglist = (db_pglist_t *) argp->list.data; + nelem = argp->list.size / sizeof(db_pglist_t); + if (DB_REDO(op)) { + /* + * First call __db_pg_truncate to find the truncation + * point, truncate the file and return the new last_pgno. + */ + last_pgno = argp->last_pgno; + if ((ret = __db_pg_truncate(dbc, NULL, pglist, + NULL, &nelem, argp->next_free, &last_pgno, lsnp, 1)) != 0) + goto out; + + if (argp->last_free != PGNO_INVALID) { + /* + * Update the next pointer of the last page in + * the freelist. If the truncation point is + * beyond next_free then this is still in the freelist + * otherwise the last_free page is at the end. + */ + if ((ret = __memp_fget(mpf, + &argp->last_free, ip, NULL, 0, &meta)) == 0) { + if (LOG_COMPARE(&LSN(meta), + &argp->last_lsn) == 0) { + REC_DIRTY(mpf, + ip, dbc->priority, &meta); + if (pglist->pgno > last_pgno) + NEXT_PGNO(meta) = PGNO_INVALID; + else + NEXT_PGNO(meta) = pglist->pgno; + LSN(meta) = *lsnp; + } + if ((ret = __memp_fput(mpf, ip, + meta, file_dbp->priority)) != 0) + goto out; + meta = NULL; + } else if (ret != DB_PAGE_NOTFOUND) + goto out; + } + if ((ret = __memp_fget(mpf, &argp->meta, ip, NULL, + 0, &meta)) != 0) + goto out; + if (LOG_COMPARE(&LSN(meta), &argp->meta_lsn) == 0) { + REC_DIRTY(mpf, ip, dbc->priority, &meta); + if (argp->last_free == PGNO_INVALID) { + if (nelem == 0) + meta->free = PGNO_INVALID; + else + meta->free = pglist->pgno; + } + meta->last_pgno = last_pgno; + LSN(meta) = *lsnp; + } + } else { + /* Put the free list back in its original order. */ + for (lp = pglist; lp < &pglist[nelem]; lp++) { + if ((ret = __memp_fget(mpf, &lp->pgno, ip, + NULL, DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + if (IS_ZERO_LSN(LSN(pagep)) || + LOG_COMPARE(&LSN(pagep), lsnp) == 0) { + REC_DIRTY(mpf, ip, dbc->priority, &pagep); + P_INIT(pagep, file_dbp->pgsize, lp->pgno, + PGNO_INVALID, lp->next_pgno, 0, P_INVALID); + LSN(pagep) = lp->lsn; + } + if ((ret = __memp_fput(mpf, + ip, pagep, file_dbp->priority)) != 0) + goto out; + } + /* + * Link the truncated part back into the free list. + * Its either after the last_free page or direclty + * linked to the metadata page. + */ + if (argp->last_free != PGNO_INVALID) { + if ((ret = __memp_fget(mpf, &argp->last_free, + ip, NULL, DB_MPOOL_EDIT, &meta)) == 0) { + if (LOG_COMPARE(&LSN(meta), lsnp) == 0) { + NEXT_PGNO(meta) = argp->next_free; + LSN(meta) = argp->last_lsn; + } + if ((ret = __memp_fput(mpf, ip, + meta, file_dbp->priority)) != 0) + goto out; + } else if (ret != DB_PAGE_NOTFOUND) + goto out; + meta = NULL; + } + if ((ret = __memp_fget(mpf, &argp->meta, + ip, NULL, DB_MPOOL_EDIT, &meta)) != 0) + goto out; + if (LOG_COMPARE(&LSN(meta), lsnp) == 0) { + REC_DIRTY(mpf, ip, dbc->priority, &meta); + /* + * If we had to break up the list last_pgno + * may only represent the end of the block. + */ + if (meta->last_pgno < argp->last_pgno) + meta->last_pgno = argp->last_pgno; + if (argp->last_free == PGNO_INVALID) + meta->free = argp->next_free; + LSN(meta) = argp->meta_lsn; + } + } + + if ((ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0) + goto out; + + if (op == DB_TXN_ABORT) { + /* + * Put the pages back on the in memory free list. + * If this is part of a multi-record truncate then + * we need to find this batch, it may not be at the end. + * If we aborted while writing one of the log records + * then this set may still be in the list. + */ + if ((ret = __memp_get_freelist(mpf, &felem, &list)) != 0) + goto out; + if (list != NULL) { + if (felem != 0 && list[felem - 1] > pglist->pgno) { + __db_freelist_pos( + pglist->pgno, list, felem, &pos); + DB_ASSERT(env, pos < felem); + if (pglist->pgno == list[pos]) + goto done; + pos++; + } else if (felem != 0 && + list[felem - 1] == pglist->pgno) + goto done; + else + pos = felem; + if ((ret = __memp_extend_freelist( + mpf, felem + nelem, &list)) != 0) + goto out; + if (pos != felem) + memmove(&list[nelem + pos], &list[pos], + sizeof(*list) * (felem - pos)); + for (lp = pglist; lp < &pglist[nelem]; lp++) + list[pos++] = lp->pgno; + } + } + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: REC_CLOSE; +#else + /* + * If HAVE_FTRUNCATE is not defined, we'll never see pg_trunc records + * to recover. + */ + COMPQUIET(env, NULL); + COMPQUIET(dbtp, NULL); + COMPQUIET(lsnp, NULL); + COMPQUIET(op, DB_TXN_ABORT); + COMPQUIET(info, NULL); + return (EINVAL); +#endif +} +/* + * __db_pg_sort_44_recover -- + * Recovery function for pg_sort. + * This is deprecated and kept for replication upgrades. + * + * PUBLIC: int __db_pg_sort_44_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__db_pg_sort_44_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ +#ifdef HAVE_FTRUNCATE + __db_pg_sort_44_args *argp; + DB_THREAD_INFO *ip; + DB *file_dbp; + DBC *dbc; + DBMETA *meta; + DB_MPOOLFILE *mpf; + PAGE *pagep; + db_pglist_t *pglist, *lp; + db_pgno_t pgno, *list; + u_int32_t felem, nelem; + int ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + REC_PRINT(__db_pg_sort_44_print); + REC_INTRO(__db_pg_sort_44_read, ip, 1); + + pglist = (db_pglist_t *) argp->list.data; + nelem = argp->list.size / sizeof(db_pglist_t); + if (DB_REDO(op)) { + pgno = argp->last_pgno; + __db_freelist_sort(pglist, nelem); + if ((ret = __db_pg_truncate(dbc, NULL, + pglist, NULL, &nelem, PGNO_INVALID, &pgno, lsnp, 1)) != 0) + goto out; + + if (argp->last_free != PGNO_INVALID) { + if ((ret = __memp_fget(mpf, + &argp->last_free, ip, NULL, 0, &meta)) == 0) { + if (LOG_COMPARE(&LSN(meta), + &argp->last_lsn) == 0) { + REC_DIRTY(mpf, + ip, dbc->priority, &meta); + NEXT_PGNO(meta) = PGNO_INVALID; + LSN(meta) = *lsnp; + } + if ((ret = __memp_fput(mpf, ip, + meta, file_dbp->priority)) != 0) + goto out; + meta = NULL; + } else if (ret != DB_PAGE_NOTFOUND) + goto out; + } + if ((ret = __memp_fget(mpf, &argp->meta, ip, NULL, + 0, &meta)) != 0) + goto out; + if (LOG_COMPARE(&LSN(meta), &argp->meta_lsn) == 0) { + REC_DIRTY(mpf, ip, dbc->priority, &meta); + if (argp->last_free == PGNO_INVALID) { + if (nelem == 0) + meta->free = PGNO_INVALID; + else + meta->free = pglist->pgno; + } + meta->last_pgno = pgno; + LSN(meta) = *lsnp; + } + } else { + /* Put the free list back in its original order. */ + for (lp = pglist; lp < &pglist[nelem]; lp++) { + if ((ret = __memp_fget(mpf, &lp->pgno, ip, + NULL, DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + if (IS_ZERO_LSN(LSN(pagep)) || + LOG_COMPARE(&LSN(pagep), lsnp) == 0) { + REC_DIRTY(mpf, ip, dbc->priority, &pagep); + if (lp == &pglist[nelem - 1]) + pgno = PGNO_INVALID; + else + pgno = lp[1].pgno; + + P_INIT(pagep, file_dbp->pgsize, + lp->pgno, PGNO_INVALID, pgno, 0, P_INVALID); + LSN(pagep) = lp->lsn; + } + if ((ret = __memp_fput(mpf, + ip, pagep, file_dbp->priority)) != 0) + goto out; + } + if (argp->last_free != PGNO_INVALID) { + if ((ret = __memp_fget(mpf, &argp->last_free, + ip, NULL, DB_MPOOL_EDIT, &meta)) == 0) { + if (LOG_COMPARE(&LSN(meta), lsnp) == 0) { + NEXT_PGNO(meta) = pglist->pgno; + LSN(meta) = argp->last_lsn; + } + if ((ret = __memp_fput(mpf, ip, + meta, file_dbp->priority)) != 0) + goto out; + } else if (ret != DB_PAGE_NOTFOUND) + goto out; + meta = NULL; + } + if ((ret = __memp_fget(mpf, &argp->meta, + ip, NULL, DB_MPOOL_EDIT, &meta)) != 0) + goto out; + if (LOG_COMPARE(&LSN(meta), lsnp) == 0) { + REC_DIRTY(mpf, ip, dbc->priority, &meta); + meta->last_pgno = argp->last_pgno; + if (argp->last_free == PGNO_INVALID) + meta->free = pglist->pgno; + LSN(meta) = argp->meta_lsn; + } + } + if (op == DB_TXN_ABORT) { + if ((ret = __memp_get_freelist(mpf, &felem, &list)) != 0) + goto out; + if (list != NULL) { + DB_ASSERT(env, felem == 0 || + argp->last_free == list[felem - 1]); + if ((ret = __memp_extend_freelist( + mpf, felem + nelem, &list)) != 0) + goto out; + for (lp = pglist; lp < &pglist[nelem]; lp++) + list[felem++] = lp->pgno; + } + } + + if ((ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0) + goto out; + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: REC_CLOSE; +#else + /* + * If HAVE_FTRUNCATE is not defined, we'll never see pg_sort records + * to recover. + */ + COMPQUIET(env, NULL); + COMPQUIET(dbtp, NULL); + COMPQUIET(lsnp, NULL); + COMPQUIET(op, DB_TXN_ABORT); + COMPQUIET(info, NULL); + return (EINVAL); +#endif +} + +/* + * __db_pg_alloc_42_recover -- + * Recovery function for pg_alloc. + * + * PUBLIC: int __db_pg_alloc_42_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__db_pg_alloc_42_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __db_pg_alloc_42_args *argp; + DB_THREAD_INFO *ip; + DB *file_dbp; + DBC *dbc; + DBMETA *meta; + DB_MPOOLFILE *mpf; + PAGE *pagep; + db_pgno_t pgno; + int cmp_n, cmp_p, created, level, ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + meta = NULL; + pagep = NULL; + created = 0; + REC_PRINT(__db_pg_alloc_42_print); + REC_INTRO(__db_pg_alloc_42_read, ip, 0); + + /* + * Fix up the metadata page. If we're redoing the operation, we have + * to get the metadata page and update its LSN and its free pointer. + * If we're undoing the operation and the page was ever created, we put + * it on the freelist. + */ + pgno = PGNO_BASE_MD; + if ((ret = __memp_fget(mpf, &pgno, ip, NULL, 0, &meta)) != 0) { + /* The metadata page must always exist on redo. */ + if (DB_REDO(op)) { + ret = __db_pgerr(file_dbp, pgno, ret); + goto out; + } else + goto done; + } + cmp_n = LOG_COMPARE(lsnp, &LSN(meta)); + cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn); + CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn); + if (cmp_p == 0 && DB_REDO(op)) { + /* Need to redo update described. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &meta); + LSN(meta) = *lsnp; + meta->free = argp->next; + if (argp->pgno > meta->last_pgno) + meta->last_pgno = argp->pgno; + } else if (cmp_n == 0 && DB_UNDO(op)) { + goto no_rollback; + } + + /* + * Fix up the allocated page. If the page does not exist + * and we can truncate it then don't create it. + * Otherwise if we're redoing the operation, we have + * to get the page (creating it if it doesn't exist), and update its + * LSN. If we're undoing the operation, we have to reset the page's + * LSN and put it on the free list, or truncate it. + */ + if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) { + /* + * We have to be able to identify if a page was newly + * created so we can recover it properly. We cannot simply + * look for an empty header, because hash uses a pgin + * function that will set the header. Instead, we explicitly + * try for the page without CREATE and if that fails, then + * create it. + */ + if ((ret = __memp_fget(mpf, &argp->pgno, + ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) { + if (DB_UNDO(op) && ret == ENOSPC) + goto do_truncate; + ret = __db_pgerr(file_dbp, argp->pgno, ret); + goto out; + } + created = 1; + } + + /* Fix up the allocated page. */ + cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->page_lsn); + + /* + * If an initial allocation is aborted and then reallocated during + * an archival restore the log record will have an LSN for the page + * but the page will be empty. + */ + if (IS_ZERO_LSN(LSN(pagep)) || + (IS_ZERO_LSN(argp->page_lsn) && IS_INIT_LSN(LSN(pagep)))) + cmp_p = 0; + + CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->page_lsn); + /* + * Another special case we have to handle is if we ended up with a + * page of all 0's which can happen if we abort between allocating a + * page in mpool and initializing it. In that case, even if we're + * undoing, we need to re-initialize the page. + */ + if (DB_REDO(op) && cmp_p == 0) { + /* Need to redo update described. */ + switch (argp->ptype) { + case P_LBTREE: + case P_LRECNO: + case P_LDUP: + level = LEAFLEVEL; + break; + default: + level = 0; + break; + } + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + P_INIT(pagep, file_dbp->pgsize, + argp->pgno, PGNO_INVALID, PGNO_INVALID, level, argp->ptype); + + pagep->lsn = *lsnp; + } else if (DB_UNDO(op) && (cmp_n == 0 || created)) { + /* + * This is where we handle the case of a 0'd page (pagep->pgno + * is equal to PGNO_INVALID). + * Undo the allocation, reinitialize the page and + * link its next pointer to the free list. + */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + P_INIT(pagep, file_dbp->pgsize, + argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID); + + pagep->lsn = argp->page_lsn; + } + +do_truncate: + /* + * We cannot undo things from 4.2 land, because we nolonger + * have limbo processing. + */ + if ((pagep == NULL || IS_ZERO_LSN(LSN(pagep))) && + IS_ZERO_LSN(argp->page_lsn) && DB_UNDO(op)) { +no_rollback: __db_errx(env, +"Cannot replicate prepared transactions from master running release 4.2 "); + ret = __env_panic(env, EINVAL); + } + + if (pagep != NULL && + (ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) + goto out; + pagep = NULL; + + if ((ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0) + goto out; + meta = NULL; + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: if (pagep != NULL) + (void)__memp_fput(mpf, ip, pagep, file_dbp->priority); + if (meta != NULL) + (void)__memp_fput(mpf, ip, meta, file_dbp->priority); + REC_CLOSE; +} + +/* + * __db_pg_free_recover_42_int -- + */ +static int +__db_pg_free_recover_42_int(env, ip, argp, file_dbp, lsnp, mpf, op, data) + ENV *env; + DB_THREAD_INFO *ip; + __db_pg_freedata_42_args *argp; + DB *file_dbp; + DB_LSN *lsnp; + DB_MPOOLFILE *mpf; + db_recops op; + int data; +{ + DBMETA *meta; + DB_LSN copy_lsn; + PAGE *pagep, *prevp; + int cmp_n, cmp_p, is_meta, ret; + + meta = NULL; + pagep = NULL; + prevp = NULL; + + /* + * Get the "metapage". This will either be the metapage + * or the previous page in the free list if we are doing + * sorted allocations. If its a previous page then + * we will not be truncating. + */ + is_meta = argp->meta_pgno == PGNO_BASE_MD; + + REC_FGET(mpf, ip, argp->meta_pgno, &meta, check_meta); + + if (argp->meta_pgno != PGNO_BASE_MD) + prevp = (PAGE *)meta; + + cmp_n = LOG_COMPARE(lsnp, &LSN(meta)); + cmp_p = LOG_COMPARE(&LSN(meta), &argp->meta_lsn); + CHECK_LSN(env, op, cmp_p, &LSN(meta), &argp->meta_lsn); + + /* + * Fix up the metadata page. If we're redoing or undoing the operation + * we get the page and update its LSN, last and free pointer. + */ + if (cmp_p == 0 && DB_REDO(op)) { + /* Need to redo the deallocation. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &meta); + if (prevp == NULL) + meta->free = argp->pgno; + else + NEXT_PGNO(prevp) = argp->pgno; + /* + * If this was a compensating transaction and + * we are a replica, then we never executed the + * original allocation which incremented meta->free. + */ + if (prevp == NULL && meta->last_pgno < meta->free) + meta->last_pgno = meta->free; + LSN(meta) = *lsnp; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Need to undo the deallocation. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &meta); + if (prevp == NULL) + meta->free = argp->next; + else + NEXT_PGNO(prevp) = argp->next; + LSN(meta) = argp->meta_lsn; + if (prevp == NULL && meta->last_pgno < argp->pgno) + meta->last_pgno = argp->pgno; + } + +check_meta: + if (ret != 0 && is_meta) { + /* The metadata page must always exist. */ + ret = __db_pgerr(file_dbp, argp->meta_pgno, ret); + goto out; + } + + /* + * Get the freed page. If we support truncate then don't + * create the page if we are going to free it. If we're + * redoing the operation we get the page and explicitly discard + * its contents, then update its LSN. If we're undoing the + * operation, we get the page and restore its header. + * If we don't support truncate, then we must create the page + * and roll it back. + */ + if ((ret = __memp_fget(mpf, &argp->pgno, + ip, NULL, DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + + (void)__ua_memcpy(©_lsn, &LSN(argp->header.data), sizeof(DB_LSN)); + cmp_n = IS_ZERO_LSN(LSN(pagep)) ? 0 : LOG_COMPARE(lsnp, &LSN(pagep)); + cmp_p = LOG_COMPARE(&LSN(pagep), ©_lsn); + + CHECK_LSN(env, op, cmp_p, &LSN(pagep), ©_lsn); + if (DB_REDO(op) && + (cmp_p == 0 || + (IS_ZERO_LSN(copy_lsn) && + LOG_COMPARE(&LSN(pagep), &argp->meta_lsn) <= 0))) { + /* Need to redo the deallocation. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + P_INIT(pagep, file_dbp->pgsize, + argp->pgno, PGNO_INVALID, argp->next, 0, P_INVALID); + pagep->lsn = *lsnp; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Need to reallocate the page. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + memcpy(pagep, argp->header.data, argp->header.size); + if (data) + memcpy((u_int8_t*)pagep + HOFFSET(pagep), + argp->data.data, argp->data.size); + } + if (pagep != NULL && + (ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) + goto out; + + pagep = NULL; + if (meta != NULL && + (ret = __memp_fput(mpf, ip, meta, file_dbp->priority)) != 0) + goto out; + meta = NULL; + + ret = 0; + +out: if (pagep != NULL) + (void)__memp_fput(mpf, ip, pagep, file_dbp->priority); + if (meta != NULL) + (void)__memp_fput(mpf, ip, meta, file_dbp->priority); + + return (ret); +} + +/* + * __db_pg_free_42_recover -- + * Recovery function for pg_free. + * + * PUBLIC: int __db_pg_free_42_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__db_pg_free_42_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __db_pg_free_42_args *argp; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + DB_THREAD_INFO *ip; + int ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + REC_PRINT(__db_pg_free_42_print); + REC_INTRO(__db_pg_free_42_read, ip, 0); + + ret = __db_pg_free_recover_42_int(env, ip, + (__db_pg_freedata_42_args *)argp, file_dbp, lsnp, mpf, op, 0); + +done: *lsnp = argp->prev_lsn; +out: + REC_CLOSE; +} + +/* + * __db_pg_freedata_42_recover -- + * Recovery function for pg_freedata. + * + * PUBLIC: int __db_pg_freedata_42_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__db_pg_freedata_42_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __db_pg_freedata_42_args *argp; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + DB_THREAD_INFO *ip; + int ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + REC_PRINT(__db_pg_freedata_42_print); + REC_INTRO(__db_pg_freedata_42_read, ip, 0); + + ret = __db_pg_free_recover_42_int( + env, ip, argp, file_dbp, lsnp, mpf, op, 1); + +done: *lsnp = argp->prev_lsn; +out: + REC_CLOSE; +} + +/* + * __db_relink_42_recover -- + * Recovery function for relink. + * + * PUBLIC: int __db_relink_42_recover + * PUBLIC: __P((ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__db_relink_42_recover(env, dbtp, lsnp, op, info) + ENV *env; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __db_relink_42_args *argp; + DB_THREAD_INFO *ip; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int cmp_n, cmp_p, modified, ret; + + ip = ((DB_TXNHEAD *)info)->thread_info; + pagep = NULL; + REC_PRINT(__db_relink_42_print); + REC_INTRO(__db_relink_42_read, ip, 0); + + /* + * There are up to three pages we need to check -- the page, and the + * previous and next pages, if they existed. For a page add operation, + * the current page is the result of a split and is being recovered + * elsewhere, so all we need do is recover the next page. + */ + if ((ret = __memp_fget(mpf, &argp->pgno, ip, NULL, 0, &pagep)) != 0) { + if (DB_REDO(op)) { + ret = __db_pgerr(file_dbp, argp->pgno, ret); + goto out; + } + goto next2; + } + if (argp->opcode == DB_ADD_PAGE_COMPAT) + goto next1; + + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn); + CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn); + if (cmp_p == 0 && DB_REDO(op)) { + /* Redo the relink. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + pagep->lsn = *lsnp; + } else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) { + /* Undo the relink. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + pagep->next_pgno = argp->next; + pagep->prev_pgno = argp->prev; + pagep->lsn = argp->lsn; + } +next1: if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) + goto out; + pagep = NULL; + +next2: if ((ret = __memp_fget(mpf, &argp->next, ip, NULL, 0, &pagep)) != 0) { + if (DB_REDO(op)) { + ret = __db_pgerr(file_dbp, argp->next, ret); + goto out; + } + goto prev; + } + modified = 0; + cmp_n = LOG_COMPARE(lsnp, &LSN(pagep)); + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_next); + CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_next); + if ((argp->opcode == DB_REM_PAGE_COMPAT && cmp_p == 0 && DB_REDO(op)) || + (argp->opcode == DB_ADD_PAGE_COMPAT && cmp_n == 0 && DB_UNDO(op))) { + /* Redo the remove or undo the add. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + pagep->prev_pgno = argp->prev; + modified = 1; + } else if ((argp->opcode == DB_REM_PAGE_COMPAT && + cmp_n == 0 && DB_UNDO(op)) || + (argp->opcode == DB_ADD_PAGE_COMPAT && cmp_p == 0 && DB_REDO(op))) { + /* Undo the remove or redo the add. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + pagep->prev_pgno = argp->pgno; + modified = 1; + } + if (modified) { + if (DB_UNDO(op)) + pagep->lsn = argp->lsn_next; + else + pagep->lsn = *lsnp; + } + if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) + goto out; + pagep = NULL; + if (argp->opcode == DB_ADD_PAGE_COMPAT) + goto done; + +prev: if ((ret = __memp_fget(mpf, &argp->prev, ip, NULL, 0, &pagep)) != 0) { + if (DB_REDO(op)) { + ret = __db_pgerr(file_dbp, argp->prev, ret); + goto out; + } + goto done; + } + modified = 0; + cmp_p = LOG_COMPARE(&LSN(pagep), &argp->lsn_prev); + CHECK_LSN(env, op, cmp_p, &LSN(pagep), &argp->lsn_prev); + if (cmp_p == 0 && DB_REDO(op)) { + /* Redo the relink. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + pagep->next_pgno = argp->next; + modified = 1; + } else if (LOG_COMPARE(lsnp, &LSN(pagep)) == 0 && DB_UNDO(op)) { + /* Undo the relink. */ + REC_DIRTY(mpf, ip, file_dbp->priority, &pagep); + pagep->next_pgno = argp->pgno; + modified = 1; + } + if (modified) { + if (DB_UNDO(op)) + pagep->lsn = argp->lsn_prev; + else + pagep->lsn = *lsnp; + } + if ((ret = __memp_fput(mpf, ip, pagep, file_dbp->priority)) != 0) + goto out; + pagep = NULL; + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: if (pagep != NULL) + (void)__memp_fput(mpf, ip, pagep, file_dbp->priority); + REC_CLOSE; +} diff --git a/db/db_reclaim.c b/db/db_reclaim.c new file mode 100644 index 0000000..a44d054 --- /dev/null +++ b/db/db_reclaim.c @@ -0,0 +1,246 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/mp.h" + +/* + * __db_traverse_big + * Traverse a chain of overflow pages and call the callback routine + * on each one. The calling convention for the callback is: + * callback(dbc, page, cookie, did_put), + * where did_put is a return value indicating if the page in question has + * already been returned to the mpool. + * + * PUBLIC: int __db_traverse_big __P((DBC *, db_pgno_t, + * PUBLIC: int (*)(DBC *, PAGE *, void *, int *), void *)); + */ +int +__db_traverse_big(dbc, pgno, callback, cookie) + DBC *dbc; + db_pgno_t pgno; + int (*callback) __P((DBC *, PAGE *, void *, int *)); + void *cookie; +{ + DB_MPOOLFILE *mpf; + PAGE *p; + int did_put, ret; + + mpf = dbc->dbp->mpf; + + do { + did_put = 0; + if ((ret = __memp_fget(mpf, + &pgno, dbc->thread_info, dbc->txn, 0, &p)) != 0) + return (ret); + /* + * If we are freeing pages only process the overflow + * chain if the head of the chain has a refcount of 1. + */ + pgno = NEXT_PGNO(p); + if (callback == __db_truncate_callback && OV_REF(p) != 1) + pgno = PGNO_INVALID; + if ((ret = callback(dbc, p, cookie, &did_put)) == 0 && + !did_put) + ret = __memp_fput(mpf, + dbc->thread_info, p, dbc->priority); + } while (ret == 0 && pgno != PGNO_INVALID); + + return (ret); +} + +/* + * __db_reclaim_callback + * This is the callback routine used during a delete of a subdatabase. + * we are traversing a btree or hash table and trying to free all the + * pages. Since they share common code for duplicates and overflow + * items, we traverse them identically and use this routine to do the + * actual free. The reason that this is callback is because hash uses + * the same traversal code for statistics gathering. + * + * PUBLIC: int __db_reclaim_callback __P((DBC *, PAGE *, void *, int *)); + */ +int +__db_reclaim_callback(dbc, p, cookie, putp) + DBC *dbc; + PAGE *p; + void *cookie; + int *putp; +{ + DB *dbp; + int ret; + + COMPQUIET(cookie, NULL); + dbp = dbc->dbp; + + /* + * We don't want to log the free of the root with the subdb. + * If we abort then the subdb may not be openable to undo + * the free. + */ + if ((dbp->type == DB_BTREE || dbp->type == DB_RECNO) && + PGNO(p) == ((BTREE *)dbp->bt_internal)->bt_root) + return (0); + if ((ret = __db_free(dbc, p)) != 0) + return (ret); + *putp = 1; + + return (0); +} + +/* + * __db_truncate_callback + * This is the callback routine used during a truncate. + * we are traversing a btree or hash table and trying to free all the + * pages. + * + * PUBLIC: int __db_truncate_callback __P((DBC *, PAGE *, void *, int *)); + */ +int +__db_truncate_callback(dbc, p, cookie, putp) + DBC *dbc; + PAGE *p; + void *cookie; + int *putp; +{ + DB *dbp; + DBT ddbt, ldbt; + DB_MPOOLFILE *mpf; + db_indx_t indx, len, off, tlen, top; + u_int8_t *hk, type; + u_int32_t *countp; + int ret; + + top = NUM_ENT(p); + dbp = dbc->dbp; + mpf = dbp->mpf; + countp = cookie; + *putp = 1; + + switch (TYPE(p)) { + case P_LBTREE: + /* Skip for off-page duplicates and deleted items. */ + for (indx = 0; indx < top; indx += P_INDX) { + type = GET_BKEYDATA(dbp, p, indx + O_INDX)->type; + if (!B_DISSET(type) && B_TYPE(type) != B_DUPLICATE) + ++*countp; + } + /* FALLTHROUGH */ + case P_IBTREE: + case P_IRECNO: + case P_INVALID: + if (dbp->type != DB_HASH && + ((BTREE *)dbp->bt_internal)->bt_root == PGNO(p)) { + type = dbp->type == DB_RECNO ? P_LRECNO : P_LBTREE; + goto reinit; + } + break; + case P_OVERFLOW: + if ((ret = __memp_dirty(mpf, + &p, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) + return (ret); + if (DBC_LOGGING(dbc)) { + if ((ret = __db_ovref_log(dbp, dbc->txn, + &LSN(p), 0, p->pgno, -1, &LSN(p))) != 0) + return (ret); + } else + LSN_NOT_LOGGED(LSN(p)); + if (--OV_REF(p) != 0) + *putp = 0; + break; + case P_LRECNO: + for (indx = 0; indx < top; indx += O_INDX) { + type = GET_BKEYDATA(dbp, p, indx)->type; + if (!B_DISSET(type)) + ++*countp; + } + + if (((BTREE *)dbp->bt_internal)->bt_root == PGNO(p)) { + type = P_LRECNO; + goto reinit; + } + break; + case P_LDUP: + /* Correct for deleted items. */ + for (indx = 0; indx < top; indx += O_INDX) + if (!B_DISSET(GET_BKEYDATA(dbp, p, indx)->type)) + ++*countp; + + break; + case P_HASH: + /* Correct for on-page duplicates and deleted items. */ + for (indx = 0; indx < top; indx += P_INDX) { + switch (*H_PAIRDATA(dbp, p, indx)) { + case H_OFFDUP: + break; + case H_OFFPAGE: + case H_KEYDATA: + ++*countp; + break; + case H_DUPLICATE: + tlen = LEN_HDATA(dbp, p, 0, indx); + hk = H_PAIRDATA(dbp, p, indx); + for (off = 0; off < tlen; + off += len + 2 * sizeof(db_indx_t)) { + ++*countp; + memcpy(&len, + HKEYDATA_DATA(hk) + + off, sizeof(db_indx_t)); + } + break; + default: + return (__db_pgfmt(dbp->env, p->pgno)); + } + } + /* Don't free the head of the bucket. */ + if (PREV_PGNO(p) == PGNO_INVALID) { + type = P_HASH; + +reinit: if ((ret = __memp_dirty(mpf, &p, + dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0) + return (ret); + *putp = 0; + if (DBC_LOGGING(dbc)) { + memset(&ldbt, 0, sizeof(ldbt)); + memset(&ddbt, 0, sizeof(ddbt)); + ldbt.data = p; + ldbt.size = P_OVERHEAD(dbp); + ldbt.size += p->entries * sizeof(db_indx_t); + ddbt.data = (u_int8_t *)p + HOFFSET(p); + ddbt.size = dbp->pgsize - HOFFSET(p); + if ((ret = __db_pg_init_log(dbp, + dbc->txn, &LSN(p), 0, + p->pgno, &ldbt, &ddbt)) != 0) + return (ret); + } else + LSN_NOT_LOGGED(LSN(p)); + + P_INIT(p, dbp->pgsize, PGNO(p), PGNO_INVALID, + PGNO_INVALID, type == P_HASH ? 0 : 1, type); + } + break; + default: + return (__db_pgfmt(dbp->env, p->pgno)); + } + + if (*putp == 1) { + if ((ret = __db_free(dbc, p)) != 0) + return (ret); + } else { + if ((ret = __memp_fput(mpf, dbc->thread_info, p, + dbc->priority)) != 0) + return (ret); + *putp = 1; + } + + return (0); +} diff --git a/db/db_remove.c b/db/db_remove.c new file mode 100644 index 0000000..6b59ec3 --- /dev/null +++ b/db/db_remove.c @@ -0,0 +1,492 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2001, 2010 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/fop.h" +#include "dbinc/btree.h" +#include "dbinc/hash.h" +#include "dbinc/lock.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" + +static int __db_dbtxn_remove __P((DB *, + DB_THREAD_INFO *, DB_TXN *, const char *, const char *)); +static int __db_subdb_remove __P((DB *, + DB_THREAD_INFO *, DB_TXN *, const char *, const char *)); + +/* + * __env_dbremove_pp + * ENV->dbremove pre/post processing. + * + * PUBLIC: int __env_dbremove_pp __P((DB_ENV *, + * PUBLIC: DB_TXN *, const char *, const char *, u_int32_t)); + */ +int +__env_dbremove_pp(dbenv, txn, name, subdb, flags) + DB_ENV *dbenv; + DB_TXN *txn; + const char *name, *subdb; + u_int32_t flags; +{ + DB *dbp; + DB_THREAD_INFO *ip; + ENV *env; + int handle_check, ret, t_ret, txn_local; + + dbp = NULL; + env = dbenv->env; + txn_local = 0; + + ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->dbremove"); + + /* + * The actual argument checking is simple, do it inline, outside of + * the replication block. + */ + if ((ret = __db_fchk(env, + "DB->remove", flags, DB_AUTO_COMMIT | DB_TXN_NOT_DURABLE)) != 0) + return (ret); + + ENV_ENTER(env, ip); + + /* Check for replication block. */ + handle_check = IS_ENV_REPLICATED(env); + if (handle_check && (ret = __env_rep_enter(env, 1)) != 0) { + handle_check = 0; + goto err; + } + + /* + * Create local transaction as necessary, check for consistent + * transaction usage. + */ + if (IS_ENV_AUTO_COMMIT(env, txn, flags)) { + if ((ret = __db_txn_auto_init(env, ip, &txn)) != 0) + goto err; + txn_local = 1; + } else + if (txn != NULL && !TXN_ON(env) && + (!CDB_LOCKING(env) || !F_ISSET(txn, TXN_CDSGROUP))) { + ret = __db_not_txn_env(env); + goto err; + } + LF_CLR(DB_AUTO_COMMIT); + + if ((ret = __db_create_internal(&dbp, env, 0)) != 0) + goto err; + if (LF_ISSET(DB_TXN_NOT_DURABLE) && + (ret = __db_set_flags(dbp, DB_TXN_NOT_DURABLE)) != 0) + goto err; + LF_CLR(DB_TXN_NOT_DURABLE); + + ret = __db_remove_int(dbp, ip, txn, name, subdb, flags); + + if (txn_local) { + /* + * We created the DBP here and when we commit/abort, we'll + * release all the transactional locks, including the handle + * lock; mark the handle cleared explicitly. + */ + LOCK_INIT(dbp->handle_lock); + dbp->locker = NULL; + } else if (txn != NULL) { + /* + * We created this handle locally so we need to close it + * and clean it up. Unfortunately, it's holding transactional + * locks that need to persist until the end of transaction. + * If we invalidate the locker id (dbp->locker), then the close + * won't free these locks prematurely. + */ + dbp->locker = NULL; + } + +err: if (txn_local && (t_ret = + __db_txn_auto_resolve(env, txn, 0, ret)) != 0 && ret == 0) + ret = t_ret; + + /* + * We never opened this dbp for real, so don't include a transaction + * handle, and use NOSYNC to avoid calling into mpool. + * + * !!! + * Note we're reversing the order of operations: we started the txn and + * then opened the DB handle; we're resolving the txn and then closing + * closing the DB handle -- a DB handle cannot be closed before + * resolving the txn. + */ + if (dbp != NULL && + (t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0) + ret = t_ret; + + if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __db_remove_pp + * DB->remove pre/post processing. + * + * PUBLIC: int __db_remove_pp + * PUBLIC: __P((DB *, const char *, const char *, u_int32_t)); + */ +int +__db_remove_pp(dbp, name, subdb, flags) + DB *dbp; + const char *name, *subdb; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int handle_check, ret, t_ret; + + env = dbp->env; + + /* + * Validate arguments, continuing to destroy the handle on failure. + * + * Cannot use DB_ILLEGAL_AFTER_OPEN directly because it returns. + * + * !!! + * We have a serious problem if we're here with a handle used to open + * a database -- we'll destroy the handle, and the application won't + * ever be able to close the database. + */ + if (F_ISSET(dbp, DB_AM_OPEN_CALLED)) + return (__db_mi_open(env, "DB->remove", 1)); + + /* Validate arguments. */ + if ((ret = __db_fchk(env, "DB->remove", flags, 0)) != 0) + return (ret); + + /* Check for consistent transaction usage. */ + if ((ret = __db_check_txn(dbp, NULL, DB_LOCK_INVALIDID, 0)) != 0) + return (ret); + + ENV_ENTER(env, ip); + + handle_check = IS_ENV_REPLICATED(env); + if (handle_check && (ret = __db_rep_enter(dbp, 1, 1, 0)) != 0) { + handle_check = 0; + goto err; + } + + /* Remove the file. */ + ret = __db_remove(dbp, ip, NULL, name, subdb, flags); + + if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + +err: ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __db_remove + * DB->remove method. + * + * PUBLIC: int __db_remove __P((DB *, DB_THREAD_INFO *, + * PUBLIC: DB_TXN *, const char *, const char *, u_int32_t)); + */ +int +__db_remove(dbp, ip, txn, name, subdb, flags) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + const char *name, *subdb; + u_int32_t flags; +{ + int ret, t_ret; + + ret = __db_remove_int(dbp, ip, txn, name, subdb, flags); + + if ((t_ret = __db_close(dbp, txn, DB_NOSYNC)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __db_remove_int + * Worker function for the DB->remove method. + * + * PUBLIC: int __db_remove_int __P((DB *, DB_THREAD_INFO *, + * PUBLIC: DB_TXN *, const char *, const char *, u_int32_t)); + */ +int +__db_remove_int(dbp, ip, txn, name, subdb, flags) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + const char *name, *subdb; + u_int32_t flags; +{ + ENV *env; + int ret; + char *real_name, *tmpname; + + env = dbp->env; + real_name = tmpname = NULL; + + if (name == NULL && subdb == NULL) { + __db_errx(env, "Remove on temporary files invalid"); + ret = EINVAL; + goto err; + } + + if (name == NULL) { + MAKE_INMEM(dbp); + real_name = (char *)subdb; + } else if (subdb != NULL) { + ret = __db_subdb_remove(dbp, ip, txn, name, subdb); + goto err; + } + + /* Handle transactional file removes separately. */ + if (IS_REAL_TXN(txn)) { + ret = __db_dbtxn_remove(dbp, ip, txn, name, subdb); + goto err; + } + + /* + * The remaining case is a non-transactional file remove. + * + * Find the real name of the file. + */ + if (!F_ISSET(dbp, DB_AM_INMEM) && (ret = __db_appname(env, + DB_APP_DATA, name, &dbp->dirname, &real_name)) != 0) + goto err; + + /* + * If this is a file and force is set, remove the temporary file, which + * may have been left around. Ignore errors because the temporary file + * might not exist. + */ + if (!F_ISSET(dbp, DB_AM_INMEM) && LF_ISSET(DB_FORCE) && + (ret = __db_backup_name(env, real_name, NULL, &tmpname)) == 0) + (void)__os_unlink(env, tmpname, 0); + + if ((ret = __fop_remove_setup(dbp, NULL, real_name, 0)) != 0) + goto err; + + if (dbp->db_am_remove != NULL && + (ret = dbp->db_am_remove(dbp, ip, NULL, name, subdb, flags)) != 0) + goto err; + + ret = F_ISSET(dbp, DB_AM_INMEM) ? + __db_inmem_remove(dbp, NULL, real_name) : + __fop_remove(env, + NULL, dbp->fileid, name, &dbp->dirname, DB_APP_DATA, + F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0); + +err: if (!F_ISSET(dbp, DB_AM_INMEM) && real_name != NULL) + __os_free(env, real_name); + if (tmpname != NULL) + __os_free(env, tmpname); + + return (ret); +} + +/* + * __db_inmem_remove -- + * Removal of a named in-memory database. + * + * PUBLIC: int __db_inmem_remove __P((DB *, DB_TXN *, const char *)); + */ +int +__db_inmem_remove(dbp, txn, name) + DB *dbp; + DB_TXN *txn; + const char *name; +{ + DBT fid_dbt, name_dbt; + DB_LOCKER *locker; + DB_LSN lsn; + ENV *env; + int ret; + + env = dbp->env; + locker = NULL; + + DB_ASSERT(env, name != NULL); + + /* This had better exist if we are trying to do a remove. */ + (void)__memp_set_flags(dbp->mpf, DB_MPOOL_NOFILE, 1); + if ((ret = __memp_fopen(dbp->mpf, NULL, + name, &dbp->dirname, 0, 0, 0)) != 0) + return (ret); + if ((ret = __memp_get_fileid(dbp->mpf, dbp->fileid)) != 0) + return (ret); + dbp->preserve_fid = 1; + + if (LOCKING_ON(env)) { + if (dbp->locker == NULL && + (ret = __lock_id(env, NULL, &dbp->locker)) != 0) + return (ret); + locker = txn == NULL ? dbp->locker : txn->locker; + } + + /* + * In a transactional environment, we'll play the same game we play + * for databases in the file system -- create a temporary database + * and put it in with the current name and then rename this one to + * another name. We'll then use a commit-time event to remove the + * entry. + */ + if ((ret = + __fop_lock_handle(env, dbp, locker, DB_LOCK_WRITE, NULL, 0)) != 0) + return (ret); + + if (!IS_REAL_TXN(txn)) + ret = __memp_nameop(env, dbp->fileid, NULL, name, NULL, 1); + else if (LOGGING_ON(env)) { + if (txn != NULL && (ret = + __txn_remevent(env, txn, name, dbp->fileid, 1)) != 0) + return (ret); + + DB_INIT_DBT(name_dbt, name, strlen(name) + 1); + DB_INIT_DBT(fid_dbt, dbp->fileid, DB_FILE_ID_LEN); + ret = __crdel_inmem_remove_log( + env, txn, &lsn, 0, &name_dbt, &fid_dbt); + } + + return (ret); +} + +/* + * __db_subdb_remove -- + * Remove a subdatabase. + */ +static int +__db_subdb_remove(dbp, ip, txn, name, subdb) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + const char *name, *subdb; +{ + DB *mdbp, *sdbp; + int ret, t_ret; + + mdbp = sdbp = NULL; + + /* Open the subdatabase. */ + if ((ret = __db_create_internal(&sdbp, dbp->env, 0)) != 0) + goto err; + if (F_ISSET(dbp, DB_AM_NOT_DURABLE) && + (ret = __db_set_flags(sdbp, DB_TXN_NOT_DURABLE)) != 0) + goto err; + if ((ret = __db_open(sdbp, ip, + txn, name, subdb, DB_UNKNOWN, DB_WRITEOPEN, 0, PGNO_BASE_MD)) != 0) + goto err; + + DB_TEST_RECOVERY(sdbp, DB_TEST_PREDESTROY, ret, name); + + /* Free up the pages in the subdatabase. */ + switch (sdbp->type) { + case DB_BTREE: + case DB_RECNO: + if ((ret = __bam_reclaim(sdbp, ip, txn)) != 0) + goto err; + break; + case DB_HASH: + if ((ret = __ham_reclaim(sdbp, ip, txn)) != 0) + goto err; + break; + case DB_QUEUE: + case DB_UNKNOWN: + default: + ret = __db_unknown_type( + sdbp->env, "__db_subdb_remove", sdbp->type); + goto err; + } + + /* + * Remove the entry from the main database and free the subdatabase + * metadata page. + */ + if ((ret = __db_master_open(sdbp, ip, txn, name, 0, 0, &mdbp)) != 0) + goto err; + + if ((ret = __db_master_update(mdbp, + sdbp, ip, txn, subdb, sdbp->type, MU_REMOVE, NULL, 0)) != 0) + goto err; + + DB_TEST_RECOVERY(sdbp, DB_TEST_POSTDESTROY, ret, name); + +DB_TEST_RECOVERY_LABEL +err: + /* Close the main and subdatabases. */ + if ((t_ret = __db_close(sdbp, txn, DB_NOSYNC)) != 0 && ret == 0) + ret = t_ret; + + if (mdbp != NULL && + (t_ret = __db_close(mdbp, txn, DB_NOSYNC)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +static int +__db_dbtxn_remove(dbp, ip, txn, name, subdb) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + const char *name, *subdb; +{ + ENV *env; + int ret; + char *tmpname; + + env = dbp->env; + tmpname = NULL; + + /* + * This is a transactional remove, so we have to keep the name + * of the file locked until the transaction commits. As a result, + * we implement remove by renaming the file to some other name + * (which creates a dummy named file as a placeholder for the + * file being rename/dremoved) and then deleting that file as + * a delayed remove at commit. + */ + if ((ret = __db_backup_name(env, + F_ISSET(dbp, DB_AM_INMEM) ? subdb : name, txn, &tmpname)) != 0) + return (ret); + + DB_TEST_RECOVERY(dbp, DB_TEST_PREDESTROY, ret, name); + + if ((ret = __db_rename_int(dbp, + txn->thread_info, txn, name, subdb, tmpname)) != 0) + goto err; + + /* + * The internal removes will also translate into delayed removes. + */ + if (dbp->db_am_remove != NULL && + (ret = dbp->db_am_remove(dbp, ip, txn, tmpname, NULL, 0)) != 0) + goto err; + + ret = F_ISSET(dbp, DB_AM_INMEM) ? + __db_inmem_remove(dbp, txn, tmpname) : + __fop_remove(env, + txn, dbp->fileid, tmpname, &dbp->dirname, DB_APP_DATA, + F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0); + + DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, name); + +err: +DB_TEST_RECOVERY_LABEL + if (tmpname != NULL) + __os_free(env, tmpname); + + return (ret); +} diff --git a/db/db_rename.c b/db/db_rename.c new file mode 100644 index 0000000..1fdf721 --- /dev/null +++ b/db/db_rename.c @@ -0,0 +1,372 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2001, 2010 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/fop.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" + +static int __db_rename __P((DB *, DB_THREAD_INFO *, + DB_TXN *, const char *, const char *, const char *)); +static int __db_subdb_rename __P((DB *, DB_THREAD_INFO *, + DB_TXN *, const char *, const char *, const char *)); + +/* + * __env_dbrename_pp + * ENV->dbrename pre/post processing. + * + * PUBLIC: int __env_dbrename_pp __P((DB_ENV *, DB_TXN *, + * PUBLIC: const char *, const char *, const char *, u_int32_t)); + */ +int +__env_dbrename_pp(dbenv, txn, name, subdb, newname, flags) + DB_ENV *dbenv; + DB_TXN *txn; + const char *name, *subdb, *newname; + u_int32_t flags; +{ + DB *dbp; + DB_THREAD_INFO *ip; + ENV *env; + int handle_check, ret, t_ret, txn_local; + + env = dbenv->env; + dbp = NULL; + txn_local = 0; + + ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->dbrename"); + + /* + * The actual argument checking is simple, do it inline, outside of + * the replication block. + */ + if ((ret = __db_fchk(env, "DB->rename", flags, DB_AUTO_COMMIT)) != 0) + return (ret); + + ENV_ENTER(env, ip); + + /* Check for replication block. */ + handle_check = IS_ENV_REPLICATED(env); + if (handle_check && (ret = __env_rep_enter(env, 1)) != 0) { + handle_check = 0; + goto err; + } + + /* + * Create local transaction as necessary, check for consistent + * transaction usage. + */ + if (IS_ENV_AUTO_COMMIT(env, txn, flags)) { + if ((ret = __db_txn_auto_init(env, ip, &txn)) != 0) + goto err; + txn_local = 1; + } else + if (txn != NULL && !TXN_ON(env) && + (!CDB_LOCKING(env) || !F_ISSET(txn, TXN_CDSGROUP))) { + ret = __db_not_txn_env(env); + goto err; + } + + LF_CLR(DB_AUTO_COMMIT); + + if ((ret = __db_create_internal(&dbp, env, 0)) != 0) + goto err; + + ret = __db_rename_int(dbp, ip, txn, name, subdb, newname); + + if (txn_local) { + /* + * We created the DBP here and when we commit/abort, we'll + * release all the transactional locks, including the handle + * lock; mark the handle cleared explicitly. + */ + LOCK_INIT(dbp->handle_lock); + dbp->locker = NULL; + } else if (txn != NULL) { + /* + * We created this handle locally so we need to close it and + * clean it up. Unfortunately, it's holding transactional + * or CDS group locks that need to persist until the end of + * transaction. If we invalidate the locker (dbp->locker), + * then the close won't free these locks prematurely. + */ + dbp->locker = NULL; + } + +err: if (txn_local && (t_ret = + __db_txn_auto_resolve(env, txn, 0, ret)) != 0 && ret == 0) + ret = t_ret; + + /* + * We never opened this dbp for real, so don't include a transaction + * handle, and use NOSYNC to avoid calling into mpool. + * + * !!! + * Note we're reversing the order of operations: we started the txn and + * then opened the DB handle; we're resolving the txn and then closing + * closing the DB handle -- it's safer. + */ + if (dbp != NULL && + (t_ret = __db_close(dbp, NULL, DB_NOSYNC)) != 0 && ret == 0) + ret = t_ret; + + if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __db_rename_pp + * DB->rename pre/post processing. + * + * PUBLIC: int __db_rename_pp __P((DB *, + * PUBLIC: const char *, const char *, const char *, u_int32_t)); + */ +int +__db_rename_pp(dbp, name, subdb, newname, flags) + DB *dbp; + const char *name, *subdb, *newname; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int handle_check, ret, t_ret; + + env = dbp->env; + handle_check = 0; + + /* + * Validate arguments, continuing to destroy the handle on failure. + * + * Cannot use DB_ILLEGAL_AFTER_OPEN directly because it returns. + * + * !!! + * We have a serious problem if we're here with a handle used to open + * a database -- we'll destroy the handle, and the application won't + * ever be able to close the database. + */ + if (F_ISSET(dbp, DB_AM_OPEN_CALLED)) + return (__db_mi_open(env, "DB->rename", 1)); + + /* Validate arguments. */ + if ((ret = __db_fchk(env, "DB->rename", flags, 0)) != 0) + return (ret); + + /* Check for consistent transaction usage. */ + if ((ret = __db_check_txn(dbp, NULL, DB_LOCK_INVALIDID, 0)) != 0) + return (ret); + + ENV_ENTER(env, ip); + + handle_check = IS_ENV_REPLICATED(env); + if (handle_check && (ret = __db_rep_enter(dbp, 1, 1, 0)) != 0) { + handle_check = 0; + goto err; + } + + /* Rename the file. */ + ret = __db_rename(dbp, ip, NULL, name, subdb, newname); + + if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; +err: ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __db_rename + * DB->rename method. + * + */ +static int +__db_rename(dbp, ip, txn, name, subdb, newname) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + const char *name, *subdb, *newname; +{ + int ret, t_ret; + + ret = __db_rename_int(dbp, ip, txn, name, subdb, newname); + + if ((t_ret = __db_close(dbp, txn, DB_NOSYNC)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __db_rename_int + * Worker function for DB->rename method; the close of the dbp is + * left in the wrapper routine. + * + * PUBLIC: int __db_rename_int __P((DB *, DB_THREAD_INFO *, + * PUBLIC: DB_TXN *, const char *, const char *, const char *)); + */ +int +__db_rename_int(dbp, ip, txn, name, subdb, newname) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + const char *name, *subdb, *newname; +{ + ENV *env; + int ret; + char *old, *real_name; + + env = dbp->env; + real_name = NULL; + + DB_TEST_RECOVERY(dbp, DB_TEST_PREDESTROY, ret, name); + + if (name == NULL && subdb == NULL) { + __db_errx(env, "Rename on temporary files invalid"); + ret = EINVAL; + goto err; + } + + if (name == NULL) + MAKE_INMEM(dbp); + else if (subdb != NULL) { + ret = __db_subdb_rename(dbp, ip, txn, name, subdb, newname); + goto err; + } + + /* + * From here on down, this pertains to files or in-memory databases. + * + * Find the real name of the file. + */ + if (F_ISSET(dbp, DB_AM_INMEM)) { + old = (char *)subdb; + real_name = (char *)subdb; + } else { + if ((ret = __db_appname(env, DB_APP_DATA, + name, &dbp->dirname, &real_name)) != 0) + goto err; + old = (char *)name; + } + DB_ASSERT(env, old != NULL); + + if ((ret = __fop_remove_setup(dbp, txn, real_name, 0)) != 0) + goto err; + + if (dbp->db_am_rename != NULL && + (ret = dbp->db_am_rename(dbp, ip, txn, name, subdb, newname)) != 0) + goto err; + + /* + * The transactional case and non-transactional case are + * quite different. In the non-transactional case, we simply + * do the rename. In the transactional case, since we need + * the ability to back out and maintain locking, we have to + * create a temporary object as a placeholder. This is all + * taken care of in the fop layer. + */ + if (IS_REAL_TXN(txn)) { + if ((ret = __fop_dummy(dbp, txn, old, newname)) != 0) + goto err; + } else { + if ((ret = __fop_dbrename(dbp, old, newname)) != 0) + goto err; + } + + /* + * I am pretty sure that we haven't gotten a dbreg id, so calling + * dbreg_filelist_update is not necessary. + */ + DB_ASSERT(env, dbp->log_filename == NULL || + dbp->log_filename->id == DB_LOGFILEID_INVALID); + + DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, newname); + +DB_TEST_RECOVERY_LABEL +err: if (!F_ISSET(dbp, DB_AM_INMEM) && real_name != NULL) + __os_free(env, real_name); + + return (ret); +} + +/* + * __db_subdb_rename -- + * Rename a subdatabase. + */ +static int +__db_subdb_rename(dbp, ip, txn, name, subdb, newname) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + const char *name, *subdb, *newname; +{ + DB *mdbp; + ENV *env; + PAGE *meta; + int ret, t_ret; + + mdbp = NULL; + meta = NULL; + env = dbp->env; + + /* + * We have not opened this dbp so it isn't marked as a subdb, + * but it ought to be. + */ + F_SET(dbp, DB_AM_SUBDB); + + /* + * Rename the entry in the main database. We need to first + * get the meta-data page number (via MU_OPEN) so that we can + * read the meta-data page and obtain a handle lock. Once we've + * done that, we can proceed to do the rename in the master. + */ + if ((ret = __db_master_open(dbp, ip, txn, name, 0, 0, &mdbp)) != 0) + goto err; + + if ((ret = __db_master_update(mdbp, dbp, ip, txn, subdb, dbp->type, + MU_OPEN, NULL, 0)) != 0) + goto err; + + if ((ret = __memp_fget(mdbp->mpf, &dbp->meta_pgno, + ip, txn, 0, &meta)) != 0) + goto err; + memcpy(dbp->fileid, ((DBMETA *)meta)->uid, DB_FILE_ID_LEN); + if ((ret = __fop_lock_handle(env, + dbp, mdbp->locker, DB_LOCK_WRITE, NULL, NOWAIT_FLAG(txn))) != 0) + goto err; + + ret = __memp_fput(mdbp->mpf, ip, meta, dbp->priority); + meta = NULL; + if (ret != 0) + goto err; + + if ((ret = __db_master_update(mdbp, dbp, ip, txn, + subdb, dbp->type, MU_RENAME, newname, 0)) != 0) + goto err; + + DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, name); + +DB_TEST_RECOVERY_LABEL +err: + if (meta != NULL && (t_ret = + __memp_fput(mdbp->mpf, ip, meta, dbp->priority)) != 0 && ret == 0) + ret = t_ret; + + if (mdbp != NULL && + (t_ret = __db_close(mdbp, txn, DB_NOSYNC)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} diff --git a/db/db_ret.c b/db/db_ret.c new file mode 100644 index 0000000..5ff60d1 --- /dev/null +++ b/db/db_ret.c @@ -0,0 +1,156 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" + +/* + * __db_ret -- + * Build return DBT. + * + * PUBLIC: int __db_ret __P((DBC *, + * PUBLIC: PAGE *, u_int32_t, DBT *, void **, u_int32_t *)); + */ +int +__db_ret(dbc, h, indx, dbt, memp, memsize) + DBC *dbc; + PAGE *h; + u_int32_t indx; + DBT *dbt; + void **memp; + u_int32_t *memsize; +{ + BKEYDATA *bk; + BOVERFLOW *bo; + DB *dbp; + HOFFPAGE ho; + u_int32_t len; + u_int8_t *hk; + void *data; + + dbp = dbc->dbp; + + switch (TYPE(h)) { + case P_HASH_UNSORTED: + case P_HASH: + hk = P_ENTRY(dbp, h, indx); + if (HPAGE_PTYPE(hk) == H_OFFPAGE) { + memcpy(&ho, hk, sizeof(HOFFPAGE)); + return (__db_goff(dbc, dbt, + ho.tlen, ho.pgno, memp, memsize)); + } + len = LEN_HKEYDATA(dbp, h, dbp->pgsize, indx); + data = HKEYDATA_DATA(hk); + break; + case P_LBTREE: + case P_LDUP: + case P_LRECNO: + bk = GET_BKEYDATA(dbp, h, indx); + if (B_TYPE(bk->type) == B_OVERFLOW) { + bo = (BOVERFLOW *)bk; + return (__db_goff(dbc, dbt, + bo->tlen, bo->pgno, memp, memsize)); + } + len = bk->len; + data = bk->data; + break; + default: + return (__db_pgfmt(dbp->env, h->pgno)); + } + + return (__db_retcopy(dbp->env, dbt, data, len, memp, memsize)); +} + +/* + * __db_retcopy -- + * Copy the returned data into the user's DBT, handling special flags. + * + * PUBLIC: int __db_retcopy __P((ENV *, DBT *, + * PUBLIC: void *, u_int32_t, void **, u_int32_t *)); + */ +int +__db_retcopy(env, dbt, data, len, memp, memsize) + ENV *env; + DBT *dbt; + void *data; + u_int32_t len; + void **memp; + u_int32_t *memsize; +{ + int ret; + + ret = 0; + + /* If returning a partial record, reset the length. */ + if (F_ISSET(dbt, DB_DBT_PARTIAL)) { + data = (u_int8_t *)data + dbt->doff; + if (len > dbt->doff) { + len -= dbt->doff; + if (len > dbt->dlen) + len = dbt->dlen; + } else + len = 0; + } + + /* + * Allocate memory to be owned by the application: DB_DBT_MALLOC, + * DB_DBT_REALLOC. + * + * !!! + * We always allocate memory, even if we're copying out 0 bytes. This + * guarantees consistency, i.e., the application can always free memory + * without concern as to how many bytes of the record were requested. + * + * Use the memory specified by the application: DB_DBT_USERMEM. + * + * !!! + * If the length we're going to copy is 0, the application-supplied + * memory pointer is allowed to be NULL. + */ + if (F_ISSET(dbt, DB_DBT_USERCOPY)) { + dbt->size = len; + return (len == 0 ? 0 : env->dbt_usercopy(dbt, 0, data, + len, DB_USERCOPY_SETDATA)); + + } else if (F_ISSET(dbt, DB_DBT_MALLOC)) + ret = __os_umalloc(env, len, &dbt->data); + else if (F_ISSET(dbt, DB_DBT_REALLOC)) { + if (dbt->data == NULL || dbt->size == 0 || dbt->size < len) + ret = __os_urealloc(env, len, &dbt->data); + } else if (F_ISSET(dbt, DB_DBT_USERMEM)) { + if (len != 0 && (dbt->data == NULL || dbt->ulen < len)) + ret = DB_BUFFER_SMALL; + } else if (memp == NULL || memsize == NULL) + ret = EINVAL; + else { + if (len != 0 && (*memsize == 0 || *memsize < len)) { + if ((ret = __os_realloc(env, len, memp)) == 0) + *memsize = len; + else + *memsize = 0; + } + if (ret == 0) + dbt->data = *memp; + } + + if (ret == 0 && len != 0) + memcpy(dbt->data, data, len); + + /* + * Return the length of the returned record in the DBT size field. + * This satisfies the requirement that if we're using user memory + * and insufficient memory was provided, return the amount necessary + * in the size field. + */ + dbt->size = len; + + return (ret); +} diff --git a/db/db_setid.c b/db/db_setid.c new file mode 100644 index 0000000..a78977e --- /dev/null +++ b/db/db_setid.c @@ -0,0 +1,213 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2000-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_swap.h" +#include "dbinc/db_am.h" +#include "dbinc/mp.h" + +/* + * __env_fileid_reset_pp -- + * ENV->fileid_reset pre/post processing. + * + * PUBLIC: int __env_fileid_reset_pp __P((DB_ENV *, const char *, u_int32_t)); + */ +int +__env_fileid_reset_pp(dbenv, name, flags) + DB_ENV *dbenv; + const char *name; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + env = dbenv->env; + + ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->fileid_reset"); + + /* + * !!! + * The actual argument checking is simple, do it inline, outside of + * the replication block. + */ + if (flags != 0 && flags != DB_ENCRYPT) + return (__db_ferr(env, "DB_ENV->fileid_reset", 0)); + + ENV_ENTER(env, ip); + REPLICATION_WRAP(env, + (__env_fileid_reset(env, ip, name, LF_ISSET(DB_ENCRYPT) ? 1 : 0)), + 1, ret); + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __env_fileid_reset -- + * Reset the file IDs for every database in the file. + * PUBLIC: int __env_fileid_reset + * PUBLIC: __P((ENV *, DB_THREAD_INFO *, const char *, int)); + */ +int +__env_fileid_reset(env, ip, name, encrypted) + ENV *env; + DB_THREAD_INFO *ip; + const char *name; + int encrypted; +{ + DB *dbp; + DBC *dbcp; + DBMETA *meta; + DBT key, data; + DB_FH *fhp; + DB_MPOOLFILE *mpf; + DB_PGINFO cookie; + db_pgno_t pgno; + int t_ret, ret; + size_t n; + char *real_name; + u_int8_t fileid[DB_FILE_ID_LEN], mbuf[DBMETASIZE]; + void *pagep; + + dbp = NULL; + dbcp = NULL; + fhp = NULL; + real_name = NULL; + + /* Get the real backing file name. */ + if ((ret = __db_appname(env, + DB_APP_DATA, name, NULL, &real_name)) != 0) + return (ret); + + /* Get a new file ID. */ + if ((ret = __os_fileid(env, real_name, 1, fileid)) != 0) + goto err; + + /* + * The user may have physically copied a file currently open in the + * cache, which means if we open this file through the cache before + * updating the file ID on page 0, we might connect to the file from + * which the copy was made. + */ + if ((ret = __os_open(env, real_name, 0, 0, 0, &fhp)) != 0) { + __db_err(env, ret, "%s", real_name); + goto err; + } + if ((ret = __os_read(env, fhp, mbuf, sizeof(mbuf), &n)) != 0) + goto err; + + if (n != sizeof(mbuf)) { + ret = EINVAL; + __db_errx(env, + "__env_fileid_reset: %s: unexpected file type or format", + real_name); + goto err; + } + + /* + * Create the DB object. + */ + if ((ret = __db_create_internal(&dbp, env, 0)) != 0) + goto err; + + /* If configured with a password, the databases are encrypted. */ + if (encrypted && (ret = __db_set_flags(dbp, DB_ENCRYPT)) != 0) + goto err; + + if ((ret = __db_meta_setup(env, + dbp, real_name, (DBMETA *)mbuf, 0, DB_CHK_META)) != 0) + goto err; + + meta = (DBMETA *)mbuf; + if (FLD_ISSET(meta->metaflags, + DBMETA_PART_RANGE | DBMETA_PART_CALLBACK) && (ret = + __part_fileid_reset(env, ip, name, meta->nparts, encrypted)) != 0) + goto err; + + memcpy(meta->uid, fileid, DB_FILE_ID_LEN); + cookie.db_pagesize = sizeof(mbuf); + cookie.flags = dbp->flags; + cookie.type = dbp->type; + key.data = &cookie; + + if ((ret = __db_pgout(env->dbenv, 0, mbuf, &key)) != 0) + goto err; + if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0) + goto err; + if ((ret = __os_write(env, fhp, mbuf, sizeof(mbuf), &n)) != 0) + goto err; + if ((ret = __os_fsync(env, fhp)) != 0) + goto err; + + /* + * Page 0 of the file has an updated file ID, and we can open it in + * the cache without connecting to a different, existing file. Open + * the file in the cache, and update the file IDs for subdatabases. + * (No existing code, as far as I know, actually uses the file ID of + * a subdatabase, but it's cleaner to get them all.) + */ + + /* + * If the database file doesn't support subdatabases, we only have + * to update a single metadata page. Otherwise, we have to open a + * cursor and step through the master database, and update all of + * the subdatabases' metadata pages. + */ + if (meta->type != P_BTREEMETA || !F_ISSET(meta, BTM_SUBDB)) + goto err; + + /* + * Open the DB file. + * + * !!! + * Note DB_RDWRMASTER flag, we need to open the master database file + * for writing in this case. + */ + if ((ret = __db_open(dbp, ip, NULL, + name, NULL, DB_UNKNOWN, DB_RDWRMASTER, 0, PGNO_BASE_MD)) != 0) + goto err; + + mpf = dbp->mpf; + memset(&key, 0, sizeof(key)); + memset(&data, 0, sizeof(data)); + if ((ret = __db_cursor(dbp, ip, NULL, &dbcp, 0)) != 0) + goto err; + while ((ret = __dbc_get(dbcp, &key, &data, DB_NEXT)) == 0) { + /* + * XXX + * We're handling actual data, not on-page meta-data, so it + * hasn't been converted to/from opposite endian architectures. + * Do it explicitly, now. + */ + memcpy(&pgno, data.data, sizeof(db_pgno_t)); + DB_NTOHL_SWAP(env, &pgno); + if ((ret = __memp_fget(mpf, &pgno, ip, NULL, + DB_MPOOL_DIRTY, &pagep)) != 0) + goto err; + memcpy(((DBMETA *)pagep)->uid, fileid, DB_FILE_ID_LEN); + if ((ret = __memp_fput(mpf, ip, pagep, dbcp->priority)) != 0) + goto err; + } + if (ret == DB_NOTFOUND) + ret = 0; + +err: if (dbcp != NULL && (t_ret = __dbc_close(dbcp)) != 0 && ret == 0) + ret = t_ret; + if (dbp != NULL && (t_ret = __db_close(dbp, NULL, 0)) != 0 && ret == 0) + ret = t_ret; + if (fhp != NULL && + (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0) + ret = t_ret; + if (real_name != NULL) + __os_free(env, real_name); + + return (ret); +} diff --git a/db/db_setlsn.c b/db/db_setlsn.c new file mode 100644 index 0000000..51ee7d3 --- /dev/null +++ b/db/db_setlsn.c @@ -0,0 +1,137 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2000-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/mp.h" +#include "dbinc/partition.h" +#include "dbinc/qam.h" + +static int __env_lsn_reset __P((ENV *, DB_THREAD_INFO *, const char *, int)); + +/* + * __env_lsn_reset_pp -- + * ENV->lsn_reset pre/post processing. + * + * PUBLIC: int __env_lsn_reset_pp __P((DB_ENV *, const char *, u_int32_t)); + */ +int +__env_lsn_reset_pp(dbenv, name, flags) + DB_ENV *dbenv; + const char *name; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + env = dbenv->env; + + ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->lsn_reset"); + + /* + * !!! + * The actual argument checking is simple, do it inline, outside of + * the replication block. + */ + if (flags != 0 && flags != DB_ENCRYPT) + return (__db_ferr(env, "DB_ENV->lsn_reset", 0)); + + ENV_ENTER(env, ip); + REPLICATION_WRAP(env, + (__env_lsn_reset(env, ip, name, LF_ISSET(DB_ENCRYPT) ? 1 : 0)), + 1, ret); + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __env_lsn_reset -- + * Reset the LSNs for every page in the file. + */ +static int +__env_lsn_reset(env, ip, name, encrypted) + ENV *env; + DB_THREAD_INFO *ip; + const char *name; + int encrypted; +{ + DB *dbp; + int t_ret, ret; + + /* Create the DB object. */ + if ((ret = __db_create_internal(&dbp, env, 0)) != 0) + return (ret); + + /* If configured with a password, the databases are encrypted. */ + if (encrypted && (ret = __db_set_flags(dbp, DB_ENCRYPT)) != 0) + goto err; + + /* + * Open the DB file. + * + * !!! + * Note DB_RDWRMASTER flag, we need to open the master database file + * for writing in this case. + */ + if ((ret = __db_open(dbp, ip, NULL, + name, NULL, DB_UNKNOWN, DB_RDWRMASTER, 0, PGNO_BASE_MD)) != 0) { + __db_err(env, ret, "%s", name); + goto err; + } + + ret = __db_lsn_reset(dbp->mpf, ip); +#ifdef HAVE_PARTITION + if (ret == 0 && DB_IS_PARTITIONED(dbp)) + ret = __part_lsn_reset(dbp, ip); + else +#endif + if (ret == 0 && dbp->type == DB_QUEUE) +#ifdef HAVE_QUEUE + ret = __qam_lsn_reset(dbp, ip); +#else + ret = __db_no_queue_am(env); +#endif + +err: if ((t_ret = __db_close(dbp, NULL, 0)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __db_lsn_reset -- reset the lsn for a db mpool handle. + * PUBLIC: int __db_lsn_reset __P((DB_MPOOLFILE *, DB_THREAD_INFO *)); + */ +int +__db_lsn_reset(mpf, ip) + DB_MPOOLFILE *mpf; + DB_THREAD_INFO *ip; +{ + PAGE *pagep; + db_pgno_t pgno; + int ret; + + /* Reset the LSN on every page of the database file. */ + for (pgno = 0; + (ret = __memp_fget(mpf, + &pgno, ip, NULL, DB_MPOOL_DIRTY, &pagep)) == 0; + ++pgno) { + LSN_NOT_LOGGED(pagep->lsn); + if ((ret = __memp_fput(mpf, + ip, pagep, DB_PRIORITY_UNCHANGED)) != 0) + break; + } + + if (ret == DB_PAGE_NOTFOUND) + ret = 0; + + return (ret); +} diff --git a/db/db_sort_multiple.c b/db/db_sort_multiple.c new file mode 100644 index 0000000..32ae2df --- /dev/null +++ b/db/db_sort_multiple.c @@ -0,0 +1,287 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" + +static int __db_quicksort __P((DB *, DBT *, DBT *, u_int32_t *, u_int32_t *, + u_int32_t *, u_int32_t *, u_int32_t)); + +/* + * __db_compare_both -- + * Use the comparison functions from db to compare akey and bkey, and if + * DB_DUPSORT adata and bdata. + * + * PUBLIC: int __db_compare_both __P((DB *, const DBT *, const DBT *, + * PUBLIC: const DBT *, const DBT *)); + */ +int +__db_compare_both(db, akey, adata, bkey, bdata) + DB *db; + const DBT *akey; + const DBT *adata; + const DBT *bkey; + const DBT *bdata; +{ + BTREE *t; + int cmp; + + t = (BTREE *)db->bt_internal; + + cmp = t->bt_compare(db, akey, bkey); + if (cmp != 0) return cmp; + if (!F_ISSET(db, DB_AM_DUPSORT)) return 0; + + if (adata == 0) return bdata == 0 ? 0 : -1; + if (bdata == 0) return 1; + +#ifdef HAVE_COMPRESSION + if (DB_IS_COMPRESSED(db)) + return t->compress_dup_compare(db, adata, bdata); +#endif + return db->dup_compare(db, adata, bdata); +} + +#define DB_SORT_SWAP(a, ad, b, bd) \ +do { \ + tmp = (a)[0]; (a)[0] = (b)[0]; (b)[0] = tmp; \ + tmp = (a)[-1]; (a)[-1] = (b)[-1]; (b)[-1] = tmp; \ + if (data != NULL) { \ + tmp = (ad)[0]; (ad)[0] = (bd)[0]; (bd)[0] = tmp; \ + tmp = (ad)[-1]; (ad)[-1] = (bd)[-1]; (bd)[-1] = tmp; \ + } \ +} while (0) + +#define DB_SORT_LOAD_DBT(a, ad, aptr, adptr) \ +do { \ + (a).data = (u_int8_t*)key->data + (aptr)[0]; \ + (a).size = (aptr)[-1]; \ + if (data != NULL) { \ + (ad).data = (u_int8_t*)data->data + (adptr)[0]; \ + (ad).size = (adptr)[-1]; \ + } \ +} while (0) + +#define DB_SORT_COMPARE(a, ad, b, bd) (data != NULL ? \ + __db_compare_both(db, &(a), &(ad), &(b), &(bd)) : \ + __db_compare_both(db, &(a), 0, &(b), 0)) + +#define DB_SORT_STACKSIZE 32 + +/* + * __db_quicksort -- + * The quicksort implementation for __db_sort_multiple() and + * __db_sort_multiple_key(). + */ +static int +__db_quicksort(db, key, data, kstart, kend, dstart, dend, size) + DB *db; + DBT *key, *data; + u_int32_t *kstart, *kend, *dstart, *dend; + u_int32_t size; +{ + int ret; + u_int32_t tmp; + u_int32_t *kmiddle, *dmiddle, *kptr, *dptr; + DBT a, ad, b, bd, m, md; + ENV *env; + + struct DB_SORT_quicksort_stack { + u_int32_t *kstart; + u_int32_t *kend; + u_int32_t *dstart; + u_int32_t *dend; + } stackbuf[DB_SORT_STACKSIZE], *stack; + u_int32_t soff, slen; + + ret = 0; + env = db->env; + + memset(&a, 0, sizeof(DBT)); + memset(&ad, 0, sizeof(DBT)); + memset(&b, 0, sizeof(DBT)); + memset(&bd, 0, sizeof(DBT)); + memset(&m, 0, sizeof(DBT)); + memset(&md, 0, sizeof(DBT)); + + /* NB end is smaller than start */ + + stack = stackbuf; + soff = 0; + slen = DB_SORT_STACKSIZE; + + start: + if (kend >= kstart) goto pop; + + /* If there's only one value, it's already sorted */ + tmp = (u_int32_t)(kstart - kend) / size; + if (tmp == 1) goto pop; + + DB_SORT_LOAD_DBT(a, ad, kstart, dstart); + DB_SORT_LOAD_DBT(b, bd, kend + size, dend + size); + + if (tmp == 2) { + /* Special case the sorting of two value sequences */ + if (DB_SORT_COMPARE(a, ad, b, bd) > 0) { + DB_SORT_SWAP(kstart, dstart, kend + size, dend + size); + } + goto pop; + } + + kmiddle = kstart - (tmp / 2) * size; + dmiddle = dstart - (tmp / 2) * size; + DB_SORT_LOAD_DBT(m, md, kmiddle, dmiddle); + + /* Find the median of three */ + if (DB_SORT_COMPARE(a, ad, b, bd) < 0) { + if (DB_SORT_COMPARE(m, md, a, ad) < 0) { + /* m < a < b */ + DB_SORT_SWAP(kstart, dstart, kend + size, dend + size); + } else if (DB_SORT_COMPARE(m, md, b, bd) < 0) { + /* a < m < b */ + DB_SORT_SWAP(kmiddle, + dmiddle, kend + size, dend + size); + } else { + /* a < b < m */ + /* Do nothing */ + } + } else { + if (DB_SORT_COMPARE(a, ad, m, md) < 0) { + /* b < a < m */ + DB_SORT_SWAP(kstart, dstart, kend + size, dend + size); + } else if (DB_SORT_COMPARE(b, bd, m, md) < 0) { + /* b < m < a */ + DB_SORT_SWAP(kmiddle, + dmiddle, kend + size, dend + size); + } else { + /* m < b < a */ + /* Do nothing */ + } + } + + /* partition */ + DB_SORT_LOAD_DBT(b, bd, kend + size, dend + size); + kmiddle = kstart; + dmiddle = dstart; + for (kptr = kstart, dptr = dstart; kptr > kend; + kptr -= size, dptr -= size) { + DB_SORT_LOAD_DBT(a, ad, kptr, dptr); + if (DB_SORT_COMPARE(a, ad, b, bd) < 0) { + DB_SORT_SWAP(kmiddle, dmiddle, kptr, dptr); + kmiddle -= size; + dmiddle -= size; + } + } + + DB_SORT_SWAP(kmiddle, dmiddle, kend + size, dend + size); + + if (soff == slen) { + /* Grow the stack */ + slen = slen * 2; + if (stack == stackbuf) { + ret = __os_malloc(env, slen * + sizeof(struct DB_SORT_quicksort_stack), &stack); + if (ret != 0) goto error; + memcpy(stack, stackbuf, soff * + sizeof(struct DB_SORT_quicksort_stack)); + } else { + ret = __os_realloc(env, slen * + sizeof(struct DB_SORT_quicksort_stack), &stack); + if (ret != 0) goto error; + } + } + + /* divide and conquer */ + stack[soff].kstart = kmiddle - size; + stack[soff].kend = kend; + stack[soff].dstart = dmiddle - size; + stack[soff].dend = dend; + ++soff; + + kend = kmiddle; + dend = dmiddle; + + goto start; + + pop: + if (soff != 0) { + --soff; + kstart = stack[soff].kstart; + kend = stack[soff].kend; + dstart = stack[soff].dstart; + dend = stack[soff].dend; + goto start; + } + + error: + if (stack != stackbuf) + __os_free(env, stack); + + return ret; +} + +#undef DB_SORT_SWAP +#undef DB_SORT_LOAD_DBT + +/* + * __db_sort_multiple -- + * If flags == DB_MULTIPLE_KEY, sorts a DB_MULTIPLE_KEY format DBT using + * the BTree comparison function and duplicate comparison function. + * + * If flags == DB_MULTIPLE, sorts one or two DB_MULTIPLE format DBTs using + * the BTree comparison function and duplicate comparison function. Will + * assume key and data specifies pairs of key/data to sort together. If + * data is NULL, will just sort key according to the btree comparison + * function. + * + * Uses an in-place quicksort algorithm, with median of three for the pivot + * point. + * + * PUBLIC: int __db_sort_multiple __P((DB *, DBT *, DBT *, u_int32_t)); + */ +int +__db_sort_multiple(db, key, data, flags) + DB *db; + DBT *key, *data; + u_int32_t flags; +{ + u_int32_t *kstart, *kend, *dstart, *dend; + + /* TODO: sanity checks on the DBTs */ + /* DB_ILLEGAL_METHOD(db, DB_OK_BTREE); */ + + kstart = (u_int32_t*)((u_int8_t *)key->data + key->ulen) - 1; + + switch (flags) { + case DB_MULTIPLE: + if (data != NULL) + dstart = (u_int32_t*)((u_int8_t *)data->data + + data->ulen) - 1; + else + dstart = kstart; + + /* Find the end */ + for (kend = kstart, dend = dstart; + *kend != (u_int32_t)-1 && *dend != (u_int32_t)-1; + kend -= 2, dend -= 2) + ; + + return (__db_quicksort(db, key, data, kstart, kend, dstart, + dend, 2)); + case DB_MULTIPLE_KEY: + /* Find the end */ + for (kend = kstart; *kend != (u_int32_t)-1; kend -= 4) + ; + + return (__db_quicksort(db, key, key, kstart, kend, kstart - 2, + kend - 2, 4)); + default: + return (__db_ferr(db->env, "DB->sort_multiple", 0)); + } +} diff --git a/db/db_stati.c b/db/db_stati.c new file mode 100644 index 0000000..b8d3a3f --- /dev/null +++ b/db/db_stati.c @@ -0,0 +1,494 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/hash.h" +#include "dbinc/qam.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/partition.h" + +#ifdef HAVE_STATISTICS +static int __db_print_all __P((DB *, u_int32_t)); +static int __db_print_citem __P((DBC *)); +static int __db_print_cursor __P((DB *)); +static int __db_print_stats __P((DB *, DB_THREAD_INFO *, u_int32_t)); +static int __db_stat __P((DB *, DB_THREAD_INFO *, DB_TXN *, void *, u_int32_t)); +static int __db_stat_arg __P((DB *, u_int32_t)); + +/* + * __db_stat_pp -- + * DB->stat pre/post processing. + * + * PUBLIC: int __db_stat_pp __P((DB *, DB_TXN *, void *, u_int32_t)); + */ +int +__db_stat_pp(dbp, txn, spp, flags) + DB *dbp; + DB_TXN *txn; + void *spp; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int handle_check, ret, t_ret; + + env = dbp->env; + + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->stat"); + + if ((ret = __db_stat_arg(dbp, flags)) != 0) + return (ret); + + ENV_ENTER(env, ip); + + /* Check for replication block. */ + handle_check = IS_ENV_REPLICATED(env); + if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, + txn != NULL)) != 0) { + handle_check = 0; + goto err; + } + + ret = __db_stat(dbp, ip, txn, spp, flags); + + /* Release replication block. */ + if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + +err: ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __db_stat -- + * DB->stat. + * + */ +static int +__db_stat(dbp, ip, txn, spp, flags) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + void *spp; + u_int32_t flags; +{ + DBC *dbc; + ENV *env; + int ret, t_ret; + + env = dbp->env; + + /* Acquire a cursor. */ + if ((ret = __db_cursor(dbp, ip, txn, + &dbc, LF_ISSET(DB_READ_COMMITTED | DB_READ_UNCOMMITTED))) != 0) + return (ret); + + DEBUG_LWRITE(dbc, NULL, "DB->stat", NULL, NULL, flags); + LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED); +#ifdef HAVE_PARTITION + if (DB_IS_PARTITIONED(dbp)) + ret = __partition_stat(dbc, spp, flags); + else +#endif + switch (dbp->type) { + case DB_BTREE: + case DB_RECNO: + ret = __bam_stat(dbc, spp, flags); + break; + case DB_HASH: + ret = __ham_stat(dbc, spp, flags); + break; + case DB_QUEUE: + ret = __qam_stat(dbc, spp, flags); + break; + case DB_UNKNOWN: + default: + ret = (__db_unknown_type(env, "DB->stat", dbp->type)); + break; + } + + if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __db_stat_arg -- + * Check DB->stat arguments. + */ +static int +__db_stat_arg(dbp, flags) + DB *dbp; + u_int32_t flags; +{ + ENV *env; + + env = dbp->env; + + /* Check for invalid function flags. */ + LF_CLR(DB_READ_COMMITTED | DB_READ_UNCOMMITTED); + switch (flags) { + case 0: + case DB_FAST_STAT: + break; + default: + return (__db_ferr(env, "DB->stat", 0)); + } + + return (0); +} + +/* + * __db_stat_print_pp -- + * DB->stat_print pre/post processing. + * + * PUBLIC: int __db_stat_print_pp __P((DB *, u_int32_t)); + */ +int +__db_stat_print_pp(dbp, flags) + DB *dbp; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int handle_check, ret, t_ret; + + env = dbp->env; + + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->stat_print"); + + /* + * !!! + * The actual argument checking is simple, do it inline. + */ + if ((ret = __db_fchk(env, + "DB->stat_print", flags, DB_FAST_STAT | DB_STAT_ALL)) != 0) + return (ret); + + ENV_ENTER(env, ip); + + /* Check for replication block. */ + handle_check = IS_ENV_REPLICATED(env); + if (handle_check && (ret = __db_rep_enter(dbp, 1, 0, 0)) != 0) { + handle_check = 0; + goto err; + } + + ret = __db_stat_print(dbp, ip, flags); + + /* Release replication block. */ + if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + +err: ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __db_stat_print -- + * DB->stat_print. + * + * PUBLIC: int __db_stat_print __P((DB *, DB_THREAD_INFO *, u_int32_t)); + */ +int +__db_stat_print(dbp, ip, flags) + DB *dbp; + DB_THREAD_INFO *ip; + u_int32_t flags; +{ + time_t now; + int ret; + char time_buf[CTIME_BUFLEN]; + + (void)time(&now); + __db_msg(dbp->env, "%.24s\tLocal time", __os_ctime(&now, time_buf)); + + if (LF_ISSET(DB_STAT_ALL) && (ret = __db_print_all(dbp, flags)) != 0) + return (ret); + + if ((ret = __db_print_stats(dbp, ip, flags)) != 0) + return (ret); + + return (0); +} + +/* + * __db_print_stats -- + * Display default DB handle statistics. + */ +static int +__db_print_stats(dbp, ip, flags) + DB *dbp; + DB_THREAD_INFO *ip; + u_int32_t flags; +{ + DBC *dbc; + ENV *env; + int ret, t_ret; + + env = dbp->env; + + /* Acquire a cursor. */ + if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0) + return (ret); + + DEBUG_LWRITE(dbc, NULL, "DB->stat_print", NULL, NULL, 0); + + switch (dbp->type) { + case DB_BTREE: + case DB_RECNO: + ret = __bam_stat_print(dbc, flags); + break; + case DB_HASH: + ret = __ham_stat_print(dbc, flags); + break; + case DB_QUEUE: + ret = __qam_stat_print(dbc, flags); + break; + case DB_UNKNOWN: + default: + ret = (__db_unknown_type(env, "DB->stat_print", dbp->type)); + break; + } + + if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __db_print_all -- + * Display debugging DB handle statistics. + */ +static int +__db_print_all(dbp, flags) + DB *dbp; + u_int32_t flags; +{ + static const FN fn[] = { + { DB_AM_CHKSUM, "DB_AM_CHKSUM" }, + { DB_AM_COMPENSATE, "DB_AM_COMPENSATE" }, + { DB_AM_CREATED, "DB_AM_CREATED" }, + { DB_AM_CREATED_MSTR, "DB_AM_CREATED_MSTR" }, + { DB_AM_DBM_ERROR, "DB_AM_DBM_ERROR" }, + { DB_AM_DELIMITER, "DB_AM_DELIMITER" }, + { DB_AM_DISCARD, "DB_AM_DISCARD" }, + { DB_AM_DUP, "DB_AM_DUP" }, + { DB_AM_DUPSORT, "DB_AM_DUPSORT" }, + { DB_AM_ENCRYPT, "DB_AM_ENCRYPT" }, + { DB_AM_FIXEDLEN, "DB_AM_FIXEDLEN" }, + { DB_AM_INMEM, "DB_AM_INMEM" }, + { DB_AM_IN_RENAME, "DB_AM_IN_RENAME" }, + { DB_AM_NOT_DURABLE, "DB_AM_NOT_DURABLE" }, + { DB_AM_OPEN_CALLED, "DB_AM_OPEN_CALLED" }, + { DB_AM_PAD, "DB_AM_PAD" }, + { DB_AM_PGDEF, "DB_AM_PGDEF" }, + { DB_AM_RDONLY, "DB_AM_RDONLY" }, + { DB_AM_READ_UNCOMMITTED, "DB_AM_READ_UNCOMMITTED" }, + { DB_AM_RECNUM, "DB_AM_RECNUM" }, + { DB_AM_RECOVER, "DB_AM_RECOVER" }, + { DB_AM_RENUMBER, "DB_AM_RENUMBER" }, + { DB_AM_REVSPLITOFF, "DB_AM_REVSPLITOFF" }, + { DB_AM_SECONDARY, "DB_AM_SECONDARY" }, + { DB_AM_SNAPSHOT, "DB_AM_SNAPSHOT" }, + { DB_AM_SUBDB, "DB_AM_SUBDB" }, + { DB_AM_SWAP, "DB_AM_SWAP" }, + { DB_AM_TXN, "DB_AM_TXN" }, + { DB_AM_VERIFYING, "DB_AM_VERIFYING" }, + { 0, NULL } + }; + ENV *env; + char time_buf[CTIME_BUFLEN]; + + env = dbp->env; + + __db_msg(env, "%s", DB_GLOBAL(db_line)); + __db_msg(env, "DB handle information:"); + STAT_ULONG("Page size", dbp->pgsize); + STAT_ISSET("Append recno", dbp->db_append_recno); + STAT_ISSET("Feedback", dbp->db_feedback); + STAT_ISSET("Dup compare", dbp->dup_compare); + STAT_ISSET("App private", dbp->app_private); + STAT_ISSET("DbEnv", dbp->env); + STAT_STRING("Type", __db_dbtype_to_string(dbp->type)); + + __mutex_print_debug_single(env, "Thread mutex", dbp->mutex, flags); + + STAT_STRING("File", dbp->fname); + STAT_STRING("Database", dbp->dname); + STAT_HEX("Open flags", dbp->open_flags); + + __db_print_fileid(env, dbp->fileid, "\tFile ID"); + + STAT_ULONG("Cursor adjust ID", dbp->adj_fileid); + STAT_ULONG("Meta pgno", dbp->meta_pgno); + if (dbp->locker != NULL) + STAT_ULONG("Locker ID", dbp->locker->id); + if (dbp->cur_locker != NULL) + STAT_ULONG("Handle lock", dbp->cur_locker->id); + if (dbp->associate_locker != NULL) + STAT_ULONG("Associate lock", dbp->associate_locker->id); + STAT_ULONG("RPC remote ID", dbp->cl_id); + + __db_msg(env, + "%.24s\tReplication handle timestamp", + dbp->timestamp == 0 ? "0" : __os_ctime(&dbp->timestamp, time_buf)); + + STAT_ISSET("Secondary callback", dbp->s_callback); + STAT_ISSET("Primary handle", dbp->s_primary); + + STAT_ISSET("api internal", dbp->api_internal); + STAT_ISSET("Btree/Recno internal", dbp->bt_internal); + STAT_ISSET("Hash internal", dbp->h_internal); + STAT_ISSET("Queue internal", dbp->q_internal); + + __db_prflags(env, NULL, dbp->flags, fn, NULL, "\tFlags"); + + if (dbp->log_filename == NULL) + STAT_ISSET("File naming information", dbp->log_filename); + else + __dbreg_print_fname(env, dbp->log_filename); + + (void)__db_print_cursor(dbp); + + return (0); +} + +/* + * __db_print_cursor -- + * Display the cursor active and free queues. + */ +static int +__db_print_cursor(dbp) + DB *dbp; +{ + DBC *dbc; + ENV *env; + int ret, t_ret; + + env = dbp->env; + + __db_msg(env, "%s", DB_GLOBAL(db_line)); + __db_msg(env, "DB handle cursors:"); + + ret = 0; + MUTEX_LOCK(dbp->env, dbp->mutex); + __db_msg(env, "Active queue:"); + TAILQ_FOREACH(dbc, &dbp->active_queue, links) + if ((t_ret = __db_print_citem(dbc)) != 0 && ret == 0) + ret = t_ret; + __db_msg(env, "Join queue:"); + TAILQ_FOREACH(dbc, &dbp->join_queue, links) + if ((t_ret = __db_print_citem(dbc)) != 0 && ret == 0) + ret = t_ret; + __db_msg(env, "Free queue:"); + TAILQ_FOREACH(dbc, &dbp->free_queue, links) + if ((t_ret = __db_print_citem(dbc)) != 0 && ret == 0) + ret = t_ret; + MUTEX_UNLOCK(dbp->env, dbp->mutex); + + return (ret); +} + +static int +__db_print_citem(dbc) + DBC *dbc; +{ + static const FN fn[] = { + { DBC_ACTIVE, "DBC_ACTIVE" }, + { DBC_DONTLOCK, "DBC_DONTLOCK" }, + { DBC_MULTIPLE, "DBC_MULTIPLE" }, + { DBC_MULTIPLE_KEY, "DBC_MULTIPLE_KEY" }, + { DBC_OPD, "DBC_OPD" }, + { DBC_OWN_LID, "DBC_OWN_LID" }, + { DBC_READ_COMMITTED, "DBC_READ_COMMITTED" }, + { DBC_READ_UNCOMMITTED, "DBC_READ_UNCOMMITTED" }, + { DBC_RECOVER, "DBC_RECOVER" }, + { DBC_RMW, "DBC_RMW" }, + { DBC_TRANSIENT, "DBC_TRANSIENT" }, + { DBC_WAS_READ_COMMITTED,"DBC_WAS_READ_COMMITTED" }, + { DBC_WRITECURSOR, "DBC_WRITECURSOR" }, + { DBC_WRITER, "DBC_WRITER" }, + { 0, NULL } + }; + DB *dbp; + DBC_INTERNAL *cp; + ENV *env; + + dbp = dbc->dbp; + env = dbp->env; + cp = dbc->internal; + + STAT_POINTER("DBC", dbc); + STAT_POINTER("Associated dbp", dbc->dbp); + STAT_POINTER("Associated txn", dbc->txn); + STAT_POINTER("Internal", cp); + STAT_HEX("Default locker ID", dbc->lref == NULL ? 0 : dbc->lref->id); + STAT_HEX("Locker", P_TO_ULONG(dbc->locker)); + STAT_STRING("Type", __db_dbtype_to_string(dbc->dbtype)); + + STAT_POINTER("Off-page duplicate cursor", cp->opd); + STAT_POINTER("Referenced page", cp->page); + STAT_ULONG("Root", cp->root); + STAT_ULONG("Page number", cp->pgno); + STAT_ULONG("Page index", cp->indx); + STAT_STRING("Lock mode", __db_lockmode_to_string(cp->lock_mode)); + __db_prflags(env, NULL, dbc->flags, fn, NULL, "\tFlags"); + + switch (dbc->dbtype) { + case DB_BTREE: + case DB_RECNO: + __bam_print_cursor(dbc); + break; + case DB_HASH: + __ham_print_cursor(dbc); + break; + case DB_UNKNOWN: + DB_ASSERT(env, dbp->type != DB_UNKNOWN); + /* FALLTHROUGH */ + case DB_QUEUE: + default: + break; + } + return (0); +} + +#else /* !HAVE_STATISTICS */ + +int +__db_stat_pp(dbp, txn, spp, flags) + DB *dbp; + DB_TXN *txn; + void *spp; + u_int32_t flags; +{ + COMPQUIET(spp, NULL); + COMPQUIET(txn, NULL); + COMPQUIET(flags, 0); + + return (__db_stat_not_built(dbp->env)); +} + +int +__db_stat_print_pp(dbp, flags) + DB *dbp; + u_int32_t flags; +{ + COMPQUIET(flags, 0); + + return (__db_stat_not_built(dbp->env)); +} +#endif diff --git a/db/db_truncate.c b/db/db_truncate.c new file mode 100644 index 0000000..66f4180 --- /dev/null +++ b/db/db_truncate.c @@ -0,0 +1,225 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2001-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/hash.h" +#include "dbinc/qam.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/partition.h" +#include "dbinc/txn.h" + +static int __db_cursor_check __P((DB *)); + +/* + * __db_truncate_pp + * DB->truncate pre/post processing. + * + * PUBLIC: int __db_truncate_pp __P((DB *, DB_TXN *, u_int32_t *, u_int32_t)); + */ +int +__db_truncate_pp(dbp, txn, countp, flags) + DB *dbp; + DB_TXN *txn; + u_int32_t *countp, flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int handle_check, ret, t_ret, txn_local; + + env = dbp->env; + handle_check = txn_local = 0; + + STRIP_AUTO_COMMIT(flags); + + /* Check for invalid flags. */ + if (F_ISSET(dbp, DB_AM_SECONDARY)) { + __db_errx(env, "DB->truncate forbidden on secondary indices"); + return (EINVAL); + } + if ((ret = __db_fchk(env, "DB->truncate", flags, 0)) != 0) + return (ret); + + ENV_ENTER(env, ip); + + /* + * Make sure there are no active cursors on this db. Since we drop + * pages we cannot really adjust cursors. + */ + if ((ret = __db_cursor_check(dbp)) != 0) { + __db_errx(env, + "DB->truncate not permitted with active cursors"); + goto err; + } + +#ifdef CONFIG_TEST + if (IS_REP_MASTER(env)) + DB_TEST_WAIT(env, env->test_check); +#endif + /* Check for replication block. */ + handle_check = IS_ENV_REPLICATED(env); + if (handle_check && + (ret = __db_rep_enter(dbp, 1, 0, txn != NULL)) != 0) { + handle_check = 0; + goto err; + } + + /* + * Check for changes to a read-only database. This must be after the + * replication block so that we cannot race master/client state changes. + */ + if (DB_IS_READONLY(dbp)) { + ret = __db_rdonly(env, "DB->truncate"); + goto err; + } + + /* + * Create local transaction as necessary, check for consistent + * transaction usage. + */ + if (IS_DB_AUTO_COMMIT(dbp, txn)) { + if ((ret = __txn_begin(env, ip, NULL, &txn, 0)) != 0) + goto err; + txn_local = 1; + } + + /* Check for consistent transaction usage. */ + if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0) + goto err; + + ret = __db_truncate(dbp, ip, txn, countp); + +err: if (txn_local && + (t_ret = __db_txn_auto_resolve(env, txn, 0, ret)) && ret == 0) + ret = t_ret; + + /* Release replication block. */ + if (handle_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __db_truncate + * DB->truncate. + * + * PUBLIC: int __db_truncate __P((DB *, DB_THREAD_INFO *, DB_TXN *, + * PUBLIC: u_int32_t *)); + */ +int +__db_truncate(dbp, ip, txn, countp) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + u_int32_t *countp; +{ + DB *sdbp; + DBC *dbc; + ENV *env; + u_int32_t scount; + int ret, t_ret; + + env = dbp->env; + dbc = NULL; + ret = 0; + + /* + * Run through all secondaries and truncate them first. The count + * returned is the count of the primary only. QUEUE uses normal + * processing to truncate so it will update the secondaries normally. + */ + if (dbp->type != DB_QUEUE && DB_IS_PRIMARY(dbp)) { + if ((ret = __db_s_first(dbp, &sdbp)) != 0) + return (ret); + for (; sdbp != NULL && ret == 0; ret = __db_s_next(&sdbp, txn)) + if ((ret = __db_truncate(sdbp, ip, txn, &scount)) != 0) + break; + if (sdbp != NULL) + (void)__db_s_done(sdbp, txn); + if (ret != 0) + return (ret); + } + + DB_TEST_RECOVERY(dbp, DB_TEST_PREDESTROY, ret, NULL); + + /* Acquire a cursor. */ + if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0) + return (ret); + + DEBUG_LWRITE(dbc, txn, "DB->truncate", NULL, NULL, 0); +#ifdef HAVE_PARTITION + if (DB_IS_PARTITIONED(dbp)) + ret = __part_truncate(dbc, countp); + else +#endif + switch (dbp->type) { + case DB_BTREE: + case DB_RECNO: + ret = __bam_truncate(dbc, countp); + break; + case DB_HASH: + ret = __ham_truncate(dbc, countp); + break; + case DB_QUEUE: + ret = __qam_truncate(dbc, countp); + break; + case DB_UNKNOWN: + default: + ret = __db_unknown_type(env, "DB->truncate", dbp->type); + break; + } + + /* Discard the cursor. */ + if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + DB_TEST_RECOVERY(dbp, DB_TEST_POSTDESTROY, ret, NULL); + +DB_TEST_RECOVERY_LABEL + + return (ret); +} + +/* + * __db_cursor_check -- + * See if there are any active cursors on this db. + */ +static int +__db_cursor_check(dbp) + DB *dbp; +{ + DB *ldbp; + DBC *dbc; + ENV *env; + int found; + + env = dbp->env; + + MUTEX_LOCK(env, env->mtx_dblist); + FIND_FIRST_DB_MATCH(env, dbp, ldbp); + for (found = 0; + !found && ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; + ldbp = TAILQ_NEXT(ldbp, dblistlinks)) { + MUTEX_LOCK(env, dbp->mutex); + TAILQ_FOREACH(dbc, &ldbp->active_queue, links) + if (IS_INITIALIZED(dbc)) { + found = 1; + break; + } + MUTEX_UNLOCK(env, dbp->mutex); + } + MUTEX_UNLOCK(env, env->mtx_dblist); + + return (found ? EINVAL : 0); +} diff --git a/db/db_upg.c b/db/db_upg.c new file mode 100644 index 0000000..5a6db94 --- /dev/null +++ b/db/db_upg.c @@ -0,0 +1,510 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_swap.h" +#include "dbinc/btree.h" +#include "dbinc/hash.h" +#include "dbinc/qam.h" + +/* + * __db_upgrade_pp -- + * DB->upgrade pre/post processing. + * + * PUBLIC: int __db_upgrade_pp __P((DB *, const char *, u_int32_t)); + */ +int +__db_upgrade_pp(dbp, fname, flags) + DB *dbp; + const char *fname; + u_int32_t flags; +{ +#ifdef HAVE_UPGRADE_SUPPORT + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + env = dbp->env; + + /* + * !!! + * The actual argument checking is simple, do it inline. + */ + if ((ret = __db_fchk(env, "DB->upgrade", flags, DB_DUPSORT)) != 0) + return (ret); + + ENV_ENTER(env, ip); + ret = __db_upgrade(dbp, fname, flags); + ENV_LEAVE(env, ip); + return (ret); +#else + COMPQUIET(dbp, NULL); + COMPQUIET(fname, NULL); + COMPQUIET(flags, 0); + + __db_errx(dbp->env, "upgrade not supported"); + return (EINVAL); +#endif +} + +#ifdef HAVE_UPGRADE_SUPPORT +static int (* const func_31_list[P_PAGETYPE_MAX]) + __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)) = { + NULL, /* P_INVALID */ + NULL, /* __P_DUPLICATE */ + __ham_31_hash, /* P_HASH_UNSORTED */ + NULL, /* P_IBTREE */ + NULL, /* P_IRECNO */ + __bam_31_lbtree, /* P_LBTREE */ + NULL, /* P_LRECNO */ + NULL, /* P_OVERFLOW */ + __ham_31_hashmeta, /* P_HASHMETA */ + __bam_31_btreemeta, /* P_BTREEMETA */ + NULL, /* P_QAMMETA */ + NULL, /* P_QAMDATA */ + NULL, /* P_LDUP */ + NULL, /* P_HASH */ +}; + +static int (* const func_46_list[P_PAGETYPE_MAX]) + __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)) = { + NULL, /* P_INVALID */ + NULL, /* __P_DUPLICATE */ + __ham_46_hash, /* P_HASH_UNSORTED */ + NULL, /* P_IBTREE */ + NULL, /* P_IRECNO */ + NULL, /* P_LBTREE */ + NULL, /* P_LRECNO */ + NULL, /* P_OVERFLOW */ + __ham_46_hashmeta, /* P_HASHMETA */ + NULL, /* P_BTREEMETA */ + NULL, /* P_QAMMETA */ + NULL, /* P_QAMDATA */ + NULL, /* P_LDUP */ + NULL, /* P_HASH */ +}; + +static int __db_page_pass __P((DB *, char *, u_int32_t, int (* const []) + (DB *, char *, u_int32_t, DB_FH *, PAGE *, int *), DB_FH *)); +static int __db_set_lastpgno __P((DB *, char *, DB_FH *)); + +/* + * __db_upgrade -- + * Upgrade an existing database. + * + * PUBLIC: int __db_upgrade __P((DB *, const char *, u_int32_t)); + */ +int +__db_upgrade(dbp, fname, flags) + DB *dbp; + const char *fname; + u_int32_t flags; +{ + DBMETA *meta; + DB_FH *fhp; + ENV *env; + size_t n; + int ret, t_ret, use_mp_open; + u_int8_t mbuf[256], tmpflags; + char *real_name; + + use_mp_open = 0; + env = dbp->env; + fhp = NULL; + + /* Get the real backing file name. */ + if ((ret = __db_appname(env, + DB_APP_DATA, fname, NULL, &real_name)) != 0) + return (ret); + + /* Open the file. */ + if ((ret = __os_open(env, real_name, 0, 0, 0, &fhp)) != 0) { + __db_err(env, ret, "%s", real_name); + return (ret); + } + + /* Initialize the feedback. */ + if (dbp->db_feedback != NULL) + dbp->db_feedback(dbp, DB_UPGRADE, 0); + + /* + * Read the metadata page. We read 256 bytes, which is larger than + * any access method's metadata page and smaller than any disk sector. + */ + if ((ret = __os_read(env, fhp, mbuf, sizeof(mbuf), &n)) != 0) + goto err; + + switch (((DBMETA *)mbuf)->magic) { + case DB_BTREEMAGIC: + switch (((DBMETA *)mbuf)->version) { + case 6: + /* + * Before V7 not all pages had page types, so we do the + * single meta-data page by hand. + */ + if ((ret = + __bam_30_btreemeta(dbp, real_name, mbuf)) != 0) + goto err; + if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0) + goto err; + if ((ret = __os_write(env, fhp, mbuf, 256, &n)) != 0) + goto err; + /* FALLTHROUGH */ + case 7: + /* + * We need the page size to do more. Rip it out of + * the meta-data page. + */ + memcpy(&dbp->pgsize, mbuf + 20, sizeof(u_int32_t)); + + if ((ret = __db_page_pass( + dbp, real_name, flags, func_31_list, fhp)) != 0) + goto err; + /* FALLTHROUGH */ + case 8: + if ((ret = + __db_set_lastpgno(dbp, real_name, fhp)) != 0) + goto err; + /* FALLTHROUGH */ + case 9: + break; + default: + __db_errx(env, "%s: unsupported btree version: %lu", + real_name, (u_long)((DBMETA *)mbuf)->version); + ret = DB_OLD_VERSION; + goto err; + } + break; + case DB_HASHMAGIC: + switch (((DBMETA *)mbuf)->version) { + case 4: + case 5: + /* + * Before V6 not all pages had page types, so we do the + * single meta-data page by hand. + */ + if ((ret = + __ham_30_hashmeta(dbp, real_name, mbuf)) != 0) + goto err; + if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0) + goto err; + if ((ret = __os_write(env, fhp, mbuf, 256, &n)) != 0) + goto err; + + /* + * Before V6, we created hash pages one by one as they + * were needed, using hashhdr.ovfl_point to reserve + * a block of page numbers for them. A consequence + * of this was that, if no overflow pages had been + * created, the current doubling might extend past + * the end of the database file. + * + * In DB 3.X, we now create all the hash pages + * belonging to a doubling atomically; it's not + * safe to just save them for later, because when + * we create an overflow page we'll just create + * a new last page (whatever that may be). Grow + * the database to the end of the current doubling. + */ + if ((ret = + __ham_30_sizefix(dbp, fhp, real_name, mbuf)) != 0) + goto err; + /* FALLTHROUGH */ + case 6: + /* + * We need the page size to do more. Rip it out of + * the meta-data page. + */ + memcpy(&dbp->pgsize, mbuf + 20, sizeof(u_int32_t)); + + if ((ret = __db_page_pass( + dbp, real_name, flags, func_31_list, fhp)) != 0) + goto err; + /* FALLTHROUGH */ + case 7: + if ((ret = + __db_set_lastpgno(dbp, real_name, fhp)) != 0) + goto err; + /* FALLTHROUGH */ + case 8: + /* + * Any upgrade that has proceeded this far has metadata + * pages compatible with hash version 8 metadata pages, + * so casting mbuf to a dbmeta is safe. + * If a newer revision moves the pagesize, checksum or + * encrypt_alg flags in the metadata, then the + * extraction of the fields will need to use hard coded + * offsets. + */ + meta = (DBMETA*)mbuf; + /* + * We need the page size to do more. Extract it from + * the meta-data page. + */ + memcpy(&dbp->pgsize, &meta->pagesize, + sizeof(u_int32_t)); + /* + * Rip out metadata and encrypt_alg fields from the + * metadata page. So the upgrade can know how big + * the page metadata pre-amble is. Any upgrade that has + * proceeded this far has metadata pages compatible + * with hash version 8 metadata pages, so extracting + * the fields is safe. + */ + memcpy(&tmpflags, &meta->metaflags, sizeof(u_int8_t)); + if (FLD_ISSET(tmpflags, DBMETA_CHKSUM)) + F_SET(dbp, DB_AM_CHKSUM); + memcpy(&tmpflags, &meta->encrypt_alg, sizeof(u_int8_t)); + if (tmpflags != 0) { + if (!CRYPTO_ON(dbp->env)) { + __db_errx(env, +"Attempt to upgrade an encrypted database without providing a password."); + ret = EINVAL; + goto err; + } + F_SET(dbp, DB_AM_ENCRYPT); + } + + /* + * This is ugly. It is necessary to have a usable + * mpool in the dbp to upgrade from an unsorted + * to a sorted hash database. The mpool file is used + * to resolve offpage key items, which are needed to + * determine sort order. Having mpool open and access + * the file does not affect the page pass, since the + * page pass only updates DB_HASH_UNSORTED pages + * in-place, and the mpool file is only used to read + * OFFPAGE items. + */ + use_mp_open = 1; + if ((ret = __os_closehandle(env, fhp)) != 0) + return (ret); + dbp->type = DB_HASH; + if ((ret = __env_mpool(dbp, fname, + DB_AM_NOT_DURABLE | DB_AM_VERIFYING)) != 0) + return (ret); + fhp = dbp->mpf->fhp; + + /* Do the actual conversion pass. */ + if ((ret = __db_page_pass( + dbp, real_name, flags, func_46_list, fhp)) != 0) + goto err; + + /* FALLTHROUGH */ + case 9: + break; + default: + __db_errx(env, "%s: unsupported hash version: %lu", + real_name, (u_long)((DBMETA *)mbuf)->version); + ret = DB_OLD_VERSION; + goto err; + } + break; + case DB_QAMMAGIC: + switch (((DBMETA *)mbuf)->version) { + case 1: + /* + * If we're in a Queue database, the only page that + * needs upgrading is the meta-database page, don't + * bother with a full pass. + */ + if ((ret = __qam_31_qammeta(dbp, real_name, mbuf)) != 0) + return (ret); + /* FALLTHROUGH */ + case 2: + if ((ret = __qam_32_qammeta(dbp, real_name, mbuf)) != 0) + return (ret); + if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0) + goto err; + if ((ret = __os_write(env, fhp, mbuf, 256, &n)) != 0) + goto err; + /* FALLTHROUGH */ + case 3: + case 4: + break; + default: + __db_errx(env, "%s: unsupported queue version: %lu", + real_name, (u_long)((DBMETA *)mbuf)->version); + ret = DB_OLD_VERSION; + goto err; + } + break; + default: + M_32_SWAP(((DBMETA *)mbuf)->magic); + switch (((DBMETA *)mbuf)->magic) { + case DB_BTREEMAGIC: + case DB_HASHMAGIC: + case DB_QAMMAGIC: + __db_errx(env, + "%s: DB->upgrade only supported on native byte-order systems", + real_name); + break; + default: + __db_errx(env, + "%s: unrecognized file type", real_name); + break; + } + ret = EINVAL; + goto err; + } + + ret = __os_fsync(env, fhp); + + /* + * If mp_open was used, then rely on the database close to clean up + * any file handles. + */ +err: if (use_mp_open == 0 && fhp != NULL && + (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0) + ret = t_ret; + __os_free(env, real_name); + + /* We're done. */ + if (dbp->db_feedback != NULL) + dbp->db_feedback(dbp, DB_UPGRADE, 100); + + return (ret); +} + +/* + * __db_page_pass -- + * Walk the pages of the database, upgrading whatever needs it. + */ +static int +__db_page_pass(dbp, real_name, flags, fl, fhp) + DB *dbp; + char *real_name; + u_int32_t flags; + int (* const fl[P_PAGETYPE_MAX]) + __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); + DB_FH *fhp; +{ + ENV *env; + PAGE *page; + db_pgno_t i, pgno_last; + size_t n; + int dirty, ret; + + env = dbp->env; + + /* Determine the last page of the file. */ + if ((ret = __db_lastpgno(dbp, real_name, fhp, &pgno_last)) != 0) + return (ret); + + /* Allocate memory for a single page. */ + if ((ret = __os_malloc(env, dbp->pgsize, &page)) != 0) + return (ret); + + /* Walk the file, calling the underlying conversion functions. */ + for (i = 0; i < pgno_last; ++i) { + if (dbp->db_feedback != NULL) + dbp->db_feedback( + dbp, DB_UPGRADE, (int)((i * 100)/pgno_last)); + if ((ret = __os_seek(env, fhp, i, dbp->pgsize, 0)) != 0) + break; + if ((ret = __os_read(env, fhp, page, dbp->pgsize, &n)) != 0) + break; + dirty = 0; + /* Always decrypt the page. */ + if ((ret = __db_decrypt_pg(env, dbp, page)) != 0) + break; + if (fl[TYPE(page)] != NULL && (ret = fl[TYPE(page)] + (dbp, real_name, flags, fhp, page, &dirty)) != 0) + break; + if (dirty) { + if ((ret = __db_encrypt_and_checksum_pg( + env, dbp, page)) != 0) + break; + if ((ret = + __os_seek(env, fhp, i, dbp->pgsize, 0)) != 0) + break; + if ((ret = __os_write(env, + fhp, page, dbp->pgsize, &n)) != 0) + break; + } + } + + __os_free(dbp->env, page); + return (ret); +} + +/* + * __db_lastpgno -- + * Return the current last page number of the file. + * + * PUBLIC: int __db_lastpgno __P((DB *, char *, DB_FH *, db_pgno_t *)); + */ +int +__db_lastpgno(dbp, real_name, fhp, pgno_lastp) + DB *dbp; + char *real_name; + DB_FH *fhp; + db_pgno_t *pgno_lastp; +{ + ENV *env; + db_pgno_t pgno_last; + u_int32_t mbytes, bytes; + int ret; + + env = dbp->env; + + if ((ret = __os_ioinfo(env, + real_name, fhp, &mbytes, &bytes, NULL)) != 0) { + __db_err(env, ret, "%s", real_name); + return (ret); + } + + /* Page sizes have to be a power-of-two. */ + if (bytes % dbp->pgsize != 0) { + __db_errx(env, + "%s: file size not a multiple of the pagesize", real_name); + return (EINVAL); + } + pgno_last = mbytes * (MEGABYTE / dbp->pgsize); + pgno_last += bytes / dbp->pgsize; + + *pgno_lastp = pgno_last; + return (0); +} + +/* + * __db_set_lastpgno -- + * Update the meta->last_pgno field. + * + * Code assumes that we do not have checksums/crypto on the page. + */ +static int +__db_set_lastpgno(dbp, real_name, fhp) + DB *dbp; + char *real_name; + DB_FH *fhp; +{ + DBMETA meta; + ENV *env; + int ret; + size_t n; + + env = dbp->env; + if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0) + return (ret); + if ((ret = __os_read(env, fhp, &meta, sizeof(meta), &n)) != 0) + return (ret); + dbp->pgsize = meta.pagesize; + if ((ret = __db_lastpgno(dbp, real_name, fhp, &meta.last_pgno)) != 0) + return (ret); + if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0) + return (ret); + if ((ret = __os_write(env, fhp, &meta, sizeof(meta), &n)) != 0) + return (ret); + + return (0); +} +#endif /* HAVE_UPGRADE_SUPPORT */ diff --git a/db/db_upg_opd.c b/db/db_upg_opd.c new file mode 100644 index 0000000..ea143cf --- /dev/null +++ b/db/db_upg_opd.c @@ -0,0 +1,343 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" + +static int __db_build_bi __P((DB *, DB_FH *, PAGE *, PAGE *, u_int32_t, int *)); +static int __db_build_ri __P((DB *, DB_FH *, PAGE *, PAGE *, u_int32_t, int *)); +static int __db_up_ovref __P((DB *, DB_FH *, db_pgno_t)); + +#define GET_PAGE(dbp, fhp, pgno, page) { \ + if ((ret = __os_seek( \ + dbp->env, fhp, pgno, (dbp)->pgsize, 0)) != 0) \ + goto err; \ + if ((ret = __os_read(dbp->env, \ + fhp, page, (dbp)->pgsize, &n)) != 0) \ + goto err; \ +} +#define PUT_PAGE(dbp, fhp, pgno, page) { \ + if ((ret = __os_seek( \ + dbp->env, fhp, pgno, (dbp)->pgsize, 0)) != 0) \ + goto err; \ + if ((ret = __os_write(dbp->env, \ + fhp, page, (dbp)->pgsize, &n)) != 0) \ + goto err; \ +} + +/* + * __db_31_offdup -- + * Convert 3.0 off-page duplicates to 3.1 off-page duplicates. + * + * PUBLIC: int __db_31_offdup __P((DB *, char *, DB_FH *, int, db_pgno_t *)); + */ +int +__db_31_offdup(dbp, real_name, fhp, sorted, pgnop) + DB *dbp; + char *real_name; + DB_FH *fhp; + int sorted; + db_pgno_t *pgnop; +{ + PAGE *ipage, *page; + db_indx_t indx; + db_pgno_t cur_cnt, i, next_cnt, pgno, *pgno_cur, pgno_last; + db_pgno_t *pgno_next, pgno_max, *tmp; + db_recno_t nrecs; + size_t n; + int level, nomem, ret; + + ipage = page = NULL; + pgno_cur = pgno_next = NULL; + + /* Allocate room to hold a page. */ + if ((ret = __os_malloc(dbp->env, dbp->pgsize, &page)) != 0) + goto err; + + /* + * Walk the chain of 3.0 off-page duplicates. Each one is converted + * in place to a 3.1 off-page duplicate page. If the duplicates are + * sorted, they are converted to a Btree leaf page, otherwise to a + * Recno leaf page. + */ + for (nrecs = 0, cur_cnt = pgno_max = 0, + pgno = *pgnop; pgno != PGNO_INVALID;) { + if (pgno_max == cur_cnt) { + pgno_max += 20; + if ((ret = __os_realloc(dbp->env, pgno_max * + sizeof(db_pgno_t), &pgno_cur)) != 0) + goto err; + } + pgno_cur[cur_cnt++] = pgno; + + GET_PAGE(dbp, fhp, pgno, page); + nrecs += NUM_ENT(page); + LEVEL(page) = LEAFLEVEL; + TYPE(page) = sorted ? P_LDUP : P_LRECNO; + /* + * !!! + * DB didn't zero the LSNs on off-page duplicates pages. + */ + ZERO_LSN(LSN(page)); + PUT_PAGE(dbp, fhp, pgno, page); + + pgno = NEXT_PGNO(page); + } + + /* If we only have a single page, it's easy. */ + if (cur_cnt <= 1) + goto done; + + /* + * pgno_cur is the list of pages we just converted. We're + * going to walk that list, but we'll need to create a new + * list while we do so. + */ + if ((ret = __os_malloc(dbp->env, + cur_cnt * sizeof(db_pgno_t), &pgno_next)) != 0) + goto err; + + /* Figure out where we can start allocating new pages. */ + if ((ret = __db_lastpgno(dbp, real_name, fhp, &pgno_last)) != 0) + goto err; + + /* Allocate room for an internal page. */ + if ((ret = __os_malloc(dbp->env, dbp->pgsize, &ipage)) != 0) + goto err; + PGNO(ipage) = PGNO_INVALID; + + /* + * Repeatedly walk the list of pages, building internal pages, until + * there's only one page at a level. + */ + for (level = LEAFLEVEL + 1; cur_cnt > 1; ++level) { + for (indx = 0, i = next_cnt = 0; i < cur_cnt;) { + if (indx == 0) { + P_INIT(ipage, dbp->pgsize, pgno_last, + PGNO_INVALID, PGNO_INVALID, + level, sorted ? P_IBTREE : P_IRECNO); + ZERO_LSN(LSN(ipage)); + + pgno_next[next_cnt++] = pgno_last++; + } + + GET_PAGE(dbp, fhp, pgno_cur[i], page); + + /* + * If the duplicates are sorted, put the first item on + * the lower-level page onto a Btree internal page. If + * the duplicates are not sorted, create an internal + * Recno structure on the page. If either case doesn't + * fit, push out the current page and start a new one. + */ + nomem = 0; + if (sorted) { + if ((ret = __db_build_bi( + dbp, fhp, ipage, page, indx, &nomem)) != 0) + goto err; + } else + if ((ret = __db_build_ri( + dbp, fhp, ipage, page, indx, &nomem)) != 0) + goto err; + if (nomem) { + indx = 0; + PUT_PAGE(dbp, fhp, PGNO(ipage), ipage); + } else { + ++indx; + ++NUM_ENT(ipage); + ++i; + } + } + + /* + * Push out the last internal page. Set the top-level record + * count if we've reached the top. + */ + if (next_cnt == 1) + RE_NREC_SET(ipage, nrecs); + PUT_PAGE(dbp, fhp, PGNO(ipage), ipage); + + /* Swap the current and next page number arrays. */ + cur_cnt = next_cnt; + tmp = pgno_cur; + pgno_cur = pgno_next; + pgno_next = tmp; + } + +done: *pgnop = pgno_cur[0]; + +err: if (pgno_cur != NULL) + __os_free(dbp->env, pgno_cur); + if (pgno_next != NULL) + __os_free(dbp->env, pgno_next); + if (ipage != NULL) + __os_free(dbp->env, ipage); + if (page != NULL) + __os_free(dbp->env, page); + + return (ret); +} + +/* + * __db_build_bi -- + * Build a BINTERNAL entry for a parent page. + */ +static int +__db_build_bi(dbp, fhp, ipage, page, indx, nomemp) + DB *dbp; + DB_FH *fhp; + PAGE *ipage, *page; + u_int32_t indx; + int *nomemp; +{ + BINTERNAL bi, *child_bi; + BKEYDATA *child_bk; + u_int8_t *p; + int ret; + db_indx_t *inp; + + inp = P_INP(dbp, ipage); + switch (TYPE(page)) { + case P_IBTREE: + child_bi = GET_BINTERNAL(dbp, page, 0); + if (P_FREESPACE(dbp, ipage) < BINTERNAL_PSIZE(child_bi->len)) { + *nomemp = 1; + return (0); + } + inp[indx] = + HOFFSET(ipage) -= BINTERNAL_SIZE(child_bi->len); + p = P_ENTRY(dbp, ipage, indx); + + bi.len = child_bi->len; + B_TSET(bi.type, child_bi->type); + bi.pgno = PGNO(page); + bi.nrecs = __bam_total(dbp, page); + memcpy(p, &bi, SSZA(BINTERNAL, data)); + p += SSZA(BINTERNAL, data); + memcpy(p, child_bi->data, child_bi->len); + + /* Increment the overflow ref count. */ + if (B_TYPE(child_bi->type) == B_OVERFLOW) + if ((ret = __db_up_ovref(dbp, fhp, + ((BOVERFLOW *)(child_bi->data))->pgno)) != 0) + return (ret); + break; + case P_LDUP: + child_bk = GET_BKEYDATA(dbp, page, 0); + switch (B_TYPE(child_bk->type)) { + case B_KEYDATA: + if (P_FREESPACE(dbp, ipage) < + BINTERNAL_PSIZE(child_bk->len)) { + *nomemp = 1; + return (0); + } + inp[indx] = + HOFFSET(ipage) -= BINTERNAL_SIZE(child_bk->len); + p = P_ENTRY(dbp, ipage, indx); + + bi.len = child_bk->len; + B_TSET(bi.type, child_bk->type); + bi.pgno = PGNO(page); + bi.nrecs = __bam_total(dbp, page); + memcpy(p, &bi, SSZA(BINTERNAL, data)); + p += SSZA(BINTERNAL, data); + memcpy(p, child_bk->data, child_bk->len); + break; + case B_OVERFLOW: + if (P_FREESPACE(dbp, ipage) < + BINTERNAL_PSIZE(BOVERFLOW_SIZE)) { + *nomemp = 1; + return (0); + } + inp[indx] = + HOFFSET(ipage) -= BINTERNAL_SIZE(BOVERFLOW_SIZE); + p = P_ENTRY(dbp, ipage, indx); + + bi.len = BOVERFLOW_SIZE; + B_TSET(bi.type, child_bk->type); + bi.pgno = PGNO(page); + bi.nrecs = __bam_total(dbp, page); + memcpy(p, &bi, SSZA(BINTERNAL, data)); + p += SSZA(BINTERNAL, data); + memcpy(p, child_bk, BOVERFLOW_SIZE); + + /* Increment the overflow ref count. */ + if ((ret = __db_up_ovref(dbp, fhp, + ((BOVERFLOW *)child_bk)->pgno)) != 0) + return (ret); + break; + default: + return (__db_pgfmt(dbp->env, PGNO(page))); + } + break; + default: + return (__db_pgfmt(dbp->env, PGNO(page))); + } + + return (0); +} + +/* + * __db_build_ri -- + * Build a RINTERNAL entry for an internal parent page. + */ +static int +__db_build_ri(dbp, fhp, ipage, page, indx, nomemp) + DB *dbp; + DB_FH *fhp; + PAGE *ipage, *page; + u_int32_t indx; + int *nomemp; +{ + RINTERNAL ri; + db_indx_t *inp; + + COMPQUIET(fhp, NULL); + inp = P_INP(dbp, ipage); + if (P_FREESPACE(dbp, ipage) < RINTERNAL_PSIZE) { + *nomemp = 1; + return (0); + } + + ri.pgno = PGNO(page); + ri.nrecs = __bam_total(dbp, page); + inp[indx] = HOFFSET(ipage) -= RINTERNAL_SIZE; + memcpy(P_ENTRY(dbp, ipage, indx), &ri, RINTERNAL_SIZE); + + return (0); +} + +/* + * __db_up_ovref -- + * Increment/decrement the reference count on an overflow page. + */ +static int +__db_up_ovref(dbp, fhp, pgno) + DB *dbp; + DB_FH *fhp; + db_pgno_t pgno; +{ + PAGE *page; + size_t n; + int ret; + + /* Allocate room to hold a page. */ + if ((ret = __os_malloc(dbp->env, dbp->pgsize, &page)) != 0) + return (ret); + + GET_PAGE(dbp, fhp, pgno, page); + ++OV_REF(page); + PUT_PAGE(dbp, fhp, pgno, page); + +err: __os_free(dbp->env, page); + + return (ret); +} diff --git a/db/db_vrfy.c b/db/db_vrfy.c new file mode 100644 index 0000000..7ea9c62 --- /dev/null +++ b/db/db_vrfy.c @@ -0,0 +1,2894 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2000-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_swap.h" +#include "dbinc/db_verify.h" +#include "dbinc/btree.h" +#include "dbinc/hash.h" +#include "dbinc/lock.h" +#include "dbinc/mp.h" +#include "dbinc/qam.h" +#include "dbinc/txn.h" + +/* + * This is the code for DB->verify, the DB database consistency checker. + * For now, it checks all subdatabases in a database, and verifies + * everything it knows how to (i.e. it's all-or-nothing, and one can't + * check only for a subset of possible problems). + */ + +static u_int __db_guesspgsize __P((ENV *, DB_FH *)); +static int __db_is_valid_magicno __P((u_int32_t, DBTYPE *)); +static int __db_meta2pgset + __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t, DB *)); +static int __db_salvage __P((DB *, VRFY_DBINFO *, + db_pgno_t, void *, int (*)(void *, const void *), u_int32_t)); +static int __db_salvage_subdbpg __P((DB *, VRFY_DBINFO *, + PAGE *, void *, int (*)(void *, const void *), u_int32_t)); +static int __db_salvage_all __P((DB *, VRFY_DBINFO *, void *, + int(*)(void *, const void *), u_int32_t, int *)); +static int __db_salvage_unknowns __P((DB *, VRFY_DBINFO *, void *, + int (*)(void *, const void *), u_int32_t)); +static int __db_verify_arg __P((DB *, const char *, void *, u_int32_t)); +static int __db_vrfy_freelist + __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t)); +static int __db_vrfy_invalid + __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t)); +static int __db_vrfy_orderchkonly __P((DB *, + VRFY_DBINFO *, const char *, const char *, u_int32_t)); +static int __db_vrfy_pagezero __P((DB *, VRFY_DBINFO *, DB_FH *, u_int32_t)); +static int __db_vrfy_subdbs + __P((DB *, VRFY_DBINFO *, const char *, u_int32_t)); +static int __db_vrfy_structure __P((DB *, VRFY_DBINFO *, + const char *, db_pgno_t, void *, void *, u_int32_t)); +static int __db_vrfy_walkpages __P((DB *, VRFY_DBINFO *, + void *, int (*)(void *, const void *), u_int32_t)); + +#define VERIFY_FLAGS \ + (DB_AGGRESSIVE | \ + DB_NOORDERCHK | DB_ORDERCHKONLY | DB_PRINTABLE | DB_SALVAGE | DB_UNREF) + +/* + * __db_verify_pp -- + * DB->verify public interface. + * + * PUBLIC: int __db_verify_pp + * PUBLIC: __P((DB *, const char *, const char *, FILE *, u_int32_t)); + */ +int +__db_verify_pp(dbp, file, database, outfile, flags) + DB *dbp; + const char *file, *database; + FILE *outfile; + u_int32_t flags; +{ + /* + * __db_verify_pp is a wrapper to __db_verify_internal, which lets + * us pass appropriate equivalents to FILE * in from the non-C APIs. + * That's why the usual ENV_ENTER macros are in __db_verify_internal, + * not here. + */ + return (__db_verify_internal(dbp, + file, database, outfile, __db_pr_callback, flags)); +} + +/* + * __db_verify_internal -- + * + * PUBLIC: int __db_verify_internal __P((DB *, const char *, + * PUBLIC: const char *, void *, int (*)(void *, const void *), u_int32_t)); + */ +int +__db_verify_internal(dbp, fname, dname, handle, callback, flags) + DB *dbp; + const char *fname, *dname; + void *handle; + int (*callback) __P((void *, const void *)); + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret, t_ret; + + env = dbp->env; + + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->verify"); + + if (!LF_ISSET(DB_SALVAGE)) + LF_SET(DB_UNREF); + + ENV_ENTER(env, ip); + + if ((ret = __db_verify_arg(dbp, dname, handle, flags)) == 0) + ret = __db_verify(dbp, ip, + fname, dname, handle, callback, NULL, NULL, flags); + + /* Db.verify is a DB handle destructor. */ + if ((t_ret = __db_close(dbp, NULL, 0)) != 0 && ret == 0) + ret = t_ret; + + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __db_verify_arg -- + * Check DB->verify arguments. + */ +static int +__db_verify_arg(dbp, dname, handle, flags) + DB *dbp; + const char *dname; + void *handle; + u_int32_t flags; +{ + ENV *env; + int ret; + + env = dbp->env; + + if ((ret = __db_fchk(env, "DB->verify", flags, VERIFY_FLAGS)) != 0) + return (ret); + + /* + * DB_SALVAGE is mutually exclusive with the other flags except + * DB_AGGRESSIVE, DB_PRINTABLE. + * + * DB_AGGRESSIVE and DB_PRINTABLE are only meaningful when salvaging. + * + * DB_SALVAGE requires an output stream. + */ + if (LF_ISSET(DB_SALVAGE)) { + if (LF_ISSET(~(DB_AGGRESSIVE | DB_PRINTABLE | DB_SALVAGE))) + return (__db_ferr(env, "DB->verify", 1)); + if (handle == NULL) { + __db_errx(env, + "DB_SALVAGE requires a an output handle"); + return (EINVAL); + } + } else + if (LF_ISSET(DB_AGGRESSIVE | DB_PRINTABLE)) + return (__db_ferr(env, "DB->verify", 1)); + + /* + * DB_ORDERCHKONLY is mutually exclusive with DB_SALVAGE and + * DB_NOORDERCHK, and requires a database name. + */ + if ((ret = __db_fcchk(env, "DB->verify", flags, + DB_ORDERCHKONLY, DB_SALVAGE | DB_NOORDERCHK)) != 0) + return (ret); + if (LF_ISSET(DB_ORDERCHKONLY) && dname == NULL) { + __db_errx(env, "DB_ORDERCHKONLY requires a database name"); + return (EINVAL); + } + return (0); +} + +/* + * __db_verify -- + * Walk the entire file page-by-page, either verifying with or without + * dumping in db_dump -d format, or DB_SALVAGE-ing whatever key/data + * pairs can be found and dumping them in standard (db_load-ready) + * dump format. + * + * (Salvaging isn't really a verification operation, but we put it + * here anyway because it requires essentially identical top-level + * code.) + * + * flags may be 0, DB_NOORDERCHK, DB_ORDERCHKONLY, or DB_SALVAGE + * (and optionally DB_AGGRESSIVE). + * PUBLIC: int __db_verify __P((DB *, DB_THREAD_INFO *, const char *, + * PUBLIC: const char *, void *, int (*)(void *, const void *), + * PUBLIC: void *, void *, u_int32_t)); + */ +int +__db_verify(dbp, ip, name, subdb, handle, callback, lp, rp, flags) + DB *dbp; + DB_THREAD_INFO *ip; + const char *name, *subdb; + void *handle; + int (*callback) __P((void *, const void *)); + void *lp, *rp; + u_int32_t flags; +{ + DB_FH *fhp; + ENV *env; + VRFY_DBINFO *vdp; + u_int32_t sflags; + int has_subdbs, isbad, ret, t_ret; + char *real_name; + + env = dbp->env; + fhp = NULL; + vdp = NULL; + real_name = NULL; + has_subdbs = isbad = ret = t_ret = 0; + + F_SET(dbp, DB_AM_VERIFYING); + + /* Initialize any feedback function. */ + if (!LF_ISSET(DB_SALVAGE) && dbp->db_feedback != NULL) + dbp->db_feedback(dbp, DB_VERIFY, 0); + + /* + * We don't know how large the cache is, and if the database + * in question uses a small page size--which we don't know + * yet!--it may be uncomfortably small for the default page + * size [#2143]. However, the things we need temporary + * databases for in dbinfo are largely tiny, so using a + * 1024-byte pagesize is probably not going to be a big hit, + * and will make us fit better into small spaces. + */ + if ((ret = __db_vrfy_dbinfo_create(env, ip, 1024, &vdp)) != 0) + goto err; + + /* + * Note whether the user has requested that we use printable + * chars where possible. We won't get here with this flag if + * we're not salvaging. + */ + if (LF_ISSET(DB_PRINTABLE)) + F_SET(vdp, SALVAGE_PRINTABLE); + + /* Find the real name of the file. */ + if ((ret = __db_appname(env, + DB_APP_DATA, name, &dbp->dirname, &real_name)) != 0) + goto err; + + /* + * Our first order of business is to verify page 0, which is + * the metadata page for the master database of subdatabases + * or of the only database in the file. We want to do this by hand + * rather than just calling __db_open in case it's corrupt--various + * things in __db_open might act funny. + * + * Once we know the metadata page is healthy, I believe that it's + * safe to open the database normally and then use the page swapping + * code, which makes life easier. + */ + if ((ret = __os_open(env, real_name, 0, DB_OSO_RDONLY, 0, &fhp)) != 0) + goto err; + + /* Verify the metadata page 0; set pagesize and type. */ + if ((ret = __db_vrfy_pagezero(dbp, vdp, fhp, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + + /* + * We can assume at this point that dbp->pagesize and dbp->type are + * set correctly, or at least as well as they can be, and that + * locking, logging, and txns are not in use. Thus we can trust + * the memp code not to look at the page, and thus to be safe + * enough to use. + * + * The dbp is not open, but the file is open in the fhp, and we + * cannot assume that __db_open is safe. Call __env_setup, + * the [safe] part of __db_open that initializes the environment-- + * and the mpool--manually. + */ + if ((ret = __env_setup(dbp, NULL, + name, subdb, TXN_INVALID, DB_ODDFILESIZE | DB_RDONLY)) != 0) + goto err; + + /* + * Set our name in the Queue subsystem; we may need it later + * to deal with extents. + */ + if (dbp->type == DB_QUEUE && + (ret = __qam_set_ext_data(dbp, name)) != 0) + goto err; + + /* Mark the dbp as opened, so that we correctly handle its close. */ + F_SET(dbp, DB_AM_OPEN_CALLED); + + /* Find out the page number of the last page in the database. */ + if ((ret = __memp_get_last_pgno(dbp->mpf, &vdp->last_pgno)) != 0) + goto err; + + /* + * DB_ORDERCHKONLY is a special case; our file consists of + * several subdatabases, which use different hash, bt_compare, + * and/or dup_compare functions. Consequently, we couldn't verify + * sorting and hashing simply by calling DB->verify() on the file. + * DB_ORDERCHKONLY allows us to come back and check those things; it + * requires a subdatabase, and assumes that everything but that + * database's sorting/hashing is correct. + */ + if (LF_ISSET(DB_ORDERCHKONLY)) { + ret = __db_vrfy_orderchkonly(dbp, vdp, name, subdb, flags); + goto done; + } + + sflags = flags; + if (dbp->p_internal != NULL) + LF_CLR(DB_SALVAGE); + + /* + * When salvaging, we use a db to keep track of whether we've seen a + * given overflow or dup page in the course of traversing normal data. + * If in the end we have not, we assume its key got lost and print it + * with key "UNKNOWN". + */ + if (LF_ISSET(DB_SALVAGE)) { + if ((ret = __db_salvage_init(vdp)) != 0) + goto err; + + /* + * If we're not being aggressive, salvage by walking the tree + * and only printing the leaves we find. "has_subdbs" will + * indicate whether we found subdatabases. + */ + if (!LF_ISSET(DB_AGGRESSIVE) && __db_salvage_all( + dbp, vdp, handle, callback, flags, &has_subdbs) != 0) + isbad = 1; + + /* + * If we have subdatabases, flag if any keys are found that + * don't belong to a subdatabase -- they'll need to have an + * "__OTHER__" subdatabase header printed first. + */ + if (has_subdbs) { + F_SET(vdp, SALVAGE_PRINTHEADER); + F_SET(vdp, SALVAGE_HASSUBDBS); + } + } + + /* Walk all the pages, if a page cannot be read, verify structure. */ + if ((ret = + __db_vrfy_walkpages(dbp, vdp, handle, callback, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else if (ret != DB_PAGE_NOTFOUND) + goto err; + } + + /* If we're verifying, verify inter-page structure. */ + if (!LF_ISSET(DB_SALVAGE) && isbad == 0) + if ((t_ret = __db_vrfy_structure(dbp, + vdp, name, 0, lp, rp, flags)) != 0) { + if (t_ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + + /* + * If we're salvaging, output with key UNKNOWN any overflow or dup pages + * we haven't been able to put in context. Then destroy the salvager's + * state-saving database. + */ + if (LF_ISSET(DB_SALVAGE)) { + if ((ret = __db_salvage_unknowns(dbp, + vdp, handle, callback, flags)) != 0) + isbad = 1; + } + + flags = sflags; + +#ifdef HAVE_PARTITION + if (t_ret == 0 && dbp->p_internal != NULL) + t_ret = __part_verify(dbp, vdp, name, handle, callback, flags); +#endif + + if (ret == 0) + ret = t_ret; + + /* Don't display a footer for a database holding other databases. */ + if (LF_ISSET(DB_SALVAGE | DB_VERIFY_PARTITION) == DB_SALVAGE && + (!has_subdbs || F_ISSET(vdp, SALVAGE_PRINTFOOTER))) + (void)__db_prfooter(handle, callback); + +done: err: + /* Send feedback that we're done. */ + if (!LF_ISSET(DB_SALVAGE) && dbp->db_feedback != NULL) + dbp->db_feedback(dbp, DB_VERIFY, 100); + + if (LF_ISSET(DB_SALVAGE) && + (t_ret = __db_salvage_destroy(vdp)) != 0 && ret == 0) + ret = t_ret; + if (fhp != NULL && + (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0) + ret = t_ret; + if (vdp != NULL && + (t_ret = __db_vrfy_dbinfo_destroy(env, vdp)) != 0 && ret == 0) + ret = t_ret; + if (real_name != NULL) + __os_free(env, real_name); + + /* + * DB_VERIFY_FATAL is a private error, translate to a public one. + * + * If we didn't find a page, it's probably a page number was corrupted. + * Return the standard corruption error. + * + * Otherwise, if we found corruption along the way, set the return. + */ + if (ret == DB_VERIFY_FATAL || + ret == DB_PAGE_NOTFOUND || (ret == 0 && isbad == 1)) + ret = DB_VERIFY_BAD; + + /* Make sure there's a public complaint if we found corruption. */ + if (ret != 0) + __db_err(env, ret, "%s", name); + + return (ret); +} + +/* + * __db_vrfy_pagezero -- + * Verify the master metadata page. Use seek, read, and a local buffer + * rather than the DB paging code, for safety. + * + * Must correctly (or best-guess) set dbp->type and dbp->pagesize. + */ +static int +__db_vrfy_pagezero(dbp, vdp, fhp, flags) + DB *dbp; + VRFY_DBINFO *vdp; + DB_FH *fhp; + u_int32_t flags; +{ + DBMETA *meta; + ENV *env; + VRFY_PAGEINFO *pip; + db_pgno_t freelist; + size_t nr; + int isbad, ret, swapped; + u_int8_t mbuf[DBMETASIZE]; + + isbad = ret = swapped = 0; + freelist = 0; + env = dbp->env; + meta = (DBMETA *)mbuf; + dbp->type = DB_UNKNOWN; + + if ((ret = __db_vrfy_getpageinfo(vdp, PGNO_BASE_MD, &pip)) != 0) + return (ret); + + /* + * Seek to the metadata page. + * Note that if we're just starting a verification, dbp->pgsize + * may be zero; this is okay, as we want page zero anyway and + * 0*0 == 0. + */ + if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0 || + (ret = __os_read(env, fhp, mbuf, DBMETASIZE, &nr)) != 0) { + __db_err(env, ret, + "Metadata page %lu cannot be read", (u_long)PGNO_BASE_MD); + return (ret); + } + + if (nr != DBMETASIZE) { + EPRINT((env, + "Page %lu: Incomplete metadata page", + (u_long)PGNO_BASE_MD)); + return (DB_VERIFY_FATAL); + } + + if ((ret = __db_chk_meta(env, dbp, meta, 1)) != 0) { + EPRINT((env, + "Page %lu: metadata page corrupted", (u_long)PGNO_BASE_MD)); + isbad = 1; + if (ret != -1) { + EPRINT((env, + "Page %lu: could not check metadata page", + (u_long)PGNO_BASE_MD)); + return (DB_VERIFY_FATAL); + } + } + + /* + * Check all of the fields that we can. + * + * 08-11: Current page number. Must == pgno. + * Note that endianness doesn't matter--it's zero. + */ + if (meta->pgno != PGNO_BASE_MD) { + isbad = 1; + EPRINT((env, "Page %lu: pgno incorrectly set to %lu", + (u_long)PGNO_BASE_MD, (u_long)meta->pgno)); + } + + /* 12-15: Magic number. Must be one of valid set. */ + if (__db_is_valid_magicno(meta->magic, &dbp->type)) + swapped = 0; + else { + M_32_SWAP(meta->magic); + if (__db_is_valid_magicno(meta->magic, + &dbp->type)) + swapped = 1; + else { + isbad = 1; + EPRINT((env, + "Page %lu: bad magic number %lu", + (u_long)PGNO_BASE_MD, (u_long)meta->magic)); + } + } + + /* + * 16-19: Version. Must be current; for now, we + * don't support verification of old versions. + */ + if (swapped) + M_32_SWAP(meta->version); + if ((dbp->type == DB_BTREE && + (meta->version > DB_BTREEVERSION || + meta->version < DB_BTREEOLDVER)) || + (dbp->type == DB_HASH && + (meta->version > DB_HASHVERSION || + meta->version < DB_HASHOLDVER)) || + (dbp->type == DB_QUEUE && + (meta->version > DB_QAMVERSION || + meta->version < DB_QAMOLDVER))) { + isbad = 1; + EPRINT((env, + "Page %lu: unsupported DB version %lu; extraneous errors may result", + (u_long)PGNO_BASE_MD, (u_long)meta->version)); + } + + /* + * 20-23: Pagesize. Must be power of two, + * greater than 512, and less than 64K. + */ + if (swapped) + M_32_SWAP(meta->pagesize); + if (IS_VALID_PAGESIZE(meta->pagesize)) + dbp->pgsize = meta->pagesize; + else { + isbad = 1; + EPRINT((env, "Page %lu: bad page size %lu", + (u_long)PGNO_BASE_MD, (u_long)meta->pagesize)); + + /* + * Now try to settle on a pagesize to use. + * If the user-supplied one is reasonable, + * use it; else, guess. + */ + if (!IS_VALID_PAGESIZE(dbp->pgsize)) + dbp->pgsize = __db_guesspgsize(env, fhp); + } + + /* + * 25: Page type. Must be correct for dbp->type, + * which is by now set as well as it can be. + */ + /* Needs no swapping--only one byte! */ + if ((dbp->type == DB_BTREE && meta->type != P_BTREEMETA) || + (dbp->type == DB_HASH && meta->type != P_HASHMETA) || + (dbp->type == DB_QUEUE && meta->type != P_QAMMETA)) { + isbad = 1; + EPRINT((env, "Page %lu: bad page type %lu", + (u_long)PGNO_BASE_MD, (u_long)meta->type)); + } + + /* + * 26: Meta-flags. + */ + if (meta->metaflags != 0) { + if (FLD_ISSET(meta->metaflags, + ~(DBMETA_CHKSUM|DBMETA_PART_RANGE|DBMETA_PART_CALLBACK))) { + isbad = 1; + EPRINT((env, + "Page %lu: bad meta-data flags value %#lx", + (u_long)PGNO_BASE_MD, (u_long)meta->metaflags)); + } + if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM)) + F_SET(pip, VRFY_HAS_CHKSUM); + if (FLD_ISSET(meta->metaflags, DBMETA_PART_RANGE)) + F_SET(pip, VRFY_HAS_PART_RANGE); + if (FLD_ISSET(meta->metaflags, DBMETA_PART_CALLBACK)) + F_SET(pip, VRFY_HAS_PART_CALLBACK); + + if (FLD_ISSET(meta->metaflags, + DBMETA_PART_RANGE | DBMETA_PART_CALLBACK) && + (ret = __partition_init(dbp, meta->metaflags)) != 0) + return (ret); + } + + /* + * 28-31: Free list page number. + * 32-35: Last page in database file. + * We'll verify its sensibility when we do inter-page + * verification later; for now, just store it. + */ + if (swapped) + M_32_SWAP(meta->free); + freelist = meta->free; + if (swapped) + M_32_SWAP(meta->last_pgno); + vdp->meta_last_pgno = meta->last_pgno; + + /* + * Initialize vdp->pages to fit a single pageinfo structure for + * this one page. We'll realloc later when we know how many + * pages there are. + */ + pip->pgno = PGNO_BASE_MD; + pip->type = meta->type; + + /* + * Signal that we still have to check the info specific to + * a given type of meta page. + */ + F_SET(pip, VRFY_INCOMPLETE); + + pip->free = freelist; + + if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0) + return (ret); + + /* Set up the dbp's fileid. We don't use the regular open path. */ + memcpy(dbp->fileid, meta->uid, DB_FILE_ID_LEN); + + if (swapped == 1) + F_SET(dbp, DB_AM_SWAP); + + return (isbad ? DB_VERIFY_BAD : 0); +} + +/* + * __db_vrfy_walkpages -- + * Main loop of the verifier/salvager. Walks through, + * page by page, and verifies all pages and/or prints all data pages. + */ +static int +__db_vrfy_walkpages(dbp, vdp, handle, callback, flags) + DB *dbp; + VRFY_DBINFO *vdp; + void *handle; + int (*callback) __P((void *, const void *)); + u_int32_t flags; +{ + DB_MPOOLFILE *mpf; + ENV *env; + PAGE *h; + VRFY_PAGEINFO *pip; + db_pgno_t i; + int ret, t_ret, isbad; + + env = dbp->env; + mpf = dbp->mpf; + h = NULL; + ret = isbad = t_ret = 0; + + for (i = 0; i <= vdp->last_pgno; i++) { + /* + * If DB_SALVAGE is set, we inspect our database of completed + * pages, and skip any we've already printed in the subdb pass. + */ + if (LF_ISSET(DB_SALVAGE) && (__db_salvage_isdone(vdp, i) != 0)) + continue; + + /* + * An individual page get can fail if: + * * This is a hash database, it is expected to find + * empty buckets, which don't have allocated pages. Create + * a dummy page so the verification can proceed. + * * We are salvaging, flag the error and continue. + */ + if ((t_ret = __memp_fget(mpf, &i, + vdp->thread_info, NULL, 0, &h)) != 0) { + if (dbp->type == DB_HASH) { + if ((t_ret = + __db_vrfy_getpageinfo(vdp, i, &pip)) != 0) + goto err1; + pip->type = P_INVALID; + pip->pgno = i; + F_CLR(pip, VRFY_IS_ALLZEROES); + if ((t_ret = __db_vrfy_putpageinfo( + env, vdp, pip)) != 0) + goto err1; + continue; + } + if (t_ret == DB_PAGE_NOTFOUND) { + EPRINT((env, + "Page %lu: beyond the end of the file, metadata page has last page as %lu", + (u_long)i, (u_long)vdp->last_pgno)); + if (ret == 0) + return (t_ret); + } + +err1: if (ret == 0) + ret = t_ret; + if (LF_ISSET(DB_SALVAGE)) + continue; + return (ret); + } + + if (LF_ISSET(DB_SALVAGE)) { + /* + * We pretty much don't want to quit unless a + * bomb hits. May as well return that something + * was screwy, however. + */ + if ((t_ret = __db_salvage_pg(dbp, + vdp, i, h, handle, callback, flags)) != 0) { + if (ret == 0) + ret = t_ret; + isbad = 1; + } + } else { + /* + * If we are not salvaging, and we get any error + * other than DB_VERIFY_BAD, return immediately; + * it may not be safe to proceed. If we get + * DB_VERIFY_BAD, keep going; listing more errors + * may make it easier to diagnose problems and + * determine the magnitude of the corruption. + * + * Verify info common to all page types. + */ + if (i != PGNO_BASE_MD) { + ret = __db_vrfy_common(dbp, vdp, h, i, flags); + if (ret == DB_VERIFY_BAD) + isbad = 1; + else if (ret != 0) + goto err; + } + + switch (TYPE(h)) { + case P_INVALID: + ret = __db_vrfy_invalid(dbp, vdp, h, i, flags); + break; + case __P_DUPLICATE: + isbad = 1; + EPRINT((env, + "Page %lu: old-style duplicate page", + (u_long)i)); + break; + case P_HASH_UNSORTED: + case P_HASH: + ret = __ham_vrfy(dbp, vdp, h, i, flags); + break; + case P_IBTREE: + case P_IRECNO: + case P_LBTREE: + case P_LDUP: + ret = __bam_vrfy(dbp, vdp, h, i, flags); + break; + case P_LRECNO: + ret = __ram_vrfy_leaf(dbp, vdp, h, i, flags); + break; + case P_OVERFLOW: + ret = __db_vrfy_overflow(dbp, vdp, h, i, flags); + break; + case P_HASHMETA: + ret = __ham_vrfy_meta(dbp, + vdp, (HMETA *)h, i, flags); + break; + case P_BTREEMETA: + ret = __bam_vrfy_meta(dbp, + vdp, (BTMETA *)h, i, flags); + break; + case P_QAMMETA: + ret = __qam_vrfy_meta(dbp, + vdp, (QMETA *)h, i, flags); + break; + case P_QAMDATA: + ret = __qam_vrfy_data(dbp, + vdp, (QPAGE *)h, i, flags); + break; + default: + EPRINT((env, + "Page %lu: unknown page type %lu", + (u_long)i, (u_long)TYPE(h))); + isbad = 1; + break; + } + + /* + * Set up error return. + */ + if (ret == DB_VERIFY_BAD) + isbad = 1; + else if (ret != 0) + goto err; + + /* + * Provide feedback to the application about our + * progress. The range 0-50% comes from the fact + * that this is the first of two passes through the + * database (front-to-back, then top-to-bottom). + */ + if (dbp->db_feedback != NULL) + dbp->db_feedback(dbp, DB_VERIFY, + (int)((i + 1) * 50 / (vdp->last_pgno + 1))); + } + + /* + * Just as with the page get, bail if and only if we're + * not salvaging. + */ + if ((t_ret = __memp_fput(mpf, + vdp->thread_info, h, dbp->priority)) != 0) { + if (ret == 0) + ret = t_ret; + if (!LF_ISSET(DB_SALVAGE)) + return (ret); + } + } + + /* + * If we've seen a Queue metadata page, we may need to walk Queue + * extent pages that won't show up between 0 and vdp->last_pgno. + */ + if (F_ISSET(vdp, VRFY_QMETA_SET) && (t_ret = + __qam_vrfy_walkqueue(dbp, vdp, handle, callback, flags)) != 0) { + if (ret == 0) + ret = t_ret; + if (t_ret == DB_VERIFY_BAD) + isbad = 1; + else if (!LF_ISSET(DB_SALVAGE)) + return (ret); + } + + if (0) { +err: if (h != NULL && (t_ret = __memp_fput(mpf, + vdp->thread_info, h, dbp->priority)) != 0) + return (ret == 0 ? t_ret : ret); + } + + return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret); +} + +/* + * __db_vrfy_structure-- + * After a beginning-to-end walk through the database has been + * completed, put together the information that has been collected + * to verify the overall database structure. + * + * Should only be called if we want to do a database verification, + * i.e. if DB_SALVAGE is not set. + */ +static int +__db_vrfy_structure(dbp, vdp, dbname, meta_pgno, lp, rp, flags) + DB *dbp; + VRFY_DBINFO *vdp; + const char *dbname; + db_pgno_t meta_pgno; + void *lp, *rp; + u_int32_t flags; +{ + DB *pgset; + ENV *env; + VRFY_PAGEINFO *pip; + db_pgno_t i; + int ret, isbad, hassubs, p; + + isbad = 0; + pip = NULL; + env = dbp->env; + pgset = vdp->pgset; + + /* + * Providing feedback here is tricky; in most situations, + * we fetch each page one more time, but we do so in a top-down + * order that depends on the access method. Worse, we do this + * recursively in btree, such that on any call where we're traversing + * a subtree we don't know where that subtree is in the whole database; + * worse still, any given database may be one of several subdbs. + * + * The solution is to decrement a counter vdp->pgs_remaining each time + * we verify (and call feedback on) a page. We may over- or + * under-count, but the structure feedback function will ensure that we + * never give a percentage under 50 or over 100. (The first pass + * covered the range 0-50%.) + */ + if (dbp->db_feedback != NULL) + vdp->pgs_remaining = vdp->last_pgno + 1; + + /* + * Call the appropriate function to downwards-traverse the db type. + */ + switch (dbp->type) { + case DB_BTREE: + case DB_RECNO: + if ((ret = + __bam_vrfy_structure(dbp, vdp, 0, lp, rp, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + + /* + * If we have subdatabases and we know that the database is, + * thus far, sound, it's safe to walk the tree of subdatabases. + * Do so, and verify the structure of the databases within. + */ + if ((ret = __db_vrfy_getpageinfo(vdp, 0, &pip)) != 0) + goto err; + hassubs = F_ISSET(pip, VRFY_HAS_SUBDBS) ? 1 : 0; + if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0) + goto err; + pip = NULL; + + if (isbad == 0 && hassubs) + if ((ret = + __db_vrfy_subdbs(dbp, vdp, dbname, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + break; + case DB_HASH: + if ((ret = __ham_vrfy_structure(dbp, vdp, 0, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + break; + case DB_QUEUE: + if ((ret = __qam_vrfy_structure(dbp, vdp, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + } + + /* + * Queue pages may be unreferenced and totally zeroed, if + * they're empty; queue doesn't have much structure, so + * this is unlikely to be wrong in any troublesome sense. + * Skip to "err". + */ + goto err; + case DB_UNKNOWN: + default: + ret = __db_unknown_path(env, "__db_vrfy_structure"); + goto err; + } + + /* Walk free list. */ + if ((ret = + __db_vrfy_freelist(dbp, vdp, meta_pgno, flags)) == DB_VERIFY_BAD) + isbad = 1; + + /* + * If structure checks up until now have failed, it's likely that + * checking what pages have been missed will result in oodles of + * extraneous error messages being EPRINTed. Skip to the end + * if this is the case; we're going to be printing at least one + * error anyway, and probably all the more salient ones. + */ + if (ret != 0 || isbad == 1) + goto err; + + /* + * Make sure no page has been missed and that no page is still marked + * "all zeroes" (only certain hash pages can be, and they're unmarked + * in __ham_vrfy_structure). + */ + for (i = 0; i < vdp->last_pgno + 1; i++) { + if ((ret = __db_vrfy_getpageinfo(vdp, i, &pip)) != 0) + goto err; + if ((ret = __db_vrfy_pgset_get(pgset, + vdp->thread_info, i, &p)) != 0) + goto err; + if (pip->type == P_OVERFLOW) { + if ((u_int32_t)p != pip->refcount) { + EPRINT((env, + "Page %lu: overflow refcount %lu, referenced %lu times", + (u_long)i, + (u_long)pip->refcount, (u_long)p)); + isbad = 1; + } + } else if (p == 0 && +#ifndef HAVE_FTRUNCATE + !(i > vdp->meta_last_pgno && + (F_ISSET(pip, VRFY_IS_ALLZEROES) || pip->type == P_HASH)) && +#endif + !(dbp->type == DB_HASH && pip->type == P_INVALID)) { + /* + * It is OK for unreferenced hash buckets to be + * marked invalid and unreferenced. + */ + EPRINT((env, + "Page %lu: unreferenced page", (u_long)i)); + isbad = 1; + } + + if (F_ISSET(pip, VRFY_IS_ALLZEROES) +#ifndef HAVE_FTRUNCATE + && i <= vdp->meta_last_pgno +#endif + ) { + EPRINT((env, + "Page %lu: totally zeroed page", (u_long)i)); + isbad = 1; + } + if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0) + goto err; + pip = NULL; + } + +err: if (pip != NULL) + (void)__db_vrfy_putpageinfo(env, vdp, pip); + + return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD : ret); +} + +/* + * __db_is_valid_magicno + */ +static int +__db_is_valid_magicno(magic, typep) + u_int32_t magic; + DBTYPE *typep; +{ + switch (magic) { + case DB_BTREEMAGIC: + *typep = DB_BTREE; + return (1); + case DB_HASHMAGIC: + *typep = DB_HASH; + return (1); + case DB_QAMMAGIC: + *typep = DB_QUEUE; + return (1); + default: + break; + } + *typep = DB_UNKNOWN; + return (0); +} + +/* + * __db_vrfy_common -- + * Verify info common to all page types. + * + * PUBLIC: int __db_vrfy_common + * PUBLIC: __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t)); + */ +int +__db_vrfy_common(dbp, vdp, h, pgno, flags) + DB *dbp; + VRFY_DBINFO *vdp; + PAGE *h; + db_pgno_t pgno; + u_int32_t flags; +{ + ENV *env; + VRFY_PAGEINFO *pip; + int ret, t_ret; + u_int8_t *p; + + env = dbp->env; + + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + + pip->pgno = pgno; + F_CLR(pip, VRFY_IS_ALLZEROES); + + /* + * Hash expands the table by leaving some pages between the + * old last and the new last totally zeroed. These pages may + * not be all zero if they were used, freed and then reallocated. + * + * Queue will create sparse files if sparse record numbers are used. + */ + if (pgno != 0 && PGNO(h) == 0) { + F_SET(pip, VRFY_IS_ALLZEROES); + for (p = (u_int8_t *)h; p < (u_int8_t *)h + dbp->pgsize; p++) + if (*p != 0) { + F_CLR(pip, VRFY_IS_ALLZEROES); + break; + } + /* + * Mark it as a hash, and we'll + * check that that makes sense structurally later. + * (The queue verification doesn't care, since queues + * don't really have much in the way of structure.) + */ + pip->type = P_HASH; + ret = 0; + goto err; /* well, not really an err. */ + } + + if (PGNO(h) != pgno) { + EPRINT((env, "Page %lu: bad page number %lu", + (u_long)pgno, (u_long)h->pgno)); + ret = DB_VERIFY_BAD; + } + + switch (h->type) { + case P_INVALID: /* Order matches ordinal value. */ + case P_HASH_UNSORTED: + case P_IBTREE: + case P_IRECNO: + case P_LBTREE: + case P_LRECNO: + case P_OVERFLOW: + case P_HASHMETA: + case P_BTREEMETA: + case P_QAMMETA: + case P_QAMDATA: + case P_LDUP: + case P_HASH: + break; + default: + EPRINT((env, "Page %lu: bad page type %lu", + (u_long)pgno, (u_long)h->type)); + ret = DB_VERIFY_BAD; + } + pip->type = h->type; + +err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __db_vrfy_invalid -- + * Verify P_INVALID page. + * (Yes, there's not much to do here.) + */ +static int +__db_vrfy_invalid(dbp, vdp, h, pgno, flags) + DB *dbp; + VRFY_DBINFO *vdp; + PAGE *h; + db_pgno_t pgno; + u_int32_t flags; +{ + ENV *env; + VRFY_PAGEINFO *pip; + int ret, t_ret; + + env = dbp->env; + + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + pip->next_pgno = pip->prev_pgno = 0; + + if (!IS_VALID_PGNO(NEXT_PGNO(h))) { + EPRINT((env, "Page %lu: invalid next_pgno %lu", + (u_long)pgno, (u_long)NEXT_PGNO(h))); + ret = DB_VERIFY_BAD; + } else + pip->next_pgno = NEXT_PGNO(h); + + if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __db_vrfy_datapage -- + * Verify elements common to data pages (P_HASH, P_LBTREE, + * P_IBTREE, P_IRECNO, P_LRECNO, P_OVERFLOW, P_DUPLICATE)--i.e., + * those defined in the PAGE structure. + * + * Called from each of the per-page routines, after the + * all-page-type-common elements of pip have been verified and filled + * in. + * + * PUBLIC: int __db_vrfy_datapage + * PUBLIC: __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, u_int32_t)); + */ +int +__db_vrfy_datapage(dbp, vdp, h, pgno, flags) + DB *dbp; + VRFY_DBINFO *vdp; + PAGE *h; + db_pgno_t pgno; + u_int32_t flags; +{ + ENV *env; + VRFY_PAGEINFO *pip; + u_int32_t smallest_entry; + int isbad, ret, t_ret; + + env = dbp->env; + + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + isbad = 0; + + /* + * prev_pgno and next_pgno: store for inter-page checks, + * verify that they point to actual pages and not to self. + * + * !!! + * Internal btree pages do not maintain these fields (indeed, + * they overload them). Skip. + */ + if (TYPE(h) != P_IBTREE && TYPE(h) != P_IRECNO) { + if (!IS_VALID_PGNO(PREV_PGNO(h)) || PREV_PGNO(h) == pip->pgno) { + isbad = 1; + EPRINT((env, "Page %lu: invalid prev_pgno %lu", + (u_long)pip->pgno, (u_long)PREV_PGNO(h))); + } + if (!IS_VALID_PGNO(NEXT_PGNO(h)) || NEXT_PGNO(h) == pip->pgno) { + isbad = 1; + EPRINT((env, "Page %lu: invalid next_pgno %lu", + (u_long)pip->pgno, (u_long)NEXT_PGNO(h))); + } + pip->prev_pgno = PREV_PGNO(h); + pip->next_pgno = NEXT_PGNO(h); + } + + /* + * Verify the number of entries on the page: there's no good way to + * determine if this is accurate. The best we can do is verify that + * it's not more than can, in theory, fit on the page. Then, we make + * sure there are at least this many valid elements in inp[], and + * hope the test catches most cases. + */ + switch (TYPE(h)) { + case P_HASH_UNSORTED: + case P_HASH: + smallest_entry = HKEYDATA_PSIZE(0); + break; + case P_IBTREE: + smallest_entry = BINTERNAL_PSIZE(0); + break; + case P_IRECNO: + smallest_entry = RINTERNAL_PSIZE; + break; + case P_LBTREE: + case P_LDUP: + case P_LRECNO: + smallest_entry = BKEYDATA_PSIZE(0); + break; + default: + smallest_entry = 0; + break; + } + if (smallest_entry * NUM_ENT(h) / 2 > dbp->pgsize) { + isbad = 1; + EPRINT((env, "Page %lu: too many entries: %lu", + (u_long)pgno, (u_long)NUM_ENT(h))); + } + + if (TYPE(h) != P_OVERFLOW) + pip->entries = NUM_ENT(h); + + /* + * btree level. Should be zero unless we're a btree; + * if we are a btree, should be between LEAFLEVEL and MAXBTREELEVEL, + * and we need to save it off. + */ + switch (TYPE(h)) { + case P_IBTREE: + case P_IRECNO: + if (LEVEL(h) < LEAFLEVEL + 1) { + isbad = 1; + EPRINT((env, "Page %lu: bad btree level %lu", + (u_long)pgno, (u_long)LEVEL(h))); + } + pip->bt_level = LEVEL(h); + break; + case P_LBTREE: + case P_LDUP: + case P_LRECNO: + if (LEVEL(h) != LEAFLEVEL) { + isbad = 1; + EPRINT((env, + "Page %lu: btree leaf page has incorrect level %lu", + (u_long)pgno, (u_long)LEVEL(h))); + } + break; + default: + if (LEVEL(h) != 0) { + isbad = 1; + EPRINT((env, + "Page %lu: nonzero level %lu in non-btree database", + (u_long)pgno, (u_long)LEVEL(h))); + } + break; + } + + /* + * Even though inp[] occurs in all PAGEs, we look at it in the + * access-method-specific code, since btree and hash treat + * item lengths very differently, and one of the most important + * things we want to verify is that the data--as specified + * by offset and length--cover the right part of the page + * without overlaps, gaps, or violations of the page boundary. + */ + if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0) + ret = t_ret; + + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); +} + +/* + * __db_vrfy_meta-- + * Verify the access-method common parts of a meta page, using + * normal mpool routines. + * + * PUBLIC: int __db_vrfy_meta + * PUBLIC: __P((DB *, VRFY_DBINFO *, DBMETA *, db_pgno_t, u_int32_t)); + */ +int +__db_vrfy_meta(dbp, vdp, meta, pgno, flags) + DB *dbp; + VRFY_DBINFO *vdp; + DBMETA *meta; + db_pgno_t pgno; + u_int32_t flags; +{ + DBTYPE dbtype, magtype; + ENV *env; + VRFY_PAGEINFO *pip; + int isbad, ret, t_ret; + + isbad = 0; + env = dbp->env; + + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + + /* type plausible for a meta page */ + switch (meta->type) { + case P_BTREEMETA: + dbtype = DB_BTREE; + break; + case P_HASHMETA: + dbtype = DB_HASH; + break; + case P_QAMMETA: + dbtype = DB_QUEUE; + break; + default: + ret = __db_unknown_path(env, "__db_vrfy_meta"); + goto err; + } + + /* magic number valid */ + if (!__db_is_valid_magicno(meta->magic, &magtype)) { + isbad = 1; + EPRINT((env, + "Page %lu: invalid magic number", (u_long)pgno)); + } + if (magtype != dbtype) { + isbad = 1; + EPRINT((env, + "Page %lu: magic number does not match database type", + (u_long)pgno)); + } + + /* version */ + if ((dbtype == DB_BTREE && + (meta->version > DB_BTREEVERSION || + meta->version < DB_BTREEOLDVER)) || + (dbtype == DB_HASH && + (meta->version > DB_HASHVERSION || + meta->version < DB_HASHOLDVER)) || + (dbtype == DB_QUEUE && + (meta->version > DB_QAMVERSION || + meta->version < DB_QAMOLDVER))) { + isbad = 1; + EPRINT((env, + "Page %lu: unsupported database version %lu; extraneous errors may result", + (u_long)pgno, (u_long)meta->version)); + } + + /* pagesize */ + if (meta->pagesize != dbp->pgsize) { + isbad = 1; + EPRINT((env, "Page %lu: invalid pagesize %lu", + (u_long)pgno, (u_long)meta->pagesize)); + } + + /* Flags */ + if (meta->metaflags != 0) { + if (FLD_ISSET(meta->metaflags, + ~(DBMETA_CHKSUM|DBMETA_PART_RANGE|DBMETA_PART_CALLBACK))) { + isbad = 1; + EPRINT((env, + "Page %lu: bad meta-data flags value %#lx", + (u_long)PGNO_BASE_MD, (u_long)meta->metaflags)); + } + if (FLD_ISSET(meta->metaflags, DBMETA_CHKSUM)) + F_SET(pip, VRFY_HAS_CHKSUM); + if (FLD_ISSET(meta->metaflags, DBMETA_PART_RANGE)) + F_SET(pip, VRFY_HAS_PART_RANGE); + if (FLD_ISSET(meta->metaflags, DBMETA_PART_CALLBACK)) + F_SET(pip, VRFY_HAS_PART_CALLBACK); + } + + /* + * Free list. + * + * If this is not the main, master-database meta page, it + * should not have a free list. + */ + if (pgno != PGNO_BASE_MD && meta->free != PGNO_INVALID) { + isbad = 1; + EPRINT((env, + "Page %lu: nonempty free list on subdatabase metadata page", + (u_long)pgno)); + } + + /* Can correctly be PGNO_INVALID--that's just the end of the list. */ + if (meta->free != PGNO_INVALID && IS_VALID_PGNO(meta->free)) + pip->free = meta->free; + else if (!IS_VALID_PGNO(meta->free)) { + isbad = 1; + EPRINT((env, + "Page %lu: nonsensical free list pgno %lu", + (u_long)pgno, (u_long)meta->free)); + } + + /* + * Check that the meta page agrees with what we got from mpool. + * If we don't have FTRUNCATE then mpool could include some + * zeroed pages at the end of the file, we assume the meta page + * is correct. + */ + if (pgno == PGNO_BASE_MD && meta->last_pgno != vdp->last_pgno) { +#ifdef HAVE_FTRUNCATE + isbad = 1; + EPRINT((env, + "Page %lu: last_pgno is not correct: %lu != %lu", + (u_long)pgno, + (u_long)meta->last_pgno, (u_long)vdp->last_pgno)); +#endif + vdp->meta_last_pgno = meta->last_pgno; + } + + /* + * We have now verified the common fields of the metadata page. + * Clear the flag that told us they had been incompletely checked. + */ + F_CLR(pip, VRFY_INCOMPLETE); + +err: if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0) + ret = t_ret; + + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); +} + +/* + * __db_vrfy_freelist -- + * Walk free list, checking off pages and verifying absence of + * loops. + */ +static int +__db_vrfy_freelist(dbp, vdp, meta, flags) + DB *dbp; + VRFY_DBINFO *vdp; + db_pgno_t meta; + u_int32_t flags; +{ + DB *pgset; + ENV *env; + VRFY_PAGEINFO *pip; + db_pgno_t cur_pgno, next_pgno; + int p, ret, t_ret; + + env = dbp->env; + pgset = vdp->pgset; + DB_ASSERT(env, pgset != NULL); + + if ((ret = __db_vrfy_getpageinfo(vdp, meta, &pip)) != 0) + return (ret); + for (next_pgno = pip->free; + next_pgno != PGNO_INVALID; next_pgno = pip->next_pgno) { + cur_pgno = pip->pgno; + if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0) + return (ret); + + /* This shouldn't happen, but just in case. */ + if (!IS_VALID_PGNO(next_pgno)) { + EPRINT((env, + "Page %lu: invalid next_pgno %lu on free list page", + (u_long)cur_pgno, (u_long)next_pgno)); + return (DB_VERIFY_BAD); + } + + /* Detect cycles. */ + if ((ret = __db_vrfy_pgset_get(pgset, + vdp->thread_info, next_pgno, &p)) != 0) + return (ret); + if (p != 0) { + EPRINT((env, + "Page %lu: page %lu encountered a second time on free list", + (u_long)cur_pgno, (u_long)next_pgno)); + return (DB_VERIFY_BAD); + } + if ((ret = __db_vrfy_pgset_inc(pgset, + vdp->thread_info, next_pgno)) != 0) + return (ret); + + if ((ret = __db_vrfy_getpageinfo(vdp, next_pgno, &pip)) != 0) + return (ret); + + if (pip->type != P_INVALID) { + EPRINT((env, + "Page %lu: non-invalid page %lu on free list", + (u_long)cur_pgno, (u_long)next_pgno)); + ret = DB_VERIFY_BAD; /* unsafe to continue */ + break; + } + } + + if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0) + ret = t_ret; + return (ret); +} + +/* + * __db_vrfy_subdbs -- + * Walk the known-safe master database of subdbs with a cursor, + * verifying the structure of each subdatabase we encounter. + */ +static int +__db_vrfy_subdbs(dbp, vdp, dbname, flags) + DB *dbp; + VRFY_DBINFO *vdp; + const char *dbname; + u_int32_t flags; +{ + DB *mdbp; + DBC *dbc; + DBT key, data; + ENV *env; + VRFY_PAGEINFO *pip; + db_pgno_t meta_pgno; + int ret, t_ret, isbad; + u_int8_t type; + + isbad = 0; + dbc = NULL; + env = dbp->env; + + if ((ret = __db_master_open(dbp, + vdp->thread_info, NULL, dbname, DB_RDONLY, 0, &mdbp)) != 0) + return (ret); + + if ((ret = __db_cursor_int(mdbp, NULL, + NULL, DB_BTREE, PGNO_INVALID, 0, DB_LOCK_INVALIDID, &dbc)) != 0) + goto err; + + memset(&key, 0, sizeof(key)); + memset(&data, 0, sizeof(data)); + while ((ret = __dbc_get(dbc, &key, &data, DB_NEXT)) == 0) { + if (data.size != sizeof(db_pgno_t)) { + EPRINT((env, + "Subdatabase entry not page-number size")); + isbad = 1; + goto err; + } + memcpy(&meta_pgno, data.data, data.size); + /* + * Subdatabase meta pgnos are stored in network byte + * order for cross-endian compatibility. Swap if appropriate. + */ + DB_NTOHL_SWAP(env, &meta_pgno); + if (meta_pgno == PGNO_INVALID || meta_pgno > vdp->last_pgno) { + EPRINT((env, + "Subdatabase entry references invalid page %lu", + (u_long)meta_pgno)); + isbad = 1; + goto err; + } + if ((ret = __db_vrfy_getpageinfo(vdp, meta_pgno, &pip)) != 0) + goto err; + type = pip->type; + if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0) + goto err; + switch (type) { + case P_BTREEMETA: + if ((ret = __bam_vrfy_structure( + dbp, vdp, meta_pgno, NULL, NULL, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + break; + case P_HASHMETA: + if ((ret = __ham_vrfy_structure( + dbp, vdp, meta_pgno, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + break; + case P_QAMMETA: + default: + EPRINT((env, + "Subdatabase entry references page %lu of invalid type %lu", + (u_long)meta_pgno, (u_long)type)); + ret = DB_VERIFY_BAD; + goto err; + } + } + + if (ret == DB_NOTFOUND) + ret = 0; + +err: if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + if ((t_ret = __db_close(mdbp, NULL, 0)) != 0 && ret == 0) + ret = t_ret; + + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); +} + +/* + * __db_vrfy_struct_feedback -- + * Provide feedback during top-down database structure traversal. + * (See comment at the beginning of __db_vrfy_structure.) + * + * PUBLIC: void __db_vrfy_struct_feedback __P((DB *, VRFY_DBINFO *)); + */ +void +__db_vrfy_struct_feedback(dbp, vdp) + DB *dbp; + VRFY_DBINFO *vdp; +{ + int progress; + + if (dbp->db_feedback == NULL) + return; + + if (vdp->pgs_remaining > 0) + vdp->pgs_remaining--; + + /* Don't allow a feedback call of 100 until we're really done. */ + progress = 100 - (int)(vdp->pgs_remaining * 50 / (vdp->last_pgno + 1)); + dbp->db_feedback(dbp, DB_VERIFY, progress == 100 ? 99 : progress); +} + +/* + * __db_vrfy_orderchkonly -- + * Do an sort-order/hashing check on a known-otherwise-good subdb. + */ +static int +__db_vrfy_orderchkonly(dbp, vdp, name, subdb, flags) + DB *dbp; + VRFY_DBINFO *vdp; + const char *name, *subdb; + u_int32_t flags; +{ + BTMETA *btmeta; + DB *mdbp, *pgset; + DBC *pgsc; + DBT key, data; + DB_MPOOLFILE *mpf; + ENV *env; + HASH *h_internal; + HMETA *hmeta; + PAGE *h, *currpg; + db_pgno_t meta_pgno, p, pgno; + u_int32_t bucket; + int t_ret, ret; + + pgset = NULL; + pgsc = NULL; + env = dbp->env; + mpf = dbp->mpf; + currpg = h = NULL; + + LF_CLR(DB_NOORDERCHK); + + /* Open the master database and get the meta_pgno for the subdb. */ + if ((ret = __db_master_open(dbp, + vdp->thread_info, NULL, name, DB_RDONLY, 0, &mdbp)) != 0) + goto err; + + DB_INIT_DBT(key, subdb, strlen(subdb)); + memset(&data, 0, sizeof(data)); + if ((ret = __db_get(mdbp, + vdp->thread_info, NULL, &key, &data, 0)) != 0) { + if (ret == DB_NOTFOUND) + ret = ENOENT; + goto err; + } + + if (data.size != sizeof(db_pgno_t)) { + EPRINT((env, "Subdatabase entry of invalid size")); + ret = DB_VERIFY_BAD; + goto err; + } + + memcpy(&meta_pgno, data.data, data.size); + + /* + * Subdatabase meta pgnos are stored in network byte + * order for cross-endian compatibility. Swap if appropriate. + */ + DB_NTOHL_SWAP(env, &meta_pgno); + + if ((ret = __memp_fget(mpf, + &meta_pgno, vdp->thread_info, NULL, 0, &h)) != 0) + goto err; + + if ((ret = __db_vrfy_pgset(env, + vdp->thread_info, dbp->pgsize, &pgset)) != 0) + goto err; + + switch (TYPE(h)) { + case P_BTREEMETA: + btmeta = (BTMETA *)h; + if (F_ISSET(&btmeta->dbmeta, BTM_RECNO)) { + /* Recnos have no order to check. */ + ret = 0; + goto err; + } + if ((ret = + __db_meta2pgset(dbp, vdp, meta_pgno, flags, pgset)) != 0) + goto err; + if ((ret = __db_cursor_int(pgset, NULL, NULL, dbp->type, + PGNO_INVALID, 0, DB_LOCK_INVALIDID, &pgsc)) != 0) + goto err; + while ((ret = __db_vrfy_pgset_next(pgsc, &p)) == 0) { + if ((ret = __memp_fget(mpf, &p, + vdp->thread_info, NULL, 0, &currpg)) != 0) + goto err; + if ((ret = __bam_vrfy_itemorder(dbp, NULL, + vdp->thread_info, currpg, p, NUM_ENT(currpg), 1, + F_ISSET(&btmeta->dbmeta, BTM_DUP), flags)) != 0) + goto err; + if ((ret = __memp_fput(mpf, + vdp->thread_info, currpg, dbp->priority)) != 0) + goto err; + currpg = NULL; + } + + /* + * The normal exit condition for the loop above is DB_NOTFOUND. + * If we see that, zero it and continue on to cleanup. + * Otherwise, it's a real error and will be returned. + */ + if (ret == DB_NOTFOUND) + ret = 0; + break; + case P_HASHMETA: + hmeta = (HMETA *)h; + h_internal = (HASH *)dbp->h_internal; + /* + * Make sure h_charkey is right. + */ + if (h_internal == NULL) { + EPRINT((env, + "Page %lu: DB->h_internal field is NULL", + (u_long)meta_pgno)); + ret = DB_VERIFY_BAD; + goto err; + } + if (h_internal->h_hash == NULL) + h_internal->h_hash = hmeta->dbmeta.version < 5 + ? __ham_func4 : __ham_func5; + if (hmeta->h_charkey != + h_internal->h_hash(dbp, CHARKEY, sizeof(CHARKEY))) { + EPRINT((env, + "Page %lu: incorrect hash function for database", + (u_long)meta_pgno)); + ret = DB_VERIFY_BAD; + goto err; + } + + /* + * Foreach bucket, verify hashing on each page in the + * corresponding chain of pages. + */ + if ((ret = __db_cursor_int(dbp, NULL, NULL, dbp->type, + PGNO_INVALID, 0, DB_LOCK_INVALIDID, &pgsc)) != 0) + goto err; + for (bucket = 0; bucket <= hmeta->max_bucket; bucket++) { + pgno = BS_TO_PAGE(bucket, hmeta->spares); + while (pgno != PGNO_INVALID) { + if ((ret = __memp_fget(mpf, &pgno, + vdp->thread_info, NULL, 0, &currpg)) != 0) + goto err; + if ((ret = __ham_vrfy_hashing(pgsc, + NUM_ENT(currpg), hmeta, bucket, pgno, + flags, h_internal->h_hash)) != 0) + goto err; + pgno = NEXT_PGNO(currpg); + if ((ret = __memp_fput(mpf, vdp->thread_info, + currpg, dbp->priority)) != 0) + goto err; + currpg = NULL; + } + } + break; + default: + EPRINT((env, "Page %lu: database metapage of bad type %lu", + (u_long)meta_pgno, (u_long)TYPE(h))); + ret = DB_VERIFY_BAD; + break; + } + +err: if (pgsc != NULL && (t_ret = __dbc_close(pgsc)) != 0 && ret == 0) + ret = t_ret; + if (pgset != NULL && + (t_ret = __db_close(pgset, NULL, 0)) != 0 && ret == 0) + ret = t_ret; + if (h != NULL && (t_ret = __memp_fput(mpf, + vdp->thread_info, h, dbp->priority)) != 0) + ret = t_ret; + if (currpg != NULL && + (t_ret = __memp_fput(mpf, + vdp->thread_info, currpg, dbp->priority)) != 0) + ret = t_ret; + if ((t_ret = __db_close(mdbp, NULL, 0)) != 0) + ret = t_ret; + return (ret); +} + +/* + * __db_salvage_pg -- + * Walk through a page, salvaging all likely or plausible (w/ + * DB_AGGRESSIVE) key/data pairs and marking seen pages in vdp. + * + * PUBLIC: int __db_salvage_pg __P((DB *, VRFY_DBINFO *, db_pgno_t, + * PUBLIC: PAGE *, void *, int (*)(void *, const void *), u_int32_t)); + */ +int +__db_salvage_pg(dbp, vdp, pgno, h, handle, callback, flags) + DB *dbp; + VRFY_DBINFO *vdp; + db_pgno_t pgno; + PAGE *h; + void *handle; + int (*callback) __P((void *, const void *)); + u_int32_t flags; +{ + ENV *env; + VRFY_PAGEINFO *pip; + int keyflag, ret, t_ret; + + env = dbp->env; + DB_ASSERT(env, LF_ISSET(DB_SALVAGE)); + + /* + * !!! + * We dump record numbers when salvaging Queue databases, but not for + * immutable Recno databases. The problem is we can't figure out the + * record number from the database page in the Recno case, while the + * offset in the file is sufficient for Queue. + */ + keyflag = 0; + + /* If we got this page in the subdb pass, we can safely skip it. */ + if (__db_salvage_isdone(vdp, pgno)) + return (0); + + switch (TYPE(h)) { + case P_BTREEMETA: + ret = __bam_vrfy_meta(dbp, vdp, (BTMETA *)h, pgno, flags); + break; + case P_HASH: + case P_HASH_UNSORTED: + case P_LBTREE: + case P_QAMDATA: + return (__db_salvage_leaf(dbp, + vdp, pgno, h, handle, callback, flags)); + case P_HASHMETA: + ret = __ham_vrfy_meta(dbp, vdp, (HMETA *)h, pgno, flags); + break; + case P_IBTREE: + /* + * We need to mark any overflow keys on internal pages as seen, + * so we don't print them out in __db_salvage_unknowns. But if + * we're an upgraded database, a P_LBTREE page may very well + * have a reference to the same overflow pages (this practice + * stopped somewhere around db4.5). To give P_LBTREEs a chance + * to print out any keys on shared pages, mark the page now and + * deal with it at the end. + */ + return (__db_salvage_markneeded(vdp, pgno, SALVAGE_IBTREE)); + case P_LDUP: + return (__db_salvage_markneeded(vdp, pgno, SALVAGE_LDUP)); + case P_LRECNO: + /* + * Recno leaves are tough, because the leaf could be (1) a dup + * page, or it could be (2) a regular database leaf page. + * Fortunately, RECNO databases are not allowed to have + * duplicates. + * + * If there are no subdatabases, dump the page immediately if + * it's a leaf in a RECNO database, otherwise wait and hopefully + * it will be dumped by the leaf page that refers to it, + * otherwise we'll get it with the unknowns. + * + * If there are subdatabases, there might be mixed types and + * dbp->type can't be trusted. We'll only get here after + * salvaging each database, though, so salvaging this page + * immediately isn't important. If this page is a dup, it might + * get salvaged later on, otherwise the unknowns pass will pick + * it up. Note that SALVAGE_HASSUBDBS won't get set if we're + * salvaging aggressively. + * + * If we're salvaging aggressively, we don't know whether or not + * there's subdatabases, so we wait on all recno pages. + */ + if (!LF_ISSET(DB_AGGRESSIVE) && + !F_ISSET(vdp, SALVAGE_HASSUBDBS) && dbp->type == DB_RECNO) + return (__db_salvage_leaf(dbp, + vdp, pgno, h, handle, callback, flags)); + return (__db_salvage_markneeded(vdp, pgno, SALVAGE_LRECNODUP)); + case P_OVERFLOW: + return (__db_salvage_markneeded(vdp, pgno, SALVAGE_OVERFLOW)); + case P_QAMMETA: + keyflag = 1; + ret = __qam_vrfy_meta(dbp, vdp, (QMETA *)h, pgno, flags); + break; + case P_INVALID: + case P_IRECNO: + case __P_DUPLICATE: + default: + /* + * There's no need to display an error, the page type was + * already checked and reported on. + */ + return (0); + } + if (ret != 0) + return (ret); + + /* + * We have to display the dump header if it's a metadata page. It's + * our last chance as the page was marked "seen" in the vrfy routine, + * and we won't see the page again. We don't display headers for + * the first database in a multi-database file, that database simply + * contains a list of subdatabases. + */ + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + if (!F_ISSET(pip, VRFY_HAS_SUBDBS) && !LF_ISSET(DB_VERIFY_PARTITION)) + ret = __db_prheader( + dbp, NULL, 0, keyflag, handle, callback, vdp, pgno); + if ((t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __db_salvage_leaf -- + * Walk through a leaf, salvaging all likely key/data pairs and marking + * seen pages in vdp. + * + * PUBLIC: int __db_salvage_leaf __P((DB *, VRFY_DBINFO *, db_pgno_t, + * PUBLIC: PAGE *, void *, int (*)(void *, const void *), u_int32_t)); + */ +int +__db_salvage_leaf(dbp, vdp, pgno, h, handle, callback, flags) + DB *dbp; + VRFY_DBINFO *vdp; + db_pgno_t pgno; + PAGE *h; + void *handle; + int (*callback) __P((void *, const void *)); + u_int32_t flags; +{ + ENV *env; + + env = dbp->env; + DB_ASSERT(env, LF_ISSET(DB_SALVAGE)); + + /* If we got this page in the subdb pass, we can safely skip it. */ + if (__db_salvage_isdone(vdp, pgno)) + return (0); + + switch (TYPE(h)) { + case P_HASH_UNSORTED: + case P_HASH: + return (__ham_salvage(dbp, vdp, + pgno, h, handle, callback, flags)); + case P_LBTREE: + case P_LRECNO: + return (__bam_salvage(dbp, vdp, + pgno, TYPE(h), h, handle, callback, NULL, flags)); + case P_QAMDATA: + return (__qam_salvage(dbp, vdp, + pgno, h, handle, callback, flags)); + default: + /* + * There's no need to display an error, the page type was + * already checked and reported on. + */ + return (0); + } +} + +/* + * __db_salvage_unknowns -- + * Walk through the salvager database, printing with key "UNKNOWN" + * any pages we haven't dealt with. + */ +static int +__db_salvage_unknowns(dbp, vdp, handle, callback, flags) + DB *dbp; + VRFY_DBINFO *vdp; + void *handle; + int (*callback) __P((void *, const void *)); + u_int32_t flags; +{ + DBC *dbc; + DBT unkdbt, key, *dbt; + DB_MPOOLFILE *mpf; + ENV *env; + PAGE *h; + db_pgno_t pgno; + u_int32_t pgtype, ovfl_bufsz, tmp_flags; + int ret, t_ret; + void *ovflbuf; + + dbc = NULL; + env = dbp->env; + mpf = dbp->mpf; + + DB_INIT_DBT(unkdbt, "UNKNOWN", sizeof("UNKNOWN") - 1); + + if ((ret = __os_malloc(env, dbp->pgsize, &ovflbuf)) != 0) + return (ret); + ovfl_bufsz = dbp->pgsize; + + /* + * We make two passes -- in the first pass, skip SALVAGE_OVERFLOW + * pages, because they may be referenced by the standard database + * pages that we're resolving. + */ + while ((t_ret = + __db_salvage_getnext(vdp, &dbc, &pgno, &pgtype, 1)) == 0) { + if ((t_ret = __memp_fget(mpf, + &pgno, vdp->thread_info, NULL, 0, &h)) != 0) { + if (ret == 0) + ret = t_ret; + continue; + } + + dbt = NULL; + tmp_flags = 0; + switch (pgtype) { + case SALVAGE_LDUP: + case SALVAGE_LRECNODUP: + dbt = &unkdbt; + tmp_flags = DB_SA_UNKNOWNKEY; + /* FALLTHROUGH */ + case SALVAGE_IBTREE: + case SALVAGE_LBTREE: + case SALVAGE_LRECNO: + if ((t_ret = __bam_salvage( + dbp, vdp, pgno, pgtype, h, handle, + callback, dbt, tmp_flags | flags)) != 0 && ret == 0) + ret = t_ret; + break; + case SALVAGE_OVERFLOW: + DB_ASSERT(env, 0); /* Shouldn't ever happen. */ + break; + case SALVAGE_HASH: + if ((t_ret = __ham_salvage(dbp, vdp, + pgno, h, handle, callback, flags)) != 0 && ret == 0) + ret = t_ret; + break; + case SALVAGE_INVALID: + case SALVAGE_IGNORE: + default: + /* + * Shouldn't happen, but if it does, just do what the + * nice man says. + */ + DB_ASSERT(env, 0); + break; + } + if ((t_ret = __memp_fput(mpf, + vdp->thread_info, h, dbp->priority)) != 0 && ret == 0) + ret = t_ret; + } + + /* We should have reached the end of the database. */ + if (t_ret == DB_NOTFOUND) + t_ret = 0; + if (t_ret != 0 && ret == 0) + ret = t_ret; + + /* Re-open the cursor so we traverse the database again. */ + if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + dbc = NULL; + + /* Now, deal with any remaining overflow pages. */ + while ((t_ret = + __db_salvage_getnext(vdp, &dbc, &pgno, &pgtype, 0)) == 0) { + if ((t_ret = __memp_fget(mpf, + &pgno, vdp->thread_info, NULL, 0, &h)) != 0) { + if (ret == 0) + ret = t_ret; + continue; + } + + switch (pgtype) { + case SALVAGE_OVERFLOW: + /* + * XXX: + * This may generate multiple "UNKNOWN" keys in + * a database with no dups. What to do? + */ + if ((t_ret = __db_safe_goff(dbp, vdp, + pgno, &key, &ovflbuf, &ovfl_bufsz, flags)) != 0 || + ((vdp->type == DB_BTREE || vdp->type == DB_HASH) && + (t_ret = __db_vrfy_prdbt(&unkdbt, + 0, " ", handle, callback, 0, vdp)) != 0) || + (t_ret = __db_vrfy_prdbt( + &key, 0, " ", handle, callback, 0, vdp)) != 0) + if (ret == 0) + ret = t_ret; + break; + default: + DB_ASSERT(env, 0); /* Shouldn't ever happen. */ + break; + } + if ((t_ret = __memp_fput(mpf, + vdp->thread_info, h, dbp->priority)) != 0 && ret == 0) + ret = t_ret; + } + + /* We should have reached the end of the database. */ + if (t_ret == DB_NOTFOUND) + t_ret = 0; + if (t_ret != 0 && ret == 0) + ret = t_ret; + + if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + __os_free(env, ovflbuf); + + return (ret); +} + +/* + * Offset of the ith inp array entry, which we can compare to the offset + * the entry stores. + */ +#define INP_OFFSET(dbp, h, i) \ + ((db_indx_t)((u_int8_t *)((P_INP(dbp,(h))) + (i)) - (u_int8_t *)(h))) + +/* + * __db_vrfy_inpitem -- + * Verify that a single entry in the inp array is sane, and update + * the high water mark and current item offset. (The former of these is + * used for state information between calls, and is required; it must + * be initialized to the pagesize before the first call.) + * + * Returns DB_VERIFY_FATAL if inp has collided with the data, + * since verification can't continue from there; returns DB_VERIFY_BAD + * if anything else is wrong. + * + * PUBLIC: int __db_vrfy_inpitem __P((DB *, PAGE *, + * PUBLIC: db_pgno_t, u_int32_t, int, u_int32_t, u_int32_t *, u_int32_t *)); + */ +int +__db_vrfy_inpitem(dbp, h, pgno, i, is_btree, flags, himarkp, offsetp) + DB *dbp; + PAGE *h; + db_pgno_t pgno; + u_int32_t i; + int is_btree; + u_int32_t flags, *himarkp, *offsetp; +{ + BKEYDATA *bk; + ENV *env; + db_indx_t *inp, offset, len; + + env = dbp->env; + + DB_ASSERT(env, himarkp != NULL); + inp = P_INP(dbp, h); + + /* + * Check that the inp array, which grows from the beginning of the + * page forward, has not collided with the data, which grow from the + * end of the page backward. + */ + if (inp + i >= (db_indx_t *)((u_int8_t *)h + *himarkp)) { + /* We've collided with the data. We need to bail. */ + EPRINT((env, "Page %lu: entries listing %lu overlaps data", + (u_long)pgno, (u_long)i)); + return (DB_VERIFY_FATAL); + } + + offset = inp[i]; + + /* + * Check that the item offset is reasonable: it points somewhere + * after the inp array and before the end of the page. + */ + if (offset <= INP_OFFSET(dbp, h, i) || offset >= dbp->pgsize) { + EPRINT((env, "Page %lu: bad offset %lu at page index %lu", + (u_long)pgno, (u_long)offset, (u_long)i)); + return (DB_VERIFY_BAD); + } + + /* Update the high-water mark (what HOFFSET should be) */ + if (offset < *himarkp) + *himarkp = offset; + + if (is_btree) { + /* + * Check alignment; if it's unaligned, it's unsafe to + * manipulate this item. + */ + if (offset != DB_ALIGN(offset, sizeof(u_int32_t))) { + EPRINT((env, + "Page %lu: unaligned offset %lu at page index %lu", + (u_long)pgno, (u_long)offset, (u_long)i)); + return (DB_VERIFY_BAD); + } + + /* + * Check that the item length remains on-page. + */ + bk = GET_BKEYDATA(dbp, h, i); + + /* + * We need to verify the type of the item here; + * we can't simply assume that it will be one of the + * expected three. If it's not a recognizable type, + * it can't be considered to have a verifiable + * length, so it's not possible to certify it as safe. + */ + switch (B_TYPE(bk->type)) { + case B_KEYDATA: + len = bk->len; + break; + case B_DUPLICATE: + case B_OVERFLOW: + len = BOVERFLOW_SIZE; + break; + default: + EPRINT((env, + "Page %lu: item %lu of unrecognizable type", + (u_long)pgno, (u_long)i)); + return (DB_VERIFY_BAD); + } + + if ((size_t)(offset + len) > dbp->pgsize) { + EPRINT((env, + "Page %lu: item %lu extends past page boundary", + (u_long)pgno, (u_long)i)); + return (DB_VERIFY_BAD); + } + } + + if (offsetp != NULL) + *offsetp = offset; + return (0); +} + +/* + * __db_vrfy_duptype-- + * Given a page number and a set of flags to __bam_vrfy_subtree, + * verify that the dup tree type is correct--i.e., it's a recno + * if DUPSORT is not set and a btree if it is. + * + * PUBLIC: int __db_vrfy_duptype + * PUBLIC: __P((DB *, VRFY_DBINFO *, db_pgno_t, u_int32_t)); + */ +int +__db_vrfy_duptype(dbp, vdp, pgno, flags) + DB *dbp; + VRFY_DBINFO *vdp; + db_pgno_t pgno; + u_int32_t flags; +{ + ENV *env; + VRFY_PAGEINFO *pip; + int ret, isbad; + + env = dbp->env; + isbad = 0; + + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + + switch (pip->type) { + case P_IBTREE: + case P_LDUP: + if (!LF_ISSET(DB_ST_DUPSORT)) { + EPRINT((env, + "Page %lu: sorted duplicate set in unsorted-dup database", + (u_long)pgno)); + isbad = 1; + } + break; + case P_IRECNO: + case P_LRECNO: + if (LF_ISSET(DB_ST_DUPSORT)) { + EPRINT((env, + "Page %lu: unsorted duplicate set in sorted-dup database", + (u_long)pgno)); + isbad = 1; + } + break; + default: + /* + * If the page is entirely zeroed, its pip->type will be a lie + * (we assumed it was a hash page, as they're allowed to be + * zeroed); handle this case specially. + */ + if (F_ISSET(pip, VRFY_IS_ALLZEROES)) + ZEROPG_ERR_PRINT(env, pgno, "duplicate page"); + else + EPRINT((env, + "Page %lu: duplicate page of inappropriate type %lu", + (u_long)pgno, (u_long)pip->type)); + isbad = 1; + break; + } + + if ((ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0) + return (ret); + return (isbad == 1 ? DB_VERIFY_BAD : 0); +} + +/* + * __db_salvage_duptree -- + * Attempt to salvage a given duplicate tree, given its alleged root. + * + * The key that corresponds to this dup set has been passed to us + * in DBT *key. Because data items follow keys, though, it has been + * printed once already. + * + * The basic idea here is that pgno ought to be a P_LDUP, a P_LRECNO, a + * P_IBTREE, or a P_IRECNO. If it's an internal page, use the verifier + * functions to make sure it's safe; if it's not, we simply bail and the + * data will have to be printed with no key later on. if it is safe, + * recurse on each of its children. + * + * Whether or not it's safe, if it's a leaf page, __bam_salvage it. + * + * At all times, use the DB hanging off vdp to mark and check what we've + * done, so each page gets printed exactly once and we don't get caught + * in any cycles. + * + * PUBLIC: int __db_salvage_duptree __P((DB *, VRFY_DBINFO *, db_pgno_t, + * PUBLIC: DBT *, void *, int (*)(void *, const void *), u_int32_t)); + */ +int +__db_salvage_duptree(dbp, vdp, pgno, key, handle, callback, flags) + DB *dbp; + VRFY_DBINFO *vdp; + db_pgno_t pgno; + DBT *key; + void *handle; + int (*callback) __P((void *, const void *)); + u_int32_t flags; +{ + DB_MPOOLFILE *mpf; + PAGE *h; + int ret, t_ret; + + mpf = dbp->mpf; + + if (pgno == PGNO_INVALID || !IS_VALID_PGNO(pgno)) + return (DB_VERIFY_BAD); + + /* We have a plausible page. Try it. */ + if ((ret = __memp_fget(mpf, &pgno, vdp->thread_info, NULL, 0, &h)) != 0) + return (ret); + + switch (TYPE(h)) { + case P_IBTREE: + case P_IRECNO: + if ((ret = __db_vrfy_common(dbp, vdp, h, pgno, flags)) != 0) + goto err; + if ((ret = __bam_vrfy(dbp, + vdp, h, pgno, flags | DB_NOORDERCHK)) != 0 || + (ret = __db_salvage_markdone(vdp, pgno)) != 0) + goto err; + /* + * We have a known-healthy internal page. Walk it. + */ + if ((ret = __bam_salvage_walkdupint(dbp, vdp, h, key, + handle, callback, flags)) != 0) + goto err; + break; + case P_LRECNO: + case P_LDUP: + if ((ret = __bam_salvage(dbp, + vdp, pgno, TYPE(h), h, handle, callback, key, flags)) != 0) + goto err; + break; + default: + ret = DB_VERIFY_BAD; + goto err; + } + +err: if ((t_ret = __memp_fput(mpf, + vdp->thread_info, h, dbp->priority)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __db_salvage_all -- + * Salvage only the leaves we find by walking the tree. If we have subdbs, + * salvage each of them individually. + */ +static int +__db_salvage_all(dbp, vdp, handle, callback, flags, hassubsp) + DB *dbp; + VRFY_DBINFO *vdp; + void *handle; + int (*callback) __P((void *, const void *)); + u_int32_t flags; + int *hassubsp; +{ + DB *pgset; + DBC *pgsc; + DB_MPOOLFILE *mpf; + ENV *env; + PAGE *h; + VRFY_PAGEINFO *pip; + db_pgno_t p, meta_pgno; + int ret, t_ret; + + *hassubsp = 0; + + env = dbp->env; + pgset = NULL; + pgsc = NULL; + mpf = dbp->mpf; + h = NULL; + pip = NULL; + ret = 0; + + /* + * Check to make sure the page is OK and find out if it contains + * subdatabases. + */ + meta_pgno = PGNO_BASE_MD; + if ((t_ret = __memp_fget(mpf, + &meta_pgno, vdp->thread_info, NULL, 0, &h)) == 0 && + (t_ret = __db_vrfy_common(dbp, vdp, h, PGNO_BASE_MD, flags)) == 0 && + (t_ret = __db_salvage_pg( + dbp, vdp, PGNO_BASE_MD, h, handle, callback, flags)) == 0 && + (t_ret = __db_vrfy_getpageinfo(vdp, 0, &pip)) == 0) + if (F_ISSET(pip, VRFY_HAS_SUBDBS)) + *hassubsp = 1; + if (pip != NULL && + (t_ret = __db_vrfy_putpageinfo(env, vdp, pip)) != 0 && ret == 0) + ret = t_ret; + if (h != NULL) { + if ((t_ret = __memp_fput(mpf, + vdp->thread_info, h, dbp->priority)) != 0 && ret == 0) + ret = t_ret; + h = NULL; + } + if (ret != 0) + return (ret); + + /* Without subdatabases, we can just dump from the meta pgno. */ + if (*hassubsp == 0) + return (__db_salvage(dbp, + vdp, PGNO_BASE_MD, handle, callback, flags)); + + /* + * We have subdbs. Try to crack them. + * + * To do so, get a set of leaf pages in the master database, and then + * walk each of the valid ones, salvaging subdbs as we go. If any + * prove invalid, just drop them; we'll pick them up on a later pass. + */ + if ((ret = __db_vrfy_pgset(env, + vdp->thread_info, dbp->pgsize, &pgset)) != 0) + goto err; + if ((ret = __db_meta2pgset(dbp, vdp, PGNO_BASE_MD, flags, pgset)) != 0) + goto err; + if ((ret = __db_cursor(pgset, vdp->thread_info, NULL, &pgsc, 0)) != 0) + goto err; + while ((t_ret = __db_vrfy_pgset_next(pgsc, &p)) == 0) { + if ((t_ret = __memp_fget(mpf, + &p, vdp->thread_info, NULL, 0, &h)) == 0 && + (t_ret = __db_vrfy_common(dbp, vdp, h, p, flags)) == 0 && + (t_ret = + __bam_vrfy(dbp, vdp, h, p, flags | DB_NOORDERCHK)) == 0) + t_ret = __db_salvage_subdbpg( + dbp, vdp, h, handle, callback, flags); + if (t_ret != 0 && ret == 0) + ret = t_ret; + if (h != NULL) { + if ((t_ret = __memp_fput(mpf, vdp->thread_info, + h, dbp->priority)) != 0 && ret == 0) + ret = t_ret; + h = NULL; + } + } + + if (t_ret != DB_NOTFOUND && ret == 0) + ret = t_ret; + +err: if (pgsc != NULL && (t_ret = __dbc_close(pgsc)) != 0 && ret == 0) + ret = t_ret; + if (pgset != NULL && + (t_ret = __db_close(pgset, NULL, 0)) != 0 && ret ==0) + ret = t_ret; + if (h != NULL && + (t_ret = __memp_fput(mpf, + vdp->thread_info, h, dbp->priority)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __db_salvage_subdbpg -- + * Given a known-good leaf page in the master database, salvage all + * leaf pages corresponding to each subdb. + */ +static int +__db_salvage_subdbpg(dbp, vdp, master, handle, callback, flags) + DB *dbp; + VRFY_DBINFO *vdp; + PAGE *master; + void *handle; + int (*callback) __P((void *, const void *)); + u_int32_t flags; +{ + BKEYDATA *bkkey, *bkdata; + BOVERFLOW *bo; + DB *pgset; + DBC *pgsc; + DBT key; + DB_MPOOLFILE *mpf; + ENV *env; + PAGE *subpg; + db_indx_t i; + db_pgno_t meta_pgno; + int ret, err_ret, t_ret; + char *subdbname; + u_int32_t ovfl_bufsz; + + env = dbp->env; + mpf = dbp->mpf; + ret = err_ret = 0; + subdbname = NULL; + pgsc = NULL; + pgset = NULL; + ovfl_bufsz = 0; + + /* + * For each entry, get and salvage the set of pages + * corresponding to that entry. + */ + for (i = 0; i < NUM_ENT(master); i += P_INDX) { + bkkey = GET_BKEYDATA(dbp, master, i); + bkdata = GET_BKEYDATA(dbp, master, i + O_INDX); + + /* Get the subdatabase name. */ + if (B_TYPE(bkkey->type) == B_OVERFLOW) { + /* + * We can, in principle anyway, have a subdb + * name so long it overflows. Ick. + */ + bo = (BOVERFLOW *)bkkey; + if ((ret = __db_safe_goff(dbp, vdp, bo->pgno, + &key, &subdbname, &ovfl_bufsz, flags)) != 0) { + err_ret = DB_VERIFY_BAD; + continue; + } + + /* Nul-terminate it. */ + if (ovfl_bufsz < key.size + 1) { + if ((ret = __os_realloc(env, + key.size + 1, &subdbname)) != 0) + goto err; + ovfl_bufsz = key.size + 1; + } + subdbname[key.size] = '\0'; + } else if (B_TYPE(bkkey->type) == B_KEYDATA) { + if (ovfl_bufsz < (u_int32_t)bkkey->len + 1) { + if ((ret = __os_realloc(env, + bkkey->len + 1, &subdbname)) != 0) + goto err; + ovfl_bufsz = bkkey->len + 1; + } + DB_ASSERT(env, subdbname != NULL); + memcpy(subdbname, bkkey->data, bkkey->len); + subdbname[bkkey->len] = '\0'; + } + + /* Get the corresponding pgno. */ + if (bkdata->len != sizeof(db_pgno_t)) { + err_ret = DB_VERIFY_BAD; + continue; + } + memcpy(&meta_pgno, + (db_pgno_t *)bkdata->data, sizeof(db_pgno_t)); + + /* + * Subdatabase meta pgnos are stored in network byte + * order for cross-endian compatibility. Swap if appropriate. + */ + DB_NTOHL_SWAP(env, &meta_pgno); + + /* If we can't get the subdb meta page, just skip the subdb. */ + if (!IS_VALID_PGNO(meta_pgno) || (ret = __memp_fget(mpf, + &meta_pgno, vdp->thread_info, NULL, 0, &subpg)) != 0) { + err_ret = ret; + continue; + } + + /* + * Verify the subdatabase meta page. This has two functions. + * First, if it's bad, we have no choice but to skip the subdb + * and let the pages just get printed on a later pass. Second, + * the access-method-specific meta verification routines record + * the various state info (such as the presence of dups) + * that we need for __db_prheader(). + */ + if ((ret = + __db_vrfy_common(dbp, vdp, subpg, meta_pgno, flags)) != 0) { + err_ret = ret; + (void)__memp_fput(mpf, + vdp->thread_info, subpg, dbp->priority); + continue; + } + switch (TYPE(subpg)) { + case P_BTREEMETA: + if ((ret = __bam_vrfy_meta(dbp, + vdp, (BTMETA *)subpg, meta_pgno, flags)) != 0) { + err_ret = ret; + (void)__memp_fput(mpf, + vdp->thread_info, subpg, dbp->priority); + continue; + } + break; + case P_HASHMETA: + if ((ret = __ham_vrfy_meta(dbp, + vdp, (HMETA *)subpg, meta_pgno, flags)) != 0) { + err_ret = ret; + (void)__memp_fput(mpf, + vdp->thread_info, subpg, dbp->priority); + continue; + } + break; + default: + /* This isn't an appropriate page; skip this subdb. */ + err_ret = DB_VERIFY_BAD; + continue; + } + + if ((ret = __memp_fput(mpf, + vdp->thread_info, subpg, dbp->priority)) != 0) { + err_ret = ret; + continue; + } + + /* Print a subdatabase header. */ + if ((ret = __db_prheader(dbp, + subdbname, 0, 0, handle, callback, vdp, meta_pgno)) != 0) + goto err; + + /* Salvage meta_pgno's tree. */ + if ((ret = __db_salvage(dbp, + vdp, meta_pgno, handle, callback, flags)) != 0) + err_ret = ret; + + /* Print a subdatabase footer. */ + if ((ret = __db_prfooter(handle, callback)) != 0) + goto err; + } + +err: if (subdbname) + __os_free(env, subdbname); + + if (pgsc != NULL && (t_ret = __dbc_close(pgsc)) != 0) + ret = t_ret; + + if (pgset != NULL && (t_ret = __db_close(pgset, NULL, 0)) != 0) + ret = t_ret; + + if ((t_ret = __db_salvage_markdone(vdp, PGNO(master))) != 0) + return (t_ret); + + return ((err_ret != 0) ? err_ret : ret); +} + +/* + * __db_salvage -- + * Given a meta page number, salvage all data from leaf pages found by + * walking the meta page's tree. + */ +static int +__db_salvage(dbp, vdp, meta_pgno, handle, callback, flags) + DB *dbp; + VRFY_DBINFO *vdp; + db_pgno_t meta_pgno; + void *handle; + int (*callback) __P((void *, const void *)); + u_int32_t flags; + +{ + DB *pgset; + DBC *dbc, *pgsc; + DB_MPOOLFILE *mpf; + ENV *env; + PAGE *subpg; + db_pgno_t p; + int err_ret, ret, t_ret; + + env = dbp->env; + mpf = dbp->mpf; + err_ret = ret = t_ret = 0; + pgsc = NULL; + pgset = NULL; + dbc = NULL; + + if ((ret = __db_vrfy_pgset(env, + vdp->thread_info, dbp->pgsize, &pgset)) != 0) + goto err; + + /* Get all page numbers referenced from this meta page. */ + if ((ret = __db_meta2pgset(dbp, vdp, meta_pgno, + flags, pgset)) != 0) { + err_ret = ret; + goto err; + } + + if ((ret = __db_cursor(pgset, + vdp->thread_info, NULL, &pgsc, 0)) != 0) + goto err; + + if (dbp->type == DB_QUEUE && + (ret = __db_cursor(dbp, vdp->thread_info, NULL, &dbc, 0)) != 0) + goto err; + + /* Salvage every page in pgset. */ + while ((ret = __db_vrfy_pgset_next(pgsc, &p)) == 0) { + if (dbp->type == DB_QUEUE) { +#ifdef HAVE_QUEUE + ret = __qam_fget(dbc, &p, 0, &subpg); +#else + ret = __db_no_queue_am(env); +#endif + /* Don't report an error for pages not found in a queue. + * The pgset is a best guess, it doesn't know about + * deleted extents which leads to this error. + */ + if (ret == ENOENT || ret == DB_PAGE_NOTFOUND) + continue; + } else + ret = __memp_fget(mpf, + &p, vdp->thread_info, NULL, 0, &subpg); + if (ret != 0) { + err_ret = ret; + continue; + } + + if ((ret = __db_salvage_pg(dbp, vdp, p, subpg, + handle, callback, flags)) != 0) + err_ret = ret; + + if (dbp->type == DB_QUEUE) +#ifdef HAVE_QUEUE + ret = __qam_fput(dbc, p, subpg, dbp->priority); +#else + ret = __db_no_queue_am(env); +#endif + else + ret = __memp_fput(mpf, + vdp->thread_info, subpg, dbp->priority); + if (ret != 0) + err_ret = ret; + } + + if (ret == DB_NOTFOUND) + ret = 0; + +err: + if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0) + ret = t_ret; + if (pgsc != NULL && (t_ret = __dbc_close(pgsc)) != 0) + ret = t_ret; + if (pgset != NULL && (t_ret = __db_close(pgset, NULL, 0)) != 0) + ret = t_ret; + + return ((err_ret != 0) ? err_ret : ret); +} + +/* + * __db_meta2pgset -- + * Given a known-safe meta page number, return the set of pages + * corresponding to the database it represents. Return DB_VERIFY_BAD if + * it's not a suitable meta page or is invalid. + */ +static int +__db_meta2pgset(dbp, vdp, pgno, flags, pgset) + DB *dbp; + VRFY_DBINFO *vdp; + db_pgno_t pgno; + u_int32_t flags; + DB *pgset; +{ + DB_MPOOLFILE *mpf; + PAGE *h; + int ret, t_ret; + + mpf = dbp->mpf; + + if ((ret = __memp_fget(mpf, &pgno, vdp->thread_info, NULL, 0, &h)) != 0) + return (ret); + + switch (TYPE(h)) { + case P_BTREEMETA: + ret = __bam_meta2pgset(dbp, vdp, (BTMETA *)h, flags, pgset); + break; + case P_HASHMETA: + ret = __ham_meta2pgset(dbp, vdp, (HMETA *)h, flags, pgset); + break; + case P_QAMMETA: +#ifdef HAVE_QUEUE + ret = __qam_meta2pgset(dbp, vdp, pgset); + break; +#endif + default: + ret = DB_VERIFY_BAD; + break; + } + + if ((t_ret = __memp_fput(mpf, vdp->thread_info, h, dbp->priority)) != 0) + return (t_ret); + return (ret); +} + +/* + * __db_guesspgsize -- + * Try to guess what the pagesize is if the one on the meta page + * and the one in the db are invalid. + */ +static u_int +__db_guesspgsize(env, fhp) + ENV *env; + DB_FH *fhp; +{ + db_pgno_t i; + size_t nr; + u_int32_t guess; + u_int8_t type; + + for (guess = DB_MAX_PGSIZE; guess >= DB_MIN_PGSIZE; guess >>= 1) { + /* + * We try to read three pages ahead after the first one + * and make sure we have plausible types for all of them. + * If the seeks fail, continue with a smaller size; + * we're probably just looking past the end of the database. + * If they succeed and the types are reasonable, also continue + * with a size smaller; we may be looking at pages N, + * 2N, and 3N for some N > 1. + * + * As soon as we hit an invalid type, we stop and return + * our previous guess; that last one was probably the page size. + */ + for (i = 1; i <= 3; i++) { + if (__os_seek( + env, fhp, i, guess, SSZ(DBMETA, type)) != 0) + break; + if (__os_read(env, + fhp, &type, 1, &nr) != 0 || nr == 0) + break; + if (type == P_INVALID || type >= P_PAGETYPE_MAX) + return (guess << 1); + } + } + + /* + * If we're just totally confused--the corruption takes up most of the + * beginning pages of the database--go with the default size. + */ + return (DB_DEF_IOSIZE); +} diff --git a/db/db_vrfy_stub.c b/db/db_vrfy_stub.c new file mode 100644 index 0000000..9ed5acd --- /dev/null +++ b/db/db_vrfy_stub.c @@ -0,0 +1,117 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#ifndef HAVE_VERIFY +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/db_verify.h" + +/* + * If the library wasn't compiled with the verification support, various + * routines aren't available. Stub them here, returning an appropriate + * error. + */ + +static int __db_novrfy __P((ENV *)); + +/* + * __db_novrfy -- + * Error when a Berkeley DB build doesn't include the access method. + */ +static int +__db_novrfy(env) + ENV *env; +{ + __db_errx(env, + "library build did not include support for database verification"); + return (DB_OPNOTSUP); +} + +int +__db_verify_pp(dbp, file, database, outfile, flags) + DB *dbp; + const char *file, *database; + FILE *outfile; + u_int32_t flags; +{ + int ret; + + COMPQUIET(file, NULL); + COMPQUIET(database, NULL); + COMPQUIET(outfile, NULL); + COMPQUIET(flags, 0); + + ret = __db_novrfy(dbp->env); + + /* The verify method is a destructor. */ + (void)__db_close(dbp, NULL, 0); + + return (ret); +} + +int +__db_verify_internal(dbp, name, subdb, handle, callback, flags) + DB *dbp; + const char *name, *subdb; + void *handle; + int (*callback) __P((void *, const void *)); + u_int32_t flags; +{ + COMPQUIET(dbp, NULL); + COMPQUIET(name, NULL); + COMPQUIET(subdb, NULL); + COMPQUIET(handle, NULL); + COMPQUIET(callback, NULL); + COMPQUIET(flags, 0); + return (0); +} + +int +__db_vrfy_getpageinfo(vdp, pgno, pipp) + VRFY_DBINFO *vdp; + db_pgno_t pgno; + VRFY_PAGEINFO **pipp; +{ + COMPQUIET(pgno, 0); + COMPQUIET(pipp, NULL); + return (__db_novrfy(vdp->pgdbp->env)); +} + +int +__db_vrfy_putpageinfo(env, vdp, pip) + ENV *env; + VRFY_DBINFO *vdp; + VRFY_PAGEINFO *pip; +{ + COMPQUIET(vdp, NULL); + COMPQUIET(pip, NULL); + return (__db_novrfy(env)); +} + +int +__db_vrfy_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, vdp) + DBT *dbtp; + int checkprint; + const char *prefix; + void *handle; + int (*callback) __P((void *, const void *)); + int is_recno; + VRFY_DBINFO *vdp; +{ + COMPQUIET(dbtp, NULL); + COMPQUIET(checkprint, 0); + COMPQUIET(prefix, NULL); + COMPQUIET(handle, NULL); + COMPQUIET(callback, NULL); + COMPQUIET(is_recno, 0); + return (__db_novrfy(vdp->pgdbp->env)); +} +#endif /* !HAVE_VERIFY */ diff --git a/db/db_vrfyutil.c b/db/db_vrfyutil.c new file mode 100644 index 0000000..04d73d9 --- /dev/null +++ b/db/db_vrfyutil.c @@ -0,0 +1,916 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2000-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_verify.h" +#include "dbinc/db_am.h" + +static int __db_vrfy_childinc __P((DBC *, VRFY_CHILDINFO *)); +static int __db_vrfy_pageinfo_create __P((ENV *, VRFY_PAGEINFO **)); + +/* + * __db_vrfy_dbinfo_create -- + * Allocate and initialize a VRFY_DBINFO structure. + * + * PUBLIC: int __db_vrfy_dbinfo_create + * PUBLIC: __P((ENV *, DB_THREAD_INFO *, u_int32_t, VRFY_DBINFO **)); + */ +int +__db_vrfy_dbinfo_create(env, ip, pgsize, vdpp) + ENV *env; + DB_THREAD_INFO *ip; + u_int32_t pgsize; + VRFY_DBINFO **vdpp; +{ + DB *cdbp, *pgdbp, *pgset; + VRFY_DBINFO *vdp; + int ret; + + vdp = NULL; + cdbp = pgdbp = pgset = NULL; + + if ((ret = __os_calloc(NULL, 1, sizeof(VRFY_DBINFO), &vdp)) != 0) + goto err; + + if ((ret = __db_create_internal(&cdbp, env, 0)) != 0) + goto err; + + if ((ret = __db_set_flags(cdbp, DB_DUP)) != 0) + goto err; + + if ((ret = __db_set_pagesize(cdbp, pgsize)) != 0) + goto err; + + /* If transactional, make sure we don't log. */ + if (TXN_ON(env) && + (ret = __db_set_flags(cdbp, DB_TXN_NOT_DURABLE)) != 0) + goto err; + if ((ret = __db_open(cdbp, ip, + NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0600, PGNO_BASE_MD)) != 0) + goto err; + + if ((ret = __db_create_internal(&pgdbp, env, 0)) != 0) + goto err; + + if ((ret = __db_set_pagesize(pgdbp, pgsize)) != 0) + goto err; + + /* If transactional, make sure we don't log. */ + if (TXN_ON(env) && + (ret = __db_set_flags(pgdbp, DB_TXN_NOT_DURABLE)) != 0) + goto err; + + if ((ret = __db_open(pgdbp, ip, + NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0600, PGNO_BASE_MD)) != 0) + goto err; + + if ((ret = __db_vrfy_pgset(env, ip, pgsize, &pgset)) != 0) + goto err; + + LIST_INIT(&vdp->subdbs); + LIST_INIT(&vdp->activepips); + + vdp->cdbp = cdbp; + vdp->pgdbp = pgdbp; + vdp->pgset = pgset; + vdp->thread_info = ip; + *vdpp = vdp; + return (0); + +err: if (cdbp != NULL) + (void)__db_close(cdbp, NULL, 0); + if (pgdbp != NULL) + (void)__db_close(pgdbp, NULL, 0); + if (vdp != NULL) + __os_free(env, vdp); + return (ret); +} + +/* + * __db_vrfy_dbinfo_destroy -- + * Destructor for VRFY_DBINFO. Destroys VRFY_PAGEINFOs and deallocates + * structure. + * + * PUBLIC: int __db_vrfy_dbinfo_destroy __P((ENV *, VRFY_DBINFO *)); + */ +int +__db_vrfy_dbinfo_destroy(env, vdp) + ENV *env; + VRFY_DBINFO *vdp; +{ + VRFY_CHILDINFO *c; + int t_ret, ret; + + ret = 0; + + /* + * Discard active page structures. Ideally there wouldn't be any, + * but in some error cases we may not have cleared them all out. + */ + while (LIST_FIRST(&vdp->activepips) != NULL) + if ((t_ret = __db_vrfy_putpageinfo( + env, vdp, LIST_FIRST(&vdp->activepips))) != 0) { + if (ret == 0) + ret = t_ret; + break; + } + + /* Discard subdatabase list structures. */ + while ((c = LIST_FIRST(&vdp->subdbs)) != NULL) { + LIST_REMOVE(c, links); + __os_free(NULL, c); + } + + if ((t_ret = __db_close(vdp->pgdbp, NULL, 0)) != 0) + ret = t_ret; + + if ((t_ret = __db_close(vdp->cdbp, NULL, 0)) != 0 && ret == 0) + ret = t_ret; + + if ((t_ret = __db_close(vdp->pgset, NULL, 0)) != 0 && ret == 0) + ret = t_ret; + + if (vdp->extents != NULL) + __os_free(env, vdp->extents); + __os_free(env, vdp); + return (ret); +} + +/* + * __db_vrfy_getpageinfo -- + * Get a PAGEINFO structure for a given page, creating it if necessary. + * + * PUBLIC: int __db_vrfy_getpageinfo + * PUBLIC: __P((VRFY_DBINFO *, db_pgno_t, VRFY_PAGEINFO **)); + */ +int +__db_vrfy_getpageinfo(vdp, pgno, pipp) + VRFY_DBINFO *vdp; + db_pgno_t pgno; + VRFY_PAGEINFO **pipp; +{ + DB *pgdbp; + DBT key, data; + ENV *env; + VRFY_PAGEINFO *pip; + int ret; + + /* + * We want a page info struct. There are three places to get it from, + * in decreasing order of preference: + * + * 1. vdp->activepips. If it's already "checked out", we're + * already using it, we return the same exact structure with a + * bumped refcount. This is necessary because this code is + * replacing array accesses, and it's common for f() to make some + * changes to a pip, and then call g() and h() which each make + * changes to the same pip. vdps are never shared between threads + * (they're never returned to the application), so this is safe. + * 2. The pgdbp. It's not in memory, but it's in the database, so + * get it, give it a refcount of 1, and stick it on activepips. + * 3. malloc. It doesn't exist yet; create it, then stick it on + * activepips. We'll put it in the database when we putpageinfo + * later. + */ + + /* Case 1. */ + LIST_FOREACH(pip, &vdp->activepips, links) + if (pip->pgno == pgno) + goto found; + + /* Case 2. */ + pgdbp = vdp->pgdbp; + env = pgdbp->env; + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + F_SET(&data, DB_DBT_MALLOC); + key.data = &pgno; + key.size = sizeof(db_pgno_t); + + if ((ret = __db_get(pgdbp, + vdp->thread_info, NULL, &key, &data, 0)) == 0) { + /* Found it. */ + DB_ASSERT(env, data.size == sizeof(VRFY_PAGEINFO)); + pip = data.data; + LIST_INSERT_HEAD(&vdp->activepips, pip, links); + goto found; + } else if (ret != DB_NOTFOUND) /* Something nasty happened. */ + return (ret); + + /* Case 3 */ + if ((ret = __db_vrfy_pageinfo_create(env, &pip)) != 0) + return (ret); + + LIST_INSERT_HEAD(&vdp->activepips, pip, links); +found: pip->pi_refcount++; + + *pipp = pip; + return (0); +} + +/* + * __db_vrfy_putpageinfo -- + * Put back a VRFY_PAGEINFO that we're done with. + * + * PUBLIC: int __db_vrfy_putpageinfo __P((ENV *, + * PUBLIC: VRFY_DBINFO *, VRFY_PAGEINFO *)); + */ +int +__db_vrfy_putpageinfo(env, vdp, pip) + ENV *env; + VRFY_DBINFO *vdp; + VRFY_PAGEINFO *pip; +{ + DB *pgdbp; + DBT key, data; + VRFY_PAGEINFO *p; + int ret; + + if (--pip->pi_refcount > 0) + return (0); + + pgdbp = vdp->pgdbp; + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + + key.data = &pip->pgno; + key.size = sizeof(db_pgno_t); + data.data = pip; + data.size = sizeof(VRFY_PAGEINFO); + + if ((ret = __db_put(pgdbp, + vdp->thread_info, NULL, &key, &data, 0)) != 0) + return (ret); + + LIST_FOREACH(p, &vdp->activepips, links) + if (p == pip) + break; + if (p != NULL) + LIST_REMOVE(p, links); + + __os_ufree(env, p); + return (0); +} + +/* + * __db_vrfy_pgset -- + * Create a temporary database for the storing of sets of page numbers. + * (A mapping from page number to int, used by the *_meta2pgset functions, + * as well as for keeping track of which pages the verifier has seen.) + * + * PUBLIC: int __db_vrfy_pgset __P((ENV *, + * PUBLIC: DB_THREAD_INFO *, u_int32_t, DB **)); + */ +int +__db_vrfy_pgset(env, ip, pgsize, dbpp) + ENV *env; + DB_THREAD_INFO *ip; + u_int32_t pgsize; + DB **dbpp; +{ + DB *dbp; + int ret; + + if ((ret = __db_create_internal(&dbp, env, 0)) != 0) + return (ret); + if ((ret = __db_set_pagesize(dbp, pgsize)) != 0) + goto err; + + /* If transactional, make sure we don't log. */ + if (TXN_ON(env) && + (ret = __db_set_flags(dbp, DB_TXN_NOT_DURABLE)) != 0) + goto err; + if ((ret = __db_open(dbp, ip, + NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0600, PGNO_BASE_MD)) == 0) + *dbpp = dbp; + else +err: (void)__db_close(dbp, NULL, 0); + + return (ret); +} + +/* + * __db_vrfy_pgset_get -- + * Get the value associated in a page set with a given pgno. Return + * a 0 value (and succeed) if we've never heard of this page. + * + * PUBLIC: int __db_vrfy_pgset_get __P((DB *, DB_THREAD_INFO *, db_pgno_t, + * PUBLIC: int *)); + */ +int +__db_vrfy_pgset_get(dbp, ip, pgno, valp) + DB *dbp; + DB_THREAD_INFO *ip; + db_pgno_t pgno; + int *valp; +{ + DBT key, data; + int ret, val; + + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + + key.data = &pgno; + key.size = sizeof(db_pgno_t); + data.data = &val; + data.ulen = sizeof(int); + F_SET(&data, DB_DBT_USERMEM); + + if ((ret = __db_get(dbp, ip, NULL, &key, &data, 0)) == 0) { + DB_ASSERT(dbp->env, data.size == sizeof(int)); + } else if (ret == DB_NOTFOUND) + val = 0; + else + return (ret); + + *valp = val; + return (0); +} + +/* + * __db_vrfy_pgset_inc -- + * Increment the value associated with a pgno by 1. + * + * PUBLIC: int __db_vrfy_pgset_inc __P((DB *, DB_THREAD_INFO *, db_pgno_t)); + */ +int +__db_vrfy_pgset_inc(dbp, ip, pgno) + DB *dbp; + DB_THREAD_INFO *ip; + db_pgno_t pgno; +{ + DBT key, data; + int ret; + int val; + + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + + val = 0; + + key.data = &pgno; + key.size = sizeof(db_pgno_t); + data.data = &val; + data.ulen = sizeof(int); + F_SET(&data, DB_DBT_USERMEM); + + if ((ret = __db_get(dbp, ip, NULL, &key, &data, 0)) == 0) { + DB_ASSERT(dbp->env, data.size == sizeof(int)); + } else if (ret != DB_NOTFOUND) + return (ret); + + data.size = sizeof(int); + ++val; + + return (__db_put(dbp, ip, NULL, &key, &data, 0)); +} + +/* + * __db_vrfy_pgset_next -- + * Given a cursor open in a pgset database, get the next page in the + * set. + * + * PUBLIC: int __db_vrfy_pgset_next __P((DBC *, db_pgno_t *)); + */ +int +__db_vrfy_pgset_next(dbc, pgnop) + DBC *dbc; + db_pgno_t *pgnop; +{ + DBT key, data; + db_pgno_t pgno; + int ret; + + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + /* We don't care about the data, just the keys. */ + F_SET(&data, DB_DBT_USERMEM | DB_DBT_PARTIAL); + F_SET(&key, DB_DBT_USERMEM); + key.data = &pgno; + key.ulen = sizeof(db_pgno_t); + + if ((ret = __dbc_get(dbc, &key, &data, DB_NEXT)) != 0) + return (ret); + + DB_ASSERT(dbc->env, key.size == sizeof(db_pgno_t)); + *pgnop = pgno; + + return (0); +} + +/* + * __db_vrfy_childcursor -- + * Create a cursor to walk the child list with. Returns with a nonzero + * final argument if the specified page has no children. + * + * PUBLIC: int __db_vrfy_childcursor __P((VRFY_DBINFO *, DBC **)); + */ +int +__db_vrfy_childcursor(vdp, dbcp) + VRFY_DBINFO *vdp; + DBC **dbcp; +{ + DB *cdbp; + DBC *dbc; + int ret; + + cdbp = vdp->cdbp; + + if ((ret = __db_cursor(cdbp, vdp->thread_info, NULL, &dbc, 0)) == 0) + *dbcp = dbc; + + return (ret); +} + +/* + * __db_vrfy_childput -- + * Add a child structure to the set for a given page. + * + * PUBLIC: int __db_vrfy_childput + * PUBLIC: __P((VRFY_DBINFO *, db_pgno_t, VRFY_CHILDINFO *)); + */ +int +__db_vrfy_childput(vdp, pgno, cip) + VRFY_DBINFO *vdp; + db_pgno_t pgno; + VRFY_CHILDINFO *cip; +{ + DB *cdbp; + DBC *cc; + DBT key, data; + VRFY_CHILDINFO *oldcip; + int ret; + + cdbp = vdp->cdbp; + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + + key.data = &pgno; + key.size = sizeof(db_pgno_t); + + /* + * We want to avoid adding multiple entries for a single child page; + * we only need to verify each child once, even if a child (such + * as an overflow key) is multiply referenced. + * + * However, we also need to make sure that when walking the list + * of children, we encounter them in the order they're referenced + * on a page. (This permits us, for example, to verify the + * prev_pgno/next_pgno chain of Btree leaf pages.) + * + * Check the child database to make sure that this page isn't + * already a child of the specified page number. If it's not, + * put it at the end of the duplicate set. + */ + if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0) + return (ret); + for (ret = __db_vrfy_ccset(cc, pgno, &oldcip); ret == 0; + ret = __db_vrfy_ccnext(cc, &oldcip)) + if (oldcip->pgno == cip->pgno) { + /* + * Found a matching child. Increment its reference + * count--we've run into it again--but don't put it + * again. + */ + if ((ret = __db_vrfy_childinc(cc, oldcip)) != 0 || + (ret = __db_vrfy_ccclose(cc)) != 0) + return (ret); + return (0); + } + if (ret != DB_NOTFOUND) { + (void)__db_vrfy_ccclose(cc); + return (ret); + } + if ((ret = __db_vrfy_ccclose(cc)) != 0) + return (ret); + + cip->refcnt = 1; + data.data = cip; + data.size = sizeof(VRFY_CHILDINFO); + + return (__db_put(cdbp, vdp->thread_info, NULL, &key, &data, 0)); +} + +/* + * __db_vrfy_childinc -- + * Increment the refcount of the VRFY_CHILDINFO struct that the child + * cursor is pointing to. (The caller has just retrieved this struct, and + * passes it in as cip to save us a get.) + */ +static int +__db_vrfy_childinc(dbc, cip) + DBC *dbc; + VRFY_CHILDINFO *cip; +{ + DBT key, data; + + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + + cip->refcnt++; + data.data = cip; + data.size = sizeof(VRFY_CHILDINFO); + + return (__dbc_put(dbc, &key, &data, DB_CURRENT)); +} + +/* + * __db_vrfy_ccset -- + * Sets a cursor created with __db_vrfy_childcursor to the first + * child of the given pgno, and returns it in the third arg. + * + * PUBLIC: int __db_vrfy_ccset __P((DBC *, db_pgno_t, VRFY_CHILDINFO **)); + */ +int +__db_vrfy_ccset(dbc, pgno, cipp) + DBC *dbc; + db_pgno_t pgno; + VRFY_CHILDINFO **cipp; +{ + DBT key, data; + int ret; + + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + + key.data = &pgno; + key.size = sizeof(db_pgno_t); + + if ((ret = __dbc_get(dbc, &key, &data, DB_SET)) != 0) + return (ret); + + DB_ASSERT(dbc->env, data.size == sizeof(VRFY_CHILDINFO)); + *cipp = (VRFY_CHILDINFO *)data.data; + + return (0); +} + +/* + * __db_vrfy_ccnext -- + * Gets the next child of the given cursor created with + * __db_vrfy_childcursor, and returns it in the memory provided in the + * second arg. + * + * PUBLIC: int __db_vrfy_ccnext __P((DBC *, VRFY_CHILDINFO **)); + */ +int +__db_vrfy_ccnext(dbc, cipp) + DBC *dbc; + VRFY_CHILDINFO **cipp; +{ + DBT key, data; + int ret; + + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + + if ((ret = __dbc_get(dbc, &key, &data, DB_NEXT_DUP)) != 0) + return (ret); + + DB_ASSERT(dbc->env, data.size == sizeof(VRFY_CHILDINFO)); + *cipp = (VRFY_CHILDINFO *)data.data; + + return (0); +} + +/* + * __db_vrfy_ccclose -- + * Closes the cursor created with __db_vrfy_childcursor. + * + * This doesn't actually do anything interesting now, but it's + * not inconceivable that we might change the internal database usage + * and keep the interfaces the same, and a function call here or there + * seldom hurts anyone. + * + * PUBLIC: int __db_vrfy_ccclose __P((DBC *)); + */ +int +__db_vrfy_ccclose(dbc) + DBC *dbc; +{ + + return (__dbc_close(dbc)); +} + +/* + * __db_vrfy_pageinfo_create -- + * Constructor for VRFY_PAGEINFO; allocates and initializes. + */ +static int +__db_vrfy_pageinfo_create(env, pipp) + ENV *env; + VRFY_PAGEINFO **pipp; +{ + VRFY_PAGEINFO *pip; + int ret; + + /* + * pageinfo structs are sometimes allocated here and sometimes + * allocated by fetching them from a database with DB_DBT_MALLOC. + * There's no easy way for the destructor to tell which was + * used, and so we always allocate with __os_umalloc so we can free + * with __os_ufree. + */ + if ((ret = __os_umalloc(env, sizeof(VRFY_PAGEINFO), &pip)) != 0) + return (ret); + memset(pip, 0, sizeof(VRFY_PAGEINFO)); + + *pipp = pip; + return (0); +} + +/* + * __db_salvage_init -- + * Set up salvager database. + * + * PUBLIC: int __db_salvage_init __P((VRFY_DBINFO *)); + */ +int +__db_salvage_init(vdp) + VRFY_DBINFO *vdp; +{ + DB *dbp; + int ret; + + if ((ret = __db_create_internal(&dbp, NULL, 0)) != 0) + return (ret); + + if ((ret = __db_set_pagesize(dbp, 1024)) != 0) + goto err; + + if ((ret = __db_open(dbp, vdp->thread_info, + NULL, NULL, NULL, DB_BTREE, DB_CREATE, 0, PGNO_BASE_MD)) != 0) + goto err; + + vdp->salvage_pages = dbp; + return (0); + +err: (void)__db_close(dbp, NULL, 0); + return (ret); +} + +/* + * __db_salvage_destroy -- + * Close salvager database. + * PUBLIC: int __db_salvage_destroy __P((VRFY_DBINFO *)); + */ +int +__db_salvage_destroy(vdp) + VRFY_DBINFO *vdp; +{ + return (vdp->salvage_pages == NULL ? 0 : + __db_close(vdp->salvage_pages, NULL, 0)); +} + +/* + * __db_salvage_getnext -- + * Get the next (first) unprinted page in the database of pages we need to + * print still. Delete entries for any already-printed pages we encounter + * in this search, as well as the page we're returning. + * + * PUBLIC: int __db_salvage_getnext + * PUBLIC: __P((VRFY_DBINFO *, DBC **, db_pgno_t *, u_int32_t *, int)); + */ +int +__db_salvage_getnext(vdp, dbcp, pgnop, pgtypep, skip_overflow) + VRFY_DBINFO *vdp; + DBC **dbcp; + db_pgno_t *pgnop; + u_int32_t *pgtypep; + int skip_overflow; +{ + DB *dbp; + DBT key, data; + int ret; + u_int32_t pgtype; + + dbp = vdp->salvage_pages; + + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + + if (*dbcp == NULL && + (ret = __db_cursor(dbp, vdp->thread_info, NULL, dbcp, 0)) != 0) + return (ret); + + while ((ret = __dbc_get(*dbcp, &key, &data, DB_NEXT)) == 0) { + DB_ASSERT(dbp->env, data.size == sizeof(u_int32_t)); + memcpy(&pgtype, data.data, sizeof(pgtype)); + + if (skip_overflow && pgtype == SALVAGE_OVERFLOW) + continue; + + if ((ret = __dbc_del(*dbcp, 0)) != 0) + return (ret); + if (pgtype != SALVAGE_IGNORE) { + DB_ASSERT(dbp->env, key.size == sizeof(db_pgno_t)); + DB_ASSERT(dbp->env, data.size == sizeof(u_int32_t)); + + *pgnop = *(db_pgno_t *)key.data; + *pgtypep = *(u_int32_t *)data.data; + break; + } + } + + return (ret); +} + +/* + * __db_salvage_isdone -- + * Return whether or not the given pgno is already marked + * SALVAGE_IGNORE (meaning that we don't need to print it again). + * + * Returns DB_KEYEXIST if it is marked, 0 if not, or another error on + * error. + * + * PUBLIC: int __db_salvage_isdone __P((VRFY_DBINFO *, db_pgno_t)); + */ +int +__db_salvage_isdone(vdp, pgno) + VRFY_DBINFO *vdp; + db_pgno_t pgno; +{ + DB *dbp; + DBT key, data; + int ret; + u_int32_t currtype; + + dbp = vdp->salvage_pages; + + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + + currtype = SALVAGE_INVALID; + data.data = &currtype; + data.ulen = sizeof(u_int32_t); + data.flags = DB_DBT_USERMEM; + + key.data = &pgno; + key.size = sizeof(db_pgno_t); + + /* + * Put an entry for this page, with pgno as key and type as data, + * unless it's already there and is marked done. + * If it's there and is marked anything else, that's fine--we + * want to mark it done. + */ + if ((ret = __db_get(dbp, + vdp->thread_info, NULL, &key, &data, 0)) == 0) { + /* + * The key's already here. Check and see if it's already + * marked done. If it is, return DB_KEYEXIST. If it's not, + * return 0. + */ + if (currtype == SALVAGE_IGNORE) + return (DB_KEYEXIST); + else + return (0); + } else if (ret != DB_NOTFOUND) + return (ret); + + /* The pgno is not yet marked anything; return 0. */ + return (0); +} + +/* + * __db_salvage_markdone -- + * Mark as done a given page. + * + * PUBLIC: int __db_salvage_markdone __P((VRFY_DBINFO *, db_pgno_t)); + */ +int +__db_salvage_markdone(vdp, pgno) + VRFY_DBINFO *vdp; + db_pgno_t pgno; +{ + DB *dbp; + DBT key, data; + int pgtype, ret; + u_int32_t currtype; + + pgtype = SALVAGE_IGNORE; + dbp = vdp->salvage_pages; + + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + + currtype = SALVAGE_INVALID; + data.data = &currtype; + data.ulen = sizeof(u_int32_t); + data.flags = DB_DBT_USERMEM; + + key.data = &pgno; + key.size = sizeof(db_pgno_t); + + /* + * Put an entry for this page, with pgno as key and type as data, + * unless it's already there and is marked done. + * If it's there and is marked anything else, that's fine--we + * want to mark it done, but db_salvage_isdone only lets + * us know if it's marked IGNORE. + * + * We don't want to return DB_KEYEXIST, though; this will + * likely get passed up all the way and make no sense to the + * application. Instead, use DB_VERIFY_BAD to indicate that + * we've seen this page already--it probably indicates a + * multiply-linked page. + */ + if ((ret = __db_salvage_isdone(vdp, pgno)) != 0) + return (ret == DB_KEYEXIST ? DB_VERIFY_BAD : ret); + + data.size = sizeof(u_int32_t); + data.data = &pgtype; + + return (__db_put(dbp, vdp->thread_info, NULL, &key, &data, 0)); +} + +/* + * __db_salvage_markneeded -- + * If it has not yet been printed, make note of the fact that a page + * must be dealt with later. + * + * PUBLIC: int __db_salvage_markneeded + * PUBLIC: __P((VRFY_DBINFO *, db_pgno_t, u_int32_t)); + */ +int +__db_salvage_markneeded(vdp, pgno, pgtype) + VRFY_DBINFO *vdp; + db_pgno_t pgno; + u_int32_t pgtype; +{ + DB *dbp; + DBT key, data; + int ret; + + dbp = vdp->salvage_pages; + + memset(&key, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + + key.data = &pgno; + key.size = sizeof(db_pgno_t); + + data.data = &pgtype; + data.size = sizeof(u_int32_t); + + /* + * Put an entry for this page, with pgno as key and type as data, + * unless it's already there, in which case it's presumably + * already been marked done. + */ + ret = __db_put(dbp, + vdp->thread_info, NULL, &key, &data, DB_NOOVERWRITE); + return (ret == DB_KEYEXIST ? 0 : ret); +} + +/* + * __db_vrfy_prdbt -- + * Print out a DBT data element from a verification routine. + * + * PUBLIC: int __db_vrfy_prdbt __P((DBT *, int, const char *, void *, + * PUBLIC: int (*)(void *, const void *), int, VRFY_DBINFO *)); + */ +int +__db_vrfy_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno, vdp) + DBT *dbtp; + int checkprint; + const char *prefix; + void *handle; + int (*callback) __P((void *, const void *)); + int is_recno; + VRFY_DBINFO *vdp; +{ + if (vdp != NULL) { + /* + * If vdp is non-NULL, we might be the first key in the + * "fake" subdatabase used for key/data pairs we can't + * associate with a known subdb. + * + * Check and clear the SALVAGE_PRINTHEADER flag; if + * it was set, print a subdatabase header. + */ + if (F_ISSET(vdp, SALVAGE_PRINTHEADER)) { + (void)__db_prheader( + NULL, "__OTHER__", 0, 0, handle, callback, vdp, 0); + F_CLR(vdp, SALVAGE_PRINTHEADER); + F_SET(vdp, SALVAGE_PRINTFOOTER); + } + + /* + * Even if the printable flag wasn't set by our immediate + * caller, it may be set on a salvage-wide basis. + */ + if (F_ISSET(vdp, SALVAGE_PRINTABLE)) + checkprint = 1; + } + return ( + __db_prdbt(dbtp, checkprint, prefix, handle, callback, is_recno)); +} diff --git a/db/partition.c b/db/partition.c new file mode 100644 index 0000000..4e89ede --- /dev/null +++ b/db/partition.c @@ -0,0 +1,2048 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2001, 2010 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_verify.h" +#include "dbinc/btree.h" +#ifdef HAVE_HASH +#include "dbinc/hash.h" +#endif +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/partition.h" +#include "dbinc/txn.h" +#ifdef HAVE_PARTITION + +static int __part_rr __P((DB *, DB_THREAD_INFO *, DB_TXN *, + const char *, const char *, const char *, u_int32_t)); +static int __partc_close __P((DBC *, db_pgno_t, int *)); +static int __partc_del __P((DBC*, u_int32_t)); +static int __partc_destroy __P((DBC*)); +static int __partc_get_pp __P((DBC*, DBT *, DBT *, u_int32_t)); +static int __partc_put __P((DBC*, DBT *, DBT *, u_int32_t, db_pgno_t *)); +static int __partc_writelock __P((DBC*)); +static int __partition_chk_meta __P((DB *, + DB_THREAD_INFO *, DB_TXN *, u_int32_t)); +static int __partition_setup_keys __P((DBC *, + DB_PARTITION *, DBMETA *, u_int32_t)); +static int __part_key_cmp __P((const void *, const void *)); +static inline void __part_search __P((DB *, + DB_PARTITION *, DBT *, u_int32_t *)); + +static char *Alloc_err = "Partition open failed to allocate %d bytes"; + +/* + * Allocate a partition cursor and copy flags to the partition cursor. + * Not passed: + * DBC_PARTITIONED -- the subcursors are not. + * DBC_OWN_LID -- the arg dbc owns the lock id. + * DBC_WRITECURSOR DBC_WRITER -- CDS locking happens on + * the whole DB, not the partition. + */ +#define GET_PART_CURSOR(dbc, new_dbc, part_id) do { \ + DB *__part_dbp; \ + __part_dbp = part->handles[part_id]; \ + if ((ret = __db_cursor_int(__part_dbp, \ + (dbc)->thread_info, (dbc)->txn, __part_dbp->type, \ + PGNO_INVALID, 0, (dbc)->locker, &new_dbc)) != 0) \ + goto err; \ + (new_dbc)->flags = (dbc)->flags & \ + ~(DBC_PARTITIONED|DBC_OWN_LID|DBC_WRITECURSOR|DBC_WRITER); \ +} while (0) + +/* + * Search for the correct partition. + */ +static inline void __part_search(dbp, part, key, part_idp) + DB *dbp; + DB_PARTITION *part; + DBT *key; + u_int32_t *part_idp; +{ + db_indx_t base, indx, limit; + int cmp; + int (*func) __P((DB *, const DBT *, const DBT *)); + + DB_ASSERT(dbp->env, part->nparts != 0); + COMPQUIET(cmp, 0); + COMPQUIET(indx, 0); + + func = ((BTREE *)dbp->bt_internal)->bt_compare; + DB_BINARY_SEARCH_FOR(base, limit, part->nparts, O_INDX) { + DB_BINARY_SEARCH_INCR(indx, base, limit, O_INDX); + cmp = func(dbp, key, &part->keys[indx]); + if (cmp == 0) + break; + if (cmp > 0) + DB_BINARY_SEARCH_SHIFT_BASE(indx, base, limit, O_INDX); + } + if (cmp == 0) + *part_idp = indx; + else if ((*part_idp = base) != 0) + (*part_idp)--; +} + +/* + * __partition_init -- + * Initialize the partition structure. + * Called when the meta data page is read in during database open or + * when partition keys or a callback are set. + * + * PUBLIC: int __partition_init __P((DB *, u_int32_t)); + */ +int +__partition_init(dbp, flags) + DB *dbp; + u_int32_t flags; +{ + DB_PARTITION *part; + int ret; + + if ((part = dbp->p_internal) != NULL) { + if ((LF_ISSET(DBMETA_PART_RANGE) && + F_ISSET(part, PART_CALLBACK)) || + (LF_ISSET(DBMETA_PART_CALLBACK) && + F_ISSET(part, PART_RANGE))) { + __db_errx(dbp->env, + "Cannot specify callback and range keys."); + return (EINVAL); + } + } else if ((ret = __os_calloc(dbp->env, 1, sizeof(*part), &part)) != 0) + return (ret); + + if (LF_ISSET(DBMETA_PART_RANGE)) + F_SET(part, PART_RANGE); + if (LF_ISSET(DBMETA_PART_CALLBACK)) + F_SET(part, PART_CALLBACK); + dbp->p_internal = part; + /* Set up AM-specific methods that do not require an open. */ + dbp->db_am_rename = __part_rename; + dbp->db_am_remove = __part_remove; + return (0); +} +/* + * __partition_set -- + * Set the partitioning keys or callback function. + * This routine must be called prior to creating the database. + * PUBLIC: int __partition_set __P((DB *, u_int32_t, DBT *, + * PUBLIC: u_int32_t (*callback)(DB *, DBT *key))); + */ + +int +__partition_set(dbp, parts, keys, callback) + DB *dbp; + u_int32_t parts; + DBT *keys; + u_int32_t (*callback)(DB *, DBT *key); +{ + DB_PARTITION *part; + ENV *env; + int ret; + + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_partition"); + env = dbp->dbenv->env; + + if (parts < 2) { + __db_errx(env, "Must specify at least 2 partitions."); + return (EINVAL); + } + + if (keys == NULL && callback == NULL) { + __db_errx(env, "Must specify either keys or a callback."); + return (EINVAL); + } + if (keys != NULL && callback != NULL) { +bad: __db_errx(env, "May not specify both keys and a callback."); + return (EINVAL); + } + + if ((part = dbp->p_internal) == NULL) { + if ((ret = __partition_init(dbp, + keys != NULL ? + DBMETA_PART_RANGE : DBMETA_PART_CALLBACK)) != 0) + return (ret); + part = dbp->p_internal; + } else if ((part->keys != NULL && callback != NULL) || + (part->callback != NULL && keys != NULL)) + goto bad; + + part->nparts = parts; + part->keys = keys; + part->callback = callback; + + return (0); +} + +/* + * __partition_set_dirs -- + * Set the directories for creating the partition databases. + * They must be in the environment. + * PUBLIC: int __partition_set_dirs __P((DB *, const char **)); + */ +int +__partition_set_dirs(dbp, dirp) + DB *dbp; + const char **dirp; +{ + DB_ENV *dbenv; + DB_PARTITION *part; + ENV *env; + u_int32_t ndirs, slen; + int i, ret; + const char **dir; + char *cp, **part_dirs, **pd; + + DB_ILLEGAL_AFTER_OPEN(dbp, "DB->set_partition_dirs"); + dbenv = dbp->dbenv; + env = dbp->env; + + ndirs = 1; + slen = 0; + for (dir = dirp; *dir != NULL; dir++) { + if (F_ISSET(env, ENV_DBLOCAL)) + slen += (u_int32_t)strlen(*dir) + 1; + ndirs++; + } + + slen += sizeof(char *) * ndirs; + if ((ret = __os_malloc(env, slen, &part_dirs)) != 0) + return (EINVAL); + memset(part_dirs, 0, slen); + + cp = (char *) part_dirs + (sizeof(char *) * ndirs); + pd = part_dirs; + for (dir = dirp; *dir != NULL; dir++, pd++) { + if (F_ISSET(env, ENV_DBLOCAL)) { + (void)strcpy(cp, *dir); + *pd = cp; + cp += strlen(*dir) + 1; + continue; + } + for (i = 0; i < dbenv->data_next; i++) + if (strcmp(*dir, dbenv->db_data_dir[i]) == 0) + break; + if (i == dbenv->data_next) { + __db_errx(dbp->env, + "Directory not in environment list %s", *dir); + __os_free(env, part_dirs); + return (EINVAL); + } + *pd = dbenv->db_data_dir[i]; + } + + if ((part = dbp->p_internal) == NULL) { + if ((ret = __partition_init(dbp, 0)) != 0) + return (ret); + part = dbp->p_internal; + } + + part->dirs = (const char **)part_dirs; + + return (0); +} + +/* + * __partition_open -- + * Open/create a partitioned database. + * PUBLIC: int __partition_open __P((DB *, DB_THREAD_INFO *, + * PUBLIC: DB_TXN *, const char *, DBTYPE, u_int32_t, int, int)); + */ +int +__partition_open(dbp, ip, txn, fname, type, flags, mode, do_open) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + const char *fname; + DBTYPE type; + u_int32_t flags; + int mode, do_open; +{ + DB *part_db; + DB_PARTITION *part; + DBC *dbc; + ENV *env; + u_int32_t part_id; + int ret; + char *name, *sp; + const char **dirp, *np; + + part = dbp->p_internal; + env = dbp->dbenv->env; + name = NULL; + + if ((ret = __partition_chk_meta(dbp, ip, txn, flags)) != 0 && do_open) + goto err; + + if ((ret = __os_calloc(env, + part->nparts, sizeof(*part->handles), &part->handles)) != 0) { + __db_errx(env, + Alloc_err, part->nparts * sizeof(*part->handles)); + goto err; + } + + DB_ASSERT(env, fname != NULL); + if ((ret = __os_malloc(env, + strlen(fname) + PART_LEN + 1, &name)) != 0) { + __db_errx(env, Alloc_err, strlen(fname) + PART_LEN + 1); + goto err; + } + + sp = name; + np = __db_rpath(fname); + if (np == NULL) + np = fname; + else { + np++; + (void)strncpy(name, fname, (size_t)(np - fname)); + sp = name + (np - fname); + } + + if (F_ISSET(dbp, DB_AM_RECOVER)) + goto done; + dirp = part->dirs; + for (part_id = 0; part_id < part->nparts; part_id++) { + if ((ret = __db_create_internal( + &part->handles[part_id], dbp->env, 0)) != 0) + goto err; + + part_db = part->handles[part_id]; + part_db->flags = F_ISSET(dbp, + ~(DB_AM_CREATED | DB_AM_CREATED_MSTR | DB_AM_OPEN_CALLED)); + part_db->adj_fileid = dbp->adj_fileid; + part_db->pgsize = dbp->pgsize; + part_db->priority = dbp->priority; + part_db->db_append_recno = dbp->db_append_recno; + part_db->db_feedback = dbp->db_feedback; + part_db->dup_compare = dbp->dup_compare; + part_db->app_private = dbp->app_private; + part_db->api_internal = dbp->api_internal; + + if (dbp->type == DB_BTREE) + __bam_copy_config(dbp, part_db, part->nparts); +#ifdef HAVE_HASH + if (dbp->type == DB_HASH) + __ham_copy_config(dbp, part_db, part->nparts); +#endif + + (void)sprintf(sp, PART_NAME, np, part_id); + if ((ret = __os_strdup(env, name, &part_db->fname)) != 0) + goto err; + if (do_open) { + /* + * Cycle through the directory names passed in, + * if any. + */ + if (dirp != NULL && + (part_db->dirname = *dirp++) == NULL) + part_db->dirname = *(dirp = part->dirs); + if ((ret = __db_open(part_db, ip, txn, + name, NULL, type, flags, mode, PGNO_BASE_MD)) != 0) + goto err; + } + } + + /* Get rid of the cursor used to open the database its the wrong type */ +done: while ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL) + if ((ret = __dbc_destroy(dbc)) != 0) + break; + + if (0) { +err: (void)__partition_close(dbp, txn, 0); + } + if (name != NULL) + __os_free(env, name); + return (ret); +} + +/* + * __partition_chk_meta -- + * Check for a consistent meta data page and parameters when opening a + * partitioned database. + */ +static int +__partition_chk_meta(dbp, ip, txn, flags) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + u_int32_t flags; +{ + DBMETA *meta; + DB_PARTITION *part; + DBC *dbc; + DB_LOCK metalock; + DB_MPOOLFILE *mpf; + ENV *env; + db_pgno_t base_pgno; + int ret, t_ret; + + dbc = NULL; + meta = NULL; + LOCK_INIT(metalock); + part = dbp->p_internal; + mpf = dbp->mpf; + env = dbp->env; + ret = 0; + + /* Get a cursor on the main db. */ + dbp->p_internal = NULL; + if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0) + goto err; + + /* Get the metadata page. */ + base_pgno = PGNO_BASE_MD; + if ((ret = + __db_lget(dbc, 0, base_pgno, DB_LOCK_READ, 0, &metalock)) != 0) + goto err; + if ((ret = __memp_fget(mpf, &base_pgno, ip, dbc->txn, 0, &meta)) != 0) + goto err; + + if (meta->magic != DB_HASHMAGIC && + (meta->magic != DB_BTREEMAGIC || F_ISSET(meta, BTM_RECNO))) { + __db_errx(env, + "Partitioning may only specified on BTREE and HASH databases."); + ret = EINVAL; + goto err; + } + if (!FLD_ISSET(meta->metaflags, + DBMETA_PART_RANGE | DBMETA_PART_CALLBACK)) { + __db_errx(env, + "Partitioning specified on a non-partitioned database."); + ret = EINVAL; + goto err; + } + + if ((F_ISSET(part, PART_RANGE) && + FLD_ISSET(meta->metaflags, DBMETA_PART_CALLBACK)) || + (F_ISSET(part, PART_CALLBACK) && + FLD_ISSET(meta->metaflags, DBMETA_PART_RANGE))) { + __db_errx(env, "Incompatible partitioning specified."); + ret = EINVAL; + goto err; + } + + if (FLD_ISSET(meta->metaflags, DBMETA_PART_CALLBACK) && + part->callback == NULL && !IS_RECOVERING(env) && + !F_ISSET(dbp, DB_AM_RECOVER) && !LF_ISSET(DB_RDWRMASTER)) { + __db_errx(env, "Partition callback not specified."); + ret = EINVAL; + goto err; + } + + if (F_ISSET(dbp, DB_AM_RECNUM)) { + __db_errx(env, + "Record numbers are not supported in partitioned databases."); + ret = EINVAL; + goto err; + } + + if (part->nparts == 0) { + if (LF_ISSET(DB_CREATE) && meta->nparts == 0) { + __db_errx(env, "Zero paritions specified."); + ret = EINVAL; + goto err; + } else + part->nparts = meta->nparts; + } else if (meta->nparts != 0 && part->nparts != meta->nparts) { + __db_errx(env, "Number of partitions does not match."); + ret = EINVAL; + goto err; + } + + if (meta->magic == DB_HASHMAGIC) { + if (!F_ISSET(part, PART_CALLBACK)) { + __db_errx(env, + "Hash database must specify a partition callback."); + ret = EINVAL; + } + } else if (meta->magic != DB_BTREEMAGIC) { + __db_errx(env, + "Partitioning only supported on BTREE nad HASH."); + ret = EINVAL; + } else + ret = __partition_setup_keys(dbc, part, meta, flags); + +err: /* Put the metadata page back. */ + if (meta != NULL && (t_ret = __memp_fput(mpf, + ip, meta, dbc->priority)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __LPUT(dbc, metalock)) != 0 && ret == 0) + ret = t_ret; + + if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + dbp->p_internal = part; + return (ret); +} + +/* + * Support for sorting keys. Keys must be sorted using the btree + * compare function so if we call qsort in __partiton_setup_keys + * we use this structure to pass the DBP and compare function. + */ +struct key_sort { + DB *dbp; + DBT *key; + int (*compare) __P((DB *, const DBT *, const DBT *)); +}; + +static int __part_key_cmp(a, b) + const void *a, *b; +{ + const struct key_sort *ka, *kb; + + ka = a; + kb = b; + return (ka->compare(ka->dbp, ka->key, kb->key)); +} +/* + * __partition_setup_keys -- + * Get the partition keys into memory, or put them to disk if we + * are creating a partitioned database. + */ +static int +__partition_setup_keys(dbc, part, meta, flags) + DBC *dbc; + DB_PARTITION *part; + DBMETA *meta; + u_int32_t flags; +{ + BTREE *t; + DB *dbp; + DBT data, key, *keys, *kp; + ENV *env; + u_int32_t ds, i, j; + u_int8_t *dd; + struct key_sort *ks; + int have_keys, ret; + int (*compare) __P((DB *, const DBT *, const DBT *)); + void *dp; + + COMPQUIET(dd, NULL); + COMPQUIET(ds, 0); + memset(&data, 0, sizeof(data)); + memset(&key, 0, sizeof(key)); + ks = NULL; + + dbp = dbc->dbp; + env = dbp->env; + + /* Need to just read the main database. */ + dbp->p_internal = NULL; + have_keys = 0; + + /* First verify that things what we expect. */ + if ((ret = __dbc_get(dbc, &key, &data, DB_FIRST)) != 0) { + if (ret != DB_NOTFOUND) + goto err; + if (F_ISSET(part, PART_CALLBACK)) { + ret = 0; + goto done; + } + if (!LF_ISSET(DB_CREATE) && !F_ISSET(dbp, DB_AM_RECOVER) && + !LF_ISSET(DB_RDWRMASTER)) { + __db_errx(env, "No range keys found."); + ret = EINVAL; + goto err; + } + } else { + if (F_ISSET(part, PART_CALLBACK)) { + __db_errx(env, "Keys found and callback set."); + ret = EINVAL; + goto err; + } + if (key.size != 0) { + __db_errx(env, "Partition key 0 is not empty."); + ret = EINVAL; + goto err; + } + have_keys = 1; + } + + if (LF_ISSET(DB_CREATE) && have_keys == 0) { + /* Insert the keys into the master database. */ + for (i = 0; i < part->nparts - 1; i++) { + if ((ret = __db_put(dbp, dbc->thread_info, + dbc->txn, &part->keys[i], &data, 0)) != 0) + goto err; + } + + /* + * Insert the "0" pointer. All records less than the first + * given key go into this partition. We must use the default + * compare to insert this key, otherwise it might not be first. + */ + t = dbc->dbp->bt_internal; + compare = t->bt_compare; + t->bt_compare = __bam_defcmp; + memset(&key, 0, sizeof(key)); + ret = __db_put(dbp, dbc->thread_info, dbc->txn, &key, &data, 0); + t->bt_compare = compare; + if (ret != 0) + goto err; + } +done: if (F_ISSET(part, PART_RANGE)) { + /* + * Allocate one page to hold the keys plus space at the + * end of the buffer to put an array of DBTs. If there + * is not enough space __dbc_get will return how much + * is needed and we realloc. + */ + if ((ret = __os_malloc(env, + meta->pagesize + (sizeof(DBT) * part->nparts), + &part->data)) != 0) { + __db_errx(env, Alloc_err, meta->pagesize); + goto err; + } + memset(&key, 0, sizeof(key)); + memset(&data, 0, sizeof(data)); + data.data = part->data; + data.ulen = meta->pagesize; + data.flags = DB_DBT_USERMEM; +again: if ((ret = __dbc_get(dbc, &key, &data, + DB_FIRST | DB_MULTIPLE_KEY)) == DB_BUFFER_SMALL) { + if ((ret = __os_realloc(env, + data.size + (sizeof(DBT) * part->nparts), + &part->data)) != 0) + goto err; + data.data = part->data; + data.ulen = data.size; + goto again; + } + if (ret == 0) { + /* + * They passed in keys, they must match. + */ + keys = NULL; + compare = NULL; + if (have_keys == 1 && (keys = part->keys) != NULL) { + t = dbc->dbp->bt_internal; + compare = t->bt_compare; + if ((ret = __os_malloc(env, (part->nparts - 1) + * sizeof(struct key_sort), &ks)) != 0) + goto err; + for (j = 0; j < part->nparts - 1; j++) { + ks[j].dbp = dbc->dbp; + ks[j].compare = compare; + ks[j].key = &keys[j]; + } + + qsort(ks, (size_t)part->nparts - 1, + sizeof(struct key_sort), __part_key_cmp); + } + DB_MULTIPLE_INIT(dp, &data); + part->keys = (DBT *) + ((u_int8_t *)part->data + data.size); + j = 0; + for (kp = part->keys; + kp < &part->keys[part->nparts]; kp++, j++) { + DB_MULTIPLE_KEY_NEXT(dp, + &data, kp->data, kp->size, dd, ds); + if (dp == NULL) { + ret = DB_NOTFOUND; + break; + } + if (keys != NULL && j != 0 && + compare(dbc->dbp, ks[j - 1].key, kp) != 0) { + if (kp->data == NULL && + F_ISSET(dbp, DB_AM_RECOVER)) + goto err; + __db_errx(env, + "Partition key %d does not match", j); + ret = EINVAL; + goto err; + } + } + } + } + if (ret == DB_NOTFOUND && F_ISSET(dbp, DB_AM_RECOVER)) + ret = 0; + +err: dbp->p_internal = part; + if (ks != NULL) + __os_free(env, ks); + return (ret); +} + +/* + * __partition_get_callback -- + * Get the partition callback function. + * PUBLIC: int __partition_get_callback __P((DB *, + * PUBLIC: u_int32_t *, u_int32_t (**callback)(DB *, DBT *key))); + */ +int +__partition_get_callback(dbp, parts, callback) + DB *dbp; + u_int32_t *parts; + u_int32_t (**callback)(DB *, DBT *key); +{ + DB_PARTITION *part; + + part = dbp->p_internal; + /* Only return populated results if partitioned using callbacks. */ + if (part != NULL && !F_ISSET(part, PART_CALLBACK)) + part = NULL; + if (parts != NULL) + *parts = (part != NULL ? part->nparts : 0); + if (callback != NULL) + *callback = (part != NULL ? part->callback : NULL); + + return (0); +} + +/* + * __partition_get_keys -- + * Get partition keys. + * PUBLIC: int __partition_get_keys __P((DB *, u_int32_t *, DBT **)); + */ +int +__partition_get_keys(dbp, parts, keys) + DB *dbp; + u_int32_t *parts; + DBT **keys; +{ + DB_PARTITION *part; + + part = dbp->p_internal; + /* Only return populated results if partitioned using ranges. */ + if (part != NULL && !F_ISSET(part, PART_RANGE)) + part = NULL; + if (parts != NULL) + *parts = (part != NULL ? part->nparts : 0); + if (keys != NULL) + *keys = (part != NULL ? &part->keys[1] : NULL); + + return (0); +} + +/* + * __partition_get_dirs -- + * Get partition dirs. + * PUBLIC: int __partition_get_dirs __P((DB *, const char ***)); + */ +int +__partition_get_dirs(dbp, dirpp) + DB *dbp; + const char ***dirpp; +{ + DB_PARTITION *part; + ENV *env; + u_int32_t i; + int ret; + + env = dbp->env; + if ((part = dbp->p_internal) == NULL) { + *dirpp = NULL; + return (0); + } + if (!F_ISSET(dbp, DB_AM_OPEN_CALLED)) { + *dirpp = part->dirs; + return (0); + } + + /* + * We build a list once when asked. The original directory list, + * if any, was discarded at open time. + */ + if ((*dirpp = part->dirs) != NULL) + return (0); + + if ((ret = __os_calloc(env, + sizeof(char *), part->nparts + 1, (char **)&part->dirs)) != 0) + return (ret); + + for (i = 0; i < part->nparts; i++) + part->dirs[i] = part->handles[i]->dirname; + + *dirpp = part->dirs; + return (0); +} + +/* + * __partc_init -- + * Initialize the access private portion of a cursor + * + * PUBLIC: int __partc_init __P((DBC *)); + */ +int +__partc_init(dbc) + DBC *dbc; +{ + ENV *env; + int ret; + + env = dbc->env; + + /* Allocate/initialize the internal structure. */ + if (dbc->internal == NULL && (ret = + __os_calloc(env, 1, sizeof(PART_CURSOR), &dbc->internal)) != 0) + return (ret); + + /* Initialize methods. */ + dbc->close = dbc->c_close = __dbc_close_pp; + dbc->cmp = __dbc_cmp_pp; + dbc->count = dbc->c_count = __dbc_count_pp; + dbc->del = dbc->c_del = __dbc_del_pp; + dbc->dup = dbc->c_dup = __dbc_dup_pp; + dbc->get = dbc->c_get = __partc_get_pp; + dbc->pget = dbc->c_pget = __dbc_pget_pp; + dbc->put = dbc->c_put = __dbc_put_pp; + dbc->am_bulk = NULL; + dbc->am_close = __partc_close; + dbc->am_del = __partc_del; + dbc->am_destroy = __partc_destroy; + dbc->am_get = NULL; + dbc->am_put = __partc_put; + dbc->am_writelock = __partc_writelock; + + /* We avoid swapping partition cursors since we swap the sub cursors */ + F_SET(dbc, DBC_PARTITIONED); + + return (0); +} +/* + * __partc_get_pp -- + * cursor get opeartion on a partitioned database. + */ +static int +__partc_get_pp(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; +{ + DB *dbp; + DB_THREAD_INFO *ip; + ENV *env; + int ignore_lease, ret; + + dbp = dbc->dbp; + env = dbp->env; + + ignore_lease = LF_ISSET(DB_IGNORE_LEASE) ? 1 : 0; + LF_CLR(DB_IGNORE_LEASE); + if ((ret = __dbc_get_arg(dbc, key, data, flags)) != 0) + return (ret); + + ENV_ENTER(env, ip); + + DEBUG_LREAD(dbc, dbc->txn, "DBcursor->get", + flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags); + + ret = __partc_get(dbc, key, data, flags); + /* + * Check for master leases. + */ + if (ret == 0 && + IS_REP_MASTER(env) && IS_USING_LEASES(env) && !ignore_lease) + ret = __rep_lease_check(env, 1); + + ENV_LEAVE(env, ip); + __dbt_userfree(env, key, NULL, data); + return (ret); +} +/* + * __partiton_get -- + * cursor get opeartion on a partitioned database. + * + * PUBLIC: int __partc_get __P((DBC*, DBT *, DBT *, u_int32_t)); + */ +int +__partc_get(dbc, key, data, flags) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; +{ + DB *dbp; + DBC *orig_dbc, *new_dbc; + DB_PARTITION *part; + PART_CURSOR *cp; + u_int32_t multi, part_id; + int ret, retry, search; + + dbp = dbc->dbp; + cp = (PART_CURSOR*)dbc->internal; + orig_dbc = cp->sub_cursor; + part = dbp->p_internal; + + new_dbc = NULL; + retry = search = 0; + part_id = cp->part_id; + multi = flags & ~DB_OPFLAGS_MASK; + + switch (flags & DB_OPFLAGS_MASK) { + case DB_CURRENT: + break; + case DB_FIRST: + part_id = 0; + retry = 1; + break; + case DB_GET_BOTH: + case DB_GET_BOTHC: + case DB_GET_BOTH_RANGE: + search = 1; + break; + case DB_SET_RANGE: + search = 1; + retry = 1; + break; + case DB_LAST: + part_id = part->nparts - 1; + retry = 1; + break; + case DB_NEXT: + case DB_NEXT_NODUP: + if (orig_dbc == NULL) + part_id = 0; + else + part_id = cp->part_id; + retry = 1; + break; + case DB_NEXT_DUP: + break; + case DB_PREV: + case DB_PREV_NODUP: + if (orig_dbc == NULL) + part_id = part->nparts - 1; + else + part_id = cp->part_id; + retry = 1; + break; + case DB_PREV_DUP: + break; + case DB_SET: + search = 1; + break; + default: + return (__db_unknown_flag(dbp->env, "__partc_get", flags)); + } + + /* + * If we need to find the partition to start on, then + * do a binary search of the in memory partition table. + */ + if (search == 1 && F_ISSET(part, PART_CALLBACK)) + part_id = part->callback(dbp, key) % part->nparts; + else if (search == 1) + __part_search(dbp, part, key, &part_id); + + /* Get a new cursor if necessary */ + if (orig_dbc == NULL || cp->part_id != part_id) { + GET_PART_CURSOR(dbc, new_dbc, part_id); + } else + new_dbc = orig_dbc; + + while ((ret = __dbc_get(new_dbc, + key, data, flags)) == DB_NOTFOUND && retry == 1) { + switch (flags & DB_OPFLAGS_MASK) { + case DB_FIRST: + case DB_NEXT: + case DB_NEXT_NODUP: + case DB_SET_RANGE: + if (++part_id < part->nparts) { + flags = DB_FIRST | multi; + break; + } + goto err; + case DB_LAST: + case DB_PREV: + case DB_PREV_NODUP: + if (part_id-- > 0) { + flags = DB_LAST | multi; + break; + } + goto err; + default: + goto err; + } + + if (new_dbc != orig_dbc && (ret = __dbc_close(new_dbc)) != 0) + goto err; + GET_PART_CURSOR(dbc, new_dbc, part_id); + } + + if (ret != 0) + goto err; + + /* Success: swap original and new cursors. */ + if (new_dbc != orig_dbc) { + if (orig_dbc != NULL) { + cp->sub_cursor = NULL; + if ((ret = __dbc_close(orig_dbc)) != 0) + goto err; + } + cp->sub_cursor = new_dbc; + cp->part_id = part_id; + } + + return (0); + +err: if (new_dbc != NULL && new_dbc != orig_dbc) + (void)__dbc_close(new_dbc); + return (ret); +} + +/* + * __partc_put -- + * cursor put opeartion on a partitioned cursor. + * + */ +static int +__partc_put(dbc, key, data, flags, pgnop) + DBC *dbc; + DBT *key, *data; + u_int32_t flags; + db_pgno_t *pgnop; +{ + DB *dbp; + DB_PARTITION *part; + DBC *new_dbc; + PART_CURSOR *cp; + u_int32_t part_id; + int ret; + + dbp = dbc->dbp; + cp = (PART_CURSOR*)dbc->internal; + part_id = cp->part_id; + part = dbp->p_internal; + *pgnop = PGNO_INVALID; + + switch (flags) { + case DB_KEYFIRST: + case DB_KEYLAST: + case DB_NODUPDATA: + case DB_NOOVERWRITE: + case DB_OVERWRITE_DUP: + if (F_ISSET(part, PART_CALLBACK)) { + part_id = part->callback(dbp, key) % part->nparts; + break; + } + __part_search(dbp, part, key, &part_id); + break; + default: + break; + } + + if ((new_dbc = cp->sub_cursor) == NULL || cp->part_id != part_id) { + if ((ret = __db_cursor_int(part->handles[part_id], + dbc->thread_info, dbc->txn, part->handles[part_id]->type, + PGNO_INVALID, 0, dbc->locker, &new_dbc)) != 0) + goto err; + } + + if (F_ISSET(dbc, DBC_WRITER | DBC_WRITECURSOR)) + F_SET(new_dbc, DBC_WRITER); + if ((ret = __dbc_put(new_dbc, key, data, flags)) != 0) + goto err; + + if (new_dbc != cp->sub_cursor) { + if (cp->sub_cursor != NULL) { + if ((ret = __dbc_close(cp->sub_cursor)) != 0) + goto err; + cp->sub_cursor = NULL; + } + cp->sub_cursor = new_dbc; + cp->part_id = part_id; + } + + return (0); + +err: if (new_dbc != NULL && cp->sub_cursor != new_dbc) + (void)__dbc_close(new_dbc); + return (ret); +} + +/* + * __partc_del + * Delete interface to partitioned cursors. + * + */ +static int +__partc_del(dbc, flags) + DBC *dbc; + u_int32_t flags; +{ + PART_CURSOR *cp; + cp = (PART_CURSOR*)dbc->internal; + + if (F_ISSET(dbc, DBC_WRITER | DBC_WRITECURSOR)) + F_SET(cp->sub_cursor, DBC_WRITER); + return (__dbc_del(cp->sub_cursor, flags)); +} + +/* + * __partc_writelock + * Writelock interface to partitioned cursors. + * + */ +static int +__partc_writelock(dbc) + DBC *dbc; +{ + PART_CURSOR *cp; + cp = (PART_CURSOR*)dbc->internal; + + return (cp->sub_cursor->am_writelock(cp->sub_cursor)); +} + +/* + * __partc_close + * Close interface to partitioned cursors. + * + */ +static int +__partc_close(dbc, root_pgno, rmroot) + DBC *dbc; + db_pgno_t root_pgno; + int *rmroot; +{ + PART_CURSOR *cp; + int ret; + + COMPQUIET(root_pgno, 0); + COMPQUIET(rmroot, NULL); + + cp = (PART_CURSOR*)dbc->internal; + + if (cp->sub_cursor == NULL) + return (0); + ret = __dbc_close(cp->sub_cursor); + cp->sub_cursor = NULL; + return (ret); +} + +/* + * __partc_destroy -- + * Destroy a single cursor. + */ +static int +__partc_destroy(dbc) + DBC *dbc; +{ + PART_CURSOR *cp; + ENV *env; + + cp = (PART_CURSOR *)dbc->internal; + env = dbc->env; + + /* Discard the structure. Don't recurse. */ + __os_free(env, cp); + + return (0); +} + +/* + * __partiton_close + * Close a partitioned database. + * + * PUBLIC: int __partition_close __P((DB *, DB_TXN *, u_int32_t)); + */ +int +__partition_close(dbp, txn, flags) + DB *dbp; + DB_TXN *txn; + u_int32_t flags; +{ + DB **pdbp; + DB_PARTITION *part; + ENV *env; + u_int32_t i; + int ret, t_ret; + + if ((part = dbp->p_internal) == NULL) + return (0); + + env = dbp->env; + ret = 0; + + if ((pdbp = part->handles) != NULL) { + for (i = 0; i < part->nparts; i++, pdbp++) + if (*pdbp != NULL && (t_ret = + __db_close(*pdbp, txn, flags)) != 0 && ret == 0) + ret = t_ret; + __os_free(env, part->handles); + } + if (part->dirs != NULL) + __os_free(env, (char **)part->dirs); + if (part->data != NULL) + __os_free(env, (char **)part->data); + __os_free(env, part); + dbp->p_internal = NULL; + + return (ret); +} + +/* + * __partiton_sync + * Sync a partitioned database. + * + * PUBLIC: int __partition_sync __P((DB *)); + */ +int +__partition_sync(dbp) + DB *dbp; +{ + DB **pdbp; + DB_PARTITION *part; + u_int32_t i; + int ret, t_ret; + + ret = 0; + part = dbp->p_internal; + + if ((pdbp = part->handles) != NULL) { + for (i = 0; i < part->nparts; i++, pdbp++) + if (*pdbp != NULL && + F_ISSET(*pdbp, DB_AM_OPEN_CALLED) && (t_ret = + __memp_fsync((*pdbp)->mpf)) != 0 && ret == 0) + ret = t_ret; + } + if ((t_ret = __memp_fsync(dbp->mpf)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __partiton_stat + * Stat a partitioned database. + * + * PUBLIC: int __partition_stat __P((DBC *, void *, u_int32_t)); + */ +int +__partition_stat(dbc, spp, flags) + DBC *dbc; + void *spp; + u_int32_t flags; +{ + DB *dbp, **pdbp; + DB_BTREE_STAT *fsp, *bsp; +#ifdef HAVE_HASH + DB_HASH_STAT *hfsp, *hsp; +#endif + DB_PARTITION *part; + DBC *new_dbc; + ENV *env; + u_int32_t i; + int ret; + + dbp = dbc->dbp; + part = dbp->p_internal; + env = dbp->env; + fsp = NULL; +#ifdef HAVE_HASH + hfsp = NULL; +#endif + + pdbp = part->handles; + for (i = 0; i < part->nparts; i++, pdbp++) { + if ((ret = __db_cursor_int(*pdbp, dbc->thread_info, dbc->txn, + (*pdbp)->type, PGNO_INVALID, + 0, dbc->locker, &new_dbc)) != 0) + goto err; + switch (new_dbc->dbtype) { + case DB_BTREE: + if ((ret = __bam_stat(new_dbc, &bsp, flags)) != 0) + goto err; + if (fsp == NULL) { + fsp = bsp; + *(DB_BTREE_STAT **)spp = fsp; + } else { + fsp->bt_nkeys += bsp->bt_nkeys; + fsp->bt_ndata += bsp->bt_ndata; + fsp->bt_pagecnt += bsp->bt_pagecnt; + if (fsp->bt_levels < bsp->bt_levels) + fsp->bt_levels = bsp->bt_levels; + fsp->bt_int_pg += bsp->bt_int_pg; + fsp->bt_leaf_pg += bsp->bt_leaf_pg; + fsp->bt_dup_pg += bsp->bt_dup_pg; + fsp->bt_over_pg += bsp->bt_over_pg; + fsp->bt_free += bsp->bt_free; + fsp->bt_int_pgfree += bsp->bt_int_pgfree; + fsp->bt_leaf_pgfree += bsp->bt_leaf_pgfree; + fsp->bt_dup_pgfree += bsp->bt_dup_pgfree; + fsp->bt_over_pgfree += bsp->bt_over_pgfree; + __os_ufree(env, bsp); + } + break; +#ifdef HAVE_HASH + case DB_HASH: + if ((ret = __ham_stat(new_dbc, &hsp, flags)) != 0) + goto err; + if (hfsp == NULL) { + hfsp = hsp; + *(DB_HASH_STAT **)spp = hfsp; + } else { + hfsp->hash_nkeys += hsp->hash_nkeys; + hfsp->hash_ndata += hsp->hash_ndata; + hfsp->hash_pagecnt += hsp->hash_pagecnt; + hfsp->hash_ffactor += hsp->hash_ffactor; + hfsp->hash_buckets += hsp->hash_buckets; + hfsp->hash_free += hsp->hash_free; + hfsp->hash_bfree += hsp->hash_bfree; + hfsp->hash_bigpages += hsp->hash_bigpages; + hfsp->hash_big_bfree += hsp->hash_big_bfree; + hfsp->hash_overflows += hsp->hash_overflows; + hfsp->hash_ovfl_free += hsp->hash_ovfl_free; + hfsp->hash_dup += hsp->hash_dup; + hfsp->hash_dup_free += hsp->hash_dup_free; + __os_ufree(env, hsp); + } + break; +#endif + default: + break; + } + if ((ret = __dbc_close(new_dbc)) != 0) + goto err; + } + return (0); + +err: + if (fsp != NULL) + __os_ufree(env, fsp); + *(DB_BTREE_STAT **)spp = NULL; + return (ret); +} + +/* + * __part_truncate -- + * Truncate a database. + * + * PUBLIC: int __part_truncate __P((DBC *, u_int32_t *)); + */ +int +__part_truncate(dbc, countp) + DBC *dbc; + u_int32_t *countp; +{ + DB *dbp, **pdbp; + DB_PARTITION *part; + DBC *new_dbc; + u_int32_t count, i; + int ret, t_ret; + + dbp = dbc->dbp; + part = dbp->p_internal; + pdbp = part->handles; + ret = 0; + + if (countp != NULL) + *countp = 0; + for (i = 0; ret == 0 && i < part->nparts; i++, pdbp++) { + if ((ret = __db_cursor_int(*pdbp, dbc->thread_info, dbc->txn, + (*pdbp)->type, PGNO_INVALID, + 0, dbc->locker, &new_dbc)) != 0) + break; + switch (dbp->type) { + case DB_BTREE: + case DB_RECNO: + ret = __bam_truncate(new_dbc, &count); + break; + case DB_HASH: +#ifdef HAVE_HASH + ret = __ham_truncate(new_dbc, &count); + break; +#endif + case DB_QUEUE: + case DB_UNKNOWN: + default: + ret = __db_unknown_type(dbp->env, + "DB->truncate", dbp->type); + count = 0; + break; + } + if ((t_ret = __dbc_close(new_dbc)) != 0 && ret == 0) + ret = t_ret; + if (countp != NULL) + *countp += count; + } + + return (ret); +} +/* + * __part_compact -- compact a partitioned database. + * + * PUBLIC: int __part_compact __P((DB *, DB_THREAD_INFO *, DB_TXN *, + * PUBLIC: DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *)); + */ +int +__part_compact(dbp, ip, txn, start, stop, c_data, flags, end) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + DBT *start, *stop; + DB_COMPACT *c_data; + u_int32_t flags; + DBT *end; +{ + DB **pdbp; + DB_PARTITION *part; + u_int32_t i; + int ret; + + part = dbp->p_internal; + pdbp = part->handles; + ret = 0; + + for (i = 0; ret == 0 && i < part->nparts; i++, pdbp++) { + switch (dbp->type) { + case DB_HASH: + if (!LF_ISSET(DB_FREELIST_ONLY)) + goto err; + /* FALLTHROUGH */ + case DB_BTREE: + case DB_RECNO: + ret = __bam_compact(*pdbp, + ip, txn, start, stop, c_data, flags, end); + break; + + default: + err: ret = __dbh_am_chk(dbp, DB_OK_BTREE); + break; + } + } + return (ret); +} + +/* + * __part_lsn_reset -- + * reset the lsns on each partition. + * + * PUBLIC: int __part_lsn_reset __P((DB *, DB_THREAD_INFO *)); + */ +int +__part_lsn_reset(dbp, ip) + DB *dbp; + DB_THREAD_INFO *ip; +{ + DB **pdbp; + DB_PARTITION *part; + u_int32_t i; + int ret; + + part = dbp->p_internal; + pdbp = part->handles; + ret = 0; + + for (i = 0; ret == 0 && i < part->nparts; i++, pdbp++) + ret = __db_lsn_reset((*pdbp)->mpf, ip); + + return (ret); +} + +/* + * __part_fileid_reset -- + * reset the fileid on each partition. + * + * PUBLIC: int __part_fileid_reset + * PUBLIC: __P((ENV *, DB_THREAD_INFO *, const char *, u_int32_t, int)); + */ +int +__part_fileid_reset(env, ip, fname, nparts, encrypted) + ENV *env; + DB_THREAD_INFO *ip; + const char *fname; + u_int32_t nparts; + int encrypted; +{ + int ret; + u_int32_t part_id; + char *name, *sp; + const char *np; + + if ((ret = __os_malloc(env, + strlen(fname) + PART_LEN + 1, &name)) != 0) { + __db_errx(env, Alloc_err, strlen(fname) + PART_LEN + 1); + return (ret); + } + + sp = name; + np = __db_rpath(fname); + if (np == NULL) + np = fname; + else { + np++; + (void)strncpy(name, fname, (size_t)(np - fname)); + sp = name + (np - fname); + } + + for (part_id = 0; ret == 0 && part_id < nparts; part_id++) { + (void)sprintf(sp, PART_NAME, np, part_id); + ret = __env_fileid_reset(env, ip, sp, encrypted); + } + + __os_free(env, name); + return (ret); +} +#ifndef HAVE_BREW +/* + * __part_key_range -- + * Return proportion of keys relative to given key. + * + * PUBLIC: int __part_key_range __P((DBC *, DBT *, DB_KEY_RANGE *, u_int32_t)); + */ +int +__part_key_range(dbc, dbt, kp, flags) + DBC *dbc; + DBT *dbt; + DB_KEY_RANGE *kp; + u_int32_t flags; +{ + BTREE_CURSOR *cp; + DBC *new_dbc; + DB_PARTITION *part; + PAGE *h; + u_int32_t id, part_id; + u_int32_t elems, empty, less_elems, my_elems, greater_elems; + u_int32_t levels, max_levels, my_levels; + int ret; + double total_elems; + + COMPQUIET(flags, 0); + + part = dbc->dbp->p_internal; + + /* + * First we find the key range for the partition that contains the + * key. Then we scale based on estimates of the other partitions. + */ + if (F_ISSET(part, PART_CALLBACK)) + part_id = part->callback(dbc->dbp, dbt) % part->nparts; + else + __part_search(dbc->dbp, part, dbt, &part_id); + GET_PART_CURSOR(dbc, new_dbc, part_id); + + if ((ret = __bam_key_range(new_dbc, dbt, kp, flags)) != 0) + goto err; + + cp = (BTREE_CURSOR *)new_dbc->internal; + + if ((ret = __memp_fget(new_dbc->dbp->mpf, + &cp->root, new_dbc->thread_info, new_dbc->txn, 0, &h)) != 0) + goto c_err; + + my_elems = NUM_ENT(h); + my_levels = LEVEL(h); + max_levels = my_levels; + + if ((ret = __memp_fput(new_dbc->dbp->mpf, + new_dbc->thread_info, h, new_dbc->priority)) != 0) + goto c_err; + + if ((ret = __dbc_close(new_dbc)) != 0) + goto err; + /* + * We have the range within one subtree. Now estimate + * what part of the whole range that subtree is. Figure + * out how many levels each part has and how many entries + * in the level below the root. + */ + empty = less_elems = greater_elems = 0; + for (id = 0; id < part->nparts; id++) { + if (id == part_id) { + empty = 0; + continue; + } + GET_PART_CURSOR(dbc, new_dbc, id); + cp = (BTREE_CURSOR *)new_dbc->internal; + if ((ret = __memp_fget(new_dbc->dbp->mpf, &cp->root, + new_dbc->thread_info, new_dbc->txn, 0, &h)) != 0) + goto c_err; + + elems = NUM_ENT(h); + levels = LEVEL(h); + if (levels == 1) + elems /= 2; + + if ((ret = __memp_fput(new_dbc->dbp->mpf, + new_dbc->thread_info, h, new_dbc->priority)) != 0) + goto c_err; + + if ((ret = __dbc_close(new_dbc)) != 0) + goto err; + + /* If the tree is empty, ignore it. */ + if (elems == 0) { + empty++; + continue; + } + + /* + * If a tree has fewer levels than the max just count + * it as a single element in the higher level. + */ + if (id < part_id) { + if (levels > max_levels) { + max_levels = levels; + less_elems = id + elems - empty; + } else if (levels < max_levels) + less_elems++; + else + less_elems += elems; + } else { + if (levels > max_levels) { + max_levels = levels; + greater_elems = (id - part_id) + elems - empty; + } else if (levels < max_levels) + greater_elems++; + else + greater_elems += elems; + } + + } + + if (my_levels < max_levels) { + /* + * The subtree containing the key is not the tallest one. + * Reduce its share by the number of records at the highest + * level. Scale the greater and lesser components up + * by the number of records on either side of this + * subtree. + */ + total_elems = 1 + greater_elems + less_elems; + kp->equal /= total_elems; + kp->less /= total_elems; + kp->less += less_elems/total_elems; + kp->greater /= total_elems; + kp->greater += greater_elems/total_elems; + } else if (my_levels == max_levels) { + /* + * The key is in one of the tallest subtrees. We will + * scale the values by the ratio of the records at the + * top of this stubtree to the number of records at the + * highest level. + */ + total_elems = greater_elems + less_elems; + if (total_elems != 0) { + /* + * First scale down by the fraction of elements + * in this subtree. + */ + total_elems += my_elems; + kp->equal *= my_elems; + kp->equal /= total_elems; + kp->less *= my_elems; + kp->less /= total_elems; + kp->greater *= my_elems; + kp->greater /= total_elems; + /* + * Proportially add weight from the subtrees to the + * left and right of this one. + */ + kp->less += less_elems / total_elems; + kp->greater += greater_elems / total_elems; + } + } + + if (0) { +c_err: (void)__dbc_close(new_dbc); + } + +err: return (ret); +} +#endif + +/* + * __part_remove -- + * Remove method for a partitioned database. + * + * PUBLIC: int __part_remove __P((DB *, DB_THREAD_INFO *, + * PUBLIC: DB_TXN *, const char *, const char *, u_int32_t)); + */ +int +__part_remove(dbp, ip, txn, name, subdb, flags) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + const char *name, *subdb; + u_int32_t flags; +{ + return (__part_rr(dbp, ip, txn, name, subdb, NULL, flags)); +} + +/* + * __part_rename -- + * Rename method for a partitioned database. + * + * PUBLIC: int __part_rename __P((DB *, DB_THREAD_INFO *, + * PUBLIC: DB_TXN *, const char *, const char *, const char *)); + */ +int +__part_rename(dbp, ip, txn, name, subdb, newname) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + const char *name, *subdb, *newname; +{ + return (__part_rr(dbp, ip, txn, name, subdb, newname, 0)); +} + +/* + * __part_rr -- + * Remove/Rename method for a partitioned database. + */ +static int +__part_rr(dbp, ip, txn, name, subdb, newname, flags) + DB *dbp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + const char *name, *subdb, *newname; + u_int32_t flags; +{ + DB **pdbp, *ptmpdbp, *tmpdbp; + DB_PARTITION *part; + ENV *env; + u_int32_t i; + int ret, t_ret; + char *np; + + env = dbp->env; + ret = 0; + + if (subdb != NULL && name != NULL) { + __db_errx(env, + "A partitioned database can not be in a multiple databases file"); + return (EINVAL); + } + ENV_GET_THREAD_INFO(env, ip); + + /* + * Since rename no longer opens the database, we have + * to do it here. + */ + if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0) + return (ret); + + /* + * We need to make sure we don't self-deadlock, so give + * this dbp the same locker as the incoming one. + */ + tmpdbp->locker = dbp->locker; + if ((ret = __db_open(tmpdbp, ip, txn, name, NULL, dbp->type, + DB_RDWRMASTER | DB_RDONLY, 0, PGNO_BASE_MD)) != 0) + goto err; + + part = tmpdbp->p_internal; + pdbp = part->handles; + COMPQUIET(np, NULL); + if (newname != NULL && (ret = __os_malloc(env, + strlen(newname) + PART_LEN + 1, &np)) != 0) { + __db_errx(env, Alloc_err, strlen(newname) + PART_LEN + 1); + goto err; + } + for (i = 0; i < part->nparts; i++, pdbp++) { + if ((ret = __db_create_internal(&ptmpdbp, env, 0)) != 0) + break; + ptmpdbp->locker = (*pdbp)->locker; + if (newname == NULL) + ret = __db_remove_int(ptmpdbp, + ip, txn, (*pdbp)->fname, NULL, flags); + else { + DB_ASSERT(env, np != NULL); + (void)sprintf(np, PART_NAME, newname, i); + ret = __db_rename_int(ptmpdbp, + ip, txn, (*pdbp)->fname, NULL, np); + } + ptmpdbp->locker = NULL; + (void)__db_close(ptmpdbp, NULL, DB_NOSYNC); + if (ret != 0) + break; + } + + if (newname != NULL) + __os_free(env, np); + + if (!F_ISSET(dbp, DB_AM_OPEN_CALLED)) { +err: /* + * Since we copied the locker ID from the dbp, we'd better not + * free it here. + */ + tmpdbp->locker = NULL; + + /* We need to remove the lock event we associated with this. */ + if (txn != NULL) + __txn_remlock(env, + txn, &tmpdbp->handle_lock, DB_LOCK_INVALIDID); + + if ((t_ret = __db_close(tmpdbp, + txn, DB_NOSYNC)) != 0 && ret == 0) + ret = t_ret; + } + return (ret); +} +#ifdef HAVE_VERIFY +/* + * __part_verify -- + * Verify a partitioned database. + * + * PUBLIC: int __part_verify __P((DB *, VRFY_DBINFO *, const char *, + * PUBLIC: void *, int (*)(void *, const void *), u_int32_t)); + */ +int +__part_verify(dbp, vdp, fname, handle, callback, flags) + DB *dbp; + VRFY_DBINFO *vdp; + const char *fname; + void *handle; + int (*callback) __P((void *, const void *)); + u_int32_t flags; +{ + BINTERNAL *lp, *rp; + DB **pdbp; + DB_PARTITION *part; + DBC *dbc; + DBT *key; + ENV *env; + DB_THREAD_INFO *ip; + u_int32_t i; + int ret, t_ret; + + env = dbp->env; + lp = rp = NULL; + dbc = NULL; + ip = vdp->thread_info; + + if (dbp->type == DB_BTREE) { + if ((ret = __bam_open(dbp, ip, + NULL, fname, PGNO_BASE_MD, flags)) != 0) + goto err; + } +#ifdef HAVE_HASH + else if ((ret = __ham_open(dbp, ip, + NULL, fname, PGNO_BASE_MD, flags)) != 0) + goto err; +#endif + + /* + * Initalize partition db handles and get the names. Set DB_RDWRMASTER + * because we may not have the partition callback, but we can still + * look at the structure of the tree. + */ + if ((ret = __partition_open(dbp, + ip, NULL, fname, dbp->type, flags | DB_RDWRMASTER, 0, 0)) != 0) + goto err; + part = dbp->p_internal; + + if (LF_ISSET(DB_SALVAGE)) { + /* If we are being aggressive we don't want to dump the keys. */ + if (LF_ISSET(DB_AGGRESSIVE)) + dbp->p_internal = NULL; + ret = __db_prheader(dbp, + NULL, 0, 0, handle, callback, vdp, PGNO_BASE_MD); + dbp->p_internal = part; + if (ret != 0) + goto err; + } + + if ((ret = __db_cursor(dbp, ip, NULL, &dbc, 0)) != 0) + goto err; + + pdbp = part->handles; + for (i = 0; i < part->nparts; i++, pdbp++) { + if (!F_ISSET(part, PART_RANGE) || part->keys == NULL) + goto vrfy; + if (lp != NULL) + __os_free(env, lp); + lp = rp; + rp = NULL; + if (i + 1 < part->nparts) { + key = &part->keys[i + 1]; + if ((ret = __os_malloc(env, + BINTERNAL_SIZE(key->size), &rp)) != 0) + goto err; + rp->len = key->size; + memcpy(rp->data, key->data, key->size); + B_TSET(rp->type, B_KEYDATA); + } +vrfy: if ((t_ret = __db_verify(*pdbp, ip, (*pdbp)->fname, + NULL, handle, callback, + lp, rp, flags | DB_VERIFY_PARTITION)) != 0 && ret == 0) + ret = t_ret; + } + +err: if (lp != NULL) + __os_free(env, lp); + if (rp != NULL) + __os_free(env, rp); + return (ret); +} +#endif + +#ifdef CONFIG_TEST +/* + * __part_testdocopy -- copy all partitions for testing purposes. + * + * PUBLIC: int __part_testdocopy __P((DB *, const char *)); + */ +int +__part_testdocopy(dbp, name) + DB *dbp; + const char *name; +{ + DB **pdbp; + DB_PARTITION *part; + u_int32_t i; + int ret; + + if ((ret = __db_testdocopy(dbp->env, name)) != 0) + return (ret); + + part = dbp->p_internal; + pdbp = part->handles; + for (i = 0; i < part->nparts; i++, pdbp++) + if ((ret = __db_testdocopy(dbp->env, (*pdbp)->fname)) != 0) + return (ret); + + return (0); +} +#endif +#else +/* + * __db_nopartition -- + * Error when a Berkeley DB build doesn't include partitioning. + * + * PUBLIC: int __db_no_partition __P((ENV *)); + */ +int +__db_no_partition(env) + ENV *env; +{ + __db_errx(env, + "library build did not include support for the database partitioning"); + return (DB_OPNOTSUP); +} +/* + * __partition_set -- + * Set the partitioning keys or callback function. + * This routine must be called prior to creating the database. + * PUBLIC: int __partition_set __P((DB *, u_int32_t, DBT *, + * PUBLIC: u_int32_t (*callback)(DB *, DBT *key))); + */ + +int +__partition_set(dbp, parts, keys, callback) + DB *dbp; + u_int32_t parts; + DBT *keys; + u_int32_t (*callback)(DB *, DBT *key); +{ + COMPQUIET(parts, 0); + COMPQUIET(keys, NULL); + COMPQUIET(callback, NULL); + + return (__db_no_partition(dbp->env)); +} + +/* + * __partition_get_callback -- + * Set the partition callback function. This routine must be called + * prior to opening a partition database that requires a function. + * PUBLIC: int __partition_get_callback __P((DB *, + * PUBLIC: u_int32_t *, u_int32_t (**callback)(DB *, DBT *key))); + */ +int +__partition_get_callback(dbp, parts, callback) + DB *dbp; + u_int32_t *parts; + u_int32_t (**callback)(DB *, DBT *key); +{ + COMPQUIET(parts, NULL); + COMPQUIET(callback, NULL); + + return (__db_no_partition(dbp->env)); +} + +/* + * __partition_get_dirs -- + * Get partition dirs. + * PUBLIC: int __partition_get_dirs __P((DB *, const char ***)); + */ +int +__partition_get_dirs(dbp, dirpp) + DB *dbp; + const char ***dirpp; +{ + COMPQUIET(dirpp, NULL); + return (__db_no_partition(dbp->env)); +} + +/* + * __partition_get_keys -- + * Get partition keys. + * PUBLIC: int __partition_get_keys __P((DB *, u_int32_t *, DBT **)); + */ +int +__partition_get_keys(dbp, parts, keys) + DB *dbp; + u_int32_t *parts; + DBT **keys; +{ + COMPQUIET(parts, NULL); + COMPQUIET(keys, NULL); + + return (__db_no_partition(dbp->env)); +} +/* + * __partition_init -- + * Initialize the partition structure. + * Called when the meta data page is read in during database open or + * when partition keys or a callback are set. + * + * PUBLIC: int __partition_init __P((DB *, u_int32_t)); + */ +int +__partition_init(dbp, flags) + DB *dbp; + u_int32_t flags; +{ + COMPQUIET(flags, 0); + + return (__db_no_partition(dbp->env)); +} +/* + * __part_fileid_reset -- + * reset the fileid on each partition. + * + * PUBLIC: int __part_fileid_reset + * PUBLIC: __P((ENV *, DB_THREAD_INFO *, const char *, u_int32_t, int)); + */ +int +__part_fileid_reset(env, ip, fname, nparts, encrypted) + ENV *env; + DB_THREAD_INFO *ip; + const char *fname; + u_int32_t nparts; + int encrypted; +{ + COMPQUIET(ip, NULL); + COMPQUIET(fname, NULL); + COMPQUIET(nparts, 0); + COMPQUIET(encrypted, 0); + + return (__db_no_partition(env)); +} +/* + * __partition_set_dirs -- + * Set the directories for creating the partition databases. + * They must be in the environment. + * PUBLIC: int __partition_set_dirs __P((DB *, const char **)); + */ +int +__partition_set_dirs(dbp, dirp) + DB *dbp; + const char **dirp; +{ + COMPQUIET(dirp, NULL); + + return (__db_no_partition(dbp->env)); +} +#endif |