/*- * See the file LICENSE for redistribution information. * * Copyright (c) 1996-2001 * Sleepycat Software. All rights reserved. */ /* * Copyright (c) 1995, 1996 * The President and Fellows of Harvard University. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "db_config.h" #ifndef lint static const char revid[] = "Id: log_rec.c,v 11.69 2001/11/02 16:04:02 margo Exp "; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include #include #endif #include "db_int.h" #include "db_page.h" #include "db_am.h" #include "log.h" static int __log_check_master __P((DB_ENV *, u_int8_t *, char *)); static int __log_do_open __P((DB_ENV *, DB_LOG *, u_int8_t *, char *, DBTYPE, int32_t, db_pgno_t, u_int32_t)); static int __log_open_file __P((DB_ENV *, DB_LOG *, __log_register_args *, u_int32_t)); /* * PUBLIC: int __log_register_recover * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); */ int __log_register_recover(dbenv, dbtp, lsnp, op, info) DB_ENV *dbenv; DBT *dbtp; DB_LSN *lsnp; db_recops op; void *info; { DB_ENTRY *dbe; DB_LOG *logp; DB *dbp; __log_register_args *argp; int do_rem, ret, t_ret; u_int32_t flags; logp = dbenv->lg_handle; dbp = NULL; #ifdef DEBUG_RECOVER REC_PRINT(__log_register_print); #endif COMPQUIET(lsnp, NULL); flags = 0; if ((ret = __log_register_read(dbenv, dbtp->data, &argp)) != 0) goto out; if ((argp->opcode == LOG_OPEN && (DB_REDO(op) || op == DB_TXN_OPENFILES || op == DB_TXN_POPENFILES)) || ((argp->opcode == LOG_CLOSE || argp->opcode == LOG_RCLOSE) && DB_UNDO(op))) { /* * If we are redoing an open or undoing a close, then we need * to open a file. We must open the file even if * the meta page is not yet written as we may be creating it. */ if (op == DB_TXN_OPENFILES) F_SET(logp, DBLOG_FORCE_OPEN); /* * If we are applying a log_register record in replication, * we may be doing so out-of-order with respect to * a crdel_fileopen record that came before us in the log * but were in a still-uncommitted transaction. Let the * underlying log_register know, so that it can postpone * opening the file until it actually exists. */ if (F_ISSET(dbenv, DB_ENV_REP_CLIENT)) flags = DB_APPLY_LOGREG; ret = __log_open_file(dbenv, logp, argp, flags); F_CLR(logp, DBLOG_FORCE_OPEN); if (ret == ENOENT || ret == EINVAL) { if ((op == DB_TXN_OPENFILES || op == DB_TXN_POPENFILES) && argp->name.size != 0 && (ret = __db_txnlist_delete(dbenv, info, argp->name.data, argp->fileid, 0)) != 0) goto out; ret = 0; } } else if (argp->opcode == LOG_OPEN || argp->opcode == LOG_CLOSE || (argp->opcode == LOG_RCLOSE && op != DB_TXN_POPENFILES)) { /* * If we are undoing an open, then we need to close the file. * * If the file is deleted, then we can just ignore this close. * Otherwise, we should usually have a valid dbp we should * close or whose reference count should be decremented. * However, if we shut down without closing a file, we may, in * fact, not have the file open, and that's OK. */ do_rem = 0; MUTEX_THREAD_LOCK(dbenv, logp->mutexp); if (argp->fileid < logp->dbentry_cnt) { dbe = &logp->dbentry[argp->fileid]; if (dbe->refcount != 1) { __db_err(dbenv, "Improper file close. LSN: %lu/%lu.", (u_long)lsnp->file, (u_long)lsnp->offset); ret = EINVAL; goto out; } ret = __db_txnlist_close(info, argp->fileid, dbe->count); if ((dbp = TAILQ_FIRST(&dbe->dblist)) != NULL) (void)dbenv->log_unregister(dbenv, dbp); do_rem = 1; } MUTEX_THREAD_UNLOCK(dbenv, logp->mutexp); if (do_rem) { (void)__log_rem_logid(logp, dbp, argp->fileid); /* * If remove or rename has closed the file, don't * sync. */ if (dbp != NULL && (t_ret = dbp->close(dbp, dbp->mpf == NULL ? DB_NOSYNC : 0)) != 0 && ret == 0) ret = t_ret; } } else if (argp->opcode == LOG_CHECKPOINT && (DB_UNDO(op) || op == DB_TXN_OPENFILES || op == DB_TXN_POPENFILES)) { /* * It's a checkpoint and we are rolling backward. It * is possible that the system was shut down and thus * ended with a stable checkpoint; this file was never * closed and has therefore not been reopened yet. If * so, we need to try to open it. */ ret = __log_open_file(dbenv, logp, argp, 0); if (ret == ENOENT || ret == EINVAL) { if (argp->name.size != 0 && (ret = __db_txnlist_delete(dbenv, info, argp->name.data, argp->fileid, 0)) != 0) goto out; ret = 0; } } out: if (argp != NULL) __os_free(dbenv, argp, 0); return (ret); } /* * __log_open_file -- * Called during log_register recovery. Make sure that we have an * entry in the dbentry table for this ndx. Returns 0 on success, * non-zero on error. */ static int __log_open_file(dbenv, lp, argp, flags) DB_ENV *dbenv; DB_LOG *lp; __log_register_args *argp; u_int32_t flags; { DB_ENTRY *dbe; DB *dbp; /* * We never re-open temporary files. Temp files are only * useful during aborts in which case the dbp was entered * when the file was registered. During recovery, we treat * temp files as properly deleted files, allowing the open to * fail and not reporting any errors when recovery fails to * get a valid dbp from db_fileid_to_db. */ if (argp->name.size == 0) { (void)__log_add_logid(dbenv, lp, NULL, argp->fileid); return (ENOENT); } /* * Because of reference counting, we cannot automatically close files * during recovery, so when we're opening, we have to check that the * name we are opening is what we expect. If it's not, then we close * the old file and open the new one. */ MUTEX_THREAD_LOCK(dbenv, lp->mutexp); if (argp->fileid < lp->dbentry_cnt) dbe = &lp->dbentry[argp->fileid]; else dbe = NULL; if (dbe != NULL) { dbe->deleted = 0; if ((dbp = TAILQ_FIRST(&dbe->dblist)) != NULL) { if (dbp->meta_pgno != argp->meta_pgno || memcmp(dbp->fileid, argp->uid.data, DB_FILE_ID_LEN) != 0) { MUTEX_THREAD_UNLOCK(dbenv, lp->mutexp); goto reopen; } if (!F_ISSET(lp, DBLOG_RECOVER)) dbe->refcount++; MUTEX_THREAD_UNLOCK(dbenv, lp->mutexp); return (0); } } MUTEX_THREAD_UNLOCK(dbenv, lp->mutexp); if (0) { reopen: (void)dbenv->log_unregister(dbenv, dbp); (void)__log_rem_logid(lp, dbp, argp->fileid); dbp->close(dbp, 0); } return (__log_do_open(dbenv, lp, argp->uid.data, argp->name.data, argp->ftype, argp->fileid, argp->meta_pgno, flags)); } /* * log_reopen_file -- close and reopen a db file. * Must be called when a metadata page changes. * * PUBLIC: int __log_reopen_file __P((DB_ENV *, * PUBLIC: char *, int32_t, u_int8_t *, db_pgno_t, u_int32_t)); * */ int __log_reopen_file(dbenv, name, ndx, fileid, meta_pgno, flags) DB_ENV *dbenv; char *name; int32_t ndx; u_int8_t *fileid; db_pgno_t meta_pgno; u_int32_t flags; { DB *dbp; DB_LOG *logp; DBTYPE ftype; FNAME *fnp; LOG *lp; char *tmp_name; int ret; logp = dbenv->lg_handle; if (name == NULL) { R_LOCK(dbenv, &logp->reginfo); lp = logp->reginfo.primary; for (fnp = SH_TAILQ_FIRST(&lp->fq, __fname); fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) { if (fnp->ref == 0) /* Entry not in use. */ continue; if (memcmp(fnp->ufid, fileid, DB_FILE_ID_LEN) == 0) break; } if (fnp == 0 || fnp->name_off == INVALID_ROFF) { __db_err(dbenv, "metasub recover: non-existent file id"); return (EINVAL); } name = R_ADDR(&logp->reginfo, fnp->name_off); ret = __os_strdup(dbenv, name, &tmp_name); R_UNLOCK(dbenv, &logp->reginfo); if (ret != 0) goto out; name = tmp_name; } else tmp_name = NULL; if ((ret = __db_fileid_to_db(dbenv, &dbp, ndx, 0)) != 0) goto out; ftype = dbp->type; (void)dbenv->log_unregister(dbenv, dbp); (void)__log_rem_logid(logp, dbp, ndx); (void)dbp->close(dbp, 0); ret = __log_do_open(dbenv, logp, fileid, name, ftype, ndx, meta_pgno, flags); if (tmp_name != NULL) __os_free(dbenv, tmp_name, 0); out: return (ret); } /* * __log_do_open -- * Open files referenced in the log. This is the part of the open that * is not protected by the thread mutex. */ static int __log_do_open(dbenv, lp, uid, name, ftype, ndx, meta_pgno, flags) DB_ENV *dbenv; DB_LOG *lp; u_int8_t *uid; char *name; DBTYPE ftype; int32_t ndx; db_pgno_t meta_pgno; u_int32_t flags; { DB *dbp; int ret; u_int8_t zeroid[DB_FILE_ID_LEN]; if ((ret = db_create(&dbp, lp->dbenv, 0)) != 0) return (ret); dbp->log_fileid = ndx; /* * This is needed to signal to the locking routines called while * opening databases that we are potentially undoing a transaction * from an XA process. Since the XA process does not share * locks with the aborting transaction this prevents us from * deadlocking during the open during rollback. * Because this routine is called either during recovery or during an * XA_ABORT, we can safely set DB_AM_RECOVER in the dbp since it * will not be shared with other threads. */ F_SET(dbp, DB_AM_RECOVER); if (meta_pgno != PGNO_BASE_MD) memcpy(dbp->fileid, uid, DB_FILE_ID_LEN); dbp->type = ftype; if ((ret = __db_dbopen(dbp, name, flags | DB_ODDFILESIZE, __db_omode("rw----"), meta_pgno)) == 0) { /* * Verify that we are opening the same file that we were * referring to when we wrote this log record. */ if (meta_pgno != PGNO_BASE_MD && __log_check_master(dbenv, uid, name) != 0) goto not_right; if (memcmp(uid, dbp->fileid, DB_FILE_ID_LEN) != 0) { memset(zeroid, 0, DB_FILE_ID_LEN); if (memcmp(dbp->fileid, zeroid, DB_FILE_ID_LEN) != 0) goto not_right; skipopen: memcpy(dbp->fileid, uid, DB_FILE_ID_LEN); } if (IS_RECOVERING(dbenv) || LF_ISSET(DB_APPLY_LOGREG)) { /* * If DB_APPLY_LOGREG is set, we want to register this * log file with a specific fileid, but we don't want * to log anything. Pass the flag down into * the log_register code. */ (void)__log_register_int(dbp->dbenv, dbp, name, flags); (void)__log_add_logid(dbenv, lp, dbp, ndx); } return (0); } else if (ret == ENOENT && LF_ISSET(DB_APPLY_LOGREG)) goto skipopen; not_right: (void)dbp->close(dbp, 0); (void)__log_add_logid(dbenv, lp, NULL, ndx); return (ENOENT); } static int __log_check_master(dbenv, uid, name) DB_ENV *dbenv; u_int8_t *uid; char *name; { DB *dbp; int ret; ret = 0; if ((ret = db_create(&dbp, dbenv, 0)) != 0) return (ret); dbp->type = DB_BTREE; ret = __db_dbopen(dbp, name, 0, __db_omode("rw----"), PGNO_BASE_MD); if (ret == 0 && memcmp(uid, dbp->fileid, DB_FILE_ID_LEN) != 0) ret = EINVAL; (void)dbp->close(dbp, 0); return (ret); } /* * __log_add_logid -- * Adds a DB entry to the log's DB entry table. * * PUBLIC: int __log_add_logid __P((DB_ENV *, DB_LOG *, DB *, int32_t)); */ int __log_add_logid(dbenv, logp, dbp, ndx) DB_ENV *dbenv; DB_LOG *logp; DB *dbp; int32_t ndx; { DB *dbtmp; int32_t i; int ret; ret = 0; MUTEX_THREAD_LOCK(dbenv, logp->mutexp); /* * Check if we need to grow the table. Note, ndx is 0-based (the * index into the DB entry table) an dbentry_cnt is 1-based, the * number of available slots. */ if (logp->dbentry_cnt <= ndx) { if ((ret = __os_realloc(dbenv, (ndx + DB_GROW_SIZE) * sizeof(DB_ENTRY), &logp->dbentry)) != 0) goto err; /* * We have moved the head of the queue. * Fix up the queue header of an empty queue or the previous * pointer of the first element. */ for (i = 0; i < logp->dbentry_cnt; i++) { if ((dbtmp = TAILQ_FIRST(&logp->dbentry[i].dblist)) == NULL) TAILQ_INIT(&logp->dbentry[i].dblist); else TAILQ_REINSERT_HEAD( &logp->dbentry[i].dblist, dbtmp, links); } /* Initialize the new entries. */ for (i = logp->dbentry_cnt; i < ndx + DB_GROW_SIZE; i++) { logp->dbentry[i].count = 0; TAILQ_INIT(&logp->dbentry[i].dblist); logp->dbentry[i].deleted = 0; logp->dbentry[i].refcount = 0; } logp->dbentry_cnt = i; } if (logp->dbentry[ndx].deleted == 0 && TAILQ_FIRST(&logp->dbentry[ndx].dblist) == NULL) { logp->dbentry[ndx].count = 0; if (dbp != NULL) TAILQ_INSERT_HEAD(&logp->dbentry[ndx].dblist, dbp, links); logp->dbentry[ndx].deleted = dbp == NULL; logp->dbentry[ndx].refcount = 1; } else if (!F_ISSET(logp, DBLOG_RECOVER)) { if (dbp != NULL) TAILQ_INSERT_HEAD(&logp->dbentry[ndx].dblist, dbp, links); logp->dbentry[ndx].refcount++; } err: MUTEX_THREAD_UNLOCK(dbenv, logp->mutexp); return (ret); } /* * __db_fileid_to_db -- * Return the DB corresponding to the specified fileid. * * PUBLIC: int __db_fileid_to_db __P((DB_ENV *, DB **, int32_t, int)); */ int __db_fileid_to_db(dbenv, dbpp, ndx, inc) DB_ENV *dbenv; DB **dbpp; int32_t ndx; int inc; { DB_LOG *logp; FNAME *fname; int ret; char *name; ret = 0; logp = dbenv->lg_handle; MUTEX_THREAD_LOCK(dbenv, logp->mutexp); /* * Under XA, a process different than the one issuing DB operations * may abort a transaction. In this case, recovery routines are run * by a process that does not necessarily have the file open, so we * we must open the file explicitly. */ if (ndx >= logp->dbentry_cnt || (!logp->dbentry[ndx].deleted && TAILQ_FIRST(&logp->dbentry[ndx].dblist) == NULL)) { if (F_ISSET(logp, DBLOG_RECOVER)) { ret = ENOENT; goto err; } if (__log_lid_to_fname(logp, ndx, &fname) != 0) { /* Couldn't find entry; this is a fatal error. */ __db_err(dbenv, "Missing log fileid entry"); ret = EINVAL; goto err; } name = R_ADDR(&logp->reginfo, fname->name_off); /* * __log_do_open is called without protection of the * log thread lock. */ MUTEX_THREAD_UNLOCK(dbenv, logp->mutexp); /* * At this point, we are not holding the thread lock, so exit * directly instead of going through the exit code at the * bottom. If the __log_do_open succeeded, then we don't need * to do any of the remaining error checking at the end of this * routine. */ if ((ret = __log_do_open(dbenv, logp, fname->ufid, name, fname->s_type, ndx, fname->meta_pgno, 0)) != 0) return (ret); *dbpp = TAILQ_FIRST(&logp->dbentry[ndx].dblist); return (0); } /* * Return DB_DELETED if the file has been deleted (it's not an error). */ if (logp->dbentry[ndx].deleted) { ret = DB_DELETED; if (inc) logp->dbentry[ndx].count++; goto err; } /* * Otherwise return 0, but if we don't have a corresponding DB, * thats not read only its an error. */ if ((*dbpp = TAILQ_FIRST(&logp->dbentry[ndx].dblist)) == NULL) ret = ENOENT; while (ret == 0 && F_ISSET(*dbpp, DB_AM_RDONLY)) if ((*dbpp = TAILQ_NEXT(*dbpp, links)) == NULL) ret = ENOENT; err: MUTEX_THREAD_UNLOCK(dbenv, logp->mutexp); return (ret); } /* * __log_close_files -- * Close files that were opened by the recovery daemon. We sync the * file, unless its mpf pointer has been NULLed by a db_remove or * db_rename. We may not have flushed the log_register record that * closes the file. * * PUBLIC: void __log_close_files __P((DB_ENV *)); */ void __log_close_files(dbenv) DB_ENV *dbenv; { DB_ENTRY *dbe; DB_LOG *logp; DB *dbp; int32_t i; logp = dbenv->lg_handle; MUTEX_THREAD_LOCK(dbenv, logp->mutexp); for (i = 0; i < logp->dbentry_cnt; i++) { dbe = &logp->dbentry[i]; while ((dbp = TAILQ_FIRST(&dbe->dblist)) != NULL) { (void)dbenv->log_unregister(dbenv, dbp); TAILQ_REMOVE(&dbe->dblist, dbp, links); (void)dbp->close(dbp, dbp->mpf == NULL ? DB_NOSYNC : 0); } dbe->deleted = 0; dbe->refcount = 0; } MUTEX_THREAD_UNLOCK(dbenv, logp->mutexp); } /* * __log_rem_logid * Remove an entry from the log table. Find the appropriate DB and * unlink it from the linked list off the table. If the DB is NULL, treat * this as a simple refcount decrement. * * PUBLIC: void __log_rem_logid __P((DB_LOG *, DB *, int32_t)); */ void __log_rem_logid(logp, dbp, ndx) DB_LOG *logp; DB *dbp; int32_t ndx; { DB *xdbp; MUTEX_THREAD_LOCK(logp->dbenv, logp->mutexp); if (--logp->dbentry[ndx].refcount == 0) { if (dbp == NULL && (xdbp = TAILQ_FIRST(&logp->dbentry[ndx].dblist)) != NULL) (void)xdbp->close(xdbp, 0); TAILQ_INIT(&logp->dbentry[ndx].dblist); logp->dbentry[ndx].deleted = 0; } else if (dbp != NULL) for (xdbp = TAILQ_FIRST(&logp->dbentry[ndx].dblist); xdbp != NULL; xdbp = TAILQ_NEXT(xdbp, links)) if (xdbp == dbp) { TAILQ_REMOVE(&logp->dbentry[ndx].dblist, xdbp, links); break; } MUTEX_THREAD_UNLOCK(logp->dbenv, logp->mutexp); } /* * __log_lid_to_fname -- * Traverse the shared-memory region looking for the entry that * matches the passed log fileid. Returns 0 on success; -1 on error. * PUBLIC: int __log_lid_to_fname __P((DB_LOG *, int32_t, FNAME **)); */ int __log_lid_to_fname(dblp, lid, fnamep) DB_LOG *dblp; int32_t lid; FNAME **fnamep; { DB_ENV *dbenv; FNAME *fnp; LOG *lp; dbenv = dblp->dbenv; lp = dblp->reginfo.primary; R_LOCK(dbenv, &dblp->reginfo); for (fnp = SH_TAILQ_FIRST(&lp->fq, __fname); fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) { if (fnp->ref == 0) /* Entry not in use. */ continue; if (fnp->id == lid) { *fnamep = fnp; R_UNLOCK(dbenv, &dblp->reginfo); return (0); } } R_UNLOCK(dbenv, &dblp->reginfo); return (-1); }