summaryrefslogtreecommitdiff
path: root/db
diff options
context:
space:
mode:
authorjbj <devnull@localhost>2002-08-13 21:30:18 +0000
committerjbj <devnull@localhost>2002-08-13 21:30:18 +0000
commitee6c74d4c244b4d19372d119ad61a9116dcde07d (patch)
tree64ca3a30b21fe2cee1bbeddbaa4ea2648a16425c /db
parent48a2529ce4b3ab1f677d3de9b70e8cbe14d910c6 (diff)
downloadrpm-ee6c74d4c244b4d19372d119ad61a9116dcde07d.tar.gz
rpm-ee6c74d4c244b4d19372d119ad61a9116dcde07d.tar.bz2
rpm-ee6c74d4c244b4d19372d119ad61a9116dcde07d.zip
Avoid DB_RECOVER deadlock (#70362).
CVS patchset: 5626 CVS date: 2002/08/13 21:30:18
Diffstat (limited to 'db')
-rw-r--r--db/mp/mp_bh.c224
1 files changed, 126 insertions, 98 deletions
diff --git a/db/mp/mp_bh.c b/db/mp/mp_bh.c
index e802b165b..5c438b202 100644
--- a/db/mp/mp_bh.c
+++ b/db/mp/mp_bh.c
@@ -1,13 +1,13 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2001
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: mp_bh.c,v 11.25 2001/01/10 04:50:53 ubell Exp $";
+static const char revid[] = "Id: mp_bh.c,v 11.45 2001/07/26 19:53:31 bostic Exp ";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -30,19 +30,22 @@ static int __memp_upgrade __P((DB_MPOOL *, DB_MPOOLFILE *, MPOOLFILE *));
* Write the page associated with a given bucket header.
*
* PUBLIC: int __memp_bhwrite
- * PUBLIC: __P((DB_MPOOL *, MPOOLFILE *, BH *, int *, int *));
+ * PUBLIC: __P((DB_MPOOL *, MPOOLFILE *, BH *, int, int *, int *));
*/
int
-__memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
+__memp_bhwrite(dbmp, mfp, bhp, open_extents, restartp, wrotep)
DB_MPOOL *dbmp;
MPOOLFILE *mfp;
BH *bhp;
- int *restartp, *wrotep;
+ int open_extents, *restartp, *wrotep;
{
+ DB_ENV *dbenv;
DB_MPOOLFILE *dbmfp;
DB_MPREG *mpreg;
int incremented, ret;
+ dbenv = dbmp->dbenv;
+
if (restartp != NULL)
*restartp = 0;
if (wrotep != NULL)
@@ -66,13 +69,13 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
* If we find a descriptor on the file that's not open for writing, we
* try and upgrade it to make it writeable. If that fails, we're done.
*/
- MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
+ MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))
if (dbmfp->mfp == mfp) {
if (F_ISSET(dbmfp, MP_READONLY) &&
__memp_upgrade(dbmp, dbmfp, mfp)) {
- MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
return (0);
}
@@ -84,7 +87,7 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
incremented = 1;
break;
}
- MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
if (dbmfp != NULL)
goto found;
@@ -94,21 +97,22 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
* trying to do that. First, if we have different privileges than the
* process that "owns" the temporary file, we might create the backing
* disk file such that the owning process couldn't read/write its own
- * buffers, e.g., memp_trickle() running as root creating a file owned
+ * buffers, e.g., memp_trickle running as root creating a file owned
* as root, mode 600. Second, if the temporary file has already been
* created, we don't have any way of finding out what its real name is,
* and, even if we did, it was already unlinked (so that it won't be
* left if the process dies horribly). This decision causes a problem,
* however: if the temporary file consumes the entire buffer cache,
* and the owner doesn't flush the buffers to disk, we could end up
- * with resource starvation, and the memp_trickle() thread couldn't do
+ * with resource starvation, and the memp_trickle thread couldn't do
* anything about it. That's a pretty unlikely scenario, though.
*
* Note that we should never get here when the temporary file
* in question has already been closed in another process, in which
* case it should be marked MP_DEADFILE.
*/
- if (F_ISSET(mfp, MP_TEMP)) {
+ if (F_ISSET(mfp, MP_TEMP)
+ || (F_ISSET(mfp, MP_EXTENT) && !open_extents)) {
DB_ASSERT(!F_ISSET(mfp, MP_DEADFILE));
return (0);
}
@@ -120,12 +124,12 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
* nothing we can do.
*/
if (mfp->ftype != 0) {
- MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
+ MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
for (mpreg = LIST_FIRST(&dbmp->dbregq);
mpreg != NULL; mpreg = LIST_NEXT(mpreg, q))
if (mpreg->ftype == mfp->ftype)
break;
- MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
if (mpreg == NULL)
return (0);
}
@@ -138,16 +142,24 @@ __memp_bhwrite(dbmp, mfp, bhp, restartp, wrotep)
* There's no negative cache, so we may repeatedly try and open files
* that we have previously tried (and failed) to open.
*/
- if (__memp_fopen(dbmp, mfp, R_ADDR(dbmp->reginfo, mfp->path_off),
- 0, 0, mfp->stat.st_pagesize, 0, NULL, &dbmfp) != 0)
+ if (dbenv->memp_fcreate(dbenv, &dbmfp, 0) != 0)
+ return (0);
+ if (__memp_fopen_int(dbmfp, mfp,
+ R_ADDR(dbmp->reginfo, mfp->path_off),
+ 0, 0, mfp->stat.st_pagesize, 0) != 0) {
+ (void)dbmfp->close(dbmfp, 0);
return (0);
+ }
+ F_SET(dbmfp, MP_FLUSH);
+ if (F_ISSET(mfp, MP_EXTENT))
+ dbmp->extents = 1;
found: ret = __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep);
if (incremented) {
- MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
+ MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
--dbmfp->ref;
- MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
}
return (ret);
@@ -177,6 +189,7 @@ __memp_pgread(dbmfp, bhp, can_create)
dbenv = dbmp->dbenv;
mfp = dbmfp->mfp;
pagesize = mfp->stat.st_pagesize;
+ created = 0;
F_SET(bhp, BH_LOCKED | BH_TRASH);
MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp);
@@ -187,59 +200,55 @@ __memp_pgread(dbmfp, bhp, can_create)
* them now, we create them when the pages have to be flushed.
*/
nr = 0;
- if (F_ISSET(&dbmfp->fh, DB_FH_VALID)) {
- /*
- * Ignore read errors if we have permission to create the page.
- * Assume that the page doesn't exist, and that we'll create it
- * when we write it out.
- *
- * XXX
- * Theoretically, we could overwrite a page of data if it were
- * possible for a file to be successfully opened for reading
- * and then for the read to fail. Shouldn't ever happen, but
- * it might be worth checking to see if the offset is past the
- * known end-of-file.
- */
- db_io.fhp = &dbmfp->fh;
+ if (F_ISSET(dbmfp->fhp, DB_FH_VALID)) {
+ db_io.fhp = dbmfp->fhp;
db_io.mutexp = dbmfp->mutexp;
db_io.pagesize = db_io.bytes = pagesize;
db_io.pgno = bhp->pgno;
db_io.buf = bhp->buf;
- ret = __os_io(dbenv, &db_io, DB_IO_READ, &nr);
+ /*
+ * The page may not exist; if it doesn't, nr may well be 0,
+ * but we expect the underlying OS calls not to return an
+ * error code in this case.
+ */
+ if ((ret = __os_io(dbenv, &db_io, DB_IO_READ, &nr)) != 0)
+ goto err;
} else
ret = 0;
- created = 0;
if (nr < pagesize) {
if (can_create)
created = 1;
else {
/*
- * If we had a short read, ret may be 0. This may not
- * be an error -- in particular DB recovery processing
- * may request pages that have never been written to
- * disk, in which case we won't find the page. So, the
- * caller must know how to handle the error.
+ * Don't output error messages for short reads. In
+ * particular, DB recovery processing may request pages
+ * that have never been written to disk or for which
+ * only some part have been written to disk, in which
+ * case we won't find the page. The caller must know
+ * how to handle the error.
*/
- if (ret == 0)
- ret = EIO;
+ ret = DB_PAGE_NOTFOUND;
goto err;
}
- }
-
- /*
- * Clear any bytes we didn't read that need to be cleared. If we're
- * running in diagnostic mode, smash any bytes on the page that are
- * unknown quantities for the caller.
- */
- if (nr != pagesize) {
+ /*
+ * Clear any bytes that need to be cleared -- if we did a short
+ * read, we assume that a page was not completely written and
+ * clear even the bytes that we read. This is so our caller
+ * isn't surprised (for example, if the first sector only of a
+ * DB page was written, the LSN will indicate that the page was
+ * updated, but the page contents will be wrong). Support for
+ * page checksums might make this unnecessary in the future --
+ * I would prefer not to discard data potentially written by
+ * the application, under any circumstances.
+ *
+ * If we're running in diagnostic mode, corrupt any bytes on
+ * the page that are unknown quantities for the caller.
+ */
len = mfp->clear_len == 0 ? pagesize : mfp->clear_len;
- if (nr < len)
- memset(bhp->buf + nr, 0, len - nr);
-#ifdef DIAGNOSTIC
- if (nr > len)
- len = nr;
+ memset(bhp->buf, 0, len);
+#if defined(DIAGNOSTIC) || defined(UMRW)
if (len < pagesize)
memset(bhp->buf + len, CLEAR_BYTE, pagesize - len);
#endif
@@ -303,34 +312,19 @@ __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep)
*wrotep = 0;
callpgin = 0;
- /*
- * Check the dirty bit -- this buffer may have been written since we
- * decided to write it.
- */
- if (!F_ISSET(bhp, BH_DIRTY)) {
- if (wrotep != NULL)
- *wrotep = 1;
- return (0);
- }
-
- MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp);
+ /* We should never be called with a clean or a locked buffer. */
+ DB_ASSERT(F_ISSET(bhp, BH_DIRTY));
+ DB_ASSERT(!F_ISSET(bhp, BH_LOCKED));
/*
- * If there were two writers, we may have just been waiting while the
- * other writer completed I/O on this buffer. Check the dirty bit one
- * more time.
+ * Lock the buffer, set the I/O in progress flag, and discard the
+ * region lock.
*/
- if (!F_ISSET(bhp, BH_DIRTY)) {
- MUTEX_UNLOCK(dbenv, &bhp->mutex);
-
- if (wrotep != NULL)
- *wrotep = 1;
- return (0);
- }
-
+ MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp);
F_SET(bhp, BH_LOCKED);
R_UNLOCK(dbenv, dbmp->reginfo);
+ /* Tell the caller that the region lock was discarded. */
if (restartp != NULL)
*restartp = 1;
@@ -347,20 +341,53 @@ __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep)
goto file_dead;
/*
- * Ensure the appropriate log records are on disk. If the page is
- * being written as part of a sync operation, the flush has already
- * been done, unless it was written by the application *after* the
- * sync was scheduled.
+ * If the page is in a file for which we have LSN information, we have
+ * to ensure the appropriate log records are on disk. If the page is
+ * being written as part of a sync operation, the flush has been done
+ * already, unless it was modified by the application *after* the sync
+ * was scheduled.
*/
- if (LOGGING_ON(dbenv) &&
+ if (LOGGING_ON(dbenv) && !IS_RECOVERING(dbenv) && mfp->lsn_off != -1 &&
(!F_ISSET(bhp, BH_SYNC) || F_ISSET(bhp, BH_SYNC_LOGFLSH))) {
memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN));
- if ((ret = log_flush(dbenv, &lsn)) != 0)
+ if ((ret = dbenv->log_flush(dbenv, &lsn)) != 0)
goto err;
}
- DB_ASSERT(!LOGGING_ON(dbenv) ||
- log_compare(&((LOG *)((DB_LOG *)
- dbenv->lg_handle)->reginfo.primary)->s_lsn, &LSN(bhp->buf)) > 0);
+
+#ifdef DIAGNOSTIC
+ /*
+ * Verify write-ahead logging semantics.
+ *
+ * !!!
+ * One special case. There is a single field on the meta-data page,
+ * the last-page-number-in-the-file field, for which we do not log
+ * changes. So, if the page was original created in a database that
+ * didn't have logging turned on, we can see a page marked dirty but
+ * for which no corresponding log record has been written. However,
+ * the only way that a page can be created for which there isn't a
+ * previous log record and valid LSN is when the page was created
+ * without logging turned on, and so we check for that special-case
+ * LSN value.
+ */
+ if (LOGGING_ON(dbenv) && !IS_NOT_LOGGED_LSN(LSN(bhp->buf))) {
+ /*
+ * There is a potential race here. If we are in the midst of
+ * switching log files, it's possible we could test against the
+ * old file and the new offset in the log region's LSN. If we
+ * fail the first test, acquire the log mutex and check again.
+ */
+ DB_LOG *dblp;
+ LOG *lp;
+
+ dblp = dbenv->lg_handle;
+ lp = dblp->reginfo.primary;
+ if (log_compare(&lp->s_lsn, &LSN(bhp->buf)) <= 0) {
+ R_LOCK(dbenv, &dblp->reginfo);
+ DB_ASSERT(log_compare(&lp->s_lsn, &LSN(bhp->buf)) > 0);
+ R_UNLOCK(dbenv, &dblp->reginfo);
+ }
+ }
+#endif
/*
* Call any pgout function. We set the callpgin flag so that we flag
@@ -376,13 +403,13 @@ __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep)
}
/* Temporary files may not yet have been created. */
- if (!F_ISSET(&dbmfp->fh, DB_FH_VALID)) {
+ if (!F_ISSET(dbmfp->fhp, DB_FH_VALID)) {
MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
- if (!F_ISSET(&dbmfp->fh, DB_FH_VALID) &&
+ if (!F_ISSET(dbmfp->fhp, DB_FH_VALID) &&
((ret = __db_appname(dbenv, DB_APP_TMP, NULL, NULL,
DB_OSO_CREATE | DB_OSO_EXCL | DB_OSO_TEMP,
- &dbmfp->fh, NULL)) != 0 ||
- !F_ISSET(&dbmfp->fh, DB_FH_VALID))) {
+ dbmfp->fhp, NULL)) != 0 ||
+ !F_ISSET(dbmfp->fhp, DB_FH_VALID))) {
MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
__db_err(dbenv,
"unable to create temporary backing file");
@@ -392,13 +419,12 @@ __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep)
}
/* Write the page. */
- db_io.fhp = &dbmfp->fh;
+ db_io.fhp = dbmfp->fhp;
db_io.mutexp = dbmfp->mutexp;
db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize;
db_io.pgno = bhp->pgno;
db_io.buf = bhp->buf;
if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) {
- ret = __db_panic(dbenv, ret);
fail = "write";
goto syserr;
}
@@ -427,7 +453,7 @@ file_dead:
*/
if (callpgin)
F_SET(bhp, BH_CALLPGIN);
- F_CLR(bhp, BH_DIRTY | BH_LOCKED);
+ F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE | BH_LOCKED);
/*
* If we write a buffer for which a checkpoint is waiting, update
@@ -447,6 +473,7 @@ file_dead:
/* Update the page clean/dirty statistics. */
c_mp = BH_TO_CACHE(dbmp, bhp);
++c_mp->stat.st_page_clean;
+ DB_ASSERT(c_mp->stat.st_page_dirty != 0);
--c_mp->stat.st_page_dirty;
/* Update I/O statistics. */
@@ -470,7 +497,7 @@ file_dead:
*/
if (dosync) {
R_UNLOCK(dbenv, dbmp->reginfo);
- syncfail = __os_fsync(dbenv, &dbmfp->fh) != 0;
+ syncfail = __os_fsync(dbenv, dbmfp->fhp) != 0;
R_LOCK(dbenv, dbmp->reginfo);
if (syncfail)
F_SET(mp, MP_LSN_RETRY);
@@ -602,14 +629,15 @@ __memp_bhfree(dbmp, bhp, free_mem)
if (--mfp->block_cnt == 0 && mfp->mpf_cnt == 0)
__memp_mf_discard(dbmp, mfp);
+ DB_ASSERT(c_mp->stat.st_page_clean != 0);
+ --c_mp->stat.st_page_clean;
+
/*
* If we're not reusing it immediately, free the buffer header
* and data for real.
*/
- if (free_mem) {
- --c_mp->stat.st_page_clean;
+ if (free_mem)
__db_shalloc_free(dbmp->reginfo[n_cache].addr, bhp);
- }
}
/*
@@ -652,11 +680,11 @@ __memp_upgrade(dbmp, dbmfp, mfp)
ret = 1;
} else {
/* Swap the descriptors and set the upgrade flag. */
- (void)__os_closehandle(&dbmfp->fh);
- dbmfp->fh = fh;
+ (void)__os_closehandle(dbmfp->fhp);
+ *dbmfp->fhp = fh;
F_SET(dbmfp, MP_UPGRADE);
ret = 0;
}
- __os_freestr(rpath);
+ __os_freestr(dbmp->dbenv, rpath);
return (ret);
}