summaryrefslogtreecommitdiff
path: root/db/mp
diff options
context:
space:
mode:
Diffstat (limited to 'db/mp')
-rw-r--r--db/mp/mp_bh.c479
-rw-r--r--db/mp/mp_fopen.c845
2 files changed, 696 insertions, 628 deletions
diff --git a/db/mp/mp_bh.c b/db/mp/mp_bh.c
index 5c438b202..24f14ab1d 100644
--- a/db/mp/mp_bh.c
+++ b/db/mp/mp_bh.c
@@ -1,13 +1,13 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996-2001
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "Id: mp_bh.c,v 11.45 2001/07/26 19:53:31 bostic Exp ";
+static const char revid[] = "Id: mp_bh.c,v 11.68 2002/05/03 15:21:16 bostic Exp ";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -18,43 +18,41 @@ static const char revid[] = "Id: mp_bh.c,v 11.45 2001/07/26 19:53:31 bostic Exp
#endif
#include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
-#include "log.h"
-#include "db_page.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
+#include "dbinc/log.h"
+#include "dbinc/db_page.h"
+static int __memp_pgwrite
+ __P((DB_MPOOL *, DB_MPOOLFILE *, DB_MPOOL_HASH *, BH *));
static int __memp_upgrade __P((DB_MPOOL *, DB_MPOOLFILE *, MPOOLFILE *));
/*
* __memp_bhwrite --
- * Write the page associated with a given bucket header.
+ * Write the page associated with a given buffer header.
*
- * PUBLIC: int __memp_bhwrite
- * PUBLIC: __P((DB_MPOOL *, MPOOLFILE *, BH *, int, int *, int *));
+ * PUBLIC: int __memp_bhwrite __P((DB_MPOOL *,
+ * PUBLIC: DB_MPOOL_HASH *, MPOOLFILE *, BH *, int));
*/
int
-__memp_bhwrite(dbmp, mfp, bhp, open_extents, restartp, wrotep)
+__memp_bhwrite(dbmp, hp, mfp, bhp, open_extents)
DB_MPOOL *dbmp;
+ DB_MPOOL_HASH *hp;
MPOOLFILE *mfp;
BH *bhp;
- int open_extents, *restartp, *wrotep;
+ int open_extents;
{
DB_ENV *dbenv;
DB_MPOOLFILE *dbmfp;
DB_MPREG *mpreg;
- int incremented, ret;
+ int local_open, incremented, ret;
dbenv = dbmp->dbenv;
-
- if (restartp != NULL)
- *restartp = 0;
- if (wrotep != NULL)
- *wrotep = 0;
- incremented = 0;
+ local_open = incremented = 0;
/*
- * If the file has been removed or is a closed temporary file, Jump
- * right ahead and pretend that we've found the file we want-- the
+ * If the file has been removed or is a closed temporary file, jump
+ * right ahead and pretend that we've found the file we want -- the
* page-write function knows how to handle the fact that we don't have
* (or need!) any real file descriptor information.
*/
@@ -74,25 +72,35 @@ __memp_bhwrite(dbmp, mfp, bhp, open_extents, restartp, wrotep)
dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))
if (dbmfp->mfp == mfp) {
if (F_ISSET(dbmfp, MP_READONLY) &&
- __memp_upgrade(dbmp, dbmfp, mfp)) {
+ !F_ISSET(dbmfp, MP_UPGRADE) &&
+ (F_ISSET(dbmfp, MP_UPGRADE_FAIL) ||
+ __memp_upgrade(dbmp, dbmfp, mfp))) {
MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
- return (0);
+ return (EPERM);
}
/*
* Increment the reference count -- see the comment in
- * memp_fclose().
+ * __memp_fclose_int().
*/
++dbmfp->ref;
incremented = 1;
break;
}
MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+
if (dbmfp != NULL)
goto found;
/*
* !!!
+ * It's the caller's choice if we're going to open extent files.
+ */
+ if (!open_extents && F_ISSET(mfp, MP_EXTENT))
+ return (EPERM);
+
+ /*
+ * !!!
* Don't try to attach to temporary files. There are two problems in
* trying to do that. First, if we have different privileges than the
* process that "owns" the temporary file, we might create the backing
@@ -107,15 +115,12 @@ __memp_bhwrite(dbmp, mfp, bhp, open_extents, restartp, wrotep)
* with resource starvation, and the memp_trickle thread couldn't do
* anything about it. That's a pretty unlikely scenario, though.
*
- * Note that we should never get here when the temporary file
- * in question has already been closed in another process, in which
- * case it should be marked MP_DEADFILE.
+ * Note we should never get here when the temporary file in question
+ * has already been closed in another process, in which case it should
+ * be marked MP_DEADFILE.
*/
- if (F_ISSET(mfp, MP_TEMP)
- || (F_ISSET(mfp, MP_EXTENT) && !open_extents)) {
- DB_ASSERT(!F_ISSET(mfp, MP_DEADFILE));
- return (0);
- }
+ if (F_ISSET(mfp, MP_TEMP))
+ return (EPERM);
/*
* It's not a page from a file we've opened. If the file requires
@@ -131,7 +136,7 @@ __memp_bhwrite(dbmp, mfp, bhp, open_extents, restartp, wrotep)
break;
MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
if (mpreg == NULL)
- return (0);
+ return (EPERM);
}
/*
@@ -142,25 +147,24 @@ __memp_bhwrite(dbmp, mfp, bhp, open_extents, restartp, wrotep)
* There's no negative cache, so we may repeatedly try and open files
* that we have previously tried (and failed) to open.
*/
- if (dbenv->memp_fcreate(dbenv, &dbmfp, 0) != 0)
- return (0);
- if (__memp_fopen_int(dbmfp, mfp,
+ if ((ret = dbenv->memp_fcreate(dbenv, &dbmfp, 0)) != 0)
+ return (ret);
+ if ((ret = __memp_fopen_int(dbmfp, mfp,
R_ADDR(dbmp->reginfo, mfp->path_off),
- 0, 0, mfp->stat.st_pagesize, 0) != 0) {
+ 0, 0, mfp->stat.st_pagesize)) != 0) {
(void)dbmfp->close(dbmfp, 0);
- return (0);
+ return (ret);
}
- F_SET(dbmfp, MP_FLUSH);
- if (F_ISSET(mfp, MP_EXTENT))
- dbmp->extents = 1;
+ local_open = 1;
-found: ret = __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep);
+found: ret = __memp_pgwrite(dbmp, dbmfp, hp, bhp) == 0 ? 0 : 1;
- if (incremented) {
- MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
+ MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
+ if (incremented)
--dbmfp->ref;
- MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
- }
+ else if (local_open)
+ F_SET(dbmfp, MP_FLUSH);
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
return (ret);
}
@@ -169,11 +173,12 @@ found: ret = __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep);
* __memp_pgread --
* Read a page from a file.
*
- * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, BH *, int));
+ * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, DB_MUTEX *, BH *, int));
*/
int
-__memp_pgread(dbmfp, bhp, can_create)
+__memp_pgread(dbmfp, mutexp, bhp, can_create)
DB_MPOOLFILE *dbmfp;
+ DB_MUTEX *mutexp;
BH *bhp;
int can_create;
{
@@ -181,19 +186,21 @@ __memp_pgread(dbmfp, bhp, can_create)
DB_ENV *dbenv;
DB_MPOOL *dbmp;
MPOOLFILE *mfp;
- size_t len, pagesize;
- size_t nr;
- int created, ret;
+ size_t len, nr, pagesize;
+ int ret;
dbmp = dbmfp->dbmp;
dbenv = dbmp->dbenv;
mfp = dbmfp->mfp;
pagesize = mfp->stat.st_pagesize;
- created = 0;
+ /* We should never be called with a dirty or a locked buffer. */
+ DB_ASSERT(!F_ISSET(bhp, BH_DIRTY | BH_DIRTY_CREATE | BH_LOCKED));
+
+ /* Lock the buffer and swap the hash bucket lock for the buffer lock. */
F_SET(bhp, BH_LOCKED | BH_TRASH);
- MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp);
- R_UNLOCK(dbenv, dbmp->reginfo);
+ MUTEX_LOCK(dbenv, &bhp->mutex);
+ MUTEX_UNLOCK(dbenv, mutexp);
/*
* Temporary files may not yet have been created. We don't create
@@ -208,125 +215,101 @@ __memp_pgread(dbmfp, bhp, can_create)
db_io.buf = bhp->buf;
/*
- * The page may not exist; if it doesn't, nr may well be 0,
+ * The page may not exist; if it doesn't, nr may well be 0,
* but we expect the underlying OS calls not to return an
* error code in this case.
*/
if ((ret = __os_io(dbenv, &db_io, DB_IO_READ, &nr)) != 0)
goto err;
- } else
- ret = 0;
+ }
if (nr < pagesize) {
- if (can_create)
- created = 1;
- else {
- /*
- * Don't output error messages for short reads. In
- * particular, DB recovery processing may request pages
- * that have never been written to disk or for which
- * only some part have been written to disk, in which
- * case we won't find the page. The caller must know
- * how to handle the error.
- */
+ /*
+ * Don't output error messages for short reads. In particular,
+ * DB recovery processing may request pages never written to
+ * disk or for which only some part have been written to disk,
+ * in which case we won't find the page. The caller must know
+ * how to handle the error.
+ */
+ if (can_create == 0) {
ret = DB_PAGE_NOTFOUND;
goto err;
}
+
+ /* Clear any bytes that need to be cleared. */
+ len = mfp->clear_len == 0 ? pagesize : mfp->clear_len;
+ memset(bhp->buf, 0, len);
+
+#if defined(DIAGNOSTIC) || defined(UMRW)
/*
- * Clear any bytes that need to be cleared -- if we did a short
- * read, we assume that a page was not completely written and
- * clear even the bytes that we read. This is so our caller
- * isn't surprised (for example, if the first sector only of a
- * DB page was written, the LSN will indicate that the page was
- * updated, but the page contents will be wrong). Support for
- * page checksums might make this unnecessary in the future --
- * I would prefer not to discard data potentially written by
- * the application, under any circumstances.
- *
* If we're running in diagnostic mode, corrupt any bytes on
* the page that are unknown quantities for the caller.
*/
- len = mfp->clear_len == 0 ? pagesize : mfp->clear_len;
- memset(bhp->buf, 0, len);
-#if defined(DIAGNOSTIC) || defined(UMRW)
if (len < pagesize)
memset(bhp->buf + len, CLEAR_BYTE, pagesize - len);
#endif
- }
+ ++mfp->stat.st_page_create;
+ } else
+ ++mfp->stat.st_page_in;
/* Call any pgin function. */
ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1);
- /* Unlock the buffer and reacquire the region lock. */
+ /* Unlock the buffer and reacquire the hash bucket lock. */
err: MUTEX_UNLOCK(dbenv, &bhp->mutex);
- R_LOCK(dbenv, dbmp->reginfo);
+ MUTEX_LOCK(dbenv, mutexp);
/*
* If no errors occurred, the data is now valid, clear the BH_TRASH
* flag; regardless, clear the lock bit and let other threads proceed.
*/
F_CLR(bhp, BH_LOCKED);
- if (ret == 0) {
+ if (ret == 0)
F_CLR(bhp, BH_TRASH);
- /* Update the statistics. */
- if (created)
- ++mfp->stat.st_page_create;
- else
- ++mfp->stat.st_page_in;
- }
-
return (ret);
}
/*
* __memp_pgwrite --
* Write a page to a file.
- *
- * PUBLIC: int __memp_pgwrite
- * PUBLIC: __P((DB_MPOOL *, DB_MPOOLFILE *, BH *, int *, int *));
*/
-int
-__memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep)
+static int
+__memp_pgwrite(dbmp, dbmfp, hp, bhp)
DB_MPOOL *dbmp;
DB_MPOOLFILE *dbmfp;
+ DB_MPOOL_HASH *hp;
BH *bhp;
- int *restartp, *wrotep;
{
DB_ENV *dbenv;
DB_IO db_io;
DB_LSN lsn;
- MPOOL *c_mp, *mp;
+ MPOOL *mp;
MPOOLFILE *mfp;
size_t nw;
- int callpgin, dosync, ret, syncfail;
- const char *fail;
+ int callpgin, ret;
dbenv = dbmp->dbenv;
mp = dbmp->reginfo[0].primary;
mfp = dbmfp == NULL ? NULL : dbmfp->mfp;
+ callpgin = ret = 0;
- if (restartp != NULL)
- *restartp = 0;
- if (wrotep != NULL)
- *wrotep = 0;
- callpgin = 0;
-
- /* We should never be called with a clean or a locked buffer. */
+ /*
+ * We should never be called with a clean or trash buffer.
+ * The sync code does call us with already locked buffers.
+ */
DB_ASSERT(F_ISSET(bhp, BH_DIRTY));
- DB_ASSERT(!F_ISSET(bhp, BH_LOCKED));
+ DB_ASSERT(!F_ISSET(bhp, BH_TRASH));
/*
- * Lock the buffer, set the I/O in progress flag, and discard the
- * region lock.
+ * If we have not already traded the hash bucket lock for the buffer
+ * lock, do so now.
*/
- MUTEX_LOCK(dbenv, &bhp->mutex, dbenv->lockfhp);
- F_SET(bhp, BH_LOCKED);
- R_UNLOCK(dbenv, dbmp->reginfo);
-
- /* Tell the caller that the region lock was discarded. */
- if (restartp != NULL)
- *restartp = 1;
+ if (!F_ISSET(bhp, BH_LOCKED)) {
+ F_SET(bhp, BH_LOCKED);
+ MUTEX_LOCK(dbenv, &bhp->mutex);
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+ }
/*
* It's possible that the underlying file doesn't exist, either
@@ -342,13 +325,9 @@ __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep)
/*
* If the page is in a file for which we have LSN information, we have
- * to ensure the appropriate log records are on disk. If the page is
- * being written as part of a sync operation, the flush has been done
- * already, unless it was modified by the application *after* the sync
- * was scheduled.
+ * to ensure the appropriate log records are on disk.
*/
- if (LOGGING_ON(dbenv) && !IS_RECOVERING(dbenv) && mfp->lsn_off != -1 &&
- (!F_ISSET(bhp, BH_SYNC) || F_ISSET(bhp, BH_SYNC_LOGFLSH))) {
+ if (LOGGING_ON(dbenv) && mfp->lsn_off != -1) {
memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN));
if ((ret = dbenv->log_flush(dbenv, &lsn)) != 0)
goto err;
@@ -361,7 +340,7 @@ __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep)
* !!!
* One special case. There is a single field on the meta-data page,
* the last-page-number-in-the-file field, for which we do not log
- * changes. So, if the page was original created in a database that
+ * changes. If the page was originally created in a database that
* didn't have logging turned on, we can see a page marked dirty but
* for which no corresponding log record has been written. However,
* the only way that a page can be created for which there isn't a
@@ -394,9 +373,7 @@ __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep)
* that the contents of the buffer will need to be passed through pgin
* before they are reused.
*/
- if (mfp->ftype == 0)
- ret = 0;
- else {
+ if (mfp->ftype != 0) {
callpgin = 1;
if ((ret = __memp_pg(dbmfp, bhp, 0)) != 0)
goto err;
@@ -405,17 +382,16 @@ __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep)
/* Temporary files may not yet have been created. */
if (!F_ISSET(dbmfp->fhp, DB_FH_VALID)) {
MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
- if (!F_ISSET(dbmfp->fhp, DB_FH_VALID) &&
- ((ret = __db_appname(dbenv, DB_APP_TMP, NULL, NULL,
- DB_OSO_CREATE | DB_OSO_EXCL | DB_OSO_TEMP,
- dbmfp->fhp, NULL)) != 0 ||
- !F_ISSET(dbmfp->fhp, DB_FH_VALID))) {
- MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+ ret = F_ISSET(dbmfp->fhp, DB_FH_VALID) ? 0 :
+ __db_appname(dbenv, DB_APP_TMP, NULL,
+ F_ISSET(dbenv, DB_ENV_DIRECT_DB) ? DB_OSO_DIRECT : 0,
+ dbmfp->fhp, NULL);
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+ if (ret != 0) {
__db_err(dbenv,
"unable to create temporary backing file");
goto err;
}
- MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
}
/* Write the page. */
@@ -425,104 +401,44 @@ __memp_pgwrite(dbmp, dbmfp, bhp, restartp, wrotep)
db_io.pgno = bhp->pgno;
db_io.buf = bhp->buf;
if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) {
- fail = "write";
- goto syserr;
- }
- if (nw != mfp->stat.st_pagesize) {
- ret = EIO;
- fail = "write";
- goto syserr;
+ __db_err(dbenv, "%s: write failed for page %lu",
+ __memp_fn(dbmfp), (u_long)bhp->pgno);
+ goto err;
}
+ ++mfp->stat.st_page_out;
+err:
file_dead:
/*
* !!!
* Once we pass this point, dbmfp and mfp may be NULL, we may not have
* a valid file reference.
*
- * Unlock the buffer and reacquire the region lock.
+ * Unlock the buffer and reacquire the hash lock.
*/
MUTEX_UNLOCK(dbenv, &bhp->mutex);
- R_LOCK(dbenv, dbmp->reginfo);
+ MUTEX_LOCK(dbenv, &hp->hash_mutex);
/*
- * Clean up the flags based on a successful write.
- *
* If we rewrote the page, it will need processing by the pgin
* routine before reuse.
*/
if (callpgin)
F_SET(bhp, BH_CALLPGIN);
- F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE | BH_LOCKED);
/*
- * If we write a buffer for which a checkpoint is waiting, update
- * the count of pending buffers (both in the mpool as a whole and
- * for this file). If the count for this file goes to zero, set a
- * flag so we flush the writes.
+ * Update the hash bucket statistics, reset the flags.
+ * If we were successful, the page is no longer dirty.
*/
- dosync = 0;
- if (F_ISSET(bhp, BH_SYNC)) {
- F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH);
-
- --mp->lsn_cnt;
- if (mfp != NULL)
- dosync = --mfp->lsn_cnt == 0 ? 1 : 0;
- }
-
- /* Update the page clean/dirty statistics. */
- c_mp = BH_TO_CACHE(dbmp, bhp);
- ++c_mp->stat.st_page_clean;
- DB_ASSERT(c_mp->stat.st_page_dirty != 0);
- --c_mp->stat.st_page_dirty;
-
- /* Update I/O statistics. */
- if (mfp != NULL)
- ++mfp->stat.st_page_out;
+ if (ret == 0) {
+ DB_ASSERT(hp->hash_page_dirty != 0);
+ --hp->hash_page_dirty;
- /*
- * Do the sync after everything else has been updated, so any incoming
- * checkpoint doesn't see inconsistent information.
- *
- * XXX:
- * Don't lock the region around the sync, fsync(2) has no atomicity
- * issues.
- *
- * XXX:
- * We ignore errors from the sync -- it makes no sense to return an
- * error to the calling process, so set a flag causing the checkpoint
- * to be retried later. There is a possibility, of course, that a
- * subsequent checkpoint was started and that we're going to force it
- * to fail. That should be unlikely, and fixing it would be difficult.
- */
- if (dosync) {
- R_UNLOCK(dbenv, dbmp->reginfo);
- syncfail = __os_fsync(dbenv, dbmfp->fhp) != 0;
- R_LOCK(dbenv, dbmp->reginfo);
- if (syncfail)
- F_SET(mp, MP_LSN_RETRY);
+ F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
}
- if (wrotep != NULL)
- *wrotep = 1;
-
- return (0);
-
-syserr: __db_err(dbenv, "%s: %s failed for page %lu",
- __memp_fn(dbmfp), fail, (u_long)bhp->pgno);
-
-err: /* Unlock the buffer and reacquire the region lock. */
- MUTEX_UNLOCK(dbenv, &bhp->mutex);
- R_LOCK(dbenv, dbmp->reginfo);
-
- /*
- * Clean up the flags based on a failure.
- *
- * The page remains dirty but we remove our lock. If we rewrote the
- * page, it will need processing by the pgin routine before reuse.
- */
- if (callpgin)
- F_SET(bhp, BH_CALLPGIN);
+ /* Regardless, clear any sync wait-for count and remove our lock. */
+ bhp->ref_sync = 0;
F_CLR(bhp, BH_LOCKED);
return (ret);
@@ -541,15 +457,17 @@ __memp_pg(dbmfp, bhp, is_pgin)
int is_pgin;
{
DBT dbt, *dbtp;
+ DB_ENV *dbenv;
DB_MPOOL *dbmp;
DB_MPREG *mpreg;
MPOOLFILE *mfp;
int ftype, ret;
dbmp = dbmfp->dbmp;
+ dbenv = dbmp->dbenv;
mfp = dbmfp->mfp;
- MUTEX_THREAD_LOCK(dbmp->dbenv, dbmp->mutexp);
+ MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
ftype = mfp->ftype;
for (mpreg = LIST_FIRST(&dbmp->dbregq);
@@ -563,28 +481,28 @@ __memp_pg(dbmfp, bhp, is_pgin)
dbt.data = R_ADDR(dbmp->reginfo, mfp->pgcookie_off);
dbtp = &dbt;
}
- MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
if (is_pgin) {
if (mpreg->pgin != NULL &&
- (ret = mpreg->pgin(dbmp->dbenv,
+ (ret = mpreg->pgin(dbenv,
bhp->pgno, bhp->buf, dbtp)) != 0)
goto err;
} else
if (mpreg->pgout != NULL &&
- (ret = mpreg->pgout(dbmp->dbenv,
+ (ret = mpreg->pgout(dbenv,
bhp->pgno, bhp->buf, dbtp)) != 0)
goto err;
break;
}
if (mpreg == NULL)
- MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
return (0);
-err: MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
- __db_err(dbmp->dbenv, "%s: %s failed for page %lu",
+err: MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+ __db_err(dbenv, "%s: %s failed for page %lu",
__memp_fn(dbmfp), is_pgin ? "pgin" : "pgout", (u_long)bhp->pgno);
return (ret);
}
@@ -593,56 +511,78 @@ err: MUTEX_THREAD_UNLOCK(dbmp->dbenv, dbmp->mutexp);
* __memp_bhfree --
* Free a bucket header and its referenced data.
*
- * PUBLIC: void __memp_bhfree __P((DB_MPOOL *, BH *, int));
+ * PUBLIC: void __memp_bhfree __P((DB_MPOOL *, DB_MPOOL_HASH *, BH *, int));
*/
void
-__memp_bhfree(dbmp, bhp, free_mem)
+__memp_bhfree(dbmp, hp, bhp, free_mem)
DB_MPOOL *dbmp;
+ DB_MPOOL_HASH *hp;
BH *bhp;
int free_mem;
{
- DB_HASHTAB *dbht;
+ DB_ENV *dbenv;
MPOOL *c_mp, *mp;
MPOOLFILE *mfp;
- int n_bucket, n_cache;
+ u_int32_t n_cache;
+ /*
+ * Assumes the hash bucket is locked and the MPOOL is not.
+ */
+ dbenv = dbmp->dbenv;
mp = dbmp->reginfo[0].primary;
- c_mp = BH_TO_CACHE(dbmp, bhp);
- n_cache = NCACHE(mp, bhp->pgno);
- n_bucket = NBUCKET(c_mp, bhp->mf_offset, bhp->pgno);
- dbht = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
+ n_cache = NCACHE(mp, bhp->mf_offset, bhp->pgno);
- /* Delete the buffer header from the hash bucket queue. */
- SH_TAILQ_REMOVE(&dbht[n_bucket], bhp, hq, __bh);
+ /*
+ * Delete the buffer header from the hash bucket queue and reset
+ * the hash bucket's priority, if necessary.
+ */
+ SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
+ if (bhp->priority == hp->hash_priority)
+ hp->hash_priority =
+ SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL ?
+ 0 : SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
- /* Delete the buffer header from the LRU queue. */
- SH_TAILQ_REMOVE(&c_mp->bhq, bhp, q, __bh);
+ /*
+ * Discard the hash bucket's mutex, it's no longer needed, and
+ * we don't want to be holding it when acquiring other locks.
+ */
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
- /* Clear the mutex this buffer recorded */
- __db_shlocks_clear(&bhp->mutex, &dbmp->reginfo[n_cache],
- (REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], mp->maint_off));
/*
* Find the underlying MPOOLFILE and decrement its reference count.
* If this is its last reference, remove it.
*/
mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+ MUTEX_LOCK(dbenv, &mfp->mutex);
if (--mfp->block_cnt == 0 && mfp->mpf_cnt == 0)
__memp_mf_discard(dbmp, mfp);
+ else
+ MUTEX_UNLOCK(dbenv, &mfp->mutex);
- DB_ASSERT(c_mp->stat.st_page_clean != 0);
- --c_mp->stat.st_page_clean;
+ R_LOCK(dbenv, &dbmp->reginfo[n_cache]);
+
+ /*
+ * Clear the mutex this buffer recorded; requires the region lock
+ * be held.
+ */
+ __db_shlocks_clear(&bhp->mutex, &dbmp->reginfo[n_cache],
+ (REGMAINT *)R_ADDR(&dbmp->reginfo[n_cache], mp->maint_off));
/*
- * If we're not reusing it immediately, free the buffer header
+ * If we're not reusing the buffer immediately, free the buffer header
* and data for real.
*/
- if (free_mem)
+ if (free_mem) {
__db_shalloc_free(dbmp->reginfo[n_cache].addr, bhp);
+ c_mp = dbmp->reginfo[n_cache].primary;
+ c_mp->stat.st_pages--;
+ }
+ R_UNLOCK(dbenv, &dbmp->reginfo[n_cache]);
}
/*
* __memp_upgrade --
- * Upgrade a file descriptor from readonly to readwrite.
+ * Upgrade a file descriptor from read-only to read-write.
*/
static int
__memp_upgrade(dbmp, dbmfp, mfp)
@@ -650,41 +590,58 @@ __memp_upgrade(dbmp, dbmfp, mfp)
DB_MPOOLFILE *dbmfp;
MPOOLFILE *mfp;
{
- DB_FH fh;
+ DB_ENV *dbenv;
+ DB_FH *fhp, *tfhp;
int ret;
char *rpath;
- /*
- * !!!
- * We expect the handle to already be locked.
- */
-
- /* Check to see if we've already upgraded. */
- if (F_ISSET(dbmfp, MP_UPGRADE))
- return (0);
-
- /* Check to see if we've already failed. */
- if (F_ISSET(dbmfp, MP_UPGRADE_FAIL))
- return (1);
+ dbenv = dbmp->dbenv;
+ fhp = NULL;
+ rpath = NULL;
/*
* Calculate the real name for this file and try to open it read/write.
* We know we have a valid pathname for the file because it's the only
* way we could have gotten a file descriptor of any kind.
*/
- if ((ret = __db_appname(dbmp->dbenv, DB_APP_DATA,
- NULL, R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) != 0)
- return (ret);
- if (__os_open(dbmp->dbenv, rpath, 0, 0, &fh) != 0) {
+ if ((ret = __os_calloc(dbenv, 1, sizeof(DB_FH), &fhp)) != 0)
+ goto err;
+
+ if ((ret = __db_appname(dbenv, DB_APP_DATA,
+ R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) != 0)
+ goto err;
+
+ if (__os_open(dbenv, rpath,
+ F_ISSET(mfp, MP_DIRECT) ? DB_OSO_DIRECT : 0, 0, fhp) != 0) {
F_SET(dbmfp, MP_UPGRADE_FAIL);
- ret = 1;
- } else {
- /* Swap the descriptors and set the upgrade flag. */
- (void)__os_closehandle(dbmfp->fhp);
- *dbmfp->fhp = fh;
- F_SET(dbmfp, MP_UPGRADE);
- ret = 0;
+ goto err;
}
- __os_freestr(dbmp->dbenv, rpath);
+
+ /*
+ * Swap the descriptors and set the upgrade flag.
+ *
+ * XXX
+ * There is a race here. If another process schedules a read using the
+ * existing file descriptor and is swapped out before making the system
+ * call, this code could theoretically close the file descriptor out
+ * from under it. While it's very unlikely, this code should still be
+ * rewritten.
+ */
+ tfhp = dbmfp->fhp;
+ dbmfp->fhp = fhp;
+ fhp = tfhp;
+
+ (void)__os_closehandle(dbenv, fhp);
+ F_SET(dbmfp, MP_UPGRADE);
+
+ ret = 0;
+ if (0) {
+err: ret = 1;
+ }
+ if (fhp != NULL)
+ __os_free(dbenv, fhp);
+ if (rpath != NULL)
+ __os_free(dbenv, rpath);
+
return (ret);
}
diff --git a/db/mp/mp_fopen.c b/db/mp/mp_fopen.c
index bb3937e10..7209bf066 100644
--- a/db/mp/mp_fopen.c
+++ b/db/mp/mp_fopen.c
@@ -1,13 +1,13 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996-2001
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "Id: mp_fopen.c,v 11.60 2001/10/04 21:26:56 bostic Exp ";
+static const char revid[] = "Id: mp_fopen.c,v 11.88 2002/07/01 15:05:30 bostic Exp ";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -17,14 +17,13 @@ static const char revid[] = "Id: mp_fopen.c,v 11.60 2001/10/04 21:26:56 bostic E
#endif
#include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
static int __memp_fclose __P((DB_MPOOLFILE *, u_int32_t));
static int __memp_fopen __P((DB_MPOOLFILE *,
const char *, u_int32_t, int, size_t));
-static int __memp_mf_open __P((DB_MPOOLFILE *,
- const char *, size_t, db_pgno_t, u_int32_t, MPOOLFILE **));
+static void __memp_get_fileid __P((DB_MPOOLFILE *, u_int8_t *));
static void __memp_last_pgno __P((DB_MPOOLFILE *, db_pgno_t *));
static void __memp_refcnt __P((DB_MPOOLFILE *, db_pgno_t *));
static int __memp_set_clear_len __P((DB_MPOOLFILE *, u_int32_t));
@@ -32,21 +31,9 @@ static int __memp_set_fileid __P((DB_MPOOLFILE *, u_int8_t *));
static int __memp_set_ftype __P((DB_MPOOLFILE *, int));
static int __memp_set_lsn_offset __P((DB_MPOOLFILE *, int32_t));
static int __memp_set_pgcookie __P((DB_MPOOLFILE *, DBT *));
+static int __memp_set_priority __P((DB_MPOOLFILE *, DB_CACHE_PRIORITY));
static void __memp_set_unlink __P((DB_MPOOLFILE *, int));
-/*
- * MEMP_FREMOVE --
- * Discard an MPOOLFILE and any buffers it references: update the flags
- * so we never try to write buffers associated with the file, nor can we
- * find it when looking for files to join. In addition, clear the ftype
- * field, there's no reason to post-process pages, they can be discarded
- * by any thread.
- */
-#define MEMP_FREMOVE(mfp) { \
- mfp->ftype = 0; \
- F_SET(mfp, MP_DEADFILE); \
-}
-
/* Initialization methods cannot be called after open is called. */
#define MPF_ILLEGAL_AFTER_OPEN(dbmfp, name) \
if (F_ISSET(dbmfp, MP_OPEN_CALLED)) \
@@ -81,32 +68,23 @@ __memp_fcreate(dbenv, retp, flags)
/* Allocate and initialize the per-process structure. */
if ((ret = __os_calloc(dbenv, 1, sizeof(DB_MPOOLFILE), &dbmfp)) != 0)
return (ret);
- if ((ret = __os_calloc(dbenv, 1, sizeof(DB_FH), &dbmfp->fhp)) != 0) {
- __os_free(dbenv, dbmfp, sizeof(DB_MPOOLFILE));
- return (ret);
- }
+ if ((ret = __os_calloc(dbenv, 1, sizeof(DB_FH), &dbmfp->fhp)) != 0)
+ goto err;
/* Allocate and initialize a mutex if necessary. */
- if (F_ISSET(dbenv, DB_ENV_THREAD)) {
- if ((ret = __db_mutex_alloc(
- dbenv, dbmp->reginfo, 0, &dbmfp->mutexp)) != 0)
- return (ret);
-
- if ((ret = __db_shmutex_init(dbenv, dbmfp->mutexp, 0,
- MUTEX_THREAD, dbmp->reginfo,
- (REGMAINT *)R_ADDR(dbmp->reginfo,
- ((MPOOL *)dbmp->reginfo->primary)->maint_off))) != 0) {
- __db_mutex_free(dbenv, dbmp->reginfo, dbmfp->mutexp);
- return (ret);
- }
- }
+ if (F_ISSET(dbenv, DB_ENV_THREAD) &&
+ (ret = __db_mutex_setup(dbenv, dbmp->reginfo, &dbmfp->mutexp,
+ MUTEX_ALLOC | MUTEX_THREAD)) != 0)
+ goto err;
dbmfp->ref = 1;
dbmfp->lsn_offset = -1;
dbmfp->dbmp = dbmp;
+ dbmfp->mfp = INVALID_ROFF;
dbmfp->close = __memp_fclose;
dbmfp->get = __memp_fget;
+ dbmfp->get_fileid = __memp_get_fileid;
dbmfp->last_pgno = __memp_last_pgno;
dbmfp->open = __memp_fopen;
dbmfp->put = __memp_fput;
@@ -117,16 +95,19 @@ __memp_fcreate(dbenv, retp, flags)
dbmfp->set_ftype = __memp_set_ftype;
dbmfp->set_lsn_offset = __memp_set_lsn_offset;
dbmfp->set_pgcookie = __memp_set_pgcookie;
+ dbmfp->set_priority = __memp_set_priority;
dbmfp->set_unlink = __memp_set_unlink;
dbmfp->sync = __memp_fsync;
- /* Add the file to the environment's list of files. */
- MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
- TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q);
- MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
-
*retp = dbmfp;
return (0);
+
+err: if (dbmfp != NULL) {
+ if (dbmfp->fhp != NULL)
+ (void)__os_free(dbenv, dbmfp->fhp);
+ (void)__os_free(dbenv, dbmfp);
+ }
+ return (ret);
}
/*
@@ -168,6 +149,10 @@ __memp_set_ftype(dbmfp, ftype)
DB_MPOOLFILE *dbmfp;
int ftype;
{
+ DB_ENV *dbenv;
+
+ dbenv = dbmfp->dbmp->dbenv;
+
MPF_ILLEGAL_AFTER_OPEN(dbmfp, "set_ftype");
dbmfp->ftype = ftype;
@@ -205,6 +190,40 @@ __memp_set_pgcookie(dbmfp, pgcookie)
}
/*
+ * __memp_set_priority --
+ * Set the cache priority for pages from this file.
+ */
+static int
+__memp_set_priority(dbmfp, priority)
+ DB_MPOOLFILE *dbmfp;
+ DB_CACHE_PRIORITY priority;
+{
+ switch (priority) {
+ case DB_PRIORITY_VERY_LOW:
+ dbmfp->mfp->priority = MPOOL_PRI_VERY_LOW;
+ break;
+ case DB_PRIORITY_LOW:
+ dbmfp->mfp->priority = MPOOL_PRI_LOW;
+ break;
+ case DB_PRIORITY_DEFAULT:
+ dbmfp->mfp->priority = MPOOL_PRI_DEFAULT;
+ break;
+ case DB_PRIORITY_HIGH:
+ dbmfp->mfp->priority = MPOOL_PRI_HIGH;
+ break;
+ case DB_PRIORITY_VERY_HIGH:
+ dbmfp->mfp->priority = MPOOL_PRI_VERY_HIGH;
+ break;
+ default:
+ __db_err(dbmfp->dbmp->dbenv,
+ "Unknown priority value: %d", priority);
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+/*
* __memp_fopen --
* Open a backing file for the memory pool.
*/
@@ -227,7 +246,7 @@ __memp_fopen(dbmfp, path, flags, mode, pagesize)
/* Validate arguments. */
if ((ret = __db_fchk(dbenv, "memp_fopen", flags,
- DB_CREATE | DB_EXTENT |
+ DB_CREATE | DB_DIRECT | DB_EXTENT |
DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE)) != 0)
return (ret);
@@ -242,26 +261,18 @@ __memp_fopen(dbmfp, path, flags, mode, pagesize)
}
if (dbmfp->clear_len > pagesize) {
__db_err(dbenv,
- "memp_fopen: clear length larger than page size.");
+ "memp_fopen: clear length larger than page size");
return (EINVAL);
}
/* Read-only checks, and local flag. */
- if (LF_ISSET(DB_RDONLY)) {
- if (path == NULL) {
- __db_err(dbenv,
- "memp_fopen: temporary files can't be readonly");
- return (EINVAL);
- }
- F_SET(dbmfp, MP_READONLY);
+ if (LF_ISSET(DB_RDONLY) && path == NULL) {
+ __db_err(dbenv,
+ "memp_fopen: temporary files can't be readonly");
+ return (EINVAL);
}
- if ((ret = __memp_fopen_int(
- dbmfp, NULL, path, flags, mode, pagesize, 1)) != 0)
- return (ret);
-
- F_SET(dbmfp, MP_OPEN_CALLED);
- return (0);
+ return (__memp_fopen_int(dbmfp, NULL, path, flags, mode, pagesize));
}
/*
@@ -269,51 +280,199 @@ __memp_fopen(dbmfp, path, flags, mode, pagesize)
* Open a backing file for the memory pool; internal version.
*
* PUBLIC: int __memp_fopen_int __P((DB_MPOOLFILE *,
- * PUBLIC: MPOOLFILE *, const char *, u_int32_t, int, size_t, int));
+ * PUBLIC: MPOOLFILE *, const char *, u_int32_t, int, size_t));
*/
int
-__memp_fopen_int(dbmfp, mfp, path, flags, mode, pagesize, needlock)
+__memp_fopen_int(dbmfp, mfp, path, flags, mode, pagesize)
DB_MPOOLFILE *dbmfp;
MPOOLFILE *mfp;
const char *path;
u_int32_t flags;
- int mode, needlock;
+ int mode;
size_t pagesize;
{
DB_ENV *dbenv;
DB_MPOOL *dbmp;
+ MPOOL *mp;
db_pgno_t last_pgno;
size_t maxmap;
u_int32_t mbytes, bytes, oflags;
- int ret;
+ int mfp_alloc, ret;
u_int8_t idbuf[DB_FILE_ID_LEN];
char *rpath;
+ void *p;
dbmp = dbmfp->dbmp;
dbenv = dbmp->dbenv;
- ret = 0;
+ mp = dbmp->reginfo[0].primary;
+ mfp_alloc = ret = 0;
rpath = NULL;
+ /*
+ * Set the page size so os_open can decide whether to turn buffering
+ * off if the DB_DIRECT_DB flag is set.
+ */
+ dbmfp->fhp->pagesize = (u_int32_t)pagesize;
+
+ /*
+ * If it's a temporary file, delay the open until we actually need
+ * to write the file, and we know we can't join any existing files.
+ */
if (path == NULL)
- last_pgno = 0;
- else {
- /* Get the real name for this file and open it. */
- if ((ret = __db_appname(dbenv,
- DB_APP_DATA, NULL, path, 0, NULL, &rpath)) != 0)
+ goto alloc;
+
+ /*
+ * Get the real name for this file and open it. If it's a Queue extent
+ * file, it may not exist, and that's OK.
+ */
+ oflags = 0;
+ if (LF_ISSET(DB_CREATE))
+ oflags |= DB_OSO_CREATE;
+ if (LF_ISSET(DB_DIRECT))
+ oflags |= DB_OSO_DIRECT;
+ if (LF_ISSET(DB_RDONLY)) {
+ F_SET(dbmfp, MP_READONLY);
+ oflags |= DB_OSO_RDONLY;
+ }
+ if ((ret =
+ __db_appname(dbenv, DB_APP_DATA, path, 0, NULL, &rpath)) != 0)
+ goto err;
+ if ((ret = __os_open(dbenv, rpath, oflags, mode, dbmfp->fhp)) != 0) {
+ if (!LF_ISSET(DB_EXTENT))
+ __db_err(dbenv, "%s: %s", rpath, db_strerror(ret));
+ goto err;
+ }
+
+ /*
+ * Get the file id if we weren't given one. Generated file id's
+ * don't use timestamps, otherwise there'd be no chance of any
+ * other process joining the party.
+ */
+ if (dbmfp->fileid == NULL) {
+ if ((ret = __os_fileid(dbenv, rpath, 0, idbuf)) != 0)
goto err;
- oflags = 0;
- if (LF_ISSET(DB_CREATE))
- oflags |= DB_OSO_CREATE;
- if (LF_ISSET(DB_RDONLY))
- oflags |= DB_OSO_RDONLY;
- if ((ret =
- __os_open(dbenv, rpath, oflags, mode, dbmfp->fhp)) != 0) {
- if (!LF_ISSET(DB_EXTENT))
- __db_err(dbenv,
- "%s: %s", rpath, db_strerror(ret));
+ dbmfp->fileid = idbuf;
+ }
+
+ /*
+ * If our caller knows what mfp we're using, increment the ref count,
+ * no need to search.
+ *
+ * We don't need to acquire a lock other than the mfp itself, because
+ * we know there's another reference and it's not going away.
+ */
+ if (mfp != NULL) {
+ MUTEX_LOCK(dbenv, &mfp->mutex);
+ ++mfp->mpf_cnt;
+ MUTEX_UNLOCK(dbenv, &mfp->mutex);
+ goto check_map;
+ }
+
+ /*
+ * If not creating a temporary file, walk the list of MPOOLFILE's,
+ * looking for a matching file. Files backed by temporary files
+ * or previously removed files can't match.
+ *
+ * DB_TRUNCATE support.
+ *
+ * The fileID is a filesystem unique number (e.g., a UNIX dev/inode
+ * pair) plus a timestamp. If files are removed and created in less
+ * than a second, the fileID can be repeated. The problem with
+ * repetition happens when the file that previously had the fileID
+ * value still has pages in the pool, since we don't want to use them
+ * to satisfy requests for the new file.
+ *
+ * Because the DB_TRUNCATE flag reuses the dev/inode pair, repeated
+ * opens with that flag set guarantees matching fileIDs when the
+ * machine can open a file and then re-open with truncate within a
+ * second. For this reason, we pass that flag down, and, if we find
+ * a matching entry, we ensure that it's never found again, and we
+ * create a new entry for the current request.
+ */
+ R_LOCK(dbenv, dbmp->reginfo);
+ for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
+ mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
+ /* Skip dead files and temporary files. */
+ if (F_ISSET(mfp, MP_DEADFILE | MP_TEMP))
+ continue;
+
+ /* Skip non-matching files. */
+ if (memcmp(dbmfp->fileid, R_ADDR(dbmp->reginfo,
+ mfp->fileid_off), DB_FILE_ID_LEN) != 0)
+ continue;
+
+ /*
+ * If the file is being truncated, remove it from the system
+ * and create a new entry.
+ *
+ * !!!
+ * We should be able to set mfp to NULL and break out of the
+ * loop, but I like the idea of checking all the entries.
+ */
+ if (LF_ISSET(DB_TRUNCATE)) {
+ MUTEX_LOCK(dbenv, &mfp->mutex);
+ MPOOLFILE_IGNORE(mfp);
+ MUTEX_UNLOCK(dbenv, &mfp->mutex);
+ continue;
+ }
+
+ /*
+ * Some things about a file cannot be changed: the clear length,
+ * page size, or lSN location.
+ *
+ * The file type can change if the application's pre- and post-
+ * processing needs change. For example, an application that
+ * created a hash subdatabase in a database that was previously
+ * all btree.
+ *
+ * XXX
+ * We do not check to see if the pgcookie information changed,
+ * or update it if it is, this might be a bug.
+ */
+ if (dbmfp->clear_len != mfp->clear_len ||
+ pagesize != mfp->stat.st_pagesize ||
+ dbmfp->lsn_offset != mfp->lsn_off) {
+ __db_err(dbenv,
+ "%s: clear length, page size or LSN location changed",
+ path);
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ ret = EINVAL;
goto err;
}
+ if (dbmfp->ftype != 0)
+ mfp->ftype = dbmfp->ftype;
+
+ MUTEX_LOCK(dbenv, &mfp->mutex);
+ ++mfp->mpf_cnt;
+ MUTEX_UNLOCK(dbenv, &mfp->mutex);
+ break;
+ }
+ R_UNLOCK(dbenv, dbmp->reginfo);
+
+ if (mfp != NULL)
+ goto check_map;
+
+alloc: /* Allocate and initialize a new MPOOLFILE. */
+ if ((ret = __memp_alloc(
+ dbmp, dbmp->reginfo, NULL, sizeof(MPOOLFILE), NULL, &mfp)) != 0)
+ goto err;
+ mfp_alloc = 1;
+ memset(mfp, 0, sizeof(MPOOLFILE));
+ mfp->mpf_cnt = 1;
+ mfp->ftype = dbmfp->ftype;
+ mfp->stat.st_pagesize = pagesize;
+ mfp->lsn_off = dbmfp->lsn_offset;
+ mfp->clear_len = dbmfp->clear_len;
+
+ if (LF_ISSET(DB_DIRECT))
+ F_SET(mfp, MP_DIRECT);
+ if (LF_ISSET(DB_EXTENT))
+ F_SET(mfp, MP_EXTENT);
+
+ if (path == NULL)
+ F_SET(mfp, MP_TEMP);
+ else {
/*
* Don't permit files that aren't a multiple of the pagesize,
* and find the number of the last page in the file, all the
@@ -327,79 +486,84 @@ __memp_fopen_int(dbmfp, mfp, path, flags, mode, pagesize, needlock)
* environments where an off_t is 32-bits, but still run where
* offsets are 64-bits, and they pay us a lot of money.
*/
- if ((ret = __os_ioinfo(dbenv, rpath,
- dbmfp->fhp, &mbytes, &bytes, NULL)) != 0) {
+ if ((ret = __os_ioinfo(
+ dbenv, rpath, dbmfp->fhp, &mbytes, &bytes, NULL)) != 0) {
__db_err(dbenv, "%s: %s", rpath, db_strerror(ret));
goto err;
}
/*
- * If we're doing a verify, we might have to cope with
- * a truncated file; if the file size is not a multiple
- * of the page size, round down to a page -- we'll
- * take care of the partial page outside the memp system.
+ * During verify or recovery, we might have to cope with a
+ * truncated file; if the file size is not a multiple of the
+ * page size, round down to a page, we'll take care of the
+ * partial page outside the mpool system.
*/
if (bytes % pagesize != 0) {
if (LF_ISSET(DB_ODDFILESIZE))
- /*
- * During verify or recovery, we might have
- * to cope with a truncated file; round down,
- * we'll worry about the partial page outside
- * the memp system.
- */
- bytes -= (bytes % pagesize);
+ bytes -= (u_int32_t)(bytes % pagesize);
else {
__db_err(dbenv,
- "%s: file size not a multiple of the pagesize",
- rpath);
+ "%s: file size not a multiple of the pagesize", rpath);
ret = EINVAL;
goto err;
}
}
- last_pgno = mbytes * (MEGABYTE / pagesize);
- last_pgno += bytes / pagesize;
-
- /* Correction: page numbers are zero-based, not 1-based. */
+ /*
+ * If the user specifies DB_MPOOL_LAST or DB_MPOOL_NEW on a
+ * page get, we have to increment the last page in the file.
+ * Figure it out and save it away.
+ *
+ * Note correction: page numbers are zero-based, not 1-based.
+ */
+ last_pgno = (db_pgno_t)(mbytes * (MEGABYTE / pagesize));
+ last_pgno += (db_pgno_t)(bytes / pagesize);
if (last_pgno != 0)
--last_pgno;
+ mfp->orig_last_pgno = mfp->last_pgno = last_pgno;
- /*
- * Get the file id if we weren't given one. Generated file id's
- * don't use timestamps, otherwise there'd be no chance of any
- * other process joining the party.
- */
- if (dbmfp->fileid == NULL) {
- if ((ret = __os_fileid(dbenv, rpath, 0, idbuf)) != 0)
- goto err;
- dbmfp->fileid = idbuf;
- }
+ /* Copy the file path into shared memory. */
+ if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+ NULL, strlen(path) + 1, &mfp->path_off, &p)) != 0)
+ goto err;
+ memcpy(p, path, strlen(path) + 1);
+
+ /* Copy the file identification string into shared memory. */
+ if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+ NULL, DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0)
+ goto err;
+ memcpy(p, dbmfp->fileid, DB_FILE_ID_LEN);
+ }
+
+ /* Copy the page cookie into shared memory. */
+ if (dbmfp->pgcookie == NULL || dbmfp->pgcookie->size == 0) {
+ mfp->pgcookie_len = 0;
+ mfp->pgcookie_off = 0;
+ } else {
+ if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
+ NULL, dbmfp->pgcookie->size, &mfp->pgcookie_off, &p)) != 0)
+ goto err;
+ memcpy(p, dbmfp->pgcookie->data, dbmfp->pgcookie->size);
+ mfp->pgcookie_len = dbmfp->pgcookie->size;
}
/*
- * If we weren't provided an underlying shared object to join with,
- * find/allocate the shared file objects. Also allocate space for
- * for the per-process thread lock.
+ * Prepend the MPOOLFILE to the list of MPOOLFILE's.
*/
- if (needlock)
- R_LOCK(dbenv, dbmp->reginfo);
- if (mfp == NULL)
- ret = __memp_mf_open(
- dbmfp, path, pagesize, last_pgno, flags, &mfp);
- else {
- ++mfp->mpf_cnt;
- ret = 0;
- }
- dbmfp->mfp = mfp;
- if (needlock)
- R_UNLOCK(dbenv, dbmp->reginfo);
+ R_LOCK(dbenv, dbmp->reginfo);
+ ret = __db_mutex_setup(dbenv, dbmp->reginfo, &mfp->mutex,
+ MUTEX_NO_RLOCK);
+ if (ret == 0)
+ SH_TAILQ_INSERT_HEAD(&mp->mpfq, mfp, q, __mpoolfile);
+ R_UNLOCK(dbenv, dbmp->reginfo);
if (ret != 0)
goto err;
+check_map:
/*
* If a file:
- * + is read-only
* + isn't temporary
+ * + is read-only
* + doesn't require any pgin/pgout support
* + the DB_NOMMAP flag wasn't set (in either the file open or
* the environment in which it was opened)
@@ -411,7 +575,6 @@ __memp_fopen_int(dbmfp, mfp, path, flags, mode, pagesize, needlock)
* NFS mounted partition, and we can fail in buffer I/O just as easily
* as here.
*
- * XXX
* We'd like to test to see if the file is too big to mmap. Since we
* don't know what size or type off_t's or size_t's are, or the largest
* unsigned integral type is, or what random insanity the local C
@@ -420,10 +583,10 @@ __memp_fopen_int(dbmfp, mfp, path, flags, mode, pagesize, needlock)
*/
#define DB_MAXMMAPSIZE (10 * 1024 * 1024) /* 10 Mb. */
if (F_ISSET(mfp, MP_CAN_MMAP)) {
- if (!F_ISSET(dbmfp, MP_READONLY))
- F_CLR(mfp, MP_CAN_MMAP);
if (path == NULL)
F_CLR(mfp, MP_CAN_MMAP);
+ if (!F_ISSET(dbmfp, MP_READONLY))
+ F_CLR(mfp, MP_CAN_MMAP);
if (dbmfp->ftype != 0)
F_CLR(mfp, MP_CAN_MMAP);
if (LF_ISSET(DB_NOMMAP) || F_ISSET(dbenv, DB_ENV_NOMMAP))
@@ -433,179 +596,72 @@ __memp_fopen_int(dbmfp, mfp, path, flags, mode, pagesize, needlock)
if (mbytes > maxmap / MEGABYTE ||
(mbytes == maxmap / MEGABYTE && bytes >= maxmap % MEGABYTE))
F_CLR(mfp, MP_CAN_MMAP);
- }
- dbmfp->addr = NULL;
- if (F_ISSET(mfp, MP_CAN_MMAP)) {
- dbmfp->len = (size_t)mbytes * MEGABYTE + bytes;
- if (__os_mapfile(dbenv, rpath,
- dbmfp->fhp, dbmfp->len, 1, &dbmfp->addr) != 0) {
- dbmfp->addr = NULL;
- F_CLR(mfp, MP_CAN_MMAP);
+
+ dbmfp->addr = NULL;
+ if (F_ISSET(mfp, MP_CAN_MMAP)) {
+ dbmfp->len = (size_t)mbytes * MEGABYTE + bytes;
+ if (__os_mapfile(dbenv, rpath,
+ dbmfp->fhp, dbmfp->len, 1, &dbmfp->addr) != 0) {
+ dbmfp->addr = NULL;
+ F_CLR(mfp, MP_CAN_MMAP);
+ }
}
}
- if (rpath != NULL)
- __os_freestr(dbenv, rpath);
- return (0);
+ dbmfp->mfp = mfp;
+
+ F_SET(dbmfp, MP_OPEN_CALLED);
-err: if (rpath != NULL)
- __os_freestr(dbenv, rpath);
- if (F_ISSET(dbmfp->fhp, DB_FH_VALID))
- (void)__os_closehandle(dbmfp->fhp);
+ /* Add the file to the process' list of DB_MPOOLFILEs. */
+ MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
+ TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q);
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+
+ if (0) {
+err: if (F_ISSET(dbmfp->fhp, DB_FH_VALID))
+ (void)__os_closehandle(dbenv, dbmfp->fhp);
+
+ if (mfp_alloc) {
+ R_LOCK(dbenv, dbmp->reginfo);
+ if (mfp->path_off != 0)
+ __db_shalloc_free(dbmp->reginfo[0].addr,
+ R_ADDR(dbmp->reginfo, mfp->path_off));
+ if (mfp->fileid_off != 0)
+ __db_shalloc_free(dbmp->reginfo[0].addr,
+ R_ADDR(dbmp->reginfo, mfp->fileid_off));
+ __db_shalloc_free(dbmp->reginfo[0].addr, mfp);
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ }
+
+ }
+ if (rpath != NULL)
+ __os_free(dbenv, rpath);
return (ret);
}
/*
- * __memp_mf_open --
- * Open an MPOOLFILE.
+ * __memp_get_fileid --
+ * Return the file ID.
+ *
+ * XXX
+ * Undocumented interface: DB private.
*/
-static int
-__memp_mf_open(dbmfp, path, pagesize, last_pgno, flags, retp)
+static void
+__memp_get_fileid(dbmfp, fidp)
DB_MPOOLFILE *dbmfp;
- const char *path;
- size_t pagesize;
- db_pgno_t last_pgno;
- u_int32_t flags;
- MPOOLFILE **retp;
+ u_int8_t *fidp;
{
- DB_MPOOL *dbmp;
- MPOOL *mp;
- MPOOLFILE *mfp;
- int ret;
- void *p;
-
-#define ISTEMPORARY (path == NULL)
-
- dbmp = dbmfp->dbmp;
-
/*
- * If not creating a temporary file, walk the list of MPOOLFILE's,
- * looking for a matching file. Files backed by temporary files
- * or previously removed files can't match.
- *
- * DB_TRUNCATE support.
+ * No lock needed -- we're using the handle, it had better not
+ * be going away.
*
- * The fileID is a filesystem unique number (e.g., a UNIX dev/inode
- * pair) plus a timestamp. If files are removed and created in less
- * than a second, the fileID can be repeated. The problem with
- * repetition happens when the file that previously had the fileID
- * value still has pages in the pool, since we don't want to use them
- * to satisfy requests for the new file.
- *
- * Because the DB_TRUNCATE flag reuses the dev/inode pair, repeated
- * opens with that flag set guarantees matching fileIDs when the
- * machine can open a file and then re-open with truncate within a
- * second. For this reason, we pass that flag down, and, if we find
- * a matching entry, we ensure that it's never found again, and we
- * create a new entry for the current request.
+ * !!!
+ * Get the fileID out of the region, not out of the DB_MPOOLFILE
+ * structure because the DB_MPOOLFILE reference is possibly short
+ * lived, and isn't to be trusted.
*/
- if (!ISTEMPORARY) {
- mp = dbmp->reginfo[0].primary;
- for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
- mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
- if (F_ISSET(mfp, MP_DEADFILE | MP_TEMP))
- continue;
- if (memcmp(dbmfp->fileid, R_ADDR(dbmp->reginfo,
- mfp->fileid_off), DB_FILE_ID_LEN) == 0) {
- if (LF_ISSET(DB_TRUNCATE)) {
- MEMP_FREMOVE(mfp);
- continue;
- }
- if (dbmfp->clear_len != mfp->clear_len ||
- pagesize != mfp->stat.st_pagesize) {
- __db_err(dbmp->dbenv,
- "%s: page size or clear length changed",
- path);
- return (EINVAL);
- }
-
- /*
- * It's possible that our needs for pre- and
- * post-processing are changing. For example,
- * an application created a hash subdatabase
- * in a database that was previously all btree.
- */
- if (dbmfp->ftype != 0)
- mfp->ftype = dbmfp->ftype;
-
- ++mfp->mpf_cnt;
-
- *retp = mfp;
- return (0);
- }
- }
- }
-
- /* Allocate a new MPOOLFILE. */
- if ((ret = __memp_alloc(
- dbmp, dbmp->reginfo, NULL, sizeof(MPOOLFILE), NULL, &mfp)) != 0)
- goto mem_err;
- *retp = mfp;
-
- /* Initialize the structure. */
- memset(mfp, 0, sizeof(MPOOLFILE));
- mfp->mpf_cnt = 1;
- mfp->ftype = dbmfp->ftype;
- mfp->lsn_off = dbmfp->lsn_offset;
- mfp->clear_len = dbmfp->clear_len;
-
- /*
- * If the user specifies DB_MPOOL_LAST or DB_MPOOL_NEW on a memp_fget,
- * we have to know the last page in the file. Figure it out and save
- * it away.
- */
- mfp->stat.st_pagesize = pagesize;
- mfp->orig_last_pgno = mfp->last_pgno = last_pgno;
-
- if (ISTEMPORARY)
- F_SET(mfp, MP_TEMP);
- else {
- /* Copy the file path into shared memory. */
- if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
- NULL, strlen(path) + 1, &mfp->path_off, &p)) != 0)
- goto err;
- memcpy(p, path, strlen(path) + 1);
-
- /* Copy the file identification string into shared memory. */
- if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
- NULL, DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0)
- goto err;
- memcpy(p, dbmfp->fileid, DB_FILE_ID_LEN);
-
- F_SET(mfp, MP_CAN_MMAP);
- if (LF_ISSET(DB_EXTENT))
- F_SET(mfp, MP_EXTENT);
- }
-
- /* Copy the page cookie into shared memory. */
- if (dbmfp->pgcookie == NULL || dbmfp->pgcookie->size == 0) {
- mfp->pgcookie_len = 0;
- mfp->pgcookie_off = 0;
- } else {
- if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
- NULL, dbmfp->pgcookie->size, &mfp->pgcookie_off, &p)) != 0)
- goto err;
- memcpy(p, dbmfp->pgcookie->data, dbmfp->pgcookie->size);
- mfp->pgcookie_len = dbmfp->pgcookie->size;
- }
-
- /* Prepend the MPOOLFILE to the list of MPOOLFILE's. */
- mp = dbmp->reginfo[0].primary;
- SH_TAILQ_INSERT_HEAD(&mp->mpfq, mfp, q, __mpoolfile);
-
- if (0) {
-err: if (mfp->path_off != 0)
- __db_shalloc_free(dbmp->reginfo[0].addr,
- R_ADDR(dbmp->reginfo, mfp->path_off));
- if (mfp->fileid_off != 0)
- __db_shalloc_free(dbmp->reginfo[0].addr,
- R_ADDR(dbmp->reginfo, mfp->fileid_off));
- if (mfp != NULL)
- __db_shalloc_free(dbmp->reginfo[0].addr, mfp);
-mem_err: __db_err(dbmp->dbenv,
- "Unable to allocate memory for mpool file");
- }
- return (ret);
+ memcpy(fidp, R_ADDR(
+ dbmfp->dbmp->reginfo, dbmfp->mfp->fileid_off), DB_FILE_ID_LEN);
}
/*
@@ -644,14 +700,12 @@ __memp_refcnt(dbmfp, cntp)
db_pgno_t *cntp;
{
DB_ENV *dbenv;
- DB_MPOOL *dbmp;
- dbmp = dbmfp->dbmp;
- dbenv = dbmp->dbenv;
+ dbenv = dbmfp->dbmp->dbenv;
- R_LOCK(dbenv, dbmp->reginfo);
+ MUTEX_LOCK(dbenv, &dbmfp->mfp->mutex);
*cntp = dbmfp->mfp->mpf_cnt;
- R_UNLOCK(dbenv, dbmp->reginfo);
+ MUTEX_UNLOCK(dbenv, &dbmfp->mfp->mutex);
}
/*
@@ -666,26 +720,16 @@ __memp_set_unlink(dbmpf, set)
DB_MPOOLFILE *dbmpf;
int set;
{
- DB_MPOOL *dbmp;
+ DB_ENV *dbenv;
- dbmp = dbmpf->dbmp;
+ dbenv = dbmpf->dbmp->dbenv;
- if (set) {
- R_LOCK(dbmp->dbenv, dbmp->reginfo);
+ MUTEX_LOCK(dbenv, &dbmpf->mfp->mutex);
+ if (set)
F_SET(dbmpf->mfp, MP_UNLINK);
- R_UNLOCK(dbmp->dbenv, dbmp->reginfo);
- } else {
- /*
- * This bit is protected in the queue code because the metapage
- * is locked, so we can avoid getting the region lock. If this
- * gets used from other than the queue code, we cannot.
- */
- if (F_ISSET(dbmpf->mfp, MP_UNLINK)) {
- R_LOCK(dbmp->dbenv, dbmp->reginfo);
- F_CLR(dbmpf->mfp, MP_UNLINK);
- R_UNLOCK(dbmp->dbenv, dbmp->reginfo);
- }
- }
+ else
+ F_CLR(dbmpf->mfp, MP_UNLINK);
+ MUTEX_UNLOCK(dbenv, &dbmpf->mfp->mutex);
}
/*
@@ -698,7 +742,7 @@ __memp_fclose(dbmfp, flags)
u_int32_t flags;
{
DB_ENV *dbenv;
- int ret;
+ int ret, t_ret;
dbenv = dbmfp->dbmp->dbenv;
@@ -708,79 +752,83 @@ __memp_fclose(dbmfp, flags)
* XXX
* DB_MPOOL_DISCARD: Undocumented flag: DB private.
*/
- if (flags != 0 && (ret = __db_fchk(dbenv,
- "DB_MPOOLFILE->close", flags, DB_MPOOL_DISCARD)) != 0)
- return (ret);
+ ret = __db_fchk(dbenv, "DB_MPOOLFILE->close", flags, DB_MPOOL_DISCARD);
+
+ if ((t_ret = __memp_fclose_int(dbmfp, flags)) != 0 && ret == 0)
+ ret = t_ret;
- return (__memp_fclose_int(dbmfp, flags, 1));
+ return (ret);
}
/*
* __memp_fclose_int --
* Internal version of __memp_fclose.
*
- * PUBLIC: int __memp_fclose_int __P((DB_MPOOLFILE *, u_int32_t, int));
+ * PUBLIC: int __memp_fclose_int __P((DB_MPOOLFILE *, u_int32_t));
*/
int
-__memp_fclose_int(dbmfp, flags, needlock)
+__memp_fclose_int(dbmfp, flags)
DB_MPOOLFILE *dbmfp;
u_int32_t flags;
- int needlock;
{
DB_ENV *dbenv;
DB_MPOOL *dbmp;
MPOOLFILE *mfp;
char *rpath;
- int ret, t_ret;
+ int deleted, ret, t_ret;
dbmp = dbmfp->dbmp;
dbenv = dbmp->dbenv;
ret = 0;
/*
- * Remove the DB_MPOOLFILE from the queue. This has to happen before
- * we perform any action that can fail, otherwise __memp_close may
- * loop infinitely when calling us to discard all of the DB_MPOOLFILEs.
+ * We have to reference count DB_MPOOLFILE structures as other threads
+ * in the process may be using them. Here's the problem:
+ *
+ * Thread A opens a database.
+ * Thread B uses thread A's DB_MPOOLFILE to write a buffer
+ * in order to free up memory in the mpool cache.
+ * Thread A closes the database while thread B is using the
+ * DB_MPOOLFILE structure.
+ *
+ * By opening all databases before creating any threads, and closing
+ * the databases after all the threads have exited, applications get
+ * better performance and avoid the problem path entirely.
+ *
+ * Regardless, holding the DB_MPOOLFILE to flush a dirty buffer is a
+ * short-term lock, even in worst case, since we better be the only
+ * thread of control using the DB_MPOOLFILE structure to read pages
+ * *into* the cache. Wait until we're the only reference holder and
+ * remove the DB_MPOOLFILE structure from the list, so nobody else can
+ * find it. We do this, rather than have the last reference holder
+ * (whoever that might be) discard the DB_MPOOLFILE structure, because
+ * we'd rather write error messages to the application in the close
+ * routine, not in the checkpoint/sync routine.
+ *
+ * !!!
+ * It's possible the DB_MPOOLFILE was never added to the DB_MPOOLFILE
+ * file list, check the DB_OPEN_CALLED flag to be sure.
*/
- for (;;) {
+ for (deleted = 0;;) {
MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
-
- /*
- * We have to reference count DB_MPOOLFILE structures as other
- * threads may be using them. The problem only happens if the
- * application makes a bad design choice. Here's the path:
- *
- * Thread A opens a database.
- * Thread B uses thread A's DB_MPOOLFILE to write a buffer
- * in order to free up memory in the mpool cache.
- * Thread A closes the database while thread B is using the
- * DB_MPOOLFILE structure.
- *
- * By opening all databases before creating the threads, and
- * closing them after the threads have exited, applications
- * get better performance and avoid the problem path entirely.
- *
- * Regardless, holding the DB_MPOOLFILE to flush a dirty buffer
- * is a short-term lock, even in worst case, since we better be
- * the only thread of control using the DB_MPOOLFILE structure
- * to read pages *into* the cache. Wait until we're the only
- * reference holder and remove the DB_MPOOLFILE structure from
- * the list, so nobody else can even find it.
- */
if (dbmfp->ref == 1) {
- TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q);
- break;
+ if (F_ISSET(dbmfp, MP_OPEN_CALLED))
+ TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q);
+ deleted = 1;
}
MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
- (void)__os_sleep(dbenv, 1, 0);
+ if (deleted)
+ break;
+ __os_sleep(dbenv, 1, 0);
}
- MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
/* Complain if pinned blocks never returned. */
- if (dbmfp->pinref != 0)
+ if (dbmfp->pinref != 0) {
__db_err(dbenv, "%s: close: %lu blocks left pinned",
__memp_fn(dbmfp), (u_long)dbmfp->pinref);
+ ret = __db_panic(dbenv, DB_RUNRECOVERY);
+ }
/* Discard any mmap information. */
if (dbmfp->addr != NULL &&
@@ -789,7 +837,7 @@ __memp_fclose_int(dbmfp, flags, needlock)
/* Close the file; temporary files may not yet have been created. */
if (F_ISSET(dbmfp->fhp, DB_FH_VALID) &&
- (t_ret = __os_closehandle(dbmfp->fhp)) != 0) {
+ (t_ret = __os_closehandle(dbenv, dbmfp->fhp)) != 0) {
__db_err(dbenv, "%s: %s", __memp_fn(dbmfp), db_strerror(t_ret));
if (ret == 0)
ret = t_ret;
@@ -801,41 +849,51 @@ __memp_fclose_int(dbmfp, flags, needlock)
/*
* Discard our reference on the the underlying MPOOLFILE, and close
- * it if it's no longer useful to anyone.
- *
+ * it if it's no longer useful to anyone. It possible the open of
+ * the file never happened or wasn't successful, in which case, mpf
+ * will be NULL;
+ */
+ if ((mfp = dbmfp->mfp) == NULL)
+ goto done;
+
+ /*
* If it's a temp file, all outstanding references belong to unflushed
* buffers. (A temp file can only be referenced by one DB_MPOOLFILE).
* We don't care about preserving any of those buffers, so mark the
* MPOOLFILE as dead so that even the dirty ones just get discarded
* when we try to flush them.
*/
- if ((mfp = dbmfp->mfp) == NULL)
- goto done;
- if (needlock)
- R_LOCK(dbenv, dbmp->reginfo);
+ deleted = 0;
+ MUTEX_LOCK(dbenv, &mfp->mutex);
if (--mfp->mpf_cnt == 0 || LF_ISSET(DB_MPOOL_DISCARD)) {
if (LF_ISSET(DB_MPOOL_DISCARD) ||
F_ISSET(mfp, MP_TEMP | MP_UNLINK))
- MEMP_FREMOVE(mfp);
+ MPOOLFILE_IGNORE(mfp);
if (F_ISSET(mfp, MP_UNLINK)) {
if ((t_ret = __db_appname(dbmp->dbenv,
- DB_APP_DATA, NULL, R_ADDR(dbmp->reginfo,
+ DB_APP_DATA, R_ADDR(dbmp->reginfo,
mfp->path_off), 0, NULL, &rpath)) != 0 && ret == 0)
ret = t_ret;
- if (t_ret == 0 && (t_ret =
- __os_unlink(dbmp->dbenv, rpath) != 0) && ret == 0)
+ if (t_ret == 0) {
+ if ((t_ret = __os_unlink(
+ dbmp->dbenv, rpath) != 0) && ret == 0)
+ ret = t_ret;
+ __os_free(dbenv, rpath);
+ }
+ }
+ if (mfp->block_cnt == 0) {
+ if ((t_ret =
+ __memp_mf_discard(dbmp, mfp)) != 0 && ret == 0)
ret = t_ret;
- __os_free(dbenv, rpath, 0);
+ deleted = 1;
}
- if (mfp->block_cnt == 0)
- __memp_mf_discard(dbmp, mfp);
}
- if (needlock)
- R_UNLOCK(dbenv, dbmp->reginfo);
+ if (deleted == 0)
+ MUTEX_UNLOCK(dbenv, &mfp->mutex);
-done: /* Discard the DB_MPOOLFILE structure. */
- __os_free(dbenv, dbmfp->fhp, sizeof(DB_FH));
- __os_free(dbenv, dbmfp, sizeof(DB_MPOOLFILE));
+ /* Discard the DB_MPOOLFILE structure. */
+done: __os_free(dbenv, dbmfp->fhp);
+ __os_free(dbenv, dbmfp);
return (ret);
}
@@ -844,20 +902,69 @@ done: /* Discard the DB_MPOOLFILE structure. */
* __memp_mf_discard --
* Discard an MPOOLFILE.
*
- * PUBLIC: void __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *));
+ * PUBLIC: int __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *));
*/
-void
+int
__memp_mf_discard(dbmp, mfp)
DB_MPOOL *dbmp;
MPOOLFILE *mfp;
{
+ DB_ENV *dbenv;
+ DB_FH fh;
+ DB_MPOOL_STAT *sp;
MPOOL *mp;
+ char *rpath;
+ int ret;
+ dbenv = dbmp->dbenv;
mp = dbmp->reginfo[0].primary;
+ ret = 0;
+
+ /*
+ * Expects caller to be holding the MPOOLFILE mutex.
+ *
+ * When discarding a file, we have to flush writes from it to disk.
+ * The scenario is that dirty buffers from this file need to be
+ * flushed to satisfy a future checkpoint, but when the checkpoint
+ * calls mpool sync, the sync code won't know anything about them.
+ */
+ if (!F_ISSET(mfp, MP_DEADFILE) &&
+ (ret = __db_appname(dbenv, DB_APP_DATA,
+ R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) == 0) {
+ if ((ret = __os_open(dbenv, rpath, 0, 0, &fh)) == 0) {
+ ret = __os_fsync(dbenv, &fh);
+ (void)__os_closehandle(dbenv, &fh);
+ }
+ __os_free(dbenv, rpath);
+ }
+
+ /*
+ * We have to release the MPOOLFILE lock before acquiring the region
+ * lock so that we don't deadlock. Make sure nobody ever looks at
+ * this structure again.
+ */
+ MPOOLFILE_IGNORE(mfp);
+
+ /* Discard the mutex we're holding. */
+ MUTEX_UNLOCK(dbenv, &mfp->mutex);
/* Delete from the list of MPOOLFILEs. */
+ R_LOCK(dbenv, dbmp->reginfo);
SH_TAILQ_REMOVE(&mp->mpfq, mfp, q, __mpoolfile);
+ /* Copy the statistics into the region. */
+ sp = &mp->stat;
+ sp->st_cache_hit += mfp->stat.st_cache_hit;
+ sp->st_cache_miss += mfp->stat.st_cache_miss;
+ sp->st_map += mfp->stat.st_map;
+ sp->st_page_create += mfp->stat.st_page_create;
+ sp->st_page_in += mfp->stat.st_page_in;
+ sp->st_page_out += mfp->stat.st_page_out;
+
+ /* Clear the mutex this MPOOLFILE recorded. */
+ __db_shlocks_clear(&mfp->mutex, dbmp->reginfo,
+ (REGMAINT *)R_ADDR(dbmp->reginfo, mp->maint_off));
+
/* Free the space. */
if (mfp->path_off != 0)
__db_shalloc_free(dbmp->reginfo[0].addr,
@@ -869,6 +976,10 @@ __memp_mf_discard(dbmp, mfp)
__db_shalloc_free(dbmp->reginfo[0].addr,
R_ADDR(dbmp->reginfo, mfp->pgcookie_off));
__db_shalloc_free(dbmp->reginfo[0].addr, mfp);
+
+ R_UNLOCK(dbenv, dbmp->reginfo);
+
+ return (ret);
}
/*