summaryrefslogtreecommitdiff
path: root/db
diff options
context:
space:
mode:
Diffstat (limited to 'db')
-rw-r--r--db/env/db_salloc.c34
-rw-r--r--db/mp/mp_sync.c909
2 files changed, 445 insertions, 498 deletions
diff --git a/db/env/db_salloc.c b/db/env/db_salloc.c
index 4780107c5..881c63bbd 100644
--- a/db/env/db_salloc.c
+++ b/db/env/db_salloc.c
@@ -1,14 +1,14 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: db_salloc.c,v 11.10 2000/12/06 19:55:44 ubell Exp $";
+static const char revid[] = "Id: db_salloc.c,v 11.15 2002/02/22 01:55:53 mjc Exp ";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -59,8 +59,8 @@ __db_shalloc_init(area, size)
}
/*
- * __db_shalloc --
- * Allocate some space from the shared region.
+ * __db_shalloc_size --
+ * Return size of the shared region, including alignment.
*
* PUBLIC: int __db_shalloc_size __P((size_t, size_t));
*/
@@ -81,7 +81,7 @@ __db_shalloc_size(len, align)
if (align <= sizeof(db_align_t))
align = sizeof(db_align_t);
- return (ALIGN(len, align) + sizeof (struct __data));
+ return ((int)(ALIGN(len, align) + sizeof (struct __data)));
}
/*
@@ -284,28 +284,6 @@ __db_shalloc_free(regionp, ptr)
}
/*
- * __db_shalloc_count --
- * Return the amount of memory on the free list.
- *
- * PUBLIC: size_t __db_shalloc_count __P((void *));
- */
-size_t
-__db_shalloc_count(addr)
- void *addr;
-{
- struct __data *elp;
- size_t count;
-
- count = 0;
- for (elp = SH_LIST_FIRST((struct __head *)addr, __data);
- elp != NULL;
- elp = SH_LIST_NEXT(elp, links, __data))
- count += elp->len;
-
- return (count);
-}
-
-/*
* __db_shsizeof --
* Return the size of a shalloc'd piece of memory.
*
@@ -355,6 +333,6 @@ __db_shalloc_dump(addr, fp)
for (elp = SH_LIST_FIRST((struct __head *)addr, __data);
elp != NULL;
elp = SH_LIST_NEXT(elp, links, __data))
- fprintf(fp, "%#lx: %lu\t", (u_long)elp, (u_long)elp->len);
+ fprintf(fp, "%#lx: %lu\t", P_TO_ULONG(elp), (u_long)elp->len);
fprintf(fp, "\n");
}
diff --git a/db/mp/mp_sync.c b/db/mp/mp_sync.c
index 1b0751db7..8c176a385 100644
--- a/db/mp/mp_sync.c
+++ b/db/mp/mp_sync.c
@@ -1,13 +1,13 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: mp_sync.c,v 11.29 2001/01/11 18:19:53 bostic Exp $";
+static const char revid[] = "Id: mp_sync.c,v 11.63 2002/08/13 18:30:07 bostic Exp ";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -16,339 +16,92 @@ static const char revid[] = "$Id: mp_sync.c,v 11.29 2001/01/11 18:19:53 bostic E
#include <stdlib.h>
#endif
-#ifdef HAVE_RPC
-#include "db_server.h"
-#endif
-
#include "db_int.h"
-#include "db_shash.h"
-#include "mp.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/mp.h"
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
+typedef struct {
+ DB_MPOOL_HASH *track_hp; /* Hash bucket. */
+
+ roff_t track_off; /* Page file offset. */
+ db_pgno_t track_pgno; /* Page number. */
+} BH_TRACK;
static int __bhcmp __P((const void *, const void *));
-static int __memp_fsync __P((DB_MPOOLFILE *));
-static int __memp_sballoc __P((DB_ENV *, BH ***, u_int32_t *));
+static int __memp_close_flush_files __P((DB_ENV *, DB_MPOOL *));
+static int __memp_sync_files __P((DB_ENV *, DB_MPOOL *));
/*
- * memp_sync --
+ * __memp_sync --
* Mpool sync function.
+ *
+ * PUBLIC: int __memp_sync __P((DB_ENV *, DB_LSN *));
*/
int
-memp_sync(dbenv, lsnp)
+__memp_sync(dbenv, lsnp)
DB_ENV *dbenv;
DB_LSN *lsnp;
{
- BH *bhp, **bharray;
DB_MPOOL *dbmp;
- DB_LSN tlsn;
- MPOOL *c_mp, *mp;
- MPOOLFILE *mfp;
- u_int32_t ar_cnt, i, ndirty;
- int ret, retry_done, retry_need, wrote;
-
-#ifdef HAVE_RPC
- if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
- return (__dbcl_memp_sync(dbenv, lsnp));
-#endif
+ MPOOL *mp;
+ int ret;
PANIC_CHECK(dbenv);
- ENV_REQUIRES_CONFIG(dbenv, dbenv->mp_handle, DB_INIT_MPOOL);
-
- dbmp = dbenv->mp_handle;
- mp = dbmp->reginfo[0].primary;
+ ENV_REQUIRES_CONFIG(dbenv,
+ dbenv->mp_handle, "memp_sync", DB_INIT_MPOOL);
/*
- * If no LSN is provided, flush the entire cache.
- *
- * !!!
- * Our current behavior is to flush the entire cache, so there's
- * nothing special we have to do here other than deal with NULL
- * pointers.
+ * If no LSN is provided, flush the entire cache (reasonable usage
+ * even if there's no log subsystem configured).
*/
- if (lsnp == NULL) {
- ZERO_LSN(tlsn);
- lsnp = &tlsn;
- F_SET(mp, MP_LSN_RETRY);
- } else if (!LOGGING_ON(dbenv)) {
- __db_err(dbenv, "memp_sync: requires logging");
- return (EINVAL);
- }
+ if (lsnp != NULL)
+ ENV_REQUIRES_CONFIG(dbenv,
+ dbenv->lg_handle, "memp_sync", DB_INIT_LOG);
- /*
- * Sync calls are single-threaded so that we don't have multiple
- * threads, with different checkpoint LSNs, walking the caches
- * and updating the checkpoint LSNs and how many buffers remain
- * to be written for the checkpoint. This shouldn't be a problem,
- * any application that has multiple checkpoint threads isn't what
- * I'd call trustworthy.
- */
- MUTEX_LOCK(dbenv, &mp->sync_mutex, dbenv->lockfhp);
+ dbmp = dbenv->mp_handle;
+ mp = dbmp->reginfo[0].primary;
- /*
- * If the application is asking about a previous call to memp_sync(),
- * and we haven't found any buffers that the application holding the
- * pin couldn't write, return yes or no based on the current count.
- * Note, if the application is asking about a LSN *smaller* than one
- * we've already handled or are currently handling, then we return a
- * result based on the count for the larger LSN.
- */
- R_LOCK(dbenv, dbmp->reginfo);
- if (!IS_ZERO_LSN(*lsnp) &&
- !F_ISSET(mp, MP_LSN_RETRY) && log_compare(lsnp, &mp->lsn) <= 0) {
- if (mp->lsn_cnt == 0) {
+ /* If we've flushed to the requested LSN, return that information. */
+ if (lsnp != NULL) {
+ R_LOCK(dbenv, dbmp->reginfo);
+ if (log_compare(lsnp, &mp->lsn) <= 0) {
*lsnp = mp->lsn;
- ret = 0;
- } else
- ret = DB_INCOMPLETE;
+ R_UNLOCK(dbenv, dbmp->reginfo);
+ return (0);
+ }
R_UNLOCK(dbenv, dbmp->reginfo);
- MUTEX_UNLOCK(dbenv, &mp->sync_mutex);
- return (ret);
}
- /*
- * Allocate room for a list of buffers, and decide how many buffers
- * we can pin down.
- *
- * !!!
- * Note: __memp_sballoc has released the region lock if we're not
- * continuing forward.
- */
- if ((ret =
- __memp_sballoc(dbenv, &bharray, &ndirty)) != 0 || ndirty == 0) {
- MUTEX_UNLOCK(dbenv, &mp->sync_mutex);
+ if ((ret = __memp_sync_int(dbenv, NULL, 0, DB_SYNC_CACHE, NULL)) != 0)
return (ret);
- }
- retry_done = 0;
-retry: retry_need = 0;
- /*
- * Start a new checkpoint.
- *
- * Save the LSN. We know that it's a new LSN, a retry, or larger than
- * the one for which we were already doing a checkpoint. (BTW, I don't
- * expect to see multiple LSN's from the same or multiple processes,
- * but You Just Never Know. Responding as if they all called with the
- * largest of the LSNs specified makes everything work.)
- *
- * We don't currently use the LSN we save. We could potentially save
- * the last-written LSN in each buffer header and use it to determine
- * what buffers need to be written. The problem with this is that it's
- * sizeof(LSN) more bytes of buffer header. We currently write all the
- * dirty buffers instead, but with a sufficiently large cache that's
- * going to be a problem.
- */
- mp->lsn = *lsnp;
-
- /*
- * Clear the global count of buffers waiting to be written, walk the
- * list of files clearing the count of buffers waiting to be written.
- *
- * Clear the retry flag.
- */
- mp->lsn_cnt = 0;
- for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
- mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile))
- mfp->lsn_cnt = 0;
- F_CLR(mp, MP_LSN_RETRY);
-
- /*
- * Walk each cache's list of buffers and mark all dirty buffers to be
- * written and all pinned buffers to be potentially written (we can't
- * know if they'll need to be written until the holder returns them to
- * the cache). We do this in one pass while holding the region locked
- * so that processes can't make new buffers dirty, causing us to never
- * finish. Since the application may have restarted the sync using a
- * different LSN value, clear any BH_SYNC | BH_SYNC_LOGFLSH flags that
- * appear leftover from previous calls.
- *
- * Keep a count of the total number of buffers we need to write in
- * MPOOL->lsn_cnt, and for each file, in MPOOLFILE->lsn_count.
- */
- for (ar_cnt = 0, i = 0; i < mp->nreg; ++i) {
- c_mp = dbmp->reginfo[i].primary;
- for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
- bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
- if (F_ISSET(bhp, BH_DIRTY) || bhp->ref != 0) {
- F_SET(bhp, BH_SYNC);
-
- ++mp->lsn_cnt;
-
- mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
- ++mfp->lsn_cnt;
-
- /*
- * If the buffer isn't being used, we can write
- * it immediately, so increment its reference
- * count to lock it down, and save a reference
- * to it.
- *
- * If we've run out space to store buffer refs,
- * we're screwed. We don't want to realloc the
- * array while holding a region lock, so we set
- * a flag and deal with it later.
- */
- if (bhp->ref == 0) {
- ++bhp->ref;
- bharray[ar_cnt] = bhp;
-
- if (++ar_cnt >= ndirty) {
- retry_need = 1;
- break;
- }
- }
- } else
- if (F_ISSET(bhp, BH_SYNC))
- F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH);
- }
- if (ar_cnt >= ndirty)
- break;
- }
-
- /* If there no buffers we can write immediately, we're done. */
- if (ar_cnt == 0) {
- ret = mp->lsn_cnt ? DB_INCOMPLETE : 0;
- goto done;
- }
-
- R_UNLOCK(dbenv, dbmp->reginfo);
-
- /*
- * Sort the buffers we're going to write immediately.
- *
- * We try and write the buffers in file/page order: it should reduce
- * seeks by the underlying filesystem and possibly reduce the actual
- * number of writes.
- */
- if (ar_cnt > 1)
- qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp);
-
- /*
- * Flush the log. We have to ensure the log records reflecting the
- * changes on the database pages we're writing have already made it
- * to disk. We usually do that as we write each page, but if we
- * are going to write a large number of pages, repeatedly acquiring
- * the log region lock is going to be expensive. Flush the entire
- * log now, so that sync doesn't require any more log flushes.
- */
- if (LOGGING_ON(dbenv) && (ret = log_flush(dbenv, NULL)) != 0)
- goto done;
-
- R_LOCK(dbenv, dbmp->reginfo);
-
- /* Walk the array, writing buffers. */
- for (i = 0; i < ar_cnt; ++i) {
- /*
- * It's possible for a thread to have gotten the buffer since
- * we listed it for writing. If the reference count is still
- * 1, we're the only ones using the buffer, go ahead and write.
- * If it's >1, then skip the buffer and assume that it will be
- * written when it's returned to the cache.
- */
- if (bharray[i]->ref > 1) {
- --bharray[i]->ref;
- continue;
- }
-
- /* Write the buffer. */
- mfp = R_ADDR(dbmp->reginfo, bharray[i]->mf_offset);
- ret = __memp_bhwrite(dbmp, mfp, bharray[i], NULL, &wrote);
-
- /* Release the buffer. */
- --bharray[i]->ref;
-
- if (ret == 0 && wrote)
- continue;
-
- /*
- * Any process syncing the shared memory buffer pool had best
- * be able to write to any underlying file. Be understanding,
- * but firm, on this point.
- */
- if (ret == 0) {
- __db_err(dbenv, "%s: unable to flush page: %lu",
- __memp_fns(dbmp, mfp), (u_long)bharray[i]->pgno);
- ret = EPERM;
- }
-
- /*
- * On error, clear MPOOL->lsn and set MP_LSN_RETRY so that no
- * future checkpoint return can depend on this failure. Clear
- * the buffer's BH_SYNC flag, because it's used to determine
- * if lsn_cnt values are incremented/decremented. Don't bother
- * to reset/clear:
- *
- * MPOOL->lsn_cnt
- * MPOOLFILE->lsn_cnt
- *
- * they don't make any difference.
- */
- ZERO_LSN(mp->lsn);
- F_SET(mp, MP_LSN_RETRY);
-
- /* Release any buffers we're still pinning down. */
- while (++i < ar_cnt) {
- bhp = bharray[i];
- --bhp->ref;
- F_CLR(bhp, BH_SYNC | BH_SYNC_LOGFLSH);
- }
-
- goto done;
- }
-
- ret = mp->lsn_cnt != 0 ? DB_INCOMPLETE : 0;
-
- /*
- * If there were too many buffers and we're not returning an error, we
- * re-try the checkpoint once -- since we allocated 80% of the total
- * buffer count, once should be enough. If it still doesn't work, some
- * other thread of control is dirtying buffers as fast as we're writing
- * them, and we might as well give up for now. In the latter case, set
- * the global retry flag, we'll have to start from scratch on the next
- * checkpoint.
- */
- if (retry_need) {
- if (retry_done) {
- ret = DB_INCOMPLETE;
- F_SET(mp, MP_LSN_RETRY);
- } else {
- retry_done = 1;
- goto retry;
- }
+ if (lsnp != NULL) {
+ R_LOCK(dbenv, dbmp->reginfo);
+ if (log_compare(lsnp, &mp->lsn) > 0)
+ mp->lsn = *lsnp;
+ R_UNLOCK(dbenv, dbmp->reginfo);
}
-done: R_UNLOCK(dbenv, dbmp->reginfo);
- MUTEX_UNLOCK(dbenv, &mp->sync_mutex);
-
- __os_free(bharray, ndirty * sizeof(BH *));
-
- return (ret);
+ return (0);
}
/*
- * memp_fsync --
+ * __memp_fsync --
* Mpool file sync function.
+ *
+ * PUBLIC: int __memp_fsync __P((DB_MPOOLFILE *));
*/
int
-memp_fsync(dbmfp)
+__memp_fsync(dbmfp)
DB_MPOOLFILE *dbmfp;
{
DB_ENV *dbenv;
DB_MPOOL *dbmp;
- int is_tmp;
dbmp = dbmfp->dbmp;
dbenv = dbmp->dbenv;
-#ifdef HAVE_RPC
- if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
- return (__dbcl_memp_fsync(dbmfp));
-#endif
-
PANIC_CHECK(dbenv);
/*
@@ -359,13 +112,10 @@ memp_fsync(dbmfp)
if (F_ISSET(dbmfp, MP_READONLY))
return (0);
- R_LOCK(dbenv, dbmp->reginfo);
- is_tmp = F_ISSET(dbmfp->mfp, MP_TEMP);
- R_UNLOCK(dbenv, dbmp->reginfo);
- if (is_tmp)
+ if (F_ISSET(dbmfp->mfp, MP_TEMP))
return (0);
- return (__memp_fsync(dbmfp));
+ return (__memp_sync_int(dbenv, dbmfp, 0, DB_SYNC_FILE, NULL));
}
/*
@@ -379,6 +129,7 @@ __mp_xxx_fh(dbmfp, fhp)
DB_MPOOLFILE *dbmfp;
DB_FH **fhp;
{
+ DB_ENV *dbenv;
/*
* This is a truly spectacular layering violation, intended ONLY to
* support compatibility for the DB 1.85 DB->fd call.
@@ -393,239 +144,457 @@ __mp_xxx_fh(dbmfp, fhp)
* because we want to write to the backing file regardless so that
* we get a file descriptor to return.
*/
- *fhp = &dbmfp->fh;
- return (F_ISSET(&dbmfp->fh, DB_FH_VALID) ? 0 : __memp_fsync(dbmfp));
+ *fhp = dbmfp->fhp;
+ if (F_ISSET(dbmfp->fhp, DB_FH_VALID))
+ return (0);
+ dbenv = dbmfp->dbmp->dbenv;
+
+ return (__memp_sync_int(dbenv, dbmfp, 0, DB_SYNC_FILE, NULL));
}
/*
- * __memp_fsync --
- * Mpool file internal sync function.
+ * __memp_sync_int --
+ * Mpool sync internal function.
+ *
+ * PUBLIC: int __memp_sync_int
+ * PUBLIC: __P((DB_ENV *, DB_MPOOLFILE *, int, db_sync_op, int *));
*/
-static int
-__memp_fsync(dbmfp)
+int
+__memp_sync_int(dbenv, dbmfp, ar_max, op, wrotep)
+ DB_ENV *dbenv;
DB_MPOOLFILE *dbmfp;
+ int ar_max, *wrotep;
+ db_sync_op op;
{
- BH *bhp, **bharray;
- DB_ENV *dbenv;
+ BH *bhp;
+ BH_TRACK *bharray;
DB_MPOOL *dbmp;
+ DB_MPOOL_HASH *hp;
+ DB_MUTEX *mutexp;
MPOOL *c_mp, *mp;
- size_t mf_offset;
- u_int32_t ar_cnt, i, ndirty;
- int incomplete, ret, retry_done, retry_need, wrote;
+ MPOOLFILE *mfp;
+ u_int32_t n_cache;
+ int ar_cnt, hb_lock, i, pass, remaining, ret, t_ret, wait_cnt, wrote;
- dbmp = dbmfp->dbmp;
- dbenv = dbmp->dbenv;
+ dbmp = dbenv->mp_handle;
mp = dbmp->reginfo[0].primary;
-
- R_LOCK(dbenv, dbmp->reginfo);
+ pass = wrote = 0;
/*
- * Allocate room for a list of buffers, and decide how many buffers
- * we can pin down.
- *
- * !!!
- * Note: __memp_sballoc has released our region lock if we're not
- * continuing forward.
+ * If the caller does not specify how many pages assume one
+ * per bucket.
*/
+ if (ar_max == 0)
+ ar_max = mp->nreg * mp->htab_buckets;
+
if ((ret =
- __memp_sballoc(dbenv, &bharray, &ndirty)) != 0 || ndirty == 0)
+ __os_malloc(dbenv, ar_max * sizeof(BH_TRACK), &bharray)) != 0)
return (ret);
- retry_done = 0;
-retry: retry_need = 0;
/*
* Walk each cache's list of buffers and mark all dirty buffers to be
- * written and all pinned buffers to be potentially written (we can't
- * know if they'll need to be written until the holder returns them to
- * the cache). We do this in one pass while holding the region locked
- * so that processes can't make new buffers dirty, causing us to never
- * finish.
+ * written and all pinned buffers to be potentially written, depending
+ * on our flags.
*/
- mf_offset = R_OFFSET(dbmp->reginfo, dbmfp->mfp);
- for (ar_cnt = 0, incomplete = 0, i = 0; i < mp->nreg; ++i) {
- c_mp = dbmp->reginfo[i].primary;
- for (bhp = SH_TAILQ_FIRST(&c_mp->bhq, __bh);
- bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, q, __bh)) {
- if (!F_ISSET(bhp, BH_DIRTY) ||
- bhp->mf_offset != mf_offset)
- continue;
- if (bhp->ref != 0 || F_ISSET(bhp, BH_LOCKED)) {
- incomplete = 1;
- continue;
- }
+ for (ar_cnt = 0, n_cache = 0; n_cache < mp->nreg; ++n_cache) {
+ c_mp = dbmp->reginfo[n_cache].primary;
+ hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
+ for (i = 0; i < c_mp->htab_buckets; i++, hp++) {
/*
- * If the buffer isn't being used, we can write
- * it immediately, so increment its reference
- * count to lock it down, and save a reference
- * to it.
- *
- * If we've run out space to store buffer refs,
- * we're screwed. We don't want to realloc the
- * array while holding a region lock, so we set
- * a flag and deal with it later.
+ * We can check for empty buckets before locking as we
+ * only care if the pointer is zero or non-zero. We
+ * can ignore empty buckets because we only need write
+ * buffers that were dirty before we started.
*/
- ++bhp->ref;
- bharray[ar_cnt] = bhp;
- if (++ar_cnt >= ndirty) {
- retry_need = 1;
- break;
+ if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+ continue;
+
+ MUTEX_LOCK(dbenv, &hp->hash_mutex);
+ for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
+ /* Always ignore unreferenced, clean pages. */
+ if (bhp->ref == 0 && !F_ISSET(bhp, BH_DIRTY))
+ continue;
+
+ /*
+ * Checkpoints have to wait on all pinned pages,
+ * as pages may be marked dirty when returned to
+ * the cache.
+ *
+ * File syncs only wait on pages both pinned and
+ * dirty. (We don't care if pages are marked
+ * dirty when returned to the cache, that means
+ * there's another writing thread and flushing
+ * the cache for this handle is meaningless.)
+ */
+ if (op == DB_SYNC_FILE &&
+ !F_ISSET(bhp, BH_DIRTY))
+ continue;
+
+ mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+
+ /*
+ * Ignore temporary files -- this means you
+ * can't even flush temporary files by handle.
+ * (Checkpoint doesn't require temporary files
+ * be flushed and the underlying buffer write
+ * write routine may not be able to write it
+ * anyway.)
+ */
+ if (F_ISSET(mfp, MP_TEMP))
+ continue;
+
+ /*
+ * If we're flushing a specific file, see if
+ * this page is from that file.
+ */
+ if (dbmfp != NULL && mfp != dbmfp->mfp)
+ continue;
+
+ /*
+ * Ignore files that aren't involved in DB's
+ * transactional operations during checkpoints.
+ */
+ if (dbmfp == NULL && mfp->lsn_off == -1)
+ continue;
+
+ /* Track the buffer, we want it. */
+ bharray[ar_cnt].track_hp = hp;
+ bharray[ar_cnt].track_pgno = bhp->pgno;
+ bharray[ar_cnt].track_off = bhp->mf_offset;
+ ar_cnt++;
+
+ if (ar_cnt >= ar_max) {
+ if ((ret = __os_realloc(dbenv,
+ (ar_max * 2) * sizeof(BH_TRACK),
+ &bharray)) != 0)
+ break;
+ ar_max *= 2;
+ }
}
+ MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+
+ if (ret != 0)
+ goto err;
}
- if (ar_cnt >= ndirty)
- break;
}
- /* If there no buffers we can write immediately, we're done. */
- if (ar_cnt == 0) {
- ret = 0;
+ /* If there no buffers to write, we're done. */
+ if (ar_cnt == 0)
goto done;
- }
- R_UNLOCK(dbenv, dbmp->reginfo);
-
- /* Sort the buffers we're going to write. */
+ /*
+ * Write the buffers in file/page order, trying to reduce seeks by the
+ * filesystem and, when pages are smaller than filesystem block sizes,
+ * reduce the actual number of writes.
+ */
if (ar_cnt > 1)
- qsort(bharray, ar_cnt, sizeof(BH *), __bhcmp);
+ qsort(bharray, ar_cnt, sizeof(BH_TRACK), __bhcmp);
- R_LOCK(dbenv, dbmp->reginfo);
+ /*
+ * If we're trickling buffers, only write enough to reach the correct
+ * percentage for this region. We may not write enough if the dirty
+ * buffers have an unbalanced distribution among the regions, but that
+ * seems unlikely.
+ */
+ if (op == DB_SYNC_TRICKLE && ar_cnt > ar_max / (int)mp->nreg)
+ ar_cnt = ar_max / (int)mp->nreg;
+
+ /*
+ * Flush the log. We have to ensure the log records reflecting the
+ * changes on the database pages we're writing have already made it
+ * to disk. We still have to check the log each time we write a page
+ * (because pages we are about to write may be modified after we have
+ * flushed the log), but in general this will at least avoid any I/O
+ * on the log's part.
+ */
+ if (LOGGING_ON(dbenv) && (ret = dbenv->log_flush(dbenv, NULL)) != 0)
+ goto err;
+
+ /*
+ * Walk the array, writing buffers. When we write a buffer, we NULL
+ * out its hash bucket pointer so we don't process a slot more than
+ * once.
+ */
+ for (remaining = ar_cnt, i = pass = 0; remaining > 0; ++i) {
+ if (i >= ar_cnt) {
+ i = 0;
+ ++pass;
+ __os_sleep(dbenv, 1, 0);
+ }
+ if ((hp = bharray[i].track_hp) == NULL)
+ continue;
+
+ /* Lock the hash bucket and find the buffer. */
+ mutexp = &hp->hash_mutex;
+ MUTEX_LOCK(dbenv, mutexp);
+ for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+ bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
+ if (bhp->pgno == bharray[i].track_pgno &&
+ bhp->mf_offset == bharray[i].track_off)
+ break;
- /* Walk the array, writing buffers. */
- for (i = 0; i < ar_cnt;) {
/*
- * It's possible for a thread to have gotten the buffer since
- * we listed it for writing. If the reference count is still
- * 1, we're the only ones using the buffer, go ahead and write.
- * If it's >1, then skip the buffer and assume that it will be
- * written when it's returned to the cache.
+ * If we can't find the buffer we're done, somebody else had
+ * to have written it.
+ *
+ * If the buffer isn't pinned or dirty, we're done, there's
+ * no work needed.
*/
- if (bharray[i]->ref > 1) {
- incomplete = 1;
- --bharray[i++]->ref;
+ if (bhp == NULL || (bhp->ref == 0 && !F_ISSET(bhp, BH_DIRTY))) {
+ MUTEX_UNLOCK(dbenv, mutexp);
+ --remaining;
+ bharray[i].track_hp = NULL;
continue;
}
- /* Write the buffer. */
- ret = __memp_pgwrite(dbmp, dbmfp, bharray[i], NULL, &wrote);
+ /*
+ * If the buffer is locked by another thread, ignore it, we'll
+ * come back to it.
+ *
+ * If the buffer is pinned and it's only the first or second
+ * time we have looked at it, ignore it, we'll come back to
+ * it.
+ *
+ * In either case, skip the buffer if we're not required to
+ * write it.
+ */
+ if (F_ISSET(bhp, BH_LOCKED) || (bhp->ref != 0 && pass < 2)) {
+ MUTEX_UNLOCK(dbenv, mutexp);
+ if (op != DB_SYNC_CACHE && op != DB_SYNC_FILE) {
+ --remaining;
+ bharray[i].track_hp = NULL;
+ }
+ continue;
+ }
+
+ /*
+ * The buffer is either pinned or dirty.
+ *
+ * Set the sync wait-for count, used to count down outstanding
+ * references to this buffer as they are returned to the cache.
+ */
+ bhp->ref_sync = bhp->ref;
- /* Release the buffer. */
- --bharray[i++]->ref;
+ /* Pin the buffer into memory and lock it. */
+ ++bhp->ref;
+ F_SET(bhp, BH_LOCKED);
+ MUTEX_LOCK(dbenv, &bhp->mutex);
- if (ret == 0) {
- if (!wrote)
- incomplete = 1;
- continue;
+ /*
+ * Unlock the hash bucket and wait for the wait-for count to
+ * go to 0. No new thread can acquire the buffer because we
+ * have it locked.
+ *
+ * If a thread attempts to re-pin a page, the wait-for count
+ * will never go to 0 (the thread spins on our buffer lock,
+ * while we spin on the thread's ref count). Give up if we
+ * don't get the buffer in 3 seconds, we can try again later.
+ *
+ * If, when the wait-for count goes to 0, the buffer is found
+ * to be dirty, write it.
+ */
+ MUTEX_UNLOCK(dbenv, mutexp);
+ for (wait_cnt = 1;
+ bhp->ref_sync != 0 && wait_cnt < 4; ++wait_cnt)
+ __os_sleep(dbenv, 1, 0);
+ MUTEX_LOCK(dbenv, mutexp);
+ hb_lock = 1;
+
+ /*
+ * If the ref_sync count has gone to 0, we're going to be done
+ * with this buffer no matter what happens.
+ */
+ if (bhp->ref_sync == 0) {
+ --remaining;
+ bharray[i].track_hp = NULL;
}
/*
- * On error:
+ * If the ref_sync count has gone to 0 and the buffer is still
+ * dirty, we write it. We only try to write the buffer once.
+ * Any process checkpointing or trickle-flushing the pool
+ * must be able to write any underlying file -- if the write
+ * fails, error out. It would be very strange if file sync
+ * failed to write, but we don't care if it happens.
+ */
+ if (bhp->ref_sync == 0 && F_ISSET(bhp, BH_DIRTY)) {
+ hb_lock = 0;
+ MUTEX_UNLOCK(dbenv, mutexp);
+
+ mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+ if ((ret = __memp_bhwrite(dbmp, hp, mfp, bhp, 1)) == 0)
+ ++wrote;
+ else if (op == DB_SYNC_CACHE || op == DB_SYNC_TRICKLE)
+ __db_err(dbenv, "%s: unable to flush page: %lu",
+ __memp_fns(dbmp, mfp), (u_long)bhp->pgno);
+ else
+ ret = 0;
+ }
+
+ /*
+ * If ref_sync count never went to 0, the buffer was written
+ * by another thread, or the write failed, we still have the
+ * buffer locked.
+ *
+ * We may or may not currently hold the hash bucket mutex. If
+ * the __memp_bhwrite -> __memp_pgwrite call was successful,
+ * then __memp_pgwrite will have swapped the buffer lock for
+ * the hash lock. All other call paths will leave us without
+ * the hash bucket lock.
*
- * Release any buffers we're still pinning down.
+ * The order of mutexes above was to acquire the buffer lock
+ * while holding the hash bucket lock. Don't deadlock here,
+ * release the buffer lock and then acquire the hash bucket
+ * lock.
*/
- while (i < ar_cnt)
- --bharray[i++]->ref;
- break;
- }
+ if (F_ISSET(bhp, BH_LOCKED)) {
+ F_CLR(bhp, BH_LOCKED);
+ MUTEX_UNLOCK(dbenv, &bhp->mutex);
- /*
- * If there were too many buffers and we're not returning an error, we
- * re-try the flush once -- since we allocated 80% of the total
- * buffer count, once should be enough. If it still doesn't work, some
- * other thread of control is dirtying buffers as fast as we're writing
- * them, and we might as well give up.
- */
- if (retry_need) {
- if (retry_done)
- incomplete = 1;
- else {
- retry_done = 1;
- goto retry;
+ if (!hb_lock)
+ MUTEX_LOCK(dbenv, mutexp);
}
- }
-done: R_UNLOCK(dbenv, dbmp->reginfo);
+ /*
+ * Reset the ref_sync count regardless of our success, we're
+ * done with this buffer for now.
+ */
+ bhp->ref_sync = 0;
+
+ /* Discard our reference and unlock the bucket. */
+ --bhp->ref;
+ MUTEX_UNLOCK(dbenv, mutexp);
- __os_free(bharray, ndirty * sizeof(BH *));
+ if (ret != 0)
+ break;
+ }
+
+done: /* If we've opened files to flush pages, close them. */
+ if ((t_ret = __memp_close_flush_files(dbenv, dbmp)) != 0 && ret == 0)
+ ret = t_ret;
/*
- * Sync the underlying file as the last thing we do, so that the OS
- * has a maximal opportunity to flush buffers before we request it.
- *
- * !!!:
- * Don't lock the region around the sync, fsync(2) has no atomicity
- * issues.
+ * If doing a checkpoint or flushing a file for the application, we
+ * have to force the pages to disk. We don't do this as we go along
+ * because we want to give the OS as much time as possible to lazily
+ * flush, and because we have to flush files that might not even have
+ * had dirty buffers in the cache, so we have to walk the files list.
*/
- if (ret == 0)
- ret = incomplete ?
- DB_INCOMPLETE : __os_fsync(dbenv, &dbmfp->fh);
+ if (ret == 0 && (op == DB_SYNC_CACHE || op == DB_SYNC_FILE)) {
+ if (dbmfp == NULL)
+ ret = __memp_sync_files(dbenv, dbmp);
+ else
+ ret = __os_fsync(dbenv, dbmfp->fhp);
+ }
+
+err: __os_free(dbenv, bharray);
+ if (wrotep != NULL)
+ *wrotep = wrote;
return (ret);
}
/*
- * __memp_sballoc --
- * Allocate room for a list of buffers.
+ * __memp_sync_files --
+ * Sync all the files in the environment, open or not.
*/
-static int
-__memp_sballoc(dbenv, bharrayp, ndirtyp)
+static
+int __memp_sync_files(dbenv, dbmp)
DB_ENV *dbenv;
- BH ***bharrayp;
- u_int32_t *ndirtyp;
-{
DB_MPOOL *dbmp;
- MPOOL *c_mp, *mp;
- u_int32_t i, nclean, ndirty, maxpin;
- int ret;
+{
+ DB_MPOOLFILE *dbmfp;
+ MPOOL *mp;
+ MPOOLFILE *mfp;
+ int ret, t_ret;
- dbmp = dbenv->mp_handle;
+ ret = 0;
mp = dbmp->reginfo[0].primary;
- /*
- * We don't want to hold the region lock while we write the buffers,
- * so only lock it while we create a list.
- *
- * Walk through the list of caches, figuring out how many buffers
- * we're going to need.
- *
- * Make a point of not holding the region lock across the library
- * allocation call.
- */
- for (nclean = ndirty = 0, i = 0; i < mp->nreg; ++i) {
- c_mp = dbmp->reginfo[i].primary;
- ndirty += c_mp->stat.st_page_dirty;
- nclean += c_mp->stat.st_page_clean;
+ R_LOCK(dbenv, dbmp->reginfo);
+ for (mfp = SH_TAILQ_FIRST(&mp->mpfq, __mpoolfile);
+ mfp != NULL; mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile)) {
+ if (mfp->stat.st_page_out == 0 ||
+ F_ISSET(mfp, MP_DEADFILE | MP_TEMP))
+ continue;
+
+ /* Look for an already open handle. */
+ ret = 0;
+ MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
+ for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
+ dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))
+ if (dbmfp->mfp == mfp) {
+ ret = __os_fsync(dbenv, dbmfp->fhp);
+ break;
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+ if (ret != 0)
+ goto err;
+
+ /* If we don't find one, open one. */
+ if (dbmfp == NULL) {
+ if ((ret = dbenv->memp_fcreate(dbenv, &dbmfp, 0)) != 0)
+ goto err;
+ ret = __memp_fopen_int(
+ dbmfp, mfp, R_ADDR(dbmp->reginfo, mfp->path_off),
+ 0, 0, mfp->stat.st_pagesize);
+ if (ret == 0)
+ ret = __os_fsync(dbenv, dbmfp->fhp);
+ if ((t_ret =
+ __memp_fclose_int(dbmfp, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err;
+ }
}
- R_UNLOCK(dbenv, dbmp->reginfo);
- if (ndirty == 0) {
- *ndirtyp = 0;
- return (0);
+
+ if (0) {
+err: __db_err(dbenv, "%s: cannot sync: %s",
+ R_ADDR(dbmp->reginfo, mfp->path_off), db_strerror(ret));
}
+ R_UNLOCK(dbenv, dbmp->reginfo);
- /*
- * We don't want to pin down the entire buffer cache, otherwise we'll
- * starve threads needing new pages. Don't pin down more than 80% of
- * the cache, making sure that we don't screw up just because only a
- * few pages have been created.
- */
- maxpin = ((ndirty + nclean) * 8) / 10;
- if (maxpin < 10)
- maxpin = 10;
+ return (ret);
+}
+
+/*
+ * __memp_close_flush_files --
+ * Close files opened only to flush buffers.
+ */
+static int
+__memp_close_flush_files(dbenv, dbmp)
+ DB_ENV *dbenv;
+ DB_MPOOL *dbmp;
+{
+ DB_MPOOLFILE *dbmfp;
+ int ret;
/*
- * Get a good-sized block of memory to hold buffer pointers, we don't
- * want to run out, but correct if we want to allocate more than we
- * would be allowed to store, regardless.
+ * The routine exists because we must close files opened by sync to
+ * flush buffers. There are two cases: first, extent files have to
+ * be closed so they may be removed when empty. Second, regular
+ * files have to be closed so we don't run out of descriptors (for
+ * example, and application partitioning its data into databases
+ * based on timestamps, so there's a continually increasing set of
+ * files).
+ *
+ * We mark files opened in the __memp_bhwrite() function with the
+ * MP_FLUSH flag. Here we walk through our file descriptor list,
+ * and, if a file was opened by __memp_bhwrite(), we close it.
*/
- ndirty += ndirty / 2 + 10;
- if (ndirty > maxpin)
- ndirty = maxpin;
- if ((ret =
- __os_malloc(dbenv, ndirty * sizeof(BH *), NULL, bharrayp)) != 0)
- return (ret);
-
- *ndirtyp = ndirty;
-
- R_LOCK(dbenv, dbmp->reginfo);
+retry: MUTEX_THREAD_LOCK(dbenv, dbmp->mutexp);
+ for (dbmfp = TAILQ_FIRST(&dbmp->dbmfq);
+ dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q))
+ if (F_ISSET(dbmfp, MP_FLUSH)) {
+ F_CLR(dbmfp, MP_FLUSH);
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
+ if ((ret = __memp_fclose_int(dbmfp, 0)) != 0)
+ return (ret);
+ goto retry;
+ }
+ MUTEX_THREAD_UNLOCK(dbenv, dbmp->mutexp);
return (0);
}
@@ -634,15 +603,15 @@ static int
__bhcmp(p1, p2)
const void *p1, *p2;
{
- BH *bhp1, *bhp2;
+ BH_TRACK *bhp1, *bhp2;
- bhp1 = *(BH * const *)p1;
- bhp2 = *(BH * const *)p2;
+ bhp1 = (BH_TRACK *)p1;
+ bhp2 = (BH_TRACK *)p2;
/* Sort by file (shared memory pool offset). */
- if (bhp1->mf_offset < bhp2->mf_offset)
+ if (bhp1->track_off < bhp2->track_off)
return (-1);
- if (bhp1->mf_offset > bhp2->mf_offset)
+ if (bhp1->track_off > bhp2->track_off)
return (1);
/*
@@ -650,9 +619,9 @@ __bhcmp(p1, p2)
* Defend against badly written quicksort code calling the comparison
* function with two identical pointers (e.g., WATCOM C++ (Power++)).
*/
- if (bhp1->pgno < bhp2->pgno)
+ if (bhp1->track_pgno < bhp2->track_pgno)
return (-1);
- if (bhp1->pgno > bhp2->pgno)
+ if (bhp1->track_pgno > bhp2->track_pgno)
return (1);
return (0);
}