summaryrefslogtreecommitdiff
path: root/db/mp
diff options
context:
space:
mode:
Diffstat (limited to 'db/mp')
-rw-r--r--db/mp/Design52
-rw-r--r--db/mp/mp_alloc.c118
-rw-r--r--db/mp/mp_bh.c47
-rw-r--r--db/mp/mp_fget.c84
-rw-r--r--db/mp/mp_fmethod.c22
-rw-r--r--db/mp/mp_fopen.c291
-rw-r--r--db/mp/mp_fput.c120
-rw-r--r--db/mp/mp_fset.c105
-rw-r--r--db/mp/mp_method.c243
-rw-r--r--db/mp/mp_mvcc.c63
-rw-r--r--db/mp/mp_region.c308
-rw-r--r--db/mp/mp_register.c5
-rw-r--r--db/mp/mp_resize.c559
-rw-r--r--db/mp/mp_stat.c35
-rw-r--r--db/mp/mp_sync.c327
-rw-r--r--db/mp/mp_trickle.c19
16 files changed, 1592 insertions, 806 deletions
diff --git a/db/mp/Design b/db/mp/Design
deleted file mode 100644
index 1b26aae6c..000000000
--- a/db/mp/Design
+++ /dev/null
@@ -1,52 +0,0 @@
-$Id: Design,v 11.2 1999/11/21 23:08:27 bostic Exp $
-
-There are three ways we do locking in the mpool code:
-
-Locking a handle mutex to provide concurrency for DB_THREAD operations.
-Locking the region mutex to provide mutual exclusion while reading and
- writing structures in the shared region.
-Locking buffer header mutexes during I/O.
-
-The first will not be further described here. We use the shared mpool
-region lock to provide mutual exclusion while reading/modifying all of
-the data structures, including the buffer headers. We use a per-buffer
-header lock to wait on buffer I/O. The order of locking is as follows:
-
-Searching for a buffer:
- Acquire the region lock.
- Find the buffer header.
- Increment the reference count (guarantee the buffer stays).
- While the BH_LOCKED flag is set (I/O is going on) {
- Release the region lock.
- Explicitly yield the processor if it's not the first pass
- through this loop, otherwise, we can simply spin because
- we'll be simply switching between the two locks.
- Request the buffer lock.
- The I/O will complete...
- Acquire the buffer lock.
- Release the buffer lock.
- Acquire the region lock.
- }
- Return the buffer.
-
-Reading/writing a buffer:
- Acquire the region lock.
- Find/create the buffer header.
- If reading, increment the reference count (guarantee the buffer stays).
- Set the BH_LOCKED flag.
- Acquire the buffer lock (guaranteed not to block).
- Release the region lock.
- Do the I/O and/or initialize the buffer contents.
- Release the buffer lock.
- At this point, the buffer lock is available, but the logical
- operation (flagged by BH_LOCKED) is not yet completed. For
- this reason, among others, threads checking the BH_LOCKED flag
- must loop around their test.
- Acquire the region lock.
- Clear the BH_LOCKED flag.
- Release the region lock.
- Return/discard the buffer.
-
-Pointers to DB_MPOOL, MPOOL, DB_MPOOLFILE and MPOOLFILE structures are
-not reacquired when a region lock is reacquired because they couldn't
-have been closed/discarded and because they never move in memory.
diff --git a/db/mp/mp_alloc.c b/db/mp/mp_alloc.c
index 0619d5ccf..c18e62dff 100644
--- a/db/mp/mp_alloc.c
+++ b/db/mp/mp_alloc.c
@@ -1,10 +1,9 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996-2006
- * Oracle Corporation. All rights reserved.
+ * Copyright (c) 1996,2007 Oracle. All rights reserved.
*
- * $Id: mp_alloc.c,v 12.20 2006/09/07 15:11:26 mjc Exp $
+ * $Id: mp_alloc.c,v 12.33 2007/06/01 18:32:44 bostic Exp $
*/
#include "db_config.h"
@@ -38,7 +37,6 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp)
MPOOL *c_mp;
MPOOLFILE *bh_mfp;
size_t freed_space;
- db_mutex_t mutex;
u_int32_t buckets, buffers, high_priority, priority;
u_int32_t put_counter, total_buckets;
int aggressive, alloc_freeze, giveup, got_oldest, ret;
@@ -54,7 +52,7 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp)
aggressive = alloc_freeze = giveup = got_oldest = 0;
hp_tmp = NULL;
- c_mp->stat.st_alloc++;
+ STAT(c_mp->stat.st_alloc++);
/*
* If we're allocating a buffer, and the one we're discarding is the
@@ -86,7 +84,7 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp)
* we need in the hopes it will coalesce into a contiguous chunk of the
* right size. In the latter case we branch back here and try again.
*/
-alloc: if ((ret = __db_shalloc(infop, len, 0, &p)) == 0) {
+alloc: if ((ret = __env_alloc(infop, len, &p)) == 0) {
if (mfp != NULL)
c_mp->stat.st_pages++;
MPOOL_REGION_UNLOCK(dbenv, infop);
@@ -106,6 +104,7 @@ found: if (offsetp != NULL)
* We're not holding the region locked here, these statistics
* can't be trusted.
*/
+#ifdef HAVE_STATISTICS
total_buckets += buckets;
if (total_buckets != 0) {
if (total_buckets > c_mp->stat.st_alloc_max_buckets)
@@ -117,6 +116,7 @@ found: if (offsetp != NULL)
c_mp->stat.st_alloc_max_pages = buffers;
c_mp->stat.st_alloc_pages += buffers;
}
+#endif
return (0);
} else if (giveup || c_mp->stat.st_pages == 0) {
MPOOL_REGION_UNLOCK(dbenv, infop);
@@ -153,24 +153,14 @@ found: if (offsetp != NULL)
}
/*
- * Skip empty buckets.
- *
- * We can check for empty buckets before locking as we
- * only care if the pointer is zero or non-zero.
- */
- if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
- continue;
-
- /*
* The failure mode is when there are too many buffers we can't
- * write or there's not enough memory in the system. We don't
- * have a way to know that allocation has no way to succeed.
- * We fail if there were no pages returned to the cache after
- * we've been trying for a relatively long time.
+ * write or there's not enough memory in the system to support
+ * the number of pinned buffers.
*
- * Get aggressive if we've tried to flush the number of hash
- * buckets as are in the system and have not found any more
- * space. Aggressive means:
+ * Get aggressive if we've reviewed the entire cache without
+ * freeing 3 times the needed space. (The code resets the
+ * counter when we free 3 times the needed space.) Aggressive
+ * means:
*
* a: set a flag to attempt to flush high priority buffers as
* well as other buffers.
@@ -187,11 +177,15 @@ found: if (offsetp != NULL)
* Always try to allocate memory too, in case some other thread
* returns its memory to the region.
*
+ * We don't have any way to know an allocation has no way to
+ * succeed. Fail if no pages are returned to the cache after
+ * we've been trying for a relatively long time.
+ *
* !!!
* This test ignores pathological cases like no buffers in the
- * system -- that shouldn't be possible.
+ * system -- we check for that early on, so it isn't possible.
*/
- if ((++buckets % c_mp->htab_buckets) == 0) {
+ if (buckets++ == c_mp->htab_buckets) {
if (freed_space > 0)
goto alloc;
MPOOL_REGION_UNLOCK(dbenv, infop);
@@ -207,7 +201,7 @@ found: if (offsetp != NULL)
case 5:
case 6:
(void)__memp_sync_int(
- dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
+ dbenv, NULL, 0, DB_SYNC_ALLOC, NULL, NULL);
__os_sleep(dbenv, 1, 0);
break;
@@ -222,11 +216,35 @@ found: if (offsetp != NULL)
goto alloc;
}
+ /*
+ * Skip empty buckets.
+ *
+ * We can check for empty buckets before locking as we
+ * only care if the pointer is zero or non-zero.
+ */
+ if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+ continue;
+ /*
+ * Skip buckets that only have pinned pages.
+ *
+ * Again we are doing this without locking. If we misread
+ * the number we might improperly skip a bucket but this is
+ * not fatal.
+ */
+ if (hp->hash_priority == UINT32_MAX)
+ continue;
+
if (!aggressive) {
- /* Skip high priority buckets. */
- if (hp->hash_priority > high_priority)
+ /* Adjust if the bucket has not been reset. */
+ priority = hp->hash_priority;
+ if (c_mp->lru_reset != 0 &&
+ c_mp->lru_reset <= hp - dbht)
+ priority -= MPOOL_BASE_DECREMENT;
+ /*
+ * Skip high priority buckets.
+ */
+ if (priority > high_priority)
continue;
-
/*
* Find two buckets and select the one with the lowest
* priority. Performance testing shows that looking
@@ -237,18 +255,22 @@ found: if (offsetp != NULL)
hp_tmp = hp;
continue;
}
- if (hp->hash_priority > hp_tmp->hash_priority)
+ if (c_mp->lru_reset &&
+ c_mp->lru_reset <= hp_tmp - dbht) {
+ if (priority > hp_tmp->hash_priority -
+ MPOOL_BASE_DECREMENT)
+ hp = hp_tmp;
+ } else if (priority > hp_tmp->hash_priority)
hp = hp_tmp;
hp_tmp = NULL;
}
- /* Remember the priority of the buffer we're looking for. */
- priority = hp->hash_priority;
-
/* Unlock the region and lock the hash bucket. */
MPOOL_REGION_UNLOCK(dbenv, infop);
- mutex = hp->mtx_hash;
- MUTEX_LOCK(dbenv, mutex);
+ MUTEX_LOCK(dbenv, hp->mtx_hash);
+
+ /* Remember the priority of the buffer we're looking for. */
+ priority = hp->hash_priority;
#ifdef DIAGNOSTIC
__memp_check_order(dbenv, hp);
@@ -311,10 +333,15 @@ this_hb: if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL)
++bhp->ref;
ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0);
--bhp->ref;
+#ifdef HAVE_STATISTICS
if (ret == 0)
++c_mp->stat.st_rw_evict;
- } else
+#endif
+ }
+#ifdef HAVE_STATISTICS
+ else
++c_mp->stat.st_ro_evict;
+#endif
/*
* Freeze this buffer, if necessary. That is, if the buffer
@@ -373,13 +400,13 @@ this_hb: if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL)
++bhp->ref;
if ((ret = __memp_bh_thaw(dbmp, infop, hp,
bhp, NULL)) != 0) {
- MUTEX_UNLOCK(dbenv, mutex);
+ MUTEX_UNLOCK(dbenv, hp->mtx_hash);
return (ret);
}
alloc_freeze = 0;
goto this_hb;
} else if (alloc_freeze) {
- if ((ret = __memp_bhfree(dbmp, hp, bhp, 0)) != 0)
+ if ((ret = __memp_bhfree(dbmp, infop, hp, bhp, 0)) != 0)
return (ret);
MVCC_MPROTECT(bhp->buf, bh_mfp->stat.st_pagesize,
PROT_READ | PROT_WRITE | PROT_EXEC);
@@ -399,13 +426,13 @@ this_hb: if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL)
continue;
} else if (mfp != NULL &&
mfp->stat.st_pagesize == bh_mfp->stat.st_pagesize) {
- if ((ret = __memp_bhfree(dbmp, hp, bhp, 0)) != 0)
+ if ((ret = __memp_bhfree(dbmp, infop, hp, bhp, 0)) != 0)
return (ret);
p = bhp;
goto found;
} else {
- freed_space += __db_shalloc_sizeof(bhp);
- if ((ret = __memp_bhfree(dbmp,
+ freed_space += sizeof(*bhp) + bh_mfp->stat.st_pagesize;
+ if ((ret = __memp_bhfree(dbmp, infop,
hp, bhp, BH_FREE_FREEMEM)) != 0)
return (ret);
}
@@ -419,7 +446,7 @@ this_hb: if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL)
* hash bucket lock has already been discarded.
*/
if (0) {
-next_hb: MUTEX_UNLOCK(dbenv, mutex);
+next_hb: MUTEX_UNLOCK(dbenv, hp->mtx_hash);
}
MPOOL_REGION_LOCK(dbenv, infop);
@@ -449,7 +476,7 @@ __memp_free(infop, mfp, buf)
{
MVCC_BHUNALIGN(mfp, buf);
COMPQUIET(mfp, NULL);
- __db_shalloc_free(infop, buf);
+ __env_alloc_free(infop, buf);
}
/*
@@ -516,7 +543,9 @@ __memp_check_order(dbenv, hp)
DB_MPOOL_HASH *hp;
{
BH *bhp, *first_bhp, *tbhp;
- u_int32_t priority, last_priority;
+ u_int32_t dirty, priority, last_priority;
+
+ dirty = 0;
/*
* Assumes the hash bucket is locked.
@@ -526,6 +555,8 @@ __memp_check_order(dbenv, hp)
bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
DB_ASSERT(dbenv, !SH_CHAIN_HASNEXT(bhp, vc));
+ if (F_ISSET(bhp, BH_DIRTY))
+ dirty++;
priority = BH_PRIORITY(bhp);
DB_ASSERT(dbenv, (bhp == first_bhp) ?
priority == last_priority : priority >= last_priority);
@@ -547,5 +578,6 @@ __memp_check_order(dbenv, hp)
DB_ASSERT(dbenv, bhp->pgno != tbhp->pgno ||
bhp->mf_offset != tbhp->mf_offset);
}
+ DB_ASSERT(dbenv, dirty == hp->hash_page_dirty);
}
#endif
diff --git a/db/mp/mp_bh.c b/db/mp/mp_bh.c
index ef4d1d4be..85cc30cc7 100644
--- a/db/mp/mp_bh.c
+++ b/db/mp/mp_bh.c
@@ -1,10 +1,9 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996-2006
- * Oracle Corporation. All rights reserved.
+ * Copyright (c) 1996,2007 Oracle. All rights reserved.
*
- * $Id: mp_bh.c,v 12.31 2006/09/07 19:11:46 bostic Exp $
+ * $Id: mp_bh.c,v 12.38 2007/05/17 15:15:45 bostic Exp $
*/
#include "db_config.h"
@@ -256,9 +255,13 @@ __memp_pgread(dbmfp, hp, bhp, can_create)
if (len < pagesize)
memset(bhp->buf + len, CLEAR_BYTE, pagesize - len);
#endif
+#ifdef HAVE_STATISTICS
++mfp->stat.st_page_create;
} else
++mfp->stat.st_page_in;
+#else
+ }
+#endif
/* Call any pgin function. */
ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1);
@@ -304,18 +307,16 @@ __memp_pgwrite(dbenv, dbmfp, hp, bhp)
mfp = dbmfp == NULL ? NULL : dbmfp->mfp;
callpgin = ret = 0;
- /*
- * We should never be called with a clean or trash buffer.
- * The sync code does call us with already locked buffers.
- */
+ /* We should never be called with a clean or trash buffer. */
DB_ASSERT(dbenv, F_ISSET(bhp, BH_DIRTY));
DB_ASSERT(dbenv, !F_ISSET(bhp, BH_TRASH));
- /* If not already done, lock the buffer and unlock the hash bucket. */
- if (!F_ISSET(bhp, BH_LOCKED)) {
- F_SET(bhp, BH_LOCKED);
- MUTEX_UNLOCK(dbenv, hp->mtx_hash);
- }
+ /*
+ * The sync code has already locked the buffer, but the allocation
+ * code has not. Lock the buffer and release the hash bucket mutex.
+ */
+ F_SET(bhp, BH_LOCKED);
+ MUTEX_UNLOCK(dbenv, hp->mtx_hash);
/*
* It's possible that the underlying file doesn't exist, either
@@ -333,7 +334,7 @@ __memp_pgwrite(dbenv, dbmfp, hp, bhp)
* If the page is in a file for which we have LSN information, we have
* to ensure the appropriate log records are on disk.
*/
- if (LOGGING_ON(dbenv) && mfp->lsn_off != -1 &&
+ if (LOGGING_ON(dbenv) && mfp->lsn_off != DB_LSN_OFF_NOTSET &&
!IS_CLIENT_PGRECOVER(dbenv)) {
memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN));
if (!IS_NOT_LOGGED_LSN(lsn) &&
@@ -402,7 +403,7 @@ __memp_pgwrite(dbenv, dbmfp, hp, bhp)
__memp_fn(dbmfp), (u_long)bhp->pgno);
goto err;
}
- ++mfp->stat.st_page_out;
+ STAT(++mfp->stat.st_page_out);
if (bhp->pgno > mfp->last_flushed_pgno) {
MUTEX_LOCK(dbenv, mfp->mutex);
if (bhp->pgno > mfp->last_flushed_pgno)
@@ -517,20 +518,20 @@ err: __db_errx(dbenv, "%s: %s failed for page %lu",
* Free a bucket header and its referenced data.
*
* PUBLIC: int __memp_bhfree
- * PUBLIC: __P((DB_MPOOL *, DB_MPOOL_HASH *, BH *, u_int32_t));
+ * PUBLIC: __P((DB_MPOOL *, REGINFO *, DB_MPOOL_HASH *, BH *, u_int32_t));
*/
int
-__memp_bhfree(dbmp, hp, bhp, flags)
+__memp_bhfree(dbmp, infop, hp, bhp, flags)
DB_MPOOL *dbmp;
+ REGINFO *infop;
DB_MPOOL_HASH *hp;
BH *bhp;
u_int32_t flags;
{
DB_ENV *dbenv;
- MPOOL *c_mp, *mp;
+ MPOOL *c_mp;
MPOOLFILE *mfp;
BH *next_bhp, *prev_bhp;
- u_int32_t n_cache;
int reorder, ret, t_ret;
#ifdef DIAG_MVCC
size_t pagesize;
@@ -542,8 +543,6 @@ __memp_bhfree(dbmp, hp, bhp, flags)
* Assumes the hash bucket is locked and the MPOOL is not.
*/
dbenv = dbmp->dbenv;
- mp = dbmp->reginfo[0].primary;
- n_cache = NCACHE(mp, bhp->mf_offset, bhp->pgno);
mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
#ifdef DIAG_MVCC
pagesize = mfp->stat.st_pagesize;
@@ -623,13 +622,13 @@ __memp_bhfree(dbmp, hp, bhp, flags)
* real.
*/
if (LF_ISSET(BH_FREE_FREEMEM)) {
- MPOOL_REGION_LOCK(dbenv, &dbmp->reginfo[n_cache]);
+ MPOOL_REGION_LOCK(dbenv, infop);
- __memp_free(&dbmp->reginfo[n_cache], mfp, bhp);
- c_mp = dbmp->reginfo[n_cache].primary;
+ __memp_free(infop, mfp, bhp);
+ c_mp = infop->primary;
c_mp->stat.st_pages--;
- MPOOL_REGION_UNLOCK(dbenv, &dbmp->reginfo[n_cache]);
+ MPOOL_REGION_UNLOCK(dbenv, infop);
}
/*
diff --git a/db/mp/mp_fget.c b/db/mp/mp_fget.c
index 5f7eb6802..bb73a0a08 100644
--- a/db/mp/mp_fget.c
+++ b/db/mp/mp_fget.c
@@ -1,10 +1,9 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996-2006
- * Oracle Corporation. All rights reserved.
+ * Copyright (c) 1996,2007 Oracle. All rights reserved.
*
- * $Id: mp_fget.c,v 12.33 2006/09/13 14:53:42 mjc Exp $
+ * $Id: mp_fget.c,v 12.43 2007/06/05 11:55:28 mjc Exp $
*/
#include "db_config.h"
@@ -108,36 +107,34 @@ __memp_fget(dbmfp, pgnoaddr, txn, flags, addrp)
enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state;
BH *alloc_bhp, *bhp, *current_bhp, *frozen_bhp, *oldest_bhp;
DB_ENV *dbenv;
+ DB_LSN *read_lsnp;
DB_MPOOL *dbmp;
DB_MPOOL_HASH *hp;
- MPOOL *c_mp, *mp;
+ MPOOL *c_mp;
MPOOLFILE *mfp;
- REGINFO *infop;
+ REGINFO *infop, *t_infop;
TXN_DETAIL *td;
- DB_LSN *read_lsnp;
roff_t mf_offset;
- u_int32_t n_cache, st_hsearch;
+ u_int32_t st_hsearch;
int b_incr, b_locked, dirty, edit, extending, first;
int makecopy, mvcc, need_free, reorder, ret;
*(void **)addrp = NULL;
+ COMPQUIET(c_mp, NULL);
+ COMPQUIET(infop, NULL);
COMPQUIET(oldest_bhp, NULL);
dbenv = dbmfp->dbenv;
dbmp = dbenv->mp_handle;
- c_mp = NULL;
- mp = dbmp->reginfo[0].primary;
mfp = dbmfp->mfp;
mvcc = mfp->multiversion;
mf_offset = R_OFFSET(dbmp->reginfo, mfp);
alloc_bhp = bhp = frozen_bhp = NULL;
read_lsnp = NULL;
+ td = NULL;
hp = NULL;
b_incr = b_locked = extending = makecopy = ret = 0;
- n_cache = 0;
- infop = NULL;
- td = NULL;
if (LF_ISSET(DB_MPOOL_DIRTY)) {
if (F_ISSET(dbmfp, MP_READONLY)) {
@@ -224,25 +221,22 @@ __memp_fget(dbmfp, pgnoaddr, txn, flags, addrp)
F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) {
*(void **)addrp = (u_int8_t *)dbmfp->addr +
(*pgnoaddr * mfp->stat.st_pagesize);
- ++mfp->stat.st_map;
+ STAT(++mfp->stat.st_map);
return (0);
}
-hb_search:
- /*
+retry: /*
* Determine the cache and hash bucket where this page lives and get
* local pointers to them. Reset on each pass through this code, the
* page number can change.
*/
- n_cache = NCACHE(mp, mf_offset, *pgnoaddr);
- infop = &dbmp->reginfo[n_cache];
+ MP_GET_BUCKET(dbmfp, *pgnoaddr, &infop, hp, ret);
+ if (ret != 0)
+ return (ret);
c_mp = infop->primary;
- hp = R_ADDR(infop, c_mp->htab);
- hp = &hp[NBUCKET(c_mp, mf_offset, *pgnoaddr)];
/* Search the hash chain for the page. */
-retry: st_hsearch = 0;
- MUTEX_LOCK(dbenv, hp->mtx_hash);
+ st_hsearch = 0;
b_locked = 1;
SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) {
++st_hsearch;
@@ -326,7 +320,7 @@ retry: st_hsearch = 0;
F_SET(hp, IO_WAITER);
MUTEX_LOCK(dbenv, hp->mtx_io);
}
- ++hp->hash_io_wait;
+ STAT(++hp->hash_io_wait);
/* Release the hash bucket lock. */
MUTEX_UNLOCK(dbenv, hp->mtx_hash);
@@ -362,10 +356,13 @@ thawed: need_free = (--frozen_bhp->ref == 0);
goto retry;
}
+#ifdef HAVE_STATISTICS
++mfp->stat.st_cache_hit;
+#endif
break;
}
+#ifdef HAVE_STATISTICS
/*
* Update the hash bucket search statistics -- do now because our next
* search may be for a different bucket.
@@ -374,6 +371,7 @@ thawed: need_free = (--frozen_bhp->ref == 0);
if (st_hsearch > c_mp->stat.st_hash_longest)
c_mp->stat.st_hash_longest = st_hsearch;
c_mp->stat.st_hash_examined += st_hsearch;
+#endif
/*
* There are 4 possible paths to this location:
@@ -411,6 +409,10 @@ thawed: need_free = (--frozen_bhp->ref == 0);
*/
if (flags == DB_MPOOL_FREE) {
if (--bhp->ref == 0) {
+ if (F_ISSET(bhp, BH_DIRTY)) {
+ --hp->hash_page_dirty;
+ F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE);
+ }
/*
* In a multiversion database, this page could
* be requested again so we have to leave it in
@@ -424,17 +426,12 @@ thawed: need_free = (--frozen_bhp->ref == 0);
if (mvcc && (!SH_CHAIN_SINGLETON(bhp, vc) ||
bhp->td_off == INVALID_ROFF ||
!IS_MAX_LSN(*VISIBLE_LSN(dbenv, bhp)))) {
- if (F_ISSET(bhp, BH_DIRTY)) {
- --hp->hash_page_dirty;
- F_CLR(bhp,
- BH_DIRTY | BH_DIRTY_CREATE);
- }
F_SET(bhp, BH_FREED);
MUTEX_UNLOCK(dbenv, hp->mtx_hash);
return (0);
}
return (__memp_bhfree(
- dbmp, hp, bhp, BH_FREE_FREEMEM));
+ dbmp, infop, hp, bhp, BH_FREE_FREEMEM));
}
__db_errx(dbenv,
"File %s: freeing pinned buffer for page %lu",
@@ -447,12 +444,10 @@ thawed: need_free = (--frozen_bhp->ref == 0);
if (flags == DB_MPOOL_CREATE &&
F_ISSET(bhp, BH_FREED)) {
extending = makecopy = 1;
- MUTEX_UNLOCK(dbenv, hp->mtx_hash);
MUTEX_LOCK(dbenv, mfp->mutex);
if (*pgnoaddr > mfp->last_pgno)
mfp->last_pgno = *pgnoaddr;
MUTEX_UNLOCK(dbenv, mfp->mutex);
- MUTEX_LOCK(dbenv, hp->mtx_hash);
}
/*
@@ -478,8 +473,9 @@ thawed: need_free = (--frozen_bhp->ref == 0);
((ret = __txn_oldest_reader(dbenv,
&hp->old_reader)) == 0 &&
BH_OBSOLETE(oldest_bhp, hp->old_reader)))) {
- if ((ret = __memp_bhfree(dbmp, hp,
- oldest_bhp, BH_FREE_REUSE)) != 0)
+ if ((ret = __memp_bhfree(dbmp,
+ infop, hp, oldest_bhp,
+ BH_FREE_REUSE)) != 0)
goto err;
alloc_bhp = oldest_bhp;
} else if (ret != 0)
@@ -547,17 +543,17 @@ alloc: /*
/*
* !!!
- * In the DB_MPOOL_NEW code path, mf_offset and n_cache have
+ * In the DB_MPOOL_NEW code path, infop and c_mp have
* not yet been initialized.
*/
- mf_offset = R_OFFSET(dbmp->reginfo, mfp);
- n_cache = NCACHE(mp, mf_offset, *pgnoaddr);
- infop = &dbmp->reginfo[n_cache];
+ MP_GET_REGION(dbmfp, *pgnoaddr, &infop, ret);
+ if (ret != 0)
+ goto err;
c_mp = infop->primary;
/* Allocate a new buffer header and data space. */
if ((ret =
- __memp_alloc(dbmp,infop, mfp, 0, NULL, &alloc_bhp)) != 0)
+ __memp_alloc(dbmp, infop, mfp, 0, NULL, &alloc_bhp)) != 0)
goto err;
#ifdef DIAGNOSTIC
if ((uintptr_t)alloc_bhp->buf & (sizeof(size_t) - 1)) {
@@ -601,7 +597,10 @@ alloc: /*
*/
if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) {
*pgnoaddr = mfp->last_pgno + 1;
- if (n_cache != NCACHE(mp, mf_offset, *pgnoaddr)) {
+ MP_GET_REGION(dbmfp, *pgnoaddr, &t_infop,ret);
+ if (ret != 0)
+ goto err;
+ if (t_infop != infop) {
/*
* flags == DB_MPOOL_NEW, so extending is set
* and we're holding the mfp locked.
@@ -641,7 +640,7 @@ alloc: /*
b_locked = 1;
break;
}
- goto hb_search;
+ goto retry;
case SECOND_FOUND:
/*
* We allocated buffer space for the requested page, but then
@@ -764,10 +763,10 @@ alloc: /*
if (flags == DB_MPOOL_CREATE && mfp->ftype != 0)
F_SET(bhp, BH_CALLPGIN);
- ++mfp->stat.st_page_create;
+ STAT(++mfp->stat.st_page_create);
} else {
F_SET(bhp, BH_TRASH);
- ++mfp->stat.st_cache_miss;
+ STAT(++mfp->stat.st_cache_miss);
}
/* Increment buffer count referenced by MPOOLFILE. */
@@ -961,7 +960,8 @@ err: /*
if (frozen_bhp != NULL)
--frozen_bhp;
if (b_incr && --bhp->ref == 0) {
- (void)__memp_bhfree(dbmp, hp, bhp, BH_FREE_FREEMEM);
+ (void)__memp_bhfree(dbmp,
+ infop, hp, bhp, BH_FREE_FREEMEM);
b_locked = 0;
}
}
diff --git a/db/mp/mp_fmethod.c b/db/mp/mp_fmethod.c
index 76d160ee5..38cd11d34 100644
--- a/db/mp/mp_fmethod.c
+++ b/db/mp/mp_fmethod.c
@@ -1,10 +1,9 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996-2006
- * Oracle Corporation. All rights reserved.
+ * Copyright (c) 1996,2007 Oracle. All rights reserved.
*
- * $Id: mp_fmethod.c,v 12.13 2006/08/24 14:46:14 bostic Exp $
+ * $Id: mp_fmethod.c,v 12.19 2007/06/01 16:30:30 bostic Exp $
*/
#include "db_config.h"
@@ -67,7 +66,7 @@ __memp_fcreate(dbenv, retp)
return (ret);
dbmfp->ref = 1;
- dbmfp->lsn_offset = -1;
+ dbmfp->lsn_offset = DB_LSN_OFF_NOTSET;
dbmfp->dbenv = dbenv;
dbmfp->mfp = INVALID_ROFF;
@@ -77,13 +76,13 @@ __memp_fcreate(dbenv, retp)
dbmfp->get_fileid = __memp_get_fileid;
dbmfp->get_flags = __memp_get_flags;
dbmfp->get_ftype = __memp_get_ftype;
+ dbmfp->get_last_pgno = __memp_get_last_pgno;
dbmfp->get_lsn_offset = __memp_get_lsn_offset;
dbmfp->get_maxsize = __memp_get_maxsize;
dbmfp->get_pgcookie = __memp_get_pgcookie;
dbmfp->get_priority = __memp_get_priority;
dbmfp->open = __memp_fopen_pp;
dbmfp->put = __memp_fput_pp;
- dbmfp->set = __memp_fset_pp;
dbmfp->set_clear_len = __memp_set_clear_len;
dbmfp->set_fileid = __memp_set_fileid;
dbmfp->set_flags = __memp_set_flags;
@@ -489,16 +488,17 @@ __memp_set_priority(dbmfp, priority)
}
/*
- * __memp_last_pgno --
+ * __memp_get_last_pgno --
* Return the page number of the last page in the file.
*
* !!!
- * Undocumented interface: DB private.
+ * The method is undocumented, but the handle is exported, users occasionally
+ * ask for it.
*
- * PUBLIC: int __memp_last_pgno __P((DB_MPOOLFILE *, db_pgno_t *));
+ * PUBLIC: int __memp_get_last_pgno __P((DB_MPOOLFILE *, db_pgno_t *));
*/
int
-__memp_last_pgno(dbmfp, pgnoaddr)
+__memp_get_last_pgno(dbmfp, pgnoaddr)
DB_MPOOLFILE *dbmfp;
db_pgno_t *pgnoaddr;
{
@@ -540,8 +540,8 @@ __memp_fns(dbmp, mfp)
DB_MPOOL *dbmp;
MPOOLFILE *mfp;
{
- if (mfp->path_off == 0)
- return ((char *)"temporary");
+ if (mfp == NULL || mfp->path_off == 0)
+ return ((char *)"unknown");
return ((char *)R_ADDR(dbmp->reginfo, mfp->path_off));
}
diff --git a/db/mp/mp_fopen.c b/db/mp/mp_fopen.c
index f13876e75..b41565304 100644
--- a/db/mp/mp_fopen.c
+++ b/db/mp/mp_fopen.c
@@ -1,10 +1,9 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996-2006
- * Oracle Corporation. All rights reserved.
+ * Copyright (c) 1996,2007 Oracle. All rights reserved.
*
- * $Id: mp_fopen.c,v 12.34 2006/09/09 13:55:52 bostic Exp $
+ * $Id: mp_fopen.c,v 12.44 2007/05/17 17:18:01 bostic Exp $
*/
#include "db_config.h"
@@ -15,8 +14,10 @@
#include "dbinc/db_page.h"
#include "dbinc/hash.h"
-static int __memp_mfp_alloc __P((DB_MPOOL *,
+static int __memp_mpf_alloc __P((DB_MPOOL *,
DB_MPOOLFILE *, const char *, u_int32_t, u_int32_t, MPOOLFILE **));
+static int __memp_mpf_find __P((DB_ENV *,
+ DB_MPOOLFILE *, DB_MPOOL_HASH *, const char *, u_int32_t, MPOOLFILE **));
/*
* __memp_fopen_pp --
@@ -140,14 +141,51 @@ __memp_fopen(dbmfp, mfp, path, flags, mode, pgsize)
bucket = 0;
hp = R_ADDR(dbmp->reginfo, mp->ftab);
- if (path == NULL && mfp == NULL)
- goto alloc;
+ if (mfp == NULL) {
+ if (path == NULL)
+ goto alloc;
- /*
- * Our caller may be able to tell us which underlying MPOOLFILE we
- * need a handle for.
- */
- if (mfp != NULL) {
+ /*
+ * Hash to the proper file table entry and walk it.
+ *
+ * The fileID is a filesystem unique number (e.g., a
+ * UNIX dev/inode pair) plus a timestamp. If files are
+ * removed and created in less than a second, the fileID
+ * can be repeated. The problem with repetition happens
+ * when the file that previously had the fileID value still
+ * has pages in the pool, since we don't want to use them
+ * to satisfy requests for the new file. Because the
+ * DB_TRUNCATE flag reuses the dev/inode pair, repeated
+ * opens with that flag set guarantees matching fileIDs
+ * when the machine can open a file and then re-open
+ * with truncate within a second. For this reason, we
+ * pass that flag down, and, if we find a matching entry,
+ * we ensure that it's never found again, and we create
+ * a new entry for the current request.
+ */
+
+ if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE))
+ bucket = FNBUCKET(path, strlen(path));
+ else
+ bucket = FNBUCKET(dbmfp->fileid, DB_FILE_ID_LEN);
+ hp += bucket;
+
+ /*
+ * If we are passed a FILEID find the MPOOLFILE and inc
+ * its ref count. That way it cannot go away while we
+ * open it.
+ */
+ if (F_ISSET(dbmfp, MP_FILEID_SET)) {
+ MUTEX_LOCK(dbenv, hp->mtx_hash);
+ ret =
+ __memp_mpf_find(dbenv, dbmfp, hp, path, flags,&mfp);
+ MUTEX_UNLOCK(dbenv, hp->mtx_hash);
+ if (ret != 0)
+ goto err;
+ if (mfp != NULL)
+ refinc = 1;
+ }
+ } else {
/*
* Deadfile can only be set if mpf_cnt goes to zero (or if we
* failed creating the file DB_AM_DISCARD). Increment the ref
@@ -213,7 +251,7 @@ __memp_fopen(dbmfp, mfp, path, flags, mode, pgsize)
}
if ((ret = __db_appname(dbenv,
DB_APP_DATA, path, 0, NULL, &rpath)) == 0)
- ret = __os_open_extend(dbenv, rpath,
+ ret = __os_open(dbenv, rpath,
(u_int32_t)pagesize, oflags, mode, &dbmfp->fhp);
if (mfp != NULL)
MPOOL_SYSTEM_UNLOCK(dbenv);
@@ -289,83 +327,21 @@ __memp_fopen(dbmfp, mfp, path, flags, mode, pgsize)
goto have_mfp;
/*
- * Hash to the proper file table entry and walk it.
- *
- * The fileID is a filesystem unique number (e.g., a UNIX dev/inode
- * pair) plus a timestamp. If files are removed and created in less
- * than a second, the fileID can be repeated. The problem with
- * repetition happens when the file that previously had the fileID
- * value still has pages in the pool, since we don't want to use them
- * to satisfy requests for the new file.
- *
- * Because the DB_TRUNCATE flag reuses the dev/inode pair, repeated
- * opens with that flag set guarantees matching fileIDs when the
- * machine can open a file and then re-open with truncate within a
- * second. For this reason, we pass that flag down, and, if we find
- * a matching entry, we ensure that it's never found again, and we
- * create a new entry for the current request.
- */
- if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) {
- DB_ASSERT(dbenv, path != NULL);
- bucket = FNBUCKET(path, strlen(path));
- } else
- bucket = FNBUCKET(dbmfp->fileid, DB_FILE_ID_LEN);
- hp += bucket;
-
- /*
* We can race with another process opening the same file when
* we allocate the mpoolfile structure. We will come back
* here and check the hash table again to see if it has appeared.
* For most files this is not a problem, since the name is locked
* at a higher layer but QUEUE extent files are not locked.
*/
-
check: MUTEX_LOCK(dbenv, hp->mtx_hash);
- SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) {
- /* Skip dead files and temporary files. */
- if (mfp->deadfile || F_ISSET(mfp, MP_TEMP))
- continue;
-
- /*
- * Any remaining DB_MPOOL_NOFILE databases are in-memory
- * named databases and need only match other in-memory
- * databases with the same name.
- */
- if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) {
- if (!mfp->no_backing_file)
- continue;
-
- DB_ASSERT(dbenv, path != NULL);
- if (strcmp(path, R_ADDR(dbmp->reginfo, mfp->path_off)))
- continue;
-
- /*
- * We matched an in-memory file; grab the fileid if
- * it is set in the region, but not in the dbmfp.
- */
- if (!F_ISSET(dbmfp, MP_FILEID_SET))
- (void)__memp_set_fileid(dbmfp,
- R_ADDR(dbmp->reginfo, mfp->fileid_off));
- } else
- if (memcmp(dbmfp->fileid, R_ADDR(dbmp->reginfo,
- mfp->fileid_off), DB_FILE_ID_LEN) != 0)
- continue;
-
- /*
- * If the file is being truncated, remove it from the system
- * and create a new entry.
- *
- * !!!
- * We should be able to set mfp to NULL and break out of the
- * loop, but I like the idea of checking all the entries.
- */
- if (LF_ISSET(DB_TRUNCATE)) {
- MUTEX_LOCK(dbenv, mfp->mutex);
- mfp->deadfile = 1;
- MUTEX_UNLOCK(dbenv, mfp->mutex);
- continue;
- }
+ if ((ret = __memp_mpf_find(dbenv, dbmfp, hp, path, flags, &mfp) != 0))
+ goto err;
+ if (alloc_mfp != NULL && mfp == NULL) {
+ mfp = alloc_mfp;
+ alloc_mfp = NULL;
+ SH_TAILQ_INSERT_HEAD(&hp->hash_bucket, mfp, q, __mpoolfile);
+ } else if (mfp != NULL) {
/*
* Some things about a file cannot be changed: the clear length,
* page size, or LSN location. However, if this is an attempt
@@ -385,7 +361,7 @@ check: MUTEX_LOCK(dbenv, hp->mtx_hash);
mfp->clear_len != DB_CLEARLEN_NOTSET &&
dbmfp->clear_len != mfp->clear_len) ||
(pagesize != 0 && pagesize != mfp->stat.st_pagesize) ||
- (dbmfp->lsn_offset != -1 &&
+ (dbmfp->lsn_offset != DB_LSN_OFF_NOTSET &&
mfp->lsn_off != DB_LSN_OFF_NOTSET &&
dbmfp->lsn_offset != mfp->lsn_off)) {
__db_errx(dbenv,
@@ -395,42 +371,6 @@ check: MUTEX_LOCK(dbenv, hp->mtx_hash);
ret = EINVAL;
goto err;
}
-
- /*
- * Check to see if this file has died while we waited.
- *
- * We normally don't lock the deadfile field when we read it as
- * we only care if the field is zero or non-zero. We do lock
- * on read when searching for a matching MPOOLFILE so that two
- * threads of control don't race between setting the deadfile
- * bit and incrementing the reference count, that is, a thread
- * of control decrementing the reference count and then setting
- * deadfile because the reference count is 0 blocks us finding
- * the file without knowing it's about to be marked dead.
- */
- MUTEX_LOCK(dbenv, mfp->mutex);
- if (mfp->deadfile) {
- MUTEX_UNLOCK(dbenv, mfp->mutex);
- continue;
- }
- ++mfp->mpf_cnt;
- refinc = 1;
- MUTEX_UNLOCK(dbenv, mfp->mutex);
-
- /* Initialize any fields that are not yet set. */
- if (dbmfp->ftype != 0)
- mfp->ftype = dbmfp->ftype;
- if (dbmfp->clear_len != DB_CLEARLEN_NOTSET)
- mfp->clear_len = dbmfp->clear_len;
- if (dbmfp->lsn_offset != -1)
- mfp->lsn_off = dbmfp->lsn_offset;
-
- break;
- }
- if (alloc_mfp != NULL && mfp == NULL) {
- mfp = alloc_mfp;
- alloc_mfp = NULL;
- SH_TAILQ_INSERT_HEAD(&hp->hash_bucket, mfp, q, __mpoolfile);
}
MUTEX_UNLOCK(dbenv, hp->mtx_hash);
@@ -462,7 +402,7 @@ alloc: /*
__os_fileid(dbenv, rpath, 0, dbmfp->fileid)) != 0)
goto err;
- if ((ret = __memp_mfp_alloc(dbmp,
+ if ((ret = __memp_mpf_alloc(dbmp,
dbmfp, path, pagesize, flags, &alloc_mfp)) != 0)
goto err;
@@ -625,8 +565,105 @@ err: if (refinc) {
return (ret);
}
+/*
+ * __memp_mpf_find --
+ * Search a hash bucket for a MPOOLFILE.
+ */
+static int
+__memp_mpf_find(dbenv, dbmfp, hp, path, flags, mfpp)
+ DB_ENV *dbenv;
+ DB_MPOOLFILE *dbmfp;
+ DB_MPOOL_HASH *hp;
+ const char *path;
+ u_int32_t flags;
+ MPOOLFILE **mfpp;
+{
+ DB_MPOOL *dbmp;
+ MPOOLFILE *mfp;
+
+ dbmp = dbenv->mp_handle;
+
+ SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) {
+ /* Skip dead files and temporary files. */
+ if (mfp->deadfile || F_ISSET(mfp, MP_TEMP))
+ continue;
+
+ /*
+ * Any remaining DB_MPOOL_NOFILE databases are in-memory
+ * named databases and need only match other in-memory
+ * databases with the same name.
+ */
+ if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) {
+ if (!mfp->no_backing_file)
+ continue;
+
+ if (strcmp(path, R_ADDR(dbmp->reginfo, mfp->path_off)))
+ continue;
+
+ /*
+ * We matched an in-memory file; grab the fileid if
+ * it is set in the region, but not in the dbmfp.
+ */
+ if (!F_ISSET(dbmfp, MP_FILEID_SET))
+ (void)__memp_set_fileid(dbmfp,
+ R_ADDR(dbmp->reginfo, mfp->fileid_off));
+ } else
+ if (memcmp(dbmfp->fileid, R_ADDR(dbmp->reginfo,
+ mfp->fileid_off), DB_FILE_ID_LEN) != 0)
+ continue;
+
+ /*
+ * If the file is being truncated, remove it from the system
+ * and create a new entry.
+ *
+ * !!!
+ * We should be able to set mfp to NULL and break out of the
+ * loop, but I like the idea of checking all the entries.
+ */
+ if (LF_ISSET(DB_TRUNCATE)) {
+ MUTEX_LOCK(dbenv, mfp->mutex);
+ mfp->deadfile = 1;
+ MUTEX_UNLOCK(dbenv, mfp->mutex);
+ continue;
+ }
+
+ /*
+ * Check to see if this file has died while we waited.
+ *
+ * We normally don't lock the deadfile field when we read it as
+ * we only care if the field is zero or non-zero. We do lock
+ * on read when searching for a matching MPOOLFILE so that two
+ * threads of control don't race between setting the deadfile
+ * bit and incrementing the reference count, that is, a thread
+ * of control decrementing the reference count and then setting
+ * deadfile because the reference count is 0 blocks us finding
+ * the file without knowing it's about to be marked dead.
+ */
+ MUTEX_LOCK(dbenv, mfp->mutex);
+ if (mfp->deadfile) {
+ MUTEX_UNLOCK(dbenv, mfp->mutex);
+ continue;
+ }
+ ++mfp->mpf_cnt;
+ MUTEX_UNLOCK(dbenv, mfp->mutex);
+
+ /* Initialize any fields that are not yet set. */
+ if (dbmfp->ftype != 0)
+ mfp->ftype = dbmfp->ftype;
+ if (dbmfp->clear_len != DB_CLEARLEN_NOTSET)
+ mfp->clear_len = dbmfp->clear_len;
+ if (dbmfp->lsn_offset != -1)
+ mfp->lsn_off = dbmfp->lsn_offset;
+
+ break;
+ }
+
+ *mfpp = mfp;
+ return (0);
+}
+
static int
-__memp_mfp_alloc(dbmp, dbmfp, path, pagesize, flags, retmfp)
+__memp_mpf_alloc(dbmp, dbmfp, path, pagesize, flags, retmfp)
DB_MPOOL *dbmp;
DB_MPOOLFILE *dbmfp;
const char *path;
@@ -742,14 +779,12 @@ __memp_fclose_pp(dbmfp, flags)
/*
* Validate arguments, but as a handle destructor, we can't fail.
- *
- * !!!
- * DB_MPOOL_DISCARD: Undocumented flag: DB private.
*/
- (void)__db_fchk(dbenv, "DB_MPOOLFILE->close", flags, DB_MPOOL_DISCARD);
+ if (flags != 0)
+ (void)__db_ferr(dbenv, "DB_MPOOLFILE->close", 0);
ENV_ENTER(dbenv, ip);
- REPLICATION_WRAP(dbenv, (__memp_fclose(dbmfp, flags)), ret);
+ REPLICATION_WRAP(dbenv, (__memp_fclose(dbmfp, 0)), ret);
ENV_LEAVE(dbenv, ip);
return (ret);
}
@@ -906,7 +941,9 @@ __memp_mf_discard(dbmp, mfp)
{
DB_ENV *dbenv;
DB_MPOOL_HASH *hp;
+#ifdef HAVE_STATISTICS
DB_MPOOL_STAT *sp;
+#endif
MPOOL *mp;
int need_sync, ret, t_ret;
@@ -948,9 +985,10 @@ __memp_mf_discard(dbmp, mfp)
/* Lock the region and collect stats and free the space. */
MPOOL_SYSTEM_LOCK(dbenv);
if (need_sync &&
- (t_ret = __memp_mf_sync(dbmp, mfp, 1)) != 0 && ret == 0)
+ (t_ret = __memp_mf_sync(dbmp, mfp, 0)) != 0 && ret == 0)
ret = t_ret;
+#ifdef HAVE_STATISTICS
/* Copy the statistics into the region. */
sp = &mp->stat;
sp->st_cache_hit += mfp->stat.st_cache_hit;
@@ -959,6 +997,7 @@ __memp_mf_discard(dbmp, mfp)
sp->st_page_create += mfp->stat.st_page_create;
sp->st_page_in += mfp->stat.st_page_in;
sp->st_page_out += mfp->stat.st_page_out;
+#endif
/* Free the space. */
if (mfp->path_off != 0)
diff --git a/db/mp/mp_fput.c b/db/mp/mp_fput.c
index 124d2e1da..53afe8a82 100644
--- a/db/mp/mp_fput.c
+++ b/db/mp/mp_fput.c
@@ -1,10 +1,9 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996-2006
- * Oracle Corporation. All rights reserved.
+ * Copyright (c) 1996,2007 Oracle. All rights reserved.
*
- * $Id: mp_fput.c,v 12.22 2006/09/07 20:05:33 bostic Exp $
+ * $Id: mp_fput.c,v 12.36 2007/06/05 11:55:28 mjc Exp $
*/
#include "db_config.h"
@@ -19,12 +18,14 @@ static int __memp_reset_lru __P((DB_ENV *, REGINFO *));
* __memp_fput_pp --
* DB_MPOOLFILE->put pre/post processing.
*
- * PUBLIC: int __memp_fput_pp __P((DB_MPOOLFILE *, void *, u_int32_t));
+ * PUBLIC: int __memp_fput_pp
+ * PUBLIC: __P((DB_MPOOLFILE *, void *, DB_CACHE_PRIORITY, u_int32_t));
*/
int
-__memp_fput_pp(dbmfp, pgaddr, flags)
+__memp_fput_pp(dbmfp, pgaddr, priority, flags)
DB_MPOOLFILE *dbmfp;
void *pgaddr;
+ DB_CACHE_PRIORITY priority;
u_int32_t flags;
{
DB_ENV *dbenv;
@@ -33,10 +34,14 @@ __memp_fput_pp(dbmfp, pgaddr, flags)
dbenv = dbmfp->dbenv;
PANIC_CHECK(dbenv);
+ if (flags != 0)
+ return (__db_ferr(dbenv, "DB_MPOOLFILE->put", 0));
+
+ MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->put");
ENV_ENTER(dbenv, ip);
- ret = __memp_fput(dbmfp, pgaddr, flags);
+ ret = __memp_fput(dbmfp, pgaddr, priority);
if (IS_ENV_REPLICATED(dbenv) &&
(t_ret = __op_rep_exit(dbenv)) != 0 && ret == 0)
ret = t_ret;
@@ -49,47 +54,30 @@ __memp_fput_pp(dbmfp, pgaddr, flags)
* __memp_fput --
* DB_MPOOLFILE->put.
*
- * PUBLIC: int __memp_fput __P((DB_MPOOLFILE *, void *, u_int32_t));
+ * PUBLIC: int __memp_fput __P((DB_MPOOLFILE *, void *, DB_CACHE_PRIORITY));
*/
int
-__memp_fput(dbmfp, pgaddr, flags)
+__memp_fput(dbmfp, pgaddr, priority)
DB_MPOOLFILE *dbmfp;
void *pgaddr;
- u_int32_t flags;
+ DB_CACHE_PRIORITY priority;
{
+ BH *bhp;
DB_ENV *dbenv;
DB_MPOOL *dbmp;
DB_MPOOL_HASH *hp;
MPOOL *c_mp;
MPOOLFILE *mfp;
- BH *bhp;
- u_int32_t n_cache;
- int adjust, ret, t_ret;
+ REGINFO *infop;
+ int adjust, pfactor, ret, t_ret;
dbenv = dbmfp->dbenv;
- MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->put");
dbmp = dbenv->mp_handle;
mfp = dbmfp->mfp;
bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
ret = 0;
/*
- * Check arguments, but don't fail because we want to unpin the page
- * regardless. The problem is when running with replication. There
- * is a reference count we incremented when __memp_fget was called,
- * and we need to unpin the page and decrement that reference count.
- * If we see flag problems, mark the page dirty.
- */
- if (flags) {
- if (__db_fchk(dbenv, "memp_fput", flags,
- DB_MPOOL_DISCARD) != 0) {
- flags = 0;
- ret = EINVAL;
- DB_ASSERT(dbenv, 0);
- }
- }
-
- /*
* If we're mapping the file, there's nothing to do. Because we can
* stop mapping the file at any time, we have to check on each buffer
* to see if the address we gave the application was part of the map
@@ -116,15 +104,10 @@ __memp_fput(dbmfp, pgaddr, flags)
#endif
/* Convert a page address to a buffer header and hash bucket. */
- n_cache = NCACHE(dbmp->reginfo[0].primary, bhp->mf_offset, bhp->pgno);
- c_mp = dbmp->reginfo[n_cache].primary;
- hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
- hp = &hp[NBUCKET(c_mp, bhp->mf_offset, bhp->pgno)];
-
- MUTEX_LOCK(dbenv, hp->mtx_hash);
-
- if (LF_ISSET(DB_MPOOL_DISCARD))
- F_SET(bhp, BH_DISCARD);
+ MP_GET_BUCKET(dbmfp, bhp->pgno, &infop, hp, ret);
+ if (ret != 0)
+ return (ret);
+ c_mp = infop->primary;
/*
* Check for a reference count going to zero. This can happen if the
@@ -163,7 +146,8 @@ __memp_fput(dbmfp, pgaddr, flags)
MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, 0);
/* Update priority values. */
- if (F_ISSET(bhp, BH_DISCARD) || mfp->priority == MPOOL_PRI_VERY_LOW)
+ if (priority == DB_PRIORITY_VERY_LOW ||
+ mfp->priority == MPOOL_PRI_VERY_LOW)
bhp->priority = 0;
else {
/*
@@ -173,9 +157,31 @@ __memp_fput(dbmfp, pgaddr, flags)
*/
bhp->priority = c_mp->lru_count;
+ switch (priority) {
+ default:
+ case DB_PRIORITY_UNCHANGED:
+ pfactor = mfp->priority;
+ break;
+ case DB_PRIORITY_VERY_LOW:
+ pfactor = MPOOL_PRI_VERY_LOW;
+ break;
+ case DB_PRIORITY_LOW:
+ pfactor = MPOOL_PRI_LOW;
+ break;
+ case DB_PRIORITY_DEFAULT:
+ pfactor = MPOOL_PRI_DEFAULT;
+ break;
+ case DB_PRIORITY_HIGH:
+ pfactor = MPOOL_PRI_HIGH;
+ break;
+ case DB_PRIORITY_VERY_HIGH:
+ pfactor = MPOOL_PRI_VERY_HIGH;
+ break;
+ }
+
adjust = 0;
- if (mfp->priority != 0)
- adjust = (int)c_mp->stat.st_pages / mfp->priority;
+ if (pfactor != 0)
+ adjust = (int)c_mp->stat.st_pages / pfactor;
if (F_ISSET(bhp, BH_DIRTY))
adjust += (int)c_mp->stat.st_pages / MPOOL_PRI_DIRTY;
@@ -234,10 +240,9 @@ __memp_reset_lru(dbenv, infop)
BH *bhp, *tbhp;
DB_MPOOL_HASH *hp;
MPOOL *c_mp;
- u_int32_t bucket;
+ u_int32_t bucket, priority;
c_mp = infop->primary;
-
/*
* Update the counter so all future allocations will start at the
* bottom.
@@ -253,19 +258,42 @@ __memp_reset_lru(dbenv, infop)
* We can check for empty buckets before locking as we
* only care if the pointer is zero or non-zero.
*/
- if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+ if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) {
+ c_mp->lru_reset++;
continue;
+ }
MUTEX_LOCK(dbenv, hp->mtx_hash);
- SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh)
+ c_mp->lru_reset++;
+ /*
+ * We need to take a little care that the bucket does
+ * not become unsorted. This is highly unlikely but
+ * possible.
+ */
+ priority = 0;
+ SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) {
for (tbhp = bhp; tbhp != NULL;
tbhp = SH_CHAIN_PREV(tbhp, vc, __bh)) {
if (tbhp->priority != UINT32_MAX &&
- tbhp->priority > MPOOL_BASE_DECREMENT)
+ tbhp->priority > MPOOL_BASE_DECREMENT) {
tbhp->priority -= MPOOL_BASE_DECREMENT;
+ if (tbhp->priority < priority)
+ tbhp->priority = priority;
+ }
}
+ priority = bhp->priority;
+ }
+ /*
+ * Reset the hash bucket's priority. The chain is never empty
+ * in this case, so tbhp will never be NULL.
+ */
+ if ((tbhp =
+ SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) != NULL)
+ hp->hash_priority = tbhp->priority;
MUTEX_UNLOCK(dbenv, hp->mtx_hash);
}
+ c_mp->lru_reset = 0;
+ COMPQUIET(dbenv, NULL);
return (0);
}
diff --git a/db/mp/mp_fset.c b/db/mp/mp_fset.c
index e3fd2f4df..46950f4e1 100644
--- a/db/mp/mp_fset.c
+++ b/db/mp/mp_fset.c
@@ -1,10 +1,9 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996-2006
- * Oracle Corporation. All rights reserved.
+ * Copyright (c) 1996,2007 Oracle. All rights reserved.
*
- * $Id: mp_fset.c,v 12.16 2006/09/13 14:53:42 mjc Exp $
+ * $Id: mp_fset.c,v 12.23 2007/06/05 11:55:28 mjc Exp $
*/
#include "db_config.h"
@@ -15,108 +14,33 @@
#include "dbinc/txn.h"
/*
- * __memp_fset_pp --
- * DB_MPOOLFILE->set pre/post processing.
- *
- * PUBLIC: int __memp_fset_pp __P((DB_MPOOLFILE *, void *, u_int32_t));
- */
-int
-__memp_fset_pp(dbmfp, pgaddr, flags)
- DB_MPOOLFILE *dbmfp;
- void *pgaddr;
- u_int32_t flags;
-{
- DB_ENV *dbenv;
- DB_THREAD_INFO *ip;
- int ret;
-
- dbenv = dbmfp->dbenv;
-
- PANIC_CHECK(dbenv);
- MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->set");
-
- /* Validate arguments. */
- if (flags == 0)
- return (__db_ferr(dbenv, "memp_fset", 1));
-
- if ((ret = __db_fchk(dbenv, "memp_fset", flags, DB_MPOOL_DISCARD)) != 0)
- return (ret);
-
- ENV_ENTER(dbenv, ip);
- REPLICATION_WRAP(dbenv, (__memp_fset(dbmfp, pgaddr, flags)), ret);
- ENV_LEAVE(dbenv, ip);
- return (ret);
-}
-
-/*
- * __memp_fset --
- * DB_MPOOLFILE->set.
- *
- * PUBLIC: int __memp_fset __P((DB_MPOOLFILE *, void *, u_int32_t));
- */
-int
-__memp_fset(dbmfp, pgaddr, flags)
- DB_MPOOLFILE *dbmfp;
- void *pgaddr;
- u_int32_t flags;
-{
- BH *bhp;
- DB_ENV *dbenv;
- DB_MPOOL *dbmp;
- DB_MPOOL_HASH *hp;
- MPOOL *c_mp;
- u_int32_t n_cache;
-
- dbenv = dbmfp->dbenv;
- dbmp = dbenv->mp_handle;
-
- DB_ASSERT(dbenv, !LF_ISSET(DB_MPOOL_DIRTY));
-
- /* Convert the page address to a buffer header and hash bucket. */
- bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf));
- n_cache = NCACHE(dbmp->reginfo[0].primary, bhp->mf_offset, bhp->pgno);
- c_mp = dbmp->reginfo[n_cache].primary;
- hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
- hp = &hp[NBUCKET(c_mp, bhp->mf_offset, bhp->pgno)];
-
- MUTEX_LOCK(dbenv, hp->mtx_hash);
-
- if (LF_ISSET(DB_MPOOL_DISCARD))
- F_SET(bhp, BH_DISCARD);
-
- MUTEX_UNLOCK(dbenv, hp->mtx_hash);
- return (0);
-}
-
-/*
* __memp_dirty --
* Upgrade a page from a read-only to a writeable pointer.
*
- * PUBLIC: int __memp_dirty __P((DB_MPOOLFILE *, void *, DB_TXN *, u_int32_t));
+ * PUBLIC: int __memp_dirty __P((
+ * PUBLIC: DB_MPOOLFILE *, void *, DB_TXN *, DB_CACHE_PRIORITY, u_int32_t));
*/
int
-__memp_dirty(dbmfp, addrp, txn, flags)
+__memp_dirty(dbmfp, addrp, txn, priority, flags)
DB_MPOOLFILE *dbmfp;
void *addrp;
DB_TXN *txn;
+ DB_CACHE_PRIORITY priority;
u_int32_t flags;
{
BH *bhp;
DB_ENV *dbenv;
- DB_MPOOL *dbmp;
DB_MPOOL_HASH *hp;
DB_TXN *ancestor;
#ifdef DIAG_MVCC
MPOOLFILE *mfp;
#endif
- MPOOL *c_mp;
- u_int32_t n_cache;
+ REGINFO *infop;
int ret;
db_pgno_t pgno;
void *pgaddr;
dbenv = dbmfp->dbenv;
- dbmp = dbenv->mp_handle;
pgaddr = *(void **)addrp;
/* Convert the page address to a buffer header. */
@@ -154,11 +78,11 @@ __memp_dirty(dbmfp, addrp, txn, flags)
(flags == DB_MPOOL_EDIT && *(void **)addrp == pgaddr) ||
(flags != DB_MPOOL_EDIT && *(void **)addrp != pgaddr));
- if ((ret = __memp_fput(dbmfp, pgaddr, 0)) != 0) {
+ if ((ret = __memp_fput(dbmfp, pgaddr, priority)) != 0) {
__db_errx(dbenv,
"%s: error releasing a read-only page",
__memp_fn(dbmfp));
- (void)__memp_fput(dbmfp, *(void **)addrp, 0);
+ (void)__memp_fput(dbmfp, *(void **)addrp, priority);
*(void **)addrp = NULL;
return (ret);
}
@@ -168,13 +92,10 @@ __memp_dirty(dbmfp, addrp, txn, flags)
return (0);
}
- n_cache = NCACHE(dbmp->reginfo[0].primary,
- bhp->mf_offset, bhp->pgno);
- c_mp = dbmp->reginfo[n_cache].primary;
- hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
- hp = &hp[NBUCKET(c_mp, bhp->mf_offset, bhp->pgno)];
+ MP_GET_BUCKET(dbmfp, pgno, &infop, hp, ret);
+ if (ret != 0)
+ return (ret);
- MUTEX_LOCK(dbenv, hp->mtx_hash);
/* Set/clear the page bits. */
if (!F_ISSET(bhp, BH_DIRTY)) {
++hp->hash_page_dirty;
@@ -183,7 +104,7 @@ __memp_dirty(dbmfp, addrp, txn, flags)
MUTEX_UNLOCK(dbenv, hp->mtx_hash);
#ifdef DIAG_MVCC
- mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+ mfp = R_ADDR(dbenv->mp_handle->reginfo, bhp->mf_offset);
MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, PROT_READ | PROT_WRITE);
#endif
return (0);
diff --git a/db/mp/mp_method.c b/db/mp/mp_method.c
index 14c144974..e9096827c 100644
--- a/db/mp/mp_method.c
+++ b/db/mp/mp_method.c
@@ -1,10 +1,9 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996-2006
- * Oracle Corporation. All rights reserved.
+ * Copyright (c) 1996,2007 Oracle. All rights reserved.
*
- * $Id: mp_method.c,v 12.36 2006/09/15 18:54:13 margo Exp $
+ * $Id: mp_method.c,v 12.50 2007/06/01 18:32:44 bostic Exp $
*/
#include "db_config.h"
@@ -15,13 +14,13 @@
#include "dbinc/hash.h"
/*
- * __memp_dbenv_create --
+ * __memp_env_create --
* Mpool specific creation of the DB_ENV structure.
*
- * PUBLIC: int __memp_dbenv_create __P((DB_ENV *));
+ * PUBLIC: int __memp_env_create __P((DB_ENV *));
*/
int
-__memp_dbenv_create(dbenv)
+__memp_env_create(dbenv)
DB_ENV *dbenv;
{
/*
@@ -37,7 +36,7 @@ __memp_dbenv_create(dbenv)
* Solaris needs 24 and 52 bytes for the same structures. The minimum
* number of hash buckets is 37. These contain a mutex also.
*/
- dbenv->mp_bytes =
+ dbenv->mp_bytes = dbenv->mp_max_bytes =
32 * ((8 * 1024) + sizeof(BH)) + 37 * sizeof(DB_MPOOL_HASH);
dbenv->mp_ncache = 1;
@@ -45,13 +44,13 @@ __memp_dbenv_create(dbenv)
}
/*
- * __memp_dbenv_destroy --
+ * __memp_env_destroy --
* Mpool specific destruction of the DB_ENV structure.
*
- * PUBLIC: void __memp_dbenv_destroy __P((DB_ENV *));
+ * PUBLIC: void __memp_env_destroy __P((DB_ENV *));
*/
void
-__memp_dbenv_destroy(dbenv)
+__memp_env_destroy(dbenv)
DB_ENV *dbenv;
{
COMPQUIET(dbenv, NULL);
@@ -109,8 +108,6 @@ __memp_set_cachesize(dbenv, gbytes, bytes, arg_ncache)
{
u_int ncache;
- ENV_ILLEGAL_AFTER_OPEN(dbenv, "DB_ENV->set_cachesize");
-
/* Normalize the cache count. */
ncache = arg_ncache <= 0 ? 1 : (u_int)arg_ncache;
@@ -133,18 +130,18 @@ __memp_set_cachesize(dbenv, gbytes, bytes, arg_ncache)
* wrapping in the calculation of the number of hash buckets. See
* __memp_open for details.
*/
- if (sizeof(roff_t) <= 4) {
- if (gbytes / ncache >= 4) {
+ if (!F_ISSET(dbenv, DB_ENV_OPEN_CALLED)) {
+ if (sizeof(roff_t) <= 4 && gbytes / ncache >= 4) {
__db_errx(dbenv,
"individual cache size too large: maximum is 4GB");
return (EINVAL);
}
- } else
if (gbytes / ncache > 10000) {
__db_errx(dbenv,
"individual cache size too large: maximum is 10TB");
return (EINVAL);
}
+ }
/*
* If the application requested less than 500Mb, increase the cachesize
@@ -164,6 +161,9 @@ __memp_set_cachesize(dbenv, gbytes, bytes, arg_ncache)
bytes = ncache * DB_CACHESIZE_MIN;
}
+ if (F_ISSET(dbenv, DB_ENV_OPEN_CALLED))
+ return (__memp_resize(dbenv->mp_handle, gbytes, bytes));
+
dbenv->mp_gbytes = gbytes;
dbenv->mp_bytes = bytes;
dbenv->mp_ncache = ncache;
@@ -172,6 +172,76 @@ __memp_set_cachesize(dbenv, gbytes, bytes, arg_ncache)
}
/*
+ * __memp_set_config --
+ * Set the cache subsystem configuration.
+ *
+ * PUBLIC: int __memp_set_config __P((DB_ENV *, u_int32_t, int));
+ */
+int
+__memp_set_config(dbenv, which, on)
+ DB_ENV *dbenv;
+ u_int32_t which;
+ int on;
+{
+ DB_MPOOL *dbmp;
+ MPOOL *mp;
+
+ ENV_NOT_CONFIGURED(dbenv,
+ dbenv->mp_handle, "DB_ENV->memp_set_config", DB_INIT_MPOOL);
+
+ switch (which) {
+ case DB_MEMP_SUPPRESS_WRITE:
+ case DB_MEMP_SYNC_INTERRUPT:
+ if (MPOOL_ON(dbenv)) {
+ dbmp = dbenv->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+ if (on)
+ FLD_SET(mp->config_flags, which);
+ else
+ FLD_CLR(mp->config_flags, which);
+ }
+ break;
+ default:
+ return (EINVAL);
+ }
+ return (0);
+}
+
+/*
+ * __memp_get_config --
+ * Return the cache subsystem configuration.
+ *
+ * PUBLIC: int __memp_get_config __P((DB_ENV *, u_int32_t, int *));
+ */
+int
+__memp_get_config(dbenv, which, onp)
+ DB_ENV *dbenv;
+ u_int32_t which;
+ int *onp;
+{
+ DB_MPOOL *dbmp;
+ MPOOL *mp;
+
+ ENV_REQUIRES_CONFIG(dbenv,
+ dbenv->mp_handle, "DB_ENV->memp_get_config", DB_INIT_MPOOL);
+
+ switch (which) {
+ case DB_MEMP_SUPPRESS_WRITE:
+ case DB_MEMP_SYNC_INTERRUPT:
+ if (MPOOL_ON(dbenv)) {
+ dbmp = dbenv->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+ *onp = FLD_ISSET(mp->config_flags, which) ? 1 : 0;
+ } else
+ *onp = 0;
+ break;
+ default:
+ return (EINVAL);
+ }
+ return (0);
+}
+
+/*
* PUBLIC: int __memp_get_mp_max_openfd __P((DB_ENV *, int *));
*/
int
@@ -224,12 +294,13 @@ __memp_set_mp_max_openfd(dbenv, maxopenfd)
}
/*
- * PUBLIC: int __memp_get_mp_max_write __P((DB_ENV *, int *, int *));
+ * PUBLIC: int __memp_get_mp_max_write __P((DB_ENV *, int *, db_timeout_t *));
*/
int
__memp_get_mp_max_write(dbenv, maxwritep, maxwrite_sleepp)
DB_ENV *dbenv;
- int *maxwritep, *maxwrite_sleepp;
+ int *maxwritep;
+ db_timeout_t *maxwrite_sleepp;
{
DB_MPOOL *dbmp;
MPOOL *mp;
@@ -255,12 +326,13 @@ __memp_get_mp_max_write(dbenv, maxwritep, maxwrite_sleepp)
* __memp_set_mp_max_write --
* Set the maximum continuous I/O count.
*
- * PUBLIC: int __memp_set_mp_max_write __P((DB_ENV *, int, int));
+ * PUBLIC: int __memp_set_mp_max_write __P((DB_ENV *, int, db_timeout_t));
*/
int
__memp_set_mp_max_write(dbenv, maxwrite, maxwrite_sleep)
DB_ENV *dbenv;
- int maxwrite, maxwrite_sleep;
+ int maxwrite;
+ db_timeout_t maxwrite_sleep;
{
DB_MPOOL *dbmp;
MPOOL *mp;
@@ -366,9 +438,13 @@ __memp_nameop(dbenv, fileid, newname, fullold, fullnew, inmem)
#define op_is_remove (newname == NULL)
COMPQUIET(bucket, 0);
+ COMPQUIET(hp, NULL);
+ COMPQUIET(newname_off, 0);
+ COMPQUIET(nlen, 0);
dbmp = NULL;
mfp = NULL;
+ nhp = NULL;
p = NULL;
locked = ret = 0;
@@ -378,63 +454,61 @@ __memp_nameop(dbenv, fileid, newname, fullold, fullnew, inmem)
dbmp = dbenv->mp_handle;
mp = dbmp->reginfo[0].primary;
hp = R_ADDR(dbmp->reginfo, mp->ftab);
- nhp = NULL;
- /*
- * Remove or rename a file that the mpool might know about. We assume
- * that the fop layer has the file locked for exclusive access, so we
- * don't worry about locking except for the mpool mutexes. Checkpoint
- * can happen at any time, independent of file locking, so we have to
- * do the actual unlink or rename system call to avoid any race.
- *
- * If this is a rename, allocate first, because we can't recursively
- * grab the region lock. If this is a memory file
- * then on a rename, we need to make sure that the new name does
- * not exist.
- */
- hp = R_ADDR(dbmp->reginfo, mp->ftab);
- if (op_is_remove) {
- COMPQUIET(newname_off, INVALID_ROFF);
- } else {
+ if (!op_is_remove) {
nlen = strlen(newname);
if ((ret = __memp_alloc(dbmp, dbmp->reginfo,
NULL, nlen + 1, &newname_off, &p)) != 0)
return (ret);
memcpy(p, newname, nlen + 1);
- MPOOL_SYSTEM_LOCK(dbenv);
- locked = 1;
- if (inmem) {
- bucket = FNBUCKET(newname, nlen);
- nhp = hp + bucket;
- MUTEX_LOCK(dbenv, nhp->mtx_hash);
- SH_TAILQ_FOREACH(mfp, &nhp->hash_bucket, q, __mpoolfile)
- if (!mfp->deadfile &&
- mfp->no_backing_file && strcmp(newname,
- R_ADDR(dbmp->reginfo, mfp->path_off)) == 0)
- break;
- MUTEX_UNLOCK(dbenv, nhp->mtx_hash);
- if (mfp != NULL) {
- ret = EEXIST;
- goto err;
- }
- }
}
- if (locked == 0)
- MPOOL_SYSTEM_LOCK(dbenv);
- locked = 1;
-
+ /*
+ * Remove or rename a file that the mpool might know about. We assume
+ * that the fop layer has the file locked for exclusive access, so we
+ * don't worry about locking except for the mpool mutexes. Checkpoint
+ * can happen at any time, independent of file locking, so we have to
+ * do the actual unlink or rename system call while holding
+ * all affected buckets locked.
+ *
+ * If this is a rename and this is a memory file then we need
+ * to make sure that the new name does not exist. Since we
+ * are locking two buckets lock them in ascending order.
+ */
if (inmem) {
DB_ASSERT(dbenv, fullold != NULL);
hp += FNBUCKET(fullold, strlen(fullold));
+ if (!op_is_remove) {
+ bucket = FNBUCKET(newname, nlen);
+ nhp = R_ADDR(dbmp->reginfo, mp->ftab);
+ nhp += bucket;
+ }
} else
hp += FNBUCKET(fileid, DB_FILE_ID_LEN);
+ if (nhp != NULL && nhp < hp)
+ MUTEX_LOCK(dbenv, nhp->mtx_hash);
+ MUTEX_LOCK(dbenv, hp->mtx_hash);
+ if (nhp != NULL && nhp > hp)
+ MUTEX_LOCK(dbenv, nhp->mtx_hash);
+ locked = 1;
+
+ if (!op_is_remove && inmem) {
+ SH_TAILQ_FOREACH(mfp, &nhp->hash_bucket, q, __mpoolfile)
+ if (!mfp->deadfile &&
+ mfp->no_backing_file && strcmp(newname,
+ R_ADDR(dbmp->reginfo, mfp->path_off)) == 0)
+ break;
+ if (mfp != NULL) {
+ ret = EEXIST;
+ goto err;
+ }
+ }
+
/*
* Find the file -- if mpool doesn't know about this file, that may
- * not be an error -- if the file is not a memory-only file and it
+ * not be an error.
*/
- MUTEX_LOCK(dbenv, hp->mtx_hash);
SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) {
/* Ignore non-active files. */
if (mfp->deadfile || F_ISSET(mfp, MP_TEMP))
@@ -447,17 +521,21 @@ __memp_nameop(dbenv, fileid, newname, fullold, fullnew, inmem)
break;
}
- MUTEX_UNLOCK(dbenv, hp->mtx_hash);
- if (mfp == NULL)
+
+ if (mfp == NULL) {
+ if (inmem) {
+ ret = ENOENT;
+ goto err;
+ }
goto fsop;
+ }
if (op_is_remove) {
MUTEX_LOCK(dbenv, mfp->mutex);
/*
- * In-memory dbs have an artificially incremented
- * ref count so that they do not ever get reclaimed
- * as long as they exist. Since we are now deleting
- * the database, we need to dec that count.
+ * In-memory dbs have an artificially incremented ref count so
+ * they do not get reclaimed as long as they exist. Since we
+ * are now deleting the database, we need to dec that count.
*/
if (mfp->no_backing_file)
mfp->mpf_cnt--;
@@ -465,31 +543,22 @@ __memp_nameop(dbenv, fileid, newname, fullold, fullnew, inmem)
MUTEX_UNLOCK(dbenv, mfp->mutex);
} else {
/*
- * Else, it's a rename. We've allocated memory
- * for the new name. Swap it with the old one.
+ * Else, it's a rename. We've allocated memory for the new
+ * name. Swap it with the old one. If it's in memory we
+ * need to move it the right bucket.
*/
p = R_ADDR(dbmp->reginfo, mfp->path_off);
mfp->path_off = newname_off;
- /* If its in memory we need to move it the right bucket. */
- if (inmem) {
+ if (inmem && hp != nhp) {
DB_ASSERT(dbenv, nhp != NULL);
- MUTEX_LOCK(dbenv, hp->mtx_hash);
SH_TAILQ_REMOVE(&hp->hash_bucket, mfp, q, __mpoolfile);
- MUTEX_UNLOCK(dbenv, hp->mtx_hash);
mfp->bucket = bucket;
- MUTEX_LOCK(dbenv, nhp->mtx_hash);
SH_TAILQ_INSERT_TAIL(&nhp->hash_bucket, mfp, q);
- MUTEX_UNLOCK(dbenv, nhp->mtx_hash);
}
}
-fsop: if (mfp == NULL && inmem) {
- ret = ENOENT;
- goto err;
- }
-
- /*
+fsop: /*
* If this is a real file, then mfp could be NULL, because
* mpool isn't turned on, and we still need to do the file ops.
*/
@@ -504,12 +573,14 @@ fsop: if (mfp == NULL && inmem) {
ret = 0;
} else {
/*
- * Defensive only, fullname should never be
+ * Defensive only, fullnew should never be
* NULL.
*/
DB_ASSERT(dbenv, fullnew != NULL);
- if (fullnew == NULL)
- return (EINVAL);
+ if (fullnew == NULL) {
+ ret = EINVAL;
+ goto err;
+ }
ret = __os_rename(dbenv, fullold, fullnew, 1);
}
}
@@ -518,8 +589,12 @@ fsop: if (mfp == NULL && inmem) {
err: if (p != NULL)
__memp_free(&dbmp->reginfo[0], NULL, p);
- if (locked == 1)
- MPOOL_SYSTEM_UNLOCK(dbenv);
+ /* If we have buckets locked, unlock them when done moving files. */
+ if (locked == 1) {
+ MUTEX_UNLOCK(dbenv, hp->mtx_hash);
+ if (nhp != NULL && nhp != hp)
+ MUTEX_UNLOCK(dbenv, nhp->mtx_hash);
+ }
return (ret);
}
diff --git a/db/mp/mp_mvcc.c b/db/mp/mp_mvcc.c
index 4a763e1de..e797df904 100644
--- a/db/mp/mp_mvcc.c
+++ b/db/mp/mp_mvcc.c
@@ -1,10 +1,9 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2006
- * Oracle Corporation. All rights reserved.
+ * Copyright (c) 2006,2007 Oracle. All rights reserved.
*
- * $Id: mp_mvcc.c,v 12.24 2006/09/18 13:11:50 mjc Exp $
+ * $Id: mp_mvcc.c,v 12.34 2007/06/05 11:55:28 mjc Exp $
*/
#include "db_config.h"
@@ -92,9 +91,12 @@ __memp_bucket_reorder(dbenv, hp, bhp)
next, bhp, hq, __bh);
}
-done: /* Reset the hash bucket's priority. */
- hp->hash_priority =
- BH_PRIORITY(SH_TAILQ_FIRST(&hp->hash_bucket, __bh));
+done: /*
+ * Reset the hash bucket's priority -- the chain is never empty in
+ * this case, so bhp will never be NULL.
+ */
+ if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) != NULL)
+ hp->hash_priority = BH_PRIORITY(bhp);
}
/*
@@ -103,7 +105,8 @@ done: /* Reset the hash bucket's priority. */
*
* PUBLIC: int __memp_bh_settxn __P((DB_MPOOL *, MPOOLFILE *mfp, BH *, void *));
*/
-int __memp_bh_settxn(dbmp, mfp, bhp, vtd)
+int
+__memp_bh_settxn(dbmp, mfp, bhp, vtd)
DB_MPOOL *dbmp;
MPOOLFILE *mfp;
BH *bhp;
@@ -149,16 +152,13 @@ __memp_skip_curadj(dbc, pgno)
DB_MPOOL_HASH *hp;
DB_MPOOLFILE *dbmfp;
DB_TXN *txn;
- MPOOL *c_mp, *mp;
MPOOLFILE *mfp;
REGINFO *infop;
roff_t mf_offset;
- u_int32_t n_cache;
- int skip;
+ int ret, skip;
dbenv = dbc->dbp->dbenv;
dbmp = dbenv->mp_handle;
- mp = dbmp->reginfo[0].primary;
dbmfp = dbc->dbp->mpf;
mfp = dbmfp->mfp;
mf_offset = R_OFFSET(dbmp->reginfo, mfp);
@@ -172,13 +172,13 @@ __memp_skip_curadj(dbc, pgno)
* local pointers to them. Reset on each pass through this code, the
* page number can change.
*/
- n_cache = NCACHE(mp, mf_offset, pgno);
- infop = &dbmp->reginfo[n_cache];
- c_mp = infop->primary;
- hp = R_ADDR(infop, c_mp->htab);
- hp = &hp[NBUCKET(c_mp, mf_offset, pgno)];
+ MP_GET_BUCKET(dbmfp, pgno, &infop, hp, ret);
+ if (ret != 0) {
+ /* Panic: there is no way to return the error. */
+ (void)__db_panic(dbenv, ret);
+ return (0);
+ }
- MUTEX_LOCK(dbenv, hp->mtx_hash);
SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) {
if (bhp->pgno != pgno || bhp->mf_offset != mf_offset)
continue;
@@ -251,12 +251,12 @@ __memp_bh_freeze(dbmp, infop, hp, bhp, need_frozenp)
*need_frozenp = 1;
/* There might be a small amount of unallocated space. */
- if (__db_shalloc(infop,
- sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE), 0,
+ if (__env_alloc(infop,
+ sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE),
&frozen_alloc) == 0) {
frozen_bhp = (BH *)(frozen_alloc + 1);
- SH_TAILQ_INSERT_HEAD(&c_mp->alloc_frozen, frozen_alloc,
- links, __bh_frozen_a);
+ SH_TAILQ_INSERT_TAIL(&c_mp->alloc_frozen,
+ frozen_alloc, links);
}
}
MPOOL_REGION_UNLOCK(dbenv, infop);
@@ -285,7 +285,7 @@ __memp_bh_freeze(dbmp, infop, hp, bhp, need_frozenp)
if ((ret = __db_appname(dbenv, DB_APP_NONE, filename,
0, NULL, &real_name)) != 0)
goto err;
- if ((ret = __os_open_extend(dbenv, real_name, pagesize,
+ if ((ret = __os_open(dbenv, real_name, pagesize,
DB_OSO_CREATE | DB_OSO_EXCL, dbenv->db_mode, &fhp)) == 0) {
/* We're creating the file -- initialize the metadata page. */
magic = DB_FREEZER_MAGIC;
@@ -299,8 +299,8 @@ __memp_bh_freeze(dbmp, infop, hp, bhp, need_frozenp)
(ret = __os_seek(dbenv, fhp, 0, 0, 0)) != 0)
goto err;
} else if (ret == EEXIST)
- ret = __os_open_extend(dbenv, real_name, pagesize, 0,
- dbenv->db_mode, &fhp);
+ ret = __os_open(
+ dbenv, real_name, pagesize, 0, dbenv->db_mode, &fhp);
if (ret != 0)
goto err;
if ((ret = __os_read(dbenv, fhp, &magic, sizeof(u_int32_t),
@@ -372,8 +372,11 @@ __memp_bh_freeze(dbmp, infop, hp, bhp, need_frozenp)
* Increment the file's block count -- freeing the original buffer will
* decrement it.
*/
+ MUTEX_LOCK(dbenv, bh_mfp->mutex);
++bh_mfp->block_cnt;
- ++hp->hash_frozen;
+ MUTEX_UNLOCK(dbenv, bh_mfp->mutex);
+
+ STAT(++hp->hash_frozen);
if (0) {
err: if (ret == 0)
@@ -492,8 +495,8 @@ __memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp)
&real_name)) != 0)
goto err;
- if ((ret = __os_open_extend(dbenv, real_name, pagesize, 0,
- dbenv->db_mode, &fhp)) != 0)
+ if ((ret = __os_open(
+ dbenv, real_name, pagesize, 0, dbenv->db_mode, &fhp)) != 0)
goto err;
/*
@@ -625,8 +628,8 @@ __memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp)
if (reorder) {
if (next_bhp != NULL)
__memp_bucket_reorder(dbenv, hp, next_bhp);
- else
- hp->hash_priority = BH_PRIORITY(SH_TAILQ_FIRST(
+ else if (!SH_TAILQ_EMPTY(&hp->hash_bucket))
+ hp->hash_priority = BH_PRIORITY(SH_TAILQ_FIRSTP(
&hp->hash_bucket, __bh));
}
@@ -651,10 +654,12 @@ __memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp)
F_CLR(frozen_bhp, BH_FROZEN | BH_LOCKED);
}
+#ifdef HAVE_STATISTICS
if (alloc_bhp != NULL)
++hp->hash_thawed;
else
++hp->hash_frozen_freed;
+#endif
if (0) {
err: if (ret == 0)
diff --git a/db/mp/mp_region.c b/db/mp/mp_region.c
index a02683f21..34a1ced15 100644
--- a/db/mp/mp_region.c
+++ b/db/mp/mp_region.c
@@ -1,10 +1,9 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996-2006
- * Oracle Corporation. All rights reserved.
+ * Copyright (c) 1996,2007 Oracle. All rights reserved.
*
- * $Id: mp_region.c,v 12.21 2006/08/24 14:46:15 bostic Exp $
+ * $Id: mp_region.c,v 12.33 2007/05/17 17:18:01 bostic Exp $
*/
#include "db_config.h"
@@ -12,7 +11,6 @@
#include "db_int.h"
#include "dbinc/mp.h"
-static int __memp_init __P((DB_ENV *, DB_MPOOL *, u_int, u_int32_t));
static int __memp_init_config __P((DB_ENV *, MPOOL *));
static void __memp_region_size __P((DB_ENV *, roff_t *, u_int32_t *));
@@ -20,17 +18,18 @@ static void __memp_region_size __P((DB_ENV *, roff_t *, u_int32_t *));
* __memp_open --
* Internal version of memp_open: only called from DB_ENV->open.
*
- * PUBLIC: int __memp_open __P((DB_ENV *));
+ * PUBLIC: int __memp_open __P((DB_ENV *, int));
*/
int
-__memp_open(dbenv)
+__memp_open(dbenv, create_ok)
DB_ENV *dbenv;
+ int create_ok;
{
DB_MPOOL *dbmp;
MPOOL *mp;
REGINFO reginfo;
roff_t reg_size;
- u_int i;
+ u_int i, max_nreg;
u_int32_t htab_buckets, *regids;
int ret;
@@ -50,9 +49,9 @@ __memp_open(dbenv)
reginfo.type = REGION_TYPE_MPOOL;
reginfo.id = INVALID_REGION_ID;
reginfo.flags = REGION_JOIN_OK;
- if (F_ISSET(dbenv, DB_ENV_CREATE))
+ if (create_ok)
F_SET(&reginfo, REGION_CREATE_OK);
- if ((ret = __db_r_attach(dbenv, &reginfo, reg_size)) != 0)
+ if ((ret = __env_region_attach(dbenv, &reginfo, reg_size)) != 0)
goto err;
/*
@@ -65,17 +64,18 @@ __memp_open(dbenv)
* the REGINFO structures and create them. Make sure we don't
* clear the wrong entries on error.
*/
- dbmp->nreg = dbenv->mp_ncache;
+ max_nreg = __memp_max_regions(dbenv);
if ((ret = __os_calloc(dbenv,
- dbmp->nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0)
+ max_nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0)
goto err;
/* Make sure we don't clear the wrong entries on error. */
- for (i = 0; i < dbmp->nreg; ++i)
- dbmp->reginfo[i].id = INVALID_REGION_ID;
dbmp->reginfo[0] = reginfo;
+ for (i = 1; i < max_nreg; ++i)
+ dbmp->reginfo[i].id = INVALID_REGION_ID;
/* Initialize the first region. */
- if ((ret = __memp_init(dbenv, dbmp, 0, htab_buckets)) != 0)
+ if ((ret = __memp_init(dbenv, dbmp,
+ 0, htab_buckets, max_nreg)) != 0)
goto err;
/*
@@ -84,16 +84,17 @@ __memp_open(dbenv)
*/
mp = R_ADDR(dbmp->reginfo, dbmp->reginfo[0].rp->primary);
regids = R_ADDR(dbmp->reginfo, mp->regids);
- for (i = 1; i < dbmp->nreg; ++i) {
+ regids[0] = dbmp->reginfo[0].id;
+ for (i = 1; i < dbenv->mp_ncache; ++i) {
dbmp->reginfo[i].dbenv = dbenv;
dbmp->reginfo[i].type = REGION_TYPE_MPOOL;
dbmp->reginfo[i].id = INVALID_REGION_ID;
dbmp->reginfo[i].flags = REGION_CREATE_OK;
- if ((ret = __db_r_attach(
+ if ((ret = __env_region_attach(
dbenv, &dbmp->reginfo[i], reg_size)) != 0)
goto err;
- if ((ret =
- __memp_init(dbenv, dbmp, i, htab_buckets)) != 0)
+ if ((ret = __memp_init(dbenv, dbmp,
+ i, htab_buckets, max_nreg)) != 0)
goto err;
regids[i] = dbmp->reginfo[i].id;
@@ -105,30 +106,30 @@ __memp_open(dbenv)
* information.
*/
mp = R_ADDR(&reginfo, reginfo.rp->primary);
- dbmp->nreg = mp->nreg;
+ dbenv->mp_ncache = mp->nreg;
if ((ret = __os_calloc(dbenv,
- dbmp->nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0)
+ mp->max_nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0)
goto err;
/* Make sure we don't clear the wrong entries on error. */
- for (i = 0; i < dbmp->nreg; ++i)
+ for (i = 0; i < dbenv->mp_ncache; ++i)
dbmp->reginfo[i].id = INVALID_REGION_ID;
dbmp->reginfo[0] = reginfo;
/* Join remaining regions. */
regids = R_ADDR(dbmp->reginfo, mp->regids);
- for (i = 1; i < dbmp->nreg; ++i) {
+ for (i = 1; i < dbenv->mp_ncache; ++i) {
dbmp->reginfo[i].dbenv = dbenv;
dbmp->reginfo[i].type = REGION_TYPE_MPOOL;
dbmp->reginfo[i].id = regids[i];
dbmp->reginfo[i].flags = REGION_JOIN_OK;
- if ((ret = __db_r_attach(
+ if ((ret = __env_region_attach(
dbenv, &dbmp->reginfo[i], 0)) != 0)
goto err;
}
}
/* Set the local addresses for the regions. */
- for (i = 0; i < dbmp->nreg; ++i)
+ for (i = 0; i < dbenv->mp_ncache; ++i)
dbmp->reginfo[i].primary =
R_ADDR(&dbmp->reginfo[i], dbmp->reginfo[i].rp->primary);
@@ -147,9 +148,9 @@ __memp_open(dbenv)
err: dbenv->mp_handle = NULL;
if (dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) {
- for (i = 0; i < dbmp->nreg; ++i)
+ for (i = 0; i < dbenv->mp_ncache; ++i)
if (dbmp->reginfo[i].id != INVALID_REGION_ID)
- (void)__db_r_detach(
+ (void)__env_region_detach(
dbenv, &dbmp->reginfo[i], 0);
__os_free(dbenv, dbmp->reginfo);
}
@@ -162,27 +163,32 @@ err: dbenv->mp_handle = NULL;
/*
* __memp_init --
* Initialize a MPOOL structure in shared memory.
+ *
+ * PUBLIC: int __memp_init
+ * PUBLIC: __P((DB_ENV *, DB_MPOOL *, u_int, u_int32_t, u_int));
*/
-static int
-__memp_init(dbenv, dbmp, reginfo_off, htab_buckets)
+int
+__memp_init(dbenv, dbmp, reginfo_off, htab_buckets, max_nreg)
DB_ENV *dbenv;
DB_MPOOL *dbmp;
- u_int reginfo_off;
+ u_int reginfo_off, max_nreg;
u_int32_t htab_buckets;
{
+ BH_FROZEN_ALLOC *frozen;
+ BH *frozen_bhp;
DB_MPOOL_HASH *htab, *hp;
- MPOOL *mp;
- REGINFO *reginfo;
+ MPOOL *mp, *main_mp;
+ REGINFO *infop;
+ db_mutex_t mtx_base, mtx_discard, mtx_prev;
u_int32_t i;
int ret;
void *p;
- reginfo = &dbmp->reginfo[reginfo_off];
- if ((ret = __db_shalloc(
- reginfo, sizeof(MPOOL), 0, &reginfo->primary)) != 0)
+ infop = &dbmp->reginfo[reginfo_off];
+ if ((ret = __env_alloc(infop, sizeof(MPOOL), &infop->primary)) != 0)
goto mem_err;
- reginfo->rp->primary = R_OFFSET(reginfo, reginfo->primary);
- mp = reginfo->primary;
+ infop->rp->primary = R_OFFSET(infop, infop->primary);
+ mp = infop->primary;
memset(mp, 0, sizeof(*mp));
if ((ret =
@@ -192,17 +198,19 @@ __memp_init(dbenv, dbmp, reginfo_off, htab_buckets)
if (reginfo_off == 0) {
ZERO_LSN(mp->lsn);
- mp->nreg = dbmp->nreg;
- if ((ret = __db_shalloc(&dbmp->reginfo[0],
- dbmp->nreg * sizeof(u_int32_t), 0, &p)) != 0)
+ mp->nreg = dbenv->mp_ncache;
+ mp->max_nreg = max_nreg;
+ if ((ret = __env_alloc(&dbmp->reginfo[0],
+ max_nreg * sizeof(u_int32_t), &p)) != 0)
goto mem_err;
mp->regids = R_OFFSET(dbmp->reginfo, p);
+ mp->nbuckets = dbenv->mp_ncache * htab_buckets;
/* Allocate file table space and initialize it. */
- if ((ret = __db_shalloc(reginfo,
- MPOOL_FILE_BUCKETS * sizeof(DB_MPOOL_HASH), 0, &htab)) != 0)
+ if ((ret = __env_alloc(infop,
+ MPOOL_FILE_BUCKETS * sizeof(DB_MPOOL_HASH), &htab)) != 0)
goto mem_err;
- mp->ftab = R_OFFSET(reginfo, htab);
+ mp->ftab = R_OFFSET(infop, htab);
for (i = 0; i < MPOOL_FILE_BUCKETS; i++) {
if ((ret = __mutex_alloc(dbenv,
MTX_MPOOL_FILE_BUCKET, 0, &htab[i].mtx_hash)) != 0)
@@ -211,32 +219,80 @@ __memp_init(dbenv, dbmp, reginfo_off, htab_buckets)
htab[i].hash_page_dirty = htab[i].hash_priority = 0;
}
+ /*
+ * Allocate all of the hash bucket mutexes up front. We do
+ * this so that we don't need to free and reallocate mutexes as
+ * the cache is resized.
+ */
+ mtx_base = mtx_prev = MUTEX_INVALID;
+ for (i = 0; i < mp->max_nreg * htab_buckets; i++) {
+ if ((ret = __mutex_alloc(dbenv, MTX_MPOOL_HASH_BUCKET,
+ 0, &mtx_discard)) != 0)
+ return (ret);
+ if (i == 0) {
+ mtx_base = mtx_discard;
+ mtx_prev = mtx_discard - 1;
+ }
+ DB_ASSERT(dbenv, mtx_discard == mtx_prev + 1 ||
+ mtx_base == MUTEX_INVALID);
+ mtx_prev = mtx_discard;
+ if ((ret = __mutex_alloc(dbenv, MTX_MPOOL_IO,
+ DB_MUTEX_SELF_BLOCK, &mtx_discard)) != 0)
+ return (ret);
+ DB_ASSERT(dbenv, mtx_discard == mtx_prev + 1 ||
+ mtx_base == MUTEX_INVALID);
+ mtx_prev = mtx_discard;
+ }
+ } else {
+ main_mp = dbmp->reginfo[0].primary;
+ htab = R_ADDR(&dbmp->reginfo[0], main_mp->htab);
+ mtx_base = htab[0].mtx_hash;
}
+ if (mtx_base != MUTEX_INVALID)
+ mtx_base += reginfo_off * htab_buckets;
+
/* Allocate hash table space and initialize it. */
- if ((ret = __db_shalloc(reginfo,
- htab_buckets * sizeof(DB_MPOOL_HASH), 0, &htab)) != 0)
+ if ((ret = __env_alloc(infop,
+ htab_buckets * sizeof(DB_MPOOL_HASH), &htab)) != 0)
goto mem_err;
- mp->htab = R_OFFSET(reginfo, htab);
+ mp->htab = R_OFFSET(infop, htab);
for (i = 0; i < htab_buckets; i++) {
hp = &htab[i];
- if ((ret = __mutex_alloc(dbenv,
- MTX_MPOOL_HASH_BUCKET, 0, &hp->mtx_hash)) != 0)
- return (ret);
- if ((ret = __mutex_alloc(dbenv,
- MTX_MPOOL_IO, DB_MUTEX_SELF_BLOCK, &hp->mtx_io)) != 0)
- return (ret);
+ hp->mtx_hash = (mtx_base == MUTEX_INVALID) ? MUTEX_INVALID :
+ mtx_base + i * 2;
+ hp->mtx_io = (mtx_base == MUTEX_INVALID) ? MUTEX_INVALID :
+ mtx_base + i * 2 + 1;
SH_TAILQ_INIT(&hp->hash_bucket);
- hp->hash_page_dirty = hp->hash_priority = hp->hash_io_wait = 0;
+ hp->hash_page_dirty = hp->hash_priority = 0;
+#ifdef HAVE_STATISTICS
+ hp->hash_io_wait = 0;
+ hp->hash_frozen = hp->hash_thawed = hp->hash_frozen_freed = 0;
+#endif
hp->flags = 0;
ZERO_LSN(hp->old_reader);
}
- mp->htab_buckets = mp->stat.st_hash_buckets = htab_buckets;
+ mp->htab_buckets = htab_buckets;
+#ifdef HAVE_STATISTICS
+ mp->stat.st_hash_buckets = htab_buckets;
+#endif
SH_TAILQ_INIT(&mp->free_frozen);
SH_TAILQ_INIT(&mp->alloc_frozen);
/*
+ * Pre-allocate one frozen buffer header. This avoids situations where
+ * the cache becomes full of pages and we don't even have the 28 bytes
+ * (or so) available to allocate a frozen buffer header.
+ */
+ if ((ret = __env_alloc(infop,
+ sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE), &frozen)) != 0)
+ goto mem_err;
+ frozen_bhp = (BH *)(frozen + 1);
+ SH_TAILQ_INSERT_TAIL(&mp->alloc_frozen, frozen, links);
+ SH_TAILQ_INSERT_TAIL(&mp->free_frozen, frozen_bhp, hq);
+
+ /*
* Only the environment creator knows the total cache size, fill in
* those statistics now.
*/
@@ -249,6 +305,25 @@ mem_err:__db_errx(dbenv, "Unable to allocate memory for mpool region");
}
/*
+ * PUBLIC: u_int32_t __memp_max_regions __P((DB_ENV *));
+ */
+u_int32_t
+__memp_max_regions(dbenv)
+ DB_ENV *dbenv;
+{
+ roff_t reg_size, max_size;
+ u_int32_t max_nreg;
+
+ __memp_region_size(dbenv, &reg_size, NULL);
+ max_size = (roff_t)dbenv->mp_max_gbytes * GIGABYTE +
+ dbenv->mp_max_bytes;
+ max_nreg = (max_size + reg_size / 2) / reg_size;
+ if (max_nreg <= dbenv->mp_ncache)
+ max_nreg = dbenv->mp_ncache;
+ return (max_nreg);
+}
+
+/*
* __memp_region_size --
* Size the region and figure out how many hash buckets we'll have.
*/
@@ -258,15 +333,16 @@ __memp_region_size(dbenv, reg_sizep, htab_bucketsp)
roff_t *reg_sizep;
u_int32_t *htab_bucketsp;
{
- roff_t reg_size;
+ roff_t reg_size, cache_size;
/*
* Figure out how big each cache region is. Cast an operand to roff_t
* so we do 64-bit arithmetic as appropriate.
*/
- reg_size = ((roff_t)GIGABYTE / dbenv->mp_ncache) * dbenv->mp_gbytes;
- reg_size += dbenv->mp_bytes / dbenv->mp_ncache;
- *reg_sizep = reg_size;
+ cache_size = (roff_t)dbenv->mp_gbytes * GIGABYTE + dbenv->mp_bytes;
+ reg_size = cache_size / dbenv->mp_ncache;
+ if (reg_sizep != NULL)
+ *reg_sizep = reg_size;
/*
* Figure out how many hash buckets each region will have. Assume we
@@ -281,7 +357,9 @@ __memp_region_size(dbenv, reg_sizep, htab_bucketsp)
* something we need to worry about right now, but is checked when the
* cache size is set.
*/
- *htab_bucketsp = __db_tablesize((u_int32_t)(reg_size / (10 * 1024)));
+ if (htab_bucketsp != NULL)
+ *htab_bucketsp =
+ __db_tablesize((u_int32_t)(reg_size / (10 * 1024)));
}
/*
@@ -294,10 +372,9 @@ u_int32_t
__memp_region_mutex_count(dbenv)
DB_ENV *dbenv;
{
- roff_t reg_size;
u_int32_t htab_buckets;
- __memp_region_size(dbenv, &reg_size, &htab_buckets);
+ __memp_region_size(dbenv, NULL, &htab_buckets);
/*
* We need a couple of mutexes for the region itself, one for each
@@ -334,13 +411,13 @@ __memp_init_config(dbenv, mp)
}
/*
- * __memp_dbenv_refresh --
+ * __memp_env_refresh --
* Clean up after the mpool system on a close or failed open.
*
- * PUBLIC: int __memp_dbenv_refresh __P((DB_ENV *));
+ * PUBLIC: int __memp_env_refresh __P((DB_ENV *));
*/
int
-__memp_dbenv_refresh(dbenv)
+__memp_env_refresh(dbenv)
DB_ENV *dbenv;
{
BH *bhp;
@@ -349,53 +426,72 @@ __memp_dbenv_refresh(dbenv)
DB_MPOOLFILE *dbmfp;
DB_MPOOL_HASH *hp;
DB_MPREG *mpreg;
- MPOOL *mp;
- REGINFO *reginfo;
- u_int32_t bucket, i;
+ MPOOL *mp, *c_mp;
+ REGINFO *infop;
+ db_mutex_t mtx_base, mtx;
+ u_int32_t bucket, htab_buckets, i, max_nreg, nreg;
int ret, t_ret;
ret = 0;
dbmp = dbenv->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+ htab_buckets = mp->htab_buckets;
+ nreg = mp->nreg;
+ max_nreg = mp->max_nreg;
+ hp = R_ADDR(&dbmp->reginfo[0], mp->htab);
+ mtx_base = hp->mtx_hash;
/*
* If a private region, return the memory to the heap. Not needed for
* filesystem-backed or system shared memory regions, that memory isn't
* owned by any particular process.
- *
- * Discard buffers.
*/
- if (F_ISSET(dbenv, DB_ENV_PRIVATE))
- for (i = 0; i < dbmp->nreg; ++i) {
- reginfo = &dbmp->reginfo[i];
- mp = reginfo->primary;
- for (hp = R_ADDR(reginfo, mp->htab), bucket = 0;
- bucket < mp->htab_buckets; ++hp, ++bucket) {
- while ((bhp = SH_TAILQ_FIRST(
- &hp->hash_bucket, __bh)) != NULL)
- if (F_ISSET(bhp, BH_FROZEN))
- SH_TAILQ_REMOVE(
- &hp->hash_bucket, bhp,
- hq, __bh);
- else if ((t_ret = __memp_bhfree(
- dbmp, hp, bhp,
+ if (!F_ISSET(dbenv, DB_ENV_PRIVATE))
+ goto not_priv;
+
+ /* Discard buffers. */
+ for (i = 0; i < nreg; ++i) {
+ infop = &dbmp->reginfo[i];
+ c_mp = infop->primary;
+ for (hp = R_ADDR(infop, c_mp->htab), bucket = 0;
+ bucket < c_mp->htab_buckets; ++hp, ++bucket) {
+ while ((bhp = SH_TAILQ_FIRST(
+ &hp->hash_bucket, __bh)) != NULL)
+ if (F_ISSET(bhp, BH_FROZEN))
+ SH_TAILQ_REMOVE(
+ &hp->hash_bucket, bhp,
+ hq, __bh);
+ else {
+ if (F_ISSET(bhp, BH_DIRTY)) {
+ --hp->hash_page_dirty;
+ F_CLR(bhp,
+ BH_DIRTY | BH_DIRTY_CREATE);
+ }
+ if ((t_ret = __memp_bhfree(
+ dbmp, infop, hp, bhp,
BH_FREE_FREEMEM |
BH_FREE_UNLOCKED)) != 0 && ret == 0)
ret = t_ret;
- if ((t_ret = __mutex_free(
- dbenv, &hp->mtx_hash)) != 0 && ret == 0)
- ret = t_ret;
- if ((t_ret = __mutex_free(
- dbenv, &hp->mtx_io)) != 0 && ret == 0)
- ret = t_ret;
- }
- while ((frozen_alloc = SH_TAILQ_FIRST(
- &mp->alloc_frozen, __bh_frozen_a)) != NULL) {
- SH_TAILQ_REMOVE(&mp->alloc_frozen, frozen_alloc,
- links, __bh_frozen_a);
- __db_shalloc_free(reginfo, frozen_alloc);
- }
+ }
+ }
+ while ((frozen_alloc = SH_TAILQ_FIRST(
+ &c_mp->alloc_frozen, __bh_frozen_a)) != NULL) {
+ SH_TAILQ_REMOVE(&c_mp->alloc_frozen, frozen_alloc,
+ links, __bh_frozen_a);
+ __env_alloc_free(infop, frozen_alloc);
+ }
+ }
+
+ /* Discard hash bucket mutexes. */
+ if (mtx_base != MUTEX_INVALID)
+ for (i = 0; i < 2 * max_nreg * htab_buckets; ++i) {
+ mtx = mtx_base + i;
+ if ((t_ret = __mutex_free(dbenv, &mtx)) != 0 &&
+ ret == 0)
+ ret = t_ret;
}
+not_priv:
/* Discard DB_MPOOLFILEs. */
while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL)
if ((t_ret = __memp_fclose(dbmfp, 0)) != 0 && ret == 0)
@@ -415,25 +511,25 @@ __memp_dbenv_refresh(dbenv)
if (F_ISSET(dbenv, DB_ENV_PRIVATE)) {
/* Discard REGION IDs. */
- reginfo = &dbmp->reginfo[0];
- mp = dbmp->reginfo[0].primary;
- __memp_free(reginfo, NULL, R_ADDR(reginfo, mp->regids));
+ infop = &dbmp->reginfo[0];
+ __memp_free(infop, NULL, R_ADDR(infop, mp->regids));
/* Discard the File table. */
- __memp_free(reginfo, NULL, R_ADDR(reginfo, mp->ftab));
+ __memp_free(infop, NULL, R_ADDR(infop, mp->ftab));
/* Discard Hash tables. */
- for (i = 0; i < dbmp->nreg; ++i) {
- reginfo = &dbmp->reginfo[i];
- mp = reginfo->primary;
- __memp_free(reginfo, NULL, R_ADDR(reginfo, mp->htab));
+ for (i = 0; i < nreg; ++i) {
+ infop = &dbmp->reginfo[i];
+ c_mp = infop->primary;
+ __memp_free(infop, NULL, R_ADDR(infop, c_mp->htab));
}
}
/* Detach from the region. */
- for (i = 0; i < dbmp->nreg; ++i) {
- reginfo = &dbmp->reginfo[i];
- if ((t_ret = __db_r_detach(dbenv, reginfo, 0)) != 0 && ret == 0)
+ for (i = 0; i < nreg; ++i) {
+ infop = &dbmp->reginfo[i];
+ if ((t_ret =
+ __env_region_detach(dbenv, infop, 0)) != 0 && ret == 0)
ret = t_ret;
}
diff --git a/db/mp/mp_register.c b/db/mp/mp_register.c
index 1ca5f8311..ef5269d42 100644
--- a/db/mp/mp_register.c
+++ b/db/mp/mp_register.c
@@ -1,10 +1,9 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996-2006
- * Oracle Corporation. All rights reserved.
+ * Copyright (c) 1996,2007 Oracle. All rights reserved.
*
- * $Id: mp_register.c,v 12.11 2006/08/24 14:46:15 bostic Exp $
+ * $Id: mp_register.c,v 12.13 2007/05/17 15:15:45 bostic Exp $
*/
#include "db_config.h"
diff --git a/db/mp/mp_resize.c b/db/mp/mp_resize.c
new file mode 100644
index 000000000..241f37e4b
--- /dev/null
+++ b/db/mp/mp_resize.c
@@ -0,0 +1,559 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 2006,2007 Oracle. All rights reserved.
+ *
+ * $Id: mp_resize.c,v 12.5 2007/06/05 11:55:28 mjc Exp $
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+
+static int __memp_add_bucket __P((DB_MPOOL *));
+static int __memp_add_region __P((DB_MPOOL *));
+static int __memp_map_regions __P((DB_MPOOL *));
+static int __memp_merge_buckets
+ __P((DB_MPOOL *, u_int32_t, u_int32_t, u_int32_t));
+static int __memp_remove_bucket __P((DB_MPOOL *));
+static int __memp_remove_region __P((DB_MPOOL *));
+
+/*
+ * PUBLIC: int __memp_get_bucket
+ * PUBLIC: __P((DB_MPOOLFILE *, db_pgno_t, REGINFO **, DB_MPOOL_HASH **));
+ */
+int
+__memp_get_bucket(dbmfp, pgno, infopp, hpp)
+ DB_MPOOLFILE *dbmfp;
+ db_pgno_t pgno;
+ REGINFO **infopp;
+ DB_MPOOL_HASH **hpp;
+{
+ DB_ENV *dbenv;
+ DB_MPOOL *dbmp;
+ DB_MPOOL_HASH *hp;
+ MPOOL *c_mp, *mp;
+ REGINFO *infop;
+ roff_t mf_offset;
+ u_int32_t bucket, nbuckets, new_bucket, new_nbuckets, region;
+ u_int32_t *regids;
+ int ret;
+
+ dbenv = dbmfp->dbenv;
+ dbmp = dbenv->mp_handle;
+ mf_offset = R_OFFSET(dbmp->reginfo, dbmfp->mfp);
+ mp = dbmp->reginfo[0].primary;
+ ret = 0;
+
+ for (;;) {
+ nbuckets = mp->nbuckets;
+ MP_BUCKET(mf_offset, pgno, nbuckets, bucket);
+
+ /*
+ * Once we work out which region we are looking in, we have to
+ * check that we have that region mapped, and that the version
+ * we have matches the ID in the main mpool region. Otherwise
+ * we have to go and map in any regions that don't match and
+ * retry.
+ */
+ region = NREGION(mp, bucket);
+ regids = R_ADDR(dbmp->reginfo, mp->regids);
+
+ for (;;) {
+ infop = *infopp = &dbmp->reginfo[region];
+ c_mp = infop->primary;
+
+ /* If we have the correct region mapped, we're done. */
+ if (c_mp != NULL && regids[region] == infop->id)
+ break;
+ if ((ret = __memp_map_regions(dbmp)) != 0)
+ return (ret);
+ }
+
+ /* If our caller wants the hash bucket, lock it here. */
+ if (hpp != NULL) {
+ hp = R_ADDR(infop, c_mp->htab);
+ hp = &hp[bucket - region * mp->htab_buckets];
+
+ MUTEX_LOCK(dbenv, hp->mtx_hash);
+
+ /*
+ * Check that we still have the correct region mapped.
+ */
+ if (regids[region] != infop->id) {
+ MUTEX_UNLOCK(dbenv, hp->mtx_hash);
+ continue;
+ }
+
+ /*
+ * Now that the bucket is locked, we need to check that
+ * the cache has not been resized while we waited.
+ */
+ new_nbuckets = mp->nbuckets;
+ if (nbuckets != new_nbuckets) {
+ MP_BUCKET(mf_offset, pgno, new_nbuckets,
+ new_bucket);
+
+ if (new_bucket != bucket) {
+ MUTEX_UNLOCK(dbenv, hp->mtx_hash);
+ continue;
+ }
+ }
+
+ *hpp = hp;
+ }
+
+ break;
+ }
+
+ return (ret);
+}
+
+static int
+__memp_merge_buckets(dbmp, new_nbuckets, old_bucket, new_bucket)
+ DB_MPOOL *dbmp;
+ u_int32_t new_nbuckets, old_bucket, new_bucket;
+{
+ BH *alloc_bhp, *bhp, *current_bhp, *new_bhp, *next_bhp;
+ DB_ENV *dbenv;
+ DB_MPOOL_HASH *new_hp, *old_hp;
+ MPOOL *mp, *new_mp, *old_mp;
+ MPOOLFILE *mfp;
+ REGINFO *new_infop, *old_infop;
+ u_int32_t bucket, high_mask, new_region, old_region;
+ int ret;
+
+ dbenv = dbmp->dbenv;
+ mp = dbmp->reginfo[0].primary;
+ new_bhp = NULL;
+ ret = 0;
+
+ MP_MASK(new_nbuckets, high_mask);
+
+ old_region = NREGION(mp, old_bucket);
+ old_infop = &dbmp->reginfo[old_region];
+ old_mp = old_infop->primary;
+ old_hp = R_ADDR(old_infop, old_mp->htab);
+ old_hp = &old_hp[old_bucket - old_region * mp->htab_buckets];
+
+ new_region = NREGION(mp, new_bucket);
+ new_infop = &dbmp->reginfo[new_region];
+ new_mp = new_infop->primary;
+ new_hp = R_ADDR(new_infop, new_mp->htab);
+ new_hp = &new_hp[new_bucket - new_region * mp->htab_buckets];
+
+ /*
+ * Before merging, we need to check that there are no old buffers left
+ * in the target hash bucket after a previous split.
+ */
+free_old:
+ MUTEX_LOCK(dbenv, new_hp->mtx_hash);
+ SH_TAILQ_FOREACH(bhp, &new_hp->hash_bucket, hq, __bh) {
+ MP_BUCKET(bhp->mf_offset, bhp->pgno, mp->nbuckets, bucket);
+
+ if (bucket != new_bucket) {
+ /*
+ * There is no way that an old buffer can be locked
+ * after a split, since everyone will look for it in
+ * the new hash bucket.
+ */
+ DB_ASSERT(dbenv, !F_ISSET(bhp, BH_LOCKED | BH_DIRTY) &&
+ bhp->ref == 0);
+ if ((ret = __memp_bhfree(dbmp,
+ new_infop, new_hp, bhp, BH_FREE_FREEMEM)) != 0) {
+ MUTEX_UNLOCK(dbenv, new_hp->mtx_hash);
+ return (ret);
+ }
+
+ /*
+ * The free has modified the list of buffers and
+ * dropped the mutex. We need to start again.
+ */
+ goto free_old;
+ }
+ }
+ MUTEX_UNLOCK(dbenv, new_hp->mtx_hash);
+
+ /*
+ * Before we begin, make sure that all of the buffers we care about are
+ * not in use and not frozen. We do this because we can't drop the old
+ * hash bucket mutex once we start moving buffers around.
+ */
+retry: MUTEX_LOCK(dbenv, old_hp->mtx_hash);
+ SH_TAILQ_FOREACH(bhp, &old_hp->hash_bucket, hq, __bh) {
+ MP_HASH_BUCKET(MP_HASH(bhp->mf_offset, bhp->pgno),
+ new_nbuckets, high_mask, bucket);
+
+ if (bucket == new_bucket &&
+ (F_ISSET(bhp, BH_LOCKED) || bhp->ref != 0)) {
+ MUTEX_UNLOCK(dbenv, old_hp->mtx_hash);
+ __os_yield(dbenv);
+ goto retry;
+ } else if (bucket == new_bucket && F_ISSET(bhp, BH_FROZEN)) {
+ if (BH_OBSOLETE(bhp, old_hp->old_reader))
+ alloc_bhp = NULL;
+ else {
+ ++bhp->ref;
+ mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+ MUTEX_UNLOCK(dbenv, old_hp->mtx_hash);
+ if ((ret = __memp_alloc(dbmp,
+ old_infop, mfp, 0, NULL, &alloc_bhp)) != 0)
+ return (ret);
+ MUTEX_LOCK(dbenv, old_hp->mtx_hash);
+ }
+ if ((ret = __memp_bh_thaw(dbmp,
+ old_infop, old_hp, bhp, alloc_bhp)) != 0) {
+ MUTEX_UNLOCK(dbenv, old_hp->mtx_hash);
+ return (ret);
+ }
+
+ /*
+ * We've dropped the mutex in order to thaw, so we need
+ * to go back to the beginning and check that all of
+ * the buffers we care about are still unlocked and
+ * unreferenced.
+ */
+ MUTEX_UNLOCK(dbenv, old_hp->mtx_hash);
+ goto retry;
+ }
+ }
+
+ /*
+ * We now know that all of the buffers we care about are unlocked and
+ * unreferenced. Go ahead and copy them.
+ */
+ SH_TAILQ_FOREACH(bhp, &old_hp->hash_bucket, hq, __bh) {
+ MP_HASH_BUCKET(MP_HASH(bhp->mf_offset, bhp->pgno),
+ new_nbuckets, high_mask, bucket);
+ mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
+
+ /*
+ * We ignore buffers that don't hash to the new bucket. We
+ * could also ignore clean buffers which are not part of a
+ * multiversion chain as long as they have a backing file.
+ */
+ if (bucket != new_bucket || (!F_ISSET(bhp, BH_DIRTY) &&
+ SH_CHAIN_SINGLETON(bhp, vc) && !mfp->no_backing_file))
+ continue;
+
+ for (current_bhp = bhp, next_bhp = NULL;
+ current_bhp != NULL;
+ current_bhp = SH_CHAIN_PREV(current_bhp, vc, __bh),
+ next_bhp = alloc_bhp) {
+ if ((ret = __memp_alloc(dbmp,
+ new_infop, mfp, 0, NULL, &alloc_bhp)) != 0)
+ break;
+
+ alloc_bhp->ref = current_bhp->ref;
+ alloc_bhp->ref_sync = current_bhp->ref_sync;
+ alloc_bhp->priority = current_bhp->priority;
+ alloc_bhp->pgno = current_bhp->pgno;
+ alloc_bhp->mf_offset = current_bhp->mf_offset;
+ alloc_bhp->flags = current_bhp->flags;
+ alloc_bhp->td_off = current_bhp->td_off;
+
+ /*
+ * We've duplicated the buffer, so now we need to
+ * update reference counts, including the counts in the
+ * per-MPOOLFILE and the transaction detail (for MVCC
+ * buffers).
+ */
+ MUTEX_LOCK(dbenv, mfp->mutex);
+ ++mfp->block_cnt;
+ MUTEX_UNLOCK(dbenv, mfp->mutex);
+
+ if (alloc_bhp->td_off != INVALID_ROFF &&
+ (ret = __txn_add_buffer(dbenv,
+ R_ADDR(&dbenv->tx_handle->reginfo,
+ alloc_bhp->td_off))) != 0)
+ break;
+
+ memcpy(alloc_bhp->buf, bhp->buf, mfp->stat.st_pagesize);
+
+ /*
+ * We build up the MVCC chain first, then insert the
+ * head (stored in new_bhp) once.
+ */
+ if (next_bhp == NULL) {
+ SH_CHAIN_INIT(alloc_bhp, vc);
+ new_bhp = alloc_bhp;
+ } else
+ SH_CHAIN_INSERT_BEFORE(
+ next_bhp, alloc_bhp, vc, __bh);
+ }
+
+ MUTEX_LOCK(dbenv, new_hp->mtx_hash);
+ SH_TAILQ_INSERT_TAIL(&new_hp->hash_bucket, new_bhp, hq);
+ if (F_ISSET(new_bhp, BH_DIRTY))
+ ++new_hp->hash_page_dirty;
+
+ /*
+ * We're doing an insertion sort, so it is O(N**2), but since
+ * buckets should be small, that should not matter. When
+ * splitting a bucket, we traverse in priority order and append
+ * to the new bucket, and __memp_bucket_reorder is O(1) in that
+ * case.
+ */
+ __memp_bucket_reorder(dbenv, new_hp, new_bhp);
+ MUTEX_UNLOCK(dbenv, new_hp->mtx_hash);
+
+ if (F_ISSET(bhp, BH_DIRTY)) {
+ F_CLR(bhp, BH_DIRTY);
+ --old_hp->hash_page_dirty;
+ }
+ }
+
+ if (ret == 0)
+ mp->nbuckets = new_nbuckets;
+ MUTEX_UNLOCK(dbenv, old_hp->mtx_hash);
+
+ return (ret);
+}
+
+static int
+__memp_add_bucket(dbmp)
+ DB_MPOOL *dbmp;
+{
+ DB_ENV *dbenv;
+ MPOOL *mp;
+ u_int32_t high_mask, new_bucket, old_bucket;
+
+ dbenv = dbmp->dbenv;
+ mp = dbmp->reginfo[0].primary;
+
+ new_bucket = mp->nbuckets;
+ /* We should always be adding buckets to the last region. */
+ DB_ASSERT(dbenv, NREGION(mp, new_bucket) == mp->nreg - 1);
+ MP_MASK(mp->nbuckets, high_mask);
+ old_bucket = new_bucket & (high_mask >> 1);
+
+ /*
+ * With fixed-sized regions, the new region is always smaller than the
+ * existing total cache size, so buffers always need to be copied. If
+ * we implement variable region sizes, it's possible that we will be
+ * splitting a hash bucket in the new region. Catch that here.
+ */
+ DB_ASSERT(dbenv, NREGION(mp, old_bucket) != NREGION(mp, new_bucket));
+
+ return (__memp_merge_buckets(dbmp, mp->nbuckets + 1,
+ old_bucket, new_bucket));
+}
+
+static int
+__memp_add_region(dbmp)
+ DB_MPOOL *dbmp;
+{
+ DB_ENV *dbenv;
+ MPOOL *mp;
+ REGINFO *infop;
+ int ret;
+ roff_t reg_size;
+ u_int i;
+ u_int32_t *regids;
+
+ dbenv = dbmp->dbenv;
+ mp = dbmp->reginfo[0].primary;
+ /* All cache regions are the same size. */
+ reg_size = dbmp->reginfo[0].rp->size;
+ ret = 0;
+
+ infop = &dbmp->reginfo[mp->nreg];
+ infop->dbenv = dbenv;
+ infop->type = REGION_TYPE_MPOOL;
+ infop->id = INVALID_REGION_ID;
+ infop->flags = REGION_CREATE_OK;
+ if ((ret = __env_region_attach(dbenv, infop, reg_size)) != 0)
+ return (ret);
+ if ((ret = __memp_init(dbenv,
+ dbmp, mp->nreg, mp->htab_buckets, mp->max_nreg)) != 0)
+ return (ret);
+ regids = R_ADDR(dbmp->reginfo, mp->regids);
+ regids[mp->nreg++] = infop->id;
+
+ for (i = 0; i < mp->htab_buckets; i++)
+ if ((ret = __memp_add_bucket(dbmp)) != 0)
+ break;
+
+ return (ret);
+}
+
+static int
+__memp_remove_bucket(dbmp)
+ DB_MPOOL *dbmp;
+{
+ DB_ENV *dbenv;
+ MPOOL *mp;
+ u_int32_t high_mask, new_bucket, old_bucket;
+
+ dbenv = dbmp->dbenv;
+ mp = dbmp->reginfo[0].primary;
+
+ old_bucket = mp->nbuckets - 1;
+
+ /* We should always be removing buckets from the last region. */
+ DB_ASSERT(dbenv, NREGION(mp, old_bucket) == mp->nreg - 1);
+ MP_MASK(mp->nbuckets - 1, high_mask);
+ new_bucket = old_bucket & (high_mask >> 1);
+
+ return (__memp_merge_buckets(dbmp, mp->nbuckets - 1,
+ old_bucket, new_bucket));
+}
+
+static int
+__memp_remove_region(dbmp)
+ DB_MPOOL *dbmp;
+{
+ DB_ENV *dbenv;
+ MPOOL *mp;
+ REGINFO *infop;
+ int ret;
+ u_int i;
+
+ dbenv = dbmp->dbenv;
+ mp = dbmp->reginfo[0].primary;
+ ret = 0;
+
+ if (mp->nreg == 1) {
+ __db_errx(dbenv, "cannot remove the last cache");
+ return (EINVAL);
+ }
+
+ for (i = 0; i < mp->htab_buckets; i++)
+ if ((ret = __memp_remove_bucket(dbmp)) != 0)
+ return (ret);
+
+ /* Detach from the region then destroy it. */
+ infop = &dbmp->reginfo[--mp->nreg];
+ return (__env_region_detach(dbenv, infop, 1));
+}
+
+static int
+__memp_map_regions(dbmp)
+ DB_MPOOL *dbmp;
+{
+ DB_ENV *dbenv;
+ MPOOL *mp;
+ int ret;
+ u_int i;
+ u_int32_t *regids;
+
+ dbenv = dbmp->dbenv;
+ mp = dbmp->reginfo[0].primary;
+ regids = R_ADDR(dbmp->reginfo, mp->regids);
+ ret = 0;
+
+ for (i = 1; i < mp->nreg; ++i) {
+ if (dbmp->reginfo[i].primary != NULL &&
+ dbmp->reginfo[i].id == regids[i])
+ continue;
+
+ if (dbmp->reginfo[i].primary != NULL)
+ ret = __env_region_detach(dbenv, &dbmp->reginfo[i], 0);
+
+ dbmp->reginfo[i].dbenv = dbenv;
+ dbmp->reginfo[i].type = REGION_TYPE_MPOOL;
+ dbmp->reginfo[i].id = regids[i];
+ dbmp->reginfo[i].flags = REGION_JOIN_OK;
+ if ((ret =
+ __env_region_attach(dbenv, &dbmp->reginfo[i], 0)) != 0)
+ return (ret);
+ dbmp->reginfo[i].primary = R_ADDR(&dbmp->reginfo[i],
+ dbmp->reginfo[i].rp->primary);
+ }
+
+ for (; i < mp->max_nreg; i++)
+ if (dbmp->reginfo[i].primary != NULL &&
+ (ret = __env_region_detach(dbenv,
+ &dbmp->reginfo[i], 0)) != 0)
+ break;
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __memp_resize __P((DB_MPOOL *, u_int32_t, u_int32_t));
+ */
+int
+__memp_resize(dbmp, gbytes, bytes)
+ DB_MPOOL *dbmp;
+ u_int32_t gbytes, bytes;
+{
+ DB_ENV *dbenv;
+ MPOOL *mp;
+ int ret;
+ u_int32_t ncache;
+ roff_t reg_size, total_size;
+
+ dbenv = dbmp->dbenv;
+ mp = dbmp->reginfo[0].primary;
+ reg_size = dbmp->reginfo[0].rp->size;
+ total_size = (roff_t)gbytes * GIGABYTE + bytes;
+ ncache = (u_int32_t)((total_size + reg_size / 2) / reg_size);
+
+ if (ncache < 1)
+ ncache = 1;
+ else if (ncache > mp->max_nreg) {
+ __db_errx(dbenv,
+ "cannot resize to %lu cache regions: maximum is %lu",
+ (u_long)ncache, (u_long)mp->max_nreg);
+ return (EINVAL);
+ }
+
+ ret = 0;
+ MUTEX_LOCK(dbenv, mp->mtx_resize);
+ while (mp->nreg != ncache)
+ if ((ret = (mp->nreg < ncache ?
+ __memp_add_region(dbmp) :
+ __memp_remove_region(dbmp))) != 0)
+ break;
+ MUTEX_UNLOCK(dbenv, mp->mtx_resize);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: int __memp_get_cache_max __P((DB_ENV *, u_int32_t *, u_int32_t *));
+ */
+int
+__memp_get_cache_max(dbenv, max_gbytesp, max_bytesp)
+ DB_ENV *dbenv;
+ u_int32_t *max_gbytesp, *max_bytesp;
+{
+ DB_MPOOL *dbmp;
+ MPOOL *mp;
+ roff_t reg_size, max_size;
+
+ ENV_NOT_CONFIGURED(dbenv,
+ dbenv->mp_handle, "DB_ENV->get_mp_max_ncache", DB_INIT_MPOOL);
+
+ if (MPOOL_ON(dbenv)) {
+ dbmp = dbenv->mp_handle;
+ mp = dbmp->reginfo[0].primary;
+ reg_size = dbmp->reginfo[0].rp->size;
+ max_size = mp->max_nreg * reg_size;
+ *max_gbytesp = (u_int32_t)(max_size / GIGABYTE);
+ *max_bytesp = (u_int32_t)(max_size % GIGABYTE);
+ } else {
+ *max_gbytesp = dbenv->mp_max_gbytes;
+ *max_bytesp = dbenv->mp_max_bytes;
+ }
+
+ return (0);
+}
+
+/*
+ * PUBLIC: int __memp_set_cache_max __P((DB_ENV *, u_int32_t, u_int32_t));
+ */
+int
+__memp_set_cache_max(dbenv, max_gbytes, max_bytes)
+ DB_ENV *dbenv;
+ u_int32_t max_gbytes, max_bytes;
+{
+ ENV_ILLEGAL_AFTER_OPEN(dbenv, "DB_ENV->set_cache_max");
+ dbenv->mp_max_gbytes = max_gbytes;
+ dbenv->mp_max_bytes = max_bytes;
+
+ return (0);
+}
diff --git a/db/mp/mp_stat.c b/db/mp/mp_stat.c
index b4d4544b5..0e7b6c237 100644
--- a/db/mp/mp_stat.c
+++ b/db/mp/mp_stat.c
@@ -1,10 +1,9 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996-2006
- * Oracle Corporation. All rights reserved.
+ * Copyright (c) 1996,2007 Oracle. All rights reserved.
*
- * $Id: mp_stat.c,v 12.28 2006/09/11 14:53:42 bostic Exp $
+ * $Id: mp_stat.c,v 12.36 2007/06/22 17:41:29 bostic Exp $
*/
#include "db_config.h"
@@ -104,10 +103,10 @@ __memp_stat(dbenv, gspp, fspp, flags)
* a per-cache basis. Note that configuration information
* may be modified at any time, and so we have to lock.
*/
- c_mp = dbmp->reginfo[0].primary;
- sp->st_gbytes = c_mp->stat.st_gbytes;
- sp->st_bytes = c_mp->stat.st_bytes;
- sp->st_ncache = dbmp->nreg;
+ sp->st_gbytes = mp->stat.st_gbytes;
+ sp->st_bytes = mp->stat.st_bytes;
+ sp->st_ncache = mp->nreg;
+ sp->st_max_ncache = mp->max_nreg;
sp->st_regsize = dbmp->reginfo[0].rp->size;
MPOOL_SYSTEM_LOCK(dbenv);
@@ -165,7 +164,8 @@ __memp_stat(dbenv, gspp, fspp, flags)
c_mp->stat.st_alloc_max_pages;
if (LF_ISSET(DB_STAT_CLEAR)) {
- __mutex_clear(dbenv, c_mp->mtx_region);
+ if (!LF_ISSET(DB_STAT_SUBSYSTEM))
+ __mutex_clear(dbenv, c_mp->mtx_region);
MPOOL_SYSTEM_LOCK(dbenv);
st_bytes = c_mp->stat.st_bytes;
@@ -388,9 +388,10 @@ __memp_stat_print(dbenv, flags)
int ret;
orig_flags = flags;
- LF_CLR(DB_STAT_CLEAR);
+ LF_CLR(DB_STAT_CLEAR | DB_STAT_SUBSYSTEM);
if (flags == 0 || LF_ISSET(DB_STAT_ALL)) {
- ret = __memp_print_stats(dbenv, orig_flags);
+ ret = __memp_print_stats(dbenv,
+ LF_ISSET(DB_STAT_ALL) ? flags : orig_flags);
if (flags == 0 || ret != 0)
return (ret);
}
@@ -423,6 +424,7 @@ __memp_print_stats(dbenv, flags)
__db_dlbytes(dbenv, "Total cache size",
(u_long)gsp->st_gbytes, (u_long)0, (u_long)gsp->st_bytes);
__db_dl(dbenv, "Number of caches", (u_long)gsp->st_ncache);
+ __db_dl(dbenv, "Maximum number of caches", (u_long)gsp->st_max_ncache);
__db_dlbytes(dbenv, "Pool individual cache size",
(u_long)0, (u_long)0, (u_long)gsp->st_regsize);
__db_dlbytes(dbenv, "Maximum memory-mapped file size",
@@ -551,7 +553,7 @@ __memp_print_all(dbenv, flags)
MPOOL_SYSTEM_LOCK(dbenv);
- __db_print_reginfo(dbenv, dbmp->reginfo, "Mpool");
+ __db_print_reginfo(dbenv, dbmp->reginfo, "Mpool", flags);
__db_msg(dbenv, "%s", DB_GLOBAL(db_line));
__db_msg(dbenv, "MPOOL structure:");
@@ -567,7 +569,7 @@ __memp_print_all(dbenv, flags)
__db_msg(dbenv, "DB_MPOOL handle information:");
__mutex_print_debug_single(
dbenv, "DB_MPOOL handle mutex", dbmp->mutex, flags);
- STAT_ULONG("Underlying cache regions", dbmp->nreg);
+ STAT_ULONG("Underlying cache regions", mp->nreg);
__db_msg(dbenv, "%s", DB_GLOBAL(db_line));
__db_msg(dbenv, "DB_MPOOLFILE structures:");
@@ -709,9 +711,11 @@ __memp_print_hash(dbenv, dbmp, reginfo, fmap, flags)
bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
MUTEX_LOCK(dbenv, hp->mtx_hash);
if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) != NULL) {
- __db_msgadd(dbenv, &mb, "bucket %lu: %lu, %lu ",
+ __db_msgadd(dbenv, &mb,
+ "bucket %lu: %lu, %lu (%lu dirty)",
(u_long)bucket, (u_long)hp->hash_io_wait,
- (u_long)hp->hash_priority);
+ (u_long)hp->hash_priority,
+ (u_long)hp->hash_page_dirty);
if (hp->hash_frozen != 0)
__db_msgadd(dbenv, &mb, "(MVCC %lu/%lu/%lu) ",
(u_long)hp->hash_frozen,
@@ -822,7 +826,8 @@ __memp_stat_wait(dbenv, reginfo, mp, mstat, flags)
mstat->st_hash_max_wait = tmp_wait;
mstat->st_hash_max_nowait = tmp_nowait;
}
- if (LF_ISSET(DB_STAT_CLEAR))
+ if (LF_ISSET(DB_STAT_CLEAR |
+ DB_STAT_SUBSYSTEM) == DB_STAT_CLEAR)
__mutex_clear(dbenv, hp->mtx_hash);
mstat->st_io_wait += hp->hash_io_wait;
diff --git a/db/mp/mp_sync.c b/db/mp/mp_sync.c
index 898ae5b6d..5db83fc7b 100644
--- a/db/mp/mp_sync.c
+++ b/db/mp/mp_sync.c
@@ -1,10 +1,9 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996-2006
- * Oracle Corporation. All rights reserved.
+ * Copyright (c) 1996,2007 Oracle. All rights reserved.
*
- * $Id: mp_sync.c,v 12.24 2006/08/24 14:46:15 bostic Exp $
+ * $Id: mp_sync.c,v 12.52 2007/06/01 18:32:44 bostic Exp $
*/
#include "db_config.h"
@@ -12,6 +11,8 @@
#include "db_int.h"
#include "dbinc/log.h"
#include "dbinc/mp.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
typedef struct {
DB_MPOOL_HASH *track_hp; /* Hash bucket. */
@@ -21,8 +22,8 @@ typedef struct {
} BH_TRACK;
static int __bhcmp __P((const void *, const void *));
-static int __memp_close_flush_files __P((DB_ENV *, DB_MPOOL *, int));
-static int __memp_sync_files __P((DB_ENV *, DB_MPOOL *));
+static int __memp_close_flush_files __P((DB_ENV *, int));
+static int __memp_sync_files __P((DB_ENV *));
static int __memp_sync_file __P((DB_ENV *,
MPOOLFILE *, void *, u_int32_t *, u_int32_t));
@@ -93,7 +94,7 @@ __memp_sync_pp(dbenv, lsnp)
dbenv->lg_handle, "memp_sync", DB_INIT_LOG);
ENV_ENTER(dbenv, ip);
- REPLICATION_WRAP(dbenv, (__memp_sync(dbenv, lsnp)), ret);
+ REPLICATION_WRAP(dbenv, (__memp_sync(dbenv, DB_SYNC_CACHE, lsnp)), ret);
ENV_LEAVE(dbenv, ip);
return (ret);
}
@@ -102,16 +103,17 @@ __memp_sync_pp(dbenv, lsnp)
* __memp_sync --
* DB_ENV->memp_sync.
*
- * PUBLIC: int __memp_sync __P((DB_ENV *, DB_LSN *));
+ * PUBLIC: int __memp_sync __P((DB_ENV *, u_int32_t, DB_LSN *));
*/
int
-__memp_sync(dbenv, lsnp)
+__memp_sync(dbenv, flags, lsnp)
DB_ENV *dbenv;
+ u_int32_t flags;
DB_LSN *lsnp;
{
DB_MPOOL *dbmp;
MPOOL *mp;
- int ret;
+ int interrupted, ret;
dbmp = dbenv->mp_handle;
mp = dbmp->reginfo[0].primary;
@@ -128,10 +130,11 @@ __memp_sync(dbenv, lsnp)
MPOOL_SYSTEM_UNLOCK(dbenv);
}
- if ((ret = __memp_sync_int(dbenv, NULL, 0, DB_SYNC_CACHE, NULL)) != 0)
+ if ((ret =
+ __memp_sync_int(dbenv, NULL, 0, flags, NULL, &interrupted)) != 0)
return (ret);
- if (lsnp != NULL) {
+ if (!interrupted && lsnp != NULL) {
MPOOL_SYSTEM_LOCK(dbenv);
if (LOG_COMPARE(lsnp, &mp->lsn) > 0)
mp->lsn = *lsnp;
@@ -195,7 +198,8 @@ __memp_fsync(dbmfp)
if (mfp->file_written == 0)
return (0);
- return (__memp_sync_int(dbmfp->dbenv, dbmfp, 0, DB_SYNC_FILE, NULL));
+ return (__memp_sync_int(
+ dbmfp->dbenv, dbmfp, 0, DB_SYNC_FILE, NULL, NULL));
}
/*
@@ -209,6 +213,8 @@ __mp_xxx_fh(dbmfp, fhp)
DB_MPOOLFILE *dbmfp;
DB_FH **fhp;
{
+ int ret;
+
/*
* This is a truly spectacular layering violation, intended ONLY to
* support compatibility for the DB 1.85 DB->fd call.
@@ -226,7 +232,10 @@ __mp_xxx_fh(dbmfp, fhp)
if ((*fhp = dbmfp->fhp) != NULL)
return (0);
- return (__memp_sync_int(dbmfp->dbenv, dbmfp, 0, DB_SYNC_FILE, NULL));
+ if ((ret = __memp_sync_int(
+ dbmfp->dbenv, dbmfp, 0, DB_SYNC_FILE, NULL, NULL)) == 0)
+ *fhp = dbmfp->fhp;
+ return (ret);
}
/*
@@ -234,14 +243,14 @@ __mp_xxx_fh(dbmfp, fhp)
* Mpool sync internal function.
*
* PUBLIC: int __memp_sync_int __P((DB_ENV *,
- * PUBLIC: DB_MPOOLFILE *, u_int32_t, db_sync_op, u_int32_t *));
+ * PUBLIC: DB_MPOOLFILE *, u_int32_t, u_int32_t, u_int32_t *, int *));
*/
int
-__memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep)
+__memp_sync_int(dbenv, dbmfp, trickle_max, flags, wrote_totalp, interruptedp)
DB_ENV *dbenv;
DB_MPOOLFILE *dbmfp;
- u_int32_t trickle_max, *wrotep;
- db_sync_op op;
+ u_int32_t trickle_max, flags, *wrote_totalp;
+ int *interruptedp;
{
BH *bhp;
BH_TRACK *bharray;
@@ -251,20 +260,32 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep)
MPOOLFILE *mfp;
db_mutex_t mutex;
roff_t last_mf_offset;
- u_int32_t ar_cnt, ar_max, i, n_cache, remaining, wrote;
- int filecnt, hb_lock, maxopenfd, maxwrite, maxwrite_sleep;
- int pass, ret, t_ret, wait_cnt, write_cnt;
+ u_int32_t ar_cnt, ar_max, dirty, i, n_cache, remaining, wrote_total;
+ int filecnt, maxopenfd, pass, required_write, ret, t_ret;
+ int wait_cnt, wrote_cnt;
dbmp = dbenv->mp_handle;
mp = dbmp->reginfo[0].primary;
last_mf_offset = INVALID_ROFF;
- filecnt = pass = wrote = 0;
+ filecnt = pass = wrote_total = 0;
+
+ if (wrote_totalp != NULL)
+ *wrote_totalp = 0;
+ if (interruptedp != NULL)
+ *interruptedp = 0;
+
+ /*
+ * If we're flushing the cache, it's a checkpoint or we're flushing a
+ * specific file, we really have to write the blocks and we have to
+ * confirm they made it to disk. Otherwise, we can skip a block if
+ * it's hard to get.
+ */
+ required_write = LF_ISSET(DB_SYNC_CACHE |
+ DB_SYNC_CHECKPOINT | DB_SYNC_FILE | DB_SYNC_QUEUE_EXTENT);
/* Get shared configuration information. */
MPOOL_SYSTEM_LOCK(dbenv);
maxopenfd = mp->mp_maxopenfd;
- maxwrite = mp->mp_maxwrite;
- maxwrite_sleep = mp->mp_maxwrite_sleep;
MPOOL_SYSTEM_UNLOCK(dbenv);
/* Assume one dirty page per bucket. */
@@ -284,43 +305,60 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep)
hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
for (i = 0; i < c_mp->htab_buckets; i++, hp++) {
/*
- * We can check for empty buckets before locking as we
- * only care if the pointer is zero or non-zero. We
- * can ignore empty buckets because we only need write
- * buffers that were dirty before we started.
+ * We can check for empty buckets before locking as
+ * we only care if the pointer is zero or non-zero.
+ * We can ignore empty or clean buckets because we
+ * only need write buffers that were dirty before
+ * we started.
*/
+#ifdef DIAGNOSTIC
if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+#else
+ if (hp->hash_page_dirty == 0)
+#endif
continue;
+ dirty = 0;
MUTEX_LOCK(dbenv, hp->mtx_hash);
SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) {
/* Always ignore clean pages. */
if (!F_ISSET(bhp, BH_DIRTY))
continue;
+ dirty++;
mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
/*
- * Ignore in-memory files, even if they are
- * temp files to whom a backing file has been
- * allocated.
+ * Ignore in-memory files, unless the file is
+ * specifically being flushed.
*/
- if (mfp->no_backing_file ||
+ if (mfp->no_backing_file)
+ continue;
+ if (!LF_ISSET(DB_SYNC_FILE) &&
F_ISSET(mfp, MP_TEMP))
continue;
/*
- * If we're flushing a specific file, see if
- * this page is from that file.
+ * Ignore files that aren't involved in DB's
+ * transactional operations during checkpoints.
*/
- if (dbmfp != NULL && mfp != dbmfp->mfp)
+ if (LF_ISSET(DB_SYNC_CHECKPOINT) &&
+ mfp->lsn_off == DB_LSN_OFF_NOTSET)
continue;
/*
- * Ignore files that aren't involved in DB's
- * transactional operations during checkpoints.
+ * Ignore files that aren't Queue extent files
+ * if we're flushing a Queue file with extents.
*/
- if (dbmfp == NULL && mfp->lsn_off == -1)
+ if (LF_ISSET(DB_SYNC_QUEUE_EXTENT) &&
+ !F_ISSET(mfp, MP_EXTENT))
+ continue;
+
+ /*
+ * If we're flushing a specific file, see if
+ * this page is from that file.
+ */
+ if (dbmfp != NULL && mfp != dbmfp->mfp)
continue;
/* Track the buffer, we want it. */
@@ -343,10 +381,25 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep)
ar_max *= 2;
}
}
+ DB_ASSERT(dbenv, dirty == hp->hash_page_dirty);
+ if (dirty != hp->hash_page_dirty) {
+ __db_errx(dbenv,
+ "memp_sync: correcting dirty count %lu %lu",
+ (u_long)hp->hash_page_dirty, (u_long)dirty);
+ hp->hash_page_dirty = dirty;
+ }
MUTEX_UNLOCK(dbenv, hp->mtx_hash);
if (ret != 0)
goto err;
+
+ /* Check if the call has been interrupted. */
+ if (LF_ISSET(DB_SYNC_INTERRUPT_OK) && FLD_ISSET(
+ mp->config_flags, DB_MEMP_SYNC_INTERRUPT)) {
+ if (interruptedp != NULL)
+ *interruptedp = 1;
+ goto err;
+ }
}
}
@@ -366,7 +419,7 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep)
* If we're trickling buffers, only write enough to reach the correct
* percentage.
*/
- if (op == DB_SYNC_TRICKLE && ar_cnt > trickle_max)
+ if (LF_ISSET(DB_SYNC_TRICKLE) && ar_cnt > trickle_max)
ar_cnt = trickle_max;
/*
@@ -385,7 +438,7 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep)
* out its hash bucket pointer so we don't process a slot more than
* once.
*/
- for (i = pass = write_cnt = 0, remaining = ar_cnt; remaining > 0; ++i) {
+ for (i = pass = wrote_cnt = 0, remaining = ar_cnt; remaining > 0; ++i) {
if (i >= ar_cnt) {
i = 0;
++pass;
@@ -429,44 +482,40 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep)
*/
if (F_ISSET(bhp, BH_LOCKED) || (bhp->ref != 0 && pass < 2)) {
MUTEX_UNLOCK(dbenv, mutex);
- if (op != DB_SYNC_CACHE && op != DB_SYNC_FILE) {
+ if (!required_write) {
--remaining;
bharray[i].track_hp = NULL;
}
continue;
}
- /*
- * The buffer is dirty and may also be pinned.
- *
- * Set the sync wait-for count, used to count down outstanding
- * references to this buffer as they are returned to the cache.
- */
- bhp->ref_sync = bhp->ref;
-
/* Pin the buffer into memory and lock it. */
++bhp->ref;
F_SET(bhp, BH_LOCKED);
/*
- * Unlock the hash bucket and wait for the wait-for count to
- * go to 0. No new thread can acquire the buffer because we
- * have it locked.
+ * If the buffer is referenced by another thread, set the sync
+ * wait-for count (used to count down outstanding references to
+ * this buffer as they are returned to the cache), then unlock
+ * the hash bucket and wait for the count to go to 0. No other
+ * thread can acquire the buffer because we have it locked.
*
* If a thread attempts to re-pin a page, the wait-for count
- * will never go to 0 (the thread spins on our buffer lock,
+ * will never go to 0 (that thread spins on our buffer lock,
* while we spin on the thread's ref count). Give up if we
- * don't get the buffer in 3 seconds, we can try again later.
+ * don't get the buffer in 3 seconds, we'll try again later.
*
* If, when the wait-for count goes to 0, the buffer is found
* to be dirty, write it.
*/
- MUTEX_UNLOCK(dbenv, mutex);
- for (wait_cnt = 1;
- bhp->ref_sync != 0 && wait_cnt < 4; ++wait_cnt)
- __os_sleep(dbenv, 1, 0);
- MUTEX_LOCK(dbenv, mutex);
- hb_lock = 1;
+ bhp->ref_sync = bhp->ref - 1;
+ if (bhp->ref_sync != 0) {
+ MUTEX_UNLOCK(dbenv, mutex);
+ for (wait_cnt = 1;
+ bhp->ref_sync != 0 && wait_cnt < 4; ++wait_cnt)
+ __os_sleep(dbenv, 1, 0);
+ MUTEX_LOCK(dbenv, mutex);
+ }
/*
* If we've switched files, check to see if we're configured
@@ -476,7 +525,7 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep)
if (++filecnt >= maxopenfd) {
filecnt = 0;
if ((t_ret = __memp_close_flush_files(
- dbenv, dbmp, 1)) != 0 && ret == 0)
+ dbenv, 1)) != 0 && ret == 0)
ret = t_ret;
}
last_mf_offset = bhp->mf_offset;
@@ -496,28 +545,18 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep)
* dirty, we write it. We only try to write the buffer once.
*/
if (bhp->ref_sync == 0 && F_ISSET(bhp, BH_DIRTY)) {
- MUTEX_UNLOCK(dbenv, mutex);
- hb_lock = 0;
-
mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset);
if ((t_ret =
- __memp_bhwrite(dbmp, hp, mfp, bhp, 1)) == 0)
- ++wrote;
- else {
+ __memp_bhwrite(dbmp, hp, mfp, bhp, 1)) == 0) {
+ ++wrote_cnt;
+ ++wrote_total;
+ } else {
if (ret == 0)
ret = t_ret;
__db_errx
(dbenv, "%s: unable to flush page: %lu",
__memp_fns(dbmp, mfp), (u_long)bhp->pgno);
- }
- /*
- * Avoid saturating the disk, sleep once we've done
- * some number of writes.
- */
- if (maxwrite != 0 && ++write_cnt >= maxwrite) {
- write_cnt = 0;
- __os_sleep(dbenv, 0, (u_long)maxwrite_sleep);
}
}
@@ -525,18 +564,9 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep)
* If ref_sync count never went to 0, the buffer was written
* by another thread, or the write failed, we still have the
* buffer locked.
- *
- * We may or may not currently hold the hash bucket mutex. If
- * the __memp_bhwrite -> __memp_pgwrite call was successful,
- * __memp_pgwrite will have acquired the hash bucket lock; all
- * other call paths will leave us without the hash bucket lock.
*/
- if (F_ISSET(bhp, BH_LOCKED)) {
- if (!hb_lock)
- MUTEX_LOCK(dbenv, mutex);
-
+ if (F_ISSET(bhp, BH_LOCKED))
F_CLR(bhp, BH_LOCKED);
- }
/*
* Reset the ref_sync count regardless of our success, we're
@@ -548,7 +578,8 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep)
--bhp->ref;
/*
- * If a thread of control is waiting on this buffer, wake it up.
+ * If a thread of control is waiting in this hash bucket, wake
+ * it up.
*/
if (F_ISSET(hp, IO_WAITER)) {
F_CLR(hp, IO_WAITER);
@@ -557,29 +588,51 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep)
/* Release the hash bucket mutex. */
MUTEX_UNLOCK(dbenv, mutex);
+
+ /* Check if the call has been interrupted. */
+ if (LF_ISSET(DB_SYNC_INTERRUPT_OK) &&
+ FLD_ISSET(mp->config_flags, DB_MEMP_SYNC_INTERRUPT)) {
+ if (interruptedp != NULL)
+ *interruptedp = 1;
+ goto err;
+ }
+
+ /*
+ * Sleep after some number of writes to avoid disk saturation.
+ * Don't cache the max writes value, an application shutting
+ * down might reset the value in order to do a fast flush or
+ * checkpoint.
+ */
+ if (!LF_ISSET(DB_SYNC_SUPPRESS_WRITE) &&
+ !FLD_ISSET(mp->config_flags, DB_MEMP_SUPPRESS_WRITE) &&
+ mp->mp_maxwrite != 0 && wrote_cnt >= mp->mp_maxwrite) {
+ wrote_cnt = 0;
+ __os_sleep(
+ dbenv, 0, (u_long)mp->mp_maxwrite_sleep);
+ }
}
done: /*
- * If doing a checkpoint or flushing a file for the application, we
- * have to force the pages to disk. We don't do this as we go along
- * because we want to give the OS as much time as possible to lazily
- * flush, and because we have to flush files that might not even have
- * had dirty buffers in the cache, so we have to walk the files list.
+ * If a write is required, we have to force the pages to disk. We
+ * don't do this as we go along because we want to give the OS as
+ * much time as possible to lazily flush, and because we have to flush
+ * files that might not even have had dirty buffers in the cache, so
+ * we have to walk the files list.
*/
- if (ret == 0 && (op == DB_SYNC_CACHE || op == DB_SYNC_FILE)) {
+ if (ret == 0 && required_write) {
if (dbmfp == NULL)
- ret = __memp_sync_files(dbenv, dbmp);
+ ret = __memp_sync_files(dbenv);
else
ret = __os_fsync(dbenv, dbmfp->fhp);
}
/* If we've opened files to flush pages, close them. */
- if ((t_ret = __memp_close_flush_files(dbenv, dbmp, 0)) != 0 && ret == 0)
+ if ((t_ret = __memp_close_flush_files(dbenv, 0)) != 0 && ret == 0)
ret = t_ret;
err: __os_free(dbenv, bharray);
- if (wrotep != NULL)
- *wrotep = wrote;
+ if (wrote_totalp != NULL)
+ *wrote_totalp = wrote_total;
return (ret);
}
@@ -651,28 +704,23 @@ __memp_sync_file(dbenv, mfp, argp, countp, flags)
/* If we don't find a handle we can use, open one. */
if (dbmfp == NULL) {
- if ((ret = __memp_mf_sync(dbmp, mfp, 0)) != 0) {
+ if ((ret = __memp_mf_sync(dbmp, mfp, 1)) != 0) {
__db_err(dbenv, ret,
"%s: unable to flush", (char *)
R_ADDR(dbmp->reginfo, mfp->path_off));
}
- } else {
+ } else
ret = __os_fsync(dbenv, dbmfp->fhp);
- if ((t_ret = __memp_fclose(dbmfp, 0)) != 0 && ret == 0)
- ret = t_ret;
- }
-
/*
* Re-acquire the MPOOLFILE mutex, we need it to modify the
* reference count.
*/
MUTEX_LOCK(dbenv, mfp->mutex);
- --mfp->mpf_cnt;
/*
- * If we wrote the file and there are no open handles (or there
- * is a single open handle, and it's the one we opened to write
+ * If we wrote the file and there are no other references (or there
+ * is a single reference, and it's the one we opened to write
* buffers during checkpoint), clear the file_written flag. We
* do this so that applications opening thousands of files don't
* loop here opening and flushing those files during checkpoint.
@@ -684,7 +732,7 @@ __memp_sync_file(dbenv, mfp, argp, countp, flags)
* the region lock, no possibility of another thread of control
* racing with us to open a MPOOLFILE.
*/
- if (mfp->mpf_cnt == 0 || (mfp->mpf_cnt == 1 &&
+ if (mfp->mpf_cnt == 1 || (mfp->mpf_cnt == 2 &&
dbmfp != NULL && F_ISSET(dbmfp, MP_FLUSH))) {
mfp->file_written = 0;
@@ -696,31 +744,44 @@ __memp_sync_file(dbenv, mfp, argp, countp, flags)
* I mean, what are the chances that there aren't any
* buffers in the pool? Regardless, it might happen.)
*/
- if (mfp->mpf_cnt == 0 && mfp->block_cnt == 0)
+ if (mfp->mpf_cnt == 1 && mfp->block_cnt == 0)
*(int *)argp = 1;
}
- /* Unlock the MPOOLFILE, and move to the next entry. */
+ /*
+ * If we found the file we must close it in case we are the last
+ * reference to the dbmfp. NOTE: since we have incremented
+ * mfp->mpf_cnt this cannot be the last reference to the mfp.
+ * This is important since we are called with the hash bucket
+ * locked. The mfp will get freed via the cleanup pass.
+ */
+ if (dbmfp != NULL && (t_ret = __memp_fclose(dbmfp, 0)) != 0 && ret == 0)
+ ret = t_ret;
+
+ --mfp->mpf_cnt;
+
+ /* Unlock the MPOOLFILE. */
MUTEX_UNLOCK(dbenv, mfp->mutex);
- return (0);
+ return (ret);
}
/*
* __memp_sync_files --
* Sync all the files in the environment, open or not.
*/
-static
-int __memp_sync_files(dbenv, dbmp)
+static int
+__memp_sync_files(dbenv)
DB_ENV *dbenv;
- DB_MPOOL *dbmp;
{
+ DB_MPOOL *dbmp;
DB_MPOOL_HASH *hp;
MPOOL *mp;
MPOOLFILE *mfp, *next_mfp;
int i, need_discard_pass, ret;
- need_discard_pass = ret = 0;
+ dbmp = dbenv->mp_handle;
mp = dbmp->reginfo[0].primary;
+ need_discard_pass = ret = 0;
ret = __memp_walk_files(dbenv,
mp, __memp_sync_file, &need_discard_pass, 0, DB_STAT_NOERROR);
@@ -734,7 +795,7 @@ int __memp_sync_files(dbenv, dbmp)
hp = R_ADDR(dbmp->reginfo, mp->ftab);
for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) {
- MUTEX_LOCK(dbenv, hp->mtx_hash);
+retry: MUTEX_LOCK(dbenv, hp->mtx_hash);
for (mfp = SH_TAILQ_FIRST(&hp->hash_bucket,
__mpoolfile); mfp != NULL; mfp = next_mfp) {
next_mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile);
@@ -743,13 +804,17 @@ int __memp_sync_files(dbenv, dbmp)
* without a mutex on the MPOOLFILE. If likely to
* succeed, lock the MPOOLFILE down and look for real.
*/
- if (mfp->block_cnt != 0 || mfp->mpf_cnt != 0)
+ if (mfp->deadfile ||
+ mfp->block_cnt != 0 || mfp->mpf_cnt != 0)
continue;
MUTEX_LOCK(dbenv, mfp->mutex);
- if (mfp->block_cnt == 0 && mfp->mpf_cnt == 0)
+ if (!mfp->deadfile &&
+ mfp->block_cnt == 0 && mfp->mpf_cnt == 0) {
+ MUTEX_UNLOCK(dbenv, hp->mtx_hash);
(void)__memp_mf_discard(dbmp, mfp);
- else
+ goto retry;
+ } else
MUTEX_UNLOCK(dbenv, mfp->mutex);
}
MUTEX_UNLOCK(dbenv, hp->mtx_hash);
@@ -764,28 +829,36 @@ int __memp_sync_files(dbenv, dbmp)
* PUBLIC: int __memp_mf_sync __P((DB_MPOOL *, MPOOLFILE *, int));
*/
int
-__memp_mf_sync(dbmp, mfp, region_locked)
+__memp_mf_sync(dbmp, mfp, locked)
DB_MPOOL *dbmp;
MPOOLFILE *mfp;
- int region_locked;
+ int locked;
{
DB_ENV *dbenv;
DB_FH *fhp;
+ DB_MPOOL_HASH *hp;
+ MPOOL *mp;
int ret, t_ret;
char *rpath;
+ COMPQUIET(hp, NULL);
dbenv = dbmp->dbenv;
/*
- * We need to be holding the region lock: we're using the path name
+ * We need to be holding the hash lock: we're using the path name
* and __memp_nameop might try and rename the file.
*/
- if (!region_locked)
- MPOOL_SYSTEM_LOCK(dbenv);
+ if (!locked) {
+ mp = dbmp->reginfo[0].primary;
+ hp = R_ADDR(dbmp->reginfo, mp->ftab);
+ hp += FNBUCKET(
+ R_ADDR(dbmp->reginfo, mfp->fileid_off), DB_FILE_ID_LEN);
+ MUTEX_LOCK(dbenv, hp->mtx_hash);
+ }
if ((ret = __db_appname(dbenv, DB_APP_DATA,
R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) == 0) {
- if ((ret = __os_open(dbenv, rpath, 0, 0, &fhp)) == 0) {
+ if ((ret = __os_open(dbenv, rpath, 0, 0, 0, &fhp)) == 0) {
ret = __os_fsync(dbenv, fhp);
if ((t_ret =
__os_closehandle(dbenv, fhp)) != 0 && ret == 0)
@@ -794,8 +867,8 @@ __memp_mf_sync(dbmp, mfp, region_locked)
__os_free(dbenv, rpath);
}
- if (!region_locked)
- MPOOL_SYSTEM_UNLOCK(dbenv);
+ if (!locked)
+ MUTEX_UNLOCK(dbenv, hp->mtx_hash);
return (ret);
}
@@ -805,15 +878,17 @@ __memp_mf_sync(dbmp, mfp, region_locked)
* Close files opened only to flush buffers.
*/
static int
-__memp_close_flush_files(dbenv, dbmp, dosync)
+__memp_close_flush_files(dbenv, dosync)
DB_ENV *dbenv;
- DB_MPOOL *dbmp;
int dosync;
{
+ DB_MPOOL *dbmp;
DB_MPOOLFILE *dbmfp;
MPOOLFILE *mfp;
int ret;
+ dbmp = dbenv->mp_handle;
+
/*
* The routine exists because we must close files opened by sync to
* flush buffers. There are two cases: first, extent files have to
diff --git a/db/mp/mp_trickle.c b/db/mp/mp_trickle.c
index d1d3853aa..cbe7af4f2 100644
--- a/db/mp/mp_trickle.c
+++ b/db/mp/mp_trickle.c
@@ -1,10 +1,9 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996-2006
- * Oracle Corporation. All rights reserved.
+ * Copyright (c) 1996,2007 Oracle. All rights reserved.
*
- * $Id: mp_trickle.c,v 12.9 2006/08/24 14:46:15 bostic Exp $
+ * $Id: mp_trickle.c,v 12.16 2007/06/01 18:32:44 bostic Exp $
*/
#include "db_config.h"
@@ -89,15 +88,21 @@ __memp_trickle(dbenv, pct, nwrotep)
if (total == 0 || dirty == 0)
return (0);
- clean = total - dirty;
+ /*
+ * The total number of pages is an exact number, but the dirty page
+ * count can change while we're walking the hash buckets, and it's
+ * even possible the dirty page count ends up larger than the total
+ * number of pages.
+ */
+ clean = total > dirty ? total - dirty : 0;
need_clean = (total * (u_int)pct) / 100;
if (clean >= need_clean)
return (0);
need_clean -= clean;
- ret = __memp_sync_int(
- dbenv, NULL, need_clean, DB_SYNC_TRICKLE, &wrote);
- mp->stat.st_page_trickle += wrote;
+ ret = __memp_sync_int(dbenv, NULL,
+ need_clean, DB_SYNC_TRICKLE | DB_SYNC_INTERRUPT_OK, &wrote, NULL);
+ STAT((mp->stat.st_page_trickle += wrote));
if (nwrotep != NULL)
*nwrotep = (int)wrote;