diff options
Diffstat (limited to 'db/mp')
-rw-r--r-- | db/mp/Design | 52 | ||||
-rw-r--r-- | db/mp/mp_alloc.c | 118 | ||||
-rw-r--r-- | db/mp/mp_bh.c | 47 | ||||
-rw-r--r-- | db/mp/mp_fget.c | 84 | ||||
-rw-r--r-- | db/mp/mp_fmethod.c | 22 | ||||
-rw-r--r-- | db/mp/mp_fopen.c | 291 | ||||
-rw-r--r-- | db/mp/mp_fput.c | 120 | ||||
-rw-r--r-- | db/mp/mp_fset.c | 105 | ||||
-rw-r--r-- | db/mp/mp_method.c | 243 | ||||
-rw-r--r-- | db/mp/mp_mvcc.c | 63 | ||||
-rw-r--r-- | db/mp/mp_region.c | 308 | ||||
-rw-r--r-- | db/mp/mp_register.c | 5 | ||||
-rw-r--r-- | db/mp/mp_resize.c | 559 | ||||
-rw-r--r-- | db/mp/mp_stat.c | 35 | ||||
-rw-r--r-- | db/mp/mp_sync.c | 327 | ||||
-rw-r--r-- | db/mp/mp_trickle.c | 19 |
16 files changed, 1592 insertions, 806 deletions
diff --git a/db/mp/Design b/db/mp/Design deleted file mode 100644 index 1b26aae6c..000000000 --- a/db/mp/Design +++ /dev/null @@ -1,52 +0,0 @@ -$Id: Design,v 11.2 1999/11/21 23:08:27 bostic Exp $ - -There are three ways we do locking in the mpool code: - -Locking a handle mutex to provide concurrency for DB_THREAD operations. -Locking the region mutex to provide mutual exclusion while reading and - writing structures in the shared region. -Locking buffer header mutexes during I/O. - -The first will not be further described here. We use the shared mpool -region lock to provide mutual exclusion while reading/modifying all of -the data structures, including the buffer headers. We use a per-buffer -header lock to wait on buffer I/O. The order of locking is as follows: - -Searching for a buffer: - Acquire the region lock. - Find the buffer header. - Increment the reference count (guarantee the buffer stays). - While the BH_LOCKED flag is set (I/O is going on) { - Release the region lock. - Explicitly yield the processor if it's not the first pass - through this loop, otherwise, we can simply spin because - we'll be simply switching between the two locks. - Request the buffer lock. - The I/O will complete... - Acquire the buffer lock. - Release the buffer lock. - Acquire the region lock. - } - Return the buffer. - -Reading/writing a buffer: - Acquire the region lock. - Find/create the buffer header. - If reading, increment the reference count (guarantee the buffer stays). - Set the BH_LOCKED flag. - Acquire the buffer lock (guaranteed not to block). - Release the region lock. - Do the I/O and/or initialize the buffer contents. - Release the buffer lock. - At this point, the buffer lock is available, but the logical - operation (flagged by BH_LOCKED) is not yet completed. For - this reason, among others, threads checking the BH_LOCKED flag - must loop around their test. - Acquire the region lock. - Clear the BH_LOCKED flag. - Release the region lock. - Return/discard the buffer. - -Pointers to DB_MPOOL, MPOOL, DB_MPOOLFILE and MPOOLFILE structures are -not reacquired when a region lock is reacquired because they couldn't -have been closed/discarded and because they never move in memory. diff --git a/db/mp/mp_alloc.c b/db/mp/mp_alloc.c index 0619d5ccf..c18e62dff 100644 --- a/db/mp/mp_alloc.c +++ b/db/mp/mp_alloc.c @@ -1,10 +1,9 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996-2006 - * Oracle Corporation. All rights reserved. + * Copyright (c) 1996,2007 Oracle. All rights reserved. * - * $Id: mp_alloc.c,v 12.20 2006/09/07 15:11:26 mjc Exp $ + * $Id: mp_alloc.c,v 12.33 2007/06/01 18:32:44 bostic Exp $ */ #include "db_config.h" @@ -38,7 +37,6 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp) MPOOL *c_mp; MPOOLFILE *bh_mfp; size_t freed_space; - db_mutex_t mutex; u_int32_t buckets, buffers, high_priority, priority; u_int32_t put_counter, total_buckets; int aggressive, alloc_freeze, giveup, got_oldest, ret; @@ -54,7 +52,7 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp) aggressive = alloc_freeze = giveup = got_oldest = 0; hp_tmp = NULL; - c_mp->stat.st_alloc++; + STAT(c_mp->stat.st_alloc++); /* * If we're allocating a buffer, and the one we're discarding is the @@ -86,7 +84,7 @@ __memp_alloc(dbmp, infop, mfp, len, offsetp, retp) * we need in the hopes it will coalesce into a contiguous chunk of the * right size. In the latter case we branch back here and try again. */ -alloc: if ((ret = __db_shalloc(infop, len, 0, &p)) == 0) { +alloc: if ((ret = __env_alloc(infop, len, &p)) == 0) { if (mfp != NULL) c_mp->stat.st_pages++; MPOOL_REGION_UNLOCK(dbenv, infop); @@ -106,6 +104,7 @@ found: if (offsetp != NULL) * We're not holding the region locked here, these statistics * can't be trusted. */ +#ifdef HAVE_STATISTICS total_buckets += buckets; if (total_buckets != 0) { if (total_buckets > c_mp->stat.st_alloc_max_buckets) @@ -117,6 +116,7 @@ found: if (offsetp != NULL) c_mp->stat.st_alloc_max_pages = buffers; c_mp->stat.st_alloc_pages += buffers; } +#endif return (0); } else if (giveup || c_mp->stat.st_pages == 0) { MPOOL_REGION_UNLOCK(dbenv, infop); @@ -153,24 +153,14 @@ found: if (offsetp != NULL) } /* - * Skip empty buckets. - * - * We can check for empty buckets before locking as we - * only care if the pointer is zero or non-zero. - */ - if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) - continue; - - /* * The failure mode is when there are too many buffers we can't - * write or there's not enough memory in the system. We don't - * have a way to know that allocation has no way to succeed. - * We fail if there were no pages returned to the cache after - * we've been trying for a relatively long time. + * write or there's not enough memory in the system to support + * the number of pinned buffers. * - * Get aggressive if we've tried to flush the number of hash - * buckets as are in the system and have not found any more - * space. Aggressive means: + * Get aggressive if we've reviewed the entire cache without + * freeing 3 times the needed space. (The code resets the + * counter when we free 3 times the needed space.) Aggressive + * means: * * a: set a flag to attempt to flush high priority buffers as * well as other buffers. @@ -187,11 +177,15 @@ found: if (offsetp != NULL) * Always try to allocate memory too, in case some other thread * returns its memory to the region. * + * We don't have any way to know an allocation has no way to + * succeed. Fail if no pages are returned to the cache after + * we've been trying for a relatively long time. + * * !!! * This test ignores pathological cases like no buffers in the - * system -- that shouldn't be possible. + * system -- we check for that early on, so it isn't possible. */ - if ((++buckets % c_mp->htab_buckets) == 0) { + if (buckets++ == c_mp->htab_buckets) { if (freed_space > 0) goto alloc; MPOOL_REGION_UNLOCK(dbenv, infop); @@ -207,7 +201,7 @@ found: if (offsetp != NULL) case 5: case 6: (void)__memp_sync_int( - dbenv, NULL, 0, DB_SYNC_ALLOC, NULL); + dbenv, NULL, 0, DB_SYNC_ALLOC, NULL, NULL); __os_sleep(dbenv, 1, 0); break; @@ -222,11 +216,35 @@ found: if (offsetp != NULL) goto alloc; } + /* + * Skip empty buckets. + * + * We can check for empty buckets before locking as we + * only care if the pointer is zero or non-zero. + */ + if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) + continue; + /* + * Skip buckets that only have pinned pages. + * + * Again we are doing this without locking. If we misread + * the number we might improperly skip a bucket but this is + * not fatal. + */ + if (hp->hash_priority == UINT32_MAX) + continue; + if (!aggressive) { - /* Skip high priority buckets. */ - if (hp->hash_priority > high_priority) + /* Adjust if the bucket has not been reset. */ + priority = hp->hash_priority; + if (c_mp->lru_reset != 0 && + c_mp->lru_reset <= hp - dbht) + priority -= MPOOL_BASE_DECREMENT; + /* + * Skip high priority buckets. + */ + if (priority > high_priority) continue; - /* * Find two buckets and select the one with the lowest * priority. Performance testing shows that looking @@ -237,18 +255,22 @@ found: if (offsetp != NULL) hp_tmp = hp; continue; } - if (hp->hash_priority > hp_tmp->hash_priority) + if (c_mp->lru_reset && + c_mp->lru_reset <= hp_tmp - dbht) { + if (priority > hp_tmp->hash_priority - + MPOOL_BASE_DECREMENT) + hp = hp_tmp; + } else if (priority > hp_tmp->hash_priority) hp = hp_tmp; hp_tmp = NULL; } - /* Remember the priority of the buffer we're looking for. */ - priority = hp->hash_priority; - /* Unlock the region and lock the hash bucket. */ MPOOL_REGION_UNLOCK(dbenv, infop); - mutex = hp->mtx_hash; - MUTEX_LOCK(dbenv, mutex); + MUTEX_LOCK(dbenv, hp->mtx_hash); + + /* Remember the priority of the buffer we're looking for. */ + priority = hp->hash_priority; #ifdef DIAGNOSTIC __memp_check_order(dbenv, hp); @@ -311,10 +333,15 @@ this_hb: if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL) ++bhp->ref; ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0); --bhp->ref; +#ifdef HAVE_STATISTICS if (ret == 0) ++c_mp->stat.st_rw_evict; - } else +#endif + } +#ifdef HAVE_STATISTICS + else ++c_mp->stat.st_ro_evict; +#endif /* * Freeze this buffer, if necessary. That is, if the buffer @@ -373,13 +400,13 @@ this_hb: if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL) ++bhp->ref; if ((ret = __memp_bh_thaw(dbmp, infop, hp, bhp, NULL)) != 0) { - MUTEX_UNLOCK(dbenv, mutex); + MUTEX_UNLOCK(dbenv, hp->mtx_hash); return (ret); } alloc_freeze = 0; goto this_hb; } else if (alloc_freeze) { - if ((ret = __memp_bhfree(dbmp, hp, bhp, 0)) != 0) + if ((ret = __memp_bhfree(dbmp, infop, hp, bhp, 0)) != 0) return (ret); MVCC_MPROTECT(bhp->buf, bh_mfp->stat.st_pagesize, PROT_READ | PROT_WRITE | PROT_EXEC); @@ -399,13 +426,13 @@ this_hb: if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL) continue; } else if (mfp != NULL && mfp->stat.st_pagesize == bh_mfp->stat.st_pagesize) { - if ((ret = __memp_bhfree(dbmp, hp, bhp, 0)) != 0) + if ((ret = __memp_bhfree(dbmp, infop, hp, bhp, 0)) != 0) return (ret); p = bhp; goto found; } else { - freed_space += __db_shalloc_sizeof(bhp); - if ((ret = __memp_bhfree(dbmp, + freed_space += sizeof(*bhp) + bh_mfp->stat.st_pagesize; + if ((ret = __memp_bhfree(dbmp, infop, hp, bhp, BH_FREE_FREEMEM)) != 0) return (ret); } @@ -419,7 +446,7 @@ this_hb: if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) == NULL) * hash bucket lock has already been discarded. */ if (0) { -next_hb: MUTEX_UNLOCK(dbenv, mutex); +next_hb: MUTEX_UNLOCK(dbenv, hp->mtx_hash); } MPOOL_REGION_LOCK(dbenv, infop); @@ -449,7 +476,7 @@ __memp_free(infop, mfp, buf) { MVCC_BHUNALIGN(mfp, buf); COMPQUIET(mfp, NULL); - __db_shalloc_free(infop, buf); + __env_alloc_free(infop, buf); } /* @@ -516,7 +543,9 @@ __memp_check_order(dbenv, hp) DB_MPOOL_HASH *hp; { BH *bhp, *first_bhp, *tbhp; - u_int32_t priority, last_priority; + u_int32_t dirty, priority, last_priority; + + dirty = 0; /* * Assumes the hash bucket is locked. @@ -526,6 +555,8 @@ __memp_check_order(dbenv, hp) bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) { DB_ASSERT(dbenv, !SH_CHAIN_HASNEXT(bhp, vc)); + if (F_ISSET(bhp, BH_DIRTY)) + dirty++; priority = BH_PRIORITY(bhp); DB_ASSERT(dbenv, (bhp == first_bhp) ? priority == last_priority : priority >= last_priority); @@ -547,5 +578,6 @@ __memp_check_order(dbenv, hp) DB_ASSERT(dbenv, bhp->pgno != tbhp->pgno || bhp->mf_offset != tbhp->mf_offset); } + DB_ASSERT(dbenv, dirty == hp->hash_page_dirty); } #endif diff --git a/db/mp/mp_bh.c b/db/mp/mp_bh.c index ef4d1d4be..85cc30cc7 100644 --- a/db/mp/mp_bh.c +++ b/db/mp/mp_bh.c @@ -1,10 +1,9 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996-2006 - * Oracle Corporation. All rights reserved. + * Copyright (c) 1996,2007 Oracle. All rights reserved. * - * $Id: mp_bh.c,v 12.31 2006/09/07 19:11:46 bostic Exp $ + * $Id: mp_bh.c,v 12.38 2007/05/17 15:15:45 bostic Exp $ */ #include "db_config.h" @@ -256,9 +255,13 @@ __memp_pgread(dbmfp, hp, bhp, can_create) if (len < pagesize) memset(bhp->buf + len, CLEAR_BYTE, pagesize - len); #endif +#ifdef HAVE_STATISTICS ++mfp->stat.st_page_create; } else ++mfp->stat.st_page_in; +#else + } +#endif /* Call any pgin function. */ ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp, 1); @@ -304,18 +307,16 @@ __memp_pgwrite(dbenv, dbmfp, hp, bhp) mfp = dbmfp == NULL ? NULL : dbmfp->mfp; callpgin = ret = 0; - /* - * We should never be called with a clean or trash buffer. - * The sync code does call us with already locked buffers. - */ + /* We should never be called with a clean or trash buffer. */ DB_ASSERT(dbenv, F_ISSET(bhp, BH_DIRTY)); DB_ASSERT(dbenv, !F_ISSET(bhp, BH_TRASH)); - /* If not already done, lock the buffer and unlock the hash bucket. */ - if (!F_ISSET(bhp, BH_LOCKED)) { - F_SET(bhp, BH_LOCKED); - MUTEX_UNLOCK(dbenv, hp->mtx_hash); - } + /* + * The sync code has already locked the buffer, but the allocation + * code has not. Lock the buffer and release the hash bucket mutex. + */ + F_SET(bhp, BH_LOCKED); + MUTEX_UNLOCK(dbenv, hp->mtx_hash); /* * It's possible that the underlying file doesn't exist, either @@ -333,7 +334,7 @@ __memp_pgwrite(dbenv, dbmfp, hp, bhp) * If the page is in a file for which we have LSN information, we have * to ensure the appropriate log records are on disk. */ - if (LOGGING_ON(dbenv) && mfp->lsn_off != -1 && + if (LOGGING_ON(dbenv) && mfp->lsn_off != DB_LSN_OFF_NOTSET && !IS_CLIENT_PGRECOVER(dbenv)) { memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN)); if (!IS_NOT_LOGGED_LSN(lsn) && @@ -402,7 +403,7 @@ __memp_pgwrite(dbenv, dbmfp, hp, bhp) __memp_fn(dbmfp), (u_long)bhp->pgno); goto err; } - ++mfp->stat.st_page_out; + STAT(++mfp->stat.st_page_out); if (bhp->pgno > mfp->last_flushed_pgno) { MUTEX_LOCK(dbenv, mfp->mutex); if (bhp->pgno > mfp->last_flushed_pgno) @@ -517,20 +518,20 @@ err: __db_errx(dbenv, "%s: %s failed for page %lu", * Free a bucket header and its referenced data. * * PUBLIC: int __memp_bhfree - * PUBLIC: __P((DB_MPOOL *, DB_MPOOL_HASH *, BH *, u_int32_t)); + * PUBLIC: __P((DB_MPOOL *, REGINFO *, DB_MPOOL_HASH *, BH *, u_int32_t)); */ int -__memp_bhfree(dbmp, hp, bhp, flags) +__memp_bhfree(dbmp, infop, hp, bhp, flags) DB_MPOOL *dbmp; + REGINFO *infop; DB_MPOOL_HASH *hp; BH *bhp; u_int32_t flags; { DB_ENV *dbenv; - MPOOL *c_mp, *mp; + MPOOL *c_mp; MPOOLFILE *mfp; BH *next_bhp, *prev_bhp; - u_int32_t n_cache; int reorder, ret, t_ret; #ifdef DIAG_MVCC size_t pagesize; @@ -542,8 +543,6 @@ __memp_bhfree(dbmp, hp, bhp, flags) * Assumes the hash bucket is locked and the MPOOL is not. */ dbenv = dbmp->dbenv; - mp = dbmp->reginfo[0].primary; - n_cache = NCACHE(mp, bhp->mf_offset, bhp->pgno); mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); #ifdef DIAG_MVCC pagesize = mfp->stat.st_pagesize; @@ -623,13 +622,13 @@ __memp_bhfree(dbmp, hp, bhp, flags) * real. */ if (LF_ISSET(BH_FREE_FREEMEM)) { - MPOOL_REGION_LOCK(dbenv, &dbmp->reginfo[n_cache]); + MPOOL_REGION_LOCK(dbenv, infop); - __memp_free(&dbmp->reginfo[n_cache], mfp, bhp); - c_mp = dbmp->reginfo[n_cache].primary; + __memp_free(infop, mfp, bhp); + c_mp = infop->primary; c_mp->stat.st_pages--; - MPOOL_REGION_UNLOCK(dbenv, &dbmp->reginfo[n_cache]); + MPOOL_REGION_UNLOCK(dbenv, infop); } /* diff --git a/db/mp/mp_fget.c b/db/mp/mp_fget.c index 5f7eb6802..bb73a0a08 100644 --- a/db/mp/mp_fget.c +++ b/db/mp/mp_fget.c @@ -1,10 +1,9 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996-2006 - * Oracle Corporation. All rights reserved. + * Copyright (c) 1996,2007 Oracle. All rights reserved. * - * $Id: mp_fget.c,v 12.33 2006/09/13 14:53:42 mjc Exp $ + * $Id: mp_fget.c,v 12.43 2007/06/05 11:55:28 mjc Exp $ */ #include "db_config.h" @@ -108,36 +107,34 @@ __memp_fget(dbmfp, pgnoaddr, txn, flags, addrp) enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state; BH *alloc_bhp, *bhp, *current_bhp, *frozen_bhp, *oldest_bhp; DB_ENV *dbenv; + DB_LSN *read_lsnp; DB_MPOOL *dbmp; DB_MPOOL_HASH *hp; - MPOOL *c_mp, *mp; + MPOOL *c_mp; MPOOLFILE *mfp; - REGINFO *infop; + REGINFO *infop, *t_infop; TXN_DETAIL *td; - DB_LSN *read_lsnp; roff_t mf_offset; - u_int32_t n_cache, st_hsearch; + u_int32_t st_hsearch; int b_incr, b_locked, dirty, edit, extending, first; int makecopy, mvcc, need_free, reorder, ret; *(void **)addrp = NULL; + COMPQUIET(c_mp, NULL); + COMPQUIET(infop, NULL); COMPQUIET(oldest_bhp, NULL); dbenv = dbmfp->dbenv; dbmp = dbenv->mp_handle; - c_mp = NULL; - mp = dbmp->reginfo[0].primary; mfp = dbmfp->mfp; mvcc = mfp->multiversion; mf_offset = R_OFFSET(dbmp->reginfo, mfp); alloc_bhp = bhp = frozen_bhp = NULL; read_lsnp = NULL; + td = NULL; hp = NULL; b_incr = b_locked = extending = makecopy = ret = 0; - n_cache = 0; - infop = NULL; - td = NULL; if (LF_ISSET(DB_MPOOL_DIRTY)) { if (F_ISSET(dbmfp, MP_READONLY)) { @@ -224,25 +221,22 @@ __memp_fget(dbmfp, pgnoaddr, txn, flags, addrp) F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) { *(void **)addrp = (u_int8_t *)dbmfp->addr + (*pgnoaddr * mfp->stat.st_pagesize); - ++mfp->stat.st_map; + STAT(++mfp->stat.st_map); return (0); } -hb_search: - /* +retry: /* * Determine the cache and hash bucket where this page lives and get * local pointers to them. Reset on each pass through this code, the * page number can change. */ - n_cache = NCACHE(mp, mf_offset, *pgnoaddr); - infop = &dbmp->reginfo[n_cache]; + MP_GET_BUCKET(dbmfp, *pgnoaddr, &infop, hp, ret); + if (ret != 0) + return (ret); c_mp = infop->primary; - hp = R_ADDR(infop, c_mp->htab); - hp = &hp[NBUCKET(c_mp, mf_offset, *pgnoaddr)]; /* Search the hash chain for the page. */ -retry: st_hsearch = 0; - MUTEX_LOCK(dbenv, hp->mtx_hash); + st_hsearch = 0; b_locked = 1; SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) { ++st_hsearch; @@ -326,7 +320,7 @@ retry: st_hsearch = 0; F_SET(hp, IO_WAITER); MUTEX_LOCK(dbenv, hp->mtx_io); } - ++hp->hash_io_wait; + STAT(++hp->hash_io_wait); /* Release the hash bucket lock. */ MUTEX_UNLOCK(dbenv, hp->mtx_hash); @@ -362,10 +356,13 @@ thawed: need_free = (--frozen_bhp->ref == 0); goto retry; } +#ifdef HAVE_STATISTICS ++mfp->stat.st_cache_hit; +#endif break; } +#ifdef HAVE_STATISTICS /* * Update the hash bucket search statistics -- do now because our next * search may be for a different bucket. @@ -374,6 +371,7 @@ thawed: need_free = (--frozen_bhp->ref == 0); if (st_hsearch > c_mp->stat.st_hash_longest) c_mp->stat.st_hash_longest = st_hsearch; c_mp->stat.st_hash_examined += st_hsearch; +#endif /* * There are 4 possible paths to this location: @@ -411,6 +409,10 @@ thawed: need_free = (--frozen_bhp->ref == 0); */ if (flags == DB_MPOOL_FREE) { if (--bhp->ref == 0) { + if (F_ISSET(bhp, BH_DIRTY)) { + --hp->hash_page_dirty; + F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE); + } /* * In a multiversion database, this page could * be requested again so we have to leave it in @@ -424,17 +426,12 @@ thawed: need_free = (--frozen_bhp->ref == 0); if (mvcc && (!SH_CHAIN_SINGLETON(bhp, vc) || bhp->td_off == INVALID_ROFF || !IS_MAX_LSN(*VISIBLE_LSN(dbenv, bhp)))) { - if (F_ISSET(bhp, BH_DIRTY)) { - --hp->hash_page_dirty; - F_CLR(bhp, - BH_DIRTY | BH_DIRTY_CREATE); - } F_SET(bhp, BH_FREED); MUTEX_UNLOCK(dbenv, hp->mtx_hash); return (0); } return (__memp_bhfree( - dbmp, hp, bhp, BH_FREE_FREEMEM)); + dbmp, infop, hp, bhp, BH_FREE_FREEMEM)); } __db_errx(dbenv, "File %s: freeing pinned buffer for page %lu", @@ -447,12 +444,10 @@ thawed: need_free = (--frozen_bhp->ref == 0); if (flags == DB_MPOOL_CREATE && F_ISSET(bhp, BH_FREED)) { extending = makecopy = 1; - MUTEX_UNLOCK(dbenv, hp->mtx_hash); MUTEX_LOCK(dbenv, mfp->mutex); if (*pgnoaddr > mfp->last_pgno) mfp->last_pgno = *pgnoaddr; MUTEX_UNLOCK(dbenv, mfp->mutex); - MUTEX_LOCK(dbenv, hp->mtx_hash); } /* @@ -478,8 +473,9 @@ thawed: need_free = (--frozen_bhp->ref == 0); ((ret = __txn_oldest_reader(dbenv, &hp->old_reader)) == 0 && BH_OBSOLETE(oldest_bhp, hp->old_reader)))) { - if ((ret = __memp_bhfree(dbmp, hp, - oldest_bhp, BH_FREE_REUSE)) != 0) + if ((ret = __memp_bhfree(dbmp, + infop, hp, oldest_bhp, + BH_FREE_REUSE)) != 0) goto err; alloc_bhp = oldest_bhp; } else if (ret != 0) @@ -547,17 +543,17 @@ alloc: /* /* * !!! - * In the DB_MPOOL_NEW code path, mf_offset and n_cache have + * In the DB_MPOOL_NEW code path, infop and c_mp have * not yet been initialized. */ - mf_offset = R_OFFSET(dbmp->reginfo, mfp); - n_cache = NCACHE(mp, mf_offset, *pgnoaddr); - infop = &dbmp->reginfo[n_cache]; + MP_GET_REGION(dbmfp, *pgnoaddr, &infop, ret); + if (ret != 0) + goto err; c_mp = infop->primary; /* Allocate a new buffer header and data space. */ if ((ret = - __memp_alloc(dbmp,infop, mfp, 0, NULL, &alloc_bhp)) != 0) + __memp_alloc(dbmp, infop, mfp, 0, NULL, &alloc_bhp)) != 0) goto err; #ifdef DIAGNOSTIC if ((uintptr_t)alloc_bhp->buf & (sizeof(size_t) - 1)) { @@ -601,7 +597,10 @@ alloc: /* */ if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) { *pgnoaddr = mfp->last_pgno + 1; - if (n_cache != NCACHE(mp, mf_offset, *pgnoaddr)) { + MP_GET_REGION(dbmfp, *pgnoaddr, &t_infop,ret); + if (ret != 0) + goto err; + if (t_infop != infop) { /* * flags == DB_MPOOL_NEW, so extending is set * and we're holding the mfp locked. @@ -641,7 +640,7 @@ alloc: /* b_locked = 1; break; } - goto hb_search; + goto retry; case SECOND_FOUND: /* * We allocated buffer space for the requested page, but then @@ -764,10 +763,10 @@ alloc: /* if (flags == DB_MPOOL_CREATE && mfp->ftype != 0) F_SET(bhp, BH_CALLPGIN); - ++mfp->stat.st_page_create; + STAT(++mfp->stat.st_page_create); } else { F_SET(bhp, BH_TRASH); - ++mfp->stat.st_cache_miss; + STAT(++mfp->stat.st_cache_miss); } /* Increment buffer count referenced by MPOOLFILE. */ @@ -961,7 +960,8 @@ err: /* if (frozen_bhp != NULL) --frozen_bhp; if (b_incr && --bhp->ref == 0) { - (void)__memp_bhfree(dbmp, hp, bhp, BH_FREE_FREEMEM); + (void)__memp_bhfree(dbmp, + infop, hp, bhp, BH_FREE_FREEMEM); b_locked = 0; } } diff --git a/db/mp/mp_fmethod.c b/db/mp/mp_fmethod.c index 76d160ee5..38cd11d34 100644 --- a/db/mp/mp_fmethod.c +++ b/db/mp/mp_fmethod.c @@ -1,10 +1,9 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996-2006 - * Oracle Corporation. All rights reserved. + * Copyright (c) 1996,2007 Oracle. All rights reserved. * - * $Id: mp_fmethod.c,v 12.13 2006/08/24 14:46:14 bostic Exp $ + * $Id: mp_fmethod.c,v 12.19 2007/06/01 16:30:30 bostic Exp $ */ #include "db_config.h" @@ -67,7 +66,7 @@ __memp_fcreate(dbenv, retp) return (ret); dbmfp->ref = 1; - dbmfp->lsn_offset = -1; + dbmfp->lsn_offset = DB_LSN_OFF_NOTSET; dbmfp->dbenv = dbenv; dbmfp->mfp = INVALID_ROFF; @@ -77,13 +76,13 @@ __memp_fcreate(dbenv, retp) dbmfp->get_fileid = __memp_get_fileid; dbmfp->get_flags = __memp_get_flags; dbmfp->get_ftype = __memp_get_ftype; + dbmfp->get_last_pgno = __memp_get_last_pgno; dbmfp->get_lsn_offset = __memp_get_lsn_offset; dbmfp->get_maxsize = __memp_get_maxsize; dbmfp->get_pgcookie = __memp_get_pgcookie; dbmfp->get_priority = __memp_get_priority; dbmfp->open = __memp_fopen_pp; dbmfp->put = __memp_fput_pp; - dbmfp->set = __memp_fset_pp; dbmfp->set_clear_len = __memp_set_clear_len; dbmfp->set_fileid = __memp_set_fileid; dbmfp->set_flags = __memp_set_flags; @@ -489,16 +488,17 @@ __memp_set_priority(dbmfp, priority) } /* - * __memp_last_pgno -- + * __memp_get_last_pgno -- * Return the page number of the last page in the file. * * !!! - * Undocumented interface: DB private. + * The method is undocumented, but the handle is exported, users occasionally + * ask for it. * - * PUBLIC: int __memp_last_pgno __P((DB_MPOOLFILE *, db_pgno_t *)); + * PUBLIC: int __memp_get_last_pgno __P((DB_MPOOLFILE *, db_pgno_t *)); */ int -__memp_last_pgno(dbmfp, pgnoaddr) +__memp_get_last_pgno(dbmfp, pgnoaddr) DB_MPOOLFILE *dbmfp; db_pgno_t *pgnoaddr; { @@ -540,8 +540,8 @@ __memp_fns(dbmp, mfp) DB_MPOOL *dbmp; MPOOLFILE *mfp; { - if (mfp->path_off == 0) - return ((char *)"temporary"); + if (mfp == NULL || mfp->path_off == 0) + return ((char *)"unknown"); return ((char *)R_ADDR(dbmp->reginfo, mfp->path_off)); } diff --git a/db/mp/mp_fopen.c b/db/mp/mp_fopen.c index f13876e75..b41565304 100644 --- a/db/mp/mp_fopen.c +++ b/db/mp/mp_fopen.c @@ -1,10 +1,9 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996-2006 - * Oracle Corporation. All rights reserved. + * Copyright (c) 1996,2007 Oracle. All rights reserved. * - * $Id: mp_fopen.c,v 12.34 2006/09/09 13:55:52 bostic Exp $ + * $Id: mp_fopen.c,v 12.44 2007/05/17 17:18:01 bostic Exp $ */ #include "db_config.h" @@ -15,8 +14,10 @@ #include "dbinc/db_page.h" #include "dbinc/hash.h" -static int __memp_mfp_alloc __P((DB_MPOOL *, +static int __memp_mpf_alloc __P((DB_MPOOL *, DB_MPOOLFILE *, const char *, u_int32_t, u_int32_t, MPOOLFILE **)); +static int __memp_mpf_find __P((DB_ENV *, + DB_MPOOLFILE *, DB_MPOOL_HASH *, const char *, u_int32_t, MPOOLFILE **)); /* * __memp_fopen_pp -- @@ -140,14 +141,51 @@ __memp_fopen(dbmfp, mfp, path, flags, mode, pgsize) bucket = 0; hp = R_ADDR(dbmp->reginfo, mp->ftab); - if (path == NULL && mfp == NULL) - goto alloc; + if (mfp == NULL) { + if (path == NULL) + goto alloc; - /* - * Our caller may be able to tell us which underlying MPOOLFILE we - * need a handle for. - */ - if (mfp != NULL) { + /* + * Hash to the proper file table entry and walk it. + * + * The fileID is a filesystem unique number (e.g., a + * UNIX dev/inode pair) plus a timestamp. If files are + * removed and created in less than a second, the fileID + * can be repeated. The problem with repetition happens + * when the file that previously had the fileID value still + * has pages in the pool, since we don't want to use them + * to satisfy requests for the new file. Because the + * DB_TRUNCATE flag reuses the dev/inode pair, repeated + * opens with that flag set guarantees matching fileIDs + * when the machine can open a file and then re-open + * with truncate within a second. For this reason, we + * pass that flag down, and, if we find a matching entry, + * we ensure that it's never found again, and we create + * a new entry for the current request. + */ + + if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) + bucket = FNBUCKET(path, strlen(path)); + else + bucket = FNBUCKET(dbmfp->fileid, DB_FILE_ID_LEN); + hp += bucket; + + /* + * If we are passed a FILEID find the MPOOLFILE and inc + * its ref count. That way it cannot go away while we + * open it. + */ + if (F_ISSET(dbmfp, MP_FILEID_SET)) { + MUTEX_LOCK(dbenv, hp->mtx_hash); + ret = + __memp_mpf_find(dbenv, dbmfp, hp, path, flags,&mfp); + MUTEX_UNLOCK(dbenv, hp->mtx_hash); + if (ret != 0) + goto err; + if (mfp != NULL) + refinc = 1; + } + } else { /* * Deadfile can only be set if mpf_cnt goes to zero (or if we * failed creating the file DB_AM_DISCARD). Increment the ref @@ -213,7 +251,7 @@ __memp_fopen(dbmfp, mfp, path, flags, mode, pgsize) } if ((ret = __db_appname(dbenv, DB_APP_DATA, path, 0, NULL, &rpath)) == 0) - ret = __os_open_extend(dbenv, rpath, + ret = __os_open(dbenv, rpath, (u_int32_t)pagesize, oflags, mode, &dbmfp->fhp); if (mfp != NULL) MPOOL_SYSTEM_UNLOCK(dbenv); @@ -289,83 +327,21 @@ __memp_fopen(dbmfp, mfp, path, flags, mode, pgsize) goto have_mfp; /* - * Hash to the proper file table entry and walk it. - * - * The fileID is a filesystem unique number (e.g., a UNIX dev/inode - * pair) plus a timestamp. If files are removed and created in less - * than a second, the fileID can be repeated. The problem with - * repetition happens when the file that previously had the fileID - * value still has pages in the pool, since we don't want to use them - * to satisfy requests for the new file. - * - * Because the DB_TRUNCATE flag reuses the dev/inode pair, repeated - * opens with that flag set guarantees matching fileIDs when the - * machine can open a file and then re-open with truncate within a - * second. For this reason, we pass that flag down, and, if we find - * a matching entry, we ensure that it's never found again, and we - * create a new entry for the current request. - */ - if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) { - DB_ASSERT(dbenv, path != NULL); - bucket = FNBUCKET(path, strlen(path)); - } else - bucket = FNBUCKET(dbmfp->fileid, DB_FILE_ID_LEN); - hp += bucket; - - /* * We can race with another process opening the same file when * we allocate the mpoolfile structure. We will come back * here and check the hash table again to see if it has appeared. * For most files this is not a problem, since the name is locked * at a higher layer but QUEUE extent files are not locked. */ - check: MUTEX_LOCK(dbenv, hp->mtx_hash); - SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) { - /* Skip dead files and temporary files. */ - if (mfp->deadfile || F_ISSET(mfp, MP_TEMP)) - continue; - - /* - * Any remaining DB_MPOOL_NOFILE databases are in-memory - * named databases and need only match other in-memory - * databases with the same name. - */ - if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) { - if (!mfp->no_backing_file) - continue; - - DB_ASSERT(dbenv, path != NULL); - if (strcmp(path, R_ADDR(dbmp->reginfo, mfp->path_off))) - continue; - - /* - * We matched an in-memory file; grab the fileid if - * it is set in the region, but not in the dbmfp. - */ - if (!F_ISSET(dbmfp, MP_FILEID_SET)) - (void)__memp_set_fileid(dbmfp, - R_ADDR(dbmp->reginfo, mfp->fileid_off)); - } else - if (memcmp(dbmfp->fileid, R_ADDR(dbmp->reginfo, - mfp->fileid_off), DB_FILE_ID_LEN) != 0) - continue; - - /* - * If the file is being truncated, remove it from the system - * and create a new entry. - * - * !!! - * We should be able to set mfp to NULL and break out of the - * loop, but I like the idea of checking all the entries. - */ - if (LF_ISSET(DB_TRUNCATE)) { - MUTEX_LOCK(dbenv, mfp->mutex); - mfp->deadfile = 1; - MUTEX_UNLOCK(dbenv, mfp->mutex); - continue; - } + if ((ret = __memp_mpf_find(dbenv, dbmfp, hp, path, flags, &mfp) != 0)) + goto err; + if (alloc_mfp != NULL && mfp == NULL) { + mfp = alloc_mfp; + alloc_mfp = NULL; + SH_TAILQ_INSERT_HEAD(&hp->hash_bucket, mfp, q, __mpoolfile); + } else if (mfp != NULL) { /* * Some things about a file cannot be changed: the clear length, * page size, or LSN location. However, if this is an attempt @@ -385,7 +361,7 @@ check: MUTEX_LOCK(dbenv, hp->mtx_hash); mfp->clear_len != DB_CLEARLEN_NOTSET && dbmfp->clear_len != mfp->clear_len) || (pagesize != 0 && pagesize != mfp->stat.st_pagesize) || - (dbmfp->lsn_offset != -1 && + (dbmfp->lsn_offset != DB_LSN_OFF_NOTSET && mfp->lsn_off != DB_LSN_OFF_NOTSET && dbmfp->lsn_offset != mfp->lsn_off)) { __db_errx(dbenv, @@ -395,42 +371,6 @@ check: MUTEX_LOCK(dbenv, hp->mtx_hash); ret = EINVAL; goto err; } - - /* - * Check to see if this file has died while we waited. - * - * We normally don't lock the deadfile field when we read it as - * we only care if the field is zero or non-zero. We do lock - * on read when searching for a matching MPOOLFILE so that two - * threads of control don't race between setting the deadfile - * bit and incrementing the reference count, that is, a thread - * of control decrementing the reference count and then setting - * deadfile because the reference count is 0 blocks us finding - * the file without knowing it's about to be marked dead. - */ - MUTEX_LOCK(dbenv, mfp->mutex); - if (mfp->deadfile) { - MUTEX_UNLOCK(dbenv, mfp->mutex); - continue; - } - ++mfp->mpf_cnt; - refinc = 1; - MUTEX_UNLOCK(dbenv, mfp->mutex); - - /* Initialize any fields that are not yet set. */ - if (dbmfp->ftype != 0) - mfp->ftype = dbmfp->ftype; - if (dbmfp->clear_len != DB_CLEARLEN_NOTSET) - mfp->clear_len = dbmfp->clear_len; - if (dbmfp->lsn_offset != -1) - mfp->lsn_off = dbmfp->lsn_offset; - - break; - } - if (alloc_mfp != NULL && mfp == NULL) { - mfp = alloc_mfp; - alloc_mfp = NULL; - SH_TAILQ_INSERT_HEAD(&hp->hash_bucket, mfp, q, __mpoolfile); } MUTEX_UNLOCK(dbenv, hp->mtx_hash); @@ -462,7 +402,7 @@ alloc: /* __os_fileid(dbenv, rpath, 0, dbmfp->fileid)) != 0) goto err; - if ((ret = __memp_mfp_alloc(dbmp, + if ((ret = __memp_mpf_alloc(dbmp, dbmfp, path, pagesize, flags, &alloc_mfp)) != 0) goto err; @@ -625,8 +565,105 @@ err: if (refinc) { return (ret); } +/* + * __memp_mpf_find -- + * Search a hash bucket for a MPOOLFILE. + */ +static int +__memp_mpf_find(dbenv, dbmfp, hp, path, flags, mfpp) + DB_ENV *dbenv; + DB_MPOOLFILE *dbmfp; + DB_MPOOL_HASH *hp; + const char *path; + u_int32_t flags; + MPOOLFILE **mfpp; +{ + DB_MPOOL *dbmp; + MPOOLFILE *mfp; + + dbmp = dbenv->mp_handle; + + SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) { + /* Skip dead files and temporary files. */ + if (mfp->deadfile || F_ISSET(mfp, MP_TEMP)) + continue; + + /* + * Any remaining DB_MPOOL_NOFILE databases are in-memory + * named databases and need only match other in-memory + * databases with the same name. + */ + if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) { + if (!mfp->no_backing_file) + continue; + + if (strcmp(path, R_ADDR(dbmp->reginfo, mfp->path_off))) + continue; + + /* + * We matched an in-memory file; grab the fileid if + * it is set in the region, but not in the dbmfp. + */ + if (!F_ISSET(dbmfp, MP_FILEID_SET)) + (void)__memp_set_fileid(dbmfp, + R_ADDR(dbmp->reginfo, mfp->fileid_off)); + } else + if (memcmp(dbmfp->fileid, R_ADDR(dbmp->reginfo, + mfp->fileid_off), DB_FILE_ID_LEN) != 0) + continue; + + /* + * If the file is being truncated, remove it from the system + * and create a new entry. + * + * !!! + * We should be able to set mfp to NULL and break out of the + * loop, but I like the idea of checking all the entries. + */ + if (LF_ISSET(DB_TRUNCATE)) { + MUTEX_LOCK(dbenv, mfp->mutex); + mfp->deadfile = 1; + MUTEX_UNLOCK(dbenv, mfp->mutex); + continue; + } + + /* + * Check to see if this file has died while we waited. + * + * We normally don't lock the deadfile field when we read it as + * we only care if the field is zero or non-zero. We do lock + * on read when searching for a matching MPOOLFILE so that two + * threads of control don't race between setting the deadfile + * bit and incrementing the reference count, that is, a thread + * of control decrementing the reference count and then setting + * deadfile because the reference count is 0 blocks us finding + * the file without knowing it's about to be marked dead. + */ + MUTEX_LOCK(dbenv, mfp->mutex); + if (mfp->deadfile) { + MUTEX_UNLOCK(dbenv, mfp->mutex); + continue; + } + ++mfp->mpf_cnt; + MUTEX_UNLOCK(dbenv, mfp->mutex); + + /* Initialize any fields that are not yet set. */ + if (dbmfp->ftype != 0) + mfp->ftype = dbmfp->ftype; + if (dbmfp->clear_len != DB_CLEARLEN_NOTSET) + mfp->clear_len = dbmfp->clear_len; + if (dbmfp->lsn_offset != -1) + mfp->lsn_off = dbmfp->lsn_offset; + + break; + } + + *mfpp = mfp; + return (0); +} + static int -__memp_mfp_alloc(dbmp, dbmfp, path, pagesize, flags, retmfp) +__memp_mpf_alloc(dbmp, dbmfp, path, pagesize, flags, retmfp) DB_MPOOL *dbmp; DB_MPOOLFILE *dbmfp; const char *path; @@ -742,14 +779,12 @@ __memp_fclose_pp(dbmfp, flags) /* * Validate arguments, but as a handle destructor, we can't fail. - * - * !!! - * DB_MPOOL_DISCARD: Undocumented flag: DB private. */ - (void)__db_fchk(dbenv, "DB_MPOOLFILE->close", flags, DB_MPOOL_DISCARD); + if (flags != 0) + (void)__db_ferr(dbenv, "DB_MPOOLFILE->close", 0); ENV_ENTER(dbenv, ip); - REPLICATION_WRAP(dbenv, (__memp_fclose(dbmfp, flags)), ret); + REPLICATION_WRAP(dbenv, (__memp_fclose(dbmfp, 0)), ret); ENV_LEAVE(dbenv, ip); return (ret); } @@ -906,7 +941,9 @@ __memp_mf_discard(dbmp, mfp) { DB_ENV *dbenv; DB_MPOOL_HASH *hp; +#ifdef HAVE_STATISTICS DB_MPOOL_STAT *sp; +#endif MPOOL *mp; int need_sync, ret, t_ret; @@ -948,9 +985,10 @@ __memp_mf_discard(dbmp, mfp) /* Lock the region and collect stats and free the space. */ MPOOL_SYSTEM_LOCK(dbenv); if (need_sync && - (t_ret = __memp_mf_sync(dbmp, mfp, 1)) != 0 && ret == 0) + (t_ret = __memp_mf_sync(dbmp, mfp, 0)) != 0 && ret == 0) ret = t_ret; +#ifdef HAVE_STATISTICS /* Copy the statistics into the region. */ sp = &mp->stat; sp->st_cache_hit += mfp->stat.st_cache_hit; @@ -959,6 +997,7 @@ __memp_mf_discard(dbmp, mfp) sp->st_page_create += mfp->stat.st_page_create; sp->st_page_in += mfp->stat.st_page_in; sp->st_page_out += mfp->stat.st_page_out; +#endif /* Free the space. */ if (mfp->path_off != 0) diff --git a/db/mp/mp_fput.c b/db/mp/mp_fput.c index 124d2e1da..53afe8a82 100644 --- a/db/mp/mp_fput.c +++ b/db/mp/mp_fput.c @@ -1,10 +1,9 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996-2006 - * Oracle Corporation. All rights reserved. + * Copyright (c) 1996,2007 Oracle. All rights reserved. * - * $Id: mp_fput.c,v 12.22 2006/09/07 20:05:33 bostic Exp $ + * $Id: mp_fput.c,v 12.36 2007/06/05 11:55:28 mjc Exp $ */ #include "db_config.h" @@ -19,12 +18,14 @@ static int __memp_reset_lru __P((DB_ENV *, REGINFO *)); * __memp_fput_pp -- * DB_MPOOLFILE->put pre/post processing. * - * PUBLIC: int __memp_fput_pp __P((DB_MPOOLFILE *, void *, u_int32_t)); + * PUBLIC: int __memp_fput_pp + * PUBLIC: __P((DB_MPOOLFILE *, void *, DB_CACHE_PRIORITY, u_int32_t)); */ int -__memp_fput_pp(dbmfp, pgaddr, flags) +__memp_fput_pp(dbmfp, pgaddr, priority, flags) DB_MPOOLFILE *dbmfp; void *pgaddr; + DB_CACHE_PRIORITY priority; u_int32_t flags; { DB_ENV *dbenv; @@ -33,10 +34,14 @@ __memp_fput_pp(dbmfp, pgaddr, flags) dbenv = dbmfp->dbenv; PANIC_CHECK(dbenv); + if (flags != 0) + return (__db_ferr(dbenv, "DB_MPOOLFILE->put", 0)); + + MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->put"); ENV_ENTER(dbenv, ip); - ret = __memp_fput(dbmfp, pgaddr, flags); + ret = __memp_fput(dbmfp, pgaddr, priority); if (IS_ENV_REPLICATED(dbenv) && (t_ret = __op_rep_exit(dbenv)) != 0 && ret == 0) ret = t_ret; @@ -49,47 +54,30 @@ __memp_fput_pp(dbmfp, pgaddr, flags) * __memp_fput -- * DB_MPOOLFILE->put. * - * PUBLIC: int __memp_fput __P((DB_MPOOLFILE *, void *, u_int32_t)); + * PUBLIC: int __memp_fput __P((DB_MPOOLFILE *, void *, DB_CACHE_PRIORITY)); */ int -__memp_fput(dbmfp, pgaddr, flags) +__memp_fput(dbmfp, pgaddr, priority) DB_MPOOLFILE *dbmfp; void *pgaddr; - u_int32_t flags; + DB_CACHE_PRIORITY priority; { + BH *bhp; DB_ENV *dbenv; DB_MPOOL *dbmp; DB_MPOOL_HASH *hp; MPOOL *c_mp; MPOOLFILE *mfp; - BH *bhp; - u_int32_t n_cache; - int adjust, ret, t_ret; + REGINFO *infop; + int adjust, pfactor, ret, t_ret; dbenv = dbmfp->dbenv; - MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->put"); dbmp = dbenv->mp_handle; mfp = dbmfp->mfp; bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf)); ret = 0; /* - * Check arguments, but don't fail because we want to unpin the page - * regardless. The problem is when running with replication. There - * is a reference count we incremented when __memp_fget was called, - * and we need to unpin the page and decrement that reference count. - * If we see flag problems, mark the page dirty. - */ - if (flags) { - if (__db_fchk(dbenv, "memp_fput", flags, - DB_MPOOL_DISCARD) != 0) { - flags = 0; - ret = EINVAL; - DB_ASSERT(dbenv, 0); - } - } - - /* * If we're mapping the file, there's nothing to do. Because we can * stop mapping the file at any time, we have to check on each buffer * to see if the address we gave the application was part of the map @@ -116,15 +104,10 @@ __memp_fput(dbmfp, pgaddr, flags) #endif /* Convert a page address to a buffer header and hash bucket. */ - n_cache = NCACHE(dbmp->reginfo[0].primary, bhp->mf_offset, bhp->pgno); - c_mp = dbmp->reginfo[n_cache].primary; - hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab); - hp = &hp[NBUCKET(c_mp, bhp->mf_offset, bhp->pgno)]; - - MUTEX_LOCK(dbenv, hp->mtx_hash); - - if (LF_ISSET(DB_MPOOL_DISCARD)) - F_SET(bhp, BH_DISCARD); + MP_GET_BUCKET(dbmfp, bhp->pgno, &infop, hp, ret); + if (ret != 0) + return (ret); + c_mp = infop->primary; /* * Check for a reference count going to zero. This can happen if the @@ -163,7 +146,8 @@ __memp_fput(dbmfp, pgaddr, flags) MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, 0); /* Update priority values. */ - if (F_ISSET(bhp, BH_DISCARD) || mfp->priority == MPOOL_PRI_VERY_LOW) + if (priority == DB_PRIORITY_VERY_LOW || + mfp->priority == MPOOL_PRI_VERY_LOW) bhp->priority = 0; else { /* @@ -173,9 +157,31 @@ __memp_fput(dbmfp, pgaddr, flags) */ bhp->priority = c_mp->lru_count; + switch (priority) { + default: + case DB_PRIORITY_UNCHANGED: + pfactor = mfp->priority; + break; + case DB_PRIORITY_VERY_LOW: + pfactor = MPOOL_PRI_VERY_LOW; + break; + case DB_PRIORITY_LOW: + pfactor = MPOOL_PRI_LOW; + break; + case DB_PRIORITY_DEFAULT: + pfactor = MPOOL_PRI_DEFAULT; + break; + case DB_PRIORITY_HIGH: + pfactor = MPOOL_PRI_HIGH; + break; + case DB_PRIORITY_VERY_HIGH: + pfactor = MPOOL_PRI_VERY_HIGH; + break; + } + adjust = 0; - if (mfp->priority != 0) - adjust = (int)c_mp->stat.st_pages / mfp->priority; + if (pfactor != 0) + adjust = (int)c_mp->stat.st_pages / pfactor; if (F_ISSET(bhp, BH_DIRTY)) adjust += (int)c_mp->stat.st_pages / MPOOL_PRI_DIRTY; @@ -234,10 +240,9 @@ __memp_reset_lru(dbenv, infop) BH *bhp, *tbhp; DB_MPOOL_HASH *hp; MPOOL *c_mp; - u_int32_t bucket; + u_int32_t bucket, priority; c_mp = infop->primary; - /* * Update the counter so all future allocations will start at the * bottom. @@ -253,19 +258,42 @@ __memp_reset_lru(dbenv, infop) * We can check for empty buckets before locking as we * only care if the pointer is zero or non-zero. */ - if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) + if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) { + c_mp->lru_reset++; continue; + } MUTEX_LOCK(dbenv, hp->mtx_hash); - SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) + c_mp->lru_reset++; + /* + * We need to take a little care that the bucket does + * not become unsorted. This is highly unlikely but + * possible. + */ + priority = 0; + SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) { for (tbhp = bhp; tbhp != NULL; tbhp = SH_CHAIN_PREV(tbhp, vc, __bh)) { if (tbhp->priority != UINT32_MAX && - tbhp->priority > MPOOL_BASE_DECREMENT) + tbhp->priority > MPOOL_BASE_DECREMENT) { tbhp->priority -= MPOOL_BASE_DECREMENT; + if (tbhp->priority < priority) + tbhp->priority = priority; + } } + priority = bhp->priority; + } + /* + * Reset the hash bucket's priority. The chain is never empty + * in this case, so tbhp will never be NULL. + */ + if ((tbhp = + SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) != NULL) + hp->hash_priority = tbhp->priority; MUTEX_UNLOCK(dbenv, hp->mtx_hash); } + c_mp->lru_reset = 0; + COMPQUIET(dbenv, NULL); return (0); } diff --git a/db/mp/mp_fset.c b/db/mp/mp_fset.c index e3fd2f4df..46950f4e1 100644 --- a/db/mp/mp_fset.c +++ b/db/mp/mp_fset.c @@ -1,10 +1,9 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996-2006 - * Oracle Corporation. All rights reserved. + * Copyright (c) 1996,2007 Oracle. All rights reserved. * - * $Id: mp_fset.c,v 12.16 2006/09/13 14:53:42 mjc Exp $ + * $Id: mp_fset.c,v 12.23 2007/06/05 11:55:28 mjc Exp $ */ #include "db_config.h" @@ -15,108 +14,33 @@ #include "dbinc/txn.h" /* - * __memp_fset_pp -- - * DB_MPOOLFILE->set pre/post processing. - * - * PUBLIC: int __memp_fset_pp __P((DB_MPOOLFILE *, void *, u_int32_t)); - */ -int -__memp_fset_pp(dbmfp, pgaddr, flags) - DB_MPOOLFILE *dbmfp; - void *pgaddr; - u_int32_t flags; -{ - DB_ENV *dbenv; - DB_THREAD_INFO *ip; - int ret; - - dbenv = dbmfp->dbenv; - - PANIC_CHECK(dbenv); - MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->set"); - - /* Validate arguments. */ - if (flags == 0) - return (__db_ferr(dbenv, "memp_fset", 1)); - - if ((ret = __db_fchk(dbenv, "memp_fset", flags, DB_MPOOL_DISCARD)) != 0) - return (ret); - - ENV_ENTER(dbenv, ip); - REPLICATION_WRAP(dbenv, (__memp_fset(dbmfp, pgaddr, flags)), ret); - ENV_LEAVE(dbenv, ip); - return (ret); -} - -/* - * __memp_fset -- - * DB_MPOOLFILE->set. - * - * PUBLIC: int __memp_fset __P((DB_MPOOLFILE *, void *, u_int32_t)); - */ -int -__memp_fset(dbmfp, pgaddr, flags) - DB_MPOOLFILE *dbmfp; - void *pgaddr; - u_int32_t flags; -{ - BH *bhp; - DB_ENV *dbenv; - DB_MPOOL *dbmp; - DB_MPOOL_HASH *hp; - MPOOL *c_mp; - u_int32_t n_cache; - - dbenv = dbmfp->dbenv; - dbmp = dbenv->mp_handle; - - DB_ASSERT(dbenv, !LF_ISSET(DB_MPOOL_DIRTY)); - - /* Convert the page address to a buffer header and hash bucket. */ - bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf)); - n_cache = NCACHE(dbmp->reginfo[0].primary, bhp->mf_offset, bhp->pgno); - c_mp = dbmp->reginfo[n_cache].primary; - hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab); - hp = &hp[NBUCKET(c_mp, bhp->mf_offset, bhp->pgno)]; - - MUTEX_LOCK(dbenv, hp->mtx_hash); - - if (LF_ISSET(DB_MPOOL_DISCARD)) - F_SET(bhp, BH_DISCARD); - - MUTEX_UNLOCK(dbenv, hp->mtx_hash); - return (0); -} - -/* * __memp_dirty -- * Upgrade a page from a read-only to a writeable pointer. * - * PUBLIC: int __memp_dirty __P((DB_MPOOLFILE *, void *, DB_TXN *, u_int32_t)); + * PUBLIC: int __memp_dirty __P(( + * PUBLIC: DB_MPOOLFILE *, void *, DB_TXN *, DB_CACHE_PRIORITY, u_int32_t)); */ int -__memp_dirty(dbmfp, addrp, txn, flags) +__memp_dirty(dbmfp, addrp, txn, priority, flags) DB_MPOOLFILE *dbmfp; void *addrp; DB_TXN *txn; + DB_CACHE_PRIORITY priority; u_int32_t flags; { BH *bhp; DB_ENV *dbenv; - DB_MPOOL *dbmp; DB_MPOOL_HASH *hp; DB_TXN *ancestor; #ifdef DIAG_MVCC MPOOLFILE *mfp; #endif - MPOOL *c_mp; - u_int32_t n_cache; + REGINFO *infop; int ret; db_pgno_t pgno; void *pgaddr; dbenv = dbmfp->dbenv; - dbmp = dbenv->mp_handle; pgaddr = *(void **)addrp; /* Convert the page address to a buffer header. */ @@ -154,11 +78,11 @@ __memp_dirty(dbmfp, addrp, txn, flags) (flags == DB_MPOOL_EDIT && *(void **)addrp == pgaddr) || (flags != DB_MPOOL_EDIT && *(void **)addrp != pgaddr)); - if ((ret = __memp_fput(dbmfp, pgaddr, 0)) != 0) { + if ((ret = __memp_fput(dbmfp, pgaddr, priority)) != 0) { __db_errx(dbenv, "%s: error releasing a read-only page", __memp_fn(dbmfp)); - (void)__memp_fput(dbmfp, *(void **)addrp, 0); + (void)__memp_fput(dbmfp, *(void **)addrp, priority); *(void **)addrp = NULL; return (ret); } @@ -168,13 +92,10 @@ __memp_dirty(dbmfp, addrp, txn, flags) return (0); } - n_cache = NCACHE(dbmp->reginfo[0].primary, - bhp->mf_offset, bhp->pgno); - c_mp = dbmp->reginfo[n_cache].primary; - hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab); - hp = &hp[NBUCKET(c_mp, bhp->mf_offset, bhp->pgno)]; + MP_GET_BUCKET(dbmfp, pgno, &infop, hp, ret); + if (ret != 0) + return (ret); - MUTEX_LOCK(dbenv, hp->mtx_hash); /* Set/clear the page bits. */ if (!F_ISSET(bhp, BH_DIRTY)) { ++hp->hash_page_dirty; @@ -183,7 +104,7 @@ __memp_dirty(dbmfp, addrp, txn, flags) MUTEX_UNLOCK(dbenv, hp->mtx_hash); #ifdef DIAG_MVCC - mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); + mfp = R_ADDR(dbenv->mp_handle->reginfo, bhp->mf_offset); MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, PROT_READ | PROT_WRITE); #endif return (0); diff --git a/db/mp/mp_method.c b/db/mp/mp_method.c index 14c144974..e9096827c 100644 --- a/db/mp/mp_method.c +++ b/db/mp/mp_method.c @@ -1,10 +1,9 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996-2006 - * Oracle Corporation. All rights reserved. + * Copyright (c) 1996,2007 Oracle. All rights reserved. * - * $Id: mp_method.c,v 12.36 2006/09/15 18:54:13 margo Exp $ + * $Id: mp_method.c,v 12.50 2007/06/01 18:32:44 bostic Exp $ */ #include "db_config.h" @@ -15,13 +14,13 @@ #include "dbinc/hash.h" /* - * __memp_dbenv_create -- + * __memp_env_create -- * Mpool specific creation of the DB_ENV structure. * - * PUBLIC: int __memp_dbenv_create __P((DB_ENV *)); + * PUBLIC: int __memp_env_create __P((DB_ENV *)); */ int -__memp_dbenv_create(dbenv) +__memp_env_create(dbenv) DB_ENV *dbenv; { /* @@ -37,7 +36,7 @@ __memp_dbenv_create(dbenv) * Solaris needs 24 and 52 bytes for the same structures. The minimum * number of hash buckets is 37. These contain a mutex also. */ - dbenv->mp_bytes = + dbenv->mp_bytes = dbenv->mp_max_bytes = 32 * ((8 * 1024) + sizeof(BH)) + 37 * sizeof(DB_MPOOL_HASH); dbenv->mp_ncache = 1; @@ -45,13 +44,13 @@ __memp_dbenv_create(dbenv) } /* - * __memp_dbenv_destroy -- + * __memp_env_destroy -- * Mpool specific destruction of the DB_ENV structure. * - * PUBLIC: void __memp_dbenv_destroy __P((DB_ENV *)); + * PUBLIC: void __memp_env_destroy __P((DB_ENV *)); */ void -__memp_dbenv_destroy(dbenv) +__memp_env_destroy(dbenv) DB_ENV *dbenv; { COMPQUIET(dbenv, NULL); @@ -109,8 +108,6 @@ __memp_set_cachesize(dbenv, gbytes, bytes, arg_ncache) { u_int ncache; - ENV_ILLEGAL_AFTER_OPEN(dbenv, "DB_ENV->set_cachesize"); - /* Normalize the cache count. */ ncache = arg_ncache <= 0 ? 1 : (u_int)arg_ncache; @@ -133,18 +130,18 @@ __memp_set_cachesize(dbenv, gbytes, bytes, arg_ncache) * wrapping in the calculation of the number of hash buckets. See * __memp_open for details. */ - if (sizeof(roff_t) <= 4) { - if (gbytes / ncache >= 4) { + if (!F_ISSET(dbenv, DB_ENV_OPEN_CALLED)) { + if (sizeof(roff_t) <= 4 && gbytes / ncache >= 4) { __db_errx(dbenv, "individual cache size too large: maximum is 4GB"); return (EINVAL); } - } else if (gbytes / ncache > 10000) { __db_errx(dbenv, "individual cache size too large: maximum is 10TB"); return (EINVAL); } + } /* * If the application requested less than 500Mb, increase the cachesize @@ -164,6 +161,9 @@ __memp_set_cachesize(dbenv, gbytes, bytes, arg_ncache) bytes = ncache * DB_CACHESIZE_MIN; } + if (F_ISSET(dbenv, DB_ENV_OPEN_CALLED)) + return (__memp_resize(dbenv->mp_handle, gbytes, bytes)); + dbenv->mp_gbytes = gbytes; dbenv->mp_bytes = bytes; dbenv->mp_ncache = ncache; @@ -172,6 +172,76 @@ __memp_set_cachesize(dbenv, gbytes, bytes, arg_ncache) } /* + * __memp_set_config -- + * Set the cache subsystem configuration. + * + * PUBLIC: int __memp_set_config __P((DB_ENV *, u_int32_t, int)); + */ +int +__memp_set_config(dbenv, which, on) + DB_ENV *dbenv; + u_int32_t which; + int on; +{ + DB_MPOOL *dbmp; + MPOOL *mp; + + ENV_NOT_CONFIGURED(dbenv, + dbenv->mp_handle, "DB_ENV->memp_set_config", DB_INIT_MPOOL); + + switch (which) { + case DB_MEMP_SUPPRESS_WRITE: + case DB_MEMP_SYNC_INTERRUPT: + if (MPOOL_ON(dbenv)) { + dbmp = dbenv->mp_handle; + mp = dbmp->reginfo[0].primary; + if (on) + FLD_SET(mp->config_flags, which); + else + FLD_CLR(mp->config_flags, which); + } + break; + default: + return (EINVAL); + } + return (0); +} + +/* + * __memp_get_config -- + * Return the cache subsystem configuration. + * + * PUBLIC: int __memp_get_config __P((DB_ENV *, u_int32_t, int *)); + */ +int +__memp_get_config(dbenv, which, onp) + DB_ENV *dbenv; + u_int32_t which; + int *onp; +{ + DB_MPOOL *dbmp; + MPOOL *mp; + + ENV_REQUIRES_CONFIG(dbenv, + dbenv->mp_handle, "DB_ENV->memp_get_config", DB_INIT_MPOOL); + + switch (which) { + case DB_MEMP_SUPPRESS_WRITE: + case DB_MEMP_SYNC_INTERRUPT: + if (MPOOL_ON(dbenv)) { + dbmp = dbenv->mp_handle; + mp = dbmp->reginfo[0].primary; + *onp = FLD_ISSET(mp->config_flags, which) ? 1 : 0; + } else + *onp = 0; + break; + default: + return (EINVAL); + } + return (0); +} + +/* * PUBLIC: int __memp_get_mp_max_openfd __P((DB_ENV *, int *)); */ int @@ -224,12 +294,13 @@ __memp_set_mp_max_openfd(dbenv, maxopenfd) } /* - * PUBLIC: int __memp_get_mp_max_write __P((DB_ENV *, int *, int *)); + * PUBLIC: int __memp_get_mp_max_write __P((DB_ENV *, int *, db_timeout_t *)); */ int __memp_get_mp_max_write(dbenv, maxwritep, maxwrite_sleepp) DB_ENV *dbenv; - int *maxwritep, *maxwrite_sleepp; + int *maxwritep; + db_timeout_t *maxwrite_sleepp; { DB_MPOOL *dbmp; MPOOL *mp; @@ -255,12 +326,13 @@ __memp_get_mp_max_write(dbenv, maxwritep, maxwrite_sleepp) * __memp_set_mp_max_write -- * Set the maximum continuous I/O count. * - * PUBLIC: int __memp_set_mp_max_write __P((DB_ENV *, int, int)); + * PUBLIC: int __memp_set_mp_max_write __P((DB_ENV *, int, db_timeout_t)); */ int __memp_set_mp_max_write(dbenv, maxwrite, maxwrite_sleep) DB_ENV *dbenv; - int maxwrite, maxwrite_sleep; + int maxwrite; + db_timeout_t maxwrite_sleep; { DB_MPOOL *dbmp; MPOOL *mp; @@ -366,9 +438,13 @@ __memp_nameop(dbenv, fileid, newname, fullold, fullnew, inmem) #define op_is_remove (newname == NULL) COMPQUIET(bucket, 0); + COMPQUIET(hp, NULL); + COMPQUIET(newname_off, 0); + COMPQUIET(nlen, 0); dbmp = NULL; mfp = NULL; + nhp = NULL; p = NULL; locked = ret = 0; @@ -378,63 +454,61 @@ __memp_nameop(dbenv, fileid, newname, fullold, fullnew, inmem) dbmp = dbenv->mp_handle; mp = dbmp->reginfo[0].primary; hp = R_ADDR(dbmp->reginfo, mp->ftab); - nhp = NULL; - /* - * Remove or rename a file that the mpool might know about. We assume - * that the fop layer has the file locked for exclusive access, so we - * don't worry about locking except for the mpool mutexes. Checkpoint - * can happen at any time, independent of file locking, so we have to - * do the actual unlink or rename system call to avoid any race. - * - * If this is a rename, allocate first, because we can't recursively - * grab the region lock. If this is a memory file - * then on a rename, we need to make sure that the new name does - * not exist. - */ - hp = R_ADDR(dbmp->reginfo, mp->ftab); - if (op_is_remove) { - COMPQUIET(newname_off, INVALID_ROFF); - } else { + if (!op_is_remove) { nlen = strlen(newname); if ((ret = __memp_alloc(dbmp, dbmp->reginfo, NULL, nlen + 1, &newname_off, &p)) != 0) return (ret); memcpy(p, newname, nlen + 1); - MPOOL_SYSTEM_LOCK(dbenv); - locked = 1; - if (inmem) { - bucket = FNBUCKET(newname, nlen); - nhp = hp + bucket; - MUTEX_LOCK(dbenv, nhp->mtx_hash); - SH_TAILQ_FOREACH(mfp, &nhp->hash_bucket, q, __mpoolfile) - if (!mfp->deadfile && - mfp->no_backing_file && strcmp(newname, - R_ADDR(dbmp->reginfo, mfp->path_off)) == 0) - break; - MUTEX_UNLOCK(dbenv, nhp->mtx_hash); - if (mfp != NULL) { - ret = EEXIST; - goto err; - } - } } - if (locked == 0) - MPOOL_SYSTEM_LOCK(dbenv); - locked = 1; - + /* + * Remove or rename a file that the mpool might know about. We assume + * that the fop layer has the file locked for exclusive access, so we + * don't worry about locking except for the mpool mutexes. Checkpoint + * can happen at any time, independent of file locking, so we have to + * do the actual unlink or rename system call while holding + * all affected buckets locked. + * + * If this is a rename and this is a memory file then we need + * to make sure that the new name does not exist. Since we + * are locking two buckets lock them in ascending order. + */ if (inmem) { DB_ASSERT(dbenv, fullold != NULL); hp += FNBUCKET(fullold, strlen(fullold)); + if (!op_is_remove) { + bucket = FNBUCKET(newname, nlen); + nhp = R_ADDR(dbmp->reginfo, mp->ftab); + nhp += bucket; + } } else hp += FNBUCKET(fileid, DB_FILE_ID_LEN); + if (nhp != NULL && nhp < hp) + MUTEX_LOCK(dbenv, nhp->mtx_hash); + MUTEX_LOCK(dbenv, hp->mtx_hash); + if (nhp != NULL && nhp > hp) + MUTEX_LOCK(dbenv, nhp->mtx_hash); + locked = 1; + + if (!op_is_remove && inmem) { + SH_TAILQ_FOREACH(mfp, &nhp->hash_bucket, q, __mpoolfile) + if (!mfp->deadfile && + mfp->no_backing_file && strcmp(newname, + R_ADDR(dbmp->reginfo, mfp->path_off)) == 0) + break; + if (mfp != NULL) { + ret = EEXIST; + goto err; + } + } + /* * Find the file -- if mpool doesn't know about this file, that may - * not be an error -- if the file is not a memory-only file and it + * not be an error. */ - MUTEX_LOCK(dbenv, hp->mtx_hash); SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) { /* Ignore non-active files. */ if (mfp->deadfile || F_ISSET(mfp, MP_TEMP)) @@ -447,17 +521,21 @@ __memp_nameop(dbenv, fileid, newname, fullold, fullnew, inmem) break; } - MUTEX_UNLOCK(dbenv, hp->mtx_hash); - if (mfp == NULL) + + if (mfp == NULL) { + if (inmem) { + ret = ENOENT; + goto err; + } goto fsop; + } if (op_is_remove) { MUTEX_LOCK(dbenv, mfp->mutex); /* - * In-memory dbs have an artificially incremented - * ref count so that they do not ever get reclaimed - * as long as they exist. Since we are now deleting - * the database, we need to dec that count. + * In-memory dbs have an artificially incremented ref count so + * they do not get reclaimed as long as they exist. Since we + * are now deleting the database, we need to dec that count. */ if (mfp->no_backing_file) mfp->mpf_cnt--; @@ -465,31 +543,22 @@ __memp_nameop(dbenv, fileid, newname, fullold, fullnew, inmem) MUTEX_UNLOCK(dbenv, mfp->mutex); } else { /* - * Else, it's a rename. We've allocated memory - * for the new name. Swap it with the old one. + * Else, it's a rename. We've allocated memory for the new + * name. Swap it with the old one. If it's in memory we + * need to move it the right bucket. */ p = R_ADDR(dbmp->reginfo, mfp->path_off); mfp->path_off = newname_off; - /* If its in memory we need to move it the right bucket. */ - if (inmem) { + if (inmem && hp != nhp) { DB_ASSERT(dbenv, nhp != NULL); - MUTEX_LOCK(dbenv, hp->mtx_hash); SH_TAILQ_REMOVE(&hp->hash_bucket, mfp, q, __mpoolfile); - MUTEX_UNLOCK(dbenv, hp->mtx_hash); mfp->bucket = bucket; - MUTEX_LOCK(dbenv, nhp->mtx_hash); SH_TAILQ_INSERT_TAIL(&nhp->hash_bucket, mfp, q); - MUTEX_UNLOCK(dbenv, nhp->mtx_hash); } } -fsop: if (mfp == NULL && inmem) { - ret = ENOENT; - goto err; - } - - /* +fsop: /* * If this is a real file, then mfp could be NULL, because * mpool isn't turned on, and we still need to do the file ops. */ @@ -504,12 +573,14 @@ fsop: if (mfp == NULL && inmem) { ret = 0; } else { /* - * Defensive only, fullname should never be + * Defensive only, fullnew should never be * NULL. */ DB_ASSERT(dbenv, fullnew != NULL); - if (fullnew == NULL) - return (EINVAL); + if (fullnew == NULL) { + ret = EINVAL; + goto err; + } ret = __os_rename(dbenv, fullold, fullnew, 1); } } @@ -518,8 +589,12 @@ fsop: if (mfp == NULL && inmem) { err: if (p != NULL) __memp_free(&dbmp->reginfo[0], NULL, p); - if (locked == 1) - MPOOL_SYSTEM_UNLOCK(dbenv); + /* If we have buckets locked, unlock them when done moving files. */ + if (locked == 1) { + MUTEX_UNLOCK(dbenv, hp->mtx_hash); + if (nhp != NULL && nhp != hp) + MUTEX_UNLOCK(dbenv, nhp->mtx_hash); + } return (ret); } diff --git a/db/mp/mp_mvcc.c b/db/mp/mp_mvcc.c index 4a763e1de..e797df904 100644 --- a/db/mp/mp_mvcc.c +++ b/db/mp/mp_mvcc.c @@ -1,10 +1,9 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2006 - * Oracle Corporation. All rights reserved. + * Copyright (c) 2006,2007 Oracle. All rights reserved. * - * $Id: mp_mvcc.c,v 12.24 2006/09/18 13:11:50 mjc Exp $ + * $Id: mp_mvcc.c,v 12.34 2007/06/05 11:55:28 mjc Exp $ */ #include "db_config.h" @@ -92,9 +91,12 @@ __memp_bucket_reorder(dbenv, hp, bhp) next, bhp, hq, __bh); } -done: /* Reset the hash bucket's priority. */ - hp->hash_priority = - BH_PRIORITY(SH_TAILQ_FIRST(&hp->hash_bucket, __bh)); +done: /* + * Reset the hash bucket's priority -- the chain is never empty in + * this case, so bhp will never be NULL. + */ + if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) != NULL) + hp->hash_priority = BH_PRIORITY(bhp); } /* @@ -103,7 +105,8 @@ done: /* Reset the hash bucket's priority. */ * * PUBLIC: int __memp_bh_settxn __P((DB_MPOOL *, MPOOLFILE *mfp, BH *, void *)); */ -int __memp_bh_settxn(dbmp, mfp, bhp, vtd) +int +__memp_bh_settxn(dbmp, mfp, bhp, vtd) DB_MPOOL *dbmp; MPOOLFILE *mfp; BH *bhp; @@ -149,16 +152,13 @@ __memp_skip_curadj(dbc, pgno) DB_MPOOL_HASH *hp; DB_MPOOLFILE *dbmfp; DB_TXN *txn; - MPOOL *c_mp, *mp; MPOOLFILE *mfp; REGINFO *infop; roff_t mf_offset; - u_int32_t n_cache; - int skip; + int ret, skip; dbenv = dbc->dbp->dbenv; dbmp = dbenv->mp_handle; - mp = dbmp->reginfo[0].primary; dbmfp = dbc->dbp->mpf; mfp = dbmfp->mfp; mf_offset = R_OFFSET(dbmp->reginfo, mfp); @@ -172,13 +172,13 @@ __memp_skip_curadj(dbc, pgno) * local pointers to them. Reset on each pass through this code, the * page number can change. */ - n_cache = NCACHE(mp, mf_offset, pgno); - infop = &dbmp->reginfo[n_cache]; - c_mp = infop->primary; - hp = R_ADDR(infop, c_mp->htab); - hp = &hp[NBUCKET(c_mp, mf_offset, pgno)]; + MP_GET_BUCKET(dbmfp, pgno, &infop, hp, ret); + if (ret != 0) { + /* Panic: there is no way to return the error. */ + (void)__db_panic(dbenv, ret); + return (0); + } - MUTEX_LOCK(dbenv, hp->mtx_hash); SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) { if (bhp->pgno != pgno || bhp->mf_offset != mf_offset) continue; @@ -251,12 +251,12 @@ __memp_bh_freeze(dbmp, infop, hp, bhp, need_frozenp) *need_frozenp = 1; /* There might be a small amount of unallocated space. */ - if (__db_shalloc(infop, - sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE), 0, + if (__env_alloc(infop, + sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE), &frozen_alloc) == 0) { frozen_bhp = (BH *)(frozen_alloc + 1); - SH_TAILQ_INSERT_HEAD(&c_mp->alloc_frozen, frozen_alloc, - links, __bh_frozen_a); + SH_TAILQ_INSERT_TAIL(&c_mp->alloc_frozen, + frozen_alloc, links); } } MPOOL_REGION_UNLOCK(dbenv, infop); @@ -285,7 +285,7 @@ __memp_bh_freeze(dbmp, infop, hp, bhp, need_frozenp) if ((ret = __db_appname(dbenv, DB_APP_NONE, filename, 0, NULL, &real_name)) != 0) goto err; - if ((ret = __os_open_extend(dbenv, real_name, pagesize, + if ((ret = __os_open(dbenv, real_name, pagesize, DB_OSO_CREATE | DB_OSO_EXCL, dbenv->db_mode, &fhp)) == 0) { /* We're creating the file -- initialize the metadata page. */ magic = DB_FREEZER_MAGIC; @@ -299,8 +299,8 @@ __memp_bh_freeze(dbmp, infop, hp, bhp, need_frozenp) (ret = __os_seek(dbenv, fhp, 0, 0, 0)) != 0) goto err; } else if (ret == EEXIST) - ret = __os_open_extend(dbenv, real_name, pagesize, 0, - dbenv->db_mode, &fhp); + ret = __os_open( + dbenv, real_name, pagesize, 0, dbenv->db_mode, &fhp); if (ret != 0) goto err; if ((ret = __os_read(dbenv, fhp, &magic, sizeof(u_int32_t), @@ -372,8 +372,11 @@ __memp_bh_freeze(dbmp, infop, hp, bhp, need_frozenp) * Increment the file's block count -- freeing the original buffer will * decrement it. */ + MUTEX_LOCK(dbenv, bh_mfp->mutex); ++bh_mfp->block_cnt; - ++hp->hash_frozen; + MUTEX_UNLOCK(dbenv, bh_mfp->mutex); + + STAT(++hp->hash_frozen); if (0) { err: if (ret == 0) @@ -492,8 +495,8 @@ __memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp) &real_name)) != 0) goto err; - if ((ret = __os_open_extend(dbenv, real_name, pagesize, 0, - dbenv->db_mode, &fhp)) != 0) + if ((ret = __os_open( + dbenv, real_name, pagesize, 0, dbenv->db_mode, &fhp)) != 0) goto err; /* @@ -625,8 +628,8 @@ __memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp) if (reorder) { if (next_bhp != NULL) __memp_bucket_reorder(dbenv, hp, next_bhp); - else - hp->hash_priority = BH_PRIORITY(SH_TAILQ_FIRST( + else if (!SH_TAILQ_EMPTY(&hp->hash_bucket)) + hp->hash_priority = BH_PRIORITY(SH_TAILQ_FIRSTP( &hp->hash_bucket, __bh)); } @@ -651,10 +654,12 @@ __memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp) F_CLR(frozen_bhp, BH_FROZEN | BH_LOCKED); } +#ifdef HAVE_STATISTICS if (alloc_bhp != NULL) ++hp->hash_thawed; else ++hp->hash_frozen_freed; +#endif if (0) { err: if (ret == 0) diff --git a/db/mp/mp_region.c b/db/mp/mp_region.c index a02683f21..34a1ced15 100644 --- a/db/mp/mp_region.c +++ b/db/mp/mp_region.c @@ -1,10 +1,9 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996-2006 - * Oracle Corporation. All rights reserved. + * Copyright (c) 1996,2007 Oracle. All rights reserved. * - * $Id: mp_region.c,v 12.21 2006/08/24 14:46:15 bostic Exp $ + * $Id: mp_region.c,v 12.33 2007/05/17 17:18:01 bostic Exp $ */ #include "db_config.h" @@ -12,7 +11,6 @@ #include "db_int.h" #include "dbinc/mp.h" -static int __memp_init __P((DB_ENV *, DB_MPOOL *, u_int, u_int32_t)); static int __memp_init_config __P((DB_ENV *, MPOOL *)); static void __memp_region_size __P((DB_ENV *, roff_t *, u_int32_t *)); @@ -20,17 +18,18 @@ static void __memp_region_size __P((DB_ENV *, roff_t *, u_int32_t *)); * __memp_open -- * Internal version of memp_open: only called from DB_ENV->open. * - * PUBLIC: int __memp_open __P((DB_ENV *)); + * PUBLIC: int __memp_open __P((DB_ENV *, int)); */ int -__memp_open(dbenv) +__memp_open(dbenv, create_ok) DB_ENV *dbenv; + int create_ok; { DB_MPOOL *dbmp; MPOOL *mp; REGINFO reginfo; roff_t reg_size; - u_int i; + u_int i, max_nreg; u_int32_t htab_buckets, *regids; int ret; @@ -50,9 +49,9 @@ __memp_open(dbenv) reginfo.type = REGION_TYPE_MPOOL; reginfo.id = INVALID_REGION_ID; reginfo.flags = REGION_JOIN_OK; - if (F_ISSET(dbenv, DB_ENV_CREATE)) + if (create_ok) F_SET(®info, REGION_CREATE_OK); - if ((ret = __db_r_attach(dbenv, ®info, reg_size)) != 0) + if ((ret = __env_region_attach(dbenv, ®info, reg_size)) != 0) goto err; /* @@ -65,17 +64,18 @@ __memp_open(dbenv) * the REGINFO structures and create them. Make sure we don't * clear the wrong entries on error. */ - dbmp->nreg = dbenv->mp_ncache; + max_nreg = __memp_max_regions(dbenv); if ((ret = __os_calloc(dbenv, - dbmp->nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0) + max_nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0) goto err; /* Make sure we don't clear the wrong entries on error. */ - for (i = 0; i < dbmp->nreg; ++i) - dbmp->reginfo[i].id = INVALID_REGION_ID; dbmp->reginfo[0] = reginfo; + for (i = 1; i < max_nreg; ++i) + dbmp->reginfo[i].id = INVALID_REGION_ID; /* Initialize the first region. */ - if ((ret = __memp_init(dbenv, dbmp, 0, htab_buckets)) != 0) + if ((ret = __memp_init(dbenv, dbmp, + 0, htab_buckets, max_nreg)) != 0) goto err; /* @@ -84,16 +84,17 @@ __memp_open(dbenv) */ mp = R_ADDR(dbmp->reginfo, dbmp->reginfo[0].rp->primary); regids = R_ADDR(dbmp->reginfo, mp->regids); - for (i = 1; i < dbmp->nreg; ++i) { + regids[0] = dbmp->reginfo[0].id; + for (i = 1; i < dbenv->mp_ncache; ++i) { dbmp->reginfo[i].dbenv = dbenv; dbmp->reginfo[i].type = REGION_TYPE_MPOOL; dbmp->reginfo[i].id = INVALID_REGION_ID; dbmp->reginfo[i].flags = REGION_CREATE_OK; - if ((ret = __db_r_attach( + if ((ret = __env_region_attach( dbenv, &dbmp->reginfo[i], reg_size)) != 0) goto err; - if ((ret = - __memp_init(dbenv, dbmp, i, htab_buckets)) != 0) + if ((ret = __memp_init(dbenv, dbmp, + i, htab_buckets, max_nreg)) != 0) goto err; regids[i] = dbmp->reginfo[i].id; @@ -105,30 +106,30 @@ __memp_open(dbenv) * information. */ mp = R_ADDR(®info, reginfo.rp->primary); - dbmp->nreg = mp->nreg; + dbenv->mp_ncache = mp->nreg; if ((ret = __os_calloc(dbenv, - dbmp->nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0) + mp->max_nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0) goto err; /* Make sure we don't clear the wrong entries on error. */ - for (i = 0; i < dbmp->nreg; ++i) + for (i = 0; i < dbenv->mp_ncache; ++i) dbmp->reginfo[i].id = INVALID_REGION_ID; dbmp->reginfo[0] = reginfo; /* Join remaining regions. */ regids = R_ADDR(dbmp->reginfo, mp->regids); - for (i = 1; i < dbmp->nreg; ++i) { + for (i = 1; i < dbenv->mp_ncache; ++i) { dbmp->reginfo[i].dbenv = dbenv; dbmp->reginfo[i].type = REGION_TYPE_MPOOL; dbmp->reginfo[i].id = regids[i]; dbmp->reginfo[i].flags = REGION_JOIN_OK; - if ((ret = __db_r_attach( + if ((ret = __env_region_attach( dbenv, &dbmp->reginfo[i], 0)) != 0) goto err; } } /* Set the local addresses for the regions. */ - for (i = 0; i < dbmp->nreg; ++i) + for (i = 0; i < dbenv->mp_ncache; ++i) dbmp->reginfo[i].primary = R_ADDR(&dbmp->reginfo[i], dbmp->reginfo[i].rp->primary); @@ -147,9 +148,9 @@ __memp_open(dbenv) err: dbenv->mp_handle = NULL; if (dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) { - for (i = 0; i < dbmp->nreg; ++i) + for (i = 0; i < dbenv->mp_ncache; ++i) if (dbmp->reginfo[i].id != INVALID_REGION_ID) - (void)__db_r_detach( + (void)__env_region_detach( dbenv, &dbmp->reginfo[i], 0); __os_free(dbenv, dbmp->reginfo); } @@ -162,27 +163,32 @@ err: dbenv->mp_handle = NULL; /* * __memp_init -- * Initialize a MPOOL structure in shared memory. + * + * PUBLIC: int __memp_init + * PUBLIC: __P((DB_ENV *, DB_MPOOL *, u_int, u_int32_t, u_int)); */ -static int -__memp_init(dbenv, dbmp, reginfo_off, htab_buckets) +int +__memp_init(dbenv, dbmp, reginfo_off, htab_buckets, max_nreg) DB_ENV *dbenv; DB_MPOOL *dbmp; - u_int reginfo_off; + u_int reginfo_off, max_nreg; u_int32_t htab_buckets; { + BH_FROZEN_ALLOC *frozen; + BH *frozen_bhp; DB_MPOOL_HASH *htab, *hp; - MPOOL *mp; - REGINFO *reginfo; + MPOOL *mp, *main_mp; + REGINFO *infop; + db_mutex_t mtx_base, mtx_discard, mtx_prev; u_int32_t i; int ret; void *p; - reginfo = &dbmp->reginfo[reginfo_off]; - if ((ret = __db_shalloc( - reginfo, sizeof(MPOOL), 0, ®info->primary)) != 0) + infop = &dbmp->reginfo[reginfo_off]; + if ((ret = __env_alloc(infop, sizeof(MPOOL), &infop->primary)) != 0) goto mem_err; - reginfo->rp->primary = R_OFFSET(reginfo, reginfo->primary); - mp = reginfo->primary; + infop->rp->primary = R_OFFSET(infop, infop->primary); + mp = infop->primary; memset(mp, 0, sizeof(*mp)); if ((ret = @@ -192,17 +198,19 @@ __memp_init(dbenv, dbmp, reginfo_off, htab_buckets) if (reginfo_off == 0) { ZERO_LSN(mp->lsn); - mp->nreg = dbmp->nreg; - if ((ret = __db_shalloc(&dbmp->reginfo[0], - dbmp->nreg * sizeof(u_int32_t), 0, &p)) != 0) + mp->nreg = dbenv->mp_ncache; + mp->max_nreg = max_nreg; + if ((ret = __env_alloc(&dbmp->reginfo[0], + max_nreg * sizeof(u_int32_t), &p)) != 0) goto mem_err; mp->regids = R_OFFSET(dbmp->reginfo, p); + mp->nbuckets = dbenv->mp_ncache * htab_buckets; /* Allocate file table space and initialize it. */ - if ((ret = __db_shalloc(reginfo, - MPOOL_FILE_BUCKETS * sizeof(DB_MPOOL_HASH), 0, &htab)) != 0) + if ((ret = __env_alloc(infop, + MPOOL_FILE_BUCKETS * sizeof(DB_MPOOL_HASH), &htab)) != 0) goto mem_err; - mp->ftab = R_OFFSET(reginfo, htab); + mp->ftab = R_OFFSET(infop, htab); for (i = 0; i < MPOOL_FILE_BUCKETS; i++) { if ((ret = __mutex_alloc(dbenv, MTX_MPOOL_FILE_BUCKET, 0, &htab[i].mtx_hash)) != 0) @@ -211,32 +219,80 @@ __memp_init(dbenv, dbmp, reginfo_off, htab_buckets) htab[i].hash_page_dirty = htab[i].hash_priority = 0; } + /* + * Allocate all of the hash bucket mutexes up front. We do + * this so that we don't need to free and reallocate mutexes as + * the cache is resized. + */ + mtx_base = mtx_prev = MUTEX_INVALID; + for (i = 0; i < mp->max_nreg * htab_buckets; i++) { + if ((ret = __mutex_alloc(dbenv, MTX_MPOOL_HASH_BUCKET, + 0, &mtx_discard)) != 0) + return (ret); + if (i == 0) { + mtx_base = mtx_discard; + mtx_prev = mtx_discard - 1; + } + DB_ASSERT(dbenv, mtx_discard == mtx_prev + 1 || + mtx_base == MUTEX_INVALID); + mtx_prev = mtx_discard; + if ((ret = __mutex_alloc(dbenv, MTX_MPOOL_IO, + DB_MUTEX_SELF_BLOCK, &mtx_discard)) != 0) + return (ret); + DB_ASSERT(dbenv, mtx_discard == mtx_prev + 1 || + mtx_base == MUTEX_INVALID); + mtx_prev = mtx_discard; + } + } else { + main_mp = dbmp->reginfo[0].primary; + htab = R_ADDR(&dbmp->reginfo[0], main_mp->htab); + mtx_base = htab[0].mtx_hash; } + if (mtx_base != MUTEX_INVALID) + mtx_base += reginfo_off * htab_buckets; + /* Allocate hash table space and initialize it. */ - if ((ret = __db_shalloc(reginfo, - htab_buckets * sizeof(DB_MPOOL_HASH), 0, &htab)) != 0) + if ((ret = __env_alloc(infop, + htab_buckets * sizeof(DB_MPOOL_HASH), &htab)) != 0) goto mem_err; - mp->htab = R_OFFSET(reginfo, htab); + mp->htab = R_OFFSET(infop, htab); for (i = 0; i < htab_buckets; i++) { hp = &htab[i]; - if ((ret = __mutex_alloc(dbenv, - MTX_MPOOL_HASH_BUCKET, 0, &hp->mtx_hash)) != 0) - return (ret); - if ((ret = __mutex_alloc(dbenv, - MTX_MPOOL_IO, DB_MUTEX_SELF_BLOCK, &hp->mtx_io)) != 0) - return (ret); + hp->mtx_hash = (mtx_base == MUTEX_INVALID) ? MUTEX_INVALID : + mtx_base + i * 2; + hp->mtx_io = (mtx_base == MUTEX_INVALID) ? MUTEX_INVALID : + mtx_base + i * 2 + 1; SH_TAILQ_INIT(&hp->hash_bucket); - hp->hash_page_dirty = hp->hash_priority = hp->hash_io_wait = 0; + hp->hash_page_dirty = hp->hash_priority = 0; +#ifdef HAVE_STATISTICS + hp->hash_io_wait = 0; + hp->hash_frozen = hp->hash_thawed = hp->hash_frozen_freed = 0; +#endif hp->flags = 0; ZERO_LSN(hp->old_reader); } - mp->htab_buckets = mp->stat.st_hash_buckets = htab_buckets; + mp->htab_buckets = htab_buckets; +#ifdef HAVE_STATISTICS + mp->stat.st_hash_buckets = htab_buckets; +#endif SH_TAILQ_INIT(&mp->free_frozen); SH_TAILQ_INIT(&mp->alloc_frozen); /* + * Pre-allocate one frozen buffer header. This avoids situations where + * the cache becomes full of pages and we don't even have the 28 bytes + * (or so) available to allocate a frozen buffer header. + */ + if ((ret = __env_alloc(infop, + sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE), &frozen)) != 0) + goto mem_err; + frozen_bhp = (BH *)(frozen + 1); + SH_TAILQ_INSERT_TAIL(&mp->alloc_frozen, frozen, links); + SH_TAILQ_INSERT_TAIL(&mp->free_frozen, frozen_bhp, hq); + + /* * Only the environment creator knows the total cache size, fill in * those statistics now. */ @@ -249,6 +305,25 @@ mem_err:__db_errx(dbenv, "Unable to allocate memory for mpool region"); } /* + * PUBLIC: u_int32_t __memp_max_regions __P((DB_ENV *)); + */ +u_int32_t +__memp_max_regions(dbenv) + DB_ENV *dbenv; +{ + roff_t reg_size, max_size; + u_int32_t max_nreg; + + __memp_region_size(dbenv, ®_size, NULL); + max_size = (roff_t)dbenv->mp_max_gbytes * GIGABYTE + + dbenv->mp_max_bytes; + max_nreg = (max_size + reg_size / 2) / reg_size; + if (max_nreg <= dbenv->mp_ncache) + max_nreg = dbenv->mp_ncache; + return (max_nreg); +} + +/* * __memp_region_size -- * Size the region and figure out how many hash buckets we'll have. */ @@ -258,15 +333,16 @@ __memp_region_size(dbenv, reg_sizep, htab_bucketsp) roff_t *reg_sizep; u_int32_t *htab_bucketsp; { - roff_t reg_size; + roff_t reg_size, cache_size; /* * Figure out how big each cache region is. Cast an operand to roff_t * so we do 64-bit arithmetic as appropriate. */ - reg_size = ((roff_t)GIGABYTE / dbenv->mp_ncache) * dbenv->mp_gbytes; - reg_size += dbenv->mp_bytes / dbenv->mp_ncache; - *reg_sizep = reg_size; + cache_size = (roff_t)dbenv->mp_gbytes * GIGABYTE + dbenv->mp_bytes; + reg_size = cache_size / dbenv->mp_ncache; + if (reg_sizep != NULL) + *reg_sizep = reg_size; /* * Figure out how many hash buckets each region will have. Assume we @@ -281,7 +357,9 @@ __memp_region_size(dbenv, reg_sizep, htab_bucketsp) * something we need to worry about right now, but is checked when the * cache size is set. */ - *htab_bucketsp = __db_tablesize((u_int32_t)(reg_size / (10 * 1024))); + if (htab_bucketsp != NULL) + *htab_bucketsp = + __db_tablesize((u_int32_t)(reg_size / (10 * 1024))); } /* @@ -294,10 +372,9 @@ u_int32_t __memp_region_mutex_count(dbenv) DB_ENV *dbenv; { - roff_t reg_size; u_int32_t htab_buckets; - __memp_region_size(dbenv, ®_size, &htab_buckets); + __memp_region_size(dbenv, NULL, &htab_buckets); /* * We need a couple of mutexes for the region itself, one for each @@ -334,13 +411,13 @@ __memp_init_config(dbenv, mp) } /* - * __memp_dbenv_refresh -- + * __memp_env_refresh -- * Clean up after the mpool system on a close or failed open. * - * PUBLIC: int __memp_dbenv_refresh __P((DB_ENV *)); + * PUBLIC: int __memp_env_refresh __P((DB_ENV *)); */ int -__memp_dbenv_refresh(dbenv) +__memp_env_refresh(dbenv) DB_ENV *dbenv; { BH *bhp; @@ -349,53 +426,72 @@ __memp_dbenv_refresh(dbenv) DB_MPOOLFILE *dbmfp; DB_MPOOL_HASH *hp; DB_MPREG *mpreg; - MPOOL *mp; - REGINFO *reginfo; - u_int32_t bucket, i; + MPOOL *mp, *c_mp; + REGINFO *infop; + db_mutex_t mtx_base, mtx; + u_int32_t bucket, htab_buckets, i, max_nreg, nreg; int ret, t_ret; ret = 0; dbmp = dbenv->mp_handle; + mp = dbmp->reginfo[0].primary; + htab_buckets = mp->htab_buckets; + nreg = mp->nreg; + max_nreg = mp->max_nreg; + hp = R_ADDR(&dbmp->reginfo[0], mp->htab); + mtx_base = hp->mtx_hash; /* * If a private region, return the memory to the heap. Not needed for * filesystem-backed or system shared memory regions, that memory isn't * owned by any particular process. - * - * Discard buffers. */ - if (F_ISSET(dbenv, DB_ENV_PRIVATE)) - for (i = 0; i < dbmp->nreg; ++i) { - reginfo = &dbmp->reginfo[i]; - mp = reginfo->primary; - for (hp = R_ADDR(reginfo, mp->htab), bucket = 0; - bucket < mp->htab_buckets; ++hp, ++bucket) { - while ((bhp = SH_TAILQ_FIRST( - &hp->hash_bucket, __bh)) != NULL) - if (F_ISSET(bhp, BH_FROZEN)) - SH_TAILQ_REMOVE( - &hp->hash_bucket, bhp, - hq, __bh); - else if ((t_ret = __memp_bhfree( - dbmp, hp, bhp, + if (!F_ISSET(dbenv, DB_ENV_PRIVATE)) + goto not_priv; + + /* Discard buffers. */ + for (i = 0; i < nreg; ++i) { + infop = &dbmp->reginfo[i]; + c_mp = infop->primary; + for (hp = R_ADDR(infop, c_mp->htab), bucket = 0; + bucket < c_mp->htab_buckets; ++hp, ++bucket) { + while ((bhp = SH_TAILQ_FIRST( + &hp->hash_bucket, __bh)) != NULL) + if (F_ISSET(bhp, BH_FROZEN)) + SH_TAILQ_REMOVE( + &hp->hash_bucket, bhp, + hq, __bh); + else { + if (F_ISSET(bhp, BH_DIRTY)) { + --hp->hash_page_dirty; + F_CLR(bhp, + BH_DIRTY | BH_DIRTY_CREATE); + } + if ((t_ret = __memp_bhfree( + dbmp, infop, hp, bhp, BH_FREE_FREEMEM | BH_FREE_UNLOCKED)) != 0 && ret == 0) ret = t_ret; - if ((t_ret = __mutex_free( - dbenv, &hp->mtx_hash)) != 0 && ret == 0) - ret = t_ret; - if ((t_ret = __mutex_free( - dbenv, &hp->mtx_io)) != 0 && ret == 0) - ret = t_ret; - } - while ((frozen_alloc = SH_TAILQ_FIRST( - &mp->alloc_frozen, __bh_frozen_a)) != NULL) { - SH_TAILQ_REMOVE(&mp->alloc_frozen, frozen_alloc, - links, __bh_frozen_a); - __db_shalloc_free(reginfo, frozen_alloc); - } + } + } + while ((frozen_alloc = SH_TAILQ_FIRST( + &c_mp->alloc_frozen, __bh_frozen_a)) != NULL) { + SH_TAILQ_REMOVE(&c_mp->alloc_frozen, frozen_alloc, + links, __bh_frozen_a); + __env_alloc_free(infop, frozen_alloc); + } + } + + /* Discard hash bucket mutexes. */ + if (mtx_base != MUTEX_INVALID) + for (i = 0; i < 2 * max_nreg * htab_buckets; ++i) { + mtx = mtx_base + i; + if ((t_ret = __mutex_free(dbenv, &mtx)) != 0 && + ret == 0) + ret = t_ret; } +not_priv: /* Discard DB_MPOOLFILEs. */ while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL) if ((t_ret = __memp_fclose(dbmfp, 0)) != 0 && ret == 0) @@ -415,25 +511,25 @@ __memp_dbenv_refresh(dbenv) if (F_ISSET(dbenv, DB_ENV_PRIVATE)) { /* Discard REGION IDs. */ - reginfo = &dbmp->reginfo[0]; - mp = dbmp->reginfo[0].primary; - __memp_free(reginfo, NULL, R_ADDR(reginfo, mp->regids)); + infop = &dbmp->reginfo[0]; + __memp_free(infop, NULL, R_ADDR(infop, mp->regids)); /* Discard the File table. */ - __memp_free(reginfo, NULL, R_ADDR(reginfo, mp->ftab)); + __memp_free(infop, NULL, R_ADDR(infop, mp->ftab)); /* Discard Hash tables. */ - for (i = 0; i < dbmp->nreg; ++i) { - reginfo = &dbmp->reginfo[i]; - mp = reginfo->primary; - __memp_free(reginfo, NULL, R_ADDR(reginfo, mp->htab)); + for (i = 0; i < nreg; ++i) { + infop = &dbmp->reginfo[i]; + c_mp = infop->primary; + __memp_free(infop, NULL, R_ADDR(infop, c_mp->htab)); } } /* Detach from the region. */ - for (i = 0; i < dbmp->nreg; ++i) { - reginfo = &dbmp->reginfo[i]; - if ((t_ret = __db_r_detach(dbenv, reginfo, 0)) != 0 && ret == 0) + for (i = 0; i < nreg; ++i) { + infop = &dbmp->reginfo[i]; + if ((t_ret = + __env_region_detach(dbenv, infop, 0)) != 0 && ret == 0) ret = t_ret; } diff --git a/db/mp/mp_register.c b/db/mp/mp_register.c index 1ca5f8311..ef5269d42 100644 --- a/db/mp/mp_register.c +++ b/db/mp/mp_register.c @@ -1,10 +1,9 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996-2006 - * Oracle Corporation. All rights reserved. + * Copyright (c) 1996,2007 Oracle. All rights reserved. * - * $Id: mp_register.c,v 12.11 2006/08/24 14:46:15 bostic Exp $ + * $Id: mp_register.c,v 12.13 2007/05/17 15:15:45 bostic Exp $ */ #include "db_config.h" diff --git a/db/mp/mp_resize.c b/db/mp/mp_resize.c new file mode 100644 index 000000000..241f37e4b --- /dev/null +++ b/db/mp/mp_resize.c @@ -0,0 +1,559 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2006,2007 Oracle. All rights reserved. + * + * $Id: mp_resize.c,v 12.5 2007/06/05 11:55:28 mjc Exp $ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" + +static int __memp_add_bucket __P((DB_MPOOL *)); +static int __memp_add_region __P((DB_MPOOL *)); +static int __memp_map_regions __P((DB_MPOOL *)); +static int __memp_merge_buckets + __P((DB_MPOOL *, u_int32_t, u_int32_t, u_int32_t)); +static int __memp_remove_bucket __P((DB_MPOOL *)); +static int __memp_remove_region __P((DB_MPOOL *)); + +/* + * PUBLIC: int __memp_get_bucket + * PUBLIC: __P((DB_MPOOLFILE *, db_pgno_t, REGINFO **, DB_MPOOL_HASH **)); + */ +int +__memp_get_bucket(dbmfp, pgno, infopp, hpp) + DB_MPOOLFILE *dbmfp; + db_pgno_t pgno; + REGINFO **infopp; + DB_MPOOL_HASH **hpp; +{ + DB_ENV *dbenv; + DB_MPOOL *dbmp; + DB_MPOOL_HASH *hp; + MPOOL *c_mp, *mp; + REGINFO *infop; + roff_t mf_offset; + u_int32_t bucket, nbuckets, new_bucket, new_nbuckets, region; + u_int32_t *regids; + int ret; + + dbenv = dbmfp->dbenv; + dbmp = dbenv->mp_handle; + mf_offset = R_OFFSET(dbmp->reginfo, dbmfp->mfp); + mp = dbmp->reginfo[0].primary; + ret = 0; + + for (;;) { + nbuckets = mp->nbuckets; + MP_BUCKET(mf_offset, pgno, nbuckets, bucket); + + /* + * Once we work out which region we are looking in, we have to + * check that we have that region mapped, and that the version + * we have matches the ID in the main mpool region. Otherwise + * we have to go and map in any regions that don't match and + * retry. + */ + region = NREGION(mp, bucket); + regids = R_ADDR(dbmp->reginfo, mp->regids); + + for (;;) { + infop = *infopp = &dbmp->reginfo[region]; + c_mp = infop->primary; + + /* If we have the correct region mapped, we're done. */ + if (c_mp != NULL && regids[region] == infop->id) + break; + if ((ret = __memp_map_regions(dbmp)) != 0) + return (ret); + } + + /* If our caller wants the hash bucket, lock it here. */ + if (hpp != NULL) { + hp = R_ADDR(infop, c_mp->htab); + hp = &hp[bucket - region * mp->htab_buckets]; + + MUTEX_LOCK(dbenv, hp->mtx_hash); + + /* + * Check that we still have the correct region mapped. + */ + if (regids[region] != infop->id) { + MUTEX_UNLOCK(dbenv, hp->mtx_hash); + continue; + } + + /* + * Now that the bucket is locked, we need to check that + * the cache has not been resized while we waited. + */ + new_nbuckets = mp->nbuckets; + if (nbuckets != new_nbuckets) { + MP_BUCKET(mf_offset, pgno, new_nbuckets, + new_bucket); + + if (new_bucket != bucket) { + MUTEX_UNLOCK(dbenv, hp->mtx_hash); + continue; + } + } + + *hpp = hp; + } + + break; + } + + return (ret); +} + +static int +__memp_merge_buckets(dbmp, new_nbuckets, old_bucket, new_bucket) + DB_MPOOL *dbmp; + u_int32_t new_nbuckets, old_bucket, new_bucket; +{ + BH *alloc_bhp, *bhp, *current_bhp, *new_bhp, *next_bhp; + DB_ENV *dbenv; + DB_MPOOL_HASH *new_hp, *old_hp; + MPOOL *mp, *new_mp, *old_mp; + MPOOLFILE *mfp; + REGINFO *new_infop, *old_infop; + u_int32_t bucket, high_mask, new_region, old_region; + int ret; + + dbenv = dbmp->dbenv; + mp = dbmp->reginfo[0].primary; + new_bhp = NULL; + ret = 0; + + MP_MASK(new_nbuckets, high_mask); + + old_region = NREGION(mp, old_bucket); + old_infop = &dbmp->reginfo[old_region]; + old_mp = old_infop->primary; + old_hp = R_ADDR(old_infop, old_mp->htab); + old_hp = &old_hp[old_bucket - old_region * mp->htab_buckets]; + + new_region = NREGION(mp, new_bucket); + new_infop = &dbmp->reginfo[new_region]; + new_mp = new_infop->primary; + new_hp = R_ADDR(new_infop, new_mp->htab); + new_hp = &new_hp[new_bucket - new_region * mp->htab_buckets]; + + /* + * Before merging, we need to check that there are no old buffers left + * in the target hash bucket after a previous split. + */ +free_old: + MUTEX_LOCK(dbenv, new_hp->mtx_hash); + SH_TAILQ_FOREACH(bhp, &new_hp->hash_bucket, hq, __bh) { + MP_BUCKET(bhp->mf_offset, bhp->pgno, mp->nbuckets, bucket); + + if (bucket != new_bucket) { + /* + * There is no way that an old buffer can be locked + * after a split, since everyone will look for it in + * the new hash bucket. + */ + DB_ASSERT(dbenv, !F_ISSET(bhp, BH_LOCKED | BH_DIRTY) && + bhp->ref == 0); + if ((ret = __memp_bhfree(dbmp, + new_infop, new_hp, bhp, BH_FREE_FREEMEM)) != 0) { + MUTEX_UNLOCK(dbenv, new_hp->mtx_hash); + return (ret); + } + + /* + * The free has modified the list of buffers and + * dropped the mutex. We need to start again. + */ + goto free_old; + } + } + MUTEX_UNLOCK(dbenv, new_hp->mtx_hash); + + /* + * Before we begin, make sure that all of the buffers we care about are + * not in use and not frozen. We do this because we can't drop the old + * hash bucket mutex once we start moving buffers around. + */ +retry: MUTEX_LOCK(dbenv, old_hp->mtx_hash); + SH_TAILQ_FOREACH(bhp, &old_hp->hash_bucket, hq, __bh) { + MP_HASH_BUCKET(MP_HASH(bhp->mf_offset, bhp->pgno), + new_nbuckets, high_mask, bucket); + + if (bucket == new_bucket && + (F_ISSET(bhp, BH_LOCKED) || bhp->ref != 0)) { + MUTEX_UNLOCK(dbenv, old_hp->mtx_hash); + __os_yield(dbenv); + goto retry; + } else if (bucket == new_bucket && F_ISSET(bhp, BH_FROZEN)) { + if (BH_OBSOLETE(bhp, old_hp->old_reader)) + alloc_bhp = NULL; + else { + ++bhp->ref; + mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); + MUTEX_UNLOCK(dbenv, old_hp->mtx_hash); + if ((ret = __memp_alloc(dbmp, + old_infop, mfp, 0, NULL, &alloc_bhp)) != 0) + return (ret); + MUTEX_LOCK(dbenv, old_hp->mtx_hash); + } + if ((ret = __memp_bh_thaw(dbmp, + old_infop, old_hp, bhp, alloc_bhp)) != 0) { + MUTEX_UNLOCK(dbenv, old_hp->mtx_hash); + return (ret); + } + + /* + * We've dropped the mutex in order to thaw, so we need + * to go back to the beginning and check that all of + * the buffers we care about are still unlocked and + * unreferenced. + */ + MUTEX_UNLOCK(dbenv, old_hp->mtx_hash); + goto retry; + } + } + + /* + * We now know that all of the buffers we care about are unlocked and + * unreferenced. Go ahead and copy them. + */ + SH_TAILQ_FOREACH(bhp, &old_hp->hash_bucket, hq, __bh) { + MP_HASH_BUCKET(MP_HASH(bhp->mf_offset, bhp->pgno), + new_nbuckets, high_mask, bucket); + mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); + + /* + * We ignore buffers that don't hash to the new bucket. We + * could also ignore clean buffers which are not part of a + * multiversion chain as long as they have a backing file. + */ + if (bucket != new_bucket || (!F_ISSET(bhp, BH_DIRTY) && + SH_CHAIN_SINGLETON(bhp, vc) && !mfp->no_backing_file)) + continue; + + for (current_bhp = bhp, next_bhp = NULL; + current_bhp != NULL; + current_bhp = SH_CHAIN_PREV(current_bhp, vc, __bh), + next_bhp = alloc_bhp) { + if ((ret = __memp_alloc(dbmp, + new_infop, mfp, 0, NULL, &alloc_bhp)) != 0) + break; + + alloc_bhp->ref = current_bhp->ref; + alloc_bhp->ref_sync = current_bhp->ref_sync; + alloc_bhp->priority = current_bhp->priority; + alloc_bhp->pgno = current_bhp->pgno; + alloc_bhp->mf_offset = current_bhp->mf_offset; + alloc_bhp->flags = current_bhp->flags; + alloc_bhp->td_off = current_bhp->td_off; + + /* + * We've duplicated the buffer, so now we need to + * update reference counts, including the counts in the + * per-MPOOLFILE and the transaction detail (for MVCC + * buffers). + */ + MUTEX_LOCK(dbenv, mfp->mutex); + ++mfp->block_cnt; + MUTEX_UNLOCK(dbenv, mfp->mutex); + + if (alloc_bhp->td_off != INVALID_ROFF && + (ret = __txn_add_buffer(dbenv, + R_ADDR(&dbenv->tx_handle->reginfo, + alloc_bhp->td_off))) != 0) + break; + + memcpy(alloc_bhp->buf, bhp->buf, mfp->stat.st_pagesize); + + /* + * We build up the MVCC chain first, then insert the + * head (stored in new_bhp) once. + */ + if (next_bhp == NULL) { + SH_CHAIN_INIT(alloc_bhp, vc); + new_bhp = alloc_bhp; + } else + SH_CHAIN_INSERT_BEFORE( + next_bhp, alloc_bhp, vc, __bh); + } + + MUTEX_LOCK(dbenv, new_hp->mtx_hash); + SH_TAILQ_INSERT_TAIL(&new_hp->hash_bucket, new_bhp, hq); + if (F_ISSET(new_bhp, BH_DIRTY)) + ++new_hp->hash_page_dirty; + + /* + * We're doing an insertion sort, so it is O(N**2), but since + * buckets should be small, that should not matter. When + * splitting a bucket, we traverse in priority order and append + * to the new bucket, and __memp_bucket_reorder is O(1) in that + * case. + */ + __memp_bucket_reorder(dbenv, new_hp, new_bhp); + MUTEX_UNLOCK(dbenv, new_hp->mtx_hash); + + if (F_ISSET(bhp, BH_DIRTY)) { + F_CLR(bhp, BH_DIRTY); + --old_hp->hash_page_dirty; + } + } + + if (ret == 0) + mp->nbuckets = new_nbuckets; + MUTEX_UNLOCK(dbenv, old_hp->mtx_hash); + + return (ret); +} + +static int +__memp_add_bucket(dbmp) + DB_MPOOL *dbmp; +{ + DB_ENV *dbenv; + MPOOL *mp; + u_int32_t high_mask, new_bucket, old_bucket; + + dbenv = dbmp->dbenv; + mp = dbmp->reginfo[0].primary; + + new_bucket = mp->nbuckets; + /* We should always be adding buckets to the last region. */ + DB_ASSERT(dbenv, NREGION(mp, new_bucket) == mp->nreg - 1); + MP_MASK(mp->nbuckets, high_mask); + old_bucket = new_bucket & (high_mask >> 1); + + /* + * With fixed-sized regions, the new region is always smaller than the + * existing total cache size, so buffers always need to be copied. If + * we implement variable region sizes, it's possible that we will be + * splitting a hash bucket in the new region. Catch that here. + */ + DB_ASSERT(dbenv, NREGION(mp, old_bucket) != NREGION(mp, new_bucket)); + + return (__memp_merge_buckets(dbmp, mp->nbuckets + 1, + old_bucket, new_bucket)); +} + +static int +__memp_add_region(dbmp) + DB_MPOOL *dbmp; +{ + DB_ENV *dbenv; + MPOOL *mp; + REGINFO *infop; + int ret; + roff_t reg_size; + u_int i; + u_int32_t *regids; + + dbenv = dbmp->dbenv; + mp = dbmp->reginfo[0].primary; + /* All cache regions are the same size. */ + reg_size = dbmp->reginfo[0].rp->size; + ret = 0; + + infop = &dbmp->reginfo[mp->nreg]; + infop->dbenv = dbenv; + infop->type = REGION_TYPE_MPOOL; + infop->id = INVALID_REGION_ID; + infop->flags = REGION_CREATE_OK; + if ((ret = __env_region_attach(dbenv, infop, reg_size)) != 0) + return (ret); + if ((ret = __memp_init(dbenv, + dbmp, mp->nreg, mp->htab_buckets, mp->max_nreg)) != 0) + return (ret); + regids = R_ADDR(dbmp->reginfo, mp->regids); + regids[mp->nreg++] = infop->id; + + for (i = 0; i < mp->htab_buckets; i++) + if ((ret = __memp_add_bucket(dbmp)) != 0) + break; + + return (ret); +} + +static int +__memp_remove_bucket(dbmp) + DB_MPOOL *dbmp; +{ + DB_ENV *dbenv; + MPOOL *mp; + u_int32_t high_mask, new_bucket, old_bucket; + + dbenv = dbmp->dbenv; + mp = dbmp->reginfo[0].primary; + + old_bucket = mp->nbuckets - 1; + + /* We should always be removing buckets from the last region. */ + DB_ASSERT(dbenv, NREGION(mp, old_bucket) == mp->nreg - 1); + MP_MASK(mp->nbuckets - 1, high_mask); + new_bucket = old_bucket & (high_mask >> 1); + + return (__memp_merge_buckets(dbmp, mp->nbuckets - 1, + old_bucket, new_bucket)); +} + +static int +__memp_remove_region(dbmp) + DB_MPOOL *dbmp; +{ + DB_ENV *dbenv; + MPOOL *mp; + REGINFO *infop; + int ret; + u_int i; + + dbenv = dbmp->dbenv; + mp = dbmp->reginfo[0].primary; + ret = 0; + + if (mp->nreg == 1) { + __db_errx(dbenv, "cannot remove the last cache"); + return (EINVAL); + } + + for (i = 0; i < mp->htab_buckets; i++) + if ((ret = __memp_remove_bucket(dbmp)) != 0) + return (ret); + + /* Detach from the region then destroy it. */ + infop = &dbmp->reginfo[--mp->nreg]; + return (__env_region_detach(dbenv, infop, 1)); +} + +static int +__memp_map_regions(dbmp) + DB_MPOOL *dbmp; +{ + DB_ENV *dbenv; + MPOOL *mp; + int ret; + u_int i; + u_int32_t *regids; + + dbenv = dbmp->dbenv; + mp = dbmp->reginfo[0].primary; + regids = R_ADDR(dbmp->reginfo, mp->regids); + ret = 0; + + for (i = 1; i < mp->nreg; ++i) { + if (dbmp->reginfo[i].primary != NULL && + dbmp->reginfo[i].id == regids[i]) + continue; + + if (dbmp->reginfo[i].primary != NULL) + ret = __env_region_detach(dbenv, &dbmp->reginfo[i], 0); + + dbmp->reginfo[i].dbenv = dbenv; + dbmp->reginfo[i].type = REGION_TYPE_MPOOL; + dbmp->reginfo[i].id = regids[i]; + dbmp->reginfo[i].flags = REGION_JOIN_OK; + if ((ret = + __env_region_attach(dbenv, &dbmp->reginfo[i], 0)) != 0) + return (ret); + dbmp->reginfo[i].primary = R_ADDR(&dbmp->reginfo[i], + dbmp->reginfo[i].rp->primary); + } + + for (; i < mp->max_nreg; i++) + if (dbmp->reginfo[i].primary != NULL && + (ret = __env_region_detach(dbenv, + &dbmp->reginfo[i], 0)) != 0) + break; + + return (ret); +} + +/* + * PUBLIC: int __memp_resize __P((DB_MPOOL *, u_int32_t, u_int32_t)); + */ +int +__memp_resize(dbmp, gbytes, bytes) + DB_MPOOL *dbmp; + u_int32_t gbytes, bytes; +{ + DB_ENV *dbenv; + MPOOL *mp; + int ret; + u_int32_t ncache; + roff_t reg_size, total_size; + + dbenv = dbmp->dbenv; + mp = dbmp->reginfo[0].primary; + reg_size = dbmp->reginfo[0].rp->size; + total_size = (roff_t)gbytes * GIGABYTE + bytes; + ncache = (u_int32_t)((total_size + reg_size / 2) / reg_size); + + if (ncache < 1) + ncache = 1; + else if (ncache > mp->max_nreg) { + __db_errx(dbenv, + "cannot resize to %lu cache regions: maximum is %lu", + (u_long)ncache, (u_long)mp->max_nreg); + return (EINVAL); + } + + ret = 0; + MUTEX_LOCK(dbenv, mp->mtx_resize); + while (mp->nreg != ncache) + if ((ret = (mp->nreg < ncache ? + __memp_add_region(dbmp) : + __memp_remove_region(dbmp))) != 0) + break; + MUTEX_UNLOCK(dbenv, mp->mtx_resize); + + return (ret); +} + +/* + * PUBLIC: int __memp_get_cache_max __P((DB_ENV *, u_int32_t *, u_int32_t *)); + */ +int +__memp_get_cache_max(dbenv, max_gbytesp, max_bytesp) + DB_ENV *dbenv; + u_int32_t *max_gbytesp, *max_bytesp; +{ + DB_MPOOL *dbmp; + MPOOL *mp; + roff_t reg_size, max_size; + + ENV_NOT_CONFIGURED(dbenv, + dbenv->mp_handle, "DB_ENV->get_mp_max_ncache", DB_INIT_MPOOL); + + if (MPOOL_ON(dbenv)) { + dbmp = dbenv->mp_handle; + mp = dbmp->reginfo[0].primary; + reg_size = dbmp->reginfo[0].rp->size; + max_size = mp->max_nreg * reg_size; + *max_gbytesp = (u_int32_t)(max_size / GIGABYTE); + *max_bytesp = (u_int32_t)(max_size % GIGABYTE); + } else { + *max_gbytesp = dbenv->mp_max_gbytes; + *max_bytesp = dbenv->mp_max_bytes; + } + + return (0); +} + +/* + * PUBLIC: int __memp_set_cache_max __P((DB_ENV *, u_int32_t, u_int32_t)); + */ +int +__memp_set_cache_max(dbenv, max_gbytes, max_bytes) + DB_ENV *dbenv; + u_int32_t max_gbytes, max_bytes; +{ + ENV_ILLEGAL_AFTER_OPEN(dbenv, "DB_ENV->set_cache_max"); + dbenv->mp_max_gbytes = max_gbytes; + dbenv->mp_max_bytes = max_bytes; + + return (0); +} diff --git a/db/mp/mp_stat.c b/db/mp/mp_stat.c index b4d4544b5..0e7b6c237 100644 --- a/db/mp/mp_stat.c +++ b/db/mp/mp_stat.c @@ -1,10 +1,9 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996-2006 - * Oracle Corporation. All rights reserved. + * Copyright (c) 1996,2007 Oracle. All rights reserved. * - * $Id: mp_stat.c,v 12.28 2006/09/11 14:53:42 bostic Exp $ + * $Id: mp_stat.c,v 12.36 2007/06/22 17:41:29 bostic Exp $ */ #include "db_config.h" @@ -104,10 +103,10 @@ __memp_stat(dbenv, gspp, fspp, flags) * a per-cache basis. Note that configuration information * may be modified at any time, and so we have to lock. */ - c_mp = dbmp->reginfo[0].primary; - sp->st_gbytes = c_mp->stat.st_gbytes; - sp->st_bytes = c_mp->stat.st_bytes; - sp->st_ncache = dbmp->nreg; + sp->st_gbytes = mp->stat.st_gbytes; + sp->st_bytes = mp->stat.st_bytes; + sp->st_ncache = mp->nreg; + sp->st_max_ncache = mp->max_nreg; sp->st_regsize = dbmp->reginfo[0].rp->size; MPOOL_SYSTEM_LOCK(dbenv); @@ -165,7 +164,8 @@ __memp_stat(dbenv, gspp, fspp, flags) c_mp->stat.st_alloc_max_pages; if (LF_ISSET(DB_STAT_CLEAR)) { - __mutex_clear(dbenv, c_mp->mtx_region); + if (!LF_ISSET(DB_STAT_SUBSYSTEM)) + __mutex_clear(dbenv, c_mp->mtx_region); MPOOL_SYSTEM_LOCK(dbenv); st_bytes = c_mp->stat.st_bytes; @@ -388,9 +388,10 @@ __memp_stat_print(dbenv, flags) int ret; orig_flags = flags; - LF_CLR(DB_STAT_CLEAR); + LF_CLR(DB_STAT_CLEAR | DB_STAT_SUBSYSTEM); if (flags == 0 || LF_ISSET(DB_STAT_ALL)) { - ret = __memp_print_stats(dbenv, orig_flags); + ret = __memp_print_stats(dbenv, + LF_ISSET(DB_STAT_ALL) ? flags : orig_flags); if (flags == 0 || ret != 0) return (ret); } @@ -423,6 +424,7 @@ __memp_print_stats(dbenv, flags) __db_dlbytes(dbenv, "Total cache size", (u_long)gsp->st_gbytes, (u_long)0, (u_long)gsp->st_bytes); __db_dl(dbenv, "Number of caches", (u_long)gsp->st_ncache); + __db_dl(dbenv, "Maximum number of caches", (u_long)gsp->st_max_ncache); __db_dlbytes(dbenv, "Pool individual cache size", (u_long)0, (u_long)0, (u_long)gsp->st_regsize); __db_dlbytes(dbenv, "Maximum memory-mapped file size", @@ -551,7 +553,7 @@ __memp_print_all(dbenv, flags) MPOOL_SYSTEM_LOCK(dbenv); - __db_print_reginfo(dbenv, dbmp->reginfo, "Mpool"); + __db_print_reginfo(dbenv, dbmp->reginfo, "Mpool", flags); __db_msg(dbenv, "%s", DB_GLOBAL(db_line)); __db_msg(dbenv, "MPOOL structure:"); @@ -567,7 +569,7 @@ __memp_print_all(dbenv, flags) __db_msg(dbenv, "DB_MPOOL handle information:"); __mutex_print_debug_single( dbenv, "DB_MPOOL handle mutex", dbmp->mutex, flags); - STAT_ULONG("Underlying cache regions", dbmp->nreg); + STAT_ULONG("Underlying cache regions", mp->nreg); __db_msg(dbenv, "%s", DB_GLOBAL(db_line)); __db_msg(dbenv, "DB_MPOOLFILE structures:"); @@ -709,9 +711,11 @@ __memp_print_hash(dbenv, dbmp, reginfo, fmap, flags) bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) { MUTEX_LOCK(dbenv, hp->mtx_hash); if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) != NULL) { - __db_msgadd(dbenv, &mb, "bucket %lu: %lu, %lu ", + __db_msgadd(dbenv, &mb, + "bucket %lu: %lu, %lu (%lu dirty)", (u_long)bucket, (u_long)hp->hash_io_wait, - (u_long)hp->hash_priority); + (u_long)hp->hash_priority, + (u_long)hp->hash_page_dirty); if (hp->hash_frozen != 0) __db_msgadd(dbenv, &mb, "(MVCC %lu/%lu/%lu) ", (u_long)hp->hash_frozen, @@ -822,7 +826,8 @@ __memp_stat_wait(dbenv, reginfo, mp, mstat, flags) mstat->st_hash_max_wait = tmp_wait; mstat->st_hash_max_nowait = tmp_nowait; } - if (LF_ISSET(DB_STAT_CLEAR)) + if (LF_ISSET(DB_STAT_CLEAR | + DB_STAT_SUBSYSTEM) == DB_STAT_CLEAR) __mutex_clear(dbenv, hp->mtx_hash); mstat->st_io_wait += hp->hash_io_wait; diff --git a/db/mp/mp_sync.c b/db/mp/mp_sync.c index 898ae5b6d..5db83fc7b 100644 --- a/db/mp/mp_sync.c +++ b/db/mp/mp_sync.c @@ -1,10 +1,9 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996-2006 - * Oracle Corporation. All rights reserved. + * Copyright (c) 1996,2007 Oracle. All rights reserved. * - * $Id: mp_sync.c,v 12.24 2006/08/24 14:46:15 bostic Exp $ + * $Id: mp_sync.c,v 12.52 2007/06/01 18:32:44 bostic Exp $ */ #include "db_config.h" @@ -12,6 +11,8 @@ #include "db_int.h" #include "dbinc/log.h" #include "dbinc/mp.h" +#include "dbinc/db_page.h" +#include "dbinc/hash.h" typedef struct { DB_MPOOL_HASH *track_hp; /* Hash bucket. */ @@ -21,8 +22,8 @@ typedef struct { } BH_TRACK; static int __bhcmp __P((const void *, const void *)); -static int __memp_close_flush_files __P((DB_ENV *, DB_MPOOL *, int)); -static int __memp_sync_files __P((DB_ENV *, DB_MPOOL *)); +static int __memp_close_flush_files __P((DB_ENV *, int)); +static int __memp_sync_files __P((DB_ENV *)); static int __memp_sync_file __P((DB_ENV *, MPOOLFILE *, void *, u_int32_t *, u_int32_t)); @@ -93,7 +94,7 @@ __memp_sync_pp(dbenv, lsnp) dbenv->lg_handle, "memp_sync", DB_INIT_LOG); ENV_ENTER(dbenv, ip); - REPLICATION_WRAP(dbenv, (__memp_sync(dbenv, lsnp)), ret); + REPLICATION_WRAP(dbenv, (__memp_sync(dbenv, DB_SYNC_CACHE, lsnp)), ret); ENV_LEAVE(dbenv, ip); return (ret); } @@ -102,16 +103,17 @@ __memp_sync_pp(dbenv, lsnp) * __memp_sync -- * DB_ENV->memp_sync. * - * PUBLIC: int __memp_sync __P((DB_ENV *, DB_LSN *)); + * PUBLIC: int __memp_sync __P((DB_ENV *, u_int32_t, DB_LSN *)); */ int -__memp_sync(dbenv, lsnp) +__memp_sync(dbenv, flags, lsnp) DB_ENV *dbenv; + u_int32_t flags; DB_LSN *lsnp; { DB_MPOOL *dbmp; MPOOL *mp; - int ret; + int interrupted, ret; dbmp = dbenv->mp_handle; mp = dbmp->reginfo[0].primary; @@ -128,10 +130,11 @@ __memp_sync(dbenv, lsnp) MPOOL_SYSTEM_UNLOCK(dbenv); } - if ((ret = __memp_sync_int(dbenv, NULL, 0, DB_SYNC_CACHE, NULL)) != 0) + if ((ret = + __memp_sync_int(dbenv, NULL, 0, flags, NULL, &interrupted)) != 0) return (ret); - if (lsnp != NULL) { + if (!interrupted && lsnp != NULL) { MPOOL_SYSTEM_LOCK(dbenv); if (LOG_COMPARE(lsnp, &mp->lsn) > 0) mp->lsn = *lsnp; @@ -195,7 +198,8 @@ __memp_fsync(dbmfp) if (mfp->file_written == 0) return (0); - return (__memp_sync_int(dbmfp->dbenv, dbmfp, 0, DB_SYNC_FILE, NULL)); + return (__memp_sync_int( + dbmfp->dbenv, dbmfp, 0, DB_SYNC_FILE, NULL, NULL)); } /* @@ -209,6 +213,8 @@ __mp_xxx_fh(dbmfp, fhp) DB_MPOOLFILE *dbmfp; DB_FH **fhp; { + int ret; + /* * This is a truly spectacular layering violation, intended ONLY to * support compatibility for the DB 1.85 DB->fd call. @@ -226,7 +232,10 @@ __mp_xxx_fh(dbmfp, fhp) if ((*fhp = dbmfp->fhp) != NULL) return (0); - return (__memp_sync_int(dbmfp->dbenv, dbmfp, 0, DB_SYNC_FILE, NULL)); + if ((ret = __memp_sync_int( + dbmfp->dbenv, dbmfp, 0, DB_SYNC_FILE, NULL, NULL)) == 0) + *fhp = dbmfp->fhp; + return (ret); } /* @@ -234,14 +243,14 @@ __mp_xxx_fh(dbmfp, fhp) * Mpool sync internal function. * * PUBLIC: int __memp_sync_int __P((DB_ENV *, - * PUBLIC: DB_MPOOLFILE *, u_int32_t, db_sync_op, u_int32_t *)); + * PUBLIC: DB_MPOOLFILE *, u_int32_t, u_int32_t, u_int32_t *, int *)); */ int -__memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep) +__memp_sync_int(dbenv, dbmfp, trickle_max, flags, wrote_totalp, interruptedp) DB_ENV *dbenv; DB_MPOOLFILE *dbmfp; - u_int32_t trickle_max, *wrotep; - db_sync_op op; + u_int32_t trickle_max, flags, *wrote_totalp; + int *interruptedp; { BH *bhp; BH_TRACK *bharray; @@ -251,20 +260,32 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep) MPOOLFILE *mfp; db_mutex_t mutex; roff_t last_mf_offset; - u_int32_t ar_cnt, ar_max, i, n_cache, remaining, wrote; - int filecnt, hb_lock, maxopenfd, maxwrite, maxwrite_sleep; - int pass, ret, t_ret, wait_cnt, write_cnt; + u_int32_t ar_cnt, ar_max, dirty, i, n_cache, remaining, wrote_total; + int filecnt, maxopenfd, pass, required_write, ret, t_ret; + int wait_cnt, wrote_cnt; dbmp = dbenv->mp_handle; mp = dbmp->reginfo[0].primary; last_mf_offset = INVALID_ROFF; - filecnt = pass = wrote = 0; + filecnt = pass = wrote_total = 0; + + if (wrote_totalp != NULL) + *wrote_totalp = 0; + if (interruptedp != NULL) + *interruptedp = 0; + + /* + * If we're flushing the cache, it's a checkpoint or we're flushing a + * specific file, we really have to write the blocks and we have to + * confirm they made it to disk. Otherwise, we can skip a block if + * it's hard to get. + */ + required_write = LF_ISSET(DB_SYNC_CACHE | + DB_SYNC_CHECKPOINT | DB_SYNC_FILE | DB_SYNC_QUEUE_EXTENT); /* Get shared configuration information. */ MPOOL_SYSTEM_LOCK(dbenv); maxopenfd = mp->mp_maxopenfd; - maxwrite = mp->mp_maxwrite; - maxwrite_sleep = mp->mp_maxwrite_sleep; MPOOL_SYSTEM_UNLOCK(dbenv); /* Assume one dirty page per bucket. */ @@ -284,43 +305,60 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep) hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab); for (i = 0; i < c_mp->htab_buckets; i++, hp++) { /* - * We can check for empty buckets before locking as we - * only care if the pointer is zero or non-zero. We - * can ignore empty buckets because we only need write - * buffers that were dirty before we started. + * We can check for empty buckets before locking as + * we only care if the pointer is zero or non-zero. + * We can ignore empty or clean buckets because we + * only need write buffers that were dirty before + * we started. */ +#ifdef DIAGNOSTIC if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) +#else + if (hp->hash_page_dirty == 0) +#endif continue; + dirty = 0; MUTEX_LOCK(dbenv, hp->mtx_hash); SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) { /* Always ignore clean pages. */ if (!F_ISSET(bhp, BH_DIRTY)) continue; + dirty++; mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); /* - * Ignore in-memory files, even if they are - * temp files to whom a backing file has been - * allocated. + * Ignore in-memory files, unless the file is + * specifically being flushed. */ - if (mfp->no_backing_file || + if (mfp->no_backing_file) + continue; + if (!LF_ISSET(DB_SYNC_FILE) && F_ISSET(mfp, MP_TEMP)) continue; /* - * If we're flushing a specific file, see if - * this page is from that file. + * Ignore files that aren't involved in DB's + * transactional operations during checkpoints. */ - if (dbmfp != NULL && mfp != dbmfp->mfp) + if (LF_ISSET(DB_SYNC_CHECKPOINT) && + mfp->lsn_off == DB_LSN_OFF_NOTSET) continue; /* - * Ignore files that aren't involved in DB's - * transactional operations during checkpoints. + * Ignore files that aren't Queue extent files + * if we're flushing a Queue file with extents. */ - if (dbmfp == NULL && mfp->lsn_off == -1) + if (LF_ISSET(DB_SYNC_QUEUE_EXTENT) && + !F_ISSET(mfp, MP_EXTENT)) + continue; + + /* + * If we're flushing a specific file, see if + * this page is from that file. + */ + if (dbmfp != NULL && mfp != dbmfp->mfp) continue; /* Track the buffer, we want it. */ @@ -343,10 +381,25 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep) ar_max *= 2; } } + DB_ASSERT(dbenv, dirty == hp->hash_page_dirty); + if (dirty != hp->hash_page_dirty) { + __db_errx(dbenv, + "memp_sync: correcting dirty count %lu %lu", + (u_long)hp->hash_page_dirty, (u_long)dirty); + hp->hash_page_dirty = dirty; + } MUTEX_UNLOCK(dbenv, hp->mtx_hash); if (ret != 0) goto err; + + /* Check if the call has been interrupted. */ + if (LF_ISSET(DB_SYNC_INTERRUPT_OK) && FLD_ISSET( + mp->config_flags, DB_MEMP_SYNC_INTERRUPT)) { + if (interruptedp != NULL) + *interruptedp = 1; + goto err; + } } } @@ -366,7 +419,7 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep) * If we're trickling buffers, only write enough to reach the correct * percentage. */ - if (op == DB_SYNC_TRICKLE && ar_cnt > trickle_max) + if (LF_ISSET(DB_SYNC_TRICKLE) && ar_cnt > trickle_max) ar_cnt = trickle_max; /* @@ -385,7 +438,7 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep) * out its hash bucket pointer so we don't process a slot more than * once. */ - for (i = pass = write_cnt = 0, remaining = ar_cnt; remaining > 0; ++i) { + for (i = pass = wrote_cnt = 0, remaining = ar_cnt; remaining > 0; ++i) { if (i >= ar_cnt) { i = 0; ++pass; @@ -429,44 +482,40 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep) */ if (F_ISSET(bhp, BH_LOCKED) || (bhp->ref != 0 && pass < 2)) { MUTEX_UNLOCK(dbenv, mutex); - if (op != DB_SYNC_CACHE && op != DB_SYNC_FILE) { + if (!required_write) { --remaining; bharray[i].track_hp = NULL; } continue; } - /* - * The buffer is dirty and may also be pinned. - * - * Set the sync wait-for count, used to count down outstanding - * references to this buffer as they are returned to the cache. - */ - bhp->ref_sync = bhp->ref; - /* Pin the buffer into memory and lock it. */ ++bhp->ref; F_SET(bhp, BH_LOCKED); /* - * Unlock the hash bucket and wait for the wait-for count to - * go to 0. No new thread can acquire the buffer because we - * have it locked. + * If the buffer is referenced by another thread, set the sync + * wait-for count (used to count down outstanding references to + * this buffer as they are returned to the cache), then unlock + * the hash bucket and wait for the count to go to 0. No other + * thread can acquire the buffer because we have it locked. * * If a thread attempts to re-pin a page, the wait-for count - * will never go to 0 (the thread spins on our buffer lock, + * will never go to 0 (that thread spins on our buffer lock, * while we spin on the thread's ref count). Give up if we - * don't get the buffer in 3 seconds, we can try again later. + * don't get the buffer in 3 seconds, we'll try again later. * * If, when the wait-for count goes to 0, the buffer is found * to be dirty, write it. */ - MUTEX_UNLOCK(dbenv, mutex); - for (wait_cnt = 1; - bhp->ref_sync != 0 && wait_cnt < 4; ++wait_cnt) - __os_sleep(dbenv, 1, 0); - MUTEX_LOCK(dbenv, mutex); - hb_lock = 1; + bhp->ref_sync = bhp->ref - 1; + if (bhp->ref_sync != 0) { + MUTEX_UNLOCK(dbenv, mutex); + for (wait_cnt = 1; + bhp->ref_sync != 0 && wait_cnt < 4; ++wait_cnt) + __os_sleep(dbenv, 1, 0); + MUTEX_LOCK(dbenv, mutex); + } /* * If we've switched files, check to see if we're configured @@ -476,7 +525,7 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep) if (++filecnt >= maxopenfd) { filecnt = 0; if ((t_ret = __memp_close_flush_files( - dbenv, dbmp, 1)) != 0 && ret == 0) + dbenv, 1)) != 0 && ret == 0) ret = t_ret; } last_mf_offset = bhp->mf_offset; @@ -496,28 +545,18 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep) * dirty, we write it. We only try to write the buffer once. */ if (bhp->ref_sync == 0 && F_ISSET(bhp, BH_DIRTY)) { - MUTEX_UNLOCK(dbenv, mutex); - hb_lock = 0; - mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); if ((t_ret = - __memp_bhwrite(dbmp, hp, mfp, bhp, 1)) == 0) - ++wrote; - else { + __memp_bhwrite(dbmp, hp, mfp, bhp, 1)) == 0) { + ++wrote_cnt; + ++wrote_total; + } else { if (ret == 0) ret = t_ret; __db_errx (dbenv, "%s: unable to flush page: %lu", __memp_fns(dbmp, mfp), (u_long)bhp->pgno); - } - /* - * Avoid saturating the disk, sleep once we've done - * some number of writes. - */ - if (maxwrite != 0 && ++write_cnt >= maxwrite) { - write_cnt = 0; - __os_sleep(dbenv, 0, (u_long)maxwrite_sleep); } } @@ -525,18 +564,9 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep) * If ref_sync count never went to 0, the buffer was written * by another thread, or the write failed, we still have the * buffer locked. - * - * We may or may not currently hold the hash bucket mutex. If - * the __memp_bhwrite -> __memp_pgwrite call was successful, - * __memp_pgwrite will have acquired the hash bucket lock; all - * other call paths will leave us without the hash bucket lock. */ - if (F_ISSET(bhp, BH_LOCKED)) { - if (!hb_lock) - MUTEX_LOCK(dbenv, mutex); - + if (F_ISSET(bhp, BH_LOCKED)) F_CLR(bhp, BH_LOCKED); - } /* * Reset the ref_sync count regardless of our success, we're @@ -548,7 +578,8 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep) --bhp->ref; /* - * If a thread of control is waiting on this buffer, wake it up. + * If a thread of control is waiting in this hash bucket, wake + * it up. */ if (F_ISSET(hp, IO_WAITER)) { F_CLR(hp, IO_WAITER); @@ -557,29 +588,51 @@ __memp_sync_int(dbenv, dbmfp, trickle_max, op, wrotep) /* Release the hash bucket mutex. */ MUTEX_UNLOCK(dbenv, mutex); + + /* Check if the call has been interrupted. */ + if (LF_ISSET(DB_SYNC_INTERRUPT_OK) && + FLD_ISSET(mp->config_flags, DB_MEMP_SYNC_INTERRUPT)) { + if (interruptedp != NULL) + *interruptedp = 1; + goto err; + } + + /* + * Sleep after some number of writes to avoid disk saturation. + * Don't cache the max writes value, an application shutting + * down might reset the value in order to do a fast flush or + * checkpoint. + */ + if (!LF_ISSET(DB_SYNC_SUPPRESS_WRITE) && + !FLD_ISSET(mp->config_flags, DB_MEMP_SUPPRESS_WRITE) && + mp->mp_maxwrite != 0 && wrote_cnt >= mp->mp_maxwrite) { + wrote_cnt = 0; + __os_sleep( + dbenv, 0, (u_long)mp->mp_maxwrite_sleep); + } } done: /* - * If doing a checkpoint or flushing a file for the application, we - * have to force the pages to disk. We don't do this as we go along - * because we want to give the OS as much time as possible to lazily - * flush, and because we have to flush files that might not even have - * had dirty buffers in the cache, so we have to walk the files list. + * If a write is required, we have to force the pages to disk. We + * don't do this as we go along because we want to give the OS as + * much time as possible to lazily flush, and because we have to flush + * files that might not even have had dirty buffers in the cache, so + * we have to walk the files list. */ - if (ret == 0 && (op == DB_SYNC_CACHE || op == DB_SYNC_FILE)) { + if (ret == 0 && required_write) { if (dbmfp == NULL) - ret = __memp_sync_files(dbenv, dbmp); + ret = __memp_sync_files(dbenv); else ret = __os_fsync(dbenv, dbmfp->fhp); } /* If we've opened files to flush pages, close them. */ - if ((t_ret = __memp_close_flush_files(dbenv, dbmp, 0)) != 0 && ret == 0) + if ((t_ret = __memp_close_flush_files(dbenv, 0)) != 0 && ret == 0) ret = t_ret; err: __os_free(dbenv, bharray); - if (wrotep != NULL) - *wrotep = wrote; + if (wrote_totalp != NULL) + *wrote_totalp = wrote_total; return (ret); } @@ -651,28 +704,23 @@ __memp_sync_file(dbenv, mfp, argp, countp, flags) /* If we don't find a handle we can use, open one. */ if (dbmfp == NULL) { - if ((ret = __memp_mf_sync(dbmp, mfp, 0)) != 0) { + if ((ret = __memp_mf_sync(dbmp, mfp, 1)) != 0) { __db_err(dbenv, ret, "%s: unable to flush", (char *) R_ADDR(dbmp->reginfo, mfp->path_off)); } - } else { + } else ret = __os_fsync(dbenv, dbmfp->fhp); - if ((t_ret = __memp_fclose(dbmfp, 0)) != 0 && ret == 0) - ret = t_ret; - } - /* * Re-acquire the MPOOLFILE mutex, we need it to modify the * reference count. */ MUTEX_LOCK(dbenv, mfp->mutex); - --mfp->mpf_cnt; /* - * If we wrote the file and there are no open handles (or there - * is a single open handle, and it's the one we opened to write + * If we wrote the file and there are no other references (or there + * is a single reference, and it's the one we opened to write * buffers during checkpoint), clear the file_written flag. We * do this so that applications opening thousands of files don't * loop here opening and flushing those files during checkpoint. @@ -684,7 +732,7 @@ __memp_sync_file(dbenv, mfp, argp, countp, flags) * the region lock, no possibility of another thread of control * racing with us to open a MPOOLFILE. */ - if (mfp->mpf_cnt == 0 || (mfp->mpf_cnt == 1 && + if (mfp->mpf_cnt == 1 || (mfp->mpf_cnt == 2 && dbmfp != NULL && F_ISSET(dbmfp, MP_FLUSH))) { mfp->file_written = 0; @@ -696,31 +744,44 @@ __memp_sync_file(dbenv, mfp, argp, countp, flags) * I mean, what are the chances that there aren't any * buffers in the pool? Regardless, it might happen.) */ - if (mfp->mpf_cnt == 0 && mfp->block_cnt == 0) + if (mfp->mpf_cnt == 1 && mfp->block_cnt == 0) *(int *)argp = 1; } - /* Unlock the MPOOLFILE, and move to the next entry. */ + /* + * If we found the file we must close it in case we are the last + * reference to the dbmfp. NOTE: since we have incremented + * mfp->mpf_cnt this cannot be the last reference to the mfp. + * This is important since we are called with the hash bucket + * locked. The mfp will get freed via the cleanup pass. + */ + if (dbmfp != NULL && (t_ret = __memp_fclose(dbmfp, 0)) != 0 && ret == 0) + ret = t_ret; + + --mfp->mpf_cnt; + + /* Unlock the MPOOLFILE. */ MUTEX_UNLOCK(dbenv, mfp->mutex); - return (0); + return (ret); } /* * __memp_sync_files -- * Sync all the files in the environment, open or not. */ -static -int __memp_sync_files(dbenv, dbmp) +static int +__memp_sync_files(dbenv) DB_ENV *dbenv; - DB_MPOOL *dbmp; { + DB_MPOOL *dbmp; DB_MPOOL_HASH *hp; MPOOL *mp; MPOOLFILE *mfp, *next_mfp; int i, need_discard_pass, ret; - need_discard_pass = ret = 0; + dbmp = dbenv->mp_handle; mp = dbmp->reginfo[0].primary; + need_discard_pass = ret = 0; ret = __memp_walk_files(dbenv, mp, __memp_sync_file, &need_discard_pass, 0, DB_STAT_NOERROR); @@ -734,7 +795,7 @@ int __memp_sync_files(dbenv, dbmp) hp = R_ADDR(dbmp->reginfo, mp->ftab); for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) { - MUTEX_LOCK(dbenv, hp->mtx_hash); +retry: MUTEX_LOCK(dbenv, hp->mtx_hash); for (mfp = SH_TAILQ_FIRST(&hp->hash_bucket, __mpoolfile); mfp != NULL; mfp = next_mfp) { next_mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile); @@ -743,13 +804,17 @@ int __memp_sync_files(dbenv, dbmp) * without a mutex on the MPOOLFILE. If likely to * succeed, lock the MPOOLFILE down and look for real. */ - if (mfp->block_cnt != 0 || mfp->mpf_cnt != 0) + if (mfp->deadfile || + mfp->block_cnt != 0 || mfp->mpf_cnt != 0) continue; MUTEX_LOCK(dbenv, mfp->mutex); - if (mfp->block_cnt == 0 && mfp->mpf_cnt == 0) + if (!mfp->deadfile && + mfp->block_cnt == 0 && mfp->mpf_cnt == 0) { + MUTEX_UNLOCK(dbenv, hp->mtx_hash); (void)__memp_mf_discard(dbmp, mfp); - else + goto retry; + } else MUTEX_UNLOCK(dbenv, mfp->mutex); } MUTEX_UNLOCK(dbenv, hp->mtx_hash); @@ -764,28 +829,36 @@ int __memp_sync_files(dbenv, dbmp) * PUBLIC: int __memp_mf_sync __P((DB_MPOOL *, MPOOLFILE *, int)); */ int -__memp_mf_sync(dbmp, mfp, region_locked) +__memp_mf_sync(dbmp, mfp, locked) DB_MPOOL *dbmp; MPOOLFILE *mfp; - int region_locked; + int locked; { DB_ENV *dbenv; DB_FH *fhp; + DB_MPOOL_HASH *hp; + MPOOL *mp; int ret, t_ret; char *rpath; + COMPQUIET(hp, NULL); dbenv = dbmp->dbenv; /* - * We need to be holding the region lock: we're using the path name + * We need to be holding the hash lock: we're using the path name * and __memp_nameop might try and rename the file. */ - if (!region_locked) - MPOOL_SYSTEM_LOCK(dbenv); + if (!locked) { + mp = dbmp->reginfo[0].primary; + hp = R_ADDR(dbmp->reginfo, mp->ftab); + hp += FNBUCKET( + R_ADDR(dbmp->reginfo, mfp->fileid_off), DB_FILE_ID_LEN); + MUTEX_LOCK(dbenv, hp->mtx_hash); + } if ((ret = __db_appname(dbenv, DB_APP_DATA, R_ADDR(dbmp->reginfo, mfp->path_off), 0, NULL, &rpath)) == 0) { - if ((ret = __os_open(dbenv, rpath, 0, 0, &fhp)) == 0) { + if ((ret = __os_open(dbenv, rpath, 0, 0, 0, &fhp)) == 0) { ret = __os_fsync(dbenv, fhp); if ((t_ret = __os_closehandle(dbenv, fhp)) != 0 && ret == 0) @@ -794,8 +867,8 @@ __memp_mf_sync(dbmp, mfp, region_locked) __os_free(dbenv, rpath); } - if (!region_locked) - MPOOL_SYSTEM_UNLOCK(dbenv); + if (!locked) + MUTEX_UNLOCK(dbenv, hp->mtx_hash); return (ret); } @@ -805,15 +878,17 @@ __memp_mf_sync(dbmp, mfp, region_locked) * Close files opened only to flush buffers. */ static int -__memp_close_flush_files(dbenv, dbmp, dosync) +__memp_close_flush_files(dbenv, dosync) DB_ENV *dbenv; - DB_MPOOL *dbmp; int dosync; { + DB_MPOOL *dbmp; DB_MPOOLFILE *dbmfp; MPOOLFILE *mfp; int ret; + dbmp = dbenv->mp_handle; + /* * The routine exists because we must close files opened by sync to * flush buffers. There are two cases: first, extent files have to diff --git a/db/mp/mp_trickle.c b/db/mp/mp_trickle.c index d1d3853aa..cbe7af4f2 100644 --- a/db/mp/mp_trickle.c +++ b/db/mp/mp_trickle.c @@ -1,10 +1,9 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996-2006 - * Oracle Corporation. All rights reserved. + * Copyright (c) 1996,2007 Oracle. All rights reserved. * - * $Id: mp_trickle.c,v 12.9 2006/08/24 14:46:15 bostic Exp $ + * $Id: mp_trickle.c,v 12.16 2007/06/01 18:32:44 bostic Exp $ */ #include "db_config.h" @@ -89,15 +88,21 @@ __memp_trickle(dbenv, pct, nwrotep) if (total == 0 || dirty == 0) return (0); - clean = total - dirty; + /* + * The total number of pages is an exact number, but the dirty page + * count can change while we're walking the hash buckets, and it's + * even possible the dirty page count ends up larger than the total + * number of pages. + */ + clean = total > dirty ? total - dirty : 0; need_clean = (total * (u_int)pct) / 100; if (clean >= need_clean) return (0); need_clean -= clean; - ret = __memp_sync_int( - dbenv, NULL, need_clean, DB_SYNC_TRICKLE, &wrote); - mp->stat.st_page_trickle += wrote; + ret = __memp_sync_int(dbenv, NULL, + need_clean, DB_SYNC_TRICKLE | DB_SYNC_INTERRUPT_OK, &wrote, NULL); + STAT((mp->stat.st_page_trickle += wrote)); if (nwrotep != NULL) *nwrotep = (int)wrote; |