diff options
author | Zhang Qiang <qiang.z.zhang@intel.com> | 2012-05-29 12:22:00 +0800 |
---|---|---|
committer | Zhang Qiang <qiang.z.zhang@intel.com> | 2012-05-29 12:22:00 +0800 |
commit | 02f0634ac29e19c68279e5544cac963e7f1203b8 (patch) | |
tree | b983472f94ef063cedf866d8ecfb55939171779d /mp | |
parent | e776056ea09ba0b6d9505ced6913c9190a12d632 (diff) | |
download | db4-2.0alpha.tar.gz db4-2.0alpha.tar.bz2 db4-2.0alpha.zip |
Diffstat (limited to 'mp')
-rw-r--r-- | mp/mp_alloc.c | 622 | ||||
-rw-r--r-- | mp/mp_bh.c | 639 | ||||
-rw-r--r-- | mp/mp_fget.c | 1161 | ||||
-rw-r--r-- | mp/mp_fmethod.c | 555 | ||||
-rw-r--r-- | mp/mp_fopen.c | 1100 | ||||
-rw-r--r-- | mp/mp_fput.c | 367 | ||||
-rw-r--r-- | mp/mp_fset.c | 165 | ||||
-rw-r--r-- | mp/mp_method.c | 992 | ||||
-rw-r--r-- | mp/mp_mvcc.c | 634 | ||||
-rw-r--r-- | mp/mp_region.c | 588 | ||||
-rw-r--r-- | mp/mp_register.c | 115 | ||||
-rw-r--r-- | mp/mp_resize.c | 579 | ||||
-rw-r--r-- | mp/mp_stat.c | 904 | ||||
-rw-r--r-- | mp/mp_sync.c | 919 | ||||
-rw-r--r-- | mp/mp_trickle.c | 112 |
15 files changed, 9452 insertions, 0 deletions
diff --git a/mp/mp_alloc.c b/mp/mp_alloc.c new file mode 100644 index 0000000..ff02143 --- /dev/null +++ b/mp/mp_alloc.c @@ -0,0 +1,622 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" + +/* + * __memp_alloc -- + * Allocate some space from a cache region. + * + * PUBLIC: int __memp_alloc __P((DB_MPOOL *, + * PUBLIC: REGINFO *, MPOOLFILE *, size_t, roff_t *, void *)); + */ +int +__memp_alloc(dbmp, infop, mfp, len, offsetp, retp) + DB_MPOOL *dbmp; + REGINFO *infop; + MPOOLFILE *mfp; + size_t len; + roff_t *offsetp; + void *retp; +{ + BH *bhp, *current_bhp, *mvcc_bhp, *oldest_bhp; + BH_FROZEN_PAGE *frozen_bhp; + DB_LSN vlsn; + DB_MPOOL_HASH *dbht, *hp, *hp_end, *hp_saved, *hp_tmp; + ENV *env; + MPOOL *c_mp; + MPOOLFILE *bh_mfp; + size_t freed_space; + u_int32_t buckets, buffers, high_priority, priority, priority_saved; + u_int32_t put_counter, total_buckets; + int aggressive, alloc_freeze, b_lock, giveup, got_oldest; + int h_locked, need_free, need_freeze, obsolete, ret; + u_int8_t *endp; + void *p; + + env = dbmp->env; + c_mp = infop->primary; + dbht = R_ADDR(infop, c_mp->htab); + hp_end = &dbht[c_mp->htab_buckets]; + hp_saved = NULL; + priority_saved = 0; + + buckets = buffers = put_counter = total_buckets = 0; + aggressive = alloc_freeze = giveup = got_oldest = h_locked = 0; + + STAT(c_mp->stat.st_alloc++); + + /* + * If we're allocating a buffer, and the one we're discarding is the + * same size, we don't want to waste the time to re-integrate it into + * the shared memory free list. If the DB_MPOOLFILE argument isn't + * NULL, we'll compare the underlying page sizes of the two buffers + * before free-ing and re-allocating buffers. + */ + if (mfp != NULL) { + len = SSZA(BH, buf) + mfp->stat.st_pagesize; + /* Add space for alignment padding for MVCC diagnostics. */ + MVCC_BHSIZE(mfp, len); + } + + MPOOL_REGION_LOCK(env, infop); + + /* + * Anything newer than 1/10th of the buffer pool is ignored during + * allocation (unless allocation starts failing). + */ + high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10; + + /* + * First we try to allocate from free memory. If that fails, scan the + * buffer pool to find buffers with low priorities. We consider small + * sets of hash buckets each time to limit the amount of work needing + * to be done. This approximates LRU, but not very well. We either + * find a buffer of the same size to use, or we will free 3 times what + * we need in the hopes it will coalesce into a contiguous chunk of the + * right size. In the latter case we branch back here and try again. + */ +alloc: if ((ret = __env_alloc(infop, len, &p)) == 0) { + if (mfp != NULL) { + /* + * For MVCC diagnostics, align the pointer so that the + * buffer starts on a page boundary. + */ + MVCC_BHALIGN(p); + bhp = (BH *)p; + + if ((ret = __mutex_alloc(env, MTX_MPOOL_BH, + DB_MUTEX_SHARED, &bhp->mtx_buf)) != 0) { + MVCC_BHUNALIGN(bhp); + __env_alloc_free(infop, bhp); + goto search; + } + c_mp->stat.st_pages++; + } + MPOOL_REGION_UNLOCK(env, infop); +found: if (offsetp != NULL) + *offsetp = R_OFFSET(infop, p); + *(void **)retp = p; + + /* + * Update the search statistics. + * + * We're not holding the region locked here, these statistics + * can't be trusted. + */ +#ifdef HAVE_STATISTICS + total_buckets += buckets; + if (total_buckets != 0) { + if (total_buckets > c_mp->stat.st_alloc_max_buckets) + c_mp->stat.st_alloc_max_buckets = total_buckets; + c_mp->stat.st_alloc_buckets += total_buckets; + } + if (buffers != 0) { + if (buffers > c_mp->stat.st_alloc_max_pages) + c_mp->stat.st_alloc_max_pages = buffers; + c_mp->stat.st_alloc_pages += buffers; + } +#endif + return (0); + } else if (giveup || c_mp->stat.st_pages == 0) { + MPOOL_REGION_UNLOCK(env, infop); + + __db_errx(env, + "unable to allocate space from the buffer cache"); + return (ret); + } +search: ret = 0; + + /* + * We re-attempt the allocation every time we've freed 3 times what + * we need. Reset our free-space counter. + */ + freed_space = 0; + total_buckets += buckets; + buckets = 0; + + /* + * Walk the hash buckets and find the next two with potentially useful + * buffers. Free the buffer with the lowest priority from the buckets' + * chains. + */ + for (;;) { + /* All pages have been freed, make one last try */ + if (c_mp->stat.st_pages == 0) + goto alloc; + + /* Check for wrap around. */ + hp = &dbht[c_mp->last_checked++]; + if (hp >= hp_end) { + c_mp->last_checked = 0; + hp = &dbht[c_mp->last_checked++]; + } + + /* + * The failure mode is when there are too many buffers we can't + * write or there's not enough memory in the system to support + * the number of pinned buffers. + * + * Get aggressive if we've reviewed the entire cache without + * freeing the needed space. (The code resets "aggressive" + * when we free any space.) Aggressive means: + * + * a: set a flag to attempt to flush high priority buffers as + * well as other buffers. + * b: sync the mpool to force out queue extent pages. While we + * might not have enough space for what we want and flushing + * is expensive, why not? + * c: look at a buffer in every hash bucket rather than choose + * the more preferable of two. + * d: start to think about giving up. + * + * If we get here twice, sleep for a second, hopefully someone + * else will run and free up some memory. + * + * Always try to allocate memory too, in case some other thread + * returns its memory to the region. + * + * We don't have any way to know an allocation has no way to + * succeed. Fail if no pages are returned to the cache after + * we've been trying for a relatively long time. + * + * !!! + * This test ignores pathological cases like no buffers in the + * system -- we check for that early on, so it isn't possible. + */ + if (buckets++ == c_mp->htab_buckets) { + if (freed_space > 0) + goto alloc; + MPOOL_REGION_UNLOCK(env, infop); + + switch (++aggressive) { + case 1: + break; + case 2: + put_counter = c_mp->put_counter; + /* FALLTHROUGH */ + case 3: + case 4: + case 5: + case 6: + (void)__memp_sync_int( + env, NULL, 0, DB_SYNC_ALLOC, NULL, NULL); + + __os_yield(env, 1, 0); + break; + default: + aggressive = 1; + if (put_counter == c_mp->put_counter) + giveup = 1; + break; + } + + MPOOL_REGION_LOCK(env, infop); + goto alloc; + } + + /* + * Skip empty buckets. + * + * We can check for empty buckets before locking the hash + * bucket as we only care if the pointer is zero or non-zero. + */ + if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) + continue; + + /* Unlock the region and lock the hash bucket. */ + MPOOL_REGION_UNLOCK(env, infop); + MUTEX_READLOCK(env, hp->mtx_hash); + h_locked = 1; + b_lock = 0; + + /* + * Find a buffer we can use. + * + * We don't want to free a buffer out of the middle of an MVCC + * chain (that requires I/O). So, walk the buffers, looking + * for, in order of preference: + * + * an obsolete buffer at the end of an MVCC chain, + * the lowest priority buffer, if it is not frozen, + * the lowest priority frozen buffer. + * + * We use an obsolete buffer at the end of a chain as soon as + * we find one. We use the lowest-LRU singleton buffer if we + * find one and it's better than the result of another hash + * bucket we've reviewed. We do not use a buffer which + * has a priority greater than high_priority unless we are + * being aggressive. + * + * We prefer ordinary buffers over frozen buffers in the middle + * of an MVCC chain, regardless of priority: if the oldest + * buffer in a chain is frozen, we thaw or free it before + * recycling any buffers within the version chain. + * + * Ignore referenced buffers, we can't get rid of them. + */ +retry_search: bhp = NULL; + obsolete = 0; + SH_TAILQ_FOREACH(current_bhp, &hp->hash_bucket, hq, __bh) { + if (SH_CHAIN_SINGLETON(current_bhp, vc)) { + if (BH_REFCOUNT(current_bhp) == 0 && + (aggressive || + current_bhp->priority < high_priority) && + (bhp == NULL || + bhp->priority > current_bhp->priority)) { + if (bhp != NULL) + atomic_dec(env, &bhp->ref); + bhp = current_bhp; + atomic_inc(env, &bhp->ref); + } + continue; + } + + for (mvcc_bhp = oldest_bhp = current_bhp; + mvcc_bhp != NULL; + oldest_bhp = mvcc_bhp, + mvcc_bhp = SH_CHAIN_PREV(mvcc_bhp, vc, __bh)) { + DB_ASSERT(env, mvcc_bhp != + SH_CHAIN_PREV(mvcc_bhp, vc, __bh)); + if (aggressive > 2 && + BH_REFCOUNT(mvcc_bhp) == 0 && + !F_ISSET(mvcc_bhp, BH_FROZEN) && + (bhp == NULL || + bhp->priority > mvcc_bhp->priority)) { + if (bhp != NULL) + atomic_dec(env, &bhp->ref); + bhp = mvcc_bhp; + atomic_inc(env, &bhp->ref); + } + } + + /* + * oldest_bhp is the last buffer on the MVCC chain, and + * an obsolete buffer at the end of the MVCC chain + * gets used without further search. + * + * If the buffer isn't obsolete with respect to the + * cached old reader LSN, recalculate the oldest + * reader LSN and check again. + */ +retry_obsolete: if (BH_OBSOLETE(oldest_bhp, hp->old_reader, vlsn)) { + obsolete = 1; + if (bhp != NULL) + atomic_dec(env, &bhp->ref); + bhp = oldest_bhp; + atomic_inc(env, &bhp->ref); + goto this_buffer; + } + if (!got_oldest) { + if ((ret = __txn_oldest_reader( + env, &hp->old_reader)) != 0) + return (ret); + got_oldest = 1; + goto retry_obsolete; + } + } + + /* + * bhp is either NULL or the best candidate buffer. + * We'll use the chosen buffer only if we have compared its + * priority against one chosen from another hash bucket. + */ + if (bhp == NULL) + goto next_hb; + + /* Adjust the priority if the bucket has not been reset. */ + priority = bhp->priority; + if (c_mp->lru_reset != 0 && c_mp->lru_reset <= hp - dbht) + priority -= MPOOL_BASE_DECREMENT; + + /* + * Compare two hash buckets and select the one with the lowest + * priority. Performance testing shows looking at two improves + * the LRU-ness and looking at more only does a little better. + */ + if (hp_saved == NULL) { + hp_saved = hp; + priority_saved = priority; + goto next_hb; + } + + /* + * If the buffer we just found is a better choice than our + * previous choice, use it. + * + * If the previous choice was better, pretend we're moving + * from this hash bucket to the previous one and re-do the + * search. + * + * We don't worry about simply swapping between two buckets + * because that could only happen if a buffer was removed + * from the chain, or its priority updated. If a buffer + * is removed from the chain, some other thread has managed + * to discard a buffer, so we're moving forward. Updating + * a buffer's priority will make it a high-priority buffer, + * so we'll ignore it when we search again, and so we will + * eventually zero in on a buffer to use, or we'll decide + * there are no buffers we can use. + * + * If there's only a single hash bucket with buffers, we'll + * search the bucket once, choose a buffer, walk the entire + * list of buckets and search it again. In the case of a + * system that's busy, it's possible to imagine a case where + * we'd loop for a long while. For that reason, and because + * the test is easy, we special case and test for it. + */ + if (priority > priority_saved && hp != hp_saved) { + MUTEX_UNLOCK(env, hp->mtx_hash); + hp_tmp = hp_saved; + hp_saved = hp; + hp = hp_tmp; + priority_saved = priority; + MUTEX_READLOCK(env, hp->mtx_hash); + h_locked = 1; + DB_ASSERT(env, BH_REFCOUNT(bhp) > 0); + atomic_dec(env, &bhp->ref); + goto retry_search; + } + +this_buffer: buffers++; + + /* + * Discard any previously remembered hash bucket, we've got + * a winner. + */ + hp_saved = NULL; + + /* Drop the hash mutex and lock the buffer exclusively. */ + MUTEX_UNLOCK(env, hp->mtx_hash); + h_locked = 0; + /* We cannot block as the caller is probably holding locks. */ + if (BH_REFCOUNT(bhp) > 1 || + (ret = MUTEX_TRYLOCK(env, bhp->mtx_buf)) != 0) { + if (ret != 0 && ret != DB_LOCK_NOTGRANTED) + return (ret); + ret = 0; + goto next_hb; + } + F_SET(bhp, BH_EXCLUSIVE); + b_lock = 1; + + /* Someone may have grabbed it while we got the lock. */ + if (BH_REFCOUNT(bhp) != 1) + goto next_hb; + + /* Find the associated MPOOLFILE. */ + bh_mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); + + /* If the page is dirty, write it. */ + ret = 0; + if (F_ISSET(bhp, BH_DIRTY)) { + DB_ASSERT(env, atomic_read(&hp->hash_page_dirty) > 0); + ret = __memp_bhwrite(dbmp, hp, bh_mfp, bhp, 0); + DB_ASSERT(env, atomic_read(&bhp->ref) > 0); +#ifdef HAVE_STATISTICS + if (ret == 0) + ++c_mp->stat.st_rw_evict; +#endif + } +#ifdef HAVE_STATISTICS + else + ++c_mp->stat.st_ro_evict; +#endif + + /* + * Freeze this buffer, if necessary. That is, if the buffer + * could be read by the oldest reader in the system. + */ + need_freeze = (SH_CHAIN_HASPREV(bhp, vc) || + (SH_CHAIN_HASNEXT(bhp, vc) && !obsolete)); + if (ret == 0 && need_freeze) { + if (!aggressive || + F_ISSET(bhp, BH_DIRTY | BH_FROZEN)) + goto next_hb; + if ((ret = __memp_bh_freeze(dbmp, + infop, hp, bhp, &alloc_freeze)) == 0) + need_freeze = 0; + else if (ret == EBUSY || ret == EIO || + ret == ENOMEM || ret == ENOSPC) + ret = 0; + else { + DB_ASSERT(env, BH_REFCOUNT(bhp) > 0); + atomic_dec(env, &bhp->ref); + DB_ASSERT(env, b_lock); + F_CLR(bhp, BH_EXCLUSIVE); + MUTEX_UNLOCK(env, bhp->mtx_buf); + DB_ASSERT(env, !h_locked); + return (ret); + } + } + + /* + * If a write fails for any reason, we can't proceed. + * + * Also, we released the hash bucket lock while doing I/O, so + * another thread may have acquired this buffer and incremented + * the ref count or dirtied the buffer after we wrote it, in + * which case we can't have it. + * + * If there's a write error and we're having problems finding + * something to allocate, avoid selecting this buffer again + * by raising its priority. + */ + MUTEX_LOCK(env, hp->mtx_hash); + h_locked = 1; + if (ret != 0 && (aggressive || bhp->priority < c_mp->lru_count)) + bhp->priority = c_mp->lru_count + + c_mp->stat.st_pages / MPOOL_PRI_DIRTY; + + if (ret != 0 || BH_REFCOUNT(bhp) != 1 || + F_ISSET(bhp, BH_DIRTY) || need_freeze) + goto next_hb; + + /* + * If the buffer is frozen, thaw it and look for another one + * we can use. (Calling __memp_bh_freeze above will not + * mark bhp BH_FROZEN.) + */ + if (F_ISSET(bhp, BH_FROZEN)) { + DB_ASSERT(env, obsolete || SH_CHAIN_SINGLETON(bhp, vc)); + DB_ASSERT(env, BH_REFCOUNT(bhp) > 0); + if (!F_ISSET(bhp, BH_THAWED)) { + /* + * This call releases the hash bucket mutex. + * We're going to retry the search, so we need + * to re-lock it. + */ + if ((ret = __memp_bh_thaw(dbmp, + infop, hp, bhp, NULL)) != 0) + return (ret); + MUTEX_READLOCK(env, hp->mtx_hash); + } else { + need_free = (atomic_dec(env, &bhp->ref) == 0); + F_CLR(bhp, BH_EXCLUSIVE); + MUTEX_UNLOCK(env, bhp->mtx_buf); + if (need_free) { + MPOOL_REGION_LOCK(env, infop); + SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen, + bhp, hq); + MPOOL_REGION_UNLOCK(env, infop); + } + } + bhp = NULL; + b_lock = alloc_freeze = 0; + goto retry_search; + } + + /* + * If we need some empty buffer headers for freezing, turn the + * buffer we've found into frozen headers and put them on the + * free list. Only reset alloc_freeze if we've actually + * allocated some frozen buffer headers. + */ + if (alloc_freeze) { + if ((ret = __memp_bhfree(dbmp, + infop, bh_mfp, hp, bhp, 0)) != 0) + return (ret); + b_lock = 0; + h_locked = 0; + + MVCC_MPROTECT(bhp->buf, bh_mfp->stat.st_pagesize, + PROT_READ | PROT_WRITE | PROT_EXEC); + + MPOOL_REGION_LOCK(env, infop); + SH_TAILQ_INSERT_TAIL(&c_mp->alloc_frozen, + (BH_FROZEN_ALLOC *)bhp, links); + frozen_bhp = (BH_FROZEN_PAGE *) + ((BH_FROZEN_ALLOC *)bhp + 1); + endp = (u_int8_t *)bhp->buf + bh_mfp->stat.st_pagesize; + while ((u_int8_t *)(frozen_bhp + 1) < endp) { + frozen_bhp->header.mtx_buf = MUTEX_INVALID; + SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen, + (BH *)frozen_bhp, hq); + frozen_bhp++; + } + MPOOL_REGION_UNLOCK(env, infop); + + alloc_freeze = 0; + MUTEX_READLOCK(env, hp->mtx_hash); + h_locked = 1; + goto retry_search; + } + + /* + * Check to see if the buffer is the size we're looking for. + * If so, we can simply reuse it. Otherwise, free the buffer + * and its space and keep looking. + */ + if (mfp != NULL && + mfp->stat.st_pagesize == bh_mfp->stat.st_pagesize) { + if ((ret = __memp_bhfree(dbmp, + infop, bh_mfp, hp, bhp, 0)) != 0) + return (ret); + p = bhp; + goto found; + } + + freed_space += sizeof(*bhp) + bh_mfp->stat.st_pagesize; + if ((ret = + __memp_bhfree(dbmp, infop, + bh_mfp, hp, bhp, BH_FREE_FREEMEM)) != 0) + return (ret); + + /* Reset "aggressive" if we free any space. */ + if (aggressive > 1) + aggressive = 1; + + /* + * Unlock this buffer and re-acquire the region lock. If + * we're reaching here as a result of calling memp_bhfree, the + * buffer lock has already been discarded. + */ + if (0) { +next_hb: if (bhp != NULL) { + DB_ASSERT(env, BH_REFCOUNT(bhp) > 0); + atomic_dec(env, &bhp->ref); + if (b_lock) { + F_CLR(bhp, BH_EXCLUSIVE); + MUTEX_UNLOCK(env, bhp->mtx_buf); + } + } + if (h_locked) + MUTEX_UNLOCK(env, hp->mtx_hash); + h_locked = 0; + } + MPOOL_REGION_LOCK(env, infop); + + /* + * Retry the allocation as soon as we've freed up sufficient + * space. We're likely to have to coalesce of memory to + * satisfy the request, don't try until it's likely (possible?) + * we'll succeed. + */ + if (freed_space >= 3 * len) + goto alloc; + } + /* NOTREACHED */ +} + +/* + * __memp_free -- + * Free some space from a cache region. + * + * PUBLIC: void __memp_free __P((REGINFO *, void *)); + */ +void +__memp_free(infop, buf) + REGINFO *infop; + void *buf; +{ + __env_alloc_free(infop, buf); +} diff --git a/mp/mp_bh.c b/mp/mp_bh.c new file mode 100644 index 0000000..bedb430 --- /dev/null +++ b/mp/mp_bh.c @@ -0,0 +1,639 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" /* Required for diagnostic code. */ +#include "dbinc/mp.h" +#include "dbinc/log.h" +#include "dbinc/txn.h" + +static int __memp_pgwrite + __P((ENV *, DB_MPOOLFILE *, DB_MPOOL_HASH *, BH *)); + +/* + * __memp_bhwrite -- + * Write the page associated with a given buffer header. + * + * PUBLIC: int __memp_bhwrite __P((DB_MPOOL *, + * PUBLIC: DB_MPOOL_HASH *, MPOOLFILE *, BH *, int)); + */ +int +__memp_bhwrite(dbmp, hp, mfp, bhp, open_extents) + DB_MPOOL *dbmp; + DB_MPOOL_HASH *hp; + MPOOLFILE *mfp; + BH *bhp; + int open_extents; +{ + DB_MPOOLFILE *dbmfp; + DB_MPREG *mpreg; + ENV *env; + int ret; + + env = dbmp->env; + + /* + * If the file has been removed or is a closed temporary file, we're + * done -- the page-write function knows how to handle the fact that + * we don't have (or need!) any real file descriptor information. + */ + if (mfp->deadfile) + return (__memp_pgwrite(env, NULL, hp, bhp)); + + /* + * Walk the process' DB_MPOOLFILE list and find a file descriptor for + * the file. We also check that the descriptor is open for writing. + */ + MUTEX_LOCK(env, dbmp->mutex); + TAILQ_FOREACH(dbmfp, &dbmp->dbmfq, q) + if (dbmfp->mfp == mfp && !F_ISSET(dbmfp, MP_READONLY)) { + ++dbmfp->ref; + break; + } + MUTEX_UNLOCK(env, dbmp->mutex); + + if (dbmfp != NULL) { + /* + * Temporary files may not have been created. We only handle + * temporary files in this path, because only the process that + * created a temporary file will ever flush buffers to it. + */ + if (dbmfp->fhp == NULL) { + /* We may not be allowed to create backing files. */ + if (mfp->no_backing_file) { + --dbmfp->ref; + return (EPERM); + } + + MUTEX_LOCK(env, dbmp->mutex); + if (dbmfp->fhp == NULL) { + ret = __db_tmp_open(env, + F_ISSET(env->dbenv, DB_ENV_DIRECT_DB) ? + DB_OSO_DIRECT : 0, &dbmfp->fhp); + } else + ret = 0; + MUTEX_UNLOCK(env, dbmp->mutex); + if (ret != 0) { + __db_errx(env, + "unable to create temporary backing file"); + --dbmfp->ref; + return (ret); + } + } + + goto pgwrite; + } + + /* + * There's no file handle for this file in our process. + * + * !!! + * It's the caller's choice if we're going to open extent files. + */ + if (!open_extents && F_ISSET(mfp, MP_EXTENT)) + return (EPERM); + + /* + * !!! + * Don't try to attach to temporary files. There are two problems in + * trying to do that. First, if we have different privileges than the + * process that "owns" the temporary file, we might create the backing + * disk file such that the owning process couldn't read/write its own + * buffers, e.g., memp_trickle running as root creating a file owned + * as root, mode 600. Second, if the temporary file has already been + * created, we don't have any way of finding out what its real name is, + * and, even if we did, it was already unlinked (so that it won't be + * left if the process dies horribly). This decision causes a problem, + * however: if the temporary file consumes the entire buffer cache, + * and the owner doesn't flush the buffers to disk, we could end up + * with resource starvation, and the memp_trickle thread couldn't do + * anything about it. That's a pretty unlikely scenario, though. + * + * Note we should never get here when the temporary file in question + * has already been closed in another process, in which case it should + * be marked dead. + */ + if (F_ISSET(mfp, MP_TEMP) || mfp->no_backing_file) + return (EPERM); + + /* + * It's not a page from a file we've opened. If the file requires + * application-specific input/output processing, see if this process + * has ever registered information as to how to write this type of + * file. If not, there's nothing we can do. + */ + if (mfp->ftype != 0 && mfp->ftype != DB_FTYPE_SET) { + MUTEX_LOCK(env, dbmp->mutex); + LIST_FOREACH(mpreg, &dbmp->dbregq, q) + if (mpreg->ftype == mfp->ftype) + break; + MUTEX_UNLOCK(env, dbmp->mutex); + if (mpreg == NULL) + return (EPERM); + } + + /* + * Try and open the file, specifying the known underlying shared area. + * + * !!! + * There's no negative cache, so we may repeatedly try and open files + * that we have previously tried (and failed) to open. + */ + if ((ret = __memp_fcreate(env, &dbmfp)) != 0) + return (ret); + if ((ret = __memp_fopen(dbmfp, mfp, + NULL, NULL, DB_DURABLE_UNKNOWN, 0, mfp->stat.st_pagesize)) != 0) { + (void)__memp_fclose(dbmfp, 0); + + /* + * Ignore any error if the file is marked dead, assume the file + * was removed from under us. + */ + if (!mfp->deadfile) + return (ret); + + dbmfp = NULL; + } + +pgwrite: + MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, + PROT_READ | PROT_WRITE | PROT_EXEC); + ret = __memp_pgwrite(env, dbmfp, hp, bhp); + if (dbmfp == NULL) + return (ret); + + /* + * Discard our reference, and, if we're the last reference, make sure + * the file eventually gets closed. + */ + MUTEX_LOCK(env, dbmp->mutex); + if (dbmfp->ref == 1) + F_SET(dbmfp, MP_FLUSH); + else + --dbmfp->ref; + MUTEX_UNLOCK(env, dbmp->mutex); + + return (ret); +} + +/* + * __memp_pgread -- + * Read a page from a file. + * + * PUBLIC: int __memp_pgread __P((DB_MPOOLFILE *, BH *, int)); + */ +int +__memp_pgread(dbmfp, bhp, can_create) + DB_MPOOLFILE *dbmfp; + BH *bhp; + int can_create; +{ + ENV *env; + MPOOLFILE *mfp; + size_t len, nr; + u_int32_t pagesize; + int ret; + + env = dbmfp->env; + mfp = dbmfp->mfp; + pagesize = mfp->stat.st_pagesize; + + /* We should never be called with a dirty or unlocked buffer. */ + DB_ASSERT(env, !F_ISSET(bhp, BH_DIRTY_CREATE | BH_FROZEN)); + DB_ASSERT(env, can_create || !F_ISSET(bhp, BH_DIRTY)); + DB_ASSERT(env, F_ISSET(bhp, BH_EXCLUSIVE)); + + /* Mark the buffer as in transistion. */ + F_SET(bhp, BH_TRASH); + + /* + * Temporary files may not yet have been created. We don't create + * them now, we create them when the pages have to be flushed. + */ + nr = 0; + if (dbmfp->fhp != NULL) + if ((ret = __os_io(env, DB_IO_READ, dbmfp->fhp, + bhp->pgno, pagesize, 0, pagesize, bhp->buf, &nr)) != 0) + goto err; + + /* + * The page may not exist; if it doesn't, nr may well be 0, but we + * expect the underlying OS calls not to return an error code in + * this case. + */ + if (nr < pagesize) { + /* + * Don't output error messages for short reads. In particular, + * DB recovery processing may request pages never written to + * disk or for which only some part have been written to disk, + * in which case we won't find the page. The caller must know + * how to handle the error. + */ + if (!can_create) { + ret = DB_PAGE_NOTFOUND; + goto err; + } + + /* Clear any bytes that need to be cleared. */ + len = mfp->clear_len == DB_CLEARLEN_NOTSET ? + pagesize : mfp->clear_len; + memset(bhp->buf, 0, len); + +#if defined(DIAGNOSTIC) || defined(UMRW) + /* + * If we're running in diagnostic mode, corrupt any bytes on + * the page that are unknown quantities for the caller. + */ + if (len < pagesize) + memset(bhp->buf + len, CLEAR_BYTE, pagesize - len); +#endif +#ifdef HAVE_STATISTICS + ++mfp->stat.st_page_create; + } else + ++mfp->stat.st_page_in; +#else + } +#endif + + /* Call any pgin function. */ + ret = mfp->ftype == 0 ? 0 : __memp_pg(dbmfp, bhp->pgno, bhp->buf, 1); + + /* + * If no errors occurred, the data is now valid, clear the BH_TRASH + * flag. + */ + if (ret == 0) + F_CLR(bhp, BH_TRASH); +err: return (ret); +} + +/* + * __memp_pgwrite -- + * Write a page to a file. + */ +static int +__memp_pgwrite(env, dbmfp, hp, bhp) + ENV *env; + DB_MPOOLFILE *dbmfp; + DB_MPOOL_HASH *hp; + BH *bhp; +{ + DB_LSN lsn; + MPOOLFILE *mfp; + size_t nw; + int ret; + void * buf; + + /* + * Since writing does not require exclusive access, another thread + * could have already written this buffer. + */ + if (!F_ISSET(bhp, BH_DIRTY)) + return (0); + + mfp = dbmfp == NULL ? NULL : dbmfp->mfp; + ret = 0; + buf = NULL; + + /* We should never be called with a frozen or trashed buffer. */ + DB_ASSERT(env, !F_ISSET(bhp, BH_FROZEN | BH_TRASH)); + + /* + * It's possible that the underlying file doesn't exist, either + * because of an outright removal or because it was a temporary + * file that's been closed. + * + * !!! + * Once we pass this point, we know that dbmfp and mfp aren't NULL, + * and that we have a valid file reference. + */ + if (mfp == NULL || mfp->deadfile) + goto file_dead; + + /* + * If the page is in a file for which we have LSN information, we have + * to ensure the appropriate log records are on disk. + */ + if (LOGGING_ON(env) && mfp->lsn_off != DB_LSN_OFF_NOTSET && + !IS_CLIENT_PGRECOVER(env)) { + memcpy(&lsn, bhp->buf + mfp->lsn_off, sizeof(DB_LSN)); + if (!IS_NOT_LOGGED_LSN(lsn) && + (ret = __log_flush(env, &lsn)) != 0) + goto err; + } + +#ifdef DIAGNOSTIC + /* + * Verify write-ahead logging semantics. + * + * !!! + * Two special cases. There is a single field on the meta-data page, + * the last-page-number-in-the-file field, for which we do not log + * changes. If the page was originally created in a database that + * didn't have logging turned on, we can see a page marked dirty but + * for which no corresponding log record has been written. However, + * the only way that a page can be created for which there isn't a + * previous log record and valid LSN is when the page was created + * without logging turned on, and so we check for that special-case + * LSN value. + * + * Second, when a client is reading database pages from a master + * during an internal backup, we may get pages modified after + * the current end-of-log. + */ + if (LOGGING_ON(env) && !IS_NOT_LOGGED_LSN(LSN(bhp->buf)) && + !IS_CLIENT_PGRECOVER(env)) { + /* + * There is a potential race here. If we are in the midst of + * switching log files, it's possible we could test against the + * old file and the new offset in the log region's LSN. If we + * fail the first test, acquire the log mutex and check again. + */ + DB_LOG *dblp; + LOG *lp; + + dblp = env->lg_handle; + lp = dblp->reginfo.primary; + if (!lp->db_log_inmemory && + LOG_COMPARE(&lp->s_lsn, &LSN(bhp->buf)) <= 0) { + MUTEX_LOCK(env, lp->mtx_flush); + DB_ASSERT(env, + LOG_COMPARE(&lp->s_lsn, &LSN(bhp->buf)) > 0); + MUTEX_UNLOCK(env, lp->mtx_flush); + } + } +#endif + + /* + * Call any pgout function. If we have the page exclusive then + * we are going to reuse it otherwise make a copy of the page so + * that others can continue looking at the page while we write it. + */ + buf = bhp->buf; + if (mfp->ftype != 0) { + if (F_ISSET(bhp, BH_EXCLUSIVE)) + F_SET(bhp, BH_TRASH); + else { + if ((ret = + __os_malloc(env, mfp->stat.st_pagesize, &buf)) != 0) + goto err; + memcpy(buf, bhp->buf, mfp->stat.st_pagesize); + } + if ((ret = __memp_pg(dbmfp, bhp->pgno, buf, 0)) != 0) + goto err; + } + + /* Write the page. */ + if ((ret = __os_io( + env, DB_IO_WRITE, dbmfp->fhp, bhp->pgno, mfp->stat.st_pagesize, + 0, mfp->stat.st_pagesize, buf, &nw)) != 0) { + __db_errx(env, "%s: write failed for page %lu", + __memp_fn(dbmfp), (u_long)bhp->pgno); + goto err; + } + STAT(++mfp->stat.st_page_out); + if (bhp->pgno > mfp->last_flushed_pgno) { + MUTEX_LOCK(env, mfp->mutex); + if (bhp->pgno > mfp->last_flushed_pgno) + mfp->last_flushed_pgno = bhp->pgno; + MUTEX_UNLOCK(env, mfp->mutex); + } + +err: +file_dead: + if (buf != NULL && buf != bhp->buf) + __os_free(env, buf); + /* + * !!! + * Once we pass this point, dbmfp and mfp may be NULL, we may not have + * a valid file reference. + */ + + /* + * Update the hash bucket statistics, reset the flags. If we were + * successful, the page is no longer dirty. Someone else may have + * also written the page so we need to latch the hash bucket here + * to get the accounting correct. Since we have the buffer + * shared it cannot be marked dirty again till we release it. + * This is the only place we update the flags field only holding + * a shared latch. + */ + if (F_ISSET(bhp, BH_DIRTY | BH_TRASH)) { + MUTEX_LOCK(env, hp->mtx_hash); + DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc)); + if (ret == 0 && F_ISSET(bhp, BH_DIRTY)) { + F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE); + DB_ASSERT(env, atomic_read(&hp->hash_page_dirty) > 0); + atomic_dec(env, &hp->hash_page_dirty); + } + + /* put the page back if necessary. */ + if ((ret != 0 || BH_REFCOUNT(bhp) > 1) && + F_ISSET(bhp, BH_TRASH)) { + ret = __memp_pg(dbmfp, bhp->pgno, bhp->buf, 1); + F_CLR(bhp, BH_TRASH); + } + MUTEX_UNLOCK(env, hp->mtx_hash); + } + + return (ret); +} + +/* + * __memp_pg -- + * Call the pgin/pgout routine. + * + * PUBLIC: int __memp_pg __P((DB_MPOOLFILE *, db_pgno_t, void *, int)); + */ +int +__memp_pg(dbmfp, pgno, buf, is_pgin) + DB_MPOOLFILE *dbmfp; + db_pgno_t pgno; + void *buf; + int is_pgin; +{ + DBT dbt, *dbtp; + DB_MPOOL *dbmp; + DB_MPREG *mpreg; + ENV *env; + MPOOLFILE *mfp; + int ftype, ret; + + env = dbmfp->env; + dbmp = env->mp_handle; + mfp = dbmfp->mfp; + + if ((ftype = mfp->ftype) == DB_FTYPE_SET) + mpreg = dbmp->pg_inout; + else { + MUTEX_LOCK(env, dbmp->mutex); + LIST_FOREACH(mpreg, &dbmp->dbregq, q) + if (ftype == mpreg->ftype) + break; + MUTEX_UNLOCK(env, dbmp->mutex); + } + if (mpreg == NULL) + return (0); + + if (mfp->pgcookie_len == 0) + dbtp = NULL; + else { + DB_SET_DBT(dbt, R_ADDR( + dbmp->reginfo, mfp->pgcookie_off), mfp->pgcookie_len); + dbtp = &dbt; + } + + if (is_pgin) { + if (mpreg->pgin != NULL && (ret = + mpreg->pgin(env->dbenv, pgno, buf, dbtp)) != 0) + goto err; + } else + if (mpreg->pgout != NULL && (ret = + mpreg->pgout(env->dbenv, pgno, buf, dbtp)) != 0) + goto err; + + return (0); + +err: __db_errx(env, "%s: %s failed for page %lu", + __memp_fn(dbmfp), is_pgin ? "pgin" : "pgout", (u_long)pgno); + return (ret); +} + +/* + * __memp_bhfree -- + * Free a bucket header and its referenced data. + * + * PUBLIC: int __memp_bhfree __P((DB_MPOOL *, + * PUBLIC: REGINFO *, MPOOLFILE *, DB_MPOOL_HASH *, BH *, u_int32_t)); + */ +int +__memp_bhfree(dbmp, infop, mfp, hp, bhp, flags) + DB_MPOOL *dbmp; + REGINFO *infop; + MPOOLFILE *mfp; + DB_MPOOL_HASH *hp; + BH *bhp; + u_int32_t flags; +{ + ENV *env; +#ifdef DIAGNOSTIC + DB_LSN vlsn; +#endif + BH *prev_bhp; + MPOOL *c_mp; + int ret, t_ret; +#ifdef DIAG_MVCC + size_t pagesize; +#endif + + ret = 0; + + /* + * Assumes the hash bucket is locked and the MPOOL is not. + */ + env = dbmp->env; +#ifdef DIAG_MVCC + if (mfp != NULL) + pagesize = mfp->stat.st_pagesize; +#endif + + DB_ASSERT(env, LF_ISSET(BH_FREE_UNLOCKED) || + (hp != NULL && MUTEX_IS_OWNED(env, hp->mtx_hash))); + DB_ASSERT(env, BH_REFCOUNT(bhp) == 1 && + !F_ISSET(bhp, BH_DIRTY | BH_FROZEN)); + DB_ASSERT(env, LF_ISSET(BH_FREE_UNLOCKED) || + SH_CHAIN_SINGLETON(bhp, vc) || (SH_CHAIN_HASNEXT(bhp, vc) && + (SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off == bhp->td_off || + bhp->td_off == INVALID_ROFF || + IS_MAX_LSN(*VISIBLE_LSN(env, bhp)) || + BH_OBSOLETE(bhp, hp->old_reader, vlsn)))); + + /* + * Delete the buffer header from the hash bucket queue or the + * version chain. + */ + if (hp == NULL) + goto no_hp; + prev_bhp = SH_CHAIN_PREV(bhp, vc, __bh); + if (!SH_CHAIN_HASNEXT(bhp, vc)) { + if (prev_bhp != NULL) + SH_TAILQ_INSERT_AFTER(&hp->hash_bucket, + bhp, prev_bhp, hq, __bh); + SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh); + } + SH_CHAIN_REMOVE(bhp, vc, __bh); + + /* + * Remove the reference to this buffer from the transaction that + * created it, if any. When the BH_FREE_UNLOCKED flag is set, we're + * discarding the environment, so the transaction region is already + * gone. + */ + if (bhp->td_off != INVALID_ROFF && !LF_ISSET(BH_FREE_UNLOCKED)) { + ret = __txn_remove_buffer( + env, BH_OWNER(env, bhp), hp->mtx_hash); + bhp->td_off = INVALID_ROFF; + } + + /* + * We're going to use the memory for something else -- it had better be + * accessible. + */ +no_hp: MVCC_MPROTECT(bhp->buf, pagesize, PROT_READ | PROT_WRITE | PROT_EXEC); + + /* + * Discard the hash bucket's mutex, it's no longer needed, and + * we don't want to be holding it when acquiring other locks. + */ + if (!LF_ISSET(BH_FREE_UNLOCKED)) + MUTEX_UNLOCK(env, hp->mtx_hash); + + /* + * If we're only removing this header from the chain for reuse, we're + * done. + */ + if (LF_ISSET(BH_FREE_REUSE)) + return (ret); + + /* + * If we're not reusing the buffer immediately, free the buffer for + * real. + */ + if (!LF_ISSET(BH_FREE_UNLOCKED)) + MUTEX_UNLOCK(env, bhp->mtx_buf); + if (LF_ISSET(BH_FREE_FREEMEM)) { + if ((ret = __mutex_free(env, &bhp->mtx_buf)) != 0) + return (ret); + MPOOL_REGION_LOCK(env, infop); + + MVCC_BHUNALIGN(bhp); + __memp_free(infop, bhp); + c_mp = infop->primary; + c_mp->stat.st_pages--; + + MPOOL_REGION_UNLOCK(env, infop); + } + + if (mfp == NULL) + return (ret); + + /* + * Decrement the reference count of the underlying MPOOLFILE. + * If this is its last reference, remove it. + */ + MUTEX_LOCK(env, mfp->mutex); + if (--mfp->block_cnt == 0 && mfp->mpf_cnt == 0) { + if ((t_ret = __memp_mf_discard(dbmp, mfp)) != 0 && ret == 0) + ret = t_ret; + } else + MUTEX_UNLOCK(env, mfp->mutex); + + return (ret); +} diff --git a/mp/mp_fget.c b/mp/mp_fget.c new file mode 100644 index 0000000..5fdee5a --- /dev/null +++ b/mp/mp_fget.c @@ -0,0 +1,1161 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" + +/* + * __memp_fget_pp -- + * DB_MPOOLFILE->get pre/post processing. + * + * PUBLIC: int __memp_fget_pp + * PUBLIC: __P((DB_MPOOLFILE *, db_pgno_t *, DB_TXN *, u_int32_t, void *)); + */ +int +__memp_fget_pp(dbmfp, pgnoaddr, txnp, flags, addrp) + DB_MPOOLFILE *dbmfp; + db_pgno_t *pgnoaddr; + DB_TXN *txnp; + u_int32_t flags; + void *addrp; +{ + DB_THREAD_INFO *ip; + ENV *env; + int rep_blocked, ret; + + env = dbmfp->env; + + MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->get"); + + /* + * Validate arguments. + * + * !!! + * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly + * files here, and create non-existent pages in readonly files if the + * flags are set, later. The reason is that the hash access method + * wants to get empty pages that don't really exist in readonly files. + * The only alternative is for hash to write the last "bucket" all the + * time, which we don't want to do because one of our big goals in life + * is to keep database files small. It's sleazy as hell, but we catch + * any attempt to actually write the file in memp_fput(). + */ +#define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_DIRTY | \ + DB_MPOOL_EDIT | DB_MPOOL_LAST | DB_MPOOL_NEW) + if (flags != 0) { + if ((ret = __db_fchk(env, "memp_fget", flags, OKFLAGS)) != 0) + return (ret); + + switch (flags) { + case DB_MPOOL_DIRTY: + case DB_MPOOL_CREATE: + case DB_MPOOL_EDIT: + case DB_MPOOL_LAST: + case DB_MPOOL_NEW: + break; + default: + return (__db_ferr(env, "memp_fget", 1)); + } + } + + ENV_ENTER(env, ip); + + rep_blocked = 0; + if (txnp == NULL && IS_ENV_REPLICATED(env)) { + if ((ret = __op_rep_enter(env)) != 0) + goto err; + rep_blocked = 1; + } + ret = __memp_fget(dbmfp, pgnoaddr, ip, txnp, flags, addrp); + /* + * We only decrement the count in op_rep_exit if the operation fails. + * Otherwise the count will be decremented when the page is no longer + * pinned in memp_fput. + */ + if (ret != 0 && rep_blocked) + (void)__op_rep_exit(env); + + /* Similarly if an app has a page pinned it is ACTIVE. */ +err: if (ret != 0) + ENV_LEAVE(env, ip); + + return (ret); +} + +/* + * __memp_fget -- + * Get a page from the file. + * + * PUBLIC: int __memp_fget __P((DB_MPOOLFILE *, + * PUBLIC: db_pgno_t *, DB_THREAD_INFO *, DB_TXN *, u_int32_t, void *)); + */ +int +__memp_fget(dbmfp, pgnoaddr, ip, txn, flags, addrp) + DB_MPOOLFILE *dbmfp; + db_pgno_t *pgnoaddr; + DB_THREAD_INFO *ip; + DB_TXN *txn; + u_int32_t flags; + void *addrp; +{ + enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state; + BH *alloc_bhp, *bhp, *oldest_bhp; + ENV *env; + DB_LSN *read_lsnp, vlsn; + DB_MPOOL *dbmp; + DB_MPOOL_HASH *hp; + MPOOL *c_mp; + MPOOLFILE *mfp; + PIN_LIST *list, *lp; + REGENV *renv; + REGINFO *infop, *t_infop, *reginfo; + TXN_DETAIL *td; + roff_t list_off, mf_offset; + u_int32_t bucket, pinmax, st_hsearch; + int b_incr, b_lock, h_locked, dirty, extending; + int makecopy, mvcc, need_free, ret; + + *(void **)addrp = NULL; + COMPQUIET(c_mp, NULL); + COMPQUIET(infop, NULL); + + env = dbmfp->env; + dbmp = env->mp_handle; + + mfp = dbmfp->mfp; + mvcc = mfp->multiversion && (txn != NULL); + mf_offset = R_OFFSET(dbmp->reginfo, mfp); + alloc_bhp = bhp = oldest_bhp = NULL; + read_lsnp = NULL; + td = NULL; + hp = NULL; + b_incr = b_lock = h_locked = extending = makecopy = ret = 0; + + if (LF_ISSET(DB_MPOOL_DIRTY)) { + if (F_ISSET(dbmfp, MP_READONLY)) { + __db_errx(env, + "%s: dirty flag set for readonly file page", + __memp_fn(dbmfp)); + return (EINVAL); + } + if ((ret = __db_fcchk(env, "DB_MPOOLFILE->get", + flags, DB_MPOOL_DIRTY, DB_MPOOL_EDIT)) != 0) + return (ret); + } + + dirty = LF_ISSET(DB_MPOOL_DIRTY | DB_MPOOL_EDIT | DB_MPOOL_FREE); + LF_CLR(DB_MPOOL_DIRTY | DB_MPOOL_EDIT); + + /* + * If the transaction is being used to update a multiversion database + * for the first time, set the read LSN. In addition, if this is an + * update, allocate a mutex. If no transaction has been supplied, that + * will be caught later, when we know whether one is required. + */ + if (mvcc && txn != NULL && txn->td != NULL) { + /* We're only interested in the ultimate parent transaction. */ + while (txn->parent != NULL) + txn = txn->parent; + td = (TXN_DETAIL *)txn->td; + if (F_ISSET(txn, TXN_SNAPSHOT)) { + read_lsnp = &td->read_lsn; + if (IS_MAX_LSN(*read_lsnp) && + (ret = __log_current_lsn(env, read_lsnp, + NULL, NULL)) != 0) + return (ret); + } + if ((dirty || LF_ISSET(DB_MPOOL_CREATE | DB_MPOOL_NEW)) && + td->mvcc_mtx == MUTEX_INVALID && (ret = + __mutex_alloc(env, MTX_TXN_MVCC, 0, &td->mvcc_mtx)) != 0) + return (ret); + } + + switch (flags) { + case DB_MPOOL_LAST: + /* Get the last page number in the file. */ + MUTEX_LOCK(env, mfp->mutex); + *pgnoaddr = mfp->last_pgno; + MUTEX_UNLOCK(env, mfp->mutex); + break; + case DB_MPOOL_NEW: + /* + * If always creating a page, skip the first search + * of the hash bucket. + */ + goto newpg; + case DB_MPOOL_CREATE: + default: + break; + } + + /* + * If mmap'ing the file and the page is not past the end of the file, + * just return a pointer. We can't use R_ADDR here: this is an offset + * into an mmap'd file, not a shared region, and doesn't change for + * private environments. + * + * The page may be past the end of the file, so check the page number + * argument against the original length of the file. If we previously + * returned pages past the original end of the file, last_pgno will + * have been updated to match the "new" end of the file, and checking + * against it would return pointers past the end of the mmap'd region. + * + * If another process has opened the file for writing since we mmap'd + * it, we will start playing the game by their rules, i.e. everything + * goes through the cache. All pages previously returned will be safe, + * as long as the correct locking protocol was observed. + * + * We don't discard the map because we don't know when all of the + * pages will have been discarded from the process' address space. + * It would be possible to do so by reference counting the open + * pages from the mmap, but it's unclear to me that it's worth it. + */ + if (dbmfp->addr != NULL && + F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) { + *(void **)addrp = (u_int8_t *)dbmfp->addr + + (*pgnoaddr * mfp->stat.st_pagesize); + STAT(++mfp->stat.st_map); + return (0); + } + + /* + * Determine the cache and hash bucket where this page lives and get + * local pointers to them. Reset on each pass through this code, the + * page number can change. + */ + MP_GET_BUCKET(env, mfp, *pgnoaddr, &infop, hp, bucket, ret); + if (ret != 0) + return (ret); + c_mp = infop->primary; + + if (0) { + /* if we search again, get an exclusive lock. */ +retry: MUTEX_LOCK(env, hp->mtx_hash); + } + + /* Search the hash chain for the page. */ + st_hsearch = 0; + h_locked = 1; + SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) { + ++st_hsearch; + if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset) + continue; + + /* Snapshot reads -- get the version visible at read_lsn. */ + if (read_lsnp != NULL) { + while (bhp != NULL && + !BH_OWNED_BY(env, bhp, txn) && + !BH_VISIBLE(env, bhp, read_lsnp, vlsn)) + bhp = SH_CHAIN_PREV(bhp, vc, __bh); + + /* + * We can get a null bhp if we are looking for a + * page that was created after the transaction was + * started so its not visible (i.e. page added to + * the BTREE in a subsequent txn). + */ + if (bhp == NULL) { + ret = DB_PAGE_NOTFOUND; + goto err; + } + } + + makecopy = mvcc && dirty && !BH_OWNED_BY(env, bhp, txn); + + /* + * Increment the reference count. This signals that the + * buffer may not be discarded. We must drop the hash + * mutex before we lock the buffer mutex. + */ + if (BH_REFCOUNT(bhp) == UINT16_MAX) { + __db_errx(env, + "%s: page %lu: reference count overflow", + __memp_fn(dbmfp), (u_long)bhp->pgno); + ret = __env_panic(env, EINVAL); + goto err; + } + atomic_inc(env, &bhp->ref); + b_incr = 1; + + /* + * Lock the buffer. If the page is being read in or modified it + * will be exclusively locked and we will block. + */ + MUTEX_UNLOCK(env, hp->mtx_hash); + h_locked = 0; + if (dirty || extending || makecopy || F_ISSET(bhp, BH_FROZEN)) { +xlatch: if (LF_ISSET(DB_MPOOL_TRY)) { + if ((ret = + MUTEX_TRYLOCK(env, bhp->mtx_buf)) != 0) + goto err; + } else + MUTEX_LOCK(env, bhp->mtx_buf); + F_SET(bhp, BH_EXCLUSIVE); + } else if (LF_ISSET(DB_MPOOL_TRY)) { + if ((ret = MUTEX_TRY_READLOCK(env, bhp->mtx_buf)) != 0) + goto err; + } else + MUTEX_READLOCK(env, bhp->mtx_buf); + +#ifdef HAVE_SHARED_LATCHES + /* + * If buffer is still in transit once we have a shared latch, + * upgrade to an exclusive latch. + */ + if (F_ISSET(bhp, BH_FREED | BH_TRASH) && + !F_ISSET(bhp, BH_EXCLUSIVE)) { + MUTEX_UNLOCK(env, bhp->mtx_buf); + goto xlatch; + } +#else + F_SET(bhp, BH_EXCLUSIVE); +#endif + b_lock = 1; + + /* + * If the buffer was frozen before we waited for any I/O to + * complete and is still frozen, we will need to thaw it. + * Otherwise, it was thawed while we waited, and we need to + * search again. + */ + if (F_ISSET(bhp, BH_THAWED)) { +thawed: need_free = (atomic_dec(env, &bhp->ref) == 0); + b_incr = 0; + MUTEX_UNLOCK(env, bhp->mtx_buf); + b_lock = 0; + if (need_free) { + MPOOL_REGION_LOCK(env, infop); + SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen, + bhp, hq); + MPOOL_REGION_UNLOCK(env, infop); + } + bhp = NULL; + goto retry; + } + + /* + * If the buffer we wanted was frozen or thawed while we + * waited, we need to start again. That is indicated by + * a new buffer header in the version chain owned by the same + * transaction as the one we pinned. + * + * Also, if we're doing an unversioned read on a multiversion + * file, another thread may have dirtied this buffer while we + * swapped from the hash bucket lock to the buffer lock. + */ + if (SH_CHAIN_HASNEXT(bhp, vc) && + (SH_CHAIN_NEXTP(bhp, vc, __bh)->td_off == bhp->td_off || + (!dirty && read_lsnp == NULL))) { + DB_ASSERT(env, b_incr && BH_REFCOUNT(bhp) != 0); + atomic_dec(env, &bhp->ref); + b_incr = 0; + MUTEX_UNLOCK(env, bhp->mtx_buf); + b_lock = 0; + bhp = NULL; + goto retry; + } else if (dirty && SH_CHAIN_HASNEXT(bhp, vc)) { + ret = DB_LOCK_DEADLOCK; + goto err; + } else if (F_ISSET(bhp, BH_FREED) && flags != DB_MPOOL_CREATE && + flags != DB_MPOOL_NEW && flags != DB_MPOOL_FREE) { + ret = DB_PAGE_NOTFOUND; + goto err; + } + + STAT(++mfp->stat.st_cache_hit); + break; + } + +#ifdef HAVE_STATISTICS + /* + * Update the hash bucket search statistics -- do now because our next + * search may be for a different bucket. + */ + ++c_mp->stat.st_hash_searches; + if (st_hsearch > c_mp->stat.st_hash_longest) + c_mp->stat.st_hash_longest = st_hsearch; + c_mp->stat.st_hash_examined += st_hsearch; +#endif + + /* + * There are 4 possible paths to this location: + * + * FIRST_MISS: + * Didn't find the page in the hash bucket on our first pass: + * bhp == NULL, alloc_bhp == NULL + * + * FIRST_FOUND: + * Found the page in the hash bucket on our first pass: + * bhp != NULL, alloc_bhp == NULL + * + * SECOND_FOUND: + * Didn't find the page in the hash bucket on the first pass, + * allocated space, and found the page in the hash bucket on + * our second pass: + * bhp != NULL, alloc_bhp != NULL + * + * SECOND_MISS: + * Didn't find the page in the hash bucket on the first pass, + * allocated space, and didn't find the page in the hash bucket + * on our second pass: + * bhp == NULL, alloc_bhp != NULL + */ + state = bhp == NULL ? + (alloc_bhp == NULL ? FIRST_MISS : SECOND_MISS) : + (alloc_bhp == NULL ? FIRST_FOUND : SECOND_FOUND); + + switch (state) { + case FIRST_FOUND: + /* + * If we are to free the buffer, then this had better be the + * only reference. If so, just free the buffer. If not, + * complain and get out. + */ + if (flags == DB_MPOOL_FREE) { +freebuf: MUTEX_LOCK(env, hp->mtx_hash); + h_locked = 1; + if (F_ISSET(bhp, BH_DIRTY)) { + F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE); + DB_ASSERT(env, + atomic_read(&hp->hash_page_dirty) > 0); + atomic_dec(env, &hp->hash_page_dirty); + } + + /* + * If the buffer we found is already freed, we're done. + * If the ref count is not 1 then someone may be + * peeking at the buffer. We cannot free it until they + * determine that it is not what they want. Clear the + * buffer so that waiting threads get an empty page. + */ + if (F_ISSET(bhp, BH_FREED)) + goto done; + else if (F_ISSET(bhp, BH_FROZEN)) + makecopy = 1; + + if (makecopy) + break; + else if (BH_REFCOUNT(bhp) != 1 || + !SH_CHAIN_SINGLETON(bhp, vc)) { + /* + * Create an empty page in the chain for + * subsequent gets. Otherwise, a thread that + * re-creates this page while it is still in + * cache will see stale data. + */ + F_SET(bhp, BH_FREED); + F_CLR(bhp, BH_TRASH); + } else { + ret = __memp_bhfree(dbmp, infop, mfp, + hp, bhp, BH_FREE_FREEMEM); + bhp = NULL; + b_incr = b_lock = h_locked = 0; + } + goto done; + } else if (F_ISSET(bhp, BH_FREED)) { +revive: DB_ASSERT(env, + flags == DB_MPOOL_CREATE || flags == DB_MPOOL_NEW); + makecopy = makecopy || + (mvcc && !BH_OWNED_BY(env, bhp, txn)) || + F_ISSET(bhp, BH_FROZEN); + if (flags == DB_MPOOL_CREATE) { + MUTEX_LOCK(env, mfp->mutex); + if (*pgnoaddr > mfp->last_pgno) + mfp->last_pgno = *pgnoaddr; + MUTEX_UNLOCK(env, mfp->mutex); + } + } + if (mvcc) { + /* + * With multiversion databases, we might need to + * allocate a new buffer into which we can copy the one + * that we found. In that case, check the last buffer + * in the chain to see whether we can reuse an obsolete + * buffer. + * + * To provide snapshot isolation, we need to make sure + * that we've seen a buffer older than the oldest + * snapshot read LSN. + */ +reuse: if ((makecopy || F_ISSET(bhp, BH_FROZEN)) && + !h_locked) { + MUTEX_LOCK(env, hp->mtx_hash); + h_locked = 1; + } + if ((makecopy || F_ISSET(bhp, BH_FROZEN)) && + SH_CHAIN_HASPREV(bhp, vc)) { + oldest_bhp = SH_CHAIN_PREVP(bhp, vc, __bh); + while (SH_CHAIN_HASPREV(oldest_bhp, vc)) + oldest_bhp = SH_CHAIN_PREVP( + oldest_bhp, vc, __bh); + + if (BH_REFCOUNT(oldest_bhp) == 0 && + !BH_OBSOLETE( + oldest_bhp, hp->old_reader, vlsn) && + (ret = __txn_oldest_reader(env, + &hp->old_reader)) != 0) + goto err; + + if (BH_OBSOLETE( + oldest_bhp, hp->old_reader, vlsn) && + BH_REFCOUNT(oldest_bhp) == 0) { + DB_ASSERT(env, + !F_ISSET(oldest_bhp, BH_DIRTY)); + atomic_inc(env, &oldest_bhp->ref); + if (F_ISSET(oldest_bhp, BH_FROZEN)) { + /* + * This call will release the + * hash bucket mutex. + */ + ret = __memp_bh_thaw(dbmp, + infop, hp, oldest_bhp, + NULL); + h_locked = 0; + if (ret != 0) + goto err; + goto reuse; + } + if ((ret = __memp_bhfree(dbmp, + infop, mfp, hp, oldest_bhp, + BH_FREE_REUSE)) != 0) + goto err; + alloc_bhp = oldest_bhp; + h_locked = 0; + } + + DB_ASSERT(env, alloc_bhp == NULL || + !F_ISSET(alloc_bhp, BH_FROZEN)); + } + } + + /* We found the buffer or we're ready to copy -- we're done. */ + if (!(makecopy || F_ISSET(bhp, BH_FROZEN)) || alloc_bhp != NULL) + break; + + /* FALLTHROUGH */ + case FIRST_MISS: + /* + * We didn't find the buffer in our first check. Figure out + * if the page exists, and allocate structures so we can add + * the page to the buffer pool. + */ + if (h_locked) + MUTEX_UNLOCK(env, hp->mtx_hash); + h_locked = 0; + + /* + * The buffer is not in the pool, so we don't need to free it. + */ + if (LF_ISSET(DB_MPOOL_FREE) && + (bhp == NULL || F_ISSET(bhp, BH_FREED) || !makecopy)) + goto done; + + if (bhp != NULL) + goto alloc; + +newpg: /* + * If DB_MPOOL_NEW is set, we have to allocate a page number. + * If neither DB_MPOOL_CREATE or DB_MPOOL_NEW is set, then + * it's an error to try and get a page past the end of file. + */ + DB_ASSERT(env, !h_locked); + MUTEX_LOCK(env, mfp->mutex); + switch (flags) { + case DB_MPOOL_NEW: + extending = 1; + if (mfp->maxpgno != 0 && + mfp->last_pgno >= mfp->maxpgno) { + __db_errx(env, "%s: file limited to %lu pages", + __memp_fn(dbmfp), (u_long)mfp->maxpgno); + ret = ENOSPC; + } else + *pgnoaddr = mfp->last_pgno + 1; + break; + case DB_MPOOL_CREATE: + if (mfp->maxpgno != 0 && *pgnoaddr > mfp->maxpgno) { + __db_errx(env, "%s: file limited to %lu pages", + __memp_fn(dbmfp), (u_long)mfp->maxpgno); + ret = ENOSPC; + } else if (!extending) + extending = *pgnoaddr > mfp->last_pgno; + break; + default: + ret = *pgnoaddr > mfp->last_pgno ? DB_PAGE_NOTFOUND : 0; + break; + } + MUTEX_UNLOCK(env, mfp->mutex); + if (ret != 0) + goto err; + + /* + * !!! + * In the DB_MPOOL_NEW code path, hp, infop and c_mp have + * not yet been initialized. + */ + if (hp == NULL) { + MP_GET_BUCKET(env, + mfp, *pgnoaddr, &infop, hp, bucket, ret); + if (ret != 0) + goto err; + MUTEX_UNLOCK(env, hp->mtx_hash); + c_mp = infop->primary; + } + +alloc: /* Allocate a new buffer header and data space. */ + if (alloc_bhp == NULL && (ret = + __memp_alloc(dbmp, infop, mfp, 0, NULL, &alloc_bhp)) != 0) + goto err; + + /* Initialize enough so we can call __memp_bhfree. */ + alloc_bhp->flags = 0; + atomic_init(&alloc_bhp->ref, 1); +#ifdef DIAGNOSTIC + if ((uintptr_t)alloc_bhp->buf & (sizeof(size_t) - 1)) { + __db_errx(env, + "DB_MPOOLFILE->get: buffer data is NOT size_t aligned"); + ret = __env_panic(env, EINVAL); + goto err; + } +#endif + + /* + * If we're doing copy-on-write, we will already have the + * buffer header. In that case, we don't need to search again. + */ + if (bhp != NULL) + break; + + /* + * If we are extending the file, we'll need the mfp lock + * again. + */ + if (extending) + MUTEX_LOCK(env, mfp->mutex); + + /* + * DB_MPOOL_NEW does not guarantee you a page unreferenced by + * any other thread of control. (That guarantee is interesting + * for DB_MPOOL_NEW, unlike DB_MPOOL_CREATE, because the caller + * did not specify the page number, and so, may reasonably not + * have any way to lock the page outside of mpool.) Regardless, + * if we allocate the page, and some other thread of control + * requests the page by number, we will not detect that and the + * thread of control that allocated using DB_MPOOL_NEW may not + * have a chance to initialize the page. (Note: we *could* + * detect this case if we set a flag in the buffer header which + * guaranteed that no gets of the page would succeed until the + * reference count went to 0, that is, until the creating page + * put the page.) What we do guarantee is that if two threads + * of control are both doing DB_MPOOL_NEW calls, they won't + * collide, that is, they won't both get the same page. + * + * There's a possibility that another thread allocated the page + * we were planning to allocate while we were off doing buffer + * allocation. We can do that by making sure the page number + * we were going to use is still available. If it's not, then + * we check to see if the next available page number hashes to + * the same mpool region as the old one -- if it does, we can + * continue, otherwise, we have to start over. + */ + if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) { + *pgnoaddr = mfp->last_pgno + 1; + MP_GET_REGION(dbmfp, *pgnoaddr, &t_infop, ret); + if (ret != 0) + goto err; + if (t_infop != infop) { + /* + * flags == DB_MPOOL_NEW, so extending is set + * and we're holding the mfp locked. + */ + MUTEX_UNLOCK(env, mfp->mutex); + goto newpg; + } + } + + /* + * We released the mfp lock, so another thread might have + * extended the file. Update the last_pgno and initialize + * the file, as necessary, if we extended the file. + */ + if (extending) { + if (*pgnoaddr > mfp->last_pgno) + mfp->last_pgno = *pgnoaddr; + MUTEX_UNLOCK(env, mfp->mutex); + if (ret != 0) + goto err; + } + goto retry; + case SECOND_FOUND: + /* + * We allocated buffer space for the requested page, but then + * found the page in the buffer cache on our second check. + * That's OK -- we can use the page we found in the pool, + * unless DB_MPOOL_NEW is set. If we're about to copy-on-write, + * this is exactly the situation we want. + * + * For multiversion files, we may have left some pages in cache + * beyond the end of a file after truncating. In that case, we + * would get to here with extending set. If so, we need to + * insert the new page in the version chain similar to when + * we copy on write. + */ + if (F_ISSET(bhp, BH_FREED) && + (flags == DB_MPOOL_NEW || flags == DB_MPOOL_CREATE)) + goto revive; + else if (flags == DB_MPOOL_FREE) + goto freebuf; + else if (makecopy || F_ISSET(bhp, BH_FROZEN)) + break; + + /* + * We can't use the page we found in the pool if DB_MPOOL_NEW + * was set. (For details, see the above comment beginning + * "DB_MPOOL_NEW does not guarantee you a page unreferenced by + * any other thread of control".) If DB_MPOOL_NEW is set, we + * release our pin on this particular buffer, and try to get + * another one. + */ + if (flags == DB_MPOOL_NEW) { + DB_ASSERT(env, b_incr && BH_REFCOUNT(bhp) != 0); + atomic_dec(env, &bhp->ref); + b_incr = 0; + if (F_ISSET(bhp, BH_EXCLUSIVE)) + F_CLR(bhp, BH_EXCLUSIVE); + MUTEX_UNLOCK(env, bhp->mtx_buf); + b_lock = 0; + bhp = NULL; + goto newpg; + } + + break; + case SECOND_MISS: + /* + * We allocated buffer space for the requested page, and found + * the page still missing on our second pass through the buffer + * cache. Instantiate the page. + */ + DB_ASSERT(env, alloc_bhp != NULL); + bhp = alloc_bhp; + alloc_bhp = NULL; + + /* + * Initialize all the BH and hash bucket fields so we can call + * __memp_bhfree if an error occurs. + * + * Append the buffer to the tail of the bucket list. + */ + bhp->priority = UINT32_MAX; + bhp->pgno = *pgnoaddr; + bhp->mf_offset = mf_offset; + bhp->bucket = bucket; + bhp->region = (int)(infop - dbmp->reginfo); + bhp->td_off = INVALID_ROFF; + SH_CHAIN_INIT(bhp, vc); + bhp->flags = 0; + + /* + * Reference the buffer and lock exclusive. We either + * need to read the buffer or create it from scratch + * and don't want anyone looking at it till we do. + */ + MUTEX_LOCK(env, bhp->mtx_buf); + b_lock = 1; + F_SET(bhp, BH_EXCLUSIVE); + b_incr = 1; + + /* We created a new page, it starts dirty. */ + if (extending) { + atomic_inc(env, &hp->hash_page_dirty); + F_SET(bhp, BH_DIRTY | BH_DIRTY_CREATE); + } + + MUTEX_REQUIRED(env, hp->mtx_hash); + SH_TAILQ_INSERT_HEAD(&hp->hash_bucket, bhp, hq, __bh); + MUTEX_UNLOCK(env, hp->mtx_hash); + h_locked = 0; + + /* + * If we created the page, zero it out. If we didn't create + * the page, read from the backing file. + * + * !!! + * DB_MPOOL_NEW doesn't call the pgin function. + * + * If DB_MPOOL_CREATE is used, then the application's pgin + * function has to be able to handle pages of 0's -- if it + * uses DB_MPOOL_NEW, it can detect all of its page creates, + * and not bother. + * + * If we're running in diagnostic mode, smash any bytes on the + * page that are unknown quantities for the caller. + * + * Otherwise, read the page into memory, optionally creating it + * if DB_MPOOL_CREATE is set. + */ + if (extending) { + MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, + PROT_READ | PROT_WRITE); + memset(bhp->buf, 0, + (mfp->clear_len == DB_CLEARLEN_NOTSET) ? + mfp->stat.st_pagesize : mfp->clear_len); +#if defined(DIAGNOSTIC) || defined(UMRW) + if (mfp->clear_len != DB_CLEARLEN_NOTSET) + memset(bhp->buf + mfp->clear_len, CLEAR_BYTE, + mfp->stat.st_pagesize - mfp->clear_len); +#endif + + if (flags == DB_MPOOL_CREATE && mfp->ftype != 0 && + (ret = __memp_pg(dbmfp, + bhp->pgno, bhp->buf, 1)) != 0) + goto err; + + STAT(++mfp->stat.st_page_create); + } else { + F_SET(bhp, BH_TRASH); + STAT(++mfp->stat.st_cache_miss); + } + + makecopy = mvcc && dirty && !extending; + + /* Increment buffer count referenced by MPOOLFILE. */ + MUTEX_LOCK(env, mfp->mutex); + ++mfp->block_cnt; + MUTEX_UNLOCK(env, mfp->mutex); + } + + DB_ASSERT(env, bhp != NULL && BH_REFCOUNT(bhp) != 0 && b_lock); + DB_ASSERT(env, !F_ISSET(bhp, BH_FROZEN) || !F_ISSET(bhp, BH_FREED) || + makecopy); + + /* We've got a buffer header we're re-instantiating. */ + if (F_ISSET(bhp, BH_FROZEN) && !F_ISSET(bhp, BH_FREED)) { + if (alloc_bhp == NULL) + goto reuse; + + /* + * To thaw the buffer, we must hold the hash bucket mutex, + * and the call to __memp_bh_thaw will release it. + */ + if (h_locked == 0) + MUTEX_LOCK(env, hp->mtx_hash); + h_locked = 1; + + /* + * If the empty buffer has been filled in the meantime, don't + * overwrite it. + */ + if (F_ISSET(bhp, BH_THAWED)) { + MUTEX_UNLOCK(env, hp->mtx_hash); + h_locked = 0; + goto thawed; + } + + ret = __memp_bh_thaw(dbmp, infop, hp, bhp, alloc_bhp); + bhp = NULL; + b_lock = h_locked = 0; + if (ret != 0) + goto err; + bhp = alloc_bhp; + alloc_bhp = NULL; + MUTEX_REQUIRED(env, bhp->mtx_buf); + b_incr = b_lock = 1; + } + + /* + * BH_TRASH -- + * The buffer we found may need to be filled from the disk. + * + * It's possible for the read function to fail, which means we fail + * as well. Discard the buffer on failure unless another thread + * is waiting on our I/O to complete. It's OK to leave the buffer + * around, as the waiting thread will see the BH_TRASH flag set, + * and will also attempt to discard it. If there's a waiter, + * we need to decrement our reference count. + */ + if (F_ISSET(bhp, BH_TRASH) && + flags != DB_MPOOL_FREE && !F_ISSET(bhp, BH_FREED)) { + if ((ret = __memp_pgread(dbmfp, + bhp, LF_ISSET(DB_MPOOL_CREATE) ? 1 : 0)) != 0) + goto err; + DB_ASSERT(env, read_lsnp != NULL || !SH_CHAIN_HASNEXT(bhp, vc)); + } + + /* Copy-on-write. */ + if (makecopy) { + /* + * If we read a page from disk that we want to modify, we now + * need to make copy, so we now need to allocate another buffer + * to hold the new copy. + */ + if (alloc_bhp == NULL) + goto reuse; + + DB_ASSERT(env, bhp != NULL && alloc_bhp != bhp); + DB_ASSERT(env, txn != NULL || + (F_ISSET(bhp, BH_FROZEN) && F_ISSET(bhp, BH_FREED))); + DB_ASSERT(env, (extending || flags == DB_MPOOL_FREE || + F_ISSET(bhp, BH_FREED)) || + !F_ISSET(bhp, BH_FROZEN | BH_TRASH)); + MUTEX_REQUIRED(env, bhp->mtx_buf); + + if (BH_REFCOUNT(bhp) == 1) + MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, + PROT_READ); + + atomic_init(&alloc_bhp->ref, 1); + MUTEX_LOCK(env, alloc_bhp->mtx_buf); + alloc_bhp->priority = bhp->priority; + alloc_bhp->pgno = bhp->pgno; + alloc_bhp->bucket = bhp->bucket; + alloc_bhp->region = bhp->region; + alloc_bhp->mf_offset = bhp->mf_offset; + alloc_bhp->td_off = INVALID_ROFF; + if (txn == NULL) { + DB_ASSERT(env, + F_ISSET(bhp, BH_FROZEN) && F_ISSET(bhp, BH_FREED)); + if (bhp->td_off != INVALID_ROFF && (ret = + __memp_bh_settxn(dbmp, mfp, alloc_bhp, + BH_OWNER(env, bhp))) != 0) + goto err; + } else if ((ret = + __memp_bh_settxn(dbmp, mfp, alloc_bhp, td)) != 0) + goto err; + MVCC_MPROTECT(alloc_bhp->buf, mfp->stat.st_pagesize, + PROT_READ | PROT_WRITE); + if (extending || + F_ISSET(bhp, BH_FREED) || flags == DB_MPOOL_FREE) { + memset(alloc_bhp->buf, 0, + (mfp->clear_len == DB_CLEARLEN_NOTSET) ? + mfp->stat.st_pagesize : mfp->clear_len); +#if defined(DIAGNOSTIC) || defined(UMRW) + if (mfp->clear_len != DB_CLEARLEN_NOTSET) + memset(alloc_bhp->buf + mfp->clear_len, + CLEAR_BYTE, + mfp->stat.st_pagesize - mfp->clear_len); +#endif + } else + memcpy(alloc_bhp->buf, bhp->buf, mfp->stat.st_pagesize); + MVCC_MPROTECT(alloc_bhp->buf, mfp->stat.st_pagesize, 0); + + if (h_locked == 0) + MUTEX_LOCK(env, hp->mtx_hash); + MUTEX_REQUIRED(env, hp->mtx_hash); + h_locked = 1; + + alloc_bhp->flags = BH_EXCLUSIVE | + ((flags == DB_MPOOL_FREE) ? BH_FREED : + F_ISSET(bhp, BH_DIRTY | BH_DIRTY_CREATE)); + DB_ASSERT(env, flags != DB_MPOOL_FREE || + !F_ISSET(bhp, BH_DIRTY)); + F_CLR(bhp, BH_DIRTY | BH_DIRTY_CREATE); + DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc)); + SH_CHAIN_INSERT_AFTER(bhp, alloc_bhp, vc, __bh); + SH_TAILQ_INSERT_BEFORE(&hp->hash_bucket, + bhp, alloc_bhp, hq, __bh); + SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh); + MUTEX_UNLOCK(env, hp->mtx_hash); + h_locked = 0; + DB_ASSERT(env, b_incr && BH_REFCOUNT(bhp) > 0); + if (atomic_dec(env, &bhp->ref) == 0) { + bhp->priority = c_mp->lru_count; + MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, 0); + } + F_CLR(bhp, BH_EXCLUSIVE); + MUTEX_UNLOCK(env, bhp->mtx_buf); + + bhp = alloc_bhp; + DB_ASSERT(env, BH_REFCOUNT(bhp) > 0); + b_incr = 1; + MUTEX_REQUIRED(env, bhp->mtx_buf); + b_lock = 1; + + if (alloc_bhp != oldest_bhp) { + MUTEX_LOCK(env, mfp->mutex); + ++mfp->block_cnt; + MUTEX_UNLOCK(env, mfp->mutex); + } + + alloc_bhp = NULL; + } else if (mvcc && extending && + (ret = __memp_bh_settxn(dbmp, mfp, bhp, td)) != 0) + goto err; + + if (flags == DB_MPOOL_FREE) { + DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc)); + /* If we have created an empty buffer, it is not returned. */ + if (!F_ISSET(bhp, BH_FREED)) + goto freebuf; + goto done; + } + + /* + * Free the allocated memory, we no longer need it. + */ + if (alloc_bhp != NULL) { + if ((ret = __memp_bhfree(dbmp, infop, NULL, + NULL, alloc_bhp, BH_FREE_FREEMEM | BH_FREE_UNLOCKED)) != 0) + goto err; + alloc_bhp = NULL; + } + + if (dirty || extending || + (F_ISSET(bhp, BH_FREED) && + (flags == DB_MPOOL_CREATE || flags == DB_MPOOL_NEW))) { + MUTEX_REQUIRED(env, bhp->mtx_buf); + if (F_ISSET(bhp, BH_FREED)) { + memset(bhp->buf, 0, + (mfp->clear_len == DB_CLEARLEN_NOTSET) ? + mfp->stat.st_pagesize : mfp->clear_len); + F_CLR(bhp, BH_FREED); + } + if (!F_ISSET(bhp, BH_DIRTY)) { +#ifdef DIAGNOSTIC + MUTEX_LOCK(env, hp->mtx_hash); +#endif + DB_ASSERT(env, !SH_CHAIN_HASNEXT(bhp, vc)); + atomic_inc(env, &hp->hash_page_dirty); + F_SET(bhp, BH_DIRTY); +#ifdef DIAGNOSTIC + MUTEX_UNLOCK(env, hp->mtx_hash); +#endif + } + } else if (F_ISSET(bhp, BH_EXCLUSIVE)) { + F_CLR(bhp, BH_EXCLUSIVE); +#ifdef HAVE_SHARED_LATCHES + MUTEX_UNLOCK(env, bhp->mtx_buf); + MUTEX_READLOCK(env, bhp->mtx_buf); + /* + * If another thread has dirtied the page while we + * switched locks, we have to go through it all again. + */ + if (SH_CHAIN_HASNEXT(bhp, vc) && read_lsnp == NULL) { + atomic_dec(env, &bhp->ref); + b_incr = 0; + MUTEX_UNLOCK(env, bhp->mtx_buf); + b_lock = 0; + bhp = NULL; + goto retry; + } +#endif + } + + MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, PROT_READ | + (dirty || extending || F_ISSET(bhp, BH_DIRTY) ? + PROT_WRITE : 0)); + +#ifdef DIAGNOSTIC + MUTEX_LOCK(env, hp->mtx_hash); + { + BH *next_bhp = SH_CHAIN_NEXT(bhp, vc, __bh); + + DB_ASSERT(env, !mfp->multiversion || read_lsnp != NULL || + next_bhp == NULL); + DB_ASSERT(env, !mvcc || read_lsnp == NULL || + bhp->td_off == INVALID_ROFF || BH_OWNED_BY(env, bhp, txn) || + (BH_VISIBLE(env, bhp, read_lsnp, vlsn) && + (next_bhp == NULL || F_ISSET(next_bhp, BH_FROZEN) || + (next_bhp->td_off != INVALID_ROFF && + (BH_OWNER(env, next_bhp)->status != TXN_COMMITTED || + IS_ZERO_LSN(BH_OWNER(env, next_bhp)->last_lsn) || + !BH_VISIBLE(env, next_bhp, read_lsnp, vlsn)))))); + } + MUTEX_UNLOCK(env, hp->mtx_hash); +#endif + + /* + * Record this pin for this thread. Holding the page pinned + * without recording the pin is ok since we do not recover from + * a death from within the library itself. + */ + if (ip != NULL) { + reginfo = env->reginfo; + if (ip->dbth_pincount == ip->dbth_pinmax) { + pinmax = ip->dbth_pinmax; + renv = reginfo->primary; + MUTEX_LOCK(env, renv->mtx_regenv); + if ((ret = __env_alloc(reginfo, + 2 * pinmax * sizeof(PIN_LIST), &list)) != 0) { + MUTEX_UNLOCK(env, renv->mtx_regenv); + goto err; + } + + memcpy(list, R_ADDR(reginfo, ip->dbth_pinlist), + pinmax * sizeof(PIN_LIST)); + memset(&list[pinmax], 0, pinmax * sizeof(PIN_LIST)); + list_off = R_OFFSET(reginfo, list); + list = R_ADDR(reginfo, ip->dbth_pinlist); + ip->dbth_pinmax = 2 * pinmax; + ip->dbth_pinlist = list_off; + if (list != ip->dbth_pinarray) + __env_alloc_free(reginfo, list); + MUTEX_UNLOCK(env, renv->mtx_regenv); + } + list = R_ADDR(reginfo, ip->dbth_pinlist); + for (lp = list; lp < &list[ip->dbth_pinmax]; lp++) + if (lp->b_ref == INVALID_ROFF) + break; + + ip->dbth_pincount++; + lp->b_ref = R_OFFSET(infop, bhp); + lp->region = (int)(infop - dbmp->reginfo); + } + +#ifdef DIAGNOSTIC + /* Update the file's pinned reference count. */ + MPOOL_SYSTEM_LOCK(env); + ++dbmfp->pinref; + MPOOL_SYSTEM_UNLOCK(env); + + /* + * We want to switch threads as often as possible, and at awkward + * times. Yield every time we get a new page to ensure contention. + */ + if (F_ISSET(env->dbenv, DB_ENV_YIELDCPU)) + __os_yield(env, 0, 0); +#endif + + DB_ASSERT(env, alloc_bhp == NULL); + DB_ASSERT(env, !(dirty || extending) || + atomic_read(&hp->hash_page_dirty) > 0); + DB_ASSERT(env, BH_REFCOUNT(bhp) > 0 && + !F_ISSET(bhp, BH_FREED | BH_FROZEN | BH_TRASH)); + + *(void **)addrp = bhp->buf; + return (0); + +done: +err: /* + * We should only get to here with ret == 0 if freeing a buffer. + * In that case, check that it has in fact been freed. + */ + DB_ASSERT(env, ret != 0 || flags != DB_MPOOL_FREE || bhp == NULL || + (F_ISSET(bhp, BH_FREED) && !SH_CHAIN_HASNEXT(bhp, vc))); + + if (bhp != NULL) { + if (b_incr) + atomic_dec(env, &bhp->ref); + if (b_lock) { + F_CLR(bhp, BH_EXCLUSIVE); + MUTEX_UNLOCK(env, bhp->mtx_buf); + } + } + + if (h_locked) + MUTEX_UNLOCK(env, hp->mtx_hash); + + /* If alloc_bhp is set, free the memory. */ + if (alloc_bhp != NULL) + (void)__memp_bhfree(dbmp, infop, NULL, + NULL, alloc_bhp, BH_FREE_FREEMEM | BH_FREE_UNLOCKED); + + return (ret); +} diff --git a/mp/mp_fmethod.c b/mp/mp_fmethod.c new file mode 100644 index 0000000..2acc282 --- /dev/null +++ b/mp/mp_fmethod.c @@ -0,0 +1,555 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" + +static int __memp_get_clear_len __P((DB_MPOOLFILE *, u_int32_t *)); +static int __memp_get_lsn_offset __P((DB_MPOOLFILE *, int32_t *)); +static int __memp_get_maxsize __P((DB_MPOOLFILE *, u_int32_t *, u_int32_t *)); +static int __memp_set_maxsize __P((DB_MPOOLFILE *, u_int32_t, u_int32_t)); +static int __memp_get_priority __P((DB_MPOOLFILE *, DB_CACHE_PRIORITY *)); +static int __memp_set_priority __P((DB_MPOOLFILE *, DB_CACHE_PRIORITY)); + +/* + * __memp_fcreate_pp -- + * ENV->memp_fcreate pre/post processing. + * + * PUBLIC: int __memp_fcreate_pp __P((DB_ENV *, DB_MPOOLFILE **, u_int32_t)); + */ +int +__memp_fcreate_pp(dbenv, retp, flags) + DB_ENV *dbenv; + DB_MPOOLFILE **retp; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + env = dbenv->env; + + /* Validate arguments. */ + if ((ret = __db_fchk(env, "DB_ENV->memp_fcreate", flags, 0)) != 0) + return (ret); + + if (REP_ON(env)) { + __db_errx(env, + "DB_ENV->memp_fcreate: method not permitted when replication is configured"); + return (EINVAL); + } + + ENV_ENTER(env, ip); + ret = __memp_fcreate(env, retp); + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __memp_fcreate -- + * ENV->memp_fcreate. + * + * PUBLIC: int __memp_fcreate __P((ENV *, DB_MPOOLFILE **)); + */ +int +__memp_fcreate(env, retp) + ENV *env; + DB_MPOOLFILE **retp; +{ + DB_MPOOLFILE *dbmfp; + int ret; + + /* Allocate and initialize the per-process structure. */ + if ((ret = __os_calloc(env, 1, sizeof(DB_MPOOLFILE), &dbmfp)) != 0) + return (ret); + + dbmfp->ref = 1; + dbmfp->lsn_offset = DB_LSN_OFF_NOTSET; + dbmfp->env = env; + dbmfp->mfp = INVALID_ROFF; + + dbmfp->close = __memp_fclose_pp; + dbmfp->get = __memp_fget_pp; + dbmfp->get_clear_len = __memp_get_clear_len; + dbmfp->get_fileid = __memp_get_fileid; + dbmfp->get_flags = __memp_get_flags; + dbmfp->get_ftype = __memp_get_ftype; + dbmfp->get_last_pgno = __memp_get_last_pgno; + dbmfp->get_lsn_offset = __memp_get_lsn_offset; + dbmfp->get_maxsize = __memp_get_maxsize; + dbmfp->get_pgcookie = __memp_get_pgcookie; + dbmfp->get_priority = __memp_get_priority; + dbmfp->open = __memp_fopen_pp; + dbmfp->put = __memp_fput_pp; + dbmfp->set_clear_len = __memp_set_clear_len; + dbmfp->set_fileid = __memp_set_fileid; + dbmfp->set_flags = __memp_set_flags; + dbmfp->set_ftype = __memp_set_ftype; + dbmfp->set_lsn_offset = __memp_set_lsn_offset; + dbmfp->set_maxsize = __memp_set_maxsize; + dbmfp->set_pgcookie = __memp_set_pgcookie; + dbmfp->set_priority = __memp_set_priority; + dbmfp->sync = __memp_fsync_pp; + + *retp = dbmfp; + return (0); +} + +/* + * __memp_get_clear_len -- + * Get the clear length. + */ +static int +__memp_get_clear_len(dbmfp, clear_lenp) + DB_MPOOLFILE *dbmfp; + u_int32_t *clear_lenp; +{ + *clear_lenp = dbmfp->clear_len; + return (0); +} + +/* + * __memp_set_clear_len -- + * DB_MPOOLFILE->set_clear_len. + * + * PUBLIC: int __memp_set_clear_len __P((DB_MPOOLFILE *, u_int32_t)); + */ +int +__memp_set_clear_len(dbmfp, clear_len) + DB_MPOOLFILE *dbmfp; + u_int32_t clear_len; +{ + MPF_ILLEGAL_AFTER_OPEN(dbmfp, "DB_MPOOLFILE->set_clear_len"); + + dbmfp->clear_len = clear_len; + return (0); +} + +/* + * __memp_get_fileid -- + * DB_MPOOLFILE->get_fileid. + * + * PUBLIC: int __memp_get_fileid __P((DB_MPOOLFILE *, u_int8_t *)); + */ +int +__memp_get_fileid(dbmfp, fileid) + DB_MPOOLFILE *dbmfp; + u_int8_t *fileid; +{ + if (!F_ISSET(dbmfp, MP_FILEID_SET)) { + __db_errx(dbmfp->env, "get_fileid: file ID not set"); + return (EINVAL); + } + + memcpy(fileid, dbmfp->fileid, DB_FILE_ID_LEN); + return (0); +} + +/* + * __memp_set_fileid -- + * DB_MPOOLFILE->set_fileid. + * + * PUBLIC: int __memp_set_fileid __P((DB_MPOOLFILE *, u_int8_t *)); + */ +int +__memp_set_fileid(dbmfp, fileid) + DB_MPOOLFILE *dbmfp; + u_int8_t *fileid; +{ + MPF_ILLEGAL_AFTER_OPEN(dbmfp, "DB_MPOOLFILE->set_fileid"); + + memcpy(dbmfp->fileid, fileid, DB_FILE_ID_LEN); + F_SET(dbmfp, MP_FILEID_SET); + + return (0); +} + +/* + * __memp_get_flags -- + * Get the DB_MPOOLFILE flags; + * + * PUBLIC: int __memp_get_flags __P((DB_MPOOLFILE *, u_int32_t *)); + */ +int +__memp_get_flags(dbmfp, flagsp) + DB_MPOOLFILE *dbmfp; + u_int32_t *flagsp; +{ + MPOOLFILE *mfp; + + mfp = dbmfp->mfp; + + *flagsp = 0; + + if (mfp == NULL) + *flagsp = FLD_ISSET(dbmfp->config_flags, + DB_MPOOL_NOFILE | DB_MPOOL_UNLINK); + else { + if (mfp->no_backing_file) + FLD_SET(*flagsp, DB_MPOOL_NOFILE); + if (mfp->unlink_on_close) + FLD_SET(*flagsp, DB_MPOOL_UNLINK); + } + return (0); +} + +/* + * __memp_set_flags -- + * Set the DB_MPOOLFILE flags; + * + * PUBLIC: int __memp_set_flags __P((DB_MPOOLFILE *, u_int32_t, int)); + */ +int +__memp_set_flags(dbmfp, flags, onoff) + DB_MPOOLFILE *dbmfp; + u_int32_t flags; + int onoff; +{ + ENV *env; + MPOOLFILE *mfp; + int ret; + + env = dbmfp->env; + mfp = dbmfp->mfp; + + switch (flags) { + case DB_MPOOL_NOFILE: + if (mfp == NULL) + if (onoff) + FLD_SET(dbmfp->config_flags, DB_MPOOL_NOFILE); + else + FLD_CLR(dbmfp->config_flags, DB_MPOOL_NOFILE); + else + mfp->no_backing_file = onoff; + break; + case DB_MPOOL_UNLINK: + if (mfp == NULL) + if (onoff) + FLD_SET(dbmfp->config_flags, DB_MPOOL_UNLINK); + else + FLD_CLR(dbmfp->config_flags, DB_MPOOL_UNLINK); + else + mfp->unlink_on_close = onoff; + break; + default: + if ((ret = __db_fchk(env, "DB_MPOOLFILE->set_flags", + flags, DB_MPOOL_NOFILE | DB_MPOOL_UNLINK)) != 0) + return (ret); + break; + } + return (0); +} + +/* + * __memp_get_ftype -- + * Get the file type (as registered). + * + * PUBLIC: int __memp_get_ftype __P((DB_MPOOLFILE *, int *)); + */ +int +__memp_get_ftype(dbmfp, ftypep) + DB_MPOOLFILE *dbmfp; + int *ftypep; +{ + *ftypep = dbmfp->ftype; + return (0); +} + +/* + * __memp_set_ftype -- + * DB_MPOOLFILE->set_ftype. + * + * PUBLIC: int __memp_set_ftype __P((DB_MPOOLFILE *, int)); + */ +int +__memp_set_ftype(dbmfp, ftype) + DB_MPOOLFILE *dbmfp; + int ftype; +{ + MPF_ILLEGAL_AFTER_OPEN(dbmfp, "DB_MPOOLFILE->set_ftype"); + + dbmfp->ftype = ftype; + return (0); +} + +/* + * __memp_get_lsn_offset -- + * Get the page's LSN offset. + */ +static int +__memp_get_lsn_offset(dbmfp, lsn_offsetp) + DB_MPOOLFILE *dbmfp; + int32_t *lsn_offsetp; +{ + *lsn_offsetp = dbmfp->lsn_offset; + return (0); +} + +/* + * __memp_set_lsn_offset -- + * Set the page's LSN offset. + * + * PUBLIC: int __memp_set_lsn_offset __P((DB_MPOOLFILE *, int32_t)); + */ +int +__memp_set_lsn_offset(dbmfp, lsn_offset) + DB_MPOOLFILE *dbmfp; + int32_t lsn_offset; +{ + MPF_ILLEGAL_AFTER_OPEN(dbmfp, "DB_MPOOLFILE->set_lsn_offset"); + + dbmfp->lsn_offset = lsn_offset; + return (0); +} + +/* + * __memp_get_maxsize -- + * Get the file's maximum size. + */ +static int +__memp_get_maxsize(dbmfp, gbytesp, bytesp) + DB_MPOOLFILE *dbmfp; + u_int32_t *gbytesp, *bytesp; +{ + ENV *env; + MPOOLFILE *mfp; + + if ((mfp = dbmfp->mfp) == NULL) { + *gbytesp = dbmfp->gbytes; + *bytesp = dbmfp->bytes; + } else { + env = dbmfp->env; + + MUTEX_LOCK(env, mfp->mutex); + *gbytesp = (u_int32_t) + (mfp->maxpgno / (GIGABYTE / mfp->stat.st_pagesize)); + *bytesp = (u_int32_t) + ((mfp->maxpgno % (GIGABYTE / mfp->stat.st_pagesize)) * + mfp->stat.st_pagesize); + MUTEX_UNLOCK(env, mfp->mutex); + } + + return (0); +} + +/* + * __memp_set_maxsize -- + * Set the file's maximum size. + */ +static int +__memp_set_maxsize(dbmfp, gbytes, bytes) + DB_MPOOLFILE *dbmfp; + u_int32_t gbytes, bytes; +{ + ENV *env; + MPOOLFILE *mfp; + + if ((mfp = dbmfp->mfp) == NULL) { + dbmfp->gbytes = gbytes; + dbmfp->bytes = bytes; + } else { + env = dbmfp->env; + + MUTEX_LOCK(env, mfp->mutex); + mfp->maxpgno = (db_pgno_t) + (gbytes * (GIGABYTE / mfp->stat.st_pagesize)); + mfp->maxpgno += (db_pgno_t) + ((bytes + mfp->stat.st_pagesize - 1) / + mfp->stat.st_pagesize); + MUTEX_UNLOCK(env, mfp->mutex); + } + + return (0); +} + +/* + * __memp_get_pgcookie -- + * Get the pgin/pgout cookie. + * + * PUBLIC: int __memp_get_pgcookie __P((DB_MPOOLFILE *, DBT *)); + */ +int +__memp_get_pgcookie(dbmfp, pgcookie) + DB_MPOOLFILE *dbmfp; + DBT *pgcookie; +{ + if (dbmfp->pgcookie == NULL) { + pgcookie->size = 0; + pgcookie->data = ""; + } else + memcpy(pgcookie, dbmfp->pgcookie, sizeof(DBT)); + return (0); +} + +/* + * __memp_set_pgcookie -- + * Set the pgin/pgout cookie. + * + * PUBLIC: int __memp_set_pgcookie __P((DB_MPOOLFILE *, DBT *)); + */ +int +__memp_set_pgcookie(dbmfp, pgcookie) + DB_MPOOLFILE *dbmfp; + DBT *pgcookie; +{ + DBT *cookie; + ENV *env; + int ret; + + MPF_ILLEGAL_AFTER_OPEN(dbmfp, "DB_MPOOLFILE->set_pgcookie"); + env = dbmfp->env; + + if ((ret = __os_calloc(env, 1, sizeof(*cookie), &cookie)) != 0) + return (ret); + if ((ret = __os_malloc(env, pgcookie->size, &cookie->data)) != 0) { + __os_free(env, cookie); + return (ret); + } + + memcpy(cookie->data, pgcookie->data, pgcookie->size); + cookie->size = pgcookie->size; + + dbmfp->pgcookie = cookie; + return (0); +} + +/* + * __memp_get_priority -- + * Set the cache priority for pages from this file. + */ +static int +__memp_get_priority(dbmfp, priorityp) + DB_MPOOLFILE *dbmfp; + DB_CACHE_PRIORITY *priorityp; +{ + switch (dbmfp->priority) { + case MPOOL_PRI_VERY_LOW: + *priorityp = DB_PRIORITY_VERY_LOW; + break; + case MPOOL_PRI_LOW: + *priorityp = DB_PRIORITY_LOW; + break; + case MPOOL_PRI_DEFAULT: + *priorityp = DB_PRIORITY_DEFAULT; + break; + case MPOOL_PRI_HIGH: + *priorityp = DB_PRIORITY_HIGH; + break; + case MPOOL_PRI_VERY_HIGH: + *priorityp = DB_PRIORITY_VERY_HIGH; + break; + default: + __db_errx(dbmfp->env, + "DB_MPOOLFILE->get_priority: unknown priority value: %d", + dbmfp->priority); + return (EINVAL); + } + + return (0); +} + +/* + * __memp_set_priority -- + * Set the cache priority for pages from this file. + */ +static int +__memp_set_priority(dbmfp, priority) + DB_MPOOLFILE *dbmfp; + DB_CACHE_PRIORITY priority; +{ + switch (priority) { + case DB_PRIORITY_VERY_LOW: + dbmfp->priority = MPOOL_PRI_VERY_LOW; + break; + case DB_PRIORITY_LOW: + dbmfp->priority = MPOOL_PRI_LOW; + break; + case DB_PRIORITY_DEFAULT: + dbmfp->priority = MPOOL_PRI_DEFAULT; + break; + case DB_PRIORITY_HIGH: + dbmfp->priority = MPOOL_PRI_HIGH; + break; + case DB_PRIORITY_VERY_HIGH: + dbmfp->priority = MPOOL_PRI_VERY_HIGH; + break; + default: + __db_errx(dbmfp->env, + "DB_MPOOLFILE->set_priority: unknown priority value: %d", + priority); + return (EINVAL); + } + + /* Update the underlying file if we've already opened it. */ + if (dbmfp->mfp != NULL) + dbmfp->mfp->priority = dbmfp->priority; + + return (0); +} + +/* + * __memp_get_last_pgno -- + * Return the page number of the last page in the file. + * + * !!! + * The method is undocumented, but the handle is exported, users occasionally + * ask for it. + * + * PUBLIC: int __memp_get_last_pgno __P((DB_MPOOLFILE *, db_pgno_t *)); + */ +int +__memp_get_last_pgno(dbmfp, pgnoaddr) + DB_MPOOLFILE *dbmfp; + db_pgno_t *pgnoaddr; +{ + ENV *env; + MPOOLFILE *mfp; + + env = dbmfp->env; + mfp = dbmfp->mfp; + + MUTEX_LOCK(env, mfp->mutex); + *pgnoaddr = mfp->last_pgno; + MUTEX_UNLOCK(env, mfp->mutex); + + return (0); +} + +/* + * __memp_fn -- + * On errors we print whatever is available as the file name. + * + * PUBLIC: char * __memp_fn __P((DB_MPOOLFILE *)); + */ +char * +__memp_fn(dbmfp) + DB_MPOOLFILE *dbmfp; +{ + return (__memp_fns(dbmfp->env->mp_handle, dbmfp->mfp)); +} + +/* + * __memp_fns -- + * On errors we print whatever is available as the file name. + * + * PUBLIC: char * __memp_fns __P((DB_MPOOL *, MPOOLFILE *)); + * + */ +char * +__memp_fns(dbmp, mfp) + DB_MPOOL *dbmp; + MPOOLFILE *mfp; +{ + if (mfp == NULL || mfp->path_off == 0) + return ((char *)"unknown"); + + return ((char *)R_ADDR(dbmp->reginfo, mfp->path_off)); +} diff --git a/mp/mp_fopen.c b/mp/mp_fopen.c new file mode 100644 index 0000000..48a7d5b --- /dev/null +++ b/mp/mp_fopen.c @@ -0,0 +1,1100 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/db_page.h" +#include "dbinc/hash.h" + +static int __memp_mpf_alloc __P((DB_MPOOL *, + DB_MPOOLFILE *, const char *, u_int32_t, u_int32_t, MPOOLFILE **)); +static int __memp_mpf_find __P((ENV *, + DB_MPOOLFILE *, DB_MPOOL_HASH *, const char *, u_int32_t, MPOOLFILE **)); + +/* + * __memp_fopen_pp -- + * DB_MPOOLFILE->open pre/post processing. + * + * PUBLIC: int __memp_fopen_pp + * PUBLIC: __P((DB_MPOOLFILE *, const char *, u_int32_t, int, size_t)); + */ +int +__memp_fopen_pp(dbmfp, path, flags, mode, pagesize) + DB_MPOOLFILE *dbmfp; + const char *path; + u_int32_t flags; + int mode; + size_t pagesize; +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + env = dbmfp->env; + + /* Validate arguments. */ + if ((ret = __db_fchk(env, "DB_MPOOLFILE->open", flags, + DB_CREATE | DB_DIRECT | DB_EXTENT | DB_MULTIVERSION | + DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE)) != 0) + return (ret); + + /* + * Require a non-zero, power-of-two pagesize, smaller than the + * clear length. + */ + if (pagesize == 0 || !POWER_OF_TWO(pagesize)) { + __db_errx(env, + "DB_MPOOLFILE->open: page sizes must be a power-of-2"); + return (EINVAL); + } + if (dbmfp->clear_len > pagesize) { + __db_errx(env, + "DB_MPOOLFILE->open: clear length larger than page size"); + return (EINVAL); + } + + /* Read-only checks, and local flag. */ + if (LF_ISSET(DB_RDONLY) && path == NULL) { + __db_errx(env, + "DB_MPOOLFILE->open: temporary files can't be readonly"); + return (EINVAL); + } + + if (LF_ISSET(DB_MULTIVERSION) && !TXN_ON(env)) { + __db_errx(env, + "DB_MPOOLFILE->open: DB_MULTIVERSION requires transactions"); + return (EINVAL); + } + + ENV_ENTER(env, ip); + REPLICATION_WRAP(env, + (__memp_fopen(dbmfp, NULL, + path, NULL, flags, mode, pagesize)), 0, ret); + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __memp_fopen -- + * DB_MPOOLFILE->open. + * + * PUBLIC: int __memp_fopen __P((DB_MPOOLFILE *, MPOOLFILE *, + * PUBLIC: const char *, const char **, u_int32_t, int, size_t)); + */ +int +__memp_fopen(dbmfp, mfp, path, dirp, flags, mode, pgsize) + DB_MPOOLFILE *dbmfp; + MPOOLFILE *mfp; + const char *path; + const char **dirp; + u_int32_t flags; + int mode; + size_t pgsize; +{ + DB_ENV *dbenv; + DB_MPOOL *dbmp; + DB_MPOOLFILE *tmp_dbmfp; + DB_MPOOL_HASH *hp; + ENV *env; + MPOOL *mp; + MPOOLFILE *alloc_mfp; + size_t maxmap; + db_pgno_t last_pgno; + u_int32_t bucket, mbytes, bytes, oflags, pagesize; + int refinc, ret; + char *rpath; + + /* If this handle is already open, return. */ + if (F_ISSET(dbmfp, MP_OPEN_CALLED)) + return (0); + + env = dbmfp->env; + dbmp = env->mp_handle; + dbenv = env->dbenv; + mp = dbmp->reginfo[0].primary; + alloc_mfp = NULL; + mbytes = bytes = 0; + refinc = ret = 0; + rpath = NULL; + + /* + * We're keeping the page size as a size_t in the public API, but + * it's a u_int32_t everywhere internally. + */ + pagesize = (u_int32_t)pgsize; + + /* + * We're called internally with a specified mfp, in which case the + * path is NULL, but we'll get the path from the underlying region + * information. Otherwise, if the path is NULL, it's a temporary + * file -- we know we can't join any existing files, and we'll delay + * the open until we actually need to write the file. All temporary + * files will go into the first hash bucket. + */ + DB_ASSERT(env, mfp == NULL || path == NULL); + + bucket = 0; + hp = R_ADDR(dbmp->reginfo, mp->ftab); + if (mfp == NULL) { + if (path == NULL) + goto alloc; + + /* + * Hash to the proper file table entry and walk it. + * + * The fileID is a filesystem unique number (e.g., a + * UNIX dev/inode pair) plus a timestamp. If files are + * removed and created in less than a second, the fileID + * can be repeated. The problem with repetition happens + * when the file that previously had the fileID value still + * has pages in the pool, since we don't want to use them + * to satisfy requests for the new file. Because the + * DB_TRUNCATE flag reuses the dev/inode pair, repeated + * opens with that flag set guarantees matching fileIDs + * when the machine can open a file and then re-open + * with truncate within a second. For this reason, we + * pass that flag down, and, if we find a matching entry, + * we ensure that it's never found again, and we create + * a new entry for the current request. + */ + + if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) + bucket = FNBUCKET(path, strlen(path)); + else + bucket = FNBUCKET(dbmfp->fileid, DB_FILE_ID_LEN); + hp += bucket; + + /* + * If we are passed a FILEID find the MPOOLFILE and inc + * its ref count. That way it cannot go away while we + * open it. + */ + if (F_ISSET(dbmfp, MP_FILEID_SET)) { + MUTEX_LOCK(env, hp->mtx_hash); + ret = + __memp_mpf_find(env, dbmfp, hp, path, flags,&mfp); + MUTEX_UNLOCK(env, hp->mtx_hash); + if (ret != 0) + goto err; + if (mfp != NULL) + refinc = 1; + } + } else { + /* + * Deadfile can only be set if mpf_cnt goes to zero (or if we + * failed creating the file DB_AM_DISCARD). Increment the ref + * count so the file cannot become dead and be unlinked. + */ + MUTEX_LOCK(env, mfp->mutex); + if (!mfp->deadfile) { + ++mfp->mpf_cnt; + refinc = 1; + } + MUTEX_UNLOCK(env, mfp->mutex); + + /* + * Test one last time to see if the file is dead -- it may have + * been removed. This happens when a checkpoint trying to open + * the file to flush a buffer races with the Db::remove method. + * The error will be ignored, so don't output an error message. + */ + if (mfp->deadfile) + return (EINVAL); + } + + /* + * If there's no backing file, we can join existing files in the cache, + * but there's nothing to read from disk. + */ + if (!FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) { + /* Convert MP open flags to DB OS-layer open flags. */ + oflags = 0; + if (LF_ISSET(DB_CREATE)) + oflags |= DB_OSO_CREATE; + if (LF_ISSET(DB_DIRECT)) + oflags |= DB_OSO_DIRECT; + if (LF_ISSET(DB_RDONLY)) { + F_SET(dbmfp, MP_READONLY); + oflags |= DB_OSO_RDONLY; + } + + /* + * XXX + * A grievous layering violation, the DB_DSYNC_DB flag + * was left in the ENV structure and not driven through + * the cache API. This needs to be fixed when the general + * API configuration is fixed. + */ + if (F_ISSET(env->dbenv, DB_ENV_DSYNC_DB)) + oflags |= DB_OSO_DSYNC; + + /* + * Get the real name for this file and open it. + * + * Supply a page size so os_open can decide whether to + * turn buffering off if the DB_DIRECT_DB flag is set. + * + * Acquire the region lock if we're using a path from + * an underlying MPOOLFILE -- there's a race in accessing + * the path name stored in the region, __memp_nameop may + * be simultaneously renaming the file. + */ + if (mfp != NULL) { + MPOOL_SYSTEM_LOCK(env); + path = R_ADDR(dbmp->reginfo, mfp->path_off); + } + if ((ret = __db_appname(env, + DB_APP_DATA, path, dirp, &rpath)) == 0) + ret = __os_open(env, rpath, + (u_int32_t)pagesize, oflags, mode, &dbmfp->fhp); + if (mfp != NULL) + MPOOL_SYSTEM_UNLOCK(env); + if (ret != 0) + goto err; + + /* + * Cache file handles are shared, and have mutexes to + * protect the underlying file handle across seek and + * read/write calls. + */ + dbmfp->fhp->ref = 1; + if ((ret = __mutex_alloc(env, MTX_MPOOL_FH, + DB_MUTEX_PROCESS_ONLY, &dbmfp->fhp->mtx_fh)) != 0) + goto err; + + /* + * Figure out the file's size. + * + * !!! + * We can't use off_t's here, or in any code in the mainline + * library for that matter. (We have to use them in the + * os stubs, of course, as there are system calls that + * take them as arguments.) The reason is some customers + * build in environments where an off_t is 32-bits, but + * still run where offsets are 64-bits, and they pay us + * a lot of money. + */ + if ((ret = __os_ioinfo( + env, rpath, dbmfp->fhp, &mbytes, &bytes, NULL)) != 0) { + __db_err(env, ret, "%s", rpath); + goto err; + } + + /* + * Don't permit files that aren't a multiple of the pagesize, + * and find the number of the last page in the file, all the + * time being careful not to overflow 32 bits. + * + * During verify or recovery, we might have to cope with a + * truncated file; if the file size is not a multiple of the + * page size, round down to a page, we'll take care of the + * partial page outside the mpool system. + */ + DB_ASSERT(env, pagesize != 0); + if (bytes % pagesize != 0) { + if (LF_ISSET(DB_ODDFILESIZE)) + bytes -= (u_int32_t)(bytes % pagesize); + else { + __db_errx(env, + "%s: file size not a multiple of the pagesize", rpath); + ret = EINVAL; + goto err; + } + } + + /* + * Get the file id if we weren't given one. Generated file id's + * don't use timestamps, otherwise there'd be no chance of any + * other process joining the party. Don't bother looking for + * this id in the hash table, its new. + */ + if (mfp == NULL && !F_ISSET(dbmfp, MP_FILEID_SET)) { + if ((ret = + __os_fileid(env, rpath, 0, dbmfp->fileid)) != 0) + goto err; + F_SET(dbmfp, MP_FILEID_SET); + goto alloc; + } + } + + if (mfp != NULL) + goto have_mfp; + + /* + * We can race with another process opening the same file when + * we allocate the mpoolfile structure. We will come back + * here and check the hash table again to see if it has appeared. + * For most files this is not a problem, since the name is locked + * at a higher layer but QUEUE extent files are not locked. + */ +check: MUTEX_LOCK(env, hp->mtx_hash); + if ((ret = __memp_mpf_find(env, dbmfp, hp, path, flags, &mfp) != 0)) + goto err; + + if (alloc_mfp != NULL && mfp == NULL) { + mfp = alloc_mfp; + alloc_mfp = NULL; + SH_TAILQ_INSERT_HEAD(&hp->hash_bucket, mfp, q, __mpoolfile); + } else if (mfp != NULL) { + /* + * Some things about a file cannot be changed: the clear length, + * page size, or LSN location. However, if this is an attempt + * to open a named in-memory file, we may not yet have that + * information. so accept uninitialized entries. + * + * The file type can change if the application's pre- and post- + * processing needs change. For example, an application that + * created a hash subdatabase in a database that was previously + * all btree. + * + * !!! + * We do not check to see if the pgcookie information changed, + * or update it if it is. + */ + if ((dbmfp->clear_len != DB_CLEARLEN_NOTSET && + mfp->clear_len != DB_CLEARLEN_NOTSET && + dbmfp->clear_len != mfp->clear_len) || + (pagesize != 0 && pagesize != mfp->stat.st_pagesize) || + (dbmfp->lsn_offset != DB_LSN_OFF_NOTSET && + mfp->lsn_off != DB_LSN_OFF_NOTSET && + dbmfp->lsn_offset != mfp->lsn_off)) { + __db_errx(env, + "%s: clear length, page size or LSN location changed", + path); + MUTEX_UNLOCK(env, hp->mtx_hash); + ret = EINVAL; + goto err; + } + } + + MUTEX_UNLOCK(env, hp->mtx_hash); + if (alloc_mfp != NULL) { + MUTEX_LOCK(env, alloc_mfp->mutex); + if ((ret = __memp_mf_discard(dbmp, alloc_mfp)) != 0) + goto err; + } + + if (mfp == NULL) { + /* + * If we didn't find the file and this is an in-memory file, + * then the create flag should be set. + */ + if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE) && + !LF_ISSET(DB_CREATE)) { + ret = ENOENT; + goto err; + } + +alloc: /* + * Get the file ID if we weren't given one. Generated file + * ID's don't use timestamps, otherwise there'd be no + * chance of any other process joining the party. + */ + if (path != NULL && + !FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE) && + !F_ISSET(dbmfp, MP_FILEID_SET) && (ret = + __os_fileid(env, rpath, 0, dbmfp->fileid)) != 0) + goto err; + + if ((ret = __memp_mpf_alloc(dbmp, + dbmfp, path, pagesize, flags, &alloc_mfp)) != 0) + goto err; + + /* + * If the user specifies DB_MPOOL_LAST or DB_MPOOL_NEW on a + * page get, we have to increment the last page in the file. + * Figure it out and save it away. + * + * Note correction: page numbers are zero-based, not 1-based. + */ + DB_ASSERT(env, pagesize != 0); + last_pgno = (db_pgno_t)(mbytes * (MEGABYTE / pagesize)); + last_pgno += (db_pgno_t)(bytes / pagesize); + if (last_pgno != 0) + --last_pgno; + + alloc_mfp->last_flushed_pgno = alloc_mfp->orig_last_pgno = + alloc_mfp->last_pgno = last_pgno; + + alloc_mfp->bucket = bucket; + + /* Go back and see if someone else has opened the file. */ + if (path != NULL) + goto check; + + mfp = alloc_mfp; + /* This is a temp, noone else can see it, put it at the end. */ + MUTEX_LOCK(env, hp->mtx_hash); + SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, mfp, q); + MUTEX_UNLOCK(env, hp->mtx_hash); + } +have_mfp: + /* + * We need to verify that all handles open a file either durable or not + * durable. This needs to be cross process and cross sub-databases, so + * mpool is the place to do it. + */ + if (!LF_ISSET(DB_DURABLE_UNKNOWN | DB_RDONLY)) { + if (F_ISSET(mfp, MP_DURABLE_UNKNOWN)) { + if (LF_ISSET(DB_TXN_NOT_DURABLE)) + F_SET(mfp, MP_NOT_DURABLE); + F_CLR(mfp, MP_DURABLE_UNKNOWN); + } else if (!LF_ISSET(DB_TXN_NOT_DURABLE) != + !F_ISSET(mfp, MP_NOT_DURABLE)) { + __db_errx(env, + "Cannot open DURABLE and NOT DURABLE handles in the same file"); + ret = EINVAL; + goto err; + } + } + + if (LF_ISSET(DB_MULTIVERSION)) { + ++mfp->multiversion; + F_SET(dbmfp, MP_MULTIVERSION); + } + + /* + * All paths to here have initialized the mfp variable to reference + * the selected (or allocated) MPOOLFILE. + */ + dbmfp->mfp = mfp; + + /* + * Check to see if we can mmap the file. If a file: + * + isn't temporary + * + is read-only + * + doesn't require any pgin/pgout support + * + the DB_NOMMAP flag wasn't set (in either the file open or + * the environment in which it was opened) + * + and is less than mp_mmapsize bytes in size + * + * we can mmap it instead of reading/writing buffers. Don't do error + * checking based on the mmap call failure. We want to do normal I/O + * on the file if the reason we failed was because the file was on an + * NFS mounted partition, and we can fail in buffer I/O just as easily + * as here. + * + * We'd like to test to see if the file is too big to mmap. Since we + * don't know what size or type off_t's or size_t's are, or the largest + * unsigned integral type is, or what random insanity the local C + * compiler will perpetrate, doing the comparison in a portable way is + * flatly impossible. Hope that mmap fails if the file is too large. + */ +#define DB_MAXMMAPSIZE (10 * 1024 * 1024) /* 10 MB. */ + if (F_ISSET(mfp, MP_CAN_MMAP)) { + maxmap = dbenv->mp_mmapsize == 0 ? + DB_MAXMMAPSIZE : dbenv->mp_mmapsize; + if (path == NULL || + FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) + F_CLR(mfp, MP_CAN_MMAP); + else if (!F_ISSET(dbmfp, MP_READONLY)) + F_CLR(mfp, MP_CAN_MMAP); + else if (dbmfp->ftype != 0) + F_CLR(mfp, MP_CAN_MMAP); + else if (LF_ISSET(DB_NOMMAP) || F_ISSET(dbenv, DB_ENV_NOMMAP)) + F_CLR(mfp, MP_CAN_MMAP); + else { + MPOOL_SYSTEM_LOCK(env); + maxmap = mp->mp_mmapsize == 0 ? + DB_MAXMMAPSIZE : mp->mp_mmapsize; + MPOOL_SYSTEM_UNLOCK(env); + if (mbytes > maxmap / MEGABYTE || + (mbytes == maxmap / MEGABYTE && + bytes >= maxmap % MEGABYTE)) + F_CLR(mfp, MP_CAN_MMAP); + } + + dbmfp->addr = NULL; + if (F_ISSET(mfp, MP_CAN_MMAP)) { + dbmfp->len = (size_t)mbytes * MEGABYTE + bytes; + if (__os_mapfile(env, rpath, + dbmfp->fhp, dbmfp->len, 1, &dbmfp->addr) != 0) { + dbmfp->addr = NULL; + F_CLR(mfp, MP_CAN_MMAP); + } + } + } + + F_SET(dbmfp, MP_OPEN_CALLED); + + /* + * Share the underlying file descriptor if that's possible. + * + * Add the file to the process' list of DB_MPOOLFILEs. + */ + MUTEX_LOCK(env, dbmp->mutex); + + if (dbmfp->fhp != NULL) + TAILQ_FOREACH(tmp_dbmfp, &dbmp->dbmfq, q) + if (dbmfp->mfp == tmp_dbmfp->mfp && + (F_ISSET(dbmfp, MP_READONLY) || + !F_ISSET(tmp_dbmfp, MP_READONLY))) { + (void)__mutex_free(env, &dbmfp->fhp->mtx_fh); + (void)__os_closehandle(env, dbmfp->fhp); + ++tmp_dbmfp->fhp->ref; + dbmfp->fhp = tmp_dbmfp->fhp; + break; + } + + TAILQ_INSERT_TAIL(&dbmp->dbmfq, dbmfp, q); + + MUTEX_UNLOCK(env, dbmp->mutex); + + if (0) { +err: if (refinc) { + /* + * If mpf_cnt goes to zero here and unlink_on_close is + * set, then we missed the last close, but there was an + * error trying to open the file, so we probably cannot + * unlink it anyway. + */ + MUTEX_LOCK(env, mfp->mutex); + --mfp->mpf_cnt; + MUTEX_UNLOCK(env, mfp->mutex); + } + + } + if (rpath != NULL) + __os_free(env, rpath); + return (ret); +} + +/* + * __memp_mpf_find -- + * Search a hash bucket for a MPOOLFILE. + */ +static int +__memp_mpf_find(env, dbmfp, hp, path, flags, mfpp) + ENV *env; + DB_MPOOLFILE *dbmfp; + DB_MPOOL_HASH *hp; + const char *path; + u_int32_t flags; + MPOOLFILE **mfpp; +{ + DB_MPOOL *dbmp; + MPOOLFILE *mfp; + + dbmp = env->mp_handle; + + SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) { + /* Skip dead files and temporary files. */ + if (mfp->deadfile || F_ISSET(mfp, MP_TEMP)) + continue; + + /* + * Any remaining DB_MPOOL_NOFILE databases are in-memory + * named databases and need only match other in-memory + * databases with the same name. + */ + if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) { + if (!mfp->no_backing_file) + continue; + + if (strcmp(path, R_ADDR(dbmp->reginfo, mfp->path_off))) + continue; + + /* + * We matched an in-memory file; grab the fileid if + * it is set in the region, but not in the dbmfp. + */ + if (!F_ISSET(dbmfp, MP_FILEID_SET)) + (void)__memp_set_fileid(dbmfp, + R_ADDR(dbmp->reginfo, mfp->fileid_off)); + } else + if (memcmp(dbmfp->fileid, R_ADDR(dbmp->reginfo, + mfp->fileid_off), DB_FILE_ID_LEN) != 0) + continue; + + /* + * If the file is being truncated, remove it from the system + * and create a new entry. + * + * !!! + * We should be able to set mfp to NULL and break out of the + * loop, but I like the idea of checking all the entries. + */ + if (LF_ISSET(DB_TRUNCATE)) { + MUTEX_LOCK(env, mfp->mutex); + mfp->deadfile = 1; + MUTEX_UNLOCK(env, mfp->mutex); + continue; + } + + /* + * Check to see if this file has died while we waited. + * + * We normally don't lock the deadfile field when we read it as + * we only care if the field is zero or non-zero. We do lock + * on read when searching for a matching MPOOLFILE so that two + * threads of control don't race between setting the deadfile + * bit and incrementing the reference count, that is, a thread + * of control decrementing the reference count and then setting + * deadfile because the reference count is 0 blocks us finding + * the file without knowing it's about to be marked dead. + */ + MUTEX_LOCK(env, mfp->mutex); + if (mfp->deadfile) { + MUTEX_UNLOCK(env, mfp->mutex); + continue; + } + ++mfp->mpf_cnt; + MUTEX_UNLOCK(env, mfp->mutex); + + /* Initialize any fields that are not yet set. */ + if (dbmfp->ftype != 0) + mfp->ftype = dbmfp->ftype; + if (dbmfp->clear_len != DB_CLEARLEN_NOTSET) + mfp->clear_len = dbmfp->clear_len; + if (dbmfp->lsn_offset != -1) + mfp->lsn_off = dbmfp->lsn_offset; + + break; + } + + *mfpp = mfp; + return (0); +} + +static int +__memp_mpf_alloc(dbmp, dbmfp, path, pagesize, flags, retmfp) + DB_MPOOL *dbmp; + DB_MPOOLFILE *dbmfp; + const char *path; + u_int32_t pagesize; + u_int32_t flags; + MPOOLFILE **retmfp; +{ + ENV *env; + MPOOLFILE *mfp; + int ret; + void *p; + + env = dbmp->env; + ret = 0; + /* Allocate and initialize a new MPOOLFILE. */ + if ((ret = __memp_alloc(dbmp, + dbmp->reginfo, NULL, sizeof(MPOOLFILE), NULL, &mfp)) != 0) + goto err; + memset(mfp, 0, sizeof(MPOOLFILE)); + mfp->mpf_cnt = 1; + mfp->ftype = dbmfp->ftype; + mfp->stat.st_pagesize = pagesize; + mfp->lsn_off = dbmfp->lsn_offset; + mfp->clear_len = dbmfp->clear_len; + mfp->priority = dbmfp->priority; + if (dbmfp->gbytes != 0 || dbmfp->bytes != 0) { + mfp->maxpgno = (db_pgno_t) + (dbmfp->gbytes * (GIGABYTE / mfp->stat.st_pagesize)); + mfp->maxpgno += (db_pgno_t) + ((dbmfp->bytes + mfp->stat.st_pagesize - 1) / + mfp->stat.st_pagesize); + } + if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) + mfp->no_backing_file = 1; + if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_UNLINK)) + mfp->unlink_on_close = 1; + + if (LF_ISSET(DB_DURABLE_UNKNOWN | DB_RDONLY)) + F_SET(mfp, MP_DURABLE_UNKNOWN); + if (LF_ISSET(DB_DIRECT)) + F_SET(mfp, MP_DIRECT); + if (LF_ISSET(DB_EXTENT)) + F_SET(mfp, MP_EXTENT); + if (LF_ISSET(DB_TXN_NOT_DURABLE)) + F_SET(mfp, MP_NOT_DURABLE); + F_SET(mfp, MP_CAN_MMAP); + + /* + * An in-memory database with no name is a temp file. Named + * in-memory databases get an artificially bumped reference + * count so they don't disappear on close; they need a remove + * to make them disappear. + */ + if (path == NULL) + F_SET(mfp, MP_TEMP); + else if (FLD_ISSET(dbmfp->config_flags, DB_MPOOL_NOFILE)) + mfp->mpf_cnt++; + + /* Copy the file identification string into shared memory. */ + if (F_ISSET(dbmfp, MP_FILEID_SET)) { + if ((ret = __memp_alloc(dbmp, dbmp->reginfo, + NULL, DB_FILE_ID_LEN, &mfp->fileid_off, &p)) != 0) + goto err; + memcpy(p, dbmfp->fileid, DB_FILE_ID_LEN); + } + + /* Copy the file path into shared memory. */ + if (path != NULL) { + if ((ret = __memp_alloc(dbmp, dbmp->reginfo, + NULL, strlen(path) + 1, &mfp->path_off, &p)) != 0) + goto err; + memcpy(p, path, strlen(path) + 1); + } + + /* Copy the page cookie into shared memory. */ + if (dbmfp->pgcookie == NULL || dbmfp->pgcookie->size == 0) { + mfp->pgcookie_len = 0; + mfp->pgcookie_off = 0; + } else { + if ((ret = __memp_alloc(dbmp, dbmp->reginfo, + NULL, dbmfp->pgcookie->size, + &mfp->pgcookie_off, &p)) != 0) + goto err; + memcpy(p, + dbmfp->pgcookie->data, dbmfp->pgcookie->size); + mfp->pgcookie_len = dbmfp->pgcookie->size; + } + + if ((ret = __mutex_alloc(env, + MTX_MPOOLFILE_HANDLE, 0, &mfp->mutex)) != 0) + goto err; + *retmfp = mfp; + +err: return (ret); +} + +/* + * memp_fclose_pp -- + * DB_MPOOLFILE->close pre/post processing. + * + * PUBLIC: int __memp_fclose_pp __P((DB_MPOOLFILE *, u_int32_t)); + */ +int +__memp_fclose_pp(dbmfp, flags) + DB_MPOOLFILE *dbmfp; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + env = dbmfp->env; + + /* + * Validate arguments, but as a handle destructor, we can't fail. + */ + if (flags != 0) + (void)__db_ferr(env, "DB_MPOOLFILE->close", 0); + + ENV_ENTER(env, ip); + REPLICATION_WRAP(env, (__memp_fclose(dbmfp, 0)), 0, ret); + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __memp_fclose -- + * DB_MPOOLFILE->close. + * + * PUBLIC: int __memp_fclose __P((DB_MPOOLFILE *, u_int32_t)); + */ +int +__memp_fclose(dbmfp, flags) + DB_MPOOLFILE *dbmfp; + u_int32_t flags; +{ + DB_MPOOL *dbmp; + ENV *env; + MPOOLFILE *mfp; + char *rpath; + u_int32_t ref; + int deleted, ret, t_ret; + + env = dbmfp->env; + dbmp = env->mp_handle; + ret = 0; + + /* + * Remove the DB_MPOOLFILE from the process' list. + * + * It's possible the underlying mpool cache may never have been created. + * In that case, all we have is a structure, discard it. + * + * It's possible the DB_MPOOLFILE was never added to the DB_MPOOLFILE + * file list, check the MP_OPEN_CALLED flag to be sure. + */ + if (dbmp == NULL) + goto done; + + MUTEX_LOCK(env, dbmp->mutex); + + DB_ASSERT(env, dbmfp->ref >= 1); + if ((ref = --dbmfp->ref) == 0 && F_ISSET(dbmfp, MP_OPEN_CALLED)) + TAILQ_REMOVE(&dbmp->dbmfq, dbmfp, q); + + /* + * Decrement the file descriptor's ref count -- if we're the last ref, + * we'll discard the file descriptor. + */ + if (ref == 0 && dbmfp->fhp != NULL && --dbmfp->fhp->ref > 0) + dbmfp->fhp = NULL; + MUTEX_UNLOCK(env, dbmp->mutex); + if (ref != 0) + return (0); + + /* Complain if pinned blocks never returned. */ + if (dbmfp->pinref != 0) { + __db_errx(env, "%s: close: %lu blocks left pinned", + __memp_fn(dbmfp), (u_long)dbmfp->pinref); + ret = __env_panic(env, DB_RUNRECOVERY); + } + + /* Discard any mmap information. */ + if (dbmfp->addr != NULL && + (ret = __os_unmapfile(env, dbmfp->addr, dbmfp->len)) != 0) + __db_err(env, ret, "%s", __memp_fn(dbmfp)); + + /* + * Close the file and discard the descriptor structure; temporary + * files may not yet have been created. + */ + if (dbmfp->fhp != NULL) { + if ((t_ret = + __mutex_free(env, &dbmfp->fhp->mtx_fh)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __os_closehandle(env, dbmfp->fhp)) != 0) { + __db_err(env, t_ret, "%s", __memp_fn(dbmfp)); + if (ret == 0) + ret = t_ret; + } + dbmfp->fhp = NULL; + } + + /* + * Discard our reference on the underlying MPOOLFILE, and close it + * if it's no longer useful to anyone. It possible the open of the + * file never happened or wasn't successful, in which case, mpf will + * be NULL and MP_OPEN_CALLED will not be set. + */ + mfp = dbmfp->mfp; + DB_ASSERT(env, + (F_ISSET(dbmfp, MP_OPEN_CALLED) && mfp != NULL) || + (!F_ISSET(dbmfp, MP_OPEN_CALLED) && mfp == NULL)); + if (!F_ISSET(dbmfp, MP_OPEN_CALLED)) + goto done; + + /* + * If it's a temp file, all outstanding references belong to unflushed + * buffers. (A temp file can only be referenced by one DB_MPOOLFILE). + * We don't care about preserving any of those buffers, so mark the + * MPOOLFILE as dead so that even the dirty ones just get discarded + * when we try to flush them. + */ + deleted = 0; + if (!LF_ISSET(DB_MPOOL_NOLOCK)) + MUTEX_LOCK(env, mfp->mutex); + if (F_ISSET(dbmfp, MP_MULTIVERSION)) + --mfp->multiversion; + if (--mfp->mpf_cnt == 0 || LF_ISSET(DB_MPOOL_DISCARD)) { + if (LF_ISSET(DB_MPOOL_DISCARD) || + F_ISSET(mfp, MP_TEMP) || mfp->unlink_on_close) { + mfp->deadfile = 1; + } + if (mfp->unlink_on_close) { + if ((t_ret = __db_appname(dbmp->env, DB_APP_DATA, + R_ADDR(dbmp->reginfo, mfp->path_off), NULL, + &rpath)) != 0 && ret == 0) + ret = t_ret; + if (t_ret == 0) { + if ((t_ret = __os_unlink( + dbmp->env, rpath, 0)) != 0 && ret == 0) + ret = t_ret; + __os_free(env, rpath); + } + } + if (mfp->mpf_cnt == 0) { + F_CLR(mfp, MP_NOT_DURABLE); + F_SET(mfp, MP_DURABLE_UNKNOWN); + } + if (mfp->block_cnt == 0) { + /* + * We should never discard this mp file if our caller + * is holding the lock on it. See comment in + * __memp_sync_file. + */ + DB_ASSERT(env, !LF_ISSET(DB_MPOOL_NOLOCK)); + if ((t_ret = + __memp_mf_discard(dbmp, mfp)) != 0 && ret == 0) + ret = t_ret; + deleted = 1; + } + } + if (!deleted && !LF_ISSET(DB_MPOOL_NOLOCK)) + MUTEX_UNLOCK(env, mfp->mutex); + +done: /* Discard the DB_MPOOLFILE structure. */ + if (dbmfp->pgcookie != NULL) { + __os_free(env, dbmfp->pgcookie->data); + __os_free(env, dbmfp->pgcookie); + } + __os_free(env, dbmfp); + + return (ret); +} + +/* + * __memp_mf_discard -- + * Discard an MPOOLFILE. + * + * PUBLIC: int __memp_mf_discard __P((DB_MPOOL *, MPOOLFILE *)); + */ +int +__memp_mf_discard(dbmp, mfp) + DB_MPOOL *dbmp; + MPOOLFILE *mfp; +{ + DB_MPOOL_HASH *hp; + ENV *env; +#ifdef HAVE_STATISTICS + DB_MPOOL_STAT *sp; +#endif + MPOOL *mp; + int need_sync, ret, t_ret; + + env = dbmp->env; + mp = dbmp->reginfo[0].primary; + hp = R_ADDR(dbmp->reginfo, mp->ftab); + hp += mfp->bucket; + ret = 0; + + /* + * Expects caller to be holding the MPOOLFILE mutex. + * + * When discarding a file, we have to flush writes from it to disk. + * The scenario is that dirty buffers from this file need to be + * flushed to satisfy a future checkpoint, but when the checkpoint + * calls mpool sync, the sync code won't know anything about them. + * Ignore files not written, discarded, or only temporary. + */ + need_sync = + mfp->file_written && !mfp->deadfile && !F_ISSET(mfp, MP_TEMP); + + /* + * We have to release the MPOOLFILE mutex before acquiring the region + * mutex so we don't deadlock. Make sure nobody ever looks at this + * structure again. + */ + mfp->deadfile = 1; + + /* Discard the mutex we're holding and return it too the pool. */ + MUTEX_UNLOCK(env, mfp->mutex); + if ((t_ret = __mutex_free(env, &mfp->mutex)) != 0 && ret == 0) + ret = t_ret; + + /* Lock the bucket and delete from the list of MPOOLFILEs. */ + MUTEX_LOCK(env, hp->mtx_hash); + SH_TAILQ_REMOVE(&hp->hash_bucket, mfp, q, __mpoolfile); + MUTEX_UNLOCK(env, hp->mtx_hash); + + /* Lock the region and collect stats and free the space. */ + MPOOL_SYSTEM_LOCK(env); + if (need_sync && + (t_ret = __memp_mf_sync(dbmp, mfp, 0)) != 0 && ret == 0) + ret = t_ret; + +#ifdef HAVE_STATISTICS + /* Copy the statistics into the region. */ + sp = &mp->stat; + sp->st_cache_hit += mfp->stat.st_cache_hit; + sp->st_cache_miss += mfp->stat.st_cache_miss; + sp->st_map += mfp->stat.st_map; + sp->st_page_create += mfp->stat.st_page_create; + sp->st_page_in += mfp->stat.st_page_in; + sp->st_page_out += mfp->stat.st_page_out; +#endif + + /* Free the space. */ + if (mfp->path_off != 0) + __memp_free(&dbmp->reginfo[0], + R_ADDR(dbmp->reginfo, mfp->path_off)); + if (mfp->fileid_off != 0) + __memp_free(&dbmp->reginfo[0], + R_ADDR(dbmp->reginfo, mfp->fileid_off)); + if (mfp->pgcookie_off != 0) + __memp_free(&dbmp->reginfo[0], + R_ADDR(dbmp->reginfo, mfp->pgcookie_off)); + __memp_free(&dbmp->reginfo[0], mfp); + + MPOOL_SYSTEM_UNLOCK(env); + + return (ret); +} + +/* + * __memp_inmemlist -- + * Return a list of the named in-memory databases. + * + * PUBLIC: int __memp_inmemlist __P((ENV *, char ***, int *)); + */ +int +__memp_inmemlist(env, namesp, cntp) + ENV *env; + char ***namesp; + int *cntp; +{ + DB_MPOOL *dbmp; + DB_MPOOL_HASH *hp; + MPOOL *mp; + MPOOLFILE *mfp; + int arraysz, cnt, i, ret; + char **names; + + names = NULL; + dbmp = env->mp_handle; + mp = dbmp->reginfo[0].primary; + hp = R_ADDR(dbmp->reginfo, mp->ftab); + + arraysz = cnt = 0; + for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) { + MUTEX_LOCK(env, hp->mtx_hash); + SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) { + /* Skip dead files and temporary files. */ + if (mfp->deadfile || F_ISSET(mfp, MP_TEMP)) + continue; + + /* Skip entries that allow files. */ + if (!mfp->no_backing_file) + continue; + + /* We found one. */ + if (cnt >= arraysz) { + arraysz += 100; + if ((ret = __os_realloc(env, + (u_int)arraysz * sizeof(names[0]), + &names)) != 0) + goto nomem; + } + if ((ret = __os_strdup(env, + R_ADDR(dbmp->reginfo, mfp->path_off), + &names[cnt])) != 0) + goto nomem; + + cnt++; + } + MUTEX_UNLOCK(env, hp->mtx_hash); + } + *namesp = names; + *cntp = cnt; + return (0); + +nomem: MUTEX_UNLOCK(env, hp->mtx_hash); + if (names != NULL) { + while (--cnt >= 0) + __os_free(env, names[cnt]); + __os_free(env, names); + } + + /* Make sure we don't return any garbage. */ + *cntp = 0; + *namesp = NULL; + return (ret); +} diff --git a/mp/mp_fput.c b/mp/mp_fput.c new file mode 100644 index 0000000..0f8598c --- /dev/null +++ b/mp/mp_fput.c @@ -0,0 +1,367 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" + +static int __memp_reset_lru __P((ENV *, REGINFO *)); + +/* + * __memp_fput_pp -- + * DB_MPOOLFILE->put pre/post processing. + * + * PUBLIC: int __memp_fput_pp + * PUBLIC: __P((DB_MPOOLFILE *, void *, DB_CACHE_PRIORITY, u_int32_t)); + */ +int +__memp_fput_pp(dbmfp, pgaddr, priority, flags) + DB_MPOOLFILE *dbmfp; + void *pgaddr; + DB_CACHE_PRIORITY priority; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret, t_ret; + + env = dbmfp->env; + + if (flags != 0) + return (__db_ferr(env, "DB_MPOOLFILE->put", 0)); + + MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->put"); + + ENV_ENTER(env, ip); + + ret = __memp_fput(dbmfp, ip, pgaddr, priority); + if (IS_ENV_REPLICATED(env) && + (t_ret = __op_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __memp_fput -- + * DB_MPOOLFILE->put. + * + * PUBLIC: int __memp_fput __P((DB_MPOOLFILE *, + * PUBLIC: DB_THREAD_INFO *, void *, DB_CACHE_PRIORITY)); + */ +int +__memp_fput(dbmfp, ip, pgaddr, priority) + DB_MPOOLFILE *dbmfp; + DB_THREAD_INFO *ip; + void *pgaddr; + DB_CACHE_PRIORITY priority; +{ + BH *bhp; + DB_ENV *dbenv; + DB_MPOOL *dbmp; + DB_MPOOL_HASH *hp; + ENV *env; + MPOOL *c_mp; + MPOOLFILE *mfp; + PIN_LIST *list, *lp; + REGINFO *infop, *reginfo; + roff_t b_ref; + int region; + int adjust, pfactor, ret, t_ret; + char buf[DB_THREADID_STRLEN]; + + env = dbmfp->env; + dbenv = env->dbenv; + dbmp = env->mp_handle; + mfp = dbmfp->mfp; + bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf)); + ret = 0; + + /* + * If this is marked dummy, we are using it to unpin a buffer for + * another thread. + */ + if (F_ISSET(dbmfp, MP_DUMMY)) + goto unpin; + + /* + * If we're mapping the file, there's nothing to do. Because we can + * stop mapping the file at any time, we have to check on each buffer + * to see if the address we gave the application was part of the map + * region. + */ + if (dbmfp->addr != NULL && pgaddr >= dbmfp->addr && + (u_int8_t *)pgaddr <= (u_int8_t *)dbmfp->addr + dbmfp->len) + return (0); + +#ifdef DIAGNOSTIC + /* + * Decrement the per-file pinned buffer count (mapped pages aren't + * counted). + */ + MPOOL_SYSTEM_LOCK(env); + if (dbmfp->pinref == 0) { + MPOOL_SYSTEM_UNLOCK(env); + __db_errx(env, + "%s: more pages returned than retrieved", __memp_fn(dbmfp)); + return (__env_panic(env, EACCES)); + } + --dbmfp->pinref; + MPOOL_SYSTEM_UNLOCK(env); +#endif + +unpin: + infop = &dbmp->reginfo[bhp->region]; + c_mp = infop->primary; + hp = R_ADDR(infop, c_mp->htab); + hp = &hp[bhp->bucket]; + + /* + * Check for a reference count going to zero. This can happen if the + * application returns a page twice. + */ + if (atomic_read(&bhp->ref) == 0) { + __db_errx(env, "%s: page %lu: unpinned page returned", + __memp_fn(dbmfp), (u_long)bhp->pgno); + DB_ASSERT(env, atomic_read(&bhp->ref) != 0); + return (__env_panic(env, EACCES)); + } + + /* Note the activity so allocation won't decide to quit. */ + ++c_mp->put_counter; + + if (ip != NULL) { + reginfo = env->reginfo; + list = R_ADDR(reginfo, ip->dbth_pinlist); + region = (int)(infop - dbmp->reginfo); + b_ref = R_OFFSET(infop, bhp); + for (lp = list; lp < &list[ip->dbth_pinmax]; lp++) + if (lp->b_ref == b_ref && lp->region == region) + break; + + if (lp == &list[ip->dbth_pinmax]) { + __db_errx(env, + "__memp_fput: pinned buffer not found for thread %s", + dbenv->thread_id_string(dbenv, + ip->dbth_pid, ip->dbth_tid, buf)); + return (__env_panic(env, EINVAL)); + } + + lp->b_ref = INVALID_ROFF; + ip->dbth_pincount--; + } + + /* + * Mark the file dirty. + */ + if (F_ISSET(bhp, BH_EXCLUSIVE) && F_ISSET(bhp, BH_DIRTY)) { + DB_ASSERT(env, atomic_read(&hp->hash_page_dirty) > 0); + mfp->file_written = 1; + } + + /* + * If more than one reference to the page we're done. Ignore the + * discard flags (for now) and leave the buffer's priority alone. + * We are doing this a little early as the remaining ref may or + * may not be a write behind. If it is we set the priority + * here, if not it will get set again later. We might race + * and miss setting the priority which would leave it wrong + * for a while. + */ + DB_ASSERT(env, atomic_read(&bhp->ref) != 0); + if (atomic_dec(env, &bhp->ref) > 1 || (atomic_read(&bhp->ref) == 1 && + !F_ISSET(bhp, BH_DIRTY))) { + /* + * __memp_pgwrite only has a shared lock while it clears + * the BH_DIRTY bit. If we only have a shared latch then + * we can't touch the flags bits. + */ + if (F_ISSET(bhp, BH_EXCLUSIVE)) + F_CLR(bhp, BH_EXCLUSIVE); + MUTEX_UNLOCK(env, bhp->mtx_buf); + return (0); + } + + /* The buffer should not be accessed again. */ + if (BH_REFCOUNT(bhp) == 0) + MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, 0); + + /* Update priority values. */ + if (priority == DB_PRIORITY_VERY_LOW || + mfp->priority == MPOOL_PRI_VERY_LOW) + bhp->priority = 0; + else { + /* + * We don't lock the LRU counter or the stat.st_pages field, if + * we get garbage (which won't happen on a 32-bit machine), it + * only means a buffer has the wrong priority. + */ + bhp->priority = c_mp->lru_count; + + switch (priority) { + default: + case DB_PRIORITY_UNCHANGED: + pfactor = mfp->priority; + break; + case DB_PRIORITY_VERY_LOW: + pfactor = MPOOL_PRI_VERY_LOW; + break; + case DB_PRIORITY_LOW: + pfactor = MPOOL_PRI_LOW; + break; + case DB_PRIORITY_DEFAULT: + pfactor = MPOOL_PRI_DEFAULT; + break; + case DB_PRIORITY_HIGH: + pfactor = MPOOL_PRI_HIGH; + break; + case DB_PRIORITY_VERY_HIGH: + pfactor = MPOOL_PRI_VERY_HIGH; + break; + } + + adjust = 0; + if (pfactor != 0) + adjust = (int)c_mp->stat.st_pages / pfactor; + + if (F_ISSET(bhp, BH_DIRTY)) + adjust += (int)c_mp->stat.st_pages / MPOOL_PRI_DIRTY; + + if (adjust > 0) { + if (UINT32_MAX - bhp->priority >= (u_int32_t)adjust) + bhp->priority += adjust; + } else if (adjust < 0) + if (bhp->priority > (u_int32_t)-adjust) + bhp->priority += adjust; + } + + /* + * __memp_pgwrite only has a shared lock while it clears the + * BH_DIRTY bit. If we only have a shared latch then we can't + * touch the flags bits. + */ + if (F_ISSET(bhp, BH_EXCLUSIVE)) + F_CLR(bhp, BH_EXCLUSIVE); + MUTEX_UNLOCK(env, bhp->mtx_buf); + + /* + * On every buffer put we update the buffer generation number and check + * for wraparound. + */ + if (++c_mp->lru_count == UINT32_MAX) + if ((t_ret = + __memp_reset_lru(env, dbmp->reginfo)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __memp_reset_lru -- + * Reset the cache LRU counter. + */ +static int +__memp_reset_lru(env, infop) + ENV *env; + REGINFO *infop; +{ + BH *bhp, *tbhp; + DB_MPOOL_HASH *hp; + MPOOL *c_mp; + u_int32_t bucket, priority; + + c_mp = infop->primary; + /* + * Update the counter so all future allocations will start at the + * bottom. + */ + c_mp->lru_count -= MPOOL_BASE_DECREMENT; + + /* Adjust the priority of every buffer in the system. */ + for (hp = R_ADDR(infop, c_mp->htab), + bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) { + /* + * Skip empty buckets. + * + * We can check for empty buckets before locking as we + * only care if the pointer is zero or non-zero. + */ + if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) { + c_mp->lru_reset++; + continue; + } + + MUTEX_LOCK(env, hp->mtx_hash); + c_mp->lru_reset++; + /* + * We need to take a little care that the bucket does + * not become unsorted. This is highly unlikely but + * possible. + */ + priority = 0; + SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) { + for (tbhp = bhp; tbhp != NULL; + tbhp = SH_CHAIN_PREV(tbhp, vc, __bh)) { + if (tbhp->priority != UINT32_MAX && + tbhp->priority > MPOOL_BASE_DECREMENT) { + tbhp->priority -= MPOOL_BASE_DECREMENT; + if (tbhp->priority < priority) + tbhp->priority = priority; + } + } + priority = bhp->priority; + } + MUTEX_UNLOCK(env, hp->mtx_hash); + } + c_mp->lru_reset = 0; + + COMPQUIET(env, NULL); + return (0); +} + +/* + * __memp_unpin_buffers -- + * Unpin buffers pinned by a thread. + * + * PUBLIC: int __memp_unpin_buffers __P((ENV *, DB_THREAD_INFO *)); + */ +int +__memp_unpin_buffers(env, ip) + ENV *env; + DB_THREAD_INFO *ip; +{ + BH *bhp; + DB_MPOOL *dbmp; + DB_MPOOLFILE dbmf; + PIN_LIST *list, *lp; + REGINFO *rinfop, *reginfo; + int ret; + + memset(&dbmf, 0, sizeof(dbmf)); + dbmf.env = env; + dbmf.flags = MP_DUMMY; + dbmp = env->mp_handle; + reginfo = env->reginfo; + + list = R_ADDR(reginfo, ip->dbth_pinlist); + for (lp = list; lp < &list[ip->dbth_pinmax]; lp++) { + if (lp->b_ref == INVALID_ROFF) + continue; + rinfop = &dbmp->reginfo[lp->region]; + bhp = R_ADDR(rinfop, lp->b_ref); + dbmf.mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); + if ((ret = __memp_fput(&dbmf, ip, + (u_int8_t *)bhp + SSZA(BH, buf), + DB_PRIORITY_UNCHANGED)) != 0) + return (ret); + } + return (0); +} diff --git a/mp/mp_fset.c b/mp/mp_fset.c new file mode 100644 index 0000000..2869fc8 --- /dev/null +++ b/mp/mp_fset.c @@ -0,0 +1,165 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" + +/* + * __memp_dirty -- + * Upgrade a page from a read-only to a writeable pointer. + * + * PUBLIC: int __memp_dirty __P((DB_MPOOLFILE *, void *, + * PUBLIC: DB_THREAD_INFO *, DB_TXN *, DB_CACHE_PRIORITY, u_int32_t)); + */ +int +__memp_dirty(dbmfp, addrp, ip, txn, priority, flags) + DB_MPOOLFILE *dbmfp; + void *addrp; + DB_THREAD_INFO *ip; + DB_TXN *txn; + DB_CACHE_PRIORITY priority; + u_int32_t flags; +{ + BH *bhp; + DB_MPOOL *dbmp; + DB_MPOOL_HASH *hp; + DB_TXN *ancestor; + ENV *env; + MPOOL *c_mp; +#ifdef DIAG_MVCC + MPOOLFILE *mfp; +#endif + REGINFO *infop; + int mvcc, ret; + db_pgno_t pgno; + void *pgaddr; + + env = dbmfp->env; + dbmp = env->mp_handle; + mvcc = dbmfp->mfp->multiversion; + + /* Convert the page address to a buffer header. */ + pgaddr = *(void **)addrp; + bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf)); + pgno = bhp->pgno; + + /* If we have it exclusively then its already dirty. */ + if (F_ISSET(bhp, BH_EXCLUSIVE)) { + DB_ASSERT(env, F_ISSET(bhp, BH_DIRTY)); + return (0); + } + + if (flags == 0) + flags = DB_MPOOL_DIRTY; + DB_ASSERT(env, flags == DB_MPOOL_DIRTY || flags == DB_MPOOL_EDIT); + + if (F_ISSET(dbmfp, MP_READONLY)) { + __db_errx(env, "%s: dirty flag set for readonly file page", + __memp_fn(dbmfp)); + return (EACCES); + } + + for (ancestor = txn; + ancestor != NULL && ancestor->parent != NULL; + ancestor = ancestor->parent) + ; + + if (mvcc && txn != NULL && flags == DB_MPOOL_DIRTY && + (!BH_OWNED_BY(env, bhp, ancestor) || SH_CHAIN_HASNEXT(bhp, vc))) { + atomic_inc(env, &bhp->ref); + *(void **)addrp = NULL; + if ((ret = __memp_fput(dbmfp, ip, pgaddr, priority)) != 0) { + __db_errx(env, + "%s: error releasing a read-only page", + __memp_fn(dbmfp)); + atomic_dec(env, &bhp->ref); + return (ret); + } + if ((ret = __memp_fget(dbmfp, + &pgno, ip, txn, flags, addrp)) != 0) { + if (ret != DB_LOCK_DEADLOCK) + __db_errx(env, + "%s: error getting a page for writing", + __memp_fn(dbmfp)); + atomic_dec(env, &bhp->ref); + return (ret); + } + atomic_dec(env, &bhp->ref); + + DB_ASSERT(env, + flags == DB_MPOOL_DIRTY && *(void **)addrp != pgaddr); + + pgaddr = *(void **)addrp; + bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf)); + DB_ASSERT(env, pgno == bhp->pgno); + return (0); + } + + infop = &dbmp->reginfo[bhp->region]; + c_mp = infop->primary; + hp = R_ADDR(infop, c_mp->htab); + hp = &hp[bhp->bucket]; + + /* Drop the shared latch and get an exclusive. We have the buf ref'ed.*/ + MUTEX_UNLOCK(env, bhp->mtx_buf); + MUTEX_LOCK(env, bhp->mtx_buf); + DB_ASSERT(env, !F_ISSET(bhp, BH_EXCLUSIVE)); + F_SET(bhp, BH_EXCLUSIVE); + + /* Set/clear the page bits. */ + if (!F_ISSET(bhp, BH_DIRTY)) { +#ifdef DIAGNOSTIC + MUTEX_LOCK(env, hp->mtx_hash); +#endif + atomic_inc(env, &hp->hash_page_dirty); + F_SET(bhp, BH_DIRTY); +#ifdef DIAGNOSTIC + MUTEX_UNLOCK(env, hp->mtx_hash); +#endif + } + +#ifdef DIAG_MVCC + mfp = R_ADDR(env->mp_handle->reginfo, bhp->mf_offset); + MVCC_MPROTECT(bhp->buf, mfp->stat.st_pagesize, PROT_READ | PROT_WRITE); +#endif + DB_ASSERT(env, !F_ISSET(bhp, BH_DIRTY) || + atomic_read(&hp->hash_page_dirty) != 0); + return (0); +} + +/* + * __memp_shared -- + * Downgrade a page from exlusively held to shared. + * + * PUBLIC: int __memp_shared __P((DB_MPOOLFILE *, void *)); + */ +int +__memp_shared(dbmfp, pgaddr) + DB_MPOOLFILE *dbmfp; + void *pgaddr; +{ + BH *bhp; + ENV *env; + + env = dbmfp->env; + /* Convert the page address to a buffer header. */ + bhp = (BH *)((u_int8_t *)pgaddr - SSZA(BH, buf)); + + if (F_ISSET(bhp, BH_DIRTY)) + dbmfp->mfp->file_written = 1; + DB_ASSERT(env, F_ISSET(bhp, BH_EXCLUSIVE)); + F_CLR(bhp, BH_EXCLUSIVE); + MUTEX_UNLOCK(env, bhp->mtx_buf); + MUTEX_READLOCK(env, bhp->mtx_buf); + + return (0); +} diff --git a/mp/mp_method.c b/mp/mp_method.c new file mode 100644 index 0000000..c51a404 --- /dev/null +++ b/mp/mp_method.c @@ -0,0 +1,992 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/mp.h" +#include "dbinc/db_page.h" +#include "dbinc/hash.h" + +/* + * __memp_env_create -- + * Mpool specific creation of the DB_ENV structure. + * + * PUBLIC: int __memp_env_create __P((DB_ENV *)); + */ +int +__memp_env_create(dbenv) + DB_ENV *dbenv; +{ + /* + * !!! + * Our caller has not yet had the opportunity to reset the panic + * state or turn off mutex locking, and so we can neither check + * the panic state or acquire a mutex in the DB_ENV create path. + * + * We default to 32 8K pages. We don't default to a flat 256K, because + * some systems require significantly more memory to hold 32 pages than + * others. For example, HP-UX with POSIX pthreads needs 88 bytes for + * a POSIX pthread mutex and almost 200 bytes per buffer header, while + * Solaris needs 24 and 52 bytes for the same structures. The minimum + * number of hash buckets is 37. These contain a mutex also. + */ + dbenv->mp_bytes = dbenv->mp_max_bytes = + 32 * ((8 * 1024) + sizeof(BH)) + 37 * sizeof(DB_MPOOL_HASH); + dbenv->mp_ncache = 1; + + return (0); +} + +/* + * __memp_env_destroy -- + * Mpool specific destruction of the DB_ENV structure. + * + * PUBLIC: void __memp_env_destroy __P((DB_ENV *)); + */ +void +__memp_env_destroy(dbenv) + DB_ENV *dbenv; +{ + COMPQUIET(dbenv, NULL); +} + +/* + * __memp_get_cachesize -- + * {DB_ENV,DB}->get_cachesize. + * + * PUBLIC: int __memp_get_cachesize + * PUBLIC: __P((DB_ENV *, u_int32_t *, u_int32_t *, int *)); + */ +int +__memp_get_cachesize(dbenv, gbytesp, bytesp, ncachep) + DB_ENV *dbenv; + u_int32_t *gbytesp, *bytesp; + int *ncachep; +{ + ENV *env; + MPOOL *mp; + + env = dbenv->env; + + ENV_NOT_CONFIGURED(env, + env->mp_handle, "DB_ENV->get_cachesize", DB_INIT_MPOOL); + + if (MPOOL_ON(env)) { + /* Cannot be set after open, no lock required to read. */ + mp = env->mp_handle->reginfo[0].primary; + if (gbytesp != NULL) + *gbytesp = mp->stat.st_gbytes; + if (bytesp != NULL) + *bytesp = mp->stat.st_bytes; + if (ncachep != NULL) + *ncachep = (int)mp->nreg; + } else { + if (gbytesp != NULL) + *gbytesp = dbenv->mp_gbytes; + if (bytesp != NULL) + *bytesp = dbenv->mp_bytes; + if (ncachep != NULL) + *ncachep = (int)dbenv->mp_ncache; + } + return (0); +} + +/* + * __memp_set_cachesize -- + * {DB_ENV,DB}->set_cachesize. + * + * PUBLIC: int __memp_set_cachesize __P((DB_ENV *, u_int32_t, u_int32_t, int)); + */ +int +__memp_set_cachesize(dbenv, gbytes, bytes, arg_ncache) + DB_ENV *dbenv; + u_int32_t gbytes, bytes; + int arg_ncache; +{ + ENV *env; + u_int ncache; + + env = dbenv->env; + + /* Normalize the cache count. */ + ncache = arg_ncache <= 0 ? 1 : (u_int)arg_ncache; + + /* + * You can only store 4GB-1 in an unsigned 32-bit value, so correct for + * applications that specify 4GB cache sizes -- we know what they meant. + */ + if (sizeof(roff_t) == 4 && gbytes / ncache == 4 && bytes == 0) { + --gbytes; + bytes = GIGABYTE - 1; + } else { + gbytes += bytes / GIGABYTE; + bytes %= GIGABYTE; + } + + /* + * !!! + * With 32-bit region offsets, individual cache regions must be smaller + * than 4GB. Also, cache sizes larger than 10TB would cause 32-bit + * wrapping in the calculation of the number of hash buckets. See + * __memp_open for details. + */ + if (!F_ISSET(env, ENV_OPEN_CALLED)) { + if (sizeof(roff_t) <= 4 && gbytes / ncache >= 4) { + __db_errx(env, + "individual cache size too large: maximum is 4GB"); + return (EINVAL); + } + if (gbytes / ncache > 10000) { + __db_errx(env, + "individual cache size too large: maximum is 10TB"); + return (EINVAL); + } + } + + /* + * If the application requested less than 500Mb, increase the cachesize + * by 25% and factor in the size of the hash buckets to account for our + * overhead. (I'm guessing caches over 500Mb are specifically sized, + * that is, it's a large server and the application actually knows how + * much memory is available. We only document the 25% overhead number, + * not the hash buckets, but I don't see a reason to confuse the issue, + * it shouldn't matter to an application.) + * + * There is a minimum cache size, regardless. + */ + if (gbytes == 0) { + if (bytes < 500 * MEGABYTE) + bytes += (bytes / 4) + 37 * sizeof(DB_MPOOL_HASH); + if (bytes / ncache < DB_CACHESIZE_MIN) + bytes = ncache * DB_CACHESIZE_MIN; + } + + if (F_ISSET(env, ENV_OPEN_CALLED)) + return (__memp_resize(env->mp_handle, gbytes, bytes)); + + dbenv->mp_gbytes = gbytes; + dbenv->mp_bytes = bytes; + dbenv->mp_ncache = ncache; + + return (0); +} + +/* + * __memp_set_config -- + * Set the cache subsystem configuration. + * + * PUBLIC: int __memp_set_config __P((DB_ENV *, u_int32_t, int)); + */ +int +__memp_set_config(dbenv, which, on) + DB_ENV *dbenv; + u_int32_t which; + int on; +{ + DB_MPOOL *dbmp; + ENV *env; + MPOOL *mp; + + env = dbenv->env; + + ENV_NOT_CONFIGURED(env, + env->mp_handle, "DB_ENV->memp_set_config", DB_INIT_MPOOL); + + switch (which) { + case DB_MEMP_SUPPRESS_WRITE: + case DB_MEMP_SYNC_INTERRUPT: + if (MPOOL_ON(env)) { + dbmp = env->mp_handle; + mp = dbmp->reginfo[0].primary; + if (on) + FLD_SET(mp->config_flags, which); + else + FLD_CLR(mp->config_flags, which); + } + break; + default: + return (EINVAL); + } + return (0); +} + +/* + * __memp_get_config -- + * Return the cache subsystem configuration. + * + * PUBLIC: int __memp_get_config __P((DB_ENV *, u_int32_t, int *)); + */ +int +__memp_get_config(dbenv, which, onp) + DB_ENV *dbenv; + u_int32_t which; + int *onp; +{ + DB_MPOOL *dbmp; + ENV *env; + MPOOL *mp; + + env = dbenv->env; + + ENV_REQUIRES_CONFIG(env, + env->mp_handle, "DB_ENV->memp_get_config", DB_INIT_MPOOL); + + switch (which) { + case DB_MEMP_SUPPRESS_WRITE: + case DB_MEMP_SYNC_INTERRUPT: + if (MPOOL_ON(env)) { + dbmp = env->mp_handle; + mp = dbmp->reginfo[0].primary; + *onp = FLD_ISSET(mp->config_flags, which) ? 1 : 0; + } else + *onp = 0; + break; + default: + return (EINVAL); + } + return (0); +} + +/* + * PUBLIC: int __memp_get_mp_max_openfd __P((DB_ENV *, int *)); + */ +int +__memp_get_mp_max_openfd(dbenv, maxopenfdp) + DB_ENV *dbenv; + int *maxopenfdp; +{ + DB_MPOOL *dbmp; + DB_THREAD_INFO *ip; + ENV *env; + MPOOL *mp; + + env = dbenv->env; + + ENV_NOT_CONFIGURED(env, + env->mp_handle, "DB_ENV->get_mp_max_openfd", DB_INIT_MPOOL); + + if (MPOOL_ON(env)) { + dbmp = env->mp_handle; + mp = dbmp->reginfo[0].primary; + ENV_ENTER(env, ip); + MPOOL_SYSTEM_LOCK(env); + *maxopenfdp = mp->mp_maxopenfd; + MPOOL_SYSTEM_UNLOCK(env); + ENV_LEAVE(env, ip); + } else + *maxopenfdp = dbenv->mp_maxopenfd; + return (0); +} + +/* + * __memp_set_mp_max_openfd -- + * Set the maximum number of open fd's when flushing the cache. + * PUBLIC: int __memp_set_mp_max_openfd __P((DB_ENV *, int)); + */ +int +__memp_set_mp_max_openfd(dbenv, maxopenfd) + DB_ENV *dbenv; + int maxopenfd; +{ + DB_MPOOL *dbmp; + DB_THREAD_INFO *ip; + ENV *env; + MPOOL *mp; + + env = dbenv->env; + + ENV_NOT_CONFIGURED(env, + env->mp_handle, "DB_ENV->set_mp_max_openfd", DB_INIT_MPOOL); + + if (MPOOL_ON(env)) { + dbmp = env->mp_handle; + mp = dbmp->reginfo[0].primary; + ENV_ENTER(env, ip); + MPOOL_SYSTEM_LOCK(env); + mp->mp_maxopenfd = maxopenfd; + MPOOL_SYSTEM_UNLOCK(env); + ENV_LEAVE(env, ip); + } else + dbenv->mp_maxopenfd = maxopenfd; + return (0); +} + +/* + * PUBLIC: int __memp_get_mp_max_write __P((DB_ENV *, int *, db_timeout_t *)); + */ +int +__memp_get_mp_max_write(dbenv, maxwritep, maxwrite_sleepp) + DB_ENV *dbenv; + int *maxwritep; + db_timeout_t *maxwrite_sleepp; +{ + DB_MPOOL *dbmp; + DB_THREAD_INFO *ip; + ENV *env; + MPOOL *mp; + + env = dbenv->env; + + ENV_NOT_CONFIGURED(env, + env->mp_handle, "DB_ENV->get_mp_max_write", DB_INIT_MPOOL); + + if (MPOOL_ON(env)) { + dbmp = env->mp_handle; + mp = dbmp->reginfo[0].primary; + ENV_ENTER(env, ip); + MPOOL_SYSTEM_LOCK(env); + *maxwritep = mp->mp_maxwrite; + *maxwrite_sleepp = mp->mp_maxwrite_sleep; + MPOOL_SYSTEM_UNLOCK(env); + ENV_LEAVE(env, ip); + } else { + *maxwritep = dbenv->mp_maxwrite; + *maxwrite_sleepp = dbenv->mp_maxwrite_sleep; + } + return (0); +} + +/* + * __memp_set_mp_max_write -- + * Set the maximum continuous I/O count. + * + * PUBLIC: int __memp_set_mp_max_write __P((DB_ENV *, int, db_timeout_t)); + */ +int +__memp_set_mp_max_write(dbenv, maxwrite, maxwrite_sleep) + DB_ENV *dbenv; + int maxwrite; + db_timeout_t maxwrite_sleep; +{ + DB_MPOOL *dbmp; + DB_THREAD_INFO *ip; + ENV *env; + MPOOL *mp; + + env = dbenv->env; + + ENV_NOT_CONFIGURED(env, + env->mp_handle, "DB_ENV->get_mp_max_write", DB_INIT_MPOOL); + + if (MPOOL_ON(env)) { + dbmp = env->mp_handle; + mp = dbmp->reginfo[0].primary; + ENV_ENTER(env, ip); + MPOOL_SYSTEM_LOCK(env); + mp->mp_maxwrite = maxwrite; + mp->mp_maxwrite_sleep = maxwrite_sleep; + MPOOL_SYSTEM_UNLOCK(env); + ENV_LEAVE(env, ip); + } else { + dbenv->mp_maxwrite = maxwrite; + dbenv->mp_maxwrite_sleep = maxwrite_sleep; + } + return (0); +} + +/* + * PUBLIC: int __memp_get_mp_mmapsize __P((DB_ENV *, size_t *)); + */ +int +__memp_get_mp_mmapsize(dbenv, mp_mmapsizep) + DB_ENV *dbenv; + size_t *mp_mmapsizep; +{ + DB_MPOOL *dbmp; + DB_THREAD_INFO *ip; + ENV *env; + MPOOL *mp; + + env = dbenv->env; + + ENV_NOT_CONFIGURED(env, + env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL); + + if (MPOOL_ON(env)) { + dbmp = env->mp_handle; + mp = dbmp->reginfo[0].primary; + ENV_ENTER(env, ip); + MPOOL_SYSTEM_LOCK(env); + *mp_mmapsizep = mp->mp_mmapsize; + MPOOL_SYSTEM_UNLOCK(env); + ENV_LEAVE(env, ip); + } else + *mp_mmapsizep = dbenv->mp_mmapsize; + return (0); +} + +/* + * __memp_set_mp_mmapsize -- + * DB_ENV->set_mp_mmapsize. + * + * PUBLIC: int __memp_set_mp_mmapsize __P((DB_ENV *, size_t)); + */ +int +__memp_set_mp_mmapsize(dbenv, mp_mmapsize) + DB_ENV *dbenv; + size_t mp_mmapsize; +{ + DB_MPOOL *dbmp; + DB_THREAD_INFO *ip; + ENV *env; + MPOOL *mp; + + env = dbenv->env; + + ENV_NOT_CONFIGURED(env, + env->mp_handle, "DB_ENV->set_mp_max_mmapsize", DB_INIT_MPOOL); + + if (MPOOL_ON(env)) { + dbmp = env->mp_handle; + mp = dbmp->reginfo[0].primary; + ENV_ENTER(env, ip); + MPOOL_SYSTEM_LOCK(env); + mp->mp_mmapsize = mp_mmapsize; + MPOOL_SYSTEM_UNLOCK(env); + ENV_LEAVE(env, ip); + } else + dbenv->mp_mmapsize = mp_mmapsize; + return (0); +} + +/* + * PUBLIC: int __memp_get_mp_pagesize __P((DB_ENV *, u_int32_t *)); + */ +int +__memp_get_mp_pagesize(dbenv, mp_pagesizep) + DB_ENV *dbenv; + u_int32_t *mp_pagesizep; +{ + ENV *env; + + env = dbenv->env; + + ENV_NOT_CONFIGURED(env, + env->mp_handle, "DB_ENV->get_mp_max_pagesize", DB_INIT_MPOOL); + + *mp_pagesizep = dbenv->mp_pagesize; + return (0); +} + +/* + * __memp_set_mp_pagesize -- + * DB_ENV->set_mp_pagesize. + * + * PUBLIC: int __memp_set_mp_pagesize __P((DB_ENV *, u_int32_t)); + */ +int +__memp_set_mp_pagesize(dbenv, mp_pagesize) + DB_ENV *dbenv; + u_int32_t mp_pagesize; +{ + ENV *env; + + env = dbenv->env; + + ENV_NOT_CONFIGURED(env, + env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL); + ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mp_pagesize"); + + dbenv->mp_pagesize = mp_pagesize; + return (0); +} + +/* + * PUBLIC: int __memp_get_mp_tablesize __P((DB_ENV *, u_int32_t *)); + */ +int +__memp_get_mp_tablesize(dbenv, mp_tablesizep) + DB_ENV *dbenv; + u_int32_t *mp_tablesizep; +{ + ENV *env; + + env = dbenv->env; + + ENV_NOT_CONFIGURED(env, + env->mp_handle, "DB_ENV->get_mp_max_tablesize", DB_INIT_MPOOL); + + *mp_tablesizep = dbenv->mp_tablesize; + return (0); +} + +/* + * __memp_set_mp_tablesize -- + * DB_ENV->set_mp_tablesize. + * + * PUBLIC: int __memp_set_mp_tablesize __P((DB_ENV *, u_int32_t)); + */ +int +__memp_set_mp_tablesize(dbenv, mp_tablesize) + DB_ENV *dbenv; + u_int32_t mp_tablesize; +{ + ENV *env; + + env = dbenv->env; + + ENV_NOT_CONFIGURED(env, + env->mp_handle, "DB_ENV->get_mp_max_mmapsize", DB_INIT_MPOOL); + ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_mp_tablesize"); + + dbenv->mp_tablesize = mp_tablesize; + return (0); +} + +/* + * __memp_nameop + * Remove or rename a file in the pool. + * + * PUBLIC: int __memp_nameop __P((ENV *, + * PUBLIC: u_int8_t *, const char *, const char *, const char *, int)); + * + * XXX + * Undocumented interface: DB private. + */ +int +__memp_nameop(env, fileid, newname, fullold, fullnew, inmem) + ENV *env; + u_int8_t *fileid; + const char *newname, *fullold, *fullnew; + int inmem; +{ + DB_MPOOL *dbmp; + DB_MPOOL_HASH *hp, *nhp; + MPOOL *mp; + MPOOLFILE *mfp; + roff_t newname_off; + u_int32_t bucket; + int locked, ret; + size_t nlen; + void *p; + +#undef op_is_remove +#define op_is_remove (newname == NULL) + + COMPQUIET(bucket, 0); + COMPQUIET(hp, NULL); + COMPQUIET(newname_off, 0); + COMPQUIET(nlen, 0); + + dbmp = NULL; + mfp = NULL; + nhp = NULL; + p = NULL; + locked = ret = 0; + + if (!MPOOL_ON(env)) + goto fsop; + + dbmp = env->mp_handle; + mp = dbmp->reginfo[0].primary; + hp = R_ADDR(dbmp->reginfo, mp->ftab); + + if (!op_is_remove) { + nlen = strlen(newname); + if ((ret = __memp_alloc(dbmp, dbmp->reginfo, + NULL, nlen + 1, &newname_off, &p)) != 0) + return (ret); + memcpy(p, newname, nlen + 1); + } + + /* + * Remove or rename a file that the mpool might know about. We assume + * that the fop layer has the file locked for exclusive access, so we + * don't worry about locking except for the mpool mutexes. Checkpoint + * can happen at any time, independent of file locking, so we have to + * do the actual unlink or rename system call while holding + * all affected buckets locked. + * + * If this is a rename and this is a memory file then we need + * to make sure that the new name does not exist. Since we + * are locking two buckets lock them in ascending order. + */ + if (inmem) { + DB_ASSERT(env, fullold != NULL); + hp += FNBUCKET(fullold, strlen(fullold)); + if (!op_is_remove) { + bucket = FNBUCKET(newname, nlen); + nhp = R_ADDR(dbmp->reginfo, mp->ftab); + nhp += bucket; + } + } else + hp += FNBUCKET(fileid, DB_FILE_ID_LEN); + + if (nhp != NULL && nhp < hp) + MUTEX_LOCK(env, nhp->mtx_hash); + MUTEX_LOCK(env, hp->mtx_hash); + if (nhp != NULL && nhp > hp) + MUTEX_LOCK(env, nhp->mtx_hash); + locked = 1; + + if (!op_is_remove && inmem) { + SH_TAILQ_FOREACH(mfp, &nhp->hash_bucket, q, __mpoolfile) + if (!mfp->deadfile && + mfp->no_backing_file && strcmp(newname, + R_ADDR(dbmp->reginfo, mfp->path_off)) == 0) + break; + if (mfp != NULL) { + ret = EEXIST; + goto err; + } + } + + /* + * Find the file -- if mpool doesn't know about this file, that may + * not be an error. + */ + SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) { + /* Ignore non-active files. */ + if (mfp->deadfile || F_ISSET(mfp, MP_TEMP)) + continue; + + /* Try to match on fileid. */ + if (memcmp(fileid, R_ADDR( + dbmp->reginfo, mfp->fileid_off), DB_FILE_ID_LEN) != 0) + continue; + + break; + } + + if (mfp == NULL) { + if (inmem) { + ret = ENOENT; + goto err; + } + goto fsop; + } + + if (op_is_remove) { + MUTEX_LOCK(env, mfp->mutex); + /* + * In-memory dbs have an artificially incremented ref count so + * they do not get reclaimed as long as they exist. Since we + * are now deleting the database, we need to dec that count. + */ + if (mfp->no_backing_file) + mfp->mpf_cnt--; + mfp->deadfile = 1; + MUTEX_UNLOCK(env, mfp->mutex); + } else { + /* + * Else, it's a rename. We've allocated memory for the new + * name. Swap it with the old one. If it's in memory we + * need to move it the right bucket. + */ + p = R_ADDR(dbmp->reginfo, mfp->path_off); + mfp->path_off = newname_off; + + if (inmem && hp != nhp) { + DB_ASSERT(env, nhp != NULL); + SH_TAILQ_REMOVE(&hp->hash_bucket, mfp, q, __mpoolfile); + mfp->bucket = bucket; + SH_TAILQ_INSERT_TAIL(&nhp->hash_bucket, mfp, q); + } + } + +fsop: /* + * If this is a real file, then mfp could be NULL, because + * mpool isn't turned on, and we still need to do the file ops. + */ + if (mfp == NULL || !mfp->no_backing_file) { + if (op_is_remove) { + /* + * !!! + * Replication may ask us to unlink a file that's been + * renamed. Don't complain if it doesn't exist. + */ + if ((ret = __os_unlink(env, fullold, 0)) == ENOENT) + ret = 0; + } else { + /* + * Defensive only, fullnew should never be + * NULL. + */ + DB_ASSERT(env, fullnew != NULL); + if (fullnew == NULL) { + ret = EINVAL; + goto err; + } + ret = __os_rename(env, fullold, fullnew, 1); + } + } + + /* Delete the memory we no longer need. */ +err: if (p != NULL) { + MPOOL_REGION_LOCK(env, &dbmp->reginfo[0]); + __memp_free(&dbmp->reginfo[0], p); + MPOOL_REGION_UNLOCK(env, &dbmp->reginfo[0]); + } + + /* If we have buckets locked, unlock them when done moving files. */ + if (locked == 1) { + MUTEX_UNLOCK(env, hp->mtx_hash); + if (nhp != NULL && nhp != hp) + MUTEX_UNLOCK(env, nhp->mtx_hash); + } + return (ret); +} + +/* + * __memp_ftruncate __ + * Truncate the file. + * + * PUBLIC: int __memp_ftruncate __P((DB_MPOOLFILE *, DB_TXN *, + * PUBLIC: DB_THREAD_INFO *, db_pgno_t, u_int32_t)); + */ +int +__memp_ftruncate(dbmfp, txn, ip, pgno, flags) + DB_MPOOLFILE *dbmfp; + DB_TXN *txn; + DB_THREAD_INFO *ip; + db_pgno_t pgno; + u_int32_t flags; +{ + ENV *env; + MPOOLFILE *mfp; + void *pagep; + db_pgno_t last_pgno, pg; + int ret; + + env = dbmfp->env; + mfp = dbmfp->mfp; + ret = 0; + + MUTEX_LOCK(env, mfp->mutex); + last_pgno = mfp->last_pgno; + MUTEX_UNLOCK(env, mfp->mutex); + + if (pgno > last_pgno) { + if (LF_ISSET(MP_TRUNC_RECOVER)) + return (0); + __db_errx(env, "Truncate beyond the end of file"); + return (EINVAL); + } + + pg = pgno; + do { + if (mfp->block_cnt == 0) + break; + if ((ret = __memp_fget(dbmfp, &pg, + ip, txn, DB_MPOOL_FREE, &pagep)) != 0) + return (ret); + } while (pg++ < last_pgno); + + /* + * If we are aborting an extend of a file, the call to __os_truncate + * could extend the file if the new page(s) had not yet been + * written to disk. We do not want to extend the file to pages + * whose log records are not yet flushed [#14031]. In addition if + * we are out of disk space we can generate an error [#12743]. + */ + MUTEX_LOCK(env, mfp->mutex); + if (!F_ISSET(mfp, MP_TEMP) && + !mfp->no_backing_file && pgno <= mfp->last_flushed_pgno) +#ifdef HAVE_FTRUNCATE + ret = __os_truncate(env, + dbmfp->fhp, pgno, mfp->stat.st_pagesize); +#else + ret = __db_zero_extend(env, + dbmfp->fhp, pgno, mfp->last_pgno, mfp->stat.st_pagesize); +#endif + + /* + * This set could race with another thread of control that extending + * the file. It's not a problem because we should have the page + * locked at a higher level of the system. + */ + if (ret == 0) { + mfp->last_pgno = pgno - 1; + if (mfp->last_flushed_pgno > mfp->last_pgno) + mfp->last_flushed_pgno = mfp->last_pgno; + } + MUTEX_UNLOCK(env, mfp->mutex); + + return (ret); +} + +#ifdef HAVE_FTRUNCATE +/* + * Support routines for maintaining a sorted freelist while we try to rearrange + * and truncate the file. + */ + +/* + * __memp_alloc_freelist -- + * Allocate mpool space for the freelist. + * + * PUBLIC: int __memp_alloc_freelist __P((DB_MPOOLFILE *, + * PUBLIC: u_int32_t, db_pgno_t **)); + */ +int +__memp_alloc_freelist(dbmfp, nelems, listp) + DB_MPOOLFILE *dbmfp; + u_int32_t nelems; + db_pgno_t **listp; +{ + DB_MPOOL *dbmp; + ENV *env; + MPOOLFILE *mfp; + void *retp; + int ret; + + env = dbmfp->env; + dbmp = env->mp_handle; + mfp = dbmfp->mfp; + + *listp = NULL; + + /* + * These fields are protected because the database layer + * has the metapage locked while manipulating them. + */ + mfp->free_ref++; + if (mfp->free_size != 0) + return (EBUSY); + + /* Allocate at least a few slots. */ + mfp->free_cnt = nelems; + if (nelems == 0) + nelems = 50; + + if ((ret = __memp_alloc(dbmp, dbmp->reginfo, + NULL, nelems * sizeof(db_pgno_t), &mfp->free_list, &retp)) != 0) + return (ret); + + mfp->free_size = nelems * sizeof(db_pgno_t); + *listp = retp; + return (0); +} + +/* + * __memp_free_freelist -- + * Free the list. + * + * PUBLIC: int __memp_free_freelist __P((DB_MPOOLFILE *)); + */ +int +__memp_free_freelist(dbmfp) + DB_MPOOLFILE *dbmfp; +{ + DB_MPOOL *dbmp; + ENV *env; + MPOOLFILE *mfp; + + env = dbmfp->env; + dbmp = env->mp_handle; + mfp = dbmfp->mfp; + + DB_ASSERT(env, mfp->free_ref > 0); + if (--mfp->free_ref > 0) + return (0); + + DB_ASSERT(env, mfp->free_size != 0); + + MPOOL_SYSTEM_LOCK(env); + __memp_free(dbmp->reginfo, R_ADDR(dbmp->reginfo, mfp->free_list)); + MPOOL_SYSTEM_UNLOCK(env); + + mfp->free_cnt = 0; + mfp->free_list = 0; + mfp->free_size = 0; + return (0); +} + +/* + * __memp_get_freelst -- + * Return current list. + * + * PUBLIC: int __memp_get_freelist __P(( + * PUBLIC: DB_MPOOLFILE *, u_int32_t *, db_pgno_t **)); + */ +int +__memp_get_freelist(dbmfp, nelemp, listp) + DB_MPOOLFILE *dbmfp; + u_int32_t *nelemp; + db_pgno_t **listp; +{ + DB_MPOOL *dbmp; + ENV *env; + MPOOLFILE *mfp; + + env = dbmfp->env; + dbmp = env->mp_handle; + mfp = dbmfp->mfp; + + if (mfp->free_size == 0) { + *nelemp = 0; + *listp = NULL; + } else { + *nelemp = mfp->free_cnt; + *listp = R_ADDR(dbmp->reginfo, mfp->free_list); + } + + return (0); +} + +/* + * __memp_extend_freelist -- + * Extend the list. + * + * PUBLIC: int __memp_extend_freelist __P(( + * PUBLIC: DB_MPOOLFILE *, u_int32_t , db_pgno_t **)); + */ +int +__memp_extend_freelist(dbmfp, count, listp) + DB_MPOOLFILE *dbmfp; + u_int32_t count; + db_pgno_t **listp; +{ + DB_MPOOL *dbmp; + ENV *env; + MPOOLFILE *mfp; + int ret; + void *retp; + + env = dbmfp->env; + dbmp = env->mp_handle; + mfp = dbmfp->mfp; + + if (mfp->free_size == 0) + return (EINVAL); + + if (count * sizeof(db_pgno_t) > mfp->free_size) { + mfp->free_size = + (size_t)DB_ALIGN(count * sizeof(db_pgno_t), 512); + *listp = R_ADDR(dbmp->reginfo, mfp->free_list); + if ((ret = __memp_alloc(dbmp, dbmp->reginfo, + NULL, mfp->free_size, &mfp->free_list, &retp)) != 0) + return (ret); + + memcpy(retp, *listp, mfp->free_cnt * sizeof(db_pgno_t)); + + MPOOL_SYSTEM_LOCK(env); + __memp_free(dbmp->reginfo, *listp); + MPOOL_SYSTEM_UNLOCK(env); + } + + mfp->free_cnt = count; + *listp = R_ADDR(dbmp->reginfo, mfp->free_list); + + return (0); +} +#endif + +/* + * __memp_set_last_pgno -- set the last page of the file + * + * PUBLIC: void __memp_set_last_pgno __P((DB_MPOOLFILE *, db_pgno_t)); + */ +void +__memp_set_last_pgno(dbmfp, pgno) + DB_MPOOLFILE *dbmfp; + db_pgno_t pgno; +{ + dbmfp->mfp->last_pgno = pgno; +} diff --git a/mp/mp_mvcc.c b/mp/mp_mvcc.c new file mode 100644 index 0000000..34467d2 --- /dev/null +++ b/mp/mp_mvcc.c @@ -0,0 +1,634 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2006-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" + +static int __pgno_cmp __P((const void *, const void *)); + +/* + * __memp_bh_settxn -- + * Set the transaction that owns the given buffer. + * + * PUBLIC: int __memp_bh_settxn __P((DB_MPOOL *, MPOOLFILE *mfp, BH *, void *)); + */ +int +__memp_bh_settxn(dbmp, mfp, bhp, vtd) + DB_MPOOL *dbmp; + MPOOLFILE *mfp; + BH *bhp; + void *vtd; +{ + ENV *env; + TXN_DETAIL *td; + + env = dbmp->env; + td = (TXN_DETAIL *)vtd; + + if (td == NULL) { + __db_errx(env, + "%s: non-transactional update to a multiversion file", + __memp_fns(dbmp, mfp)); + return (EINVAL); + } + + if (bhp->td_off != INVALID_ROFF) { + DB_ASSERT(env, BH_OWNER(env, bhp) == td); + return (0); + } + + bhp->td_off = R_OFFSET(&env->tx_handle->reginfo, td); + return (__txn_add_buffer(env, td)); +} + +/* + * __memp_skip_curadj -- + * Indicate whether a cursor adjustment can be skipped for a snapshot + * cursor. + * + * PUBLIC: int __memp_skip_curadj __P((DBC *, db_pgno_t)); + */ +int +__memp_skip_curadj(dbc, pgno) + DBC * dbc; + db_pgno_t pgno; +{ + BH *bhp; + DB_MPOOL *dbmp; + DB_MPOOLFILE *dbmfp; + DB_MPOOL_HASH *hp; + DB_TXN *txn; + ENV *env; + MPOOLFILE *mfp; + REGINFO *infop; + roff_t mf_offset; + int ret, skip; + u_int32_t bucket; + + env = dbc->env; + dbmp = env->mp_handle; + dbmfp = dbc->dbp->mpf; + mfp = dbmfp->mfp; + mf_offset = R_OFFSET(dbmp->reginfo, mfp); + skip = 0; + + for (txn = dbc->txn; txn->parent != NULL; txn = txn->parent) + ; + + /* + * Determine the cache and hash bucket where this page lives and get + * local pointers to them. Reset on each pass through this code, the + * page number can change. + */ + MP_GET_BUCKET(env, mfp, pgno, &infop, hp, bucket, ret); + if (ret != 0) { + /* Panic: there is no way to return the error. */ + (void)__env_panic(env, ret); + return (0); + } + + SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) { + if (bhp->pgno != pgno || bhp->mf_offset != mf_offset) + continue; + + if (!BH_OWNED_BY(env, bhp, txn)) + skip = 1; + break; + } + MUTEX_UNLOCK(env, hp->mtx_hash); + + return (skip); +} + +#define DB_FREEZER_MAGIC 0x06102002 + +/* + * __memp_bh_freeze -- + * Save a buffer to temporary storage in case it is needed later by + * a snapshot transaction. This function should be called with the buffer + * locked and will exit with it locked. A BH_FROZEN buffer header is + * allocated to represent the frozen data in mpool. + * + * PUBLIC: int __memp_bh_freeze __P((DB_MPOOL *, REGINFO *, DB_MPOOL_HASH *, + * PUBLIC: BH *, int *)); + */ +int +__memp_bh_freeze(dbmp, infop, hp, bhp, need_frozenp) + DB_MPOOL *dbmp; + REGINFO *infop; + DB_MPOOL_HASH *hp; + BH *bhp; + int *need_frozenp; +{ + BH *frozen_bhp; + BH_FROZEN_ALLOC *frozen_alloc; + DB_FH *fhp; + ENV *env; + MPOOL *c_mp; + MPOOLFILE *mfp; + db_mutex_t mutex; + db_pgno_t maxpgno, newpgno, nextfree; + size_t nio; + int created, h_locked, ret, t_ret; + u_int32_t magic, nbucket, ncache, pagesize; + char filename[100], *real_name; + + env = dbmp->env; + c_mp = infop->primary; + created = h_locked = ret = 0; + /* Find the associated MPOOLFILE. */ + mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); + pagesize = mfp->stat.st_pagesize; + real_name = NULL; + fhp = NULL; + + MVCC_MPROTECT(bhp->buf, pagesize, PROT_READ | PROT_WRITE); + + MPOOL_REGION_LOCK(env, infop); + frozen_bhp = SH_TAILQ_FIRST(&c_mp->free_frozen, __bh); + if (frozen_bhp != NULL) { + SH_TAILQ_REMOVE(&c_mp->free_frozen, frozen_bhp, hq, __bh); + *need_frozenp = SH_TAILQ_EMPTY(&c_mp->free_frozen); + } else { + *need_frozenp = 1; + + /* There might be a small amount of unallocated space. */ + if (__env_alloc(infop, + sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE), + &frozen_alloc) == 0) { + frozen_bhp = (BH *)(frozen_alloc + 1); + frozen_bhp->mtx_buf = MUTEX_INVALID; + SH_TAILQ_INSERT_TAIL(&c_mp->alloc_frozen, + frozen_alloc, links); + } + } + MPOOL_REGION_UNLOCK(env, infop); + + /* + * If we can't get a frozen buffer header, return ENOMEM immediately: + * we don't want to call __memp_alloc recursively. __memp_alloc will + * turn the next free page it finds into frozen buffer headers. + */ + if (frozen_bhp == NULL) { + ret = ENOMEM; + goto err; + } + + /* + * For now, keep things simple and have one file per page size per + * hash bucket. This improves concurrency but can mean lots of files + * if there is lots of freezing. + */ + ncache = (u_int32_t)(infop - dbmp->reginfo); + nbucket = (u_int32_t)(hp - (DB_MPOOL_HASH *)R_ADDR(infop, c_mp->htab)); + snprintf(filename, sizeof(filename), "__db.freezer.%lu.%lu.%luK", + (u_long)ncache, (u_long)nbucket, (u_long)pagesize / 1024); + + if ((ret = __db_appname(env, + DB_APP_NONE, filename, NULL, &real_name)) != 0) + goto err; + + MUTEX_LOCK(env, hp->mtx_hash); + h_locked = 1; + DB_ASSERT(env, F_ISSET(bhp, BH_EXCLUSIVE) && !F_ISSET(bhp, BH_FROZEN)); + + if (BH_REFCOUNT(bhp) > 1 || F_ISSET(bhp, BH_DIRTY)) { + ret = EBUSY; + goto err; + } + + if ((ret = __os_open(env, real_name, pagesize, + DB_OSO_CREATE | DB_OSO_EXCL, env->db_mode, &fhp)) == 0) { + /* We're creating the file -- initialize the metadata page. */ + created = 1; + magic = DB_FREEZER_MAGIC; + maxpgno = newpgno = 0; + if ((ret = __os_write(env, fhp, + &magic, sizeof(u_int32_t), &nio)) != 0 || + (ret = __os_write(env, fhp, + &newpgno, sizeof(db_pgno_t), &nio)) != 0 || + (ret = __os_write(env, fhp, + &maxpgno, sizeof(db_pgno_t), &nio)) != 0 || + (ret = __os_seek(env, fhp, 0, 0, 0)) != 0) + goto err; + } else if (ret == EEXIST) + ret = __os_open(env, + real_name, pagesize, 0, env->db_mode, &fhp); + if (ret != 0) + goto err; + if ((ret = __os_read(env, fhp, + &magic, sizeof(u_int32_t), &nio)) != 0 || + (ret = __os_read(env, fhp, + &newpgno, sizeof(db_pgno_t), &nio)) != 0 || + (ret = __os_read(env, fhp, + &maxpgno, sizeof(db_pgno_t), &nio)) != 0) + goto err; + if (magic != DB_FREEZER_MAGIC) { + ret = EINVAL; + goto err; + } + if (newpgno == 0) { + newpgno = ++maxpgno; + if ((ret = __os_seek(env, + fhp, 0, 0, sizeof(u_int32_t) + sizeof(db_pgno_t))) != 0 || + (ret = __os_write(env, fhp, &maxpgno, sizeof(db_pgno_t), + &nio)) != 0) + goto err; + } else { + if ((ret = __os_seek(env, fhp, newpgno, pagesize, 0)) != 0 || + (ret = __os_read(env, fhp, &nextfree, sizeof(db_pgno_t), + &nio)) != 0) + goto err; + if ((ret = + __os_seek(env, fhp, 0, 0, sizeof(u_int32_t))) != 0 || + (ret = __os_write(env, fhp, &nextfree, sizeof(db_pgno_t), + &nio)) != 0) + goto err; + } + + /* Write the buffer to the allocated page. */ + if ((ret = __os_io(env, DB_IO_WRITE, fhp, newpgno, pagesize, 0, + pagesize, bhp->buf, &nio)) != 0) + goto err; + + ret = __os_closehandle(env, fhp); + fhp = NULL; + if (ret != 0) + goto err; + + /* + * Set up the frozen_bhp with the freezer page number. The original + * buffer header is about to be freed, so transfer resources to the + * frozen header here. + */ + mutex = frozen_bhp->mtx_buf; +#ifdef DIAG_MVCC + memcpy(frozen_bhp, bhp, SSZ(BH, align_off)); +#else + memcpy(frozen_bhp, bhp, SSZA(BH, buf)); +#endif + atomic_init(&frozen_bhp->ref, 0); + if (mutex != MUTEX_INVALID) + frozen_bhp->mtx_buf = mutex; + else if ((ret = __mutex_alloc(env, MTX_MPOOL_BH, + DB_MUTEX_SHARED, &frozen_bhp->mtx_buf)) != 0) + goto err; + F_SET(frozen_bhp, BH_FROZEN); + F_CLR(frozen_bhp, BH_EXCLUSIVE); + ((BH_FROZEN_PAGE *)frozen_bhp)->spgno = newpgno; + + /* + * We're about to add the frozen buffer header to the version chain, so + * we have temporarily created another buffer for the owning + * transaction. + */ + if (frozen_bhp->td_off != INVALID_ROFF && + (ret = __txn_add_buffer(env, BH_OWNER(env, frozen_bhp))) != 0) { + (void)__env_panic(env, ret); + goto err; + } + + /* + * Add the frozen buffer to the version chain and update the hash + * bucket if this is the head revision. The original buffer will be + * freed by __memp_alloc calling __memp_bhfree (assuming no other + * thread has blocked waiting for it while we were freezing). + */ + SH_CHAIN_INSERT_AFTER(bhp, frozen_bhp, vc, __bh); + if (!SH_CHAIN_HASNEXT(frozen_bhp, vc)) { + SH_TAILQ_INSERT_BEFORE(&hp->hash_bucket, + bhp, frozen_bhp, hq, __bh); + SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh); + } + MUTEX_UNLOCK(env, hp->mtx_hash); + h_locked = 0; + + /* + * Increment the file's block count -- freeing the original buffer will + * decrement it. + */ + MUTEX_LOCK(env, mfp->mutex); + ++mfp->block_cnt; + MUTEX_UNLOCK(env, mfp->mutex); + + STAT(++hp->hash_frozen); + + if (0) { +err: if (fhp != NULL && + (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0) + ret = t_ret; + if (created) { + DB_ASSERT(env, h_locked); + if ((t_ret = __os_unlink(env, real_name, 0)) != 0 && + ret == 0) + ret = t_ret; + } + if (h_locked) + MUTEX_UNLOCK(env, hp->mtx_hash); + if (ret == 0) + ret = EIO; + if (frozen_bhp != NULL) { + MPOOL_REGION_LOCK(env, infop); + SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen, + frozen_bhp, hq); + MPOOL_REGION_UNLOCK(env, infop); + } + } + if (real_name != NULL) + __os_free(env, real_name); + if (ret != 0 && ret != EBUSY && ret != ENOMEM) + __db_err(env, ret, "__memp_bh_freeze"); + + return (ret); +} + +static int +__pgno_cmp(a, b) + const void *a, *b; +{ + db_pgno_t *ap, *bp; + + ap = (db_pgno_t *)a; + bp = (db_pgno_t *)b; + + return (int)(*ap - *bp); +} + +/* + * __memp_bh_thaw -- + * Free a buffer header in temporary storage. Optionally restore the + * buffer (if alloc_bhp != NULL). This function should be + * called with the hash bucket locked and will return with it unlocked. + * + * PUBLIC: int __memp_bh_thaw __P((DB_MPOOL *, REGINFO *, + * PUBLIC: DB_MPOOL_HASH *, BH *, BH *)); + */ +int +__memp_bh_thaw(dbmp, infop, hp, frozen_bhp, alloc_bhp) + DB_MPOOL *dbmp; + REGINFO *infop; + DB_MPOOL_HASH *hp; + BH *frozen_bhp, *alloc_bhp; +{ + DB_FH *fhp; + ENV *env; +#ifdef DIAGNOSTIC + DB_LSN vlsn; +#endif + MPOOL *c_mp; + MPOOLFILE *mfp; + db_mutex_t mutex; + db_pgno_t *freelist, *ppgno, freepgno, maxpgno, spgno; + size_t nio; + u_int32_t listsize, magic, nbucket, ncache, ntrunc, nfree, pagesize; +#ifdef HAVE_FTRUNCATE + int i; +#endif + int h_locked, needfree, ret, t_ret; + char filename[100], *real_name; + + env = dbmp->env; + fhp = NULL; + c_mp = infop->primary; + mfp = R_ADDR(dbmp->reginfo, frozen_bhp->mf_offset); + freelist = NULL; + pagesize = mfp->stat.st_pagesize; + ret = 0; + real_name = NULL; + + MUTEX_REQUIRED(env, hp->mtx_hash); + DB_ASSERT(env, F_ISSET(frozen_bhp, BH_EXCLUSIVE) || alloc_bhp == NULL); + h_locked = 1; + + DB_ASSERT(env, F_ISSET(frozen_bhp, BH_FROZEN) && + !F_ISSET(frozen_bhp, BH_THAWED)); + DB_ASSERT(env, alloc_bhp != NULL || + SH_CHAIN_SINGLETON(frozen_bhp, vc) || + (SH_CHAIN_HASNEXT(frozen_bhp, vc) && + BH_OBSOLETE(frozen_bhp, hp->old_reader, vlsn))); + DB_ASSERT(env, alloc_bhp == NULL || !F_ISSET(alloc_bhp, BH_FROZEN)); + + spgno = ((BH_FROZEN_PAGE *)frozen_bhp)->spgno; + + if (alloc_bhp != NULL) { + mutex = alloc_bhp->mtx_buf; +#ifdef DIAG_MVCC + memcpy(alloc_bhp, frozen_bhp, SSZ(BH, align_off)); +#else + memcpy(alloc_bhp, frozen_bhp, SSZA(BH, buf)); +#endif + alloc_bhp->mtx_buf = mutex; + MUTEX_LOCK(env, alloc_bhp->mtx_buf); + atomic_init(&alloc_bhp->ref, 1); + F_CLR(alloc_bhp, BH_FROZEN); + } + + /* + * For now, keep things simple and have one file per page size per + * hash bucket. This improves concurrency but can mean lots of files + * if there is lots of freezing. + */ + ncache = (u_int32_t)(infop - dbmp->reginfo); + nbucket = (u_int32_t)(hp - (DB_MPOOL_HASH *)R_ADDR(infop, c_mp->htab)); + snprintf(filename, sizeof(filename), "__db.freezer.%lu.%lu.%luK", + (u_long)ncache, (u_long)nbucket, (u_long)pagesize / 1024); + + if ((ret = __db_appname(env, + DB_APP_NONE, filename, NULL, &real_name)) != 0) + goto err; + if ((ret = __os_open(env, + real_name, pagesize, 0, env->db_mode, &fhp)) != 0) + goto err; + + /* + * Read the first free page number -- we're about to free the page + * after we we read it. + */ + if ((ret = __os_read(env, fhp, &magic, sizeof(u_int32_t), &nio)) != 0 || + (ret = + __os_read(env, fhp, &freepgno, sizeof(db_pgno_t), &nio)) != 0 || + (ret = __os_read(env, fhp, &maxpgno, sizeof(db_pgno_t), &nio)) != 0) + goto err; + + if (magic != DB_FREEZER_MAGIC) { + ret = EINVAL; + goto err; + } + + /* Read the buffer from the frozen page. */ + if (alloc_bhp != NULL) { + DB_ASSERT(env, !F_ISSET(frozen_bhp, BH_FREED)); + if ((ret = __os_io(env, DB_IO_READ, fhp, + spgno, pagesize, 0, pagesize, alloc_bhp->buf, &nio)) != 0) + goto err; + } + + /* + * Free the page from the file. If it's the last page, truncate. + * Otherwise, update free page linked list. + */ + needfree = 1; + if (spgno == maxpgno) { + listsize = 100; + if ((ret = __os_malloc(env, + listsize * sizeof(db_pgno_t), &freelist)) != 0) + goto err; + nfree = 0; + while (freepgno != 0) { + if (nfree == listsize - 1) { + listsize *= 2; + if ((ret = __os_realloc(env, + listsize * sizeof(db_pgno_t), + &freelist)) != 0) + goto err; + } + freelist[nfree++] = freepgno; + if ((ret = __os_seek(env, fhp, + freepgno, pagesize, 0)) != 0 || + (ret = __os_read(env, fhp, &freepgno, + sizeof(db_pgno_t), &nio)) != 0) + goto err; + } + freelist[nfree++] = spgno; + qsort(freelist, nfree, sizeof(db_pgno_t), __pgno_cmp); + for (ppgno = &freelist[nfree - 1]; ppgno > freelist; ppgno--) + if (*(ppgno - 1) != *ppgno - 1) + break; + ntrunc = (u_int32_t)(&freelist[nfree] - ppgno); + if (ntrunc == (u_int32_t)maxpgno) { + needfree = 0; + ret = __os_closehandle(env, fhp); + fhp = NULL; + if (ret != 0 || + (ret = __os_unlink(env, real_name, 0)) != 0) + goto err; + } +#ifdef HAVE_FTRUNCATE + else { + maxpgno -= (db_pgno_t)ntrunc; + if ((ret = __os_truncate(env, fhp, + maxpgno + 1, pagesize)) != 0) + goto err; + + /* Fix up the linked list */ + freelist[nfree - ntrunc] = 0; + if ((ret = __os_seek(env, fhp, + 0, 0, sizeof(u_int32_t))) != 0 || + (ret = __os_write(env, fhp, &freelist[0], + sizeof(db_pgno_t), &nio)) != 0 || + (ret = __os_write(env, fhp, &maxpgno, + sizeof(db_pgno_t), &nio)) != 0) + goto err; + + for (i = 0; i < (int)(nfree - ntrunc); i++) + if ((ret = __os_seek(env, + fhp, freelist[i], pagesize, 0)) != 0 || + (ret = __os_write(env, fhp, + &freelist[i + 1], sizeof(db_pgno_t), + &nio)) != 0) + goto err; + needfree = 0; + } +#endif + } + if (needfree) { + if ((ret = __os_seek(env, fhp, spgno, pagesize, 0)) != 0 || + (ret = __os_write(env, fhp, + &freepgno, sizeof(db_pgno_t), &nio)) != 0 || + (ret = __os_seek(env, fhp, 0, 0, sizeof(u_int32_t))) != 0 || + (ret = __os_write(env, fhp, + &spgno, sizeof(db_pgno_t), &nio)) != 0) + goto err; + + ret = __os_closehandle(env, fhp); + fhp = NULL; + if (ret != 0) + goto err; + } + + /* + * Add the thawed buffer (if any) to the version chain. We can't + * do this any earlier, because we can't guarantee that another thread + * won't be waiting for it, which means we can't clean up if there are + * errors reading from the freezer. We can't do it any later, because + * we're about to free frozen_bhp, and without it we would need to do + * another cache lookup to find out where the new page should live. + */ + MUTEX_REQUIRED(env, hp->mtx_hash); + if (alloc_bhp != NULL) { + alloc_bhp->priority = c_mp->lru_count; + + SH_CHAIN_INSERT_AFTER(frozen_bhp, alloc_bhp, vc, __bh); + if (!SH_CHAIN_HASNEXT(alloc_bhp, vc)) { + SH_TAILQ_INSERT_BEFORE(&hp->hash_bucket, frozen_bhp, + alloc_bhp, hq, __bh); + SH_TAILQ_REMOVE(&hp->hash_bucket, frozen_bhp, hq, __bh); + } + } else if (!SH_CHAIN_HASNEXT(frozen_bhp, vc)) { + if (SH_CHAIN_HASPREV(frozen_bhp, vc)) + SH_TAILQ_INSERT_BEFORE(&hp->hash_bucket, frozen_bhp, + SH_CHAIN_PREV(frozen_bhp, vc, __bh), hq, __bh); + SH_TAILQ_REMOVE(&hp->hash_bucket, frozen_bhp, hq, __bh); + } + SH_CHAIN_REMOVE(frozen_bhp, vc, __bh); + + if (alloc_bhp == NULL && frozen_bhp->td_off != INVALID_ROFF && + (ret = __txn_remove_buffer(env, + BH_OWNER(env, frozen_bhp), MUTEX_INVALID)) != 0) { + (void)__env_panic(env, ret); + goto err; + } + frozen_bhp->td_off = INVALID_ROFF; + + /* + * If other threads are waiting for this buffer as well, they will have + * incremented the reference count and will be waiting on the mutex. + * For that reason, we can't unconditionally free the memory here. + */ + needfree = (atomic_dec(env, &frozen_bhp->ref) == 0); + if (!needfree) + F_SET(frozen_bhp, BH_THAWED); + MUTEX_UNLOCK(env, hp->mtx_hash); + if (F_ISSET(frozen_bhp, BH_EXCLUSIVE)) + MUTEX_UNLOCK(env, frozen_bhp->mtx_buf); + h_locked = 0; + if (needfree) { + MPOOL_REGION_LOCK(env, infop); + SH_TAILQ_INSERT_TAIL(&c_mp->free_frozen, frozen_bhp, hq); + MPOOL_REGION_UNLOCK(env, infop); + } + +#ifdef HAVE_STATISTICS + if (alloc_bhp != NULL) + ++hp->hash_thawed; + else + ++hp->hash_frozen_freed; +#endif + + if (0) { +err: if (h_locked) + MUTEX_UNLOCK(env, hp->mtx_hash); + if (ret == 0) + ret = EIO; + } + if (real_name != NULL) + __os_free(env, real_name); + if (freelist != NULL) + __os_free(env, freelist); + if (fhp != NULL && + (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + __db_err(env, ret, "__memp_bh_thaw"); + + return (ret); +} diff --git a/mp/mp_region.c b/mp/mp_region.c new file mode 100644 index 0000000..e6cece9 --- /dev/null +++ b/mp/mp_region.c @@ -0,0 +1,588 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/mp.h" + +static int __memp_init_config __P((ENV *, MPOOL *)); +static void __memp_region_size __P((ENV *, roff_t *, u_int32_t *)); + +#define MPOOL_DEFAULT_PAGESIZE (4 * 1024) + +/* + * __memp_open -- + * Internal version of memp_open: only called from ENV->open. + * + * PUBLIC: int __memp_open __P((ENV *, int)); + */ +int +__memp_open(env, create_ok) + ENV *env; + int create_ok; +{ + DB_ENV *dbenv; + DB_MPOOL *dbmp; + MPOOL *mp; + REGINFO reginfo; + roff_t reg_size; + u_int i, max_nreg; + u_int32_t htab_buckets, *regids; + int ret; + + dbenv = env->dbenv; + + /* Calculate the region size and hash bucket count. */ + __memp_region_size(env, ®_size, &htab_buckets); + + /* Create and initialize the DB_MPOOL structure. */ + if ((ret = __os_calloc(env, 1, sizeof(*dbmp), &dbmp)) != 0) + return (ret); + LIST_INIT(&dbmp->dbregq); + TAILQ_INIT(&dbmp->dbmfq); + dbmp->env = env; + + /* Join/create the first mpool region. */ + memset(®info, 0, sizeof(REGINFO)); + reginfo.env = env; + reginfo.type = REGION_TYPE_MPOOL; + reginfo.id = INVALID_REGION_ID; + reginfo.flags = REGION_JOIN_OK; + if (create_ok) + F_SET(®info, REGION_CREATE_OK); + if ((ret = __env_region_attach(env, ®info, reg_size)) != 0) + goto err; + + /* + * If we created the region, initialize it. Create or join any + * additional regions. + */ + if (F_ISSET(®info, REGION_CREATE)) { + /* + * We define how many regions there are going to be, allocate + * the REGINFO structures and create them. Make sure we don't + * clear the wrong entries on error. + */ + max_nreg = __memp_max_regions(env); + if ((ret = __os_calloc(env, + max_nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0) + goto err; + /* Make sure we don't clear the wrong entries on error. */ + dbmp->reginfo[0] = reginfo; + for (i = 1; i < max_nreg; ++i) + dbmp->reginfo[i].id = INVALID_REGION_ID; + + /* Initialize the first region. */ + if ((ret = __memp_init(env, dbmp, + 0, htab_buckets, max_nreg)) != 0) + goto err; + + /* + * Create/initialize remaining regions and copy their IDs into + * the first region. + */ + mp = R_ADDR(dbmp->reginfo, dbmp->reginfo[0].rp->primary); + regids = R_ADDR(dbmp->reginfo, mp->regids); + regids[0] = dbmp->reginfo[0].id; + for (i = 1; i < dbenv->mp_ncache; ++i) { + dbmp->reginfo[i].env = env; + dbmp->reginfo[i].type = REGION_TYPE_MPOOL; + dbmp->reginfo[i].id = INVALID_REGION_ID; + dbmp->reginfo[i].flags = REGION_CREATE_OK; + if ((ret = __env_region_attach( + env, &dbmp->reginfo[i], reg_size)) != 0) + goto err; + if ((ret = __memp_init(env, dbmp, + i, htab_buckets, max_nreg)) != 0) + goto err; + + regids[i] = dbmp->reginfo[i].id; + } + } else { + /* + * Determine how many regions there are going to be, allocate + * the REGINFO structures and fill in local copies of that + * information. + */ + mp = R_ADDR(®info, reginfo.rp->primary); + dbenv->mp_ncache = mp->nreg; + if ((ret = __os_calloc(env, + mp->max_nreg, sizeof(REGINFO), &dbmp->reginfo)) != 0) + goto err; + /* Make sure we don't clear the wrong entries on error. */ + for (i = 0; i < dbenv->mp_ncache; ++i) + dbmp->reginfo[i].id = INVALID_REGION_ID; + dbmp->reginfo[0] = reginfo; + + /* Join remaining regions. */ + regids = R_ADDR(dbmp->reginfo, mp->regids); + for (i = 1; i < dbenv->mp_ncache; ++i) { + dbmp->reginfo[i].env = env; + dbmp->reginfo[i].type = REGION_TYPE_MPOOL; + dbmp->reginfo[i].id = regids[i]; + dbmp->reginfo[i].flags = REGION_JOIN_OK; + if ((ret = __env_region_attach( + env, &dbmp->reginfo[i], 0)) != 0) + goto err; + } + } + + /* Set the local addresses for the regions. */ + for (i = 0; i < dbenv->mp_ncache; ++i) + dbmp->reginfo[i].primary = + R_ADDR(&dbmp->reginfo[i], dbmp->reginfo[i].rp->primary); + + /* If the region is threaded, allocate a mutex to lock the handles. */ + if ((ret = __mutex_alloc(env, + MTX_MPOOL_HANDLE, DB_MUTEX_PROCESS_ONLY, &dbmp->mutex)) != 0) + goto err; + + env->mp_handle = dbmp; + + /* A process joining the region may reset the mpool configuration. */ + if ((ret = __memp_init_config(env, mp)) != 0) + return (ret); + + return (0); + +err: env->mp_handle = NULL; + if (dbmp->reginfo != NULL && dbmp->reginfo[0].addr != NULL) { + for (i = 0; i < dbenv->mp_ncache; ++i) + if (dbmp->reginfo[i].id != INVALID_REGION_ID) + (void)__env_region_detach( + env, &dbmp->reginfo[i], 0); + __os_free(env, dbmp->reginfo); + } + + (void)__mutex_free(env, &dbmp->mutex); + __os_free(env, dbmp); + return (ret); +} + +/* + * __memp_init -- + * Initialize a MPOOL structure in shared memory. + * + * PUBLIC: int __memp_init + * PUBLIC: __P((ENV *, DB_MPOOL *, u_int, u_int32_t, u_int)); + */ +int +__memp_init(env, dbmp, reginfo_off, htab_buckets, max_nreg) + ENV *env; + DB_MPOOL *dbmp; + u_int reginfo_off, max_nreg; + u_int32_t htab_buckets; +{ + BH *frozen_bhp; + BH_FROZEN_ALLOC *frozen; + DB_ENV *dbenv; + DB_MPOOL_HASH *htab, *hp; + MPOOL *mp, *main_mp; + REGINFO *infop; + db_mutex_t mtx_base, mtx_discard, mtx_prev; + u_int32_t i; + int ret; + void *p; + + dbenv = env->dbenv; + + infop = &dbmp->reginfo[reginfo_off]; + if ((ret = __env_alloc(infop, sizeof(MPOOL), &infop->primary)) != 0) + goto mem_err; + infop->rp->primary = R_OFFSET(infop, infop->primary); + mp = infop->primary; + memset(mp, 0, sizeof(*mp)); + + if ((ret = + __mutex_alloc(env, MTX_MPOOL_REGION, 0, &mp->mtx_region)) != 0) + return (ret); + + if (reginfo_off == 0) { + ZERO_LSN(mp->lsn); + + mp->nreg = dbenv->mp_ncache; + mp->max_nreg = max_nreg; + if ((ret = __env_alloc(&dbmp->reginfo[0], + max_nreg * sizeof(u_int32_t), &p)) != 0) + goto mem_err; + mp->regids = R_OFFSET(dbmp->reginfo, p); + mp->nbuckets = dbenv->mp_ncache * htab_buckets; + + /* Allocate file table space and initialize it. */ + if ((ret = __env_alloc(infop, + MPOOL_FILE_BUCKETS * sizeof(DB_MPOOL_HASH), &htab)) != 0) + goto mem_err; + mp->ftab = R_OFFSET(infop, htab); + for (i = 0; i < MPOOL_FILE_BUCKETS; i++) { + if ((ret = __mutex_alloc(env, + MTX_MPOOL_FILE_BUCKET, 0, &htab[i].mtx_hash)) != 0) + return (ret); + SH_TAILQ_INIT(&htab[i].hash_bucket); + atomic_init(&htab[i].hash_page_dirty, 0); + } + + /* + * Allocate all of the hash bucket mutexes up front. We do + * this so that we don't need to free and reallocate mutexes as + * the cache is resized. + */ + mtx_base = mtx_prev = MUTEX_INVALID; + for (i = 0; i < mp->max_nreg * htab_buckets; i++) { + if ((ret = __mutex_alloc(env, MTX_MPOOL_HASH_BUCKET, + DB_MUTEX_SHARED, &mtx_discard)) != 0) + return (ret); + if (i == 0) { + mtx_base = mtx_discard; + mtx_prev = mtx_discard - 1; + } + DB_ASSERT(env, mtx_discard == mtx_prev + 1 || + mtx_base == MUTEX_INVALID); + mtx_prev = mtx_discard; + } + } else { + main_mp = dbmp->reginfo[0].primary; + htab = R_ADDR(&dbmp->reginfo[0], main_mp->htab); + mtx_base = htab[0].mtx_hash; + } + + /* + * We preallocated all of the mutexes in a block, so for regions after + * the first, we skip mutexes in use in earlier regions. Each region + * has the same number of buckets + */ + if (mtx_base != MUTEX_INVALID) + mtx_base += reginfo_off * htab_buckets; + + /* Allocate hash table space and initialize it. */ + if ((ret = __env_alloc(infop, + htab_buckets * sizeof(DB_MPOOL_HASH), &htab)) != 0) + goto mem_err; + mp->htab = R_OFFSET(infop, htab); + for (i = 0; i < htab_buckets; i++) { + hp = &htab[i]; + hp->mtx_hash = (mtx_base == MUTEX_INVALID) ? MUTEX_INVALID : + mtx_base + i; + SH_TAILQ_INIT(&hp->hash_bucket); + atomic_init(&hp->hash_page_dirty, 0); +#ifdef HAVE_STATISTICS + hp->hash_io_wait = 0; + hp->hash_frozen = hp->hash_thawed = hp->hash_frozen_freed = 0; +#endif + hp->flags = 0; + ZERO_LSN(hp->old_reader); + } + mp->htab_buckets = htab_buckets; +#ifdef HAVE_STATISTICS + mp->stat.st_hash_buckets = htab_buckets; + mp->stat.st_pagesize = dbenv->mp_pagesize == 0 ? + MPOOL_DEFAULT_PAGESIZE : dbenv->mp_pagesize; +#endif + + SH_TAILQ_INIT(&mp->free_frozen); + SH_TAILQ_INIT(&mp->alloc_frozen); + + /* + * Pre-allocate one frozen buffer header. This avoids situations where + * the cache becomes full of pages and we don't even have the 28 bytes + * (or so) available to allocate a frozen buffer header. + */ + if ((ret = __env_alloc(infop, + sizeof(BH_FROZEN_ALLOC) + sizeof(BH_FROZEN_PAGE), &frozen)) != 0) + goto mem_err; + SH_TAILQ_INSERT_TAIL(&mp->alloc_frozen, frozen, links); + frozen_bhp = (BH *)(frozen + 1); + frozen_bhp->mtx_buf = MUTEX_INVALID; + SH_TAILQ_INSERT_TAIL(&mp->free_frozen, frozen_bhp, hq); + + /* + * Only the environment creator knows the total cache size, fill in + * those statistics now. + */ + mp->stat.st_gbytes = dbenv->mp_gbytes; + mp->stat.st_bytes = dbenv->mp_bytes; + infop->mtx_alloc = mp->mtx_region; + return (0); + +mem_err:__db_errx(env, "Unable to allocate memory for mpool region"); + return (ret); +} + +/* + * PUBLIC: u_int32_t __memp_max_regions __P((ENV *)); + */ +u_int32_t +__memp_max_regions(env) + ENV *env; +{ + DB_ENV *dbenv; + roff_t reg_size, max_size; + size_t max_nreg; + + dbenv = env->dbenv; + + __memp_region_size(env, ®_size, NULL); + max_size = + (roff_t)dbenv->mp_max_gbytes * GIGABYTE + dbenv->mp_max_bytes; + max_nreg = (max_size + reg_size / 2) / reg_size; + + /* Sanity check that the number of regions fits in 32 bits. */ + DB_ASSERT(env, max_nreg == (u_int32_t)max_nreg); + + if (max_nreg <= dbenv->mp_ncache) + max_nreg = dbenv->mp_ncache; + return ((u_int32_t)max_nreg); +} + +/* + * __memp_region_size -- + * Size the region and figure out how many hash buckets we'll have. + */ +static void +__memp_region_size(env, reg_sizep, htab_bucketsp) + ENV *env; + roff_t *reg_sizep; + u_int32_t *htab_bucketsp; +{ + DB_ENV *dbenv; + roff_t reg_size, cache_size; + u_int32_t pgsize; + + dbenv = env->dbenv; + + /* + * Figure out how big each cache region is. Cast an operand to roff_t + * so we do 64-bit arithmetic as appropriate. + */ + cache_size = (roff_t)dbenv->mp_gbytes * GIGABYTE + dbenv->mp_bytes; + reg_size = cache_size / dbenv->mp_ncache; + if (reg_sizep != NULL) + *reg_sizep = reg_size; + + /* + * Figure out how many hash buckets each region will have. Assume we + * want to keep the hash chains with under 3 pages on each chain. We + * don't know the pagesize in advance, and it may differ for different + * files. Use a pagesize of 4K for the calculation -- we walk these + * chains a lot, they must be kept short. We use 2.5 as this maintains + * compatibility with previous releases. + * + * XXX + * Cache sizes larger than 10TB would cause 32-bit wrapping in the + * calculation of the number of hash buckets. This probably isn't + * something we need to worry about right now, but is checked when the + * cache size is set. + */ + if (htab_bucketsp != NULL) { + if (dbenv->mp_tablesize != 0) + *htab_bucketsp = __db_tablesize(dbenv->mp_tablesize); + else { + if ((pgsize = dbenv->mp_pagesize) == 0) + pgsize = MPOOL_DEFAULT_PAGESIZE; + *htab_bucketsp = __db_tablesize( + (u_int32_t)(reg_size / (2.5 * pgsize))); + } + } +} + +/* + * __memp_region_mutex_count -- + * Return the number of mutexes the mpool region will need. + * + * PUBLIC: u_int32_t __memp_region_mutex_count __P((ENV *)); + */ +u_int32_t +__memp_region_mutex_count(env) + ENV *env; +{ + DB_ENV *dbenv; + u_int32_t htab_buckets; + roff_t reg_size; + u_int32_t num_per_cache, pgsize; + + dbenv = env->dbenv; + + __memp_region_size(env, ®_size, &htab_buckets); + if ((pgsize = dbenv->mp_pagesize) == 0) + pgsize = MPOOL_DEFAULT_PAGESIZE; + + /* + * We need a couple of mutexes for the region itself, one for each + * file handle (MPOOLFILE) the application allocates, one for each + * of the MPOOL_FILE_BUCKETS, and each cache has one mutex per + * hash bucket. We then need one mutex per page in the cache, + * the worst case is really big if the pages are 512 bytes. + */ + num_per_cache = htab_buckets + (u_int32_t)(reg_size / pgsize); + return ((dbenv->mp_ncache * num_per_cache) + 50 + MPOOL_FILE_BUCKETS); +} + +/* + * __memp_init_config -- + * Initialize shared configuration information. + */ +static int +__memp_init_config(env, mp) + ENV *env; + MPOOL *mp; +{ + DB_ENV *dbenv; + + dbenv = env->dbenv; + + MPOOL_SYSTEM_LOCK(env); + if (dbenv->mp_mmapsize != 0) + mp->mp_mmapsize = dbenv->mp_mmapsize; + if (dbenv->mp_maxopenfd != 0) + mp->mp_maxopenfd = dbenv->mp_maxopenfd; + if (dbenv->mp_maxwrite != 0) + mp->mp_maxwrite = dbenv->mp_maxwrite; + if (dbenv->mp_maxwrite_sleep != 0) + mp->mp_maxwrite_sleep = dbenv->mp_maxwrite_sleep; + MPOOL_SYSTEM_UNLOCK(env); + + return (0); +} + +/* + * __memp_env_refresh -- + * Clean up after the mpool system on a close or failed open. + * + * PUBLIC: int __memp_env_refresh __P((ENV *)); + */ +int +__memp_env_refresh(env) + ENV *env; +{ + BH *bhp; + BH_FROZEN_ALLOC *frozen_alloc; + DB_MPOOL *dbmp; + DB_MPOOLFILE *dbmfp; + DB_MPOOL_HASH *hp; + DB_MPREG *mpreg; + MPOOL *mp, *c_mp; + REGINFO *infop; + db_mutex_t mtx_base, mtx; + u_int32_t bucket, htab_buckets, i, max_nreg, nreg; + int ret, t_ret; + + ret = 0; + dbmp = env->mp_handle; + mp = dbmp->reginfo[0].primary; + htab_buckets = mp->htab_buckets; + nreg = mp->nreg; + max_nreg = mp->max_nreg; + hp = R_ADDR(&dbmp->reginfo[0], mp->htab); + mtx_base = hp->mtx_hash; + + /* + * If a private region, return the memory to the heap. Not needed for + * filesystem-backed or system shared memory regions, that memory isn't + * owned by any particular process. + */ + if (!F_ISSET(env, ENV_PRIVATE)) + goto not_priv; + + /* Discard buffers. */ + for (i = 0; i < nreg; ++i) { + infop = &dbmp->reginfo[i]; + c_mp = infop->primary; + for (hp = R_ADDR(infop, c_mp->htab), bucket = 0; + bucket < c_mp->htab_buckets; ++hp, ++bucket) { + while ((bhp = SH_TAILQ_FIRST( + &hp->hash_bucket, __bh)) != NULL) + if (F_ISSET(bhp, BH_FROZEN)) + SH_TAILQ_REMOVE( + &hp->hash_bucket, bhp, + hq, __bh); + else { + if (F_ISSET(bhp, BH_DIRTY)) { + atomic_dec(env, + &hp->hash_page_dirty); + F_CLR(bhp, + BH_DIRTY | BH_DIRTY_CREATE); + } + atomic_inc(env, &bhp->ref); + if ((t_ret = __memp_bhfree(dbmp, infop, + R_ADDR(dbmp->reginfo, + bhp->mf_offset), hp, bhp, + BH_FREE_FREEMEM | + BH_FREE_UNLOCKED)) != 0 && ret == 0) + ret = t_ret; + } + } + MPOOL_REGION_LOCK(env, infop); + while ((frozen_alloc = SH_TAILQ_FIRST( + &c_mp->alloc_frozen, __bh_frozen_a)) != NULL) { + SH_TAILQ_REMOVE(&c_mp->alloc_frozen, frozen_alloc, + links, __bh_frozen_a); + __env_alloc_free(infop, frozen_alloc); + } + MPOOL_REGION_UNLOCK(env, infop); + } + + /* Discard hash bucket mutexes. */ + if (mtx_base != MUTEX_INVALID) + for (i = 0; i < max_nreg * htab_buckets; ++i) { + mtx = mtx_base + i; + if ((t_ret = __mutex_free(env, &mtx)) != 0 && + ret == 0) + ret = t_ret; + } + +not_priv: + /* Discard DB_MPOOLFILEs. */ + while ((dbmfp = TAILQ_FIRST(&dbmp->dbmfq)) != NULL) + if ((t_ret = __memp_fclose(dbmfp, 0)) != 0 && ret == 0) + ret = t_ret; + + /* Discard DB_MPREGs. */ + if (dbmp->pg_inout != NULL) + __os_free(env, dbmp->pg_inout); + while ((mpreg = LIST_FIRST(&dbmp->dbregq)) != NULL) { + LIST_REMOVE(mpreg, q); + __os_free(env, mpreg); + } + + /* Discard the DB_MPOOL thread mutex. */ + if ((t_ret = __mutex_free(env, &dbmp->mutex)) != 0 && ret == 0) + ret = t_ret; + + if (F_ISSET(env, ENV_PRIVATE)) { + /* Discard REGION IDs. */ + infop = &dbmp->reginfo[0]; + infop->mtx_alloc = MUTEX_INVALID; + __memp_free(infop, R_ADDR(infop, mp->regids)); + + /* Discard the File table. */ + __memp_free(infop, R_ADDR(infop, mp->ftab)); + + /* Discard Hash tables. */ + for (i = 0; i < nreg; ++i) { + infop = &dbmp->reginfo[i]; + c_mp = infop->primary; + infop->mtx_alloc = MUTEX_INVALID; + __memp_free(infop, R_ADDR(infop, c_mp->htab)); + } + } + + /* Detach from the region. */ + for (i = 0; i < nreg; ++i) { + infop = &dbmp->reginfo[i]; + if ((t_ret = + __env_region_detach(env, infop, 0)) != 0 && ret == 0) + ret = t_ret; + } + + /* Discard DB_MPOOL. */ + __os_free(env, dbmp->reginfo); + __os_free(env, dbmp); + + env->mp_handle = NULL; + return (ret); +} diff --git a/mp/mp_register.c b/mp/mp_register.c new file mode 100644 index 0000000..1ed907b --- /dev/null +++ b/mp/mp_register.c @@ -0,0 +1,115 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" + +/* + * memp_register_pp -- + * ENV->memp_register pre/post processing. + * + * PUBLIC: int __memp_register_pp __P((DB_ENV *, int, + * PUBLIC: int (*)(DB_ENV *, db_pgno_t, void *, DBT *), + * PUBLIC: int (*)(DB_ENV *, db_pgno_t, void *, DBT *))); + */ +int +__memp_register_pp(dbenv, ftype, pgin, pgout) + DB_ENV *dbenv; + int ftype; + int (*pgin) __P((DB_ENV *, db_pgno_t, void *, DBT *)); + int (*pgout) __P((DB_ENV *, db_pgno_t, void *, DBT *)); +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + env = dbenv->env; + + ENV_REQUIRES_CONFIG(env, + env->mp_handle, "DB_ENV->memp_register", DB_INIT_MPOOL); + + if (REP_ON(env)) { + __db_errx(env, "%s%s", "DB_ENV->memp_register: ", + "method not permitted when replication is configured"); + return (EINVAL); + } + + ENV_ENTER(env, ip); + ret = __memp_register(env, ftype, pgin, pgout); + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * memp_register -- + * ENV->memp_register. + * + * PUBLIC: int __memp_register __P((ENV *, int, + * PUBLIC: int (*)(DB_ENV *, db_pgno_t, void *, DBT *), + * PUBLIC: int (*)(DB_ENV *, db_pgno_t, void *, DBT *))); + */ +int +__memp_register(env, ftype, pgin, pgout) + ENV *env; + int ftype; + int (*pgin) __P((DB_ENV *, db_pgno_t, void *, DBT *)); + int (*pgout) __P((DB_ENV *, db_pgno_t, void *, DBT *)); +{ + DB_MPOOL *dbmp; + DB_MPREG *mpreg; + int ret; + + dbmp = env->mp_handle; + + /* + * We keep the DB pgin/pgout functions outside of the linked list + * to avoid locking/unlocking the linked list on every page I/O. + * + * The Berkeley DB I/O conversion functions are registered when the + * environment is first created, so there's no need for locking here. + */ + if (ftype == DB_FTYPE_SET) { + if (dbmp->pg_inout != NULL) + return (0); + if ((ret = + __os_malloc(env, sizeof(DB_MPREG), &dbmp->pg_inout)) != 0) + return (ret); + dbmp->pg_inout->ftype = ftype; + dbmp->pg_inout->pgin = pgin; + dbmp->pg_inout->pgout = pgout; + return (0); + } + + /* + * The item may already have been registered. If already registered, + * just update the entry, although it's probably unchanged. + */ + MUTEX_LOCK(env, dbmp->mutex); + LIST_FOREACH(mpreg, &dbmp->dbregq, q) + if (mpreg->ftype == ftype) { + mpreg->pgin = pgin; + mpreg->pgout = pgout; + break; + } + + if (mpreg == NULL) { /* New entry. */ + if ((ret = __os_malloc(env, sizeof(DB_MPREG), &mpreg)) != 0) + return (ret); + mpreg->ftype = ftype; + mpreg->pgin = pgin; + mpreg->pgout = pgout; + + LIST_INSERT_HEAD(&dbmp->dbregq, mpreg, q); + } + MUTEX_UNLOCK(env, dbmp->mutex); + + return (0); +} diff --git a/mp/mp_resize.c b/mp/mp_resize.c new file mode 100644 index 0000000..a234ad7 --- /dev/null +++ b/mp/mp_resize.c @@ -0,0 +1,579 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2006-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" + +static int __memp_add_bucket __P((DB_MPOOL *)); +static int __memp_add_region __P((DB_MPOOL *)); +static int __memp_map_regions __P((DB_MPOOL *)); +static int __memp_merge_buckets + __P((DB_MPOOL *, u_int32_t, u_int32_t, u_int32_t)); +static int __memp_remove_bucket __P((DB_MPOOL *)); +static int __memp_remove_region __P((DB_MPOOL *)); + +/* + * PUBLIC: int __memp_get_bucket __P((ENV *, MPOOLFILE *, + * PUBLIC: db_pgno_t, REGINFO **, DB_MPOOL_HASH **, u_int32_t *)); + */ +int +__memp_get_bucket(env, mfp, pgno, infopp, hpp, bucketp) + ENV *env; + MPOOLFILE *mfp; + db_pgno_t pgno; + REGINFO **infopp; + DB_MPOOL_HASH **hpp; + u_int32_t *bucketp; +{ + DB_MPOOL *dbmp; + DB_MPOOL_HASH *hp; + MPOOL *c_mp, *mp; + REGINFO *infop; + roff_t mf_offset; + u_int32_t bucket, nbuckets, new_bucket, new_nbuckets, region; + u_int32_t *regids; + int ret; + + dbmp = env->mp_handle; + mf_offset = R_OFFSET(dbmp->reginfo, mfp); + mp = dbmp->reginfo[0].primary; + ret = 0; + + for (;;) { + nbuckets = mp->nbuckets; + MP_BUCKET(mf_offset, pgno, nbuckets, bucket); + + /* + * Once we work out which region we are looking in, we have to + * check that we have that region mapped, and that the version + * we have matches the ID in the main mpool region. Otherwise + * we have to go and map in any regions that don't match and + * retry. + */ + region = NREGION(mp, bucket); + regids = R_ADDR(dbmp->reginfo, mp->regids); + + for (;;) { + infop = *infopp = &dbmp->reginfo[region]; + c_mp = infop->primary; + + /* If we have the correct region mapped, we're done. */ + if (c_mp != NULL && regids[region] == infop->id) + break; + if ((ret = __memp_map_regions(dbmp)) != 0) + return (ret); + } + + /* If our caller wants the hash bucket, lock it here. */ + if (hpp != NULL) { + hp = R_ADDR(infop, c_mp->htab); + hp = &hp[bucket - region * mp->htab_buckets]; + + MUTEX_READLOCK(env, hp->mtx_hash); + + /* + * Check that we still have the correct region mapped. + */ + if (regids[region] != infop->id) { + MUTEX_UNLOCK(env, hp->mtx_hash); + continue; + } + + /* + * Now that the bucket is locked, we need to check that + * the cache has not been resized while we waited. + */ + new_nbuckets = mp->nbuckets; + if (nbuckets != new_nbuckets) { + MP_BUCKET(mf_offset, pgno, new_nbuckets, + new_bucket); + + if (new_bucket != bucket) { + MUTEX_UNLOCK(env, hp->mtx_hash); + continue; + } + } + + *hpp = hp; + } + + break; + } + + if (bucketp != NULL) + *bucketp = bucket - region * mp->htab_buckets; + return (ret); +} + +static int +__memp_merge_buckets(dbmp, new_nbuckets, old_bucket, new_bucket) + DB_MPOOL *dbmp; + u_int32_t new_nbuckets, old_bucket, new_bucket; +{ + BH *alloc_bhp, *bhp, *current_bhp, *new_bhp, *next_bhp; + DB_LSN vlsn; + DB_MPOOL_HASH *new_hp, *old_hp; + ENV *env; + MPOOL *mp, *new_mp, *old_mp; + MPOOLFILE *mfp; + REGINFO *new_infop, *old_infop; + u_int32_t bucket, high_mask, new_region, old_region; + int ret; + + env = dbmp->env; + mp = dbmp->reginfo[0].primary; + new_bhp = NULL; + ret = 0; + + MP_MASK(new_nbuckets, high_mask); + + old_region = NREGION(mp, old_bucket); + old_infop = &dbmp->reginfo[old_region]; + old_mp = old_infop->primary; + old_hp = R_ADDR(old_infop, old_mp->htab); + old_hp = &old_hp[old_bucket - old_region * mp->htab_buckets]; + + new_region = NREGION(mp, new_bucket); + new_infop = &dbmp->reginfo[new_region]; + new_mp = new_infop->primary; + new_hp = R_ADDR(new_infop, new_mp->htab); + new_hp = &new_hp[new_bucket - new_region * mp->htab_buckets]; + + /* + * Before merging, we need to check that there are no old buffers left + * in the target hash bucket after a previous split. + */ +free_old: + MUTEX_LOCK(env, new_hp->mtx_hash); + SH_TAILQ_FOREACH(bhp, &new_hp->hash_bucket, hq, __bh) { + MP_BUCKET(bhp->mf_offset, bhp->pgno, mp->nbuckets, bucket); + + if (bucket != new_bucket) { + /* + * There is no way that an old buffer can be locked + * after a split, since everyone will look for it in + * the new hash bucket. + */ + DB_ASSERT(env, !F_ISSET(bhp, BH_DIRTY) && + atomic_read(&bhp->ref) == 0); + atomic_inc(env, &bhp->ref); + mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); + if ((ret = __memp_bhfree(dbmp, new_infop, + mfp, new_hp, bhp, BH_FREE_FREEMEM)) != 0) { + MUTEX_UNLOCK(env, new_hp->mtx_hash); + return (ret); + } + + /* + * The free has modified the list of buffers and + * dropped the mutex. We need to start again. + */ + goto free_old; + } + } + MUTEX_UNLOCK(env, new_hp->mtx_hash); + + /* + * Before we begin, make sure that all of the buffers we care about are + * not in use and not frozen. We do this because we can't drop the old + * hash bucket mutex once we start moving buffers around. + */ +retry: MUTEX_LOCK(env, old_hp->mtx_hash); + SH_TAILQ_FOREACH(bhp, &old_hp->hash_bucket, hq, __bh) { + MP_HASH_BUCKET(MP_HASH(bhp->mf_offset, bhp->pgno), + new_nbuckets, high_mask, bucket); + + if (bucket == new_bucket && atomic_read(&bhp->ref) != 0) { + MUTEX_UNLOCK(env, old_hp->mtx_hash); + __os_yield(env, 0, 0); + goto retry; + } else if (bucket == new_bucket && F_ISSET(bhp, BH_FROZEN)) { + atomic_inc(env, &bhp->ref); + /* + * We need to drop the hash bucket mutex to avoid + * self-blocking when we allocate a new buffer. + */ + MUTEX_UNLOCK(env, old_hp->mtx_hash); + MUTEX_LOCK(env, bhp->mtx_buf); + F_SET(bhp, BH_EXCLUSIVE); + if (BH_OBSOLETE(bhp, old_hp->old_reader, vlsn)) + alloc_bhp = NULL; + else { + mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); + if ((ret = __memp_alloc(dbmp, + old_infop, mfp, 0, NULL, &alloc_bhp)) != 0) + goto err; + } + /* + * But we need to lock the hash bucket again before + * thawing the buffer. The call to __memp_bh_thaw + * will unlock the hash bucket mutex. + */ + MUTEX_LOCK(env, old_hp->mtx_hash); + if (F_ISSET(bhp, BH_THAWED)) { + ret = __memp_bhfree(dbmp, old_infop, NULL, NULL, + alloc_bhp, + BH_FREE_FREEMEM | BH_FREE_UNLOCKED); + } else + ret = __memp_bh_thaw(dbmp, + old_infop, old_hp, bhp, alloc_bhp); + + /* + * We've dropped the mutex in order to thaw, so we need + * to go back to the beginning and check that all of + * the buffers we care about are still unlocked and + * unreferenced. + */ +err: atomic_dec(env, &bhp->ref); + F_CLR(bhp, BH_EXCLUSIVE); + MUTEX_UNLOCK(env, bhp->mtx_buf); + if (ret != 0) + return (ret); + goto retry; + } + } + + /* + * We now know that all of the buffers we care about are unlocked and + * unreferenced. Go ahead and copy them. + */ + SH_TAILQ_FOREACH(bhp, &old_hp->hash_bucket, hq, __bh) { + MP_HASH_BUCKET(MP_HASH(bhp->mf_offset, bhp->pgno), + new_nbuckets, high_mask, bucket); + mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); + + /* + * We ignore buffers that don't hash to the new bucket. We + * could also ignore clean buffers which are not part of a + * multiversion chain as long as they have a backing file. + */ + if (bucket != new_bucket || (!F_ISSET(bhp, BH_DIRTY) && + SH_CHAIN_SINGLETON(bhp, vc) && !mfp->no_backing_file)) + continue; + + for (current_bhp = bhp, next_bhp = NULL; + current_bhp != NULL; + current_bhp = SH_CHAIN_PREV(current_bhp, vc, __bh), + next_bhp = alloc_bhp) { + /* Allocate in the new region. */ + if ((ret = __memp_alloc(dbmp, + new_infop, mfp, 0, NULL, &alloc_bhp)) != 0) + break; + + alloc_bhp->ref = current_bhp->ref; + alloc_bhp->priority = current_bhp->priority; + alloc_bhp->pgno = current_bhp->pgno; + alloc_bhp->mf_offset = current_bhp->mf_offset; + alloc_bhp->flags = current_bhp->flags; + alloc_bhp->td_off = current_bhp->td_off; + + /* + * We've duplicated the buffer, so now we need to + * update reference counts, including the counts in the + * per-MPOOLFILE and the transaction detail (for MVCC + * buffers). + */ + MUTEX_LOCK(env, mfp->mutex); + ++mfp->block_cnt; + MUTEX_UNLOCK(env, mfp->mutex); + + if (alloc_bhp->td_off != INVALID_ROFF && + (ret = __txn_add_buffer(env, + R_ADDR(&env->tx_handle->reginfo, + alloc_bhp->td_off))) != 0) + break; + + memcpy(alloc_bhp->buf, bhp->buf, mfp->stat.st_pagesize); + + /* + * We build up the MVCC chain first, then insert the + * head (stored in new_bhp) once. + */ + if (next_bhp == NULL) { + SH_CHAIN_INIT(alloc_bhp, vc); + new_bhp = alloc_bhp; + } else + SH_CHAIN_INSERT_BEFORE( + next_bhp, alloc_bhp, vc, __bh); + } + + MUTEX_LOCK(env, new_hp->mtx_hash); + SH_TAILQ_INSERT_TAIL(&new_hp->hash_bucket, new_bhp, hq); + if (F_ISSET(new_bhp, BH_DIRTY)) + atomic_inc(env, &new_hp->hash_page_dirty); + + if (F_ISSET(bhp, BH_DIRTY)) { + F_CLR(bhp, BH_DIRTY); + atomic_dec(env, &old_hp->hash_page_dirty); + } + MUTEX_UNLOCK(env, new_hp->mtx_hash); + } + + if (ret == 0) + mp->nbuckets = new_nbuckets; + MUTEX_UNLOCK(env, old_hp->mtx_hash); + + return (ret); +} + +static int +__memp_add_bucket(dbmp) + DB_MPOOL *dbmp; +{ + ENV *env; + MPOOL *mp; + u_int32_t high_mask, new_bucket, old_bucket; + + env = dbmp->env; + mp = dbmp->reginfo[0].primary; + + new_bucket = mp->nbuckets; + /* We should always be adding buckets to the last region. */ + DB_ASSERT(env, NREGION(mp, new_bucket) == mp->nreg - 1); + MP_MASK(mp->nbuckets, high_mask); + old_bucket = new_bucket & (high_mask >> 1); + + /* + * With fixed-sized regions, the new region is always smaller than the + * existing total cache size, so buffers always need to be copied. If + * we implement variable region sizes, it's possible that we will be + * splitting a hash bucket in the new region. Catch that here. + */ + DB_ASSERT(env, NREGION(mp, old_bucket) != NREGION(mp, new_bucket)); + + return (__memp_merge_buckets(dbmp, mp->nbuckets + 1, + old_bucket, new_bucket)); +} + +static int +__memp_add_region(dbmp) + DB_MPOOL *dbmp; +{ + ENV *env; + MPOOL *mp; + REGINFO *infop; + int ret; + roff_t reg_size; + u_int i; + u_int32_t *regids; + + env = dbmp->env; + mp = dbmp->reginfo[0].primary; + /* All cache regions are the same size. */ + reg_size = dbmp->reginfo[0].rp->size; + ret = 0; + + infop = &dbmp->reginfo[mp->nreg]; + infop->env = env; + infop->type = REGION_TYPE_MPOOL; + infop->id = INVALID_REGION_ID; + infop->flags = REGION_CREATE_OK; + if ((ret = __env_region_attach(env, infop, reg_size)) != 0) + return (ret); + if ((ret = __memp_init(env, + dbmp, mp->nreg, mp->htab_buckets, mp->max_nreg)) != 0) + return (ret); + regids = R_ADDR(dbmp->reginfo, mp->regids); + regids[mp->nreg++] = infop->id; + + for (i = 0; i < mp->htab_buckets; i++) + if ((ret = __memp_add_bucket(dbmp)) != 0) + break; + + return (ret); +} + +static int +__memp_remove_bucket(dbmp) + DB_MPOOL *dbmp; +{ + ENV *env; + MPOOL *mp; + u_int32_t high_mask, new_bucket, old_bucket; + + env = dbmp->env; + mp = dbmp->reginfo[0].primary; + + old_bucket = mp->nbuckets - 1; + + /* We should always be removing buckets from the last region. */ + DB_ASSERT(env, NREGION(mp, old_bucket) == mp->nreg - 1); + MP_MASK(mp->nbuckets - 1, high_mask); + new_bucket = old_bucket & (high_mask >> 1); + + return (__memp_merge_buckets(dbmp, mp->nbuckets - 1, + old_bucket, new_bucket)); +} + +static int +__memp_remove_region(dbmp) + DB_MPOOL *dbmp; +{ + ENV *env; + MPOOL *mp; + REGINFO *infop; + int ret; + u_int i; + + env = dbmp->env; + mp = dbmp->reginfo[0].primary; + ret = 0; + + if (mp->nreg == 1) { + __db_errx(env, "cannot remove the last cache"); + return (EINVAL); + } + + for (i = 0; i < mp->htab_buckets; i++) + if ((ret = __memp_remove_bucket(dbmp)) != 0) + return (ret); + + /* Detach from the region then destroy it. */ + infop = &dbmp->reginfo[--mp->nreg]; + return (__env_region_detach(env, infop, 1)); +} + +static int +__memp_map_regions(dbmp) + DB_MPOOL *dbmp; +{ + ENV *env; + MPOOL *mp; + int ret; + u_int i; + u_int32_t *regids; + + env = dbmp->env; + mp = dbmp->reginfo[0].primary; + regids = R_ADDR(dbmp->reginfo, mp->regids); + ret = 0; + + for (i = 1; i < mp->nreg; ++i) { + if (dbmp->reginfo[i].primary != NULL && + dbmp->reginfo[i].id == regids[i]) + continue; + + if (dbmp->reginfo[i].primary != NULL) + ret = __env_region_detach(env, &dbmp->reginfo[i], 0); + + dbmp->reginfo[i].env = env; + dbmp->reginfo[i].type = REGION_TYPE_MPOOL; + dbmp->reginfo[i].id = regids[i]; + dbmp->reginfo[i].flags = REGION_JOIN_OK; + if ((ret = + __env_region_attach(env, &dbmp->reginfo[i], 0)) != 0) + return (ret); + dbmp->reginfo[i].primary = R_ADDR(&dbmp->reginfo[i], + dbmp->reginfo[i].rp->primary); + } + + for (; i < mp->max_nreg; i++) + if (dbmp->reginfo[i].primary != NULL && + (ret = __env_region_detach(env, + &dbmp->reginfo[i], 0)) != 0) + break; + + return (ret); +} + +/* + * PUBLIC: int __memp_resize __P((DB_MPOOL *, u_int32_t, u_int32_t)); + */ +int +__memp_resize(dbmp, gbytes, bytes) + DB_MPOOL *dbmp; + u_int32_t gbytes, bytes; +{ + ENV *env; + MPOOL *mp; + int ret; + u_int32_t ncache; + roff_t reg_size, total_size; + + env = dbmp->env; + mp = dbmp->reginfo[0].primary; + reg_size = dbmp->reginfo[0].rp->size; + total_size = (roff_t)gbytes * GIGABYTE + bytes; + ncache = (u_int32_t)((total_size + reg_size / 2) / reg_size); + + if (ncache < 1) + ncache = 1; + else if (ncache > mp->max_nreg) { + __db_errx(env, + "cannot resize to %lu cache regions: maximum is %lu", + (u_long)ncache, (u_long)mp->max_nreg); + return (EINVAL); + } + + ret = 0; + MUTEX_LOCK(env, mp->mtx_resize); + while (mp->nreg != ncache) + if ((ret = (mp->nreg < ncache ? + __memp_add_region(dbmp) : + __memp_remove_region(dbmp))) != 0) + break; + MUTEX_UNLOCK(env, mp->mtx_resize); + + return (ret); +} + +/* + * PUBLIC: int __memp_get_cache_max __P((DB_ENV *, u_int32_t *, u_int32_t *)); + */ +int +__memp_get_cache_max(dbenv, max_gbytesp, max_bytesp) + DB_ENV *dbenv; + u_int32_t *max_gbytesp, *max_bytesp; +{ + DB_MPOOL *dbmp; + ENV *env; + MPOOL *mp; + roff_t reg_size, max_size; + + env = dbenv->env; + + ENV_NOT_CONFIGURED(env, + env->mp_handle, "DB_ENV->get_mp_max_ncache", DB_INIT_MPOOL); + + if (MPOOL_ON(env)) { + /* Cannot be set after open, no lock required to read. */ + dbmp = env->mp_handle; + mp = dbmp->reginfo[0].primary; + reg_size = dbmp->reginfo[0].rp->size; + max_size = mp->max_nreg * reg_size; + *max_gbytesp = (u_int32_t)(max_size / GIGABYTE); + *max_bytesp = (u_int32_t)(max_size % GIGABYTE); + } else { + *max_gbytesp = dbenv->mp_max_gbytes; + *max_bytesp = dbenv->mp_max_bytes; + } + + return (0); +} + +/* + * PUBLIC: int __memp_set_cache_max __P((DB_ENV *, u_int32_t, u_int32_t)); + */ +int +__memp_set_cache_max(dbenv, max_gbytes, max_bytes) + DB_ENV *dbenv; + u_int32_t max_gbytes, max_bytes; +{ + ENV *env; + + env = dbenv->env; + + ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_cache_max"); + dbenv->mp_max_gbytes = max_gbytes; + dbenv->mp_max_bytes = max_bytes; + + return (0); +} diff --git a/mp/mp_stat.c b/mp/mp_stat.c new file mode 100644 index 0000000..32f8080 --- /dev/null +++ b/mp/mp_stat.c @@ -0,0 +1,904 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" + +#ifdef HAVE_STATISTICS +static void __memp_print_bh __P((ENV *, + DB_MPOOL *, const char *, BH *, roff_t *)); +static int __memp_print_all __P((ENV *, u_int32_t)); +static int __memp_print_stats __P((ENV *, u_int32_t)); +static int __memp_print_hash __P((ENV *, + DB_MPOOL *, REGINFO *, roff_t *, u_int32_t)); +static int __memp_stat __P((ENV *, + DB_MPOOL_STAT **, DB_MPOOL_FSTAT ***, u_int32_t)); +static void __memp_stat_wait + __P((ENV *, REGINFO *, MPOOL *, DB_MPOOL_STAT *, u_int32_t)); +static int __memp_file_stats __P((ENV *, + MPOOLFILE *, void *, u_int32_t *, u_int32_t)); +static int __memp_count_files __P((ENV *, + MPOOLFILE *, void *, u_int32_t *, u_int32_t)); +static int __memp_get_files __P((ENV *, + MPOOLFILE *, void *, u_int32_t *, u_int32_t)); +static int __memp_print_files __P((ENV *, + MPOOLFILE *, void *, u_int32_t *, u_int32_t)); + +/* + * __memp_stat_pp -- + * DB_ENV->memp_stat pre/post processing. + * + * PUBLIC: int __memp_stat_pp + * PUBLIC: __P((DB_ENV *, DB_MPOOL_STAT **, DB_MPOOL_FSTAT ***, u_int32_t)); + */ +int +__memp_stat_pp(dbenv, gspp, fspp, flags) + DB_ENV *dbenv; + DB_MPOOL_STAT **gspp; + DB_MPOOL_FSTAT ***fspp; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + env = dbenv->env; + + ENV_REQUIRES_CONFIG(env, + env->mp_handle, "DB_ENV->memp_stat", DB_INIT_MPOOL); + + if ((ret = __db_fchk(env, + "DB_ENV->memp_stat", flags, DB_STAT_CLEAR)) != 0) + return (ret); + + ENV_ENTER(env, ip); + REPLICATION_WRAP(env, (__memp_stat(env, gspp, fspp, flags)), 0, ret); + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __memp_stat -- + * ENV->memp_stat + */ +static int +__memp_stat(env, gspp, fspp, flags) + ENV *env; + DB_MPOOL_STAT **gspp; + DB_MPOOL_FSTAT ***fspp; + u_int32_t flags; +{ + DB_MPOOL *dbmp; + DB_MPOOL_FSTAT **tfsp; + DB_MPOOL_STAT *sp; + MPOOL *c_mp, *mp; + size_t len; + int ret; + u_int32_t i, st_bytes, st_gbytes, st_hash_buckets, st_pages; + uintmax_t tmp_wait, tmp_nowait; + + dbmp = env->mp_handle; + mp = dbmp->reginfo[0].primary; + + /* Global statistics. */ + if (gspp != NULL) { + *gspp = NULL; + + if ((ret = __os_umalloc(env, sizeof(**gspp), gspp)) != 0) + return (ret); + memset(*gspp, 0, sizeof(**gspp)); + sp = *gspp; + + /* + * Initialization and information that is not maintained on + * a per-cache basis. Note that configuration information + * may be modified at any time, and so we have to lock. + */ + sp->st_gbytes = mp->stat.st_gbytes; + sp->st_bytes = mp->stat.st_bytes; + sp->st_pagesize = mp->stat.st_pagesize; + sp->st_ncache = mp->nreg; + sp->st_max_ncache = mp->max_nreg; + sp->st_regsize = dbmp->reginfo[0].rp->size; + sp->st_sync_interrupted = mp->stat.st_sync_interrupted; + + MPOOL_SYSTEM_LOCK(env); + sp->st_mmapsize = mp->mp_mmapsize; + sp->st_maxopenfd = mp->mp_maxopenfd; + sp->st_maxwrite = mp->mp_maxwrite; + sp->st_maxwrite_sleep = mp->mp_maxwrite_sleep; + MPOOL_SYSTEM_UNLOCK(env); + + /* Walk the cache list and accumulate the global information. */ + for (i = 0; i < mp->nreg; ++i) { + c_mp = dbmp->reginfo[i].primary; + + sp->st_map += c_mp->stat.st_map; + sp->st_cache_hit += c_mp->stat.st_cache_hit; + sp->st_cache_miss += c_mp->stat.st_cache_miss; + sp->st_page_create += c_mp->stat.st_page_create; + sp->st_page_in += c_mp->stat.st_page_in; + sp->st_page_out += c_mp->stat.st_page_out; + sp->st_ro_evict += c_mp->stat.st_ro_evict; + sp->st_rw_evict += c_mp->stat.st_rw_evict; + sp->st_page_trickle += c_mp->stat.st_page_trickle; + sp->st_pages += c_mp->stat.st_pages; + /* + * st_page_dirty calculated by __memp_stat_hash + * st_page_clean calculated here + */ + __memp_stat_hash( + &dbmp->reginfo[i], c_mp, &sp->st_page_dirty); + sp->st_page_clean = sp->st_pages - sp->st_page_dirty; + sp->st_hash_buckets += c_mp->stat.st_hash_buckets; + sp->st_hash_searches += c_mp->stat.st_hash_searches; + sp->st_hash_longest += c_mp->stat.st_hash_longest; + sp->st_hash_examined += c_mp->stat.st_hash_examined; + /* + * st_hash_nowait calculated by __memp_stat_wait + * st_hash_wait + */ + __memp_stat_wait( + env, &dbmp->reginfo[i], c_mp, sp, flags); + __mutex_set_wait_info(env, + c_mp->mtx_region, &tmp_wait, &tmp_nowait); + sp->st_region_nowait += tmp_nowait; + sp->st_region_wait += tmp_wait; + sp->st_alloc += c_mp->stat.st_alloc; + sp->st_alloc_buckets += c_mp->stat.st_alloc_buckets; + if (sp->st_alloc_max_buckets < + c_mp->stat.st_alloc_max_buckets) + sp->st_alloc_max_buckets = + c_mp->stat.st_alloc_max_buckets; + sp->st_alloc_pages += c_mp->stat.st_alloc_pages; + if (sp->st_alloc_max_pages < + c_mp->stat.st_alloc_max_pages) + sp->st_alloc_max_pages = + c_mp->stat.st_alloc_max_pages; + + if (LF_ISSET(DB_STAT_CLEAR)) { + if (!LF_ISSET(DB_STAT_SUBSYSTEM)) + __mutex_clear(env, c_mp->mtx_region); + + MPOOL_SYSTEM_LOCK(env); + st_bytes = c_mp->stat.st_bytes; + st_gbytes = c_mp->stat.st_gbytes; + st_hash_buckets = c_mp->stat.st_hash_buckets; + st_pages = c_mp->stat.st_pages; + memset(&c_mp->stat, 0, sizeof(c_mp->stat)); + c_mp->stat.st_bytes = st_bytes; + c_mp->stat.st_gbytes = st_gbytes; + c_mp->stat.st_hash_buckets = st_hash_buckets; + c_mp->stat.st_pages = st_pages; + MPOOL_SYSTEM_UNLOCK(env); + } + } + + /* + * We have duplicate statistics fields in per-file structures + * and the cache. The counters are only incremented in the + * per-file structures, except if a file is flushed from the + * mpool, at which time we copy its information into the cache + * statistics. We added the cache information above, now we + * add the per-file information. + */ + if ((ret = __memp_walk_files(env, mp, __memp_file_stats, + sp, NULL, fspp == NULL ? LF_ISSET(DB_STAT_CLEAR) : 0)) != 0) + return (ret); + } + + /* Per-file statistics. */ + if (fspp != NULL) { + *fspp = NULL; + + /* Count the MPOOLFILE structures. */ + i = 0; + len = 0; + if ((ret = __memp_walk_files(env, + mp, __memp_count_files, &len, &i, flags)) != 0) + return (ret); + + if (i == 0) + return (0); + len += sizeof(DB_MPOOL_FSTAT *); /* Trailing NULL */ + + /* Allocate space */ + if ((ret = __os_umalloc(env, len, fspp)) != 0) + return (ret); + + tfsp = *fspp; + *tfsp = NULL; + + /* + * Files may have been opened since we counted, don't walk + * off the end of the allocated space. + */ + if ((ret = __memp_walk_files(env, + mp, __memp_get_files, &tfsp, &i, flags)) != 0) + return (ret); + + *++tfsp = NULL; + } + + return (0); +} + +static int +__memp_file_stats(env, mfp, argp, countp, flags) + ENV *env; + MPOOLFILE *mfp; + void *argp; + u_int32_t *countp; + u_int32_t flags; +{ + DB_MPOOL_STAT *sp; + u_int32_t pagesize; + + COMPQUIET(env, NULL); + COMPQUIET(countp, NULL); + + sp = argp; + + sp->st_map += mfp->stat.st_map; + sp->st_cache_hit += mfp->stat.st_cache_hit; + sp->st_cache_miss += mfp->stat.st_cache_miss; + sp->st_page_create += mfp->stat.st_page_create; + sp->st_page_in += mfp->stat.st_page_in; + sp->st_page_out += mfp->stat.st_page_out; + if (LF_ISSET(DB_STAT_CLEAR)) { + pagesize = mfp->stat.st_pagesize; + memset(&mfp->stat, 0, sizeof(mfp->stat)); + mfp->stat.st_pagesize = pagesize; + } + return (0); +} + +static int +__memp_count_files(env, mfp, argp, countp, flags) + ENV *env; + MPOOLFILE *mfp; + void *argp; + u_int32_t *countp; + u_int32_t flags; +{ + DB_MPOOL *dbmp; + size_t len; + + COMPQUIET(flags, 0); + dbmp = env->mp_handle; + len = *(size_t *)argp; + + (*countp)++; + len += sizeof(DB_MPOOL_FSTAT *) + + sizeof(DB_MPOOL_FSTAT) + strlen(__memp_fns(dbmp, mfp)) + 1; + + *(size_t *)argp = len; + return (0); +} + +/* + * __memp_get_files -- + * get file specific statistics + * + * Build each individual entry. We assume that an array of pointers are + * aligned correctly to be followed by an array of structures, which should + * be safe (in this particular case, the first element of the structure + * is a pointer, so we're doubly safe). The array is followed by space + * for the text file names. + */ +static int +__memp_get_files(env, mfp, argp, countp, flags) + ENV *env; + MPOOLFILE *mfp; + void *argp; + u_int32_t *countp; + u_int32_t flags; +{ + DB_MPOOL *dbmp; + DB_MPOOL_FSTAT **tfsp, *tstruct; + char *name, *tname; + size_t nlen; + u_int32_t pagesize; + + if (*countp == 0) + return (0); + + dbmp = env->mp_handle; + tfsp = *(DB_MPOOL_FSTAT ***)argp; + + if (*tfsp == NULL) { + /* Add 1 to count because we need to skip over the NULL. */ + tstruct = (DB_MPOOL_FSTAT *)(tfsp + *countp + 1); + tname = (char *)(tstruct + *countp); + *tfsp = tstruct; + } else { + tstruct = *tfsp + 1; + tname = (*tfsp)->file_name + strlen((*tfsp)->file_name) + 1; + *++tfsp = tstruct; + } + + name = __memp_fns(dbmp, mfp); + nlen = strlen(name) + 1; + memcpy(tname, name, nlen); + *tstruct = mfp->stat; + tstruct->file_name = tname; + + *(DB_MPOOL_FSTAT ***)argp = tfsp; + (*countp)--; + + if (LF_ISSET(DB_STAT_CLEAR)) { + pagesize = mfp->stat.st_pagesize; + memset(&mfp->stat, 0, sizeof(mfp->stat)); + mfp->stat.st_pagesize = pagesize; + } + return (0); +} + +/* + * __memp_stat_print_pp -- + * ENV->memp_stat_print pre/post processing. + * + * PUBLIC: int __memp_stat_print_pp __P((DB_ENV *, u_int32_t)); + */ +int +__memp_stat_print_pp(dbenv, flags) + DB_ENV *dbenv; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + env = dbenv->env; + + ENV_REQUIRES_CONFIG(env, + env->mp_handle, "DB_ENV->memp_stat_print", DB_INIT_MPOOL); + +#define DB_STAT_MEMP_FLAGS \ + (DB_STAT_ALL | DB_STAT_CLEAR | DB_STAT_MEMP_HASH) + if ((ret = __db_fchk(env, + "DB_ENV->memp_stat_print", flags, DB_STAT_MEMP_FLAGS)) != 0) + return (ret); + + ENV_ENTER(env, ip); + REPLICATION_WRAP(env, (__memp_stat_print(env, flags)), 0, ret); + ENV_LEAVE(env, ip); + return (ret); +} + +#define FMAP_ENTRIES 200 /* Files we map. */ + +/* + * __memp_stat_print -- + * ENV->memp_stat_print method. + * + * PUBLIC: int __memp_stat_print __P((ENV *, u_int32_t)); + */ +int +__memp_stat_print(env, flags) + ENV *env; + u_int32_t flags; +{ + u_int32_t orig_flags; + int ret; + + orig_flags = flags; + LF_CLR(DB_STAT_CLEAR | DB_STAT_SUBSYSTEM); + if (flags == 0 || LF_ISSET(DB_STAT_ALL)) { + ret = __memp_print_stats(env, + LF_ISSET(DB_STAT_ALL) ? flags : orig_flags); + if (flags == 0 || ret != 0) + return (ret); + } + + if (LF_ISSET(DB_STAT_ALL | DB_STAT_MEMP_HASH) && + (ret = __memp_print_all(env, orig_flags)) != 0) + return (ret); + + return (0); +} + +/* + * __memp_print_stats -- + * Display default mpool region statistics. + */ +static int +__memp_print_stats(env, flags) + ENV *env; + u_int32_t flags; +{ + DB_MPOOL_FSTAT **fsp, **tfsp; + DB_MPOOL_STAT *gsp; + int ret; + + if ((ret = __memp_stat(env, &gsp, &fsp, flags)) != 0) + return (ret); + + if (LF_ISSET(DB_STAT_ALL)) + __db_msg(env, "Default cache region information:"); + __db_dlbytes(env, "Total cache size", + (u_long)gsp->st_gbytes, (u_long)0, (u_long)gsp->st_bytes); + __db_dl(env, "Number of caches", (u_long)gsp->st_ncache); + __db_dl(env, "Maximum number of caches", (u_long)gsp->st_max_ncache); + __db_dlbytes(env, "Pool individual cache size", + (u_long)0, (u_long)0, (u_long)gsp->st_regsize); + __db_dlbytes(env, "Maximum memory-mapped file size", + (u_long)0, (u_long)0, (u_long)gsp->st_mmapsize); + STAT_LONG("Maximum open file descriptors", gsp->st_maxopenfd); + STAT_LONG("Maximum sequential buffer writes", gsp->st_maxwrite); + STAT_LONG("Sleep after writing maximum sequential buffers", + gsp->st_maxwrite_sleep); + __db_dl(env, + "Requested pages mapped into the process' address space", + (u_long)gsp->st_map); + __db_dl_pct(env, "Requested pages found in the cache", + (u_long)gsp->st_cache_hit, DB_PCT( + gsp->st_cache_hit, gsp->st_cache_hit + gsp->st_cache_miss), NULL); + __db_dl(env, "Requested pages not found in the cache", + (u_long)gsp->st_cache_miss); + __db_dl(env, + "Pages created in the cache", (u_long)gsp->st_page_create); + __db_dl(env, "Pages read into the cache", (u_long)gsp->st_page_in); + __db_dl(env, "Pages written from the cache to the backing file", + (u_long)gsp->st_page_out); + __db_dl(env, "Clean pages forced from the cache", + (u_long)gsp->st_ro_evict); + __db_dl(env, "Dirty pages forced from the cache", + (u_long)gsp->st_rw_evict); + __db_dl(env, "Dirty pages written by trickle-sync thread", + (u_long)gsp->st_page_trickle); + __db_dl(env, "Current total page count", + (u_long)gsp->st_pages); + __db_dl(env, "Current clean page count", + (u_long)gsp->st_page_clean); + __db_dl(env, "Current dirty page count", + (u_long)gsp->st_page_dirty); + __db_dl(env, "Number of hash buckets used for page location", + (u_long)gsp->st_hash_buckets); + __db_dl(env, "Assumed page size used", + (u_long)gsp->st_pagesize); + __db_dl(env, + "Total number of times hash chains searched for a page", + (u_long)gsp->st_hash_searches); + __db_dl(env, "The longest hash chain searched for a page", + (u_long)gsp->st_hash_longest); + __db_dl(env, + "Total number of hash chain entries checked for page", + (u_long)gsp->st_hash_examined); + __db_dl_pct(env, + "The number of hash bucket locks that required waiting", + (u_long)gsp->st_hash_wait, DB_PCT( + gsp->st_hash_wait, gsp->st_hash_wait + gsp->st_hash_nowait), NULL); + __db_dl_pct(env, + "The maximum number of times any hash bucket lock was waited for", + (u_long)gsp->st_hash_max_wait, DB_PCT(gsp->st_hash_max_wait, + gsp->st_hash_max_wait + gsp->st_hash_max_nowait), NULL); + __db_dl_pct(env, + "The number of region locks that required waiting", + (u_long)gsp->st_region_wait, DB_PCT(gsp->st_region_wait, + gsp->st_region_wait + gsp->st_region_nowait), NULL); + __db_dl(env, "The number of buffers frozen", + (u_long)gsp->st_mvcc_frozen); + __db_dl(env, "The number of buffers thawed", + (u_long)gsp->st_mvcc_thawed); + __db_dl(env, "The number of frozen buffers freed", + (u_long)gsp->st_mvcc_freed); + __db_dl(env, "The number of page allocations", (u_long)gsp->st_alloc); + __db_dl(env, + "The number of hash buckets examined during allocations", + (u_long)gsp->st_alloc_buckets); + __db_dl(env, + "The maximum number of hash buckets examined for an allocation", + (u_long)gsp->st_alloc_max_buckets); + __db_dl(env, "The number of pages examined during allocations", + (u_long)gsp->st_alloc_pages); + __db_dl(env, "The max number of pages examined for an allocation", + (u_long)gsp->st_alloc_max_pages); + __db_dl(env, "Threads waited on page I/O", (u_long)gsp->st_io_wait); + __db_dl(env, "The number of times a sync is interrupted", + (u_long)gsp->st_sync_interrupted); + + for (tfsp = fsp; fsp != NULL && *tfsp != NULL; ++tfsp) { + if (LF_ISSET(DB_STAT_ALL)) + __db_msg(env, "%s", DB_GLOBAL(db_line)); + __db_msg(env, "Pool File: %s", (*tfsp)->file_name); + __db_dl(env, "Page size", (u_long)(*tfsp)->st_pagesize); + __db_dl(env, + "Requested pages mapped into the process' address space", + (u_long)(*tfsp)->st_map); + __db_dl_pct(env, "Requested pages found in the cache", + (u_long)(*tfsp)->st_cache_hit, DB_PCT((*tfsp)->st_cache_hit, + (*tfsp)->st_cache_hit + (*tfsp)->st_cache_miss), NULL); + __db_dl(env, "Requested pages not found in the cache", + (u_long)(*tfsp)->st_cache_miss); + __db_dl(env, "Pages created in the cache", + (u_long)(*tfsp)->st_page_create); + __db_dl(env, "Pages read into the cache", + (u_long)(*tfsp)->st_page_in); + __db_dl(env, + "Pages written from the cache to the backing file", + (u_long)(*tfsp)->st_page_out); + } + + __os_ufree(env, fsp); + __os_ufree(env, gsp); + return (0); +} + +/* + * __memp_print_all -- + * Display debugging mpool region statistics. + */ +static int +__memp_print_all(env, flags) + ENV *env; + u_int32_t flags; +{ + static const FN cfn[] = { + { DB_MPOOL_NOFILE, "DB_MPOOL_NOFILE" }, + { DB_MPOOL_UNLINK, "DB_MPOOL_UNLINK" }, + { 0, NULL } + }; + DB_MPOOL *dbmp; + DB_MPOOLFILE *dbmfp; + MPOOL *mp; + roff_t fmap[FMAP_ENTRIES + 1]; + u_int32_t i, cnt; + int ret; + + dbmp = env->mp_handle; + mp = dbmp->reginfo[0].primary; + ret = 0; + + MPOOL_SYSTEM_LOCK(env); + + __db_print_reginfo(env, dbmp->reginfo, "Mpool", flags); + __db_msg(env, "%s", DB_GLOBAL(db_line)); + + __db_msg(env, "MPOOL structure:"); + __mutex_print_debug_single( + env, "MPOOL region mutex", mp->mtx_region, flags); + STAT_LSN("Maximum checkpoint LSN", &mp->lsn); + STAT_ULONG("Hash table entries", mp->htab_buckets); + STAT_ULONG("Hash table last-checked", mp->last_checked); + STAT_ULONG("Hash table LRU count", mp->lru_count); + STAT_ULONG("Put counter", mp->put_counter); + + __db_msg(env, "%s", DB_GLOBAL(db_line)); + __db_msg(env, "DB_MPOOL handle information:"); + __mutex_print_debug_single( + env, "DB_MPOOL handle mutex", dbmp->mutex, flags); + STAT_ULONG("Underlying cache regions", mp->nreg); + + __db_msg(env, "%s", DB_GLOBAL(db_line)); + __db_msg(env, "DB_MPOOLFILE structures:"); + for (cnt = 0, dbmfp = TAILQ_FIRST(&dbmp->dbmfq); + dbmfp != NULL; dbmfp = TAILQ_NEXT(dbmfp, q), ++cnt) { + __db_msg(env, "File #%lu: %s: per-process, %s", + (u_long)cnt + 1, __memp_fn(dbmfp), + F_ISSET(dbmfp, MP_READONLY) ? "readonly" : "read/write"); + STAT_ULONG("Reference count", dbmfp->ref); + STAT_ULONG("Pinned block reference count", dbmfp->ref); + STAT_ULONG("Clear length", dbmfp->clear_len); + __db_print_fileid(env, dbmfp->fileid, "\tID"); + STAT_ULONG("File type", dbmfp->ftype); + STAT_ULONG("LSN offset", dbmfp->lsn_offset); + STAT_ULONG("Max gbytes", dbmfp->gbytes); + STAT_ULONG("Max bytes", dbmfp->bytes); + STAT_ULONG("Cache priority", dbmfp->priority); + STAT_POINTER("mmap address", dbmfp->addr); + STAT_ULONG("mmap length", dbmfp->len); + __db_prflags(env, NULL, dbmfp->flags, cfn, NULL, "\tFlags"); + __db_print_fh(env, "File handle", dbmfp->fhp, flags); + } + + __db_msg(env, "%s", DB_GLOBAL(db_line)); + __db_msg(env, "MPOOLFILE structures:"); + cnt = 0; + ret = __memp_walk_files(env, mp, __memp_print_files, fmap, &cnt, flags); + MPOOL_SYSTEM_UNLOCK(env); + if (ret != 0) + return (ret); + + if (cnt < FMAP_ENTRIES) + fmap[cnt] = INVALID_ROFF; + else + fmap[FMAP_ENTRIES] = INVALID_ROFF; + + /* Dump the individual caches. */ + for (i = 0; i < mp->nreg; ++i) { + __db_msg(env, "%s", DB_GLOBAL(db_line)); + __db_msg(env, "Cache #%d:", i + 1); + if (i > 0) + __env_alloc_print(&dbmp->reginfo[i], flags); + if ((ret = __memp_print_hash( + env, dbmp, &dbmp->reginfo[i], fmap, flags)) != 0) + break; + } + + return (ret); +} + +static int +__memp_print_files(env, mfp, argp, countp, flags) + ENV *env; + MPOOLFILE *mfp; + void *argp; + u_int32_t *countp; + u_int32_t flags; +{ + roff_t *fmap; + DB_MPOOL *dbmp; + u_int32_t mfp_flags; + static const FN fn[] = { + { MP_CAN_MMAP, "MP_CAN_MMAP" }, + { MP_DIRECT, "MP_DIRECT" }, + { MP_EXTENT, "MP_EXTENT" }, + { MP_FAKE_DEADFILE, "deadfile" }, + { MP_FAKE_FILEWRITTEN, "file written" }, + { MP_FAKE_NB, "no backing file" }, + { MP_FAKE_UOC, "unlink on close" }, + { MP_NOT_DURABLE, "not durable" }, + { MP_TEMP, "MP_TEMP" }, + { 0, NULL } + }; + + dbmp = env->mp_handle; + fmap = argp; + + __db_msg(env, "File #%d: %s", *countp + 1, __memp_fns(dbmp, mfp)); + __mutex_print_debug_single(env, "Mutex", mfp->mutex, flags); + + MUTEX_LOCK(env, mfp->mutex); + STAT_ULONG("Reference count", mfp->mpf_cnt); + STAT_ULONG("Block count", mfp->block_cnt); + STAT_ULONG("Last page number", mfp->last_pgno); + STAT_ULONG("Original last page number", mfp->orig_last_pgno); + STAT_ULONG("Maximum page number", mfp->maxpgno); + STAT_LONG("Type", mfp->ftype); + STAT_LONG("Priority", mfp->priority); + STAT_LONG("Page's LSN offset", mfp->lsn_off); + STAT_LONG("Page's clear length", mfp->clear_len); + + __db_print_fileid(env, + R_ADDR(dbmp->reginfo, mfp->fileid_off), "\tID"); + + mfp_flags = 0; + if (mfp->deadfile) + FLD_SET(mfp_flags, MP_FAKE_DEADFILE); + if (mfp->file_written) + FLD_SET(mfp_flags, MP_FAKE_FILEWRITTEN); + if (mfp->no_backing_file) + FLD_SET(mfp_flags, MP_FAKE_NB); + if (mfp->unlink_on_close) + FLD_SET(mfp_flags, MP_FAKE_UOC); + __db_prflags(env, NULL, mfp_flags, fn, NULL, "\tFlags"); + + if (*countp < FMAP_ENTRIES) + fmap[*countp] = R_OFFSET(dbmp->reginfo, mfp); + (*countp)++; + MUTEX_UNLOCK(env, mfp->mutex); + return (0); +} + +/* + * __memp_print_hash -- + * Display hash bucket statistics for a cache. + */ +static int +__memp_print_hash(env, dbmp, reginfo, fmap, flags) + ENV *env; + DB_MPOOL *dbmp; + REGINFO *reginfo; + roff_t *fmap; + u_int32_t flags; +{ + BH *bhp, *vbhp; + DB_MPOOL_HASH *hp; + DB_MSGBUF mb; + MPOOL *c_mp; + u_int32_t bucket; + + c_mp = reginfo->primary; + DB_MSGBUF_INIT(&mb); + + /* Display the hash table list of BH's. */ + __db_msg(env, + "BH hash table (%lu hash slots)", (u_long)c_mp->htab_buckets); + __db_msg(env, "bucket #: priority, I/O wait, [mutex]"); + __db_msg(env, "\tpageno, file, ref, LSN, address, priority, flags"); + + for (hp = R_ADDR(reginfo, c_mp->htab), + bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) { + MUTEX_READLOCK(env, hp->mtx_hash); + if ((bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)) != NULL) { + __db_msgadd(env, &mb, + "bucket %lu: %lu (%lu dirty)", + (u_long)bucket, (u_long)hp->hash_io_wait, + (u_long)atomic_read(&hp->hash_page_dirty)); + if (hp->hash_frozen != 0) + __db_msgadd(env, &mb, "(MVCC %lu/%lu/%lu) ", + (u_long)hp->hash_frozen, + (u_long)hp->hash_thawed, + (u_long)hp->hash_frozen_freed); + __mutex_print_debug_stats( + env, &mb, hp->mtx_hash, flags); + DB_MSGBUF_FLUSH(env, &mb); + } + for (; bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) { + __memp_print_bh(env, dbmp, NULL, bhp, fmap); + + /* Print the version chain, if it exists. */ + for (vbhp = SH_CHAIN_PREV(bhp, vc, __bh); + vbhp != NULL; + vbhp = SH_CHAIN_PREV(vbhp, vc, __bh)) { + __memp_print_bh(env, dbmp, + " next:\t", vbhp, fmap); + } + } + MUTEX_UNLOCK(env, hp->mtx_hash); + } + + return (0); +} + +/* + * __memp_print_bh -- + * Display a BH structure. + */ +static void +__memp_print_bh(env, dbmp, prefix, bhp, fmap) + ENV *env; + DB_MPOOL *dbmp; + const char *prefix; + BH *bhp; + roff_t *fmap; +{ + static const FN fn[] = { + { BH_CALLPGIN, "callpgin" }, + { BH_DIRTY, "dirty" }, + { BH_DIRTY_CREATE, "created" }, + { BH_DISCARD, "discard" }, + { BH_EXCLUSIVE, "exclusive" }, + { BH_FREED, "freed" }, + { BH_FROZEN, "frozen" }, + { BH_TRASH, "trash" }, + { BH_THAWED, "thawed" }, + { 0, NULL } + }; + DB_MSGBUF mb; + int i; + + DB_MSGBUF_INIT(&mb); + + if (prefix != NULL) + __db_msgadd(env, &mb, "%s", prefix); + else + __db_msgadd(env, &mb, "\t"); + + for (i = 0; i < FMAP_ENTRIES; ++i) + if (fmap[i] == INVALID_ROFF || fmap[i] == bhp->mf_offset) + break; + + if (fmap[i] == INVALID_ROFF) + __db_msgadd(env, &mb, "%5lu, %lu, ", + (u_long)bhp->pgno, (u_long)bhp->mf_offset); + else + __db_msgadd( + env, &mb, "%5lu, #%d, ", (u_long)bhp->pgno, i + 1); + + __db_msgadd(env, &mb, "%2lu, %lu/%lu", (u_long)atomic_read(&bhp->ref), + F_ISSET(bhp, BH_FROZEN) ? 0 : (u_long)LSN(bhp->buf).file, + F_ISSET(bhp, BH_FROZEN) ? 0 : (u_long)LSN(bhp->buf).offset); + if (bhp->td_off != INVALID_ROFF) + __db_msgadd(env, &mb, " (@%lu/%lu)", + (u_long)VISIBLE_LSN(env, bhp)->file, + (u_long)VISIBLE_LSN(env, bhp)->offset); + __db_msgadd(env, &mb, ", %#08lx, %lu", + (u_long)R_OFFSET(dbmp->reginfo, bhp), (u_long)bhp->priority); + __db_prflags(env, &mb, bhp->flags, fn, " (", ")"); + DB_MSGBUF_FLUSH(env, &mb); +} + +/* + * __memp_stat_wait -- + * Total hash bucket wait stats into the region. + */ +static void +__memp_stat_wait(env, reginfo, mp, mstat, flags) + ENV *env; + REGINFO *reginfo; + MPOOL *mp; + DB_MPOOL_STAT *mstat; + u_int32_t flags; +{ + DB_MPOOL_HASH *hp; + u_int32_t i; + uintmax_t tmp_nowait, tmp_wait; + + mstat->st_hash_max_wait = 0; + hp = R_ADDR(reginfo, mp->htab); + for (i = 0; i < mp->htab_buckets; i++, hp++) { + __mutex_set_wait_info( + env, hp->mtx_hash, &tmp_wait, &tmp_nowait); + mstat->st_hash_nowait += tmp_nowait; + mstat->st_hash_wait += tmp_wait; + if (tmp_wait > mstat->st_hash_max_wait) { + mstat->st_hash_max_wait = tmp_wait; + mstat->st_hash_max_nowait = tmp_nowait; + } + if (LF_ISSET(DB_STAT_CLEAR | + DB_STAT_SUBSYSTEM) == DB_STAT_CLEAR) + __mutex_clear(env, hp->mtx_hash); + + mstat->st_io_wait += hp->hash_io_wait; + mstat->st_mvcc_frozen += hp->hash_frozen; + mstat->st_mvcc_thawed += hp->hash_thawed; + mstat->st_mvcc_freed += hp->hash_frozen_freed; + if (LF_ISSET(DB_STAT_CLEAR)) { + hp->hash_io_wait = 0; + hp->hash_frozen = 0; + hp->hash_thawed = 0; + hp->hash_frozen_freed = 0; + } + } +} + +#else /* !HAVE_STATISTICS */ + +int +__memp_stat_pp(dbenv, gspp, fspp, flags) + DB_ENV *dbenv; + DB_MPOOL_STAT **gspp; + DB_MPOOL_FSTAT ***fspp; + u_int32_t flags; +{ + COMPQUIET(gspp, NULL); + COMPQUIET(fspp, NULL); + COMPQUIET(flags, 0); + + return (__db_stat_not_built(dbenv->env)); +} + +int +__memp_stat_print_pp(dbenv, flags) + DB_ENV *dbenv; + u_int32_t flags; +{ + COMPQUIET(flags, 0); + + return (__db_stat_not_built(dbenv->env)); +} +#endif + +/* + * __memp_stat_hash -- + * Total hash bucket stats (other than mutex wait) into the region. + * + * PUBLIC: void __memp_stat_hash __P((REGINFO *, MPOOL *, u_int32_t *)); + */ +void +__memp_stat_hash(reginfo, mp, dirtyp) + REGINFO *reginfo; + MPOOL *mp; + u_int32_t *dirtyp; +{ + DB_MPOOL_HASH *hp; + u_int32_t dirty, i; + + hp = R_ADDR(reginfo, mp->htab); + for (i = 0, dirty = 0; i < mp->htab_buckets; i++, hp++) + dirty += (u_int32_t)atomic_read(&hp->hash_page_dirty); + *dirtyp = dirty; +} diff --git a/mp/mp_sync.c b/mp/mp_sync.c new file mode 100644 index 0000000..7490445 --- /dev/null +++ b/mp/mp_sync.c @@ -0,0 +1,919 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/db_page.h" +#include "dbinc/hash.h" + +typedef struct { + DB_MPOOL_HASH *track_hp; /* Hash bucket. */ + + roff_t track_off; /* Page file offset. */ + db_pgno_t track_pgno; /* Page number. */ +} BH_TRACK; + +static int __bhcmp __P((const void *, const void *)); +static int __memp_close_flush_files __P((ENV *, int)); +static int __memp_sync_files __P((ENV *)); +static int __memp_sync_file __P((ENV *, + MPOOLFILE *, void *, u_int32_t *, u_int32_t)); + +/* + * __memp_walk_files -- + * PUBLIC: int __memp_walk_files __P((ENV *, MPOOL *, + * PUBLIC: int (*) __P((ENV *, MPOOLFILE *, void *, + * PUBLIC: u_int32_t *, u_int32_t)), void *, u_int32_t *, u_int32_t)); + */ +int +__memp_walk_files(env, mp, func, arg, countp, flags) + ENV *env; + MPOOL *mp; + int (*func)__P((ENV *, MPOOLFILE *, void *, u_int32_t *, u_int32_t)); + void *arg; + u_int32_t *countp; + u_int32_t flags; +{ + DB_MPOOL *dbmp; + DB_MPOOL_HASH *hp; + MPOOLFILE *mfp; + int i, ret, t_ret; + + dbmp = env->mp_handle; + ret = 0; + + hp = R_ADDR(dbmp->reginfo, mp->ftab); + for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) { + MUTEX_LOCK(env, hp->mtx_hash); + SH_TAILQ_FOREACH(mfp, &hp->hash_bucket, q, __mpoolfile) { + if ((t_ret = func(env, + mfp, arg, countp, flags)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0 && !LF_ISSET(DB_STAT_MEMP_NOERROR)) + break; + } + MUTEX_UNLOCK(env, hp->mtx_hash); + if (ret != 0 && !LF_ISSET(DB_STAT_MEMP_NOERROR)) + break; + } + return (ret); +} + +/* + * __memp_sync_pp -- + * ENV->memp_sync pre/post processing. + * + * PUBLIC: int __memp_sync_pp __P((DB_ENV *, DB_LSN *)); + */ +int +__memp_sync_pp(dbenv, lsnp) + DB_ENV *dbenv; + DB_LSN *lsnp; +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + env = dbenv->env; + + ENV_REQUIRES_CONFIG(env, + env->mp_handle, "memp_sync", DB_INIT_MPOOL); + + /* + * If no LSN is provided, flush the entire cache (reasonable usage + * even if there's no log subsystem configured). + */ + if (lsnp != NULL) + ENV_REQUIRES_CONFIG(env, + env->lg_handle, "memp_sync", DB_INIT_LOG); + + ENV_ENTER(env, ip); + REPLICATION_WRAP(env, (__memp_sync(env, DB_SYNC_CACHE, lsnp)), 0, ret); + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __memp_sync -- + * ENV->memp_sync. + * + * PUBLIC: int __memp_sync __P((ENV *, u_int32_t, DB_LSN *)); + */ +int +__memp_sync(env, flags, lsnp) + ENV *env; + u_int32_t flags; + DB_LSN *lsnp; +{ + DB_MPOOL *dbmp; + MPOOL *mp; + int interrupted, ret; + + dbmp = env->mp_handle; + mp = dbmp->reginfo[0].primary; + + /* If we've flushed to the requested LSN, return that information. */ + if (lsnp != NULL) { + MPOOL_SYSTEM_LOCK(env); + if (LOG_COMPARE(lsnp, &mp->lsn) <= 0) { + *lsnp = mp->lsn; + + MPOOL_SYSTEM_UNLOCK(env); + return (0); + } + MPOOL_SYSTEM_UNLOCK(env); + } + + if ((ret = + __memp_sync_int(env, NULL, 0, flags, NULL, &interrupted)) != 0) + return (ret); + + if (!interrupted && lsnp != NULL) { + MPOOL_SYSTEM_LOCK(env); + if (LOG_COMPARE(lsnp, &mp->lsn) > 0) + mp->lsn = *lsnp; + MPOOL_SYSTEM_UNLOCK(env); + } + + return (0); +} + +/* + * __memp_fsync_pp -- + * DB_MPOOLFILE->sync pre/post processing. + * + * PUBLIC: int __memp_fsync_pp __P((DB_MPOOLFILE *)); + */ +int +__memp_fsync_pp(dbmfp) + DB_MPOOLFILE *dbmfp; +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + env = dbmfp->env; + + MPF_ILLEGAL_BEFORE_OPEN(dbmfp, "DB_MPOOLFILE->sync"); + + ENV_ENTER(env, ip); + REPLICATION_WRAP(env, (__memp_fsync(dbmfp)), 0, ret); + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __memp_fsync -- + * DB_MPOOLFILE->sync. + * + * PUBLIC: int __memp_fsync __P((DB_MPOOLFILE *)); + */ +int +__memp_fsync(dbmfp) + DB_MPOOLFILE *dbmfp; +{ + MPOOLFILE *mfp; + + mfp = dbmfp->mfp; + + /* + * If this handle doesn't have a file descriptor that's open for + * writing, or if the file is a temporary, or if the file hasn't + * been written since it was flushed, there's no reason to proceed + * further. + */ + if (F_ISSET(dbmfp, MP_READONLY)) + return (0); + + if (F_ISSET(dbmfp->mfp, MP_TEMP) || dbmfp->mfp->no_backing_file) + return (0); + + if (mfp->file_written == 0) + return (0); + + return (__memp_sync_int( + dbmfp->env, dbmfp, 0, DB_SYNC_FILE, NULL, NULL)); +} + +/* + * __mp_xxx_fh -- + * Return a file descriptor for DB 1.85 compatibility locking. + * + * PUBLIC: int __mp_xxx_fh __P((DB_MPOOLFILE *, DB_FH **)); + */ +int +__mp_xxx_fh(dbmfp, fhp) + DB_MPOOLFILE *dbmfp; + DB_FH **fhp; +{ + int ret; + + /* + * This is a truly spectacular layering violation, intended ONLY to + * support compatibility for the DB 1.85 DB->fd call. + * + * Sync the database file to disk, creating the file as necessary. + * + * We skip the MP_READONLY and MP_TEMP tests done by memp_fsync(3). + * The MP_READONLY test isn't interesting because we will either + * already have a file descriptor (we opened the database file for + * reading) or we aren't readonly (we created the database which + * requires write privileges). The MP_TEMP test isn't interesting + * because we want to write to the backing file regardless so that + * we get a file descriptor to return. + */ + if ((*fhp = dbmfp->fhp) != NULL) + return (0); + + if ((ret = __memp_sync_int( + dbmfp->env, dbmfp, 0, DB_SYNC_FILE, NULL, NULL)) == 0) + *fhp = dbmfp->fhp; + return (ret); +} + +/* + * __memp_sync_int -- + * Mpool sync internal function. + * + * PUBLIC: int __memp_sync_int __P((ENV *, + * PUBLIC: DB_MPOOLFILE *, u_int32_t, u_int32_t, u_int32_t *, int *)); + */ +int +__memp_sync_int(env, dbmfp, trickle_max, flags, wrote_totalp, interruptedp) + ENV *env; + DB_MPOOLFILE *dbmfp; + u_int32_t trickle_max, flags, *wrote_totalp; + int *interruptedp; +{ + BH *bhp; + BH_TRACK *bharray; + DB_MPOOL *dbmp; + DB_MPOOL_HASH *hp; + MPOOL *c_mp, *mp; + MPOOLFILE *mfp; + db_mutex_t mutex; + roff_t last_mf_offset; + u_int32_t ar_cnt, ar_max, i, n_cache, remaining, wrote_total; + int dirty, filecnt, maxopenfd, required_write, ret, t_ret; + int wrote_cnt; + + dbmp = env->mp_handle; + mp = dbmp->reginfo[0].primary; + last_mf_offset = INVALID_ROFF; + filecnt = wrote_total = 0; + + if (wrote_totalp != NULL) + *wrote_totalp = 0; + if (interruptedp != NULL) + *interruptedp = 0; + + /* + * If we're flushing the cache, it's a checkpoint or we're flushing a + * specific file, we really have to write the blocks and we have to + * confirm they made it to disk. Otherwise, we can skip a block if + * it's hard to get. + */ + required_write = LF_ISSET(DB_SYNC_CACHE | + DB_SYNC_CHECKPOINT | DB_SYNC_FILE | DB_SYNC_QUEUE_EXTENT); + + /* Get shared configuration information. */ + MPOOL_SYSTEM_LOCK(env); + maxopenfd = mp->mp_maxopenfd; + MPOOL_SYSTEM_UNLOCK(env); + + /* Assume one dirty page per bucket. */ + ar_max = mp->nreg * mp->htab_buckets; + if ((ret = + __os_malloc(env, ar_max * sizeof(BH_TRACK), &bharray)) != 0) + return (ret); + + /* + * Walk each cache's list of buffers and mark all dirty buffers to be + * written and all dirty buffers to be potentially written, depending + * on our flags. + */ + for (ar_cnt = 0, n_cache = 0; n_cache < mp->nreg; ++n_cache) { + c_mp = dbmp->reginfo[n_cache].primary; + + hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab); + for (i = 0; i < c_mp->htab_buckets; i++, hp++) { + /* + * We can check for empty buckets before locking as + * we only care if the pointer is zero or non-zero. + * We can ignore empty or clean buckets because we + * only need write buffers that were dirty before + * we started. + */ +#ifdef DIAGNOSTIC + if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) +#else + if (atomic_read(&hp->hash_page_dirty) == 0) +#endif + continue; + + dirty = 0; + MUTEX_LOCK(env, hp->mtx_hash); + SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) { + /* Always ignore clean pages. */ + if (!F_ISSET(bhp, BH_DIRTY)) + continue; + + dirty++; + mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); + + /* + * Ignore in-memory files, unless the file is + * specifically being flushed. + */ + if (mfp->no_backing_file) + continue; + if (!LF_ISSET(DB_SYNC_FILE) && + F_ISSET(mfp, MP_TEMP)) + continue; + + /* + * Ignore files that aren't involved in DB's + * transactional operations during checkpoints. + */ + if (LF_ISSET(DB_SYNC_CHECKPOINT) && + mfp->lsn_off == DB_LSN_OFF_NOTSET) + continue; + + /* + * Ignore files that aren't Queue extent files + * if we're flushing a Queue file with extents. + */ + if (LF_ISSET(DB_SYNC_QUEUE_EXTENT) && + !F_ISSET(mfp, MP_EXTENT)) + continue; + + /* + * If we're flushing a specific file, see if + * this page is from that file. + */ + if (dbmfp != NULL && mfp != dbmfp->mfp) + continue; + + /* Track the buffer, we want it. */ + bharray[ar_cnt].track_hp = hp; + bharray[ar_cnt].track_pgno = bhp->pgno; + bharray[ar_cnt].track_off = bhp->mf_offset; + ar_cnt++; + + /* + * If we run out of space, double and continue. + * Don't stop at trickle_max, we want to sort + * as large a sample set as possible in order + * to minimize disk seeks. + */ + if (ar_cnt >= ar_max) { + if ((ret = __os_realloc(env, + (ar_max * 2) * sizeof(BH_TRACK), + &bharray)) != 0) + break; + ar_max *= 2; + } + } + + if (ret != 0) + goto err; + /* + * We are only checking this in diagnostic mode + * since it requires extra latching to keep the count + * in sync with the number of bits counted. + */ + DB_ASSERT(env, + dirty == (int)atomic_read(&hp->hash_page_dirty)); + MUTEX_UNLOCK(env, hp->mtx_hash); + + /* Check if the call has been interrupted. */ + if (LF_ISSET(DB_SYNC_INTERRUPT_OK) && FLD_ISSET( + mp->config_flags, DB_MEMP_SYNC_INTERRUPT)) { + STAT(++mp->stat.st_sync_interrupted); + if (interruptedp != NULL) + *interruptedp = 1; + goto err; + } + } + } + + /* If there no buffers to write, we're done. */ + if (ar_cnt == 0) + goto done; + + /* + * Write the buffers in file/page order, trying to reduce seeks by the + * filesystem and, when pages are smaller than filesystem block sizes, + * reduce the actual number of writes. + */ + if (ar_cnt > 1) + qsort(bharray, ar_cnt, sizeof(BH_TRACK), __bhcmp); + + /* + * If we're trickling buffers, only write enough to reach the correct + * percentage. + */ + if (LF_ISSET(DB_SYNC_TRICKLE) && ar_cnt > trickle_max) + ar_cnt = trickle_max; + + /* + * Flush the log. We have to ensure the log records reflecting the + * changes on the database pages we're writing have already made it + * to disk. We still have to check the log each time we write a page + * (because pages we are about to write may be modified after we have + * flushed the log), but in general this will at least avoid any I/O + * on the log's part. + */ + if (LOGGING_ON(env) && (ret = __log_flush(env, NULL)) != 0) + goto err; + + /* + * Walk the array, writing buffers. When we write a buffer, we NULL + * out its hash bucket pointer so we don't process a slot more than + * once. + */ + for (i = wrote_cnt = 0, remaining = ar_cnt; remaining > 0; ++i) { + if (i >= ar_cnt) { + i = 0; + __os_yield(env, 1, 0); + } + if ((hp = bharray[i].track_hp) == NULL) + continue; + + /* Lock the hash bucket and find the buffer. */ + mutex = hp->mtx_hash; + MUTEX_READLOCK(env, mutex); + SH_TAILQ_FOREACH(bhp, &hp->hash_bucket, hq, __bh) + if (bhp->pgno == bharray[i].track_pgno && + bhp->mf_offset == bharray[i].track_off) + break; + + /* + * If we can't find the buffer we're done, somebody else had + * to have written it. + * + * If the buffer isn't dirty, we're done, there's no work + * needed. + */ + if (bhp == NULL || !F_ISSET(bhp, BH_DIRTY)) { + MUTEX_UNLOCK(env, mutex); + --remaining; + bharray[i].track_hp = NULL; + continue; + } + + /* + * If the buffer is locked by another thread, ignore it, we'll + * come back to it. + */ + if (F_ISSET(bhp, BH_EXCLUSIVE)) { + MUTEX_UNLOCK(env, mutex); + if (!required_write) { + --remaining; + bharray[i].track_hp = NULL; + } + continue; + } + + /* Pin the buffer into memory. */ + atomic_inc(env, &bhp->ref); + MUTEX_UNLOCK(env, mutex); + MUTEX_READLOCK(env, bhp->mtx_buf); + DB_ASSERT(env, !F_ISSET(bhp, BH_EXCLUSIVE)); + + /* + * When swapping the hash bucket mutex for the buffer mutex, + * we may have raced with an MVCC update. In that case, we + * no longer have the most recent version, and need to retry + * (the buffer header we have pinned will no longer be marked + * dirty, so we can't just write it). + */ + if (SH_CHAIN_HASNEXT(bhp, vc)) { + atomic_dec(env, &bhp->ref); + MUTEX_UNLOCK(env, bhp->mtx_buf); + continue; + } + + /* we will dispose of this buffer. */ + --remaining; + bharray[i].track_hp = NULL; + + /* + * If we've switched files, check to see if we're configured + * to close file descriptors. + */ + if (maxopenfd != 0 && bhp->mf_offset != last_mf_offset) { + if (++filecnt >= maxopenfd) { + filecnt = 0; + if ((t_ret = __memp_close_flush_files( + env, 1)) != 0 && ret == 0) + ret = t_ret; + } + last_mf_offset = bhp->mf_offset; + } + + /* + * If the buffer is dirty, we write it. We only try to + * write the buffer once. + */ + if (F_ISSET(bhp, BH_DIRTY)) { + mfp = R_ADDR(dbmp->reginfo, bhp->mf_offset); + if ((t_ret = + __memp_bhwrite(dbmp, hp, mfp, bhp, 1)) == 0) { + ++wrote_cnt; + ++wrote_total; + } else { + if (ret == 0) + ret = t_ret; + __db_errx + (env, "%s: unable to flush page: %lu", + __memp_fns(dbmp, mfp), (u_long)bhp->pgno); + + } + } + + /* Discard our buffer reference. */ + DB_ASSERT(env, atomic_read(&bhp->ref) > 0); + atomic_dec(env, &bhp->ref); + MUTEX_UNLOCK(env, bhp->mtx_buf); + + /* Check if the call has been interrupted. */ + if (LF_ISSET(DB_SYNC_INTERRUPT_OK) && + FLD_ISSET(mp->config_flags, DB_MEMP_SYNC_INTERRUPT)) { + STAT(++mp->stat.st_sync_interrupted); + if (interruptedp != NULL) + *interruptedp = 1; + goto err; + } + + /* + * Sleep after some number of writes to avoid disk saturation. + * Don't cache the max writes value, an application shutting + * down might reset the value in order to do a fast flush or + * checkpoint. + */ + if (!LF_ISSET(DB_SYNC_SUPPRESS_WRITE) && + !FLD_ISSET(mp->config_flags, DB_MEMP_SUPPRESS_WRITE) && + mp->mp_maxwrite != 0 && wrote_cnt >= mp->mp_maxwrite) { + wrote_cnt = 0; + __os_yield(env, 0, (u_long)mp->mp_maxwrite_sleep); + } + } + +done: /* + * If a write is required, we have to force the pages to disk. We + * don't do this as we go along because we want to give the OS as + * much time as possible to lazily flush, and because we have to flush + * files that might not even have had dirty buffers in the cache, so + * we have to walk the files list. + */ + if (ret == 0 && required_write) { + if (dbmfp == NULL) + ret = __memp_sync_files(env); + else + ret = __os_fsync(env, dbmfp->fhp); + } + + /* If we've opened files to flush pages, close them. */ + if ((t_ret = __memp_close_flush_files(env, 0)) != 0 && ret == 0) + ret = t_ret; + +err: __os_free(env, bharray); + if (wrote_totalp != NULL) + *wrote_totalp = wrote_total; + + return (ret); +} + +static int +__memp_sync_file(env, mfp, argp, countp, flags) + ENV *env; + MPOOLFILE *mfp; + void *argp; + u_int32_t *countp; + u_int32_t flags; +{ + DB_MPOOL *dbmp; + DB_MPOOLFILE *dbmfp; + int ret, t_ret; + + COMPQUIET(countp, NULL); + COMPQUIET(flags, 0); + + if (!mfp->file_written || mfp->no_backing_file || + mfp->deadfile || F_ISSET(mfp, MP_TEMP)) + return (0); + /* + * Pin the MPOOLFILE structure into memory, and release the + * region mutex allowing us to walk the linked list. We'll + * re-acquire that mutex to move to the next entry in the list. + * + * This works because we only need to flush current entries, + * we don't care about new entries being added, and the linked + * list is never re-ordered, a single pass is sufficient. It + * requires MPOOLFILE structures removed before we get to them + * be flushed to disk, but that's nothing new, they could have + * been removed while checkpoint was running, too. + * + * Once we have the MPOOLFILE lock, re-check the MPOOLFILE is + * not being discarded. (A thread removing the MPOOLFILE + * will: hold the MPOOLFILE mutex, set deadfile, drop the + * MPOOLFILE mutex and then acquire the region MUTEX to walk + * the linked list and remove the MPOOLFILE structure. Make + * sure the MPOOLFILE wasn't marked dead while we waited for + * the mutex. + */ + MUTEX_LOCK(env, mfp->mutex); + if (!mfp->file_written || mfp->deadfile) { + MUTEX_UNLOCK(env, mfp->mutex); + return (0); + } + ++mfp->mpf_cnt; + MUTEX_UNLOCK(env, mfp->mutex); + + /* + * Look for an already open, writeable handle (fsync doesn't + * work on read-only Windows handles). + */ + dbmp = env->mp_handle; + MUTEX_LOCK(env, dbmp->mutex); + TAILQ_FOREACH(dbmfp, &dbmp->dbmfq, q) { + if (dbmfp->mfp != mfp || F_ISSET(dbmfp, MP_READONLY)) + continue; + /* + * We don't want to hold the mutex while calling sync. + * Increment the DB_MPOOLFILE handle ref count to pin + * it into memory. + */ + ++dbmfp->ref; + break; + } + MUTEX_UNLOCK(env, dbmp->mutex); + + /* If we don't find a handle we can use, open one. */ + if (dbmfp == NULL) { + if ((ret = __memp_mf_sync(dbmp, mfp, 1)) != 0) { + __db_err(env, ret, + "%s: unable to flush", (char *) + R_ADDR(dbmp->reginfo, mfp->path_off)); + } + } else + ret = __os_fsync(env, dbmfp->fhp); + + /* + * Re-acquire the MPOOLFILE mutex, we need it to modify the + * reference count. + */ + MUTEX_LOCK(env, mfp->mutex); + + /* + * If we wrote the file and there are no other references (or there + * is a single reference, and it's the one we opened to write + * buffers during checkpoint), clear the file_written flag. We + * do this so that applications opening thousands of files don't + * loop here opening and flushing those files during checkpoint. + * + * The danger here is if a buffer were to be written as part of + * a checkpoint, and then not be flushed to disk. This cannot + * happen because we only clear file_written when there are no + * other users of the MPOOLFILE in the system, and, as we hold + * the region lock, no possibility of another thread of control + * racing with us to open a MPOOLFILE. + */ + if (mfp->mpf_cnt == 1 || (mfp->mpf_cnt == 2 && + dbmfp != NULL && F_ISSET(dbmfp, MP_FLUSH))) { + mfp->file_written = 0; + + /* + * We may be the last reference for a MPOOLFILE, as we + * weren't holding the MPOOLFILE mutex when flushing + * it's buffers to disk. If we can discard it, set + * a flag to schedule a clean-out pass. (Not likely, + * I mean, what are the chances that there aren't any + * buffers in the pool? Regardless, it might happen.) + */ + if (mfp->mpf_cnt == 1 && mfp->block_cnt == 0) + *(int *)argp = 1; + } + + /* + * If we found the file we must close it in case we are the last + * reference to the dbmfp. NOTE: since we have incremented + * mfp->mpf_cnt this cannot be the last reference to the mfp. + * This is important since we are called with the hash bucket + * locked. The mfp will get freed via the cleanup pass. + */ + if (dbmfp != NULL && + (t_ret = __memp_fclose(dbmfp, DB_MPOOL_NOLOCK)) != 0 && ret == 0) + ret = t_ret; + + --mfp->mpf_cnt; + + /* Unlock the MPOOLFILE. */ + MUTEX_UNLOCK(env, mfp->mutex); + return (ret); +} + +/* + * __memp_sync_files -- + * Sync all the files in the environment, open or not. + */ +static int +__memp_sync_files(env) + ENV *env; +{ + DB_MPOOL *dbmp; + DB_MPOOL_HASH *hp; + MPOOL *mp; + MPOOLFILE *mfp, *next_mfp; + int i, need_discard_pass, ret; + + dbmp = env->mp_handle; + mp = dbmp->reginfo[0].primary; + need_discard_pass = ret = 0; + + ret = __memp_walk_files(env, + mp, __memp_sync_file, &need_discard_pass, 0, DB_STAT_MEMP_NOERROR); + + /* + * We may need to do a last pass through the MPOOLFILE list -- if we + * were the last reference to an MPOOLFILE, we need to clean it out. + */ + if (!need_discard_pass) + return (ret); + + hp = R_ADDR(dbmp->reginfo, mp->ftab); + for (i = 0; i < MPOOL_FILE_BUCKETS; i++, hp++) { +retry: MUTEX_LOCK(env, hp->mtx_hash); + for (mfp = SH_TAILQ_FIRST(&hp->hash_bucket, + __mpoolfile); mfp != NULL; mfp = next_mfp) { + next_mfp = SH_TAILQ_NEXT(mfp, q, __mpoolfile); + /* + * Do a fast check -- we can check for zero/non-zero + * without a mutex on the MPOOLFILE. If likely to + * succeed, lock the MPOOLFILE down and look for real. + */ + if (mfp->deadfile || + mfp->block_cnt != 0 || mfp->mpf_cnt != 0) + continue; + + MUTEX_LOCK(env, mfp->mutex); + if (!mfp->deadfile && + mfp->block_cnt == 0 && mfp->mpf_cnt == 0) { + MUTEX_UNLOCK(env, hp->mtx_hash); + (void)__memp_mf_discard(dbmp, mfp); + goto retry; + } else + MUTEX_UNLOCK(env, mfp->mutex); + } + MUTEX_UNLOCK(env, hp->mtx_hash); + } + return (ret); +} + +/* + * __memp_mf_sync -- + * Flush an MPOOLFILE, when no currently open handle is available. + * + * PUBLIC: int __memp_mf_sync __P((DB_MPOOL *, MPOOLFILE *, int)); + */ +int +__memp_mf_sync(dbmp, mfp, locked) + DB_MPOOL *dbmp; + MPOOLFILE *mfp; + int locked; +{ + DB_FH *fhp; + DB_MPOOL_HASH *hp; + ENV *env; + MPOOL *mp; + int ret, t_ret; + char *rpath; + + COMPQUIET(hp, NULL); + env = dbmp->env; + + /* + * We need to be holding the hash lock: we're using the path name + * and __memp_nameop might try and rename the file. + */ + if (!locked) { + mp = dbmp->reginfo[0].primary; + hp = R_ADDR(dbmp->reginfo, mp->ftab); + hp += FNBUCKET( + R_ADDR(dbmp->reginfo, mfp->fileid_off), DB_FILE_ID_LEN); + MUTEX_LOCK(env, hp->mtx_hash); + } + + if ((ret = __db_appname(env, DB_APP_DATA, + R_ADDR(dbmp->reginfo, mfp->path_off), NULL, &rpath)) == 0) { + if ((ret = __os_open(env, rpath, 0, 0, 0, &fhp)) == 0) { + ret = __os_fsync(env, fhp); + if ((t_ret = + __os_closehandle(env, fhp)) != 0 && ret == 0) + ret = t_ret; + } + __os_free(env, rpath); + } + + if (!locked) + MUTEX_UNLOCK(env, hp->mtx_hash); + + return (ret); +} + +/* + * __memp_close_flush_files -- + * Close files opened only to flush buffers. + */ +static int +__memp_close_flush_files(env, dosync) + ENV *env; + int dosync; +{ + DB_MPOOL *dbmp; + DB_MPOOLFILE *dbmfp; + MPOOLFILE *mfp; + int ret; + + dbmp = env->mp_handle; + + /* + * The routine exists because we must close files opened by sync to + * flush buffers. There are two cases: first, extent files have to + * be closed so they may be removed when empty. Second, regular + * files have to be closed so we don't run out of descriptors (for + * example, an application partitioning its data into databases + * based on timestamps, so there's a continually increasing set of + * files). + * + * We mark files opened in the __memp_bhwrite() function with the + * MP_FLUSH flag. Here we walk through our file descriptor list, + * and, if a file was opened by __memp_bhwrite(), we close it. + */ +retry: MUTEX_LOCK(env, dbmp->mutex); + TAILQ_FOREACH(dbmfp, &dbmp->dbmfq, q) + if (F_ISSET(dbmfp, MP_FLUSH)) { + F_CLR(dbmfp, MP_FLUSH); + MUTEX_UNLOCK(env, dbmp->mutex); + if (dosync) { + /* + * If we have the only open handle on the file, + * clear the dirty flag so we don't re-open and + * sync it again when discarding the MPOOLFILE + * structure. Clear the flag before the sync + * so can't race with a thread writing the file. + */ + mfp = dbmfp->mfp; + if (mfp->mpf_cnt == 1) { + MUTEX_LOCK(env, mfp->mutex); + if (mfp->mpf_cnt == 1) + mfp->file_written = 0; + MUTEX_UNLOCK(env, mfp->mutex); + } + if ((ret = __os_fsync(env, dbmfp->fhp)) != 0) + return (ret); + } + if ((ret = __memp_fclose(dbmfp, 0)) != 0) + return (ret); + goto retry; + } + MUTEX_UNLOCK(env, dbmp->mutex); + + return (0); +} + +static int +__bhcmp(p1, p2) + const void *p1, *p2; +{ + BH_TRACK *bhp1, *bhp2; + + bhp1 = (BH_TRACK *)p1; + bhp2 = (BH_TRACK *)p2; + + /* Sort by file (shared memory pool offset). */ + if (bhp1->track_off < bhp2->track_off) + return (-1); + if (bhp1->track_off > bhp2->track_off) + return (1); + + /* + * !!! + * Defend against badly written quicksort code calling the comparison + * function with two identical pointers (e.g., WATCOM C++ (Power++)). + */ + if (bhp1->track_pgno < bhp2->track_pgno) + return (-1); + if (bhp1->track_pgno > bhp2->track_pgno) + return (1); + return (0); +} diff --git a/mp/mp_trickle.c b/mp/mp_trickle.c new file mode 100644 index 0000000..e92e788 --- /dev/null +++ b/mp/mp_trickle.c @@ -0,0 +1,112 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2009 Oracle. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" + +static int __memp_trickle __P((ENV *, int, int *)); + +/* + * __memp_trickle_pp -- + * ENV->memp_trickle pre/post processing. + * + * PUBLIC: int __memp_trickle_pp __P((DB_ENV *, int, int *)); + */ +int +__memp_trickle_pp(dbenv, pct, nwrotep) + DB_ENV *dbenv; + int pct, *nwrotep; +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + env = dbenv->env; + + ENV_REQUIRES_CONFIG(env, + env->mp_handle, "memp_trickle", DB_INIT_MPOOL); + + ENV_ENTER(env, ip); + REPLICATION_WRAP(env, (__memp_trickle(env, pct, nwrotep)), 0, ret); + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __memp_trickle -- + * ENV->memp_trickle. + */ +static int +__memp_trickle(env, pct, nwrotep) + ENV *env; + int pct, *nwrotep; +{ + DB_MPOOL *dbmp; + MPOOL *c_mp, *mp; + u_int32_t clean, dirty, i, need_clean, total, dtmp, wrote; + int ret; + + dbmp = env->mp_handle; + mp = dbmp->reginfo[0].primary; + + if (nwrotep != NULL) + *nwrotep = 0; + + if (pct < 1 || pct > 100) { + __db_errx(env, + "DB_ENV->memp_trickle: %d: percent must be between 1 and 100", + pct); + return (EINVAL); + } + + /* + * Loop through the caches counting total/dirty buffers. + * + * XXX + * Using hash_page_dirty is our only choice at the moment, but it's not + * as correct as we might like in the presence of pools having more + * than one page size, as a free 512B buffer may not be equivalent to + * having a free 8KB buffer. + */ + for (ret = 0, i = dirty = total = 0; i < mp->nreg; ++i) { + c_mp = dbmp->reginfo[i].primary; + total += c_mp->stat.st_pages; + __memp_stat_hash(&dbmp->reginfo[i], c_mp, &dtmp); + dirty += dtmp; + } + + /* + * If there are sufficient clean buffers, no buffers or no dirty + * buffers, we're done. + */ + if (total == 0 || dirty == 0) + return (0); + + /* + * The total number of pages is an exact number, but the dirty page + * count can change while we're walking the hash buckets, and it's + * even possible the dirty page count ends up larger than the total + * number of pages. + */ + clean = total > dirty ? total - dirty : 0; + need_clean = (total * (u_int)pct) / 100; + if (clean >= need_clean) + return (0); + + need_clean -= clean; + ret = __memp_sync_int(env, NULL, + need_clean, DB_SYNC_TRICKLE | DB_SYNC_INTERRUPT_OK, &wrote, NULL); + STAT((mp->stat.st_page_trickle += wrote)); + if (nwrotep != NULL) + *nwrotep = (int)wrote; + + return (ret); +} |