1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
|
/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 1996-2004
* Sleepycat Software. All rights reserved.
*
* $Id: mp.h,v 11.61 2004/09/17 22:00:27 mjc Exp $
*/
#ifndef _DB_MP_H_
#define _DB_MP_H_
struct __bh; typedef struct __bh BH;
struct __db_mpool_hash; typedef struct __db_mpool_hash DB_MPOOL_HASH;
struct __db_mpreg; typedef struct __db_mpreg DB_MPREG;
struct __mpool; typedef struct __mpool MPOOL;
/* We require at least 20KB of cache. */
#define DB_CACHESIZE_MIN (20 * 1024)
/*
* DB_MPOOLFILE initialization methods cannot be called after open is called,
* other methods cannot be called before open is called
*/
#define MPF_ILLEGAL_AFTER_OPEN(dbmfp, name) \
if (F_ISSET(dbmfp, MP_OPEN_CALLED)) \
return (__db_mi_open((dbmfp)->dbenv, name, 1));
#define MPF_ILLEGAL_BEFORE_OPEN(dbmfp, name) \
if (!F_ISSET(dbmfp, MP_OPEN_CALLED)) \
return (__db_mi_open((dbmfp)->dbenv, name, 0));
typedef enum {
DB_SYNC_ALLOC, /* Flush for allocation. */
DB_SYNC_CACHE, /* Checkpoint or flush entire cache. */
DB_SYNC_FILE, /* Flush file. */
DB_SYNC_TRICKLE /* Trickle sync. */
} db_sync_op;
/*
* DB_MPOOL --
* Per-process memory pool structure.
*/
struct __db_mpool {
/* These fields need to be protected for multi-threaded support. */
DB_MUTEX *mutexp; /* Structure thread lock. */
/* List of pgin/pgout routines. */
LIST_HEAD(__db_mpregh, __db_mpreg) dbregq;
/* List of DB_MPOOLFILE's. */
TAILQ_HEAD(__db_mpoolfileh, __db_mpoolfile) dbmfq;
/*
* The dbenv, nreg and reginfo fields are not thread protected,
* as they are initialized during mpool creation, and not modified
* again.
*/
DB_ENV *dbenv; /* Enclosing environment. */
u_int32_t nreg; /* N underlying cache regions. */
REGINFO *reginfo; /* Underlying cache regions. */
};
/*
* DB_MPREG --
* DB_MPOOL registry of pgin/pgout functions.
*/
struct __db_mpreg {
LIST_ENTRY(__db_mpreg) q; /* Linked list. */
int32_t ftype; /* File type. */
/* Pgin, pgout routines. */
int (*pgin) __P((DB_ENV *, db_pgno_t, void *, DBT *));
int (*pgout) __P((DB_ENV *, db_pgno_t, void *, DBT *));
};
/*
* NCACHE --
* Select a cache based on the file and the page number. Assumes accesses
* are uniform across pages, which is probably OK. What we really want to
* avoid is anything that puts all pages from any single file in the same
* cache, as we expect that file access will be bursty, and to avoid
* putting all page number N pages in the same cache as we expect access
* to the metapages (page 0) and the root of a btree (page 1) to be much
* more frequent than a random data page.
*/
#define NCACHE(mp, mf_offset, pgno) \
(((pgno) ^ ((u_int32_t)(mf_offset) >> 3)) % ((MPOOL *)mp)->nreg)
/*
* NBUCKET --
* We make the assumption that early pages of the file are more likely
* to be retrieved than the later pages, which means the top bits will
* be more interesting for hashing as they're less likely to collide.
* That said, as 512 8K pages represents a 4MB file, so only reasonably
* large files will have page numbers with any other than the bottom 9
* bits set. We XOR in the MPOOL offset of the MPOOLFILE that backs the
* page, since that should also be unique for the page. We don't want
* to do anything very fancy -- speed is more important to us than using
* good hashing.
*/
#define NBUCKET(mc, mf_offset, pgno) \
(((pgno) ^ ((mf_offset) << 9)) % (mc)->htab_buckets)
/*
* MPOOL --
* Shared memory pool region.
*/
struct __mpool {
/*
* The memory pool can be broken up into individual pieces/files.
* Not what we would have liked, but on Solaris you can allocate
* only a little more than 2GB of memory in a contiguous chunk,
* and I expect to see more systems with similar issues.
*
* While this structure is duplicated in each piece of the cache,
* the first of these pieces/files describes the entire pool, the
* second only describe a piece of the cache.
*/
/*
* The lsn field and list of underlying MPOOLFILEs are thread protected
* by the region lock.
*/
DB_LSN lsn; /* Maximum checkpoint LSN. */
SH_TAILQ_HEAD(__mpfq) mpfq; /* List of MPOOLFILEs. */
/* Configuration information: protected by the region lock. */
size_t mp_mmapsize; /* Maximum file size for mmap. */
int mp_maxopenfd; /* Maximum open file descriptors. */
int mp_maxwrite; /* Maximum buffers to write. */
int mp_maxwrite_sleep; /* Sleep after writing max buffers. */
/*
* The nreg, regids and maint_off fields are not thread protected,
* as they are initialized during mpool creation, and not modified
* again.
*/
u_int32_t nreg; /* Number of underlying REGIONS. */
roff_t regids; /* Array of underlying REGION Ids. */
#ifdef HAVE_MUTEX_SYSTEM_RESOURCES
roff_t maint_off; /* Maintenance information offset */
#endif
/*
* The following structure fields only describe the per-cache portion
* of the region.
*
* The htab and htab_buckets fields are not thread protected as they
* are initialized during mpool creation, and not modified again.
*
* The last_checked and lru_count fields are thread protected by
* the region lock.
*/
u_int32_t htab_buckets; /* Number of hash table entries. */
roff_t htab; /* Hash table offset. */
u_int32_t last_checked; /* Last bucket checked for free. */
u_int32_t lru_count; /* Counter for buffer LRU */
/*
* The stat fields are generally not thread protected, and cannot be
* trusted. Note that st_pages is an exception, and is always updated
* inside a region lock (although it is sometimes read outside of the
* region lock).
*/
DB_MPOOL_STAT stat; /* Per-cache mpool statistics. */
/*
* We track page puts so that we can decide when allocation is never
* going to succeed. We don't lock the field, all we care about is
* if it changes.
*/
u_int32_t put_counter; /* Count of page put calls. */
};
struct __db_mpool_hash {
DB_MUTEX hash_mutex; /* Per-bucket mutex. */
DB_HASHTAB hash_bucket; /* Head of bucket. */
u_int32_t hash_page_dirty;/* Count of dirty pages. */
u_int32_t hash_priority; /* Minimum priority of bucket buffer. */
#ifdef HPUX_MUTEX_PAD
/*
* !!!
* We allocate the mpool hash buckets as an array, which means that
* they are not individually aligned. This fails on one platform:
* HPUX 10.20, where mutexes require 16 byte alignment. This is a
* grievous hack for that single platform.
*/
u_int8_t pad[HPUX_MUTEX_PAD];
#endif
};
/*
* The base mpool priority is 1/4th of the name space, or just under 2^30.
* When the LRU counter wraps, we shift everybody down to a base-relative
* value.
*/
#define MPOOL_BASE_DECREMENT (UINT32_MAX - (UINT32_MAX / 4))
/*
* Mpool priorities from low to high. Defined in terms of fractions of the
* buffers in the pool.
*/
#define MPOOL_PRI_VERY_LOW -1 /* Dead duck. Check and set to 0. */
#define MPOOL_PRI_LOW -2 /* Low. */
#define MPOOL_PRI_DEFAULT 0 /* No adjustment -- special case.*/
#define MPOOL_PRI_HIGH 10 /* With the dirty buffers. */
#define MPOOL_PRI_DIRTY 10 /* Dirty gets a 10% boost. */
#define MPOOL_PRI_VERY_HIGH 1 /* Add number of buffers in pool. */
/*
* MPOOLFILE --
* Shared DB_MPOOLFILE information.
*/
struct __mpoolfile {
DB_MUTEX mutex;
/* Protected by MPOOLFILE mutex. */
u_int32_t mpf_cnt; /* Ref count: DB_MPOOLFILEs. */
u_int32_t block_cnt; /* Ref count: blocks in cache. */
roff_t path_off; /* File name location. */
/*
* We normally don't lock the deadfile field when we read it since we
* only care if the field is zero or non-zero. We do lock on read when
* searching for a matching MPOOLFILE -- see that code for more detail.
*/
int32_t deadfile; /* Dirty pages can be discarded. */
/* Protected by mpool cache 0 region lock. */
SH_TAILQ_ENTRY q; /* List of MPOOLFILEs */
db_pgno_t last_pgno; /* Last page in the file. */
db_pgno_t orig_last_pgno; /* Original last page in the file. */
db_pgno_t maxpgno; /* Maximum page number. */
/*
* None of the following fields are thread protected.
*
* There are potential races with the ftype field because it's read
* without holding a lock. However, it has to be set before adding
* any buffers to the cache that depend on it being set, so there
* would need to be incorrect operation ordering to have a problem.
*/
int32_t ftype; /* File type. */
/*
* There are potential races with the priority field because it's read
* without holding a lock. However, a collision is unlikely and if it
* happens is of little consequence.
*/
int32_t priority; /* Priority when unpinning buffer. */
/*
* There are potential races with the file_written field (many threads
* may be writing blocks at the same time), and with no_backing_file
* and unlink_on_close fields, as they may be set while other threads
* are reading them. However, we only care if the field value is zero
* or non-zero, so don't lock the memory.
*
* !!!
* Theoretically, a 64-bit architecture could put two of these fields
* in a single memory operation and we could race. I have never seen
* an architecture where that's a problem, and I believe Java requires
* that to never be the case.
*
* File_written is set whenever a buffer is marked dirty in the cache.
* It can be cleared in some cases, after all dirty buffers have been
* written AND the file has been flushed to disk.
*/
int32_t file_written; /* File was written. */
int32_t no_backing_file; /* Never open a backing file. */
int32_t unlink_on_close; /* Unlink file on last close. */
/*
* We do not protect the statistics in "stat" because of the cost of
* the mutex in the get/put routines. There is a chance that a count
* will get lost.
*/
DB_MPOOL_FSTAT stat; /* Per-file mpool statistics. */
/*
* The remaining fields are initialized at open and never subsequently
* modified.
*/
int32_t lsn_off; /* Page's LSN offset. */
u_int32_t clear_len; /* Bytes to clear on page create. */
roff_t fileid_off; /* File ID string location. */
roff_t pgcookie_len; /* Pgin/pgout cookie length. */
roff_t pgcookie_off; /* Pgin/pgout cookie location. */
/*
* The flags are initialized at open and never subsequently modified.
*/
#define MP_CAN_MMAP 0x001 /* If the file can be mmap'd. */
#define MP_DIRECT 0x002 /* No OS buffering. */
#define MP_DURABLE_UNKNOWN 0x004 /* We don't care about durability. */
#define MP_EXTENT 0x008 /* Extent file. */
#define MP_FAKE_DEADFILE 0x010 /* Deadfile field: fake flag. */
#define MP_FAKE_FILEWRITTEN 0x020 /* File_written field: fake flag. */
#define MP_FAKE_NB 0x040 /* No_backing_file field: fake flag. */
#define MP_FAKE_UOC 0x080 /* Unlink_on_close field: fake flag. */
#define MP_NOT_DURABLE 0x100 /* File is not durable. */
#define MP_TEMP 0x200 /* Backing file is a temporary. */
u_int32_t flags;
};
/*
* Flags to __memp_bh_free.
*/
#define BH_FREE_FREEMEM 0x01
#define BH_FREE_UNLOCKED 0x02
/*
* BH --
* Buffer header.
*/
struct __bh {
DB_MUTEX mutex; /* Buffer thread/process lock. */
u_int16_t ref; /* Reference count. */
u_int16_t ref_sync; /* Sync wait-for reference count. */
#define BH_CALLPGIN 0x001 /* Convert the page before use. */
#define BH_DIRTY 0x002 /* Page was modified. */
#define BH_DIRTY_CREATE 0x004 /* Page created, must be written. */
#define BH_DISCARD 0x008 /* Page is useless. */
#define BH_LOCKED 0x010 /* Page is locked (I/O in progress). */
#define BH_TRASH 0x020 /* Page is garbage. */
u_int16_t flags;
u_int32_t priority; /* LRU priority. */
SH_TAILQ_ENTRY hq; /* MPOOL hash bucket queue. */
db_pgno_t pgno; /* Underlying MPOOLFILE page number. */
roff_t mf_offset; /* Associated MPOOLFILE offset. */
/*
* !!!
* This array must be at least size_t aligned -- the DB access methods
* put PAGE and other structures into it, and then access them directly.
* (We guarantee size_t alignment to applications in the documentation,
* too.)
*/
u_int8_t buf[1]; /* Variable length data. */
};
#include "dbinc_auto/mp_ext.h"
#endif /* !_DB_MP_H_ */
|