summaryrefslogtreecommitdiff
path: root/db/dbinc/mp.h
blob: 120f917163c161d757cf9e216b8435b556ea4840 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
/*-
 * See the file LICENSE for redistribution information.
 *
 * Copyright (c) 1996,2007 Oracle.  All rights reserved.
 *
 * $Id: mp.h,v 12.38 2007/06/07 16:47:01 bostic Exp $
 */

#ifndef	_DB_MP_H_
#define	_DB_MP_H_

#if defined(__cplusplus)
extern "C" {
#endif

struct __bh;		typedef struct __bh BH;
struct __bh_frozen;	typedef struct __bh_frozen_p BH_FROZEN_PAGE;
struct __bh_frozen_a;	typedef struct __bh_frozen_a BH_FROZEN_ALLOC;
struct __db_mpool_hash; typedef struct __db_mpool_hash DB_MPOOL_HASH;
struct __db_mpreg;	typedef struct __db_mpreg DB_MPREG;
struct __mpool;		typedef struct __mpool MPOOL;

				/* We require at least 20KB of cache. */
#define	DB_CACHESIZE_MIN	(20 * 1024)

/*
 * DB_MPOOLFILE initialization methods cannot be called after open is called,
 * other methods cannot be called before open is called
 */
#define	MPF_ILLEGAL_AFTER_OPEN(dbmfp, name)				\
	if (F_ISSET(dbmfp, MP_OPEN_CALLED))				\
		return (__db_mi_open((dbmfp)->dbenv, name, 1));
#define	MPF_ILLEGAL_BEFORE_OPEN(dbmfp, name)				\
	if (!F_ISSET(dbmfp, MP_OPEN_CALLED))				\
		return (__db_mi_open((dbmfp)->dbenv, name, 0));

/*
 * Cache flush operations, plus modifiers.
 */
#define	DB_SYNC_ALLOC		0x0001	/* Flush for allocation. */
#define	DB_SYNC_CACHE		0x0002	/* Flush entire cache. */
#define	DB_SYNC_CHECKPOINT	0x0004	/* Checkpoint. */
#define	DB_SYNC_FILE		0x0008	/* Flush file. */
#define	DB_SYNC_INTERRUPT_OK	0x0010	/* Allow interrupt and return OK. */
#define	DB_SYNC_QUEUE_EXTENT	0x0020	/* Flush a queue file with extents. */
#define	DB_SYNC_SUPPRESS_WRITE	0x0040	/* Ignore max-write configuration. */
#define	DB_SYNC_TRICKLE		0x0080	/* Trickle sync. */

/*
 * DB_MPOOL --
 *	Per-process memory pool structure.
 */
struct __db_mpool {
	/* These fields need to be protected for multi-threaded support. */
	db_mutex_t mutex;		/* Thread mutex. */

	/*
	 * DB_MPREG structure for the DB pgin/pgout routines.
	 *
	 * Linked list of application-specified pgin/pgout routines.
	 */
	DB_MPREG *pg_inout;
	LIST_HEAD(__db_mpregh, __db_mpreg) dbregq;

					/* List of DB_MPOOLFILE's. */
	TAILQ_HEAD(__db_mpoolfileh, __db_mpoolfile) dbmfq;

	/*
	 * The dbenv and reginfo fields are not thread protected, as they are
	 * initialized during mpool creation, and not modified again.
	 */
	DB_ENV	   *dbenv;		/* Enclosing environment. */
	REGINFO	   *reginfo;		/* Underlying cache regions. */
};

/*
 * DB_MPREG --
 *	DB_MPOOL registry of pgin/pgout functions.
 */
struct __db_mpreg {
	LIST_ENTRY(__db_mpreg) q;	/* Linked list. */

	int32_t ftype;			/* File type. */
					/* Pgin, pgout routines. */
	int (*pgin) __P((DB_ENV *, db_pgno_t, void *, DBT *));
	int (*pgout) __P((DB_ENV *, db_pgno_t, void *, DBT *));
};

/*
 * File hashing --
 *	We hash each file to hash bucket based on its fileid
 *	or, in the case of in memory files, its name.
 */

/* Number of file hash buckets, a small prime number */
#define	MPOOL_FILE_BUCKETS	17

#define	FHASH(id, len)	__ham_func5(NULL, id, (u_int32_t)(len))

#define	FNBUCKET(id, len)						\
	(FHASH(id, len) % MPOOL_FILE_BUCKETS)

/* Macros to lock/unlock the mpool region as a whole. */
#define	MPOOL_SYSTEM_LOCK(dbenv)					\
	MUTEX_LOCK(dbenv, ((MPOOL *)					\
	    (dbenv)->mp_handle->reginfo[0].primary)->mtx_region)
#define	MPOOL_SYSTEM_UNLOCK(dbenv)					\
	MUTEX_UNLOCK(dbenv, ((MPOOL *)					\
	    (dbenv)->mp_handle->reginfo[0].primary)->mtx_region)

/* Macros to lock/unlock a specific mpool region. */
#define	MPOOL_REGION_LOCK(dbenv, infop)					\
	MUTEX_LOCK(dbenv, ((MPOOL *)(infop)->primary)->mtx_region)
#define	MPOOL_REGION_UNLOCK(dbenv, infop)				\
	MUTEX_UNLOCK(dbenv, ((MPOOL *)(infop)->primary)->mtx_region)

/*
 * MPOOL --
 *	Shared memory pool region.
 */
struct __mpool {
	/*
	 * The memory pool can be broken up into individual pieces/files.
	 * There are two reasons for this: firstly, on Solaris you can allocate
	 * only a little more than 2GB of memory in a contiguous chunk,
	 * and I expect to see more systems with similar issues.  Secondly,
	 * applications can add / remove pieces to dynamically resize the
	 * cache.
	 *
	 * While this structure is duplicated in each piece of the cache,
	 * the first of these pieces/files describes the entire pool, the
	 * second only describe a piece of the cache.
	 */
	db_mutex_t	mtx_region;	/* Region mutex. */
	db_mutex_t	mtx_resize;	/* Resizing mutex. */

	/*
	 * The lsn field and list of underlying MPOOLFILEs are thread protected
	 * by the region lock.
	 */
	DB_LSN	  lsn;			/* Maximum checkpoint LSN. */

	/* Configuration information: protected by the region lock. */
	u_int32_t max_nreg;		/* Maximum number of regions. */
	size_t    mp_mmapsize;		/* Maximum file size for mmap. */
	int       mp_maxopenfd;		/* Maximum open file descriptors. */
	int       mp_maxwrite;		/* Maximum buffers to write. */
	db_timeout_t mp_maxwrite_sleep;	/* Sleep after writing max buffers. */

	/*
	 * The number of regions and the total number of hash buckets across
	 * all regions.
	 * These fields are not protected by a mutex because we assume that we
	 * can read a 32-bit value atomically.  They are only modified by cache
	 * resizing which holds the mpool resizing mutex to ensure that
	 * resizing is single-threaded.  See the comment in mp_resize.c for
	 * more information.
	 */
	u_int32_t nreg;			/* Number of underlying REGIONS. */
	u_int32_t nbuckets;		/* Total number of hash buckets. */

	/*
	 * The regid field is protected by the resize mutex.
	 */
	roff_t	  regids;		/* Array of underlying REGION Ids. */

	roff_t	  ftab;			/* Hash table of files. */

	/*
	 * The following fields describe the per-cache portion of the region.
	 *
	 * The htab and htab_buckets fields are not thread protected as they
	 * are initialized during mpool creation, and not modified again.
	 *
	 * The last_checked and lru_count fields are thread protected by
	 * the region lock.
	 */
	roff_t	  htab;			/* Hash table offset. */
	u_int32_t htab_buckets;		/* Number of hash table entries. */
	u_int32_t last_checked;		/* Last bucket checked for free. */
	u_int32_t lru_count;		/* Counter for buffer LRU. */
	int32_t   lru_reset;		/* Hash bucket lru reset point. */

	/*
	 * The stat fields are generally not thread protected, and cannot be
	 * trusted.  Note that st_pages is an exception, and is always updated
	 * inside a region lock (although it is sometimes read outside of the
	 * region lock).
	 */
	DB_MPOOL_STAT stat;		/* Per-cache mpool statistics. */

	/*
	 * We track page puts so that we can decide when allocation is never
	 * going to succeed.  We don't lock the field, all we care about is
	 * if it changes.
	 */
	u_int32_t  put_counter;		/* Count of page put calls. */

	/*
	 * Cache flush operations take a long time...
	 *
	 * Some cache flush operations want to ignore the app's configured
	 * max-write parameters (they are trying to quickly shut down an
	 * environment, for example).  We can't specify that as an argument
	 * to the cache region functions, because we may decide to ignore
	 * the max-write configuration after the cache operation has begun.
	 * If the variable suppress_maxwrite is set, ignore the application
	 * max-write config.
	 *
	 * We may want to interrupt cache flush operations in high-availability
	 * configurations.
	 */
#define	DB_MEMP_SUPPRESS_WRITE	0x01
#define	DB_MEMP_SYNC_INTERRUPT	0x02
	u_int32_t config_flags;

	/* Free frozen buffer headers, protected by the region lock. */
	SH_TAILQ_HEAD(__free_frozen) free_frozen;

	/* Allocated blocks of frozen buffer headers. */
	SH_TAILQ_HEAD(__alloc_frozen) alloc_frozen;
};

/*
 * NREGION --
 *	Select a cache region given the bucket number.
 */
#define	NREGION(mp, bucket)						\
	((bucket) / (mp)->htab_buckets)

/*
 * MP_HASH --
 *	 We make the assumption that early pages of the file are more likely
 *	 to be retrieved than the later pages, which means the top bits will
 *	 be more interesting for hashing as they're less likely to collide.
 *	 That said, as 512 8K pages represents a 4MB file, so only reasonably
 *	 large files will have page numbers with any other than the bottom 9
 *	 bits set.  We XOR in the MPOOL offset of the MPOOLFILE that backs the
 *	 page, since that should also be unique for the page.  We don't want
 *	 to do anything very fancy -- speed is more important to us than using
 *	 good hashing.
 *
 *	 Since moving to a dynamic hash, which boils down to using some of the
 *	 least significant bits of the hash value, we no longer want to use a
 *	 simple shift here, because it's likely with a bit shift that mf_offset
 *	 will be ignored, and pages from different files end up in the same
 *	 hash bucket.  Use a nearby prime instead.
 */
#define	MP_HASH(mf_offset, pgno)					\
	((pgno) ^ ((mf_offset) * 509))

/*
 * Inline the calculation of the mask, since we can't reliably store the mask
 * with the number of buckets in the region.
 *
 * This is equivalent to:
 *     mask = (1 << __db_log2(nbuckets)) - 1;
 */
#define	MP_MASK(nbuckets, mask) do {					\
	for (mask = 1; mask < (nbuckets); mask = (mask << 1) | 1)	\
		;							\
} while (0)

#define	MP_HASH_BUCKET(hash, nbuckets, mask, bucket) do {		\
	(bucket) = (hash) & (mask);					\
	if ((bucket) >= (nbuckets))					\
		(bucket) &= ((mask) >> 1);				\
} while (0)

#define	MP_BUCKET(mf_offset, pgno, nbuckets, bucket) do {		\
	u_int32_t __mask;						\
	MP_MASK(nbuckets, __mask);					\
	MP_HASH_BUCKET(MP_HASH(mf_offset, pgno), nbuckets,		\
	    __mask, bucket);						\
} while (0)

/*
 * MP_GET_REGION --
 *	Select the region for a given page.
 */
#define	MP_GET_REGION(dbmfp, pgno, infopp, ret) do {			\
	DB_MPOOL *__t_dbmp;						\
	MPOOL *__t_mp;							\
									\
	__t_dbmp = dbmfp->dbenv->mp_handle;				\
	__t_mp = __t_dbmp->reginfo[0].primary;				\
	if (__t_mp->max_nreg == 1) {					\
		*(infopp) = &__t_dbmp->reginfo[0];			\
	} else								\
		ret = __memp_get_bucket((dbmfp), (pgno), (infopp), NULL);\
} while (0)

/*
 * MP_GET_BUCKET --
 *	Select and lock the bucket for a given page.
 */
#define	MP_GET_BUCKET(dbmfp, pgno, infopp, hp, ret) do {		\
	DB_MPOOL *__t_dbmp;						\
	MPOOL *__t_mp;							\
	roff_t __t_mf_offset;						\
	u_int32_t __t_bucket;						\
									\
	__t_dbmp = (dbmfp)->dbenv->mp_handle;				\
	__t_mp = __t_dbmp->reginfo[0].primary;				\
	if (__t_mp->max_nreg == 1) {					\
		*(infopp) = &__t_dbmp->reginfo[0];			\
		__t_mf_offset = R_OFFSET(*(infopp), (dbmfp)->mfp);	\
		MP_BUCKET(__t_mf_offset, (pgno), __t_mp->nbuckets, __t_bucket);\
		(hp) = R_ADDR(*(infopp), __t_mp->htab);			\
		(hp) = &(hp)[__t_bucket];				\
		MUTEX_LOCK(dbenv, (hp)->mtx_hash);			\
		ret = 0;						\
	} else								\
		ret = __memp_get_bucket((dbmfp), (pgno), (infopp), &(hp));\
} while (0)

struct __db_mpool_hash {
	db_mutex_t	mtx_hash;	/* Per-bucket mutex. */
	db_mutex_t	mtx_io;		/* Buffer I/O mutex. */

	DB_HASHTAB	hash_bucket;	/* Head of bucket. */

	u_int32_t	hash_page_dirty;/* Count of dirty pages. */
	u_int32_t	hash_priority;	/* Minimum priority of bucket buffer. */

#ifndef __TEST_DB_NO_STATISTICS
	u_int32_t	hash_io_wait;	/* Count of I/O waits. */
	u_int32_t	hash_frozen;	/* Count of frozen buffers. */
	u_int32_t	hash_thawed;	/* Count of thawed buffers. */
	u_int32_t	hash_frozen_freed;/* Count of freed frozen buffers. */
#endif

	DB_LSN		old_reader;	/* Oldest snapshot reader (cached). */

#define	IO_WAITER	0x001		/* Thread is waiting on page. */
	u_int32_t	flags;
};

/*
 * The base mpool priority is 1/4th of the name space, or just under 2^30.
 * When the LRU counter wraps, we shift everybody down to a base-relative
 * value.
 */
#define	MPOOL_BASE_DECREMENT	(UINT32_MAX - (UINT32_MAX / 4))

/*
 * Mpool priorities from low to high.  Defined in terms of fractions of the
 * buffers in the pool.
 */
#define	MPOOL_PRI_VERY_LOW	-1	/* Dead duck.  Check and set to 0. */
#define	MPOOL_PRI_LOW		-2	/* Low. */
#define	MPOOL_PRI_DEFAULT	0	/* No adjustment -- special case.*/
#define	MPOOL_PRI_HIGH		10	/* With the dirty buffers. */
#define	MPOOL_PRI_DIRTY		10	/* Dirty gets a 10% boost. */
#define	MPOOL_PRI_VERY_HIGH	1	/* Add number of buffers in pool. */

/*
 * MPOOLFILE --
 *	Shared DB_MPOOLFILE information.
 */
struct __mpoolfile {
	db_mutex_t mutex;		/* MPOOLFILE mutex. */

	/* Protected by MPOOLFILE mutex. */
	u_int32_t mpf_cnt;		/* Ref count: DB_MPOOLFILEs. */
	u_int32_t block_cnt;		/* Ref count: blocks in cache. */
	db_pgno_t last_pgno;		/* Last page in the file. */
	db_pgno_t last_flushed_pgno;	/* Last page flushed to disk. */
	db_pgno_t orig_last_pgno;	/* Original last page in the file. */
	db_pgno_t maxpgno;		/* Maximum page number. */

	roff_t	  path_off;		/* File name location. */

	/* Protected by hash bucket mutex. */
	SH_TAILQ_ENTRY q;		/* List of MPOOLFILEs */

	/*
	 * The following are used for file compaction processing.
	 * They are only used when a thread is in the process
	 * of trying to move free pages to the end of the file.
	 * Other threads may look here when freeing a page.
	 * Protected by a lock on the metapage.
	 */
	u_int32_t free_ref;		/* Refcount to freelist. */
	u_int32_t free_cnt;		/* Count of free pages. */
	size_t	  free_size;		/* Allocated size of free list. */
	roff_t	  free_list;		/* Offset to free list. */

	/*
	 * We normally don't lock the deadfile field when we read it since we
	 * only care if the field is zero or non-zero.  We do lock on read when
	 * searching for a matching MPOOLFILE -- see that code for more detail.
	 */
	int32_t	  deadfile;		/* Dirty pages can be discarded. */

	u_int32_t bucket;		/* hash bucket for this file. */

	/*
	 * None of the following fields are thread protected.
	 *
	 * There are potential races with the ftype field because it's read
	 * without holding a lock.  However, it has to be set before adding
	 * any buffers to the cache that depend on it being set, so there
	 * would need to be incorrect operation ordering to have a problem.
	 */
	int32_t	  ftype;		/* File type. */

	/*
	 * There are potential races with the priority field because it's read
	 * without holding a lock.  However, a collision is unlikely and if it
	 * happens is of little consequence.
	 */
	int32_t   priority;		/* Priority when unpinning buffer. */

	/*
	 * There are potential races with the file_written field (many threads
	 * may be writing blocks at the same time), and with no_backing_file
	 * and unlink_on_close fields, as they may be set while other threads
	 * are reading them.  However, we only care if the field value is zero
	 * or non-zero, so don't lock the memory.
	 *
	 * !!!
	 * Theoretically, a 64-bit architecture could put two of these fields
	 * in a single memory operation and we could race.  I have never seen
	 * an architecture where that's a problem, and I believe Java requires
	 * that to never be the case.
	 *
	 * File_written is set whenever a buffer is marked dirty in the cache.
	 * It can be cleared in some cases, after all dirty buffers have been
	 * written AND the file has been flushed to disk.
	 */
	int32_t	  file_written;		/* File was written. */
	int32_t	  no_backing_file;	/* Never open a backing file. */
	int32_t	  unlink_on_close;	/* Unlink file on last close. */
	int32_t	  multiversion;		/* Number of DB_MULTIVERSION handles. */

	/*
	 * We do not protect the statistics in "stat" because of the cost of
	 * the mutex in the get/put routines.  There is a chance that a count
	 * will get lost.
	 */
	DB_MPOOL_FSTAT stat;		/* Per-file mpool statistics. */

	/*
	 * The remaining fields are initialized at open and never subsequently
	 * modified.
	 */
	int32_t	  lsn_off;		/* Page's LSN offset. */
	u_int32_t clear_len;		/* Bytes to clear on page create. */

	roff_t	  fileid_off;		/* File ID string location. */

	roff_t	  pgcookie_len;		/* Pgin/pgout cookie length. */
	roff_t	  pgcookie_off;		/* Pgin/pgout cookie location. */

	/*
	 * The flags are initialized at open and never subsequently modified.
	 */
#define	MP_CAN_MMAP		0x001	/* If the file can be mmap'd. */
#define	MP_DIRECT		0x002	/* No OS buffering. */
#define	MP_DURABLE_UNKNOWN	0x004	/* We don't care about durability. */
#define	MP_EXTENT		0x008	/* Extent file. */
#define	MP_FAKE_DEADFILE	0x010	/* Deadfile field: fake flag. */
#define	MP_FAKE_FILEWRITTEN	0x020	/* File_written field: fake flag. */
#define	MP_FAKE_NB		0x040	/* No_backing_file field: fake flag. */
#define	MP_FAKE_UOC		0x080	/* Unlink_on_close field: fake flag. */
#define	MP_NOT_DURABLE		0x100	/* File is not durable. */
#define	MP_TEMP			0x200	/* Backing file is a temporary. */
	u_int32_t  flags;
};

/*
 * Flags to __memp_bh_free.
 */
#define	BH_FREE_FREEMEM		0x01
#define	BH_FREE_REUSE		0x02
#define	BH_FREE_UNLOCKED	0x04

/*
 * BH --
 *	Buffer header.
 */
struct __bh {
	u_int16_t	ref;		/* Reference count. */
	u_int16_t	ref_sync;	/* Sync wait-for reference count. */

#define	BH_CALLPGIN	0x001		/* Convert the page before use. */
#define	BH_DIRTY	0x002		/* Page is modified. */
#define	BH_DIRTY_CREATE	0x004		/* Page is modified. */
#define	BH_DISCARD	0x008		/* Page is useless. */
#define	BH_FREED	0x010		/* Page was freed. */
#define	BH_FROZEN	0x020		/* Frozen buffer: allocate & re-read. */
#define	BH_LOCKED	0x040		/* Page is locked (I/O in progress). */
#define	BH_TRASH	0x080		/* Page is garbage. */
	u_int16_t	flags;

	u_int32_t	priority;	/* LRU priority. */
	SH_TAILQ_ENTRY	hq;		/* MPOOL hash bucket queue. */

	db_pgno_t	pgno;		/* Underlying MPOOLFILE page number. */
	roff_t		mf_offset;	/* Associated MPOOLFILE offset. */

	roff_t		td_off;		/* MVCC: creating TXN_DETAIL offset. */
	SH_CHAIN_ENTRY	vc;		/* MVCC: version chain. */
#ifdef DIAG_MVCC
	u_int16_t	align_off;	/* Alignment offset for diagnostics.*/
#endif

	/*
	 * !!!
	 * This array must be at least size_t aligned -- the DB access methods
	 * put PAGE and other structures into it, and then access them directly.
	 * (We guarantee size_t alignment to applications in the documentation,
	 * too.)
	 */
	u_int8_t   buf[1];		/* Variable length data. */
};

/*
 * BH_FROZEN_PAGE --
 *	Data used to find a frozen buffer header.
 */
struct __bh_frozen_p {
	BH header;
	db_pgno_t	spgno;		/* Page number in freezer file. */
};

/*
 * BH_FROZEN_ALLOC --
 *	Frozen buffer headers are allocated a page at a time in general.  This
 *	structure is allocated at the beginning of the page so that the
 *	allocation chunks can be tracked and freed (for private environments).
 */
struct __bh_frozen_a {
	SH_TAILQ_ENTRY links;
};

#define	MULTIVERSION(dbp)	((dbp)->mpf->mfp->multiversion)
#define	IS_DIRTY(p)							\
    F_ISSET((BH *)((u_int8_t *)(p) - SSZA(BH, buf)), BH_DIRTY)

#define	BH_OWNER(dbenv, bhp)						\
    ((TXN_DETAIL *)R_ADDR(&dbenv->tx_handle->reginfo, bhp->td_off))

#define	BH_OWNED_BY(dbenv, bhp, txn)	((txn) != NULL &&		\
    (bhp)->td_off != INVALID_ROFF &&					\
    (txn)->td == BH_OWNER(dbenv, bhp))

#define	BH_PRIORITY(bhp)						\
    (SH_CHAIN_SINGLETON(bhp, vc) ? (bhp)->priority :			\
     __memp_bh_priority(bhp))

#define	VISIBLE_LSN(dbenv, bhp)						\
    (&BH_OWNER(dbenv, bhp)->visible_lsn)

#define	BH_OBSOLETE(bhp, old_lsn)	((SH_CHAIN_HASNEXT(bhp, vc) ?	\
	LOG_COMPARE(&(old_lsn), VISIBLE_LSN(dbenv,			\
	SH_CHAIN_NEXTP(bhp, vc, __bh))) :				\
	(bhp->td_off == INVALID_ROFF ? 1 :				\
	LOG_COMPARE(&(old_lsn), VISIBLE_LSN(dbenv, bhp)))) > 0)

#define	MVCC_SKIP_CURADJ(dbc, pgno)					\
    (dbc->txn != NULL && F_ISSET(dbc->txn, TXN_SNAPSHOT) &&		\
    dbc->txn->td != NULL && __memp_skip_curadj(dbc, pgno))

#if defined(DIAG_MVCC) && defined(HAVE_MPROTECT)
#define	VM_PAGESIZE 4096
#define	MVCC_BHSIZE(mfp, sz) do {					\
	sz += VM_PAGESIZE + sizeof(BH);					\
	if (mfp->stat.st_pagesize < VM_PAGESIZE)			\
		sz += VM_PAGESIZE - mfp->stat.st_pagesize;		\
} while (0)

#define	MVCC_BHALIGN(mfp, p) do {					\
	if (mfp != NULL) {						\
		BH *__bhp;						\
		void *__orig = (p);					\
		p = ALIGNP_INC(p, VM_PAGESIZE);				\
		if ((u_int8_t *)p < (u_int8_t *)__orig + sizeof(BH))	\
			p = (u_int8_t *)p + VM_PAGESIZE;		\
		__bhp = (BH *)((u_int8_t *)p - SSZA(BH, buf));		\
		DB_ASSERT(dbenv,					\
		    ((uintptr_t)__bhp->buf & (VM_PAGESIZE - 1)) == 0);	\
		DB_ASSERT(dbenv,					\
		    (u_int8_t *)__bhp >= (u_int8_t *)__orig);		\
		DB_ASSERT(dbenv, (u_int8_t *)p + mfp->stat.st_pagesize <\
		    (u_int8_t *)__orig + len);				\
		__bhp->align_off =					\
		    (u_int16_t)((u_int8_t *)__bhp - (u_int8_t *)__orig);\
		p = __bhp;						\
	}								\
} while (0)

#define	MVCC_BHUNALIGN(mfp, p) do {					\
	if ((mfp) != NULL) {						\
		BH *bhp = (BH *)(p);					\
		(p) = ((u_int8_t *)bhp - bhp->align_off);		\
	}								\
} while (0)

#ifdef linux
#define	MVCC_MPROTECT(buf, sz, mode) do {				\
	int __ret = mprotect((buf), (sz), (mode));			\
	DB_ASSERT(dbenv, __ret == 0);					\
} while (0)
#else
#define	MVCC_MPROTECT(buf, sz, mode) do {				\
	if (!F_ISSET(dbenv, DB_ENV_PRIVATE | DB_ENV_SYSTEM_MEM)) {	\
		int __ret = mprotect((buf), (sz), (mode));		\
		DB_ASSERT(dbenv, __ret == 0);				\
	}								\
} while (0)
#endif /* linux */

#else /* defined(DIAG_MVCC) && defined(HAVE_MPROTECT) */
#define	MVCC_BHSIZE(mfp, sz) do {} while (0)
#define	MVCC_BHALIGN(mfp, p) do {} while (0)
#define	MVCC_BHUNALIGN(mfp, p) do {} while (0)
#define	MVCC_MPROTECT(buf, size, mode) do {} while (0)
#endif

/*
 * Flags to __memp_ftruncate.
 */
#define	MP_TRUNC_RECOVER	0x01

#if defined(__cplusplus)
}
#endif

#include "dbinc_auto/mp_ext.h"
#endif /* !_DB_MP_H_ */