summaryrefslogtreecommitdiff
path: root/db/dbinc/region.h
blob: 805acb1ea9c41013954737c642019c66129fad45 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
/*-
 * See the file LICENSE for redistribution information.
 *
 * Copyright (c) 1998-2004
 *	Sleepycat Software.  All rights reserved.
 *
 * $Id: region.h,v 11.50 2004/09/15 21:49:12 mjc Exp $
 */

#ifndef _DB_REGION_H_
#define	_DB_REGION_H_

/*
 * The DB environment consists of some number of "regions", which are described
 * by the following four structures:
 *
 *	REGENV	   -- shared information about the environment
 *	REGENV_REF -- file describing system memory version of REGENV
 *	REGION	   -- shared information about a single region
 *	REGINFO	   -- per-process information about a REGION
 *
 * There are three types of memory that hold regions:
 *	per-process heap (malloc)
 *	file mapped into memory (mmap, MapViewOfFile)
 *	system memory (shmget, CreateFileMapping)
 *
 * If the regions are private to a process, they're in malloc.  If they're
 * public, they're in file mapped memory, or, optionally, in system memory.
 * Regions in the filesystem are named "__db.001", "__db.002" and so on.  If
 * we're not using a private environment allocated using malloc(3), the file
 * "__db.001" will always exist, as we use it to synchronize on the regions,
 * whether they exist in file mapped memory or system memory.
 *
 * The file "__db.001" contains a REGENV structure and a linked list of some
 * number of REGION structures.  Each of the REGION structures describes and
 * locks one of the underlying shared regions used by DB.
 *
 *	__db.001
 *	+---------+
 *	|REGENV  |
 *	+---------+   +----------+
 *	|REGION   |-> | __db.002 |
 *	|	  |   +----------+
 *	+---------+   +----------+
 *	|REGION   |-> | __db.003 |
 *	|	  |   +----------+
 *	+---------+   +----------+
 *	|REGION   |-> | __db.004 |
 *	|	  |   +----------+
 *	+---------+
 *
 * The only tricky part about manipulating the regions is correctly creating
 * or joining the REGENV file, i.e., __db.001.  We have to be absolutely sure
 * that only one process creates it, and that everyone else joins it without
 * seeing inconsistent data.  Once that region is created, we can use normal
 * shared locking procedures to do mutual exclusion for all other regions.
 *
 * One of the REGION structures in the main environment region describes the
 * environment region itself.
 *
 * To lock a region, locate the REGION structure that describes it and acquire
 * the region's mutex.  There is one exception to this rule -- the lock for the
 * environment region itself is in the REGENV structure, and not in the REGION
 * that describes the environment region.  That's so that we can acquire a lock
 * without walking linked lists that could potentially change underneath us.
 * The REGION will not be moved or removed during the life of the region, and
 * so long-lived references to it can be held by the process.
 *
 * All requests to create or join a region return a REGINFO structure, which
 * is held by the caller and used to open and subsequently close the reference
 * to the region.  The REGINFO structure contains the per-process information
 * that we need to access the region.
 *
 * The one remaining complication.  If the regions (including the environment
 * region) live in system memory, and the system memory isn't "named" somehow
 * in the filesystem name space, we need some way of finding it.  Do this by
 * by writing the REGENV_REF structure into the "__db.001" file.  When we find
 * a __db.001 file that is too small to be a real, on-disk environment, we use
 * the information it contains to redirect to the real "__db.001" file/memory.
 * This currently only happens when the REGENV file is in shared system memory.
 *
 * Although DB does not currently grow regions when they run out of memory, it
 * would be possible to do so.  To grow a region, allocate a new region of the
 * appropriate size, then copy the old region over it and insert the additional
 * space into the already existing shalloc arena.  Callers may have to fix up
 * local references, but that should be easy to do.  This failed in historic
 * versions of DB because the region lock lived in the mapped memory, and when
 * it was unmapped and remapped (or copied), threads could lose track of it.
 * Once we moved that lock into a region that is never unmapped, growing should
 * work.  That all said, current versions of DB don't implement region grow
 * because some systems don't support mutex copying, e.g., from OSF1 V4.0:
 *
 *	The address of an msemaphore structure may be significant.  If the
 *	msemaphore structure contains any value copied from an msemaphore
 *	structure at a different address, the result is undefined.
 */

#if defined(__cplusplus)
extern "C" {
#endif

#define	DB_REGION_PREFIX	"__db"		/* DB file name prefix. */
#define	DB_REGION_FMT		"__db.%03d"	/* Region file name format. */
#define	DB_REGION_ENV		"__db.001"	/* Primary environment name. */
#define	DB_REGION_NAME_LENGTH	8		/* Length of file names. */

#define	INVALID_REGION_ID	0	/* Out-of-band region ID. */
#define	REGION_ID_ENV		1	/* Primary environment ID. */

typedef enum {
	INVALID_REGION_TYPE=0,		/* Region type. */
	REGION_TYPE_ENV,
	REGION_TYPE_LOCK,
	REGION_TYPE_LOG,
	REGION_TYPE_MPOOL,
	REGION_TYPE_MUTEX,
	REGION_TYPE_TXN } reg_type_t;

#define	INVALID_REGION_SEGID	-1	/* Segment IDs are either shmget(2) or
					 * Win16 segment identifiers.  They are
					 * both stored in a "long", and we need
					 * an out-of-band value.
					 */
/*
 * Nothing can live at region offset 0, because, in all cases, that's where
 * we store *something*.  Lots of code needs an out-of-band value for region
 * offsets, so we use 0.
 */
#define	INVALID_ROFF		0

/* Reference describing system memory version of REGENV. */
typedef struct __db_reg_env_ref {
	roff_t	   size;		/* Region size. */
	long	   segid;		/* UNIX shmget ID, VxWorks ID. */
} REGENV_REF;

/* Per-environment region information. */
typedef struct __db_reg_env {
	/*
	 * !!!
	 * The mutex must be the first entry in the structure to guarantee
	 * correct alignment.
	 */
	DB_MUTEX   mutex;		/* Environment mutex. */

	/*
	 * !!!
	 * Note, the magic and panic fields are NOT protected by any mutex,
	 * and for this reason cannot be anything more complicated than a
	 * zero/non-zero value.
	 */
	u_int32_t  magic;		/* Valid region magic number. */
	u_int32_t  envid;		/* Unique environment ID. */

	int	   envpanic;		/* Environment is dead. */

	int	   majver;		/* Major DB version number. */
	int	   minver;		/* Minor DB version number. */
	int	   patch;		/* Patch DB version number. */

	u_int32_t  init_flags;		/* Flags the env was initialized with.*/
	roff_t	   cipher_off;		/* Offset of cipher area */

					/* List of regions. */
	SH_LIST_HEAD(__db_regionh) regionq;

	u_int32_t  refcnt;		/* References to the environment. */

	roff_t	   rep_off;		/* Offset of the replication area. */
#define	DB_REGENV_REPLOCKED	0x0001	/* Env locked for rep backup. */
	u_int32_t  flags;		/* Shared environment flags. */
#define	DB_REGENV_TIMEOUT	30	/* Backup timeout. */
	time_t	   op_timestamp;	/* Timestamp for operations. */
	time_t	   rep_timestamp;	/* Timestamp for rep db handles. */

	size_t	   pad;			/* Guarantee that following memory is
					 * size_t aligned.  This is necessary
					 * because we're going to store the
					 * allocation region information there.
					 */
} REGENV;

/* Per-region shared region information. */
typedef struct __db_region {
	/*
	 * !!!
	 * The mutex must be the first entry in the structure to guarantee
	 * correct alignment.
	 */
	DB_MUTEX   mutex;		/* Region mutex. */

	SH_LIST_ENTRY q;		/* Linked list of REGIONs. */

	reg_type_t type;		/* Region type. */
	u_int32_t  id;			/* Region id. */

	roff_t	   size_orig;		/* Region size in bytes (original). */
	roff_t	   size;		/* Region size in bytes (adjusted). */

	roff_t	   primary;		/* Primary data structure offset. */

	long	   segid;		/* UNIX shmget(2), Win16 segment ID. */
} REGION;

/*
 * Per-process/per-attachment information about a single region.
 */
struct __db_reginfo_t {		/* __db_r_attach IN parameters. */
	DB_ENV	   *dbenv;		/* Enclosing environment. */
	reg_type_t  type;		/* Region type. */
	u_int32_t   id;			/* Region id. */

				/* __db_r_attach OUT parameters. */
	REGION	   *rp;			/* Shared region. */

	char	   *name;		/* Region file name. */

	void	   *addr_orig;		/* Region address (original). */
	void	   *addr;		/* Region address (adjusted). */
	void	   *primary;		/* Primary data structure address. */

	size_t	    max_alloc;		/* Maximum bytes allocated. */
	size_t	    allocated;		/* Bytes allocated. */

#ifdef DB_WIN32
	HANDLE	wnt_handle;		/* Win/NT HANDLE. */
#endif

#define	REGION_CREATE		0x01	/* Caller created region. */
#define	REGION_CREATE_OK	0x02	/* Caller willing to create region. */
#define	REGION_JOIN_OK		0x04	/* Caller is looking for a match. */
	u_int32_t   flags;
};

/*
 * Mutex maintenance information each subsystem region must keep track
 * of to manage resources adequately.
 */
typedef struct __db_regmaint_stat_t {
	u_int32_t	st_hint_hit;
	u_int32_t	st_hint_miss;
	u_int32_t	st_records;
	u_int32_t	st_clears;
	u_int32_t	st_destroys;
	u_int32_t	st_max_locks;
} REGMAINT_STAT;

typedef struct __db_regmaint_t {
	u_int32_t  reglocks;		/* Maximum # of mutexes we track. */
	u_int32_t  regmutex_hint;	/* Hint for next slot */
	REGMAINT_STAT stat;		/* Stats */
	roff_t	   regmutexes[1];	/* Region mutexes in use. */
} REGMAINT;

/*
 * R_ADDR	Return a per-process address for a shared region offset.
 * R_OFFSET	Return a shared region offset for a per-process address.
 */
#define	R_ADDR(dbenv, base, offset)					\
	(F_ISSET((dbenv), DB_ENV_PRIVATE) ? (void *)(offset) :		\
	(void *)((u_int8_t *)((base)->addr) + (offset)))
#define	R_OFFSET(dbenv, base, p)					\
	(F_ISSET((dbenv), DB_ENV_PRIVATE) ? (roff_t)(p) :		\
	(roff_t)((u_int8_t *)(p) - (u_int8_t *)(base)->addr))

/*
 * R_LOCK	Lock/unlock a region.
 * R_UNLOCK
 */
#define	R_LOCK(dbenv, reginfo)						\
	MUTEX_LOCK(dbenv, &(reginfo)->rp->mutex)
#define	R_UNLOCK(dbenv, reginfo)					\
	MUTEX_UNLOCK(dbenv, &(reginfo)->rp->mutex)

/* PANIC_CHECK:	Check to see if the DB environment is dead. */
#define	PANIC_CHECK(dbenv)						\
	if (!F_ISSET((dbenv), DB_ENV_NOPANIC) &&			\
	    (dbenv)->reginfo != NULL && ((REGENV *)			\
	    ((REGINFO *)(dbenv)->reginfo)->primary)->envpanic != 0)	\
		return (__db_panic_msg(dbenv));

#define	PANIC_SET(dbenv, onoff)						\
	if ((dbenv)->reginfo != NULL)					\
		((REGENV *)((REGINFO *)					\
		    (dbenv)->reginfo)->primary)->envpanic = (onoff);

/*
 * All regions are created on 8K boundaries out of sheer paranoia, so we
 * don't make some underlying VM unhappy. Make sure we don't overflow or
 * underflow.
 */
#define	OS_VMPAGESIZE		(8 * 1024)
#define	OS_VMROUNDOFF(i) {						\
	if ((i) <							\
	    (UINT32_MAX - OS_VMPAGESIZE) + 1 || (i) < OS_VMPAGESIZE)	\
		(i) += OS_VMPAGESIZE - 1;				\
	(i) -= (i) % OS_VMPAGESIZE;					\
}

#if defined(__cplusplus)
}
#endif
#endif /* !_DB_REGION_H_ */