summaryrefslogtreecommitdiff
path: root/db/dbinc/region.h
blob: 84490f804600b0d2fc927dfaae007b63ea5ae79c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
/*-
 * See the file LICENSE for redistribution information.
 *
 * Copyright (c) 1998,2007 Oracle.  All rights reserved.
 *
 * $Id: region.h,v 12.14 2007/05/17 15:15:05 bostic Exp $
 */

#ifndef _DB_REGION_H_
#define	_DB_REGION_H_

/*
 * The DB environment consists of some number of "regions", which are described
 * by the following four structures:
 *
 *	REGENV	   -- shared information about the environment
 *	REGENV_REF -- file describing system memory version of REGENV
 *	REGION	   -- shared information about a single region
 *	REGINFO	   -- per-process information about a REGION
 *
 * There are three types of memory that hold regions:
 *	per-process heap (malloc)
 *	file mapped into memory (mmap, MapViewOfFile)
 *	system memory (shmget, CreateFileMapping)
 *
 * By default, regions are created in filesystem-backed shared memory.  They
 * can also be created in system shared memory (DB_SYSTEM_MEM), or, if private
 * to a process, in heap memory (DB_PRIVATE).
 *
 * Regions in the filesystem are named "__db.001", "__db.002" and so on.  If
 * we're not using a private environment allocated in heap, "__db.001" will
 * always exist, as we use it to synchronize on the regions, whether they are
 * in filesystem-backed memory or system memory.
 *
 * The file "__db.001" contains a REGENV structure and an array of REGION
 * structures.  Each REGION structures describes an underlying chunk of
 * shared memory.
 *
 *	__db.001
 *	+---------+
 *	|REGENV  |
 *	+---------+   +----------+
 *	|REGION   |-> | __db.002 |
 *	|	  |   +----------+
 *	+---------+   +----------+
 *	|REGION   |-> | __db.003 |
 *	|	  |   +----------+
 *	+---------+   +----------+
 *	|REGION   |-> | __db.004 |
 *	|	  |   +----------+
 *	+---------+
 *
 * The tricky part about manipulating the regions is creating or joining the
 * database environment.  We have to be sure only a single thread of control
 * creates and/or recovers a database environment.  All other threads should
 * then join without seeing inconsistent data.
 *
 * We do this in two parts: first, we use the underlying O_EXCL flag to the
 * open system call to serialize creation of the __db.001 file.  The thread
 * of control creating that file then proceeds to create the remaining
 * regions in the environment, including the mutex region.  Once the mutex
 * region has been created, the creating thread of control fills in the
 * __db.001 file's magic number.  Other threads of control (the ones that
 * didn't create the __db.001 file), wait on the initialization of the
 * __db.001 file's magic number.  After it has been initialized, all threads
 * of control can proceed, using normal shared mutex locking procedures for
 * exclusion.
 *
 * REGIONs are not moved or removed during the life of the environment, and
 * so processes can have long-lived references to them.
 *
 * One of the REGION structures describes the environment region itself.
 *
 * The REGION array is not locked in any way.  It's an array so we don't have
 * to manipulate data structures after a crash -- on some systems, we have to
 * join and clean up the mutex region after application failure.  Using an
 * array means we don't have to worry about broken links or other nastiness
 * after the failure.
 *
 * All requests to create or join a region return a REGINFO structure, which
 * is held by the caller and used to open and subsequently close the reference
 * to the region.  The REGINFO structure contains the per-process information
 * that we need to access the region.
 *
 * The one remaining complication.  If the regions (including the environment
 * region) live in system memory, and the system memory isn't "named" somehow
 * in the filesystem name space, we need some way of finding it.  Do this by
 * by writing the REGENV_REF structure into the "__db.001" file.  When we find
 * a __db.001 file that is too small to be a real, on-disk environment, we use
 * the information it contains to redirect to the real "__db.001" file/memory.
 * This currently only happens when the REGENV file is in shared system memory.
 *
 * Although DB does not currently grow regions when they run out of memory, it
 * would be possible to do so.  To grow a region, allocate a new region of the
 * appropriate size, then copy the old region over it and insert the additional
 * memory into the already existing shalloc arena.  Region users must reset
 * their base addresses and any local pointers into the memory, of course.
 * This failed in historic versions of DB because the region mutexes lived in
 * the mapped memory, and when it was unmapped and remapped (or copied),
 * threads could lose track of it.  Also, some systems didn't support mutex
 * copying, e.g., from OSF1 V4.0:
 *
 *	The address of an msemaphore structure may be significant.  If the
 *	msemaphore structure contains any value copied from an msemaphore
 *	structure at a different address, the result is undefined.
 *
 * All mutexes are now maintained in a separate region which is never unmapped,
 * so growing regions should be possible.
 */

#if defined(__cplusplus)
extern "C" {
#endif

#define	DB_REGION_PREFIX	"__db"		/* DB file name prefix. */
#define	DB_REGION_FMT		"__db.%03d"	/* Region file name format. */
#define	DB_REGION_ENV		"__db.001"	/* Primary environment name. */
#define	DB_REGION_NAME_LENGTH	8		/* Length of file names. */

#define	INVALID_REGION_ID	0	/* Out-of-band region ID. */
#define	REGION_ID_ENV		1	/* Primary environment ID. */

typedef enum {
	INVALID_REGION_TYPE=0,		/* Region type. */
	REGION_TYPE_ENV,
	REGION_TYPE_LOCK,
	REGION_TYPE_LOG,
	REGION_TYPE_MPOOL,
	REGION_TYPE_MUTEX,
	REGION_TYPE_TXN } reg_type_t;

#define	INVALID_REGION_SEGID	-1	/* Segment IDs are either shmget(2) or
					 * Win16 segment identifiers.  They are
					 * both stored in a "long", and we need
					 * an out-of-band value.
					 */
/*
 * Nothing can live at region offset 0, because, in all cases, that's where
 * we store *something*.  Lots of code needs an out-of-band value for region
 * offsets, so we use 0.
 */
#define	INVALID_ROFF		0

/* Reference describing system memory version of REGENV. */
typedef struct __db_reg_env_ref {
	roff_t	   size;		/* Region size. */
	long	   segid;		/* UNIX shmget ID, VxWorks ID. */
} REGENV_REF;

/* Per-environment region information. */
typedef struct __db_reg_env {
	/*
	 * !!!
	 * The magic, panic, version and envid fields of the region are fixed
	 * in size, the timestamp field is the first field which is variable
	 * length.  These fields must never change in order, to guarantee we
	 * can always read them, no matter what Berkeley DB release we have.
	 *
	 * !!!
	 * The magic and panic fields are NOT protected by any mutex, and for
	 * this reason cannot be anything more complicated than zero/non-zero.
	 */
	u_int32_t magic;		/* Valid region magic number. */
	u_int32_t panic;		/* Environment is dead. */

	u_int32_t majver;		/* Major DB version number. */
	u_int32_t minver;		/* Minor DB version number. */
	u_int32_t patchver;		/* Patch DB version number. */

	u_int32_t envid;		/* Unique environment ID. */

	time_t	  timestamp;		/* Creation time. */

	u_int32_t init_flags;		/* Flags environment initialized with.*/

	/*
	 * The mtx_regenv mutex protects the environment reference count and
	 * memory allocation from the primary shared region (the crypto, thread
	 * control block and replication implementations allocate memory from
	 * the primary shared region).
	 *
	 * The rest of the fields are initialized at creation time, and don't
	 * need mutex protection.  The flags, op_timestamp and rep_timestamp
	 * fields are used by replication only and are protected by the
	 * replication mutex.  The rep_timestamp is is not protected when it
	 * is used in recovery as that is already single threaded.
	 */
	db_mutex_t mtx_regenv;		/* Refcnt, region allocation mutex. */
	u_int32_t  refcnt;		/* References to the environment. */

	u_int32_t region_cnt;		/* Number of REGIONs. */
	roff_t	  region_off;		/* Offset of region array */

	roff_t	  cipher_off;		/* Offset of cipher area */

	roff_t	  thread_off;		/* Offset of the thread area. */

	roff_t	  rep_off;		/* Offset of the replication area. */
#define	DB_REGENV_REPLOCKED	0x0001	/* Env locked for rep backup. */
	u_int32_t flags;		/* Shared environment flags. */
#define	DB_REGENV_TIMEOUT	30	/* Backup timeout. */
	time_t	  op_timestamp;		/* Timestamp for operations. */
	time_t	  rep_timestamp;	/* Timestamp for rep db handles. */

	size_t	pad;			/* Guarantee that following memory is
					 * size_t aligned.  This is necessary
					 * because we're going to store the
					 * allocation region information there.
					 */
} REGENV;

/* Per-region shared region information. */
typedef struct __db_region {
	u_int32_t	id;		/* Region id. */
	reg_type_t	type;		/* Region type. */

	roff_t	size_orig;		/* Region size in bytes (original). */
	roff_t	size;			/* Region size in bytes (adjusted). */

	roff_t	primary;		/* Primary data structure offset. */

	long	segid;			/* UNIX shmget(2), Win16 segment ID. */
} REGION;

/*
 * Per-process/per-attachment information about a single region.
 */
struct __db_reginfo_t {		/* __env_region_attach IN parameters. */
	DB_ENV	   *dbenv;		/* Enclosing environment. */
	reg_type_t  type;		/* Region type. */
	u_int32_t   id;			/* Region id. */

				/* env_region_attach OUT parameters. */
	REGION	   *rp;			/* Shared region. */

	char	   *name;		/* Region file name. */

	void	   *addr_orig;		/* Region address (original). */
	void	   *addr;		/* Region address (adjusted). */
	void	   *primary;		/* Primary data structure address. */

	size_t	    max_alloc;		/* Maximum bytes allocated. */
	size_t	    allocated;		/* Bytes allocated. */

#ifdef DB_WIN32
	HANDLE	wnt_handle;		/* Win/NT HANDLE. */
#endif

#define	REGION_CREATE		0x01	/* Caller created region. */
#define	REGION_CREATE_OK	0x02	/* Caller willing to create region. */
#define	REGION_JOIN_OK		0x04	/* Caller is looking for a match. */
	u_int32_t   flags;
};

/*
 * R_ADDR	Return a per-process address for a shared region offset.
 * R_OFFSET	Return a shared region offset for a per-process address.
 */
#define	R_ADDR(reginfop, offset)					\
	(F_ISSET((reginfop)->dbenv, DB_ENV_PRIVATE) ? (void *)(offset) :\
	(void *)((u_int8_t *)((reginfop)->addr) + (offset)))
#define	R_OFFSET(reginfop, p)						\
	(F_ISSET((reginfop)->dbenv, DB_ENV_PRIVATE) ? (roff_t)(p) :	\
	(roff_t)((u_int8_t *)(p) - (u_int8_t *)(reginfop)->addr))

/*
 * PANIC_ISSET, PANIC_CHECK:
 *	Check to see if the DB environment is dead.
 */
#define	PANIC_ISSET(dbenv)						\
	((dbenv)->reginfo != NULL && ((REGENV *)			\
	    ((REGINFO *)(dbenv)->reginfo)->primary)->panic != 0 &&	\
	    !F_ISSET((dbenv), DB_ENV_NOPANIC))

#define	PANIC_CHECK(dbenv)						\
	if (PANIC_ISSET(dbenv))						\
		return (__db_panic_msg(dbenv));

#if defined(__cplusplus)
}
#endif
#endif /* !_DB_REGION_H_ */