00001 /*- 00002 * See the file LICENSE for redistribution information. 00003 * 00004 * Copyright (c) 1998, 1999, 2000 00005 * Sleepycat Software. All rights reserved. 00006 * 00007 * $Id: region_8h-source.html,v 1.1 2008/06/08 10:21:52 sebdiaz Exp $ 00008 */ 00009 00010 /* 00011 * The DB environment consists of some number of "regions", which are described 00012 * by the following four structures: 00013 * 00014 * REGENV -- shared information about the environment 00015 * REGENV_REF -- file describing system memory version of REGENV 00016 * REGION -- shared information about a single region 00017 * REGINFO -- per-process information about a REGION 00018 * 00019 * There are three types of memory that hold regions: 00020 * per-process heap (malloc) 00021 * file mapped into memory (mmap, MapViewOfFile) 00022 * system memory (shmget, CreateFileMapping) 00023 * 00024 * If the regions are private to a process, they're in malloc. If they're 00025 * public, they're in file mapped memory, or, optionally, in system memory. 00026 * Regions in the filesystem are named "__db.001", "__db.002" and so on. If 00027 * we're not using a private environment allocated using malloc(3), the file 00028 * "__db.001" will always exist, as we use it to synchronize on the regions, 00029 * whether they exist in file mapped memory or system memory. 00030 * 00031 * The file "__db.001" contains a REGENV structure and a linked list of some 00032 * number of REGION structures. Each of the REGION structures describes and 00033 * locks one of the underlying shared regions used by DB. 00034 * 00035 * __db.001 00036 * +---------+ 00037 * |REGENV | 00038 * +---------+ +----------+ 00039 * |REGION |-> | __db.002 | 00040 * | | +----------+ 00041 * +---------+ +----------+ 00042 * |REGION |-> | __db.003 | 00043 * | | +----------+ 00044 * +---------+ +----------+ 00045 * |REGION |-> | __db.004 | 00046 * | | +----------+ 00047 * +---------+ 00048 * 00049 * The only tricky part about manipulating the regions is correctly creating 00050 * or joining the REGENV file, i.e., __db.001. We have to be absolutely sure 00051 * that only one process creates it, and that everyone else joins it without 00052 * seeing inconsistent data. Once that region is created, we can use normal 00053 * shared locking procedures to do mutal exclusion for all other regions. 00054 * 00055 * One of the REGION structures in the CDB_main environment region describes the 00056 * environment region itself. 00057 * 00058 * To lock a region, locate the REGION structure that describes it and acquire 00059 * the region's mutex. There is one exception to this rule -- the lock for the 00060 * environment region itself is in the REGENV structure, and not in the REGION 00061 * that describes the environment region. That's so that we can acquire a lock 00062 * without walking linked lists that could potentially change underneath us. 00063 * The REGION will not be moved or removed during the life of the region, and 00064 * so long-lived references to it can be held by the process. 00065 * 00066 * All requests to create or join a region return a REGINFO structure, which 00067 * is held by the caller and used to open and subsequently close the reference 00068 * to the region. The REGINFO structure contains the per-process information 00069 * that we need to access the region. 00070 * 00071 * The one remaining complication. If the regions (including the environment 00072 * region) live in system memory, and the system memory isn't "named" somehow 00073 * in the filesystem name space, we need some way of finding it. Do this by 00074 * by writing the REGENV_REF structure into the "__db.001" file. When we find 00075 * a __db.001 file that is too small to be a real, on-disk environment, we use 00076 * the information it contains to redirect to the real "__db.001" file/memory. 00077 * This currently only happens when the REGENV file is in shared system memory. 00078 * 00079 * Although DB does not currently grow regions when they run out of memory, it 00080 * would be possible to do so. To grow a region, allocate a new region of the 00081 * appropriate size, then copy the old region over it and insert the additional 00082 * space into the already existing shalloc arena. Callers may have to fix up 00083 * local references, but that should be easy to do. This failed in historic 00084 * versions of DB because the region lock lived in the mapped memory, and when 00085 * it was unmapped and remapped (or copied), threads could lose track of it. 00086 * Once we moved that lock into a region that is never unmapped, growing should 00087 * work. That all said, current versions of DB don't implement region grow 00088 * because some systems don't support mutex copying, e.g., from OSF1 V4.0: 00089 * 00090 * The address of an msemaphore structure may be significant. If the 00091 * msemaphore structure contains any value copied from an msemaphore 00092 * structure at a different address, the result is undefined. 00093 */ 00094 00095 #if defined(__cplusplus) 00096 extern "C" { 00097 #endif 00098 00099 #define DB_REGION_FMT "__db.%03d" /* Region file name format. */ 00100 #define DB_REGION_NAME_NUM 5 /* First digit offset in file names. */ 00101 #define DB_REGION_NAME_LENGTH 8 /* Length of file names. */ 00102 00103 #define DB_REGION_ENV "__db.001" /* Primary environment name. */ 00104 00105 #define INVALID_REGION_SEGID -1 /* Segment IDs are either shmget(2) or 00106 * Win16 segment identifiers. They are 00107 * both stored in a "long", and we need 00108 * an out-of-band value. 00109 */ 00110 /* 00111 * Currently, region offsets are limited to 32-bits. I expect that's going 00112 * to have to be fixed in the not-too-distant future, since we won't want to 00113 * split 100Gb memory pools into that many different regions. It's typedef'd 00114 * so it won't be too painful to upgrade. 00115 */ 00116 typedef u_int32_t roff_t; 00117 00118 /* 00119 * Nothing can live at region offset 0, because, in all cases, that's where 00120 * we store *something*. Lots of code needs an out-of-band value for region 00121 * offsets, so we use 0. 00122 */ 00123 #define INVALID_ROFF 0 00124 00125 /* Reference describing system memory version of REGENV. */ 00126 typedef struct __db_reg_env_ref { 00127 roff_t size; /* Region size. */ 00128 long segid; /* UNIX shmget(2) ID. */ 00129 } REGENV_REF; 00130 00131 /* Per-environment region information. */ 00132 typedef struct __db_reg_env { 00133 /* 00134 * !!! 00135 * The mutex must be the first entry in the structure to guarantee 00136 * correct alignment. 00137 */ 00138 MUTEX mutex; /* Environment mutex. */ 00139 00140 /* 00141 * !!! 00142 * Note, the magic and panic fields are NOT protected by the mutex, 00143 * and for this reason cannot be anything more complicated than a 00144 * zero/non-zero value. 00145 * 00146 * !!! 00147 * Some 64-bit architectures (e.g., the OSF/1 Alpha processor) do not 00148 * support 32-bit atomic reads and writes, and so have an interesting 00149 * bug where sequential 32-bit values can be accidentally overwritten, 00150 * i.e., a variable protected by a lock gets overwritten by a thread 00151 * that doesn't hold the lock, simply because the variable sequentially 00152 * followed a variable that didn't need the lock for protection. We do 00153 * not want setting the panic value to be overwritten by another thread 00154 * unlocking the region, or vice-versa, for that matter. As the magic 00155 * variable is written only during region creation, list it first to 00156 * ensure this cannot happen. 00157 * 00158 * !!! 00159 * The valid region magic number must appear at the same byte offset 00160 * in both the environment and each shared region, as Windows/95 uses 00161 * it to determine if the memory has been zeroed since it was last used. 00162 */ 00163 #define DB_REGION_MAGIC 0x120897 00164 u_int32_t magic; /* Valid region magic number. */ 00165 00166 int panic; /* Environment is dead. */ 00167 00168 int majver; /* Major DB version number. */ 00169 int minver; /* Minor DB version number. */ 00170 int patch; /* Patch DB version number. */ 00171 00172 /* List of regions. */ 00173 SH_LIST_HEAD(__db_regionh) regionq; 00174 00175 u_int32_t refcnt; /* References to the environment. */ 00176 00177 size_t pad; /* Guarantee that following memory is 00178 * size_t aligned. This is necessary 00179 * because we're going to store the 00180 * allocation region information there. 00181 */ 00182 } REGENV; 00183 00184 /* Per-region shared region information. */ 00185 typedef struct __db_region { 00186 /* 00187 * !!! 00188 * The mutex must be the first entry in the structure to guarantee 00189 * correct alignment. 00190 */ 00191 MUTEX mutex; /* Region mutex. */ 00192 00193 /* 00194 * !!! 00195 * The valid region magic number must appear at the same byte offset 00196 * in both the environment and each shared region, as Windows/95 uses 00197 * it to determine if the memory has been zeroed since it was last used. 00198 */ 00199 u_int32_t magic; 00200 00201 SH_LIST_ENTRY q; /* Linked list of REGIONs. */ 00202 00203 roff_t size; /* Region size in bytes. */ 00204 00205 roff_t primary; /* Primary data structure offset. */ 00206 00207 long segid; /* UNIX shmget(2), Win16 segment ID. */ 00208 00209 #define REG_ID_INVALID 0 /* Invalid. */ 00210 #define REG_ID_ENV 1 /* Environment. */ 00211 #define REG_ID_LOCK 2 /* Lock region. */ 00212 #define REG_ID_LOG 3 /* Log region. */ 00213 #define REG_ID_MPOOL 4 /* Mpool region. */ 00214 #define REG_ID_TXN 5 /* Txn region. */ 00215 #define REG_ID_ASSIGN (REG_ID_TXN + 1)/* First assignable region number. */ 00216 int id; /* Region id. */ 00217 00218 #define REG_DEAD 0x01 /* Region may be corrupted. */ 00219 u_int32_t flags; 00220 } REGION; 00221 00222 /* 00223 * Per-process/per-attachment information about a single region. 00224 */ 00225 struct __db_reginfo_t { /* CDB___db_r_attach IN parameters. */ 00226 int id; /* Region id: used for naming. */ 00227 int mode; /* File creation mode. */ 00228 00229 /* CDB___db_r_attach OUT parameters. */ 00230 REGION *rp; /* Shared region. */ 00231 00232 char *name; /* Region file name. */ 00233 00234 void *addr; /* Region allocation address. */ 00235 void *primary; /* Primary data structure address. */ 00236 00237 void *wnt_handle; /* Win/NT HANDLE. */ 00238 00239 #define REGION_CREATE 0x01 /* Caller created region. */ 00240 #define REGION_CREATE_OK 0x02 /* Caller willing to create region. */ 00241 u_int32_t flags; 00242 }; 00243 00244 /* 00245 * R_ADDR Return a per-process address for a shared region offset. 00246 * R_OFFSET Return a shared region offset for a per-process address. 00247 * 00248 * !!! 00249 * R_OFFSET should really be returning a ptrdiff_t, but that's not yet 00250 * portable. We use u_int32_t, which restricts regions to 4Gb in size. 00251 */ 00252 #define R_ADDR(base, offset) \ 00253 ((void *)((u_int8_t *)((base)->addr) + offset)) 00254 #define R_OFFSET(base, p) \ 00255 ((u_int32_t)((u_int8_t *)(p) - (u_int8_t *)(base)->addr)) 00256 00257 /* 00258 * R_LOCK Lock/unlock a region. 00259 * R_UNLOCK 00260 */ 00261 #define R_LOCK(dbenv, reginfo) \ 00262 MUTEX_LOCK(&(reginfo)->rp->mutex, (dbenv)->lockfhp) 00263 #define R_UNLOCK(dbenv, reginfo) \ 00264 MUTEX_UNLOCK(&(reginfo)->rp->mutex) 00265 00266 /* PANIC_CHECK: Check to see if the DB environment is dead. */ 00267 #define PANIC_CHECK(dbenv) \ 00268 if (DB_GLOBAL(db_panic) && \ 00269 (dbenv)->reginfo != NULL && ((REGENV *) \ 00270 ((REGINFO *)(dbenv)->reginfo)->primary)->panic != 0) \ 00271 return (DB_RUNRECOVERY); 00272 00273 /* 00274 * All regions are created on 8K boundaries out of sheer paranoia, so that 00275 * we don't make some underlying VM unhappy. 00276 */ 00277 #define OS_ROUNDOFF(i, s) { \ 00278 (i) += (s) - 1; \ 00279 (i) -= (i) % (s); \ 00280 } 00281 #define OS_VMPAGESIZE (8 * 1024) 00282 #define OS_VMROUNDOFF(i) OS_ROUNDOFF(i, OS_VMPAGESIZE) 00283 00284 #if defined(__cplusplus) 00285 } 00286 #endif