1 | /*- |
---|
2 | * See the file LICENSE for redistribution information. |
---|
3 | * |
---|
4 | * Copyright (c) 1996-2002 |
---|
5 | * Sleepycat Software. All rights reserved. |
---|
6 | */ |
---|
7 | #include "db_config.h" |
---|
8 | |
---|
9 | #ifndef lint |
---|
10 | static const char revid[] = "$Id: mp_fget.c,v 1.1.1.1 2004-12-17 17:27:13 ghudson Exp $"; |
---|
11 | #endif /* not lint */ |
---|
12 | |
---|
13 | #ifndef NO_SYSTEM_INCLUDES |
---|
14 | #include <sys/types.h> |
---|
15 | |
---|
16 | #include <string.h> |
---|
17 | #endif |
---|
18 | |
---|
19 | #include "db_int.h" |
---|
20 | #include "dbinc/db_shash.h" |
---|
21 | #include "dbinc/mp.h" |
---|
22 | |
---|
23 | #ifdef HAVE_FILESYSTEM_NOTZERO |
---|
24 | static int __memp_fs_notzero |
---|
25 | __P((DB_ENV *, DB_MPOOLFILE *, MPOOLFILE *, db_pgno_t *)); |
---|
26 | #endif |
---|
27 | |
---|
28 | /* |
---|
29 | * __memp_fget -- |
---|
30 | * Get a page from the file. |
---|
31 | * |
---|
32 | * PUBLIC: int __memp_fget |
---|
33 | * PUBLIC: __P((DB_MPOOLFILE *, db_pgno_t *, u_int32_t, void *)); |
---|
34 | */ |
---|
35 | int |
---|
36 | __memp_fget(dbmfp, pgnoaddr, flags, addrp) |
---|
37 | DB_MPOOLFILE *dbmfp; |
---|
38 | db_pgno_t *pgnoaddr; |
---|
39 | u_int32_t flags; |
---|
40 | void *addrp; |
---|
41 | { |
---|
42 | enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state; |
---|
43 | BH *alloc_bhp, *bhp; |
---|
44 | DB_ENV *dbenv; |
---|
45 | DB_MPOOL *dbmp; |
---|
46 | DB_MPOOL_HASH *hp; |
---|
47 | MPOOL *c_mp, *mp; |
---|
48 | MPOOLFILE *mfp; |
---|
49 | roff_t mf_offset; |
---|
50 | u_int32_t n_cache, st_hsearch; |
---|
51 | int b_incr, extending, first, ret; |
---|
52 | |
---|
53 | *(void **)addrp = NULL; |
---|
54 | |
---|
55 | dbmp = dbmfp->dbmp; |
---|
56 | dbenv = dbmp->dbenv; |
---|
57 | |
---|
58 | PANIC_CHECK(dbenv); |
---|
59 | |
---|
60 | mp = dbmp->reginfo[0].primary; |
---|
61 | mfp = dbmfp->mfp; |
---|
62 | mf_offset = R_OFFSET(dbmp->reginfo, mfp); |
---|
63 | alloc_bhp = bhp = NULL; |
---|
64 | hp = NULL; |
---|
65 | b_incr = extending = ret = 0; |
---|
66 | |
---|
67 | /* |
---|
68 | * Validate arguments. |
---|
69 | * |
---|
70 | * !!! |
---|
71 | * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly |
---|
72 | * files here, and create non-existent pages in readonly files if the |
---|
73 | * flags are set, later. The reason is that the hash access method |
---|
74 | * wants to get empty pages that don't really exist in readonly files. |
---|
75 | * The only alternative is for hash to write the last "bucket" all the |
---|
76 | * time, which we don't want to do because one of our big goals in life |
---|
77 | * is to keep database files small. It's sleazy as hell, but we catch |
---|
78 | * any attempt to actually write the file in memp_fput(). |
---|
79 | */ |
---|
80 | #define OKFLAGS (DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW) |
---|
81 | if (flags != 0) { |
---|
82 | if ((ret = __db_fchk(dbenv, "memp_fget", flags, OKFLAGS)) != 0) |
---|
83 | return (ret); |
---|
84 | |
---|
85 | switch (flags) { |
---|
86 | case DB_MPOOL_CREATE: |
---|
87 | break; |
---|
88 | case DB_MPOOL_LAST: |
---|
89 | /* Get the last page number in the file. */ |
---|
90 | if (flags == DB_MPOOL_LAST) { |
---|
91 | R_LOCK(dbenv, dbmp->reginfo); |
---|
92 | *pgnoaddr = mfp->last_pgno; |
---|
93 | R_UNLOCK(dbenv, dbmp->reginfo); |
---|
94 | } |
---|
95 | break; |
---|
96 | case DB_MPOOL_NEW: |
---|
97 | /* |
---|
98 | * If always creating a page, skip the first search |
---|
99 | * of the hash bucket. |
---|
100 | */ |
---|
101 | if (flags == DB_MPOOL_NEW) |
---|
102 | goto alloc; |
---|
103 | break; |
---|
104 | default: |
---|
105 | return (__db_ferr(dbenv, "memp_fget", 1)); |
---|
106 | } |
---|
107 | } |
---|
108 | |
---|
109 | /* |
---|
110 | * If mmap'ing the file and the page is not past the end of the file, |
---|
111 | * just return a pointer. |
---|
112 | * |
---|
113 | * The page may be past the end of the file, so check the page number |
---|
114 | * argument against the original length of the file. If we previously |
---|
115 | * returned pages past the original end of the file, last_pgno will |
---|
116 | * have been updated to match the "new" end of the file, and checking |
---|
117 | * against it would return pointers past the end of the mmap'd region. |
---|
118 | * |
---|
119 | * If another process has opened the file for writing since we mmap'd |
---|
120 | * it, we will start playing the game by their rules, i.e. everything |
---|
121 | * goes through the cache. All pages previously returned will be safe, |
---|
122 | * as long as the correct locking protocol was observed. |
---|
123 | * |
---|
124 | * We don't discard the map because we don't know when all of the |
---|
125 | * pages will have been discarded from the process' address space. |
---|
126 | * It would be possible to do so by reference counting the open |
---|
127 | * pages from the mmap, but it's unclear to me that it's worth it. |
---|
128 | */ |
---|
129 | if (dbmfp->addr != NULL && |
---|
130 | F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) { |
---|
131 | *(void **)addrp = |
---|
132 | R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize); |
---|
133 | ++mfp->stat.st_map; |
---|
134 | return (0); |
---|
135 | } |
---|
136 | |
---|
137 | hb_search: |
---|
138 | /* |
---|
139 | * Determine the cache and hash bucket where this page lives and get |
---|
140 | * local pointers to them. Reset on each pass through this code, the |
---|
141 | * page number can change. |
---|
142 | */ |
---|
143 | n_cache = NCACHE(mp, mf_offset, *pgnoaddr); |
---|
144 | c_mp = dbmp->reginfo[n_cache].primary; |
---|
145 | hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab); |
---|
146 | hp = &hp[NBUCKET(c_mp, mf_offset, *pgnoaddr)]; |
---|
147 | |
---|
148 | /* Search the hash chain for the page. */ |
---|
149 | retry: st_hsearch = 0; |
---|
150 | MUTEX_LOCK(dbenv, &hp->hash_mutex); |
---|
151 | for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh); |
---|
152 | bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) { |
---|
153 | ++st_hsearch; |
---|
154 | if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset) |
---|
155 | continue; |
---|
156 | |
---|
157 | /* |
---|
158 | * Increment the reference count. We may discard the hash |
---|
159 | * bucket lock as we evaluate and/or read the buffer, so we |
---|
160 | * need to ensure it doesn't move and its contents remain |
---|
161 | * unchanged. |
---|
162 | */ |
---|
163 | if (bhp->ref == UINT16_T_MAX) { |
---|
164 | __db_err(dbenv, |
---|
165 | "%s: page %lu: reference count overflow", |
---|
166 | __memp_fn(dbmfp), (u_long)bhp->pgno); |
---|
167 | ret = EINVAL; |
---|
168 | MUTEX_UNLOCK(dbenv, &hp->hash_mutex); |
---|
169 | goto err; |
---|
170 | } |
---|
171 | ++bhp->ref; |
---|
172 | b_incr = 1; |
---|
173 | |
---|
174 | /* |
---|
175 | * BH_LOCKED -- |
---|
176 | * I/O is in progress or sync is waiting on the buffer to write |
---|
177 | * it. Because we've incremented the buffer reference count, |
---|
178 | * we know the buffer can't move. Unlock the bucket lock, wait |
---|
179 | * for the buffer to become available, reacquire the bucket. |
---|
180 | */ |
---|
181 | for (first = 1; F_ISSET(bhp, BH_LOCKED) && |
---|
182 | !F_ISSET(dbenv, DB_ENV_NOLOCKING); first = 0) { |
---|
183 | /* |
---|
184 | * If someone is trying to sync this buffer and the |
---|
185 | * buffer is hot, they may never get in. Give up |
---|
186 | * and try again. |
---|
187 | */ |
---|
188 | if (!first && bhp->ref_sync != 0) { |
---|
189 | --bhp->ref; |
---|
190 | b_incr = 0; |
---|
191 | MUTEX_UNLOCK(dbenv, &hp->hash_mutex); |
---|
192 | __os_yield(dbenv, 1); |
---|
193 | goto retry; |
---|
194 | } |
---|
195 | |
---|
196 | MUTEX_UNLOCK(dbenv, &hp->hash_mutex); |
---|
197 | /* |
---|
198 | * Explicitly yield the processor if not the first pass |
---|
199 | * through this loop -- if we don't, we might run to the |
---|
200 | * end of our CPU quantum as we will simply be swapping |
---|
201 | * between the two locks. |
---|
202 | */ |
---|
203 | if (!first) |
---|
204 | __os_yield(dbenv, 1); |
---|
205 | |
---|
206 | MUTEX_LOCK(dbenv, &bhp->mutex); |
---|
207 | /* Wait for I/O to finish... */ |
---|
208 | MUTEX_UNLOCK(dbenv, &bhp->mutex); |
---|
209 | MUTEX_LOCK(dbenv, &hp->hash_mutex); |
---|
210 | } |
---|
211 | |
---|
212 | ++mfp->stat.st_cache_hit; |
---|
213 | break; |
---|
214 | } |
---|
215 | |
---|
216 | /* |
---|
217 | * Update the hash bucket search statistics -- do now because our next |
---|
218 | * search may be for a different bucket. |
---|
219 | */ |
---|
220 | ++c_mp->stat.st_hash_searches; |
---|
221 | if (st_hsearch > c_mp->stat.st_hash_longest) |
---|
222 | c_mp->stat.st_hash_longest = st_hsearch; |
---|
223 | c_mp->stat.st_hash_examined += st_hsearch; |
---|
224 | |
---|
225 | /* |
---|
226 | * There are 4 possible paths to this location: |
---|
227 | * |
---|
228 | * FIRST_MISS: |
---|
229 | * Didn't find the page in the hash bucket on our first pass: |
---|
230 | * bhp == NULL, alloc_bhp == NULL |
---|
231 | * |
---|
232 | * FIRST_FOUND: |
---|
233 | * Found the page in the hash bucket on our first pass: |
---|
234 | * bhp != NULL, alloc_bhp == NULL |
---|
235 | * |
---|
236 | * SECOND_FOUND: |
---|
237 | * Didn't find the page in the hash bucket on the first pass, |
---|
238 | * allocated space, and found the page in the hash bucket on |
---|
239 | * our second pass: |
---|
240 | * bhp != NULL, alloc_bhp != NULL |
---|
241 | * |
---|
242 | * SECOND_MISS: |
---|
243 | * Didn't find the page in the hash bucket on the first pass, |
---|
244 | * allocated space, and didn't find the page in the hash bucket |
---|
245 | * on our second pass: |
---|
246 | * bhp == NULL, alloc_bhp != NULL |
---|
247 | */ |
---|
248 | state = bhp == NULL ? |
---|
249 | (alloc_bhp == NULL ? FIRST_MISS : SECOND_MISS) : |
---|
250 | (alloc_bhp == NULL ? FIRST_FOUND : SECOND_FOUND); |
---|
251 | switch (state) { |
---|
252 | case FIRST_FOUND: |
---|
253 | /* We found the buffer in our first check -- we're done. */ |
---|
254 | break; |
---|
255 | case FIRST_MISS: |
---|
256 | /* |
---|
257 | * We didn't find the buffer in our first check. Figure out |
---|
258 | * if the page exists, and allocate structures so we can add |
---|
259 | * the page to the buffer pool. |
---|
260 | */ |
---|
261 | MUTEX_UNLOCK(dbenv, &hp->hash_mutex); |
---|
262 | |
---|
263 | alloc: /* |
---|
264 | * If DB_MPOOL_NEW is set, we have to allocate a page number. |
---|
265 | * If neither DB_MPOOL_CREATE or DB_MPOOL_CREATE is set, then |
---|
266 | * it's an error to try and get a page past the end of file. |
---|
267 | */ |
---|
268 | COMPQUIET(n_cache, 0); |
---|
269 | |
---|
270 | extending = ret = 0; |
---|
271 | R_LOCK(dbenv, dbmp->reginfo); |
---|
272 | switch (flags) { |
---|
273 | case DB_MPOOL_NEW: |
---|
274 | extending = 1; |
---|
275 | *pgnoaddr = mfp->last_pgno + 1; |
---|
276 | break; |
---|
277 | case DB_MPOOL_CREATE: |
---|
278 | extending = *pgnoaddr > mfp->last_pgno; |
---|
279 | break; |
---|
280 | default: |
---|
281 | ret = *pgnoaddr > mfp->last_pgno ? DB_PAGE_NOTFOUND : 0; |
---|
282 | break; |
---|
283 | } |
---|
284 | R_UNLOCK(dbenv, dbmp->reginfo); |
---|
285 | if (ret != 0) |
---|
286 | goto err; |
---|
287 | |
---|
288 | /* |
---|
289 | * !!! |
---|
290 | * In the DB_MPOOL_NEW code path, mf_offset and n_cache have |
---|
291 | * not yet been initialized. |
---|
292 | */ |
---|
293 | mf_offset = R_OFFSET(dbmp->reginfo, mfp); |
---|
294 | n_cache = NCACHE(mp, mf_offset, *pgnoaddr); |
---|
295 | |
---|
296 | /* Allocate a new buffer header and data space. */ |
---|
297 | if ((ret = __memp_alloc(dbmp, |
---|
298 | &dbmp->reginfo[n_cache], mfp, 0, NULL, &alloc_bhp)) != 0) |
---|
299 | goto err; |
---|
300 | #ifdef DIAGNOSTIC |
---|
301 | if ((db_alignp_t)alloc_bhp->buf & (sizeof(size_t) - 1)) { |
---|
302 | __db_err(dbenv, |
---|
303 | "Error: buffer data is NOT size_t aligned"); |
---|
304 | ret = EINVAL; |
---|
305 | goto err; |
---|
306 | } |
---|
307 | #endif |
---|
308 | /* |
---|
309 | * If we are extending the file, we'll need the region lock |
---|
310 | * again. |
---|
311 | */ |
---|
312 | if (extending) |
---|
313 | R_LOCK(dbenv, dbmp->reginfo); |
---|
314 | |
---|
315 | /* |
---|
316 | * DB_MPOOL_NEW does not guarantee you a page unreferenced by |
---|
317 | * any other thread of control. (That guarantee is interesting |
---|
318 | * for DB_MPOOL_NEW, unlike DB_MPOOL_CREATE, because the caller |
---|
319 | * did not specify the page number, and so, may reasonably not |
---|
320 | * have any way to lock the page outside of mpool.) Regardless, |
---|
321 | * if we allocate the page, and some other thread of control |
---|
322 | * requests the page by number, we will not detect that and the |
---|
323 | * thread of control that allocated using DB_MPOOL_NEW may not |
---|
324 | * have a chance to initialize the page. (Note: we *could* |
---|
325 | * detect this case if we set a flag in the buffer header which |
---|
326 | * guaranteed that no gets of the page would succeed until the |
---|
327 | * reference count went to 0, that is, until the creating page |
---|
328 | * put the page.) What we do guarantee is that if two threads |
---|
329 | * of control are both doing DB_MPOOL_NEW calls, they won't |
---|
330 | * collide, that is, they won't both get the same page. |
---|
331 | * |
---|
332 | * There's a possibility that another thread allocated the page |
---|
333 | * we were planning to allocate while we were off doing buffer |
---|
334 | * allocation. We can do that by making sure the page number |
---|
335 | * we were going to use is still available. If it's not, then |
---|
336 | * we check to see if the next available page number hashes to |
---|
337 | * the same mpool region as the old one -- if it does, we can |
---|
338 | * continue, otherwise, we have to start over. |
---|
339 | */ |
---|
340 | if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) { |
---|
341 | *pgnoaddr = mfp->last_pgno + 1; |
---|
342 | if (n_cache != NCACHE(mp, mf_offset, *pgnoaddr)) { |
---|
343 | __db_shalloc_free( |
---|
344 | dbmp->reginfo[n_cache].addr, alloc_bhp); |
---|
345 | /* |
---|
346 | * flags == DB_MPOOL_NEW, so extending is set |
---|
347 | * and we're holding the region locked. |
---|
348 | */ |
---|
349 | R_UNLOCK(dbenv, dbmp->reginfo); |
---|
350 | |
---|
351 | alloc_bhp = NULL; |
---|
352 | goto alloc; |
---|
353 | } |
---|
354 | } |
---|
355 | |
---|
356 | /* |
---|
357 | * We released the region lock, so another thread might have |
---|
358 | * extended the file. Update the last_pgno and initialize |
---|
359 | * the file, as necessary, if we extended the file. |
---|
360 | */ |
---|
361 | if (extending) { |
---|
362 | #ifdef HAVE_FILESYSTEM_NOTZERO |
---|
363 | if (*pgnoaddr > mfp->last_pgno && |
---|
364 | __os_fs_notzero() && |
---|
365 | F_ISSET(dbmfp->fhp, DB_FH_VALID)) |
---|
366 | ret = __memp_fs_notzero( |
---|
367 | dbenv, dbmfp, mfp, pgnoaddr); |
---|
368 | else |
---|
369 | ret = 0; |
---|
370 | #endif |
---|
371 | if (ret == 0 && *pgnoaddr > mfp->last_pgno) |
---|
372 | mfp->last_pgno = *pgnoaddr; |
---|
373 | |
---|
374 | R_UNLOCK(dbenv, dbmp->reginfo); |
---|
375 | if (ret != 0) |
---|
376 | goto err; |
---|
377 | } |
---|
378 | goto hb_search; |
---|
379 | case SECOND_FOUND: |
---|
380 | /* |
---|
381 | * We allocated buffer space for the requested page, but then |
---|
382 | * found the page in the buffer cache on our second check. |
---|
383 | * That's OK -- we can use the page we found in the pool, |
---|
384 | * unless DB_MPOOL_NEW is set. |
---|
385 | * |
---|
386 | * Free the allocated memory, we no longer need it. Since we |
---|
387 | * can't acquire the region lock while holding the hash bucket |
---|
388 | * lock, we have to release the hash bucket and re-acquire it. |
---|
389 | * That's OK, because we have the buffer pinned down. |
---|
390 | */ |
---|
391 | MUTEX_UNLOCK(dbenv, &hp->hash_mutex); |
---|
392 | R_LOCK(dbenv, &dbmp->reginfo[n_cache]); |
---|
393 | __db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp); |
---|
394 | alloc_bhp = NULL; |
---|
395 | R_UNLOCK(dbenv, &dbmp->reginfo[n_cache]); |
---|
396 | MUTEX_LOCK(dbenv, &hp->hash_mutex); |
---|
397 | |
---|
398 | /* |
---|
399 | * We can't use the page we found in the pool if DB_MPOOL_NEW |
---|
400 | * was set. (For details, see the above comment beginning |
---|
401 | * "DB_MPOOL_NEW does not guarantee you a page unreferenced by |
---|
402 | * any other thread of control".) If DB_MPOOL_NEW is set, we |
---|
403 | * release our pin on this particular buffer, and try to get |
---|
404 | * another one. |
---|
405 | */ |
---|
406 | if (flags == DB_MPOOL_NEW) { |
---|
407 | --bhp->ref; |
---|
408 | b_incr = 0; |
---|
409 | goto alloc; |
---|
410 | } |
---|
411 | break; |
---|
412 | case SECOND_MISS: |
---|
413 | /* |
---|
414 | * We allocated buffer space for the requested page, and found |
---|
415 | * the page still missing on our second pass through the buffer |
---|
416 | * cache. Instantiate the page. |
---|
417 | */ |
---|
418 | bhp = alloc_bhp; |
---|
419 | alloc_bhp = NULL; |
---|
420 | |
---|
421 | /* |
---|
422 | * Initialize all the BH and hash bucket fields so we can call |
---|
423 | * __memp_bhfree if an error occurs. |
---|
424 | * |
---|
425 | * Append the buffer to the tail of the bucket list and update |
---|
426 | * the hash bucket's priority. |
---|
427 | */ |
---|
428 | b_incr = 1; |
---|
429 | |
---|
430 | memset(bhp, 0, sizeof(BH)); |
---|
431 | bhp->ref = 1; |
---|
432 | bhp->priority = UINT32_T_MAX; |
---|
433 | bhp->pgno = *pgnoaddr; |
---|
434 | bhp->mf_offset = mf_offset; |
---|
435 | SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq); |
---|
436 | hp->hash_priority = |
---|
437 | SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority; |
---|
438 | |
---|
439 | /* If we extended the file, make sure the page is never lost. */ |
---|
440 | if (extending) { |
---|
441 | ++hp->hash_page_dirty; |
---|
442 | F_SET(bhp, BH_DIRTY | BH_DIRTY_CREATE); |
---|
443 | } |
---|
444 | |
---|
445 | /* |
---|
446 | * If we created the page, zero it out. If we didn't create |
---|
447 | * the page, read from the backing file. |
---|
448 | * |
---|
449 | * !!! |
---|
450 | * DB_MPOOL_NEW doesn't call the pgin function. |
---|
451 | * |
---|
452 | * If DB_MPOOL_CREATE is used, then the application's pgin |
---|
453 | * function has to be able to handle pages of 0's -- if it |
---|
454 | * uses DB_MPOOL_NEW, it can detect all of its page creates, |
---|
455 | * and not bother. |
---|
456 | * |
---|
457 | * If we're running in diagnostic mode, smash any bytes on the |
---|
458 | * page that are unknown quantities for the caller. |
---|
459 | * |
---|
460 | * Otherwise, read the page into memory, optionally creating it |
---|
461 | * if DB_MPOOL_CREATE is set. |
---|
462 | */ |
---|
463 | if (extending) { |
---|
464 | if (mfp->clear_len == 0) |
---|
465 | memset(bhp->buf, 0, mfp->stat.st_pagesize); |
---|
466 | else { |
---|
467 | memset(bhp->buf, 0, mfp->clear_len); |
---|
468 | #if defined(DIAGNOSTIC) || defined(UMRW) |
---|
469 | memset(bhp->buf + mfp->clear_len, CLEAR_BYTE, |
---|
470 | mfp->stat.st_pagesize - mfp->clear_len); |
---|
471 | #endif |
---|
472 | } |
---|
473 | |
---|
474 | if (flags == DB_MPOOL_CREATE && mfp->ftype != 0) |
---|
475 | F_SET(bhp, BH_CALLPGIN); |
---|
476 | |
---|
477 | ++mfp->stat.st_page_create; |
---|
478 | } else { |
---|
479 | F_SET(bhp, BH_TRASH); |
---|
480 | ++mfp->stat.st_cache_miss; |
---|
481 | } |
---|
482 | |
---|
483 | /* Increment buffer count referenced by MPOOLFILE. */ |
---|
484 | MUTEX_LOCK(dbenv, &mfp->mutex); |
---|
485 | ++mfp->block_cnt; |
---|
486 | MUTEX_UNLOCK(dbenv, &mfp->mutex); |
---|
487 | |
---|
488 | /* |
---|
489 | * Initialize the mutex. This is the last initialization step, |
---|
490 | * because it's the only one that can fail, and everything else |
---|
491 | * must be set up or we can't jump to the err label because it |
---|
492 | * will call __memp_bhfree. |
---|
493 | */ |
---|
494 | if ((ret = __db_mutex_setup(dbenv, |
---|
495 | &dbmp->reginfo[n_cache], &bhp->mutex, 0)) != 0) |
---|
496 | goto err; |
---|
497 | } |
---|
498 | |
---|
499 | DB_ASSERT(bhp->ref != 0); |
---|
500 | |
---|
501 | /* |
---|
502 | * If we're the only reference, update buffer and bucket priorities. |
---|
503 | * We may be about to release the hash bucket lock, and everything |
---|
504 | * should be correct, first. (We've already done this if we created |
---|
505 | * the buffer, so there is no need to do it again.) |
---|
506 | */ |
---|
507 | if (state != SECOND_MISS && bhp->ref == 1) { |
---|
508 | bhp->priority = UINT32_T_MAX; |
---|
509 | SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh); |
---|
510 | SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq); |
---|
511 | hp->hash_priority = |
---|
512 | SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority; |
---|
513 | } |
---|
514 | |
---|
515 | /* |
---|
516 | * BH_TRASH -- |
---|
517 | * The buffer we found may need to be filled from the disk. |
---|
518 | * |
---|
519 | * It's possible for the read function to fail, which means we fail as |
---|
520 | * well. Note, the __memp_pgread() function discards and reacquires |
---|
521 | * the hash lock, so the buffer must be pinned down so that it cannot |
---|
522 | * move and its contents are unchanged. Discard the buffer on failure |
---|
523 | * unless another thread is waiting on our I/O to complete. It's OK to |
---|
524 | * leave the buffer around, as the waiting thread will see the BH_TRASH |
---|
525 | * flag set, and will also attempt to discard it. If there's a waiter, |
---|
526 | * we need to decrement our reference count. |
---|
527 | */ |
---|
528 | if (F_ISSET(bhp, BH_TRASH) && |
---|
529 | (ret = __memp_pgread(dbmfp, |
---|
530 | &hp->hash_mutex, bhp, LF_ISSET(DB_MPOOL_CREATE) ? 1 : 0)) != 0) |
---|
531 | goto err; |
---|
532 | |
---|
533 | /* |
---|
534 | * BH_CALLPGIN -- |
---|
535 | * The buffer was processed for being written to disk, and now has |
---|
536 | * to be re-converted for use. |
---|
537 | */ |
---|
538 | if (F_ISSET(bhp, BH_CALLPGIN)) { |
---|
539 | if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0) |
---|
540 | goto err; |
---|
541 | F_CLR(bhp, BH_CALLPGIN); |
---|
542 | } |
---|
543 | |
---|
544 | MUTEX_UNLOCK(dbenv, &hp->hash_mutex); |
---|
545 | |
---|
546 | #ifdef DIAGNOSTIC |
---|
547 | /* Update the file's pinned reference count. */ |
---|
548 | R_LOCK(dbenv, dbmp->reginfo); |
---|
549 | ++dbmfp->pinref; |
---|
550 | R_UNLOCK(dbenv, dbmp->reginfo); |
---|
551 | |
---|
552 | /* |
---|
553 | * We want to switch threads as often as possible, and at awkward |
---|
554 | * times. Yield every time we get a new page to ensure contention. |
---|
555 | */ |
---|
556 | if (F_ISSET(dbenv, DB_ENV_YIELDCPU)) |
---|
557 | __os_yield(dbenv, 1); |
---|
558 | #endif |
---|
559 | |
---|
560 | *(void **)addrp = bhp->buf; |
---|
561 | return (0); |
---|
562 | |
---|
563 | err: /* |
---|
564 | * Discard our reference. If we're the only reference, discard the |
---|
565 | * the buffer entirely. If we held a reference to a buffer, we are |
---|
566 | * also still holding the hash bucket mutex. |
---|
567 | */ |
---|
568 | if (b_incr) { |
---|
569 | if (bhp->ref == 1) |
---|
570 | (void)__memp_bhfree(dbmp, hp, bhp, 1); |
---|
571 | else { |
---|
572 | --bhp->ref; |
---|
573 | MUTEX_UNLOCK(dbenv, &hp->hash_mutex); |
---|
574 | } |
---|
575 | } |
---|
576 | |
---|
577 | /* If alloc_bhp is set, free the memory. */ |
---|
578 | if (alloc_bhp != NULL) |
---|
579 | __db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp); |
---|
580 | |
---|
581 | return (ret); |
---|
582 | } |
---|
583 | |
---|
584 | #ifdef HAVE_FILESYSTEM_NOTZERO |
---|
585 | /* |
---|
586 | * __memp_fs_notzero -- |
---|
587 | * Initialize the underlying allocated pages in the file. |
---|
588 | */ |
---|
589 | static int |
---|
590 | __memp_fs_notzero(dbenv, dbmfp, mfp, pgnoaddr) |
---|
591 | DB_ENV *dbenv; |
---|
592 | DB_MPOOLFILE *dbmfp; |
---|
593 | MPOOLFILE *mfp; |
---|
594 | db_pgno_t *pgnoaddr; |
---|
595 | { |
---|
596 | DB_IO db_io; |
---|
597 | u_int32_t i, npages; |
---|
598 | size_t nw; |
---|
599 | int ret; |
---|
600 | u_int8_t *page; |
---|
601 | char *fail; |
---|
602 | |
---|
603 | /* |
---|
604 | * Pages allocated by writing pages past end-of-file are not zeroed, |
---|
605 | * on some systems. Recovery could theoretically be fooled by a page |
---|
606 | * showing up that contained garbage. In order to avoid this, we |
---|
607 | * have to write the pages out to disk, and flush them. The reason |
---|
608 | * for the flush is because if we don't sync, the allocation of another |
---|
609 | * page subsequent to this one might reach the disk first, and if we |
---|
610 | * crashed at the right moment, leave us with this page as the one |
---|
611 | * allocated by writing a page past it in the file. |
---|
612 | * |
---|
613 | * Hash is the only access method that allocates groups of pages. We |
---|
614 | * know that it will use the existence of the last page in a group to |
---|
615 | * signify that the entire group is OK; so, write all the pages but |
---|
616 | * the last one in the group, flush them to disk, and then write the |
---|
617 | * last one to disk and flush it. |
---|
618 | */ |
---|
619 | if ((ret = __os_calloc(dbenv, 1, mfp->stat.st_pagesize, &page)) != 0) |
---|
620 | return (ret); |
---|
621 | |
---|
622 | db_io.fhp = dbmfp->fhp; |
---|
623 | db_io.mutexp = dbmfp->mutexp; |
---|
624 | db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize; |
---|
625 | db_io.buf = page; |
---|
626 | |
---|
627 | npages = *pgnoaddr - mfp->last_pgno; |
---|
628 | for (i = 1; i < npages; ++i) { |
---|
629 | db_io.pgno = mfp->last_pgno + i; |
---|
630 | if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) { |
---|
631 | fail = "write"; |
---|
632 | goto err; |
---|
633 | } |
---|
634 | } |
---|
635 | if (i != 1 && (ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) { |
---|
636 | fail = "sync"; |
---|
637 | goto err; |
---|
638 | } |
---|
639 | |
---|
640 | db_io.pgno = mfp->last_pgno + npages; |
---|
641 | if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) { |
---|
642 | fail = "write"; |
---|
643 | goto err; |
---|
644 | } |
---|
645 | if ((ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) { |
---|
646 | fail = "sync"; |
---|
647 | err: __db_err(dbenv, "%s: %s failed for page %lu", |
---|
648 | __memp_fn(dbmfp), fail, (u_long)db_io.pgno); |
---|
649 | } |
---|
650 | |
---|
651 | __os_free(dbenv, page); |
---|
652 | return (ret); |
---|
653 | } |
---|
654 | #endif |
---|