1 | /*- |
---|
2 | * See the file LICENSE for redistribution information. |
---|
3 | * |
---|
4 | * Copyright (c) 1996-2002 |
---|
5 | * Sleepycat Software. All rights reserved. |
---|
6 | */ |
---|
7 | /* |
---|
8 | * Copyright (c) 1990, 1993, 1994, 1995, 1996 |
---|
9 | * Keith Bostic. All rights reserved. |
---|
10 | */ |
---|
11 | /* |
---|
12 | * Copyright (c) 1990, 1993, 1994, 1995 |
---|
13 | * The Regents of the University of California. All rights reserved. |
---|
14 | * |
---|
15 | * This code is derived from software contributed to Berkeley by |
---|
16 | * Mike Olson. |
---|
17 | * |
---|
18 | * Redistribution and use in source and binary forms, with or without |
---|
19 | * modification, are permitted provided that the following conditions |
---|
20 | * are met: |
---|
21 | * 1. Redistributions of source code must retain the above copyright |
---|
22 | * notice, this list of conditions and the following disclaimer. |
---|
23 | * 2. Redistributions in binary form must reproduce the above copyright |
---|
24 | * notice, this list of conditions and the following disclaimer in the |
---|
25 | * documentation and/or other materials provided with the distribution. |
---|
26 | * 3. Neither the name of the University nor the names of its contributors |
---|
27 | * may be used to endorse or promote products derived from this software |
---|
28 | * without specific prior written permission. |
---|
29 | * |
---|
30 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
---|
31 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
---|
32 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
---|
33 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
---|
34 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
---|
35 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
---|
36 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
---|
37 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
---|
38 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
---|
39 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
---|
40 | * SUCH DAMAGE. |
---|
41 | */ |
---|
42 | |
---|
43 | #include "db_config.h" |
---|
44 | |
---|
45 | #ifndef lint |
---|
46 | static const char revid[] = "$Id: bt_search.c,v 1.1.1.1 2004-12-17 17:26:44 ghudson Exp $"; |
---|
47 | #endif /* not lint */ |
---|
48 | |
---|
49 | #ifndef NO_SYSTEM_INCLUDES |
---|
50 | #include <sys/types.h> |
---|
51 | |
---|
52 | #include <string.h> |
---|
53 | #endif |
---|
54 | |
---|
55 | #include "db_int.h" |
---|
56 | #include "dbinc/db_page.h" |
---|
57 | #include "dbinc/db_shash.h" |
---|
58 | #include "dbinc/btree.h" |
---|
59 | #include "dbinc/lock.h" |
---|
60 | |
---|
61 | /* |
---|
62 | * __bam_search -- |
---|
63 | * Search a btree for a key. |
---|
64 | * |
---|
65 | * PUBLIC: int __bam_search __P((DBC *, db_pgno_t, |
---|
66 | * PUBLIC: const DBT *, u_int32_t, int, db_recno_t *, int *)); |
---|
67 | */ |
---|
68 | int |
---|
69 | __bam_search(dbc, root_pgno, key, flags, stop, recnop, exactp) |
---|
70 | DBC *dbc; |
---|
71 | db_pgno_t root_pgno; |
---|
72 | const DBT *key; |
---|
73 | u_int32_t flags; |
---|
74 | int stop, *exactp; |
---|
75 | db_recno_t *recnop; |
---|
76 | { |
---|
77 | BTREE *t; |
---|
78 | BTREE_CURSOR *cp; |
---|
79 | DB *dbp; |
---|
80 | DB_LOCK lock; |
---|
81 | DB_MPOOLFILE *mpf; |
---|
82 | PAGE *h; |
---|
83 | db_indx_t base, i, indx, *inp, lim; |
---|
84 | db_lockmode_t lock_mode; |
---|
85 | db_pgno_t pg; |
---|
86 | db_recno_t recno; |
---|
87 | int adjust, cmp, deloffset, ret, stack; |
---|
88 | int (*func) __P((DB *, const DBT *, const DBT *)); |
---|
89 | |
---|
90 | dbp = dbc->dbp; |
---|
91 | mpf = dbp->mpf; |
---|
92 | cp = (BTREE_CURSOR *)dbc->internal; |
---|
93 | t = dbp->bt_internal; |
---|
94 | recno = 0; |
---|
95 | |
---|
96 | BT_STK_CLR(cp); |
---|
97 | |
---|
98 | /* |
---|
99 | * There are several ways we search a btree tree. The flags argument |
---|
100 | * specifies if we're acquiring read or write locks, if we position |
---|
101 | * to the first or last item in a set of duplicates, if we return |
---|
102 | * deleted items, and if we are locking pairs of pages. In addition, |
---|
103 | * if we're modifying record numbers, we have to lock the entire tree |
---|
104 | * regardless. See btree.h for more details. |
---|
105 | * |
---|
106 | * If write-locking pages, we need to know whether or not to acquire a |
---|
107 | * write lock on a page before getting it. This depends on how deep it |
---|
108 | * is in tree, which we don't know until we acquire the root page. So, |
---|
109 | * if we need to lock the root page we may have to upgrade it later, |
---|
110 | * because we won't get the correct lock initially. |
---|
111 | * |
---|
112 | * Retrieve the root page. |
---|
113 | */ |
---|
114 | try_again: |
---|
115 | pg = root_pgno == PGNO_INVALID ? cp->root : root_pgno; |
---|
116 | stack = LF_ISSET(S_STACK) && F_ISSET(cp, C_RECNUM); |
---|
117 | lock_mode = stack ? DB_LOCK_WRITE : DB_LOCK_READ; |
---|
118 | if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0) |
---|
119 | return (ret); |
---|
120 | if ((ret = mpf->get(mpf, &pg, 0, &h)) != 0) { |
---|
121 | /* Did not read it, so we can release the lock */ |
---|
122 | (void)__LPUT(dbc, lock); |
---|
123 | return (ret); |
---|
124 | } |
---|
125 | |
---|
126 | /* |
---|
127 | * Decide if we need to save this page; if we do, write lock it. |
---|
128 | * We deliberately don't lock-couple on this call. If the tree |
---|
129 | * is tiny, i.e., one page, and two threads are busily updating |
---|
130 | * the root page, we're almost guaranteed deadlocks galore, as |
---|
131 | * each one gets a read lock and then blocks the other's attempt |
---|
132 | * for a write lock. |
---|
133 | */ |
---|
134 | if (!stack && |
---|
135 | ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) || |
---|
136 | (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) { |
---|
137 | (void)mpf->put(mpf, h, 0); |
---|
138 | (void)__LPUT(dbc, lock); |
---|
139 | lock_mode = DB_LOCK_WRITE; |
---|
140 | if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0) |
---|
141 | return (ret); |
---|
142 | if ((ret = mpf->get(mpf, &pg, 0, &h)) != 0) { |
---|
143 | /* Did not read it, so we can release the lock */ |
---|
144 | (void)__LPUT(dbc, lock); |
---|
145 | return (ret); |
---|
146 | } |
---|
147 | if (!((LF_ISSET(S_PARENT) && |
---|
148 | (u_int8_t)(stop + 1) >= h->level) || |
---|
149 | (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) { |
---|
150 | /* Someone else split the root, start over. */ |
---|
151 | (void)mpf->put(mpf, h, 0); |
---|
152 | (void)__LPUT(dbc, lock); |
---|
153 | goto try_again; |
---|
154 | } |
---|
155 | stack = 1; |
---|
156 | } |
---|
157 | |
---|
158 | /* Choose a comparison function. */ |
---|
159 | func = F_ISSET(dbc, DBC_OPD) ? |
---|
160 | (dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare) : |
---|
161 | t->bt_compare; |
---|
162 | |
---|
163 | for (;;) { |
---|
164 | inp = P_INP(dbp, h); |
---|
165 | /* |
---|
166 | * Do a binary search on the current page. If we're searching |
---|
167 | * a Btree leaf page, we have to walk the indices in groups of |
---|
168 | * two. If we're searching an internal page or a off-page dup |
---|
169 | * page, they're an index per page item. If we find an exact |
---|
170 | * match on a leaf page, we're done. |
---|
171 | */ |
---|
172 | adjust = TYPE(h) == P_LBTREE ? P_INDX : O_INDX; |
---|
173 | for (base = 0, |
---|
174 | lim = NUM_ENT(h) / (db_indx_t)adjust; lim != 0; lim >>= 1) { |
---|
175 | indx = base + ((lim >> 1) * adjust); |
---|
176 | if ((ret = |
---|
177 | __bam_cmp(dbp, key, h, indx, func, &cmp)) != 0) |
---|
178 | goto err; |
---|
179 | if (cmp == 0) { |
---|
180 | if (TYPE(h) == P_LBTREE || TYPE(h) == P_LDUP) |
---|
181 | goto found; |
---|
182 | goto next; |
---|
183 | } |
---|
184 | if (cmp > 0) { |
---|
185 | base = indx + adjust; |
---|
186 | --lim; |
---|
187 | } |
---|
188 | } |
---|
189 | |
---|
190 | /* |
---|
191 | * No match found. Base is the smallest index greater than |
---|
192 | * key and may be zero or a last + O_INDX index. |
---|
193 | * |
---|
194 | * If it's a leaf page, return base as the "found" value. |
---|
195 | * Delete only deletes exact matches. |
---|
196 | */ |
---|
197 | if (TYPE(h) == P_LBTREE || TYPE(h) == P_LDUP) { |
---|
198 | *exactp = 0; |
---|
199 | |
---|
200 | if (LF_ISSET(S_EXACT)) |
---|
201 | goto notfound; |
---|
202 | |
---|
203 | if (LF_ISSET(S_STK_ONLY)) { |
---|
204 | BT_STK_NUM(dbp->dbenv, cp, h, base, ret); |
---|
205 | __LPUT(dbc, lock); |
---|
206 | (void)mpf->put(mpf, h, 0); |
---|
207 | return (ret); |
---|
208 | } |
---|
209 | |
---|
210 | /* |
---|
211 | * !!! |
---|
212 | * Possibly returning a deleted record -- DB_SET_RANGE, |
---|
213 | * DB_KEYFIRST and DB_KEYLAST don't require an exact |
---|
214 | * match, and we don't want to walk multiple pages here |
---|
215 | * to find an undeleted record. This is handled by the |
---|
216 | * calling routine. |
---|
217 | */ |
---|
218 | BT_STK_ENTER(dbp->dbenv, |
---|
219 | cp, h, base, lock, lock_mode, ret); |
---|
220 | if (ret != 0) |
---|
221 | goto err; |
---|
222 | return (0); |
---|
223 | } |
---|
224 | |
---|
225 | /* |
---|
226 | * If it's not a leaf page, record the internal page (which is |
---|
227 | * a parent page for the key). Decrement the base by 1 if it's |
---|
228 | * non-zero so that if a split later occurs, the inserted page |
---|
229 | * will be to the right of the saved page. |
---|
230 | */ |
---|
231 | indx = base > 0 ? base - O_INDX : base; |
---|
232 | |
---|
233 | /* |
---|
234 | * If we're trying to calculate the record number, sum up |
---|
235 | * all the record numbers on this page up to the indx point. |
---|
236 | */ |
---|
237 | next: if (recnop != NULL) |
---|
238 | for (i = 0; i < indx; ++i) |
---|
239 | recno += GET_BINTERNAL(dbp, h, i)->nrecs; |
---|
240 | |
---|
241 | pg = GET_BINTERNAL(dbp, h, indx)->pgno; |
---|
242 | |
---|
243 | if (LF_ISSET(S_STK_ONLY)) { |
---|
244 | if (stop == h->level) { |
---|
245 | BT_STK_NUM(dbp->dbenv, cp, h, indx, ret); |
---|
246 | __LPUT(dbc, lock); |
---|
247 | (void)mpf->put(mpf, h, 0); |
---|
248 | return (ret); |
---|
249 | } |
---|
250 | BT_STK_NUMPUSH(dbp->dbenv, cp, h, indx, ret); |
---|
251 | (void)mpf->put(mpf, h, 0); |
---|
252 | if ((ret = __db_lget(dbc, |
---|
253 | LCK_COUPLE_ALWAYS, pg, lock_mode, 0, &lock)) != 0) { |
---|
254 | /* |
---|
255 | * Discard our lock and return on failure. This |
---|
256 | * is OK because it only happens when descending |
---|
257 | * the tree holding read-locks. |
---|
258 | */ |
---|
259 | __LPUT(dbc, lock); |
---|
260 | return (ret); |
---|
261 | } |
---|
262 | } else if (stack) { |
---|
263 | /* Return if this is the lowest page wanted. */ |
---|
264 | if (LF_ISSET(S_PARENT) && stop == h->level) { |
---|
265 | BT_STK_ENTER(dbp->dbenv, |
---|
266 | cp, h, indx, lock, lock_mode, ret); |
---|
267 | if (ret != 0) |
---|
268 | goto err; |
---|
269 | return (0); |
---|
270 | } |
---|
271 | BT_STK_PUSH(dbp->dbenv, |
---|
272 | cp, h, indx, lock, lock_mode, ret); |
---|
273 | if (ret != 0) |
---|
274 | goto err; |
---|
275 | |
---|
276 | lock_mode = DB_LOCK_WRITE; |
---|
277 | if ((ret = |
---|
278 | __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0) |
---|
279 | goto err; |
---|
280 | } else { |
---|
281 | /* |
---|
282 | * Decide if we want to return a reference to the next |
---|
283 | * page in the return stack. If so, lock it and never |
---|
284 | * unlock it. |
---|
285 | */ |
---|
286 | if ((LF_ISSET(S_PARENT) && |
---|
287 | (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) || |
---|
288 | (h->level - 1) == LEAFLEVEL) |
---|
289 | stack = 1; |
---|
290 | |
---|
291 | (void)mpf->put(mpf, h, 0); |
---|
292 | |
---|
293 | lock_mode = stack && |
---|
294 | LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ; |
---|
295 | if ((ret = __db_lget(dbc, |
---|
296 | LCK_COUPLE_ALWAYS, pg, lock_mode, 0, &lock)) != 0) { |
---|
297 | /* |
---|
298 | * If we fail, discard the lock we held. This |
---|
299 | * is OK because this only happens when we are |
---|
300 | * descending the tree holding read-locks. |
---|
301 | */ |
---|
302 | __LPUT(dbc, lock); |
---|
303 | goto err; |
---|
304 | } |
---|
305 | } |
---|
306 | if ((ret = mpf->get(mpf, &pg, 0, &h)) != 0) |
---|
307 | goto err; |
---|
308 | } |
---|
309 | /* NOTREACHED */ |
---|
310 | |
---|
311 | found: *exactp = 1; |
---|
312 | |
---|
313 | /* |
---|
314 | * If we're trying to calculate the record number, add in the |
---|
315 | * offset on this page and correct for the fact that records |
---|
316 | * in the tree are 0-based. |
---|
317 | */ |
---|
318 | if (recnop != NULL) |
---|
319 | *recnop = recno + (indx / P_INDX) + 1; |
---|
320 | |
---|
321 | /* |
---|
322 | * If we got here, we know that we have a Btree leaf or off-page |
---|
323 | * duplicates page. If it's a Btree leaf page, we have to handle |
---|
324 | * on-page duplicates. |
---|
325 | * |
---|
326 | * If there are duplicates, go to the first/last one. This is |
---|
327 | * safe because we know that we're not going to leave the page, |
---|
328 | * all duplicate sets that are not on overflow pages exist on a |
---|
329 | * single leaf page. |
---|
330 | */ |
---|
331 | if (TYPE(h) == P_LBTREE) { |
---|
332 | if (LF_ISSET(S_DUPLAST)) |
---|
333 | while (indx < (db_indx_t)(NUM_ENT(h) - P_INDX) && |
---|
334 | inp[indx] == inp[indx + P_INDX]) |
---|
335 | indx += P_INDX; |
---|
336 | else |
---|
337 | while (indx > 0 && |
---|
338 | inp[indx] == inp[indx - P_INDX]) |
---|
339 | indx -= P_INDX; |
---|
340 | } |
---|
341 | |
---|
342 | /* |
---|
343 | * Now check if we are allowed to return deleted items; if not, then |
---|
344 | * find the next (or previous) non-deleted duplicate entry. (We do |
---|
345 | * not move from the original found key on the basis of the S_DELNO |
---|
346 | * flag.) |
---|
347 | */ |
---|
348 | if (LF_ISSET(S_DELNO)) { |
---|
349 | deloffset = TYPE(h) == P_LBTREE ? O_INDX : 0; |
---|
350 | if (LF_ISSET(S_DUPLAST)) |
---|
351 | while (B_DISSET(GET_BKEYDATA(dbp, |
---|
352 | h, indx + deloffset)->type) && indx > 0 && |
---|
353 | inp[indx] == inp[indx - adjust]) |
---|
354 | indx -= adjust; |
---|
355 | else |
---|
356 | while (B_DISSET(GET_BKEYDATA(dbp, |
---|
357 | h, indx + deloffset)->type) && |
---|
358 | indx < (db_indx_t)(NUM_ENT(h) - adjust) && |
---|
359 | inp[indx] == inp[indx + adjust]) |
---|
360 | indx += adjust; |
---|
361 | |
---|
362 | /* |
---|
363 | * If we weren't able to find a non-deleted duplicate, return |
---|
364 | * DB_NOTFOUND. |
---|
365 | */ |
---|
366 | if (B_DISSET(GET_BKEYDATA(dbp, h, indx + deloffset)->type)) |
---|
367 | goto notfound; |
---|
368 | } |
---|
369 | |
---|
370 | if (LF_ISSET(S_STK_ONLY)) { |
---|
371 | BT_STK_NUM(dbp->dbenv, cp, h, indx, ret); |
---|
372 | __LPUT(dbc, lock); |
---|
373 | (void)mpf->put(mpf, h, 0); |
---|
374 | } else { |
---|
375 | BT_STK_ENTER(dbp->dbenv, cp, h, indx, lock, lock_mode, ret); |
---|
376 | if (ret != 0) |
---|
377 | goto err; |
---|
378 | } |
---|
379 | return (0); |
---|
380 | |
---|
381 | notfound: |
---|
382 | /* Keep the page locked for serializability. */ |
---|
383 | (void)mpf->put(mpf, h, 0); |
---|
384 | (void)__TLPUT(dbc, lock); |
---|
385 | ret = DB_NOTFOUND; |
---|
386 | |
---|
387 | err: BT_STK_POP(cp); |
---|
388 | __bam_stkrel(dbc, 0); |
---|
389 | return (ret); |
---|
390 | } |
---|
391 | |
---|
392 | /* |
---|
393 | * __bam_stkrel -- |
---|
394 | * Release all pages currently held in the stack. |
---|
395 | * |
---|
396 | * PUBLIC: int __bam_stkrel __P((DBC *, u_int32_t)); |
---|
397 | */ |
---|
398 | int |
---|
399 | __bam_stkrel(dbc, flags) |
---|
400 | DBC *dbc; |
---|
401 | u_int32_t flags; |
---|
402 | { |
---|
403 | BTREE_CURSOR *cp; |
---|
404 | DB *dbp; |
---|
405 | DB_MPOOLFILE *mpf; |
---|
406 | EPG *epg; |
---|
407 | int ret, t_ret; |
---|
408 | |
---|
409 | dbp = dbc->dbp; |
---|
410 | mpf = dbp->mpf; |
---|
411 | cp = (BTREE_CURSOR *)dbc->internal; |
---|
412 | |
---|
413 | /* |
---|
414 | * Release inner pages first. |
---|
415 | * |
---|
416 | * The caller must be sure that setting STK_NOLOCK will not effect |
---|
417 | * either serializability or recoverability. |
---|
418 | */ |
---|
419 | for (ret = 0, epg = cp->sp; epg <= cp->csp; ++epg) { |
---|
420 | if (epg->page != NULL) { |
---|
421 | if (LF_ISSET(STK_CLRDBC) && cp->page == epg->page) { |
---|
422 | cp->page = NULL; |
---|
423 | LOCK_INIT(cp->lock); |
---|
424 | } |
---|
425 | if ((t_ret = |
---|
426 | mpf->put(mpf, epg->page, 0)) != 0 && ret == 0) |
---|
427 | ret = t_ret; |
---|
428 | /* |
---|
429 | * XXX |
---|
430 | * Temporary fix for #3243 -- under certain deadlock |
---|
431 | * conditions we call here again and re-free the page. |
---|
432 | * The correct fix is to never release a stack that |
---|
433 | * doesn't hold items. |
---|
434 | */ |
---|
435 | epg->page = NULL; |
---|
436 | } |
---|
437 | if (LF_ISSET(STK_NOLOCK)) |
---|
438 | (void)__LPUT(dbc, epg->lock); |
---|
439 | else |
---|
440 | (void)__TLPUT(dbc, epg->lock); |
---|
441 | } |
---|
442 | |
---|
443 | /* Clear the stack, all pages have been released. */ |
---|
444 | BT_STK_CLR(cp); |
---|
445 | |
---|
446 | return (ret); |
---|
447 | } |
---|
448 | |
---|
449 | /* |
---|
450 | * __bam_stkgrow -- |
---|
451 | * Grow the stack. |
---|
452 | * |
---|
453 | * PUBLIC: int __bam_stkgrow __P((DB_ENV *, BTREE_CURSOR *)); |
---|
454 | */ |
---|
455 | int |
---|
456 | __bam_stkgrow(dbenv, cp) |
---|
457 | DB_ENV *dbenv; |
---|
458 | BTREE_CURSOR *cp; |
---|
459 | { |
---|
460 | EPG *p; |
---|
461 | size_t entries; |
---|
462 | int ret; |
---|
463 | |
---|
464 | entries = cp->esp - cp->sp; |
---|
465 | |
---|
466 | if ((ret = __os_calloc(dbenv, entries * 2, sizeof(EPG), &p)) != 0) |
---|
467 | return (ret); |
---|
468 | memcpy(p, cp->sp, entries * sizeof(EPG)); |
---|
469 | if (cp->sp != cp->stack) |
---|
470 | __os_free(dbenv, cp->sp); |
---|
471 | cp->sp = p; |
---|
472 | cp->csp = p + entries; |
---|
473 | cp->esp = p + entries * 2; |
---|
474 | return (0); |
---|
475 | } |
---|