source: trunk/third/evolution-data-server/libdb/btree/bt_search.c @ 21189

Revision 21189, 13.1 KB checked in by ghudson, 20 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r21188, which included commits to RCS files with non-trunk default branches.
Line 
1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996-2002
5 *      Sleepycat Software.  All rights reserved.
6 */
7/*
8 * Copyright (c) 1990, 1993, 1994, 1995, 1996
9 *      Keith Bostic.  All rights reserved.
10 */
11/*
12 * Copyright (c) 1990, 1993, 1994, 1995
13 *      The Regents of the University of California.  All rights reserved.
14 *
15 * This code is derived from software contributed to Berkeley by
16 * Mike Olson.
17 *
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 * 1. Redistributions of source code must retain the above copyright
22 *    notice, this list of conditions and the following disclaimer.
23 * 2. Redistributions in binary form must reproduce the above copyright
24 *    notice, this list of conditions and the following disclaimer in the
25 *    documentation and/or other materials provided with the distribution.
26 * 3. Neither the name of the University nor the names of its contributors
27 *    may be used to endorse or promote products derived from this software
28 *    without specific prior written permission.
29 *
30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40 * SUCH DAMAGE.
41 */
42
43#include "db_config.h"
44
45#ifndef lint
46static const char revid[] = "$Id: bt_search.c,v 1.1.1.1 2004-12-17 17:26:44 ghudson Exp $";
47#endif /* not lint */
48
49#ifndef NO_SYSTEM_INCLUDES
50#include <sys/types.h>
51
52#include <string.h>
53#endif
54
55#include "db_int.h"
56#include "dbinc/db_page.h"
57#include "dbinc/db_shash.h"
58#include "dbinc/btree.h"
59#include "dbinc/lock.h"
60
61/*
62 * __bam_search --
63 *      Search a btree for a key.
64 *
65 * PUBLIC: int __bam_search __P((DBC *, db_pgno_t,
66 * PUBLIC:     const DBT *, u_int32_t, int, db_recno_t *, int *));
67 */
68int
69__bam_search(dbc, root_pgno, key, flags, stop, recnop, exactp)
70        DBC *dbc;
71        db_pgno_t root_pgno;
72        const DBT *key;
73        u_int32_t flags;
74        int stop, *exactp;
75        db_recno_t *recnop;
76{
77        BTREE *t;
78        BTREE_CURSOR *cp;
79        DB *dbp;
80        DB_LOCK lock;
81        DB_MPOOLFILE *mpf;
82        PAGE *h;
83        db_indx_t base, i, indx, *inp, lim;
84        db_lockmode_t lock_mode;
85        db_pgno_t pg;
86        db_recno_t recno;
87        int adjust, cmp, deloffset, ret, stack;
88        int (*func) __P((DB *, const DBT *, const DBT *));
89
90        dbp = dbc->dbp;
91        mpf = dbp->mpf;
92        cp = (BTREE_CURSOR *)dbc->internal;
93        t = dbp->bt_internal;
94        recno = 0;
95
96        BT_STK_CLR(cp);
97
98        /*
99         * There are several ways we search a btree tree.  The flags argument
100         * specifies if we're acquiring read or write locks, if we position
101         * to the first or last item in a set of duplicates, if we return
102         * deleted items, and if we are locking pairs of pages.  In addition,
103         * if we're modifying record numbers, we have to lock the entire tree
104         * regardless.  See btree.h for more details.
105         *
106         * If write-locking pages, we need to know whether or not to acquire a
107         * write lock on a page before getting it.  This depends on how deep it
108         * is in tree, which we don't know until we acquire the root page.  So,
109         * if we need to lock the root page we may have to upgrade it later,
110         * because we won't get the correct lock initially.
111         *
112         * Retrieve the root page.
113         */
114try_again:
115        pg = root_pgno == PGNO_INVALID ? cp->root : root_pgno;
116        stack = LF_ISSET(S_STACK) && F_ISSET(cp, C_RECNUM);
117        lock_mode = stack ? DB_LOCK_WRITE : DB_LOCK_READ;
118        if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
119                return (ret);
120        if ((ret = mpf->get(mpf, &pg, 0, &h)) != 0) {
121                /* Did not read it, so we can release the lock */
122                (void)__LPUT(dbc, lock);
123                return (ret);
124        }
125
126        /*
127         * Decide if we need to save this page; if we do, write lock it.
128         * We deliberately don't lock-couple on this call.  If the tree
129         * is tiny, i.e., one page, and two threads are busily updating
130         * the root page, we're almost guaranteed deadlocks galore, as
131         * each one gets a read lock and then blocks the other's attempt
132         * for a write lock.
133         */
134        if (!stack &&
135            ((LF_ISSET(S_PARENT) && (u_int8_t)(stop + 1) >= h->level) ||
136            (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) {
137                (void)mpf->put(mpf, h, 0);
138                (void)__LPUT(dbc, lock);
139                lock_mode = DB_LOCK_WRITE;
140                if ((ret = __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
141                        return (ret);
142                if ((ret = mpf->get(mpf, &pg, 0, &h)) != 0) {
143                        /* Did not read it, so we can release the lock */
144                        (void)__LPUT(dbc, lock);
145                        return (ret);
146                }
147                if (!((LF_ISSET(S_PARENT) &&
148                    (u_int8_t)(stop + 1) >= h->level) ||
149                    (LF_ISSET(S_WRITE) && h->level == LEAFLEVEL))) {
150                        /* Someone else split the root, start over. */
151                        (void)mpf->put(mpf, h, 0);
152                        (void)__LPUT(dbc, lock);
153                        goto try_again;
154                }
155                stack = 1;
156        }
157
158        /* Choose a comparison function. */
159        func = F_ISSET(dbc, DBC_OPD) ?
160            (dbp->dup_compare == NULL ? __bam_defcmp : dbp->dup_compare) :
161            t->bt_compare;
162
163        for (;;) {
164                inp = P_INP(dbp, h);
165                /*
166                 * Do a binary search on the current page.  If we're searching
167                 * a Btree leaf page, we have to walk the indices in groups of
168                 * two.  If we're searching an internal page or a off-page dup
169                 * page, they're an index per page item.  If we find an exact
170                 * match on a leaf page, we're done.
171                 */
172                adjust = TYPE(h) == P_LBTREE ? P_INDX : O_INDX;
173                for (base = 0,
174                    lim = NUM_ENT(h) / (db_indx_t)adjust; lim != 0; lim >>= 1) {
175                        indx = base + ((lim >> 1) * adjust);
176                        if ((ret =
177                            __bam_cmp(dbp, key, h, indx, func, &cmp)) != 0)
178                                goto err;
179                        if (cmp == 0) {
180                                if (TYPE(h) == P_LBTREE || TYPE(h) == P_LDUP)
181                                        goto found;
182                                goto next;
183                        }
184                        if (cmp > 0) {
185                                base = indx + adjust;
186                                --lim;
187                        }
188                }
189
190                /*
191                 * No match found.  Base is the smallest index greater than
192                 * key and may be zero or a last + O_INDX index.
193                 *
194                 * If it's a leaf page, return base as the "found" value.
195                 * Delete only deletes exact matches.
196                 */
197                if (TYPE(h) == P_LBTREE || TYPE(h) == P_LDUP) {
198                        *exactp = 0;
199
200                        if (LF_ISSET(S_EXACT))
201                                goto notfound;
202
203                        if (LF_ISSET(S_STK_ONLY)) {
204                                BT_STK_NUM(dbp->dbenv, cp, h, base, ret);
205                                __LPUT(dbc, lock);
206                                (void)mpf->put(mpf, h, 0);
207                                return (ret);
208                        }
209
210                        /*
211                         * !!!
212                         * Possibly returning a deleted record -- DB_SET_RANGE,
213                         * DB_KEYFIRST and DB_KEYLAST don't require an exact
214                         * match, and we don't want to walk multiple pages here
215                         * to find an undeleted record.  This is handled by the
216                         * calling routine.
217                         */
218                        BT_STK_ENTER(dbp->dbenv,
219                            cp, h, base, lock, lock_mode, ret);
220                        if (ret != 0)
221                                goto err;
222                        return (0);
223                }
224
225                /*
226                 * If it's not a leaf page, record the internal page (which is
227                 * a parent page for the key).  Decrement the base by 1 if it's
228                 * non-zero so that if a split later occurs, the inserted page
229                 * will be to the right of the saved page.
230                 */
231                indx = base > 0 ? base - O_INDX : base;
232
233                /*
234                 * If we're trying to calculate the record number, sum up
235                 * all the record numbers on this page up to the indx point.
236                 */
237next:           if (recnop != NULL)
238                        for (i = 0; i < indx; ++i)
239                                recno += GET_BINTERNAL(dbp, h, i)->nrecs;
240
241                pg = GET_BINTERNAL(dbp, h, indx)->pgno;
242
243                if (LF_ISSET(S_STK_ONLY)) {
244                        if (stop == h->level) {
245                                BT_STK_NUM(dbp->dbenv, cp, h, indx, ret);
246                                __LPUT(dbc, lock);
247                                (void)mpf->put(mpf, h, 0);
248                                return (ret);
249                        }
250                        BT_STK_NUMPUSH(dbp->dbenv, cp, h, indx, ret);
251                        (void)mpf->put(mpf, h, 0);
252                        if ((ret = __db_lget(dbc,
253                            LCK_COUPLE_ALWAYS, pg, lock_mode, 0, &lock)) != 0) {
254                                /*
255                                 * Discard our lock and return on failure.  This
256                                 * is OK because it only happens when descending
257                                 * the tree holding read-locks.
258                                 */
259                                __LPUT(dbc, lock);
260                                return (ret);
261                        }
262                } else if (stack) {
263                        /* Return if this is the lowest page wanted. */
264                        if (LF_ISSET(S_PARENT) && stop == h->level) {
265                                BT_STK_ENTER(dbp->dbenv,
266                                    cp, h, indx, lock, lock_mode, ret);
267                                if (ret != 0)
268                                        goto err;
269                                return (0);
270                        }
271                        BT_STK_PUSH(dbp->dbenv,
272                            cp, h, indx, lock, lock_mode, ret);
273                        if (ret != 0)
274                                goto err;
275
276                        lock_mode = DB_LOCK_WRITE;
277                        if ((ret =
278                            __db_lget(dbc, 0, pg, lock_mode, 0, &lock)) != 0)
279                                goto err;
280                } else {
281                        /*
282                         * Decide if we want to return a reference to the next
283                         * page in the return stack.  If so, lock it and never
284                         * unlock it.
285                         */
286                        if ((LF_ISSET(S_PARENT) &&
287                            (u_int8_t)(stop + 1) >= (u_int8_t)(h->level - 1)) ||
288                            (h->level - 1) == LEAFLEVEL)
289                                stack = 1;
290
291                        (void)mpf->put(mpf, h, 0);
292
293                        lock_mode = stack &&
294                            LF_ISSET(S_WRITE) ? DB_LOCK_WRITE : DB_LOCK_READ;
295                        if ((ret = __db_lget(dbc,
296                            LCK_COUPLE_ALWAYS, pg, lock_mode, 0, &lock)) != 0) {
297                                /*
298                                 * If we fail, discard the lock we held.  This
299                                 * is OK because this only happens when we are
300                                 * descending the tree holding read-locks.
301                                 */
302                                __LPUT(dbc, lock);
303                                goto err;
304                        }
305                }
306                if ((ret = mpf->get(mpf, &pg, 0, &h)) != 0)
307                        goto err;
308        }
309        /* NOTREACHED */
310
311found:  *exactp = 1;
312
313        /*
314         * If we're trying to calculate the record number, add in the
315         * offset on this page and correct for the fact that records
316         * in the tree are 0-based.
317         */
318        if (recnop != NULL)
319                *recnop = recno + (indx / P_INDX) + 1;
320
321        /*
322         * If we got here, we know that we have a Btree leaf or off-page
323         * duplicates page.  If it's a Btree leaf page, we have to handle
324         * on-page duplicates.
325         *
326         * If there are duplicates, go to the first/last one.  This is
327         * safe because we know that we're not going to leave the page,
328         * all duplicate sets that are not on overflow pages exist on a
329         * single leaf page.
330         */
331        if (TYPE(h) == P_LBTREE) {
332                if (LF_ISSET(S_DUPLAST))
333                        while (indx < (db_indx_t)(NUM_ENT(h) - P_INDX) &&
334                            inp[indx] == inp[indx + P_INDX])
335                                indx += P_INDX;
336                else
337                        while (indx > 0 &&
338                            inp[indx] == inp[indx - P_INDX])
339                                indx -= P_INDX;
340        }
341
342        /*
343         * Now check if we are allowed to return deleted items; if not, then
344         * find the next (or previous) non-deleted duplicate entry.  (We do
345         * not move from the original found key on the basis of the S_DELNO
346         * flag.)
347         */
348        if (LF_ISSET(S_DELNO)) {
349                deloffset = TYPE(h) == P_LBTREE ? O_INDX : 0;
350                if (LF_ISSET(S_DUPLAST))
351                        while (B_DISSET(GET_BKEYDATA(dbp,
352                            h, indx + deloffset)->type) && indx > 0 &&
353                            inp[indx] == inp[indx - adjust])
354                                indx -= adjust;
355                else
356                        while (B_DISSET(GET_BKEYDATA(dbp,
357                            h, indx + deloffset)->type) &&
358                            indx < (db_indx_t)(NUM_ENT(h) - adjust) &&
359                            inp[indx] == inp[indx + adjust])
360                                indx += adjust;
361
362                /*
363                 * If we weren't able to find a non-deleted duplicate, return
364                 * DB_NOTFOUND.
365                 */
366                if (B_DISSET(GET_BKEYDATA(dbp, h, indx + deloffset)->type))
367                        goto notfound;
368        }
369
370        if (LF_ISSET(S_STK_ONLY)) {
371                BT_STK_NUM(dbp->dbenv, cp, h, indx, ret);
372                __LPUT(dbc, lock);
373                (void)mpf->put(mpf, h, 0);
374        } else {
375                BT_STK_ENTER(dbp->dbenv, cp, h, indx, lock, lock_mode, ret);
376                if (ret != 0)
377                        goto err;
378        }
379        return (0);
380
381notfound:
382        /* Keep the page locked for serializability. */
383        (void)mpf->put(mpf, h, 0);
384        (void)__TLPUT(dbc, lock);
385        ret = DB_NOTFOUND;
386
387err:    BT_STK_POP(cp);
388        __bam_stkrel(dbc, 0);
389        return (ret);
390}
391
392/*
393 * __bam_stkrel --
394 *      Release all pages currently held in the stack.
395 *
396 * PUBLIC: int __bam_stkrel __P((DBC *, u_int32_t));
397 */
398int
399__bam_stkrel(dbc, flags)
400        DBC *dbc;
401        u_int32_t flags;
402{
403        BTREE_CURSOR *cp;
404        DB *dbp;
405        DB_MPOOLFILE *mpf;
406        EPG *epg;
407        int ret, t_ret;
408
409        dbp = dbc->dbp;
410        mpf = dbp->mpf;
411        cp = (BTREE_CURSOR *)dbc->internal;
412
413        /*
414         * Release inner pages first.
415         *
416         * The caller must be sure that setting STK_NOLOCK will not effect
417         * either serializability or recoverability.
418         */
419        for (ret = 0, epg = cp->sp; epg <= cp->csp; ++epg) {
420                if (epg->page != NULL) {
421                        if (LF_ISSET(STK_CLRDBC) && cp->page == epg->page) {
422                                cp->page = NULL;
423                                LOCK_INIT(cp->lock);
424                        }
425                        if ((t_ret =
426                            mpf->put(mpf, epg->page, 0)) != 0 && ret == 0)
427                                ret = t_ret;
428                        /*
429                         * XXX
430                         * Temporary fix for #3243 -- under certain deadlock
431                         * conditions we call here again and re-free the page.
432                         * The correct fix is to never release a stack that
433                         * doesn't hold items.
434                         */
435                        epg->page = NULL;
436                }
437                if (LF_ISSET(STK_NOLOCK))
438                        (void)__LPUT(dbc, epg->lock);
439                else
440                        (void)__TLPUT(dbc, epg->lock);
441        }
442
443        /* Clear the stack, all pages have been released. */
444        BT_STK_CLR(cp);
445
446        return (ret);
447}
448
449/*
450 * __bam_stkgrow --
451 *      Grow the stack.
452 *
453 * PUBLIC: int __bam_stkgrow __P((DB_ENV *, BTREE_CURSOR *));
454 */
455int
456__bam_stkgrow(dbenv, cp)
457        DB_ENV *dbenv;
458        BTREE_CURSOR *cp;
459{
460        EPG *p;
461        size_t entries;
462        int ret;
463
464        entries = cp->esp - cp->sp;
465
466        if ((ret = __os_calloc(dbenv, entries * 2, sizeof(EPG), &p)) != 0)
467                return (ret);
468        memcpy(p, cp->sp, entries * sizeof(EPG));
469        if (cp->sp != cp->stack)
470                __os_free(dbenv, cp->sp);
471        cp->sp = p;
472        cp->csp = p + entries;
473        cp->esp = p + entries * 2;
474        return (0);
475}
Note: See TracBrowser for help on using the repository browser.