source: trunk/third/evolution-data-server/libdb/btree/bt_delete.c @ 21189

Revision 21189, 12.2 KB checked in by ghudson, 20 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r21188, which included commits to RCS files with non-trunk default branches.
Line 
1/*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996-2002
5 *      Sleepycat Software.  All rights reserved.
6 */
7/*
8 * Copyright (c) 1990, 1993, 1994, 1995, 1996
9 *      Keith Bostic.  All rights reserved.
10 */
11/*
12 * Copyright (c) 1990, 1993, 1994, 1995
13 *      The Regents of the University of California.  All rights reserved.
14 *
15 * This code is derived from software contributed to Berkeley by
16 * Mike Olson.
17 *
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 * 1. Redistributions of source code must retain the above copyright
22 *    notice, this list of conditions and the following disclaimer.
23 * 2. Redistributions in binary form must reproduce the above copyright
24 *    notice, this list of conditions and the following disclaimer in the
25 *    documentation and/or other materials provided with the distribution.
26 * 3. Neither the name of the University nor the names of its contributors
27 *    may be used to endorse or promote products derived from this software
28 *    without specific prior written permission.
29 *
30 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40 * SUCH DAMAGE.
41 */
42
43#include "db_config.h"
44
45#ifndef lint
46static const char revid[] = "$Id: bt_delete.c,v 1.1.1.1 2004-12-17 17:26:43 ghudson Exp $";
47#endif /* not lint */
48
49#ifndef NO_SYSTEM_INCLUDES
50#include <sys/types.h>
51
52#include <string.h>
53#endif
54
55#include "db_int.h"
56#include "dbinc/db_page.h"
57#include "dbinc/db_shash.h"
58#include "dbinc/btree.h"
59#include "dbinc/lock.h"
60
61/*
62 * __bam_ditem --
63 *      Delete one or more entries from a page.
64 *
65 * PUBLIC: int __bam_ditem __P((DBC *, PAGE *, u_int32_t));
66 */
67int
68__bam_ditem(dbc, h, indx)
69        DBC *dbc;
70        PAGE *h;
71        u_int32_t indx;
72{
73        BINTERNAL *bi;
74        BKEYDATA *bk;
75        DB *dbp;
76        DB_MPOOLFILE *mpf;
77        u_int32_t nbytes;
78        int ret;
79        db_indx_t *inp;
80
81        dbp = dbc->dbp;
82        mpf = dbp->mpf;
83        inp = P_INP(dbp, h);
84
85        switch (TYPE(h)) {
86        case P_IBTREE:
87                bi = GET_BINTERNAL(dbp, h, indx);
88                switch (B_TYPE(bi->type)) {
89                case B_DUPLICATE:
90                case B_KEYDATA:
91                        nbytes = BINTERNAL_SIZE(bi->len);
92                        break;
93                case B_OVERFLOW:
94                        nbytes = BINTERNAL_SIZE(bi->len);
95                        if ((ret =
96                            __db_doff(dbc, ((BOVERFLOW *)bi->data)->pgno)) != 0)
97                                return (ret);
98                        break;
99                default:
100                        return (__db_pgfmt(dbp->dbenv, PGNO(h)));
101                }
102                break;
103        case P_IRECNO:
104                nbytes = RINTERNAL_SIZE;
105                break;
106        case P_LBTREE:
107                /*
108                 * If it's a duplicate key, discard the index and don't touch
109                 * the actual page item.
110                 *
111                 * !!!
112                 * This works because no data item can have an index matching
113                 * any other index so even if the data item is in a key "slot",
114                 * it won't match any other index.
115                 */
116                if ((indx % 2) == 0) {
117                        /*
118                         * Check for a duplicate after us on the page.  NOTE:
119                         * we have to delete the key item before deleting the
120                         * data item, otherwise the "indx + P_INDX" calculation
121                         * won't work!
122                         */
123                        if (indx + P_INDX < (u_int32_t)NUM_ENT(h) &&
124                            inp[indx] == inp[indx + P_INDX])
125                                return (__bam_adjindx(dbc,
126                                    h, indx, indx + O_INDX, 0));
127                        /*
128                         * Check for a duplicate before us on the page.  It
129                         * doesn't matter if we delete the key item before or
130                         * after the data item for the purposes of this one.
131                         */
132                        if (indx > 0 && inp[indx] == inp[indx - P_INDX])
133                                return (__bam_adjindx(dbc,
134                                    h, indx, indx - P_INDX, 0));
135                }
136                /* FALLTHROUGH */
137        case P_LDUP:
138        case P_LRECNO:
139                bk = GET_BKEYDATA(dbp, h, indx);
140                switch (B_TYPE(bk->type)) {
141                case B_DUPLICATE:
142                        nbytes = BOVERFLOW_SIZE;
143                        break;
144                case B_OVERFLOW:
145                        nbytes = BOVERFLOW_SIZE;
146                        if ((ret = __db_doff(
147                            dbc, (GET_BOVERFLOW(dbp, h, indx))->pgno)) != 0)
148                                return (ret);
149                        break;
150                case B_KEYDATA:
151                        nbytes = BKEYDATA_SIZE(bk->len);
152                        break;
153                default:
154                        return (__db_pgfmt(dbp->dbenv, PGNO(h)));
155                }
156                break;
157        default:
158                return (__db_pgfmt(dbp->dbenv, PGNO(h)));
159        }
160
161        /* Delete the item and mark the page dirty. */
162        if ((ret = __db_ditem(dbc, h, indx, nbytes)) != 0)
163                return (ret);
164        if ((ret = mpf->set(mpf, h, DB_MPOOL_DIRTY)) != 0)
165                return (ret);
166
167        return (0);
168}
169
170/*
171 * __bam_adjindx --
172 *      Adjust an index on the page.
173 *
174 * PUBLIC: int __bam_adjindx __P((DBC *, PAGE *, u_int32_t, u_int32_t, int));
175 */
176int
177__bam_adjindx(dbc, h, indx, indx_copy, is_insert)
178        DBC *dbc;
179        PAGE *h;
180        u_int32_t indx, indx_copy;
181        int is_insert;
182{
183        DB *dbp;
184        DB_MPOOLFILE *mpf;
185        db_indx_t copy, *inp;
186        int ret;
187
188        dbp = dbc->dbp;
189        mpf = dbp->mpf;
190        inp = P_INP(dbp, h);
191
192        /* Log the change. */
193        if (DBC_LOGGING(dbc)) {
194            if ((ret = __bam_adj_log(dbp, dbc->txn, &LSN(h), 0,
195                PGNO(h), &LSN(h), indx, indx_copy, (u_int32_t)is_insert)) != 0)
196                        return (ret);
197        } else
198                LSN_NOT_LOGGED(LSN(h));
199
200        /* Shuffle the indices and mark the page dirty. */
201        if (is_insert) {
202                copy = inp[indx_copy];
203                if (indx != NUM_ENT(h))
204                        memmove(&inp[indx + O_INDX], &inp[indx],
205                            sizeof(db_indx_t) * (NUM_ENT(h) - indx));
206                inp[indx] = copy;
207                ++NUM_ENT(h);
208        } else {
209                --NUM_ENT(h);
210                if (indx != NUM_ENT(h))
211                        memmove(&inp[indx], &inp[indx + O_INDX],
212                            sizeof(db_indx_t) * (NUM_ENT(h) - indx));
213        }
214        if ((ret = mpf->set(mpf, h, DB_MPOOL_DIRTY)) != 0)
215                return (ret);
216
217        return (0);
218}
219
220/*
221 * __bam_dpages --
222 *      Delete a set of locked pages.
223 *
224 * PUBLIC: int __bam_dpages __P((DBC *, EPG *));
225 */
226int
227__bam_dpages(dbc, stack_epg)
228        DBC *dbc;
229        EPG *stack_epg;
230{
231        BTREE_CURSOR *cp;
232        BINTERNAL *bi;
233        DB *dbp;
234        DBT a, b;
235        DB_LOCK c_lock, p_lock;
236        DB_MPOOLFILE *mpf;
237        EPG *epg;
238        PAGE *child, *parent;
239        db_indx_t nitems;
240        db_pgno_t pgno, root_pgno;
241        db_recno_t rcnt;
242        int done, ret, t_ret;
243
244        dbp = dbc->dbp;
245        mpf = dbp->mpf;
246        cp = (BTREE_CURSOR *)dbc->internal;
247
248        /*
249         * We have the entire stack of deletable pages locked.
250         *
251         * Btree calls us with a pointer to the beginning of a stack, where
252         * the first page in the stack is to have a single item deleted, and
253         * the rest of the pages are to be removed.
254         *
255         * Recno calls us with a pointer into the middle of the stack, where
256         * the referenced page is to have a single item deleted, and pages
257         * after the stack reference are to be removed.
258         *
259         * First, discard any pages that we don't care about.
260         */
261        ret = 0;
262        for (epg = cp->sp; epg < stack_epg; ++epg) {
263                if ((t_ret = mpf->put(mpf, epg->page, 0)) != 0 && ret == 0)
264                        ret = t_ret;
265                (void)__TLPUT(dbc, epg->lock);
266        }
267        if (ret != 0)
268                goto err;
269
270        /*
271         * !!!
272         * There is an interesting deadlock situation here.  We have to relink
273         * the leaf page chain around the leaf page being deleted.  Consider
274         * a cursor walking through the leaf pages, that has the previous page
275         * read-locked and is waiting on a lock for the page we're deleting.
276         * It will deadlock here.  Before we unlink the subtree, we relink the
277         * leaf page chain.
278         */
279        if ((ret = __db_relink(dbc, DB_REM_PAGE, cp->csp->page, NULL, 1)) != 0)
280                goto err;
281
282        /*
283         * Delete the last item that references the underlying pages that are
284         * to be deleted, and adjust cursors that reference that page.  Then,
285         * save that page's page number and item count and release it.  If
286         * the application isn't retaining locks because it's running without
287         * transactions, this lets the rest of the tree get back to business
288         * immediately.
289         */
290        if ((ret = __bam_ditem(dbc, epg->page, epg->indx)) != 0)
291                goto err;
292        if ((ret = __bam_ca_di(dbc, PGNO(epg->page), epg->indx, -1)) != 0)
293                goto err;
294
295        pgno = PGNO(epg->page);
296        nitems = NUM_ENT(epg->page);
297
298        if ((ret = mpf->put(mpf, epg->page, 0)) != 0)
299                goto err_inc;
300        (void)__TLPUT(dbc, epg->lock);
301
302        /* Free the rest of the pages in the stack. */
303        while (++epg <= cp->csp) {
304                /*
305                 * Delete page entries so they will be restored as part of
306                 * recovery.  We don't need to do cursor adjustment here as
307                 * the pages are being emptied by definition and so cannot
308                 * be referenced by a cursor.
309                 */
310                if (NUM_ENT(epg->page) != 0) {
311                        DB_ASSERT(NUM_ENT(epg->page) == 1);
312
313                        if ((ret = __bam_ditem(dbc, epg->page, epg->indx)) != 0)
314                                goto err;
315                }
316
317                if ((ret = __db_free(dbc, epg->page)) != 0) {
318                        epg->page = NULL;
319                        goto err_inc;
320                }
321                (void)__TLPUT(dbc, epg->lock);
322        }
323
324        if (0) {
325err_inc:        ++epg;
326err:            for (; epg <= cp->csp; ++epg) {
327                        if (epg->page != NULL)
328                                (void)mpf->put(mpf, epg->page, 0);
329                        (void)__TLPUT(dbc, epg->lock);
330                }
331                BT_STK_CLR(cp);
332                return (ret);
333        }
334        BT_STK_CLR(cp);
335
336        /*
337         * If we just deleted the next-to-last item from the root page, the
338         * tree can collapse one or more levels.  While there remains only a
339         * single item on the root page, write lock the last page referenced
340         * by the root page and copy it over the root page.
341         */
342        root_pgno = cp->root;
343        if (pgno != root_pgno || nitems != 1)
344                return (0);
345
346        for (done = 0; !done;) {
347                /* Initialize. */
348                parent = child = NULL;
349                LOCK_INIT(p_lock);
350                LOCK_INIT(c_lock);
351
352                /* Lock the root. */
353                pgno = root_pgno;
354                if ((ret =
355                    __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &p_lock)) != 0)
356                        goto stop;
357                if ((ret = mpf->get(mpf, &pgno, 0, &parent)) != 0)
358                        goto stop;
359
360                if (NUM_ENT(parent) != 1)
361                        goto stop;
362
363                switch (TYPE(parent)) {
364                case P_IBTREE:
365                        /*
366                         * If this is overflow, then try to delete it.
367                         * The child may or may not still point at it.
368                         */
369                        bi = GET_BINTERNAL(dbp, parent, 0);
370                        if (B_TYPE(bi->type) == B_OVERFLOW)
371                                if ((ret = __db_doff(dbc,
372                                    ((BOVERFLOW *)bi->data)->pgno)) != 0)
373                                        goto stop;
374                        pgno = bi->pgno;
375                        break;
376                case P_IRECNO:
377                        pgno = GET_RINTERNAL(dbp, parent, 0)->pgno;
378                        break;
379                default:
380                        goto stop;
381                }
382
383                /* Lock the child page. */
384                if ((ret =
385                    __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, &c_lock)) != 0)
386                        goto stop;
387                if ((ret = mpf->get(mpf, &pgno, 0, &child)) != 0)
388                        goto stop;
389
390                /* Log the change. */
391                if (DBC_LOGGING(dbc)) {
392                        memset(&a, 0, sizeof(a));
393                        a.data = child;
394                        a.size = dbp->pgsize;
395                        memset(&b, 0, sizeof(b));
396                        b.data = P_ENTRY(dbp, parent, 0);
397                        b.size = TYPE(parent) == P_IRECNO ? RINTERNAL_SIZE :
398                            BINTERNAL_SIZE(((BINTERNAL *)b.data)->len);
399                        if ((ret = __bam_rsplit_log(dbp, dbc->txn,
400                            &child->lsn, 0, PGNO(child), &a, PGNO(parent),
401                            RE_NREC(parent), &b, &parent->lsn)) != 0)
402                                goto stop;
403                } else
404                        LSN_NOT_LOGGED(child->lsn);
405
406                /*
407                 * Make the switch.
408                 *
409                 * One fixup -- internal pages below the top level do not store
410                 * a record count, so we have to preserve it if we're not
411                 * converting to a leaf page.  Note also that we are about to
412                 * overwrite the parent page, including its LSN.  This is OK
413                 * because the log message we wrote describing this update
414                 * stores its LSN on the child page.  When the child is copied
415                 * onto the parent, the correct LSN is copied into place.
416                 */
417                COMPQUIET(rcnt, 0);
418                if (F_ISSET(cp, C_RECNUM) && LEVEL(child) > LEAFLEVEL)
419                        rcnt = RE_NREC(parent);
420                memcpy(parent, child, dbp->pgsize);
421                PGNO(parent) = root_pgno;
422                if (F_ISSET(cp, C_RECNUM) && LEVEL(child) > LEAFLEVEL)
423                        RE_NREC_SET(parent, rcnt);
424
425                /* Mark the pages dirty. */
426                if ((ret = mpf->set(mpf, parent, DB_MPOOL_DIRTY)) != 0)
427                        goto stop;
428                if ((ret = mpf->set(mpf, child, DB_MPOOL_DIRTY)) != 0)
429                        goto stop;
430
431                /* Adjust the cursors. */
432                if ((ret = __bam_ca_rsplit(dbc, PGNO(child), root_pgno)) != 0)
433                        goto stop;
434
435                /*
436                 * Free the page copied onto the root page and discard its
437                 * lock.  (The call to __db_free() discards our reference
438                 * to the page.)
439                 */
440                if ((ret = __db_free(dbc, child)) != 0) {
441                        child = NULL;
442                        goto stop;
443                }
444                child = NULL;
445
446                if (0) {
447stop:                   done = 1;
448                }
449                (void)__TLPUT(dbc, p_lock);
450                if (parent != NULL &&
451                    (t_ret = mpf->put(mpf, parent, 0)) != 0 && ret == 0)
452                        ret = t_ret;
453                (void)__TLPUT(dbc, c_lock);
454                if (child != NULL &&
455                    (t_ret = mpf->put(mpf, child, 0)) != 0 && ret == 0)
456                        ret = t_ret;
457        }
458
459        return (ret);
460}
Note: See TracBrowser for help on using the repository browser.