|
12 | 12 | * Portions Copyright (c) 1994, Regents of the University of California
|
13 | 13 | *
|
14 | 14 | * IDENTIFICATION
|
15 |
| - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.97 2003/02/23 06:17:13 tgl Exp $ |
| 15 | + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.98 2003/02/23 22:43:08 tgl Exp $ |
16 | 16 | *
|
17 | 17 | *-------------------------------------------------------------------------
|
18 | 18 | */
|
@@ -572,121 +572,110 @@ btbulkdelete(PG_FUNCTION_ARGS)
|
572 | 572 | IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1);
|
573 | 573 | void *callback_state = (void *) PG_GETARG_POINTER(2);
|
574 | 574 | IndexBulkDeleteResult *result;
|
575 |
| - BlockNumber num_pages; |
576 | 575 | double tuples_removed;
|
577 | 576 | double num_index_tuples;
|
578 |
| - IndexScanDesc scan; |
579 |
| - BTScanOpaque so; |
580 |
| - ItemPointer current; |
| 577 | + OffsetNumber deletable[BLCKSZ / sizeof(OffsetNumber)]; |
| 578 | + int ndeletable; |
| 579 | + Buffer buf; |
| 580 | + BlockNumber num_pages; |
581 | 581 |
|
582 | 582 | tuples_removed = 0;
|
583 | 583 | num_index_tuples = 0;
|
584 | 584 |
|
585 | 585 | /*
|
586 |
| - * We use a standard IndexScanDesc scan object, but to speed up the |
587 |
| - * loop, we skip most of the wrapper layers of index_getnext and |
588 |
| - * instead call _bt_step directly. This implies holding buffer lock |
589 |
| - * on a target page throughout the loop over the page's tuples. |
590 |
| - * |
591 |
| - * Whenever we step onto a new page, we have to trade in the read |
592 |
| - * lock acquired by _bt_first or _bt_step for an exclusive write lock |
593 |
| - * (fortunately, _bt_relbuf doesn't care which kind of lock it's |
594 |
| - * releasing when it comes time for _bt_step to release our lock). |
| 586 | + * The outer loop iterates over index leaf pages, the inner over items |
| 587 | + * on a leaf page. We issue just one _bt_delitems() call per page, |
| 588 | + * so as to minimize WAL traffic. |
595 | 589 | *
|
596 |
| - * Note that we exclusive-lock every leaf page, or at least every one |
597 |
| - * containing data items. It sounds attractive to only exclusive-lock |
| 590 | + * Note that we exclusive-lock every leaf page containing data items, |
| 591 | + * in sequence left to right. It sounds attractive to only exclusive-lock |
598 | 592 | * those containing items we need to delete, but unfortunately that
|
599 | 593 | * is not safe: we could then pass a stopped indexscan, which could
|
600 | 594 | * in rare cases lead to deleting the item it needs to find when it
|
601 | 595 | * resumes. (See _bt_restscan --- this could only happen if an indexscan
|
602 | 596 | * stops on a deletable item and then a page split moves that item
|
603 | 597 | * into a page further to its right, which the indexscan will have no
|
604 |
| - * pin on.) |
| 598 | + * pin on.) We can skip obtaining exclusive lock on empty pages |
| 599 | + * though, since no indexscan could be stopped on those. |
605 | 600 | */
|
606 |
| - scan = index_beginscan(NULL, rel, SnapshotAny, 0, (ScanKey) NULL); |
607 |
| - so = (BTScanOpaque) scan->opaque; |
608 |
| - current = &(scan->currentItemData); |
609 |
| - |
610 |
| - /* Use _bt_first to get started, then _bt_step to remaining tuples */ |
611 |
| - if (_bt_first(scan, ForwardScanDirection)) |
| 601 | + buf = _bt_get_endpoint(rel, 0, false); |
| 602 | + if (BufferIsValid(buf)) /* check for empty index */ |
612 | 603 | {
|
613 |
| - Buffer buf; |
614 |
| - BlockNumber lockedBlock = InvalidBlockNumber; |
615 |
| - |
616 |
| - /* we have the buffer pinned and read-locked */ |
617 |
| - buf = so->btso_curbuf; |
618 |
| - Assert(BufferIsValid(buf)); |
619 |
| - |
620 |
| - do |
| 604 | + for (;;) |
621 | 605 | {
|
622 | 606 | Page page;
|
623 |
| - BlockNumber blkno; |
624 |
| - OffsetNumber offnum; |
625 |
| - BTItem btitem; |
626 | 607 | BTPageOpaque opaque;
|
627 |
| - IndexTuple itup; |
628 |
| - ItemPointer htup; |
| 608 | + OffsetNumber offnum, |
| 609 | + minoff, |
| 610 | + maxoff; |
| 611 | + BlockNumber nextpage; |
629 | 612 |
|
630 | 613 | CHECK_FOR_INTERRUPTS();
|
631 | 614 |
|
632 |
| - /* current is the next index tuple */ |
| 615 | + ndeletable = 0; |
633 | 616 | page = BufferGetPage(buf);
|
634 |
| - blkno = ItemPointerGetBlockNumber(current); |
635 |
| - |
636 |
| - /* |
637 |
| - * Make sure we have a super-exclusive write lock on this page. |
638 |
| - * |
639 |
| - * We assume that only concurrent insertions, not deletions, |
640 |
| - * can occur while we're not holding the page lock (the |
641 |
| - * caller should hold a suitable relation lock to ensure |
642 |
| - * this). Therefore, no items can escape being scanned because |
643 |
| - * of this temporary lock release. |
644 |
| - */ |
645 |
| - if (blkno != lockedBlock) |
| 617 | + opaque = (BTPageOpaque) PageGetSpecialPointer(page); |
| 618 | + minoff = P_FIRSTDATAKEY(opaque); |
| 619 | + maxoff = PageGetMaxOffsetNumber(page); |
| 620 | + /* We probably cannot see deleted pages, but skip 'em if so */ |
| 621 | + if (minoff <= maxoff && !P_ISDELETED(opaque)) |
646 | 622 | {
|
| 623 | + /* |
| 624 | + * Trade in the initial read lock for a super-exclusive |
| 625 | + * write lock on this page. |
| 626 | + */ |
647 | 627 | LockBuffer(buf, BUFFER_LOCK_UNLOCK);
|
648 | 628 | LockBufferForCleanup(buf);
|
649 |
| - lockedBlock = blkno; |
650 | 629 | /*
|
651 |
| - * If the page was formerly rightmost but was split while we |
652 |
| - * didn't hold the lock, and ip_posid is pointing to item |
653 |
| - * 1, then ip_posid now points at the high key not a valid |
654 |
| - * data item. In this case we need to step forward. |
| 630 | + * Recompute minoff/maxoff, both of which could have changed |
| 631 | + * while we weren't holding the lock. |
655 | 632 | */
|
656 |
| - opaque = (BTPageOpaque) PageGetSpecialPointer(page); |
657 |
| - if (current->ip_posid < P_FIRSTDATAKEY(opaque)) |
658 |
| - current->ip_posid = P_FIRSTDATAKEY(opaque); |
659 |
| - } |
660 |
| - |
661 |
| - offnum = ItemPointerGetOffsetNumber(current); |
662 |
| - btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum)); |
663 |
| - itup = &btitem->bti_itup; |
664 |
| - htup = &(itup->t_tid); |
665 |
| - |
666 |
| - if (callback(htup, callback_state)) |
667 |
| - { |
668 |
| - /* Okay to delete the item from the page */ |
669 |
| - _bt_itemdel(rel, buf, current); |
670 |
| - |
671 |
| - /* Mark buffer dirty, but keep the lock and pin */ |
672 |
| - WriteNoReleaseBuffer(buf); |
673 |
| - |
674 |
| - tuples_removed += 1; |
675 |
| - |
| 633 | + minoff = P_FIRSTDATAKEY(opaque); |
| 634 | + maxoff = PageGetMaxOffsetNumber(page); |
676 | 635 | /*
|
677 |
| - * We now need to back up the scan one item, so that the next |
678 |
| - * cycle will re-examine the same offnum on this page (which |
679 |
| - * now holds the next item). |
| 636 | + * Scan over all items to see which ones need deleted |
| 637 | + * according to the callback function. |
680 | 638 | */
|
681 |
| - current->ip_posid--; |
| 639 | + for (offnum = minoff; |
| 640 | + offnum <= maxoff; |
| 641 | + offnum = OffsetNumberNext(offnum)) |
| 642 | + { |
| 643 | + BTItem btitem; |
| 644 | + ItemPointer htup; |
| 645 | + |
| 646 | + btitem = (BTItem) PageGetItem(page, |
| 647 | + PageGetItemId(page, offnum)); |
| 648 | + htup = &(btitem->bti_itup.t_tid); |
| 649 | + if (callback(htup, callback_state)) |
| 650 | + { |
| 651 | + deletable[ndeletable++] = offnum; |
| 652 | + tuples_removed += 1; |
| 653 | + } |
| 654 | + else |
| 655 | + num_index_tuples += 1; |
| 656 | + } |
| 657 | + } |
| 658 | + /* |
| 659 | + * If we need to delete anything, do it and write the buffer; |
| 660 | + * else just release the buffer. |
| 661 | + */ |
| 662 | + nextpage = opaque->btpo_next; |
| 663 | + if (ndeletable > 0) |
| 664 | + { |
| 665 | + _bt_delitems(rel, buf, deletable, ndeletable); |
| 666 | + _bt_wrtbuf(rel, buf); |
682 | 667 | }
|
683 | 668 | else
|
684 |
| - num_index_tuples += 1; |
685 |
| - } while (_bt_step(scan, &buf, ForwardScanDirection)); |
| 669 | + { |
| 670 | + _bt_relbuf(rel, buf); |
| 671 | + } |
| 672 | + /* And advance to next page, if any */ |
| 673 | + if (nextpage == P_NONE) |
| 674 | + break; |
| 675 | + buf = _bt_getbuf(rel, nextpage, BT_READ); |
| 676 | + } |
686 | 677 | }
|
687 | 678 |
|
688 |
| - index_endscan(scan); |
689 |
| - |
690 | 679 | /* return statistics */
|
691 | 680 | num_pages = RelationGetNumberOfBlocks(rel);
|
692 | 681 |
|
@@ -765,7 +754,7 @@ btvacuumcleanup(PG_FUNCTION_ARGS)
|
765 | 754 | }
|
766 | 755 | }
|
767 | 756 | else if ((opaque->btpo_flags & BTP_HALF_DEAD) ||
|
768 |
| - P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page)) |
| 757 | + P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page)) |
769 | 758 | {
|
770 | 759 | /* Empty, try to delete */
|
771 | 760 | int ndel;
|
|
0 commit comments