12
12
* Portions Copyright (c) 1994, Regents of the University of California
13
13
*
14
14
* IDENTIFICATION
15
- * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.92 2002/09/04 20:31:10 momjian Exp $
15
+ * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.93 2002/10/20 20:47:31 tgl Exp $
16
16
*
17
17
*-------------------------------------------------------------------------
18
18
*/
@@ -603,12 +603,21 @@ btbulkdelete(PG_FUNCTION_ARGS)
603
603
* loop, we skip most of the wrapper layers of index_getnext and
604
604
* instead call _bt_step directly. This implies holding buffer lock
605
605
* on a target page throughout the loop over the page's tuples.
606
- * Initially, we have a read lock acquired by _bt_step when we stepped
607
- * onto the page. If we find a tuple we need to delete, we trade in
608
- * the read lock for an exclusive write lock; after that, we hold the
609
- * write lock until we step off the page (fortunately, _bt_relbuf
610
- * doesn't care which kind of lock it's releasing). This should
611
- * minimize the amount of work needed per page.
606
+ *
607
+ * Whenever we step onto a new page, we have to trade in the read
608
+ * lock acquired by _bt_first or _bt_step for an exclusive write lock
609
+ * (fortunately, _bt_relbuf doesn't care which kind of lock it's
610
+ * releasing when it comes time for _bt_step to release our lock).
611
+ *
612
+ * Note that we exclusive-lock every leaf page, or at least every one
613
+ * containing data items. It sounds attractive to only exclusive-lock
614
+ * those containing items we need to delete, but unfortunately that
615
+ * is not safe: we could then pass a stopped indexscan, which could
616
+ * in rare cases lead to deleting the item it needs to find when it
617
+ * resumes. (See _bt_restscan --- this could only happen if an indexscan
618
+ * stops on a deletable item and then a page split moves that item
619
+ * into a page further to its right, which the indexscan will have no
620
+ * pin on.)
612
621
*/
613
622
scan = index_beginscan (NULL , rel , SnapshotAny , 0 , (ScanKey ) NULL );
614
623
so = (BTScanOpaque ) scan -> opaque ;
@@ -620,7 +629,7 @@ btbulkdelete(PG_FUNCTION_ARGS)
620
629
Buffer buf ;
621
630
BlockNumber lockedBlock = InvalidBlockNumber ;
622
631
623
- /* we have the buffer pinned and locked */
632
+ /* we have the buffer pinned and read- locked */
624
633
buf = so -> btso_curbuf ;
625
634
Assert (BufferIsValid (buf ));
626
635
@@ -637,65 +646,59 @@ btbulkdelete(PG_FUNCTION_ARGS)
637
646
CHECK_FOR_INTERRUPTS ();
638
647
639
648
/* current is the next index tuple */
649
+ page = BufferGetPage (buf );
640
650
blkno = ItemPointerGetBlockNumber (current );
651
+
652
+ /*
653
+ * Make sure we have a super-exclusive write lock on this page.
654
+ *
655
+ * We assume that only concurrent insertions, not deletions,
656
+ * can occur while we're not holding the page lock (the
657
+ * caller should hold a suitable relation lock to ensure
658
+ * this). Therefore, no items can escape being scanned because
659
+ * of this temporary lock release.
660
+ */
661
+ if (blkno != lockedBlock )
662
+ {
663
+ LockBuffer (buf , BUFFER_LOCK_UNLOCK );
664
+ LockBufferForCleanup (buf );
665
+ lockedBlock = blkno ;
666
+ /*
667
+ * If the page was formerly rightmost but was split while we
668
+ * didn't hold the lock, and ip_posid is pointing to item
669
+ * 1, then ip_posid now points at the high key not a valid
670
+ * data item. In this case we need to step forward.
671
+ */
672
+ opaque = (BTPageOpaque ) PageGetSpecialPointer (page );
673
+ if (current -> ip_posid < P_FIRSTDATAKEY (opaque ))
674
+ current -> ip_posid = P_FIRSTDATAKEY (opaque );
675
+ }
676
+
641
677
offnum = ItemPointerGetOffsetNumber (current );
642
- page = BufferGetPage (buf );
643
678
btitem = (BTItem ) PageGetItem (page , PageGetItemId (page , offnum ));
644
679
itup = & btitem -> bti_itup ;
645
680
htup = & (itup -> t_tid );
646
681
647
682
if (callback (htup , callback_state ))
648
683
{
649
- /*
650
- * If this is first deletion on this page, trade in read
651
- * lock for a really-exclusive write lock. Then, step
652
- * back one and re-examine the item, because other
653
- * backends might have inserted item(s) while we weren't
654
- * holding the lock!
655
- *
656
- * We assume that only concurrent insertions, not deletions,
657
- * can occur while we're not holding the page lock (the
658
- * caller should hold a suitable relation lock to ensure
659
- * this). Therefore, the item we want to delete is either
660
- * in the same slot as before, or some slot to its right.
661
- * Rechecking the same slot is necessary and sufficient to
662
- * get back in sync after any insertions.
663
- */
664
- if (blkno != lockedBlock )
665
- {
666
- LockBuffer (buf , BUFFER_LOCK_UNLOCK );
667
- LockBufferForCleanup (buf );
668
- lockedBlock = blkno ;
669
- }
670
- else
671
- {
672
- /* Okay to delete the item from the page */
673
- _bt_itemdel (rel , buf , current );
674
-
675
- /* Mark buffer dirty, but keep the lock and pin */
676
- WriteNoReleaseBuffer (buf );
677
-
678
- tuples_removed += 1 ;
679
- }
684
+ /* Okay to delete the item from the page */
685
+ _bt_itemdel (rel , buf , current );
686
+
687
+ /* Mark buffer dirty, but keep the lock and pin */
688
+ WriteNoReleaseBuffer (buf );
689
+
690
+ tuples_removed += 1 ;
680
691
681
692
/*
682
- * In either case, we now need to back up the scan one
683
- * item, so that the next cycle will re-examine the same
684
- * offnum on this page .
693
+ * We now need to back up the scan one item, so that the next
694
+ * cycle will re-examine the same offnum on this page (which
695
+ * now holds the next item) .
685
696
*
686
697
* For now, just hack the current-item index. Will need to
687
698
* be smarter when deletion includes removal of empty
688
699
* index pages.
689
- *
690
- * We must decrement ip_posid in all cases but one: if the
691
- * page was formerly rightmost but was split while we
692
- * didn't hold the lock, and ip_posid is pointing to item
693
- * 1, then ip_posid now points at the high key not a valid
694
- * data item. In this case we do want to step forward.
695
700
*/
696
- opaque = (BTPageOpaque ) PageGetSpecialPointer (page );
697
- if (current -> ip_posid >= P_FIRSTDATAKEY (opaque ))
698
- current -> ip_posid -- ;
701
+ current -> ip_posid -- ;
699
702
}
700
703
else
701
704
num_index_tuples += 1 ;
@@ -717,6 +720,16 @@ btbulkdelete(PG_FUNCTION_ARGS)
717
720
718
721
/*
719
722
* Restore scan position when btgettuple is called to continue a scan.
723
+ *
724
+ * This is nontrivial because concurrent insertions might have moved the
725
+ * index tuple we stopped on. We assume the tuple can only have moved to
726
+ * the right from our stop point, because we kept a pin on the buffer,
727
+ * and so no deletion can have occurred on that page.
728
+ *
729
+ * On entry, we have a pin but no read lock on the buffer that contained
730
+ * the index tuple we stopped the scan on. On exit, we have pin and read
731
+ * lock on the buffer that now contains that index tuple, and the scandesc's
732
+ * current position is updated to point at it.
720
733
*/
721
734
static void
722
735
_bt_restscan (IndexScanDesc scan )
@@ -729,13 +742,14 @@ _bt_restscan(IndexScanDesc scan)
729
742
OffsetNumber offnum = ItemPointerGetOffsetNumber (current ),
730
743
maxoff ;
731
744
BTPageOpaque opaque ;
745
+ Buffer nextbuf ;
732
746
ItemPointerData target = so -> curHeapIptr ;
733
747
BTItem item ;
734
748
BlockNumber blkno ;
735
749
736
750
/*
737
- * Get back the read lock we were holding on the buffer. (We still
738
- * have a reference-count pin on it, so need not get that.)
751
+ * Reacquire read lock on the buffer. (We should still have
752
+ * a reference-count pin on it, so need not get that.)
739
753
*/
740
754
LockBuffer (buf , BT_READ );
741
755
@@ -747,7 +761,7 @@ _bt_restscan(IndexScanDesc scan)
747
761
* We use this as flag when first index tuple on page is deleted but
748
762
* we do not move left (this would slowdown vacuum) - so we set
749
763
* current->ip_posid before first index tuple on the current page
750
- * (_bt_step will move it right)...
764
+ * (_bt_step will move it right)... XXX still needed?
751
765
*/
752
766
if (!ItemPointerIsValid (& target ))
753
767
{
@@ -758,7 +772,7 @@ _bt_restscan(IndexScanDesc scan)
758
772
759
773
/*
760
774
* The item we were on may have moved right due to insertions. Find it
761
- * again.
775
+ * again. We use the heap TID to identify the item uniquely.
762
776
*/
763
777
for (;;)
764
778
{
@@ -774,28 +788,33 @@ _bt_restscan(IndexScanDesc scan)
774
788
target .ip_blkid .bi_lo &&
775
789
item -> bti_itup .t_tid .ip_posid == target .ip_posid )
776
790
{
791
+ /* Found it */
777
792
current -> ip_posid = offnum ;
778
793
return ;
779
794
}
780
795
}
781
796
782
797
/*
783
- * By here, the item we're looking for moved right at least one
784
- * page
798
+ * The item we're looking for moved right at least one page, so
799
+ * move right. We are careful here to pin and read-lock the next
800
+ * page before releasing the current one. This ensures that a
801
+ * concurrent btbulkdelete scan cannot pass our position --- if it
802
+ * did, it might be able to reach and delete our target item before
803
+ * we can find it again.
785
804
*/
786
805
if (P_RIGHTMOST (opaque ))
787
806
elog (FATAL , "_bt_restscan: my bits moved right off the end of the world!"
788
807
"\n\tRecreate index %s." , RelationGetRelationName (rel ));
789
808
790
809
blkno = opaque -> btpo_next ;
810
+ nextbuf = _bt_getbuf (rel , blkno , BT_READ );
791
811
_bt_relbuf (rel , buf );
792
- buf = _bt_getbuf ( rel , blkno , BT_READ ) ;
812
+ so -> btso_curbuf = buf = nextbuf ;
793
813
page = BufferGetPage (buf );
794
814
maxoff = PageGetMaxOffsetNumber (page );
795
815
opaque = (BTPageOpaque ) PageGetSpecialPointer (page );
796
816
offnum = P_FIRSTDATAKEY (opaque );
797
817
ItemPointerSet (current , blkno , offnum );
798
- so -> btso_curbuf = buf ;
799
818
}
800
819
}
801
820
0 commit comments