postgrespro
diff --git a/‎src/backend/access/nbtree/README
Lines changed: 27 additions & 5 deletions b/‎src/backend/access/nbtree/README
Lines changed: 27 additions & 5 deletions
diff --git a/‎src/backend/access/nbtree/nbtinsert.c
Lines changed: 22 additions & 2 deletions b/‎src/backend/access/nbtree/nbtinsert.c
Lines changed: 22 additions & 2 deletions
diff --git a/‎src/backend/access/nbtree/nbtpage.c
Lines changed: 12 additions & 3 deletions b/‎src/backend/access/nbtree/nbtpage.c
Lines changed: 12 additions & 3 deletions
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.11 2006/05/07 01:21:30 tgl Exp $
+$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.12 2006/05/08 00:00:09 tgl Exp $
 
 This directory contains a correct implementation of Lehman and Yao's
 high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
@@ -293,10 +293,32 @@ as part of the atomic update for the delete (either way, the metapage has
 to be the last page locked in the update to avoid deadlock risks).  This
 avoids race conditions if two such operations are executing concurrently.
 
-VACUUM needs to do a linear scan of an index to search for empty leaf
-pages and half-dead parent pages that can be deleted, as well as deleted
-pages that can be reclaimed because they are older than all open
-transactions.
+VACUUM needs to do a linear scan of an index to search for deleted pages
+that can be reclaimed because they are older than all open transactions.
+For efficiency's sake, we'd like to use the same linear scan to search for
+deletable tuples.  Before Postgres 8.2, btbulkdelete scanned the leaf pages
+in index order, but it is possible to visit them in physical order instead.
+The tricky part of this is to avoid missing any deletable tuples in the
+presence of concurrent page splits: a page split could easily move some
+tuples from a page not yet passed over by the sequential scan to a
+lower-numbered page already passed over.  (This wasn't a concern for the
+index-order scan, because splits always split right.)  To implement this,
+we provide a "vacuum cycle ID" mechanism that makes it possible to
+determine whether a page has been split since the current btbulkdelete
+cycle started.  If btbulkdelete finds a page that has been split since
+it started, and has a right-link pointing to a lower page number, then
+it temporarily suspends its sequential scan and visits that page instead.
+It must continue to follow right-links and vacuum dead tuples until
+reaching a page that either hasn't been split since btbulkdelete started,
+or is above the location of the outer sequential scan.  Then it can resume
+the sequential scan.  This ensures that all tuples are visited.  It may be
+that some tuples are visited twice, but that has no worse effect than an
+inaccurate index tuple count (and we can't guarantee an accurate count
+anyway in the face of concurrent activity).  Note that this still works
+if the has-been-recently-split test has a small probability of false
+positives, so long as it never gives a false negative.  This makes it
+possible to implement the test with a small counter value stored on each
+index page.
 
 WAL considerations
 ------------------
 
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.136 2006/04/25 22:46:05 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.137 2006/05/08 00:00:09 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -700,14 +700,18 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 	ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
 
 	/* if we're splitting this page, it won't be the root when we're done */
+	/* also, clear the SPLIT_END flag in both pages */
 	lopaque->btpo_flags = oopaque->btpo_flags;
-	lopaque->btpo_flags &= ~BTP_ROOT;
+	lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END);
 	ropaque->btpo_flags = lopaque->btpo_flags;
 	lopaque->btpo_prev = oopaque->btpo_prev;
 	lopaque->btpo_next = BufferGetBlockNumber(rbuf);
 	ropaque->btpo_prev = BufferGetBlockNumber(buf);
 	ropaque->btpo_next = oopaque->btpo_next;
 	lopaque->btpo.level = ropaque->btpo.level = oopaque->btpo.level;
+	/* Since we already have write-lock on both pages, ok to read cycleid */
+	lopaque->btpo_cycleid = _bt_vacuum_cycleid(rel);
+	ropaque->btpo_cycleid = lopaque->btpo_cycleid;
 
 	/*
 	 * If the page we're splitting is not the rightmost page at its level in
@@ -836,6 +840,21 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 		sopaque = (BTPageOpaque) PageGetSpecialPointer(spage);
 		if (sopaque->btpo_prev != ropaque->btpo_prev)
 			elog(PANIC, "right sibling's left-link doesn't match");
+		/*
+		 * Check to see if we can set the SPLIT_END flag in the right-hand
+		 * split page; this can save some I/O for vacuum since it need not
+		 * proceed to the right sibling.  We can set the flag if the right
+		 * sibling has a different cycleid: that means it could not be part
+		 * of a group of pages that were all split off from the same ancestor
+		 * page.  If you're confused, imagine that page A splits to A B and
+		 * then again, yielding A C B, while vacuum is in progress.  Tuples
+		 * originally in A could now be in either B or C, hence vacuum must
+		 * examine both pages.  But if D, our right sibling, has a different
+		 * cycleid then it could not contain any tuples that were in A when
+		 * the vacuum started.
+		 */
+		if (sopaque->btpo_cycleid != ropaque->btpo_cycleid)
+			ropaque->btpo_flags |= BTP_SPLIT_END;
 	}
 
 	/*
@@ -1445,6 +1464,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	rootopaque->btpo_flags = BTP_ROOT;
 	rootopaque->btpo.level =
 		((BTPageOpaque) PageGetSpecialPointer(lpage))->btpo.level + 1;
+	rootopaque->btpo_cycleid = 0;
 
 	/* update metapage data */
 	metad->btm_root = rootblknum;
 
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.96 2006/04/25 22:46:05 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.97 2006/05/08 00:00:10 tgl Exp $
  *
  *	NOTES
  *	   Postgres btree pages look like ordinary relation pages.	The opaque
@@ -206,6 +206,7 @@ _bt_getroot(Relation rel, int access)
 		rootopaque->btpo_prev = rootopaque->btpo_next = P_NONE;
 		rootopaque->btpo_flags = (BTP_LEAF | BTP_ROOT);
 		rootopaque->btpo.level = 0;
+		rootopaque->btpo_cycleid = 0;
 
 		/* NO ELOG(ERROR) till meta is updated */
 		START_CRIT_SECTION();
@@ -544,7 +545,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 		 * Release the file-extension lock; it's now OK for someone else to
 		 * extend the relation some more.  Note that we cannot release this
 		 * lock before we have buffer lock on the new page, or we risk a race
-		 * condition against btvacuumcleanup --- see comments therein.
+		 * condition against btvacuumscan --- see comments therein.
 		 */
 		if (needLock)
 			UnlockRelationForExtension(rel, ExclusiveLock);
@@ -608,7 +609,7 @@ _bt_pageinit(Page page, Size size)
 /*
  *	_bt_page_recyclable() -- Is an existing page recyclable?
  *
- * This exists to make sure _bt_getbuf and btvacuumcleanup have the same
+ * This exists to make sure _bt_getbuf and btvacuumscan have the same
  * policy about whether a page is safe to re-use.
  */
 bool
@@ -651,13 +652,21 @@ _bt_delitems(Relation rel, Buffer buf,
 			 OffsetNumber *itemnos, int nitems)
 {
 	Page		page = BufferGetPage(buf);
+	BTPageOpaque opaque;
 
 	/* No ereport(ERROR) until changes are logged */
 	START_CRIT_SECTION();
 
 	/* Fix the page */
 	PageIndexMultiDelete(page, itemnos, nitems);
 
+	/*
+	 * We can clear the vacuum cycle ID since this page has certainly
+	 * been processed by the current vacuum scan.
+	 */
+	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	opaque->btpo_cycleid = 0;
+
 	MarkBufferDirty(buf);
 
 	/* XLOG stuff */