postgrespro
diff --git a/‎src/backend/access/index/genam.c
Lines changed: 1 addition & 6 deletions b/‎src/backend/access/index/genam.c
Lines changed: 1 addition & 6 deletions
diff --git a/‎src/backend/access/index/indexam.c
Lines changed: 4 additions & 85 deletions b/‎src/backend/access/index/indexam.c
Lines changed: 4 additions & 85 deletions
diff --git a/‎src/backend/access/nbtree/README
Lines changed: 45 additions & 17 deletions b/‎src/backend/access/nbtree/README
Lines changed: 45 additions & 17 deletions
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.54 2006/03/05 15:58:21 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/index/genam.c,v 1.55 2006/05/07 01:21:30 tgl Exp $
  *
  * NOTES
  *	  many of the old access method routines have been turned into
@@ -90,8 +90,6 @@ RelationGetIndexScan(Relation indexRelation,
 	scan->have_lock = false;			/* ditto */
 	scan->kill_prior_tuple = false;
 	scan->ignore_killed_tuples = true;	/* default setting */
-	scan->keys_are_unique = false;		/* may be set by index AM */
-	scan->got_tuple = false;
 
 	scan->opaque = NULL;
 
@@ -102,9 +100,6 @@ RelationGetIndexScan(Relation indexRelation,
 	scan->xs_ctup.t_data = NULL;
 	scan->xs_cbuf = InvalidBuffer;
 
-	scan->unique_tuple_pos = 0;
-	scan->unique_tuple_mark = 0;
-
 	pgstat_initstats(&scan->xs_pgstat_info, indexRelation);
 
 	/*
 
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.92 2006/05/02 22:25:10 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/index/indexam.c,v 1.93 2006/05/07 01:21:30 tgl Exp $
  *
  * INTERFACE ROUTINES
  *		index_open		- open an index relation by relation OID
@@ -362,10 +362,6 @@ index_rescan(IndexScanDesc scan, ScanKey key)
 	}
 
 	scan->kill_prior_tuple = false;		/* for safety */
-	scan->keys_are_unique = false;		/* may be set by index AM */
-	scan->got_tuple = false;
-	scan->unique_tuple_pos = 0;
-	scan->unique_tuple_mark = 0;
 
 	FunctionCall2(procedure,
 				  PointerGetDatum(scan),
@@ -417,8 +413,6 @@ index_markpos(IndexScanDesc scan)
 	SCAN_CHECKS;
 	GET_SCAN_PROCEDURE(ammarkpos);
 
-	scan->unique_tuple_mark = scan->unique_tuple_pos;
-
 	FunctionCall1(procedure, PointerGetDatum(scan));
 }
 
@@ -440,13 +434,6 @@ index_restrpos(IndexScanDesc scan)
 
 	scan->kill_prior_tuple = false;		/* for safety */
 
-	/*
-	 * We do not reset got_tuple; so if the scan is actually being
-	 * short-circuited by index_getnext, the effective position restoration is
-	 * done by restoring unique_tuple_pos.
-	 */
-	scan->unique_tuple_pos = scan->unique_tuple_mark;
-
 	FunctionCall1(procedure, PointerGetDatum(scan));
 }
 
@@ -456,8 +443,7 @@ index_restrpos(IndexScanDesc scan)
  * The result is the next heap tuple satisfying the scan keys and the
  * snapshot, or NULL if no more matching tuples exist.	On success,
  * the buffer containing the heap tuple is pinned (the pin will be dropped
- * at the next index_getnext or index_endscan).  The index TID corresponding
- * to the heap tuple can be obtained if needed from scan->currentItemData.
+ * at the next index_getnext or index_endscan).
  * ----------------
  */
 HeapTuple
@@ -469,65 +455,6 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
 	SCAN_CHECKS;
 	GET_SCAN_PROCEDURE(amgettuple);
 
-	/*
-	 * If we already got a tuple and it must be unique, there's no need to
-	 * make the index AM look through any additional tuples.  (This can save a
-	 * useful amount of work in scenarios where there are many dead tuples due
-	 * to heavy update activity.)
-	 *
-	 * To do this we must keep track of the logical scan position
-	 * (before/on/after tuple).  Also, we have to be sure to release scan
-	 * resources before returning NULL; if we fail to do so then a multi-index
-	 * scan can easily run the system out of free buffers.	We can release
-	 * index-level resources fairly cheaply by calling index_rescan.  This
-	 * means there are two persistent states as far as the index AM is
-	 * concerned: on-tuple and rescanned.  If we are actually asked to
-	 * re-fetch the single tuple, we have to go through a fresh indexscan
-	 * startup, which penalizes that (infrequent) case.
-	 */
-	if (scan->keys_are_unique && scan->got_tuple)
-	{
-		int			new_tuple_pos = scan->unique_tuple_pos;
-
-		if (ScanDirectionIsForward(direction))
-		{
-			if (new_tuple_pos <= 0)
-				new_tuple_pos++;
-		}
-		else
-		{
-			if (new_tuple_pos >= 0)
-				new_tuple_pos--;
-		}
-		if (new_tuple_pos == 0)
-		{
-			/*
-			 * We are moving onto the unique tuple from having been off it. We
-			 * just fall through and let the index AM do the work. Note we
-			 * should get the right answer regardless of scan direction.
-			 */
-			scan->unique_tuple_pos = 0; /* need to update position */
-		}
-		else
-		{
-			/*
-			 * Moving off the tuple; must do amrescan to release index-level
-			 * pins before we return NULL.	Since index_rescan will reset my
-			 * state, must save and restore...
-			 */
-			int			unique_tuple_mark = scan->unique_tuple_mark;
-
-			index_rescan(scan, NULL /* no change to key */ );
-
-			scan->keys_are_unique = true;
-			scan->got_tuple = true;
-			scan->unique_tuple_pos = new_tuple_pos;
-			scan->unique_tuple_mark = unique_tuple_mark;
-
-			return NULL;
-		}
-	}
-
 	/* just make sure this is false... */
 	scan->kill_prior_tuple = false;
 
@@ -588,14 +515,6 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
 	}
 
 	/* Success exit */
-	scan->got_tuple = true;
-
-	/*
-	 * If we just fetched a known-unique tuple, then subsequent calls will go
-	 * through the short-circuit code above.  unique_tuple_pos has been
-	 * initialized to 0, which is the correct state ("on row").
-	 */
-
 	return heapTuple;
 }
 
@@ -608,8 +527,8 @@ index_getnext(IndexScanDesc scan, ScanDirection direction)
  * (which most callers of this routine will probably want to suppress by
  * setting scan->ignore_killed_tuples = false).
  *
- * On success (TRUE return), the found index TID is in scan->currentItemData,
- * and its heap TID is in scan->xs_ctup.t_self.  scan->xs_cbuf is untouched.
+ * On success (TRUE return), the heap TID of the found index entry is in
+ * scan->xs_ctup.t_self.  scan->xs_cbuf is untouched.
  * ----------------
  */
 bool
 
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.10 2006/04/25 22:46:05 tgl Exp $
+$PostgreSQL: pgsql/src/backend/access/nbtree/README,v 1.11 2006/05/07 01:21:30 tgl Exp $
 
 This directory contains a correct implementation of Lehman and Yao's
 high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
@@ -67,13 +67,22 @@ move right until we find a page whose right-link matches the page we
 came from.  (Actually, it's even harder than that; see deletion discussion
 below.)
 
-Read locks on a page are held for as long as a scan is examining a page.
-But nbtree.c arranges to drop the read lock, but not the buffer pin,
-on the current page of a scan before control leaves nbtree.  When we
-come back to resume the scan, we have to re-grab the read lock and
-then move right if the current item moved (see _bt_restscan()).  Keeping
-the pin ensures that the current item cannot move left or be deleted
-(see btbulkdelete).
+Page read locks are held only for as long as a scan is examining a page.
+To minimize lock/unlock traffic, an index scan always searches a leaf page
+to identify all the matching items at once, copying their heap tuple IDs
+into backend-local storage.  The heap tuple IDs are then processed while
+not holding any page lock within the index.  We do continue to hold a pin
+on the leaf page, to protect against concurrent deletions (see below).
+In this state the scan is effectively stopped "between" pages, either
+before or after the page it has pinned.  This is safe in the presence of
+concurrent insertions and even page splits, because items are never moved
+across pre-existing page boundaries --- so the scan cannot miss any items
+it should have seen, nor accidentally return the same item twice.  The scan
+must remember the page's right-link at the time it was scanned, since that
+is the page to move right to; if we move right to the current right-link
+then we'd re-scan any items moved by a page split.  We don't similarly
+remember the left-link, since it's best to use the most up-to-date
+left-link when trying to move left (see detailed move-left algorithm below).
 
 In most cases we release our lock and pin on a page before attempting
 to acquire pin and lock on the page we are moving to.  In a few places
@@ -119,14 +128,33 @@ item doesn't fit on the split page where it needs to go!
 The deletion algorithm
 ----------------------
 
-Deletions of leaf items are handled by getting a super-exclusive lock on
-the target page, so that no other backend has a pin on the page when the
-deletion starts.  This means no scan is pointing at the page, so no other
-backend can lose its place due to the item deletion.
-
-The above does not work for deletion of items in internal pages, since
-other backends keep no lock nor pin on a page they have descended past.
-Instead, when a backend is ascending the tree using its stack, it must
+Before deleting a leaf item, we get a super-exclusive lock on the target
+page, so that no other backend has a pin on the page when the deletion
+starts.  This is not necessary for correctness in terms of the btree index
+operations themselves; as explained above, index scans logically stop
+"between" pages and so can't lose their place.  The reason we do it is to
+provide an interlock between non-full VACUUM and indexscans.  Since VACUUM
+deletes index entries before deleting tuples, the super-exclusive lock
+guarantees that VACUUM can't delete any heap tuple that an indexscanning
+process might be about to visit.  (This guarantee works only for simple
+indexscans that visit the heap in sync with the index scan, not for bitmap
+scans.  We only need the guarantee when using non-MVCC snapshot rules such
+as SnapshotNow, so in practice this is only important for system catalog
+accesses.)
+
+Because a page can be split even while someone holds a pin on it, it is
+possible that an indexscan will return items that are no longer stored on
+the page it has a pin on, but rather somewhere to the right of that page.
+To ensure that VACUUM can't prematurely remove such heap tuples, we require
+btbulkdelete to obtain super-exclusive lock on every leaf page in the index
+(even pages that don't contain any deletable tuples).  This guarantees that
+the btbulkdelete call cannot return while any indexscan is still holding
+a copy of a deleted index tuple.  Note that this requirement does not say
+that btbulkdelete must visit the pages in any particular order.
+
+There is no such interlocking for deletion of items in internal pages,
+since backends keep no lock nor pin on a page they have descended past.
+Hence, when a backend is ascending the tree using its stack, it must
 be prepared for the possibility that the item it wants is to the left of
 the recorded position (but it can't have moved left out of the recorded
 page).  Since we hold a lock on the lower page (per L&Y) until we have
@@ -201,7 +229,7 @@ accordingly.  Searches and forward scans simply follow the right-link
 until they find a non-dead page --- this will be where the deleted page's
 key-space moved to.
 
-Stepping left in a backward scan is complicated because we must consider
+Moving left in a backward scan is complicated because we must consider
 the possibility that the left sibling was just split (meaning we must find
 the rightmost page derived from the left sibling), plus the possibility
 that the page we were just on has now been deleted and hence isn't in the