Rewrite hashbulkdelete() to make it amenable to new bucket locking

tglsfdc · tglsfdc · commit 39673ca47b86 · 2003-09-02T02:18:38.000Z
scheme.  A pleasant side effect is that it is *much* faster when deleting
a large fraction of the indexed tuples, because of elimination of
redundant hash_step activity induced by hash_adjscans.  Various other
continuing code cleanup.
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.65 2003/08/04 02:39:57 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.66 2003/09/02 02:18:38 tgl Exp $
  *
  * NOTES
  *	  This file contains only the public interface routines.
@@ -449,40 +449,178 @@ hashbulkdelete(PG_FUNCTION_ARGS)
 	BlockNumber num_pages;
 	double		tuples_removed;
 	double		num_index_tuples;
-	IndexScanDesc iscan;
+	uint32		deleted_tuples;
+	uint32		tuples_remaining;
+	uint32		orig_ntuples;
+	Bucket		orig_maxbucket;
+	Bucket		cur_maxbucket;
+	Bucket		cur_bucket;
+	Buffer		metabuf;
+	HashMetaPage metap;
+	HashMetaPageData local_metapage;
 
+	/*
+	 * keep track of counts in both float form (to return) and integer form
+	 * (to update hashm_ntuples).  It'd be better to make hashm_ntuples a
+	 * double, but that will have to wait for an initdb.
+	 */
 	tuples_removed = 0;
 	num_index_tuples = 0;
+	deleted_tuples = 0;
+	tuples_remaining = 0;
 
 	/*
-	 * XXX generic implementation --- should be improved!
+	 * Read the metapage to fetch original bucket and tuple counts.  Also,
+	 * we keep a copy of the last-seen metapage so that we can use its
+	 * hashm_spares[] values to compute bucket page addresses.  This is a
+	 * bit hokey but perfectly safe, since the interesting entries in the
+	 * spares array cannot change under us; and it beats rereading the
+	 * metapage for each bucket.
 	 */
+	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
+	metap = (HashMetaPage) BufferGetPage(metabuf);
+	_hash_checkpage((Page) metap, LH_META_PAGE);
+	orig_maxbucket = metap->hashm_maxbucket;
+	orig_ntuples = metap->hashm_ntuples;
+	memcpy(&local_metapage, metap, sizeof(local_metapage));
+	_hash_relbuf(rel, metabuf, HASH_READ);
+
+	/* Scan the buckets that we know exist */
+	cur_bucket = 0;
+	cur_maxbucket = orig_maxbucket;
+
+loop_top:
+	while (cur_bucket <= cur_maxbucket)
+	{
+		BlockNumber bucket_blkno;
+		BlockNumber blkno;
+		bool		bucket_dirty = false;
 
-	/* walk through the entire index */
-	iscan = index_beginscan(NULL, rel, SnapshotAny, 0, (ScanKey) NULL);
-	/* including killed tuples */
-	iscan->ignore_killed_tuples = false;
+		/* Get address of bucket's start page */
+		bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket);
 
-	while (index_getnext_indexitem(iscan, ForwardScanDirection))
-	{
-		if (callback(&iscan->xs_ctup.t_self, callback_state))
+		/* XXX lock bucket here */
+
+		/* Scan each page in bucket */
+		blkno = bucket_blkno;
+		while (BlockNumberIsValid(blkno))
 		{
-			ItemPointerData indextup = iscan->currentItemData;
+			Buffer		buf;
+			Page		page;
+			HashPageOpaque opaque;
+			OffsetNumber offno;
+			OffsetNumber maxoffno;
+			bool		page_dirty = false;
+
+			buf = _hash_getbuf(rel, blkno, HASH_WRITE);
+			page = BufferGetPage(buf);
+			_hash_checkpage(page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
+			opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+			Assert(opaque->hasho_bucket == cur_bucket);
+
+			/* Scan each tuple in page */
+			offno = FirstOffsetNumber;
+			maxoffno = PageGetMaxOffsetNumber(page);
+			while (offno <= maxoffno)
+			{
+				HashItem	hitem;
+				ItemPointer htup;
+
+				hitem = (HashItem) PageGetItem(page,
+											   PageGetItemId(page, offno));
+				htup = &(hitem->hash_itup.t_tid);
+				if (callback(htup, callback_state))
+				{
+					ItemPointerData indextup;
+
+					/* adjust any active scans that will be affected */
+					/* (this should be unnecessary) */
+					ItemPointerSet(&indextup, blkno, offno);
+					_hash_adjscans(rel, &indextup);
+
+					/* delete the item from the page */
+					PageIndexTupleDelete(page, offno);
+					bucket_dirty = page_dirty = true;
+
+					/* don't increment offno, instead decrement maxoffno */
+					maxoffno = OffsetNumberPrev(maxoffno);
+
+					tuples_removed += 1;
+					deleted_tuples += 1;
+				}
+				else
+				{
+					offno = OffsetNumberNext(offno);
+
+					num_index_tuples += 1;
+					tuples_remaining += 1;
+				}
+			}
 
-			/* adjust any active scans that will be affected by deletion */
-			/* (namely, my own scan) */
-			_hash_adjscans(rel, &indextup);
+			/*
+			 * Write or free page if needed, advance to next page.  We want
+			 * to preserve the invariant that overflow pages are nonempty.
+			 */
+			blkno = opaque->hasho_nextblkno;
+
+			if (PageIsEmpty(page) && (opaque->hasho_flag & LH_OVERFLOW_PAGE))
+				_hash_freeovflpage(rel, buf);
+			else if (page_dirty)
+				_hash_wrtbuf(rel, buf);
+			else
+				_hash_relbuf(rel, buf, HASH_WRITE);
+		}
 
-			/* delete the data from the page */
-			_hash_pagedel(rel, &indextup);
+		/* If we deleted anything, try to compact free space */
+		if (bucket_dirty)
+			_hash_squeezebucket(rel, cur_bucket, bucket_blkno);
 
-			tuples_removed += 1;
-		}
+		/* XXX unlock bucket here */
+
+		/* Advance to next bucket */
+		cur_bucket++;
+	}
+
+	/* Write-lock metapage and check for split since we started */
+	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
+	metap = (HashMetaPage) BufferGetPage(metabuf);
+	_hash_checkpage((Page) metap, LH_META_PAGE);
+
+	if (cur_maxbucket != metap->hashm_maxbucket)
+	{
+		/* There's been a split, so process the additional bucket(s) */
+		cur_maxbucket = metap->hashm_maxbucket;
+		memcpy(&local_metapage, metap, sizeof(local_metapage));
+		_hash_relbuf(rel, metabuf, HASH_WRITE);
+		goto loop_top;
+	}
+
+	/* Okay, we're really done.  Update tuple count in metapage. */
+
+	if (orig_maxbucket == metap->hashm_maxbucket &&
+		orig_ntuples == metap->hashm_ntuples)
+	{
+		/*
+		 * No one has split or inserted anything since start of scan,
+		 * so believe our count as gospel.
+		 */
+		metap->hashm_ntuples = tuples_remaining;
+	}
+	else
+	{
+		/*
+		 * Otherwise, our count is untrustworthy since we may have
+		 * double-scanned tuples in split buckets.  Proceed by
+		 * dead-reckoning.
+		 */
+		if (metap->hashm_ntuples > deleted_tuples)
+			metap->hashm_ntuples -= deleted_tuples;
 		else
-			num_index_tuples += 1;
+			metap->hashm_ntuples = 0;
+		num_index_tuples = metap->hashm_ntuples;
 	}
 
-	index_endscan(iscan);
+	_hash_wrtbuf(rel, metabuf);
 
 	/* return statistics */
 	num_pages = RelationGetNumberOfBlocks(rel);
diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.38 2003/09/01 20:26:34 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.39 2003/09/02 02:18:38 tgl Exp $
  *
  * NOTES
  *	  Overflow pages look like ordinary relation pages.
@@ -444,11 +444,13 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno)
  *	first page in the bucket chain.  The read page works backward and
  *	the write page works forward; the procedure terminates when the
  *	read page and write page are the same page.
+ *
+ *	Caller must hold exclusive lock on the target bucket.
  */
 void
 _hash_squeezebucket(Relation rel,
-					HashMetaPage metap,
-					Bucket bucket)
+					Bucket bucket,
+					BlockNumber bucket_blkno)
 {
 	Buffer		wbuf;
 	Buffer		rbuf = 0;
@@ -466,7 +468,7 @@ _hash_squeezebucket(Relation rel,
 	/*
 	 * start squeezing into the base bucket page.
 	 */
-	wblkno = BUCKET_TO_BLKNO(bucket);
+	wblkno = bucket_blkno;
 	wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE);
 	wpage = BufferGetPage(wbuf);
 	_hash_checkpage(wpage, LH_BUCKET_PAGE);
@@ -484,11 +486,6 @@ _hash_squeezebucket(Relation rel,
 	/*
 	 * find the last page in the bucket chain by starting at the base
 	 * bucket page and working forward.
-	 *
-	 * XXX if chains tend to be long, we should probably move forward using
-	 * HASH_READ and then _hash_chgbufaccess to HASH_WRITE when we reach
-	 * the end.  if they are short we probably don't care very much.  if
-	 * the hash function is working at all, they had better be short..
 	 */
 	ropaque = wopaque;
 	do
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.39 2003/09/01 20:26:34 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.40 2003/09/02 02:18:38 tgl Exp $
  *
  * NOTES
  *	  Postgres hash pages look like ordinary relation pages.  The opaque
@@ -143,7 +143,7 @@ _hash_metapinit(Relation rel)
 	 */
 	for (i = 0; i <= 1; i++)
 	{
-		buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(i), HASH_WRITE);
+		buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(metap, i), HASH_WRITE);
 		pg = BufferGetPage(buf);
 		_hash_pageinit(pg, BufferGetPageSize(buf));
 		pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
@@ -456,6 +456,8 @@ _hash_splitbucket(Relation rel,
 	Buffer		ovflbuf;
 	BlockNumber oblkno;
 	BlockNumber nblkno;
+	BlockNumber start_oblkno;
+	BlockNumber start_nblkno;
 	bool		null;
 	Datum		datum;
 	HashItem	hitem;
@@ -475,8 +477,10 @@ _hash_splitbucket(Relation rel,
 	_hash_checkpage((Page) metap, LH_META_PAGE);
 
 	/* get the buffers & pages */
-	oblkno = BUCKET_TO_BLKNO(obucket);
-	nblkno = BUCKET_TO_BLKNO(nbucket);
+	start_oblkno = BUCKET_TO_BLKNO(metap, obucket);
+	start_nblkno = BUCKET_TO_BLKNO(metap, nbucket);
+	oblkno = start_oblkno;
+	nblkno = start_nblkno;
 	obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
 	nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE);
 	opage = BufferGetPage(obuf);
@@ -571,7 +575,7 @@ _hash_splitbucket(Relation rel,
 				 */
 				_hash_wrtbuf(rel, obuf);
 				_hash_wrtbuf(rel, nbuf);
-				_hash_squeezebucket(rel, metap, obucket);
+				_hash_squeezebucket(rel, obucket, start_oblkno);
 				return;
 			}
 		}
@@ -639,7 +643,7 @@ _hash_splitbucket(Relation rel,
 				if (!BlockNumberIsValid(oblkno))
 				{
 					_hash_wrtbuf(rel, nbuf);
-					_hash_squeezebucket(rel, metap, obucket);
+					_hash_squeezebucket(rel, obucket, start_oblkno);
 					return;
 				}
 
diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashsearch.c,v 1.31 2003/08/04 02:39:57 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hashsearch.c,v 1.32 2003/09/02 02:18:38 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -19,8 +19,10 @@
 
 
 /*
- *	_hash_search() -- Finds the page/bucket that the contains the
- *	scankey and loads it into *bufP.  the buffer has a read lock.
+ *	_hash_search() -- Find the bucket that contains the scankey
+ *		and fetch its primary bucket page into *bufP.
+ *
+ * the buffer has a read lock.
  */
 void
 _hash_search(Relation rel,
@@ -30,22 +32,23 @@ _hash_search(Relation rel,
 			 HashMetaPage metap)
 {
 	BlockNumber blkno;
-	Datum		keyDatum;
 	Bucket		bucket;
 
-	if (scankey == (ScanKey) NULL ||
-		(keyDatum = scankey[0].sk_argument) == (Datum) NULL)
+	if (scankey == NULL)
 	{
 		/*
-		 * If the scankey argument is NULL, all tuples will satisfy the
+		 * If the scankey is empty, all tuples will satisfy the
 		 * scan so we start the scan at the first bucket (bucket 0).
 		 */
 		bucket = 0;
 	}
 	else
-		bucket = _hash_call(rel, metap, keyDatum);
+	{
+		Assert(!(scankey[0].sk_flags & SK_ISNULL));
+		bucket = _hash_call(rel, metap, scankey[0].sk_argument);
+	}
 
-	blkno = BUCKET_TO_BLKNO(bucket);
+	blkno = BUCKET_TO_BLKNO(metap, bucket);
 
 	*bufP = _hash_getbuf(rel, blkno, HASH_READ);
 }
@@ -330,7 +333,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir, Buffer metabuf)
 						if (allbuckets && bucket < metap->hashm_maxbucket)
 						{
 							++bucket;
-							blkno = BUCKET_TO_BLKNO(bucket);
+							blkno = BUCKET_TO_BLKNO(metap, bucket);
 							buf = _hash_getbuf(rel, blkno, HASH_READ);
 							page = BufferGetPage(buf);
 							_hash_checkpage(page, LH_BUCKET_PAGE);
@@ -380,7 +383,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir, Buffer metabuf)
 						if (allbuckets && bucket > 0)
 						{
 							--bucket;
-							blkno = BUCKET_TO_BLKNO(bucket);
+							blkno = BUCKET_TO_BLKNO(metap, bucket);
 							buf = _hash_getbuf(rel, blkno, HASH_READ);
 							page = BufferGetPage(buf);
 							_hash_checkpage(page, LH_BUCKET_PAGE);
diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c
diff --git a/src/include/access/hash.h b/src/include/access/hash.h