Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 39673ca

Browse files
committed
Rewrite hashbulkdelete() to make it amenable to new bucket locking
scheme. A pleasant side effect is that it is *much* faster when deleting a large fraction of the indexed tuples, because of elimination of redundant hash_step activity induced by hash_adjscans. Various other continuing code cleanup.
1 parent 5f65345 commit 39673ca

File tree

6 files changed

+231
-76
lines changed

6 files changed

+231
-76
lines changed

src/backend/access/hash/hash.c

Lines changed: 158 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.65 2003/08/04 02:39:57 momjian Exp $
11+
* $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.66 2003/09/02 02:18:38 tgl Exp $
1212
*
1313
* NOTES
1414
* This file contains only the public interface routines.
@@ -449,40 +449,178 @@ hashbulkdelete(PG_FUNCTION_ARGS)
449449
BlockNumber num_pages;
450450
double tuples_removed;
451451
double num_index_tuples;
452-
IndexScanDesc iscan;
452+
uint32 deleted_tuples;
453+
uint32 tuples_remaining;
454+
uint32 orig_ntuples;
455+
Bucket orig_maxbucket;
456+
Bucket cur_maxbucket;
457+
Bucket cur_bucket;
458+
Buffer metabuf;
459+
HashMetaPage metap;
460+
HashMetaPageData local_metapage;
453461

462+
/*
463+
* keep track of counts in both float form (to return) and integer form
464+
* (to update hashm_ntuples). It'd be better to make hashm_ntuples a
465+
* double, but that will have to wait for an initdb.
466+
*/
454467
tuples_removed = 0;
455468
num_index_tuples = 0;
469+
deleted_tuples = 0;
470+
tuples_remaining = 0;
456471

457472
/*
458-
* XXX generic implementation --- should be improved!
473+
* Read the metapage to fetch original bucket and tuple counts. Also,
474+
* we keep a copy of the last-seen metapage so that we can use its
475+
* hashm_spares[] values to compute bucket page addresses. This is a
476+
* bit hokey but perfectly safe, since the interesting entries in the
477+
* spares array cannot change under us; and it beats rereading the
478+
* metapage for each bucket.
459479
*/
480+
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ);
481+
metap = (HashMetaPage) BufferGetPage(metabuf);
482+
_hash_checkpage((Page) metap, LH_META_PAGE);
483+
orig_maxbucket = metap->hashm_maxbucket;
484+
orig_ntuples = metap->hashm_ntuples;
485+
memcpy(&local_metapage, metap, sizeof(local_metapage));
486+
_hash_relbuf(rel, metabuf, HASH_READ);
487+
488+
/* Scan the buckets that we know exist */
489+
cur_bucket = 0;
490+
cur_maxbucket = orig_maxbucket;
491+
492+
loop_top:
493+
while (cur_bucket <= cur_maxbucket)
494+
{
495+
BlockNumber bucket_blkno;
496+
BlockNumber blkno;
497+
bool bucket_dirty = false;
460498

461-
/* walk through the entire index */
462-
iscan = index_beginscan(NULL, rel, SnapshotAny, 0, (ScanKey) NULL);
463-
/* including killed tuples */
464-
iscan->ignore_killed_tuples = false;
499+
/* Get address of bucket's start page */
500+
bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket);
465501

466-
while (index_getnext_indexitem(iscan, ForwardScanDirection))
467-
{
468-
if (callback(&iscan->xs_ctup.t_self, callback_state))
502+
/* XXX lock bucket here */
503+
504+
/* Scan each page in bucket */
505+
blkno = bucket_blkno;
506+
while (BlockNumberIsValid(blkno))
469507
{
470-
ItemPointerData indextup = iscan->currentItemData;
508+
Buffer buf;
509+
Page page;
510+
HashPageOpaque opaque;
511+
OffsetNumber offno;
512+
OffsetNumber maxoffno;
513+
bool page_dirty = false;
514+
515+
buf = _hash_getbuf(rel, blkno, HASH_WRITE);
516+
page = BufferGetPage(buf);
517+
_hash_checkpage(page, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE);
518+
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
519+
Assert(opaque->hasho_bucket == cur_bucket);
520+
521+
/* Scan each tuple in page */
522+
offno = FirstOffsetNumber;
523+
maxoffno = PageGetMaxOffsetNumber(page);
524+
while (offno <= maxoffno)
525+
{
526+
HashItem hitem;
527+
ItemPointer htup;
528+
529+
hitem = (HashItem) PageGetItem(page,
530+
PageGetItemId(page, offno));
531+
htup = &(hitem->hash_itup.t_tid);
532+
if (callback(htup, callback_state))
533+
{
534+
ItemPointerData indextup;
535+
536+
/* adjust any active scans that will be affected */
537+
/* (this should be unnecessary) */
538+
ItemPointerSet(&indextup, blkno, offno);
539+
_hash_adjscans(rel, &indextup);
540+
541+
/* delete the item from the page */
542+
PageIndexTupleDelete(page, offno);
543+
bucket_dirty = page_dirty = true;
544+
545+
/* don't increment offno, instead decrement maxoffno */
546+
maxoffno = OffsetNumberPrev(maxoffno);
547+
548+
tuples_removed += 1;
549+
deleted_tuples += 1;
550+
}
551+
else
552+
{
553+
offno = OffsetNumberNext(offno);
554+
555+
num_index_tuples += 1;
556+
tuples_remaining += 1;
557+
}
558+
}
471559

472-
/* adjust any active scans that will be affected by deletion */
473-
/* (namely, my own scan) */
474-
_hash_adjscans(rel, &indextup);
560+
/*
561+
* Write or free page if needed, advance to next page. We want
562+
* to preserve the invariant that overflow pages are nonempty.
563+
*/
564+
blkno = opaque->hasho_nextblkno;
565+
566+
if (PageIsEmpty(page) && (opaque->hasho_flag & LH_OVERFLOW_PAGE))
567+
_hash_freeovflpage(rel, buf);
568+
else if (page_dirty)
569+
_hash_wrtbuf(rel, buf);
570+
else
571+
_hash_relbuf(rel, buf, HASH_WRITE);
572+
}
475573

476-
/* delete the data from the page */
477-
_hash_pagedel(rel, &indextup);
574+
/* If we deleted anything, try to compact free space */
575+
if (bucket_dirty)
576+
_hash_squeezebucket(rel, cur_bucket, bucket_blkno);
478577

479-
tuples_removed += 1;
480-
}
578+
/* XXX unlock bucket here */
579+
580+
/* Advance to next bucket */
581+
cur_bucket++;
582+
}
583+
584+
/* Write-lock metapage and check for split since we started */
585+
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_WRITE);
586+
metap = (HashMetaPage) BufferGetPage(metabuf);
587+
_hash_checkpage((Page) metap, LH_META_PAGE);
588+
589+
if (cur_maxbucket != metap->hashm_maxbucket)
590+
{
591+
/* There's been a split, so process the additional bucket(s) */
592+
cur_maxbucket = metap->hashm_maxbucket;
593+
memcpy(&local_metapage, metap, sizeof(local_metapage));
594+
_hash_relbuf(rel, metabuf, HASH_WRITE);
595+
goto loop_top;
596+
}
597+
598+
/* Okay, we're really done. Update tuple count in metapage. */
599+
600+
if (orig_maxbucket == metap->hashm_maxbucket &&
601+
orig_ntuples == metap->hashm_ntuples)
602+
{
603+
/*
604+
* No one has split or inserted anything since start of scan,
605+
* so believe our count as gospel.
606+
*/
607+
metap->hashm_ntuples = tuples_remaining;
608+
}
609+
else
610+
{
611+
/*
612+
* Otherwise, our count is untrustworthy since we may have
613+
* double-scanned tuples in split buckets. Proceed by
614+
* dead-reckoning.
615+
*/
616+
if (metap->hashm_ntuples > deleted_tuples)
617+
metap->hashm_ntuples -= deleted_tuples;
481618
else
482-
num_index_tuples += 1;
619+
metap->hashm_ntuples = 0;
620+
num_index_tuples = metap->hashm_ntuples;
483621
}
484622

485-
index_endscan(iscan);
623+
_hash_wrtbuf(rel, metabuf);
486624

487625
/* return statistics */
488626
num_pages = RelationGetNumberOfBlocks(rel);

src/backend/access/hash/hashovfl.c

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.38 2003/09/01 20:26:34 tgl Exp $
11+
* $Header: /cvsroot/pgsql/src/backend/access/hash/hashovfl.c,v 1.39 2003/09/02 02:18:38 tgl Exp $
1212
*
1313
* NOTES
1414
* Overflow pages look like ordinary relation pages.
@@ -444,11 +444,13 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno)
444444
* first page in the bucket chain. The read page works backward and
445445
* the write page works forward; the procedure terminates when the
446446
* read page and write page are the same page.
447+
*
448+
* Caller must hold exclusive lock on the target bucket.
447449
*/
448450
void
449451
_hash_squeezebucket(Relation rel,
450-
HashMetaPage metap,
451-
Bucket bucket)
452+
Bucket bucket,
453+
BlockNumber bucket_blkno)
452454
{
453455
Buffer wbuf;
454456
Buffer rbuf = 0;
@@ -466,7 +468,7 @@ _hash_squeezebucket(Relation rel,
466468
/*
467469
* start squeezing into the base bucket page.
468470
*/
469-
wblkno = BUCKET_TO_BLKNO(bucket);
471+
wblkno = bucket_blkno;
470472
wbuf = _hash_getbuf(rel, wblkno, HASH_WRITE);
471473
wpage = BufferGetPage(wbuf);
472474
_hash_checkpage(wpage, LH_BUCKET_PAGE);
@@ -484,11 +486,6 @@ _hash_squeezebucket(Relation rel,
484486
/*
485487
* find the last page in the bucket chain by starting at the base
486488
* bucket page and working forward.
487-
*
488-
* XXX if chains tend to be long, we should probably move forward using
489-
* HASH_READ and then _hash_chgbufaccess to HASH_WRITE when we reach
490-
* the end. if they are short we probably don't care very much. if
491-
* the hash function is working at all, they had better be short..
492489
*/
493490
ropaque = wopaque;
494491
do

src/backend/access/hash/hashpage.c

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.39 2003/09/01 20:26:34 tgl Exp $
11+
* $Header: /cvsroot/pgsql/src/backend/access/hash/hashpage.c,v 1.40 2003/09/02 02:18:38 tgl Exp $
1212
*
1313
* NOTES
1414
* Postgres hash pages look like ordinary relation pages. The opaque
@@ -143,7 +143,7 @@ _hash_metapinit(Relation rel)
143143
*/
144144
for (i = 0; i <= 1; i++)
145145
{
146-
buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(i), HASH_WRITE);
146+
buf = _hash_getbuf(rel, BUCKET_TO_BLKNO(metap, i), HASH_WRITE);
147147
pg = BufferGetPage(buf);
148148
_hash_pageinit(pg, BufferGetPageSize(buf));
149149
pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
@@ -456,6 +456,8 @@ _hash_splitbucket(Relation rel,
456456
Buffer ovflbuf;
457457
BlockNumber oblkno;
458458
BlockNumber nblkno;
459+
BlockNumber start_oblkno;
460+
BlockNumber start_nblkno;
459461
bool null;
460462
Datum datum;
461463
HashItem hitem;
@@ -475,8 +477,10 @@ _hash_splitbucket(Relation rel,
475477
_hash_checkpage((Page) metap, LH_META_PAGE);
476478

477479
/* get the buffers & pages */
478-
oblkno = BUCKET_TO_BLKNO(obucket);
479-
nblkno = BUCKET_TO_BLKNO(nbucket);
480+
start_oblkno = BUCKET_TO_BLKNO(metap, obucket);
481+
start_nblkno = BUCKET_TO_BLKNO(metap, nbucket);
482+
oblkno = start_oblkno;
483+
nblkno = start_nblkno;
480484
obuf = _hash_getbuf(rel, oblkno, HASH_WRITE);
481485
nbuf = _hash_getbuf(rel, nblkno, HASH_WRITE);
482486
opage = BufferGetPage(obuf);
@@ -571,7 +575,7 @@ _hash_splitbucket(Relation rel,
571575
*/
572576
_hash_wrtbuf(rel, obuf);
573577
_hash_wrtbuf(rel, nbuf);
574-
_hash_squeezebucket(rel, metap, obucket);
578+
_hash_squeezebucket(rel, obucket, start_oblkno);
575579
return;
576580
}
577581
}
@@ -639,7 +643,7 @@ _hash_splitbucket(Relation rel,
639643
if (!BlockNumberIsValid(oblkno))
640644
{
641645
_hash_wrtbuf(rel, nbuf);
642-
_hash_squeezebucket(rel, metap, obucket);
646+
_hash_squeezebucket(rel, obucket, start_oblkno);
643647
return;
644648
}
645649

src/backend/access/hash/hashsearch.c

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $Header: /cvsroot/pgsql/src/backend/access/hash/hashsearch.c,v 1.31 2003/08/04 02:39:57 momjian Exp $
11+
* $Header: /cvsroot/pgsql/src/backend/access/hash/hashsearch.c,v 1.32 2003/09/02 02:18:38 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -19,8 +19,10 @@
1919

2020

2121
/*
22-
* _hash_search() -- Finds the page/bucket that the contains the
23-
* scankey and loads it into *bufP. the buffer has a read lock.
22+
* _hash_search() -- Find the bucket that contains the scankey
23+
* and fetch its primary bucket page into *bufP.
24+
*
25+
* the buffer has a read lock.
2426
*/
2527
void
2628
_hash_search(Relation rel,
@@ -30,22 +32,23 @@ _hash_search(Relation rel,
3032
HashMetaPage metap)
3133
{
3234
BlockNumber blkno;
33-
Datum keyDatum;
3435
Bucket bucket;
3536

36-
if (scankey == (ScanKey) NULL ||
37-
(keyDatum = scankey[0].sk_argument) == (Datum) NULL)
37+
if (scankey == NULL)
3838
{
3939
/*
40-
* If the scankey argument is NULL, all tuples will satisfy the
40+
* If the scankey is empty, all tuples will satisfy the
4141
* scan so we start the scan at the first bucket (bucket 0).
4242
*/
4343
bucket = 0;
4444
}
4545
else
46-
bucket = _hash_call(rel, metap, keyDatum);
46+
{
47+
Assert(!(scankey[0].sk_flags & SK_ISNULL));
48+
bucket = _hash_call(rel, metap, scankey[0].sk_argument);
49+
}
4750

48-
blkno = BUCKET_TO_BLKNO(bucket);
51+
blkno = BUCKET_TO_BLKNO(metap, bucket);
4952

5053
*bufP = _hash_getbuf(rel, blkno, HASH_READ);
5154
}
@@ -330,7 +333,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir, Buffer metabuf)
330333
if (allbuckets && bucket < metap->hashm_maxbucket)
331334
{
332335
++bucket;
333-
blkno = BUCKET_TO_BLKNO(bucket);
336+
blkno = BUCKET_TO_BLKNO(metap, bucket);
334337
buf = _hash_getbuf(rel, blkno, HASH_READ);
335338
page = BufferGetPage(buf);
336339
_hash_checkpage(page, LH_BUCKET_PAGE);
@@ -380,7 +383,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir, Buffer metabuf)
380383
if (allbuckets && bucket > 0)
381384
{
382385
--bucket;
383-
blkno = BUCKET_TO_BLKNO(bucket);
386+
blkno = BUCKET_TO_BLKNO(metap, bucket);
384387
buf = _hash_getbuf(rel, blkno, HASH_READ);
385388
page = BufferGetPage(buf);
386389
_hash_checkpage(page, LH_BUCKET_PAGE);

0 commit comments

Comments
 (0)