Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 7a36937

Browse files
committed
Reimplement hash index locking algorithms, per my recent proposal to
pghackers. This fixes the problem recently reported by Markus KrÌutner (hash bucket split corrupts the state of scans being done concurrently), and I believe it also fixes all the known problems with deadlocks in hash index operations. Hash indexes are still not really ready for prime time (since they aren't WAL-logged), but this is a step forward.
1 parent ca43f71 commit 7a36937

File tree

11 files changed

+1039
-1023
lines changed

11 files changed

+1039
-1023
lines changed

src/backend/access/hash/README

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
$Header: /cvsroot/pgsql/src/backend/access/hash/README,v 1.2 2003/09/02 03:29:01 tgl Exp $
1+
$Header: /cvsroot/pgsql/src/backend/access/hash/README,v 1.3 2003/09/04 22:06:27 tgl Exp $
22

33
This directory contains an implementation of hash indexing for Postgres.
44

@@ -229,8 +229,8 @@ existing bucket in two, thereby lowering the fill ratio:
229229
check split still needed
230230
if split not needed anymore, drop locks and exit
231231
decide which bucket to split
232-
Attempt to X-lock new bucket number (shouldn't fail, but...)
233232
Attempt to X-lock old bucket number (definitely could fail)
233+
Attempt to X-lock new bucket number (shouldn't fail, but...)
234234
if above fail, drop locks and exit
235235
update meta page to reflect new number of buckets
236236
write/release meta page
@@ -261,12 +261,6 @@ not be overfull and split attempts will stop. (We could make a successful
261261
splitter loop to see if the index is still overfull, but it seems better to
262262
distribute the split overhead across successive insertions.)
263263

264-
It may be wise to make the initial exclusive-lock-page-zero operation a
265-
conditional one as well, although the odds of a deadlock failure are quite
266-
low. (AFAICS it could only deadlock against a VACUUM operation that is
267-
trying to X-lock a bucket that the current process has a stopped indexscan
268-
in.)
269-
270264
A problem is that if a split fails partway through (eg due to insufficient
271265
disk space) the index is left corrupt. The probability of that could be
272266
made quite low if we grab a free page or two before we update the meta

src/backend/access/hash/hash.c

Lines changed: 88 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.67 2003/09/02 18:13:29 tgl Exp $
11+
* $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.68 2003/09/04 22:06:27 tgl Exp $
1212
*
1313
* NOTES
1414
* This file contains only the public interface routines.
@@ -27,9 +27,6 @@
2727
#include "miscadmin.h"
2828

2929

30-
bool BuildingHash = false;
31-
32-
3330
/* Working state for hashbuild and its callback */
3431
typedef struct
3532
{
@@ -61,9 +58,6 @@ hashbuild(PG_FUNCTION_ARGS)
6158
double reltuples;
6259
HashBuildState buildstate;
6360

64-
/* set flag to disable locking */
65-
BuildingHash = true;
66-
6761
/*
6862
* We expect to be called exactly once for any index relation. If
6963
* that's not the case, big trouble's what we have.
@@ -82,9 +76,6 @@ hashbuild(PG_FUNCTION_ARGS)
8276
reltuples = IndexBuildHeapScan(heap, index, indexInfo,
8377
hashbuildCallback, (void *) &buildstate);
8478

85-
/* all done */
86-
BuildingHash = false;
87-
8879
/*
8980
* Since we just counted the tuples in the heap, we update its stats
9081
* in pg_class to guarantee that the planner takes advantage of the
@@ -212,10 +203,18 @@ hashgettuple(PG_FUNCTION_ARGS)
212203
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
213204
ScanDirection dir = (ScanDirection) PG_GETARG_INT32(1);
214205
HashScanOpaque so = (HashScanOpaque) scan->opaque;
206+
Relation rel = scan->indexRelation;
215207
Page page;
216208
OffsetNumber offnum;
217209
bool res;
218210

211+
/*
212+
* We hold pin but not lock on current buffer while outside the hash AM.
213+
* Reacquire the read lock here.
214+
*/
215+
if (BufferIsValid(so->hashso_curbuf))
216+
_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_NOLOCK, HASH_READ);
217+
219218
/*
220219
* If we've already initialized this scan, we can just advance it in
221220
* the appropriate direction. If we haven't done so yet, we call a
@@ -267,6 +266,10 @@ hashgettuple(PG_FUNCTION_ARGS)
267266
}
268267
}
269268

269+
/* Release read lock on current buffer, but keep it pinned */
270+
if (BufferIsValid(so->hashso_curbuf))
271+
_hash_chgbufaccess(rel, so->hashso_curbuf, HASH_READ, HASH_NOLOCK);
272+
270273
PG_RETURN_BOOL(res);
271274
}
272275

@@ -285,6 +288,8 @@ hashbeginscan(PG_FUNCTION_ARGS)
285288

286289
scan = RelationGetIndexScan(rel, keysz, scankey);
287290
so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
291+
so->hashso_bucket_valid = false;
292+
so->hashso_bucket_blkno = 0;
288293
so->hashso_curbuf = so->hashso_mrkbuf = InvalidBuffer;
289294
scan->opaque = so;
290295

@@ -303,28 +308,38 @@ hashrescan(PG_FUNCTION_ARGS)
303308
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
304309
ScanKey scankey = (ScanKey) PG_GETARG_POINTER(1);
305310
HashScanOpaque so = (HashScanOpaque) scan->opaque;
306-
ItemPointer iptr;
311+
Relation rel = scan->indexRelation;
307312

308-
/* we hold a read lock on the current page in the scan */
309-
if (ItemPointerIsValid(iptr = &(scan->currentItemData)))
313+
/* if we are called from beginscan, so is still NULL */
314+
if (so)
310315
{
311-
_hash_relbuf(scan->indexRelation, so->hashso_curbuf, HASH_READ);
316+
/* release any pins we still hold */
317+
if (BufferIsValid(so->hashso_curbuf))
318+
_hash_dropbuf(rel, so->hashso_curbuf);
312319
so->hashso_curbuf = InvalidBuffer;
313-
ItemPointerSetInvalid(iptr);
314-
}
315-
if (ItemPointerIsValid(iptr = &(scan->currentMarkData)))
316-
{
317-
_hash_relbuf(scan->indexRelation, so->hashso_mrkbuf, HASH_READ);
320+
321+
if (BufferIsValid(so->hashso_mrkbuf))
322+
_hash_dropbuf(rel, so->hashso_mrkbuf);
318323
so->hashso_mrkbuf = InvalidBuffer;
319-
ItemPointerSetInvalid(iptr);
324+
325+
/* release lock on bucket, too */
326+
if (so->hashso_bucket_blkno)
327+
_hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE);
328+
so->hashso_bucket_blkno = 0;
320329
}
321330

331+
/* set positions invalid (this will cause _hash_first call) */
332+
ItemPointerSetInvalid(&(scan->currentItemData));
333+
ItemPointerSetInvalid(&(scan->currentMarkData));
334+
322335
/* Update scan key, if a new one is given */
323336
if (scankey && scan->numberOfKeys > 0)
324337
{
325338
memmove(scan->keyData,
326339
scankey,
327340
scan->numberOfKeys * sizeof(ScanKeyData));
341+
if (so)
342+
so->hashso_bucket_valid = false;
328343
}
329344

330345
PG_RETURN_VOID();
@@ -337,32 +352,32 @@ Datum
337352
hashendscan(PG_FUNCTION_ARGS)
338353
{
339354
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
340-
ItemPointer iptr;
341-
HashScanOpaque so;
355+
HashScanOpaque so = (HashScanOpaque) scan->opaque;
356+
Relation rel = scan->indexRelation;
342357

343-
so = (HashScanOpaque) scan->opaque;
358+
/* don't need scan registered anymore */
359+
_hash_dropscan(scan);
344360

345-
/* release any locks we still hold */
346-
if (ItemPointerIsValid(iptr = &(scan->currentItemData)))
347-
{
348-
_hash_relbuf(scan->indexRelation, so->hashso_curbuf, HASH_READ);
349-
so->hashso_curbuf = InvalidBuffer;
350-
ItemPointerSetInvalid(iptr);
351-
}
361+
/* release any pins we still hold */
362+
if (BufferIsValid(so->hashso_curbuf))
363+
_hash_dropbuf(rel, so->hashso_curbuf);
364+
so->hashso_curbuf = InvalidBuffer;
352365

353-
if (ItemPointerIsValid(iptr = &(scan->currentMarkData)))
354-
{
355-
if (BufferIsValid(so->hashso_mrkbuf))
356-
_hash_relbuf(scan->indexRelation, so->hashso_mrkbuf, HASH_READ);
357-
so->hashso_mrkbuf = InvalidBuffer;
358-
ItemPointerSetInvalid(iptr);
359-
}
366+
if (BufferIsValid(so->hashso_mrkbuf))
367+
_hash_dropbuf(rel, so->hashso_mrkbuf);
368+
so->hashso_mrkbuf = InvalidBuffer;
360369

361-
/* don't need scan registered anymore */
362-
_hash_dropscan(scan);
370+
/* release lock on bucket, too */
371+
if (so->hashso_bucket_blkno)
372+
_hash_droplock(rel, so->hashso_bucket_blkno, HASH_SHARE);
373+
so->hashso_bucket_blkno = 0;
363374

364375
/* be tidy */
365-
pfree(scan->opaque);
376+
ItemPointerSetInvalid(&(scan->currentItemData));
377+
ItemPointerSetInvalid(&(scan->currentMarkData));
378+
379+
pfree(so);
380+
scan->opaque = NULL;
366381

367382
PG_RETURN_VOID();
368383
}
@@ -374,25 +389,21 @@ Datum
374389
hashmarkpos(PG_FUNCTION_ARGS)
375390
{
376391
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
377-
ItemPointer iptr;
378-
HashScanOpaque so;
379-
380-
so = (HashScanOpaque) scan->opaque;
392+
HashScanOpaque so = (HashScanOpaque) scan->opaque;
393+
Relation rel = scan->indexRelation;
381394

382-
/* release lock on old marked data, if any */
383-
if (ItemPointerIsValid(iptr = &(scan->currentMarkData)))
384-
{
385-
_hash_relbuf(scan->indexRelation, so->hashso_mrkbuf, HASH_READ);
386-
so->hashso_mrkbuf = InvalidBuffer;
387-
ItemPointerSetInvalid(iptr);
388-
}
395+
/* release pin on old marked data, if any */
396+
if (BufferIsValid(so->hashso_mrkbuf))
397+
_hash_dropbuf(rel, so->hashso_mrkbuf);
398+
so->hashso_mrkbuf = InvalidBuffer;
399+
ItemPointerSetInvalid(&(scan->currentMarkData));
389400

390-
/* bump lock on currentItemData and copy to currentMarkData */
401+
/* bump pin count on currentItemData and copy to currentMarkData */
391402
if (ItemPointerIsValid(&(scan->currentItemData)))
392403
{
393-
so->hashso_mrkbuf = _hash_getbuf(scan->indexRelation,
404+
so->hashso_mrkbuf = _hash_getbuf(rel,
394405
BufferGetBlockNumber(so->hashso_curbuf),
395-
HASH_READ);
406+
HASH_NOLOCK);
396407
scan->currentMarkData = scan->currentItemData;
397408
}
398409

@@ -406,26 +417,21 @@ Datum
406417
hashrestrpos(PG_FUNCTION_ARGS)
407418
{
408419
IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
409-
ItemPointer iptr;
410-
HashScanOpaque so;
420+
HashScanOpaque so = (HashScanOpaque) scan->opaque;
421+
Relation rel = scan->indexRelation;
411422

412-
so = (HashScanOpaque) scan->opaque;
423+
/* release pin on current data, if any */
424+
if (BufferIsValid(so->hashso_curbuf))
425+
_hash_dropbuf(rel, so->hashso_curbuf);
426+
so->hashso_curbuf = InvalidBuffer;
427+
ItemPointerSetInvalid(&(scan->currentItemData));
413428

414-
/* release lock on current data, if any */
415-
if (ItemPointerIsValid(iptr = &(scan->currentItemData)))
416-
{
417-
_hash_relbuf(scan->indexRelation, so->hashso_curbuf, HASH_READ);
418-
so->hashso_curbuf = InvalidBuffer;
419-
ItemPointerSetInvalid(iptr);
420-
}
421-
422-
/* bump lock on currentMarkData and copy to currentItemData */
429+
/* bump pin count on currentMarkData and copy to currentItemData */
423430
if (ItemPointerIsValid(&(scan->currentMarkData)))
424431
{
425-
so->hashso_curbuf = _hash_getbuf(scan->indexRelation,
432+
so->hashso_curbuf = _hash_getbuf(rel,
426433
BufferGetBlockNumber(so->hashso_mrkbuf),
427-
HASH_READ);
428-
434+
HASH_NOLOCK);
429435
scan->currentItemData = scan->currentMarkData;
430436
}
431437

@@ -474,7 +480,7 @@ hashbulkdelete(PG_FUNCTION_ARGS)
474480
orig_maxbucket = metap->hashm_maxbucket;
475481
orig_ntuples = metap->hashm_ntuples;
476482
memcpy(&local_metapage, metap, sizeof(local_metapage));
477-
_hash_relbuf(rel, metabuf, HASH_READ);
483+
_hash_relbuf(rel, metabuf);
478484

479485
/* Scan the buckets that we know exist */
480486
cur_bucket = 0;
@@ -490,7 +496,12 @@ hashbulkdelete(PG_FUNCTION_ARGS)
490496
/* Get address of bucket's start page */
491497
bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket);
492498

493-
/* XXX lock bucket here */
499+
/* Exclusive-lock the bucket so we can shrink it */
500+
_hash_getlock(rel, bucket_blkno, HASH_EXCLUSIVE);
501+
502+
/* Shouldn't have any active scans locally, either */
503+
if (_hash_has_active_scan(rel, cur_bucket))
504+
elog(ERROR, "hash index has active scan during VACUUM");
494505

495506
/* Scan each page in bucket */
496507
blkno = bucket_blkno;
@@ -522,13 +533,6 @@ hashbulkdelete(PG_FUNCTION_ARGS)
522533
htup = &(hitem->hash_itup.t_tid);
523534
if (callback(htup, callback_state))
524535
{
525-
ItemPointerData indextup;
526-
527-
/* adjust any active scans that will be affected */
528-
/* (this should be unnecessary) */
529-
ItemPointerSet(&indextup, blkno, offno);
530-
_hash_adjscans(rel, &indextup);
531-
532536
/* delete the item from the page */
533537
PageIndexTupleDelete(page, offno);
534538
bucket_dirty = page_dirty = true;
@@ -547,24 +551,22 @@ hashbulkdelete(PG_FUNCTION_ARGS)
547551
}
548552

549553
/*
550-
* Write or free page if needed, advance to next page. We want
551-
* to preserve the invariant that overflow pages are nonempty.
554+
* Write page if needed, advance to next page.
552555
*/
553556
blkno = opaque->hasho_nextblkno;
554557

555-
if (PageIsEmpty(page) && (opaque->hasho_flag & LH_OVERFLOW_PAGE))
556-
_hash_freeovflpage(rel, buf);
557-
else if (page_dirty)
558+
if (page_dirty)
558559
_hash_wrtbuf(rel, buf);
559560
else
560-
_hash_relbuf(rel, buf, HASH_WRITE);
561+
_hash_relbuf(rel, buf);
561562
}
562563

563564
/* If we deleted anything, try to compact free space */
564565
if (bucket_dirty)
565566
_hash_squeezebucket(rel, cur_bucket, bucket_blkno);
566567

567-
/* XXX unlock bucket here */
568+
/* Release bucket lock */
569+
_hash_droplock(rel, bucket_blkno, HASH_EXCLUSIVE);
568570

569571
/* Advance to next bucket */
570572
cur_bucket++;
@@ -580,7 +582,7 @@ hashbulkdelete(PG_FUNCTION_ARGS)
580582
/* There's been a split, so process the additional bucket(s) */
581583
cur_maxbucket = metap->hashm_maxbucket;
582584
memcpy(&local_metapage, metap, sizeof(local_metapage));
583-
_hash_relbuf(rel, metabuf, HASH_WRITE);
585+
_hash_relbuf(rel, metabuf);
584586
goto loop_top;
585587
}
586588

0 commit comments

Comments
 (0)