8
8
*
9
9
*
10
10
* IDENTIFICATION
11
- * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.65 2007/04/09 22:03:57 tgl Exp $
11
+ * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.66 2007/04/19 20:24:04 tgl Exp $
12
12
*
13
13
* NOTES
14
14
* Postgres hash pages look like ordinary relation pages. The opaque
36
36
#include "utils/lsyscache.h"
37
37
38
38
39
- static BlockNumber _hash_alloc_buckets (Relation rel , uint32 nblocks );
39
+ static bool _hash_alloc_buckets (Relation rel , BlockNumber firstblock ,
40
+ uint32 nblocks );
40
41
static void _hash_splitbucket (Relation rel , Buffer metabuf ,
41
42
Bucket obucket , Bucket nbucket ,
42
43
BlockNumber start_oblkno ,
@@ -104,8 +105,9 @@ _hash_droplock(Relation rel, BlockNumber whichlock, int access)
104
105
* requested buffer and its reference count has been incremented
105
106
* (ie, the buffer is "locked and pinned").
106
107
*
107
- * blkno == P_NEW is allowed, but it is caller's responsibility to
108
- * ensure that only one process can extend the index at a time.
108
+ * P_NEW is disallowed because this routine should only be used
109
+ * to access pages that are known to be before the filesystem EOF.
110
+ * Extending the index should be done with _hash_getnewbuf.
109
111
*
110
112
* All call sites should call either _hash_checkpage or _hash_pageinit
111
113
* on the returned page, depending on whether the block is expected
@@ -116,6 +118,9 @@ _hash_getbuf(Relation rel, BlockNumber blkno, int access)
116
118
{
117
119
Buffer buf ;
118
120
121
+ if (blkno == P_NEW )
122
+ elog (ERROR , "hash AM does not use P_NEW" );
123
+
119
124
buf = ReadBuffer (rel , blkno );
120
125
121
126
if (access != HASH_NOLOCK )
@@ -125,6 +130,51 @@ _hash_getbuf(Relation rel, BlockNumber blkno, int access)
125
130
return buf ;
126
131
}
127
132
133
+ /*
134
+ * _hash_getnewbuf() -- Get a new page at the end of the index.
135
+ *
136
+ * This has the same API as _hash_getbuf, except that we are adding
137
+ * a page to the index, and hence expect the page to be past the
138
+ * logical EOF. (However, we have to support the case where it isn't,
139
+ * since a prior try might have crashed after extending the filesystem
140
+ * EOF but before updating the metapage to reflect the added page.)
141
+ *
142
+ * It is caller's responsibility to ensure that only one process can
143
+ * extend the index at a time.
144
+ *
145
+ * All call sites should call _hash_pageinit on the returned page.
146
+ * Also, it's difficult to imagine why access would not be HASH_WRITE.
147
+ */
148
+ Buffer
149
+ _hash_getnewbuf (Relation rel , BlockNumber blkno , int access )
150
+ {
151
+ BlockNumber nblocks = RelationGetNumberOfBlocks (rel );
152
+ Buffer buf ;
153
+
154
+ if (blkno == P_NEW )
155
+ elog (ERROR , "hash AM does not use P_NEW" );
156
+ if (blkno > nblocks )
157
+ elog (ERROR , "access to noncontiguous page in hash index \"%s\"" ,
158
+ RelationGetRelationName (rel ));
159
+
160
+ /* smgr insists we use P_NEW to extend the relation */
161
+ if (blkno == nblocks )
162
+ {
163
+ buf = ReadBuffer (rel , P_NEW );
164
+ if (BufferGetBlockNumber (buf ) != blkno )
165
+ elog (ERROR , "unexpected hash relation size: %u, should be %u" ,
166
+ BufferGetBlockNumber (buf ), blkno );
167
+ }
168
+ else
169
+ buf = ReadBuffer (rel , blkno );
170
+
171
+ if (access != HASH_NOLOCK )
172
+ LockBuffer (buf , access );
173
+
174
+ /* ref count and lock type are correct */
175
+ return buf ;
176
+ }
177
+
128
178
/*
129
179
* _hash_relbuf() -- release a locked buffer.
130
180
*
@@ -238,12 +288,11 @@ _hash_metapinit(Relation rel)
238
288
239
289
/*
240
290
* We initialize the metapage, the first two bucket pages, and the
241
- * first bitmap page in sequence, using P_NEW to cause smgrextend()
242
- * calls to occur. This ensures that the smgr level has the right
243
- * idea of the physical index length.
291
+ * first bitmap page in sequence, using _hash_getnewbuf to cause
292
+ * smgrextend() calls to occur. This ensures that the smgr level
293
+ * has the right idea of the physical index length.
244
294
*/
245
- metabuf = _hash_getbuf (rel , P_NEW , HASH_WRITE );
246
- Assert (BufferGetBlockNumber (metabuf ) == HASH_METAPAGE );
295
+ metabuf = _hash_getnewbuf (rel , HASH_METAPAGE , HASH_WRITE );
247
296
pg = BufferGetPage (metabuf );
248
297
_hash_pageinit (pg , BufferGetPageSize (metabuf ));
249
298
@@ -301,8 +350,7 @@ _hash_metapinit(Relation rel)
301
350
*/
302
351
for (i = 0 ; i <= 1 ; i ++ )
303
352
{
304
- buf = _hash_getbuf (rel , P_NEW , HASH_WRITE );
305
- Assert (BufferGetBlockNumber (buf ) == BUCKET_TO_BLKNO (metap , i ));
353
+ buf = _hash_getnewbuf (rel , BUCKET_TO_BLKNO (metap , i ), HASH_WRITE );
306
354
pg = BufferGetPage (buf );
307
355
_hash_pageinit (pg , BufferGetPageSize (buf ));
308
356
pageopaque = (HashPageOpaque ) PageGetSpecialPointer (pg );
@@ -350,7 +398,6 @@ _hash_expandtable(Relation rel, Buffer metabuf)
350
398
Bucket old_bucket ;
351
399
Bucket new_bucket ;
352
400
uint32 spare_ndx ;
353
- BlockNumber firstblock = InvalidBlockNumber ;
354
401
BlockNumber start_oblkno ;
355
402
BlockNumber start_nblkno ;
356
403
uint32 maxbucket ;
@@ -402,39 +449,15 @@ _hash_expandtable(Relation rel, Buffer metabuf)
402
449
if (metap -> hashm_maxbucket >= (uint32 ) 0x7FFFFFFE )
403
450
goto fail ;
404
451
405
- /*
406
- * If the split point is increasing (hashm_maxbucket's log base 2
407
- * increases), we need to allocate a new batch of bucket pages.
408
- */
409
- new_bucket = metap -> hashm_maxbucket + 1 ;
410
- spare_ndx = _hash_log2 (new_bucket + 1 );
411
- if (spare_ndx > metap -> hashm_ovflpoint )
412
- {
413
- Assert (spare_ndx == metap -> hashm_ovflpoint + 1 );
414
- /*
415
- * The number of buckets in the new splitpoint is equal to the
416
- * total number already in existence, i.e. new_bucket. Currently
417
- * this maps one-to-one to blocks required, but someday we may need
418
- * a more complicated calculation here.
419
- */
420
- firstblock = _hash_alloc_buckets (rel , new_bucket );
421
- if (firstblock == InvalidBlockNumber )
422
- goto fail ; /* can't split due to BlockNumber overflow */
423
- }
424
-
425
452
/*
426
453
* Determine which bucket is to be split, and attempt to lock the old
427
454
* bucket. If we can't get the lock, give up.
428
455
*
429
456
* The lock protects us against other backends, but not against our own
430
457
* backend. Must check for active scans separately.
431
- *
432
- * Ideally we would lock the new bucket too before proceeding, but if we
433
- * are about to cross a splitpoint then the BUCKET_TO_BLKNO mapping isn't
434
- * correct yet. For simplicity we update the metapage first and then
435
- * lock. This should be okay because no one else should be trying to lock
436
- * the new bucket yet...
437
458
*/
459
+ new_bucket = metap -> hashm_maxbucket + 1 ;
460
+
438
461
old_bucket = (new_bucket & metap -> hashm_lowmask );
439
462
440
463
start_oblkno = BUCKET_TO_BLKNO (metap , old_bucket );
@@ -445,6 +468,45 @@ _hash_expandtable(Relation rel, Buffer metabuf)
445
468
if (!_hash_try_getlock (rel , start_oblkno , HASH_EXCLUSIVE ))
446
469
goto fail ;
447
470
471
+ /*
472
+ * Likewise lock the new bucket (should never fail).
473
+ *
474
+ * Note: it is safe to compute the new bucket's blkno here, even though
475
+ * we may still need to update the BUCKET_TO_BLKNO mapping. This is
476
+ * because the current value of hashm_spares[hashm_ovflpoint] correctly
477
+ * shows where we are going to put a new splitpoint's worth of buckets.
478
+ */
479
+ start_nblkno = BUCKET_TO_BLKNO (metap , new_bucket );
480
+
481
+ if (_hash_has_active_scan (rel , new_bucket ))
482
+ elog (ERROR , "scan in progress on supposedly new bucket" );
483
+
484
+ if (!_hash_try_getlock (rel , start_nblkno , HASH_EXCLUSIVE ))
485
+ elog (ERROR , "could not get lock on supposedly new bucket" );
486
+
487
+ /*
488
+ * If the split point is increasing (hashm_maxbucket's log base 2
489
+ * increases), we need to allocate a new batch of bucket pages.
490
+ */
491
+ spare_ndx = _hash_log2 (new_bucket + 1 );
492
+ if (spare_ndx > metap -> hashm_ovflpoint )
493
+ {
494
+ Assert (spare_ndx == metap -> hashm_ovflpoint + 1 );
495
+ /*
496
+ * The number of buckets in the new splitpoint is equal to the
497
+ * total number already in existence, i.e. new_bucket. Currently
498
+ * this maps one-to-one to blocks required, but someday we may need
499
+ * a more complicated calculation here.
500
+ */
501
+ if (!_hash_alloc_buckets (rel , start_nblkno , new_bucket ))
502
+ {
503
+ /* can't split due to BlockNumber overflow */
504
+ _hash_droplock (rel , start_oblkno , HASH_EXCLUSIVE );
505
+ _hash_droplock (rel , start_nblkno , HASH_EXCLUSIVE );
506
+ goto fail ;
507
+ }
508
+ }
509
+
448
510
/*
449
511
* Okay to proceed with split. Update the metapage bucket mapping info.
450
512
*
@@ -477,20 +539,6 @@ _hash_expandtable(Relation rel, Buffer metabuf)
477
539
metap -> hashm_ovflpoint = spare_ndx ;
478
540
}
479
541
480
- /* now we can compute the new bucket's primary block number */
481
- start_nblkno = BUCKET_TO_BLKNO (metap , new_bucket );
482
-
483
- /* if we added a splitpoint, should match result of _hash_alloc_buckets */
484
- if (firstblock != InvalidBlockNumber &&
485
- firstblock != start_nblkno )
486
- elog (PANIC , "unexpected hash relation size: %u, should be %u" ,
487
- firstblock , start_nblkno );
488
-
489
- Assert (!_hash_has_active_scan (rel , new_bucket ));
490
-
491
- if (!_hash_try_getlock (rel , start_nblkno , HASH_EXCLUSIVE ))
492
- elog (PANIC , "could not get lock on supposedly new bucket" );
493
-
494
542
/* Done mucking with metapage */
495
543
END_CRIT_SECTION ();
496
544
@@ -539,7 +587,7 @@ _hash_expandtable(Relation rel, Buffer metabuf)
539
587
* This does not need to initialize the new bucket pages; we'll do that as
540
588
* each one is used by _hash_expandtable(). But we have to extend the logical
541
589
* EOF to the end of the splitpoint; this keeps smgr's idea of the EOF in
542
- * sync with ours, so that overflow-page allocation works correctly .
590
+ * sync with ours, so that we don't get complaints from smgr .
543
591
*
544
592
* We do this by writing a page of zeroes at the end of the splitpoint range.
545
593
* We expect that the filesystem will ensure that the intervening pages read
@@ -554,37 +602,30 @@ _hash_expandtable(Relation rel, Buffer metabuf)
554
602
* for the purpose. OTOH, adding a splitpoint is a very infrequent operation,
555
603
* so it may not be worth worrying about.
556
604
*
557
- * Returns the first block number in the new splitpoint's range, or
558
- * InvalidBlockNumber if allocation failed due to BlockNumber overflow.
605
+ * Returns TRUE if successful, or FALSE if allocation failed due to
606
+ * BlockNumber overflow.
559
607
*/
560
- static BlockNumber
561
- _hash_alloc_buckets (Relation rel , uint32 nblocks )
608
+ static bool
609
+ _hash_alloc_buckets (Relation rel , BlockNumber firstblock , uint32 nblocks )
562
610
{
563
- BlockNumber firstblock ;
564
611
BlockNumber lastblock ;
565
612
char zerobuf [BLCKSZ ];
566
613
567
- /*
568
- * Since we hold metapage lock, no one else is either splitting or
569
- * allocating a new page in _hash_getovflpage(); hence it's safe to
570
- * assume that the relation length isn't changing under us.
571
- */
572
- firstblock = RelationGetNumberOfBlocks (rel );
573
614
lastblock = firstblock + nblocks - 1 ;
574
615
575
616
/*
576
617
* Check for overflow in block number calculation; if so, we cannot
577
618
* extend the index anymore.
578
619
*/
579
620
if (lastblock < firstblock || lastblock == InvalidBlockNumber )
580
- return InvalidBlockNumber ;
621
+ return false ;
581
622
582
623
MemSet (zerobuf , 0 , sizeof (zerobuf ));
583
624
584
- /* Note: we assume RelationGetNumberOfBlocks did RelationOpenSmgr for us */
625
+ RelationOpenSmgr ( rel );
585
626
smgrextend (rel -> rd_smgr , lastblock , zerobuf , rel -> rd_istemp );
586
627
587
- return firstblock ;
628
+ return true ;
588
629
}
589
630
590
631
0 commit comments