Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 81c4508

Browse files
committed
Fix race condition between hot standby and restoring a full-page image.
There was a window in RestoreBackupBlock where a page would be zeroed out, but not yet locked. If a backend pinned and locked the page in that window, it saw the zeroed page instead of the old page or new page contents, which could lead to missing rows in a result set, or errors. To fix, replace RBM_ZERO with RBM_ZERO_AND_LOCK, which atomically pins, zeroes, and locks the page, if it's not in the buffer cache already. In stable branches, the old RBM_ZERO constant is renamed to RBM_DO_NOT_USE, to avoid breaking any 3rd party extensions that might use RBM_ZERO. More importantly, this avoids renumbering the other enum values, which would cause even bigger confusion in extensions that use ReadBufferExtended, but haven't been recompiled. Backpatch to all supported versions; this has been racy since hot standby was introduced.
1 parent 35fed51 commit 81c4508

File tree

5 files changed

+78
-32
lines changed

5 files changed

+78
-32
lines changed

src/backend/access/hash/hashpage.c

+7-6
Original file line numberDiff line numberDiff line change
@@ -155,9 +155,8 @@ _hash_getinitbuf(Relation rel, BlockNumber blkno)
155155
if (blkno == P_NEW)
156156
elog(ERROR, "hash AM does not use P_NEW");
157157

158-
buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO, NULL);
159-
160-
LockBuffer(buf, HASH_WRITE);
158+
buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO_AND_LOCK,
159+
NULL);
161160

162161
/* ref count and lock type are correct */
163162

@@ -198,11 +197,13 @@ _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum)
198197
if (BufferGetBlockNumber(buf) != blkno)
199198
elog(ERROR, "unexpected hash relation size: %u, should be %u",
200199
BufferGetBlockNumber(buf), blkno);
200+
LockBuffer(buf, HASH_WRITE);
201201
}
202202
else
203-
buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO, NULL);
204-
205-
LockBuffer(buf, HASH_WRITE);
203+
{
204+
buf = ReadBufferExtended(rel, forkNum, blkno, RBM_ZERO_AND_LOCK,
205+
NULL);
206+
}
206207

207208
/* ref count and lock type are correct */
208209

src/backend/access/heap/heapam.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -7556,7 +7556,7 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
75567556
{
75577557
XLogReadBufferForRedoExtended(lsn, record, 0,
75587558
target_node, MAIN_FORKNUM, blkno,
7559-
RBM_ZERO, false, &buffer);
7559+
RBM_ZERO_AND_LOCK, false, &buffer);
75607560
page = BufferGetPage(buffer);
75617561
PageInit(page, BufferGetPageSize(buffer), 0);
75627562
action = BLK_NEEDS_REDO;
@@ -7683,7 +7683,7 @@ heap_xlog_multi_insert(XLogRecPtr lsn, XLogRecord *record)
76837683
{
76847684
XLogReadBufferForRedoExtended(lsn, record, 0,
76857685
rnode, MAIN_FORKNUM, blkno,
7686-
RBM_ZERO, false, &buffer);
7686+
RBM_ZERO_AND_LOCK, false, &buffer);
76877687
page = BufferGetPage(buffer);
76887688
PageInit(page, BufferGetPageSize(buffer), 0);
76897689
action = BLK_NEEDS_REDO;
@@ -7876,7 +7876,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool hot_update)
78767876
{
78777877
XLogReadBufferForRedoExtended(lsn, record, 1,
78787878
rnode, MAIN_FORKNUM, newblk,
7879-
RBM_ZERO, false, &nbuffer);
7879+
RBM_ZERO_AND_LOCK, false, &nbuffer);
78807880
page = (Page) BufferGetPage(nbuffer);
78817881
PageInit(page, BufferGetPageSize(nbuffer), 0);
78827882
newaction = BLK_NEEDS_REDO;

src/backend/access/transam/xlogutils.c

+27-17
Original file line numberDiff line numberDiff line change
@@ -287,9 +287,13 @@ XLogReadBufferForRedo(XLogRecPtr lsn, XLogRecord *record, int block_index,
287287
* XLogReadBufferForRedoExtended
288288
* Like XLogReadBufferForRedo, but with extra options.
289289
*
290-
* If mode is RBM_ZERO or RBM_ZERO_ON_ERROR, if the page doesn't exist, the
291-
* relation is extended with all-zeroes pages up to the referenced block
292-
* number. In RBM_ZERO mode, the return value is always BLK_NEEDS_REDO.
290+
* In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended
291+
* with all-zeroes pages up to the referenced block number. In
292+
* RBM_ZERO_AND_LOCK and RBM_ZERO_AND_CLEANUP_LOCK modes, the return value
293+
* is always BLK_NEEDS_REDO.
294+
*
295+
* (The RBM_ZERO_AND_CLEANUP_LOCK mode is redundant with the get_cleanup_lock
296+
* parameter. Do not use an inconsistent combination!)
293297
*
294298
* If 'get_cleanup_lock' is true, a "cleanup lock" is acquired on the buffer
295299
* using LockBufferForCleanup(), instead of a regular exclusive lock.
@@ -312,10 +316,13 @@ XLogReadBufferForRedoExtended(XLogRecPtr lsn, XLogRecord *record,
312316
*buf = XLogReadBufferExtended(rnode, forkno, blkno, mode);
313317
if (BufferIsValid(*buf))
314318
{
315-
if (get_cleanup_lock)
316-
LockBufferForCleanup(*buf);
317-
else
318-
LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE);
319+
if (mode != RBM_ZERO_AND_LOCK && mode != RBM_ZERO_AND_CLEANUP_LOCK)
320+
{
321+
if (get_cleanup_lock)
322+
LockBufferForCleanup(*buf);
323+
else
324+
LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE);
325+
}
319326
if (lsn <= PageGetLSN(BufferGetPage(*buf)))
320327
return BLK_DONE;
321328
else
@@ -341,16 +348,17 @@ XLogReadBufferForRedoExtended(XLogRecPtr lsn, XLogRecord *record,
341348
* The returned buffer is exclusively-locked.
342349
*
343350
* For historical reasons, instead of a ReadBufferMode argument, this only
344-
* supports RBM_ZERO (init == true) and RBM_NORMAL (init == false) modes.
351+
* supports RBM_ZERO_AND_LOCK (init == true) and RBM_NORMAL (init == false)
352+
* modes.
345353
*/
346354
Buffer
347355
XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
348356
{
349357
Buffer buf;
350358

351359
buf = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno,
352-
init ? RBM_ZERO : RBM_NORMAL);
353-
if (BufferIsValid(buf))
360+
init ? RBM_ZERO_AND_LOCK : RBM_NORMAL);
361+
if (BufferIsValid(buf) && !init)
354362
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
355363

356364
return buf;
@@ -369,8 +377,8 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
369377
* dropped or truncated. If we don't see evidence of that later in the WAL
370378
* sequence, we'll complain at the end of WAL replay.)
371379
*
372-
* In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the
373-
* relation is extended with all-zeroes pages up to the given block number.
380+
* In RBM_ZERO_* modes, if the page doesn't exist, the relation is extended
381+
* with all-zeroes pages up to the given block number.
374382
*
375383
* In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
376384
* exist, and we don't check for all-zeroes. Thus, no log entry is made
@@ -424,14 +432,20 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
424432
do
425433
{
426434
if (buffer != InvalidBuffer)
435+
{
436+
if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
437+
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
427438
ReleaseBuffer(buffer);
439+
}
428440
buffer = ReadBufferWithoutRelcache(rnode, forknum,
429441
P_NEW, mode, NULL);
430442
}
431443
while (BufferGetBlockNumber(buffer) < blkno);
432444
/* Handle the corner case that P_NEW returns non-consecutive pages */
433445
if (BufferGetBlockNumber(buffer) != blkno)
434446
{
447+
if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
448+
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
435449
ReleaseBuffer(buffer);
436450
buffer = ReadBufferWithoutRelcache(rnode, forknum, blkno,
437451
mode, NULL);
@@ -537,12 +551,8 @@ RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, char *blk,
537551
Page page;
538552

539553
buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
540-
RBM_ZERO);
554+
get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK);
541555
Assert(BufferIsValid(buffer));
542-
if (get_cleanup_lock)
543-
LockBufferForCleanup(buffer);
544-
else
545-
LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
546556

547557
page = (Page) BufferGetPage(buffer);
548558

src/backend/storage/buffer/bufmgr.c

+37-4
Original file line numberDiff line numberDiff line change
@@ -499,14 +499,19 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
499499
* valid, the page is zeroed instead of throwing an error. This is intended
500500
* for non-critical data, where the caller is prepared to repair errors.
501501
*
502-
* In RBM_ZERO mode, if the page isn't in buffer cache already, it's filled
503-
* with zeros instead of reading it from disk. Useful when the caller is
504-
* going to fill the page from scratch, since this saves I/O and avoids
502+
* In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
503+
* filled with zeros instead of reading it from disk. Useful when the caller
504+
* is going to fill the page from scratch, since this saves I/O and avoids
505505
* unnecessary failure if the page-on-disk has corrupt page headers.
506+
* The page is returned locked to ensure that the caller has a chance to
507+
* initialize the page before it's made visible to others.
506508
* Caution: do not use this mode to read a page that is beyond the relation's
507509
* current physical EOF; that is likely to cause problems in md.c when
508510
* the page is modified and written out. P_NEW is OK, though.
509511
*
512+
* RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
513+
* a cleanup-strength lock on the page.
514+
*
510515
* RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
511516
*
512517
* If strategy is not NULL, a nondefault buffer access strategy is used.
@@ -648,6 +653,18 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
648653
isExtend,
649654
found);
650655

656+
/*
657+
* In RBM_ZERO_AND_LOCK mode the caller expects the page to
658+
* be locked on return.
659+
*/
660+
if (!isLocalBuf)
661+
{
662+
if (mode == RBM_ZERO_AND_LOCK)
663+
LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
664+
else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
665+
LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
666+
}
667+
651668
return BufferDescriptorGetBuffer(bufHdr);
652669
}
653670

@@ -729,7 +746,7 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
729746
* Read in the page, unless the caller intends to overwrite it and
730747
* just wants us to allocate a buffer.
731748
*/
732-
if (mode == RBM_ZERO)
749+
if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
733750
MemSet((char *) bufBlock, 0, BLCKSZ);
734751
else
735752
{
@@ -771,6 +788,22 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
771788
}
772789
}
773790

791+
/*
792+
* In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
793+
* the page as valid, to make sure that no other backend sees the zeroed
794+
* page before the caller has had a chance to initialize it.
795+
*
796+
* Since no-one else can be looking at the page contents yet, there is no
797+
* difference between an exclusive lock and a cleanup-strength lock.
798+
* (Note that we cannot use LockBuffer() of LockBufferForCleanup() here,
799+
* because they assert that the buffer is already valid.)
800+
*/
801+
if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
802+
!isLocalBuf)
803+
{
804+
LWLockAcquire(bufHdr->content_lock, LW_EXCLUSIVE);
805+
}
806+
774807
if (isLocalBuf)
775808
{
776809
/* Only need to adjust flags */

src/include/storage/bufmgr.h

+4-2
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,10 @@ typedef enum BufferAccessStrategyType
3636
typedef enum
3737
{
3838
RBM_NORMAL, /* Normal read */
39-
RBM_ZERO, /* Don't read from disk, caller will
40-
* initialize */
39+
RBM_ZERO_AND_LOCK, /* Don't read from disk, caller will
40+
* initialize. Also locks the page. */
41+
RBM_ZERO_AND_CLEANUP_LOCK, /* Like RBM_ZERO_AND_LOCK, but locks the page
42+
* in "cleanup" mode */
4143
RBM_ZERO_ON_ERROR, /* Read, but return an all-zeros page on error */
4244
RBM_NORMAL_NO_LOG /* Don't log page as invalid during WAL
4345
* replay; otherwise same as RBM_NORMAL */

0 commit comments

Comments
 (0)