Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 412ad7a

Browse files
committed
Fix possible recovery trouble if TRUNCATE overlaps a checkpoint.
If TRUNCATE causes some buffers to be invalidated and thus the checkpoint does not flush them, TRUNCATE must also ensure that the corresponding files are truncated on disk. Otherwise, a replay from the checkpoint might find that the buffers exist but have the wrong contents, which may cause replay to fail. Report by Teja Mupparti. Patch by Kyotaro Horiguchi, per a design suggestion from Heikki Linnakangas, with some changes to the comments by me. Review of this and a prior patch that approached the issue differently by Heikki Linnakangas, Andres Freund, Álvaro Herrera, Masahiko Sawada, and Tom Lane. Discussion: http://postgr.es/m/BYAPR06MB6373BF50B469CA393C614257ABF00@BYAPR06MB6373.namprd06.prod.outlook.com
1 parent 86459b3 commit 412ad7a

File tree

11 files changed

+120
-28
lines changed

11 files changed

+120
-28
lines changed

src/backend/access/transam/multixact.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -3088,8 +3088,8 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
30883088
* crash/basebackup, even though the state of the data directory would
30893089
* require it.
30903090
*/
3091-
Assert(!MyProc->delayChkpt);
3092-
MyProc->delayChkpt = true;
3091+
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
3092+
MyProc->delayChkpt |= DELAY_CHKPT_START;
30933093

30943094
/* WAL log truncation */
30953095
WriteMTruncateXlogRec(newOldestMultiDB,
@@ -3115,7 +3115,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
31153115
/* Then offsets */
31163116
PerformOffsetsTruncation(oldestMulti, newOldestMulti);
31173117

3118-
MyProc->delayChkpt = false;
3118+
MyProc->delayChkpt &= ~DELAY_CHKPT_START;
31193119

31203120
END_CRIT_SECTION();
31213121
LWLockRelease(MultiXactTruncationLock);

src/backend/access/transam/twophase.c

+7-5
Original file line numberDiff line numberDiff line change
@@ -475,7 +475,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid,
475475
}
476476
proc->xid = xid;
477477
Assert(proc->xmin == InvalidTransactionId);
478-
proc->delayChkpt = false;
478+
proc->delayChkpt = 0;
479479
proc->statusFlags = 0;
480480
proc->pid = 0;
481481
proc->databaseId = databaseid;
@@ -1164,7 +1164,8 @@ EndPrepare(GlobalTransaction gxact)
11641164

11651165
START_CRIT_SECTION();
11661166

1167-
MyProc->delayChkpt = true;
1167+
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
1168+
MyProc->delayChkpt |= DELAY_CHKPT_START;
11681169

11691170
XLogBeginInsert();
11701171
for (record = records.head; record != NULL; record = record->next)
@@ -1207,7 +1208,7 @@ EndPrepare(GlobalTransaction gxact)
12071208
* checkpoint starting after this will certainly see the gxact as a
12081209
* candidate for fsyncing.
12091210
*/
1210-
MyProc->delayChkpt = false;
1211+
MyProc->delayChkpt &= ~DELAY_CHKPT_START;
12111212

12121213
/*
12131214
* Remember that we have this GlobalTransaction entry locked for us. If
@@ -2266,7 +2267,8 @@ RecordTransactionCommitPrepared(TransactionId xid,
22662267
START_CRIT_SECTION();
22672268

22682269
/* See notes in RecordTransactionCommit */
2269-
MyProc->delayChkpt = true;
2270+
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
2271+
MyProc->delayChkpt |= DELAY_CHKPT_START;
22702272

22712273
/*
22722274
* Emit the XLOG commit record. Note that we mark 2PC commits as
@@ -2314,7 +2316,7 @@ RecordTransactionCommitPrepared(TransactionId xid,
23142316
TransactionIdCommitTree(xid, nchildren, children);
23152317

23162318
/* Checkpoint can proceed now */
2317-
MyProc->delayChkpt = false;
2319+
MyProc->delayChkpt &= ~DELAY_CHKPT_START;
23182320

23192321
END_CRIT_SECTION();
23202322

src/backend/access/transam/xact.c

+3-2
Original file line numberDiff line numberDiff line change
@@ -1387,8 +1387,9 @@ RecordTransactionCommit(void)
13871387
* This makes checkpoint's determination of which xacts are delayChkpt
13881388
* a bit fuzzy, but it doesn't matter.
13891389
*/
1390+
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
13901391
START_CRIT_SECTION();
1391-
MyProc->delayChkpt = true;
1392+
MyProc->delayChkpt |= DELAY_CHKPT_START;
13921393

13931394
SetCurrentTransactionStopTimestamp();
13941395

@@ -1489,7 +1490,7 @@ RecordTransactionCommit(void)
14891490
*/
14901491
if (markXidCommitted)
14911492
{
1492-
MyProc->delayChkpt = false;
1493+
MyProc->delayChkpt &= ~DELAY_CHKPT_START;
14931494
END_CRIT_SECTION();
14941495
}
14951496

src/backend/access/transam/xlog.c

+14-2
Original file line numberDiff line numberDiff line change
@@ -6517,18 +6517,30 @@ CreateCheckPoint(int flags)
65176517
* and we will correctly flush the update below. So we cannot miss any
65186518
* xacts we need to wait for.
65196519
*/
6520-
vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
6520+
vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START);
65216521
if (nvxids > 0)
65226522
{
65236523
do
65246524
{
65256525
pg_usleep(10000L); /* wait for 10 msec */
6526-
} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
6526+
} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
6527+
DELAY_CHKPT_START));
65276528
}
65286529
pfree(vxids);
65296530

65306531
CheckPointGuts(checkPoint.redo, flags);
65316532

6533+
vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE);
6534+
if (nvxids > 0)
6535+
{
6536+
do
6537+
{
6538+
pg_usleep(10000L); /* wait for 10 msec */
6539+
} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
6540+
DELAY_CHKPT_COMPLETE));
6541+
}
6542+
pfree(vxids);
6543+
65326544
/*
65336545
* Take a snapshot of running transactions and write this to WAL. This
65346546
* allows us to reconstruct the state of running transactions during

src/backend/access/transam/xloginsert.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -1011,7 +1011,7 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
10111011
/*
10121012
* Ensure no checkpoint can change our view of RedoRecPtr.
10131013
*/
1014-
Assert(MyProc->delayChkpt);
1014+
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) != 0);
10151015

10161016
/*
10171017
* Update RedoRecPtr so that we can make the right decision

src/backend/catalog/storage.c

+28-1
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,22 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
325325

326326
RelationPreTruncate(rel);
327327

328+
/*
329+
* Make sure that a concurrent checkpoint can't complete while truncation
330+
* is in progress.
331+
*
332+
* The truncation operation might drop buffers that the checkpoint
333+
* otherwise would have flushed. If it does, then it's essential that
334+
* the files actually get truncated on disk before the checkpoint record
335+
* is written. Otherwise, if reply begins from that checkpoint, the
336+
* to-be-truncated blocks might still exist on disk but have older
337+
* contents than expected, which can cause replay to fail. It's OK for
338+
* the blocks to not exist on disk at all, but not for them to have the
339+
* wrong contents.
340+
*/
341+
Assert((MyProc->delayChkpt & DELAY_CHKPT_COMPLETE) == 0);
342+
MyProc->delayChkpt |= DELAY_CHKPT_COMPLETE;
343+
328344
/*
329345
* We WAL-log the truncation before actually truncating, which means
330346
* trouble if the truncation fails. If we then crash, the WAL replay
@@ -363,13 +379,24 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
363379
XLogFlush(lsn);
364380
}
365381

366-
/* Do the real work to truncate relation forks */
382+
/*
383+
* This will first remove any buffers from the buffer pool that should no
384+
* longer exist after truncation is complete, and then truncate the
385+
* corresponding files on disk.
386+
*/
367387
smgrtruncate(RelationGetSmgr(rel), forks, nforks, blocks);
368388

389+
/* We've done all the critical work, so checkpoints are OK now. */
390+
MyProc->delayChkpt &= ~DELAY_CHKPT_COMPLETE;
391+
369392
/*
370393
* Update upper-level FSM pages to account for the truncation. This is
371394
* important because the just-truncated pages were likely marked as
372395
* all-free, and would be preferentially selected.
396+
*
397+
* NB: There's no point in delaying checkpoints until this is done.
398+
* Because the FSM is not WAL-logged, we have to be prepared for the
399+
* possibility of corruption after a crash anyway.
373400
*/
374401
if (need_fsm_vacuum)
375402
FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber);

src/backend/storage/buffer/bufmgr.c

+4-2
Original file line numberDiff line numberDiff line change
@@ -3911,7 +3911,9 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
39113911
* essential that CreateCheckPoint waits for virtual transactions
39123912
* rather than full transactionids.
39133913
*/
3914-
MyProc->delayChkpt = delayChkpt = true;
3914+
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
3915+
MyProc->delayChkpt |= DELAY_CHKPT_START;
3916+
delayChkpt = true;
39153917
lsn = XLogSaveBufferForHint(buffer, buffer_std);
39163918
}
39173919

@@ -3944,7 +3946,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
39443946
UnlockBufHdr(bufHdr, buf_state);
39453947

39463948
if (delayChkpt)
3947-
MyProc->delayChkpt = false;
3949+
MyProc->delayChkpt &= ~DELAY_CHKPT_START;
39483950

39493951
if (dirtied)
39503952
{

src/backend/storage/ipc/procarray.c

+19-7
Original file line numberDiff line numberDiff line change
@@ -698,7 +698,10 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
698698

699699
proc->lxid = InvalidLocalTransactionId;
700700
proc->xmin = InvalidTransactionId;
701-
proc->delayChkpt = false; /* be sure this is cleared in abort */
701+
702+
/* be sure this is cleared in abort */
703+
proc->delayChkpt = 0;
704+
702705
proc->recoveryConflictPending = false;
703706

704707
/* must be cleared with xid/xmin: */
@@ -737,7 +740,10 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid)
737740
proc->xid = InvalidTransactionId;
738741
proc->lxid = InvalidLocalTransactionId;
739742
proc->xmin = InvalidTransactionId;
740-
proc->delayChkpt = false; /* be sure this is cleared in abort */
743+
744+
/* be sure this is cleared in abort */
745+
proc->delayChkpt = 0;
746+
741747
proc->recoveryConflictPending = false;
742748

743749
/* must be cleared with xid/xmin: */
@@ -3053,7 +3059,8 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
30533059
* delaying checkpoint because they have critical actions in progress.
30543060
*
30553061
* Constructs an array of VXIDs of transactions that are currently in commit
3056-
* critical sections, as shown by having delayChkpt set in their PGPROC.
3062+
* critical sections, as shown by having specified delayChkpt bits set in their
3063+
* PGPROC.
30573064
*
30583065
* Returns a palloc'd array that should be freed by the caller.
30593066
* *nvxids is the number of valid entries.
@@ -3067,13 +3074,15 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
30673074
* for clearing of delayChkpt to propagate is unimportant for correctness.
30683075
*/
30693076
VirtualTransactionId *
3070-
GetVirtualXIDsDelayingChkpt(int *nvxids)
3077+
GetVirtualXIDsDelayingChkpt(int *nvxids, int type)
30713078
{
30723079
VirtualTransactionId *vxids;
30733080
ProcArrayStruct *arrayP = procArray;
30743081
int count = 0;
30753082
int index;
30763083

3084+
Assert(type != 0);
3085+
30773086
/* allocate what's certainly enough result space */
30783087
vxids = (VirtualTransactionId *)
30793088
palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
@@ -3085,7 +3094,7 @@ GetVirtualXIDsDelayingChkpt(int *nvxids)
30853094
int pgprocno = arrayP->pgprocnos[index];
30863095
PGPROC *proc = &allProcs[pgprocno];
30873096

3088-
if (proc->delayChkpt)
3097+
if ((proc->delayChkpt & type) != 0)
30893098
{
30903099
VirtualTransactionId vxid;
30913100

@@ -3111,12 +3120,14 @@ GetVirtualXIDsDelayingChkpt(int *nvxids)
31113120
* those numbers should be small enough for it not to be a problem.
31123121
*/
31133122
bool
3114-
HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids)
3123+
HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids, int type)
31153124
{
31163125
bool result = false;
31173126
ProcArrayStruct *arrayP = procArray;
31183127
int index;
31193128

3129+
Assert(type != 0);
3130+
31203131
LWLockAcquire(ProcArrayLock, LW_SHARED);
31213132

31223133
for (index = 0; index < arrayP->numProcs; index++)
@@ -3127,7 +3138,8 @@ HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids)
31273138

31283139
GET_VXID_FROM_PGPROC(vxid, *proc);
31293140

3130-
if (proc->delayChkpt && VirtualTransactionIdIsValid(vxid))
3141+
if ((proc->delayChkpt & type) != 0 &&
3142+
VirtualTransactionIdIsValid(vxid))
31313143
{
31323144
int i;
31333145

src/backend/storage/lmgr/proc.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,7 @@ InitProcess(void)
393393
MyProc->roleId = InvalidOid;
394394
MyProc->tempNamespaceId = InvalidOid;
395395
MyProc->isBackgroundWorker = IsBackgroundWorker;
396-
MyProc->delayChkpt = false;
396+
MyProc->delayChkpt = 0;
397397
MyProc->statusFlags = 0;
398398
/* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */
399399
if (IsAutoVacuumWorkerProcess())
@@ -578,7 +578,7 @@ InitAuxiliaryProcess(void)
578578
MyProc->roleId = InvalidOid;
579579
MyProc->tempNamespaceId = InvalidOid;
580580
MyProc->isBackgroundWorker = IsBackgroundWorker;
581-
MyProc->delayChkpt = false;
581+
MyProc->delayChkpt = 0;
582582
MyProc->statusFlags = 0;
583583
MyProc->lwWaiting = false;
584584
MyProc->lwWaitMode = 0;

src/include/storage/proc.h

+36-1
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,41 @@ struct XidCache
8686
*/
8787
#define INVALID_PGPROCNO PG_INT32_MAX
8888

89+
/*
90+
* Flags for PGPROC.delayChkpt
91+
*
92+
* These flags can be used to delay the start or completion of a checkpoint
93+
* for short periods. A flag is in effect if the corresponding bit is set in
94+
* the PGPROC of any backend.
95+
*
96+
* For our purposes here, a checkpoint has three phases: (1) determine the
97+
* location to which the redo pointer will be moved, (2) write all the
98+
* data durably to disk, and (3) WAL-log the checkpoint.
99+
*
100+
* Setting DELAY_CHKPT_START prevents the system from moving from phase 1
101+
* to phase 2. This is useful when we are performing a WAL-logged modification
102+
* of data that will be flushed to disk in phase 2. By setting this flag
103+
* before writing WAL and clearing it after we've both written WAL and
104+
* performed the corresponding modification, we ensure that if the WAL record
105+
* is inserted prior to the new redo point, the corresponding data changes will
106+
* also be flushed to disk before the checkpoint can complete. (In the
107+
* extremely common case where the data being modified is in shared buffers
108+
* and we acquire an exclusive content lock on the relevant buffers before
109+
* writing WAL, this mechanism is not needed, because phase 2 will block
110+
* until we release the content lock and then flush the modified data to
111+
* disk.)
112+
*
113+
* Setting DELAY_CHKPT_COMPLETE prevents the system from moving from phase 2
114+
* to phase 3. This is useful if we are performing a WAL-logged operation that
115+
* might invalidate buffers, such as relation truncation. In this case, we need
116+
* to ensure that any buffers which were invalidated and thus not flushed by
117+
* the checkpoint are actaully destroyed on disk. Replay can cope with a file
118+
* or block that doesn't exist, but not with a block that has the wrong
119+
* contents.
120+
*/
121+
#define DELAY_CHKPT_START (1<<0)
122+
#define DELAY_CHKPT_COMPLETE (1<<1)
123+
89124
typedef enum
90125
{
91126
PROC_WAIT_STATUS_OK,
@@ -191,7 +226,7 @@ struct PGPROC
191226
pg_atomic_uint64 waitStart; /* time at which wait for lock acquisition
192227
* started */
193228

194-
bool delayChkpt; /* true if this proc delays checkpoint start */
229+
int delayChkpt; /* for DELAY_CHKPT_* flags */
195230

196231
uint8 statusFlags; /* this backend's status flags, see PROC_*
197232
* above. mirrored in

src/include/storage/procarray.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,9 @@ extern TransactionId GetOldestActiveTransactionId(void);
5959
extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly);
6060
extern void GetReplicationHorizons(TransactionId *slot_xmin, TransactionId *catalog_xmin);
6161

62-
extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids);
63-
extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids);
62+
extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids, int type);
63+
extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids,
64+
int nvxids, int type);
6465

6566
extern PGPROC *BackendPidGetProc(int pid);
6667
extern PGPROC *BackendPidGetProcWithLock(int pid);

0 commit comments

Comments
 (0)