Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 1ce14b6

Browse files
committed
Fix possible recovery trouble if TRUNCATE overlaps a checkpoint.
If TRUNCATE causes some buffers to be invalidated and thus the checkpoint does not flush them, TRUNCATE must also ensure that the corresponding files are truncated on disk. Otherwise, a replay from the checkpoint might find that the buffers exist but have the wrong contents, which may cause replay to fail. Report by Teja Mupparti. Patch by Kyotaro Horiguchi, per a design suggestion from Heikki Linnakangas, with some changes to the comments by me. Review of this and a prior patch that approached the issue differently by Heikki Linnakangas, Andres Freund, Álvaro Herrera, Masahiko Sawada, and Tom Lane. Discussion: http://postgr.es/m/BYAPR06MB6373BF50B469CA393C614257ABF00@BYAPR06MB6373.namprd06.prod.outlook.com
1 parent c0f99bb commit 1ce14b6

File tree

11 files changed

+120
-28
lines changed

11 files changed

+120
-28
lines changed

src/backend/access/transam/multixact.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3071,8 +3071,8 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
30713071
* crash/basebackup, even though the state of the data directory would
30723072
* require it.
30733073
*/
3074-
Assert(!MyProc->delayChkpt);
3075-
MyProc->delayChkpt = true;
3074+
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
3075+
MyProc->delayChkpt |= DELAY_CHKPT_START;
30763076

30773077
/* WAL log truncation */
30783078
WriteMTruncateXlogRec(newOldestMultiDB,
@@ -3098,7 +3098,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
30983098
/* Then offsets */
30993099
PerformOffsetsTruncation(oldestMulti, newOldestMulti);
31003100

3101-
MyProc->delayChkpt = false;
3101+
MyProc->delayChkpt &= ~DELAY_CHKPT_START;
31023102

31033103
END_CRIT_SECTION();
31043104
LWLockRelease(MultiXactTruncationLock);

src/backend/access/transam/twophase.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -476,7 +476,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid,
476476
}
477477
pgxact->xid = xid;
478478
pgxact->xmin = InvalidTransactionId;
479-
proc->delayChkpt = false;
479+
proc->delayChkpt = 0;
480480
pgxact->vacuumFlags = 0;
481481
proc->pid = 0;
482482
proc->databaseId = databaseid;
@@ -1170,7 +1170,8 @@ EndPrepare(GlobalTransaction gxact)
11701170

11711171
START_CRIT_SECTION();
11721172

1173-
MyProc->delayChkpt = true;
1173+
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
1174+
MyProc->delayChkpt |= DELAY_CHKPT_START;
11741175

11751176
XLogBeginInsert();
11761177
for (record = records.head; record != NULL; record = record->next)
@@ -1213,7 +1214,7 @@ EndPrepare(GlobalTransaction gxact)
12131214
* checkpoint starting after this will certainly see the gxact as a
12141215
* candidate for fsyncing.
12151216
*/
1216-
MyProc->delayChkpt = false;
1217+
MyProc->delayChkpt &= ~DELAY_CHKPT_START;
12171218

12181219
/*
12191220
* Remember that we have this GlobalTransaction entry locked for us. If
@@ -2286,7 +2287,8 @@ RecordTransactionCommitPrepared(TransactionId xid,
22862287
START_CRIT_SECTION();
22872288

22882289
/* See notes in RecordTransactionCommit */
2289-
MyProc->delayChkpt = true;
2290+
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
2291+
MyProc->delayChkpt |= DELAY_CHKPT_START;
22902292

22912293
/*
22922294
* Emit the XLOG commit record. Note that we mark 2PC commits as
@@ -2334,7 +2336,7 @@ RecordTransactionCommitPrepared(TransactionId xid,
23342336
TransactionIdCommitTree(xid, nchildren, children);
23352337

23362338
/* Checkpoint can proceed now */
2337-
MyProc->delayChkpt = false;
2339+
MyProc->delayChkpt &= ~DELAY_CHKPT_START;
23382340

23392341
END_CRIT_SECTION();
23402342

src/backend/access/transam/xact.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1308,8 +1308,9 @@ RecordTransactionCommit(void)
13081308
* This makes checkpoint's determination of which xacts are delayChkpt
13091309
* a bit fuzzy, but it doesn't matter.
13101310
*/
1311+
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
13111312
START_CRIT_SECTION();
1312-
MyProc->delayChkpt = true;
1313+
MyProc->delayChkpt |= DELAY_CHKPT_START;
13131314

13141315
SetCurrentTransactionStopTimestamp();
13151316

@@ -1410,7 +1411,7 @@ RecordTransactionCommit(void)
14101411
*/
14111412
if (markXidCommitted)
14121413
{
1413-
MyProc->delayChkpt = false;
1414+
MyProc->delayChkpt &= ~DELAY_CHKPT_START;
14141415
END_CRIT_SECTION();
14151416
}
14161417

src/backend/access/transam/xlog.c

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9022,18 +9022,30 @@ CreateCheckPoint(int flags)
90229022
* and we will correctly flush the update below. So we cannot miss any
90239023
* xacts we need to wait for.
90249024
*/
9025-
vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
9025+
vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START);
90269026
if (nvxids > 0)
90279027
{
90289028
do
90299029
{
90309030
pg_usleep(10000L); /* wait for 10 msec */
9031-
} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
9031+
} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
9032+
DELAY_CHKPT_START));
90329033
}
90339034
pfree(vxids);
90349035

90359036
CheckPointGuts(checkPoint.redo, flags);
90369037

9038+
vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE);
9039+
if (nvxids > 0)
9040+
{
9041+
do
9042+
{
9043+
pg_usleep(10000L); /* wait for 10 msec */
9044+
} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
9045+
DELAY_CHKPT_COMPLETE));
9046+
}
9047+
pfree(vxids);
9048+
90379049
/*
90389050
* Take a snapshot of running transactions and write this to WAL. This
90399051
* allows us to reconstruct the state of running transactions during

src/backend/access/transam/xloginsert.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -904,7 +904,7 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
904904
/*
905905
* Ensure no checkpoint can change our view of RedoRecPtr.
906906
*/
907-
Assert(MyProc->delayChkpt);
907+
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) != 0);
908908

909909
/*
910910
* Update RedoRecPtr so that we can make the right decision

src/backend/catalog/storage.c

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,22 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
325325

326326
RelationPreTruncate(rel);
327327

328+
/*
329+
* Make sure that a concurrent checkpoint can't complete while truncation
330+
* is in progress.
331+
*
332+
* The truncation operation might drop buffers that the checkpoint
333+
* otherwise would have flushed. If it does, then it's essential that
334+
* the files actually get truncated on disk before the checkpoint record
335+
* is written. Otherwise, if reply begins from that checkpoint, the
336+
* to-be-truncated blocks might still exist on disk but have older
337+
* contents than expected, which can cause replay to fail. It's OK for
338+
* the blocks to not exist on disk at all, but not for them to have the
339+
* wrong contents.
340+
*/
341+
Assert((MyProc->delayChkpt & DELAY_CHKPT_COMPLETE) == 0);
342+
MyProc->delayChkpt |= DELAY_CHKPT_COMPLETE;
343+
328344
/*
329345
* We WAL-log the truncation before actually truncating, which means
330346
* trouble if the truncation fails. If we then crash, the WAL replay
@@ -363,13 +379,24 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
363379
XLogFlush(lsn);
364380
}
365381

366-
/* Do the real work to truncate relation forks */
382+
/*
383+
* This will first remove any buffers from the buffer pool that should no
384+
* longer exist after truncation is complete, and then truncate the
385+
* corresponding files on disk.
386+
*/
367387
smgrtruncate(rel->rd_smgr, forks, nforks, blocks);
368388

389+
/* We've done all the critical work, so checkpoints are OK now. */
390+
MyProc->delayChkpt &= ~DELAY_CHKPT_COMPLETE;
391+
369392
/*
370393
* Update upper-level FSM pages to account for the truncation. This is
371394
* important because the just-truncated pages were likely marked as
372395
* all-free, and would be preferentially selected.
396+
*
397+
* NB: There's no point in delaying checkpoints until this is done.
398+
* Because the FSM is not WAL-logged, we have to be prepared for the
399+
* possibility of corruption after a crash anyway.
373400
*/
374401
if (need_fsm_vacuum)
375402
FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber);

src/backend/storage/buffer/bufmgr.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3647,7 +3647,9 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
36473647
* essential that CreateCheckpoint waits for virtual transactions
36483648
* rather than full transactionids.
36493649
*/
3650-
MyProc->delayChkpt = delayChkpt = true;
3650+
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
3651+
MyProc->delayChkpt |= DELAY_CHKPT_START;
3652+
delayChkpt = true;
36513653
lsn = XLogSaveBufferForHint(buffer, buffer_std);
36523654
}
36533655

@@ -3680,7 +3682,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
36803682
UnlockBufHdr(bufHdr, buf_state);
36813683

36823684
if (delayChkpt)
3683-
MyProc->delayChkpt = false;
3685+
MyProc->delayChkpt &= ~DELAY_CHKPT_START;
36843686

36853687
if (dirtied)
36863688
{

src/backend/storage/ipc/procarray.c

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -434,7 +434,10 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
434434
pgxact->xmin = InvalidTransactionId;
435435
/* must be cleared with xid/xmin: */
436436
pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
437-
proc->delayChkpt = false; /* be sure this is cleared in abort */
437+
438+
/* be sure this is cleared in abort */
439+
proc->delayChkpt = 0;
440+
438441
proc->recoveryConflictPending = false;
439442

440443
Assert(pgxact->nxids == 0);
@@ -456,7 +459,10 @@ ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact,
456459
pgxact->xmin = InvalidTransactionId;
457460
/* must be cleared with xid/xmin: */
458461
pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK;
459-
proc->delayChkpt = false; /* be sure this is cleared in abort */
462+
463+
/* be sure this is cleared in abort */
464+
proc->delayChkpt = 0;
465+
460466
proc->recoveryConflictPending = false;
461467

462468
/* Clear the subtransaction-XID cache too while holding the lock */
@@ -2272,7 +2278,8 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
22722278
* delaying checkpoint because they have critical actions in progress.
22732279
*
22742280
* Constructs an array of VXIDs of transactions that are currently in commit
2275-
* critical sections, as shown by having delayChkpt set in their PGPROC.
2281+
* critical sections, as shown by having specified delayChkpt bits set in their
2282+
* PGPROC.
22762283
*
22772284
* Returns a palloc'd array that should be freed by the caller.
22782285
* *nvxids is the number of valid entries.
@@ -2286,13 +2293,15 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
22862293
* for clearing of delayChkpt to propagate is unimportant for correctness.
22872294
*/
22882295
VirtualTransactionId *
2289-
GetVirtualXIDsDelayingChkpt(int *nvxids)
2296+
GetVirtualXIDsDelayingChkpt(int *nvxids, int type)
22902297
{
22912298
VirtualTransactionId *vxids;
22922299
ProcArrayStruct *arrayP = procArray;
22932300
int count = 0;
22942301
int index;
22952302

2303+
Assert(type != 0);
2304+
22962305
/* allocate what's certainly enough result space */
22972306
vxids = (VirtualTransactionId *)
22982307
palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
@@ -2304,7 +2313,7 @@ GetVirtualXIDsDelayingChkpt(int *nvxids)
23042313
int pgprocno = arrayP->pgprocnos[index];
23052314
PGPROC *proc = &allProcs[pgprocno];
23062315

2307-
if (proc->delayChkpt)
2316+
if ((proc->delayChkpt & type) != 0)
23082317
{
23092318
VirtualTransactionId vxid;
23102319

@@ -2330,12 +2339,14 @@ GetVirtualXIDsDelayingChkpt(int *nvxids)
23302339
* those numbers should be small enough for it not to be a problem.
23312340
*/
23322341
bool
2333-
HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids)
2342+
HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids, int type)
23342343
{
23352344
bool result = false;
23362345
ProcArrayStruct *arrayP = procArray;
23372346
int index;
23382347

2348+
Assert(type != 0);
2349+
23392350
LWLockAcquire(ProcArrayLock, LW_SHARED);
23402351

23412352
for (index = 0; index < arrayP->numProcs; index++)
@@ -2346,7 +2357,8 @@ HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids)
23462357

23472358
GET_VXID_FROM_PGPROC(vxid, *proc);
23482359

2349-
if (proc->delayChkpt && VirtualTransactionIdIsValid(vxid))
2360+
if ((proc->delayChkpt & type) != 0 &&
2361+
VirtualTransactionIdIsValid(vxid))
23502362
{
23512363
int i;
23522364

src/backend/storage/lmgr/proc.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -396,7 +396,7 @@ InitProcess(void)
396396
MyProc->roleId = InvalidOid;
397397
MyProc->tempNamespaceId = InvalidOid;
398398
MyProc->isBackgroundWorker = IsBackgroundWorker;
399-
MyProc->delayChkpt = false;
399+
MyProc->delayChkpt = 0;
400400
MyPgXact->vacuumFlags = 0;
401401
/* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */
402402
if (IsAutoVacuumWorkerProcess())
@@ -578,7 +578,7 @@ InitAuxiliaryProcess(void)
578578
MyProc->roleId = InvalidOid;
579579
MyProc->tempNamespaceId = InvalidOid;
580580
MyProc->isBackgroundWorker = IsBackgroundWorker;
581-
MyProc->delayChkpt = false;
581+
MyProc->delayChkpt = 0;
582582
MyPgXact->vacuumFlags = 0;
583583
MyProc->lwWaiting = false;
584584
MyProc->lwWaitMode = 0;

src/include/storage/proc.h

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,41 @@ struct XidCache
8383
*/
8484
#define INVALID_PGPROCNO PG_INT32_MAX
8585

86+
/*
87+
* Flags for PGPROC.delayChkpt
88+
*
89+
* These flags can be used to delay the start or completion of a checkpoint
90+
* for short periods. A flag is in effect if the corresponding bit is set in
91+
* the PGPROC of any backend.
92+
*
93+
* For our purposes here, a checkpoint has three phases: (1) determine the
94+
* location to which the redo pointer will be moved, (2) write all the
95+
* data durably to disk, and (3) WAL-log the checkpoint.
96+
*
97+
* Setting DELAY_CHKPT_START prevents the system from moving from phase 1
98+
* to phase 2. This is useful when we are performing a WAL-logged modification
99+
* of data that will be flushed to disk in phase 2. By setting this flag
100+
* before writing WAL and clearing it after we've both written WAL and
101+
* performed the corresponding modification, we ensure that if the WAL record
102+
* is inserted prior to the new redo point, the corresponding data changes will
103+
* also be flushed to disk before the checkpoint can complete. (In the
104+
* extremely common case where the data being modified is in shared buffers
105+
* and we acquire an exclusive content lock on the relevant buffers before
106+
* writing WAL, this mechanism is not needed, because phase 2 will block
107+
* until we release the content lock and then flush the modified data to
108+
* disk.)
109+
*
110+
* Setting DELAY_CHKPT_COMPLETE prevents the system from moving from phase 2
111+
* to phase 3. This is useful if we are performing a WAL-logged operation that
112+
* might invalidate buffers, such as relation truncation. In this case, we need
113+
* to ensure that any buffers which were invalidated and thus not flushed by
114+
* the checkpoint are actaully destroyed on disk. Replay can cope with a file
115+
* or block that doesn't exist, but not with a block that has the wrong
116+
* contents.
117+
*/
118+
#define DELAY_CHKPT_START (1<<0)
119+
#define DELAY_CHKPT_COMPLETE (1<<1)
120+
86121
/*
87122
* Each backend has a PGPROC struct in shared memory. There is also a list of
88123
* currently-unused PGPROC structs that will be reallocated to new backends.
@@ -149,7 +184,7 @@ struct PGPROC
149184
LOCKMASK heldLocks; /* bitmask for lock types already held on this
150185
* lock object by this backend */
151186

152-
bool delayChkpt; /* true if this proc delays checkpoint start */
187+
int delayChkpt; /* for DELAY_CHKPT_* flags */
153188

154189
/*
155190
* Info to allow us to wait for synchronous replication, if needed.

src/include/storage/procarray.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,9 @@ extern TransactionId GetOldestXmin(Relation rel, int flags);
9292
extern TransactionId GetOldestActiveTransactionId(void);
9393
extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly);
9494

95-
extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids);
96-
extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids);
95+
extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids, int type);
96+
extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids,
97+
int nvxids, int type);
9798

9899
extern PGPROC *BackendPidGetProc(int pid);
99100
extern PGPROC *BackendPidGetProcWithLock(int pid);

0 commit comments

Comments
 (0)