Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit bbace56

Browse files
committed
Fix possible recovery trouble if TRUNCATE overlaps a checkpoint.
If TRUNCATE causes some buffers to be invalidated and thus the checkpoint does not flush them, TRUNCATE must also ensure that the corresponding files are truncated on disk. Otherwise, a replay from the checkpoint might find that the buffers exist but have the wrong contents, which may cause replay to fail. Report by Teja Mupparti. Patch by Kyotaro Horiguchi, per a design suggestion from Heikki Linnakangas, with some changes to the comments by me. Review of this and a prior patch that approached the issue differently by Heikki Linnakangas, Andres Freund, Álvaro Herrera, Masahiko Sawada, and Tom Lane. Discussion: http://postgr.es/m/BYAPR06MB6373BF50B469CA393C614257ABF00@BYAPR06MB6373.namprd06.prod.outlook.com
1 parent 81045e1 commit bbace56

File tree

11 files changed

+120
-28
lines changed

11 files changed

+120
-28
lines changed

src/backend/access/transam/multixact.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3075,8 +3075,8 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
30753075
* crash/basebackup, even though the state of the data directory would
30763076
* require it.
30773077
*/
3078-
Assert(!MyProc->delayChkpt);
3079-
MyProc->delayChkpt = true;
3078+
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
3079+
MyProc->delayChkpt |= DELAY_CHKPT_START;
30803080

30813081
/* WAL log truncation */
30823082
WriteMTruncateXlogRec(newOldestMultiDB,
@@ -3102,7 +3102,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
31023102
/* Then offsets */
31033103
PerformOffsetsTruncation(oldestMulti, newOldestMulti);
31043104

3105-
MyProc->delayChkpt = false;
3105+
MyProc->delayChkpt &= ~DELAY_CHKPT_START;
31063106

31073107
END_CRIT_SECTION();
31083108
LWLockRelease(MultiXactTruncationLock);

src/backend/access/transam/twophase.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -474,7 +474,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid,
474474
}
475475
proc->xid = xid;
476476
Assert(proc->xmin == InvalidTransactionId);
477-
proc->delayChkpt = false;
477+
proc->delayChkpt = 0;
478478
proc->statusFlags = 0;
479479
proc->pid = 0;
480480
proc->databaseId = databaseid;
@@ -1165,7 +1165,8 @@ EndPrepare(GlobalTransaction gxact)
11651165

11661166
START_CRIT_SECTION();
11671167

1168-
MyProc->delayChkpt = true;
1168+
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
1169+
MyProc->delayChkpt |= DELAY_CHKPT_START;
11691170

11701171
XLogBeginInsert();
11711172
for (record = records.head; record != NULL; record = record->next)
@@ -1208,7 +1209,7 @@ EndPrepare(GlobalTransaction gxact)
12081209
* checkpoint starting after this will certainly see the gxact as a
12091210
* candidate for fsyncing.
12101211
*/
1211-
MyProc->delayChkpt = false;
1212+
MyProc->delayChkpt &= ~DELAY_CHKPT_START;
12121213

12131214
/*
12141215
* Remember that we have this GlobalTransaction entry locked for us. If
@@ -2275,7 +2276,8 @@ RecordTransactionCommitPrepared(TransactionId xid,
22752276
START_CRIT_SECTION();
22762277

22772278
/* See notes in RecordTransactionCommit */
2278-
MyProc->delayChkpt = true;
2279+
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
2280+
MyProc->delayChkpt |= DELAY_CHKPT_START;
22792281

22802282
/*
22812283
* Emit the XLOG commit record. Note that we mark 2PC commits as
@@ -2323,7 +2325,7 @@ RecordTransactionCommitPrepared(TransactionId xid,
23232325
TransactionIdCommitTree(xid, nchildren, children);
23242326

23252327
/* Checkpoint can proceed now */
2326-
MyProc->delayChkpt = false;
2328+
MyProc->delayChkpt &= ~DELAY_CHKPT_START;
23272329

23282330
END_CRIT_SECTION();
23292331

src/backend/access/transam/xact.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1335,8 +1335,9 @@ RecordTransactionCommit(void)
13351335
* This makes checkpoint's determination of which xacts are delayChkpt
13361336
* a bit fuzzy, but it doesn't matter.
13371337
*/
1338+
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
13381339
START_CRIT_SECTION();
1339-
MyProc->delayChkpt = true;
1340+
MyProc->delayChkpt |= DELAY_CHKPT_START;
13401341

13411342
SetCurrentTransactionStopTimestamp();
13421343

@@ -1437,7 +1438,7 @@ RecordTransactionCommit(void)
14371438
*/
14381439
if (markXidCommitted)
14391440
{
1440-
MyProc->delayChkpt = false;
1441+
MyProc->delayChkpt &= ~DELAY_CHKPT_START;
14411442
END_CRIT_SECTION();
14421443
}
14431444

src/backend/access/transam/xlog.c

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9228,18 +9228,30 @@ CreateCheckPoint(int flags)
92289228
* and we will correctly flush the update below. So we cannot miss any
92299229
* xacts we need to wait for.
92309230
*/
9231-
vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
9231+
vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_START);
92329232
if (nvxids > 0)
92339233
{
92349234
do
92359235
{
92369236
pg_usleep(10000L); /* wait for 10 msec */
9237-
} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
9237+
} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
9238+
DELAY_CHKPT_START));
92389239
}
92399240
pfree(vxids);
92409241

92419242
CheckPointGuts(checkPoint.redo, flags);
92429243

9244+
vxids = GetVirtualXIDsDelayingChkpt(&nvxids, DELAY_CHKPT_COMPLETE);
9245+
if (nvxids > 0)
9246+
{
9247+
do
9248+
{
9249+
pg_usleep(10000L); /* wait for 10 msec */
9250+
} while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids,
9251+
DELAY_CHKPT_COMPLETE));
9252+
}
9253+
pfree(vxids);
9254+
92439255
/*
92449256
* Take a snapshot of running transactions and write this to WAL. This
92459257
* allows us to reconstruct the state of running transactions during

src/backend/access/transam/xloginsert.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -925,7 +925,7 @@ XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
925925
/*
926926
* Ensure no checkpoint can change our view of RedoRecPtr.
927927
*/
928-
Assert(MyProc->delayChkpt);
928+
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) != 0);
929929

930930
/*
931931
* Update RedoRecPtr so that we can make the right decision

src/backend/catalog/storage.c

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,22 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
325325

326326
RelationPreTruncate(rel);
327327

328+
/*
329+
* Make sure that a concurrent checkpoint can't complete while truncation
330+
* is in progress.
331+
*
332+
* The truncation operation might drop buffers that the checkpoint
333+
* otherwise would have flushed. If it does, then it's essential that
334+
* the files actually get truncated on disk before the checkpoint record
335+
* is written. Otherwise, if reply begins from that checkpoint, the
336+
* to-be-truncated blocks might still exist on disk but have older
337+
* contents than expected, which can cause replay to fail. It's OK for
338+
* the blocks to not exist on disk at all, but not for them to have the
339+
* wrong contents.
340+
*/
341+
Assert((MyProc->delayChkpt & DELAY_CHKPT_COMPLETE) == 0);
342+
MyProc->delayChkpt |= DELAY_CHKPT_COMPLETE;
343+
328344
/*
329345
* We WAL-log the truncation before actually truncating, which means
330346
* trouble if the truncation fails. If we then crash, the WAL replay
@@ -363,13 +379,24 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
363379
XLogFlush(lsn);
364380
}
365381

366-
/* Do the real work to truncate relation forks */
382+
/*
383+
* This will first remove any buffers from the buffer pool that should no
384+
* longer exist after truncation is complete, and then truncate the
385+
* corresponding files on disk.
386+
*/
367387
smgrtruncate(rel->rd_smgr, forks, nforks, blocks);
368388

389+
/* We've done all the critical work, so checkpoints are OK now. */
390+
MyProc->delayChkpt &= ~DELAY_CHKPT_COMPLETE;
391+
369392
/*
370393
* Update upper-level FSM pages to account for the truncation. This is
371394
* important because the just-truncated pages were likely marked as
372395
* all-free, and would be preferentially selected.
396+
*
397+
* NB: There's no point in delaying checkpoints until this is done.
398+
* Because the FSM is not WAL-logged, we have to be prepared for the
399+
* possibility of corruption after a crash anyway.
373400
*/
374401
if (need_fsm_vacuum)
375402
FreeSpaceMapVacuumRange(rel, nblocks, InvalidBlockNumber);

src/backend/storage/buffer/bufmgr.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3946,7 +3946,9 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
39463946
* essential that CreateCheckpoint waits for virtual transactions
39473947
* rather than full transactionids.
39483948
*/
3949-
MyProc->delayChkpt = delayChkpt = true;
3949+
Assert((MyProc->delayChkpt & DELAY_CHKPT_START) == 0);
3950+
MyProc->delayChkpt |= DELAY_CHKPT_START;
3951+
delayChkpt = true;
39503952
lsn = XLogSaveBufferForHint(buffer, buffer_std);
39513953
}
39523954

@@ -3979,7 +3981,7 @@ MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
39793981
UnlockBufHdr(bufHdr, buf_state);
39803982

39813983
if (delayChkpt)
3982-
MyProc->delayChkpt = false;
3984+
MyProc->delayChkpt &= ~DELAY_CHKPT_START;
39833985

39843986
if (dirtied)
39853987
{

src/backend/storage/ipc/procarray.c

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -689,7 +689,10 @@ ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
689689

690690
proc->lxid = InvalidLocalTransactionId;
691691
proc->xmin = InvalidTransactionId;
692-
proc->delayChkpt = false; /* be sure this is cleared in abort */
692+
693+
/* be sure this is cleared in abort */
694+
proc->delayChkpt = 0;
695+
693696
proc->recoveryConflictPending = false;
694697

695698
/* must be cleared with xid/xmin: */
@@ -728,7 +731,10 @@ ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid)
728731
proc->xid = InvalidTransactionId;
729732
proc->lxid = InvalidLocalTransactionId;
730733
proc->xmin = InvalidTransactionId;
731-
proc->delayChkpt = false; /* be sure this is cleared in abort */
734+
735+
/* be sure this is cleared in abort */
736+
proc->delayChkpt = 0;
737+
732738
proc->recoveryConflictPending = false;
733739

734740
/* must be cleared with xid/xmin: */
@@ -3043,7 +3049,8 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
30433049
* delaying checkpoint because they have critical actions in progress.
30443050
*
30453051
* Constructs an array of VXIDs of transactions that are currently in commit
3046-
* critical sections, as shown by having delayChkpt set in their PGPROC.
3052+
* critical sections, as shown by having specified delayChkpt bits set in their
3053+
* PGPROC.
30473054
*
30483055
* Returns a palloc'd array that should be freed by the caller.
30493056
* *nvxids is the number of valid entries.
@@ -3057,13 +3064,15 @@ GetOldestSafeDecodingTransactionId(bool catalogOnly)
30573064
* for clearing of delayChkpt to propagate is unimportant for correctness.
30583065
*/
30593066
VirtualTransactionId *
3060-
GetVirtualXIDsDelayingChkpt(int *nvxids)
3067+
GetVirtualXIDsDelayingChkpt(int *nvxids, int type)
30613068
{
30623069
VirtualTransactionId *vxids;
30633070
ProcArrayStruct *arrayP = procArray;
30643071
int count = 0;
30653072
int index;
30663073

3074+
Assert(type != 0);
3075+
30673076
/* allocate what's certainly enough result space */
30683077
vxids = (VirtualTransactionId *)
30693078
palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
@@ -3075,7 +3084,7 @@ GetVirtualXIDsDelayingChkpt(int *nvxids)
30753084
int pgprocno = arrayP->pgprocnos[index];
30763085
PGPROC *proc = &allProcs[pgprocno];
30773086

3078-
if (proc->delayChkpt)
3087+
if ((proc->delayChkpt & type) != 0)
30793088
{
30803089
VirtualTransactionId vxid;
30813090

@@ -3101,12 +3110,14 @@ GetVirtualXIDsDelayingChkpt(int *nvxids)
31013110
* those numbers should be small enough for it not to be a problem.
31023111
*/
31033112
bool
3104-
HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids)
3113+
HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids, int type)
31053114
{
31063115
bool result = false;
31073116
ProcArrayStruct *arrayP = procArray;
31083117
int index;
31093118

3119+
Assert(type != 0);
3120+
31103121
LWLockAcquire(ProcArrayLock, LW_SHARED);
31113122

31123123
for (index = 0; index < arrayP->numProcs; index++)
@@ -3117,7 +3128,8 @@ HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids)
31173128

31183129
GET_VXID_FROM_PGPROC(vxid, *proc);
31193130

3120-
if (proc->delayChkpt && VirtualTransactionIdIsValid(vxid))
3131+
if ((proc->delayChkpt & type) != 0 &&
3132+
VirtualTransactionIdIsValid(vxid))
31213133
{
31223134
int i;
31233135

src/backend/storage/lmgr/proc.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -394,7 +394,7 @@ InitProcess(void)
394394
MyProc->roleId = InvalidOid;
395395
MyProc->tempNamespaceId = InvalidOid;
396396
MyProc->isBackgroundWorker = IsBackgroundWorker;
397-
MyProc->delayChkpt = false;
397+
MyProc->delayChkpt = 0;
398398
MyProc->statusFlags = 0;
399399
/* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */
400400
if (IsAutoVacuumWorkerProcess())
@@ -579,7 +579,7 @@ InitAuxiliaryProcess(void)
579579
MyProc->roleId = InvalidOid;
580580
MyProc->tempNamespaceId = InvalidOid;
581581
MyProc->isBackgroundWorker = IsBackgroundWorker;
582-
MyProc->delayChkpt = false;
582+
MyProc->delayChkpt = 0;
583583
MyProc->statusFlags = 0;
584584
MyProc->lwWaiting = false;
585585
MyProc->lwWaitMode = 0;

src/include/storage/proc.h

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,41 @@ struct XidCache
8686
*/
8787
#define INVALID_PGPROCNO PG_INT32_MAX
8888

89+
/*
90+
* Flags for PGPROC.delayChkpt
91+
*
92+
* These flags can be used to delay the start or completion of a checkpoint
93+
* for short periods. A flag is in effect if the corresponding bit is set in
94+
* the PGPROC of any backend.
95+
*
96+
* For our purposes here, a checkpoint has three phases: (1) determine the
97+
* location to which the redo pointer will be moved, (2) write all the
98+
* data durably to disk, and (3) WAL-log the checkpoint.
99+
*
100+
* Setting DELAY_CHKPT_START prevents the system from moving from phase 1
101+
* to phase 2. This is useful when we are performing a WAL-logged modification
102+
* of data that will be flushed to disk in phase 2. By setting this flag
103+
* before writing WAL and clearing it after we've both written WAL and
104+
* performed the corresponding modification, we ensure that if the WAL record
105+
* is inserted prior to the new redo point, the corresponding data changes will
106+
* also be flushed to disk before the checkpoint can complete. (In the
107+
* extremely common case where the data being modified is in shared buffers
108+
* and we acquire an exclusive content lock on the relevant buffers before
109+
* writing WAL, this mechanism is not needed, because phase 2 will block
110+
* until we release the content lock and then flush the modified data to
111+
* disk.)
112+
*
113+
* Setting DELAY_CHKPT_COMPLETE prevents the system from moving from phase 2
114+
* to phase 3. This is useful if we are performing a WAL-logged operation that
115+
* might invalidate buffers, such as relation truncation. In this case, we need
116+
* to ensure that any buffers which were invalidated and thus not flushed by
117+
* the checkpoint are actaully destroyed on disk. Replay can cope with a file
118+
* or block that doesn't exist, but not with a block that has the wrong
119+
* contents.
120+
*/
121+
#define DELAY_CHKPT_START (1<<0)
122+
#define DELAY_CHKPT_COMPLETE (1<<1)
123+
89124
typedef enum
90125
{
91126
PROC_WAIT_STATUS_OK,
@@ -191,7 +226,7 @@ struct PGPROC
191226
pg_atomic_uint64 waitStart; /* time at which wait for lock acquisition
192227
* started */
193228

194-
bool delayChkpt; /* true if this proc delays checkpoint start */
229+
int delayChkpt; /* for DELAY_CHKPT_* flags */
195230

196231
uint8 statusFlags; /* this backend's status flags, see PROC_*
197232
* above. mirrored in

src/include/storage/procarray.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,9 @@ extern TransactionId GetOldestActiveTransactionId(void);
5959
extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly);
6060
extern void GetReplicationHorizons(TransactionId *slot_xmin, TransactionId *catalog_xmin);
6161

62-
extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids);
63-
extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids);
62+
extern VirtualTransactionId *GetVirtualXIDsDelayingChkpt(int *nvxids, int type);
63+
extern bool HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids,
64+
int nvxids, int type);
6465

6566
extern PGPROC *BackendPidGetProc(int pid);
6667
extern PGPROC *BackendPidGetProcWithLock(int pid);

0 commit comments

Comments
 (0)