diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index fc30a52d496a..cdaac9c9df24 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -462,6 +462,11 @@ typedef struct XLogCtlData /* Fake LSN counter, for unlogged relations. */ pg_atomic_uint64 unloggedLSN; + /* + * Approximation of the last WAL segment number that is known to have been + * installed by InstallXLogFileSegment(). + */ + pg_atomic_uint64 last_known_installed_segno; /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */ pg_time_t lastSegSwitchTime; @@ -3224,7 +3229,28 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, errmsg("could not open file \"%s\": %m", path))); } else + { + /* + * The file is there, but it is possible that InstallXLogFileSegment() + * has recently renamed it and not yet made the new name durable. We + * don't want to be able to flush data into a file whose name might + * not survive power loss, since it would become unreachable in + * recovery. Since InstallXlogFileSegment() holds ControlFileLock, + * acquiring it here is enough to wait for any durable_rename() call + * that might have started before we opened the file. + * + * We can skip that if we can already see that the WAL space we need + * is fully synchronized. We may see a slightly out of date value + * since we haven't acquired the lock yet, but that's OK, it just + * means we might take the lock when we don't need to. + */ + if (pg_atomic_read_u64(&XLogCtl->last_known_installed_segno) < logsegno) + { + LWLockAcquire(ControlFileLock, LW_SHARED); + LWLockRelease(ControlFileLock); + } return fd; + } /* * Initialize an empty (all zeroes) segment. NOTE: it is possible that @@ -3576,6 +3602,11 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, XLogFilePath(path, tli, *segno, wal_segment_size); + /* + * Acquire and keep the ControlFileLock held *until* we have renamed the + * target segment durably. See XLogFileInitInternal() for details as to why + * it is dangerous otherwise. + */ LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); if (!XLogCtl->InstallXLogFileSegmentActive) { @@ -3612,6 +3643,8 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, return false; } + pg_atomic_write_u64(&XLogCtl->last_known_installed_segno, *segno); + LWLockRelease(ControlFileLock); return true; @@ -4970,6 +5003,7 @@ XLOGShmemInit(void) char *allocptr; int i; ControlFileData *localControlFile; + XLogSegNo lastKnownInstalledSegno = 0; #ifdef WAL_DEBUG @@ -5017,6 +5051,12 @@ XLOGShmemInit(void) { memcpy(ControlFile, localControlFile, sizeof(ControlFileData)); pfree(localControlFile); + /* + * A decent approximation for the last known installed WAL segment + * number can be the segment in which the checkpoint record resides, + * specially in cases where we have had a clean shutdown. + */ + XLByteToSeg(ControlFile->checkPoint, lastKnownInstalledSegno, wal_segment_size); } /* @@ -5071,6 +5111,7 @@ XLOGShmemInit(void) pg_atomic_init_u64(&XLogCtl->logWriteResult, InvalidXLogRecPtr); pg_atomic_init_u64(&XLogCtl->logFlushResult, InvalidXLogRecPtr); pg_atomic_init_u64(&XLogCtl->unloggedLSN, InvalidXLogRecPtr); + pg_atomic_init_u64(&XLogCtl->last_known_installed_segno, lastKnownInstalledSegno); } /* diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 0e8299dd5564..3d10df0abd70 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -773,6 +773,10 @@ fsync_fname(const char *fname, bool isdir) * might not be on the same filesystem. Therefore this routine does not * support renaming across directories. * + * Note that there is a window between the rename and the fsync(s). If "newfile" + * is opened, written to and then fdatasynced, and if there is a crash before + * the fsync(s) hits disk, the written data could be . + * * Log errors with the caller specified severity. * * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not