Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit d8ac77a

Browse files
committed
Recursively fsync() the data directory after a crash.
Otherwise, if there's another crash, some writes from after the first crash might make it to disk while writes from before the crash fail to make it to disk. This could lead to data corruption. Back-patch to all supported versions. Abhijit Menon-Sen, reviewed by Andres Freund and slightly revised by me.
1 parent 997066f commit d8ac77a

File tree

3 files changed

+159
-0
lines changed

3 files changed

+159
-0
lines changed

src/backend/access/transam/xlog.c

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -826,6 +826,8 @@ static void WALInsertLockAcquireExclusive(void);
826826
static void WALInsertLockRelease(void);
827827
static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
828828

829+
static void fsync_pgdata(char *datadir);
830+
829831
/*
830832
* Insert an XLOG record having the specified RMID and info bytes,
831833
* with the body of the record being the data chunk(s) described by
@@ -6116,6 +6118,18 @@ StartupXLOG(void)
61166118
(errmsg("database system was interrupted; last known up at %s",
61176119
str_time(ControlFile->time))));
61186120

6121+
/*
6122+
* If we previously crashed, there might be data which we had written,
6123+
* intending to fsync it, but which we had not actually fsync'd yet.
6124+
* Therefore, a power failure in the near future might cause earlier
6125+
* unflushed writes to be lost, even though more recent data written to
6126+
* disk from here on would be persisted. To avoid that, fsync the entire
6127+
* data directory.
6128+
*/
6129+
if (ControlFile->state != DB_SHUTDOWNED &&
6130+
ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
6131+
fsync_pgdata(data_directory);
6132+
61196133
/* This is just to allow attaching to startup process with a debugger */
61206134
#ifdef XLOG_REPLAY_DELAY
61216135
if (ControlFile->state != DB_SHUTDOWNED)
@@ -11338,3 +11352,31 @@ SetWalWriterSleeping(bool sleeping)
1133811352
xlogctl->WalWriterSleeping = sleeping;
1133911353
SpinLockRelease(&xlogctl->info_lck);
1134011354
}
11355+
11356+
/*
11357+
* Issue fsync recursively on PGDATA and all its contents.
11358+
*/
11359+
static void
11360+
fsync_pgdata(char *datadir)
11361+
{
11362+
if (!enableFsync)
11363+
return;
11364+
11365+
/*
11366+
* If possible, hint to the kernel that we're soon going to fsync
11367+
* the data directory and its contents.
11368+
*/
11369+
#if defined(HAVE_SYNC_FILE_RANGE) || \
11370+
(defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED))
11371+
walkdir(datadir, pre_sync_fname);
11372+
#endif
11373+
11374+
/*
11375+
* Now we do the fsync()s in the same order.
11376+
*
11377+
* It's important to fsync the destination directory itself as individual
11378+
* file fsyncs don't guarantee that the directory entry for the file is
11379+
* synced.
11380+
*/
11381+
walkdir(datadir, fsync_fname);
11382+
}

src/backend/storage/file/fd.c

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2438,3 +2438,118 @@ looks_like_temp_rel_name(const char *name)
24382438
return false;
24392439
return true;
24402440
}
2441+
2442+
/*
2443+
* Hint to the OS that it should get ready to fsync() this file.
2444+
*
2445+
* Adapted from pre_sync_fname in initdb.c
2446+
*/
2447+
void
2448+
pre_sync_fname(char *fname, bool isdir)
2449+
{
2450+
int fd;
2451+
2452+
fd = open(fname, O_RDONLY | PG_BINARY);
2453+
2454+
/*
2455+
* Some OSs don't allow us to open directories at all (Windows returns
2456+
* EACCES)
2457+
*/
2458+
if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
2459+
return;
2460+
2461+
if (fd < 0)
2462+
ereport(FATAL,
2463+
(errmsg("could not open file \"%s\" before fsync",
2464+
fname)));
2465+
2466+
pg_flush_data(fd, 0, 0);
2467+
2468+
close(fd);
2469+
}
2470+
2471+
/*
2472+
* walkdir: recursively walk a directory, applying the action to each
2473+
* regular file and directory (including the named directory itself)
2474+
* and following symbolic links.
2475+
*
2476+
* NB: There is another version of walkdir in initdb.c, but that version
2477+
* behaves differently with respect to symbolic links. Caveat emptor!
2478+
*/
2479+
void
2480+
walkdir(char *path, void (*action) (char *fname, bool isdir))
2481+
{
2482+
DIR *dir;
2483+
struct dirent *de;
2484+
2485+
dir = AllocateDir(path);
2486+
while ((de = ReadDir(dir, path)) != NULL)
2487+
{
2488+
char subpath[MAXPGPATH];
2489+
struct stat fst;
2490+
2491+
CHECK_FOR_INTERRUPTS();
2492+
2493+
if (strcmp(de->d_name, ".") == 0 ||
2494+
strcmp(de->d_name, "..") == 0)
2495+
continue;
2496+
2497+
snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name);
2498+
2499+
if (lstat(subpath, &fst) < 0)
2500+
ereport(ERROR,
2501+
(errcode_for_file_access(),
2502+
errmsg("could not stat file \"%s\": %m", subpath)));
2503+
2504+
if (S_ISREG(fst.st_mode))
2505+
(*action) (subpath, false);
2506+
else if (S_ISDIR(fst.st_mode))
2507+
walkdir(subpath, action);
2508+
#ifndef WIN32
2509+
else if (S_ISLNK(fst.st_mode))
2510+
#else
2511+
else if (pg_win32_is_junction(subpath))
2512+
#endif
2513+
{
2514+
#if defined(HAVE_READLINK) || defined(WIN32)
2515+
char linkpath[MAXPGPATH];
2516+
int len;
2517+
struct stat lst;
2518+
2519+
len = readlink(subpath, linkpath, sizeof(linkpath)-1);
2520+
if (len < 0)
2521+
ereport(ERROR,
2522+
(errcode_for_file_access(),
2523+
errmsg("could not read symbolic link \"%s\": %m",
2524+
subpath)));
2525+
2526+
if (len >= sizeof(linkpath)-1)
2527+
ereport(ERROR,
2528+
(errmsg("symbolic link \"%s\" target is too long",
2529+
subpath)));
2530+
2531+
linkpath[len] = '\0';
2532+
2533+
if (lstat(linkpath, &lst) == 0)
2534+
{
2535+
if (S_ISREG(lst.st_mode))
2536+
(*action) (linkpath, false);
2537+
else if (S_ISDIR(lst.st_mode))
2538+
walkdir(subpath, action);
2539+
}
2540+
else if (errno != ENOENT)
2541+
ereport(ERROR,
2542+
(errcode_for_file_access(),
2543+
errmsg("could not stat file \"%s\": %m", linkpath)));
2544+
#else
2545+
ereport(WARNING,
2546+
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
2547+
errmsg("this platform does not support symbolic links; ignoring \"%s\"",
2548+
subpath)));
2549+
#endif
2550+
}
2551+
}
2552+
FreeDir(dir);
2553+
2554+
(*action) (path, true);
2555+
}

src/include/storage/fd.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,8 @@ extern int pg_fsync_writethrough(int fd);
114114
extern int pg_fdatasync(int fd);
115115
extern int pg_flush_data(int fd, off_t offset, off_t amount);
116116
extern void fsync_fname(char *fname, bool isdir);
117+
extern void pre_sync_fname(char *fname, bool isdir);
118+
extern void walkdir(char *path, void (*action) (char *fname, bool isdir));
117119

118120
/* Filename components for OpenTemporaryFile */
119121
#define PG_TEMP_FILES_DIR "pgsql_tmp"

0 commit comments

Comments
 (0)