Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit b966dd6

Browse files
committed
Add fsync capability to initdb, and use sync_file_range() if available.
Historically we have not worried about fsync'ing anything during initdb (in fact, initdb intentionally passes -F to each backend launch to prevent it from fsync'ing). But with filesystems getting more aggressive about caching data, that's not such a good plan anymore. Make initdb do a pass over the finished data directory tree to fsync everything. For testing purposes, the -N/--nosync flag can be used to restore the old behavior. Also, testing shows that on Linux, sync_file_range() is much faster than posix_fadvise() for hinting to the kernel that an fsync is coming, apparently because the latter blocks on a rather small request queue while the former doesn't. So use this function if available in initdb, and also in the backend's pg_flush_data() (where it currently will affect only the speed of CREATE DATABASE's cloning step). We will later make pg_regress invoke initdb with the --nosync flag to avoid slowing down cases such as "make check" in contrib. But let's not do so until we've shaken out any portability issues in this patch. Jeff Davis, reviewed by Andres Freund
1 parent 1a9405d commit b966dd6

File tree

7 files changed

+258
-5
lines changed

7 files changed

+258
-5
lines changed

configure

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19254,7 +19254,8 @@ fi
1925419254

1925519255

1925619256

19257-
for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink towlower utime utimes wcstombs wcstombs_l
19257+
19258+
for ac_func in cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink sync_file_range towlower utime utimes wcstombs wcstombs_l
1925819259
do
1925919260
as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
1926019261
{ $as_echo "$as_me:$LINENO: checking for $ac_func" >&5

configure.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1207,7 +1207,7 @@ PGAC_VAR_INT_TIMEZONE
12071207
AC_FUNC_ACCEPT_ARGTYPES
12081208
PGAC_FUNC_GETTIMEOFDAY_1ARG
12091209

1210-
AC_CHECK_FUNCS([cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink towlower utime utimes wcstombs wcstombs_l])
1210+
AC_CHECK_FUNCS([cbrt dlopen fdatasync getifaddrs getpeerucred getrlimit memmove poll pstat readlink setproctitle setsid sigprocmask symlink sync_file_range towlower utime utimes wcstombs wcstombs_l])
12111211

12121212
AC_REPLACE_FUNCS(fseeko)
12131213
case $host_os in

doc/src/sgml/ref/initdb.sgml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,21 @@ PostgreSQL documentation
219219
</listitem>
220220
</varlistentry>
221221

222+
<varlistentry>
223+
<term><option>-N</option></term>
224+
<term><option>--nosync</option></term>
225+
<listitem>
226+
<para>
227+
By default, <command>initdb</command> will wait for all files to be
228+
written safely to disk. This option causes <command>initdb</command>
229+
to return without waiting, which is faster, but means that a
230+
subsequent operating system crash can leave the data directory
231+
corrupt. Generally, this option is useful for testing, but should not
232+
be used when creating a production installation.
233+
</para>
234+
</listitem>
235+
</varlistentry>
236+
222237
<varlistentry>
223238
<term><option>--pwfile=<replaceable>filename</></option></term>
224239
<listitem>

src/backend/storage/file/fd.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -336,12 +336,15 @@ pg_fdatasync(int fd)
336336
/*
337337
* pg_flush_data --- advise OS that the data described won't be needed soon
338338
*
339-
* Not all platforms have posix_fadvise; treat as noop if not available.
339+
* Not all platforms have sync_file_range or posix_fadvise; treat as no-op
340+
* if not available.
340341
*/
341342
int
342343
pg_flush_data(int fd, off_t offset, off_t amount)
343344
{
344-
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
345+
#if defined(HAVE_SYNC_FILE_RANGE)
346+
return sync_file_range(fd, offset, amount, SYNC_FILE_RANGE_WRITE);
347+
#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
345348
return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED);
346349
#else
347350
return 0;

src/bin/initdb/initdb.c

Lines changed: 229 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
#include "postgres_fe.h"
5050

5151
#include <dirent.h>
52+
#include <fcntl.h>
5253
#include <sys/stat.h>
5354
#include <unistd.h>
5455
#include <locale.h>
@@ -116,6 +117,7 @@ static const char *authmethodhost = "";
116117
static const char *authmethodlocal = "";
117118
static bool debug = false;
118119
static bool noclean = false;
120+
static bool do_sync = true;
119121
static bool show_setting = false;
120122
static char *xlog_dir = "";
121123

@@ -160,6 +162,9 @@ static char *authwarning = NULL;
160162
/*
161163
* Centralized knowledge of switches to pass to backend
162164
*
165+
* Note: we run the backend with -F (fsync disabled) and then do a single
166+
* pass of fsync'ing at the end. This is faster than fsync'ing each step.
167+
*
163168
* Note: in the shell-script version, we also passed PGDATA as a -D switch,
164169
* but here it is more convenient to pass it as an environment variable
165170
* (no quoting to worry about).
@@ -182,6 +187,9 @@ static char **filter_lines_with_token(char **lines, const char *token);
182187
#endif
183188
static char **readfile(const char *path);
184189
static void writefile(char *path, char **lines);
190+
static void walkdir(char *path, void (*action)(char *fname, bool isdir));
191+
static void pre_sync_fname(char *fname, bool isdir);
192+
static void fsync_fname(char *fname, bool isdir);
185193
static FILE *popen_check(const char *command, const char *mode);
186194
static void exit_nicely(void);
187195
static char *get_id(void);
@@ -209,6 +217,7 @@ static void load_plpgsql(void);
209217
static void vacuum_db(void);
210218
static void make_template0(void);
211219
static void make_postgres(void);
220+
static void perform_fsync(void);
212221
static void trapsig(int signum);
213222
static void check_ok(void);
214223
static char *escape_quotes(const char *src);
@@ -489,6 +498,174 @@ writefile(char *path, char **lines)
489498
}
490499
}
491500

501+
/*
502+
* walkdir: recursively walk a directory, applying the action to each
503+
* regular file and directory (including the named directory itself).
504+
*
505+
* Adapted from copydir() in copydir.c.
506+
*/
507+
static void
508+
walkdir(char *path, void (*action) (char *fname, bool isdir))
509+
{
510+
DIR *dir;
511+
struct dirent *direntry;
512+
char subpath[MAXPGPATH];
513+
514+
dir = opendir(path);
515+
if (dir == NULL)
516+
{
517+
fprintf(stderr, _("%s: could not open directory \"%s\": %s\n"),
518+
progname, path, strerror(errno));
519+
exit_nicely();
520+
}
521+
522+
while (errno = 0, (direntry = readdir(dir)) != NULL)
523+
{
524+
struct stat fst;
525+
526+
if (strcmp(direntry->d_name, ".") == 0 ||
527+
strcmp(direntry->d_name, "..") == 0)
528+
continue;
529+
530+
snprintf(subpath, MAXPGPATH, "%s/%s", path, direntry->d_name);
531+
532+
if (lstat(subpath, &fst) < 0)
533+
{
534+
fprintf(stderr, _("%s: could not stat file \"%s\": %s\n"),
535+
progname, subpath, strerror(errno));
536+
exit_nicely();
537+
}
538+
539+
if (S_ISDIR(fst.st_mode))
540+
walkdir(subpath, action);
541+
else if (S_ISREG(fst.st_mode))
542+
(*action) (subpath, false);
543+
}
544+
545+
#ifdef WIN32
546+
/*
547+
* This fix is in mingw cvs (runtime/mingwex/dirent.c rev 1.4), but not in
548+
* released version
549+
*/
550+
if (GetLastError() == ERROR_NO_MORE_FILES)
551+
errno = 0;
552+
#endif
553+
554+
if (errno)
555+
{
556+
fprintf(stderr, _("%s: could not read directory \"%s\": %s\n"),
557+
progname, path, strerror(errno));
558+
exit_nicely();
559+
}
560+
561+
closedir(dir);
562+
563+
/*
564+
* It's important to fsync the destination directory itself as individual
565+
* file fsyncs don't guarantee that the directory entry for the file is
566+
* synced. Recent versions of ext4 have made the window much wider but
567+
* it's been an issue for ext3 and other filesystems in the past.
568+
*/
569+
(*action) (path, true);
570+
}
571+
572+
/*
573+
* Hint to the OS that it should get ready to fsync() this file.
574+
*/
575+
static void
576+
pre_sync_fname(char *fname, bool isdir)
577+
{
578+
#if defined(HAVE_SYNC_FILE_RANGE) || \
579+
(defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED))
580+
int fd;
581+
582+
fd = open(fname, O_RDONLY | PG_BINARY);
583+
584+
/*
585+
* Some OSs don't allow us to open directories at all (Windows returns
586+
* EACCES)
587+
*/
588+
if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
589+
return;
590+
591+
if (fd < 0)
592+
{
593+
fprintf(stderr, _("%s: could not open file \"%s\": %s\n"),
594+
progname, fname, strerror(errno));
595+
exit_nicely();
596+
}
597+
598+
/*
599+
* Prefer sync_file_range, else use posix_fadvise. We ignore any error
600+
* here since this operation is only a hint anyway.
601+
*/
602+
#if defined(HAVE_SYNC_FILE_RANGE)
603+
sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE);
604+
#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
605+
posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
606+
#endif
607+
608+
close(fd);
609+
#endif
610+
}
611+
612+
/*
613+
* fsync a file or directory
614+
*
615+
* Try to fsync directories but ignore errors that indicate the OS
616+
* just doesn't allow/require fsyncing directories.
617+
*
618+
* Adapted from fsync_fname() in copydir.c.
619+
*/
620+
static void
621+
fsync_fname(char *fname, bool isdir)
622+
{
623+
int fd;
624+
int returncode;
625+
626+
/*
627+
* Some OSs require directories to be opened read-only whereas other
628+
* systems don't allow us to fsync files opened read-only; so we need both
629+
* cases here
630+
*/
631+
if (!isdir)
632+
fd = open(fname, O_RDWR | PG_BINARY);
633+
else
634+
fd = open(fname, O_RDONLY | PG_BINARY);
635+
636+
/*
637+
* Some OSs don't allow us to open directories at all (Windows returns
638+
* EACCES)
639+
*/
640+
if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
641+
return;
642+
643+
else if (fd < 0)
644+
{
645+
fprintf(stderr, _("%s: could not open file \"%s\": %s\n"),
646+
progname, fname, strerror(errno));
647+
exit_nicely();
648+
}
649+
650+
returncode = fsync(fd);
651+
652+
/* Some OSs don't allow us to fsync directories at all */
653+
if (returncode != 0 && isdir && errno == EBADF)
654+
{
655+
close(fd);
656+
return;
657+
}
658+
659+
if (returncode != 0)
660+
{
661+
fprintf(stderr, _("%s: could not fsync file \"%s\": %s\n"),
662+
progname, fname, strerror(errno));
663+
exit_nicely();
664+
}
665+
666+
close(fd);
667+
}
668+
492669
/*
493670
* Open a subcommand with suitable error messaging
494671
*/
@@ -2092,6 +2269,47 @@ make_postgres(void)
20922269
check_ok();
20932270
}
20942271

2272+
/*
2273+
* fsync everything down to disk
2274+
*/
2275+
static void
2276+
perform_fsync(void)
2277+
{
2278+
char pdir[MAXPGPATH];
2279+
2280+
fputs(_("syncing data to disk ... "), stdout);
2281+
fflush(stdout);
2282+
2283+
/*
2284+
* We need to name the parent of PGDATA. get_parent_directory() isn't
2285+
* enough here, because it can result in an empty string.
2286+
*/
2287+
snprintf(pdir, MAXPGPATH, "%s/..", pg_data);
2288+
canonicalize_path(pdir);
2289+
2290+
/*
2291+
* Hint to the OS so that we're going to fsync each of these files soon.
2292+
*/
2293+
2294+
/* first the parent of the PGDATA directory */
2295+
pre_sync_fname(pdir, true);
2296+
2297+
/* then recursively through the directory */
2298+
walkdir(pg_data, pre_sync_fname);
2299+
2300+
/*
2301+
* Now, do the fsync()s in the same order.
2302+
*/
2303+
2304+
/* first the parent of the PGDATA directory */
2305+
fsync_fname(pdir, true);
2306+
2307+
/* then recursively through the directory */
2308+
walkdir(pg_data, fsync_fname);
2309+
2310+
check_ok();
2311+
}
2312+
20952313

20962314
/*
20972315
* signal handler in case we are interrupted.
@@ -2532,6 +2750,7 @@ usage(const char *progname)
25322750
printf(_(" -d, --debug generate lots of debugging output\n"));
25332751
printf(_(" -L DIRECTORY where to find the input files\n"));
25342752
printf(_(" -n, --noclean do not clean up after errors\n"));
2753+
printf(_(" -N, --nosync do not wait for changes to be written safely to disk\n"));
25352754
printf(_(" -s, --show show internal settings\n"));
25362755
printf(_("\nOther options:\n"));
25372756
printf(_(" -V, --version output version information, then exit\n"));
@@ -2621,6 +2840,7 @@ main(int argc, char *argv[])
26212840
{"debug", no_argument, NULL, 'd'},
26222841
{"show", no_argument, NULL, 's'},
26232842
{"noclean", no_argument, NULL, 'n'},
2843+
{"nosync", no_argument, NULL, 'N'},
26242844
{"xlogdir", required_argument, NULL, 'X'},
26252845
{NULL, 0, NULL, 0}
26262846
};
@@ -2676,7 +2896,7 @@ main(int argc, char *argv[])
26762896

26772897
/* process command-line options */
26782898

2679-
while ((c = getopt_long(argc, argv, "dD:E:L:nU:WA:sT:X:", long_options, &option_index)) != -1)
2899+
while ((c = getopt_long(argc, argv, "dD:E:L:nNU:WA:sT:X:", long_options, &option_index)) != -1)
26802900
{
26812901
switch (c)
26822902
{
@@ -2719,6 +2939,9 @@ main(int argc, char *argv[])
27192939
noclean = true;
27202940
printf(_("Running in noclean mode. Mistakes will not be cleaned up.\n"));
27212941
break;
2942+
case 'N':
2943+
do_sync = false;
2944+
break;
27222945
case 'L':
27232946
share_path = xstrdup(optarg);
27242947
break;
@@ -3310,6 +3533,11 @@ main(int argc, char *argv[])
33103533

33113534
make_postgres();
33123535

3536+
if (do_sync)
3537+
perform_fsync();
3538+
else
3539+
printf(_("\nSync to disk skipped.\nThe data directory might become corrupt if the operating system crashes.\n"));
3540+
33133541
if (authwarning != NULL)
33143542
fprintf(stderr, "%s", authwarning);
33153543

src/include/pg_config.h.in

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -511,6 +511,9 @@
511511
/* Define to 1 if you have the `symlink' function. */
512512
#undef HAVE_SYMLINK
513513

514+
/* Define to 1 if you have the `sync_file_range' function. */
515+
#undef HAVE_SYNC_FILE_RANGE
516+
514517
/* Define to 1 if you have the syslog interface. */
515518
#undef HAVE_SYSLOG
516519

src/include/pg_config.h.win32

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,9 @@
420420
/* Define to 1 if you have the `symlink' function. */
421421
#define HAVE_SYMLINK 1
422422

423+
/* Define to 1 if you have the `sync_file_range' function. */
424+
/* #undef HAVE_SYNC_FILE_RANGE */
425+
423426
/* Define to 1 if you have the `sysconf' function. */
424427
/* #undef HAVE_SYSCONF */
425428

0 commit comments

Comments
 (0)