Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit ead2163

Browse files
committed
Fix a couple of problems pointed out by Fujii Masao in the 2008-Apr-05 patch
for pg_stop_backup. First, it is possible that the history file name is not alphabetically later than the last WAL file name, so we should explicitly check that both have been archived. Second, the previous coding would wait forever if a checkpoint had managed to remove the WAL file before we look for it. Simon Riggs, plus some code cleanup by me.
1 parent bc01b45 commit ead2163

File tree

1 file changed

+70
-22
lines changed
  • src/backend/access/transam

1 file changed

+70
-22
lines changed

src/backend/access/transam/xlog.c

+70-22
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.317 2008/08/11 11:05:10 heikki Exp $
10+
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.318 2008/09/08 16:42:15 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -391,7 +391,8 @@ static bool InRedo = false;
391391

392392
static void XLogArchiveNotify(const char *xlog);
393393
static void XLogArchiveNotifySeg(uint32 log, uint32 seg);
394-
static bool XLogArchiveCheckDone(const char *xlog, bool create_if_missing);
394+
static bool XLogArchiveCheckDone(const char *xlog);
395+
static bool XLogArchiveIsBusy(const char *xlog);
395396
static void XLogArchiveCleanup(const char *xlog);
396397
static void readRecoveryCommandFile(void);
397398
static void exitArchiveRecovery(TimeLineID endTLI,
@@ -1137,7 +1138,7 @@ XLogArchiveNotifySeg(uint32 log, uint32 seg)
11371138
* create <XLOG>.ready fails, we'll retry during subsequent checkpoints.
11381139
*/
11391140
static bool
1140-
XLogArchiveCheckDone(const char *xlog, bool create_if_missing)
1141+
XLogArchiveCheckDone(const char *xlog)
11411142
{
11421143
char archiveStatusPath[MAXPGPATH];
11431144
struct stat stat_buf;
@@ -1162,12 +1163,54 @@ XLogArchiveCheckDone(const char *xlog, bool create_if_missing)
11621163
return true;
11631164

11641165
/* Retry creation of the .ready file */
1165-
if (create_if_missing)
1166-
XLogArchiveNotify(xlog);
1167-
1166+
XLogArchiveNotify(xlog);
11681167
return false;
11691168
}
11701169

1170+
/*
1171+
* XLogArchiveIsBusy
1172+
*
1173+
* Check to see if an XLOG segment file is still unarchived.
1174+
* This is almost but not quite the inverse of XLogArchiveCheckDone: in
1175+
* the first place we aren't chartered to recreate the .ready file, and
1176+
* in the second place we should consider that if the file is already gone
1177+
* then it's not busy. (This check is needed to handle the race condition
1178+
* that a checkpoint already deleted the no-longer-needed file.)
1179+
*/
1180+
static bool
1181+
XLogArchiveIsBusy(const char *xlog)
1182+
{
1183+
char archiveStatusPath[MAXPGPATH];
1184+
struct stat stat_buf;
1185+
1186+
/* First check for .done --- this means archiver is done with it */
1187+
StatusFilePath(archiveStatusPath, xlog, ".done");
1188+
if (stat(archiveStatusPath, &stat_buf) == 0)
1189+
return false;
1190+
1191+
/* check for .ready --- this means archiver is still busy with it */
1192+
StatusFilePath(archiveStatusPath, xlog, ".ready");
1193+
if (stat(archiveStatusPath, &stat_buf) == 0)
1194+
return true;
1195+
1196+
/* Race condition --- maybe archiver just finished, so recheck */
1197+
StatusFilePath(archiveStatusPath, xlog, ".done");
1198+
if (stat(archiveStatusPath, &stat_buf) == 0)
1199+
return false;
1200+
1201+
/*
1202+
* Check to see if the WAL file has been removed by checkpoint,
1203+
* which implies it has already been archived, and explains why we
1204+
* can't see a status file for it.
1205+
*/
1206+
snprintf(archiveStatusPath, MAXPGPATH, XLOGDIR "/%s", xlog);
1207+
if (stat(archiveStatusPath, &stat_buf) != 0 &&
1208+
errno == ENOENT)
1209+
return false;
1210+
1211+
return true;
1212+
}
1213+
11711214
/*
11721215
* XLogArchiveCleanup
11731216
*
@@ -2499,14 +2542,14 @@ RestoreArchivedFile(char *path, const char *xlogfname,
24992542
*
25002543
* We initialise this with the filename of an InvalidXLogRecPtr, which
25012544
* will prevent the deletion of any WAL files from the archive
2502-
* because of the alphabetic sorting property of WAL filenames.
2545+
* because of the alphabetic sorting property of WAL filenames.
25032546
*
25042547
* Once we have successfully located the redo pointer of the checkpoint
25052548
* from which we start recovery we never request a file prior to the redo
25062549
* pointer of the last restartpoint. When redo begins we know that we
25072550
* have successfully located it, so there is no need for additional
25082551
* status flags to signify the point when we can begin deleting WAL files
2509-
* from the archive.
2552+
* from the archive.
25102553
*/
25112554
if (InRedo)
25122555
{
@@ -2740,7 +2783,7 @@ RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr)
27402783
strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
27412784
strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
27422785
{
2743-
if (XLogArchiveCheckDone(xlde->d_name, true))
2786+
if (XLogArchiveCheckDone(xlde->d_name))
27442787
{
27452788
snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
27462789

@@ -2807,7 +2850,7 @@ CleanupBackupHistory(void)
28072850
strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
28082851
".backup") == 0)
28092852
{
2810-
if (XLogArchiveCheckDone(xlde->d_name, true))
2853+
if (XLogArchiveCheckDone(xlde->d_name))
28112854
{
28122855
ereport(DEBUG2,
28132856
(errmsg("removing transaction log backup history file \"%s\"",
@@ -6623,6 +6666,12 @@ pg_stop_backup(PG_FUNCTION_ARGS)
66236666
(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
66246667
(errmsg("must be superuser to run a backup"))));
66256668

6669+
if (!XLogArchivingActive())
6670+
ereport(ERROR,
6671+
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
6672+
errmsg("WAL archiving is not active"),
6673+
errhint("archive_mode must be enabled at server start.")));
6674+
66266675
/*
66276676
* OK to clear forcePageWrites
66286677
*/
@@ -6721,25 +6770,23 @@ pg_stop_backup(PG_FUNCTION_ARGS)
67216770
CleanupBackupHistory();
67226771

67236772
/*
6724-
* Wait until the history file has been archived. We assume that the
6725-
* alphabetic sorting property of the WAL files ensures the last WAL
6726-
* file is guaranteed archived by the time the history file is archived.
6773+
* Wait until both the last WAL file filled during backup and the history
6774+
* file have been archived. We assume that the alphabetic sorting
6775+
* property of the WAL files ensures any earlier WAL files are safely
6776+
* archived as well.
67276777
*
67286778
* We wait forever, since archive_command is supposed to work and
6729-
* we assume the admin wanted his backup to work completely. If you
6730-
* don't wish to wait, you can SET statement_timeout = xx;
6731-
*
6732-
* If the status file is missing, we assume that is because it was
6733-
* set to .ready before we slept, then while asleep it has been set
6734-
* to .done and then removed by a concurrent checkpoint.
6779+
* we assume the admin wanted his backup to work completely. If you
6780+
* don't wish to wait, you can set statement_timeout.
67356781
*/
67366782
BackupHistoryFileName(histfilepath, ThisTimeLineID, _logId, _logSeg,
67376783
startpoint.xrecoff % XLogSegSize);
67386784

67396785
seconds_before_warning = 60;
67406786
waits = 0;
67416787

6742-
while (!XLogArchiveCheckDone(histfilepath, false))
6788+
while (XLogArchiveIsBusy(stopxlogfilename) ||
6789+
XLogArchiveIsBusy(histfilepath))
67436790
{
67446791
CHECK_FOR_INTERRUPTS();
67456792

@@ -6748,8 +6795,9 @@ pg_stop_backup(PG_FUNCTION_ARGS)
67486795
if (++waits >= seconds_before_warning)
67496796
{
67506797
seconds_before_warning *= 2; /* This wraps in >10 years... */
6751-
elog(WARNING, "pg_stop_backup() waiting for archive to complete "
6752-
"(%d seconds delay)", waits);
6798+
ereport(WARNING,
6799+
(errmsg("pg_stop_backup still waiting for archive to complete (%d seconds elapsed)",
6800+
waits)));
67536801
}
67546802
}
67556803

0 commit comments

Comments
 (0)