Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 8348413

Browse files
committed
Fix replay of create database records on standby
Crash recovery on standby may encounter missing directories when replaying database-creation WAL records. Prior to this patch, the standby would fail to recover in such a case; however, the directories could be legitimately missing. Consider the following sequence of commands: CREATE DATABASE DROP DATABASE DROP TABLESPACE If, after replaying the last WAL record and removing the tablespace directory, the standby crashes and has to replay the create database record again, crash recovery must be able to continue. A fix for this problem was already attempted in 49d9cfc, but it was reverted because of design issues. This new version is based on Robert Haas' proposal: any missing tablespaces are created during recovery before reaching consistency. Tablespaces are created as real directories, and should be deleted by later replay. CheckRecoveryConsistency ensures they have disappeared. The problems detected by this new code are reported as PANIC, except when allow_in_place_tablespaces is set to ON, in which case they are WARNING. Apart from making tests possible, this gives users an escape hatch in case things don't go as planned. Author: Kyotaro Horiguchi <horikyota.ntt@gmail.com> Author: Asim R Praveen <apraveen@pivotal.io> Author: Paul Guo <paulguo@gmail.com> Reviewed-by: Anastasia Lubennikova <lubennikovaav@gmail.com> (older versions) Reviewed-by: Fujii Masao <masao.fujii@oss.nttdata.com> (older versions) Reviewed-by: Michaël Paquier <michael@paquier.xyz> Diagnosed-by: Paul Guo <paulguo@gmail.com> Discussion: https://postgr.es/m/CAEET0ZGx9AvioViLf7nbR_8tH9-=27DN5xWJ2P9-ROH16e4JUA@mail.gmail.com
1 parent fc4e5af commit 8348413

File tree

4 files changed

+305
-31
lines changed

4 files changed

+305
-31
lines changed

src/backend/access/transam/xlogrecovery.c

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
#include "access/xlogutils.h"
4343
#include "catalog/pg_control.h"
4444
#include "commands/tablespace.h"
45+
#include "common/file_utils.h"
4546
#include "miscadmin.h"
4647
#include "pgstat.h"
4748
#include "postmaster/bgwriter.h"
@@ -2008,6 +2009,47 @@ xlogrecovery_redo(XLogReaderState *record, TimeLineID replayTLI)
20082009
}
20092010
}
20102011

2012+
/*
2013+
* Verify that, in non-test mode, ./pg_tblspc doesn't contain any real
2014+
* directories.
2015+
*
2016+
* Replay of database creation XLOG records for databases that were later
2017+
* dropped can create fake directories in pg_tblspc. By the time consistency
2018+
* is reached these directories should have been removed; here we verify
2019+
* that this did indeed happen. This is to be called at the point where
2020+
* consistent state is reached.
2021+
*
2022+
* allow_in_place_tablespaces turns the PANIC into a WARNING, which is
2023+
* useful for testing purposes, and also allows for an escape hatch in case
2024+
* things go south.
2025+
*/
2026+
static void
2027+
CheckTablespaceDirectory(void)
2028+
{
2029+
DIR *dir;
2030+
struct dirent *de;
2031+
2032+
dir = AllocateDir("pg_tblspc");
2033+
while ((de = ReadDir(dir, "pg_tblspc")) != NULL)
2034+
{
2035+
char path[MAXPGPATH + 10];
2036+
2037+
/* Skip entries of non-oid names */
2038+
if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
2039+
continue;
2040+
2041+
snprintf(path, sizeof(path), "pg_tblspc/%s", de->d_name);
2042+
2043+
if (get_dirent_type(path, de, false, ERROR) != PGFILETYPE_LNK)
2044+
ereport(allow_in_place_tablespaces ? WARNING : PANIC,
2045+
(errcode(ERRCODE_DATA_CORRUPTED),
2046+
errmsg("unexpected directory entry \"%s\" found in %s",
2047+
de->d_name, "pg_tblspc/"),
2048+
errdetail("All directory entries in pg_tblspc/ should be symbolic links."),
2049+
errhint("Remove those directories, or set allow_in_place_tablespaces to ON transiently to let recovery complete.")));
2050+
}
2051+
}
2052+
20112053
/*
20122054
* Checks if recovery has reached a consistent state. When consistency is
20132055
* reached and we have a valid starting standby snapshot, tell postmaster
@@ -2068,6 +2110,14 @@ CheckRecoveryConsistency(void)
20682110
*/
20692111
XLogCheckInvalidPages();
20702112

2113+
/*
2114+
* Check that pg_tblspc doesn't contain any real directories. Replay
2115+
* of Database/CREATE_* records may have created ficticious tablespace
2116+
* directories that should have been removed by the time consistency
2117+
* was reached.
2118+
*/
2119+
CheckTablespaceDirectory();
2120+
20712121
reachedConsistency = true;
20722122
ereport(LOG,
20732123
(errmsg("consistent recovery state reached at %X/%X",

src/backend/commands/dbcommands.c

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "access/tableam.h"
3131
#include "access/xact.h"
3232
#include "access/xloginsert.h"
33+
#include "access/xlogrecovery.h"
3334
#include "access/xlogutils.h"
3435
#include "catalog/catalog.h"
3536
#include "catalog/dependency.h"
@@ -47,6 +48,7 @@
4748
#include "commands/defrem.h"
4849
#include "commands/seclabel.h"
4950
#include "commands/tablespace.h"
51+
#include "common/file_perm.h"
5052
#include "mb/pg_wchar.h"
5153
#include "miscadmin.h"
5254
#include "pgstat.h"
@@ -135,6 +137,7 @@ static void CreateDirAndVersionFile(char *dbpath, Oid dbid, Oid tsid,
135137
bool isRedo);
136138
static void CreateDatabaseUsingFileCopy(Oid src_dboid, Oid dboid, Oid src_tsid,
137139
Oid dst_tsid);
140+
static void recovery_create_dbdir(char *path, bool only_tblspc);
138141

139142
/*
140143
* Create a new database using the WAL_LOG strategy.
@@ -3008,6 +3011,45 @@ get_database_name(Oid dbid)
30083011
return result;
30093012
}
30103013

3014+
/*
3015+
* recovery_create_dbdir()
3016+
*
3017+
* During recovery, there's a case where we validly need to recover a missing
3018+
* tablespace directory so that recovery can continue. This happens when
3019+
* recovery wants to create a database but the holding tablespace has been
3020+
* removed before the server stopped. Since we expect that the directory will
3021+
* be gone before reaching recovery consistency, and we have no knowledge about
3022+
* the tablespace other than its OID here, we create a real directory under
3023+
* pg_tblspc here instead of restoring the symlink.
3024+
*
3025+
* If only_tblspc is true, then the requested directory must be in pg_tblspc/
3026+
*/
3027+
static void
3028+
recovery_create_dbdir(char *path, bool only_tblspc)
3029+
{
3030+
struct stat st;
3031+
3032+
Assert(RecoveryInProgress());
3033+
3034+
if (stat(path, &st) == 0)
3035+
return;
3036+
3037+
if (only_tblspc && strstr(path, "pg_tblspc/") == NULL)
3038+
elog(PANIC, "requested to created invalid directory: %s", path);
3039+
3040+
if (reachedConsistency && !allow_in_place_tablespaces)
3041+
ereport(PANIC,
3042+
errmsg("missing directory \"%s\"", path));
3043+
3044+
elog(reachedConsistency ? WARNING : DEBUG1,
3045+
"creating missing directory: %s", path);
3046+
3047+
if (pg_mkdir_p(path, pg_dir_create_mode) != 0)
3048+
ereport(PANIC,
3049+
errmsg("could not create missing directory \"%s\": %m", path));
3050+
}
3051+
3052+
30113053
/*
30123054
* DATABASE resource manager's routines
30133055
*/
@@ -3025,6 +3067,7 @@ dbase_redo(XLogReaderState *record)
30253067
(xl_dbase_create_file_copy_rec *) XLogRecGetData(record);
30263068
char *src_path;
30273069
char *dst_path;
3070+
char *parent_path;
30283071
struct stat st;
30293072

30303073
src_path = GetDatabasePath(xlrec->src_db_id, xlrec->src_tablespace_id);
@@ -3044,6 +3087,34 @@ dbase_redo(XLogReaderState *record)
30443087
dst_path)));
30453088
}
30463089

3090+
/*
3091+
* If the parent of the target path doesn't exist, create it now. This
3092+
* enables us to create the target underneath later. Note that if
3093+
* the database dir is not in a tablespace, the parent will always
3094+
* exist, so this never runs in that case.
3095+
*/
3096+
parent_path = pstrdup(dst_path);
3097+
get_parent_directory(parent_path);
3098+
if (stat(parent_path, &st) < 0)
3099+
{
3100+
if (errno != ENOENT)
3101+
ereport(FATAL,
3102+
errmsg("could not stat directory \"%s\": %m",
3103+
dst_path));
3104+
3105+
recovery_create_dbdir(parent_path, true);
3106+
}
3107+
pfree(parent_path);
3108+
3109+
/*
3110+
* There's a case where the copy source directory is missing for the
3111+
* same reason above. Create the emtpy source directory so that
3112+
* copydir below doesn't fail. The directory will be dropped soon by
3113+
* recovery.
3114+
*/
3115+
if (stat(src_path, &st) < 0 && errno == ENOENT)
3116+
recovery_create_dbdir(src_path, false);
3117+
30473118
/*
30483119
* Force dirty buffers out to disk, to ensure source database is
30493120
* up-to-date for the copy.
@@ -3068,9 +3139,15 @@ dbase_redo(XLogReaderState *record)
30683139
xl_dbase_create_wal_log_rec *xlrec =
30693140
(xl_dbase_create_wal_log_rec *) XLogRecGetData(record);
30703141
char *dbpath;
3142+
char *parent_path;
30713143

30723144
dbpath = GetDatabasePath(xlrec->db_id, xlrec->tablespace_id);
30733145

3146+
/* create the parent directory if needed and valid */
3147+
parent_path = pstrdup(dbpath);
3148+
get_parent_directory(parent_path);
3149+
recovery_create_dbdir(parent_path, true);
3150+
30743151
/* Create the database directory with the version file. */
30753152
CreateDirAndVersionFile(dbpath, xlrec->db_id, xlrec->tablespace_id,
30763153
true);

src/backend/commands/tablespace.c

Lines changed: 9 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -156,8 +156,6 @@ TablespaceCreateDbspace(Oid spcNode, Oid dbNode, bool isRedo)
156156
/* Directory creation failed? */
157157
if (MakePGDirectory(dir) < 0)
158158
{
159-
char *parentdir;
160-
161159
/* Failure other than not exists or not in WAL replay? */
162160
if (errno != ENOENT || !isRedo)
163161
ereport(ERROR,
@@ -166,36 +164,16 @@ TablespaceCreateDbspace(Oid spcNode, Oid dbNode, bool isRedo)
166164
dir)));
167165

168166
/*
169-
* Parent directories are missing during WAL replay, so
170-
* continue by creating simple parent directories rather
171-
* than a symlink.
167+
* During WAL replay, it's conceivable that several levels
168+
* of directories are missing if tablespaces are dropped
169+
* further ahead of the WAL stream than we're currently
170+
* replaying. An easy way forward is to create them as
171+
* plain directories and hope they are removed by further
172+
* WAL replay if necessary. If this also fails, there is
173+
* trouble we cannot get out of, so just report that and
174+
* bail out.
172175
*/
173-
174-
/* create two parents up if not exist */
175-
parentdir = pstrdup(dir);
176-
get_parent_directory(parentdir);
177-
get_parent_directory(parentdir);
178-
/* Can't create parent and it doesn't already exist? */
179-
if (MakePGDirectory(parentdir) < 0 && errno != EEXIST)
180-
ereport(ERROR,
181-
(errcode_for_file_access(),
182-
errmsg("could not create directory \"%s\": %m",
183-
parentdir)));
184-
pfree(parentdir);
185-
186-
/* create one parent up if not exist */
187-
parentdir = pstrdup(dir);
188-
get_parent_directory(parentdir);
189-
/* Can't create parent and it doesn't already exist? */
190-
if (MakePGDirectory(parentdir) < 0 && errno != EEXIST)
191-
ereport(ERROR,
192-
(errcode_for_file_access(),
193-
errmsg("could not create directory \"%s\": %m",
194-
parentdir)));
195-
pfree(parentdir);
196-
197-
/* Create database directory */
198-
if (MakePGDirectory(dir) < 0)
176+
if (pg_mkdir_p(dir, pg_dir_create_mode) < 0)
199177
ereport(ERROR,
200178
(errcode_for_file_access(),
201179
errmsg("could not create directory \"%s\": %m",

0 commit comments

Comments
 (0)