Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 0d01c5b

Browse files
committed
Fix postmaster's handling of a startup-process crash.
Ordinarily, a failure (unexpected exit status) of the startup subprocess should be considered fatal, so the postmaster should just close up shop and quit. However, if we sent the startup process a SIGQUIT or SIGKILL signal, the failure is hardly "unexpected", and we should attempt restart; this is necessary for recovery from ordinary backend crashes in hot-standby scenarios. I attempted to implement the latter rule with a two-line patch in commit 442231d, but it now emerges that that patch was a few bricks shy of a load: it failed to distinguish the case of a signaled startup process from the case where the new startup process crashes before reaching database consistency. That resulted in infinitely respawning a new startup process only to have it crash again. To handle this properly, we really must track whether we have sent the *current* startup process a kill signal. Rather than add yet another ad-hoc boolean to the postmaster's state, I chose to unify this with the existing RecoveryError flag into an enum tracking the startup process's state. That seems more consistent with the postmaster's general state machine design. Back-patch to 9.0, like the previous patch.
1 parent cf0c446 commit 0d01c5b

File tree

1 file changed

+37
-14
lines changed

1 file changed

+37
-14
lines changed

src/backend/postmaster/postmaster.c

Lines changed: 37 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,17 @@ static pid_t StartupPID = 0,
249249
PgStatPID = 0,
250250
SysLoggerPID = 0;
251251

252+
/* Startup process's status */
253+
typedef enum
254+
{
255+
STARTUP_NOT_RUNNING,
256+
STARTUP_RUNNING,
257+
STARTUP_SIGNALED, /* we sent it a SIGQUIT or SIGKILL */
258+
STARTUP_CRASHED
259+
} StartupStatusEnum;
260+
261+
static StartupStatusEnum StartupStatus = STARTUP_NOT_RUNNING;
262+
252263
/* Startup/shutdown state */
253264
#define NoShutdown 0
254265
#define SmartShutdown 1
@@ -258,7 +269,6 @@ static pid_t StartupPID = 0,
258269
static int Shutdown = NoShutdown;
259270

260271
static bool FatalError = false; /* T if recovering from backend crash */
261-
static bool RecoveryError = false; /* T if WAL recovery failed */
262272

263273
/*
264274
* We use a simple state machine to control startup, shutdown, and
@@ -301,8 +311,6 @@ static bool RecoveryError = false; /* T if WAL recovery failed */
301311
* states, nor in PM_SHUTDOWN states (because we don't enter those states
302312
* when trying to recover from a crash). It can be true in PM_STARTUP state,
303313
* because we don't clear it until we've successfully started WAL redo.
304-
* Similarly, RecoveryError means that we have crashed during recovery, and
305-
* should not try to restart.
306314
*/
307315
typedef enum
308316
{
@@ -1238,6 +1246,7 @@ PostmasterMain(int argc, char *argv[])
12381246
*/
12391247
StartupPID = StartupDataBase();
12401248
Assert(StartupPID != 0);
1249+
StartupStatus = STARTUP_RUNNING;
12411250
pmState = PM_STARTUP;
12421251

12431252
/* Some workers may be scheduled to start now */
@@ -2583,6 +2592,7 @@ reaper(SIGNAL_ARGS)
25832592
if (Shutdown > NoShutdown &&
25842593
(EXIT_STATUS_0(exitstatus) || EXIT_STATUS_1(exitstatus)))
25852594
{
2595+
StartupStatus = STARTUP_NOT_RUNNING;
25862596
pmState = PM_WAIT_BACKENDS;
25872597
/* PostmasterStateMachine logic does the rest */
25882598
continue;
@@ -2605,16 +2615,18 @@ reaper(SIGNAL_ARGS)
26052615
/*
26062616
* After PM_STARTUP, any unexpected exit (including FATAL exit) of
26072617
* the startup process is catastrophic, so kill other children,
2608-
* and set RecoveryError so we don't try to reinitialize after
2609-
* they're gone. Exception: if FatalError is already set, that
2610-
* implies we previously sent the startup process a SIGQUIT, so
2618+
* and set StartupStatus so we don't try to reinitialize after
2619+
* they're gone. Exception: if StartupStatus is STARTUP_SIGNALED,
2620+
* then we previously sent the startup process a SIGQUIT; so
26112621
* that's probably the reason it died, and we do want to try to
26122622
* restart in that case.
26132623
*/
26142624
if (!EXIT_STATUS_0(exitstatus))
26152625
{
2616-
if (!FatalError)
2617-
RecoveryError = true;
2626+
if (StartupStatus == STARTUP_SIGNALED)
2627+
StartupStatus = STARTUP_NOT_RUNNING;
2628+
else
2629+
StartupStatus = STARTUP_CRASHED;
26182630
HandleChildCrash(pid, exitstatus,
26192631
_("startup process"));
26202632
continue;
@@ -2623,6 +2635,7 @@ reaper(SIGNAL_ARGS)
26232635
/*
26242636
* Startup succeeded, commence normal operations
26252637
*/
2638+
StartupStatus = STARTUP_NOT_RUNNING;
26262639
FatalError = false;
26272640
Assert(AbortStartTime == 0);
26282641
ReachedNormalRunning = true;
@@ -3170,14 +3183,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
31703183

31713184
/* Take care of the startup process too */
31723185
if (pid == StartupPID)
3186+
{
31733187
StartupPID = 0;
3188+
StartupStatus = STARTUP_CRASHED;
3189+
}
31743190
else if (StartupPID != 0 && take_action)
31753191
{
31763192
ereport(DEBUG2,
31773193
(errmsg_internal("sending %s to process %d",
31783194
(SendStop ? "SIGSTOP" : "SIGQUIT"),
31793195
(int) StartupPID)));
31803196
signal_child(StartupPID, (SendStop ? SIGSTOP : SIGQUIT));
3197+
StartupStatus = STARTUP_SIGNALED;
31813198
}
31823199

31833200
/* Take care of the bgwriter too */
@@ -3569,13 +3586,14 @@ PostmasterStateMachine(void)
35693586
}
35703587

35713588
/*
3572-
* If recovery failed, or the user does not want an automatic restart
3573-
* after backend crashes, wait for all non-syslogger children to exit, and
3574-
* then exit postmaster. We don't try to reinitialize when recovery fails,
3575-
* because more than likely it will just fail again and we will keep
3576-
* trying forever.
3589+
* If the startup process failed, or the user does not want an automatic
3590+
* restart after backend crashes, wait for all non-syslogger children to
3591+
* exit, and then exit postmaster. We don't try to reinitialize when the
3592+
* startup process fails, because more than likely it will just fail again
3593+
* and we will keep trying forever.
35773594
*/
3578-
if (pmState == PM_NO_CHILDREN && (RecoveryError || !restart_after_crash))
3595+
if (pmState == PM_NO_CHILDREN &&
3596+
(StartupStatus == STARTUP_CRASHED || !restart_after_crash))
35793597
ExitPostmaster(1);
35803598

35813599
/*
@@ -3595,6 +3613,7 @@ PostmasterStateMachine(void)
35953613

35963614
StartupPID = StartupDataBase();
35973615
Assert(StartupPID != 0);
3616+
StartupStatus = STARTUP_RUNNING;
35983617
pmState = PM_STARTUP;
35993618
/* crash recovery started, reset SIGKILL flag */
36003619
AbortStartTime = 0;
@@ -3726,7 +3745,11 @@ TerminateChildren(int signal)
37263745
{
37273746
SignalChildren(signal);
37283747
if (StartupPID != 0)
3748+
{
37293749
signal_child(StartupPID, signal);
3750+
if (signal == SIGQUIT || signal == SIGKILL)
3751+
StartupStatus = STARTUP_SIGNALED;
3752+
}
37303753
if (BgWriterPID != 0)
37313754
signal_child(BgWriterPID, signal);
37323755
if (CheckpointerPID != 0)

0 commit comments

Comments
 (0)