Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 45811be

Browse files
committed
Fix postmaster's handling of a startup-process crash.
Ordinarily, a failure (unexpected exit status) of the startup subprocess should be considered fatal, so the postmaster should just close up shop and quit. However, if we sent the startup process a SIGQUIT or SIGKILL signal, the failure is hardly "unexpected", and we should attempt restart; this is necessary for recovery from ordinary backend crashes in hot-standby scenarios. I attempted to implement the latter rule with a two-line patch in commit 442231d, but it now emerges that that patch was a few bricks shy of a load: it failed to distinguish the case of a signaled startup process from the case where the new startup process crashes before reaching database consistency. That resulted in infinitely respawning a new startup process only to have it crash again. To handle this properly, we really must track whether we have sent the *current* startup process a kill signal. Rather than add yet another ad-hoc boolean to the postmaster's state, I chose to unify this with the existing RecoveryError flag into an enum tracking the startup process's state. That seems more consistent with the postmaster's general state machine design. Back-patch to 9.0, like the previous patch.
1 parent 6ba365a commit 45811be

File tree

1 file changed

+40
-16
lines changed

1 file changed

+40
-16
lines changed

src/backend/postmaster/postmaster.c

+40-16
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,17 @@ static pid_t StartupPID = 0,
249249
PgStatPID = 0,
250250
SysLoggerPID = 0;
251251

252+
/* Startup process's status */
253+
typedef enum
254+
{
255+
STARTUP_NOT_RUNNING,
256+
STARTUP_RUNNING,
257+
STARTUP_SIGNALED, /* we sent it a SIGQUIT or SIGKILL */
258+
STARTUP_CRASHED
259+
} StartupStatusEnum;
260+
261+
static StartupStatusEnum StartupStatus = STARTUP_NOT_RUNNING;
262+
252263
/* Startup/shutdown state */
253264
#define NoShutdown 0
254265
#define SmartShutdown 1
@@ -258,7 +269,6 @@ static pid_t StartupPID = 0,
258269
static int Shutdown = NoShutdown;
259270

260271
static bool FatalError = false; /* T if recovering from backend crash */
261-
static bool RecoveryError = false; /* T if WAL recovery failed */
262272

263273
/*
264274
* We use a simple state machine to control startup, shutdown, and
@@ -301,8 +311,6 @@ static bool RecoveryError = false; /* T if WAL recovery failed */
301311
* states, nor in PM_SHUTDOWN states (because we don't enter those states
302312
* when trying to recover from a crash). It can be true in PM_STARTUP state,
303313
* because we don't clear it until we've successfully started WAL redo.
304-
* Similarly, RecoveryError means that we have crashed during recovery, and
305-
* should not try to restart.
306314
*/
307315
typedef enum
308316
{
@@ -1246,6 +1254,7 @@ PostmasterMain(int argc, char *argv[])
12461254
*/
12471255
StartupPID = StartupDataBase();
12481256
Assert(StartupPID != 0);
1257+
StartupStatus = STARTUP_RUNNING;
12491258
pmState = PM_STARTUP;
12501259

12511260
/* Some workers may be scheduled to start now */
@@ -1666,7 +1675,7 @@ ServerLoop(void)
16661675

16671676
/* If we have lost the archiver, try to start a new one. */
16681677
if (PgArchPID == 0 && PgArchStartupAllowed())
1669-
PgArchPID = pgarch_start();
1678+
PgArchPID = pgarch_start();
16701679

16711680
/* If we need to signal the autovacuum launcher, do so now */
16721681
if (avlauncher_needs_signal)
@@ -2591,6 +2600,7 @@ reaper(SIGNAL_ARGS)
25912600
if (Shutdown > NoShutdown &&
25922601
(EXIT_STATUS_0(exitstatus) || EXIT_STATUS_1(exitstatus)))
25932602
{
2603+
StartupStatus = STARTUP_NOT_RUNNING;
25942604
pmState = PM_WAIT_BACKENDS;
25952605
/* PostmasterStateMachine logic does the rest */
25962606
continue;
@@ -2600,6 +2610,7 @@ reaper(SIGNAL_ARGS)
26002610
{
26012611
ereport(LOG,
26022612
(errmsg("shutdown at recovery target")));
2613+
StartupStatus = STARTUP_NOT_RUNNING;
26032614
Shutdown = SmartShutdown;
26042615
TerminateChildren(SIGTERM);
26052616
pmState = PM_WAIT_BACKENDS;
@@ -2624,16 +2635,18 @@ reaper(SIGNAL_ARGS)
26242635
/*
26252636
* After PM_STARTUP, any unexpected exit (including FATAL exit) of
26262637
* the startup process is catastrophic, so kill other children,
2627-
* and set RecoveryError so we don't try to reinitialize after
2628-
* they're gone. Exception: if FatalError is already set, that
2629-
* implies we previously sent the startup process a SIGQUIT, so
2638+
* and set StartupStatus so we don't try to reinitialize after
2639+
* they're gone. Exception: if StartupStatus is STARTUP_SIGNALED,
2640+
* then we previously sent the startup process a SIGQUIT; so
26302641
* that's probably the reason it died, and we do want to try to
26312642
* restart in that case.
26322643
*/
26332644
if (!EXIT_STATUS_0(exitstatus))
26342645
{
2635-
if (!FatalError)
2636-
RecoveryError = true;
2646+
if (StartupStatus == STARTUP_SIGNALED)
2647+
StartupStatus = STARTUP_NOT_RUNNING;
2648+
else
2649+
StartupStatus = STARTUP_CRASHED;
26372650
HandleChildCrash(pid, exitstatus,
26382651
_("startup process"));
26392652
continue;
@@ -2642,6 +2655,7 @@ reaper(SIGNAL_ARGS)
26422655
/*
26432656
* Startup succeeded, commence normal operations
26442657
*/
2658+
StartupStatus = STARTUP_NOT_RUNNING;
26452659
FatalError = false;
26462660
Assert(AbortStartTime == 0);
26472661
ReachedNormalRunning = true;
@@ -2962,7 +2976,7 @@ CleanupBackgroundWorker(int pid,
29622976
ReportBackgroundWorkerPID(rw); /* report child death */
29632977

29642978
LogChildExit(EXIT_STATUS_0(exitstatus) ? DEBUG1 : LOG,
2965-
namebuf, pid, exitstatus);
2979+
namebuf, pid, exitstatus);
29662980

29672981
return true;
29682982
}
@@ -3190,14 +3204,18 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
31903204

31913205
/* Take care of the startup process too */
31923206
if (pid == StartupPID)
3207+
{
31933208
StartupPID = 0;
3209+
StartupStatus = STARTUP_CRASHED;
3210+
}
31943211
else if (StartupPID != 0 && take_action)
31953212
{
31963213
ereport(DEBUG2,
31973214
(errmsg_internal("sending %s to process %d",
31983215
(SendStop ? "SIGSTOP" : "SIGQUIT"),
31993216
(int) StartupPID)));
32003217
signal_child(StartupPID, (SendStop ? SIGSTOP : SIGQUIT));
3218+
StartupStatus = STARTUP_SIGNALED;
32013219
}
32023220

32033221
/* Take care of the bgwriter too */
@@ -3589,13 +3607,14 @@ PostmasterStateMachine(void)
35893607
}
35903608

35913609
/*
3592-
* If recovery failed, or the user does not want an automatic restart
3593-
* after backend crashes, wait for all non-syslogger children to exit, and
3594-
* then exit postmaster. We don't try to reinitialize when recovery fails,
3595-
* because more than likely it will just fail again and we will keep
3596-
* trying forever.
3610+
* If the startup process failed, or the user does not want an automatic
3611+
* restart after backend crashes, wait for all non-syslogger children to
3612+
* exit, and then exit postmaster. We don't try to reinitialize when the
3613+
* startup process fails, because more than likely it will just fail again
3614+
* and we will keep trying forever.
35973615
*/
3598-
if (pmState == PM_NO_CHILDREN && (RecoveryError || !restart_after_crash))
3616+
if (pmState == PM_NO_CHILDREN &&
3617+
(StartupStatus == STARTUP_CRASHED || !restart_after_crash))
35993618
ExitPostmaster(1);
36003619

36013620
/*
@@ -3615,6 +3634,7 @@ PostmasterStateMachine(void)
36153634

36163635
StartupPID = StartupDataBase();
36173636
Assert(StartupPID != 0);
3637+
StartupStatus = STARTUP_RUNNING;
36183638
pmState = PM_STARTUP;
36193639
/* crash recovery started, reset SIGKILL flag */
36203640
AbortStartTime = 0;
@@ -3746,7 +3766,11 @@ TerminateChildren(int signal)
37463766
{
37473767
SignalChildren(signal);
37483768
if (StartupPID != 0)
3769+
{
37493770
signal_child(StartupPID, signal);
3771+
if (signal == SIGQUIT || signal == SIGKILL)
3772+
StartupStatus = STARTUP_SIGNALED;
3773+
}
37503774
if (BgWriterPID != 0)
37513775
signal_child(BgWriterPID, signal);
37523776
if (CheckpointerPID != 0)

0 commit comments

Comments
 (0)