Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit e6a442c

Browse files
committed
Restructure the shutdown procedure for the archiver process to allow it to
finish archiving everything (when there's no error), and to eliminate various hazards as best we can. This fixes a previous 8.3 patch that caused the postmaster to kill and then restart the archiver during shutdown (!?). The new behavior is that the archiver is allowed to run unmolested until the bgwriter has exited; then it is sent SIGUSR2 to tell it to do a final archiving cycle and quit. We only SIGQUIT the archiver if we want a panic stop; this is important since SIGQUIT will also be sent to any active archive_command. The postmaster also now doesn't SIGQUIT the stats collector until the bgwriter is done, since the bgwriter can send stats messages in 8.3. The postmaster will not exit until both the archiver and stats collector are gone; this provides some defense (not too bulletproof) against conflicting archiver or stats collector processes being started by a new postmaster instance. We continue the prior practice that the archiver will check for postmaster death immediately before issuing any archive_command; that gives some additional protection against conflicting archivers. Also, modify the archiver process to notice SIGTERM and refuse to issue any more archive commands if it gets it. The postmaster doesn't ever send it SIGTERM; we assume that any such signal came from init and is a notice of impending whole-system shutdown. In this situation it seems imprudent to try to start new archive commands --- if they aren't extremely quick they're likely to get SIGKILL'd by init. All per discussion.
1 parent 21a00dc commit e6a442c

File tree

2 files changed

+138
-52
lines changed

2 files changed

+138
-52
lines changed

src/backend/postmaster/pgarch.c

+75-16
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
*
2020
*
2121
* IDENTIFICATION
22-
* $PostgreSQL: pgsql/src/backend/postmaster/pgarch.c,v 1.37 2008/01/01 19:45:51 momjian Exp $
22+
* $PostgreSQL: pgsql/src/backend/postmaster/pgarch.c,v 1.38 2008/01/11 00:54:08 tgl Exp $
2323
*
2424
*-------------------------------------------------------------------------
2525
*/
@@ -77,12 +77,15 @@
7777
* ----------
7878
*/
7979
static time_t last_pgarch_start_time;
80+
static time_t last_sigterm_time = 0;
8081

8182
/*
8283
* Flags set by interrupt handlers for later service in the main loop.
8384
*/
8485
static volatile sig_atomic_t got_SIGHUP = false;
86+
static volatile sig_atomic_t got_SIGTERM = false;
8587
static volatile sig_atomic_t wakened = false;
88+
static volatile sig_atomic_t ready_to_stop = false;
8689

8790
/* ----------
8891
* Local function forward declarations
@@ -95,7 +98,9 @@ static pid_t pgarch_forkexec(void);
9598
NON_EXEC_STATIC void PgArchiverMain(int argc, char *argv[]);
9699
static void pgarch_exit(SIGNAL_ARGS);
97100
static void ArchSigHupHandler(SIGNAL_ARGS);
101+
static void ArchSigTermHandler(SIGNAL_ARGS);
98102
static void pgarch_waken(SIGNAL_ARGS);
103+
static void pgarch_waken_stop(SIGNAL_ARGS);
99104
static void pgarch_MainLoop(void);
100105
static void pgarch_ArchiverCopyLoop(void);
101106
static bool pgarch_archiveXlog(char *xlog);
@@ -236,16 +241,16 @@ PgArchiverMain(int argc, char *argv[])
236241

237242
/*
238243
* Ignore all signals usually bound to some action in the postmaster,
239-
* except for SIGHUP, SIGUSR1 and SIGQUIT.
244+
* except for SIGHUP, SIGTERM, SIGUSR1, SIGUSR2, and SIGQUIT.
240245
*/
241246
pqsignal(SIGHUP, ArchSigHupHandler);
242247
pqsignal(SIGINT, SIG_IGN);
243-
pqsignal(SIGTERM, SIG_IGN);
248+
pqsignal(SIGTERM, ArchSigTermHandler);
244249
pqsignal(SIGQUIT, pgarch_exit);
245250
pqsignal(SIGALRM, SIG_IGN);
246251
pqsignal(SIGPIPE, SIG_IGN);
247252
pqsignal(SIGUSR1, pgarch_waken);
248-
pqsignal(SIGUSR2, SIG_IGN);
253+
pqsignal(SIGUSR2, pgarch_waken_stop);
249254
pqsignal(SIGCHLD, SIG_DFL);
250255
pqsignal(SIGTTIN, SIG_DFL);
251256
pqsignal(SIGTTOU, SIG_DFL);
@@ -267,28 +272,47 @@ PgArchiverMain(int argc, char *argv[])
267272
static void
268273
pgarch_exit(SIGNAL_ARGS)
269274
{
270-
/*
271-
* For now, we just nail the doors shut and get out of town. It might
272-
* seem cleaner to finish up any pending archive copies, but there's a
273-
* nontrivial risk that init will kill us partway through.
274-
*/
275-
exit(0);
275+
/* SIGQUIT means curl up and die ... */
276+
exit(1);
276277
}
277278

278-
/* SIGHUP: set flag to re-read config file at next convenient time */
279+
/* SIGHUP signal handler for archiver process */
279280
static void
280281
ArchSigHupHandler(SIGNAL_ARGS)
281282
{
283+
/* set flag to re-read config file at next convenient time */
282284
got_SIGHUP = true;
283285
}
284286

287+
/* SIGTERM signal handler for archiver process */
288+
static void
289+
ArchSigTermHandler(SIGNAL_ARGS)
290+
{
291+
/*
292+
* The postmaster never sends us SIGTERM, so we assume that this means
293+
* that init is trying to shut down the whole system. If we hang around
294+
* too long we'll get SIGKILL'd. Set flag to prevent starting any more
295+
* archive commands.
296+
*/
297+
got_SIGTERM = true;
298+
}
299+
285300
/* SIGUSR1 signal handler for archiver process */
286301
static void
287302
pgarch_waken(SIGNAL_ARGS)
288303
{
304+
/* set flag that there is work to be done */
289305
wakened = true;
290306
}
291307

308+
/* SIGUSR2 signal handler for archiver process */
309+
static void
310+
pgarch_waken_stop(SIGNAL_ARGS)
311+
{
312+
/* set flag to do a final cycle and shut down afterwards */
313+
ready_to_stop = true;
314+
}
315+
292316
/*
293317
* pgarch_MainLoop
294318
*
@@ -298,6 +322,7 @@ static void
298322
pgarch_MainLoop(void)
299323
{
300324
time_t last_copy_time = 0;
325+
bool time_to_stop;
301326

302327
/*
303328
* We run the copy loop immediately upon entry, in case there are
@@ -309,15 +334,36 @@ pgarch_MainLoop(void)
309334

310335
do
311336
{
337+
/* When we get SIGUSR2, we do one more archive cycle, then exit */
338+
time_to_stop = ready_to_stop;
339+
312340
/* Check for config update */
313341
if (got_SIGHUP)
314342
{
315343
got_SIGHUP = false;
316344
ProcessConfigFile(PGC_SIGHUP);
317345
}
318346

347+
/*
348+
* If we've gotten SIGTERM, we normally just sit and do nothing until
349+
* SIGUSR2 arrives. However, that means a random SIGTERM would
350+
* disable archiving indefinitely, which doesn't seem like a good
351+
* idea. If more than 60 seconds pass since SIGTERM, exit anyway,
352+
* so that the postmaster can start a new archiver if needed.
353+
*/
354+
if (got_SIGTERM)
355+
{
356+
time_t curtime = time(NULL);
357+
358+
if (last_sigterm_time == 0)
359+
last_sigterm_time = curtime;
360+
else if ((unsigned int) (curtime - last_sigterm_time) >=
361+
(unsigned int) 60)
362+
break;
363+
}
364+
319365
/* Do what we're here for */
320-
if (wakened)
366+
if (wakened || time_to_stop)
321367
{
322368
wakened = false;
323369
pgarch_ArchiverCopyLoop();
@@ -334,7 +380,8 @@ pgarch_MainLoop(void)
334380
* sleep into 1-second increments, and check for interrupts after each
335381
* nap.
336382
*/
337-
while (!(wakened || got_SIGHUP))
383+
while (!(wakened || ready_to_stop || got_SIGHUP ||
384+
!PostmasterIsAlive(true)))
338385
{
339386
time_t curtime;
340387

@@ -344,7 +391,13 @@ pgarch_MainLoop(void)
344391
(unsigned int) PGARCH_AUTOWAKE_INTERVAL)
345392
wakened = true;
346393
}
347-
} while (PostmasterIsAlive(true));
394+
395+
/*
396+
* The archiver quits either when the postmaster dies (not expected)
397+
* or after completing one more archiving cycle after receiving
398+
* SIGUSR2.
399+
*/
400+
} while (PostmasterIsAlive(true) && !time_to_stop);
348401
}
349402

350403
/*
@@ -377,8 +430,14 @@ pgarch_ArchiverCopyLoop(void)
377430

378431
for (;;)
379432
{
380-
/* Abandon processing if we notice our postmaster has died */
381-
if (!PostmasterIsAlive(true))
433+
/*
434+
* Do not initiate any more archive commands after receiving
435+
* SIGTERM, nor after the postmaster has died unexpectedly.
436+
* The first condition is to try to keep from having init
437+
* SIGKILL the command, and the second is to avoid conflicts
438+
* with another archiver spawned by a newer postmaster.
439+
*/
440+
if (got_SIGTERM || !PostmasterIsAlive(true))
382441
return;
383442

384443
if (pgarch_archiveXlog(xlog))

src/backend/postmaster/postmaster.c

+63-36
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
*
3838
*
3939
* IDENTIFICATION
40-
* $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.550 2008/01/01 19:45:51 momjian Exp $
40+
* $PostgreSQL: pgsql/src/backend/postmaster/postmaster.c,v 1.551 2008/01/11 00:54:09 tgl Exp $
4141
*
4242
* NOTES
4343
*
@@ -244,7 +244,7 @@ static bool FatalError = false; /* T if recovering from backend crash */
244244
* Notice that this state variable does not distinguish *why* we entered
245245
* PM_WAIT_BACKENDS or later states --- Shutdown and FatalError must be
246246
* consulted to find that out. FatalError is never true in PM_RUN state, nor
247-
* in PM_SHUTDOWN state (because we don't enter that state when trying to
247+
* in PM_SHUTDOWN states (because we don't enter those states when trying to
248248
* recover from a crash). It can be true in PM_STARTUP state, because we
249249
* don't clear it until we've successfully recovered.
250250
*/
@@ -255,6 +255,7 @@ typedef enum
255255
PM_RUN, /* normal "database is alive" state */
256256
PM_WAIT_BACKENDS, /* waiting for live backends to exit */
257257
PM_SHUTDOWN, /* waiting for bgwriter to do shutdown ckpt */
258+
PM_SHUTDOWN_2, /* waiting for archiver to finish */
258259
PM_WAIT_DEAD_END, /* waiting for dead_end children to exit */
259260
PM_NO_CHILDREN /* all important children have exited */
260261
} PMState;
@@ -1312,12 +1313,8 @@ ServerLoop(void)
13121313
start_autovac_launcher = false; /* signal processed */
13131314
}
13141315

1315-
/*
1316-
* If we have lost the archiver, try to start a new one. We do this
1317-
* even if we are shutting down, to allow archiver to take care of any
1318-
* remaining WAL files.
1319-
*/
1320-
if (XLogArchivingActive() && PgArchPID == 0 && pmState >= PM_RUN)
1316+
/* If we have lost the archiver, try to start a new one */
1317+
if (XLogArchivingActive() && PgArchPID == 0 && pmState == PM_RUN)
13211318
PgArchPID = pgarch_start();
13221319

13231320
/* If we have lost the stats collector, try to start a new one */
@@ -2175,12 +2172,31 @@ reaper(SIGNAL_ARGS)
21752172
* checkpoint. (If for some reason it didn't, recovery will
21762173
* occur on next postmaster start.)
21772174
*
2178-
* At this point we should have no normal children left (else
2179-
* we'd not be in PM_SHUTDOWN state) but we might have
2180-
* dead_end children.
2175+
* At this point we should have no normal backend children
2176+
* left (else we'd not be in PM_SHUTDOWN state) but we might
2177+
* have dead_end children to wait for.
2178+
*
2179+
* If we have an archiver subprocess, tell it to do a last
2180+
* archive cycle and quit; otherwise we can go directly to
2181+
* PM_WAIT_DEAD_END state.
21812182
*/
21822183
Assert(Shutdown > NoShutdown);
2183-
pmState = PM_WAIT_DEAD_END;
2184+
2185+
if (PgArchPID != 0)
2186+
{
2187+
/* Waken archiver for the last time */
2188+
signal_child(PgArchPID, SIGUSR2);
2189+
pmState = PM_SHUTDOWN_2;
2190+
}
2191+
else
2192+
pmState = PM_WAIT_DEAD_END;
2193+
2194+
/*
2195+
* We can also shut down the stats collector now; there's
2196+
* nothing left for it to do.
2197+
*/
2198+
if (PgStatPID != 0)
2199+
signal_child(PgStatPID, SIGQUIT);
21842200
}
21852201
else
21862202
{
@@ -2227,16 +2243,19 @@ reaper(SIGNAL_ARGS)
22272243
/*
22282244
* Was it the archiver? If so, just try to start a new one; no need
22292245
* to force reset of the rest of the system. (If fail, we'll try
2230-
* again in future cycles of the main loop.)
2246+
* again in future cycles of the main loop.) But if we were waiting
2247+
* for it to shut down, advance to the next shutdown step.
22312248
*/
22322249
if (pid == PgArchPID)
22332250
{
22342251
PgArchPID = 0;
22352252
if (!EXIT_STATUS_0(exitstatus))
22362253
LogChildExit(LOG, _("archiver process"),
22372254
pid, exitstatus);
2238-
if (XLogArchivingActive() && pmState >= PM_RUN)
2255+
if (XLogArchivingActive() && pmState == PM_RUN)
22392256
PgArchPID = pgarch_start();
2257+
else if (pmState == PM_SHUTDOWN_2)
2258+
pmState = PM_WAIT_DEAD_END;
22402259
continue;
22412260
}
22422261

@@ -2563,6 +2582,11 @@ PostmasterStateMachine(void)
25632582
* change causes ServerLoop to stop creating new ones.
25642583
*/
25652584
pmState = PM_WAIT_DEAD_END;
2585+
2586+
/*
2587+
* We already SIGQUIT'd the archiver and stats processes,
2588+
* if any, when we entered FatalError state.
2589+
*/
25662590
}
25672591
else
25682592
{
@@ -2591,13 +2615,13 @@ PostmasterStateMachine(void)
25912615
*/
25922616
FatalError = true;
25932617
pmState = PM_WAIT_DEAD_END;
2618+
2619+
/* Kill the archiver and stats collector too */
2620+
if (PgArchPID != 0)
2621+
signal_child(PgArchPID, SIGQUIT);
2622+
if (PgStatPID != 0)
2623+
signal_child(PgStatPID, SIGQUIT);
25942624
}
2595-
/* Tell pgarch to shut down too; nothing left for it to do */
2596-
if (PgArchPID != 0)
2597-
signal_child(PgArchPID, SIGQUIT);
2598-
/* Tell pgstat to shut down too; nothing left for it to do */
2599-
if (PgStatPID != 0)
2600-
signal_child(PgStatPID, SIGQUIT);
26012625
}
26022626
}
26032627
}
@@ -2606,16 +2630,26 @@ PostmasterStateMachine(void)
26062630
{
26072631
/*
26082632
* PM_WAIT_DEAD_END state ends when the BackendList is entirely empty
2609-
* (ie, no dead_end children remain).
2633+
* (ie, no dead_end children remain), and the archiver and stats
2634+
* collector are gone too.
2635+
*
2636+
* The reason we wait for those two is to protect them against a new
2637+
* postmaster starting conflicting subprocesses; this isn't an
2638+
* ironclad protection, but it at least helps in the
2639+
* shutdown-and-immediately-restart scenario. Note that they have
2640+
* already been sent appropriate shutdown signals, either during a
2641+
* normal state transition leading up to PM_WAIT_DEAD_END, or during
2642+
* FatalError processing.
26102643
*/
2611-
if (!DLGetHead(BackendList))
2644+
if (DLGetHead(BackendList) == NULL &&
2645+
PgArchPID == 0 && PgStatPID == 0)
26122646
{
26132647
/* These other guys should be dead already */
26142648
Assert(StartupPID == 0);
26152649
Assert(BgWriterPID == 0);
26162650
Assert(WalWriterPID == 0);
26172651
Assert(AutoVacPID == 0);
2618-
/* archiver, stats, and syslogger are not considered here */
2652+
/* syslogger is not considered here */
26192653
pmState = PM_NO_CHILDREN;
26202654
}
26212655
}
@@ -2628,14 +2662,9 @@ PostmasterStateMachine(void)
26282662
* we got SIGTERM from init --- there may well not be time for recovery
26292663
* before init decides to SIGKILL us.)
26302664
*
2631-
* Note: we do not wait around for exit of the archiver or stats
2632-
* processes. They've been sent SIGQUIT by this point (either when we
2633-
* entered PM_SHUTDOWN state, or when we set FatalError, and at least one
2634-
* of those must have happened by now). In any case they contain logic to
2635-
* commit hara-kiri if they notice the postmaster is gone. Since they
2636-
* aren't connected to shared memory, they pose no problem for shutdown.
2637-
* The syslogger is not considered either, since it's intended to survive
2638-
* till the postmaster exits.
2665+
* Note that the syslogger continues to run. It will exit when it sees
2666+
* EOF on its input pipe, which happens when there are no more upstream
2667+
* processes.
26392668
*/
26402669
if (Shutdown > NoShutdown && pmState == PM_NO_CHILDREN)
26412670
{
@@ -2652,10 +2681,8 @@ PostmasterStateMachine(void)
26522681
}
26532682

26542683
/*
2655-
* If we need to recover from a crash, wait for all shmem-connected
2656-
* children to exit, then reset shmem and StartupDataBase. (We can ignore
2657-
* the archiver and stats processes here since they are not connected to
2658-
* shmem.)
2684+
* If we need to recover from a crash, wait for all non-syslogger
2685+
* children to exit, then reset shmem and StartupDataBase.
26592686
*/
26602687
if (FatalError && pmState == PM_NO_CHILDREN)
26612688
{
@@ -3782,7 +3809,7 @@ sigusr1_handler(SIGNAL_ARGS)
37823809
}
37833810

37843811
if (CheckPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER) &&
3785-
PgArchPID != 0 && Shutdown <= SmartShutdown)
3812+
PgArchPID != 0)
37863813
{
37873814
/*
37883815
* Send SIGUSR1 to archiver process, to wake it up and begin archiving

0 commit comments

Comments
 (0)