Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 7e784d1

Browse files
committed
Improve client error messages for immediate-stop situations.
Up to now, if the DBA issued "pg_ctl stop -m immediate", the message sent to clients was the same as for a crash-and-restart situation. This is confusing, not least because the message claims that the database will soon be up again, something we have no business predicting. Improve things so that we can generate distinct messages for the two cases (and also recognize an ad-hoc SIGQUIT, should somebody try that). To do that, add a field to pmsignal.c's shared memory data structure that the postmaster sets just before broadcasting SIGQUIT to its children. No interlocking seems to be necessary; the intervening signal-sending and signal-receipt should sufficiently serialize accesses to the field. Hence, this isn't any riskier than the existing usages of pmsignal.c. We might in future extend this idea to improve other postmaster-to-children signal scenarios, although none of them currently seem to be as badly overloaded as SIGQUIT. Discussion: https://postgr.es/m/559291.1608587013@sss.pgh.pa.us
1 parent 90fbf7c commit 7e784d1

File tree

4 files changed

+86
-16
lines changed

4 files changed

+86
-16
lines changed

src/backend/postmaster/postmaster.c

+4
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ int ReservedBackends;
218218
/* The socket(s) we're listening to. */
219219
#define MAXLISTEN 64
220220
static pgsocket ListenSocket[MAXLISTEN];
221+
221222
/*
222223
* These globals control the behavior of the postmaster in case some
223224
* backend dumps core. Normally, it kills all peers of the dead backend
@@ -2887,6 +2888,8 @@ pmdie(SIGNAL_ARGS)
28872888
sd_notify(0, "STOPPING=1");
28882889
#endif
28892890

2891+
/* tell children to shut down ASAP */
2892+
SetQuitSignalReason(PMQUIT_FOR_STOP);
28902893
TerminateChildren(SIGQUIT);
28912894
pmState = PM_WAIT_BACKENDS;
28922895

@@ -3464,6 +3467,7 @@ HandleChildCrash(int pid, int exitstatus, const char *procname)
34643467
LogChildExit(LOG, procname, pid, exitstatus);
34653468
ereport(LOG,
34663469
(errmsg("terminating any other active server processes")));
3470+
SetQuitSignalReason(PMQUIT_FOR_CRASH);
34673471
}
34683472

34693473
/* Process background workers. */

src/backend/storage/ipc/pmsignal.c

+37-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*-------------------------------------------------------------------------
22
*
33
* pmsignal.c
4-
* routines for signaling the postmaster from its child processes
4+
* routines for signaling between the postmaster and its child processes
55
*
66
*
77
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
@@ -55,6 +55,10 @@
5555
* but carries the extra information that the child is a WAL sender.
5656
* WAL senders too start in ACTIVE state, but switch to WALSENDER once they
5757
* start streaming the WAL (and they never go back to ACTIVE after that).
58+
*
59+
* We also have a shared-memory field that is used for communication in
60+
* the opposite direction, from postmaster to children: it tells why the
61+
* postmaster has broadcasted SIGQUIT signals, if indeed it has done so.
5862
*/
5963

6064
#define PM_CHILD_UNUSED 0 /* these values must fit in sig_atomic_t */
@@ -65,8 +69,10 @@
6569
/* "typedef struct PMSignalData PMSignalData" appears in pmsignal.h */
6670
struct PMSignalData
6771
{
68-
/* per-reason flags */
72+
/* per-reason flags for signaling the postmaster */
6973
sig_atomic_t PMSignalFlags[NUM_PMSIGNALS];
74+
/* global flags for signals from postmaster to children */
75+
QuitSignalReason sigquit_reason; /* why SIGQUIT was sent */
7076
/* per-child-process flags */
7177
int num_child_flags; /* # of entries in PMChildFlags[] */
7278
int next_child_flag; /* next slot to try to assign */
@@ -134,6 +140,7 @@ PMSignalShmemInit(void)
134140

135141
if (!found)
136142
{
143+
/* initialize all flags to zeroes */
137144
MemSet(unvolatize(PMSignalData *, PMSignalState), 0, PMSignalShmemSize());
138145
PMSignalState->num_child_flags = MaxLivePostmasterChildren();
139146
}
@@ -171,6 +178,34 @@ CheckPostmasterSignal(PMSignalReason reason)
171178
return false;
172179
}
173180

181+
/*
182+
* SetQuitSignalReason - broadcast the reason for a system shutdown.
183+
* Should be called by postmaster before sending SIGQUIT to children.
184+
*
185+
* Note: in a crash-and-restart scenario, the "reason" field gets cleared
186+
* as a part of rebuilding shared memory; the postmaster need not do it
187+
* explicitly.
188+
*/
189+
void
190+
SetQuitSignalReason(QuitSignalReason reason)
191+
{
192+
PMSignalState->sigquit_reason = reason;
193+
}
194+
195+
/*
196+
* GetQuitSignalReason - obtain the reason for a system shutdown.
197+
* Called by child processes when they receive SIGQUIT.
198+
* If the postmaster hasn't actually sent SIGQUIT, will return PMQUIT_NOT_SENT.
199+
*/
200+
QuitSignalReason
201+
GetQuitSignalReason(void)
202+
{
203+
/* This is called in signal handlers, so be extra paranoid. */
204+
if (!IsUnderPostmaster || PMSignalState == NULL)
205+
return PMQUIT_NOT_SENT;
206+
return PMSignalState->sigquit_reason;
207+
}
208+
174209

175210
/*
176211
* AssignPostmasterChildSlot - select an unused slot for a new postmaster

src/backend/tcop/postgres.c

+32-13
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
#include "rewrite/rewriteHandler.h"
6868
#include "storage/bufmgr.h"
6969
#include "storage/ipc.h"
70+
#include "storage/pmsignal.h"
7071
#include "storage/proc.h"
7172
#include "storage/procsignal.h"
7273
#include "storage/sinval.h"
@@ -2752,8 +2753,8 @@ drop_unnamed_stmt(void)
27522753
/*
27532754
* quickdie() occurs when signaled SIGQUIT by the postmaster.
27542755
*
2755-
* Some backend has bought the farm,
2756-
* so we need to stop what we're doing and exit.
2756+
* Either some backend has bought the farm, or we've been told to shut down
2757+
* "immediately"; so we need to stop what we're doing and exit.
27572758
*/
27582759
void
27592760
quickdie(SIGNAL_ARGS)
@@ -2788,18 +2789,36 @@ quickdie(SIGNAL_ARGS)
27882789
* wrong, so there's not much to lose. Assuming the postmaster is still
27892790
* running, it will SIGKILL us soon if we get stuck for some reason.
27902791
*
2791-
* Ideally this should be ereport(FATAL), but then we'd not get control
2792-
* back...
2792+
* Ideally these should be ereport(FATAL), but then we'd not get control
2793+
* back to force the correct type of process exit.
27932794
*/
2794-
ereport(WARNING,
2795-
(errcode(ERRCODE_CRASH_SHUTDOWN),
2796-
errmsg("terminating connection because of crash of another server process"),
2797-
errdetail("The postmaster has commanded this server process to roll back"
2798-
" the current transaction and exit, because another"
2799-
" server process exited abnormally and possibly corrupted"
2800-
" shared memory."),
2801-
errhint("In a moment you should be able to reconnect to the"
2802-
" database and repeat your command.")));
2795+
switch (GetQuitSignalReason())
2796+
{
2797+
case PMQUIT_NOT_SENT:
2798+
/* Hmm, SIGQUIT arrived out of the blue */
2799+
ereport(WARNING,
2800+
(errcode(ERRCODE_ADMIN_SHUTDOWN),
2801+
errmsg("terminating connection because of unexpected SIGQUIT signal")));
2802+
break;
2803+
case PMQUIT_FOR_CRASH:
2804+
/* A crash-and-restart cycle is in progress */
2805+
ereport(WARNING,
2806+
(errcode(ERRCODE_CRASH_SHUTDOWN),
2807+
errmsg("terminating connection because of crash of another server process"),
2808+
errdetail("The postmaster has commanded this server process to roll back"
2809+
" the current transaction and exit, because another"
2810+
" server process exited abnormally and possibly corrupted"
2811+
" shared memory."),
2812+
errhint("In a moment you should be able to reconnect to the"
2813+
" database and repeat your command.")));
2814+
break;
2815+
case PMQUIT_FOR_STOP:
2816+
/* Immediate-mode stop */
2817+
ereport(WARNING,
2818+
(errcode(ERRCODE_ADMIN_SHUTDOWN),
2819+
errmsg("terminating connection due to immediate shutdown command")));
2820+
break;
2821+
}
28032822

28042823
/*
28052824
* We DO NOT want to run proc_exit() or atexit() callbacks -- we're here

src/include/storage/pmsignal.h

+13-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*-------------------------------------------------------------------------
22
*
33
* pmsignal.h
4-
* routines for signaling the postmaster from its child processes
4+
* routines for signaling between the postmaster and its child processes
55
*
66
*
77
* Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
@@ -45,6 +45,16 @@ typedef enum
4545
NUM_PMSIGNALS /* Must be last value of enum! */
4646
} PMSignalReason;
4747

48+
/*
49+
* Reasons why the postmaster would send SIGQUIT to its children.
50+
*/
51+
typedef enum
52+
{
53+
PMQUIT_NOT_SENT = 0, /* postmaster hasn't sent SIGQUIT */
54+
PMQUIT_FOR_CRASH, /* some other backend bought the farm */
55+
PMQUIT_FOR_STOP /* immediate stop was commanded */
56+
} QuitSignalReason;
57+
4858
/* PMSignalData is an opaque struct, details known only within pmsignal.c */
4959
typedef struct PMSignalData PMSignalData;
5060

@@ -55,6 +65,8 @@ extern Size PMSignalShmemSize(void);
5565
extern void PMSignalShmemInit(void);
5666
extern void SendPostmasterSignal(PMSignalReason reason);
5767
extern bool CheckPostmasterSignal(PMSignalReason reason);
68+
extern void SetQuitSignalReason(QuitSignalReason reason);
69+
extern QuitSignalReason GetQuitSignalReason(void);
5870
extern int AssignPostmasterChildSlot(void);
5971
extern bool ReleasePostmasterChildSlot(int slot);
6072
extern bool IsPostmasterChildWalSender(int slot);

0 commit comments

Comments
 (0)