Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 39b547f

Browse files
committed
Prevent freshly-started backend from ignoring SIGUSR1, per race condition
observed by Inoue. Also, don't call ProcRemove() from postmaster if we have detected a backend crash --- too risky if shared memory is corrupted. It's not needed anyway, considering we are going to reinitialize shared memory and semaphores as soon as the last child is dead.
1 parent e4d97cb commit 39b547f

File tree

2 files changed

+74
-63
lines changed

2 files changed

+74
-63
lines changed

src/backend/postmaster/postmaster.c

Lines changed: 57 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
*
1212
*
1313
* IDENTIFICATION
14-
* $Header: /cvsroot/pgsql/src/backend/postmaster/postmaster.c,v 1.200 2000/12/18 18:45:04 momjian Exp $
14+
* $Header: /cvsroot/pgsql/src/backend/postmaster/postmaster.c,v 1.201 2000/12/20 21:51:52 tgl Exp $
1515
*
1616
* NOTES
1717
*
@@ -180,7 +180,7 @@ static time_t checkpointed = 0;
180180

181181
static int Shutdown = NoShutdown;
182182

183-
static bool FatalError = false;
183+
static bool FatalError = false; /* T if recovering from backend crash */
184184

185185
/*
186186
* State for assigning random salts and cancel keys.
@@ -649,7 +649,7 @@ PostmasterMain(int argc, char *argv[])
649649
pqsignal(SIGTERM, pmdie); /* wait for children and ShutdownDataBase */
650650
pqsignal(SIGALRM, SIG_IGN); /* ignored */
651651
pqsignal(SIGPIPE, SIG_IGN); /* ignored */
652-
pqsignal(SIGUSR1, SIG_IGN); /* ignored */
652+
pqsignal(SIGUSR1, pmdie); /* currently ignored, but see note in pmdie */
653653
pqsignal(SIGUSR2, pmdie); /* send SIGUSR2, don't die */
654654
pqsignal(SIGCHLD, reaper); /* handle child termination */
655655
pqsignal(SIGTTIN, SIG_IGN); /* ignored */
@@ -1329,6 +1329,18 @@ pmdie(SIGNAL_ARGS)
13291329

13301330
switch (postgres_signal_arg)
13311331
{
1332+
case SIGUSR1:
1333+
/*
1334+
* Currently the postmaster ignores SIGUSR1 (maybe it should
1335+
* do something useful instead?) But we must have some handler
1336+
* installed for SIGUSR1, not just set it to SIG_IGN. Else, a
1337+
* freshly spawned backend would likewise have it set to SIG_IGN,
1338+
* which would mean the backend would ignore any attempt to kill
1339+
* it before it had gotten as far as setting up its own handler.
1340+
*/
1341+
errno = save_errno;
1342+
return;
1343+
13321344
case SIGUSR2:
13331345

13341346
/*
@@ -1511,7 +1523,7 @@ reaper(SIGNAL_ARGS)
15111523
ExitPostmaster(1);
15121524
}
15131525
StartupPID = 0;
1514-
FatalError = false;
1526+
FatalError = false; /* done with recovery */
15151527
if (Shutdown > NoShutdown)
15161528
{
15171529
if (ShutdownPID > 0)
@@ -1539,12 +1551,7 @@ reaper(SIGNAL_ARGS)
15391551
/*
15401552
* Wait for all children exit, then reset shmem and StartupDataBase.
15411553
*/
1542-
if (DLGetHead(BackendList))
1543-
{
1544-
errno = save_errno;
1545-
return;
1546-
}
1547-
if (StartupPID > 0 || ShutdownPID > 0)
1554+
if (DLGetHead(BackendList) || StartupPID > 0 || ShutdownPID > 0)
15481555
{
15491556
errno = save_errno;
15501557
return;
@@ -1595,21 +1602,18 @@ CleanupProc(int pid,
15951602
Dlelem *curr,
15961603
*next;
15971604
Backend *bp;
1598-
int sig;
15991605

16001606
if (DebugLvl)
1601-
{
16021607
fprintf(stderr, "%s: CleanupProc: pid %d exited with status %d\n",
16031608
progname, pid, exitstatus);
1604-
}
16051609

16061610
/*
16071611
* If a backend dies in an ugly way (i.e. exit status not 0) then we
16081612
* must signal all other backends to quickdie. If exit status is zero
16091613
* we assume everything is hunky dory and simply remove the backend
16101614
* from the active backend list.
16111615
*/
1612-
if (!exitstatus)
1616+
if (exitstatus == 0)
16131617
{
16141618
curr = DLGetHead(BackendList);
16151619
while (curr)
@@ -1628,73 +1632,78 @@ CleanupProc(int pid,
16281632
if (pid == CheckPointPID)
16291633
{
16301634
CheckPointPID = 0;
1631-
checkpointed = time(NULL);
1635+
if (!FatalError)
1636+
checkpointed = time(NULL);
16321637
}
16331638
else
1634-
ProcRemove(pid);
1639+
{
1640+
/* Why is this done here, and not by the backend itself? */
1641+
if (!FatalError)
1642+
ProcRemove(pid);
1643+
}
16351644

16361645
return;
16371646
}
16381647

16391648
if (!FatalError)
16401649
{
1650+
/* Make log entry unless we did so already */
16411651
tnow = time(NULL);
16421652
fprintf(stderr, "Server process (pid %d) exited with status %d at %s"
16431653
"Terminating any active server processes...\n",
16441654
pid, exitstatus, ctime(&tnow));
16451655
fflush(stderr);
16461656
}
1647-
FatalError = true;
1657+
16481658
curr = DLGetHead(BackendList);
16491659
while (curr)
16501660
{
16511661
next = DLGetSucc(curr);
16521662
bp = (Backend *) DLE_VAL(curr);
1653-
1654-
/*
1655-
* SIGUSR1 is the special signal that says exit without proc_exit
1656-
* and let the user know what's going on. ProcSemaphoreKill()
1657-
* cleans up the backends semaphore. If SendStop is set (-s on
1658-
* command line), then we send a SIGSTOP so that we can core dumps
1659-
* from all backends by hand.
1660-
*/
1661-
sig = (SendStop) ? SIGSTOP : SIGUSR1;
16621663
if (bp->pid != pid)
16631664
{
1664-
if (DebugLvl)
1665-
fprintf(stderr, "%s: CleanupProc: sending %s to process %d\n",
1666-
progname,
1667-
(sig == SIGUSR1)
1668-
? "SIGUSR1" : "SIGSTOP",
1669-
bp->pid);
1670-
kill(bp->pid, sig);
1665+
/*
1666+
* This backend is still alive. Unless we did so already,
1667+
* tell it to commit hara-kiri.
1668+
*
1669+
* SIGUSR1 is the special signal that says exit without proc_exit
1670+
* and let the user know what's going on. But if SendStop is set
1671+
* (-s on command line), then we send SIGSTOP instead, so that we
1672+
* can get core dumps from all backends by hand.
1673+
*/
1674+
if (!FatalError)
1675+
{
1676+
if (DebugLvl)
1677+
fprintf(stderr, "%s: CleanupProc: sending %s to process %d\n",
1678+
progname,
1679+
(SendStop ? "SIGSTOP" : "SIGUSR1"),
1680+
bp->pid);
1681+
kill(bp->pid, (SendStop ? SIGSTOP : SIGUSR1));
1682+
}
16711683
}
16721684
else
16731685
{
1674-
16751686
/*
1676-
* I don't like that we call ProcRemove() here, assuming that
1677-
* shmem may be corrupted! But is there another way to free
1678-
* backend semaphores? Actually, I believe that we need not in
1679-
* per backend semaphore at all (we use them to wait on lock
1680-
* only, couldn't we just sigpause?), so probably we'll remove
1681-
* this call from here someday. -- vadim 04-10-1999
1687+
* Found entry for freshly-dead backend, so remove it.
1688+
*
1689+
* Don't call ProcRemove() here, since shmem may be corrupted!
1690+
* We are going to reinitialize shmem and semaphores anyway
1691+
* once all the children are dead, so no need for it.
16821692
*/
1683-
if (pid == CheckPointPID)
1684-
{
1685-
CheckPointPID = 0;
1686-
checkpointed = 0;
1687-
}
1688-
else
1689-
ProcRemove(pid);
1690-
16911693
DLRemove(curr);
16921694
free(bp);
16931695
DLFreeElem(curr);
16941696
}
16951697
curr = next;
16961698
}
16971699

1700+
if (pid == CheckPointPID)
1701+
{
1702+
CheckPointPID = 0;
1703+
checkpointed = 0;
1704+
}
1705+
1706+
FatalError = true;
16981707
}
16991708

17001709
/*

src/backend/tcop/postgres.c

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $Header: /cvsroot/pgsql/src/backend/tcop/postgres.c,v 1.197 2000/12/18 18:45:05 momjian Exp $
11+
* $Header: /cvsroot/pgsql/src/backend/tcop/postgres.c,v 1.198 2000/12/20 21:51:52 tgl Exp $
1212
*
1313
* NOTES
1414
* this is the "main" module of the postgres backend and
@@ -1462,21 +1462,12 @@ PostgresMain(int argc, char *argv[], int real_argc, char *real_argv[], const cha
14621462
Assert(DataDir);
14631463

14641464
/*
1465-
* 1. Set BlockSig and UnBlockSig masks. 2. Set up signal handlers. 3.
1466-
* Allow only SIGUSR1 signal (we never block it) during
1467-
* initialization.
1465+
* Set up signal handlers and masks.
14681466
*
1469-
* Note that postmaster already blocked ALL signals to make us happy.
1467+
* Note that postmaster blocked all signals before forking child process,
1468+
* so there is no race condition whereby we might receive a signal before
1469+
* we have set up the handler.
14701470
*/
1471-
pqinitmask();
1472-
1473-
#ifdef HAVE_SIGPROCMASK
1474-
sigdelset(&BlockSig, SIGUSR1);
1475-
#else
1476-
BlockSig &= ~(sigmask(SIGUSR1));
1477-
#endif
1478-
1479-
PG_SETMASK(&BlockSig); /* block everything except SIGUSR1 */
14801471

14811472
pqsignal(SIGHUP, SigHupHandler); /* set flag to read config file */
14821473
pqsignal(SIGINT, QueryCancelHandler); /* cancel current query */
@@ -1499,6 +1490,17 @@ PostgresMain(int argc, char *argv[], int real_argc, char *real_argv[], const cha
14991490
pqsignal(SIGTTOU, SIG_DFL);
15001491
pqsignal(SIGCONT, SIG_DFL);
15011492

1493+
pqinitmask();
1494+
1495+
/* We allow SIGUSR1 (quickdie) at all times */
1496+
#ifdef HAVE_SIGPROCMASK
1497+
sigdelset(&BlockSig, SIGUSR1);
1498+
#else
1499+
BlockSig &= ~(sigmask(SIGUSR1));
1500+
#endif
1501+
1502+
PG_SETMASK(&BlockSig); /* block everything except SIGUSR1 */
1503+
15021504

15031505
if (IsUnderPostmaster)
15041506
{
@@ -1649,7 +1651,7 @@ PostgresMain(int argc, char *argv[], int real_argc, char *real_argv[], const cha
16491651
if (!IsUnderPostmaster)
16501652
{
16511653
puts("\nPOSTGRES backend interactive interface ");
1652-
puts("$Revision: 1.197 $ $Date: 2000/12/18 18:45:05 $\n");
1654+
puts("$Revision: 1.198 $ $Date: 2000/12/20 21:51:52 $\n");
16531655
}
16541656

16551657
/*

0 commit comments

Comments
 (0)