Emit cascaded standby message on shutdown only when appropriate.

simonat2ndQuadrant · simonat2ndQuadrant · commit dde70cc31368 · 2011-09-07T09:09:47.000+01:00
Adds additional test for active walsenders and closes a race
condition for when we failover when a new walsender was connecting.

Reported and fixed bu Fujii Masao. Review by Heikki Linnakangas
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
@@ -2328,10 +2328,11 @@ reaper(SIGNAL_ARGS)
 			 * XXX should avoid the need for disconnection. When we do,
 			 * am_cascading_walsender should be replaced with RecoveryInProgress()
 			 */
-			if (max_wal_senders > 0)
+			if (max_wal_senders > 0 && CountChildren(BACKEND_TYPE_WALSND) > 0)
 			{
 				ereport(LOG,
-						(errmsg("terminating all walsender processes to force cascaded standby(s) to update timeline and reconnect")));
+						(errmsg("terminating all walsender processes to force cascaded "
+								"standby(s) to update timeline and reconnect")));
 				SignalSomeChildren(SIGUSR2, BACKEND_TYPE_WALSND);
 			}
 
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
@@ -368,6 +368,35 @@ StartReplication(StartReplicationCmd *cmd)
 	MarkPostmasterChildWalSender();
 	SendPostmasterSignal(PMSIGNAL_ADVANCE_STATE_MACHINE);
 
+	/*
+	 * When promoting a cascading standby, postmaster sends SIGUSR2 to
+	 * any cascading walsenders to kill them. But there is a corner-case where
+	 * such walsender fails to receive SIGUSR2 and survives a standby promotion
+	 * unexpectedly. This happens when postmaster sends SIGUSR2 before
+	 * the walsender marks itself as a WAL sender, because postmaster sends
+	 * SIGUSR2 to only the processes marked as a WAL sender.
+	 *
+	 * To avoid this corner-case, if recovery is NOT in progress even though
+	 * the walsender is cascading one, we do the same thing as SIGUSR2 signal
+	 * handler does, i.e., set walsender_ready_to_stop to true. Which causes
+	 * the walsender to end later.
+	 *
+	 * When terminating cascading walsenders, usually postmaster writes
+	 * the log message announcing the terminations. But there is a race condition
+	 * here. If there is no walsender except this process before reaching here,
+	 * postmaster thinks that there is no walsender and suppresses that
+	 * log message. To handle this case, we always emit that log message here.
+	 * This might cause duplicate log messages, but which is less likely to happen,
+	 * so it's not worth writing some code to suppress them.
+	 */
+	if (am_cascading_walsender && !RecoveryInProgress())
+	{
+		ereport(LOG,
+				(errmsg("terminating walsender process to force cascaded standby "
+						"to update timeline and reconnect")));
+		walsender_ready_to_stop = true;
+	}
+
 	/*
 	 * We assume here that we're logging enough information in the WAL for
 	 * log-shipping, since this is checked in PostmasterMain().

Original file line number	Diff line number	Diff line change
`@@ -2328,10 +2328,11 @@ reaper(SIGNAL_ARGS)`
`2328`	`2328`	`* XXX should avoid the need for disconnection. When we do,`
`2329`	`2329`	`* am_cascading_walsender should be replaced with RecoveryInProgress()`
`2330`	`2330`	`*/`
`2331`		`- if (max_wal_senders > 0)`
	`2331`	`+ if (max_wal_senders > 0 && CountChildren(BACKEND_TYPE_WALSND) > 0)`
`2332`	`2332`	`{`
`2333`	`2333`	`ereport(LOG,`
`2334`		`- (errmsg("terminating all walsender processes to force cascaded standby(s) to update timeline and reconnect")));`
	`2334`	`+ (errmsg("terminating all walsender processes to force cascaded "`
	`2335`	`+ "standby(s) to update timeline and reconnect")));`
`2335`	`2336`	`SignalSomeChildren(SIGUSR2, BACKEND_TYPE_WALSND);`
`2336`	`2337`	`}`
`2337`	`2338`