Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 0475a97

Browse files
Quick exit on log stream child exit in pg_basebackup
If the log streaming child process (thread on Windows) dies during backup then the whole backup will be aborted at the end of the backup. Instead, trap ungraceful termination of the log streaming child and exit early. This also adds a TAP test for simulating this by terminating the responsible backend. Reviewed-by: Michael Paquier <michael@paquier.xyz> Reviewed-by: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com> Reviewed-by: Magnus Hagander <magnus@hagander.net> Discussion: https://postgr.es/m/0F69E282-97F9-4DB7-8D6D-F927AA6340C8@yesql.se Discussion: https://postgr.es/m/VI1PR83MB0189818B82C19059CB62E26199A89@VI1PR83MB0189.EURPRD83.prod.outlook.com
1 parent c7d7e12 commit 0475a97

File tree

2 files changed

+79
-2
lines changed

2 files changed

+79
-2
lines changed

src/bin/pg_basebackup/pg_basebackup.c

+45-2
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,8 @@ static int bgpipe[2] = {-1, -1};
174174
/* Handle to child process */
175175
static pid_t bgchild = -1;
176176
static bool in_log_streamer = false;
177+
/* Flag to indicate if child process exited unexpectedly */
178+
static volatile sig_atomic_t bgchild_exited = false;
177179

178180
/* End position for xlog streaming, empty string if unknown yet */
179181
static XLogRecPtr xlogendptr;
@@ -277,6 +279,18 @@ disconnect_atexit(void)
277279
}
278280

279281
#ifndef WIN32
282+
/*
283+
* If the bgchild exits prematurely and raises a SIGCHLD signal, we can abort
284+
* processing rather than wait until the backup has finished and error out at
285+
* that time. On Windows, we use a background thread which can communicate
286+
* without the need for a signal handler.
287+
*/
288+
static void
289+
sigchld_handler(SIGNAL_ARGS)
290+
{
291+
bgchild_exited = true;
292+
}
293+
280294
/*
281295
* On windows, our background thread dies along with the process. But on
282296
* Unix, if we have started a subprocess, we want to kill it off so it
@@ -285,7 +299,7 @@ disconnect_atexit(void)
285299
static void
286300
kill_bgchild_atexit(void)
287301
{
288-
if (bgchild > 0)
302+
if (bgchild > 0 && !bgchild_exited)
289303
kill(bgchild, SIGTERM);
290304
}
291305
#endif
@@ -572,17 +586,28 @@ LogStreamerMain(logstreamer_param *param)
572586
stream.do_sync);
573587

574588
if (!ReceiveXlogStream(param->bgconn, &stream))
575-
589+
{
576590
/*
577591
* Any errors will already have been reported in the function process,
578592
* but we need to tell the parent that we didn't shutdown in a nice
579593
* way.
580594
*/
595+
#ifdef WIN32
596+
/*
597+
* In order to signal the main thread of an ungraceful exit we
598+
* set the same flag that we use on Unix to signal SIGCHLD.
599+
*/
600+
bgchild_exited = true;
601+
#endif
581602
return 1;
603+
}
582604

583605
if (!stream.walmethod->finish())
584606
{
585607
pg_log_error("could not finish writing WAL files: %m");
608+
#ifdef WIN32
609+
bgchild_exited = true;
610+
#endif
586611
return 1;
587612
}
588613

@@ -1134,6 +1159,12 @@ ReceiveCopyData(PGconn *conn, WriteDataCallback callback,
11341159
exit(1);
11351160
}
11361161

1162+
if (bgchild_exited)
1163+
{
1164+
pg_log_error("background process terminated unexpectedly");
1165+
exit(1);
1166+
}
1167+
11371168
(*callback) (r, copybuf, callback_data);
11381169

11391170
PQfreemem(copybuf);
@@ -2882,6 +2913,18 @@ main(int argc, char **argv)
28822913
}
28832914
atexit(disconnect_atexit);
28842915

2916+
#ifndef WIN32
2917+
/*
2918+
* Trap SIGCHLD to be able to handle the WAL stream process exiting. There
2919+
* is no SIGCHLD on Windows, there we rely on the background thread setting
2920+
* the signal variable on unexpected but graceful exit. If the WAL stream
2921+
* thread crashes on Windows it will bring down the entire process as it's
2922+
* a thread, so there is nothing to catch should that happen. A crash on
2923+
* UNIX will be caught by the signal handler.
2924+
*/
2925+
pqsignal(SIGCHLD, sigchld_handler);
2926+
#endif
2927+
28852928
/*
28862929
* Set umask so that directories/files are created with the same
28872930
* permissions as directories/files in the source data directory.

src/bin/pg_basebackup/t/010_pg_basebackup.pl

+34
Original file line numberDiff line numberDiff line change
@@ -776,4 +776,38 @@
776776
rmtree("$tempdir/backup_gzip3");
777777
}
778778

779+
# Test background stream process terminating before the basebackup has
780+
# finished, the main process should exit gracefully with an error message on
781+
# stderr. To reduce the risk of timing related issues we invoke the base
782+
# backup with rate throttling enabled.
783+
$node->safe_psql('postgres',
784+
q{CREATE TABLE t AS SELECT a FROM generate_series(1,10000) AS a;});
785+
786+
my $sigchld_bb_timeout = IPC::Run::timer(60);
787+
my ($sigchld_bb_stdin, $sigchld_bb_stdout, $sigchld_bb_stderr) = ('', '', '');
788+
my $sigchld_bb = IPC::Run::start(
789+
[
790+
@pg_basebackup_defs, '--wal-method=stream', '-D', "$tempdir/sigchld",
791+
'--max-rate=32', '-d', $node->connstr('postgres')
792+
],
793+
'<',
794+
\$sigchld_bb_stdin,
795+
'>',
796+
\$sigchld_bb_stdout,
797+
'2>',
798+
\$sigchld_bb_stderr,
799+
$sigchld_bb_timeout);
800+
801+
is($node->poll_query_until('postgres',
802+
"SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE " .
803+
"application_name = '010_pg_basebackup.pl' AND wait_event = 'WalSenderMain' " .
804+
"AND backend_type = 'walsender' AND query ~ 'START_REPLICATION'"),
805+
"1",
806+
"Walsender killed");
807+
808+
ok(pump_until($sigchld_bb, $sigchld_bb_timeout, \$sigchld_bb_stderr,
809+
qr/background process terminated unexpectedly/),
810+
'background process exit message');
811+
$sigchld_bb->finish();
812+
779813
done_testing();

0 commit comments

Comments
 (0)