|
333 | 333 | $node_standby3->append_conf('postgresql.conf', "primary_slot_name = 'rep3'");
|
334 | 334 | $node_standby3->start;
|
335 | 335 | $node_primary3->wait_for_catchup($node_standby3);
|
336 |
| -my $senderpid = $node_primary3->safe_psql('postgres', |
337 |
| - "SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'"); |
338 |
| - |
339 |
| -# We've seen occasional cases where multiple walsender pids are active. An |
340 |
| -# immediate shutdown may hide evidence of a locking bug. So if multiple |
341 |
| -# walsenders are observed, shut down in fast mode, and collect some more |
342 |
| -# information. |
343 |
| -if (not like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid")) |
| 336 | + |
| 337 | +my $senderpid; |
| 338 | + |
| 339 | +# We've seen occasional cases where multiple walsender pids are active. It |
| 340 | +# could be that we're just observing process shutdown being slow. To collect |
| 341 | +# more information, retry a couple times, print a bit of debugging information |
| 342 | +# each iteration. For now report a test failure even if later iterations |
| 343 | +# succeed. |
| 344 | +my $i = 0; |
| 345 | +while (1) |
344 | 346 | {
|
345 | 347 | my ($stdout, $stderr);
|
| 348 | + |
| 349 | + $senderpid = $node_primary3->safe_psql('postgres', |
| 350 | + "SELECT pid FROM pg_stat_activity WHERE backend_type = 'walsender'"); |
| 351 | + |
| 352 | + last if like($senderpid, qr/^[0-9]+$/, "have walsender pid $senderpid"); |
| 353 | + |
| 354 | + # show information about all active connections |
346 | 355 | $node_primary3->psql('postgres',
|
347 | 356 | "\\a\\t\nSELECT * FROM pg_stat_activity",
|
348 | 357 | stdout => \$stdout, stderr => \$stderr);
|
349 | 358 | diag $stdout, $stderr;
|
350 |
| - $node_primary3->stop('fast'); |
351 |
| - $node_standby3->stop('fast'); |
352 |
| - die "could not determine walsender pid, can't continue"; |
| 359 | + |
| 360 | + # unlikely that the problem would resolve after 15s, so give up at point |
| 361 | + if ($i++ == 150) |
| 362 | + { |
| 363 | + # An immediate shutdown may hide evidence of a locking bug. If |
| 364 | + # retrying didn't resolve the issue, shut down in fast mode. |
| 365 | + $node_primary3->stop('fast'); |
| 366 | + $node_standby3->stop('fast'); |
| 367 | + die "could not determine walsender pid, can't continue"; |
| 368 | + } |
| 369 | + |
| 370 | + usleep(100_000); |
353 | 371 | }
|
354 | 372 |
|
355 | 373 | my $receiverpid = $node_standby3->safe_psql('postgres',
|
|
0 commit comments