Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit fb093e4

Browse files
Allow PostgresNode.pm tests to wait for catchup
Add methods to the core test framework PostgresNode.pm to allow us to test that standby nodes have caught up with the master, as well as basic LSN handling. Used in tests recovery/t/001_stream_rep.pl and recovery/t/004_timeline_switch.pl Craig Ringer, reviewed by Aleksander Alekseev and Simon Riggs
1 parent 579f700 commit fb093e4

File tree

3 files changed

+175
-25
lines changed

3 files changed

+175
-25
lines changed

src/test/perl/PostgresNode.pm

Lines changed: 170 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1121,7 +1121,6 @@ sub psql
11211121
my $exc_save = $@;
11221122
if ($exc_save)
11231123
{
1124-
11251124
# IPC::Run::run threw an exception. re-throw unless it's a
11261125
# timeout, which we'll handle by testing is_expired
11271126
die $exc_save
@@ -1173,7 +1172,7 @@ sub psql
11731172
if $ret == 1;
11741173
die "connection error: '$$stderr'\nwhile running '@psql_params'"
11751174
if $ret == 2;
1176-
die "error running SQL: '$$stderr'\nwhile running '@psql_params'"
1175+
die "error running SQL: '$$stderr'\nwhile running '@psql_params' with sql '$sql'"
11771176
if $ret == 3;
11781177
die "psql returns $ret: '$$stderr'\nwhile running '@psql_params'";
11791178
}
@@ -1325,6 +1324,175 @@ sub run_log
13251324
TestLib::run_log(@_);
13261325
}
13271326

1327+
=pod $node->lsn(mode)
1328+
1329+
Look up xlog positions on the server:
1330+
1331+
* insert position (master only, error on replica)
1332+
* write position (master only, error on replica)
1333+
* flush position
1334+
* receive position (always undef on master)
1335+
* replay position
1336+
1337+
mode must be specified.
1338+
1339+
=cut
1340+
1341+
sub lsn
1342+
{
1343+
my ($self, $mode) = @_;
1344+
my %modes = ('insert' => 'pg_current_xlog_insert_location()',
1345+
'flush' => 'pg_current_xlog_flush_location()',
1346+
'write' => 'pg_current_xlog_location()',
1347+
'receive' => 'pg_last_xlog_receive_location()',
1348+
'replay' => 'pg_last_xlog_replay_location()');
1349+
1350+
$mode = '<undef>' if !defined($mode);
1351+
die "unknown mode for 'lsn': '$mode', valid modes are " . join(', ', keys %modes)
1352+
if !defined($modes{$mode});
1353+
1354+
my $result = $self->safe_psql('postgres', "SELECT $modes{$mode}");
1355+
chomp($result);
1356+
if ($result eq '')
1357+
{
1358+
return undef;
1359+
}
1360+
else
1361+
{
1362+
return $result;
1363+
}
1364+
}
1365+
1366+
=pod $node->wait_for_catchup(standby_name, mode, target_lsn)
1367+
1368+
Wait for the node with application_name standby_name (usually from node->name)
1369+
until its replication position in pg_stat_replication equals or passes the
1370+
upstream's xlog insert point at the time this function is called. By default
1371+
the replay_location is waited for, but 'mode' may be specified to wait for any
1372+
of sent|write|flush|replay.
1373+
1374+
If there is no active replication connection from this peer, waits until
1375+
poll_query_until timeout.
1376+
1377+
Requires that the 'postgres' db exists and is accessible.
1378+
1379+
target_lsn may be any arbitrary lsn, but is typically $master_node->lsn('insert').
1380+
1381+
This is not a test. It die()s on failure.
1382+
1383+
=cut
1384+
1385+
sub wait_for_catchup
1386+
{
1387+
my ($self, $standby_name, $mode, $target_lsn) = @_;
1388+
$mode = defined($mode) ? $mode : 'replay';
1389+
my %valid_modes = ( 'sent' => 1, 'write' => 1, 'flush' => 1, 'replay' => 1 );
1390+
die "unknown mode $mode for 'wait_for_catchup', valid modes are " . join(', ', keys(%valid_modes)) unless exists($valid_modes{$mode});
1391+
# Allow passing of a PostgresNode instance as shorthand
1392+
if ( blessed( $standby_name ) && $standby_name->isa("PostgresNode") )
1393+
{
1394+
$standby_name = $standby_name->name;
1395+
}
1396+
die 'target_lsn must be specified' unless defined($target_lsn);
1397+
print "Waiting for replication conn " . $standby_name . "'s " . $mode . "_location to pass " . $target_lsn . " on " . $self->name . "\n";
1398+
my $query = qq[SELECT '$target_lsn' <= ${mode}_location FROM pg_catalog.pg_stat_replication WHERE application_name = '$standby_name';];
1399+
$self->poll_query_until('postgres', $query)
1400+
or die "timed out waiting for catchup, current position is " . ($self->safe_psql('postgres', $query) || '(unknown)');
1401+
print "done\n";
1402+
}
1403+
1404+
=pod $node->wait_for_slot_catchup(slot_name, mode, target_lsn)
1405+
1406+
Wait for the named replication slot to equal or pass the supplied target_lsn.
1407+
The position used is the restart_lsn unless mode is given, in which case it may
1408+
be 'restart' or 'confirmed_flush'.
1409+
1410+
Requires that the 'postgres' db exists and is accessible.
1411+
1412+
This is not a test. It die()s on failure.
1413+
1414+
If the slot is not active, will time out after poll_query_until's timeout.
1415+
1416+
target_lsn may be any arbitrary lsn, but is typically $master_node->lsn('insert').
1417+
1418+
Note that for logical slots, restart_lsn is held down by the oldest in-progress tx.
1419+
1420+
=cut
1421+
1422+
sub wait_for_slot_catchup
1423+
{
1424+
my ($self, $slot_name, $mode, $target_lsn) = @_;
1425+
$mode = defined($mode) ? $mode : 'restart';
1426+
if (!($mode eq 'restart' || $mode eq 'confirmed_flush'))
1427+
{
1428+
die "valid modes are restart, confirmed_flush";
1429+
}
1430+
die 'target lsn must be specified' unless defined($target_lsn);
1431+
print "Waiting for replication slot " . $slot_name . "'s " . $mode . "_lsn to pass " . $target_lsn . " on " . $self->name . "\n";
1432+
my $query = qq[SELECT '$target_lsn' <= ${mode}_lsn FROM pg_catalog.pg_replication_slots WHERE slot_name = '$slot_name';];
1433+
$self->poll_query_until('postgres', $query)
1434+
or die "timed out waiting for catchup, current position is " . ($self->safe_psql('postgres', $query) || '(unknown)');
1435+
print "done\n";
1436+
}
1437+
1438+
=pod $node->query_hash($dbname, $query, @columns)
1439+
1440+
Execute $query on $dbname, replacing any appearance of the string __COLUMNS__
1441+
within the query with a comma-separated list of @columns.
1442+
1443+
If __COLUMNS__ does not appear in the query, its result columns must EXACTLY
1444+
match the order and number (but not necessarily alias) of supplied @columns.
1445+
1446+
The query must return zero or one rows.
1447+
1448+
Return a hash-ref representation of the results of the query, with any empty
1449+
or null results as defined keys with an empty-string value. There is no way
1450+
to differentiate between null and empty-string result fields.
1451+
1452+
If the query returns zero rows, return a hash with all columns empty. There
1453+
is no way to differentiate between zero rows returned and a row with only
1454+
null columns.
1455+
1456+
=cut
1457+
1458+
sub query_hash
1459+
{
1460+
my ($self, $dbname, $query, @columns) = @_;
1461+
die 'calls in array context for multi-row results not supported yet' if (wantarray);
1462+
# Replace __COLUMNS__ if found
1463+
substr($query, index($query, '__COLUMNS__'), length('__COLUMNS__')) = join(', ', @columns)
1464+
if index($query, '__COLUMNS__') >= 0;
1465+
my $result = $self->safe_psql($dbname, $query);
1466+
# hash slice, see http://stackoverflow.com/a/16755894/398670 .
1467+
#
1468+
# Fills the hash with empty strings produced by x-operator element
1469+
# duplication if result is an empty row
1470+
#
1471+
my %val;
1472+
@val{@columns} = $result ne '' ? split(qr/\|/, $result) : ('',) x scalar(@columns);
1473+
return \%val;
1474+
}
1475+
1476+
=pod $node->slot(slot_name)
1477+
1478+
Return hash-ref of replication slot data for the named slot, or a hash-ref with
1479+
all values '' if not found. Does not differentiate between null and empty string
1480+
for fields, no field is ever undef.
1481+
1482+
The restart_lsn and confirmed_flush_lsn fields are returned verbatim, and also
1483+
as a 2-list of [highword, lowword] integer. Since we rely on Perl 5.8.8 we can't
1484+
"use bigint", it's from 5.20, and we can't assume we have Math::Bigint from CPAN
1485+
either.
1486+
1487+
=cut
1488+
1489+
sub slot
1490+
{
1491+
my ($self, $slot_name) = @_;
1492+
my @columns = ('plugin', 'slot_type', 'datoid', 'database', 'active', 'active_pid', 'xmin', 'catalog_xmin', 'restart_lsn');
1493+
return $self->query_hash('postgres', "SELECT __COLUMNS__ FROM pg_catalog.pg_replication_slots WHERE slot_name = '$slot_name'", @columns);
1494+
}
1495+
13281496
=pod
13291497
13301498
=back

src/test/recovery/t/001_stream_rep.pl

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -40,16 +40,8 @@
4040
"CREATE TABLE tab_int AS SELECT generate_series(1,1002) AS a");
4141

4242
# Wait for standbys to catch up
43-
my $applname_1 = $node_standby_1->name;
44-
my $applname_2 = $node_standby_2->name;
45-
my $caughtup_query =
46-
"SELECT pg_current_xlog_location() <= replay_location FROM pg_stat_replication WHERE application_name = '$applname_1';";
47-
$node_master->poll_query_until('postgres', $caughtup_query)
48-
or die "Timed out while waiting for standby 1 to catch up";
49-
$caughtup_query =
50-
"SELECT pg_last_xlog_replay_location() <= replay_location FROM pg_stat_replication WHERE application_name = '$applname_2';";
51-
$node_standby_1->poll_query_until('postgres', $caughtup_query)
52-
or die "Timed out while waiting for standby 2 to catch up";
43+
$node_master->wait_for_catchup($node_standby_1, 'replay', $node_master->lsn('insert'));
44+
$node_standby_1->wait_for_catchup($node_standby_2, 'replay', $node_standby_1->lsn('replay'));
5345

5446
my $result =
5547
$node_standby_1->safe_psql('postgres', "SELECT count(*) FROM tab_int");

src/test/recovery/t/004_timeline_switch.pl

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -32,14 +32,9 @@
3232
# Create some content on master
3333
$node_master->safe_psql('postgres',
3434
"CREATE TABLE tab_int AS SELECT generate_series(1,1000) AS a");
35-
my $until_lsn =
36-
$node_master->safe_psql('postgres', "SELECT pg_current_xlog_location();");
3735

3836
# Wait until standby has replayed enough data on standby 1
39-
my $caughtup_query =
40-
"SELECT '$until_lsn'::pg_lsn <= pg_last_xlog_replay_location()";
41-
$node_standby_1->poll_query_until('postgres', $caughtup_query)
42-
or die "Timed out while waiting for standby to catch up";
37+
$node_master->wait_for_catchup($node_standby_1, 'replay', $node_master->lsn('write'));
4338

4439
# Stop and remove master, and promote standby 1, switching it to a new timeline
4540
$node_master->teardown_node;
@@ -50,7 +45,7 @@
5045
my $connstr_1 = $node_standby_1->connstr;
5146
$node_standby_2->append_conf(
5247
'recovery.conf', qq(
53-
primary_conninfo='$connstr_1'
48+
primary_conninfo='$connstr_1 application_name=@{[$node_standby_2->name]}'
5449
standby_mode=on
5550
recovery_target_timeline='latest'
5651
));
@@ -60,12 +55,7 @@
6055
# to ensure that the timeline switch has been done.
6156
$node_standby_1->safe_psql('postgres',
6257
"INSERT INTO tab_int VALUES (generate_series(1001,2000))");
63-
$until_lsn = $node_standby_1->safe_psql('postgres',
64-
"SELECT pg_current_xlog_location();");
65-
$caughtup_query =
66-
"SELECT '$until_lsn'::pg_lsn <= pg_last_xlog_replay_location()";
67-
$node_standby_2->poll_query_until('postgres', $caughtup_query)
68-
or die "Timed out while waiting for standby to catch up";
58+
$node_standby_1->wait_for_catchup($node_standby_2, 'replay', $node_standby_1->lsn('write'));
6959

7060
my $result =
7161
$node_standby_2->safe_psql('postgres', "SELECT count(*) FROM tab_int");

0 commit comments

Comments
 (0)