Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit cae25d1

Browse files
committed
Improve stability of recovery test 035_standby_logical_decoding
This commit tweaks a couple of things in 035_standby_logical_decoding to hopefully stabilize it: - Autovacuum is now disabled, as it could hold a global xmin with a transaction. - Conflicts are generated with command sequences that removed rows (on catalogs, shared or non-shared, or just plain relations) followed by a VACUUM. This was unstable because these did not check that the horizon moved between the SQL commands and the VACUUM. The logic is refactored as follows, to ensure that VACUUM removes dead rows before testing for slot invalidation on a standby (idea suggested by Andres Freund): -- Grab the current horizon. -- Launch SQL commands removing rows. -- Check that the snapshot horizon has been updated. -- Launch VACUUM on the relation whose rows have been removed by the first step. Note that there are still some issues because of standby snapshot WAL records generated by the bgwriter, but this makes the test much more stable. Per reports from buildfarm members dikkop and skink, with analysis and tests from Alexander Lakhin. While on it, fix a couple of incorrect comments. Author: Bertrand Drouvot Reviewed-by: Alexander Lakhin, Michael Paquier Discussion: https://postgr.es/m/OSZPR01MB6310ED3CEDB531BCEDBC6AF2FD479@OSZPR01MB6310.jpnprd01.prod.outlook.com Discussion: https://postgr.es/m/bf67e076-b163-9ba3-4ade-b9fc51a3a8f6@gmail.com Backpatch-through: 16
1 parent 07b53de commit cae25d1

File tree

1 file changed

+56
-34
lines changed

1 file changed

+56
-34
lines changed

src/test/recovery/t/035_standby_logical_decoding.pl

Lines changed: 56 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,40 @@ sub check_for_invalidation
240240
) or die "Timed out waiting confl_active_logicalslot to be updated";
241241
}
242242

243+
# Launch $sql query, wait for a new snapshot that has a newer horizon and
244+
# launch a VACUUM. $vac_option is the set of options to be passed to the
245+
# VACUUM command, $sql the sql to launch before triggering the vacuum and
246+
# $to_vac the relation to vacuum.
247+
#
248+
# Note that pg_current_snapshot() is used to get the horizon. It does
249+
# not generate a Transaction/COMMIT WAL record, decreasing the risk of
250+
# seeing a xl_running_xacts that would advance an active replication slot's
251+
# catalog_xmin. Advancing the active replication slot's catalog_xmin
252+
# would break some tests that expect the active slot to conflict with
253+
# the catalog xmin horizon.
254+
sub wait_until_vacuum_can_remove
255+
{
256+
my ($vac_option, $sql, $to_vac) = @_;
257+
258+
# Get the current xid horizon,
259+
my $xid_horizon = $node_primary->safe_psql('testdb',
260+
qq[select pg_snapshot_xmin(pg_current_snapshot());]);
261+
262+
# Launch our sql.
263+
$node_primary->safe_psql('testdb', qq[$sql]);
264+
265+
# Wait until we get a newer horizon.
266+
$node_primary->poll_query_until('testdb',
267+
"SELECT (select pg_snapshot_xmin(pg_current_snapshot())::text::int - $xid_horizon) > 0"
268+
) or die "new snapshot does not have a newer horizon";
269+
270+
# Launch the vacuum command and insert into flush_wal (see CREATE
271+
# TABLE flush_wal for the reason).
272+
$node_primary->safe_psql(
273+
'testdb', qq[VACUUM $vac_option verbose $to_vac;
274+
INSERT INTO flush_wal DEFAULT VALUES;]);
275+
}
276+
243277
########################
244278
# Initialize primary node
245279
########################
@@ -250,6 +284,7 @@ sub check_for_invalidation
250284
wal_level = 'logical'
251285
max_replication_slots = 4
252286
max_wal_senders = 4
287+
autovacuum = off
253288
});
254289
$node_primary->dump_info;
255290
$node_primary->start;
@@ -470,13 +505,9 @@ sub check_for_invalidation
470505
0, 1);
471506

472507
# This should trigger the conflict
473-
$node_primary->safe_psql(
474-
'testdb', qq[
475-
CREATE TABLE conflict_test(x integer, y text);
476-
DROP TABLE conflict_test;
477-
VACUUM full pg_class;
478-
INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal
479-
]);
508+
wait_until_vacuum_can_remove(
509+
'full', 'CREATE TABLE conflict_test(x integer, y text);
510+
DROP TABLE conflict_test;', 'pg_class');
480511

481512
$node_primary->wait_for_replay_catchup($node_standby);
482513

@@ -550,18 +581,15 @@ sub check_for_invalidation
550581
my $logstart = -s $node_standby->logfile;
551582

552583
# One way to produce recovery conflict is to create/drop a relation and
553-
# launch a vacuum on pg_class with hot_standby_feedback turned off on the standby.
584+
# launch a vacuum on pg_class with hot_standby_feedback turned off on the
585+
# standby.
554586
reactive_slots_change_hfs_and_wait_for_xmins('vacuum_full_', 'row_removal_',
555587
0, 1);
556588

557589
# This should trigger the conflict
558-
$node_primary->safe_psql(
559-
'testdb', qq[
560-
CREATE TABLE conflict_test(x integer, y text);
561-
DROP TABLE conflict_test;
562-
VACUUM pg_class;
563-
INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal
564-
]);
590+
wait_until_vacuum_can_remove(
591+
'', 'CREATE TABLE conflict_test(x integer, y text);
592+
DROP TABLE conflict_test;', 'pg_class');
565593

566594
$node_primary->wait_for_replay_catchup($node_standby);
567595

@@ -587,19 +615,16 @@ sub check_for_invalidation
587615
# get the position to search from in the standby logfile
588616
$logstart = -s $node_standby->logfile;
589617

590-
# One way to produce recovery conflict is to create/drop a relation and
591-
# launch a vacuum on pg_class with hot_standby_feedback turned off on the standby.
618+
# One way to produce recovery conflict on a shared catalog table is to
619+
# create/drop a role and launch a vacuum on pg_authid with
620+
# hot_standby_feedback turned off on the standby.
592621
reactive_slots_change_hfs_and_wait_for_xmins('row_removal_',
593622
'shared_row_removal_', 0, 1);
594623

595624
# Trigger the conflict
596-
$node_primary->safe_psql(
597-
'testdb', qq[
598-
CREATE ROLE create_trash;
599-
DROP ROLE create_trash;
600-
VACUUM pg_authid;
601-
INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal
602-
]);
625+
wait_until_vacuum_can_remove(
626+
'', 'CREATE ROLE create_trash;
627+
DROP ROLE create_trash;', 'pg_authid');
603628

604629
$node_primary->wait_for_replay_catchup($node_standby);
605630

@@ -630,14 +655,11 @@ sub check_for_invalidation
630655
'no_conflict_', 0, 1);
631656

632657
# This should not trigger a conflict
633-
$node_primary->safe_psql(
634-
'testdb', qq[
635-
CREATE TABLE conflict_test(x integer, y text);
636-
INSERT INTO conflict_test(x,y) SELECT s, s::text FROM generate_series(1,4) s;
637-
UPDATE conflict_test set x=1, y=1;
638-
VACUUM conflict_test;
639-
INSERT INTO flush_wal DEFAULT VALUES; -- see create table flush_wal
640-
]);
658+
wait_until_vacuum_can_remove(
659+
'', 'CREATE TABLE conflict_test(x integer, y text);
660+
INSERT INTO conflict_test(x,y) SELECT s, s::text FROM generate_series(1,4) s;
661+
UPDATE conflict_test set x=1, y=1;', 'conflict_test');
662+
641663
$node_primary->wait_for_replay_catchup($node_standby);
642664

643665
# message should not be issued
@@ -670,7 +692,7 @@ sub check_for_invalidation
670692

671693
##################################################
672694
# Recovery conflict: Invalidate conflicting slots, including in-use slots
673-
# Scenario 4: conflict due to on-access pruning.
695+
# Scenario 5: conflict due to on-access pruning.
674696
##################################################
675697

676698
# get the position to search from in the standby logfile
@@ -710,7 +732,7 @@ sub check_for_invalidation
710732

711733
##################################################
712734
# Recovery conflict: Invalidate conflicting slots, including in-use slots
713-
# Scenario 5: incorrect wal_level on primary.
735+
# Scenario 6: incorrect wal_level on primary.
714736
##################################################
715737

716738
# get the position to search from in the standby logfile

0 commit comments

Comments
 (0)