Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 0446d3b

Browse files
committed
Backpatch 031_recovery_conflict.pl.
The prior commit showed that the introduction of recovery conflict tests was a good idea. Without these tests it's hard to know that the fix didn't break something... 031_recovery_conflict.pl was introduced in 9f8a050 and extended in 21e1844. Discussion: https://postgr.es/m/20220413002626.udl7lll7f3o7nre7@alap3.anarazel.de Backpatch: 10-14
1 parent 57c5ad1 commit 0446d3b

File tree

1 file changed

+371
-0
lines changed

1 file changed

+371
-0
lines changed
Lines changed: 371 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,371 @@
1+
# Copyright (c) 2021-2022, PostgreSQL Global Development Group
2+
3+
# Test that connections to a hot standby are correctly canceled when a
4+
# recovery conflict is detected Also, test that statistics in
5+
# pg_stat_database_conflicts are populated correctly
6+
7+
use strict;
8+
use warnings;
9+
use PostgreSQL::Test::Cluster;
10+
use PostgreSQL::Test::Utils;
11+
use Test::More;
12+
13+
14+
# Set up nodes
15+
my $node_primary = PostgreSQL::Test::Cluster->new('primary');
16+
$node_primary->init(allows_streaming => 1);
17+
18+
my $tablespace1 = "test_recovery_conflict_tblspc";
19+
20+
$node_primary->append_conf(
21+
'postgresql.conf', qq[
22+
23+
# Doesn't currently exist pre 15, but might be backpatched later
24+
#allow_in_place_tablespaces = on
25+
#temp_tablespaces = $tablespace1
26+
27+
log_temp_files = 0
28+
29+
# for deadlock test
30+
max_prepared_transactions = 10
31+
32+
# wait some to test the wait paths as well, but not long for obvious reasons
33+
max_standby_streaming_delay = 50ms
34+
35+
# Some of the recovery conflict logging code only gets exercised after
36+
# deadlock_timeout. The test doesn't rely on that additional output, but it's
37+
# nice to get some minimal coverage of that code.
38+
#log_recovery_conflict_waits = on # doesn't exist < 14.
39+
deadlock_timeout = 10ms
40+
]);
41+
$node_primary->start;
42+
43+
my $backup_name = 'my_backup';
44+
45+
# See allow_in_place_tablespaces comment above
46+
#$node_primary->safe_psql('postgres',
47+
# qq[CREATE TABLESPACE $tablespace1 LOCATION '']);
48+
49+
$node_primary->backup($backup_name);
50+
my $node_standby = PostgreSQL::Test::Cluster->new('standby');
51+
$node_standby->init_from_backup($node_primary, $backup_name,
52+
has_streaming => 1);
53+
54+
$node_standby->start;
55+
56+
my $test_db = "test_db";
57+
58+
# use a new database, to trigger database recovery conflict
59+
$node_primary->safe_psql('postgres', "CREATE DATABASE $test_db");
60+
61+
# test schema / data
62+
my $table1 = "test_recovery_conflict_table1";
63+
my $table2 = "test_recovery_conflict_table2";
64+
$node_primary->safe_psql(
65+
$test_db, qq[
66+
CREATE TABLE ${table1}(a int, b int);
67+
INSERT INTO $table1 SELECT i % 3, 0 FROM generate_series(1,20) i;
68+
CREATE TABLE ${table2}(a int, b int);
69+
]);
70+
my $primary_lsn = $node_primary->lsn('flush');
71+
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
72+
73+
74+
# a longrunning psql that we can use to trigger conflicts
75+
my $psql_timeout = IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default);
76+
my %psql_standby = ('stdin' => '', 'stdout' => '');
77+
$psql_standby{run} =
78+
$node_standby->background_psql($test_db, \$psql_standby{stdin},
79+
\$psql_standby{stdout},
80+
$psql_timeout);
81+
$psql_standby{stdout} = '';
82+
83+
my $expected_conflicts = 0;
84+
85+
86+
## RECOVERY CONFLICT 1: Buffer pin conflict
87+
my $sect = "buffer pin conflict";
88+
$expected_conflicts++;
89+
90+
# Aborted INSERT on primary that will be cleaned up by vacuum. Has to be old
91+
# enough so that there's not a snapshot conflict before the buffer pin
92+
# conflict.
93+
94+
$node_primary->safe_psql(
95+
$test_db,
96+
qq[
97+
BEGIN;
98+
INSERT INTO $table1 VALUES (1,0);
99+
ROLLBACK;
100+
-- ensure flush, rollback doesn't do so
101+
BEGIN; LOCK $table1; COMMIT;
102+
]);
103+
104+
$primary_lsn = $node_primary->lsn('flush');
105+
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
106+
107+
my $cursor1 = "test_recovery_conflict_cursor";
108+
109+
# DECLARE and use a cursor on standby, causing buffer with the only block of
110+
# the relation to be pinned on the standby
111+
$psql_standby{stdin} .= qq[
112+
BEGIN;
113+
DECLARE $cursor1 CURSOR FOR SELECT b FROM $table1;
114+
FETCH FORWARD FROM $cursor1;
115+
];
116+
# FETCH FORWARD should have returned a 0 since all values of b in the table
117+
# are 0
118+
ok(pump_until_standby(qr/^0$/m),
119+
"$sect: cursor with conflicting pin established");
120+
121+
# to check the log starting now for recovery conflict messages
122+
my $log_location = -s $node_standby->logfile;
123+
124+
# VACUUM on the primary
125+
$node_primary->safe_psql($test_db, qq[VACUUM $table1;]);
126+
127+
# Wait for catchup. Existing connection will be terminated before replay is
128+
# finished, so waiting for catchup ensures that there is no race between
129+
# encountering the recovery conflict which causes the disconnect and checking
130+
# the logfile for the terminated connection.
131+
$primary_lsn = $node_primary->lsn('flush');
132+
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
133+
134+
check_conflict_log("User was holding shared buffer pin for too long");
135+
reconnect_and_clear();
136+
check_conflict_stat("bufferpin");
137+
138+
139+
## RECOVERY CONFLICT 2: Snapshot conflict
140+
$sect = "snapshot conflict";
141+
$expected_conflicts++;
142+
143+
$node_primary->safe_psql($test_db,
144+
qq[INSERT INTO $table1 SELECT i, 0 FROM generate_series(1,20) i]);
145+
$primary_lsn = $node_primary->lsn('flush');
146+
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
147+
148+
# DECLARE and FETCH from cursor on the standby
149+
$psql_standby{stdin} .= qq[
150+
BEGIN;
151+
DECLARE $cursor1 CURSOR FOR SELECT b FROM $table1;
152+
FETCH FORWARD FROM $cursor1;
153+
];
154+
ok( pump_until(
155+
$psql_standby{run}, $psql_timeout,
156+
\$psql_standby{stdout}, qr/^0$/m,),
157+
"$sect: cursor with conflicting snapshot established");
158+
159+
# Do some HOT updates
160+
$node_primary->safe_psql($test_db,
161+
qq[UPDATE $table1 SET a = a + 1 WHERE a > 2;]);
162+
163+
# VACUUM, pruning those dead tuples
164+
$node_primary->safe_psql($test_db, qq[VACUUM $table1;]);
165+
166+
# Wait for attempted replay of PRUNE records
167+
$primary_lsn = $node_primary->lsn('flush');
168+
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
169+
170+
check_conflict_log(
171+
"User query might have needed to see row versions that must be removed");
172+
reconnect_and_clear();
173+
check_conflict_stat("snapshot");
174+
175+
176+
## RECOVERY CONFLICT 3: Lock conflict
177+
$sect = "lock conflict";
178+
$expected_conflicts++;
179+
180+
# acquire lock to conflict with
181+
$psql_standby{stdin} .= qq[
182+
BEGIN;
183+
LOCK TABLE $table1 IN ACCESS SHARE MODE;
184+
SELECT 1;
185+
];
186+
ok(pump_until_standby(qr/^1$/m), "$sect: conflicting lock acquired");
187+
188+
# DROP TABLE containing block which standby has in a pinned buffer
189+
$node_primary->safe_psql($test_db, qq[DROP TABLE $table1;]);
190+
191+
$primary_lsn = $node_primary->lsn('flush');
192+
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
193+
194+
check_conflict_log("User was holding a relation lock for too long");
195+
reconnect_and_clear();
196+
check_conflict_stat("lock");
197+
198+
199+
# See allow_in_place_tablespaces comment above
200+
### RECOVERY CONFLICT 4: Tablespace conflict
201+
#$sect = "tablespace conflict";
202+
#$expected_conflicts++;
203+
#
204+
## DECLARE a cursor for a query which, with sufficiently low work_mem, will
205+
## spill tuples into temp files in the temporary tablespace created during
206+
## setup.
207+
#$psql_standby{stdin} .= qq[
208+
# BEGIN;
209+
# SET work_mem = '64kB';
210+
# DECLARE $cursor1 CURSOR FOR
211+
# SELECT count(*) FROM generate_series(1,6000);
212+
# FETCH FORWARD FROM $cursor1;
213+
# ];
214+
#ok(pump_until_standby(qr/^6000$/m),
215+
# "$sect: cursor with conflicting temp file established");
216+
#
217+
## Drop the tablespace currently containing spill files for the query on the
218+
## standby
219+
#$node_primary->safe_psql($test_db, qq[DROP TABLESPACE $tablespace1;]);
220+
#
221+
#$primary_lsn = $node_primary->lsn('flush');
222+
#$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
223+
#
224+
#check_conflict_log(
225+
# "User was or might have been using tablespace that must be dropped");
226+
#reconnect_and_clear();
227+
#check_conflict_stat("tablespace");
228+
229+
230+
## RECOVERY CONFLICT 5: Deadlock
231+
$sect = "startup deadlock";
232+
$expected_conflicts++;
233+
234+
# Generate a few dead rows, to later be cleaned up by vacuum. Then acquire a
235+
# lock on another relation in a prepared xact, so it's held continuously by
236+
# the startup process. The standby psql will block acquiring that lock while
237+
# holding a pin that vacuum needs, triggering the deadlock.
238+
$node_primary->safe_psql(
239+
$test_db,
240+
qq[
241+
CREATE TABLE $table1(a int, b int);
242+
INSERT INTO $table1 VALUES (1);
243+
BEGIN;
244+
INSERT INTO $table1(a) SELECT generate_series(1, 100) i;
245+
ROLLBACK;
246+
BEGIN;
247+
LOCK TABLE $table2;
248+
PREPARE TRANSACTION 'lock';
249+
INSERT INTO $table1(a) VALUES (170);
250+
SELECT txid_current();
251+
]);
252+
253+
$primary_lsn = $node_primary->lsn('flush');
254+
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
255+
256+
$psql_standby{stdin} .= qq[
257+
BEGIN;
258+
-- hold pin
259+
DECLARE $cursor1 CURSOR FOR SELECT a FROM $table1;
260+
FETCH FORWARD FROM $cursor1;
261+
-- wait for lock held by prepared transaction
262+
SELECT * FROM $table2;
263+
];
264+
ok( pump_until(
265+
$psql_standby{run}, $psql_timeout,
266+
\$psql_standby{stdout}, qr/^1$/m,),
267+
"$sect: cursor holding conflicting pin, also waiting for lock, established"
268+
);
269+
270+
# just to make sure we're waiting for lock already
271+
ok( $node_standby->poll_query_until(
272+
'postgres', qq[
273+
SELECT 'waiting' FROM pg_locks WHERE locktype = 'relation' AND NOT granted;
274+
], 'waiting'),
275+
"$sect: lock acquisition is waiting");
276+
277+
# VACUUM will prune away rows, causing a buffer pin conflict, while standby
278+
# psql is waiting on lock
279+
$node_primary->safe_psql($test_db, qq[VACUUM $table1;]);
280+
$primary_lsn = $node_primary->lsn('flush');
281+
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
282+
283+
check_conflict_log("User transaction caused buffer deadlock with recovery.");
284+
reconnect_and_clear();
285+
check_conflict_stat("deadlock");
286+
287+
# clean up for next tests
288+
$node_primary->safe_psql($test_db, qq[ROLLBACK PREPARED 'lock';]);
289+
290+
291+
# Check that expected number of conflicts show in pg_stat_database. Needs to
292+
# be tested before database is dropped, for obvious reasons.
293+
is( $node_standby->safe_psql(
294+
$test_db,
295+
qq[SELECT conflicts FROM pg_stat_database WHERE datname='$test_db';]),
296+
$expected_conflicts,
297+
qq[$expected_conflicts recovery conflicts shown in pg_stat_database]);
298+
299+
300+
## RECOVERY CONFLICT 6: Database conflict
301+
$sect = "database conflict";
302+
303+
$node_primary->safe_psql('postgres', qq[DROP DATABASE $test_db;]);
304+
305+
$primary_lsn = $node_primary->lsn('flush');
306+
$node_primary->wait_for_catchup($node_standby, 'replay', $primary_lsn);
307+
308+
check_conflict_log("User was connected to a database that must be dropped");
309+
310+
311+
# explicitly shut down psql instances gracefully - to avoid hangs or worse on
312+
# windows
313+
$psql_standby{stdin} .= "\\q\n";
314+
$psql_standby{run}->finish;
315+
316+
$node_standby->stop();
317+
$node_primary->stop();
318+
319+
320+
done_testing();
321+
322+
323+
sub pump_until_standby
324+
{
325+
my $match = shift;
326+
327+
return pump_until($psql_standby{run}, $psql_timeout,
328+
\$psql_standby{stdout}, $match);
329+
}
330+
331+
sub reconnect_and_clear
332+
{
333+
# If psql isn't dead already, tell it to quit as \q, when already dead,
334+
# causes IPC::Run to unhelpfully error out with "ack Broken pipe:".
335+
$psql_standby{run}->pump_nb();
336+
if ($psql_standby{run}->pumpable())
337+
{
338+
$psql_standby{stdin} .= "\\q\n";
339+
}
340+
$psql_standby{run}->finish;
341+
342+
# restart
343+
$psql_standby{run}->run();
344+
$psql_standby{stdin} = '';
345+
$psql_standby{stdout} = '';
346+
347+
# Run query to ensure connection has finished re-establishing
348+
$psql_standby{stdin} .= qq[SELECT 1;\n];
349+
die unless pump_until_standby(qr/^1$/m);
350+
$psql_standby{stdout} = '';
351+
}
352+
353+
sub check_conflict_log
354+
{
355+
my $message = shift;
356+
my $old_log_location = $log_location;
357+
358+
$log_location = $node_standby->wait_for_log(qr/$message/, $log_location);
359+
360+
cmp_ok($log_location, '>', $old_log_location,
361+
"$sect: logfile contains terminated connection due to recovery conflict"
362+
);
363+
}
364+
365+
sub check_conflict_stat
366+
{
367+
# Stats can't easily be checked before 15, requires waiting for stats to
368+
# be reported to stats collector and then those messages need to be
369+
# processed. Dealt with here to reduce intra-branch difference in the
370+
# tests.
371+
}

0 commit comments

Comments
 (0)