Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 867b38a

Browse files
committed
Fix rare EdgePartition test failure.
It sometimes excludes 1 instead of 3.
1 parent 2f75017 commit 867b38a

File tree

3 files changed

+27
-18
lines changed

3 files changed

+27
-18
lines changed

tests/lib/test_helper.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -257,9 +257,9 @@ def performRandomFailure(self, node, wait=0, nodes_wait_for_commit=[], node_wait
257257
failure = FailureClass(node)
258258

259259
log.info('simulating failure {} on node "{}"'.format(FailureClass.__name__, node))
260-
return self.performFailure(failure, wait, nodes_wait_for_commit, node_wait_for_online, stop_load, nodes_assert_commit_during_failure)
260+
return self.performFailure(failure, wait, nodes_wait_for_commit, nodes_wait_for_online, stop_load, nodes_assert_commit_during_failure)
261261

262-
def performFailure(self, failure, wait=0, nodes_wait_for_commit=[], node_wait_for_online=None, stop_load=False, nodes_assert_commit_during_failure=[]):
262+
def performFailure(self, failure, wait=0, nodes_wait_for_commit=[], nodes_wait_for_online=[], stop_load=False, nodes_assert_commit_during_failure=[]):
263263

264264
time.sleep(TEST_WARMING_TIME)
265265

@@ -291,9 +291,9 @@ def performFailure(self, failure, wait=0, nodes_wait_for_commit=[], node_wait_fo
291291
self.client.get_aggregates(clean=False)
292292
self.client.stop()
293293

294-
if node_wait_for_online is not None:
294+
for node_wait_for_online in nodes_wait_for_online:
295295
self.awaitOnline(node_wait_for_online)
296-
else:
296+
if (len(nodes_wait_for_online) == 0):
297297
time.sleep(TEST_RECOVERY_TIME)
298298

299299
if stop_load:

tests/test_recovery_random.py

+14-5
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def test_random_disasters(self):
5353
aggs_failure, aggs = self.performRandomFailure(
5454
f'node{node_number}',
5555
nodes_wait_for_commit=[n for n in range(3)],
56-
node_wait_for_online=f"dbname=regression user=postgres host={self.host_ip} port={port}",
56+
nodes_wait_for_online=[f"dbname=regression user=postgres host={self.host_ip} port={port}"],
5757
stop_load=True,
5858
nodes_assert_commit_during_failure=
5959
nodes_assert_commit_during_failure)
@@ -76,8 +76,17 @@ def test_edge_partition(self):
7676
log.info('### test_edge_partition ###')
7777

7878
aggs_failure, aggs = self.performFailure(
79-
EdgePartition('node1', 'node3'), node_wait_for_online=
80-
f"dbname=regression user=postgres host={self.host_ip} port=15434",
79+
EdgePartition('node1', 'node3'),
80+
# clique selection picks up the min mask, so in 1-2-3 sausage 12
81+
# will be eventually the live nodes. However, there is a small risk
82+
# of 3 successfully voting for 23 before 1 understands what's going
83+
# on, in which case 1 is put into recovery which doesn't finish in
84+
# 10s of the test given that the load is not stopped. This actually
85+
# happened in CI. To avoid test failure, wait for both 1 and 3 to be
86+
# online.
87+
nodes_wait_for_online=[
88+
f"dbname=regression user=postgres host={self.host_ip} port=15434",
89+
f"dbname=regression user=postgres host={self.host_ip} port=15432"],
8190
stop_load=True)
8291

8392
self.assertTrue(('commit' in aggs_failure[0]['transfer']['finish']) or
@@ -94,8 +103,8 @@ def _test_single_failure(self):
94103

95104
failure = CrashRecoverNode('node3')
96105
aggs_failure, aggs = self.performFailure(
97-
failure, node_wait_for_online=
98-
"dbname=regression user=postgres host=127.0.0.1 port=15434",
106+
failure,
107+
nodes_wait_for_online=["dbname=regression user=postgres host=127.0.0.1 port=15434"],
99108
stop_load=True)
100109

101110
self.assertCommits(aggs_failure[:2])

tests/test_referee.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ def test_neighbor_restart(self):
7676
log.info('### test_neighbor_restart ###')
7777

7878
aggs_failure, aggs = self.performFailure(
79-
RestartNode('node2'), node_wait_for_online=self.dsns[1],
79+
RestartNode('node2'), nodes_wait_for_online=[self.dsns[1]],
8080
stop_load=True)
8181

8282
self.assertCommits(aggs_failure[:1])
@@ -90,8 +90,8 @@ def test_node_crash(self):
9090
log.info('### test_node_crash ###')
9191

9292
aggs_failure, aggs = self.performFailure(
93-
CrashRecoverNode('node2'), node_wait_for_online=
94-
self.dsns[1],
93+
CrashRecoverNode('node2'), nodes_wait_for_online=
94+
[self.dsns[1]],
9595
stop_load=True)
9696

9797
self.assertCommits(aggs_failure[:1])
@@ -105,8 +105,8 @@ def test_partition_referee(self):
105105
log.info('### test_partition_referee ###')
106106

107107
aggs_failure, aggs = self.performFailure(
108-
SingleNodePartition('node2'), node_wait_for_online=
109-
self.dsns[1],
108+
SingleNodePartition('node2'), nodes_wait_for_online=
109+
[self.dsns[1]],
110110
stop_load=True)
111111

112112
self.assertCommits(aggs_failure[:1])
@@ -123,8 +123,8 @@ def test_double_failure_referee(self):
123123
log.info('### test_double_failure_referee ###')
124124

125125
aggs_failure, aggs = self.performFailure(
126-
SingleNodePartition('node2'), node_wait_for_online=
127-
self.dsns[1], stop_load=True)
126+
SingleNodePartition('node2'), nodes_wait_for_online=
127+
[self.dsns[1]], stop_load=True)
128128

129129
self.assertCommits(aggs_failure[:1])
130130
self.assertNoCommits(aggs_failure[1:])
@@ -134,8 +134,8 @@ def test_double_failure_referee(self):
134134
self.assertIsolation(aggs)
135135

136136
aggs_failure, aggs = self.performFailure(
137-
SingleNodePartition('node1'), node_wait_for_online=
138-
self.dsns[0], stop_load=True)
137+
SingleNodePartition('node1'), nodes_wait_for_online=
138+
[self.dsns[0]], stop_load=True)
139139

140140
self.assertNoCommits(aggs_failure[:1])
141141
self.assertCommits(aggs_failure[1:])

0 commit comments

Comments
 (0)