Make deadlock-parallel isolation test more robust.

tglsfdc · tglsfdc · commit 9be4ce4fa335 · 2019-08-17T18:15:38.000-04:00
This test failed fairly reproducibly on some CLOBBER_CACHE_ALWAYS buildfarm animals. The cause seems to be that if a parallel worker is slow enough to reach its lock wait, it may not be released by the first deadlock check run, and then later deadlock checks might decide to unblock the d2 session instead of the d1 session, leaving us in an undetected deadlock state (since the isolationtester client is waiting for d1 to complete first). Fix by introducing an additional lock wait at the end of the d2a1 step, ensuring that the deadlock checker will recognize that d1 has to be unblocked before d2a1 completes. Also reduce max_parallel_workers_per_gather to 3 in this test. With the default max_worker_processes value, we were only getting one parallel worker for the d2a1 step, which is not the case I hoped to test. We should get 3 for d1a2 and 2 for d2a1, as the code stands; and maybe 3 for d2a1 if somebody figures out why the last parallel worker slot isn't free already. Discussion: https://postgr.es/m/22195.1566077308@sss.pgh.pa.us
diff --git a/src/test/isolation/expected/deadlock-parallel.out b/src/test/isolation/expected/deadlock-parallel.out
@@ -1,10 +1,10 @@
 Parsed test spec with 4 sessions
 
 starting permutation: d1a1 d2a2 e1l e2l d1a2 d2a1 d1c e1c d2c e2c
-step d1a1: SELECT lock_share(1,x) FROM bigt LIMIT 1;
-lock_share     
+step d1a1: SELECT lock_share(1,x), lock_excl(3,x) FROM bigt LIMIT 1;
+lock_share     lock_excl      
 
-1              
+1              1              
 step d2a2: select lock_share(2,x) FROM bigt LIMIT 1;
 lock_share     
 
@@ -16,15 +16,19 @@ step d1a2: SET force_parallel_mode = on;
 			  SET parallel_tuple_cost = 0;
 			  SET min_parallel_table_scan_size = 0;
 			  SET parallel_leader_participation = off;
-			  SET max_parallel_workers_per_gather = 4;
+			  SET max_parallel_workers_per_gather = 3;
 			  SELECT sum(lock_share(2,x)) FROM bigt; <waiting ...>
 step d2a1: SET force_parallel_mode = on;
 			  SET parallel_setup_cost = 0;
 			  SET parallel_tuple_cost = 0;
 			  SET min_parallel_table_scan_size = 0;
 			  SET parallel_leader_participation = off;
-			  SET max_parallel_workers_per_gather = 4;
-			  SELECT sum(lock_share(1,x)) FROM bigt; <waiting ...>
+			  SET max_parallel_workers_per_gather = 3;
+			  SELECT sum(lock_share(1,x)) FROM bigt;
+			  SET force_parallel_mode = off;
+			  RESET parallel_setup_cost;
+			  RESET parallel_tuple_cost;
+			  SELECT lock_share(3,x) FROM bigt LIMIT 1; <waiting ...>
 step d1a2: <... completed>
 sum            
 
@@ -38,6 +42,9 @@ step d2a1: <... completed>
 sum            
 
 10000          
+lock_share     
+
+1              
 step e1c: COMMIT;
 step d2c: COMMIT;
 step e2l: <... completed>
diff --git a/src/test/isolation/specs/deadlock-parallel.spec b/src/test/isolation/specs/deadlock-parallel.spec
@@ -15,6 +15,25 @@
 # The deadlock detector resolves the deadlock by reversing the d1-e2 edge,
 # unblocking d1.
 
+# However ... it's not actually that well-defined whether the deadlock
+# detector will prefer to unblock d1 or d2.  It depends on which backend
+# is first to run DeadLockCheck after the deadlock condition is created:
+# that backend will search outwards from its own wait condition, and will
+# first find a loop involving the *other* lock.  We encourage that to be
+# one of the d2a1 parallel workers, which will therefore unblock d1a2
+# workers, by setting a shorter deadlock_timeout in session d2.  But on
+# slow machines, one or more d1a2 workers may not yet have reached their
+# lock waits, so that they're not unblocked by the first DeadLockCheck.
+# The next DeadLockCheck may choose to unblock the d2a1 workers instead,
+# which would allow d2a1 to complete before d1a2, causing the test to
+# freeze up because isolationtester isn't expecting that completion order.
+# (In effect, we have an undetectable deadlock because d2 is waiting for
+# d1's completion, but on the client side.)  To fix this, introduce an
+# additional lock (advisory lock 3), which is initially taken by d1 and
+# then d2a1 will wait for it after completing the main part of the test.
+# In this way, the deadlock detector can see that d1 must be completed
+# first, regardless of timing.
+
 setup
 {
   create function lock_share(int,int) returns int language sql as
@@ -39,15 +58,15 @@ setup		{ BEGIN isolation level repeatable read;
 			  SET force_parallel_mode = off;
 			  SET deadlock_timeout = '10s';
 }
-# this lock will be taken in the leader, so it will persist:
-step "d1a1"	{ SELECT lock_share(1,x) FROM bigt LIMIT 1; }
+# these locks will be taken in the leader, so they will persist:
+step "d1a1"	{ SELECT lock_share(1,x), lock_excl(3,x) FROM bigt LIMIT 1; }
 # this causes all the parallel workers to take locks:
 step "d1a2"	{ SET force_parallel_mode = on;
 			  SET parallel_setup_cost = 0;
 			  SET parallel_tuple_cost = 0;
 			  SET min_parallel_table_scan_size = 0;
 			  SET parallel_leader_participation = off;
-			  SET max_parallel_workers_per_gather = 4;
+			  SET max_parallel_workers_per_gather = 3;
 			  SELECT sum(lock_share(2,x)) FROM bigt; }
 step "d1c"	{ COMMIT; }
 
@@ -58,14 +77,19 @@ setup		{ BEGIN isolation level repeatable read;
 }
 # this lock will be taken in the leader, so it will persist:
 step "d2a2"	{ select lock_share(2,x) FROM bigt LIMIT 1; }
-# this causes all the parallel workers to take locks:
+# this causes all the parallel workers to take locks;
+# after which, make the leader take lock 3 to prevent client-driven deadlock
 step "d2a1"	{ SET force_parallel_mode = on;
 			  SET parallel_setup_cost = 0;
 			  SET parallel_tuple_cost = 0;
 			  SET min_parallel_table_scan_size = 0;
 			  SET parallel_leader_participation = off;
-			  SET max_parallel_workers_per_gather = 4;
-			  SELECT sum(lock_share(1,x)) FROM bigt; }
+			  SET max_parallel_workers_per_gather = 3;
+			  SELECT sum(lock_share(1,x)) FROM bigt;
+			  SET force_parallel_mode = off;
+			  RESET parallel_setup_cost;
+			  RESET parallel_tuple_cost;
+			  SELECT lock_share(3,x) FROM bigt LIMIT 1; }
 step "d2c"	{ COMMIT; }
 
 session "e1"