Adding edges for all locked objects in deadlock detection.

arssher · arssher · commit ff423db9f163 · 2017-12-19T15:06:35.000+03:00
And more accurate description of current implementation drawbacks.
diff --git a/pg_shardman--0.0.2.sql b/pg_shardman--0.0.2.sql
@@ -2137,9 +2137,13 @@ create type process as (node int, pid int);
 -- View to build lock graph which can be used to detect global deadlock.
 -- Application_name is assumed pgfdw:$system_id:$coord_pid
 -- gid is assumed $pid:$count:$sys_id:$xid:$participants_count
+-- Currently we are oblivious about lock modes and report any wait -> hold edge
+-- on the same object and therefore might produce false loops. Furthermore,
+-- we have not idea about locking queues here. Probably it is better to use
+-- pg_blocking_pids, but it seems to ignore prepared xacts.
 CREATE VIEW lock_graph(wait, hold) AS
-    -- If xact is already prepared, we take node and pid of the coordinator.
 	-- local dependencies
+    -- If xact is already prepared, we take node and pid of the coordinator.
 	SELECT
 		ROW(shardman.get_my_id(),
 			wait.pid)::shardman.process,
@@ -2152,11 +2156,19 @@ CREATE VIEW lock_graph(wait, hold) AS
      FROM pg_locks wait, pg_locks hold LEFT OUTER JOIN pg_prepared_xacts twopc
 			  ON twopc.transaction=hold.transactionid
 	WHERE
-		NOT wait.granted AND wait.pid IS NOT NULL AND hold.granted
-		-- this select captures waitings on xid and on, hm, tuples
-	    AND (wait.transactionid=hold.transactionid OR
-		    (wait.page=hold.page AND wait.tuple=hold.tuple))
-		AND (hold.pid IS NOT NULL OR twopc.gid IS NOT NULL) -- ???
+		NOT wait.granted AND wait.pid IS NOT NULL AND hold.granted AND
+		-- waiter waits for the the object holder locks
+		wait.database IS NOT DISTINCT FROM hold.database AND
+		wait.relation IS NOT DISTINCT FROM hold.relation AND
+		wait.page IS NOT DISTINCT FROM hold.page AND
+		wait.tuple IS NOT DISTINCT FROM hold.tuple AND
+		wait.virtualxid IS NOT DISTINCT FROM hold.virtualxid AND
+		wait.transactionid IS NOT DISTINCT FROM hold.transactionid AND -- waiting on xid
+		wait.classid IS NOT DISTINCT FROM hold.classid AND
+		wait.objid IS NOT DISTINCT FROM hold.objid AND
+		wait.objsubid IS NOT DISTINCT FROM hold.objsubid AND
+		 -- this is most probably truism, but who knows
+		(hold.pid IS NOT NULL OR twopc.gid IS NOT NULL)
 	UNION ALL
 	-- if this fdw backend is busy, potentially waiting, add edge coordinator -> fdw
 	SELECT ROW(shardman.get_node_by_sysid(split_part(application_name, ':', 2)::bigint),
@@ -2276,7 +2288,7 @@ BEGIN
 			THEN
 				IF clock_timestamp() > failure_timestamp + rm_node_timeout_sec * interval '1 sec'
 				THEN
-					RAISE NOTICE 'Removing node % because of % timeout expiration', failed_node_id, rm_node_timeout_sec;
+					RAISE NOTICE 'Removing node % because of % sec timeout expiration', failed_node_id, rm_node_timeout_sec;
 					PERFORM shardman.broadcast(format('0:SELECT shardman.rm_node(%s, force=>true);', failed_node_id));
 					PERFORM shardman.broadcast('0:SELECT shardman.recover_xacts();');
 					failed_node_id := null;
@@ -2304,7 +2316,10 @@ BEGIN
 			   AND loop_end - loop_begin = prev_loop_end - prev_loop_begin
 			   AND deadlock_path[loop_begin:loop_end] = prev_deadlock_path[prev_loop_begin:prev_loop_end]
 			THEN
-				-- Try to cancel random node in loop
+				-- Try to cancel random node in loop.
+				-- If the victim is not executing active query at the moment,
+				-- pg_cancel_backend can't do anything with xact; because of that,
+				-- we probably need to repeat it several times
 				victim := deadlock_path[loop_begin + ((loop_end - loop_begin)*random())::integer];
 				RAISE NOTICE 'Detect deadlock: cancel process % at node %', victim.pid, victim.node;
 				PERFORM shardman.broadcast(format('%s:SELECT pg_cancel_backend(%s);',
diff --git a/readme.md b/readme.md
@@ -592,15 +592,15 @@ monitor(check_timeout_sec int = 5, rm_node_timeout_sec int = 60)
 Monitor cluster for presence of distributed deadlocks and node failures. This
 function is intended to be executed at shardlord and is redirected to shardlord
 been launched at any other node. It starts infinite loop which polls all
-clusters nodes, collecting local *lock graphs* from all nodes. Period of poll
-is specified by `check_timeout_sec` parameter (default value is 5 seconds).
-Local lock graphs are combined into global lock graph which is analyzed for the
+clusters nodes, collecting local *lock graphs* from all nodes. Period of poll is
+specified by `check_timeout_sec` parameter (default value is 5 seconds).  Local
+lock graphs are combined into global lock graph which is analyzed for the
 presence of loops. A loop in the lock graph means distributed deadlock. Monitor
 function tries to resolve deadlock by canceling one or more backends involved in
 the deadlock loop (using `pg_cancel_backend` function, which doesn't actually
-terminate backend but tries to cancel current query).  As far as not all
-backends are blocked in active query state, it may be needed send cancel several
-times. Canceled backend is randomly chosen within deadlock loop.
+terminate backend but tries to cancel current query). Canceled backend is
+randomly chosen within deadlock loop. Since not all deadlock members are
+hanged in 'active query' state, it might be needed to send cancel several times.
 
 Since local graphs collected from all nodes do not form consistent global
 snapshot, false postives are possible: edges in deadlock loop correspond to
@@ -627,12 +627,13 @@ if it is not available, performs voting among all nodes.
 ```plpgsql
 wipe_state(drop_slots_with_fire bool DEFAULT true)
 ```
-Remove unilaterally all publications, subscriptions and replication slots
-created on the worker node by `pg_shardman`. PostgreSQL forbids to drop
-replication slot with active connection; if `drop_slots_with_fire` is true, we
-will try to kill the walsenders before dropping the slots. Also, immediately
-after transaction commit set `synchronous_standby_names` GUC to empty string --
-this is a non-transactional action and there is a very small chance it won't be
+Remove unilaterally all publications, subscriptions, replication slots, foreign
+servers and user mappings created on the worker node by
+`pg_shardman`. PostgreSQL forbids to drop replication slot with active
+connection; if `drop_slots_with_fire` is true, we will try to kill the
+walsenders before dropping the slots. Also, immediately after transaction commit
+set `synchronous_standby_names` GUC to empty string -- this is a
+non-transactional action and there is a very small chance it won't be
 completed. You probably want to run it before `DROP EXTENSION pg_shardman`.
 Data is not touched by this command.