Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit e7cfaa5

Browse files
committed
Add monitor_deadlocks function
1 parent 9bc108b commit e7cfaa5

File tree

1 file changed

+40
-25
lines changed

1 file changed

+40
-25
lines changed

pg_shardman--1.0.sql

Lines changed: 40 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1996,34 +1996,49 @@ END;
19961996
$$ LANGUAGE plpgsql;
19971997

19981998

1999-
-- Detect distributed deadlock and return set of process involed in deadlock. If there is no deadlock then this view ias empty.
2000-
--
2001-
-- This query is based on the algorithm described by Knuth for detecting a cycle in a linked list. In one column, keep track of the children,
2002-
-- the children's children, the children's children's children, etc. In another column, keep track of the grandchildren, the grandchildren's grandchildren,
2003-
-- the grandchildren's grandchildren's grandchildren, etc.
2004-
--
2005-
-- For the initial selection, the distance between Child and Grandchild columns is 1. Every selection from union all increases the depth of Child by 1, and that of Grandchild by 2.
2006-
-- The distance between them increases by 1.
2007-
--
2008-
-- If there is any loop, since the distance only increases by 1 each time, at some point after Child is in the loop, the distance will be a multiple of the cycle length.
2009-
-- When that happens, the Child and the Grandchild columns are the same. Use that as an additional condition to stop the recursion, and detect it in the rest of your code as an error.
2010-
CREATE VIEW deadlock AS
1999+
-- Detect distributed deadlock and returns path in the lock graph forming deadlock loop
2000+
CREATE FUNCTION detect_deadlock() RETURNS shardman.process[] AS $$
20112001
WITH RECURSIVE LinkTable AS (SELECT wait AS Parent, hold AS Child FROM shardman.deserialize_lock_graph(shardman.global_lock_graph())),
20122002
cte AS (
2013-
SELECT lt1.Parent, lt1.Child, lt2.Child AS Grandchild
2014-
FROM LinkTable lt1
2015-
INNER JOIN LinkTable lt2 on lt2.Parent = lt1.Child
2016-
UNION ALL
2017-
SELECT cte.Parent, lt1.Child, lt3.Child AS Grandchild
2018-
FROM cte
2019-
INNER JOIN LinkTable lt1 ON lt1.Parent = cte.Child
2020-
INNER JOIN LinkTable lt2 ON lt2.Parent = cte.Grandchild
2021-
INNER JOIN LinkTable lt3 ON lt3.Parent = lt2.Child
2022-
WHERE cte.Child <> cte.Grandchild
2003+
SELECT Child, Parent, ARRAY[Child] AS AllParents, false AS Loop
2004+
FROM LinkTable
2005+
UNION ALL
2006+
SELECT c.Child, c.Parent, p.AllParents||c.Child, c.Child=ANY(p.AllParents)
2007+
FROM LinkTable c JOIN cte p ON c.Parent = p.Child AND NOT p.Loop
20232008
)
2024-
SELECT DISTINCT Parent
2025-
FROM cte
2026-
WHERE Child = Grandchild;
2009+
SELECT AllParents FROM cte WHERE Loop;
2010+
$$ LANGUAGE sql;
2011+
2012+
-- Monitor cluster for presence of distributed deadlocks and cancel correspondent queries
2013+
CREATE FUNCTION monitor_deadlocks(timeout_sec int = 5) RETURNS void AS $$
2014+
DECLARE
2015+
prev_deadlock_path shardman.process[];
2016+
deadlock_path shardman.process[];
2017+
victim shardman.process;
2018+
loop_begin int;
2019+
loop_end int;
2020+
BEGIN
2021+
IF shardman.redirect_to_shardlord(format('monitor_deadlocks(%s)', timeout_sec))
2022+
THEN
2023+
RETURN;
2024+
END IF;
2025+
2026+
LOOP
2027+
deadlock_path := shardman.detect_deadlock();
2028+
IF deadlock_path && prev_deadlock_path
2029+
THEN
2030+
loop_end := array_upper(deadlock_path, 1);
2031+
loop_begin := array_position(deadlock_path, deadlock_path[loop_end]);
2032+
victim := deadlock_path[loop_begin + ((loop_end - loop_begin)*random())::integer];
2033+
RAISE NOTICE 'Detect deadlock: cancel process % at node %', victim.pid, victim.node;
2034+
PERFORM shardman.broadcast(format('%s:SELECT pg_cancel_backend(%s);',
2035+
victim.node, victim.pid));
2036+
END IF;
2037+
prev_deadlock_path := deadlock_path;
2038+
PERFORM pg_sleep(timeout_sec);
2039+
END LOOP;
2040+
END;
2041+
$$ LANGUAGE plpgsql;
20272042

20282043

20292044
-- View for monitoring logical replication lag.

0 commit comments

Comments
 (0)