@@ -2137,9 +2137,13 @@ create type process as (node int, pid int);
2137
2137
-- View to build lock graph which can be used to detect global deadlock.
2138
2138
-- Application_name is assumed pgfdw:$system_id:$coord_pid
2139
2139
-- gid is assumed $pid:$count:$sys_id:$xid:$participants_count
2140
+ -- Currently we are oblivious about lock modes and report any wait -> hold edge
2141
+ -- on the same object and therefore might produce false loops. Furthermore,
2142
+ -- we have not idea about locking queues here. Probably it is better to use
2143
+ -- pg_blocking_pids, but it seems to ignore prepared xacts.
2140
2144
CREATE VIEW lock_graph (wait, hold) AS
2141
- -- If xact is already prepared, we take node and pid of the coordinator.
2142
2145
-- local dependencies
2146
+ -- If xact is already prepared, we take node and pid of the coordinator.
2143
2147
SELECT
2144
2148
ROW(shardman .get_my_id (),
2145
2149
wait .pid )::shardman .process ,
@@ -2152,11 +2156,19 @@ CREATE VIEW lock_graph(wait, hold) AS
2152
2156
FROM pg_locks wait, pg_locks hold LEFT OUTER JOIN pg_prepared_xacts twopc
2153
2157
ON twopc .transaction = hold .transactionid
2154
2158
WHERE
2155
- NOT wait .granted AND wait .pid IS NOT NULL AND hold .granted
2156
- -- this select captures waitings on xid and on, hm, tuples
2157
- AND (wait .transactionid = hold .transactionid OR
2158
- (wait .page = hold .page AND wait .tuple = hold .tuple ))
2159
- AND (hold .pid IS NOT NULL OR twopc .gid IS NOT NULL ) -- ???
2159
+ NOT wait .granted AND wait .pid IS NOT NULL AND hold .granted AND
2160
+ -- waiter waits for the the object holder locks
2161
+ wait .database IS NOT DISTINCT FROM hold .database AND
2162
+ wait .relation IS NOT DISTINCT FROM hold .relation AND
2163
+ wait .page IS NOT DISTINCT FROM hold .page AND
2164
+ wait .tuple IS NOT DISTINCT FROM hold .tuple AND
2165
+ wait .virtualxid IS NOT DISTINCT FROM hold .virtualxid AND
2166
+ wait .transactionid IS NOT DISTINCT FROM hold .transactionid AND -- waiting on xid
2167
+ wait .classid IS NOT DISTINCT FROM hold .classid AND
2168
+ wait .objid IS NOT DISTINCT FROM hold .objid AND
2169
+ wait .objsubid IS NOT DISTINCT FROM hold .objsubid AND
2170
+ -- this is most probably truism, but who knows
2171
+ (hold .pid IS NOT NULL OR twopc .gid IS NOT NULL )
2160
2172
UNION ALL
2161
2173
-- if this fdw backend is busy, potentially waiting, add edge coordinator -> fdw
2162
2174
SELECT ROW(shardman .get_node_by_sysid (split_part(application_name, ' :' , 2 )::bigint ),
@@ -2276,7 +2288,7 @@ BEGIN
2276
2288
THEN
2277
2289
IF clock_timestamp() > failure_timestamp + rm_node_timeout_sec * interval ' 1 sec'
2278
2290
THEN
2279
- RAISE NOTICE ' Removing node % because of % timeout expiration' , failed_node_id, rm_node_timeout_sec;
2291
+ RAISE NOTICE ' Removing node % because of % sec timeout expiration' , failed_node_id, rm_node_timeout_sec;
2280
2292
PERFORM shardman .broadcast (format(' 0:SELECT shardman.rm_node(%s, force=>true);' , failed_node_id));
2281
2293
PERFORM shardman .broadcast (' 0:SELECT shardman.recover_xacts();' );
2282
2294
failed_node_id := null ;
@@ -2304,7 +2316,10 @@ BEGIN
2304
2316
AND loop_end - loop_begin = prev_loop_end - prev_loop_begin
2305
2317
AND deadlock_path[loop_begin:loop_end] = prev_deadlock_path[prev_loop_begin:prev_loop_end]
2306
2318
THEN
2307
- -- Try to cancel random node in loop
2319
+ -- Try to cancel random node in loop.
2320
+ -- If the victim is not executing active query at the moment,
2321
+ -- pg_cancel_backend can't do anything with xact; because of that,
2322
+ -- we probably need to repeat it several times
2308
2323
victim := deadlock_path[loop_begin + ((loop_end - loop_begin)* random())::integer ];
2309
2324
RAISE NOTICE ' Detect deadlock: cancel process % at node %' , victim .pid , victim .node ;
2310
2325
PERFORM shardman .broadcast (format(' %s:SELECT pg_cancel_backend(%s);' ,
0 commit comments