Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit f6691f2

Browse files
committed
recover_xacts comments and docs.
1 parent 20fa71b commit f6691f2

File tree

2 files changed

+59
-26
lines changed

2 files changed

+59
-26
lines changed

pg_shardman--0.0.2.sql

Lines changed: 36 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1562,7 +1562,19 @@ END
15621562
$$ LANGUAGE plpgsql;
15631563

15641564

1565-
-- Commit or rollback not completed distributed transactions
1565+
-- Commit or rollback not completed distributed transactions.
1566+
-- All nodes must be alive for this to do something.
1567+
-- If coordinator is still in the cluster, we just try asking it.
1568+
-- If not, and there is only one participant, we simply commit the xact.
1569+
-- If n_participants > 1, and xact is prepared everywhere, commit it.
1570+
-- Otherwise, check WAL of every node; if COMMIT is found, COMMIT, if ABORT
1571+
-- is found, ABORT.
1572+
--
1573+
-- Currently this function is not too hasty because
1574+
-- * We make totally independent decisions for each 'prepare'.
1575+
-- * We never know the participants and poll all nodes in the cluster.
1576+
-- * If coordinator is excluded, we sequentially examine WAL of *all* nodes to
1577+
-- learn the outcome, even where xact is prepared.
15661578
CREATE FUNCTION recover_xacts() RETURNS void AS $$
15671579
DECLARE
15681580
node_id int;
@@ -1590,7 +1602,9 @@ BEGIN
15901602

15911603
FOR node_id IN SELECT id FROM shardman.nodes
15921604
LOOP
1593-
cmds := format('%s%s:SELECT string_agg(''%s=>''||gid, '','') FROM pg_prepared_xacts;', cmds, node_id, node_id);
1605+
cmds := format($cmd$
1606+
%s%s:SELECT coalesce(string_agg('%s=>' || gid, ','), '') FROM pg_prepared_xacts;$cmd$,
1607+
cmds, node_id, node_id);
15941608
END LOOP;
15951609

15961610
-- Collected prepared xacts from all nodes
@@ -1625,7 +1639,7 @@ BEGIN
16251639
n_prepared := n_prepared + counter::int;
16261640
END LOOP;
16271641

1628-
IF n_prepared=n_participants
1642+
IF n_prepared = n_participants
16291643
THEN
16301644
RAISE NOTICE 'Commit distributed transaction % which is prepared at all participant nodes', gid;
16311645
finish := format('%s%s:COMMIT PREPARED %L;', finish, xact_node_id, gid);
@@ -1635,16 +1649,19 @@ BEGIN
16351649

16361650
IF EXISTS (SELECT * FROM pg_proc WHERE proname='pg_prepared_xact_status')
16371651
THEN
1638-
-- Without coordinator there is no standard way to get status of this distributed transaction.
1639-
-- Use PGPRO-EE pg_prepared_xact_status() function if available
1652+
-- Without coordinator there is no standard way to get
1653+
-- status of this distributed transaction. Use PGPRO-EE
1654+
-- pg_prepared_xact_status() function if available
16401655
cmds := '';
16411656
FOR node_id IN SELECT id FROM shardman.nodes
16421657
LOOP
1643-
cmds := format('%s%s:SELECT pg_prepared_xact_status(%L);', cmds, node_id, gid);
1658+
cmds := format('%s%s:SELECT pg_prepared_xact_status(%L);',
1659+
cmds, node_id, gid);
16441660
END LOOP;
16451661
SELECT shardman.broadcast(cmds) INTO resp;
16461662

1647-
-- Collect information about distributed transaction status at all nodes
1663+
-- Collect information about distributed transaction
1664+
-- status at all nodes
16481665
do_commit := false;
16491666
do_rollback := false;
16501667
FOREACH status IN ARRAY string_to_array(resp, ',')
@@ -1662,9 +1679,10 @@ BEGIN
16621679
THEN
16631680
IF do_rollack
16641681
THEN
1665-
RAISE NOTICE 'Inconsistent state of transaction %', gid;
1682+
RAISE WARNING 'Inconsistent state of transaction %',
1683+
gid;
16661684
ELSE
1667-
RAISE NOTICE 'Commit transaction %s at node % because if was committed at one of participants',
1685+
RAISE NOTICE 'Committing transaction %s at node % because it was committed at one of participants',
16681686
gid, xact_node_id;
16691687
finish := format('%s%s:COMMIT PREPARED %L;', finish, xact_node_id, gid);
16701688
END IF;
@@ -1674,24 +1692,29 @@ BEGIN
16741692
gid, xact_node_id;
16751693
finish := format('%s%s:ROLLBACK PREPARED %L;', finish, xact_node_id, gid);
16761694
ELSE
1677-
RAISE NOTICE 'Can not make any decision concerning distributes transaction %', gid;
1695+
RAISE NOTICE 'Can''t make any decision concerning distributed transaction %', gid;
16781696
END IF;
16791697
END IF;
16801698
END IF;
16811699
ELSE
1682-
RAISE NOTICE 'Commit transaction % with single participant %', gid, xact_node_id;
1700+
RAISE NOTICE 'Committing transaction % with single participant %', gid, xact_node_id;
16831701
finish := format('%s%s:COMMIT PREPARED %L;', finish, xact_node_id, gid);
16841702
END IF;
16851703
ELSE
16861704
-- Check status of transaction at coordinator
1687-
SELECT shardman.broadcast(format('%s:SELECT txid_status(%s);', coordinator, xid)) INTO status;
1688-
RAISE NOTICE 'Status of distributed transaction % is % at coordinator %', gid, status, coordinator;
1705+
SELECT shardman.broadcast(format('%s:SELECT txid_status(%s);', coordinator, xid))
1706+
INTO status;
1707+
RAISE NOTICE 'Status of distributed transaction % is % at coordinator %',
1708+
gid, status, coordinator;
16891709
IF status='committed'
16901710
THEN
16911711
finish := format('%s%s:COMMIT PREPARED %L;', finish, xact_node_id, gid);
16921712
ELSIF status='aborted'
16931713
THEN
16941714
finish := format('%s%s:ROLLBACK PREPARED %L;', finish, xact_node_id, gid);
1715+
ELSEIF status IS NULL
1716+
THEN
1717+
RAISE WARNING ''
16951718
END IF;
16961719
END IF;
16971720
END LOOP;

readme.md

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -265,7 +265,7 @@ it between replication groups. Functions
265265
`shardman.rebalance_replicas(replica_pattern text = '%')` try to
266266
distribute partitions/replicas which names are `LIKE` given pattern uniformly
267267
between all nodes of their replication groups. In `pg_shardman`, partitions are
268-
called `${sharded_table_num}_${part_num}`. For instance, if you have sharded
268+
called `$sharded_table_num_$part_num`. For instance, if you have sharded
269269
table `horns`, issuing `shardman.rebalance('horns%')` should be enough to
270270
rebalance its partitions. These functions move partitions/replicas sequentially,
271271
one at time. We pretend that is was done to minimize the impact on the normal
@@ -284,8 +284,8 @@ destination node; for replica it is also necessary to specify the source node.
284284

285285
### Transactions
286286

287-
Atomicity and durability of transactions touching only single node is handled as
288-
usual in PostgreSQL, which does a pretty good job at that. However, without
287+
Atomicity and durability of transactions touching only single node is handled by
288+
vanilla PostgreSQL, which does a pretty good job at that. However, without
289289
special arrangments result of cross-node transaction might be non-atomic: if
290290
coordinator (node where transaction started) has committed it on some nodes and
291291
then something went wrong (e.g. it failed), the transaction will be aborted on
@@ -302,10 +302,10 @@ consistent behaviour of all nodes.
302302
The problem is that presently `PREPARE` is not transferred via logical
303303
replication to replicas, which means that in case of permanent node failure we
304304
still might lost part of distributed transaction and get non-atomic result if
305-
primary has prepared the transaction, but failed without committing it and now
306-
replica has no idea about it. Properly implemented `PREPARE` going over logical
307-
replication will also mitigate the possibilty of losing transactions described
308-
in 'Replication' section, and we plan to do that.
305+
primary has prepared the transaction, but died without committing it and now
306+
replica has no idea about the xact. Properly implemented `PREPARE` going over
307+
logical replication will also mitigate the possibilty of losing transactions
308+
described in 'Replication' section, and we plan to do that.
309309

310310
Similarly, if transactions affect only single nodes, plain PostgreSQL isolation
311311
rules are applicable. However, for distributed transactions we need distributed
@@ -336,7 +336,7 @@ is the following.
336336
* Make sure failed node is really turned off, and never make it online without
337337
erasing its state -- otherwise stale reads and inconsistent writes on it
338338
are possible.
339-
* Run `select shardman.rm_node(${failed_node_id}, force => true)` to exclude it
339+
* Run `select shardman.rm_node($failed_node_id, force => true)` to exclude it
340340
and promote replicas. The most advanced replica is choosen and state of other
341341
replicas is synchronized.
342342
* Run `select shardman.recover_xacts()` to resolve possibly hanged 2PC
@@ -626,16 +626,26 @@ If some node is unreachable then monitor function prints correspondent error
626626
message and retries access until `rm_node_timeout_sec` timeout expiration. After
627627
it node is removed from the cluster using `shardman.rm_node` function. If
628628
redundancy level is non-zero, then primary partitions from the disabled node are
629-
replaced with replicas. Finally shardman performs recovery of distributed
629+
replaced with replicas. Finally `pg_shardman` performs recovery of distributed
630630
transactions which coordinators were at failed node. It is done using
631-
`shardman.recover_xacts` function which collects status of distributed
631+
`shardman.recover_xacts()` function which collects status of distributed
632632
transaction at all participants and tries to make decision whether it should be
633633
committed or aborted.
634634
If `rm_node_timeout_sec` is `NULL`, `monitor` will not remove nodes.
635635

636-
Function `shardman.recover_xacts` can be also implicitly invoked by database administrator after abnormal cluster restart to recover
637-
not completed distributed transactions. First of all it tries to obtain status of distributed transaction from its coordinator and only
638-
if it is not available, performs voting among all nodes.
636+
Function `shardman.recover_xacts()` can be also manually invoked by database
637+
administrator on shardlord after abnormal cluster restart to recover not
638+
completed distributed transactions. If the coordinator is still in the cluster,
639+
we ask it about transaction outcome. Otherwise, we inquire every node's opinion
640+
on the xact; if there is at least one commit (and no aborts), we commit it, if
641+
there is at least one abort (and no commits), we abort it. All nodes in the
642+
cluster must be online to let this function resolve the transaction. Patched
643+
Postgres is needed for proper work of this function.
644+
645+
Another limitation of `shardman.recover_xacts` is that we currently don't
646+
control recycling of WAL and clog used to check for completed transaction
647+
status. Though unlikely, theoretically it is possible that we won't be able to
648+
learn it and resolve the transaction.
639649

640650
```plpgsql
641651
wipe_state(drop_slots_with_fire bool DEFAULT true)

0 commit comments

Comments
 (0)