@@ -262,7 +262,7 @@ BEGIN
262
262
-- Broadcast create table commands
263
263
PERFORM shardman .broadcast (create_tables);
264
264
-- Broadcast create hash partitions command
265
- PERFORM shardman .broadcast (create_partitions);
265
+ PERFORM shardman .broadcast (create_partitions, iso_level => ' read committed ' );
266
266
-- Broadcast create foreign table commands
267
267
PERFORM shardman .broadcast (create_fdws);
268
268
-- Broadcast replace hash partition commands
@@ -376,21 +376,11 @@ BEGIN
376
376
rm_node_id),
377
377
ignore_errors := true);
378
378
379
- -- Remove node from metadata right away. We require from the user that after
380
- -- calling rm_node the node must never be accessed, so it makes sense to
381
- -- reflect metadata accordingly -- otherwise, we if fail somewhere down the
382
- -- road below, the user would have been tempted to change her mind and not
383
- -- to call rm_node again; it should be our responsibility to clean the things
384
- -- up in recovery() in case of failure.
385
- -- We want to see this change, so make sure we are running in READ COMMITTED.
386
379
-- We set node_id of node's parts to NULL, meaning they are waiting for
387
380
-- promotion. Replicas are removed with cascade.
388
- ASSERT current_setting(' transaction_isolation' ) = ' read committed' ,
389
- ' rm_node must be executed with READ COMMITTED isolation level' ;
390
- PERFORM shardman .broadcast (format(
391
- ' {0:UPDATE shardman.partitions SET node_id=null WHERE node_id=%s;
392
- DELETE FROM shardman.nodes WHERE id=%s;}' ,
393
- rm_node_id, rm_node_id));
381
+ UPDATE shardman .partitions SET node_id = NULL WHERE node_id= rm_node_id;
382
+ DELETE FROM shardman .nodes WHERE id = rm_node_id;
383
+
394
384
395
385
-- Remove all subscriptions and publications related to removed node
396
386
FOR node IN SELECT * FROM shardman .nodes WHERE replication_group= repl_group
@@ -489,16 +479,16 @@ BEGIN
489
479
WHERE id<> rm_node_id ORDER BY random() LIMIT 1 ;
490
480
END IF;
491
481
492
- -- Partition is successfully promoted, update metadata. It is important
493
- -- to commit that before sending new mappings, because otherwise if we
494
- -- fail during the latter, news about promoted replica will be lost;
495
- -- next time we might choose another replica to promote with some new
496
- -- data already written to previously promoted replica. Syncing
497
- -- replicas doesn't help us much here if we don't lock tables.
498
- PERFORM shardman .broadcast (format(
499
- ' {0:UPDATE shardman.partitions SET node_id=%s WHERE part_name = %L ;
500
- DELETE FROM shardman.replicas WHERE part_name = %L AND node_id = %s} ' ,
501
- new_master_id, part . part_name , part . part_name , new_master_id)) ;
482
+ -- Partition is successfully promoted, update metadata. XXX: we should
483
+ -- commit that before sending new mappings, because otherwise if we fail
484
+ -- during the latter, news about promoted replica will be lost; next
485
+ -- time we might choose another replica to promote with some new data
486
+ -- already written to previously promoted replica. Syncing replicas
487
+ -- doesn't help us much here if we don't lock tables.
488
+ UPDATE shardman .partitions SET node_id = new_master_id
489
+ WHERE part_name = part . part_name ;
490
+ DELETE FROM shardman .replicas WHERE part_name = part . part_name AND
491
+ node_id = new_master_id;
502
492
503
493
-- Update pathman partition map at all nodes
504
494
FOR node IN SELECT * FROM shardman .nodes WHERE id<> rm_node_id
@@ -611,15 +601,14 @@ BEGIN
611
601
-- Create parent table at all nodes
612
602
create_tables := format(' %s{%s:%s}' ,
613
603
create_tables, node .id , create_table);
614
- -- Create partitions using pathman at all nodes
615
604
create_partitions := format(' %s%s:select create_hash_partitions(%L,%L,%L);' ,
616
605
create_partitions, node .id , rel_name, expr, part_count);
617
606
END LOOP;
618
607
619
608
-- Broadcast create table commands
620
609
PERFORM shardman .broadcast (create_tables);
621
610
-- Broadcast create hash partitions command
622
- PERFORM shardman .broadcast (create_partitions);
611
+ PERFORM shardman .broadcast (create_partitions, iso_level => ' read committed ' );
623
612
624
613
-- Get list of nodes in random order
625
614
SELECT ARRAY(SELECT id from shardman .nodes ORDER BY random()) INTO node_ids;
@@ -650,7 +639,7 @@ BEGIN
650
639
END LOOP;
651
640
652
641
-- Broadcast create foreign table commands
653
- PERFORM shardman .broadcast (create_fdws);
642
+ PERFORM shardman .broadcast (create_fdws, iso_level => ' read committed ' );
654
643
-- Broadcast replace hash partition commands
655
644
PERFORM shardman .broadcast (replace_parts);
656
645
@@ -673,6 +662,7 @@ DECLARE
673
662
repl_group text ;
674
663
pubs text = ' ' ;
675
664
subs text = ' ' ;
665
+ sub text = ' ' ;
676
666
sub_options text = ' ' ;
677
667
BEGIN
678
668
IF shardman .redirect_to_shardlord (format(' set_redundancy(%L, %L)' , rel_name,
@@ -709,8 +699,12 @@ BEGIN
709
699
-- Establish publications and subscriptions for this partition
710
700
pubs := format(' %s%s:ALTER PUBLICATION node_%s ADD TABLE %I;' ,
711
701
pubs, part .node_id , repl_node, part .part_name );
712
- subs := format(' %s%s:ALTER SUBSCRIPTION sub_%s_%s REFRESH PUBLICATION%s;' ,
713
- subs, repl_node, repl_node, part .node_id , sub_options);
702
+ sub := format(' %s:ALTER SUBSCRIPTION sub_%s_%s REFRESH PUBLICATION%s;' ,
703
+ repl_node, repl_node, part .node_id , sub_options);
704
+ -- ignore duplicates
705
+ IF position(sub in subs) = 0 THEN
706
+ subs := subs || sub;
707
+ END IF;
714
708
END LOOP;
715
709
END IF;
716
710
END LOOP;
@@ -1410,6 +1404,7 @@ BEGIN
1410
1404
WHERE r .part_name = part .part_name ORDER BY random() LIMIT 1 ;
1411
1405
IF new_master_id IS NOT NULL
1412
1406
THEN -- exists some replica for this part, promote it
1407
+ RAISE DEBUG ' [SHMN] Promoting part % on node %' , part .part_name , new_master_id;
1413
1408
-- If there are more than one replica of this partition, we need to
1414
1409
-- synchronize them
1415
1410
IF shardman .get_redundancy_of_partition (part .part_name ) > 1
@@ -1425,16 +1420,20 @@ BEGIN
1425
1420
WHERE id<> rm_node_id ORDER BY random() LIMIT 1 ;
1426
1421
END IF;
1427
1422
1428
- -- Update metadata. It is important to commit that before sending new
1423
+ -- Update metadata. XXX We should commit that before sending new
1429
1424
-- mappings, because otherwise if we fail during the latter, news about
1430
1425
-- promoted replica will be lost; next time we might choose another
1431
1426
-- replica to promote with some new data already written to previously
1432
1427
-- promoted replica. Syncing replicas doesn't help us much here if we
1433
1428
-- don't lock tables.
1434
- PERFORM shardman .broadcast (format(
1435
- ' {0:UPDATE shardman.partitions SET node_id=%s WHERE part_name = %L;
1436
- DELETE FROM shardman.replicas WHERE part_name = %L AND node_id = %s}' ,
1437
- new_master_id, part .part_name , part .part_name , new_master_id));
1429
+ UPDATE shardman .partitions SET node_id= new_master_id
1430
+ WHERE part_name = part .part_name ;
1431
+ DELETE FROM shardman .replicas r WHERE r .part_name = part .part_name AND
1432
+ r .node_id = new_master_id;
1433
+ -- PERFORM shardman.broadcast(format(
1434
+ -- '{0:UPDATE shardman.partitions SET node_id=%s WHERE part_name = %L;
1435
+ -- DELETE FROM shardman.replicas WHERE part_name = %L AND node_id = %s}',
1436
+ -- new_master_id, part.part_name, part.part_name, new_master_id));
1438
1437
END LOOP;
1439
1438
1440
1439
-- Fix replication channels
@@ -1601,7 +1600,8 @@ BEGIN
1601
1600
SELECT * INTO t FROM shardman .tables WHERE relation= part .relation ;
1602
1601
PERFORM shardman .broadcast (format(
1603
1602
' %s:SELECT create_hash_partitions(%L,%L,%L);' ,
1604
- src_node .id , t .relation , t .sharding_key , t .partitions_count ));
1603
+ src_node .id , t .relation , t .sharding_key , t .partitions_count ),
1604
+ iso_level => ' read committed' );
1605
1605
END IF;
1606
1606
RAISE NOTICE ' Replace % with % at node %' ,
1607
1607
part .part_name , fdw_part_name, src_node .id ;
@@ -1628,7 +1628,8 @@ BEGIN
1628
1628
SELECT * INTO t FROM shardman .tables WHERE relation= part .relation ;
1629
1629
PERFORM shardman .broadcast (format(
1630
1630
' %s:SELECT create_hash_partitions(%L, %L, %L);' ,
1631
- src_node .id , t .relation , t .sharding_key , t .partitions_count ));
1631
+ src_node .id , t .relation , t .sharding_key , t .partitions_count ),
1632
+ iso_level => ' read committed' );
1632
1633
ELSE
1633
1634
RAISE NOTICE ' Replace % with % at node %' ,
1634
1635
fdw_part_name, part .part_name , src_node .id ;
@@ -1827,7 +1828,9 @@ $$ LANGUAGE plpgsql;
1827
1828
1828
1829
-- Commit or rollback not completed distributed transactions.
1829
1830
-- All nodes must be alive for this to do something.
1830
- -- If coordinator is still in the cluster, we just try asking it.
1831
+ -- If coordinator is still in the cluster, we just try asking it:
1832
+ -- if xact committed on it, we commit it everywhere, if aborted, abort
1833
+ -- everywhere.
1831
1834
-- If not, and there is only one participant, we simply commit the xact.
1832
1835
-- If n_participants > 1, and xact is prepared everywhere, commit it.
1833
1836
-- Otherwise, check WAL of every node; if COMMIT is found, COMMIT, if ABORT
@@ -1870,7 +1873,8 @@ BEGIN
1870
1873
cmds, node_id, node_id);
1871
1874
END LOOP;
1872
1875
1873
- -- Collected prepared xacts from all nodes
1876
+ -- Collected prepared xacts from all nodes. They arrive as comma-separated
1877
+ -- $node_id=>$gid
1874
1878
xacts := string_to_array(shardman .broadcast (cmds), ' ,' );
1875
1879
-- empty string means no prepared xacts
1876
1880
xacts := array_remove(xacts, ' ' );
@@ -1880,7 +1884,7 @@ BEGIN
1880
1884
xact_node_id := split_part(xact, ' =>' , 1 );
1881
1885
gid := split_part(xact, ' =>' , 2 );
1882
1886
sysid := split_part(gid, ' :' , 3 )::bigint ;
1883
- xid := split_part(gid, ' :' , 4 )::bigint ;
1887
+ xid := split_part(gid, ' :' , 4 )::bigint ; -- coordinator's xid
1884
1888
SELECT id INTO coordinator FROM shardman .nodes WHERE system_id= sysid;
1885
1889
IF coordinator IS NULL
1886
1890
THEN
@@ -1982,7 +1986,7 @@ BEGIN
1982
1986
finish := format(' %s%s:ROLLBACK PREPARED %L;' , finish, xact_node_id, gid);
1983
1987
ELSEIF status IS NULL
1984
1988
THEN
1985
- RAISE WARNING ' Transaction % at coordinator % is too old to perform 2PC resolution' ,
1989
+ RAISE WARNING ' Transaction % at coordinator % is too old to perform 2PC resolution or still in progress ' ,
1986
1990
gid, coordinator;
1987
1991
END IF;
1988
1992
END IF;
@@ -2085,13 +2089,19 @@ RETURNS text AS 'pg_shardman' LANGUAGE C STRICT;
2085
2089
-- previous was already executed.
2086
2090
-- If super_connstr is true, super connstring is used everywhere, usual
2087
2091
-- connstr otherwise.
2092
+
2093
+ -- If iso_level is specified, cmd is wrapped in BEGIN TRANSACTION ISOLATION
2094
+ -- LEVEL iso_level; ... END;
2095
+ -- this allows to set isolation level; however you won't be able to get results
2096
+ -- this way.
2088
2097
CREATE FUNCTION broadcast (cmds text ,
2089
2098
ignore_errors bool = false,
2090
2099
two_phase bool = false,
2091
2100
sync_commit_on bool = false,
2092
2101
sequential bool = false,
2093
- super_connstr bool = false)
2094
- RETURNS text AS ' pg_shardman' LANGUAGE C STRICT;
2102
+ super_connstr bool = false,
2103
+ iso_level text = null )
2104
+ RETURNS text AS ' pg_shardman' LANGUAGE C;
2095
2105
2096
2106
-- Options to postgres_fdw are specified in two places: user & password in user
2097
2107
-- mapping and everything else in create server. The problem is that we use
0 commit comments