Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 59fb1ff

Browse files
committed
Test set_redundancy and rebalance, bunch of little fixes, readme.
* Default replication group is now node id, a saner behaviour. * Checking that we have enough nodes in RG to create replicas.
1 parent 0f3866d commit 59fb1ff

File tree

3 files changed

+335
-179
lines changed

3 files changed

+335
-179
lines changed

pg_shardman--0.0.2.sql

Lines changed: 63 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@ CREATE TABLE nodes (
3232
system_id bigint NOT NULL UNIQUE,
3333
super_connection_string text UNIQUE NOT NULL,
3434
connection_string text UNIQUE NOT NULL,
35-
replication_group text NOT NULL -- group of nodes within which shard replicas are allocated
35+
-- group of nodes within which shard replicas are allocated
36+
replication_group text NOT NULL
3637
);
3738

3839
-- List of sharded tables
@@ -72,7 +73,7 @@ CREATE TABLE replicas (
7273
-- * It allows to set up pgbouncer, as replication can't go through it.
7374
-- If conn_string is null, super_conn_string is used everywhere.
7475
CREATE FUNCTION add_node(super_conn_string text, conn_string text = NULL,
75-
repl_group text = 'default') RETURNS int AS $$
76+
repl_group text = NULL) RETURNS int AS $$
7677
DECLARE
7778
new_node_id int;
7879
node shardman.nodes;
@@ -113,18 +114,22 @@ BEGIN
113114
-- Insert new node in nodes table
114115
INSERT INTO shardman.nodes (system_id, super_connection_string,
115116
connection_string, replication_group)
116-
VALUES (0, super_conn_string, conn_string_effective, repl_group)
117+
VALUES (0, super_conn_string, conn_string_effective, '')
117118
RETURNING id INTO new_node_id;
118119

119-
-- We have to update system_id after insert, because otherwise broadcast
120-
-- will not work
120+
-- We have to update system_id along with dependant repl_group after insert,
121+
-- because otherwise broadcast will not work.
121122
sys_id := shardman.broadcast(
122123
format('%s:SELECT shardman.get_system_identifier();',
123124
new_node_id))::bigint;
124125
IF EXISTS(SELECT 1 FROM shardman.nodes where system_id = sys_id) THEN
125126
RAISE EXCEPTION 'Node with system id % is already in the cluster', sys_id;
126127
END IF;
127-
UPDATE shardman.nodes SET system_id=sys_id WHERE id=new_node_id;
128+
UPDATE shardman.nodes SET system_id = sys_id WHERE id = new_node_id;
129+
-- By default, use system id as repl group name
130+
UPDATE shardman.nodes SET replication_group =
131+
(CASE WHEN repl_group IS NULL THEN sys_id::text ELSE repl_group END)
132+
WHERE id = new_node_id;
128133

129134
-- Adjust replication channels within replication group.
130135
-- We need all-to-all replication channels between all group members.
@@ -137,10 +142,10 @@ BEGIN
137142
%s:CREATE PUBLICATION node_%s;
138143
%s:SELECT pg_create_logical_replication_slot(''node_%s'', ''pgoutput'');
139144
%s:SELECT pg_create_logical_replication_slot(''node_%s'', ''pgoutput'');',
140-
pubs, node.id, new_node_id,
141-
new_node_id, node.id,
142-
node.id, new_node_id,
143-
new_node_id, node.id);
145+
pubs, node.id, new_node_id,
146+
new_node_id, node.id,
147+
node.id, new_node_id,
148+
new_node_id, node.id);
144149
-- Add to new node subscriptions to existing nodes and add subscription
145150
-- to new node to all existing nodes
146151
-- sub name is sub_$subnodeid_pubnodeid to avoid application_name collision
@@ -202,18 +207,16 @@ BEGIN
202207
-- Broadcast command for creating user mapping for this servers
203208
PERFORM shardman.broadcast(usms);
204209

205-
-- Create FDWs at new node for all existed partitions
210+
-- Create FDWs at new node for all existing partitions
206211
FOR t IN SELECT * from shardman.tables WHERE sharding_key IS NOT NULL
207212
LOOP
208213
create_tables := format('%s{%s:%s}',
209214
create_tables, new_node_id, t.create_sql);
210215
create_partitions := format('%s%s:SELECT create_hash_partitions(%L,%L,%L);',
211216
create_partitions, new_node_id, t.relation, t.sharding_key, t.partitions_count);
212217
SELECT shardman.reconstruct_table_attrs(t.relation) INTO table_attrs;
213-
FOR part IN SELECT * from shardman.partitions WHERE relation=t.relation
218+
FOR part IN SELECT * FROM shardman.partitions WHERE relation=t.relation
214219
LOOP
215-
SELECT connection_string INTO conn_string from shardman.nodes WHERE id=part.node_id;
216-
SELECT * FROM shardman.conninfo_to_postgres_fdw_opts(conn_str) INTO server_opts, um_opts;
217220
srv_name := format('node_%s', part.node_id);
218221
fdw_part_name := format('%s_fdw', part.part_name);
219222
create_fdws := format('%s%s:CREATE FOREIGN TABLE %I %s SERVER %s OPTIONS (table_name %L);',
@@ -460,14 +463,27 @@ BEGIN
460463
END
461464
$$ LANGUAGE plpgsql;
462465

463-
466+
-- Bail out with ERROR if some replication group doesn't have 'redundancy'
467+
-- replicas
468+
CREATE FUNCTION check_max_replicas(redundancy int) RETURNS void AS $$
469+
DECLARE
470+
rg record;
471+
BEGIN
472+
FOR rg IN SELECT count(*), replication_group FROM shardman.nodes
473+
GROUP BY replication_group LOOP
474+
IF rg.count < redundancy + 1 THEN
475+
RAISE EXCEPTION 'Requested redundancy % is too high: replication group % has % members', redundancy, rg.replication_group, rg.count;
476+
END IF;
477+
END LOOP;
478+
END
479+
$$ LANGUAGE plpgsql;
464480

465481
-- Shard table with hash partitions. Parameters are the same as in pathman.
466482
-- It also scatter partitions through all nodes.
467483
-- This function expects that empty table is created at shardlord.
468484
-- It can be executed only at shardlord and there is no need to redirect this
469485
-- function to shardlord.
470-
CREATE FUNCTION create_hash_partitions(rel regclass, expr text, part_count int,
486+
CREATE FUNCTION create_hash_partitions(rel_name name, expr text, part_count int,
471487
redundancy int = 0)
472488
RETURNS void AS $$
473489
DECLARE
@@ -483,7 +499,6 @@ DECLARE
483499
create_partitions text = '';
484500
create_fdws text = '';
485501
replace_parts text = '';
486-
rel_name text = rel::text;
487502
i int;
488503
n_nodes int;
489504
BEGIN
@@ -496,6 +511,9 @@ BEGIN
496511
RAISE EXCEPTION 'Please add some nodes first';
497512
END IF;
498513

514+
-- Check right away to avoid unneccessary recover()
515+
PERFORM shardman.check_max_replicas(redundancy);
516+
499517
-- Generate SQL statement creating this table
500518
SELECT shardman.gen_create_table_sql(rel_name) INTO create_table;
501519

@@ -522,7 +540,7 @@ BEGIN
522540
n_nodes := array_length(node_ids, 1);
523541

524542
-- Reconstruct table attributes from parent table
525-
SELECT shardman.reconstruct_table_attrs(rel_name) INTO table_attrs;
543+
SELECT shardman.reconstruct_table_attrs(rel_name::regclass) INTO table_attrs;
526544

527545
FOR i IN 0..part_count-1
528546
LOOP
@@ -553,15 +571,15 @@ BEGIN
553571

554572
IF redundancy <> 0
555573
THEN
556-
PERFORM shardman.set_redundancy(rel, redundancy, copy_data => false);
574+
PERFORM shardman.set_redundancy(rel_name, redundancy, copy_data => false);
557575
END IF;
558576
END
559577
$$ LANGUAGE plpgsql;
560578

561579
-- Provide requested level of redundancy. 0 means no redundancy.
562580
-- If existing level of redundancy is greater than specified, then right now this
563581
-- function does nothing.
564-
CREATE FUNCTION set_redundancy(rel regclass, redundancy int, copy_data bool = true)
582+
CREATE FUNCTION set_redundancy(rel_name name, redundancy int, copy_data bool = true)
565583
RETURNS void AS $$
566584
DECLARE
567585
part shardman.partitions;
@@ -570,20 +588,21 @@ DECLARE
570588
repl_group text;
571589
pubs text = '';
572590
subs text = '';
573-
rel_name text = rel::text;
574591
sub_options text = '';
575592
BEGIN
576593
IF shardman.redirect_to_shardlord(format('set_redundancy(%L, %L)', rel_name, redundancy))
577594
THEN
578595
RETURN;
579596
END IF;
580597

598+
PERFORM shardman.check_max_replicas(redundancy);
599+
581600
IF NOT copy_data THEN
582601
sub_options := ' WITH (copy_data=false)';
583602
END IF;
584603

585604
-- Loop through all partitions of this table
586-
FOR part IN SELECT * from shardman.partitions where relation=rel_name
605+
FOR part IN SELECT * FROM shardman.partitions WHERE relation=rel_name
587606
LOOP
588607
-- Count number of replicas of this partition
589608
SELECT count(*) INTO n_replicas FROM shardman.replicas WHERE part_name=part.part_name;
@@ -665,10 +684,9 @@ $$ LANGUAGE plpgsql;
665684

666685

667686
-- Remove table from all nodes.
668-
CREATE FUNCTION rm_table(rel regclass)
687+
CREATE FUNCTION rm_table(rel_name name)
669688
RETURNS void AS $$
670689
DECLARE
671-
rel_name text = rel::text;
672690
node_id int;
673691
pname text;
674692
drop1 text = '';
@@ -702,10 +720,11 @@ BEGIN
702720
END
703721
$$ LANGUAGE plpgsql;
704722

705-
-- Move partition to other node. This function is able to move partition only within replication group.
706-
-- It creates temporary logical replication channel to copy partition to new location.
707-
-- Until logical replication almost caught-up access to old partition is now denied.
708-
-- Then we revoke all access to this table until copy is completed and all FDWs are updated.
723+
-- Move partition to other node. This function can move partition only within
724+
-- replication group. It creates temporary logical replication channel to copy
725+
-- partition to new location. Until logical replication almost caught-up access
726+
-- to old partition is denied. Then we revoke all access to this table until
727+
-- copy is completed and all FDWs are updated.
709728
CREATE FUNCTION mv_partition(mv_part_name text, dst_node_id int)
710729
RETURNS void AS $$
711730
DECLARE
@@ -750,12 +769,12 @@ BEGIN
750769
-- Check if destination belongs to the same replication group as source
751770
IF dst_repl_group<>src_repl_group AND shardman.get_redundancy_of_partition(mv_part_name)>0
752771
THEN
753-
RAISE EXCEPTION 'Can not move partition % to different replication group', mv_part_name;
772+
RAISE EXCEPTION 'Unable to move partition % to different replication group', mv_part_name;
754773
END IF;
755774

756775
IF EXISTS(SELECT * FROM shardman.replicas WHERE part_name=mv_part_name AND node_id=dst_node_id)
757776
THEN
758-
RAISE EXCEPTION 'Can not move partition % to node % with existed replica', mv_part_name, dst_node_id;
777+
RAISE EXCEPTION 'Unable to move partition % to node % with existing replica', mv_part_name, dst_node_id;
759778
END IF;
760779

761780
-- Copy partition data to new location
@@ -845,8 +864,8 @@ $$ LANGUAGE sql;
845864

846865
-- Get minimal redundancy of the specified relation.
847866
-- This command can be executed only at shardlord.
848-
CREATE FUNCTION get_min_redundancy(rel regclass) returns bigint AS $$
849-
SELECT min(redundancy) FROM (SELECT count(*) redundancy FROM shardman.replicas WHERE relation=rel::text GROUP BY part_name) s;
867+
CREATE FUNCTION get_min_redundancy(rel_name name) RETURNS bigint AS $$
868+
SELECT min(redundancy) FROM (SELECT count(*) redundancy FROM shardman.replicas WHERE relation=rel_name GROUP BY part_name) s;
850869
$$ LANGUAGE sql;
851870

852871
-- Execute command at all shardman nodes.
@@ -896,7 +915,7 @@ $$ LANGUAGE sql;
896915
-- It is not able to move partition between replication groups.
897916
-- This function intentionally moves one partition per time to minimize
898917
-- influence on system performance.
899-
CREATE FUNCTION rebalance(table_pattern text = '%') RETURNS void AS $$
918+
CREATE FUNCTION rebalance(part_pattern text = '%') RETURNS void AS $$
900919
DECLARE
901920
dst_node int;
902921
src_node int;
@@ -906,7 +925,7 @@ DECLARE
906925
repl_group text;
907926
done bool;
908927
BEGIN
909-
IF shardman.redirect_to_shardlord(format('rebalance(%L)', table_pattern))
928+
IF shardman.redirect_to_shardlord(format('rebalance(%L)', part_pattern))
910929
THEN
911930
RETURN;
912931
END IF;
@@ -919,21 +938,21 @@ BEGIN
919938
-- Select node in this group with minimal number of partitions
920939
SELECT node_id, count(*) n_parts INTO dst_node, min_count
921940
FROM shardman.partitions p JOIN shardman.nodes n ON p.node_id=n.id
922-
WHERE n.replication_group=repl_group AND p.relation LIKE table_pattern
941+
WHERE n.replication_group=repl_group AND p.relation LIKE part_pattern
923942
GROUP BY node_id
924943
ORDER BY n_parts ASC LIMIT 1;
925944
-- Select node in this group with maximal number of partitions
926945
SELECT node_id, count(*) n_parts INTO src_node,max_count
927946
FROM shardman.partitions p JOIN shardman.nodes n ON p.node_id=n.id
928-
WHERE n.replication_group=repl_group AND p.relation LIKE table_pattern
947+
WHERE n.replication_group=repl_group AND p.relation LIKE part_pattern
929948
GROUP BY node_id
930949
ORDER BY n_parts DESC LIMIT 1;
931950
-- If difference of number of partitions on this nodes is greater
932951
-- than 1, then move random partition
933952
IF max_count - min_count > 1 THEN
934953
SELECT p.part_name INTO mv_part_name
935954
FROM shardman.partitions p
936-
WHERE p.node_id=src_node AND p.relation LIKE table_pattern AND
955+
WHERE p.node_id=src_node AND p.relation LIKE part_pattern AND
937956
NOT EXISTS(SELECT * from shardman.replicas r
938957
WHERE r.node_id=dst_node AND r.part_name=p.part_name)
939958
ORDER BY random() LIMIT 1;
@@ -947,8 +966,8 @@ BEGIN
947966
END
948967
$$ LANGUAGE plpgsql;
949968

950-
-- Share table between all nodes. This function should be executed at shardlord. The empty table should be present at shardlord,
951-
-- but not at nodes.
969+
-- Share table between all nodes. This function should be executed at
970+
-- shardlord. The empty table should be present at shardlord, but not at nodes.
952971
CREATE FUNCTION create_shared_table(rel regclass, master_node_id int = 1) RETURNS void AS $$
953972
DECLARE
954973
node shardman.nodes;
@@ -1121,7 +1140,7 @@ $$ LANGUAGE plpgsql;
11211140
-- It is not able to move replica between replication groups.
11221141
-- This function intentionally moves one replica per time to minimize
11231142
-- influence on system performance.
1124-
CREATE FUNCTION rebalance_replicas(table_pattern text = '%') RETURNS void AS $$
1143+
CREATE FUNCTION rebalance_replicas(replica_pattern text = '%') RETURNS void AS $$
11251144
DECLARE
11261145
dst_node int;
11271146
src_node int;
@@ -1131,7 +1150,7 @@ DECLARE
11311150
repl_group text;
11321151
done bool;
11331152
BEGIN
1134-
IF shardman.redirect_to_shardlord(format('rebalance_replicas(%L)', table_pattern))
1153+
IF shardman.redirect_to_shardlord(format('rebalance_replicas(%L)', replica_pattern))
11351154
THEN
11361155
RETURN;
11371156
END IF;
@@ -1144,21 +1163,21 @@ BEGIN
11441163
-- Select node in this group with minimal number of replicas
11451164
SELECT node_id, count(*) n_parts INTO dst_node, min_count
11461165
FROM shardman.replicas r JOIN shardman.nodes n ON r.node_id=n.id
1147-
WHERE n.replication_group=repl_group AND r.relation LIKE table_pattern
1166+
WHERE n.replication_group=repl_group AND r.relation LIKE replica_pattern
11481167
GROUP BY node_id
11491168
ORDER BY n_parts ASC LIMIT 1;
11501169
-- Select node in this group with maximal number of partitions
11511170
SELECT node_id, count(*) n_parts INTO src_node,max_count
11521171
FROM shardman.replicas r JOIN shardman.nodes n ON r.node_id=n.id
1153-
WHERE n.replication_group=repl_group AND r.relation LIKE table_pattern
1172+
WHERE n.replication_group=repl_group AND r.relation LIKE replica_pattern
11541173
GROUP BY node_id
11551174
ORDER BY n_parts DESC LIMIT 1;
11561175
-- If difference of number of replicas on this nodes is greater
11571176
-- than 1, then move random partition
11581177
IF max_count - min_count > 1 THEN
11591178
SELECT src.part_name INTO mv_part_name
11601179
FROM shardman.replicas src
1161-
WHERE src.node_id=src_node AND src.relation LIKE table_pattern
1180+
WHERE src.node_id=src_node AND src.relation LIKE replica_pattern
11621181
AND NOT EXISTS(SELECT * FROM shardman.replicas dst
11631182
WHERE dst.node_id=dst_node AND dst.part_name=src.part_name)
11641183
AND NOT EXISTS(SELECT * FROM shardman.partitions p

0 commit comments

Comments
 (0)