Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 0aae49e

Browse files
committed
move_part
1 parent c79b6e0 commit 0aae49e

File tree

6 files changed

+347
-210
lines changed

6 files changed

+347
-210
lines changed

init.sql

Lines changed: 7 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ $$;
2424

2525
-- available commands
2626
CREATE TYPE cmd AS ENUM ('add_node', 'rm_node', 'create_hash_partitions',
27-
'move_primary', 'create_replica');
27+
'move_part', 'create_replica');
2828
-- command status
2929
CREATE TYPE cmd_status AS ENUM ('waiting', 'canceled', 'failed', 'in progress',
3030
'success');
@@ -96,16 +96,19 @@ BEGIN
9696
END
9797
$$ LANGUAGE plpgsql;
9898

99-
-- Move master partition to another node. Params:
99+
-- Move primary or replica partition to another node. Params:
100100
-- 'part_name' is name of the partition to move
101101
-- 'dest' is id of the destination node
102-
CREATE FUNCTION move_primary(part_name text, dest int) RETURNS int AS $$
102+
-- 'src' is id of the node with partition. If NULL, primary partition is moved.
103+
CREATE FUNCTION move_part(part_name text, dest int, src int DEFAULT NULL)
104+
RETURNS int AS $$
103105
DECLARE
104106
c_id int;
105107
BEGIN
106-
INSERT INTO @extschema@.cmd_log VALUES (DEFAULT, 'move_primary')
108+
INSERT INTO @extschema@.cmd_log VALUES (DEFAULT, 'move_part')
107109
RETURNING id INTO c_id;
108110
INSERT INTO @extschema@.cmd_opts VALUES (DEFAULT, c_id, part_name);
111+
INSERT INTO @extschema@.cmd_opts VALUES (DEFAULT, c_id, src);
109112
INSERT INTO @extschema@.cmd_opts VALUES (DEFAULT, c_id, dest);
110113
RETURN c_id;
111114
END $$ LANGUAGE plpgsql;
@@ -124,21 +127,6 @@ BEGIN
124127
RETURN c_id;
125128
END $$ LANGUAGE plpgsql;
126129

127-
-- Move primary or replica partition to another node. Params:
128-
-- 'part_name' is name of the partition to move
129-
-- 'src' is id of the node with partition
130-
-- 'dest' is id of the destination node
131-
CREATE FUNCTION move_part(part_name text, src int, dest int) RETURNS int AS $$
132-
DECLARE
133-
c_id int;
134-
BEGIN
135-
INSERT INTO @extschema@.cmd_log VALUES (DEFAULT, 'move_primary')
136-
RETURNING id INTO c_id;
137-
INSERT INTO @extschema@.cmd_opts VALUES (DEFAULT, c_id, part_name);
138-
INSERT INTO @extschema@.cmd_opts VALUES (DEFAULT, c_id, dest);
139-
RETURN c_id;
140-
END $$ LANGUAGE plpgsql;
141-
142130
-- Internal functions
143131

144132
-- Called on shardmaster bgw start. Add itself to nodes table, set id, create

shard.sql

Lines changed: 129 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -106,91 +106,149 @@ CREATE TRIGGER new_primary AFTER INSERT ON shardman.partitions
106106
-- fire trigger only on worker nodes
107107
ALTER TABLE shardman.partitions ENABLE REPLICA TRIGGER new_primary;
108108

109-
-- Executed on node with new primary, see mp_rebuild_lr
110-
CREATE FUNCTION primary_moved_create_data_pub(p_name name, src int, dst int)
109+
-- Executed on prev replica after partition move, see mp_rebuild_lr
110+
CREATE FUNCTION part_moved_prev(p_name name, src int, dst int)
111111
RETURNS void AS $$
112112
DECLARE
113-
-- Metadata is not yet updated, so taking nxt from src node
114-
replica int := nxt FROM shardman.partitions
115-
WHERE part_name = p_name AND owner = src;
116-
new_pubname text := shardman.get_data_pubname(p_name, dst);
117-
new_subname text := shardman.get_data_subname(p_name, dst, replica);
113+
me int := shardman.get_node_id();
114+
lname text := shardman.get_data_lname(p_name, me, dst);
118115
BEGIN
119-
PERFORM shardman.create_repslot(new_pubname);
120-
-- Create publication for new data channel
121-
EXECUTE format('DROP PUBLICATION IF EXISTS %I', new_pubname);
122-
EXECUTE format('CREATE PUBLICATION %I FOR TABLE %I',
123-
new_pubname, p_name);
124-
-- Make this channel sync
125-
PERFORM shardman.ensure_sync_standby(new_subname);
116+
PERFORM shardman.create_repslot(lname);
117+
-- Create publication for new data channel prev replica -> dst, make it sync
118+
EXECUTE format('DROP PUBLICATION IF EXISTS %I', lname);
119+
EXECUTE format('CREATE PUBLICATION %I FOR TABLE %I', lname, p_name);
120+
PERFORM shardman.ensure_sync_standby(lname);
126121
END $$ LANGUAGE plpgsql STRICT;
127122

128-
-- Executed on nearest replica after primary moved, see mp_rebuild_lr
129-
CREATE FUNCTION primary_moved_create_data_sub(p_name name, src int, dst int)
123+
-- Executed on node with new part, see mp_rebuild_lr
124+
CREATE FUNCTION part_moved_dst(p_name name, src int, dst int)
130125
RETURNS void AS $$
131126
DECLARE
132-
-- Metadata is not yet updated, so taking nxt from src node
133-
replica int := nxt FROM shardman.partitions
134-
WHERE part_name = p_name AND owner = src;
135-
new_pubname text := shardman.get_data_pubname(p_name, dst);
136-
new_subname text := shardman.get_data_subname(p_name, dst, replica);
137-
cp_logname text := shardman.get_cp_logname(p_name, src, dst);
138-
new_connstr text := shardman.get_worker_node_connstr(dst);
127+
next_rep int := nxt FROM shardman.partitions WHERE part_name = p_name
128+
AND owner = src;
129+
prev_rep int := prv FROM shardman.partitions WHERE part_name = p_name
130+
AND owner = src;
131+
next_lname text;
132+
prev_lname text;
133+
prev_connstr text;
139134
BEGIN
140-
-- Drop subscription used for copy
141-
PERFORM shardman.eliminate_sub(cp_logname);
142-
-- Create subscription for new data channel
135+
ASSERT dst = shardman.get_node_id(), 'part_moved_dst must be called on dst';
136+
IF next_rep IS NOT NULL THEN -- we need to setup channel dst -> next replica
137+
next_lname := shardman.get_data_lname(p_name, dst, next_rep);
138+
-- This must be first write in the transaction!
139+
PERFORM shardman.create_repslot(next_lname);
140+
EXECUTE format('DROP PUBLICATION IF EXISTS %I', next_lname);
141+
EXECUTE format('CREATE PUBLICATION %I FOR TABLE %I',
142+
next_lname, p_name);
143+
-- Make this channel sync
144+
PERFORM shardman.ensure_sync_standby(next_lname);
145+
END IF;
146+
147+
IF prev_rep IS NOT NULL THEN -- we need to setup channel prev replica -> dst
148+
prev_lname := shardman.get_data_lname(p_name, prev_rep, dst);
149+
prev_connstr := shardman.get_worker_node_connstr(prev_rep);
150+
PERFORM shardman.eliminate_sub(prev_lname);
151+
EXECUTE format(
152+
'CREATE SUBSCRIPTION %I connection %L
153+
PUBLICATION %I with (create_slot = false, slot_name = %L, copy_data = false);',
154+
prev_lname, prev_connstr, prev_lname, prev_lname);
155+
END IF;
156+
END $$ LANGUAGE plpgsql STRICT;
157+
158+
-- Executed on next replica after partition move, see mp_rebuild_lr
159+
CREATE FUNCTION part_moved_next(p_name name, src int, dst int)
160+
RETURNS void AS $$
161+
DECLARE
162+
me int := shardman.get_node_id();
163+
lname text := shardman.get_data_lname(p_name, dst, me);
164+
dst_connstr text := shardman.get_worker_node_connstr(dst);
165+
BEGIN
166+
-- Create subscription for new data channel dst -> next replica
143167
-- It should never exist at this moment, but just in case...
144-
PERFORM shardman.eliminate_sub(new_subname);
168+
PERFORM shardman.eliminate_sub(lname);
145169
EXECUTE format(
146170
'CREATE SUBSCRIPTION %I connection %L
147171
PUBLICATION %I with (create_slot = false, slot_name = %L, copy_data = false);',
148-
new_subname, new_connstr, new_pubname, new_pubname);
172+
lname, dst_connstr, lname, lname);
149173
END $$ LANGUAGE plpgsql STRICT;
150174

151175
-- Update metadata according to primary move
152-
CREATE FUNCTION primary_moved() RETURNS TRIGGER AS $$
176+
CREATE FUNCTION part_moved() RETURNS TRIGGER AS $$
153177
DECLARE
154-
cp_logname text := shardman.get_cp_logname(OLD.part_name, OLD.owner, NEW.owner);
155-
my_id int := shardman.get_node_id();
178+
cp_logname text := shardman.get_cp_logname(NEW.part_name, OLD.owner, NEW.owner);
179+
me int := shardman.get_node_id();
180+
prev_src_lname text;
181+
src_next_lname text;
156182
BEGIN
157-
RAISE DEBUG '[SHARDMAN] primary_moved trigger called for part %, owner %->%',
183+
ASSERT NEW.owner != OLD.owner, 'part_moved handles only moved parts';
184+
RAISE DEBUG '[SHARDMAN] part_moved trigger called for part %, owner % -> %',
158185
NEW.part_name, OLD.owner, NEW.owner;
159-
ASSERT NEW.owner != OLD.owner, 'primary_moved handles only moved parts';
160186
ASSERT NEW.nxt = OLD.nxt OR (NEW.nxt IS NULL AND OLD.nxt IS NULL),
161-
'both primary and replica must not be moved in one update';
162-
IF my_id = OLD.owner THEN -- src node
187+
'both part and replica must not be moved in one update';
188+
ASSERT NEW.prv = OLD.prv OR (NEW.prv IS NULL AND OLD.prv IS NULL),
189+
'both part and replica must not be moved in one update';
190+
IF NEW.prv IS NOT NULL THEN
191+
prev_src_lname := shardman.get_data_lname(NEW.part_name, NEW.prv, OLD.owner);
192+
END IF;
193+
IF NEW.nxt IS NOT NULL THEN
194+
src_next_lname := shardman.get_data_lname(NEW.part_name, OLD.owner, NEW.nxt);
195+
END IF;
196+
197+
IF me = OLD.owner THEN -- src node
163198
-- Drop publication & repslot used for copy
164199
PERFORM shardman.drop_repslot_and_pub(cp_logname);
165-
-- On src node, replace its partition with foreign one
166-
PERFORM shardman.replace_usual_part_with_foreign(NEW);
167-
ELSEIF my_id = NEW.owner THEN -- dst node
200+
-- If primary part was moved, replace on src node its partition with
201+
-- foreign one
202+
IF NEW.prv IS NULL THEN
203+
PERFORM shardman.replace_usual_part_with_foreign(NEW);
204+
ELSE
205+
-- On the other hand, if prev replica existed, drop sub for old
206+
-- channel prev -> src
207+
PERFORM shardman.eliminate_sub(src_next_lname);
208+
END IF;
209+
IF NEW.nxt IS NOT NULL THEN
210+
-- If next replica existed, drop pub for old channel src -> next
211+
PERFORM shardman.drop_repslot_and_pub(src_next_lname);
212+
PERFORM shardman.remove_sync_standby(src_next_lname);
213+
END IF;
214+
-- Drop old table anyway;
215+
EXECUTE format('DROP TABLE IF EXISTS %I', NEW.part_name);
216+
217+
ELSEIF me = NEW.owner THEN -- dst node
168218
-- Drop subscription used for copy
169219
PERFORM shardman.eliminate_sub(cp_logname);
170-
-- And replace moved table with foreign one
171-
PERFORM shardman.replace_foreign_part_with_usual(NEW);
172-
ELSE -- other nodes
173-
-- just update foreign server
174-
PERFORM shardman.update_fdw_server(NEW);
220+
-- If primary part was moved, replace moved table with foreign one
221+
IF NEW.prev IS NULL THEN
222+
PERFORM shardman.replace_foreign_part_with_usual(NEW);
223+
END IF;
224+
ELSEIF me = NEW.prv THEN -- node with prev replica
225+
-- Drop pub for old channel prev -> src
226+
PERFORM shardman.drop_repslot_and_pub(prev_src_lname);
227+
PERFORM shardman.remove_sync_standby(prev_src_lname);
228+
ELSEIF me = NEW.nxt THEN -- node with next replica
229+
-- Drop sub for old channel src -> next
230+
PERFORM shardman.eliminate_sub(src_next_lname);
175231
END IF;
232+
233+
-- And update fdw almost everywhere
234+
PERFORM shardman.update_fdw_server(NEW);
176235
RETURN NULL;
177236
END
178237
$$ LANGUAGE plpgsql;
179-
CREATE TRIGGER primary_moved AFTER UPDATE ON shardman.partitions
238+
CREATE TRIGGER part_moved AFTER UPDATE ON shardman.partitions
180239
FOR EACH ROW WHEN (OLD.prv is NULL AND NEW.prv IS NULL -- it is primary
181240
AND OLD.owner != NEW.owner -- and it is really moved
182241
AND OLD.part_name = NEW.part_name) -- sanity check
183-
EXECUTE PROCEDURE primary_moved();
242+
EXECUTE PROCEDURE part_moved();
184243
-- fire trigger only on worker nodes
185-
ALTER TABLE shardman.partitions ENABLE REPLICA TRIGGER primary_moved;
244+
ALTER TABLE shardman.partitions ENABLE REPLICA TRIGGER part_moved;
186245

187246
-- Executed on newtail node, see cr_rebuild_lr
188247
CREATE FUNCTION replica_created_drop_cp_sub(
189248
part_name name, oldtail int, newtail int) RETURNS void AS $$
190249
DECLARE
191250
cp_logname text := shardman.get_cp_logname(part_name, oldtail, newtail);
192251
BEGIN
193-
PERFORM shardman.readonly_replica_on(part_name::regclass);
194252
-- Drop subscription used for copy
195253
PERFORM shardman.eliminate_sub(cp_logname);
196254
END $$ LANGUAGE plpgsql;
@@ -200,39 +258,37 @@ CREATE FUNCTION replica_created_create_data_pub(
200258
part_name name, oldtail int, newtail int) RETURNS void AS $$
201259
DECLARE
202260
cp_logname text := shardman.get_cp_logname(part_name, oldtail, newtail);
203-
oldtail_pubname name := shardman.get_data_pubname(part_name, oldtail);
204-
newtail_subname name := shardman.get_data_subname(part_name, oldtail, newtail);
261+
lname name := shardman.get_data_lname(part_name, oldtail, newtail);
205262
BEGIN
206263
-- Repslot for new data channel. Must be first, since we "cannot create
207264
-- logical replication slot in transaction that has performed writes"
208-
PERFORM shardman.create_repslot(oldtail_pubname);
265+
PERFORM shardman.create_repslot(lname);
209266
-- Drop publication & repslot used for copy
210267
PERFORM shardman.drop_repslot_and_pub(cp_logname);
211268
-- Create publication for new data channel
212-
EXECUTE format('DROP PUBLICATION IF EXISTS %I', oldtail_pubname);
213-
EXECUTE format('CREATE PUBLICATION %I FOR TABLE %I',
214-
oldtail_pubname, part_name);
269+
EXECUTE format('DROP PUBLICATION IF EXISTS %I', lname);
270+
EXECUTE format('CREATE PUBLICATION %I FOR TABLE %I', lname, part_name);
215271
-- Make this channel sync
216-
PERFORM shardman.ensure_sync_standby(newtail_subname);
272+
PERFORM shardman.ensure_sync_standby(lname);
217273
-- Now it is safe to make old tail writable again
218274
PERFORM shardman.readonly_table_off(part_name::regclass);
219275
END $$ LANGUAGE plpgsql;
220276

221-
-- Executed on oldtail node, see cr_rebuild_lr
277+
-- Executed on newtail node, see cr_rebuild_lr
222278
CREATE FUNCTION replica_created_create_data_sub(
223279
part_name name, oldtail int, newtail int) RETURNS void AS $$
224280
DECLARE
225-
oldtail_pubname name := shardman.get_data_pubname(part_name, oldtail);
281+
lname name := shardman.get_data_lname(part_name, oldtail, newtail);
226282
oldtail_connstr text := shardman.get_worker_node_connstr(oldtail);
227-
newtail_subname name := shardman.get_data_subname(part_name, oldtail, newtail);
228283
BEGIN
284+
PERFORM shardman.readonly_replica_on(part_name::regclass);
229285
-- Create subscription for new data channel
230286
-- It should never exist at this moment, but just in case...
231-
PERFORM shardman.eliminate_sub(newtail_subname);
287+
PERFORM shardman.eliminate_sub(lname);
232288
EXECUTE format(
233289
'CREATE SUBSCRIPTION %I connection %L
234290
PUBLICATION %I with (create_slot = false, slot_name = %L, copy_data = false);',
235-
newtail_subname, oldtail_connstr, oldtail_pubname, oldtail_pubname);
291+
lname, oldtail_connstr, lname, lname);
236292
END $$ LANGUAGE plpgsql;
237293

238294
-- TODO
@@ -581,27 +637,26 @@ BEGIN
581637
RETURN format('shardman_copy_%s_%s_%s', part_name, src, dst);
582638
END $$ LANGUAGE plpgsql STRICT;
583639

584-
-- Convention about pub and repslot name used for data replication from part
585-
-- on pub_node node to any part. We don't change pub and repslot while
586-
-- switching subs, so sub node is not included here.
587-
CREATE FUNCTION get_data_pubname(part_name text, pub_node int)
588-
RETURNS name AS $$
589-
BEGIN
590-
RETURN format('shardman_data_%s_%s', part_name, pub_node);
591-
END $$ LANGUAGE plpgsql STRICT;
592-
593-
-- Convention about sub and application_name used for data replication. We do
594-
-- recreate sub while switching pub, so pub node is included here.
595-
-- See comment to replica_created on why we don't reuse subs.
596-
CREATE FUNCTION get_data_subname(part_name text, pub_node int, sub_node int)
640+
-- Convention about pub, repslot, sub and application_name used for data
641+
-- replication. We do recreate sub while switching pub, so pub node is included
642+
-- here, and recreate pub when switching sub, so including both in the name. See
643+
-- comment to replica_created on why we don't reuse pubs and subs.
644+
CREATE FUNCTION get_data_lname(part_name text, pub_node int, sub_node int)
597645
RETURNS name AS $$
598646
BEGIN
599647
RETURN format('shardman_data_%s_%s_%s', part_name, pub_node, sub_node);
600648
END $$ LANGUAGE plpgsql STRICT;
601649

602650
-- Make sure that standby_name is present in synchronous_standby_names. If not,
603651
-- add it via ALTER SYSTEM and SIGHUP postmaster to reread conf.
604-
CREATE FUNCTION ensure_sync_standby(newtail_subname text) RETURNS void as $$
652+
CREATE FUNCTION ensure_sync_standby(standby text) RETURNS void as $$
653+
BEGIN
654+
RAISE DEBUG '[SHARDMAN] imagine standby % added', standby;
655+
END $$ LANGUAGE plpgsql STRICT;
656+
657+
-- Remove 'standby' from in synchronous_standby_names, if it is there, and SIGHUP
658+
-- postmaster.
659+
CREATE FUNCTION remove_sync_standby(standby text) RETURNS void as $$
605660
BEGIN
606-
RAISE DEBUG '[SHARDMAN] imagine standby updated';
661+
RAISE DEBUG '[SHARDMAN] imagine standby % removed', standby;
607662
END $$ LANGUAGE plpgsql STRICT;

src/include/pg_shardman.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ extern char *get_worker_node_connstr(int32 node_id);
4949
extern int32 get_primary_owner(const char *part_name);
5050
extern int32 get_reptail_owner(const char *part_name);
5151
extern int32 get_next_node(const char *part_name, int32 node_id);
52+
extern int32 get_prev_node(const char *part_name, int32 node_id, bool *part_exists);
5253
extern char *get_partition_relation(const char *part_name);
5354

5455
#endif /* PG_SHARDMAN_H */

src/include/shard.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#include "pg_shardman.h"
55

66
extern void create_hash_partitions(Cmd *cmd);
7-
extern void move_primary(Cmd *cmd);
7+
extern void move_part(Cmd *cmd);
88
extern void create_replica(Cmd *cmd);
99

1010
#endif /* SHARD_H */

0 commit comments

Comments
 (0)