@@ -54,7 +54,7 @@ ALTER TABLE shardman.tables ENABLE REPLICA TRIGGER new_table_worker_side;
54
54
CREATE FUNCTION new_table_master_side () RETURNS TRIGGER AS $$
55
55
BEGIN
56
56
INSERT INTO shardman .partitions
57
- SELECT part_name, 0 , NULL , NULL , NEW .relation AS relation, NEW . initial_node AS owner
57
+ SELECT part_name, NEW . initial_node AS owner , NULL , NULL , NEW .relation AS relation
58
58
FROM (SELECT part_name FROM shardman .gen_part_names (
59
59
NEW .relation , NEW .partitions_count ))
60
60
AS partnames;
@@ -68,21 +68,18 @@ CREATE TRIGGER new_table_master_side AFTER INSERT ON shardman.tables
68
68
-- Partitions
69
69
-- ----------------------------------------------------------
70
70
71
- -- Primary shard and its replicas compose a doubly-linked list with 0 shard in
72
- -- the beginning.
71
+ -- Primary shard and its replicas compose a doubly-linked list: nxt refers to
72
+ -- the node containing next replica, prv to node with previous replica (or
73
+ -- primary, if we are the first replica). If prv is NULL, this is primary
74
+ -- replica. We don't number parts separately since we are not ever going to
75
+ -- allow several copies of the same partition on one node.
73
76
CREATE TABLE partitions (
74
77
part_name text ,
75
- -- Shard number. 0 means primary shard.
76
- num serial ,
77
- nxt int ,
78
- prv int ,
78
+ owner int NOT NULL REFERENCES nodes(id), -- node on which partition lies
79
+ prv int REFERENCES nodes(id),
80
+ nxt int REFERENCES nodes(id),
79
81
relation text NOT NULL REFERENCES tables(relation),
80
- owner int REFERENCES nodes(id), -- node on which partition lies
81
- PRIMARY KEY (part_name, num),
82
- FOREIGN KEY (part_name, nxt) REFERENCES shardman .partitions (part_name, num),
83
- FOREIGN KEY (part_name, prv) REFERENCES shardman .partitions (part_name, num),
84
- -- primary has no prv, replica must have prv
85
- CONSTRAINT prv_existence CHECK (num = 0 OR prv IS NOT NULL )
82
+ PRIMARY KEY (part_name, owner)
86
83
);
87
84
88
85
-- ----------------------------------------------------------
@@ -96,43 +93,31 @@ CREATE TABLE partitions (
96
93
-- it.
97
94
CREATE FUNCTION new_primary () RETURNS TRIGGER AS $$
98
95
BEGIN
99
- IF NEW .owner != (SELECT shardman .get_node_id ()) THEN
96
+ RAISE DEBUG ' [SHARDMAN] new_primary trigger called for part %, owner %' ,
97
+ NEW .part_name , NEW .owner ;
98
+ IF NEW .owner != shardman .get_node_id () THEN
100
99
PERFORM shardman .replace_usual_part_with_foreign (NEW);
101
100
END IF;
102
101
RETURN NULL ;
103
102
END
104
103
$$ LANGUAGE plpgsql;
105
104
CREATE TRIGGER new_primary AFTER INSERT ON shardman .partitions
106
- FOR EACH ROW WHEN (NEW .num = 0 ) EXECUTE PROCEDURE new_primary();
105
+ FOR EACH ROW WHEN (NEW .prv IS NULL ) EXECUTE PROCEDURE new_primary();
107
106
-- fire trigger only on worker nodes
108
107
ALTER TABLE shardman .partitions ENABLE REPLICA TRIGGER new_primary;
109
108
110
- -- Replace foreign table-partition with local. The latter must exist!
111
- -- Foreign table will be dropped.
112
- CREATE FUNCTION replace_foreign_part_with_usual (part partitions)
113
- RETURNS void AS $$
114
- DECLARE
115
- fdw_part_name name;
116
- BEGIN
117
- ASSERT to_regclass(part .part_name ) IS NOT NULL ;
118
- SELECT shardman .get_fdw_part_name (part .part_name ) INTO fdw_part_name;
119
- EXECUTE format(' SELECT replace_hash_partition(%L, %L);' ,
120
- fdw_part_name, part .part_name );
121
- EXECUTE format(' DROP FOREIGN TABLE %I;' , fdw_part_name);
122
- END $$ LANGUAGE plpgsql;
123
-
124
109
-- Update metadata according to primary move
125
110
CREATE FUNCTION primary_moved () RETURNS TRIGGER AS $$
126
111
DECLARE
127
- cp_logname text := format(' shardman_copy_%s_%s_%s' ,
128
- OLD .part_name , OLD .owner , NEW .owner );
112
+ cp_logname text := shardman .get_cp_logname (OLD .part_name , OLD .owner , NEW .owner );
129
113
my_id int := shardman .get_node_id ();
130
114
BEGIN
115
+ RAISE DEBUG ' [SHARDMAN] primary_moved trigger called for part %, owner %->%' ,
116
+ NEW .part_name , OLD .owner , NEW .owner ;
131
117
ASSERT NEW .owner != OLD .owner , ' primary_moved handles only moved parts' ;
132
118
IF my_id = OLD .owner THEN -- src node
133
119
-- Drop publication & repslot used for copy
134
- EXECUTE format(' DROP PUBLICATION IF EXISTS %I' , cp_logname);
135
- PERFORM shardman .drop_repslot (cp_logname, true);
120
+ PERFORM shardman .drop_repslot_and_pub (cp_logname);
136
121
-- On src node, replace its partition with foreign one
137
122
PERFORM shardman .replace_usual_part_with_foreign (NEW);
138
123
ELSEIF my_id = NEW .owner THEN -- dst node
@@ -148,44 +133,75 @@ BEGIN
148
133
END
149
134
$$ LANGUAGE plpgsql;
150
135
CREATE TRIGGER primary_moved AFTER UPDATE ON shardman .partitions
151
- FOR EACH ROW EXECUTE PROCEDURE primary_moved();
136
+ FOR EACH ROW WHEN (OLD .prv is NULL AND NEW .prv IS NULL -- it is primary
137
+ AND OLD .owner != NEW .owner -- and it is really moved
138
+ AND OLD .part_name = NEW .part_name ) -- sanity check
139
+ EXECUTE PROCEDURE primary_moved();
152
140
-- fire trigger only on worker nodes
153
141
ALTER TABLE shardman .partitions ENABLE REPLICA TRIGGER primary_moved;
154
142
155
143
-- Update metadata according to new replica creation.
144
+ -- Old tail part is still read-only when this called. There are two main jobs
145
+ -- to do: set up LR sync channel between old tail and new replica and update fdw
146
+ -- everywhere. For the former we could configure already existing channel used
147
+ -- for partition copy, but we will not do that, because
148
+ -- * It is not easier than creating new pub & sub: we have to rename pub, drop
149
+ -- and create repslot (there is no way to rename it), rename sub, alter sub's
150
+ -- slot_name, alter sub's publication, probably rename sub application name,
151
+ -- probably run REFRESH (which requires alive pub just as CREATE SUBSCRIPTION)
152
+ -- and hope that everything will be ok. Not sure about refreshing, though -- I
153
+ -- don't know is it ok not doing it if tables didn't change. Doc says it
154
+ -- should be executed.
155
+ -- * Since it is not possible to rename repslot and and it is not possible to
156
+ -- specify since which lsn start replication, tables must be synced anyway
157
+ -- during these operations, so what the point of reusing old sub? And copypart
158
+ -- in shard.c really cares that tables are synced at this moment and src is
159
+ -- read-only.
156
160
CREATE FUNCTION replica_created () RETURNS TRIGGER AS $$
157
161
DECLARE
158
- cp_logname text := format(' shardman_copy_%s_%s_%s' ,
159
- NEW .part_name , NEW .prv , NEW .owner );
162
+ cp_logname text := shardman .get_cp_logname (NEW .part_name , NEW .prv , NEW .owner );
163
+ oldtail_pubname name := shardman .get_data_pubname (NEW .part_name , NEW .prv );
164
+ oldtail_connstr text := shardman .get_worker_node_connstr (NEW .prv );
165
+ newtail_subname name := shardman .get_data_subname (NEW .part_name , NEW .prv , NEW .owner );
160
166
my_id int := shardman .get_node_id ();
161
167
BEGIN
162
- -- ASSERT NEW.owner != OLD.owner, 'partition_moved handles only moved parts';
163
- -- cp_logname := format('shardman_copy_%s_%s_%s',
164
- -- OLD.part_name, OLD.owner, NEW.owner);
165
- -- my_id := (SELECT shardman.get_node_id());
166
- -- IF my_id = OLD.owner THEN -- src node
167
- -- -- Drop publication & repslot used for copy
168
- -- EXECUTE format('DROP PUBLICATION IF EXISTS %I', cp_logname);
169
- -- PERFORM shardman.drop_repslot(cp_logname, true);
170
- -- -- On src node, replace its partition with foreign one
171
- -- PERFORM shardman.replace_usual_part_with_foreign(NEW);
172
- -- ELSEIF my_id = NEW.owner THEN -- dst node
173
- -- -- Drop subscription used for copy
174
- -- PERFORM shardman.eliminate_sub(cp_logname);
175
- -- PERFORM shardman.replace_foreign_part_with_usual(NEW);
176
- -- ELSE -- other nodes
177
- -- -- just update foreign server
178
- -- PERFORM shardman.update_fdw_server(NEW);
179
- -- END IF;
180
- -- RETURN NULL;
168
+ RAISE DEBUG ' [SHARDMAN] replica_created trigger called' ;
169
+ IF my_id = NEW .prv THEN -- old tail node
170
+ -- Drop publication & repslot used for copy
171
+ PERFORM shardman .drop_repslot_and_pub (cp_logname);
172
+ -- Create publication & repslot for new data channel
173
+ PERFORM shardman .create_repslot (oldtail_pubname);
174
+ EXECUTE format(' DROP PUBLICATION IF EXISTS %I' , oldtail_pubname);
175
+ EXECUTE format(' CREATE PUBLICATION %I FOR TABLE %I' ,
176
+ oldtail_pubname, NEW .part_name );
177
+ -- Make this channel sync
178
+ PERFORM shardman .ensure_sync_standby (newtail_subname);
179
+ -- Now it is safe to make old tail writable again
180
+ PERFORM shardman .readonly_table_off (relation);
181
+ ELSEIF my_id = NEW .owner THEN -- created replica, i.e. new tail node
182
+ -- Drop subscription used for copy
183
+ PERFORM shardman .eliminate_sub (cp_logname);
184
+ -- And create subscription for new data channel
185
+ -- It should never exist at this moment, but just in case...
186
+ PERFORM shardman .eliminate_sub (newtail_subname);
187
+ EXECUTE format(
188
+ ' CREATE SUBSCRIPTION %I connection %L
189
+ PUBLICATION %I with (create_slot = false, slot_name = %L);' ,
190
+ newtail_subname, oldtail_connstr, oldtail_pubname, oldtail_pubname);
191
+ -- Now fdw connstring to this part should include only primary and myself
192
+ PERFORM shardman .update_fdw_server (NEW);
193
+ ELSE -- other nodes
194
+ -- just update fdw connstr to add new replica
195
+ PERFORM shardman .update_fdw_server (NEW);
196
+ END IF;
197
+ RETURN NULL ;
181
198
END
182
199
$$ LANGUAGE plpgsql;
183
200
CREATE TRIGGER replica_created AFTER INSERT ON shardman .partitions
184
- FOR EACH ROW WHEN (NEW .num != 0 ) EXECUTE PROCEDURE replica_created();
201
+ FOR EACH ROW WHEN (NEW .prv IS NOT NULL ) EXECUTE PROCEDURE replica_created();
185
202
-- fire trigger only on worker nodes
186
203
ALTER TABLE shardman .partitions ENABLE REPLICA TRIGGER replica_created;
187
204
188
-
189
205
-- Otherwise partitioned tables on worker nodes not will be dropped properly,
190
206
-- see pathman's docs.
191
207
ALTER EVENT TRIGGER pathman_ddl_trigger ENABLE ALWAYS;
@@ -194,8 +210,8 @@ ALTER EVENT TRIGGER pathman_ddl_trigger ENABLE ALWAYS;
194
210
-- Funcs related to fdw
195
211
-- ----------------------------------------------------------
196
212
197
- -- We use _fdw suffix for foreign tables to avoid interleaving with real
198
- -- ones.
213
+ -- Convention: we use _fdw suffix for foreign tables to avoid interleaving with
214
+ -- real ones.
199
215
CREATE FUNCTION get_fdw_part_name (part_name name) RETURNS name AS $$
200
216
BEGIN
201
217
RETURN format(' %s_fdw' , part_name);
@@ -324,6 +340,20 @@ BEGIN
324
340
EXECUTE format(' DROP TABLE %I' , part .part_name );
325
341
END $$ LANGUAGE plpgsql;
326
342
343
+ -- Replace foreign table-partition with local. The latter must exist!
344
+ -- Foreign table will be dropped.
345
+ CREATE FUNCTION replace_foreign_part_with_usual (part partitions)
346
+ RETURNS void AS $$
347
+ DECLARE
348
+ fdw_part_name name;
349
+ BEGIN
350
+ ASSERT to_regclass(part .part_name ) IS NOT NULL ;
351
+ SELECT shardman .get_fdw_part_name (part .part_name ) INTO fdw_part_name;
352
+ EXECUTE format(' SELECT replace_hash_partition(%L, %L);' ,
353
+ fdw_part_name, part .part_name );
354
+ EXECUTE format(' DROP FOREIGN TABLE %I;' , fdw_part_name);
355
+ END $$ LANGUAGE plpgsql;
356
+
327
357
-- Options to postgres_fdw are specified in two places: user & password in user
328
358
-- mapping and everything else in create server. The problem is that we use
329
359
-- single connstring, however user mapping and server doesn't understand this
@@ -493,3 +523,36 @@ BEGIN
493
523
AS range(num)) AS range;
494
524
END
495
525
$$ LANGUAGE plpgsql;
526
+
527
+ -- Convention about pub, sub and repslot name used for copying part part_name
528
+ -- from src node to dst node.
529
+ CREATE FUNCTION get_cp_logname (part_name text , src int , dst int )
530
+ RETURNS name AS $$
531
+ BEGIN
532
+ RETURN format(' shardman_copy_%s_%s_%s' , part_name, src, dst);
533
+ END $$ LANGUAGE plpgsql STRICT;
534
+
535
+ -- Convention about pub and repslot name used for data replication from part
536
+ -- on pub_node node to any part. We don't change pub and repslot while
537
+ -- switching subs, so sub node is not included here.
538
+ CREATE FUNCTION get_data_pubname (part_name text , pub_node int )
539
+ RETURNS name AS $$
540
+ BEGIN
541
+ RETURN format(' shardman_data_%s_%s' , part_name, pub_node);
542
+ END $$ LANGUAGE plpgsql STRICT;
543
+
544
+ -- Convention about sub and application_name used for data replication. We do
545
+ -- recreate sub while switching pub, so pub node is included here.
546
+ -- See comment to replica_created on why we don't reuse subs.
547
+ CREATE FUNCTION get_data_subname (part_name text , pub_node int , sub_node int )
548
+ RETURNS name AS $$
549
+ BEGIN
550
+ RETURN format(' shardman_data_%s_%s_%s' , part_name, pub_node, sub_node);
551
+ END $$ LANGUAGE plpgsql STRICT;
552
+
553
+ -- Make sure that standby_name is present in synchronous_standby_names. If not,
554
+ -- add it via ALTER SYSTEM and SIGHUP postmaster to reread conf.
555
+ CREATE FUNCTION ensure_sync_standby (newtail_subname text ) RETURNS void as $$
556
+ BEGIN
557
+ RAISE DEBUG ' [SHARDMAN] imagine standby updated' ;
558
+ END $$ LANGUAGE plpgsql STRICT;
0 commit comments