@@ -252,84 +252,101 @@ CREATE TRIGGER part_moved AFTER UPDATE ON shardman.partitions
252
252
ALTER TABLE shardman .partitions ENABLE REPLICA TRIGGER part_moved;
253
253
254
254
255
- -- Partition removed: drop old LR channels.
255
+ -- Partition removed: drop LR channel and promote replica if primary was
256
+ -- removed. Since for now we support only 2 copies (1 replica), we promote
257
+ -- replica immediately if needed. Case with several replicas is much more
258
+ -- complex because we need to rebuild LR channels, so later we will have
259
+ -- separate cmd promote_replica(), while part deletion will just perform
260
+ -- cleanup. Here we do nothing if we are removing the last copy of data, the
261
+ -- caller is responsible for tracking that.
256
262
CREATE FUNCTION part_removed () RETURNS TRIGGER AS $$
257
263
DECLARE
264
+ replica_removed bool := OLD .prv IS NOT NULL ; -- replica or primary removed?
265
+ -- if primary removed, is there replica that we will promote?
266
+ replica_exists bool := OLD .nxt IS NOT NULL ;
267
+ prim_repl_lname text ; -- channel between primary and replica
258
268
me int := shardman .my_id ();
259
- prev_src_lname text ;
260
- src_next_lname text ;
261
- new_primary partitions;
262
- drop_slot_delay int := 2000 ; -- two seconds
269
+ new_primary shardman .partitions ;
270
+ drop_slot_delay int := 2 ; -- two seconds
263
271
BEGIN
264
272
RAISE DEBUG ' [SHMN] part_removed trigger called for part %, owner %' ,
265
273
OLD .part_name , OLD .owner ;
266
274
267
275
ASSERT (OLD .prv IS NULL OR OLD .nxt IS NULL ), ' We currently do not support redundancy level > 2' ;
268
276
269
- IF OLD .prv IS NOT NULL THEN
270
- prev_src_lname := shardman .get_data_lname (OLD .part_name , OLD .prv , OLD .owner );
271
- ELSE
272
- -- Primaty is moved
273
- select * from shardman .partitions where owner= OLD .nxt and part_name= OLD .part_name into new_primary;
274
- END IF;
275
- IF OLD .nxt IS NOT NULL THEN
276
- src_next_lname := shardman .get_data_lname (OLD .part_name , OLD .owner , OLD .nxt );
277
+ -- get log channel name and part we will promote, if any
278
+ IF replica_removed THEN
279
+ prim_repl_lname := shardman .get_data_lname (OLD .part_name , OLD .prv , OLD .owner );
280
+ ELSE -- Primary is removed
281
+ IF replica_exists THEN -- Primary removed, and it has replica
282
+ prim_repl_lname := shardman .get_data_lname (OLD .part_name , OLD .owner ,
283
+ OLD .nxt );
284
+ -- This replica is new primary
285
+ SELECT * FROM shardman .partitions
286
+ WHERE owner = OLD .nxt AND part_name= OLD .part_name INTO new_primary;
287
+ -- whole record nullability seems to be non-working
288
+ ASSERT new_primary .part_name IS NOT NULL ;
289
+ END IF;
277
290
END IF;
278
291
279
-
280
- IF me = OLD .owner THEN -- src node
281
- -- If primary part was moved, replace on src node its partition with
282
- -- foreign one
283
- IF OLD .prv IS NULL THEN
284
- PERFORM shardman .replace_usual_part_with_foreign (new_primary);
285
- ELSE
286
- -- On the other hand, if prev replica existed, drop sub for old
287
- -- channel prev -> src
288
- PERFORM shardman .eliminate_sub (prev_src_lname);
289
- END IF;
290
- IF OLD .nxt IS NOT NULL THEN
291
- -- If next replica existed, drop pub for old channel src -> next
292
- -- Wait sometime to let other node first remove subscription
293
- PERFORM pg_sleep(drop_slot_delay);
294
- PERFORM shardman .drop_repslot_and_pub (src_next_lname);
295
- PERFORM shardman .remove_sync_standby (src_next_lname);
292
+ IF me = OLD .owner THEN -- part dropped on us
293
+ IF replica_removed THEN -- replica removed on us
294
+ PERFORM shardman .eliminate_sub (prim_repl_lname);
295
+ ELSE -- primary removed on us
296
+ IF replica_exists IS NOT NULL THEN
297
+ -- If next replica existed, drop pub & rs for data channel
298
+ -- Wait sometime to let replica first remove subscription
299
+ PERFORM pg_sleep(drop_slot_delay);
300
+ PERFORM shardman .drop_repslot_and_pub (prim_repl_lname);
301
+ PERFORM shardman .remove_sync_standby (prim_repl_lname);
302
+ -- replace removed table with foreign one on promoted replica
303
+ PERFORM shardman .replace_usual_part_with_foreign (new_primary);
304
+ END IF;
296
305
END IF;
297
306
-- Drop old table anyway
298
307
EXECUTE format(' DROP TABLE IF EXISTS %I' , OLD .part_name );
299
- ELSEIF me = OLD .prv THEN -- node with prev replica
308
+ ELSEIF me = OLD .prv THEN -- node with primary for which replica was dropped
300
309
-- Wait sometime to let other node first remove subscription
301
310
PERFORM pg_sleep(drop_slot_delay);
302
- -- Drop pub for old channel prev -> src
303
- PERFORM shardman .drop_repslot_and_pub (prev_src_lname);
304
- PERFORM shardman .remove_sync_standby (prev_src_lname);
305
- -- Update L2-list (TODO: change replication model from chain to star)
306
- PERFORM update shardman .partitions set nxt= OLD .nxt where owner= me and part_name= OLD .part_name ;
307
- ELSEIF me = OLD .nxt THEN -- node with next replica
308
- -- Drop sub for old channel src -> next
309
- PERFORM shardman .eliminate_sub (src_next_lname);
310
- -- Update L2-list (TODO: change replication model from chain to star)
311
- PERFORM update shardman .partitions set prv= OLD .prv where owner= me and part_name= OLD .part_name ;
312
- -- This replica is promoted to primary node, so drop trigger disabling writes to the table
313
- PERFORM readonly_replica_off(part_name);
311
+ -- Drop pub & rs for data channel
312
+ PERFORM shardman .drop_repslot_and_pub (prim_repl_lname);
313
+ PERFORM shardman .remove_sync_standby (prim_repl_lname);
314
+ ELSEIF me = OLD .nxt THEN -- node with replica for which primary was dropped
315
+ -- Drop sub for data channel
316
+ PERFORM shardman .eliminate_sub (prim_repl_lname);
317
+ -- This replica is promoted to primary node, so drop trigger disabling
318
+ -- writes to the table and replace fdw with normal part
319
+ PERFORM shardman .readonly_replica_off (OLD .part_name );
314
320
-- Replace FDW with local partition
315
321
PERFORM shardman .replace_foreign_part_with_usual (new_primary);
316
322
END IF;
317
323
318
- -- If primary was moved
319
- IF OLD .prv IS NULL THEN
320
- -- then update fdw almost everywhere
324
+ IF NOT replica_removed AND shardman .me_worker () THEN
325
+ -- update fdw almost everywhere
321
326
PERFORM shardman .update_fdw_server (new_primary);
322
327
END IF;
323
328
329
+ IF shardman .me_lord () THEN
330
+ -- update partitions table: promote replica immediately after primary
331
+ -- removal or remove link to dropped replica.
332
+ IF replica_removed THEN
333
+ UPDATE shardman .partitions SET nxt = NULL WHERE owner = OLD .prv AND
334
+ part_name = OLD .part_name ;
335
+ ELSE
336
+ UPDATE shardman .partitions SET prv = NULL
337
+ WHERE owner = OLD .nxt AND part_name = OLD .part_name ;
338
+ END IF;
339
+ END IF;
340
+
324
341
RETURN NULL ;
325
342
END
326
343
$$ LANGUAGE plpgsql;
327
344
328
- CREATE TRIGGER part_removed AFTER REMOVE ON shardman .partitions
345
+ CREATE TRIGGER part_removed AFTER DELETE ON shardman .partitions
329
346
FOR EACH ROW
330
347
EXECUTE PROCEDURE part_removed();
331
- -- fire trigger only on worker nodes
332
- ALTER TABLE shardman .partitions ENABLE REPLICA TRIGGER part_removed;
348
+ -- fire trigger only on either shardlord and worker nodes
349
+ ALTER TABLE shardman .partitions ENABLE ALWAYS TRIGGER part_removed;
333
350
334
351
335
352
0 commit comments