rm_node works

arssher · arssher · commit 605a856d5d6c · 2017-07-10T00:59:21.000+03:00
diff --git a/bin/common.sh b/bin/common.sh
@@ -5,6 +5,7 @@ set -e
 # Params
 
 pgpath=~/postgres/install/vanilla/
+pathmanpath=~/postgres/pg_pathman
 
 master_datadir=~/postgres/data1
 master_port=5432
diff --git a/bin/shardman_init.sh b/bin/shardman_init.sh
@@ -3,6 +3,10 @@
 script_dir=`dirname "$(readlink -f "$0")"`
 source "${script_dir}/common.sh"
 
+echo $PATH
+cd $pathmanpath
+USE_PGXS=1 make install
+
 cd "${script_dir}/.."
 make clean
 make install
diff --git a/pg_shardman--0.0.1.sql b/pg_shardman--0.0.1.sql
@@ -13,13 +13,33 @@ BEGIN
 END
 $$;
 
+-- active is the normal mode, others needed only for proper node add and removal
+CREATE TYPE node_status AS ENUM ('active', 'add_in_progress', 'rm_in_progress');
+
 -- list of nodes present in the cluster
 CREATE TABLE nodes (
 	id serial PRIMARY KEY,
 	connstring text,
-	active bool NOT NULL -- if false, we haven't yet finished adding it
+	status node_status NOT NULL
 );
 
+-- Master is removing us, so reset our state, removing all subscriptions. A bit
+-- tricky part: we can't DROP SUBSCRIPTION here, because that would mean
+-- shooting (sending SIGTERM) ourselvers (to replication apply worker) in the
+-- leg.  So for now we just disable subscription, worker will stop after the end
+-- of transaction. Later we should delete subscriptions fully.
+CREATE FUNCTION rm_node_worker_side() RETURNS TRIGGER AS $$
+BEGIN
+	PERFORM shardman.pg_shardman_cleanup(false);
+	RETURN NULL;
+END
+$$ language plpgsql;
+CREATE TRIGGER rm_node_worker_side AFTER UPDATE ON shardman.nodes
+	FOR EACH ROW WHEN (OLD.status = 'active' AND NEW.status = 'rm_in_progress')
+	EXECUTE PROCEDURE rm_node_worker_side();
+-- fire trigger only on worker nodes
+ALTER TABLE shardman.nodes ENABLE REPLICA TRIGGER rm_node_worker_side;
+
 -- Currently it is used just to store node id, in general we can keep any local
 -- node metadata here. If is ever used extensively, probably hstore suits better.
 CREATE TABLE local_meta (
@@ -29,16 +49,17 @@ CREATE TABLE local_meta (
 INSERT INTO @extschema@.local_meta VALUES ('node_id', NULL);
 
 -- available commands
-CREATE TYPE cmd AS ENUM ('add_node', 'remove_node');
+CREATE TYPE cmd AS ENUM ('add_node', 'rm_node');
 -- command status
 CREATE TYPE cmd_status AS ENUM ('waiting', 'canceled', 'failed', 'in progress', 'success');
 
 CREATE TABLE cmd_log (
 	id bigserial PRIMARY KEY,
 	cmd_type cmd NOT NULL,
 	status cmd_status DEFAULT 'waiting' NOT NULL,
-	-- only for add_node cmd -- generated id for newly added node. Cleaner
-	-- to keep that is separate table...
+	-- only for add_node cmd -- generated id for newly added node. Exists only
+	-- when node adding is in progress or node is active. Cleaner to keep this
+	-- in separate table...
 	node_id int REFERENCES nodes(id)
 );
 
@@ -99,7 +120,8 @@ DECLARE
 BEGIN
 	SELECT node_id FROM @extschema@.cmd_log INTO n_id WHERE id = cmd_id;
 	IF n_id IS NULL THEN
-		INSERT INTO @extschema@.nodes VALUES (DEFAULT, quote_literal(connstring), false)
+		INSERT INTO @extschema@.nodes
+			VALUES (DEFAULT, quote_literal(connstring), 'add_in_progress')
 			RETURNING id INTO n_id;
 		UPDATE @extschema@.cmd_log SET node_id = n_id WHERE id = cmd_id;
 	END IF;
@@ -115,12 +137,25 @@ BEGIN
 	EXECUTE format('SELECT EXISTS (SELECT * FROM pg_replication_slots
 				   WHERE slot_name=%L)', slot_name) INTO slot_exists;
 	IF NOT slot_exists THEN
-		EXECUTE format('SELECT * FROM pg_create_logical_replication_slot(%L, %L)',
+		EXECUTE format('SELECT pg_create_logical_replication_slot(%L, %L)',
 					   slot_name, 'pgoutput');
 	END IF;
 END
 $$ LANGUAGE plpgsql;
 
+-- Drop replication slot, if it exists
+CREATE FUNCTION drop_repslot(slot_name text) RETURNS void AS $$
+DECLARE
+	slot_exists bool;
+BEGIN
+	EXECUTE format('SELECT EXISTS (SELECT * FROM pg_replication_slots
+				   WHERE slot_name=%L)', slot_name) INTO slot_exists;
+	IF slot_exists THEN
+		EXECUTE format('SELECT pg_drop_replication_slot(%L)', slot_name);
+	END IF;
+END
+$$ LANGUAGE plpgsql;
+
 -- Remove all our logical replication stuff in case of drop extension.
 -- Dropping extension cleanup is not that easy:
 --  - pg offers event triggers sql_drop, dd_command_end and ddl_command_start
@@ -133,10 +168,12 @@ $$ LANGUAGE plpgsql;
 --    is deleting.
 --  - because of that I resort to C function which examines parse tree and if
 --    it is our extension is deleting, it calls plpgsql cleanup func
-CREATE OR REPLACE FUNCTION pg_shardman_cleanup() RETURNS void  AS $$
+CREATE OR REPLACE FUNCTION pg_shardman_cleanup(drop_subs bool DEFAULT true)
+	RETURNS void AS $$
 DECLARE
 	pub record;
 	sub record;
+	rs record;
 BEGIN
 	FOR pub IN SELECT pubname FROM pg_publication WHERE pubname LIKE 'shardman_%' LOOP
 		EXECUTE format('DROP PUBLICATION %I', pub.pubname);
@@ -145,7 +182,13 @@ BEGIN
 		-- we are managing rep slots manually, so we need to detach it beforehand
 		EXECUTE format('ALTER SUBSCRIPTION %I DISABLE', sub.subname);
 		EXECUTE format('ALTER SUBSCRIPTION %I SET (slot_name = NONE)', sub.subname);
-		EXECUTE format('DROP SUBSCRIPTION %I', sub.subname);
+		IF drop_subs THEN
+			EXECUTE format('DROP SUBSCRIPTION %I', sub.subname);
+		END IF;
+	END LOOP;
+	FOR rs IN SELECT slot_name FROM pg_replication_slots
+		WHERE slot_name LIKE 'shardman_%' AND slot_type = 'logical' LOOP
+		EXECUTE format('SELECT pg_drop_replication_slot(%L)', rs.slot_name);
 	END LOOP;
 END;
 $$ LANGUAGE plpgsql;
@@ -173,8 +216,7 @@ $$ LANGUAGE sql;
 
 -- Interface functions
 
--- TODO: during the initial connection, ensure that nodes id (if any) is not
--- present in the cluster
+-- Add a node. Its state will be reset, all shardman data lost.
 CREATE FUNCTION add_node(connstring text) RETURNS void AS $$
 DECLARE
 	c_id int;
@@ -184,3 +226,14 @@ BEGIN
 	INSERT INTO @extschema@.cmd_opts VALUES (DEFAULT, c_id, connstring);
 END
 $$ LANGUAGE plpgsql;
+
+-- Remove node. Its state will be reset, all shardman data lost.
+CREATE FUNCTION rm_node(node_id int) RETURNS void AS $$
+DECLARE
+	c_id int;
+BEGIN
+	INSERT INTO @extschema@.cmd_log VALUES (DEFAULT, 'rm_node')
+										   RETURNING id INTO c_id;
+	INSERT INTO @extschema@.cmd_opts VALUES (DEFAULT, c_id, node_id);
+END
+$$ LANGUAGE plpgsql;
diff --git a/src/pg_shardman.c b/src/pg_shardman.c
@@ -49,10 +49,13 @@ static void shardmaster_sigterm(SIGNAL_ARGS);
 static void shardmaster_sigusr1(SIGNAL_ARGS);
 static void check_for_sigterm(void);
 static void pg_shardman_installed_local(void);
+
 static void add_node(Cmd *cmd);
 static int insert_node(const char *connstring, int64 cmd_id);
 static bool node_in_cluster(int id);
-static void activate_node(int64 cmd_id, int node_id);
+
+static void rm_node(Cmd *cmd);
+static bool is_node_active(int node_id);
 
 /* flags set by signal handlers */
 static volatile sig_atomic_t got_sigterm = false;
@@ -135,8 +138,8 @@ _PG_init()
 		shardmaster_worker.bgw_flags = BGWORKER_SHMEM_ACCESS |
 			BGWORKER_BACKEND_DATABASE_CONNECTION;
 		shardmaster_worker.bgw_start_time = BgWorkerStart_RecoveryFinished;
-		/* shardmaster_worker.bgw_restart_time = 10; */
-		shardmaster_worker.bgw_restart_time = BGW_NEVER_RESTART;
+		shardmaster_worker.bgw_restart_time = 10;
+		/* shardmaster_worker.bgw_restart_time = BGW_NEVER_RESTART; */
 		sprintf(shardmaster_worker.bgw_library_name, "pg_shardman");
 		sprintf(shardmaster_worker.bgw_function_name, "shardmaster_main");
 		shardmaster_worker.bgw_notify_pid = 0;
@@ -181,6 +184,8 @@ shardmaster_main(Datum main_arg)
 				shmn_elog(LOG, "%s", *opts);
 			if (strcmp(cmd->cmd_type, "add_node") == 0)
 				add_node(cmd);
+			else if (strcmp(cmd->cmd_type, "rm_node") == 0)
+				rm_node(cmd);
 			else
 				shmn_elog(FATAL, "Unknown cmd type %s", cmd->cmd_type);
 		}
@@ -371,7 +376,7 @@ pg_shardman_installed_local(void)
 	{
 		installed = false;
 		shmn_elog(WARNING, "pg_shardman library is preloaded, but extenstion"
-				  "is not created");
+				  " is not created");
 	}
 	PopActiveSnapshot();
 	CommitTransactionCommand();
@@ -419,14 +424,18 @@ check_for_sigterm(void)
 
 /*
  * Adding node consists of
- * - verifying that the node is not present in the cluster at the moment
- * - extension recreation
- * - repl slot recreation
- * - subscription creation
- * - setting node id
- * - adding node to 'nodes' table
+ * - verifying the node is not 'active' in the cluster, i.e. 'nodes' table
+ * - adding node to the 'nodes' as not active, get its new id
+ * - reinstalling extenstion
+ * - recreating repslot
+ * - recreating subscription
+ * - setting node id on the node itself
+ * - marking node as active and cmd as success
+ * We do all this stuff to make all actions are idempodent to be able to retry
+ * them in case of any failure.
  */
-static void add_node(Cmd *cmd)
+void
+add_node(Cmd *cmd)
 {
 	PGconn *conn = NULL;
 	const char *connstring = cmd->opts[0];
@@ -473,7 +482,7 @@ static void add_node(Cmd *cmd)
 
 			if (!PQgetisnull(res, 0, 0))
 			{
-				/* Node is in cluster. Is it active in our cluster? */
+				/* Node is in cluster. Was it there before we started adding? */
 				node_id = atoi(PQgetvalue(res, 0, 0));
 				PQclear(res);
 				if (node_in_cluster(node_id))
@@ -529,12 +538,24 @@ static void add_node(Cmd *cmd)
 					  PQerrorMessage(conn));
 			goto attempt_failed;
 		}
-		pg_shardman_installed = PQntuples(res) == 1 && !PQgetisnull(res, 0, 0);
+
 		PQclear(res);
+		PQfinish(conn);
+
+		/*
+		 * Mark add_node cmd as success and node as active, we must do that in
+		 * one txn.
+		 */
+		sql = psprintf(
+			"update shardman.nodes set status = 'active' where id = %d;"
+			"update shardman.cmd_log set status = 'success' where id = %ld;",
+			node_id, cmd->id);
+		void_spi(sql);
+		pfree(sql);
 
 		/* done */
-		PQfinish(conn);
-		activate_node(cmd->id, node_id);
+		elog(INFO, "Node %s successfully added, it is assigned id %d",
+			 connstring, node_id);
 		return;
 
 attempt_failed: /* clean resources, sleep, check sigusr1 and try again */
@@ -544,6 +565,7 @@ static void add_node(Cmd *cmd)
 			PQfinish(conn);
 
 		shmn_elog(LOG, "Attempt to execute add_node failed, sleeping and retrying");
+		/* TODO: sleep using waitlatch? */
 		pg_usleep(shardman_cmd_retry_naptime * 1000L);
 	}
 
@@ -581,53 +603,72 @@ insert_node(const char *connstring, int64 cmd_id)
 }
 
 /*
- * Returns true, if node 'id' is active in our cluster, false otherwise.
+ * Returns true, if node 'id' is in cluster and not in add_in_progress state
  */
 static bool
 node_in_cluster(int id)
 {
-	int e;
-	const char *sql = "select id from shardman.nodes where active;";
-	bool res = false;
-	HeapTuple tuple;
-	TupleDesc rowdesc;
-	uint64 i;
-	bool isnull;
+	char *sql = psprintf(
+		"select id from shardman.nodes where id = %d and status != 'add_in_progress';",
+		id);
+	bool res;
 
 	SPI_PROLOG;
-	e = SPI_execute(sql, true, 0);
-	if (e < 0)
+	if (SPI_execute(sql, true, 0) < 0)
 		shmn_elog(FATAL, "Stmt failed: %s", sql);
+	pfree(sql);
+	res = SPI_processed == 1;
 
-	rowdesc = SPI_tuptable->tupdesc;
-	for (i = 0; i < SPI_processed; i++)
-	{
-		tuple = SPI_tuptable->vals[i];
-		if (id == DatumGetInt32(SPI_getbinval(tuple, rowdesc,
-											  SPI_fnumber(rowdesc, "id"),
-											  &isnull)))
-			res = true;
-	}
 	SPI_EPILOG;
-
 	return res;
 }
 
 /*
- * Mark add_node cmd as success and node as active, we must do that in one txn
+ * Remove node, losing all data on it. We
+ * - ensure that there is active node with given id in the cluster
+ * - mark node as rm_in_progress and commit so this reaches node via LR
+ * - wait a bit to let it unsubscribe
+ * - drop replication slot, remove node row and mark cmd as success
+ * Everything is idempotent. Note that we are not allowed to remove repl slot
+ * when the walsender connection is alive, that's why we sleep here.
  */
-void activate_node(int64 cmd_id, int node_id)
+void
+rm_node(Cmd *cmd)
 {
-	int e;
-	char *sql = psprintf(
-		"update shardman.nodes set active = true where id = %d;"
-		"update shardman.cmd_log set status = 'success' where id = %ld;",
-		node_id, cmd_id);
+	int node_id = atoi(cmd->opts[0]);
+	char *sql;
 
-	SPI_PROLOG;
-	e = SPI_exec(sql, 0);
+	if (!node_in_cluster(node_id))
+	{
+		shmn_elog(WARNING, "node %d not in cluster, won't rm it.", node_id);
+		update_cmd_status(cmd->id, "failed");
+		return;
+	}
+
+	sql = psprintf(
+		"update shardman.nodes set status = 'rm_in_progress' where id = %d;",
+		node_id);
+	void_spi(sql);
 	pfree(sql);
-	if (e < 0)
-		shmn_elog(FATAL, "Stmt failed: %s", sql);
-	SPI_EPILOG;
+
+	/* Let node drop the subscription */
+	pg_usleep(2 * 1000000L);
+
+	/*
+	 * It is extremely unlikely that node still keeps walsender process
+	 * connected but ignored our node status update, so this should succeed.
+	 * If not, bgw exits, but postmaster will restart us to try again.
+	 * TODO: at this stage, user can't cancel command at all, this should be
+	 * fixed.
+	 */
+	sql = psprintf(
+		"select shardman.drop_repslot('shardman_meta_sub_%d');"
+		/* keep silent cmd_log fk constraint */
+		"update shardman.cmd_log set node_id = null where node_id = %d;"
+		"delete from shardman.nodes where id = %d;"
+		"update shardman.cmd_log set status = 'success' where id = %ld;",
+		node_id, node_id, node_id, cmd->id);
+	void_spi(sql);
+	pfree(sql);
+	elog(INFO, "Node %d successfully removed", node_id);
 }