Readme written.

arssher · arssher · commit ed731248b916 · 2017-08-09T15:50:31.000+03:00
diff --git a/bin/common.sh b/bin/common.sh
@@ -70,10 +70,9 @@ function run_demo()
     psql -c "select shardman.add_node('port=5435');"
     psql -c "select shardman.add_node('port=5436');"
 
-    psql -p 5433 -c "drop table if exists pt_0;" # drop replica
     psql -c "select shardman.create_hash_partitions(2, 'pt', 'id', 6);"
 
-    # psql -c "select shardman.create_replica('pt_0', 3);"
-    # psql -c "select shardman.create_replica('pt_0', 5);"
-    # psql -c "select shardman.move_part('pt_0', 4, 3);"
+    psql -c "select shardman.create_replica('pt_0', 3);"
+    psql -c "select shardman.create_replica('pt_0', 5);"
+    psql -c "select shardman.move_part('pt_0', 4, 3);"
 }
diff --git a/bin/shardman_init.sh b/bin/shardman_init.sh
@@ -23,7 +23,7 @@ for datadir in $master_datadir "${worker_datadirs[@]}"; do
 done
 
 cat postgresql.conf.common.template >> ${master_datadir}/postgresql.conf
-cat postgresql.conf.master.template >> ${master_datadir}/postgresql.conf
+cat postgresql.conf.lord.template >> ${master_datadir}/postgresql.conf
 for worker_datadir in "${worker_datadirs[@]}"; do
     cat postgresql.conf.common.template >> ${worker_datadir}/postgresql.conf
     cat postgresql.conf.worker.template >> ${worker_datadir}/postgresql.conf
diff --git a/init.sql b/init.sql
@@ -59,24 +59,26 @@ CREATE TABLE cmd_opts (
 -- Interface functions
 
 -- Add a node. Its state will be reset, all shardman data lost.
-CREATE FUNCTION add_node(connstring text) RETURNS void AS $$
+CREATE FUNCTION add_node(connstring text) RETURNS int AS $$
 DECLARE
 	c_id int;
 BEGIN
 	INSERT INTO @extschema@.cmd_log VALUES (DEFAULT, 'add_node')
 										   RETURNING id INTO c_id;
 	INSERT INTO @extschema@.cmd_opts VALUES (DEFAULT, c_id, connstring);
+	RETURN c_id;
 END
 $$ LANGUAGE plpgsql;
 
 -- Remove node. Its state will be reset, all shardman data lost.
-CREATE FUNCTION rm_node(node_id int) RETURNS void AS $$
+CREATE FUNCTION rm_node(node_id int) RETURNS int AS $$
 DECLARE
 	c_id int;
 BEGIN
 	INSERT INTO @extschema@.cmd_log VALUES (DEFAULT, 'rm_node')
 										   RETURNING id INTO c_id;
 	INSERT INTO @extschema@.cmd_opts VALUES (DEFAULT, c_id, node_id);
+	RETURN c_id;
 END
 $$ LANGUAGE plpgsql;
 
diff --git a/postgresql.conf.common.template b/postgresql.conf.common.template
@@ -1,13 +1,23 @@
 shared_preload_libraries = 'pg_pathman, pg_shardman'
 
 # print node ids in log messages
+# TODO: works rarely, we need other way to hook into the logging process
 log_line_prefix = '%m %z'
 
-# just to suppress logging
-autovacuum = off
-
 log_min_messages = DEBUG1
 # client_min_messages = NOTICE
 client_min_messages = WARNING
 
-wal_level = logical
+# We use logical replication for sending metadata from shardlord to workers
+# and for data replication.
+wal_level = logical # necessary for logical replication
+# On shardlord, this must be at least max number of workers + some reserved for
+# initial tablesync. On workers, this must be at least max number of shards on
+# the node with 'prev' replica + max number of shards on the node
+# with 'next' replica (yes, with overlapping) + some reserved for initial
+# tablesync.
+max_replication_slots = 100
+# Similar is true for max_wal_senders: shardlord this should have this equal to
+# max_replication_slots. On workers, this must be at least max number of shards
+# on the table which have 'next' replica.
+max_wal_senders = 50
diff --git a/postgresql.conf.lord.template b/postgresql.conf.lord.template
@@ -0,0 +1,5 @@
+shardman.master = on # this instance is shardlor?
+shardman.master_dbname = ars # lord's dbname
+shardman.master_connstring = 'port=5432' # lord's connstring
+shardman.cmd_retry_naptime = 500 # sleep milliseconds after failure
+shardman.poll_interval = 500 # long operations poll frequency
diff --git a/postgresql.conf.master.template b/postgresql.conf.master.template
diff --git a/postgresql.conf.worker.template b/postgresql.conf.worker.template
@@ -1 +1,4 @@
-# worker-specific part
+# At least number of shards with 'prev' replica + reserve for initial tablesync.
+max_logical_replication_workers = 50
+# At least max_logical_replication_workers + 1
+max_worker_processes = 60
diff --git a/readme.txt b/readme.txt
@@ -1,20 +1,174 @@
-How to build:
-PostgreSQL location is derived from pg_config, you can also specify path to it
-in PG_CONFIG var.
+First, some terminology:
+'shardlord' or 'lord' is postgres instance and background process (bgw) spinning
+  on it which manages sharding. In some places it is still called 'shardmaster'
+  or 'master'.
+'worker nodes' or 'workers' are other nodes with data.
+'sharded table' is table managed by shardman.
+'shard' or 'partition' is any table containing part of sharded table.
+'primary' is main partition of sharded table, i.e. the only writable
+  partition.
+'replica' is secondary partition of sharded table, i.e. read-only partition.
+'cluster' -- the whole system of shardlord and workers, or cluster in PostgreSQL
+  sense, this should be clear from the context.
+
+For quick setup, see scripts in bin/directory. Setup is configured in file
+common.sh. shardman_init.sh performs initdb for shardlord & workers, deploys
+example configs and creates extension; shardman_start.sh reinstalls extension,
+which is useful for development.
+
+Both shardlord and workers require extension built and installed. We depend
+on pg_pathman extension so it must be installed too.
+PostgreSQL location for building is derived from pg_config, you can also specify
+path to it in PG_CONFIG var. The whole process is of building and copying files
+to PG server is just:
 
 git clone
 cd pg_shardman
-make
 make install
 
-add to postgresql.conf
-shared_preload_libraries = '$libdir/pg_shardman'
+To actually install extension, add pg_shardman and pg_pathman to
+shared_preload_libraries, restart the server and run
+
+create extension pg_shardman cascade;
+
+Have a look at postgresql.conf.common.template and postgresql.conf.lord.template
+example configuration files. The former contains all shardman's and important
+PostgreSQL GUCs for either shardlord and workers, the latter for shardlord only
+-- in particular, shardman.master defines whether the instance is shardlord or
+not.
+
+Immediately after starting the server with shardman library preloaded, but
+before creating extension you will receive on shardlord warning like
+
+WARNING: pg_shardman library is preloaded on shardlord, but extenstion is not
+  created
+
+This is normal as we have here kind of circular dependency: it is pointless to
+create extension without the library, and library also uses SQL objects, so
+shardlord won't start without installed extension.
+
+Currently extension scheme is fixed, it is, who would have though, 'shardman'.
+
+Now you can issue commands to the shardlord. All shardman commands (cmds) you
+issue return immediately because they technically just submit the cmd to the
+shardlord; he learns about them and starts the actual execution. At any time you
+can cancel currently executing command, just send SIGUSR1 to the shardlord. This
+is not yet implemented as a handy SQL function, but you can use cancel_cmd.sh
+script from bin/ directory. All submitted cmds return unique command id which is
+used to check the cmd status later by querying shardman.cmd_log and
+shardman.cmd_opts tables:
+
+CREATE TABLE cmd_log (
+	id bigserial PRIMARY KEY,
+	cmd_type cmd NOT NULL,
+	status cmd_status DEFAULT 'waiting' NOT NULL
+);
+CREATE TABLE cmd_opts (
+	id bigserial PRIMARY KEY,
+	cmd_id bigint REFERENCES cmd_log(id),
+	opt text
+);
+
+We will unite them into convenient view someday. Commands status is enum with
+mostly obvious values ('waiting', 'canceled', 'failed', 'in progress',
+'success', 'done'). You might wonder what is the difference between 'success'
+and 'done'. We set the latter when the command is not atomic itself, but
+consists of several atomic steps, some of which were probably executed
+successfully and some failed.
+
+Currently cmd_log can be seen and commands issued only on the shardlord, but
+that's easy to change.
+
+Let's get to the actual commands.
+
+add_node(connstring text)
+Add node with given connstring to the cluster. Node is assigned unique id. If
+node previously contained shardman state from old cluster (not one managed by
+current shardlord), this state will be lost.
+
+rm_node(node_id int)
+Remove node from the cluster. Its shardman state will be reset. We don't delete
+tables with data and foreign tables though.
+
+You can see all cluster nodes at any time by examining shardman.nodes table:
+-- active is the normal mode, others needed only for proper node add and removal
+CREATE TYPE worker_node_status AS ENUM (
+	'active', 'add_in_progress', 'rm_in_progress', 'removed');
+CREATE TABLE nodes (
+	id serial PRIMARY KEY,
+	connstring text NOT NULL UNIQUE,
+	worker_status worker_node_status,
+	-- While currently we don't support master and worker roles on one node,
+	-- potentially node can be either worker, master or both, so we need 2 bits.
+	-- One bool with NULL might be fine, but it seems a bit counter-intuitive.
+	worker bool NOT NULL DEFAULT true,
+	master bool NOT NULL DEFAULT false,
+	-- cmd by which node was added
+	added_by bigint REFERENCES shardman.cmd_log(id)
+);
+
+create_hash_partitions(
+	node_id int, relation text, expr text, partitions_count int,
+	rebalance bool DEFAULT true)
+Hash-shard table 'relation' lying on node 'node_id' by key 'expr', creating
+'partitions_count' shards. As you probably noticed, the signature mirrors
+pathman's function with the same name. If 'rebalance' is false, we just
+partition table locally, making other nodes aware about it. If it is true,
+we also immediately run 'rebalance' function on the table to distibute
+partitions, see below.
+
+There are two tables describing sharded tables (no pun intended) state, shardman.tables and shardman.partitions:
+CREATE TABLE tables (
+	relation text PRIMARY KEY, -- table name
+	expr text NOT NULL,
+	partitions_count int NOT NULL,
+	create_sql text NOT NULL, -- sql to create the table
+	-- Node on which table was partitioned at the beginning. Used only during
+	-- initial tables inflation to distinguish between table owner and other
+	-- nodes, probably cleaner to keep it in separate table.
+	initial_node int NOT NULL REFERENCES nodes(id)
+);
+-- Primary shard and its replicas compose a doubly-linked list: nxt refers to
+-- the node containing next replica, prv to node with previous replica (or
+-- primary, if we are the first replica). If prv is NULL, this is primary
+-- replica. We don't number parts separately since we are not ever going to
+-- allow several copies of the same partition on one node.
+CREATE TABLE partitions (
+	part_name text,
+	owner int NOT NULL REFERENCES nodes(id), -- node on which partition lies
+	prv int REFERENCES nodes(id),
+	nxt int REFERENCES nodes(id),
+	relation text NOT NULL REFERENCES tables(relation),
+	PRIMARY KEY (part_name, owner)
+);
+
+move_part(part_name text, dest int, src int DEFAULT NULL)
+Move shard 'part_name' from node 'dest' to node 'src'. If src is NULL, primary
+shard is moved. Cmd fails if there is already replica of this shard on 'dest'.
+
+create_replica(part_name text, dest int)
+Create replica of shard 'part_name' on node 'dest'. Cmd fails if there is already replica of this shard on 'dest'.
+
+rebalance(relation text)
+Evenly distribute partitions of table 'relation' across all nodes. Currently
+this is pretty dumb function, it just tries to move each shard once to node
+choosen in round-robin manner, completely ignoring current distribution. Since
+dest node can already have replica of this partition, it is not uncommon to see
+warnings about failed moves during execution. After completion cmd status is
+'done', not 'success'.
 
-restart postgres server and run
-drop extension if exists pg_shardman;
-create extension pg_shardman;
+set_replevel(relation text, replevel int)
+Add replicas to shards of sharded table 'relation' until we reach replevel
+replicas for each one. Replica deletions is not implemented yet. Note that it is
+pointless to set replevel to more than number of active workers - 1 since we
+don't forbid several replicas on one node. Nodes for replicas are choosen
+randomly. As in 'rebalance', we are fully oblivious about current shards
+distribution, so you will see a bunch of warnings about failing replica
+creation -- one for each time random chooses node with already existing replica.
 
-The master itself can't be worker node for now, because it requires special
-handling of LR channels setup.
+Sharded tables dropping, as well as replica deletion is not implemented yet.
 
-ALTER TABLE for sharded tables is not supported for now.
+Limitations:
+* We can't switch shardlord for now.
+* The shardlord itself can't be worker node for now.
+* ALTER TABLE for sharded tables is not supported.
diff --git a/shard.sql b/shard.sql
@@ -632,7 +632,7 @@ CREATE FUNCTION reconstruct_table_attrs(relation regclass)
 ------------------------------------------------------------
 
 -- Drop (locally) all partitions of given table, if they exist
-CREATE OR REPLACE FUNCTION drop_parts(relation text, partitions_count int)
+CREATE FUNCTION drop_parts(relation text, partitions_count int)
 	RETURNS void as $$
 DECLARE
 	r record;
diff --git a/src/pg_shardman.c b/src/pg_shardman.c
@@ -152,7 +152,7 @@ _PG_init()
 		shardmaster_worker.bgw_flags = BGWORKER_SHMEM_ACCESS |
 			BGWORKER_BACKEND_DATABASE_CONNECTION;
 		shardmaster_worker.bgw_start_time = BgWorkerStart_RecoveryFinished;
-		shardmaster_worker.bgw_restart_time = 10;
+		shardmaster_worker.bgw_restart_time = shardman_cmd_retry_naptime;
 		/* shardmaster_worker.bgw_restart_time = BGW_NEVER_RESTART; */
 		sprintf(shardmaster_worker.bgw_library_name, "pg_shardman");
 		sprintf(shardmaster_worker.bgw_function_name, "shardmaster_main");
@@ -401,14 +401,13 @@ pg_shardman_installed_local(void)
 	if (get_extension_oid("pg_shardman", true) == InvalidOid)
 	{
 		installed = false;
-		shmn_elog(WARNING, "pg_shardman library is preloaded, but extenstion"
-				  " is not created");
+		shmn_elog(WARNING, "pg_shardman library is preloaded on shardlord, but"
+				  " extenstion is not created");
 	}
 	PopActiveSnapshot();
 	CommitTransactionCommand();
 
 	/* shardmaster won't run without extension */
-	/* TODO: unregister bgw? */
 	if (!installed)
 		proc_exit(1);
 }