Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit ed73124

Browse files
committed
Readme written.
1 parent 7e7064e commit ed73124

10 files changed

+201
-35
lines changed

bin/common.sh

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,9 @@ function run_demo()
7070
psql -c "select shardman.add_node('port=5435');"
7171
psql -c "select shardman.add_node('port=5436');"
7272

73-
psql -p 5433 -c "drop table if exists pt_0;" # drop replica
7473
psql -c "select shardman.create_hash_partitions(2, 'pt', 'id', 6);"
7574

76-
# psql -c "select shardman.create_replica('pt_0', 3);"
77-
# psql -c "select shardman.create_replica('pt_0', 5);"
78-
# psql -c "select shardman.move_part('pt_0', 4, 3);"
75+
psql -c "select shardman.create_replica('pt_0', 3);"
76+
psql -c "select shardman.create_replica('pt_0', 5);"
77+
psql -c "select shardman.move_part('pt_0', 4, 3);"
7978
}

bin/shardman_init.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ for datadir in $master_datadir "${worker_datadirs[@]}"; do
2323
done
2424

2525
cat postgresql.conf.common.template >> ${master_datadir}/postgresql.conf
26-
cat postgresql.conf.master.template >> ${master_datadir}/postgresql.conf
26+
cat postgresql.conf.lord.template >> ${master_datadir}/postgresql.conf
2727
for worker_datadir in "${worker_datadirs[@]}"; do
2828
cat postgresql.conf.common.template >> ${worker_datadir}/postgresql.conf
2929
cat postgresql.conf.worker.template >> ${worker_datadir}/postgresql.conf

init.sql

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,24 +59,26 @@ CREATE TABLE cmd_opts (
5959
-- Interface functions
6060

6161
-- Add a node. Its state will be reset, all shardman data lost.
62-
CREATE FUNCTION add_node(connstring text) RETURNS void AS $$
62+
CREATE FUNCTION add_node(connstring text) RETURNS int AS $$
6363
DECLARE
6464
c_id int;
6565
BEGIN
6666
INSERT INTO @extschema@.cmd_log VALUES (DEFAULT, 'add_node')
6767
RETURNING id INTO c_id;
6868
INSERT INTO @extschema@.cmd_opts VALUES (DEFAULT, c_id, connstring);
69+
RETURN c_id;
6970
END
7071
$$ LANGUAGE plpgsql;
7172

7273
-- Remove node. Its state will be reset, all shardman data lost.
73-
CREATE FUNCTION rm_node(node_id int) RETURNS void AS $$
74+
CREATE FUNCTION rm_node(node_id int) RETURNS int AS $$
7475
DECLARE
7576
c_id int;
7677
BEGIN
7778
INSERT INTO @extschema@.cmd_log VALUES (DEFAULT, 'rm_node')
7879
RETURNING id INTO c_id;
7980
INSERT INTO @extschema@.cmd_opts VALUES (DEFAULT, c_id, node_id);
81+
RETURN c_id;
8082
END
8183
$$ LANGUAGE plpgsql;
8284

postgresql.conf.common.template

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,23 @@
11
shared_preload_libraries = 'pg_pathman, pg_shardman'
22

33
# print node ids in log messages
4+
# TODO: works rarely, we need other way to hook into the logging process
45
log_line_prefix = '%m %z'
56

6-
# just to suppress logging
7-
autovacuum = off
8-
97
log_min_messages = DEBUG1
108
# client_min_messages = NOTICE
119
client_min_messages = WARNING
1210

13-
wal_level = logical
11+
# We use logical replication for sending metadata from shardlord to workers
12+
# and for data replication.
13+
wal_level = logical # necessary for logical replication
14+
# On shardlord, this must be at least max number of workers + some reserved for
15+
# initial tablesync. On workers, this must be at least max number of shards on
16+
# the node with 'prev' replica + max number of shards on the node
17+
# with 'next' replica (yes, with overlapping) + some reserved for initial
18+
# tablesync.
19+
max_replication_slots = 100
20+
# Similar is true for max_wal_senders: shardlord this should have this equal to
21+
# max_replication_slots. On workers, this must be at least max number of shards
22+
# on the table which have 'next' replica.
23+
max_wal_senders = 50

postgresql.conf.lord.template

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
shardman.master = on # this instance is shardlor?
2+
shardman.master_dbname = ars # lord's dbname
3+
shardman.master_connstring = 'port=5432' # lord's connstring
4+
shardman.cmd_retry_naptime = 500 # sleep milliseconds after failure
5+
shardman.poll_interval = 500 # long operations poll frequency

postgresql.conf.master.template

Lines changed: 0 additions & 6 deletions
This file was deleted.

postgresql.conf.worker.template

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,4 @@
1-
# worker-specific part
1+
# At least number of shards with 'prev' replica + reserve for initial tablesync.
2+
max_logical_replication_workers = 50
3+
# At least max_logical_replication_workers + 1
4+
max_worker_processes = 60

readme.txt

Lines changed: 166 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,174 @@
1-
How to build:
2-
PostgreSQL location is derived from pg_config, you can also specify path to it
3-
in PG_CONFIG var.
1+
First, some terminology:
2+
'shardlord' or 'lord' is postgres instance and background process (bgw) spinning
3+
on it which manages sharding. In some places it is still called 'shardmaster'
4+
or 'master'.
5+
'worker nodes' or 'workers' are other nodes with data.
6+
'sharded table' is table managed by shardman.
7+
'shard' or 'partition' is any table containing part of sharded table.
8+
'primary' is main partition of sharded table, i.e. the only writable
9+
partition.
10+
'replica' is secondary partition of sharded table, i.e. read-only partition.
11+
'cluster' -- the whole system of shardlord and workers, or cluster in PostgreSQL
12+
sense, this should be clear from the context.
13+
14+
For quick setup, see scripts in bin/directory. Setup is configured in file
15+
common.sh. shardman_init.sh performs initdb for shardlord & workers, deploys
16+
example configs and creates extension; shardman_start.sh reinstalls extension,
17+
which is useful for development.
18+
19+
Both shardlord and workers require extension built and installed. We depend
20+
on pg_pathman extension so it must be installed too.
21+
PostgreSQL location for building is derived from pg_config, you can also specify
22+
path to it in PG_CONFIG var. The whole process is of building and copying files
23+
to PG server is just:
424

525
git clone
626
cd pg_shardman
7-
make
827
make install
928

10-
add to postgresql.conf
11-
shared_preload_libraries = '$libdir/pg_shardman'
29+
To actually install extension, add pg_shardman and pg_pathman to
30+
shared_preload_libraries, restart the server and run
31+
32+
create extension pg_shardman cascade;
33+
34+
Have a look at postgresql.conf.common.template and postgresql.conf.lord.template
35+
example configuration files. The former contains all shardman's and important
36+
PostgreSQL GUCs for either shardlord and workers, the latter for shardlord only
37+
-- in particular, shardman.master defines whether the instance is shardlord or
38+
not.
39+
40+
Immediately after starting the server with shardman library preloaded, but
41+
before creating extension you will receive on shardlord warning like
42+
43+
WARNING: pg_shardman library is preloaded on shardlord, but extenstion is not
44+
created
45+
46+
This is normal as we have here kind of circular dependency: it is pointless to
47+
create extension without the library, and library also uses SQL objects, so
48+
shardlord won't start without installed extension.
49+
50+
Currently extension scheme is fixed, it is, who would have though, 'shardman'.
51+
52+
Now you can issue commands to the shardlord. All shardman commands (cmds) you
53+
issue return immediately because they technically just submit the cmd to the
54+
shardlord; he learns about them and starts the actual execution. At any time you
55+
can cancel currently executing command, just send SIGUSR1 to the shardlord. This
56+
is not yet implemented as a handy SQL function, but you can use cancel_cmd.sh
57+
script from bin/ directory. All submitted cmds return unique command id which is
58+
used to check the cmd status later by querying shardman.cmd_log and
59+
shardman.cmd_opts tables:
60+
61+
CREATE TABLE cmd_log (
62+
id bigserial PRIMARY KEY,
63+
cmd_type cmd NOT NULL,
64+
status cmd_status DEFAULT 'waiting' NOT NULL
65+
);
66+
CREATE TABLE cmd_opts (
67+
id bigserial PRIMARY KEY,
68+
cmd_id bigint REFERENCES cmd_log(id),
69+
opt text
70+
);
71+
72+
We will unite them into convenient view someday. Commands status is enum with
73+
mostly obvious values ('waiting', 'canceled', 'failed', 'in progress',
74+
'success', 'done'). You might wonder what is the difference between 'success'
75+
and 'done'. We set the latter when the command is not atomic itself, but
76+
consists of several atomic steps, some of which were probably executed
77+
successfully and some failed.
78+
79+
Currently cmd_log can be seen and commands issued only on the shardlord, but
80+
that's easy to change.
81+
82+
Let's get to the actual commands.
83+
84+
add_node(connstring text)
85+
Add node with given connstring to the cluster. Node is assigned unique id. If
86+
node previously contained shardman state from old cluster (not one managed by
87+
current shardlord), this state will be lost.
88+
89+
rm_node(node_id int)
90+
Remove node from the cluster. Its shardman state will be reset. We don't delete
91+
tables with data and foreign tables though.
92+
93+
You can see all cluster nodes at any time by examining shardman.nodes table:
94+
-- active is the normal mode, others needed only for proper node add and removal
95+
CREATE TYPE worker_node_status AS ENUM (
96+
'active', 'add_in_progress', 'rm_in_progress', 'removed');
97+
CREATE TABLE nodes (
98+
id serial PRIMARY KEY,
99+
connstring text NOT NULL UNIQUE,
100+
worker_status worker_node_status,
101+
-- While currently we don't support master and worker roles on one node,
102+
-- potentially node can be either worker, master or both, so we need 2 bits.
103+
-- One bool with NULL might be fine, but it seems a bit counter-intuitive.
104+
worker bool NOT NULL DEFAULT true,
105+
master bool NOT NULL DEFAULT false,
106+
-- cmd by which node was added
107+
added_by bigint REFERENCES shardman.cmd_log(id)
108+
);
109+
110+
create_hash_partitions(
111+
node_id int, relation text, expr text, partitions_count int,
112+
rebalance bool DEFAULT true)
113+
Hash-shard table 'relation' lying on node 'node_id' by key 'expr', creating
114+
'partitions_count' shards. As you probably noticed, the signature mirrors
115+
pathman's function with the same name. If 'rebalance' is false, we just
116+
partition table locally, making other nodes aware about it. If it is true,
117+
we also immediately run 'rebalance' function on the table to distibute
118+
partitions, see below.
119+
120+
There are two tables describing sharded tables (no pun intended) state, shardman.tables and shardman.partitions:
121+
CREATE TABLE tables (
122+
relation text PRIMARY KEY, -- table name
123+
expr text NOT NULL,
124+
partitions_count int NOT NULL,
125+
create_sql text NOT NULL, -- sql to create the table
126+
-- Node on which table was partitioned at the beginning. Used only during
127+
-- initial tables inflation to distinguish between table owner and other
128+
-- nodes, probably cleaner to keep it in separate table.
129+
initial_node int NOT NULL REFERENCES nodes(id)
130+
);
131+
-- Primary shard and its replicas compose a doubly-linked list: nxt refers to
132+
-- the node containing next replica, prv to node with previous replica (or
133+
-- primary, if we are the first replica). If prv is NULL, this is primary
134+
-- replica. We don't number parts separately since we are not ever going to
135+
-- allow several copies of the same partition on one node.
136+
CREATE TABLE partitions (
137+
part_name text,
138+
owner int NOT NULL REFERENCES nodes(id), -- node on which partition lies
139+
prv int REFERENCES nodes(id),
140+
nxt int REFERENCES nodes(id),
141+
relation text NOT NULL REFERENCES tables(relation),
142+
PRIMARY KEY (part_name, owner)
143+
);
144+
145+
move_part(part_name text, dest int, src int DEFAULT NULL)
146+
Move shard 'part_name' from node 'dest' to node 'src'. If src is NULL, primary
147+
shard is moved. Cmd fails if there is already replica of this shard on 'dest'.
148+
149+
create_replica(part_name text, dest int)
150+
Create replica of shard 'part_name' on node 'dest'. Cmd fails if there is already replica of this shard on 'dest'.
151+
152+
rebalance(relation text)
153+
Evenly distribute partitions of table 'relation' across all nodes. Currently
154+
this is pretty dumb function, it just tries to move each shard once to node
155+
choosen in round-robin manner, completely ignoring current distribution. Since
156+
dest node can already have replica of this partition, it is not uncommon to see
157+
warnings about failed moves during execution. After completion cmd status is
158+
'done', not 'success'.
12159

13-
restart postgres server and run
14-
drop extension if exists pg_shardman;
15-
create extension pg_shardman;
160+
set_replevel(relation text, replevel int)
161+
Add replicas to shards of sharded table 'relation' until we reach replevel
162+
replicas for each one. Replica deletions is not implemented yet. Note that it is
163+
pointless to set replevel to more than number of active workers - 1 since we
164+
don't forbid several replicas on one node. Nodes for replicas are choosen
165+
randomly. As in 'rebalance', we are fully oblivious about current shards
166+
distribution, so you will see a bunch of warnings about failing replica
167+
creation -- one for each time random chooses node with already existing replica.
16168

17-
The master itself can't be worker node for now, because it requires special
18-
handling of LR channels setup.
169+
Sharded tables dropping, as well as replica deletion is not implemented yet.
19170

20-
ALTER TABLE for sharded tables is not supported for now.
171+
Limitations:
172+
* We can't switch shardlord for now.
173+
* The shardlord itself can't be worker node for now.
174+
* ALTER TABLE for sharded tables is not supported.

shard.sql

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -632,7 +632,7 @@ CREATE FUNCTION reconstruct_table_attrs(relation regclass)
632632
------------------------------------------------------------
633633

634634
-- Drop (locally) all partitions of given table, if they exist
635-
CREATE OR REPLACE FUNCTION drop_parts(relation text, partitions_count int)
635+
CREATE FUNCTION drop_parts(relation text, partitions_count int)
636636
RETURNS void as $$
637637
DECLARE
638638
r record;

src/pg_shardman.c

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ _PG_init()
152152
shardmaster_worker.bgw_flags = BGWORKER_SHMEM_ACCESS |
153153
BGWORKER_BACKEND_DATABASE_CONNECTION;
154154
shardmaster_worker.bgw_start_time = BgWorkerStart_RecoveryFinished;
155-
shardmaster_worker.bgw_restart_time = 10;
155+
shardmaster_worker.bgw_restart_time = shardman_cmd_retry_naptime;
156156
/* shardmaster_worker.bgw_restart_time = BGW_NEVER_RESTART; */
157157
sprintf(shardmaster_worker.bgw_library_name, "pg_shardman");
158158
sprintf(shardmaster_worker.bgw_function_name, "shardmaster_main");
@@ -401,14 +401,13 @@ pg_shardman_installed_local(void)
401401
if (get_extension_oid("pg_shardman", true) == InvalidOid)
402402
{
403403
installed = false;
404-
shmn_elog(WARNING, "pg_shardman library is preloaded, but extenstion"
405-
" is not created");
404+
shmn_elog(WARNING, "pg_shardman library is preloaded on shardlord, but"
405+
" extenstion is not created");
406406
}
407407
PopActiveSnapshot();
408408
CommitTransactionCommand();
409409

410410
/* shardmaster won't run without extension */
411-
/* TODO: unregister bgw? */
412411
if (!installed)
413412
proc_exit(1);
414413
}

0 commit comments

Comments
 (0)