Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 33ec057

Browse files
committed
C part for movempart done.
We also now delete existing tables while doing create_hash_partitions, probably this should be added as an option to pathman.
1 parent b68f1de commit 33ec057

File tree

5 files changed

+198
-17
lines changed

5 files changed

+198
-17
lines changed

bin/shardman_start.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ make clean
99
restart_nodes # make sure nodes run
1010
# first workers, then master
1111
for port in "${worker_ports[@]}" $master_port; do
12-
psql -p $port -c "drop extension if exists pg_shardman;"
12+
psql -p $port -c "drop extension if exists pg_shardman cascade;"
1313
done
1414

1515
restart_nodes

pg_shardman--0.0.1.sql

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,16 +59,24 @@ CREATE TABLE tables (
5959
);
6060

6161
-- On adding new table, create this table on non-owner nodes using provided sql
62-
-- and partition it.
62+
-- and partition it. We destroy all existing tables with needed names.
6363
CREATE FUNCTION new_table_worker_side() RETURNS TRIGGER AS $$
64+
DECLARE
65+
partition_names text[];
66+
pname text;
6467
BEGIN
6568
IF NEW.initial_node != (SELECT shardman.get_node_id()) THEN
66-
EXECUTE format ('DROP TABLE IF EXISTS %I CASCADE;', NEW.relation);
69+
EXECUTE format('DROP TABLE IF EXISTS %I CASCADE;', NEW.relation);
70+
partition_names :=
71+
(SELECT ARRAY(SELECT part_name FROM shardman.gen_part_names(
72+
NEW.relation, NEW.partitions_count)));
73+
FOREACH pname IN ARRAY partition_names LOOP
74+
EXECUTE format('DROP TABLE IF EXISTS %I', pname);
75+
END LOOP;
6776
EXECUTE format('%s', NEW.create_sql);
6877
EXECUTE format('select create_hash_partitions(%L, %L, %L, true, %L);',
6978
NEW.relation, NEW.expr, NEW.partitions_count,
70-
(SELECT ARRAY(SELECT part_name FROM shardman.gen_part_names(
71-
NEW.relation, NEW.partitions_count))));
79+
partition_names);
7280
END IF;
7381
RETURN NULL;
7482
END
@@ -424,8 +432,6 @@ BEGIN
424432
RETURN NULL;
425433
END;
426434
$$ LANGUAGE plpgsql;
427-
CREATE FUNCTION ae_lock_table(relation regclass) RETURNS void
428-
AS 'pg_shardman' LANGUAGE C;
429435
-- And make it writable again
430436
CREATE FUNCTION readonly_table_off(relation regclass)
431437
RETURNS void AS $$

src/include/pg_shardman.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ extern bool shardman_master;
2626
extern char *shardman_master_dbname;
2727
extern char *shardman_master_connstring;
2828
extern int shardman_cmd_retry_naptime;
29+
extern int shardman_poll_interval;
2930

3031
typedef struct Cmd
3132
{

src/pg_shardman.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ bool shardman_master;
5656
char *shardman_master_dbname;
5757
char *shardman_master_connstring;
5858
int shardman_cmd_retry_naptime;
59+
int shardman_poll_interval;
5960

6061
/* Just global vars. */
6162
/* Connection to local server for LISTEN notifications. Is is global for easy
@@ -70,6 +71,7 @@ void
7071
_PG_init()
7172
{
7273
BackgroundWorker shardmaster_worker;
74+
char *desc;
7375

7476
if (!process_shared_preload_libraries_in_progress)
7577
{
@@ -121,6 +123,20 @@ _PG_init()
121123
0,
122124
NULL, NULL, NULL);
123125

126+
desc = "Unfortunately, some actions are not yet implemented using proper"
127+
"notifications and we need to poll the target node to learn progress."
128+
"This variable specifies how often (in milliseconds) we do that.";
129+
DefineCustomIntVariable("shardman.poll_interval",
130+
desc,
131+
NULL,
132+
&shardman_poll_interval,
133+
10000,
134+
0,
135+
INT_MAX,
136+
PGC_SIGHUP,
137+
0,
138+
NULL, NULL, NULL);
139+
124140
if (shardman_master)
125141
{
126142
/* register shardmaster */

src/shard.c

Lines changed: 168 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
*/
88
#include "postgres.h"
99
#include "libpq-fe.h"
10+
#include "access/xlogdefs.h"
11+
#include "utils/pg_lsn.h"
12+
#include "utils/builtins.h"
1013
#include "lib/ilist.h"
1114

1215
#include <unistd.h>
@@ -40,11 +43,15 @@ typedef enum
4043
EXECMOVEMPART_DONE /* the work is done, never invoke me again */
4144
} ExecMoveMPartRes;
4245

43-
/* Current step of 1 master partition move */
46+
/*
47+
* Current step of 1 master partition move. See comments to corresponding
48+
* funcs, e.g. start_tablesync.
49+
*/
4450
typedef enum
4551
{
4652
MOVEMPARTSTEP_START_TABLESYNC,
47-
MOVEMPARTSTEP_WAIT_TABLESYNC
53+
MOVEMPARTSTEP_START_FINALSYNC,
54+
MOVEMPARTSTEP_FINALIZE
4855
} MoveMPartStep;
4956

5057
typedef struct
@@ -72,7 +79,12 @@ typedef struct
7279
char *src_create_pub_and_rs_sql; /* create publ and repslot on src */
7380
char *relation; /* name of sharded relation */
7481
char *dst_create_tab_and_sub_sql; /* create table and sub on dst */
82+
char *substate_sql; /* get current state of subscription */
83+
char *readonly_sql; /* make src table read-only */
84+
char *received_lsn_sql; /* get last received lsn on dst */
85+
char *update_metadata_sql;
7586

87+
XLogRecPtr sync_point; /* when dst reached this point, it is synced */
7688
MoveMPartStep curstep; /* current step */
7789
ExecMoveMPartRes exec_res; /* result of the last iteration */
7890
MoveMPartRes res; /* result of the whole move */
@@ -92,6 +104,8 @@ static int calc_timeout(slist_head *timeout_states);
92104
static void epoll_subscribe(int epfd, MoveMPartState *mmps);
93105
static void exec_move_mpart(MoveMPartState *mmps);
94106
static int start_tablesync(MoveMPartState *mmpts);
107+
static int start_finalsync(MoveMPartState *mmpts);
108+
static int finalize(MoveMPartState *mmpts);
95109
static int ensure_pqconn(MoveMPartState *mmpts, int nodes);
96110
static int ensure_pqconn_intern(PGconn **conn, const char *connstr,
97111
MoveMPartState *mmps);
@@ -229,7 +243,7 @@ create_hash_partitions(Cmd *cmd)
229243
* - Sleep & check in connection to the dest waiting for completion of the
230244
* initial sync. Later this should be substituted with listen/notify.
231245
* - When done, lock writes (better lock reads too) on source and remember
232-
* current wal lsn on it.
246+
* pg_current_wal_lsn() on it.
233247
* - Now final sync has started, remember that at least in ram.
234248
* - Sleep & check in connection to dest waiting for completion of final sync,
235249
* i.e. when received_lsn is equal to remembered lsn on src.
@@ -346,6 +360,21 @@ init_mmp_state(MoveMPartState *mmps, const char *part_name, int32 dst_node)
346360
mmps->part_name, mmps->relation,
347361
mmps->logname,
348362
mmps->logname, mmps->src_connstr, mmps->logname, mmps->logname);
363+
mmps->substate_sql = psprintf(
364+
"select srsubstate from pg_subscription_rel srel join pg_subscription"
365+
" s on srel.srsubid = s.oid where subname = '%s';",
366+
mmps->logname
367+
);
368+
mmps->readonly_sql = psprintf(
369+
"select shardman.readonly_table_on('%s')", mmps->part_name
370+
);
371+
mmps->received_lsn_sql = psprintf(
372+
"select received_lsn from pg_stat_subscription where subname = '%s'",
373+
mmps->logname
374+
);
375+
mmps->update_metadata_sql = psprintf(
376+
"update shardman.partitions set owner = %d where part_name = '%s';",
377+
mmps->dst_node, mmps->part_name);
349378

350379
mmps->curstep = MOVEMPARTSTEP_START_TABLESYNC;
351380
mmps->res = MOVEMPART_IN_PROGRESS;
@@ -405,7 +434,6 @@ move_mparts(MoveMPartState *mmpss, int nparts)
405434
if ((epfd = epoll_create1(0)) == -1)
406435
shmn_elog(FATAL, "epoll_create1 failed");
407436

408-
/* TODO: check for signals */
409437
while (unfinished_moves > 0 && !got_sigusr1 && !got_sigterm)
410438
{
411439
timeout = calc_timeout(&timeout_states);
@@ -567,13 +595,13 @@ exec_move_mpart(MoveMPartState *mmps)
567595
{
568596
if (start_tablesync(mmps) == -1)
569597
return;
570-
else
571-
mmps->curstep = MOVEMPARTSTEP_WAIT_TABLESYNC;
572598
}
573-
574-
shmn_elog(DEBUG1, "Partition %s is moved", mmps->part_name);
575-
mmps->res = MOVEMPART_SUCCESS;
576-
mmps->exec_res = EXECMOVEMPART_DONE;
599+
if (mmps->curstep == MOVEMPARTSTEP_START_FINALSYNC)
600+
{
601+
if (start_finalsync(mmps) == -1)
602+
return;
603+
}
604+
finalize(mmps);
577605
}
578606

579607
/*
@@ -625,6 +653,136 @@ start_tablesync(MoveMPartState *mmps)
625653
shmn_elog(DEBUG1, "mmp %s: table & sub created on dst, tablesync started",
626654
mmps->part_name);
627655

656+
mmps->curstep = MOVEMPARTSTEP_START_FINALSYNC;
657+
return 0;
658+
}
659+
660+
/*
661+
* - wait until initial sync is done;
662+
* - make src read only and save its pg_current_wal() in mmps;
663+
* - now we are ready to wait for final sync
664+
* Returns -1 if anything goes wrong and 0 otherwise. current wal is saved
665+
* in mmps.
666+
*/
667+
int
668+
start_finalsync(MoveMPartState *mmps)
669+
{
670+
PGresult *res;
671+
int ntups;
672+
char substate;
673+
char *sync_point;
674+
675+
if (ensure_pqconn(mmps, ENSURE_PQCONN_SRC | ENSURE_PQCONN_DST) == -1)
676+
return -1;
677+
678+
res = PQexec(mmps->dst_conn, mmps->substate_sql);
679+
if (PQresultStatus(res) != PGRES_TUPLES_OK)
680+
{
681+
shmn_elog(NOTICE, "Failed to learn sub status on dst: %s",
682+
PQerrorMessage(mmps->dst_conn));
683+
reset_pqconn_and_res(&mmps->dst_conn, res);
684+
configure_retry(mmps, shardman_cmd_retry_naptime);
685+
return -1;
686+
}
687+
ntups = PQntuples(res);
688+
if (ntups != 1)
689+
{
690+
shmn_elog(WARNING, "mmp %s: num of subrels != 1", mmps->part_name);
691+
/*
692+
* Since several or 0 subrels is absolutely wrong situtation, we start
693+
* from the beginning.
694+
*/
695+
mmps->curstep = MOVEMPARTSTEP_START_TABLESYNC;
696+
configure_retry(mmps, shardman_cmd_retry_naptime);
697+
return -1;
698+
}
699+
substate = PQgetvalue(res, 0, 0)[0];
700+
if (substate != 'r')
701+
{
702+
shmn_elog(DEBUG1, "mmp %s: init sync is not yet finished, its state"
703+
" is %c", mmps->part_name, substate);
704+
configure_retry(mmps, shardman_poll_interval);
705+
return -1;
706+
}
707+
shmn_elog(DEBUG1, "mmp %s: init sync finished", mmps->part_name);
708+
PQclear(res);
709+
710+
res = PQexec(mmps->src_conn, mmps->readonly_sql);
711+
if (PQresultStatus(res) != PGRES_TUPLES_OK)
712+
{
713+
shmn_elog(NOTICE, "Failed to make src table read only: %s",
714+
PQerrorMessage(mmps->src_conn));
715+
reset_pqconn_and_res(&mmps->src_conn, res);
716+
configure_retry(mmps, shardman_cmd_retry_naptime);
717+
return -1;
718+
}
719+
shmn_elog(DEBUG1, "mmp %s: src made read only", mmps->part_name);
720+
PQclear(res);
721+
722+
res = PQexec(mmps->src_conn, "select pg_current_wal_lsn();");
723+
if (PQresultStatus(res) != PGRES_TUPLES_OK)
724+
{
725+
shmn_elog(NOTICE, "Failed to get current lsn on src: %s",
726+
PQerrorMessage(mmps->src_conn));
727+
reset_pqconn_and_res(&mmps->src_conn, res);
728+
configure_retry(mmps, shardman_cmd_retry_naptime);
729+
return -1;
730+
}
731+
sync_point = PQgetvalue(res, 0, 0);
732+
mmps->sync_point = DatumGetLSN(DirectFunctionCall1Coll(pg_lsn_in, InvalidOid,
733+
CStringGetDatum(sync_point)));
734+
shmn_elog(DEBUG1, "mmp %s: sync lsn is %s", mmps->part_name, sync_point);
735+
PQclear(res);
736+
737+
mmps->curstep = MOVEMPARTSTEP_FINALIZE;
738+
return 0;
739+
}
740+
741+
/*
742+
* Wait until final sync is done and update metadata. Returns -1 if anything
743+
* goes wrong and 0 otherwise.
744+
*/
745+
int
746+
finalize(MoveMPartState *mmps)
747+
{
748+
749+
PGresult *res;
750+
XLogRecPtr received_lsn;
751+
char *received_lsn_str;
752+
753+
if (ensure_pqconn(mmps, ENSURE_PQCONN_DST) == -1)
754+
return -1;
755+
756+
res = PQexec(mmps->dst_conn, mmps->received_lsn_sql);
757+
if (PQresultStatus(res) != PGRES_TUPLES_OK)
758+
{
759+
shmn_elog(NOTICE, "Failed to learn received_lsn on dst: %s",
760+
PQerrorMessage(mmps->dst_conn));
761+
reset_pqconn_and_res(&mmps->dst_conn, res);
762+
configure_retry(mmps, shardman_cmd_retry_naptime);
763+
return -1;
764+
}
765+
received_lsn_str = PQgetvalue(res, 0, 0);
766+
shmn_elog(DEBUG1, "mmp %s: received_lsn is %s", mmps->part_name,
767+
received_lsn_str);
768+
received_lsn = DatumGetLSN(DirectFunctionCall1Coll(
769+
pg_lsn_in, InvalidOid,
770+
CStringGetDatum(received_lsn_str)));
771+
PQclear(res);
772+
if (received_lsn < mmps->sync_point)
773+
{
774+
shmn_elog(DEBUG1, "mmp %s: final sync is not yet finished,"
775+
"received_lsn is %lu, but we wait for %lu",
776+
mmps->part_name, received_lsn, mmps->sync_point);
777+
configure_retry(mmps, shardman_poll_interval);
778+
return -1;
779+
}
780+
781+
void_spi(mmps->update_metadata_sql);
782+
783+
shmn_elog(DEBUG1, "Partition %s successfully moved", mmps->part_name);
784+
mmps->res = MOVEMPART_SUCCESS;
785+
mmps->exec_res = EXECMOVEMPART_DONE;
628786
return 0;
629787
}
630788

0 commit comments

Comments
 (0)