7
7
*/
8
8
#include "postgres.h"
9
9
#include "libpq-fe.h"
10
+ #include "lib/ilist.h"
11
+
12
+ #include <time.h>
13
+ #include <limits.h>
14
+ #include <sys/epoll.h>
10
15
11
16
#include "shard.h"
17
+ #include "timeutils.h"
18
+
19
+ typedef enum
20
+ {
21
+ MOVEMPART_IN_PROGRESS ,
22
+ MOVEMPART_FAILED ,
23
+ MOVEMPART_SUCCESS
24
+ } MoveMPartResult ;
25
+
26
+ /* result of one iteration of processing */
27
+ typedef enum
28
+ {
29
+ EXECMOVEMPART_EPOLL , /* add me to epoll on epolled_fd on EPOLLIN */
30
+ EXECMOVEMPART_WAKEMEUP , /* wake me up again on waketm */
31
+ EXECMOVEMPART_DONE /* the work is done, never invoke me again */
32
+ } ExecMoveMPartRes ;
33
+
34
+ typedef struct
35
+ {
36
+ const char * part_name ; /* partition name */
37
+ int32 src_node ; /* node we are moving partition from */
38
+ int32 dst_node ; /* node we are moving partition to */
39
+ char * src_connstr ;
40
+ char * dst_connstr ;
41
+ struct timespec waketm ; /* wake me up at waketm to do the job */
42
+ /* We need to epoll only on socket with dst to wait for copy */
43
+ /* exec_move_mpart sets fd here when it wants to be wakened by epoll */
44
+ int fd_to_epoll ;
45
+ int fd_in_epoll_set ; /* socket *currently* in epoll set. -1 of none */
46
+ MoveMPartResult result ;
47
+ } MoveMPartState ;
48
+
49
+ typedef struct
50
+ {
51
+ slist_node list_node ;
52
+ MoveMPartState * mmps ;
53
+ } MoveMPartStateNode ;
54
+
55
+ static void init_mmp_state (MoveMPartState * mmps , const char * part_name ,
56
+ int32 dst_node );
57
+ static void move_mparts (MoveMPartState * mmpss , int nparts );
58
+ static int calc_timeout (slist_head * timeout_states );
59
+ static ExecMoveMPartRes exec_move_mpart (MoveMPartState * mmps );
12
60
13
61
/*
14
62
* Steps are:
@@ -124,13 +172,14 @@ create_hash_partitions(Cmd *cmd)
124
172
cmd_canceled (cmd );
125
173
}
126
174
127
-
128
175
/*
129
176
* Move master partition to specified node. We
130
177
* - Disable subscription on destination, otherwise we can't drop rep slot on
131
178
source.
132
179
* - Idempotently create publication and repl slot on source.
133
- * - Idempotently create table and subscription on destination.
180
+ * - Idempotently create table and async subscription on destination.
181
+ * We use async subscription, because sync would block table while copy is
182
+ * in progress. But with async, we have to lock the table after initial sync.
134
183
* - Now inital copy has started, remember that at least in ram to retry
135
184
* from this point if network fails.
136
185
* - Sleep & check in connection to the dest waiting for completion of the
@@ -145,9 +194,150 @@ create_hash_partitions(Cmd *cmd)
145
194
* If we don't save progress (whether initial sync started or done, lsn,
146
195
* etc), we have to start everything from the ground if master reboots. This
147
196
* is arguably fine.
197
+ *
148
198
*/
149
199
void
150
200
move_mpart (Cmd * cmd )
201
+ {
202
+ char * part_name = cmd -> opts [0 ];
203
+ int32 dst_node = atoi (cmd -> opts [1 ]);
204
+
205
+ MoveMPartState * mmps = palloc (sizeof (MoveMPartState ));
206
+ init_mmp_state (mmps , part_name , dst_node );
207
+
208
+ move_mparts (mmps , 1 );
209
+ update_cmd_status (cmd -> id , "success" );
210
+ }
211
+
212
+
213
+ /*
214
+ * Fill MoveMPartState, retrieving needed data. If something goes wrong, we
215
+ * don't bother to fill the rest of fields.
216
+ */
217
+ void
218
+ init_mmp_state (MoveMPartState * mmps , const char * part_name , int32 dst_node )
219
+ {
220
+ int e ;
221
+
222
+ mmps -> part_name = part_name ;
223
+ if ((mmps -> src_node = get_partition_owner (part_name )) == -1 )
224
+ {
225
+ shmn_elog (WARNING , "Partition %s doesn't exist, not moving it" ,
226
+ part_name );
227
+ mmps -> result = MOVEMPART_FAILED ;
228
+ return ;
229
+ }
230
+ mmps -> dst_node = dst_node ;
231
+
232
+ /* src_connstr is surely not NULL since src_node is referenced by
233
+ part_name */
234
+ mmps -> src_connstr = get_worker_node_connstr (mmps -> src_node );
235
+ mmps -> dst_connstr = get_worker_node_connstr (mmps -> dst_node );
236
+ if (mmps -> dst_connstr == NULL )
237
+ {
238
+ shmn_elog (WARNING , "Node %d doesn't exist, not moving %s to it" ,
239
+ mmps -> dst_node , part_name );
240
+ mmps -> result = MOVEMPART_FAILED ;
241
+ return ;
242
+ }
243
+
244
+ /* Task is ready to be processed right now */
245
+ if ((e = clock_gettime (CLOCK_MONOTONIC , & mmps -> waketm )) == -1 )
246
+ {
247
+ shmn_elog (FATAL , "clock_gettime failed, %s" , strerror (e ));
248
+ }
249
+ mmps -> fd_in_epoll_set = -1 ;
250
+
251
+ mmps -> result = MOVEMPART_IN_PROGRESS ;
252
+ }
253
+
254
+ /*
255
+ * Move partitions as specified in move_mpart_states list
256
+ */
257
+ void
258
+ move_mparts (MoveMPartState * mmpss , int nparts )
259
+ {
260
+ /* list of sleeping mmp states we need to wake after specified timeout */
261
+ slist_head timeout_states = SLIST_STATIC_INIT (timeout_states );
262
+ slist_iter iter ;
263
+
264
+ int timeout ; /* at least one task will be ready after timeout millis */
265
+ int unfinished_moves = 0 ; /* number of not yet failed or succeeded tasks */
266
+ int i ;
267
+ int e ;
268
+ int epfd ;
269
+
270
+ for (i = 0 ; i < nparts ; i ++ )
271
+ {
272
+ if (mmpss [i ].result != MOVEMPART_FAILED )
273
+ {
274
+ /* In the beginning, all tasks are ready immediately */
275
+ MoveMPartStateNode * mmps_node = palloc (sizeof (MoveMPartStateNode ));
276
+ elog (DEBUG4 , "Adding task %s to timeout list" , mmpss [i ].part_name );
277
+ mmps_node -> mmps = & mmpss [i ];
278
+ slist_push_head (& timeout_states , & mmps_node -> list_node );
279
+ unfinished_moves ++ ;
280
+ }
281
+ }
282
+
283
+ if ((epfd = epoll_create1 (0 )) == -1 )
284
+ {
285
+ shmn_elog (FATAL , "epoll_create1 failed" );
286
+ }
287
+
288
+ while (unfinished_moves > 0 )
289
+ {
290
+ timeout = calc_timeout (& timeout_states );
291
+ unfinished_moves -- ;
292
+ }
293
+ }
294
+
295
+ /* Calculate when we need to wake if no epoll events are happening */
296
+ int
297
+ calc_timeout (slist_head * timeout_states )
298
+ {
299
+ slist_iter iter ;
300
+ struct timespec curtm ;
301
+ int e ;
302
+ int timeout = -1 ; /* If no tasks wait for us, don't wake */
303
+
304
+ slist_foreach (iter , timeout_states )
305
+ {
306
+ MoveMPartStateNode * mmps_node =
307
+ slist_container (MoveMPartStateNode , list_node , iter .cur );
308
+ MoveMPartState * mmps = mmps_node -> mmps ;
309
+ shmn_elog (DEBUG1 , "Peeking into %s task wake time" , mmps -> part_name );
310
+ if ((e = clock_gettime (CLOCK_MONOTONIC , & curtm )) == -1 )
311
+ {
312
+ shmn_elog (FATAL , "clock_gettime failed, %s" , strerror (e ));
313
+ }
314
+ if (timespeccmp (curtm , mmps -> waketm ) >= 0 )
315
+ {
316
+ shmn_elog (DEBUG1 , "Task %s is already ready" , mmps -> part_name );
317
+ timeout = 0 ;
318
+ return timeout ;
319
+ }
320
+ else
321
+ {
322
+ int diff = Max (0 , timespec_diff_millis (mmps -> waketm , curtm ));
323
+ if (timeout == -1 )
324
+ timeout = diff ;
325
+ else
326
+ timeout = Min (timeout , diff );
327
+ shmn_elog (DEBUG1 , "Timeout set to %d due to task %s " ,
328
+ timeout , mmps -> part_name );
329
+ }
330
+ }
331
+
332
+ return timeout ;
333
+ }
334
+
335
+ /*
336
+ * Actually run MoveMPart state machine. Return value says when (if ever)
337
+ * we want to be executed again.
338
+ */
339
+ ExecMoveMPartRes
340
+ exec_move_mpart (MoveMPartState * mmps )
151
341
{
152
342
153
343
}
0 commit comments