Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit b1767bb

Browse files
committed
dmq: add sender connection counters.
Counters allow to notice send connection reset which could possibly drop request -- and thus we should stop waiting for response. However, they don't save from potential deadlocks -- if a loop of nodes fills all queues in one direction with requests (reverse direction must constantly blink for that), nobody will be able to send replies. So eventually we should have 'connection lost in one direction, close it in the other' logic. But such deadlocks seem to be hardly possible in practice, so not today. gather() is adapted to new API, but nodoby uses it yet. It doesn't rely on whether node itself and others are enabled/disabled anymore. This also adds separate DMQ_N_MASK_POS macro, as we can actually have small number of DMQ_MAX_DESTINATIONS with large mask positions.
1 parent 8b1be93 commit b1767bb

File tree

6 files changed

+143
-33
lines changed

6 files changed

+143
-33
lines changed

src/commit.c

+28-27
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,7 @@ MtmTwoPhaseCommit()
334334
AllowTempIn2PC = true;
335335
CommitTransactionCommand();
336336
gtx->state.status = GTXPrepared;
337-
gather(participants, (MtmMessage **) messages, &n_messages, false);
337+
gather(participants, (MtmMessage **) messages, &n_messages, NULL);
338338
dmq_stream_unsubscribe(stream);
339339

340340
/* check prepare responses */
@@ -381,7 +381,7 @@ MtmTwoPhaseCommit()
381381
gtx->state.status = GTXPreCommitted;
382382
gtx->state.accepted = (GlobalTxTerm) {1, 0};
383383
mtm_log(MtmTxFinish, "TXFINISH: %s precommitted", gid);
384-
gather(participants, (MtmMessage **) messages, &n_messages, false);
384+
gather(participants, (MtmMessage **) messages, &n_messages, NULL);
385385

386386
/* check ballots in answers */
387387
for (i = 0; i < n_messages; i++)
@@ -442,8 +442,17 @@ MtmTwoPhaseCommit()
442442
return true;
443443
}
444444

445+
/*
446+
* For each counterparty in participants, either receives and pushes to
447+
* messages a msg from it or waits until recv connection to it dies.
448+
* If sendconn_cnt is not NULL, it must contain sender conn counters from
449+
* dmq_get_sendconn_cnt. In this case, we stop waiting for counterparty when
450+
* notice sender connection reset.
451+
* sendconn_cnt must be passed whenever you've sent request via dmq and
452+
* expect reply to it, otherwise you risk hanging here infinitely.
453+
*/
445454
void
446-
gather(uint64 participants, MtmMessage **messages, int *msg_count, bool ignore_disabled)
455+
gather(uint64 participants, MtmMessage **messages, int *msg_count, int *sendconn_cnt)
447456
{
448457
*msg_count = 0;
449458
while (participants != 0)
@@ -467,29 +476,12 @@ gather(uint64 participants, MtmMessage **messages, int *msg_count, bool ignore_d
467476
mtm_log(MtmTxTrace,
468477
"gather: got message from node%d", sender_mask_pos + 1);
469478
}
470-
else
479+
else if (sender_mask_pos != -1)
471480
{
472-
/*
473-
* If queue is detached then the neignbour node is probably
474-
* disconnected. Let's wait when it became disabled as we can
475-
* became offline by this time.
476-
*/
477-
int i;
478-
nodemask_t enabled = MtmGetEnabledNodeMask(ignore_disabled);
479-
480-
for (i = 0; i < MTM_MAX_NODES; i++)
481-
{
482-
if (BIT_CHECK(participants, i) && !BIT_CHECK(enabled, i))
483-
{
484-
BIT_CLEAR(participants, i);
485-
mtm_log(MtmTxTrace,
486-
"GatherPrecommit: dropping node%d from tx participants",
487-
i + 1);
488-
}
489-
}
481+
/* dead recv conn */
482+
BIT_CLEAR(participants, sender_mask_pos);
490483
}
491-
492-
if (wait)
484+
else /* WOULDBLOCK */
493485
{
494486
/* XXX cache that */
495487
rc = WaitLatch(MyLatch, WL_LATCH_SET | WL_TIMEOUT, 100.0,
@@ -498,9 +490,18 @@ gather(uint64 participants, MtmMessage **messages, int *msg_count, bool ignore_d
498490
if (rc & WL_POSTMASTER_DEATH)
499491
proc_exit(1);
500492

493+
/* XXX tell the caller about latch reset */
501494
if (rc & WL_LATCH_SET)
502495
ResetLatch(MyLatch);
496+
497+
/* waited a bit fruitlessly -- probably sender conn died? */
498+
if (rc & WL_TIMEOUT)
499+
{
500+
participants = dmq_purge_failed_participants(participants,
501+
sendconn_cnt);
502+
}
503503
}
504+
504505
}
505506
}
506507

@@ -542,7 +543,7 @@ MtmExplicitPrepare(char *gid)
542543

543544
mtm_log(MtmTxFinish, "TXFINISH: %s prepared", gid);
544545

545-
gather(participants, (MtmMessage **) messages, &n_messages, false);
546+
gather(participants, (MtmMessage **) messages, &n_messages, NULL);
546547
dmq_stream_unsubscribe(stream);
547548

548549
for (i = 0; i < n_messages; i++)
@@ -593,13 +594,13 @@ MtmExplicitFinishPrepared(bool isTopLevel, char *gid, bool isCommit)
593594

594595
SetPreparedTransactionState(gid, MULTIMASTER_PRECOMMITTED, false);
595596
mtm_log(MtmTxFinish, "TXFINISH: %s precommitted", gid);
596-
gather(participants, (MtmMessage **) messages, &n_messages, false);
597+
gather(participants, (MtmMessage **) messages, &n_messages, NULL);
597598

598599
FinishPreparedTransaction(gid, true, false);
599600

600601
/* XXX: make this conditional */
601602
mtm_log(MtmTxFinish, "TXFINISH: %s committed", gid);
602-
gather(participants, (MtmMessage **) messages, &n_messages, false);
603+
gather(participants, (MtmMessage **) messages, &n_messages, NULL);
603604

604605
dmq_stream_unsubscribe(gid);
605606
}

src/dmq.c

+107-2
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@
5151
#define DMQ_MQ_MAGIC 0x646d71
5252

5353
/* XXX: move to common */
54-
#define BIT_CHECK(mask, bit) (((mask) & ((int64)1 << (bit))) != 0)
54+
#define BIT_CLEAR(mask, bit) ((mask) &= ~((uint64)1 << (bit)))
55+
#define BIT_CHECK(mask, bit) (((mask) & ((uint64)1 << (bit))) != 0)
5556

5657
/*
5758
* Shared data structures to hold current connections topology.
@@ -83,6 +84,7 @@ typedef struct
8384
PGconn *pgconn;
8485
DmqConnState state;
8586
int pos;
87+
int8 mask_pos;
8688
} DmqDestination;
8789

8890
typedef struct
@@ -101,6 +103,43 @@ struct DmqSharedState
101103
pid_t sender_pid;
102104
dsm_handle out_dsm;
103105
DmqDestination destinations[DMQ_MAX_DESTINATIONS];
106+
/*
107+
* Stores counters incremented on each reconnect to destination, indexed
108+
* by receiver mask_pos. This allows to detect conn failures to avoid
109+
* infinite waiting for response when request could have been dropped,
110+
* c.f. dmq_fill_sconn_cnt and dmq_purge_failed_participants.
111+
*
112+
* XXX This mechanism is unreliable and ugly.
113+
* Unreliable, because though it saves from infinite waiting for reply, it
114+
* doesn't save from potential deadlocks. Deadlocks may arise whenever a
115+
* loop of nodes makes request-response sequences because request A->B and
116+
* A's response to B's request go via the same TCP channel; thus, if all
117+
* queues of loop in one direction are filled with requests, nobody will
118+
* be able to answer.
119+
*
120+
* We could control the situation by ensuring 1) all possible requests
121+
* node sends at time could fit in the output buffers 2) node never
122+
* repeats the request until the previous one is delivered or dropped.
123+
* However, to honor the 2), we must terminate send connection whenever
124+
* receive conn failed (and thus we gonna to retry the requests) to flush
125+
* previous possible undelivered requests, which we don't do currently.
126+
*
127+
* Ultimate non-deadlockable solution without such hacks would be to
128+
* divide the job between different channels: A sends its requests to B
129+
* and recevies responses from it via one TCP channel, and B sends its
130+
* requests to A and receives responses via another one. Probably this is
131+
* not worthwhile though as it would make dmq more complicated and
132+
* increase number of shm_mqs.
133+
*
134+
* Besides, the counters are ugly because they require the external code
135+
* to remember sender counters before request and check them while
136+
* waiting for reply; moreover, this checking must be based on timouts
137+
* as nobody would wake the clients on send conn failures.
138+
*
139+
* No locks are used because we don't care much about correct/up-to-date
140+
* reads, though well aligned ints are atomic anyway.
141+
*/
142+
volatile int sconn_cnt[DMQ_MAX_DESTINATIONS];
104143

105144
/* receivers stuff */
106145
int n_receivers;
@@ -115,6 +154,9 @@ struct DmqSharedState
115154

116155
} *dmq_state;
117156

157+
/* special value for sconn_cnt[] meaning the connection is dead */
158+
#define DMQSCONN_DEAD 0
159+
118160
static HTAB *dmq_subscriptions;
119161

120162
/* Backend-local i/o queues. */
@@ -332,6 +374,11 @@ dmq_sender_main(Datum main_arg)
332374
WaitEventSet *set;
333375
DmqDestination conns[DMQ_MAX_DESTINATIONS];
334376
int heartbeat_send_timeout = DatumGetInt32(main_arg);
377+
/*
378+
* Seconds dmq_state->sconn_cnt to save the counter value when
379+
* conn is dead.
380+
*/
381+
int sconn_cnt[DMQ_MAX_DESTINATIONS];
335382

336383
double prev_timer_at = dmq_now();
337384

@@ -403,6 +450,8 @@ dmq_sender_main(Datum main_arg)
403450
conns[i] = *dest;
404451
Assert(conns[i].pgconn == NULL);
405452
conns[i].state = Idle;
453+
sconn_cnt[dest->mask_pos] = 0;
454+
dmq_state->sconn_cnt[dest->mask_pos] = DMQSCONN_DEAD;
406455
prev_timer_at = 0; /* do not wait for timer event */
407456
}
408457
/* close connection to deleted destination */
@@ -443,6 +492,7 @@ dmq_sender_main(Datum main_arg)
443492
if (ret < 0)
444493
{
445494
conns[conn_id].state = Idle;
495+
dmq_state->sconn_cnt[conns[conn_id].mask_pos] = DMQSCONN_DEAD;
446496

447497
mtm_log(DmqStateFinal,
448498
"[DMQ] failed to send message to %s: %s",
@@ -561,6 +611,7 @@ dmq_sender_main(Datum main_arg)
561611
if (ret < 0)
562612
{
563613
conns[conn_id].state = Idle;
614+
dmq_state->sconn_cnt[conns[conn_id].mask_pos] = DMQSCONN_DEAD;
564615

565616
mtm_log(DmqStateFinal,
566617
"[DMQ] failed to send heartbeat to %s: %s",
@@ -684,9 +735,13 @@ dmq_sender_main(Datum main_arg)
684735
}
685736
if (!PQisBusy(conns[conn_id].pgconn))
686737
{
738+
int8 mask_pos = conns[conn_id].mask_pos;
739+
687740
conns[conn_id].state = Active;
688741
DeleteWaitEvent(set, event.pos);
689742
PQsetnonblocking(conns[conn_id].pgconn, 1);
743+
sconn_cnt[mask_pos]++;
744+
dmq_state->sconn_cnt[mask_pos] = sconn_cnt[mask_pos];
690745

691746
mtm_log(DmqStateFinal,
692747
"[DMQ] Connected to %s",
@@ -702,6 +757,7 @@ dmq_sender_main(Datum main_arg)
702757
if (!PQconsumeInput(conns[conn_id].pgconn))
703758
{
704759
conns[conn_id].state = Idle;
760+
dmq_state->sconn_cnt[conns[conn_id].mask_pos] = DMQSCONN_DEAD;
705761

706762
mtm_log(DmqStateFinal,
707763
"[DMQ] connection error with %s: %s",
@@ -1444,6 +1500,9 @@ dmq_detach_receiver(char *sender_name)
14441500
dmq_local.inhandles[handle_id].name[0] = '\0';
14451501
}
14461502

1503+
/*
1504+
* Subscribes caller to msgs from stream_name.
1505+
*/
14471506
void
14481507
dmq_stream_subscribe(char *stream_name)
14491508
{
@@ -1503,6 +1562,24 @@ dmq_stream_unsubscribe(char *stream_name)
15031562
Assert(found);
15041563
}
15051564

1565+
/*
1566+
* Fills (preallocated) sconn_cnt with current values of sender
1567+
* connection counters to guys in participants mask (as registered in
1568+
* dmq_destination_add), which allows to reveal connection failures, possibly
1569+
* resulted in losing request -- and thus stop hopeless waiting for response.
1570+
*/
1571+
void
1572+
dmq_get_sendconn_cnt(uint64 participants, int *sconn_cnt)
1573+
{
1574+
int i;
1575+
1576+
for (i = 0; i < DMQ_N_MASK_POS; i++)
1577+
{
1578+
if (BIT_CHECK(participants, i))
1579+
sconn_cnt[i] = dmq_state->sconn_cnt[i];
1580+
}
1581+
}
1582+
15061583
bool
15071584
dmq_pop(int8 *sender_mask_pos, StringInfo msg, uint64 mask)
15081585
{
@@ -1639,9 +1716,36 @@ dmq_pop_nb(int8 *sender_mask_pos, StringInfo msg, uint64 mask, bool *wait)
16391716
return false;
16401717
}
16411718

1719+
/*
1720+
* Accepts bitmask of participants and sconn_cnt counters with send
1721+
* connections counters as passed to (and the latter filled by)
1722+
* dmq_fill_sconn_cnt, returns this mask after unsetting bits for those
1723+
* counterparties with whom we've lost the send connection since.
1724+
*/
1725+
uint64
1726+
dmq_purge_failed_participants(uint64 participants, int *sconn_cnt)
1727+
{
1728+
int i;
1729+
uint64 res = participants;
1730+
1731+
for (i = 0; i < DMQ_N_MASK_POS; i++)
1732+
{
1733+
if (BIT_CHECK(participants, i) &&
1734+
(sconn_cnt[i] == DMQSCONN_DEAD ||
1735+
sconn_cnt[i] != dmq_state->sconn_cnt[i]))
1736+
BIT_CLEAR(res, i);
1737+
}
1738+
return res;
1739+
}
1740+
1741+
/*
1742+
* recv_mask_pos is short (< DMQ_N_MASK_POS) variant of
1743+
* receiver_name, used to track connection failures -- it must match mask_pos
1744+
* in dmq_attach_receiver to work!
1745+
*/
16421746
DmqDestinationId
16431747
dmq_destination_add(char *connstr, char *sender_name, char *receiver_name,
1644-
int recv_timeout)
1748+
int8 recv_mask_pos, int recv_timeout)
16451749
{
16461750
DmqDestinationId dest_id;
16471751
pid_t sender_pid;
@@ -1658,6 +1762,7 @@ dmq_destination_add(char *connstr, char *sender_name, char *receiver_name,
16581762
strncpy(dest->connstr, connstr, DMQ_CONNSTR_MAX_LEN);
16591763
dest->recv_timeout = recv_timeout;
16601764
dest->active = true;
1765+
dest->mask_pos = recv_mask_pos;
16611766
break;
16621767
}
16631768
}

src/include/commit.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,6 @@ extern void MtmXactCallback(XactEvent event, void *arg);
2828
extern bool MtmExplicitPrepare(char *gid);
2929
extern void MtmExplicitFinishPrepared(bool isTopLevel, char *gid, bool isCommit);
3030

31-
extern void gather(uint64 participants, MtmMessage **messages, int *msg_count, bool ignore_mtm_disabled);
31+
extern void gather(uint64 participants, MtmMessage **messages, int *msg_count, int *sendconn_cnt);
3232

3333
#endif

src/include/dmq.h

+5-1
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@ typedef int8 DmqDestinationId;
1010

1111
extern void dmq_init(int send_timeout);
1212

13+
#define DMQ_N_MASK_POS 16 /* ought to be >= MTM_MAX_NODES */
1314
extern DmqDestinationId dmq_destination_add(char *connstr, char *sender_name,
14-
char *receiver_name, int ping_period);
15+
char *receiver_name, int8 recv_mask_pos,
16+
int ping_period);
1517
extern void dmq_destination_drop(char *receiver_name);
1618

1719
extern void dmq_attach_receiver(char *sender_name, int8 mask_pos);
@@ -22,8 +24,10 @@ extern void dmq_terminate_receiver(char *name);
2224
extern void dmq_stream_subscribe(char *stream_name);
2325
extern void dmq_stream_unsubscribe(char *stream_name);
2426

27+
extern void dmq_get_sendconn_cnt(uint64 participants, int *sconn_cnt);
2528
extern bool dmq_pop(int8 *sender_mask_pos, StringInfo msg, uint64 mask);
2629
extern bool dmq_pop_nb(int8 *sender_mask_pos, StringInfo msg, uint64 mask, bool *wait);
30+
extern uint64 dmq_purge_failed_participants(uint64 participants, int *sconn_cnt);
2731

2832
extern void dmq_push(DmqDestinationId dest_id, char *stream_name, char *msg);
2933
extern void dmq_push_buffer(DmqDestinationId dest_id, char *stream_name, const void *buffer, size_t len);

src/resolver.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ scatter_status_requests(MtmConfig *mtm_cfg)
204204
* ars: we might collect here responses from previous resolving
205205
* sessions, which would fill acks with non MtmLastTermResponse messages.
206206
*/
207-
gather(connected, (MtmMessage **) acks, &n_acks, true);
207+
gather(connected, (MtmMessage **) acks, &n_acks, NULL);
208208

209209
for (i = 0; i < n_acks; i++)
210210
{

src/state.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -1471,7 +1471,7 @@ start_node_workers(int node_id, MtmConfig *new_cfg, Datum arg)
14711471

14721472
/* Add dmq destination */
14731473
dest = dmq_destination_add(dmq_connstr, dmq_my_name, dmq_node_name,
1474-
MtmHeartbeatRecvTimeout);
1474+
node_id - 1, MtmHeartbeatRecvTimeout);
14751475

14761476
LWLockAcquire(Mtm->lock, LW_EXCLUSIVE);
14771477
Mtm->peers[node_id - 1].dmq_dest_id = dest;

0 commit comments

Comments
 (0)