Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit c857a98

Browse files
committed
Send heartbeat to disdabled node
1 parent 8b327d3 commit c857a98

File tree

3 files changed

+16
-7
lines changed

3 files changed

+16
-7
lines changed

contrib/mmts/Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
FROM kelvich/postgres_cluster
2-
2+
# RUN sysctl -w kernel.core_pattern=core
33
RUN cd /pg/src/contrib/raftable && make clean && make install
44

55
RUN mkdir /pg/mmts

contrib/mmts/arbiter.c

+13-4
Original file line numberDiff line numberDiff line change
@@ -318,10 +318,17 @@ static void MtmCheckResponse(MtmArbiterMessage* resp)
318318
&& Mtm->status != MTM_RECOVERY
319319
&& Mtm->nodes[MtmNodeId-1].lastStatusChangeTime + MSEC_TO_USEC(MtmNodeDisableDelay) < MtmGetSystemTime())
320320
{
321-
elog(WARNING, "Node %d thinks that I was dead, while I am %s (message %s)", resp->node, MtmNodeStatusMnem[Mtm->status], messageKindText[resp->code]);
321+
elog(WARNING, "Node %d thinks that I am dead, while I am %s (message %s)", resp->node, MtmNodeStatusMnem[Mtm->status], messageKindText[resp->code]);
322322
BIT_SET(Mtm->disabledNodeMask, MtmNodeId-1);
323323
MtmSwitchClusterMode(MTM_RECOVERY);
324-
}
324+
} else if (BIT_CHECK(Mtm->disabledNodeMask, resp->node-1) && sockets[resp->node-1] < 0) {
325+
/* We receive heartbeat from dsiable node with
326+
* Looks like it is restarted.
327+
* Try to reconnect to it.
328+
*/
329+
elog(WARNING, "Receive heartbeat from disabled node %d", resp->node);
330+
BIT_SET(Mtm->reconnectMask, resp->node-1);
331+
}
325332
}
326333

327334
static void MtmScheduleHeartbeat()
@@ -355,7 +362,8 @@ static void MtmSendHeartbeat()
355362
if (i+1 != MtmNodeId) {
356363
if (!BIT_CHECK(busy_mask, i)
357364
&& (Mtm->status != MTM_ONLINE
358-
|| (sockets[i] >= 0 && !BIT_CHECK(Mtm->disabledNodeMask, i))
365+
|| sockets[i] >= 0
366+
|| !BIT_CHECK(Mtm->disabledNodeMask, i)
359367
|| BIT_CHECK(Mtm->reconnectMask, i)))
360368
{
361369
if (!MtmSendToNode(i, &msg, sizeof(msg))) {
@@ -885,6 +893,8 @@ static void MtmReceiver(Datum arg)
885893
Mtm->nodes[node-1].connectivityMask = msg->connectivityMask;
886894
Mtm->nodes[node-1].lastHeartbeat = MtmGetSystemTime();
887895

896+
MtmCheckResponse(msg);
897+
888898
switch (msg->code) {
889899
case MSG_HEARTBEAT:
890900
MTM_LOG2("Receive HEARTBEAT from node %d with timestamp %ld delay %ld",
@@ -964,7 +974,6 @@ static void MtmReceiver(Datum arg)
964974
messageKindText[msg->code], ts->xid, ts->gid, node);
965975
continue;
966976
}
967-
MtmCheckResponse(msg);
968977
BIT_SET(ts->votedMask, node-1);
969978

970979
if (MtmIsCoordinator(ts)) {

contrib/mmts/multimaster.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -3157,8 +3157,8 @@ MtmReplicationRowFilterHook(struct PGLogicalRowFilterArgs* args)
31573157
}
31583158

31593159
/*
3160-
* Filter received transacyions at destination side.
3161-
* This function is executed by receiver, so there are no race conditions and it is possible to update nodes[i].restaetLSN without lock
3160+
* Filter received transactions at destination side.
3161+
* This function is executed by receiver, so there are no race conditions and it is possible to update nodes[i].restartLSN without lock
31623162
*/
31633163
bool MtmFilterTransaction(char* record, int size)
31643164
{

0 commit comments

Comments
 (0)