Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 4993bce

Browse files
knizhnikkelvich
authored andcommitted
Force reconnection of arbiter after recovery completion
1 parent 4ef69a3 commit 4993bce

File tree

2 files changed

+21
-14
lines changed

2 files changed

+21
-14
lines changed

arbiter.c

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -350,14 +350,19 @@ static void MtmSendHeartbeat()
350350

351351
for (i = 0; i < Mtm->nAllNodes; i++)
352352
{
353-
if (i+1 != MtmNodeId && !BIT_CHECK(busy_mask, i)
354-
&& (Mtm->status != MTM_ONLINE
355-
|| (sockets[i] >= 0 && !BIT_CHECK(Mtm->disabledNodeMask, i) && !BIT_CHECK(Mtm->reconnectMask, i))))
356-
{
357-
if (!MtmSendToNode(i, &msg, sizeof(msg))) {
358-
elog(LOG, "Arbiter failed to send heartbeat to node %d", i+1);
359-
} else {
360-
MTM_LOG2("Send heartbeat to node %d with timestamp %ld", i+1, now);
353+
if (i+1 != MtmNodeId) {
354+
if (!BIT_CHECK(busy_mask, i)
355+
&& (Mtm->status != MTM_ONLINE
356+
|| (sockets[i] >= 0 && !BIT_CHECK(Mtm->disabledNodeMask, i))
357+
|| BIT_CHECK(Mtm->reconnectMask, i)))
358+
{
359+
if (!MtmSendToNode(i, &msg, sizeof(msg))) {
360+
elog(LOG, "Arbiter failed to send heartbeat to node %d", i+1);
361+
} else {
362+
MTM_LOG2("Send heartbeat to node %d with timestamp %ld", i+1, now);
363+
}
364+
} else {
365+
MTM_LOG1("Do not send hearbeat to node %d, busy mask %ld, status %d", i+1, busy_mask, Mtm->status);
361366
}
362367
}
363368
}

multimaster.c

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1348,6 +1348,7 @@ void MtmRecoveryCompleted(void)
13481348
MtmLock(LW_EXCLUSIVE);
13491349
Mtm->recoverySlot = 0;
13501350
BIT_CLEAR(Mtm->disabledNodeMask, MtmNodeId-1);
1351+
Mtm->reconnectMask |= Mtm->connectivityMask; /* try to reestablish all connections */
13511352
Mtm->nodes[MtmNodeId-1].lastStatusChangeTime = MtmGetSystemTime();
13521353
for (i = 0; i < Mtm->nAllNodes; i++) {
13531354
Mtm->nodes[i].lastHeartbeat = 0; /* defuse watchdog until first heartbeat is received */
@@ -1468,6 +1469,7 @@ bool MtmRecoveryCaughtUp(int nodeId, XLogRecPtr slotLSN)
14681469
void MtmSwitchClusterMode(MtmNodeStatus mode)
14691470
{
14701471
Mtm->status = mode;
1472+
Mtm->nodes[MtmNodeId-1].lastStatusChangeTime = MtmGetSystemTime();
14711473
MTM_LOG1("Switch to %s mode", MtmNodeStatusMnem[mode]);
14721474
/* ??? Something else to do here? */
14731475
}
@@ -1602,11 +1604,10 @@ bool MtmRefreshClusterStatus(bool nowait, int testNodeId)
16021604
if (disabled) {
16031605
timestamp_t now = MtmGetSystemTime();
16041606
for (i = 0, mask = disabled; mask != 0; i++, mask >>= 1) {
1605-
if (i+1 != MtmNodeId
1606-
&& (mask & 1) != 0
1607-
&& Mtm->nodes[i].lastStatusChangeTime + MSEC_TO_USEC(MtmNodeDisableDelay) < now)
1608-
{
1609-
MtmDisableNode(i+1);
1607+
if (mask & 1) {
1608+
if (Mtm->nodes[i].lastStatusChangeTime + MSEC_TO_USEC(MtmNodeDisableDelay) < now) {
1609+
MtmDisableNode(i+1);
1610+
}
16101611
}
16111612
}
16121613
}
@@ -1681,6 +1682,7 @@ void MtmOnNodeDisconnect(int nodeId)
16811682
MtmLock(LW_EXCLUSIVE);
16821683
BIT_SET(Mtm->connectivityMask, nodeId-1);
16831684
BIT_SET(Mtm->reconnectMask, nodeId-1);
1685+
MTM_LOG1("Disconnect node %d connectivity mask %lx", nodeId, Mtm->connectivityMask);
16841686
MtmUnlock();
16851687

16861688
if (!RaftableSet(psprintf("node-mask-%d", MtmNodeId), &Mtm->connectivityMask, sizeof Mtm->connectivityMask, false))
@@ -1725,7 +1727,7 @@ void MtmOnNodeConnect(int nodeId)
17251727
BIT_CLEAR(Mtm->reconnectMask, nodeId-1);
17261728
MtmUnlock();
17271729

1728-
MTM_LOG1("Reconnect node %d", nodeId);
1730+
MTM_LOG1("Reconnect node %d, connectivityMask=%lx", nodeId, Mtm->connectivityMask);
17291731
RaftableSet(psprintf("node-mask-%d", MtmNodeId), &Mtm->connectivityMask, sizeof Mtm->connectivityMask, false);
17301732
}
17311733

0 commit comments

Comments
 (0)