Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 734b737

Browse files
knizhnikkelvich
authored andcommitted
Detect zombies
1 parent 92927df commit 734b737

File tree

2 files changed

+18
-5
lines changed

2 files changed

+18
-5
lines changed

arbiter.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -694,7 +694,10 @@ static void MtmTransReceiver(Datum arg)
694694
MtmTransState* ts = (MtmTransState*)hash_search(MtmXid2State, &msg->dxid, HASH_FIND, NULL);
695695
Assert(ts != NULL);
696696
Assert(msg->node > 0 && msg->node <= nNodes && msg->node != MtmNodeId);
697-
697+
698+
if (BIT_CHECK(msg->disabledNodeMask, MtmNodeId-1) && Mtm->status != MTM_RECOVERY) {
699+
elog(PANIC, "Node %d thinks that I was dead: perform hara-kiri not to be a zombie", msg->node);
700+
}
698701
Mtm->nodes[msg->node-1].oldestSnapshot = msg->oldestSnapshot;
699702

700703
if (MtmIsCoordinator(ts)) {

multimaster.c

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,8 @@ HTAB* MtmXid2State;
139139
static HTAB* MtmGid2State;
140140
static HTAB* MtmLocalTables;
141141

142+
static bool MtmIsRecoverySession;
143+
142144
static MtmCurrentTrans MtmTx;
143145

144146
static TransactionManager MtmTM = {
@@ -1022,7 +1024,15 @@ static int64 MtmGetSlotLag(int nodeId)
10221024
*/
10231025
bool MtmIsRecoveredNode(int nodeId)
10241026
{
1025-
return BIT_CHECK(Mtm->disabledNodeMask, nodeId-1);
1027+
if (BIT_CHECK(Mtm->disabledNodeMask, nodeId-1)) {
1028+
if (!MtmIsRecoverySession) {
1029+
elog(ERROR, "Node %d is marked as disabled but is not in recovery mode", nodeId);
1030+
}
1031+
return true;
1032+
} else {
1033+
MtmIsRecoverySession = false; /* recovery is completed */
1034+
return false;
1035+
}
10261036
}
10271037

10281038

@@ -1871,17 +1881,17 @@ static void
18711881
MtmReplicationStartupHook(struct PGLogicalStartupHookArgs* args)
18721882
{
18731883
ListCell *param;
1874-
bool isRecoverySession = false;
1884+
MtmIsRecoverySession = false;
18751885
foreach(param, args->in_params)
18761886
{
18771887
DefElem *elem = lfirst(param);
18781888
if (strcmp("mtm_replication_mode", elem->defname) == 0) {
1879-
isRecoverySession = elem->arg != NULL && strVal(elem->arg) != NULL && strcmp(strVal(elem->arg), "recovery") == 0;
1889+
MtmIsRecoverySession = elem->arg != NULL && strVal(elem->arg) != NULL && strcmp(strVal(elem->arg), "recovery") == 0;
18801890
break;
18811891
}
18821892
}
18831893
MtmLock(LW_EXCLUSIVE);
1884-
if (isRecoverySession) {
1894+
if (MtmIsRecoverySession) {
18851895
elog(WARNING, "%d: Node %d start recovery of node %d", MyProcPid, MtmNodeId, MtmReplicationNodeId);
18861896
if (!BIT_CHECK(Mtm->disabledNodeMask, MtmReplicationNodeId-1)) {
18871897
BIT_SET(Mtm->disabledNodeMask, MtmReplicationNodeId-1);

0 commit comments

Comments
 (0)