Implement internal heartbeat for multimaster

knizhnik · kelvich · commit de99b01e5c92 · 2017-11-13T00:17:01.000+03:00
diff --git a/arbiter.c b/arbiter.c
@@ -44,6 +44,7 @@
 #include "utils/array.h"
 #include "utils/builtins.h"
 #include "utils/memutils.h"
+#include "utils/timeout.h"
 #include "commands/dbcommands.h"
 #include "miscadmin.h"
 #include "postmaster/autovacuum.h"
@@ -101,22 +102,23 @@ typedef struct
 
 static int*      sockets;
 static int       gateway;
+static bool      send_heartbeat;
 
 static void MtmTransSender(Datum arg);
 static void MtmTransReceiver(Datum arg);
 
-/*
- * static char const* const messageText[] = 
- * {
- *	"INVALID",
- *	"HANDSHAKE",
- *	"READY",
- *	"PREPARE",
- *	"PREPARED",
- *	"ABORTED",
- *	"STATUS"
- *};
- */
+
+static char const* const messageText[] = 
+{
+	"INVALID",
+	"HANDSHAKE",
+	"READY",
+	"PREPARE",
+	"PREPARED",
+	"ABORTED",
+	"STATUS",
+	"HEARTBEAT"
+};
 
 static BackgroundWorker MtmSender = {
 	"mtm-sender",
@@ -513,14 +515,19 @@ static void MtmAppendBuffer(MtmBuffer* txBuffer, TransactionId xid, int node, Mt
 		}
 		buf->used = 0;
 	}
-	MTM_LOG3("Send %s message CSN=%ld to node %d from node %d for global transaction %d/local transaction %d", 
-			 messageText[ts->cmd], ts->csn, node+1, MtmNodeId, ts->gtid.xid, ts->xid);
-
-	Assert(ts->cmd != MSG_INVALID);
-	buf->data[buf->used].code = ts->cmd;
 	buf->data[buf->used].dxid = xid;
-	buf->data[buf->used].sxid = ts->xid;
-	buf->data[buf->used].csn  = ts->csn;
+
+	if (ts != NULL) { 
+		MTM_LOG3("Send %s message CSN=%ld to node %d from node %d for global transaction %d/local transaction %d", 
+				 messageText[ts->cmd], ts->csn, node+1, MtmNodeId, ts->gtid.xid, ts->xid);
+		Assert(ts->cmd != MSG_INVALID);
+		buf->data[buf->used].code = ts->cmd;
+		buf->data[buf->used].sxid = ts->xid;
+		buf->data[buf->used].csn  = ts->csn;
+	} else { 
+		buf->data[buf->used].code = MSG_HEARTBEAT;
+		MTM_LOG3("Send HEARTBEAT message to node %d from node %d\n", node+1, MtmNodeId);
+	}
 	buf->data[buf->used].node = MtmNodeId;
 	buf->data[buf->used].disabledNodeMask = Mtm->disabledNodeMask;
 	buf->data[buf->used].oldestSnapshot = Mtm->nodes[MtmNodeId-1].oldestSnapshot;
@@ -533,15 +540,21 @@ static void MtmBroadcastMessage(MtmBuffer* txBuffer, MtmTransState* ts)
 	int n = 1;
 	for (i = 0; i < Mtm->nAllNodes; i++)
 	{
-		if (!BIT_CHECK(Mtm->disabledNodeMask, i) && TransactionIdIsValid(ts->xids[i])) { 
+		if (!BIT_CHECK(Mtm->disabledNodeMask, i) && (ts == NULL || TransactionIdIsValid(ts->xids[i]))) { 
 			Assert(i+1 != MtmNodeId);
-			MtmAppendBuffer(txBuffer, ts->xids[i], i, ts);
+			MtmAppendBuffer(txBuffer, ts ? ts->xids[i] : InvalidTransactionId, i, ts);
 			n += 1;
 		}
 	}
 	Assert(n == Mtm->nLiveNodes);
 }
 
+static void MtmSendHeartbeat()
+{
+	send_heartbeat = true;
+	PGSemaphoreUnlock(&Mtm->votingSemaphore);
+}
+	
 
 static void MtmTransSender(Datum arg)
 {
@@ -556,6 +569,8 @@ static void MtmTransSender(Datum arg)
 	sigfillset(&sset);
 	sigprocmask(SIG_UNBLOCK, &sset, NULL);
 
+	RegisterTimeout(USER_TIMEOUT, MtmSendHeartbeat);
+
 	MtmOpenConnections();
 
 	for (i = 0; i < nNodes; i++) { 
@@ -567,6 +582,10 @@ static void MtmTransSender(Datum arg)
 		PGSemaphoreLock(&Mtm->votingSemaphore);
 		CHECK_FOR_INTERRUPTS();
 
+		if (send_heartbeat) {
+			send_heartbeat = false;
+			MtmBroadcastMessage(txBuffer, NULL);
+		}			
 		/* 
 		 * Use shared lock to improve locality,
 		 * because all other process modifying this list are using exclusive lock 
@@ -700,15 +719,22 @@ static void MtmTransReceiver(Datum arg)
 
 				for (j = 0; j < nResponses; j++) { 
 					MtmArbiterMessage* msg = &rxBuffer[i].data[j];
-					MtmTransState* ts = (MtmTransState*)hash_search(MtmXid2State, &msg->dxid, HASH_FIND, NULL);
-					Assert(ts != NULL);
+					MtmTransState* ts;
+
 					Assert(msg->node > 0 && msg->node <= nNodes && msg->node != MtmNodeId);
+					Mtm->nodes[msg->node-1].oldestSnapshot = msg->oldestSnapshot;
+					Mtm->nodes[msg->node-1].lastHeartbeat = MtmGetSystemTime();
+
+					if (msg->code == MSG_HEARTBEAT) { 
+						continue;
+					}
+					ts = (MtmTransState*)hash_search(MtmXid2State, &msg->dxid, HASH_FIND, NULL);
+					Assert(ts != NULL);
 
 					if (BIT_CHECK(msg->disabledNodeMask, MtmNodeId-1) && Mtm->status != MTM_RECOVERY) { 
 						elog(PANIC, "Node %d thinks that I was dead: perform hara-kiri not to be a zombie", msg->node);
 					}
-					Mtm->nodes[msg->node-1].oldestSnapshot = msg->oldestSnapshot;
-
+					
 					if (MtmIsCoordinator(ts)) {
 						switch (msg->code) { 
 						  case MSG_READY:
@@ -768,7 +794,7 @@ static void MtmTransReceiver(Datum arg)
 					} else { 
 						switch (msg->code) { 
 						  case MSG_PREPARE:
-							Assert(ts->status == TRANSACTION_STATUS_IN_PROGRESS);									
+							Assert(ts->status == TRANSACTION_STATUS_IN_PROGRESS); 	
 							ts->status = TRANSACTION_STATUS_UNKNOWN;
 							ts->csn = MtmAssignCSN();
 							MtmAdjustSubtransactions(ts);
diff --git a/multimaster.c b/multimaster.c
@@ -191,6 +191,8 @@ int   MtmReconnectAttempts;
 int   MtmNodeDisableDelay;
 int   MtmTransSpillThreshold;
 int   MtmMaxNodes;
+int   MtmHeartbeatSendTimeout;
+int   MtmHeartbeatRecvTimeout;
 bool  MtmUseRaftable;
 bool  MtmUseDtm;
 
@@ -741,6 +743,27 @@ MtmPrePrepareTransaction(MtmCurrentTrans* x)
 
 }
 
+/*
+ * Check heartbeats
+ */
+static void MtmWatchdog()
+{
+	int i, n = Mtm->nAllNodes;
+	timestamp_t now = MtmGetSystemTime();
+	for (i = 0; i < n; i++) { 
+		if (i+1 != MtmNodeId && !BIT_CHECK(Mtm->disabledNodeMask, i)) {
+			if (Mtm->nodes[i].lastHeartbeat != 0
+				&& now > Mtm->nodes[i].lastHeartbeat + MSEC_TO_USEC(MtmHeartbeatRecvTimeout)) 
+			{ 
+				elog(WARNING, "Disable node %d because last heartbeat was received %d msec ago", 
+					 i+1, (int)USEC_TO_MSEC(now - Mtm->nodes[i].lastHeartbeat));
+				MtmOnNodeDisconnect(i+1);				
+			}
+		}
+	}
+}
+
+
 static void
 MtmPostPrepareTransaction(MtmCurrentTrans* x)
 { 
@@ -770,14 +793,24 @@ MtmPostPrepareTransaction(MtmCurrentTrans* x)
 		MtmUnlock();
 		MtmResetTransaction(x);
 	} else { 
-		time_t timeout = Max(Mtm2PCMinTimeout, (ts->csn - ts->snapshot)*Mtm2PCPrepareRatio/100000); /* usec->msec and percents */ 
+		time_t transTimeout = Max(Mtm2PCMinTimeout, (ts->csn - ts->snapshot)*Mtm2PCPrepareRatio/100000); /* usec->msec and percents */ 
+		time_t timeout = transTimeout < MtmHeartbeatRecvTimeout ? transTimeout : MtmHeartbeatRecvTimeout;
+		timestamp_t deadline = MtmGetSystemTime() + MSEC_TO_USEC(transTimeout);
 		int result = 0;
 		int nConfigChanges = Mtm->nConfigChanges;
 		/* wait votes from all nodes */
-		while (!ts->votingCompleted && !(result & WL_TIMEOUT)) {
+		while (!ts->votingCompleted) {
 			MtmUnlock();
+			MtmWatchdog();
 			result = WaitLatch(&MyProc->procLatch, WL_LATCH_SET|WL_TIMEOUT, timeout);
-			ResetLatch(&MyProc->procLatch);			
+			if (result & WL_TIMEOUT) { 
+				if (MtmGetSystemTime() > deadline) { 
+					MtmLock(LW_SHARED);
+					break;
+				}
+			} else { 
+				ResetLatch(&MyProc->procLatch);			
+			} 
 			MtmLock(LW_SHARED);
 		}
 		if (!ts->votingCompleted) { 
@@ -1022,6 +1055,22 @@ void MtmHandleApplyError(void)
 }
 
 
+static void MtmDisableNode(int nodeId)
+{
+	BIT_SET(Mtm->disabledNodeMask, nodeId-1);
+	Mtm->nodes[nodeId-1].lastStatusChangeTime = MtmGetSystemTime();
+	Mtm->nodes[nodeId-1].lastHeartbeat = 0; /* defuse watchdog until first heartbeat is received */
+	Mtm->nLiveNodes -= 1;			
+}
+	
+static void MtmEnableNode(int nodeId)
+{ 
+	BIT_CLEAR(Mtm->disabledNodeMask, nodeId-1);
+	Mtm->nodes[nodeId-1].lastStatusChangeTime = MtmGetSystemTime();
+	Mtm->nodes[nodeId-1].lastHeartbeat = 0; /* defuse watchdog until first heartbeat is received */
+	Mtm->nLiveNodes += 1;			
+}
+
 void MtmRecoveryCompleted(void)
 {
 	MTM_LOG1("Recovery of node %d is completed", MtmNodeId);
@@ -1116,9 +1165,7 @@ bool MtmRecoveryCaughtUp(int nodeId, XLogRecPtr slotLSN)
 				MTM_LOG1("%d: node %d is caugth-up without locking cluster", MyProcPid, nodeId);	
 				/* We are lucky: caugth-up without locking cluster! */
 			}
-			BIT_CLEAR(Mtm->disabledNodeMask, nodeId-1);
-			Mtm->nodes[nodeId-1].lastStatusChangeTime = MtmGetSystemTime();
-			Mtm->nLiveNodes += 1;
+			MtmEnableNode(nodeId);
 			Mtm->nConfigChanges += 1;
 			caughtUp = true;
 		} else if (!BIT_CHECK(Mtm->nodeLockerMask, nodeId-1)
@@ -1261,17 +1308,13 @@ bool MtmRefreshClusterStatus(bool nowait)
 		mask = ~clique & (((nodemask_t)1 << Mtm->nAllNodes)-1) & ~Mtm->disabledNodeMask; /* new disabled nodes mask */
 		for (i = 0; mask != 0; i++, mask >>= 1) {
 			if (mask & 1) { 
-				Mtm->nLiveNodes -= 1;
-				BIT_SET(Mtm->disabledNodeMask, i);
-				Mtm->nodes[i].lastStatusChangeTime = MtmGetSystemTime();
+				MtmDisableNode(i+1);
 			}
 		}
 		mask = clique & Mtm->disabledNodeMask; /* new enabled nodes mask */		
 		for (i = 0; mask != 0; i++, mask >>= 1) {
 			if (mask & 1) { 
-				Mtm->nLiveNodes += 1;
-				BIT_CLEAR(Mtm->disabledNodeMask, i);
-				Mtm->nodes[i].lastStatusChangeTime = MtmGetSystemTime();
+				MtmEnableNode(i+1);
 			}
 		}
 		MtmCheckQuorum();
@@ -1316,7 +1359,6 @@ void MtmOnNodeDisconnect(int nodeId)
 		/* Avoid false detection of node failure and prevent node status blinking */
 		return;
 	}
-
 	BIT_SET(Mtm->connectivityMask, nodeId-1);
 	BIT_SET(Mtm->reconnectMask, nodeId-1);
 	RaftableSet(psprintf("node-mask-%d", MtmNodeId), &Mtm->connectivityMask, sizeof Mtm->connectivityMask, false);
@@ -1327,9 +1369,7 @@ void MtmOnNodeDisconnect(int nodeId)
 	if (!MtmRefreshClusterStatus(false)) { 
 		MtmLock(LW_EXCLUSIVE);
 		if (!BIT_CHECK(Mtm->disabledNodeMask, nodeId-1)) { 
-			Mtm->nodes[nodeId-1].lastStatusChangeTime = MtmGetSystemTime();
-			BIT_SET(Mtm->disabledNodeMask, nodeId-1);
-			Mtm->nLiveNodes -= 1;
+			MtmDisableNode(nodeId);
 			MtmCheckQuorum();
 			/* Interrupt voting for active transaction and abort them */
 			for (ts = Mtm->transListHead; ts != NULL; ts = ts->next) { 
@@ -1503,6 +1543,7 @@ static void MtmInitialize()
 			Mtm->nodes[i].lastStatusChangeTime = MtmGetSystemTime();
 			Mtm->nodes[i].con = MtmConnections[i];
 			Mtm->nodes[i].flushPos = 0;
+			Mtm->nodes[i].lastHeartbeat = 0;
 		}
 		PGSemaphoreCreate(&Mtm->votingSemaphore);
 		PGSemaphoreReset(&Mtm->votingSemaphore);
@@ -1627,6 +1668,36 @@ _PG_init(void)
 	if (!process_shared_preload_libraries_in_progress)
 		return;
 
+	DefineCustomIntVariable(
+		"multimaster.heartbeat_send_timeout", 
+		"Timeout in milliseconds of sending heartbeat messages",
+		"Period of broadcasting heartbeat messages by abiter to all nodes",
+		&MtmHeartbeatSendTimeout,
+		1000,
+		1,
+		INT_MAX,
+		PGC_BACKEND,
+		0,
+		NULL,
+		NULL,
+		NULL
+	);
+
+	DefineCustomIntVariable(
+		"multimaster.heartbeat_recv_timeout", 
+		"Timeout in milliseconds of receiving heartbeat messages",
+		"If no heartbeat message is received from node within this period, it assumed to be dead",
+		&MtmHeartbeatRecvTimeout,
+		2000,
+		1,
+		INT_MAX,
+		PGC_BACKEND,
+		0,
+		NULL,
+		NULL,
+		NULL
+	);
+
 	DefineCustomIntVariable(
 		"multimaster.gc_period",
 		"Number of distributed transactions after which garbage collection is started",
@@ -2056,9 +2127,7 @@ void MtmDropNode(int nodeId, bool dropSlot)
 		{ 
 			elog(ERROR, "NodeID %d is out of range [1,%d]", nodeId, Mtm->nLiveNodes);
 		}
-		Mtm->nodes[nodeId-1].lastStatusChangeTime = MtmGetSystemTime();
-		BIT_SET(Mtm->disabledNodeMask, nodeId-1);
-		Mtm->nLiveNodes -= 1;
+		MtmDisableNode(nodeId);
 		MtmCheckQuorum();
 		if (!MtmIsBroadcast())
 		{
@@ -2110,17 +2179,13 @@ MtmReplicationStartupHook(struct PGLogicalStartupHookArgs* args)
 	if (MtmIsRecoverySession) {
 		MTM_LOG1("%d: Node %d start recovery of node %d", MyProcPid, MtmNodeId, MtmReplicationNodeId);
 		if (!BIT_CHECK(Mtm->disabledNodeMask,  MtmReplicationNodeId-1)) {
-			Mtm->nodes[MtmReplicationNodeId-1].lastStatusChangeTime = MtmGetSystemTime();
-			BIT_SET(Mtm->disabledNodeMask,  MtmReplicationNodeId-1);
-			Mtm->nLiveNodes -= 1;			
+			MtmDisableNode(MtmReplicationNodeId);
 			MtmCheckQuorum();
 		}
 	} else if (BIT_CHECK(Mtm->disabledNodeMask,  MtmReplicationNodeId-1)) {
 		if (recoveryCompleted) { 
 			MTM_LOG1("Node %d consider that recovery of node %d is completed: start normal replication", MtmNodeId, MtmReplicationNodeId); 
-			Mtm->nodes[MtmReplicationNodeId-1].lastStatusChangeTime = MtmGetSystemTime();
-			BIT_CLEAR(Mtm->disabledNodeMask,  MtmReplicationNodeId-1);
-			Mtm->nLiveNodes += 1;
+			MtmEnableNode(MtmReplicationNodeId);
 			MtmCheckQuorum();
 		} else {
 			elog(ERROR, "Disabled node %d tries to reconnect without recovery", MtmReplicationNodeId); 
diff --git a/multimaster.h b/multimaster.h
@@ -92,7 +92,8 @@ typedef enum
 	MSG_PREPARE,
 	MSG_PREPARED,
 	MSG_ABORTED,
-	MSG_STATUS
+	MSG_STATUS,
+	MSG_HEARTBEAT
 } MtmMessageCode;
 
 typedef enum
@@ -127,6 +128,7 @@ typedef struct
 	timestamp_t lastStatusChangeTime;
 	timestamp_t receiverStartTime;
 	timestamp_t senderStartTime;
+	timestamp_t lastHeartbeat;
 	int         senderPid;
 	int         receiverPid;
 	XLogRecPtr  flushPos;
@@ -218,6 +220,8 @@ extern int   MtmReconnectAttempts;
 extern int   MtmKeepaliveTimeout;
 extern int   MtmNodeDisableDelay;
 extern int   MtmTransSpillThreshold;
+extern int   MtmHeartbeatSendTimeout;
+extern int   MtmHeartbeatRecvTimeout;
 extern bool  MtmUseDtm;
 extern HTAB* MtmXid2State;