Avaid false detection of lost heartbeats in watchdog

knizhnik · knizhnik · commit 53da0eba9154 · 2016-12-28T19:49:14.000+03:00
diff --git a/contrib/mmts/arbiter.c b/contrib/mmts/arbiter.c
@@ -219,12 +219,18 @@ static int MtmWaitSocket(int sd, bool forWrite, time_t timeoutMsec)
 	struct timeval tv;
 	fd_set set;
 	int rc;
-	tv.tv_sec = timeoutMsec/1000; 
-    tv.tv_usec = timeoutMsec%1000*1000; 
+	timestamp_t deadline = MtmGetSystemTime() + MSEC_TO_USEC(timeoutMsec);
     FD_ZERO(&set); 
     FD_SET(sd, &set); 
 	do { 
+		timestamp_t now;
 		MtmCheckHeartbeat();
+		now = MtmGetSystemTime();
+		if (now > deadline) { 
+			return 0;
+		}
+		tv.tv_sec = (deadline - now)/USECS_PER_SEC; 
+		tv.tv_usec = (deadline - now)%USECS_PER_SEC;
 	} while ((rc = select(sd+1, forWrite ? NULL : &set, forWrite ? &set : NULL, NULL, &tv)) < 0 && errno == EINTR);
 	return rc;
 }
@@ -371,7 +377,7 @@ static void MtmSendHeartbeat()
 					elog(LOG, "Arbiter failed to send heartbeat to node %d", i+1);
 				} else {
 					if (last_heartbeat_to_node[i] + MSEC_TO_USEC(MtmHeartbeatSendTimeout)*2 < now) { 
-						MTM_LOG1("Last hearbeat to node %d was sent %ld microseconds ago", i+1, now - last_heartbeat_to_node[i]);
+						MTM_LOG1("Last heartbeat to node %d was sent %ld microseconds ago", i+1, now - last_heartbeat_to_node[i]);
 					}
 					last_heartbeat_to_node[i] = now;
 					/* Connectivity mask can be cleared by MtmWatchdog: in this case sockets[i] >= 0 */
@@ -865,6 +871,7 @@ static void MtmReceiver(Datum arg)
 				MtmDisconnect(i);
 			} 
 		}
+		now = MtmGetSystemTime();
 		for (j = 0; j < n; j++) {
 			if (events[j].events & EPOLLIN)  
 #else
@@ -882,6 +889,7 @@ static void MtmReceiver(Datum arg)
 		if (n < 0) {
 			elog(ERROR, "Arbiter failed to select sockets: %d", errno);
 		}
+		now = MtmGetSystemTime();
 		for (i = 0; i < nNodes; i++) { 
 			if (sockets[i] >= 0 && FD_ISSET(sockets[i], &events)) 
 #endif
@@ -1126,7 +1134,7 @@ static void MtmReceiver(Datum arg)
 			}
 		}
 		if (Mtm->status == MTM_ONLINE) { 
-			now = MtmGetSystemTime();
+			/* "now" is time of performing select, so that delays in processing should not cause false detection */
 			if (now > lastHeartbeatCheck + MSEC_TO_USEC(MtmHeartbeatRecvTimeout)) { 
 				if (!MtmWatchdog(now)) { 
 					for (i = 0; i < nNodes; i++) { 
diff --git a/contrib/mmts/multimaster.c b/contrib/mmts/multimaster.c
@@ -243,7 +243,6 @@ static int   MtmMaxRecoveryLag;
 static int   MtmGcPeriod;
 static bool  MtmIgnoreTablesWithoutPk;
 static int   MtmLockCount;
-static int   MtmSenderStarted;
 
 static ExecutorStart_hook_type PreviousExecutorStartHook;
 static ExecutorFinish_hook_type PreviousExecutorFinishHook;
@@ -261,7 +260,11 @@ static void MtmProcessUtility(Node *parsetree, const char *queryString,
 /*
  * -------------------------------------------
  * Synchronize access to MTM structures.
- * Using LWLock seems to be  more efficient (at our benchmarks)
+ * Using LWLock seems to be more efficient (at our benchmarks)
+ * Multimaster uses trash of 2N+1 lwlocks, where N is number of nodes.
+ * locks[0] is used to synchronize access to multimaster state, 
+ * locks[1..N] are used to provide exclusive access to replication session for each node
+ * locks[N+1..2*N] are used to synchronize access to distributed lock graph at each node
  * -------------------------------------------
  */
 void MtmLock(LWLockMode mode)
@@ -316,6 +319,9 @@ timestamp_t MtmGetSystemTime(void)
     return (timestamp_t)tv.tv_sec*USECS_PER_SEC + tv.tv_usec;
 }
 
+/*
+ * Get adjusted system time: taking in account time shift
+ */
 timestamp_t MtmGetCurrentTime(void)
 {
     return MtmGetSystemTime() + Mtm->timeShift;
@@ -610,13 +616,16 @@ MtmAdjustOldestXid(TransactionId xid)
 	}
     return xid;
 }
+
 /*
  * -------------------------------------------
- * Transaction list manipulation
+ * Transaction list manipulation.
+ * All distributed transactions are linked in L1-list ordered by transaction start time.
+ * This list is inspected by MtmAdjustOldestXid and transactions which are not used in any snapshot at any node
+ * are removed from the list and from the hash.
  * -------------------------------------------
  */
 
-
 static void MtmTransactionListAppend(MtmTransState* ts)
 {
 	if (!ts->isEnqueued) { 
@@ -1293,6 +1302,9 @@ MtmEndTransaction(MtmCurrentTrans* x, bool commit)
 	}
 }
 
+/* 
+ * Send arbiter's message
+ */
 void MtmSendMessage(MtmArbiterMessage* msg) 
 {
 	SpinLockAcquire(&Mtm->queueSpinlock);
@@ -1315,6 +1327,11 @@ void MtmSendMessage(MtmArbiterMessage* msg)
 	SpinLockRelease(&Mtm->queueSpinlock);
 }
 
+/*
+ * Send arbiter's 2PC message. Right now only responses to coordinates are 
+ * sent through arbiter. Brodcasts from coordinator to noes are done 
+ * using logical decoding.
+ */
 void MtmSend2PCMessage(MtmTransState* ts, MtmMessageCode cmd)
 {
 	MtmArbiterMessage msg;
@@ -1347,6 +1364,11 @@ void MtmSend2PCMessage(MtmTransState* ts, MtmMessageCode cmd)
 	}
 }
 
+/* 
+ * Broadcase poll state message to all nodes. 
+ * This function is used to gather information about state of prepared transaction
+ * at node startup or after crash of some node.
+ */
 static void MtmBroadcastPollMessage(MtmTransState* ts)
 {
 	int i;
@@ -1370,7 +1392,9 @@ static void MtmBroadcastPollMessage(MtmTransState* ts)
 }
 
 /*
- * Restore state of recovered prepared transaction in memory
+ * Restore state of recovered prepared transaction in memory.
+ * This function is called at system startup to make it possible to 
+ * handle this prepared transactions in normal way.
  */
 static void	MtmLoadPreparedTransactions(void)
 {
@@ -1426,6 +1450,10 @@ static void MtmStartRecovery()
 	MtmUnlock();
 }
 
+
+/*
+ * Prepare context for applying transaction at replica
+ */
 void MtmJoinTransaction(GlobalTransactionId* gtid, csn_t globalSnapshot)
 {
 	MtmTx.gtid = *gtid;
@@ -1479,6 +1507,13 @@ XidStatus MtmGetCurrentTransactionStatus(void)
 	return MtmTx.status;
 }
 
+/* 
+ * Perform atomic exchange of global transaction status.
+ * The problem is that because of concurrent applying transactions at replica by multiple
+ * threads we can proceed ABORT request before PREPARE - when transaction is not yet 
+ * applied at this node and there is MtmTransState associated with this transactions.
+ * We remember information about status of this transaction in MtmTransMap.
+ */
 XidStatus MtmExchangeGlobalTransactionStatus(char const* gid, XidStatus new_status)
 {
 	MtmTransMap* tm;
@@ -1526,6 +1561,9 @@ csn_t MtmGetTransactionCSN(TransactionId xid)
 	return csn;
 }
 	
+/* 
+ * Wakeup coordinator's backend when voting is completed
+ */
 void MtmWakeUpBackend(MtmTransState* ts)
 {
 	if (!ts->votingCompleted) {
@@ -1536,6 +1574,10 @@ void MtmWakeUpBackend(MtmTransState* ts)
 	}
 }
 
+
+/* 
+ * Abort the transaction if it is not yet aborted
+ */
 void MtmAbortTransaction(MtmTransState* ts)
 {	
 	Assert(MtmLockCount != 0); /* should be invoked with exclsuive lock */
@@ -1561,6 +1603,11 @@ void MtmAbortTransaction(MtmTransState* ts)
  * -------------------------------------------
  */
 
+/* 
+ * Handle critical errors while applying transaction at replica.
+ * Such errors should cause shutdown of this cluster node to allow other nodes to continue serving client requests.
+ * Other error will be just reported and ignored
+ */
 void MtmHandleApplyError(void)
 {
 	ErrorData *edata = CopyErrorData();
@@ -1570,13 +1617,15 @@ void MtmHandleApplyError(void)
 		case ERRCODE_IO_ERROR:
 		case ERRCODE_DATA_CORRUPTED:
 		case ERRCODE_INDEX_CORRUPTED:
+		  /* Should we really treate this errors as fatal? 
 		case ERRCODE_SYSTEM_ERROR:
 		case ERRCODE_INTERNAL_ERROR:
 		case ERRCODE_OUT_OF_MEMORY:
+		  */
 			elog(WARNING, "Node is excluded from cluster because of non-recoverable error %d, %s, pid=%u",
 				edata->sqlerrcode, edata->message, getpid());
-			// MtmSwitchClusterMode(MTM_OUT_OF_SERVICE);
-			// kill(PostmasterPid, SIGQUIT);
+			MtmSwitchClusterMode(MTM_OUT_OF_SERVICE);
+			kill(PostmasterPid, SIGQUIT);
 			break;
 	}
 	FreeErrorData(edata);
@@ -1643,6 +1692,9 @@ static void MtmEnableNode(int nodeId)
 	elog(WARNING, "Enable node %d at xlog position %lx", nodeId, GetXLogInsertRecPtr());
 }
 
+/* 
+ * Function call when recovery of node is completed
+ */
 void MtmRecoveryCompleted(void)
 {
 	int i;
@@ -1712,7 +1764,7 @@ static int64 MtmGetSlotLag(int nodeId)
 
 /*
  * This function is called by WAL sender when start sending new transaction.
- * It returns true if specified node is in recovery mode. In this case we should send all transactions from WAL, 
+ * It returns true if specified node is in recovery mode. In this case we should send to it all transactions from WAL, 
  * not only coordinated by self node as in normal mode.
  */
 bool MtmIsRecoveredNode(int nodeId)
@@ -1728,7 +1780,13 @@ bool MtmIsRecoveredNode(int nodeId)
 	}
 }
 
-
+/* 
+ * Check if wal sender replayed all transactions from WAL log.
+ * It can never happen if there are many active transactions.
+ * In this case we wait until gap between sent and current position in the 
+ * WAL becomes smaller than threshold value MtmMinRecoveryLag and 
+ * after it prohibit start of new transactions until WAL is completely replayed.
+ */
 bool MtmRecoveryCaughtUp(int nodeId, XLogRecPtr slotLSN)
 {
 	bool caughtUp = false;
@@ -1822,7 +1880,7 @@ MtmCheckClusterLock()
 				MtmLock(LW_EXCLUSIVE);
 				continue;
 			} else {  
-				/* All lockers are synchronized their logs */
+				/* All lockers have synchronized their logs */
 				/* Remove lock and mark them as recovered */
 				MTM_LOG1("Complete recovery of %d nodes (node mask %lx)", Mtm->nLockers, (long) Mtm->nodeLockerMask);
 				Assert(Mtm->walSenderLockerMask == 0);
@@ -2186,7 +2244,8 @@ static void MtmInitialize()
         Mtm->nAllNodes = MtmNodes;
 		Mtm->disabledNodeMask = 0;
 		Mtm->connectivityMask = 0;
-		Mtm->pglogicalNodeMask = 0;
+		Mtm->pglogicalReceiverMask = 0;
+		Mtm->pglogicalSenderMask = 0;
 		Mtm->walSenderLockerMask = 0;
 		Mtm->nodeLockerMask = 0;
 		Mtm->reconnectMask = 0;
@@ -2900,8 +2959,8 @@ _PG_fini(void)
 void MtmReceiverStarted(int nodeId)
 {
 	MtmLock(LW_EXCLUSIVE);
-	if (!BIT_CHECK(Mtm->pglogicalNodeMask, nodeId-1)) { 
-		BIT_SET(Mtm->pglogicalNodeMask, nodeId-1);
+	if (!BIT_CHECK(Mtm->pglogicalReceiverMask, nodeId-1)) { 
+		BIT_SET(Mtm->pglogicalReceiverMask, nodeId-1);
 		if (BIT_CHECK(Mtm->disabledNodeMask, nodeId-1)) {
 			MtmEnableNode(nodeId);
 			MtmCheckQuorum();
@@ -3014,7 +3073,8 @@ MtmReplicationMode MtmGetReplicationMode(int nodeId, sig_atomic_t volatile* shut
 				Mtm->nReceivers = 0;
 				Mtm->nSenders = 0;
 				Mtm->recoveryCount += 1;
-				Mtm->pglogicalNodeMask = 0;
+				Mtm->pglogicalReceiverMask = 0;
+				Mtm->pglogicalSenderMask = 0;
 				MtmUnlock();
 				return REPLMODE_RECOVERY;
 			}
@@ -3155,19 +3215,21 @@ MtmReplicationStartupHook(struct PGLogicalStartupHookArgs* args)
 			MtmEnableNode(MtmReplicationNodeId);
 			MtmCheckQuorum();
 		} else {
-			/* Force arbiter to reestablish connection with this nodem send heartbeat to inform this node that it was disabled and should perform recovery */
+			/* Force arbiter to reestablish connection with this node, send heartbeat to inform this node that it was disabled and should perform recovery */
 			BIT_SET(Mtm->reconnectMask, MtmReplicationNodeId-1);
 			MtmUnlock();
 			elog(ERROR, "Disabled node %d tries to reconnect without recovery", MtmReplicationNodeId); 
 		}
 	} else {
 		MTM_LOG1("Node %d start logical replication to node %d in normal mode", MtmNodeId, MtmReplicationNodeId); 
 	}
-	elog(LOG, "Start %d senders and %d receivers from %d cluster status %s", Mtm->nSenders+1, Mtm->nReceivers, Mtm->nLiveNodes-1, MtmNodeStatusMnem[Mtm->status]);
-	MtmSenderStarted = 1;
-	if (++Mtm->nSenders == Mtm->nLiveNodes-1 && Mtm->nReceivers == Mtm->nLiveNodes-1 && Mtm->status == MTM_CONNECTED) { 
-		/* All logical replication connections from and to this node are established, so we can switch cluster to online mode */
-		MtmSwitchClusterMode(MTM_ONLINE);
+	if (!BIT_CHECK(Mtm->pglogicalSenderMask, MtmReplicationNodeId-1)) { 
+		elog(LOG, "Start %d senders and %d receivers from %d cluster status %s", Mtm->nSenders+1, Mtm->nReceivers, Mtm->nLiveNodes-1, MtmNodeStatusMnem[Mtm->status]);
+		BIT_SET(Mtm->pglogicalSenderMask, MtmReplicationNodeId-1);
+		if (++Mtm->nSenders == Mtm->nLiveNodes-1 && Mtm->nReceivers == Mtm->nLiveNodes-1 && Mtm->status == MTM_CONNECTED) { 
+			/* All logical replication connections from and to this node are established, so we can switch cluster to online mode */
+			MtmSwitchClusterMode(MTM_ONLINE);
+		}
 	}
 	BIT_SET(Mtm->reconnectMask, MtmReplicationNodeId-1); /* arbiter should try to reestablish connection with this node */
 	MtmUnlock();
@@ -3227,14 +3289,15 @@ void  MtmUpdateLsnMapping(int node_id, XLogRecPtr end_lsn)
 static void 
 MtmReplicationShutdownHook(struct PGLogicalShutdownHookArgs* args)
 {
-	if (MtmReplicationNodeId >= 0) { 
-		MtmLock(LW_EXCLUSIVE);
-		Mtm->nSenders -= MtmSenderStarted;
-		MtmUnlock();
+	MtmLock(LW_EXCLUSIVE);
+	if (MtmReplicationNodeId >= 0 && BIT_CHECK(Mtm->pglogicalSenderMask, MtmReplicationNodeId-1)) { 
+		BIT_CLEAR(Mtm->pglogicalSenderMask, MtmReplicationNodeId-1);
+		Mtm->nSenders -= 1;
 		MTM_LOG1("Logical replication to node %d is stopped", MtmReplicationNodeId); 
 		/* MtmOnNodeDisconnect(MtmReplicationNodeId); */
-		MtmReplicationNodeId = -1; /* defuse on_proc_exit hook */
+		MtmReplicationNodeId = -1; /* defuse MtmOnProcExit hook */
 	}
+	MtmUnlock();
 }
 
 /* 
@@ -3949,6 +4012,10 @@ MtmGenerateGid(char* gid)
 	sprintf(gid, "MTM-%d-%d-%d", MtmNodeId, MyProcPid, ++localCount);
 }
 
+/*
+ * Replace normal commit with two-phase commit.
+ * It is called either for commit of standalone command either for commit of transaction block.
+ */
 static bool MtmTwoPhaseCommit(MtmCurrentTrans* x)
 {
 	// if (MyXactAccessedTempRel)
diff --git a/contrib/mmts/multimaster.h b/contrib/mmts/multimaster.h
@@ -263,7 +263,8 @@ typedef struct
 	TransactionId oldestXid;           /* XID of oldest transaction visible by any active transaction (local or global) */
 	nodemask_t disabledNodeMask;       /* bitmask of disabled nodes */
 	nodemask_t connectivityMask;       /* bitmask of disconnected nodes */
-	nodemask_t pglogicalNodeMask;      /* bitmask of started pglogic receivers */
+	nodemask_t pglogicalReceiverMask;  /* bitmask of started pglogic receivers */
+	nodemask_t pglogicalSenderMask;    /* bitmask of started pglogic senders */
 	nodemask_t walSenderLockerMask;    /* Mask of WAL-senders IDs locking the cluster */
 	nodemask_t nodeLockerMask;         /* Mask of node IDs which WAL-senders are locking the cluster */
 	nodemask_t reconnectMask; 	       /* Mask of nodes connection to which has to be reestablished by sender */
diff --git a/contrib/mmts/pglogical_proto.c b/contrib/mmts/pglogical_proto.c
@@ -211,7 +211,7 @@ pglogical_write_commit(StringInfo out, PGLogicalOutputData *data,
 			return;
 		}
 		if (isRecovery) { 
-			MTM_LOG1("PGLOGICAL_SEND recover transaction: event=%d, gid=%s, xid=%d, commit_lsn=%lx, txn->end_lsn=%lx, xlog=%lx", 
+			MTM_LOG2("PGLOGICAL_SEND recover transaction: event=%d, gid=%s, xid=%d, commit_lsn=%lx, txn->end_lsn=%lx, xlog=%lx", 
 					 flags, txn->gid, txn->xid, commit_lsn, txn->end_lsn, GetXLogInsertRecPtr());
 		}
 		if (flags == PGLOGICAL_ABORT_PREPARED) { 
diff --git a/contrib/mmts/tests2/test_recovery_up.py b/contrib/mmts/tests2/test_recovery_up.py

Original file line number	Diff line number	Diff line change
`@@ -211,7 +211,7 @@ pglogical_write_commit(StringInfo out, PGLogicalOutputData *data,`
`211`	`211`	`return;`
`212`	`212`	`}`
`213`	`213`	`if (isRecovery) {`
`214`		`- MTM_LOG1("PGLOGICAL_SEND recover transaction: event=%d, gid=%s, xid=%d, commit_lsn=%lx, txn->end_lsn=%lx, xlog=%lx",`
	`214`	`+ MTM_LOG2("PGLOGICAL_SEND recover transaction: event=%d, gid=%s, xid=%d, commit_lsn=%lx, txn->end_lsn=%lx, xlog=%lx",`
`215`	`215`	`flags, txn->gid, txn->xid, commit_lsn, txn->end_lsn, GetXLogInsertRecPtr());`
`216`	`216`	`}`
`217`	`217`	`if (flags == PGLOGICAL_ABORT_PREPARED) {`