Improve deadlock detection algorithm by taking in account hidden dependencies between transactions caused by lack of vacant workers in apply pool

knizhnik · kelvich · commit 97c667ceccb1 · 2017-11-13T00:15:34.000+03:00
diff --git a/bgwpool.c b/bgwpool.c
@@ -35,6 +35,9 @@ static void BgwPoolMainLoop(Datum arg)
         work = malloc(size);
         pool->pending -= 1;
         pool->active += 1;
+		if (pool->lastPeakTime == 0 && pool->active == pool->nWorkers && pool->pending != 0) {
+			pool->lastPeakTime = MtmGetSystemTime();
+		}
         if (pool->head + size + 4 > pool->size) { 
             memcpy(work, pool->queue, size);
             pool->head = INTALIGN(size);
@@ -48,17 +51,19 @@ static void BgwPoolMainLoop(Datum arg)
         if (pool->producerBlocked) {
             pool->producerBlocked = false;
             PGSemaphoreUnlock(&pool->overflow);
+			pool->lastPeakTime = 0;
         }
         SpinLockRelease(&pool->lock);
         pool->executor(id, work, size);
         free(work);
         SpinLockAcquire(&pool->lock);
         pool->active -= 1;
+		pool->lastPeakTime = 0;
         SpinLockRelease(&pool->lock);
     }
 }
 
-void BgwPoolInit(BgwPool* pool, BgwPoolExecutor executor, char const* dbname, size_t queueSize)
+void BgwPoolInit(BgwPool* pool, BgwPoolExecutor executor, char const* dbname, size_t queueSize, size_t nWorkers)
 {
     pool->queue = (char*)ShmemAlloc(queueSize);
     pool->executor = executor;
@@ -73,8 +78,15 @@ void BgwPoolInit(BgwPool* pool, BgwPoolExecutor executor, char const* dbname, si
     pool->size = queueSize;
     pool->active = 0;
     pool->pending = 0;
+	pool->nWorkers = nWorkers;
+	pool->lastPeakTime = 0;
     strcpy(pool->dbname, dbname);
 }
+ 
+timestamp_t BgwGetLastPeekTime(BgwPool* pool)
+{
+	return pool->lastPeakTime;
+}
 
 void BgwPoolStart(int nWorkers, BgwPoolConstructor constructor)
 {
@@ -123,12 +135,18 @@ void BgwPoolExecute(BgwPool* pool, void* work, size_t size)
         if ((pool->head <= pool->tail && pool->size - pool->tail < size + 4 && pool->head < size) 
             || (pool->head > pool->tail && pool->head - pool->tail < size + 4))
         {
-            pool->producerBlocked = true;
+            if (pool->lastPeakTime == 0) {
+				pool->lastPeakTime = MtmGetSystemTime();
+			}
+			pool->producerBlocked = true;
             SpinLockRelease(&pool->lock);
             PGSemaphoreLock(&pool->overflow);
             SpinLockAcquire(&pool->lock);
         } else {
             pool->pending += 1;
+			if (pool->lastPeakTime == 0 && pool->active == pool->nWorkers && pool->pending != 0) {
+				pool->lastPeakTime = MtmGetSystemTime();
+			}
             *(int*)&pool->queue[pool->tail] = size;
             if (pool->size - pool->tail >= size + 4) { 
                 memcpy(&pool->queue[pool->tail+4], work, size);
diff --git a/bgwpool.h b/bgwpool.h
@@ -7,6 +7,8 @@
 
 typedef void(*BgwPoolExecutor)(int id, void* work, size_t size);
 
+typedef uint64 timestamp_t;
+
 #define MAX_DBNAME_LEN 30
 #define MULTIMASTER_BGW_RESTART_TIMEOUT 1 /* seconds */
 
@@ -21,6 +23,8 @@ typedef struct
     size_t size;
     size_t active;
     size_t pending;
+	size_t nWorkers;
+	time_t lastPeakTime;
     bool   producerBlocked;
     char   dbname[MAX_DBNAME_LEN];
     char*  queue;
@@ -30,10 +34,12 @@ typedef BgwPool*(*BgwPoolConstructor)(void);
 
 extern void BgwPoolStart(int nWorkers, BgwPoolConstructor constructor);
 
-extern void BgwPoolInit(BgwPool* pool, BgwPoolExecutor executor, char const* dbname, size_t queueSize);
+extern void BgwPoolInit(BgwPool* pool, BgwPoolExecutor executor, char const* dbname, size_t queueSize, size_t nWorkers);
 
 extern void BgwPoolExecute(BgwPool* pool, void* work, size_t size);
 
 extern size_t BgwPoolGetQueueSize(BgwPool* pool);
 
+extern timestamp_t BgwGetLastPeekTime(BgwPool* pool);
+
 #endif
diff --git a/multimaster.c b/multimaster.c
@@ -255,13 +255,18 @@ void MtmUnlockNode(int nodeId)
  */
 
 
-timestamp_t MtmGetCurrentTime(void)
+timestamp_t MtmGetSystemTime(void)
 {
     struct timeval tv;
     gettimeofday(&tv, NULL);
     return (timestamp_t)tv.tv_sec*USEC + tv.tv_usec + Mtm->timeShift;
 }
 
+timestamp_t MtmGetCurrentTime(void)
+{
+    return MtmGetSystemTime() + Mtm->timeShift;
+}
+
 void MtmSleep(timestamp_t interval)
 {
     struct timespec ts;
@@ -1045,7 +1050,7 @@ void MtmRecoveryCompleted(void)
 	MtmLock(LW_EXCLUSIVE);
 	Mtm->recoverySlot = 0;
 	BIT_CLEAR(Mtm->disabledNodeMask, MtmNodeId-1);
-	Mtm->nodes[MtmNodeId-1].lastStatusChangeTime = time(NULL);
+	Mtm->nodes[MtmNodeId-1].lastStatusChangeTime = MtmGetSystemTime();
 	/* Mode will be changed to online once all locagical reciever are connected */
 	MtmSwitchClusterMode(MTM_CONNECTED);
 	MtmUnlock();
@@ -1134,7 +1139,7 @@ bool MtmRecoveryCaughtUp(int nodeId, XLogRecPtr slotLSN)
 				/* We are lucky: caugth-up without locking cluster! */
 			}
 			BIT_CLEAR(Mtm->disabledNodeMask, nodeId-1);
-			Mtm->nodes[nodeId-1].lastStatusChangeTime = time(NULL);
+			Mtm->nodes[nodeId-1].lastStatusChangeTime = MtmGetSystemTime();
 			Mtm->nNodes += 1;
 			caughtUp = true;
 		} else if (!BIT_CHECK(Mtm->nodeLockerMask, nodeId-1)
@@ -1279,15 +1284,15 @@ bool MtmRefreshClusterStatus(bool nowait)
 			if (mask & 1) { 
 				Mtm->nNodes -= 1;
 				BIT_SET(Mtm->disabledNodeMask, i);
-				Mtm->nodes[i].lastStatusChangeTime = time(NULL);
+				Mtm->nodes[i].lastStatusChangeTime = MtmGetSystemTime();
 			}
 		}
 		mask = clique & Mtm->disabledNodeMask; /* new enabled nodes mask */		
 		for (i = 0; mask != 0; i++, mask >>= 1) {
 			if (mask & 1) { 
 				Mtm->nNodes += 1;
 				BIT_CLEAR(Mtm->disabledNodeMask, i);
-				Mtm->nodes[i].lastStatusChangeTime = time(NULL);
+				Mtm->nodes[i].lastStatusChangeTime = MtmGetSystemTime();
 			}
 		}
 		MtmCheckQuorum();
@@ -1327,7 +1332,7 @@ void MtmOnNodeDisconnect(int nodeId)
 { 
 	MtmTransState *ts;
 
-	if (Mtm->nodes[nodeId-1].lastStatusChangeTime + MtmNodeDisableDelay > time(NULL)) { 
+	if (Mtm->nodes[nodeId-1].lastStatusChangeTime + MSEC_TO_USEC(MtmNodeDisableDelay) > MtmGetSystemTime()) { 
 		/* Avoid false detection of node failure and prevent node status blinking */
 		return;
 	}
@@ -1342,7 +1347,7 @@ void MtmOnNodeDisconnect(int nodeId)
 	if (!MtmRefreshClusterStatus(false)) { 
 		MtmLock(LW_EXCLUSIVE);
 		if (!BIT_CHECK(Mtm->disabledNodeMask, nodeId-1)) { 
-			Mtm->nodes[nodeId-1].lastStatusChangeTime = time(NULL);
+			Mtm->nodes[nodeId-1].lastStatusChangeTime = MtmGetSystemTime();
 			BIT_SET(Mtm->disabledNodeMask, nodeId-1);
 			Mtm->nNodes -= 1;
 			MtmCheckQuorum();
@@ -1510,14 +1515,14 @@ static void MtmInitialize()
 		for (i = 0; i < MtmNodes; i++) {
 			Mtm->nodes[i].oldestSnapshot = 0;
 			Mtm->nodes[i].transDelay = 0;
-			Mtm->nodes[i].lastStatusChangeTime = time(NULL);
+			Mtm->nodes[i].lastStatusChangeTime = MtmGetSystemTime();
 			Mtm->nodes[i].con = MtmConnections[i];
 			Mtm->nodes[i].flushPos = 0;
 		}
 		PGSemaphoreCreate(&Mtm->votingSemaphore);
 		PGSemaphoreReset(&Mtm->votingSemaphore);
 		SpinLockInit(&Mtm->spinlock);
-        BgwPoolInit(&Mtm->pool, MtmExecutor, MtmDatabaseName, MtmQueueSize);
+        BgwPoolInit(&Mtm->pool, MtmExecutor, MtmDatabaseName, MtmQueueSize, MtmWorkers);
 		RegisterXactCallback(MtmXactCallback, NULL);
 		MtmTx.snapshot = INVALID_CSN;
 		MtmTx.xid = InvalidTransactionId;		
@@ -1681,10 +1686,10 @@ _PG_init(void)
 
 	DefineCustomIntVariable(
 		"multimaster.node_disable_delay",
-		"Minamal amount of time (sec) between node status change",
+		"Minamal amount of time (msec) between node status change",
 		"This delay is used to avoid false detection of node failure and to prevent blinking of node status node",
 		&MtmNodeDisableDelay,
-		1,
+		1000,
 		1,
 		INT_MAX,
 		PGC_BACKEND,
@@ -2032,7 +2037,7 @@ void MtmDropNode(int nodeId, bool dropSlot)
 		{ 
 			elog(ERROR, "NodeID %d is out of range [1,%d]", nodeId, Mtm->nNodes);
 		}
-		Mtm->nodes[nodeId-1].lastStatusChangeTime = time(NULL);
+		Mtm->nodes[nodeId-1].lastStatusChangeTime = MtmGetSystemTime();
 		BIT_SET(Mtm->disabledNodeMask, nodeId-1);
 		Mtm->nNodes -= 1;
 		MtmCheckQuorum();
@@ -2083,15 +2088,15 @@ MtmReplicationStartupHook(struct PGLogicalStartupHookArgs* args)
 	if (MtmIsRecoverySession) {
 		MTM_LOG1("%d: Node %d start recovery of node %d", MyProcPid, MtmNodeId, MtmReplicationNodeId);
 		if (!BIT_CHECK(Mtm->disabledNodeMask,  MtmReplicationNodeId-1)) {
-			Mtm->nodes[MtmReplicationNodeId-1].lastStatusChangeTime = time(NULL);
+			Mtm->nodes[MtmReplicationNodeId-1].lastStatusChangeTime = MtmGetSystemTime();
 			BIT_SET(Mtm->disabledNodeMask,  MtmReplicationNodeId-1);
 			Mtm->nNodes -= 1;			
 			MtmCheckQuorum();
 		}
 	} else if (BIT_CHECK(Mtm->disabledNodeMask,  MtmReplicationNodeId-1)) {
 		if (recoveryCompleted) { 
 			MTM_LOG1("Node %d consider that recovery of node %d is completed: start normal replication", MtmNodeId, MtmReplicationNodeId); 
-			Mtm->nodes[MtmReplicationNodeId-1].lastStatusChangeTime = time(NULL);
+			Mtm->nodes[MtmReplicationNodeId-1].lastStatusChangeTime = MtmGetSystemTime();
 			BIT_CLEAR(Mtm->disabledNodeMask,  MtmReplicationNodeId-1);
 			Mtm->nNodes += 1;
 			MtmCheckQuorum();
@@ -2238,7 +2243,7 @@ mtm_poll_node(PG_FUNCTION_ARGS)
 	}
 	if (!nowait) { 
 		/* Just wait some time until logical repication channels will be reestablished */
-		MtmSleep(MtmNodeDisableDelay);
+		MtmSleep(MSEC_TO_USEC(MtmNodeDisableDelay));
 	}
     PG_RETURN_BOOL(online);
 }
@@ -2297,7 +2302,7 @@ mtm_get_nodes_state(PG_FUNCTION_ARGS)
 	usrfctx->values[4] = Int64GetDatum(lag);
 	usrfctx->nulls[4] = lag < 0;
 	usrfctx->values[5] = Int64GetDatum(Mtm->transCount ? Mtm->nodes[usrfctx->nodeId-1].transDelay/Mtm->transCount : 0);
-	usrfctx->values[6] = TimestampTzGetDatum(time_t_to_timestamptz(Mtm->nodes[usrfctx->nodeId-1].lastStatusChangeTime));
+	usrfctx->values[6] = TimestampTzGetDatum(time_t_to_timestamptz(Mtm->nodes[usrfctx->nodeId-1].lastStatusChangeTime/USEC));
 	usrfctx->values[7] = CStringGetTextDatum(Mtm->nodes[usrfctx->nodeId-1].con.connStr);
 	usrfctx->nodeId += 1;
 
@@ -3058,6 +3063,18 @@ MtmDetectGlobalDeadLock(PGPROC* proc)
 		MtmGetGtid(pgxact->xid, &gtid);
 		hasDeadlock = MtmGraphFindLoop(&graph, &gtid);
 		elog(WARNING, "Distributed deadlock check for %u:%u = %d", gtid.node, gtid.xid, hasDeadlock);
+		if (!hasDeadlock) { 
+			/* There is no deadlock loop in graph, but deadlock can be caused by lack of apply workers: if all of them are busy, then some transactions
+			 * can not be appied just because there are no vacant workers and it cause additional dependency between transactions which is not 
+			 * refelected in lock graph 
+			 */
+			timestamp_t lastPeekTime = BgwGetLastPeekTime(&Mtm->pool);
+			if (lastPeekTime != 0 && MtmGetSystemTime() - lastPeekTime >= MSEC_TO_USEC(DeadlockTimeout)) { 
+				hasDeadlock = true;
+				elog(WARNING, "Apply workers were blocked more than %d msec", 
+					 (int)USEC_TO_MSEC(MtmGetSystemTime() - lastPeekTime));
+			}
+		}
 	}
     return hasDeadlock;
 }
diff --git a/multimaster.h b/multimaster.h
@@ -48,6 +48,9 @@
 
 #define USEC 1000000
 
+#define USEC_TO_MSEC(t) ((t)/1000)
+#define MSEC_TO_USEC(t) ((t)*1000)
+
 #define Natts_mtm_ddl_log 2
 #define Anum_mtm_ddl_log_issued		1
 #define Anum_mtm_ddl_log_query		2
@@ -72,8 +75,6 @@ typedef uint64 csn_t; /* commit serial number */
 #define PGLOGICAL_CAUGHT_UP	        0x04
 
 
-typedef uint64 timestamp_t;
-
 /* Identifier of global transaction */
 typedef struct 
 {
@@ -122,9 +123,9 @@ typedef struct
 typedef struct
 {
 	MtmConnectionInfo con;
-	time_t transDelay;
-	time_t lastStatusChangeTime;
-	XLogRecPtr flushPos;
+	timestamp_t transDelay;
+	timestamp_t lastStatusChangeTime;
+	XLogRecPtr  flushPos;
 	csn_t  oldestSnapshot; /* Oldest snapshot used by active transactions at this node */
 } MtmNodeInfo;
 
@@ -232,8 +233,9 @@ extern void  MtmRecoverNode(int nodeId);
 extern void  MtmOnNodeDisconnect(int nodeId);
 extern void  MtmOnNodeConnect(int nodeId);
 extern void  MtmWakeUpBackend(MtmTransState* ts);
-extern timestamp_t MtmGetCurrentTime(void);
-extern void  MtmSleep(timestamp_t interval);
+extern timestamp_t MtmGetSystemTime(void);   /* non-adjusted current system time */
+extern timestamp_t MtmGetCurrentTime(void);  /* adjusted current system time */
+extern void  MtmSleep(timestamp_t interval); 
 extern void  MtmAbortTransaction(MtmTransState* ts);
 extern void  MtmSetCurrentTransactionGID(char const* gid);
 extern csn_t MtmGetTransactionCSN(TransactionId xid);