Use separate worker pools in each receiver.

kelvich · kelvich · commit b908eaf843a8 · 2018-04-08T01:18:54.000+03:00
relid_map assumes that backend works with only one remote
server, which was wrong with shared pool. It was possible to
add node info to relid_map, but with shared pool we can
simplify several other thing (do not switch replication session, etc).
diff --git a/contrib/mmts/bgwpool.c b/contrib/mmts/bgwpool.c
@@ -26,7 +26,7 @@ void BgwPoolDynamicWorkerMainLoop(Datum arg);
 
 static void BgwShutdownWorker(int sig)
 {
-	MTM_LOG1("Background worker %d receive shutdown request", MyProcPid);
+	MTM_LOG1("Background worker %d received shutdown request", MyProcPid);
 	if (MtmPool) { 
 		BgwPoolStop(MtmPool);
 	}
@@ -137,16 +137,15 @@ timestamp_t BgwGetLastPeekTime(BgwPool* pool)
 
 void BgwPoolStaticWorkerMainLoop(Datum arg)
 {
-	BgwPoolConstructor constructor = (BgwPoolConstructor)DatumGetPointer(arg);
-    BgwPoolMainLoop(constructor());
+	BgwPoolMainLoop((BgwPool*)DatumGetPointer(arg));
 }
 
 void BgwPoolDynamicWorkerMainLoop(Datum arg)
 {
-    BgwPoolMainLoop((BgwPool*)DatumGetPointer(arg));
+	BgwPoolMainLoop((BgwPool*)DatumGetPointer(arg));
 }
 
-void BgwPoolStart(int nWorkers, BgwPoolConstructor constructor)
+void BgwPoolStart(BgwPool* pool, char *poolName)
 {
     int i;
 	BackgroundWorker worker;
@@ -158,9 +157,12 @@ void BgwPoolStart(int nWorkers, BgwPoolConstructor constructor)
 	sprintf(worker.bgw_function_name, "BgwPoolStaticWorkerMainLoop");
 	worker.bgw_restart_time = MULTIMASTER_BGW_RESTART_TIMEOUT;
 
-    for (i = 0; i < nWorkers; i++) { 
-        snprintf(worker.bgw_name, BGW_MAXLEN, "bgw_pool_worker_%d", i+1);
-        worker.bgw_main_arg = PointerGetDatum(constructor);
+	strncpy(pool->poolName, poolName, MAX_NAME_LEN);
+
+	for (i = 0; i < pool->nWorkers; i++)
+	{
+		snprintf(worker.bgw_name, BGW_MAXLEN, "%s_worker_%d", pool->poolName, i+1);
+		worker.bgw_main_arg = PointerGetDatum(pool);
         RegisterBackgroundWorker(&worker);
     }
 }
@@ -189,7 +191,7 @@ static void BgwStartExtraWorker(BgwPool* pool)
 			sprintf(worker.bgw_library_name, "multimaster");
 			sprintf(worker.bgw_function_name, "BgwPoolDynamicWorkerMainLoop");
 			worker.bgw_restart_time = MULTIMASTER_BGW_RESTART_TIMEOUT;
-			snprintf(worker.bgw_name, BGW_MAXLEN, "bgw_pool_dynworker_%d", (int)++pool->nWorkers);
+			snprintf(worker.bgw_name, BGW_MAXLEN, "%s-dynworker-%d", pool->poolName, (int)++pool->nWorkers);
 			worker.bgw_main_arg = PointerGetDatum(pool);
 			pool->lastDynamicWorkerStartTime = now;
 			if (!RegisterDynamicBackgroundWorker(&worker, &handle)) { 
diff --git a/contrib/mmts/bgwpool.h b/contrib/mmts/bgwpool.h
@@ -13,6 +13,7 @@ typedef ulong64 timestamp_t;
 
 #define MAX_DBNAME_LEN 30
 #define MAX_DBUSER_LEN 30
+#define MAX_NAME_LEN 30
 #define MULTIMASTER_BGW_RESTART_TIMEOUT BGW_NEVER_RESTART /* seconds */
 
 extern timestamp_t MtmGetSystemTime(void);   /* non-adjusted current system time */
@@ -37,14 +38,15 @@ typedef struct
 	timestamp_t lastDynamicWorkerStartTime;
     bool   producerBlocked;
 	bool   shutdown;
+	char   poolName[MAX_NAME_LEN];
     char   dbname[MAX_DBNAME_LEN];
 	char   dbuser[MAX_DBUSER_LEN];
     char*  queue;
 } BgwPool;
 
 typedef BgwPool*(*BgwPoolConstructor)(void);
 
-extern void BgwPoolStart(int nWorkers, BgwPoolConstructor constructor);
+extern void BgwPoolStart(BgwPool* pool, char *poolName);
 
 extern void BgwPoolInit(BgwPool* pool, BgwPoolExecutor executor, char const* dbname, char const* dbuser, size_t queueSize, size_t nWorkers);
 
diff --git a/contrib/mmts/multimaster.c b/contrib/mmts/multimaster.c
@@ -176,7 +176,6 @@ static void MtmAddSubtransactions(MtmTransState* ts, TransactionId *subxids, int
 
 static void MtmShmemStartup(void);
 
-static BgwPool* MtmPoolConstructor(void);
 static bool MtmRunUtilityStmt(PGconn* conn, char const* sql, char **errmsg);
 static void MtmBroadcastUtilityStmt(char const* sql, bool ignoreError, int forceOnNode);
 static void MtmProcessDDLCommand(char const* queryString, bool transactional);
@@ -266,12 +265,12 @@ bool  MtmVolksWagenMode; /* Pretend to be normal postgres. This means skip some
 bool  MtmMajorNode;
 char* MtmRefereeConnStr;
 bool  MtmEnforceLocalTx;
+int	  MtmWorkers;
 
 static char* MtmConnStrs;
 static char* MtmRemoteFunctionsList;
 static char* MtmClusterName;
 static int	 MtmQueueSize;
-static int	 MtmWorkers;
 static int	 MtmVacuumDelay;
 static int	 MtmMinRecoveryLag;
 static int	 MtmMaxRecoveryLag;
@@ -2558,7 +2557,8 @@ static void MtmInitialize()
 		Mtm->inject2PCError = 0;
 		Mtm->sendQueue = NULL;
 		Mtm->freeQueue = NULL;
-		for (i = 0; i < MtmNodes; i++) {
+		for (i = 0; i < MtmMaxNodes; i++)
+		{
 			Mtm->nodes[i].oldestSnapshot = 0;
 			Mtm->nodes[i].disabledNodeMask = 0;
 			Mtm->nodes[i].connectivityMask = (((nodemask_t)1 << MtmNodes) - 1);
@@ -2576,14 +2576,14 @@ static void MtmInitialize()
 			Mtm->nodes[i].nHeartbeats = 0;
 			Mtm->nodes[i].manualRecovery = false;
 			Mtm->nodes[i].slotDeleted = false;
+			BgwPoolInit(&Mtm->nodes[i].pool, MtmExecutor, MtmDatabaseName, MtmDatabaseUser, MtmQueueSize, 0);
 		}
 		Mtm->nodes[MtmNodeId-1].originId = DoNotReplicateId;
 		/* All transaction originated from the current node should be ignored during recovery */
 		Mtm->nodes[MtmNodeId-1].restartLSN = (lsn_t)PG_UINT64_MAX;
 		Mtm->sendSemaphore = PGSemaphoreCreate();
 		PGSemaphoreReset(Mtm->sendSemaphore);
 		SpinLockInit(&Mtm->queueSpinlock);
-		BgwPoolInit(&Mtm->pool, MtmExecutor, MtmDatabaseName, MtmDatabaseUser, MtmQueueSize, MtmWorkers);
 		RegisterXactCallback(MtmXactCallback, NULL);
 		MtmTx.snapshot = INVALID_CSN;
 		MtmTx.xid = InvalidTransactionId;
@@ -2754,7 +2754,7 @@ static void MtmSplitConnStrs(void)
 		MTM_ELOG(ERROR, "More than %d nodes are specified", MtmMaxNodes);
 	}
 	MtmNodes = i;
-	MtmConnections = (MtmConnectionInfo*)palloc(MtmMaxNodes*sizeof(MtmConnectionInfo));
+	MtmConnections = (MtmConnectionInfo*)palloc0(MtmMaxNodes*sizeof(MtmConnectionInfo));
 
 	if (f != NULL) {
 		fseek(f, SEEK_SET, 0);
@@ -3369,11 +3369,9 @@ _PG_init(void)
 	 * the postmaster process.)	 We'll allocate or attach to the shared
 	 * resources in mtm_shmem_startup().
 	 */
-	RequestAddinShmemSpace(MTM_SHMEM_SIZE + MtmQueueSize);
+	RequestAddinShmemSpace(MTM_SHMEM_SIZE + MtmMaxNodes*MtmQueueSize);
 	RequestNamedLWLockTranche(MULTIMASTER_NAME, 1 + MtmMaxNodes*2);
 
-	BgwPoolStart(MtmWorkers, MtmPoolConstructor);
-
 	MtmArbiterInitialize();
 
 	/*
@@ -4313,18 +4311,30 @@ mtm_get_cluster_state(PG_FUNCTION_ARGS)
 	TupleDesc desc;
 	Datum	  values[Natts_mtm_cluster_state];
 	bool	  nulls[Natts_mtm_cluster_state] = {false};
+	int		  i,
+			  pool_active = 0,
+			  pool_pending = 0,
+			  pool_queue_size = 0;
+
 	get_call_result_type(fcinfo, NULL, &desc);
 
+	for (i = 0; i < Mtm->nAllNodes; i++)
+	{
+		pool_active += (int) Mtm->nodes[i].pool.active;
+		pool_pending += (int) Mtm->nodes[i].pool.pending;
+		pool_queue_size += (int) BgwPoolGetQueueSize(&Mtm->nodes[i].pool);
+	}
+
 	values[0] = Int32GetDatum(MtmNodeId);
 	values[1] = CStringGetTextDatum(MtmNodeStatusMnem[Mtm->status]);
 	values[2] = Int64GetDatum(Mtm->disabledNodeMask);
 	values[3] = Int64GetDatum(SELF_CONNECTIVITY_MASK);
 	values[4] = Int64GetDatum(Mtm->originLockNodeMask);
 	values[5] = Int32GetDatum(Mtm->nLiveNodes);
 	values[6] = Int32GetDatum(Mtm->nAllNodes);
-	values[7] = Int32GetDatum((int)Mtm->pool.active);
-	values[8] = Int32GetDatum((int)Mtm->pool.pending);
-	values[9] = Int64GetDatum(BgwPoolGetQueueSize(&Mtm->pool));
+	values[7] = Int32GetDatum(pool_active);
+	values[8] = Int32GetDatum(pool_pending);
+	values[9] = Int64GetDatum(pool_queue_size);
 	values[10] = Int64GetDatum(Mtm->transCount);
 	values[11] = Int64GetDatum(Mtm->timeShift);
 	values[12] = Int32GetDatum(Mtm->recoverySlot);
@@ -5608,28 +5618,6 @@ static void MtmSeqNextvalHook(Oid seqid, int64 next)
 	}
 }
 
-/*
- * -------------------------------------------
- * Executor pool interface
- * -------------------------------------------
- */
-
-void MtmExecute(void* work, int size)
-{
-	if (Mtm->status == MTM_RECOVERY) {
-		/* During recovery apply changes sequentially to preserve commit order */
-		MtmExecutor(work, size);
-	} else {
-		BgwPoolExecute(&Mtm->pool, work, size);
-	}
-}
-
-static BgwPool*
-MtmPoolConstructor(void)
-{
-	return &Mtm->pool;
-}
-
 /*
  * -------------------------------------------
  * Deadlock detection
@@ -5743,21 +5731,21 @@ MtmDetectGlobalDeadLockForXid(TransactionId xid)
 		MtmGetGtid(xid, &gtid);
 		hasDeadlock = MtmGraphFindLoop(&graph, &gtid);
 		MTM_ELOG(LOG, "Distributed deadlock check by backend %d for %u:%llu = %d", MyProcPid, gtid.node, (long64)gtid.xid, hasDeadlock);
-		if (!hasDeadlock) {
-			/* There is no deadlock loop in graph, but deadlock can be caused by lack of apply workers: if all of them are busy, then some transactions
-			 * can not be appied just because there are no vacant workers and it cause additional dependency between transactions which is not
-			 * refelected in lock graph
-			 */
-			timestamp_t lastPeekTime = BgwGetLastPeekTime(&Mtm->pool);
-			if (lastPeekTime != 0 && MtmGetSystemTime() - lastPeekTime >= MSEC_TO_USEC(DeadlockTimeout)) {
-				hasDeadlock = true;
-				MTM_ELOG(WARNING, "Apply workers were blocked more than %d msec",
-					 (int)USEC_TO_MSEC(MtmGetSystemTime() - lastPeekTime));
-			} else {
-				MTM_LOG1("Enable deadlock timeout in backend %d for transaction %llu", MyProcPid, (long64)xid);
-				enable_timeout_after(DEADLOCK_TIMEOUT, DeadlockTimeout);
-			}
-		}
+		// if (!hasDeadlock) {
+		// 	/* There is no deadlock loop in graph, but deadlock can be caused by lack of apply workers: if all of them are busy, then some transactions
+		// 	 * can not be appied just because there are no vacant workers and it cause additional dependency between transactions which is not
+		// 	 * refelected in lock graph
+		// 	 */
+		// 	timestamp_t lastPeekTime = minBgwGetLastPeekTime(&Mtm->pool);
+		// 	if (lastPeekTime != 0 && MtmGetSystemTime() - lastPeekTime >= MSEC_TO_USEC(DeadlockTimeout)) {
+		// 		hasDeadlock = true;
+		// 		MTM_ELOG(WARNING, "Apply workers were blocked more than %d msec",
+		// 			 (int)USEC_TO_MSEC(MtmGetSystemTime() - lastPeekTime));
+		// 	} else {
+		// 		MTM_LOG1("Enable deadlock timeout in backend %d for transaction %llu", MyProcPid, (long64)xid);
+		// 		enable_timeout_after(DEADLOCK_TIMEOUT, DeadlockTimeout);
+		// 	}
+		// }
 	}
 	return hasDeadlock;
 }
diff --git a/contrib/mmts/multimaster.h b/contrib/mmts/multimaster.h
@@ -223,6 +223,8 @@ typedef struct
 typedef struct
 {
 	MtmConnectionInfo con;
+	/* Pool of background workers for applying logical replication */
+	BgwPool pool;
 	timestamp_t transDelay;
 	timestamp_t lastStatusChangeTime;
 	timestamp_t receiverStartTime;
@@ -338,7 +340,6 @@ typedef struct
 	MtmMessageQueue* sendQueue;        /* Messages to be sent by arbiter sender */
 	MtmMessageQueue* freeQueue;        /* Free messages */
 	lsn_t recoveredLSN;           /* LSN at the moment of recovery completion */
-	BgwPool pool;                      /* Pool of background workers for applying logical replication patches */
 	MtmNodeInfo nodes[1];              /* [Mtm->nAllNodes]: per-node data */
 } MtmState;
 
@@ -394,6 +395,7 @@ extern bool MtmBackgroundWorker;
 extern char* MtmRefereeConnStr;
 extern bool  MtmEnforceLocalTx;
 extern bool MtmIsRecoverySession;
+extern int MtmWorkers;
 
 
 extern void  MtmArbiterInitialize(void);
@@ -404,7 +406,6 @@ extern csn_t MtmAssignCSN(void);
 extern csn_t MtmSyncClock(csn_t csn);
 extern void  MtmJoinTransaction(GlobalTransactionId* gtid, csn_t snapshot, nodemask_t participantsMask);
 extern MtmReplicationMode MtmGetReplicationMode(int nodeId, sig_atomic_t volatile* shutdown);
-extern void  MtmExecute(void* work, int size);
 extern void  MtmExecutor(void* work, size_t size);
 extern void  MtmSend2PCMessage(MtmTransState* ts, MtmMessageCode cmd);
 extern void  MtmSendMessage(MtmArbiterMessage* msg);
diff --git a/contrib/mmts/pglogical_receiver.c b/contrib/mmts/pglogical_receiver.c
@@ -212,6 +212,17 @@ static char const* const MtmReplicationModeName[] =
 	"open_existed" /* normal mode: use existed slot or create new one and start receiving data from it from the rememered position */
 };
 
+static void
+MtmExecute(void* work, int size)
+{
+	/* During recovery apply changes sequentially to preserve commit order */
+	if (Mtm->status == MTM_RECOVERY)
+		MtmExecutor(work, size);
+	else
+		BgwPoolExecute(&Mtm->nodes[MtmReplicationNodeId-1].pool, work, size);
+}
+
+
 void
 pglogical_receiver_main(Datum main_arg)
 {
@@ -252,7 +263,7 @@ pglogical_receiver_main(Datum main_arg)
 	Mtm->nodes[nodeId-1].receiverStartTime = MtmGetSystemTime();
 	MtmReplicationNodeId = nodeId;
 
-	sprintf(worker_proc, "mtm_pglogical_receiver_%d_%d", MtmNodeId, nodeId);
+	snprintf(worker_proc, BGW_MAXLEN, "mtm-logrep-receiver-%d-%d", MtmNodeId, nodeId);
 
 	/* We're now ready to receive signals */
 	BackgroundWorkerUnblockSignals();
@@ -263,6 +274,8 @@ pglogical_receiver_main(Datum main_arg)
 	ActivePortal->status = PORTAL_ACTIVE;
 	ActivePortal->sourceText = "";
 
+	BgwPoolStart(&Mtm->nodes[nodeId-1].pool, worker_proc);
+
 	/*
 	 * Set proper restartLsn for all origins
 	 */
@@ -427,7 +440,7 @@ pglogical_receiver_main(Datum main_arg)
 			/* Emergency bailout if postmaster has died */
 			if (rc & WL_POSTMASTER_DEATH)
 			{
-				BgwPoolStop(&Mtm->pool);
+				BgwPoolStop(&Mtm->nodes[nodeId-1].pool);
 				proc_exit(1);
 			}
 
@@ -729,7 +742,7 @@ void MtmStartReceiver(int nodeId, bool dynamic)
 	worker.bgw_restart_time = MULTIMASTER_BGW_RESTART_TIMEOUT;
 
 	/* Worker parameter and registration */
-	snprintf(worker.bgw_name, BGW_MAXLEN, "mtm_pglogical_receiver_%d_%d", MtmNodeId, nodeId);
+	snprintf(worker.bgw_name, BGW_MAXLEN, "mtm-logrep-receiver-%d-%d", MtmNodeId, nodeId);
 
 	worker.bgw_main_arg = Int32GetDatum(nodeId);
 	if (dynamic) {
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
@@ -279,9 +279,10 @@ extern PGPROC *PreparedXactProcs;
 #define NUM_AUXILIARY_PROCS		4
 
 /*
- * Number of extra semaphores used by Postgres (right now 3 semaphores are used by multimaster)
+ * Number of extra semaphores used by Postgres (right now multimaster uses
+ * 1 semaphore for arbiter and 2 semas per each logrep receiver).
  */
-#define NUM_EXTRA_SEMAPHORES	4
+#define NUM_EXTRA_SEMAPHORES	16
 
 /* configurable options */
 extern PGDLLIMPORT int DeadlockTimeout;

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ void BgwPoolDynamicWorkerMainLoop(Datum arg);`
`26`	`26`
`27`	`27`	`static void BgwShutdownWorker(int sig)`
`28`	`28`	`{`
`29`		`- MTM_LOG1("Background worker %d receive shutdown request", MyProcPid);`
	`29`	`+ MTM_LOG1("Background worker %d received shutdown request", MyProcPid);`
`30`	`30`	`if (MtmPool) {`
`31`	`31`	`BgwPoolStop(MtmPool);`
`32`	`32`	`}`
`@@ -137,16 +137,15 @@ timestamp_t BgwGetLastPeekTime(BgwPool* pool)`
`137`	`137`
`138`	`138`	`void BgwPoolStaticWorkerMainLoop(Datum arg)`
`139`	`139`	`{`
`140`		`- BgwPoolConstructor constructor = (BgwPoolConstructor)DatumGetPointer(arg);`
`141`		`- BgwPoolMainLoop(constructor());`
	`140`	`+ BgwPoolMainLoop((BgwPool*)DatumGetPointer(arg));`
`142`	`141`	`}`
`143`	`142`
`144`	`143`	`void BgwPoolDynamicWorkerMainLoop(Datum arg)`
`145`	`144`	`{`
`146`		`- BgwPoolMainLoop((BgwPool*)DatumGetPointer(arg));`
	`145`	`+ BgwPoolMainLoop((BgwPool*)DatumGetPointer(arg));`
`147`	`146`	`}`
`148`	`147`
`149`		`-void BgwPoolStart(int nWorkers, BgwPoolConstructor constructor)`
	`148`	`+void BgwPoolStart(BgwPool* pool, char *poolName)`
`150`	`149`	`{`
`151`	`150`	`int i;`
`152`	`151`	`BackgroundWorker worker;`
`@@ -158,9 +157,12 @@ void BgwPoolStart(int nWorkers, BgwPoolConstructor constructor)`
`158`	`157`	`sprintf(worker.bgw_function_name, "BgwPoolStaticWorkerMainLoop");`
`159`	`158`	`worker.bgw_restart_time = MULTIMASTER_BGW_RESTART_TIMEOUT;`
`160`	`159`
`161`		`- for (i = 0; i < nWorkers; i++) {`
`162`		`- snprintf(worker.bgw_name, BGW_MAXLEN, "bgw_pool_worker_%d", i+1);`
`163`		`- worker.bgw_main_arg = PointerGetDatum(constructor);`
	`160`	`+ strncpy(pool->poolName, poolName, MAX_NAME_LEN);`
	`161`	`+`
	`162`	`+ for (i = 0; i < pool->nWorkers; i++)`
	`163`	`+ {`
	`164`	`+ snprintf(worker.bgw_name, BGW_MAXLEN, "%s_worker_%d", pool->poolName, i+1);`
	`165`	`+ worker.bgw_main_arg = PointerGetDatum(pool);`
`164`	`166`	`RegisterBackgroundWorker(&worker);`
`165`	`167`	`}`
`166`	`168`	`}`
`@@ -189,7 +191,7 @@ static void BgwStartExtraWorker(BgwPool* pool)`
`189`	`191`	`sprintf(worker.bgw_library_name, "multimaster");`
`190`	`192`	`sprintf(worker.bgw_function_name, "BgwPoolDynamicWorkerMainLoop");`
`191`	`193`	`worker.bgw_restart_time = MULTIMASTER_BGW_RESTART_TIMEOUT;`
`192`		`- snprintf(worker.bgw_name, BGW_MAXLEN, "bgw_pool_dynworker_%d", (int)++pool->nWorkers);`
	`194`	`+ snprintf(worker.bgw_name, BGW_MAXLEN, "%s-dynworker-%d", pool->poolName, (int)++pool->nWorkers);`
`193`	`195`	`worker.bgw_main_arg = PointerGetDatum(pool);`
`194`	`196`	`pool->lastDynamicWorkerStartTime = now;`
`195`	`197`	`if (!RegisterDynamicBackgroundWorker(&worker, &handle)) {`