some fixes in recovery

knizhnik · kelvich · commit d29666d66edd · 2017-11-13T00:15:23.000+03:00
diff --git a/multimaster.c b/multimaster.c
@@ -635,9 +635,11 @@ MtmBeginTransaction(MtmCurrentTrans* x)
         x->isDistributed = MtmIsUserTransaction();
 		x->isPrepared = false;
 		x->isTransactionBlock = IsTransactionBlock();
-		/* Application name can be cahnged usnig PGAPPNAME environment variable */
+		/* Application name can be changed usnig PGAPPNAME environment variable */
 		if (!IsBackgroundWorker && x->isDistributed && Mtm->status != MTM_ONLINE && strcmp(application_name, MULTIMASTER_ADMIN) != 0) { 
-			/* reject all user's transactions at offline cluster */
+			/* Reject all user's transactions at offline cluster. 
+			 * Allow execution of transaction by bg-workers to make it possible to perform recovery.
+			 */
 			MtmUnlock();			
 			elog(ERROR, "Multimaster node is not online: current status %s", MtmNodeStatusMnem[Mtm->status]);
 		}
@@ -673,14 +675,17 @@ MtmPrePrepareTransaction(MtmCurrentTrans* x)
 	if (Mtm->disabledNodeMask != 0) { 
 		MtmRefreshClusterStatus(true);
 		if (!IsBackgroundWorker && Mtm->status != MTM_ONLINE) { 
-			elog(ERROR, "Abort current transaction because this cluster node is not online");			
+			/* Do not take in accoutn bg-workers which are performing recovery */
+			elog(ERROR, "Abort current transaction because this cluster node is in %s status", MtmNodeStatusMnem[Mtm->status]);			
 		}
 	}
 
 	MtmLock(LW_EXCLUSIVE);
 
 	/*
-	 * Check if there is global multimaster lock preventing new transaction from commit to make a chance to wal-senders to catch-up
+	 * Check if there is global multimaster lock preventing new transaction from commit to make a chance to wal-senders to catch-up.
+	 * Only "own" transactions are blacked. Transactions replicated from other nodes (including recovered transaction) should be proceeded
+	 * and should not cause cluster status change.
 	 */
 	if (!x->isReplicated) { 
 		MtmCheckClusterLock();
@@ -716,7 +721,8 @@ MtmPrePrepareTransaction(MtmCurrentTrans* x)
 	}
 	MtmTransactionListAppend(ts);
 	MtmAddSubtransactions(ts, subxids, ts->nSubxids);
-	MTM_TRACE("%d: MtmPrePrepareTransaction prepare commit of %d CSN=%ld\n", MyProcPid, x->xid, ts->csn);
+	MTM_TRACE("%d: MtmPrePrepareTransaction prepare commit of %d (gtid.xid=%d, gtid.node=%d, CSN=%ld)\n", 
+			  MyProcPid, x->xid, ts->gtid.xid, ts->gtid.node, ts->csn);
 	MtmUnlock();
 
 }
@@ -842,14 +848,6 @@ void MtmSendNotificationMessage(MtmTransState* ts, MtmMessageCode cmd)
 	}
 }
 
-void MtmRecoveryCompleted(void)
-{
-	elog(WARNING, "Recovery of node %d is completed", MtmNodeId);
-	Mtm->recoverySlot = 0;
-	BIT_CLEAR(Mtm->disabledNodeMask, MtmNodeId-1);
-	MtmSwitchClusterMode(MTM_ONLINE);
-}
-
 void MtmJoinTransaction(GlobalTransactionId* gtid, csn_t globalSnapshot)
 {
 	MtmLock(LW_EXCLUSIVE);
@@ -933,6 +931,18 @@ csn_t MtmGetTransactionCSN(TransactionId xid)
  * -------------------------------------------
  */
 
+void MtmRecoveryCompleted(void)
+{
+	elog(WARNING, "Recovery of node %d is completed", MtmNodeId);
+	MtmLock(LW_EXCLUSIVE);
+	Mtm->recoverySlot = 0;
+	BIT_CLEAR(Mtm->disabledNodeMask, MtmNodeId-1);
+	/* Mode will be changed to online once all locagical reciever are connected */
+	MtmSwitchClusterMode(MTM_CONNECTED);
+	MtmUnlock();
+}
+
+
 
 /**
  * Check state of replication slots. If some of them are too much lag behind wal, then drop this slots to avoid 
@@ -993,10 +1003,10 @@ bool MtmIsRecoveredNode(int nodeId)
 bool MtmRecoveryCaughtUp(int nodeId, XLogRecPtr slotLSN)
 {
 	bool caughtUp = false;
+	MtmLock(LW_EXCLUSIVE);
 	if (MtmIsRecoveredNode(nodeId)) { 
 		XLogRecPtr walLSN = GetXLogInsertRecPtr();
-		MtmLock(LW_EXCLUSIVE);
-		if (slotLSN == walLSN) {
+		if (slotLSN == walLSN && Mtm->nActiveTransactions == 0) {
 			if (BIT_CHECK(Mtm->nodeLockerMask, nodeId-1)) { 
 				elog(WARNING,"Node %d is caught-up", nodeId);	
 				BIT_CLEAR(Mtm->walSenderLockerMask, MyWalSnd - WalSndCtl->walsnds);
@@ -1018,18 +1028,17 @@ bool MtmRecoveryCaughtUp(int nodeId, XLogRecPtr slotLSN)
 			 * We have to maintain two bitmasks: one is marking wal sender, another - correspondent nodes. 
 			 * Is there some better way to establish mapping between nodes ad WAL-seconder?
 			 */
-			elog(WARNING,"Node %d is almost caught-up: lock cluster", nodeId);
+			elog(WARNING,"Node %d is almost caught-up: slot position %lx, WAL position %lx, active transactions %d", 
+				 nodeId, slotLSN, walLSN, Mtm->nActiveTransactions);
 			Assert(MyWalSnd != NULL); /* This function is called by WAL-sender, so it should not be NULL */
 			BIT_SET(Mtm->nodeLockerMask, nodeId-1);
 			BIT_SET(Mtm->walSenderLockerMask, MyWalSnd - WalSndCtl->walsnds);
 			Mtm->nLockers += 1;
 		} else { 
 			MTM_INFO("Continue recovery of node %d, slot position %lx, WAL position %lx, WAL sender position %lx, lockers %d, active transactions %d\n", nodeId, slotLSN, walLSN, MyWalSnd->sentPtr, Mtm->nLockers, Mtm->nActiveTransactions);
 		}
-		MtmUnlock();
-	} else { 
-		MTM_INFO("Node %d is not in recovery mode\n", nodeId);
 	}
+	MtmUnlock();
 	return caughtUp;
 }
 
@@ -1044,7 +1053,7 @@ void MtmSwitchClusterMode(MtmNodeStatus mode)
 /*
  * If there are recovering nodes which are catching-up WAL, check the status and prevent new transaction from commit to give
  * WAL-sender a chance to catch-up WAL, completely synchronize replica and switch it to normal mode.
- * This function is called at transaction start with multimaster lock set
+ * This function is called before transaction prepare with multimaster lock set.
  */
 static void 
 MtmCheckClusterLock()
@@ -1071,8 +1080,8 @@ MtmCheckClusterLock()
 				}
 			}
 			if (mask != 0) { 
-				/* some "almost catch-up" wal-senders are still working */
-				/* Do not start new transactions until them complete */
+				/* some "almost catch-up" wal-senders are still working. */
+				/* Do not start new transactions until them are completed. */
 				MtmUnlock();
 				MtmSleep(delay);
 				if (delay*2 <= MAX_WAIT_TIMEOUT) { 
@@ -1215,6 +1224,7 @@ void MtmOnNodeDisconnect(int nodeId)
 void MtmOnNodeConnect(int nodeId)
 {
 	BIT_CLEAR(Mtm->connectivityMask, nodeId-1);
+	elog(NOTICE, "Reconnect node %d", nodeId);
 	RaftableSet(psprintf("node-mask-%d", MtmNodeId), &Mtm->connectivityMask, sizeof Mtm->connectivityMask, false);
 }
 
@@ -1645,19 +1655,23 @@ _PG_fini(void)
 }
 
 
- 
+/*
+ * This functions is called by pglogical receiver main function when receiver background worker is started.
+ * We switch to ONLINE mode when all receviers are connected.
+ * As far as background worker can be restarted multiple times, use node bitmask.
+ */
 void MtmReceiverStarted(int nodeId)
 {
-	SpinLockAcquire(&Mtm->spinlock);	
+	MtmLock(LW_EXCLUSIVE);
 	if (!BIT_CHECK(Mtm->pglogicalNodeMask, nodeId-1)) { 
 		BIT_SET(Mtm->pglogicalNodeMask, nodeId-1);
 		if (++Mtm->nReceivers == Mtm->nNodes-1) {
 			if (Mtm->status == MTM_CONNECTED) { 
 				MtmSwitchClusterMode(MTM_ONLINE);
 			}
 		}
-     }
-	SpinLockRelease(&Mtm->spinlock);	
+	}
+	MtmUnlock();
 }
 
 /* 
diff --git a/multimaster.h b/multimaster.h
@@ -45,6 +45,9 @@ typedef uint64 csn_t; /* commit serial number */
 
 #define PGLOGICAL_XACT_EVENT(flags)	(flags & 0x03)
 
+#define PGLOGICAL_CAUGHT_UP	        0x04
+
+
 typedef uint64 timestamp_t;
 
 /* Identifier of global transaction */
diff --git a/pglogical_apply.c b/pglogical_apply.c
@@ -497,12 +497,10 @@ process_remote_commit(StringInfo in)
 	uint8 		flags;
 	csn_t       csn;
 	const char *gid = NULL;	
-	bool        caughtUp;
 
 	/* read flags */
 	flags = pq_getmsgbyte(in);
 	MtmReplicationNode = pq_getmsgbyte(in);
-	caughtUp = pq_getmsgbyte(in) != 0;
 
 	/* read fields */
 	replorigin_session_origin_lsn = pq_getmsgint64(in); /* commit_lsn */
@@ -571,7 +569,7 @@ process_remote_commit(StringInfo in)
 			Assert(false);
 	}
 	MtmEndSession(true);
-	if (caughtUp) {
+	if (flags & PGLOGICAL_CAUGHT_UP) {
 		MtmRecoveryCompleted();
 	}
 }
diff --git a/pglogical_proto.c b/pglogical_proto.c
@@ -103,7 +103,7 @@ pglogical_write_begin(StringInfo out, PGLogicalOutputData *data,
 {
 	bool isRecovery = MtmIsRecoveredNode(MtmReplicationNodeId);
 	csn_t csn = MtmTransactionSnapshot(txn->xid);
-	MTM_INFO("%d: pglogical_write_begin %d CSN=%ld\n", MyProcPid, txn->xid, csn);
+	MTM_INFO("%d: pglogical_write_begin XID=%d node=%d CSN=%ld recovery=%d\n", MyProcPid, txn->xid, MtmReplicationNodeId, csn, isRecovery);
 	
 	if (csn == INVALID_CSN && !isRecovery) { 
 		MtmIsFilteredTxn = true;
@@ -124,7 +124,7 @@ pglogical_write_commit(StringInfo out, PGLogicalOutputData *data,
 					   ReorderBufferTXN *txn, XLogRecPtr commit_lsn)
 {
     uint8 flags = 0;
-
+	
     if (txn->xact_action == XLOG_XACT_COMMIT) 
     	flags = PGLOGICAL_COMMIT;
 	else if (txn->xact_action == XLOG_XACT_PREPARE)
@@ -146,6 +146,9 @@ pglogical_write_commit(StringInfo out, PGLogicalOutputData *data,
 		if (csn == INVALID_CSN && !isRecovery) {
 			return;
 		}
+		if (MtmRecoveryCaughtUp(MtmReplicationNodeId, txn->end_lsn)) { 
+			flags |= PGLOGICAL_CAUGHT_UP;
+		}
 	}
     pq_sendbyte(out, 'C');		/* sending COMMIT */
 
@@ -154,7 +157,6 @@ pglogical_write_commit(StringInfo out, PGLogicalOutputData *data,
     /* send the flags field */
     pq_sendbyte(out, flags);
     pq_sendbyte(out, MtmNodeId);
-    pq_sendbyte(out, MtmRecoveryCaughtUp(MtmReplicationNodeId, txn->end_lsn));
 
     /* send fixed fields */
     pq_sendint64(out, commit_lsn);

Original file line number	Diff line number	Diff line change
`@@ -497,12 +497,10 @@ process_remote_commit(StringInfo in)`
`497`	`497`	`uint8 flags;`
`498`	`498`	`csn_t csn;`
`499`	`499`	`const char *gid = NULL;`
`500`		`- bool caughtUp;`
`501`	`500`
`502`	`501`	`/* read flags */`
`503`	`502`	`flags = pq_getmsgbyte(in);`
`504`	`503`	`MtmReplicationNode = pq_getmsgbyte(in);`
`505`		`- caughtUp = pq_getmsgbyte(in) != 0;`
`506`	`504`
`507`	`505`	`/* read fields */`
`508`	`506`	`replorigin_session_origin_lsn = pq_getmsgint64(in); /* commit_lsn */`
`@@ -571,7 +569,7 @@ process_remote_commit(StringInfo in)`
`571`	`569`	`Assert(false);`
`572`	`570`	`}`
`573`	`571`	`MtmEndSession(true);`
`574`		`- if (caughtUp) {`
	`572`	`+ if (flags & PGLOGICAL_CAUGHT_UP) {`
`575`	`573`	`MtmRecoveryCompleted();`
`576`	`574`	`}`
`577`	`575`	`}`