@@ -1317,13 +1317,15 @@ static void MtmPollStatusOfPreparedTransactions(int disabledNodeId)
1317
1317
1318
1318
static void MtmDisableNode (int nodeId )
1319
1319
{
1320
+ timestamp_t now = MtmGetSystemTime ();
1321
+ elog (WARNING , "Disable node %d at xlog position %lx, last status change time %d msec ago" , nodeId , GetXLogInsertRecPtr (),
1322
+ (int )USEC_TO_MSEC (now - Mtm -> nodes [nodeId - 1 ].lastStatusChangeTime ));
1320
1323
BIT_SET (Mtm -> disabledNodeMask , nodeId - 1 );
1321
- Mtm -> nodes [nodeId - 1 ].lastStatusChangeTime = MtmGetSystemTime () ;
1324
+ Mtm -> nodes [nodeId - 1 ].lastStatusChangeTime = now ;
1322
1325
Mtm -> nodes [nodeId - 1 ].lastHeartbeat = 0 ; /* defuse watchdog until first heartbeat is received */
1323
1326
if (nodeId != MtmNodeId ) {
1324
1327
Mtm -> nLiveNodes -= 1 ;
1325
1328
}
1326
- elog (WARNING , "Disable node %d at xlog position %lx" , nodeId , GetXLogInsertRecPtr ());
1327
1329
MtmPollStatusOfPreparedTransactions (nodeId );
1328
1330
}
1329
1331
@@ -1346,8 +1348,8 @@ void MtmRecoveryCompleted(void)
1346
1348
MtmNodeId , Mtm -> disabledNodeMask , Mtm -> reconnectMask , Mtm -> nLiveNodes );
1347
1349
MtmLock (LW_EXCLUSIVE );
1348
1350
Mtm -> recoverySlot = 0 ;
1349
- Mtm -> nodes [MtmNodeId - 1 ].lastStatusChangeTime = MtmGetSystemTime ();
1350
1351
BIT_CLEAR (Mtm -> disabledNodeMask , MtmNodeId - 1 );
1352
+ Mtm -> nodes [MtmNodeId - 1 ].lastStatusChangeTime = MtmGetSystemTime ();
1351
1353
for (i = 0 ; i < Mtm -> nAllNodes ; i ++ ) {
1352
1354
Mtm -> nodes [i ].lastHeartbeat = 0 ; /* defuse watchdog until first heartbeat is received */
1353
1355
}
@@ -1601,10 +1603,11 @@ bool MtmRefreshClusterStatus(bool nowait, int testNodeId)
1601
1603
if (disabled ) {
1602
1604
timestamp_t now = MtmGetSystemTime ();
1603
1605
for (i = 0 , mask = disabled ; mask != 0 ; i ++ , mask >>= 1 ) {
1604
- if (mask & 1 ) {
1605
- if (Mtm -> nodes [i ].lastStatusChangeTime + MSEC_TO_USEC (MtmNodeDisableDelay ) < now ) {
1606
- MtmDisableNode (i + 1 );
1607
- }
1606
+ if (i + 1 != MtmNodeId
1607
+ && (mask & 1 ) != 0
1608
+ && Mtm -> nodes [i ].lastStatusChangeTime + MSEC_TO_USEC (MtmNodeDisableDelay ) < now )
1609
+ {
1610
+ MtmDisableNode (i + 1 );
1608
1611
}
1609
1612
}
1610
1613
}
@@ -1616,15 +1619,16 @@ bool MtmRefreshClusterStatus(bool nowait, int testNodeId)
1616
1619
1617
1620
if (disabled |enabled ) {
1618
1621
MtmCheckQuorum ();
1619
- }
1620
- /* Interrupt voting for active transaction and abort them */
1621
- for (ts = Mtm -> transListHead ; ts != NULL ; ts = ts -> next ) {
1622
- MTM_LOG3 ("Active transaction gid='%s', coordinator=%d, xid=%d, status=%d, gtid.xid=%d" ,
1623
- ts -> gid , ts -> gtid .nхode , ts -> xid , ts -> status , ts -> gtid .xid );
1624
- if (MtmIsCoordinator (ts )) {
1625
- if (!ts -> votingCompleted && disabled != 0 && ts -> status != TRANSACTION_STATUS_ABORTED ) {
1626
- MtmAbortTransaction (ts );
1627
- MtmWakeUpBackend (ts );
1622
+
1623
+ /* Interrupt voting for active transaction and abort them */
1624
+ for (ts = Mtm -> transListHead ; ts != NULL ; ts = ts -> next ) {
1625
+ MTM_LOG3 ("Active transaction gid='%s', coordinator=%d, xid=%d, status=%d, gtid.xid=%d" ,
1626
+ ts -> gid , ts -> gtid .nхode , ts -> xid , ts -> status , ts -> gtid .xid );
1627
+ if (MtmIsCoordinator (ts )) {
1628
+ if (!ts -> votingCompleted && disabled != 0 && ts -> status != TRANSACTION_STATUS_ABORTED ) {
1629
+ MtmAbortTransaction (ts );
1630
+ MtmWakeUpBackend (ts );
1631
+ }
1628
1632
}
1629
1633
}
1630
1634
}
@@ -2243,7 +2247,7 @@ _PG_init(void)
2243
2247
"Minimal amount of time (msec) between node status change" ,
2244
2248
"This delay is used to avoid false detection of node failure and to prevent blinking of node status node" ,
2245
2249
& MtmNodeDisableDelay ,
2246
- 1000 ,
2250
+ 2000 ,
2247
2251
1 ,
2248
2252
INT_MAX ,
2249
2253
PGC_BACKEND ,
0 commit comments