@@ -191,6 +191,8 @@ int MtmReconnectAttempts;
191
191
int MtmNodeDisableDelay ;
192
192
int MtmTransSpillThreshold ;
193
193
int MtmMaxNodes ;
194
+ int MtmHeartbeatSendTimeout ;
195
+ int MtmHeartbeatRecvTimeout ;
194
196
bool MtmUseRaftable ;
195
197
bool MtmUseDtm ;
196
198
@@ -741,6 +743,27 @@ MtmPrePrepareTransaction(MtmCurrentTrans* x)
741
743
742
744
}
743
745
746
+ /*
747
+ * Check heartbeats
748
+ */
749
+ static void MtmWatchdog ()
750
+ {
751
+ int i , n = Mtm -> nAllNodes ;
752
+ timestamp_t now = MtmGetSystemTime ();
753
+ for (i = 0 ; i < n ; i ++ ) {
754
+ if (i + 1 != MtmNodeId && !BIT_CHECK (Mtm -> disabledNodeMask , i )) {
755
+ if (Mtm -> nodes [i ].lastHeartbeat != 0
756
+ && now > Mtm -> nodes [i ].lastHeartbeat + MSEC_TO_USEC (MtmHeartbeatRecvTimeout ))
757
+ {
758
+ elog (WARNING , "Disable node %d because last heartbeat was received %d msec ago" ,
759
+ i + 1 , (int )USEC_TO_MSEC (now - Mtm -> nodes [i ].lastHeartbeat ));
760
+ MtmOnNodeDisconnect (i + 1 );
761
+ }
762
+ }
763
+ }
764
+ }
765
+
766
+
744
767
static void
745
768
MtmPostPrepareTransaction (MtmCurrentTrans * x )
746
769
{
@@ -770,14 +793,24 @@ MtmPostPrepareTransaction(MtmCurrentTrans* x)
770
793
MtmUnlock ();
771
794
MtmResetTransaction (x );
772
795
} else {
773
- time_t timeout = Max (Mtm2PCMinTimeout , (ts -> csn - ts -> snapshot )* Mtm2PCPrepareRatio /100000 ); /* usec->msec and percents */
796
+ time_t transTimeout = Max (Mtm2PCMinTimeout , (ts -> csn - ts -> snapshot )* Mtm2PCPrepareRatio /100000 ); /* usec->msec and percents */
797
+ time_t timeout = transTimeout < MtmHeartbeatRecvTimeout ? transTimeout : MtmHeartbeatRecvTimeout ;
798
+ timestamp_t deadline = MtmGetSystemTime () + MSEC_TO_USEC (transTimeout );
774
799
int result = 0 ;
775
800
int nConfigChanges = Mtm -> nConfigChanges ;
776
801
/* wait votes from all nodes */
777
- while (!ts -> votingCompleted && !( result & WL_TIMEOUT ) ) {
802
+ while (!ts -> votingCompleted ) {
778
803
MtmUnlock ();
804
+ MtmWatchdog ();
779
805
result = WaitLatch (& MyProc -> procLatch , WL_LATCH_SET |WL_TIMEOUT , timeout );
780
- ResetLatch (& MyProc -> procLatch );
806
+ if (result & WL_TIMEOUT ) {
807
+ if (MtmGetSystemTime () > deadline ) {
808
+ MtmLock (LW_SHARED );
809
+ break ;
810
+ }
811
+ } else {
812
+ ResetLatch (& MyProc -> procLatch );
813
+ }
781
814
MtmLock (LW_SHARED );
782
815
}
783
816
if (!ts -> votingCompleted ) {
@@ -1022,6 +1055,22 @@ void MtmHandleApplyError(void)
1022
1055
}
1023
1056
1024
1057
1058
+ static void MtmDisableNode (int nodeId )
1059
+ {
1060
+ BIT_SET (Mtm -> disabledNodeMask , nodeId - 1 );
1061
+ Mtm -> nodes [nodeId - 1 ].lastStatusChangeTime = MtmGetSystemTime ();
1062
+ Mtm -> nodes [nodeId - 1 ].lastHeartbeat = 0 ; /* defuse watchdog until first heartbeat is received */
1063
+ Mtm -> nLiveNodes -= 1 ;
1064
+ }
1065
+
1066
+ static void MtmEnableNode (int nodeId )
1067
+ {
1068
+ BIT_CLEAR (Mtm -> disabledNodeMask , nodeId - 1 );
1069
+ Mtm -> nodes [nodeId - 1 ].lastStatusChangeTime = MtmGetSystemTime ();
1070
+ Mtm -> nodes [nodeId - 1 ].lastHeartbeat = 0 ; /* defuse watchdog until first heartbeat is received */
1071
+ Mtm -> nLiveNodes += 1 ;
1072
+ }
1073
+
1025
1074
void MtmRecoveryCompleted (void )
1026
1075
{
1027
1076
MTM_LOG1 ("Recovery of node %d is completed" , MtmNodeId );
@@ -1116,9 +1165,7 @@ bool MtmRecoveryCaughtUp(int nodeId, XLogRecPtr slotLSN)
1116
1165
MTM_LOG1 ("%d: node %d is caugth-up without locking cluster" , MyProcPid , nodeId );
1117
1166
/* We are lucky: caugth-up without locking cluster! */
1118
1167
}
1119
- BIT_CLEAR (Mtm -> disabledNodeMask , nodeId - 1 );
1120
- Mtm -> nodes [nodeId - 1 ].lastStatusChangeTime = MtmGetSystemTime ();
1121
- Mtm -> nLiveNodes += 1 ;
1168
+ MtmEnableNode (nodeId );
1122
1169
Mtm -> nConfigChanges += 1 ;
1123
1170
caughtUp = true;
1124
1171
} else if (!BIT_CHECK (Mtm -> nodeLockerMask , nodeId - 1 )
@@ -1261,17 +1308,13 @@ bool MtmRefreshClusterStatus(bool nowait)
1261
1308
mask = ~clique & (((nodemask_t )1 << Mtm -> nAllNodes )- 1 ) & ~Mtm -> disabledNodeMask ; /* new disabled nodes mask */
1262
1309
for (i = 0 ; mask != 0 ; i ++ , mask >>= 1 ) {
1263
1310
if (mask & 1 ) {
1264
- Mtm -> nLiveNodes -= 1 ;
1265
- BIT_SET (Mtm -> disabledNodeMask , i );
1266
- Mtm -> nodes [i ].lastStatusChangeTime = MtmGetSystemTime ();
1311
+ MtmDisableNode (i + 1 );
1267
1312
}
1268
1313
}
1269
1314
mask = clique & Mtm -> disabledNodeMask ; /* new enabled nodes mask */
1270
1315
for (i = 0 ; mask != 0 ; i ++ , mask >>= 1 ) {
1271
1316
if (mask & 1 ) {
1272
- Mtm -> nLiveNodes += 1 ;
1273
- BIT_CLEAR (Mtm -> disabledNodeMask , i );
1274
- Mtm -> nodes [i ].lastStatusChangeTime = MtmGetSystemTime ();
1317
+ MtmEnableNode (i + 1 );
1275
1318
}
1276
1319
}
1277
1320
MtmCheckQuorum ();
@@ -1316,7 +1359,6 @@ void MtmOnNodeDisconnect(int nodeId)
1316
1359
/* Avoid false detection of node failure and prevent node status blinking */
1317
1360
return ;
1318
1361
}
1319
-
1320
1362
BIT_SET (Mtm -> connectivityMask , nodeId - 1 );
1321
1363
BIT_SET (Mtm -> reconnectMask , nodeId - 1 );
1322
1364
RaftableSet (psprintf ("node-mask-%d" , MtmNodeId ), & Mtm -> connectivityMask , sizeof Mtm -> connectivityMask , false);
@@ -1327,9 +1369,7 @@ void MtmOnNodeDisconnect(int nodeId)
1327
1369
if (!MtmRefreshClusterStatus (false)) {
1328
1370
MtmLock (LW_EXCLUSIVE );
1329
1371
if (!BIT_CHECK (Mtm -> disabledNodeMask , nodeId - 1 )) {
1330
- Mtm -> nodes [nodeId - 1 ].lastStatusChangeTime = MtmGetSystemTime ();
1331
- BIT_SET (Mtm -> disabledNodeMask , nodeId - 1 );
1332
- Mtm -> nLiveNodes -= 1 ;
1372
+ MtmDisableNode (nodeId );
1333
1373
MtmCheckQuorum ();
1334
1374
/* Interrupt voting for active transaction and abort them */
1335
1375
for (ts = Mtm -> transListHead ; ts != NULL ; ts = ts -> next ) {
@@ -1503,6 +1543,7 @@ static void MtmInitialize()
1503
1543
Mtm -> nodes [i ].lastStatusChangeTime = MtmGetSystemTime ();
1504
1544
Mtm -> nodes [i ].con = MtmConnections [i ];
1505
1545
Mtm -> nodes [i ].flushPos = 0 ;
1546
+ Mtm -> nodes [i ].lastHeartbeat = 0 ;
1506
1547
}
1507
1548
PGSemaphoreCreate (& Mtm -> votingSemaphore );
1508
1549
PGSemaphoreReset (& Mtm -> votingSemaphore );
@@ -1627,6 +1668,36 @@ _PG_init(void)
1627
1668
if (!process_shared_preload_libraries_in_progress )
1628
1669
return ;
1629
1670
1671
+ DefineCustomIntVariable (
1672
+ "multimaster.heartbeat_send_timeout" ,
1673
+ "Timeout in milliseconds of sending heartbeat messages" ,
1674
+ "Period of broadcasting heartbeat messages by abiter to all nodes" ,
1675
+ & MtmHeartbeatSendTimeout ,
1676
+ 1000 ,
1677
+ 1 ,
1678
+ INT_MAX ,
1679
+ PGC_BACKEND ,
1680
+ 0 ,
1681
+ NULL ,
1682
+ NULL ,
1683
+ NULL
1684
+ );
1685
+
1686
+ DefineCustomIntVariable (
1687
+ "multimaster.heartbeat_recv_timeout" ,
1688
+ "Timeout in milliseconds of receiving heartbeat messages" ,
1689
+ "If no heartbeat message is received from node within this period, it assumed to be dead" ,
1690
+ & MtmHeartbeatRecvTimeout ,
1691
+ 2000 ,
1692
+ 1 ,
1693
+ INT_MAX ,
1694
+ PGC_BACKEND ,
1695
+ 0 ,
1696
+ NULL ,
1697
+ NULL ,
1698
+ NULL
1699
+ );
1700
+
1630
1701
DefineCustomIntVariable (
1631
1702
"multimaster.gc_period" ,
1632
1703
"Number of distributed transactions after which garbage collection is started" ,
@@ -2056,9 +2127,7 @@ void MtmDropNode(int nodeId, bool dropSlot)
2056
2127
{
2057
2128
elog (ERROR , "NodeID %d is out of range [1,%d]" , nodeId , Mtm -> nLiveNodes );
2058
2129
}
2059
- Mtm -> nodes [nodeId - 1 ].lastStatusChangeTime = MtmGetSystemTime ();
2060
- BIT_SET (Mtm -> disabledNodeMask , nodeId - 1 );
2061
- Mtm -> nLiveNodes -= 1 ;
2130
+ MtmDisableNode (nodeId );
2062
2131
MtmCheckQuorum ();
2063
2132
if (!MtmIsBroadcast ())
2064
2133
{
@@ -2110,17 +2179,13 @@ MtmReplicationStartupHook(struct PGLogicalStartupHookArgs* args)
2110
2179
if (MtmIsRecoverySession ) {
2111
2180
MTM_LOG1 ("%d: Node %d start recovery of node %d" , MyProcPid , MtmNodeId , MtmReplicationNodeId );
2112
2181
if (!BIT_CHECK (Mtm -> disabledNodeMask , MtmReplicationNodeId - 1 )) {
2113
- Mtm -> nodes [MtmReplicationNodeId - 1 ].lastStatusChangeTime = MtmGetSystemTime ();
2114
- BIT_SET (Mtm -> disabledNodeMask , MtmReplicationNodeId - 1 );
2115
- Mtm -> nLiveNodes -= 1 ;
2182
+ MtmDisableNode (MtmReplicationNodeId );
2116
2183
MtmCheckQuorum ();
2117
2184
}
2118
2185
} else if (BIT_CHECK (Mtm -> disabledNodeMask , MtmReplicationNodeId - 1 )) {
2119
2186
if (recoveryCompleted ) {
2120
2187
MTM_LOG1 ("Node %d consider that recovery of node %d is completed: start normal replication" , MtmNodeId , MtmReplicationNodeId );
2121
- Mtm -> nodes [MtmReplicationNodeId - 1 ].lastStatusChangeTime = MtmGetSystemTime ();
2122
- BIT_CLEAR (Mtm -> disabledNodeMask , MtmReplicationNodeId - 1 );
2123
- Mtm -> nLiveNodes += 1 ;
2188
+ MtmEnableNode (MtmReplicationNodeId );
2124
2189
MtmCheckQuorum ();
2125
2190
} else {
2126
2191
elog (ERROR , "Disabled node %d tries to reconnect without recovery" , MtmReplicationNodeId );
0 commit comments