@@ -255,13 +255,18 @@ void MtmUnlockNode(int nodeId)
255
255
*/
256
256
257
257
258
- timestamp_t MtmGetCurrentTime (void )
258
+ timestamp_t MtmGetSystemTime (void )
259
259
{
260
260
struct timeval tv ;
261
261
gettimeofday (& tv , NULL );
262
262
return (timestamp_t )tv .tv_sec * USEC + tv .tv_usec + Mtm -> timeShift ;
263
263
}
264
264
265
+ timestamp_t MtmGetCurrentTime (void )
266
+ {
267
+ return MtmGetSystemTime () + Mtm -> timeShift ;
268
+ }
269
+
265
270
void MtmSleep (timestamp_t interval )
266
271
{
267
272
struct timespec ts ;
@@ -1045,7 +1050,7 @@ void MtmRecoveryCompleted(void)
1045
1050
MtmLock (LW_EXCLUSIVE );
1046
1051
Mtm -> recoverySlot = 0 ;
1047
1052
BIT_CLEAR (Mtm -> disabledNodeMask , MtmNodeId - 1 );
1048
- Mtm -> nodes [MtmNodeId - 1 ].lastStatusChangeTime = time ( NULL );
1053
+ Mtm -> nodes [MtmNodeId - 1 ].lastStatusChangeTime = MtmGetSystemTime ( );
1049
1054
/* Mode will be changed to online once all locagical reciever are connected */
1050
1055
MtmSwitchClusterMode (MTM_CONNECTED );
1051
1056
MtmUnlock ();
@@ -1134,7 +1139,7 @@ bool MtmRecoveryCaughtUp(int nodeId, XLogRecPtr slotLSN)
1134
1139
/* We are lucky: caugth-up without locking cluster! */
1135
1140
}
1136
1141
BIT_CLEAR (Mtm -> disabledNodeMask , nodeId - 1 );
1137
- Mtm -> nodes [nodeId - 1 ].lastStatusChangeTime = time ( NULL );
1142
+ Mtm -> nodes [nodeId - 1 ].lastStatusChangeTime = MtmGetSystemTime ( );
1138
1143
Mtm -> nNodes += 1 ;
1139
1144
caughtUp = true;
1140
1145
} else if (!BIT_CHECK (Mtm -> nodeLockerMask , nodeId - 1 )
@@ -1279,15 +1284,15 @@ bool MtmRefreshClusterStatus(bool nowait)
1279
1284
if (mask & 1 ) {
1280
1285
Mtm -> nNodes -= 1 ;
1281
1286
BIT_SET (Mtm -> disabledNodeMask , i );
1282
- Mtm -> nodes [i ].lastStatusChangeTime = time ( NULL );
1287
+ Mtm -> nodes [i ].lastStatusChangeTime = MtmGetSystemTime ( );
1283
1288
}
1284
1289
}
1285
1290
mask = clique & Mtm -> disabledNodeMask ; /* new enabled nodes mask */
1286
1291
for (i = 0 ; mask != 0 ; i ++ , mask >>= 1 ) {
1287
1292
if (mask & 1 ) {
1288
1293
Mtm -> nNodes += 1 ;
1289
1294
BIT_CLEAR (Mtm -> disabledNodeMask , i );
1290
- Mtm -> nodes [i ].lastStatusChangeTime = time ( NULL );
1295
+ Mtm -> nodes [i ].lastStatusChangeTime = MtmGetSystemTime ( );
1291
1296
}
1292
1297
}
1293
1298
MtmCheckQuorum ();
@@ -1327,7 +1332,7 @@ void MtmOnNodeDisconnect(int nodeId)
1327
1332
{
1328
1333
MtmTransState * ts ;
1329
1334
1330
- if (Mtm -> nodes [nodeId - 1 ].lastStatusChangeTime + MtmNodeDisableDelay > time ( NULL )) {
1335
+ if (Mtm -> nodes [nodeId - 1 ].lastStatusChangeTime + MSEC_TO_USEC ( MtmNodeDisableDelay ) > MtmGetSystemTime ( )) {
1331
1336
/* Avoid false detection of node failure and prevent node status blinking */
1332
1337
return ;
1333
1338
}
@@ -1342,7 +1347,7 @@ void MtmOnNodeDisconnect(int nodeId)
1342
1347
if (!MtmRefreshClusterStatus (false)) {
1343
1348
MtmLock (LW_EXCLUSIVE );
1344
1349
if (!BIT_CHECK (Mtm -> disabledNodeMask , nodeId - 1 )) {
1345
- Mtm -> nodes [nodeId - 1 ].lastStatusChangeTime = time ( NULL );
1350
+ Mtm -> nodes [nodeId - 1 ].lastStatusChangeTime = MtmGetSystemTime ( );
1346
1351
BIT_SET (Mtm -> disabledNodeMask , nodeId - 1 );
1347
1352
Mtm -> nNodes -= 1 ;
1348
1353
MtmCheckQuorum ();
@@ -1510,14 +1515,14 @@ static void MtmInitialize()
1510
1515
for (i = 0 ; i < MtmNodes ; i ++ ) {
1511
1516
Mtm -> nodes [i ].oldestSnapshot = 0 ;
1512
1517
Mtm -> nodes [i ].transDelay = 0 ;
1513
- Mtm -> nodes [i ].lastStatusChangeTime = time ( NULL );
1518
+ Mtm -> nodes [i ].lastStatusChangeTime = MtmGetSystemTime ( );
1514
1519
Mtm -> nodes [i ].con = MtmConnections [i ];
1515
1520
Mtm -> nodes [i ].flushPos = 0 ;
1516
1521
}
1517
1522
PGSemaphoreCreate (& Mtm -> votingSemaphore );
1518
1523
PGSemaphoreReset (& Mtm -> votingSemaphore );
1519
1524
SpinLockInit (& Mtm -> spinlock );
1520
- BgwPoolInit (& Mtm -> pool , MtmExecutor , MtmDatabaseName , MtmQueueSize );
1525
+ BgwPoolInit (& Mtm -> pool , MtmExecutor , MtmDatabaseName , MtmQueueSize , MtmWorkers );
1521
1526
RegisterXactCallback (MtmXactCallback , NULL );
1522
1527
MtmTx .snapshot = INVALID_CSN ;
1523
1528
MtmTx .xid = InvalidTransactionId ;
@@ -1681,10 +1686,10 @@ _PG_init(void)
1681
1686
1682
1687
DefineCustomIntVariable (
1683
1688
"multimaster.node_disable_delay" ,
1684
- "Minamal amount of time (sec ) between node status change" ,
1689
+ "Minamal amount of time (msec ) between node status change" ,
1685
1690
"This delay is used to avoid false detection of node failure and to prevent blinking of node status node" ,
1686
1691
& MtmNodeDisableDelay ,
1687
- 1 ,
1692
+ 1000 ,
1688
1693
1 ,
1689
1694
INT_MAX ,
1690
1695
PGC_BACKEND ,
@@ -2032,7 +2037,7 @@ void MtmDropNode(int nodeId, bool dropSlot)
2032
2037
{
2033
2038
elog (ERROR , "NodeID %d is out of range [1,%d]" , nodeId , Mtm -> nNodes );
2034
2039
}
2035
- Mtm -> nodes [nodeId - 1 ].lastStatusChangeTime = time ( NULL );
2040
+ Mtm -> nodes [nodeId - 1 ].lastStatusChangeTime = MtmGetSystemTime ( );
2036
2041
BIT_SET (Mtm -> disabledNodeMask , nodeId - 1 );
2037
2042
Mtm -> nNodes -= 1 ;
2038
2043
MtmCheckQuorum ();
@@ -2083,15 +2088,15 @@ MtmReplicationStartupHook(struct PGLogicalStartupHookArgs* args)
2083
2088
if (MtmIsRecoverySession ) {
2084
2089
MTM_LOG1 ("%d: Node %d start recovery of node %d" , MyProcPid , MtmNodeId , MtmReplicationNodeId );
2085
2090
if (!BIT_CHECK (Mtm -> disabledNodeMask , MtmReplicationNodeId - 1 )) {
2086
- Mtm -> nodes [MtmReplicationNodeId - 1 ].lastStatusChangeTime = time ( NULL );
2091
+ Mtm -> nodes [MtmReplicationNodeId - 1 ].lastStatusChangeTime = MtmGetSystemTime ( );
2087
2092
BIT_SET (Mtm -> disabledNodeMask , MtmReplicationNodeId - 1 );
2088
2093
Mtm -> nNodes -= 1 ;
2089
2094
MtmCheckQuorum ();
2090
2095
}
2091
2096
} else if (BIT_CHECK (Mtm -> disabledNodeMask , MtmReplicationNodeId - 1 )) {
2092
2097
if (recoveryCompleted ) {
2093
2098
MTM_LOG1 ("Node %d consider that recovery of node %d is completed: start normal replication" , MtmNodeId , MtmReplicationNodeId );
2094
- Mtm -> nodes [MtmReplicationNodeId - 1 ].lastStatusChangeTime = time ( NULL );
2099
+ Mtm -> nodes [MtmReplicationNodeId - 1 ].lastStatusChangeTime = MtmGetSystemTime ( );
2095
2100
BIT_CLEAR (Mtm -> disabledNodeMask , MtmReplicationNodeId - 1 );
2096
2101
Mtm -> nNodes += 1 ;
2097
2102
MtmCheckQuorum ();
@@ -2238,7 +2243,7 @@ mtm_poll_node(PG_FUNCTION_ARGS)
2238
2243
}
2239
2244
if (!nowait ) {
2240
2245
/* Just wait some time until logical repication channels will be reestablished */
2241
- MtmSleep (MtmNodeDisableDelay );
2246
+ MtmSleep (MSEC_TO_USEC ( MtmNodeDisableDelay ) );
2242
2247
}
2243
2248
PG_RETURN_BOOL (online );
2244
2249
}
@@ -2297,7 +2302,7 @@ mtm_get_nodes_state(PG_FUNCTION_ARGS)
2297
2302
usrfctx -> values [4 ] = Int64GetDatum (lag );
2298
2303
usrfctx -> nulls [4 ] = lag < 0 ;
2299
2304
usrfctx -> values [5 ] = Int64GetDatum (Mtm -> transCount ? Mtm -> nodes [usrfctx -> nodeId - 1 ].transDelay /Mtm -> transCount : 0 );
2300
- usrfctx -> values [6 ] = TimestampTzGetDatum (time_t_to_timestamptz (Mtm -> nodes [usrfctx -> nodeId - 1 ].lastStatusChangeTime ));
2305
+ usrfctx -> values [6 ] = TimestampTzGetDatum (time_t_to_timestamptz (Mtm -> nodes [usrfctx -> nodeId - 1 ].lastStatusChangeTime / USEC ));
2301
2306
usrfctx -> values [7 ] = CStringGetTextDatum (Mtm -> nodes [usrfctx -> nodeId - 1 ].con .connStr );
2302
2307
usrfctx -> nodeId += 1 ;
2303
2308
@@ -3058,6 +3063,18 @@ MtmDetectGlobalDeadLock(PGPROC* proc)
3058
3063
MtmGetGtid (pgxact -> xid , & gtid );
3059
3064
hasDeadlock = MtmGraphFindLoop (& graph , & gtid );
3060
3065
elog (WARNING , "Distributed deadlock check for %u:%u = %d" , gtid .node , gtid .xid , hasDeadlock );
3066
+ if (!hasDeadlock ) {
3067
+ /* There is no deadlock loop in graph, but deadlock can be caused by lack of apply workers: if all of them are busy, then some transactions
3068
+ * can not be appied just because there are no vacant workers and it cause additional dependency between transactions which is not
3069
+ * refelected in lock graph
3070
+ */
3071
+ timestamp_t lastPeekTime = BgwGetLastPeekTime (& Mtm -> pool );
3072
+ if (lastPeekTime != 0 && MtmGetSystemTime () - lastPeekTime >= MSEC_TO_USEC (DeadlockTimeout )) {
3073
+ hasDeadlock = true;
3074
+ elog (WARNING , "Apply workers were blocked more than %d msec" ,
3075
+ (int )USEC_TO_MSEC (MtmGetSystemTime () - lastPeekTime ));
3076
+ }
3077
+ }
3061
3078
}
3062
3079
return hasDeadlock ;
3063
3080
}
0 commit comments