@@ -159,8 +159,8 @@ static bool MtmRunUtilityStmt(PGconn* conn, char const* sql, char **errmsg);
159
159
static void MtmBroadcastUtilityStmt (char const * sql , bool ignoreError );
160
160
static void MtmProcessDDLCommand (char const * queryString , bool transactional );
161
161
162
- static void MtmSuspendNode (void );
163
- static void MtmResumeNode (void );
162
+ static void MtmLockCluster (void );
163
+ static void MtmUnlockCluster (void );
164
164
165
165
MtmState * Mtm ;
166
166
@@ -254,7 +254,7 @@ static bool MtmIgnoreTablesWithoutPk;
254
254
static int MtmLockCount ;
255
255
static bool MtmMajorNode ;
256
256
static bool MtmBreakConnection ;
257
- static bool MtmSuspended ;
257
+ static bool MtmClusterLocked ;
258
258
static bool MtmInsideTransaction ;
259
259
260
260
static ExecutorStart_hook_type PreviousExecutorStartHook ;
@@ -288,8 +288,8 @@ void MtmReleaseLocks(void)
288
288
MtmInsideTransaction = false;
289
289
MtmUnlock ();
290
290
}
291
- if (MtmSuspended ) {
292
- MtmResumeNode ();
291
+ if (MtmClusterLocked ) {
292
+ MtmUnlockCluster ();
293
293
}
294
294
if (MtmLockCount != 0 ) {
295
295
Assert (Mtm -> lastLockHolder == MyProcPid );
@@ -876,8 +876,7 @@ MtmBeginTransaction(MtmCurrentTrans* x)
876
876
* Also allow user to complete explicit 2PC transactions.
877
877
*/
878
878
if (x -> isDistributed
879
- && (Mtm -> exclusiveLock || (!x -> isReplicated && !x -> isTwoPhase ))
880
- && !MtmSuspended
879
+ && !MtmClusterLocked /* do not lock myself */
881
880
&& strcmp (application_name , MULTIMASTER_ADMIN ) != 0 )
882
881
{
883
882
MtmCheckClusterLock ();
@@ -886,7 +885,7 @@ MtmBeginTransaction(MtmCurrentTrans* x)
886
885
Mtm -> nRunningTransactions += 1 ;
887
886
888
887
x -> snapshot = MtmAssignCSN ();
889
- MTM_LOG1 ("Start transaction %lld with snapshot %lld" , (long64 )x -> xid , x -> snapshot );
888
+ MTM_LOG2 ("Start transaction %lld with snapshot %lld" , (long64 )x -> xid , x -> snapshot );
890
889
891
890
MtmUnlock ();
892
891
@@ -1448,11 +1447,25 @@ MtmEndTransaction(MtmCurrentTrans* x, bool commit)
1448
1447
if (!MyReplicationSlot ) {
1449
1448
MtmCheckSlots ();
1450
1449
}
1451
- if (MtmSuspended ) {
1452
- MtmResumeNode ();
1450
+ if (MtmClusterLocked ) {
1451
+ MtmUnlockCluster ();
1453
1452
}
1454
1453
}
1455
1454
1455
+ /*
1456
+ * Initialize message
1457
+ */
1458
+ void MtmInitMessage (MtmArbiterMessage * msg , MtmMessageCode code )
1459
+ {
1460
+ msg -> code = code ;
1461
+ msg -> disabledNodeMask = Mtm -> disabledNodeMask ;
1462
+ msg -> connectivityMask = SELF_CONNECTIVITY_MASK ;
1463
+ msg -> oldestSnapshot = Mtm -> nodes [MtmNodeId - 1 ].oldestSnapshot ;
1464
+ msg -> lockReq = Mtm -> originLockNodeMask != 0 ;
1465
+ msg -> locked = (Mtm -> originLockNodeMask |Mtm -> inducedLockNodeMask ) != 0 ;
1466
+ }
1467
+
1468
+
1456
1469
/*
1457
1470
* Send arbiter's message
1458
1471
*/
@@ -1489,13 +1502,9 @@ void MtmSendMessage(MtmArbiterMessage* msg)
1489
1502
void MtmSend2PCMessage (MtmTransState * ts , MtmMessageCode cmd )
1490
1503
{
1491
1504
MtmArbiterMessage msg ;
1492
- msg . code = cmd ;
1505
+ MtmInitMessage ( & msg , cmd ) ;
1493
1506
msg .sxid = ts -> xid ;
1494
1507
msg .csn = ts -> csn ;
1495
- msg .disabledNodeMask = Mtm -> disabledNodeMask ;
1496
- msg .connectivityMask = SELF_CONNECTIVITY_MASK ;
1497
- msg .oldestSnapshot = Mtm -> nodes [MtmNodeId - 1 ].oldestSnapshot ;
1498
- msg .lockReq = Mtm -> nodeLockerMask != 0 ;
1499
1508
memcpy (msg .gid , ts -> gid , MULTIMASTER_MAX_GID_SIZE );
1500
1509
1501
1510
Assert (!MtmIsCoordinator (ts )); /* All broadcasts are now done through logical decoding */
@@ -1516,11 +1525,7 @@ static void MtmBroadcastPollMessage(MtmTransState* ts)
1516
1525
{
1517
1526
int i ;
1518
1527
MtmArbiterMessage msg ;
1519
- msg .code = MSG_POLL_REQUEST ;
1520
- msg .disabledNodeMask = Mtm -> disabledNodeMask ;
1521
- msg .connectivityMask = SELF_CONNECTIVITY_MASK ;
1522
- msg .oldestSnapshot = Mtm -> nodes [MtmNodeId - 1 ].oldestSnapshot ;
1523
- msg .lockReq = Mtm -> nodeLockerMask != 0 ;
1528
+ MtmInitMessage (& msg , MSG_POLL_REQUEST );
1524
1529
memcpy (msg .gid , ts -> gid , MULTIMASTER_MAX_GID_SIZE );
1525
1530
ts -> votedMask = 0 ;
1526
1531
@@ -1928,7 +1933,7 @@ void MtmRecoveryCompleted(void)
1928
1933
* logical replication connections with this node.
1929
1934
* Under the intensive workload start of logical replication can be delayed for unpredictable amount of time
1930
1935
*/
1931
- BIT_SET (Mtm -> nodeLockerMask , MtmNodeId - 1 ); /* it is trick: this mask was originally used by WAL senders performing recovery, but here we are in opposite (recovered) side:
1936
+ BIT_SET (Mtm -> originLockNodeMask , MtmNodeId - 1 ); /* it is trick: this mask was originally used by WAL senders performing recovery, but here we are in opposite (recovered) side:
1932
1937
* if this mask is not zero loadReq will be broadcasted to all other nodes by heartbeat, suspending their activity
1933
1938
*/
1934
1939
MtmSwitchClusterMode (MTM_RECOVERED );
@@ -2017,7 +2022,7 @@ void MtmCheckRecoveryCaughtUp(int nodeId, lsn_t slotLSN)
2017
2022
MtmLock (LW_EXCLUSIVE );
2018
2023
if (MtmIsRecoveredNode (nodeId )) {
2019
2024
lsn_t walLSN = GetXLogInsertRecPtr ();
2020
- if (!BIT_CHECK (Mtm -> nodeLockerMask , nodeId - 1 )
2025
+ if (!BIT_CHECK (Mtm -> originLockNodeMask , nodeId - 1 )
2021
2026
&& slotLSN + MtmMinRecoveryLag > walLSN )
2022
2027
{
2023
2028
/*
@@ -2028,14 +2033,11 @@ void MtmCheckRecoveryCaughtUp(int nodeId, lsn_t slotLSN)
2028
2033
*/
2029
2034
MTM_LOG1 ("Node %d is almost caught-up: slot position %llx, WAL position %llx, active transactions %d" ,
2030
2035
nodeId , slotLSN , walLSN , Mtm -> nActiveTransactions );
2031
- Assert (MyWalSnd != NULL ); /* This function is called by WAL-sender, so it should not be NULL */
2032
- BIT_SET (Mtm -> nodeLockerMask , nodeId - 1 );
2033
- BIT_SET (Mtm -> walSenderLockerMask , MyWalSnd - WalSndCtl -> walsnds );
2034
- Mtm -> nLockers += 1 ;
2036
+ BIT_SET (Mtm -> originLockNodeMask , nodeId - 1 );
2035
2037
} else {
2036
2038
MTM_LOG2 ("Continue recovery of node %d, slot position %llx, WAL position %llx,"
2037
- " WAL sender position %llx, lockers %d , active transactions %d" , nodeId , slotLSN ,
2038
- walLSN , MyWalSnd -> sentPtr , Mtm -> nLockers , Mtm -> nActiveTransactions );
2039
+ " WAL sender position %llx, lockers %llx , active transactions %d" , nodeId , slotLSN ,
2040
+ walLSN , MyWalSnd -> sentPtr , Mtm -> orinLockNodeMask , Mtm -> nActiveTransactions );
2039
2041
}
2040
2042
}
2041
2043
MtmUnlock ();
@@ -2051,11 +2053,13 @@ bool MtmRecoveryCaughtUp(int nodeId, lsn_t walEndPtr)
2051
2053
bool caughtUp = false;
2052
2054
MtmLock (LW_EXCLUSIVE );
2053
2055
if (MtmIsRecoveredNode (nodeId ) && Mtm -> nActiveTransactions == 0 ) {
2054
- if (BIT_CHECK (Mtm -> nodeLockerMask , nodeId - 1 )) {
2056
+ if (BIT_CHECK (Mtm -> originLockNodeMask , nodeId - 1 )) {
2055
2057
MTM_LOG1 ("Node %d is caught-up at WAL position %llx" , nodeId , walEndPtr );
2056
- BIT_CLEAR (Mtm -> walSenderLockerMask , MyWalSnd - WalSndCtl -> walsnds );
2057
- BIT_CLEAR (Mtm -> nodeLockerMask , nodeId - 1 );
2058
- Mtm -> nLockers -= 1 ;
2058
+ Assert (BIT_CHECK (Mtm -> disabledNodeMask , nodeId - 1 ));
2059
+ BIT_CLEAR (Mtm -> originLockNodeMask , nodeId - 1 );
2060
+ BIT_CLEAR (Mtm -> disabledNodeMask , nodeId - 1 );
2061
+ Mtm -> nLiveNodes += 1 ;
2062
+ MtmCheckQuorum ();
2059
2063
} else {
2060
2064
MTM_LOG1 ("Node %d is caught-up at WAL position %llx without locking cluster" , nodeId , walEndPtr );
2061
2065
/* We are lucky: caught-up without locking cluster! */
@@ -2082,40 +2086,44 @@ void MtmSwitchClusterMode(MtmNodeStatus mode)
2082
2086
* Prevent start of any new transactions at this node
2083
2087
*/
2084
2088
static void
2085
- MtmSuspendNode (void )
2089
+ MtmLockCluster (void )
2086
2090
{
2087
2091
timestamp_t delay = MIN_WAIT_TIMEOUT ;
2088
- Assert (!MtmSuspended );
2092
+ Assert (!MtmClusterLocked );
2089
2093
MtmLock (LW_EXCLUSIVE );
2090
- if (Mtm -> exclusiveLock ) {
2094
+ if (BIT_CHECK ( Mtm -> originLockNodeMask , MtmNodeId - 1 ) ) {
2091
2095
elog (ERROR , "There is already pending exclusive lock" );
2092
2096
}
2093
- Mtm -> exclusiveLock = true;
2094
- MtmSuspended = true;
2095
- MTM_LOG2 ("Transaction %lld tries to suspend node at %lld insideTransaction=%d, active transactions=%lld" ,
2096
- (long64 )MtmTx .xid , MtmGetCurrentTime (), insideTransaction , (long64 )Mtm -> nRunningTransactions );
2097
- while (Mtm -> nRunningTransactions != 1 ) { /* I am one */
2097
+ BIT_SET (Mtm -> originLockNodeMask , MtmNodeId - 1 );
2098
+ MtmClusterLocked = true;
2099
+ MTM_LOG1 ("Transaction %lld tries to lock cluster at %lld, running transactions=%lld" ,
2100
+ (long64 )MtmTx .xid , MtmGetCurrentTime (), (long64 )Mtm -> nRunningTransactions );
2101
+ /* Wait until everything is locked */
2102
+ while (Mtm -> nRunningTransactions != 1 /* I am one */
2103
+ || ((((nodemask_t )1 << Mtm -> nAllNodes )- 1 ) & ~(Mtm -> currentLockNodeMask |Mtm -> originLockNodeMask ) & ~Mtm -> disabledNodeMask ) != 0 )
2104
+ {
2098
2105
MtmUnlock ();
2099
2106
MtmSleep (delay );
2100
2107
if (delay * 2 <= MAX_WAIT_TIMEOUT ) {
2101
2108
delay *= 2 ;
2102
2109
}
2103
2110
MtmLock (LW_EXCLUSIVE );
2104
2111
}
2105
- MTM_LOG2 ("Transaction %lld suspended node at %lld, LSN %lld, active transactions=%lld" , (long64 )MtmTx .xid , MtmGetCurrentTime (), (long64 )GetXLogInsertRecPtr (), (long64 )Mtm -> nRunningTransactions );
2112
+ MTM_LOG1 ("Transaction %lld locked cluster at %lld, LSN %lld, active transactions=%lld" ,
2113
+ (long64 )MtmTx .xid , MtmGetCurrentTime (), (long64 )GetXLogInsertRecPtr (), (long64 )Mtm -> nRunningTransactions );
2106
2114
MtmUnlock ();
2107
2115
}
2108
2116
2109
2117
/*
2110
- * Resume transaction processing at node (blocked by MtmSuspendNode)
2118
+ * Remove global cluster lock set by MtmLockCluster
2111
2119
*/
2112
2120
static void
2113
- MtmResumeNode (void )
2121
+ MtmUnlockCluster (void )
2114
2122
{
2115
2123
MtmLock (LW_EXCLUSIVE );
2116
- MTM_LOG2 ("Transaction %lld resume node at %lld status %s LSN %lld" , (long64 )MtmTx .xid , MtmGetCurrentTime (), MtmTxnStatusMnem [MtmTx .status ], (long64 )GetXLogInsertRecPtr ());
2117
- Mtm -> exclusiveLock = false ;
2118
- MtmSuspended = false;
2124
+ MTM_LOG1 ("Transaction %lld unlock cluster at %lld status %s LSN %lld" , (long64 )MtmTx .xid , MtmGetCurrentTime (), MtmTxnStatusMnem [MtmTx .status ], (long64 )GetXLogInsertRecPtr ());
2125
+ BIT_CLEAR ( Mtm -> originLockNodeMask , MtmNodeId - 1 ) ;
2126
+ MtmClusterLocked = false;
2119
2127
MtmUnlock ();
2120
2128
}
2121
2129
@@ -2128,33 +2136,15 @@ static void
2128
2136
MtmCheckClusterLock ()
2129
2137
{
2130
2138
timestamp_t delay = MIN_WAIT_TIMEOUT ;
2131
- while (true)
2132
- {
2133
- if (Mtm -> exclusiveLock || (Mtm -> globalLockerMask | Mtm -> walSenderLockerMask )) {
2134
- /* some "almost cautch-up" wal-senders are still working. */
2135
- /* Do not start new transactions until them are completed. */
2136
- MtmUnlock ();
2137
- MtmSleep (delay );
2138
- if (delay * 2 <= MAX_WAIT_TIMEOUT ) {
2139
- delay *= 2 ;
2140
- }
2141
- MtmLock (LW_EXCLUSIVE );
2142
- } else {
2143
- if (Mtm -> nodeLockerMask != 0 ) {
2144
- /* All lockers have synchronized their logs */
2145
- /* Remove lock and mark them as recovered */
2146
- MTM_LOG1 ("Complete recovery of %d nodes (node mask %llx)" , Mtm -> nLockers , Mtm -> nodeLockerMask );
2147
- Assert (Mtm -> walSenderLockerMask == 0 );
2148
- Assert ((Mtm -> nodeLockerMask & Mtm -> disabledNodeMask ) == Mtm -> nodeLockerMask );
2149
- Mtm -> disabledNodeMask &= ~Mtm -> nodeLockerMask ;
2150
- Mtm -> nConfigChanges += 1 ;
2151
- Mtm -> nLiveNodes += Mtm -> nLockers ;
2152
- Mtm -> nLockers = 0 ;
2153
- Mtm -> nodeLockerMask = 0 ;
2154
- MtmCheckQuorum ();
2155
- }
2156
- break ;
2139
+ while (Mtm -> originLockNodeMask | Mtm -> inducedLockNodeMask ) {
2140
+ /* some "almost cautch-up" wal-senders are still working. */
2141
+ /* Do not start new transactions until them are completed. */
2142
+ MtmUnlock ();
2143
+ MtmSleep (delay );
2144
+ if (delay * 2 <= MAX_WAIT_TIMEOUT ) {
2145
+ delay *= 2 ;
2157
2146
}
2147
+ MtmLock (LW_EXCLUSIVE );
2158
2148
}
2159
2149
}
2160
2150
@@ -2548,13 +2538,11 @@ static void MtmInitialize()
2548
2538
Mtm -> stoppedNodeMask = 0 ;
2549
2539
Mtm -> pglogicalReceiverMask = 0 ;
2550
2540
Mtm -> pglogicalSenderMask = 0 ;
2551
- Mtm -> walSenderLockerMask = 0 ;
2552
- Mtm -> globalLockerMask = 0 ;
2553
- Mtm -> nodeLockerMask = 0 ;
2541
+ Mtm -> inducedLockNodeMask = 0 ;
2542
+ Mtm -> currentLockNodeMask = 0 ;
2543
+ Mtm -> originLockNodeMask = 0 ;
2554
2544
Mtm -> reconnectMask = 0 ;
2555
2545
Mtm -> recoveredLSN = INVALID_LSN ;
2556
- Mtm -> nLockers = 0 ;
2557
- Mtm -> exclusiveLock = false;
2558
2546
Mtm -> nActiveTransactions = 0 ;
2559
2547
Mtm -> nRunningTransactions = 0 ;
2560
2548
Mtm -> votingTransactions = NULL ;
@@ -3326,7 +3314,7 @@ void MtmReceiverStarted(int nodeId)
3326
3314
if (++ Mtm -> nReceivers == Mtm -> nLiveNodes - 1 && Mtm -> nSenders == Mtm -> nLiveNodes - 1
3327
3315
&& (Mtm -> status == MTM_RECOVERED || Mtm -> status == MTM_CONNECTED ))
3328
3316
{
3329
- BIT_CLEAR (Mtm -> nodeLockerMask , MtmNodeId - 1 ); /* recovery is completed: release cluster lock */
3317
+ BIT_CLEAR (Mtm -> originLockNodeMask , MtmNodeId - 1 ); /* recovery is completed: release cluster lock */
3330
3318
MtmSwitchClusterMode (MTM_ONLINE );
3331
3319
}
3332
3320
}
@@ -3656,7 +3644,7 @@ MtmReplicationStartupHook(struct PGLogicalStartupHookArgs* args)
3656
3644
&& (Mtm -> status == MTM_RECOVERED || Mtm -> status == MTM_CONNECTED ))
3657
3645
{
3658
3646
/* All logical replication connections from and to this node are established, so we can switch cluster to online mode */
3659
- BIT_CLEAR (Mtm -> nodeLockerMask , MtmNodeId - 1 ); /* recovery is completed: release cluster lock */
3647
+ BIT_CLEAR (Mtm -> originLockNodeMask , MtmNodeId - 1 ); /* recovery is completed: release cluster lock */
3660
3648
MtmSwitchClusterMode (MTM_ONLINE );
3661
3649
}
3662
3650
}
@@ -4070,7 +4058,7 @@ mtm_get_nodes_state(PG_FUNCTION_ARGS)
4070
4058
usrfctx -> values [3 ] = BoolGetDatum (BIT_CHECK (Mtm -> stalledNodeMask , usrfctx -> nodeId - 1 ));
4071
4059
usrfctx -> values [4 ] = BoolGetDatum (BIT_CHECK (Mtm -> stoppedNodeMask , usrfctx -> nodeId - 1 ));
4072
4060
4073
- usrfctx -> values [5 ] = BoolGetDatum (BIT_CHECK (Mtm -> nodeLockerMask , usrfctx -> nodeId - 1 ));
4061
+ usrfctx -> values [5 ] = BoolGetDatum (BIT_CHECK (Mtm -> originLockNodeMask , usrfctx -> nodeId - 1 ));
4074
4062
lag = MtmGetSlotLag (usrfctx -> nodeId );
4075
4063
usrfctx -> values [6 ] = Int64GetDatum (lag );
4076
4064
usrfctx -> nulls [6 ] = lag < 0 ;
@@ -4196,7 +4184,7 @@ mtm_get_cluster_state(PG_FUNCTION_ARGS)
4196
4184
values [1 ] = CStringGetTextDatum (MtmNodeStatusMnem [Mtm -> status ]);
4197
4185
values [2 ] = Int64GetDatum (Mtm -> disabledNodeMask );
4198
4186
values [3 ] = Int64GetDatum (SELF_CONNECTIVITY_MASK );
4199
- values [4 ] = Int64GetDatum (Mtm -> nodeLockerMask );
4187
+ values [4 ] = Int64GetDatum (Mtm -> originLockNodeMask );
4200
4188
values [5 ] = Int32GetDatum (Mtm -> nLiveNodes );
4201
4189
values [6 ] = Int32GetDatum (Mtm -> nAllNodes );
4202
4190
values [7 ] = Int32GetDatum ((int )Mtm -> pool .active );
@@ -5032,7 +5020,7 @@ static void MtmProcessUtility(Node *parsetree, const char *queryString,
5032
5020
5033
5021
case T_TruncateStmt :
5034
5022
skipCommand = false;
5035
- MtmSuspendNode ();
5023
+ MtmLockCluster ();
5036
5024
break ;
5037
5025
5038
5026
case T_DropStmt :
0 commit comments