@@ -1531,8 +1531,8 @@ static void MtmEnableNode(int nodeId)
1531
1531
void MtmRecoveryCompleted (void )
1532
1532
{
1533
1533
int i ;
1534
- MTM_LOG1 ("Recovery of node %d is completed, disabled mask=%llx, connectivity mask=%llx, live nodes=%d" ,
1535
- MtmNodeId , (long long ) Mtm -> disabledNodeMask , (long long ) Mtm -> connectivityMask , Mtm -> nLiveNodes );
1534
+ MTM_LOG1 ("Recovery of node %d is completed, disabled mask=%llx, connectivity mask=%llx, endLSN=%lx, live nodes=%d" ,
1535
+ MtmNodeId , (long long ) Mtm -> disabledNodeMask , (long long ) Mtm -> connectivityMask , GetXLogInsertRecPtr (), Mtm -> nLiveNodes );
1536
1536
MtmLock (LW_EXCLUSIVE );
1537
1537
Mtm -> recoverySlot = 0 ;
1538
1538
Mtm -> recoveredLSN = GetXLogInsertRecPtr ();
@@ -1542,7 +1542,7 @@ void MtmRecoveryCompleted(void)
1542
1542
for (i = 0 ; i < Mtm -> nAllNodes ; i ++ ) {
1543
1543
Mtm -> nodes [i ].lastHeartbeat = 0 ; /* defuse watchdog until first heartbeat is received */
1544
1544
}
1545
- /* Mode will be changed to online once all logical reciever are connected */
1545
+ /* Mode will be changed to online once all logical receiver are connected */
1546
1546
MtmSwitchClusterMode (MTM_CONNECTED );
1547
1547
MtmUnlock ();
1548
1548
}
@@ -2131,7 +2131,6 @@ static void MtmInitialize()
2131
2131
Mtm -> nodes [i ].restartLSN = InvalidXLogRecPtr ;
2132
2132
Mtm -> nodes [i ].originId = InvalidRepOriginId ;
2133
2133
Mtm -> nodes [i ].timeline = 0 ;
2134
- Mtm -> nodes [i ].recoveredLSN = InvalidXLogRecPtr ;
2135
2134
}
2136
2135
Mtm -> nodes [MtmNodeId - 1 ].originId = DoNotReplicateId ;
2137
2136
/* All transaction originated from the current node should be ignored during recovery */
@@ -2884,13 +2883,14 @@ MtmReplicationMode MtmGetReplicationMode(int nodeId, sig_atomic_t volatile* shut
2884
2883
{
2885
2884
MtmReplicationMode mode = REPLMODE_OPEN_EXISTED ;
2886
2885
2886
+ MtmLock (LW_EXCLUSIVE );
2887
2887
while ((Mtm -> status != MTM_CONNECTED && Mtm -> status != MTM_ONLINE ) || BIT_CHECK (Mtm -> disabledNodeMask , nodeId - 1 ))
2888
2888
{
2889
2889
if (* shutdown )
2890
2890
{
2891
+ MtmUnlock ();
2891
2892
return REPLMODE_EXIT ;
2892
2893
}
2893
- MtmLock (LW_EXCLUSIVE );
2894
2894
if (BIT_CHECK (Mtm -> disabledNodeMask , nodeId - 1 )) {
2895
2895
mode = REPLMODE_CREATE_NEW ;
2896
2896
}
@@ -2913,6 +2913,7 @@ MtmReplicationMode MtmGetReplicationMode(int nodeId, sig_atomic_t volatile* shut
2913
2913
MtmUnlock ();
2914
2914
/* delay opening of other slots until recovery is completed */
2915
2915
MtmSleep (STATUS_POLL_DELAY );
2916
+ MtmLock (LW_EXCLUSIVE );
2916
2917
}
2917
2918
if (mode == REPLMODE_RECOVERED ) {
2918
2919
MTM_LOG1 ("%d: Restart replication from node %d after end of recovery" , MyProcPid , nodeId );
@@ -2921,6 +2922,7 @@ MtmReplicationMode MtmGetReplicationMode(int nodeId, sig_atomic_t volatile* shut
2921
2922
} else {
2922
2923
MTM_LOG1 ("%d: Continue replication from node %d" , MyProcPid , nodeId );
2923
2924
}
2925
+ MtmUnlock ();
2924
2926
return mode ;
2925
2927
}
2926
2928
@@ -3014,7 +3016,12 @@ MtmReplicationStartupHook(struct PGLogicalStartupHookArgs* args)
3014
3016
}
3015
3017
} else if (strcmp ("mtm_recovered_pos" , elem -> defname ) == 0 ) {
3016
3018
if (elem -> arg != NULL && strVal (elem -> arg ) != NULL ) {
3017
- sscanf (strVal (elem -> arg ), "%lx" , & Mtm -> nodes [MtmReplicationNodeId - 1 ].recoveredLSN );
3019
+ XLogRecPtr recoveredLSN ;
3020
+ sscanf (strVal (elem -> arg ), "%lx" , & recoveredLSN );
3021
+ MTM_LOG1 ("Recovered position of node %d is %lx" , MtmReplicationNodeId , recoveredLSN );
3022
+ if (Mtm -> nodes [MtmReplicationNodeId - 1 ].restartLSN < recoveredLSN ) {
3023
+ Mtm -> nodes [MtmReplicationNodeId - 1 ].restartLSN = recoveredLSN ;
3024
+ }
3018
3025
} else {
3019
3026
elog (ERROR , "Recovered position is not specified" );
3020
3027
}
@@ -3129,16 +3136,21 @@ MtmReplicationRowFilterHook(struct PGLogicalRowFilterArgs* args)
3129
3136
return isDistributed ;
3130
3137
}
3131
3138
3139
+ /*
3140
+ * Filter received transacyions at destination side.
3141
+ * This function is executed by receiver, so there are no race conditions and it is possible to update nodes[i].restaetLSN without lock
3142
+ */
3132
3143
bool MtmFilterTransaction (char * record , int size )
3133
3144
{
3134
3145
StringInfoData s ;
3135
3146
uint8 flags ;
3136
3147
XLogRecPtr origin_lsn ;
3137
3148
XLogRecPtr end_lsn ;
3149
+ XLogRecPtr restart_lsn ;
3138
3150
int replication_node ;
3139
3151
int origin_node ;
3140
3152
char const * gid = "" ;
3141
- bool duplicate ;
3153
+ bool duplicate = false ;
3142
3154
3143
3155
s .data = record ;
3144
3156
s .len = size ;
@@ -3174,11 +3186,17 @@ bool MtmFilterTransaction(char* record, int size)
3174
3186
default :
3175
3187
break ;
3176
3188
}
3189
+ restart_lsn = origin_node == MtmReplicationNodeId ? end_lsn : origin_lsn ;
3190
+ if (Mtm -> nodes [origin_node - 1 ].restartLSN < restart_lsn ) {
3191
+ Mtm -> nodes [origin_node - 1 ].restartLSN = restart_lsn ;
3192
+ } else {
3193
+ duplicate = true;
3194
+ }
3195
+
3177
3196
//duplicate = Mtm->status == MTM_RECOVERY && origin_lsn != InvalidXLogRecPtr && origin_lsn <= Mtm->nodes[origin_node-1].restartLSN;
3178
- duplicate = origin_lsn != InvalidXLogRecPtr && origin_lsn <= Mtm -> nodes [origin_node - 1 ].restartLSN ;
3179
3197
3180
3198
MTM_LOG1 ("%s transaction %s from node %d lsn %lx, flags=%x, origin node %d, original lsn=%lx, current lsn=%lx" ,
3181
- duplicate ? "Ignore" : "Apply" , gid , replication_node , end_lsn , flags , origin_node , origin_lsn , Mtm -> nodes [ origin_node - 1 ]. restartLSN );
3199
+ duplicate ? "Ignore" : "Apply" , gid , replication_node , end_lsn , flags , origin_node , origin_lsn , restart_lsn );
3182
3200
return duplicate ;
3183
3201
}
3184
3202
0 commit comments