@@ -196,7 +196,8 @@ static void MtmSetReceiveMode(uint32 mode);
196
196
197
197
static void AcquirePBByHolder (void );
198
198
199
- static nodemask_t MtmGetConnectivityClique (nodemask_t * connected_mask_with_me );
199
+ static bool MtmIsConnectivityClique (nodemask_t mask );
200
+ static nodemask_t MtmGetConnectivityClique (bool locked );
200
201
201
202
/* serialization functions */
202
203
static void MtmStateSave (void );
@@ -781,7 +782,7 @@ MtmGetCurrentStatus(bool gen_locked, bool vote_locked)
781
782
* the shop to the client.
782
783
*/
783
784
if (!is_submask (mtm_state -> current_gen_members ,
784
- MtmGetConnectivityClique (NULL )) ||
785
+ MtmGetConnectivityClique (false )) ||
785
786
mtm_state -> campaigner_on_tour )
786
787
res = MTM_ISOLATED ;
787
788
else if (status_in_gen == MTM_GEN_RECOVERY )
@@ -989,6 +990,7 @@ CampaignMyself(MtmConfig *mtm_cfg, MtmGeneration *candidate_gen,
989
990
{
990
991
nodemask_t connected_mask_with_me ;
991
992
nodemask_t clique ;
993
+ bool is_curr_gen_connected ;
992
994
993
995
/*
994
996
* Basebackup'ed node must recover from donor until it obtains syncpoints
@@ -1010,17 +1012,45 @@ CampaignMyself(MtmConfig *mtm_cfg, MtmGeneration *candidate_gen,
1010
1012
LWLockAcquire (mtm_state -> gen_lock , LW_SHARED );
1011
1013
LWLockAcquire (mtm_state -> vote_lock , LW_EXCLUSIVE );
1012
1014
1013
- clique = MtmGetConnectivityClique (& connected_mask_with_me );
1015
+ LWLockAcquire (mtm_state -> connectivity_lock , LW_SHARED );
1016
+ connected_mask_with_me = MtmGetConnectedMaskWithMe (true);
1017
+ clique = MtmGetConnectivityClique (true);
1018
+ is_curr_gen_connected = MtmIsConnectivityClique (mtm_state -> current_gen_members );
1019
+ LWLockRelease (mtm_state -> connectivity_lock );
1014
1020
1015
- mtm_log (MtmStateDebug , "CampaignMyself: current_gen.num=" UINT64_FORMAT ", current_gen.members=%s, current_gen.configured=%s, StatusInGen=%s, last_online_in=" UINT64_FORMAT ", last_vote.num=" UINT64_FORMAT ", clique=%s, connected_mask_with_me=%s" ,
1021
+ mtm_log (MtmStateDebug , "CampaignMyself: current_gen.num=" UINT64_FORMAT ", current_gen.members=%s, current_gen.configured=%s, StatusInGen=%s, last_online_in=" UINT64_FORMAT ", last_vote.num=" UINT64_FORMAT ", clique=%s, connected_mask_with_me=%s, is_curr_gen_connected=%d " ,
1016
1022
pg_atomic_read_u64 (& mtm_state -> current_gen_num ),
1017
1023
maskToString (mtm_state -> current_gen_members ),
1018
1024
maskToString (mtm_state -> current_gen_configured ),
1019
1025
MtmStatusInGenMnem [MtmGetCurrentStatusInGen ()],
1020
1026
mtm_state -> last_online_in ,
1021
1027
mtm_state -> last_vote .num ,
1022
1028
maskToString (clique ),
1023
- maskToString (connected_mask_with_me ));
1029
+ maskToString (connected_mask_with_me ),
1030
+ is_curr_gen_connected );
1031
+
1032
+ /*
1033
+ * If I am online in curr gen (definitely its member) and all members are
1034
+ * interconnected, the situation is fine for me: don't ballot. This
1035
+ * shorthack is caused by 'majority clique always elects its gen' rule
1036
+ * which is necessary for xact resolution liveness (see below). Without
1037
+ * it, there is a small chance offline node turn on might temporarily
1038
+ * throw out live node(s) from the cluster because there could be several
1039
+ * cliques. For example,
1040
+ * - nodes 23 online in its gen, 1 is down
1041
+ * - 1 gets up. For a short period of time, 2 sees two cliques
1042
+ * 12 and 23 as 1<->3 connection is not yet established/gossiped.
1043
+ * Clique must be chosen deterministically to avoid flip-flopping in
1044
+ * sausage-like topologies, 2 picks 12 and ballots 12 & 23,
1045
+ * exluding 3 from the cluster.
1046
+ * The only reason for 2 balloting at all here, while 1 is not
1047
+ * recovered yet is xact resolution liveness, c.f. handle_1a.
1048
+ */
1049
+ if (MtmGetCurrentStatusInGen () == MTM_GEN_ONLINE && is_curr_gen_connected )
1050
+ {
1051
+ mtm_log (MtmStateDebug , "not campaigning as curr gen is connected and I am online" );
1052
+ goto no_interesting_candidates ;
1053
+ }
1024
1054
1025
1055
/*
1026
1056
* No point to campaign if there is no quorum clique with me at all. But
@@ -1145,6 +1175,11 @@ CampaignMyself(MtmConfig *mtm_cfg, MtmGeneration *candidate_gen,
1145
1175
* stable live connecitivity clique forming majority eventually elects
1146
1176
* generation with its members (unless previous was already the same)
1147
1177
* regardless of recovery progress; c.f. handle_1a for details.
1178
+ * (note that we need this weirdness only for >4 nodes, as with less
1179
+ * nodes xact resolution cannot deadlock (at least one of
1180
+ * coordinators of two conflicting xacts is member of live majority, and
1181
+ * coordinator may unconditionally abort his prepares before PC), so we
1182
+ * may avoid balloting for <=3 nodes.)
1148
1183
*
1149
1184
* This might seem to have a downside though: in case of short flip
1150
1185
* flopping like
@@ -1666,7 +1701,7 @@ HandleGenVoteRequest(MtmConfig *mtm_cfg, MtmGenVoteRequest *req,
1666
1701
{
1667
1702
StringInfo packed_msg ;
1668
1703
MtmGenVoteResponse resp ;
1669
- nodemask_t clique = MtmGetConnectivityClique (NULL );
1704
+ nodemask_t clique = MtmGetConnectivityClique (false );
1670
1705
1671
1706
mtm_log (MtmStateDebug , "HandleGenVoteRequest: got '%s' from %d" ,
1672
1707
MtmMesageToString ((MtmMessage * ) req ), sender_node_id );
@@ -2038,13 +2073,46 @@ MtmOnDmqSenderDisconnect(char *node_name)
2038
2073
mtm_log (MtmStateMessage , "[STATE] dmq sender to node %d disconnected" , node_id );
2039
2074
}
2040
2075
2076
+ /*
2077
+ * Do all nodes from mask see each other? The clique is not neccesarily
2078
+ * maximal.
2079
+ */
2080
+ static bool
2081
+ MtmIsConnectivityClique (nodemask_t mask )
2082
+ {
2083
+ int i , j ;
2084
+ nodemask_t connected_mask ;
2085
+
2086
+ Assert (LWLockHeldByMe (mtm_state -> connectivity_lock ));
2087
+ connected_mask = MtmGetConnectedMask (true);
2088
+ for (i = 0 ; i < MTM_MAX_NODES ; i ++ )
2089
+ {
2090
+ if (!BIT_CHECK (mask , i ))
2091
+ continue ;
2092
+ for (j = 0 ; j < MTM_MAX_NODES ; j ++ )
2093
+ {
2094
+ if (i == j )
2095
+ continue ;
2096
+ if (i + 1 == Mtm -> my_node_id )
2097
+ {
2098
+ if (!BIT_CHECK (connected_mask , j ))
2099
+ return false;
2100
+ }
2101
+ else
2102
+ {
2103
+ if (!BIT_CHECK (mtm_state -> connectivity_matrix [i ], j ))
2104
+ return false;
2105
+ }
2106
+ }
2107
+ }
2108
+ return true;
2109
+ }
2110
+
2041
2111
/*
2042
2112
* The largest subset of nodes where each member sees each other.
2043
- * If connected_mask_with_me is not NULL, additionally fills it with
2044
- * MtmGetConnectedMaskWithMe.
2045
2113
*/
2046
2114
static nodemask_t
2047
- MtmGetConnectivityClique (nodemask_t * connected_mask_with_me )
2115
+ MtmGetConnectivityClique (bool locked )
2048
2116
{
2049
2117
nodemask_t matrix [MTM_MAX_NODES ];
2050
2118
nodemask_t clique ;
@@ -2061,8 +2129,6 @@ MtmGetConnectivityClique(nodemask_t *connected_mask_with_me)
2061
2129
memcpy (matrix , mtm_state -> connectivity_matrix , sizeof (nodemask_t ) * MTM_MAX_NODES );
2062
2130
matrix [me - 1 ] = MtmGetConnectedMaskWithMe (true);
2063
2131
LWLockRelease (mtm_state -> connectivity_lock );
2064
- if (connected_mask_with_me != NULL )
2065
- * connected_mask_with_me = matrix [me - 1 ];
2066
2132
2067
2133
/* make matrix symmetric, required by Bron–Kerbosch algorithm */
2068
2134
for (i = 0 ; i < MTM_MAX_NODES ; i ++ )
@@ -2804,7 +2870,6 @@ static void handle_1a(txset_entry *txse, HTAB *txset, bool *wal_scanned,
2804
2870
* said above, all other <n xacts are aborted 2) xacts of n can't be
2805
2871
* prepared before getting all prepares of n. After that, resolving
2806
2872
* proceeds normally as majority has PREPAREs.
2807
- *
2808
2873
*/
2809
2874
2810
2875
/* xact obviously could appear after fast path check */
0 commit comments