Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 8e6a32a

Browse files
committed
Avoid healthy node going into recovery when previously offline one enters.
Usage of generations to avoid xact resolution deadlocks on >3 nodes forced campaigner to always ballot for interconnected majority regardless of recovery progress. This created the following unpleasant schedule: - nodes 23 online in its gen, 1 is down - 1 gets up. For a short period of time, 2 sees two cliques 12 and 23 as 1<->3 connection is not yet established/gossiped. Clique must be chosen deterministically to avoid flip-flopping in sausage-like topologies, 2 picks 12 and ballots 12 & 23, exluding 3 from the cluster. Mitigate this by not voting if everything is definitely well: node is online in its gen and all gen members are connected. Brought up by notorious add-stop node test.
1 parent 882d2d0 commit 8e6a32a

File tree

2 files changed

+79
-14
lines changed

2 files changed

+79
-14
lines changed

src/bkb.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,8 @@ _list_to_nodemask(NodeList *list)
7272
* 4
7373
*
7474
* 2 and 4 must calculate the same clique, or we won't converge.
75-
* To this end, we compare biggest max cliques by nodemask and pick the
76-
* largest one.
75+
* To this end, we compare max cliques by nodemask and pick the
76+
* smallest one.
7777
*/
7878
static void
7979
extend(NodeList* cur, NodeList* result, nodemask_t* graph, int* oldSet, int ne, int ce)

src/state.c

+77-12
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,8 @@ static void MtmSetReceiveMode(uint32 mode);
196196

197197
static void AcquirePBByHolder(void);
198198

199-
static nodemask_t MtmGetConnectivityClique(nodemask_t *connected_mask_with_me);
199+
static bool MtmIsConnectivityClique(nodemask_t mask);
200+
static nodemask_t MtmGetConnectivityClique(bool locked);
200201

201202
/* serialization functions */
202203
static void MtmStateSave(void);
@@ -781,7 +782,7 @@ MtmGetCurrentStatus(bool gen_locked, bool vote_locked)
781782
* the shop to the client.
782783
*/
783784
if (!is_submask(mtm_state->current_gen_members,
784-
MtmGetConnectivityClique(NULL)) ||
785+
MtmGetConnectivityClique(false)) ||
785786
mtm_state->campaigner_on_tour)
786787
res = MTM_ISOLATED;
787788
else if (status_in_gen == MTM_GEN_RECOVERY)
@@ -989,6 +990,7 @@ CampaignMyself(MtmConfig *mtm_cfg, MtmGeneration *candidate_gen,
989990
{
990991
nodemask_t connected_mask_with_me;
991992
nodemask_t clique;
993+
bool is_curr_gen_connected;
992994

993995
/*
994996
* Basebackup'ed node must recover from donor until it obtains syncpoints
@@ -1010,17 +1012,45 @@ CampaignMyself(MtmConfig *mtm_cfg, MtmGeneration *candidate_gen,
10101012
LWLockAcquire(mtm_state->gen_lock, LW_SHARED);
10111013
LWLockAcquire(mtm_state->vote_lock, LW_EXCLUSIVE);
10121014

1013-
clique = MtmGetConnectivityClique(&connected_mask_with_me);
1015+
LWLockAcquire(mtm_state->connectivity_lock, LW_SHARED);
1016+
connected_mask_with_me = MtmGetConnectedMaskWithMe(true);
1017+
clique = MtmGetConnectivityClique(true);
1018+
is_curr_gen_connected = MtmIsConnectivityClique(mtm_state->current_gen_members);
1019+
LWLockRelease(mtm_state->connectivity_lock);
10141020

1015-
mtm_log(MtmStateDebug, "CampaignMyself: current_gen.num=" UINT64_FORMAT ", current_gen.members=%s, current_gen.configured=%s, StatusInGen=%s, last_online_in=" UINT64_FORMAT ", last_vote.num=" UINT64_FORMAT ", clique=%s, connected_mask_with_me=%s",
1021+
mtm_log(MtmStateDebug, "CampaignMyself: current_gen.num=" UINT64_FORMAT ", current_gen.members=%s, current_gen.configured=%s, StatusInGen=%s, last_online_in=" UINT64_FORMAT ", last_vote.num=" UINT64_FORMAT ", clique=%s, connected_mask_with_me=%s, is_curr_gen_connected=%d",
10161022
pg_atomic_read_u64(&mtm_state->current_gen_num),
10171023
maskToString(mtm_state->current_gen_members),
10181024
maskToString(mtm_state->current_gen_configured),
10191025
MtmStatusInGenMnem[MtmGetCurrentStatusInGen()],
10201026
mtm_state->last_online_in,
10211027
mtm_state->last_vote.num,
10221028
maskToString(clique),
1023-
maskToString(connected_mask_with_me));
1029+
maskToString(connected_mask_with_me),
1030+
is_curr_gen_connected);
1031+
1032+
/*
1033+
* If I am online in curr gen (definitely its member) and all members are
1034+
* interconnected, the situation is fine for me: don't ballot. This
1035+
* shorthack is caused by 'majority clique always elects its gen' rule
1036+
* which is necessary for xact resolution liveness (see below). Without
1037+
* it, there is a small chance offline node turn on might temporarily
1038+
* throw out live node(s) from the cluster because there could be several
1039+
* cliques. For example,
1040+
* - nodes 23 online in its gen, 1 is down
1041+
* - 1 gets up. For a short period of time, 2 sees two cliques
1042+
* 12 and 23 as 1<->3 connection is not yet established/gossiped.
1043+
* Clique must be chosen deterministically to avoid flip-flopping in
1044+
* sausage-like topologies, 2 picks 12 and ballots 12 & 23,
1045+
* exluding 3 from the cluster.
1046+
* The only reason for 2 balloting at all here, while 1 is not
1047+
* recovered yet is xact resolution liveness, c.f. handle_1a.
1048+
*/
1049+
if (MtmGetCurrentStatusInGen() == MTM_GEN_ONLINE && is_curr_gen_connected)
1050+
{
1051+
mtm_log(MtmStateDebug, "not campaigning as curr gen is connected and I am online");
1052+
goto no_interesting_candidates;
1053+
}
10241054

10251055
/*
10261056
* No point to campaign if there is no quorum clique with me at all. But
@@ -1145,6 +1175,11 @@ CampaignMyself(MtmConfig *mtm_cfg, MtmGeneration *candidate_gen,
11451175
* stable live connecitivity clique forming majority eventually elects
11461176
* generation with its members (unless previous was already the same)
11471177
* regardless of recovery progress; c.f. handle_1a for details.
1178+
* (note that we need this weirdness only for >4 nodes, as with less
1179+
* nodes xact resolution cannot deadlock (at least one of
1180+
* coordinators of two conflicting xacts is member of live majority, and
1181+
* coordinator may unconditionally abort his prepares before PC), so we
1182+
* may avoid balloting for <=3 nodes.)
11481183
*
11491184
* This might seem to have a downside though: in case of short flip
11501185
* flopping like
@@ -1666,7 +1701,7 @@ HandleGenVoteRequest(MtmConfig *mtm_cfg, MtmGenVoteRequest *req,
16661701
{
16671702
StringInfo packed_msg;
16681703
MtmGenVoteResponse resp;
1669-
nodemask_t clique = MtmGetConnectivityClique(NULL);
1704+
nodemask_t clique = MtmGetConnectivityClique(false);
16701705

16711706
mtm_log(MtmStateDebug, "HandleGenVoteRequest: got '%s' from %d",
16721707
MtmMesageToString((MtmMessage *) req), sender_node_id);
@@ -2038,13 +2073,46 @@ MtmOnDmqSenderDisconnect(char *node_name)
20382073
mtm_log(MtmStateMessage, "[STATE] dmq sender to node %d disconnected", node_id);
20392074
}
20402075

2076+
/*
2077+
* Do all nodes from mask see each other? The clique is not neccesarily
2078+
* maximal.
2079+
*/
2080+
static bool
2081+
MtmIsConnectivityClique(nodemask_t mask)
2082+
{
2083+
int i, j;
2084+
nodemask_t connected_mask;
2085+
2086+
Assert(LWLockHeldByMe(mtm_state->connectivity_lock));
2087+
connected_mask = MtmGetConnectedMask(true);
2088+
for (i = 0; i < MTM_MAX_NODES; i++)
2089+
{
2090+
if (!BIT_CHECK(mask, i))
2091+
continue;
2092+
for (j = 0; j < MTM_MAX_NODES; j++)
2093+
{
2094+
if (i == j)
2095+
continue;
2096+
if (i + 1 == Mtm->my_node_id)
2097+
{
2098+
if (!BIT_CHECK(connected_mask, j))
2099+
return false;
2100+
}
2101+
else
2102+
{
2103+
if (!BIT_CHECK(mtm_state->connectivity_matrix[i], j))
2104+
return false;
2105+
}
2106+
}
2107+
}
2108+
return true;
2109+
}
2110+
20412111
/*
20422112
* The largest subset of nodes where each member sees each other.
2043-
* If connected_mask_with_me is not NULL, additionally fills it with
2044-
* MtmGetConnectedMaskWithMe.
20452113
*/
20462114
static nodemask_t
2047-
MtmGetConnectivityClique(nodemask_t *connected_mask_with_me)
2115+
MtmGetConnectivityClique(bool locked)
20482116
{
20492117
nodemask_t matrix[MTM_MAX_NODES];
20502118
nodemask_t clique;
@@ -2061,8 +2129,6 @@ MtmGetConnectivityClique(nodemask_t *connected_mask_with_me)
20612129
memcpy(matrix, mtm_state->connectivity_matrix, sizeof(nodemask_t) * MTM_MAX_NODES);
20622130
matrix[me - 1] = MtmGetConnectedMaskWithMe(true);
20632131
LWLockRelease(mtm_state->connectivity_lock);
2064-
if (connected_mask_with_me != NULL)
2065-
*connected_mask_with_me = matrix[me - 1];
20662132

20672133
/* make matrix symmetric, required by Bron–Kerbosch algorithm */
20682134
for (i = 0; i < MTM_MAX_NODES; i++)
@@ -2804,7 +2870,6 @@ static void handle_1a(txset_entry *txse, HTAB *txset, bool *wal_scanned,
28042870
* said above, all other <n xacts are aborted 2) xacts of n can't be
28052871
* prepared before getting all prepares of n. After that, resolving
28062872
* proceeds normally as majority has PREPAREs.
2807-
*
28082873
*/
28092874

28102875
/* xact obviously could appear after fast path check */

0 commit comments

Comments
 (0)