Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit cecae53

Browse files
committed
Revert "do not try to wait for a stable clique"
This reverts commit 4624b32.
1 parent 4624b32 commit cecae53

File tree

1 file changed

+26
-13
lines changed

1 file changed

+26
-13
lines changed

contrib/mmts/state.c

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -451,7 +451,7 @@ MtmBuildConnectivityMatrix(nodemask_t* matrix)
451451
void
452452
MtmRefreshClusterStatus()
453453
{
454-
nodemask_t newClique;
454+
nodemask_t newClique, oldClique;
455455
nodemask_t matrix[MAX_NODES];
456456
nodemask_t trivialClique = ~SELF_CONNECTIVITY_MASK & (((nodemask_t)1 << Mtm->nAllNodes)-1);
457457
int cliqueSize;
@@ -529,25 +529,38 @@ MtmRefreshClusterStatus()
529529

530530
/*
531531
* Check for clique.
532-
*
533-
* Sleep is added to make sure that will detect all failures that we can.
534-
* Otherwise if we will receive information about dead node from our peer
535-
* before we detect that ourself we can disable innocent node.
536532
*/
537-
MtmSleep(2*MSEC_TO_USEC(MtmHeartbeatRecvTimeout));
538533
MtmBuildConnectivityMatrix(matrix);
539534
newClique = MtmFindMaxClique(matrix, Mtm->nAllNodes, &cliqueSize);
540535

541536
if (newClique == Mtm->clique)
542537
return;
543538

544-
MTM_LOG1("[STATE] Changed clique: %s -> %s ({%s, %s, %s}, %s)",
545-
maskToString(Mtm->clique, Mtm->nAllNodes),
546-
maskToString(newClique, Mtm->nAllNodes),
547-
maskToString(~Mtm->nodes[0].connectivityMask, Mtm->nAllNodes),
548-
maskToString(~Mtm->nodes[1].connectivityMask, Mtm->nAllNodes),
549-
maskToString(~Mtm->nodes[2].connectivityMask, Mtm->nAllNodes),
550-
newClique == trivialClique ? "trivial" : "non-trivial");
539+
MTM_LOG1("[STATE] Old clique: %s", maskToString(Mtm->clique, Mtm->nAllNodes));
540+
541+
/*
542+
* Otherwise make sure that all nodes have a chance to replicate their connectivity
543+
* mask and we have the "consistent" picture. Obviously we can not get true consistent
544+
* snapshot, but at least try to wait heartbeat send timeout is expired and
545+
* connectivity graph is stabilized.
546+
*/
547+
do {
548+
oldClique = newClique;
549+
/*
550+
* Double timeout to consider the worst case when heartbeat receive interval is added
551+
* with refresh cluster status interval.
552+
*/
553+
MtmSleep(MSEC_TO_USEC(MtmHeartbeatRecvTimeout)*2);
554+
MtmBuildConnectivityMatrix(matrix);
555+
newClique = MtmFindMaxClique(matrix, Mtm->nAllNodes, &cliqueSize);
556+
} while (newClique != oldClique);
557+
558+
MTM_LOG1("[STATE] New clique: %s", maskToString(oldClique, Mtm->nAllNodes));
559+
560+
if (newClique != trivialClique)
561+
{
562+
MTM_LOG1("[STATE] NONTRIVIAL CLIQUE! (trivial: %s)", maskToString(trivialClique, Mtm->nAllNodes)); // XXXX some false-positives, fixme
563+
}
551564

552565
/*
553566
* We are using clique only to disable nodes.

0 commit comments

Comments
 (0)