@@ -451,7 +451,7 @@ MtmBuildConnectivityMatrix(nodemask_t* matrix)
451
451
void
452
452
MtmRefreshClusterStatus ()
453
453
{
454
- nodemask_t newClique , oldClique ;
454
+ nodemask_t newClique ;
455
455
nodemask_t matrix [MAX_NODES ];
456
456
nodemask_t trivialClique = ~SELF_CONNECTIVITY_MASK & (((nodemask_t )1 << Mtm -> nAllNodes )- 1 );
457
457
int cliqueSize ;
@@ -529,38 +529,25 @@ MtmRefreshClusterStatus()
529
529
530
530
/*
531
531
* Check for clique.
532
+ *
533
+ * Sleep is added to make sure that will detect all failures that we can.
534
+ * Otherwise if we will receive information about dead node from our peer
535
+ * before we detect that ourself we can disable innocent node.
532
536
*/
537
+ MtmSleep (2 * MSEC_TO_USEC (MtmHeartbeatRecvTimeout ));
533
538
MtmBuildConnectivityMatrix (matrix );
534
539
newClique = MtmFindMaxClique (matrix , Mtm -> nAllNodes , & cliqueSize );
535
540
536
541
if (newClique == Mtm -> clique )
537
542
return ;
538
543
539
- MTM_LOG1 ("[STATE] Old clique: %s" , maskToString (Mtm -> clique , Mtm -> nAllNodes ));
540
-
541
- /*
542
- * Otherwise make sure that all nodes have a chance to replicate their connectivity
543
- * mask and we have the "consistent" picture. Obviously we can not get true consistent
544
- * snapshot, but at least try to wait heartbeat send timeout is expired and
545
- * connectivity graph is stabilized.
546
- */
547
- do {
548
- oldClique = newClique ;
549
- /*
550
- * Double timeout to consider the worst case when heartbeat receive interval is added
551
- * with refresh cluster status interval.
552
- */
553
- MtmSleep (MSEC_TO_USEC (MtmHeartbeatRecvTimeout )* 2 );
554
- MtmBuildConnectivityMatrix (matrix );
555
- newClique = MtmFindMaxClique (matrix , Mtm -> nAllNodes , & cliqueSize );
556
- } while (newClique != oldClique );
557
-
558
- MTM_LOG1 ("[STATE] New clique: %s" , maskToString (oldClique , Mtm -> nAllNodes ));
559
-
560
- if (newClique != trivialClique )
561
- {
562
- MTM_LOG1 ("[STATE] NONTRIVIAL CLIQUE! (trivial: %s)" , maskToString (trivialClique , Mtm -> nAllNodes )); // XXXX some false-positives, fixme
563
- }
544
+ MTM_LOG1 ("[STATE] Changed clique: %s -> %s ({%s, %s, %s}, %s)" ,
545
+ maskToString (Mtm -> clique , Mtm -> nAllNodes ),
546
+ maskToString (newClique , Mtm -> nAllNodes ),
547
+ maskToString (~Mtm -> nodes [0 ].connectivityMask , Mtm -> nAllNodes ),
548
+ maskToString (~Mtm -> nodes [1 ].connectivityMask , Mtm -> nAllNodes ),
549
+ maskToString (~Mtm -> nodes [2 ].connectivityMask , Mtm -> nAllNodes ),
550
+ newClique == trivialClique ? "trivial" : "non-trivial" );
564
551
565
552
/*
566
553
* We are using clique only to disable nodes.
0 commit comments