@@ -451,7 +451,7 @@ MtmBuildConnectivityMatrix(nodemask_t* matrix)
451
451
void
452
452
MtmRefreshClusterStatus ()
453
453
{
454
- nodemask_t newClique ;
454
+ nodemask_t newClique , oldClique ;
455
455
nodemask_t matrix [MAX_NODES ];
456
456
nodemask_t trivialClique = ~SELF_CONNECTIVITY_MASK & (((nodemask_t )1 << Mtm -> nAllNodes )- 1 );
457
457
int cliqueSize ;
@@ -529,25 +529,38 @@ MtmRefreshClusterStatus()
529
529
530
530
/*
531
531
* Check for clique.
532
- *
533
- * Sleep is added to make sure that will detect all failures that we can.
534
- * Otherwise if we will receive information about dead node from our peer
535
- * before we detect that ourself we can disable innocent node.
536
532
*/
537
- MtmSleep (2 * MSEC_TO_USEC (MtmHeartbeatRecvTimeout ));
538
533
MtmBuildConnectivityMatrix (matrix );
539
534
newClique = MtmFindMaxClique (matrix , Mtm -> nAllNodes , & cliqueSize );
540
535
541
536
if (newClique == Mtm -> clique )
542
537
return ;
543
538
544
- MTM_LOG1 ("[STATE] Changed clique: %s -> %s ({%s, %s, %s}, %s)" ,
545
- maskToString (Mtm -> clique , Mtm -> nAllNodes ),
546
- maskToString (newClique , Mtm -> nAllNodes ),
547
- maskToString (~Mtm -> nodes [0 ].connectivityMask , Mtm -> nAllNodes ),
548
- maskToString (~Mtm -> nodes [1 ].connectivityMask , Mtm -> nAllNodes ),
549
- maskToString (~Mtm -> nodes [2 ].connectivityMask , Mtm -> nAllNodes ),
550
- newClique == trivialClique ? "trivial" : "non-trivial" );
539
+ MTM_LOG1 ("[STATE] Old clique: %s" , maskToString (Mtm -> clique , Mtm -> nAllNodes ));
540
+
541
+ /*
542
+ * Otherwise make sure that all nodes have a chance to replicate their connectivity
543
+ * mask and we have the "consistent" picture. Obviously we can not get true consistent
544
+ * snapshot, but at least try to wait heartbeat send timeout is expired and
545
+ * connectivity graph is stabilized.
546
+ */
547
+ do {
548
+ oldClique = newClique ;
549
+ /*
550
+ * Double timeout to consider the worst case when heartbeat receive interval is added
551
+ * with refresh cluster status interval.
552
+ */
553
+ MtmSleep (MSEC_TO_USEC (MtmHeartbeatRecvTimeout )* 2 );
554
+ MtmBuildConnectivityMatrix (matrix );
555
+ newClique = MtmFindMaxClique (matrix , Mtm -> nAllNodes , & cliqueSize );
556
+ } while (newClique != oldClique );
557
+
558
+ MTM_LOG1 ("[STATE] New clique: %s" , maskToString (oldClique , Mtm -> nAllNodes ));
559
+
560
+ if (newClique != trivialClique )
561
+ {
562
+ MTM_LOG1 ("[STATE] NONTRIVIAL CLIQUE! (trivial: %s)" , maskToString (trivialClique , Mtm -> nAllNodes )); // XXXX some false-positives, fixme
563
+ }
551
564
552
565
/*
553
566
* We are using clique only to disable nodes.
0 commit comments