@@ -1084,13 +1084,20 @@ _PG_fini(void)
1084
1084
*/
1085
1085
1086
1086
1087
- static void MtmSwitchFromRecoveryToNormalMode ()
1087
+ void MtmSwitchToNormalMode ()
1088
1088
{
1089
1089
dtm -> status = MTM_ONLINE ;
1090
1090
elog (WARNING , "Switch to normal mode" );
1091
1091
/* ??? Something else to do here? */
1092
1092
}
1093
1093
1094
+ void MtmSwitchToRecoveryMode ()
1095
+ {
1096
+ dtm -> status = MTM_RECOVERY ;
1097
+ /* ??? Something else to do here? */
1098
+ elog (ERROR , "Switch to normal mode" );
1099
+ }
1100
+
1094
1101
1095
1102
void MtmJoinTransaction (GlobalTransactionId * gtid , csn_t globalSnapshot )
1096
1103
{
@@ -1110,7 +1117,7 @@ void MtmJoinTransaction(GlobalTransactionId* gtid, csn_t globalSnapshot)
1110
1117
Assert (dtm -> status == MTM_RECOVERY );
1111
1118
} else if (dtm -> status == MTM_RECOVERY ) {
1112
1119
/* When recovery is completed we get normal transaction ID and switch to normal mode */
1113
- MtmSwitchFromRecoveryToNormalMode ();
1120
+ MtmSwitchToNormalMode ();
1114
1121
}
1115
1122
dtmTx .gtid = * gtid ;
1116
1123
dtmTx .xid = GetCurrentTransactionId ();
@@ -1646,41 +1653,51 @@ MtmDetectGlobalDeadLock(PGPROC* proc)
1646
1653
static void
1647
1654
MtmBuildConnectivityMatrix (nodemask_t * matrix )
1648
1655
{
1649
- int i ;
1650
- for (i = 0 ; i < MtmNodes ; i ++ ) {
1656
+ int i , j , n = MtmNodes ;
1657
+ for (i = 0 ; i < n ; i ++ ) {
1651
1658
if (i + 1 != MtmNodeId ) {
1652
1659
void * data = PaxosGet (psprintf ("node-mask-%d" , i + 1 ), NULL , NULL );
1653
1660
matrix [i ] = * (nodemask_t * )data ;
1654
1661
} else {
1655
1662
matrix [i ] = dtm -> connectivityMask ;
1656
1663
}
1657
1664
}
1665
+ /* make matrix symetric: required for Bron–Kerbosch algorithm */
1666
+ for (i = 0 ; i < n ; i ++ ) {
1667
+ for (j = 0 ; j < i ; j ++ ) {
1668
+ matrix [i ] |= ((matrix [j ] >> i ) & 1 ) << j ;
1669
+ }
1670
+ }
1658
1671
}
1659
1672
1660
1673
1661
1674
void MtmUpdateClusterStatus (void )
1662
1675
{
1663
- nodemask_t mask , clique , disconnectedMask ;
1676
+ nodemask_t mask , clique ;
1664
1677
nodemask_t matrix [MAX_NODES ];
1678
+ int clique_size ;
1665
1679
int i ;
1666
1680
1667
1681
MtmBuildConnectivityMatrix (matrix );
1668
1682
1669
- clique = MtmFindMaxClique (matrix , MtmNodes );
1670
- disconnectedMask = ~clique & (((nodemask_t )1 << MtmNodes )- 1 );
1671
- MtmLock (LW_EXCLUSIVE );
1672
- mask = disconnectedMask & ~dtm -> disabledNodeMask ;
1673
- for (i = 0 ; mask != 0 ; i ++ , mask >>= 1 ) {
1674
- if (mask & 1 ) {
1675
- dtm -> nNodes -= 1 ;
1676
- BIT_SET (dtm -> disabledNodeMask , i );
1683
+ clique = MtmFindMaxClique (matrix , MtmNodes , & clique_size );
1684
+ if (clique_size >= MtmNodes /2 + 1 ) { /* have quorum */
1685
+ MtmLock (LW_EXCLUSIVE );
1686
+ mask = ~clique & (((nodemask_t )1 << MtmNodes )- 1 ) & ~dtm -> disabledNodeMask ;
1687
+ for (i = 0 ; mask != 0 ; i ++ , mask >>= 1 ) {
1688
+ if (mask & 1 ) {
1689
+ dtm -> nNodes -= 1 ;
1690
+ BIT_SET (dtm -> disabledNodeMask , i );
1691
+ }
1677
1692
}
1693
+ MtmUnlock ();
1694
+ if (BIT_CHECK (dtm -> disabledNodeMask , MtmNodeId - 1 )) {
1695
+ /* I was excluded from cluster:( */
1696
+ MtmSwitchToRecoveryMode ();
1697
+ }
1698
+ } else {
1699
+ elog (WARNING , "Clique %lx has no quorum" , clique );
1678
1700
}
1679
- if (dtm -> disabledNodeMask != disconnectedMask ) {
1680
- dtm -> disabledNodeMask |= disconnectedMask ;
1681
- PaxosSet (psprintf ("node-mask-%d" , MtmNodeId ), & dtm -> disabledNodeMask , sizeof dtm -> disabledNodeMask );
1682
- }
1683
- MtmUnlock ();
1684
1701
}
1685
1702
1686
1703
void MtmOnNodeDisconnect (int nodeId )
0 commit comments