Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit f639b4c

Browse files
committed
referee version 2
1 parent 426b00d commit f639b4c

8 files changed

+218
-27
lines changed

multimaster--1.0.sql

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,3 +151,27 @@ $$
151151
LANGUAGE plpgsql;
152152

153153
-- select mtm.alter_sequences();
154+
155+
-- referee stuff
156+
CREATE TABLE IF NOT EXISTS mtm.referee_decision(key text primary key not null, node_id int);
157+
158+
CREATE OR REPLACE FUNCTION mtm.referee_get_winner(applicant_id int) RETURNS int AS
159+
$$
160+
DECLARE
161+
winner_id int;
162+
BEGIN
163+
insert into mtm.referee_decision values ('winner', applicant_id);
164+
select node_id into winner_id from mtm.referee_decision where key = 'winner';
165+
return winner_id;
166+
EXCEPTION WHEN others THEN
167+
select node_id into winner_id from mtm.referee_decision where key = 'winner';
168+
return winner_id;
169+
END
170+
$$
171+
LANGUAGE plpgsql;
172+
173+
CREATE OR REPLACE FUNCTION mtm.referee_clean() RETURNS void AS
174+
$$
175+
delete from mtm.referee_decision where key = 'winner';
176+
$$
177+
LANGUAGE sql;

multimaster.c

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,7 @@ bool MtmUseRDMA;
254254
bool MtmPreserveCommitOrder;
255255
bool MtmVolksWagenMode; /* Pretend to be normal postgres. This means skip some NOTICE's and use local sequences */
256256
bool MtmMajorNode;
257+
char* MtmRefereeConnStr;
257258

258259
static char* MtmConnStrs;
259260
static char* MtmRemoteFunctionsList;
@@ -2372,6 +2373,7 @@ static void MtmInitialize()
23722373
Mtm->nAllNodes = MtmNodes;
23732374
Mtm->disabledNodeMask = (((nodemask_t)1 << MtmNodes) - 1);
23742375
Mtm->clique = 0;
2376+
Mtm->refereeGrant = false;
23752377
Mtm->stalledNodeMask = 0;
23762378
Mtm->stoppedNodeMask = 0;
23772379
Mtm->deadNodeMask = 0;
@@ -2618,8 +2620,7 @@ static void MtmSplitConnStrs(void)
26182620
}
26192621
pfree(copy);
26202622
}
2621-
if (!MtmReferee)
2622-
{
2623+
26232624
if (MtmNodeId == INT_MAX) {
26242625
if (gethostname(buf, sizeof buf) != 0) {
26252626
MTM_ELOG(ERROR, "Failed to get host name: %m");
@@ -2679,7 +2680,6 @@ static void MtmSplitConnStrs(void)
26792680
len = end - dbName;
26802681
MtmDatabaseName = pnstrdup(dbName, len);
26812682
}
2682-
}
26832683
MemoryContextSwitchTo(old_context);
26842684
}
26852685

@@ -2976,6 +2976,19 @@ _PG_init(void)
29762976
NULL
29772977
);
29782978

2979+
DefineCustomStringVariable(
2980+
"multimaster.referee_connstring",
2981+
"Referee connection string",
2982+
NULL,
2983+
&MtmRefereeConnStr,
2984+
"",
2985+
PGC_POSTMASTER,
2986+
0,
2987+
NULL,
2988+
NULL,
2989+
NULL
2990+
);
2991+
29792992
DefineCustomBoolVariable(
29802993
"multimaster.use_rdma",
29812994
"Use RDMA sockets",
@@ -3177,8 +3190,6 @@ _PG_init(void)
31773190

31783191
if (MtmReferee)
31793192
{
3180-
MtmSplitConnStrs();
3181-
MtmRefereeInitialize();
31823193
return;
31833194
}
31843195

multimaster.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,7 @@ typedef struct
287287
TransactionId oldestXid; /* XID of oldest transaction visible by any active transaction (local or global) */
288288
nodemask_t disabledNodeMask; /* Bitmask of disabled nodes */
289289
nodemask_t clique; /* Bitmask of nodes that are connected and we allowed to connect/send wal/receive wal with them */
290+
bool refereeGrant; /* Referee allowed us to work with half of the nodes */
290291
nodemask_t deadNodeMask; /* Bitmask of nodes considered as dead by referee */
291292
nodemask_t recoveredNodeMask; /* Bitmask of nodes recoverd after been reported as dead by referee */
292293
nodemask_t stalledNodeMask; /* Bitmask of stalled nodes (node with dropped replication slot which makes it not possible automatic recovery of such node) */
@@ -379,6 +380,7 @@ extern timestamp_t MtmRefreshClusterStatusSchedule;
379380
extern MtmConnectionInfo* MtmConnections;
380381
extern bool MtmMajorNode;
381382
extern bool MtmBackgroundWorker;
383+
extern char* MtmRefereeConnStr;
382384

383385

384386
extern void MtmArbiterInitialize(void);

state.c

Lines changed: 108 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ char const* const MtmEventMnem[] =
2525
"MTM_NONRECOVERABLE_ERROR"
2626
};
2727

28+
static int MtmGetRefereeWinner(void);
2829

2930
// XXXX: allocate in context and clean it
3031
static char *
@@ -91,12 +92,14 @@ MtmCheckState(void)
9192
maskToString(Mtm->pglogicalReceiverMask, Mtm->nAllNodes),
9293
maskToString(Mtm->pglogicalSenderMask, Mtm->nAllNodes),
9394
Mtm->nAllNodes,
94-
MtmMajorNode);
95+
(MtmMajorNode || Mtm->refereeGrant) );
9596

9697
isEnabledState =
97-
( (nConnected >= Mtm->nAllNodes/2+1) /* majority */
98-
|| (nConnected == Mtm->nAllNodes/2 && MtmMajorNode) ) /* or half + major node */
99-
&& BIT_CHECK(Mtm->clique, MtmNodeId-1); /* in clique */
98+
( (nConnected >= Mtm->nAllNodes/2+1) /* majority */
99+
// XXXX: should we restrict major with two nodes setup?
100+
|| (nConnected == Mtm->nAllNodes/2 && MtmMajorNode) /* or half + major node */
101+
|| (nConnected == Mtm->nAllNodes/2 && Mtm->refereeGrant) ) /* or half + referee */
102+
&& BIT_CHECK(Mtm->clique, MtmNodeId-1); /* in clique */
100103

101104
/* ANY -> MTM_DISABLED */
102105
if (!isEnabledState)
@@ -135,6 +138,12 @@ MtmCheckState(void)
135138
case MTM_RECOVERED:
136139
if (nReceivers == nEnabled-1 && nSenders == nEnabled-1 && nEnabled == nConnected)
137140
{
141+
/*
142+
* It should be already cleaned by RECOVERY_CAUGHTUP, but
143+
* in major mode or with referee we can be working alone
144+
* so nobody will clean it.
145+
*/
146+
BIT_CLEAR(Mtm->originLockNodeMask, MtmNodeId-1);
138147
MtmSetClusterStatus(MTM_ONLINE);
139148
return;
140149
}
@@ -376,9 +385,35 @@ MtmRefreshClusterStatus()
376385
* Periodical check that we are still in RECOVERED state.
377386
* See comment to MTM_RECOVERED -> MTM_ONLINE transition in MtmCheckState()
378387
*/
379-
if (Mtm->status == MTM_RECOVERED)
380-
MtmCheckState();
388+
MtmCheckState();
389+
390+
/*
391+
* Check for referee decidion when pnly half of nodes are visible.
392+
*/
393+
if (MtmRefereeConnStr && *MtmRefereeConnStr && !Mtm->refereeGrant &&
394+
// XXXX visibility & ~clique?
395+
countZeroBits(SELF_CONNECTIVITY_MASK, Mtm->nAllNodes) == Mtm->nAllNodes/2)
396+
{
397+
int winner_node_id = MtmGetRefereeWinner();
398+
if (winner_node_id != -1 &&
399+
// XXXX visibility & ~clique?
400+
!BIT_CHECK(SELF_CONNECTIVITY_MASK, winner_node_id - 1))
401+
{
402+
MTM_LOG1("[STATE] Referee allowed to proceed with half of the nodes (winner_id = %d)",
403+
winner_node_id);
404+
Mtm->refereeGrant = true;
405+
406+
MtmLock(LW_EXCLUSIVE);
407+
MtmEnableNode(MtmNodeId);
408+
MtmCheckState();
409+
MtmUnlock();
410+
}
411+
}
381412

413+
414+
/*
415+
* Check for clique.
416+
*/
382417
MtmBuildConnectivityMatrix(matrix);
383418
newClique = MtmFindMaxClique(matrix, Mtm->nAllNodes, &cliqueSize);
384419

@@ -436,3 +471,70 @@ MtmRefreshClusterStatus()
436471
MtmCheckState();
437472
MtmUnlock();
438473
}
474+
475+
static int
476+
MtmGetRefereeWinner(void)
477+
{
478+
int socket_fd;
479+
PGconn* conn;
480+
PGresult *res;
481+
struct timeval timeout = { 5, 0 };
482+
char sql[128];
483+
int winner_node_id;
484+
485+
conn = PQconnectdb_safe(MtmRefereeConnStr);
486+
if (PQstatus(conn) != CONNECTION_OK)
487+
{
488+
MTM_ELOG(WARNING, "Could not connect to referee (%s): %s",
489+
MtmRefereeConnStr, PQerrorMessage(conn));
490+
PQfinish(conn);
491+
return -1;
492+
}
493+
494+
socket_fd = PQsocket(conn);
495+
if (socket_fd < 0)
496+
{
497+
MTM_ELOG(WARNING, "Referee socket is invalid");
498+
PQfinish(conn);
499+
return -1;
500+
}
501+
502+
if (setsockopt(socket_fd, SOL_SOCKET, SO_RCVTIMEO,
503+
(char *)&timeout, sizeof(timeout)) < 0)
504+
{
505+
MTM_ELOG(WARNING, "Could not set referee socket timeout: %s",
506+
strerror(errno));
507+
PQfinish(conn);
508+
return -1;
509+
}
510+
511+
sprintf(sql, "select mtm.referee_get_winner(%d)", MtmNodeId);
512+
res = PQexec(conn, sql);
513+
if (PQresultStatus(res) != PGRES_TUPLES_OK ||
514+
PQntuples(res) != 1 ||
515+
PQnfields(res) != 1)
516+
{
517+
MTM_ELOG(WARNING, "Refusing unexpected result (r=%d, n=%d, w=%d, k=%s) from referee.",
518+
PQresultStatus(res), PQntuples(res), PQnfields(res), PQgetvalue(res, 0, 0));
519+
PQclear(res);
520+
PQfinish(conn);
521+
return -1;
522+
}
523+
524+
winner_node_id = atoi(PQgetvalue(res, 0, 0));
525+
526+
if (winner_node_id < 1 || winner_node_id > Mtm->nAllNodes)
527+
{
528+
MTM_ELOG(WARNING,
529+
"Referee responded with node_id=%d, it's out of our node range",
530+
winner_node_id);
531+
PQclear(res);
532+
PQfinish(conn);
533+
return -1;
534+
}
535+
536+
MTM_LOG1("Got referee response, winner node_id=%d.", winner_node_id);
537+
/* Ok, we finally got it! */
538+
return winner_node_id;
539+
}
540+

tests2/docker-entrypoint.sh

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,21 +58,39 @@ if [ "$1" = 'postgres' ]; then
5858
max_wal_senders = 10
5959
shared_preload_libraries = 'multimaster'
6060
default_transaction_isolation = 'repeatable read'
61-
log_line_prefix = '%m: '
61+
log_line_prefix = '%m: '
62+
# log_statement = all
6263
6364
multimaster.workers = 4
6465
multimaster.max_workers = 16
6566
multimaster.max_nodes = 3
6667
multimaster.volkswagen_mode = 1
6768
multimaster.queue_size=52857600
6869
multimaster.ignore_tables_without_pk = 1
69-
multimaster.node_id = $NODE_ID
70-
multimaster.conn_strings = '$CONNSTRS'
71-
multimaster.major_node = $MAJOR
7270
multimaster.heartbeat_recv_timeout = 1100
7371
multimaster.heartbeat_send_timeout = 250
7472
EOF
7573

74+
if [ -n "$NODE_ID" ]; then
75+
echo "multimaster.node_id = $NODE_ID" >> $PGDATA/postgresql.conf
76+
fi
77+
78+
if [ -n "$CONNSTRS" ]; then
79+
echo "multimaster.conn_strings = '$CONNSTRS'" >> $PGDATA/postgresql.conf
80+
fi
81+
82+
if [ -n "$MAJOR" ]; then
83+
echo 'multimaster.major_node = on' >> $PGDATA/postgresql.conf
84+
fi
85+
86+
if [ -n "$REFEREE" ]; then
87+
echo 'multimaster.referee = on' >> $PGDATA/postgresql.conf
88+
fi
89+
90+
if [ -n "$REFEREE_CONNSTR" ]; then
91+
echo "multimaster.referee_connstring = '$REFEREE_CONNSTR'" >> $PGDATA/postgresql.conf
92+
fi
93+
7694
cat $PGDATA/postgresql.conf
7795

7896
pg_ctl -D "$PGDATA" -m fast -w stop

tests2/lib/test_helper.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import unittest
22
import time
33
import datetime
4+
import psycopg2
45

56
TEST_WARMING_TIME = 5
67
TEST_DURATION = 10
@@ -50,6 +51,7 @@ def performFailure(self, failure):
5051
aggs_failure = self.client.get_aggregates()
5152

5253

54+
# time.sleep(10000)
5355
failure.stop()
5456

5557
print('Eliminate failure at ',datetime.datetime.utcnow())
@@ -59,3 +61,12 @@ def performFailure(self, failure):
5961
aggs = self.client.get_aggregates()
6062

6163
return (aggs_failure, aggs)
64+
65+
def nodeExecute(dsn, statements):
66+
con = psycopg2.connect(dsn)
67+
con.autocommit = True
68+
cur = con.cursor()
69+
for statement in statements:
70+
cur.execute(statement)
71+
cur.close()
72+
con.close()

tests2/support/two_nodes.yml

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,10 @@ services:
1212
POSTGRES_USER: 'pg'
1313
POSTGRES_DB: 'regression'
1414
NODE_ID: 1
15-
MAJOR: 'off'
1615
CONNSTRS: >-
1716
dbname=regression user=pg host=node1,
1817
dbname=regression user=pg host=node2
18+
REFEREE_CONNSTR: 'dbname=regression user=pg host=referee'
1919
ports:
2020
- "15432:5432"
2121

@@ -29,9 +29,23 @@ services:
2929
POSTGRES_USER: 'pg'
3030
POSTGRES_DB: 'regression'
3131
NODE_ID: 2
32-
MAJOR: 'off'
3332
CONNSTRS: >-
3433
dbname=regression user=pg host=node1,
3534
dbname=regression user=pg host=node2
35+
REFEREE_CONNSTR: 'dbname=regression user=pg host=referee'
3636
ports:
3737
- "15433:5432"
38+
39+
referee:
40+
container_name: referee
41+
build: ../..
42+
privileged: true
43+
ulimits:
44+
core: 14294967296
45+
environment:
46+
POSTGRES_USER: 'pg'
47+
POSTGRES_DB: 'regression'
48+
NODE_ID: 1
49+
REFEREE: 'on'
50+
ports:
51+
- "15435:5432"

0 commit comments

Comments
 (0)