Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 2332ab5

Browse files
knizhnikkelvich
authored andcommitted
Correctly handle connect timeouts
1 parent 4ac3eec commit 2332ab5

File tree

3 files changed

+29
-38
lines changed

3 files changed

+29
-38
lines changed

arbiter.c

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -366,14 +366,16 @@ static void MtmCheckHeartbeat()
366366
}
367367

368368

369-
static int MtmConnectSocket(char const* host, int port, int max_attempts)
369+
static int MtmConnectSocket(char const* host, int port, int timeout)
370370
{
371371
struct sockaddr_in sock_inet;
372372
unsigned addrs[MAX_ROUTES];
373373
unsigned i, n_addrs = sizeof(addrs) / sizeof(addrs[0]);
374374
MtmHandshakeMessage req;
375375
MtmArbiterMessage resp;
376376
int sd;
377+
timestamp_t start = MtmGetSystemTime();
378+
377379

378380
sock_inet.sin_family = AF_INET;
379381
sock_inet.sin_port = htons(port);
@@ -390,7 +392,10 @@ static int MtmConnectSocket(char const* host, int port, int max_attempts)
390392
if (sd < 0) {
391393
elog(ERROR, "Arbiter failed to create socket: %d", errno);
392394
}
393-
fcntl(sd, F_SETFL, O_NONBLOCK);
395+
rc = fcntl(sd, F_SETFL, O_NONBLOCK);
396+
if (rc < 0) {
397+
elog(ERROR, "Arbiter failed to switch socket to non-blocking mode: %d", errno);
398+
}
394399
busy_socket = sd;
395400
for (i = 0; i < n_addrs; ++i) {
396401
memcpy(&sock_inet.sin_addr, &addrs[i], sizeof sock_inet.sin_addr);
@@ -405,17 +410,19 @@ static int MtmConnectSocket(char const* host, int port, int max_attempts)
405410
if (rc == 0) {
406411
break;
407412
}
408-
if (errno != EINPROGRESS || max_attempts == 0) {
413+
if (errno != EINPROGRESS || start + MSEC_TO_USEC(timeout) < MtmGetSystemTime()) {
409414
elog(WARNING, "Arbiter failed to connect to %s:%d: error=%d", host, port, errno);
410415
busy_socket = -1;
416+
close(sd);
411417
return -1;
412418
} else {
413-
rc = MtmWaitSocket(sd, true, MtmConnectTimeout);
419+
rc = MtmWaitSocket(sd, true, MtmHeartbeatSendTimeout);
414420
if (rc == 1) {
415421
socklen_t optlen = sizeof(int);
416422
if (getsockopt(sd, SOL_SOCKET, SO_ERROR, (void*)&rc, &optlen) < 0) {
417423
elog(WARNING, "Arbiter failed to getsockopt for %s:%d: error=%d", host, port, errno);
418424
busy_socket = -1;
425+
close(sd);
419426
return -1;
420427
}
421428
if (rc == 0) {
@@ -426,8 +433,8 @@ static int MtmConnectSocket(char const* host, int port, int max_attempts)
426433
} else {
427434
elog(WARNING, "Arbiter waiting socket to %s:%d: rc=%d, error=%d", host, port, rc, errno);
428435
}
429-
max_attempts -= 1;
430-
MtmSleep(MSEC_TO_USEC(MtmConnectTimeout));
436+
close(sd);
437+
MtmSleep(MSEC_TO_USEC(MtmHeartbeatSendTimeout));
431438
}
432439
}
433440
MtmSetSocketOptions(sd);
@@ -479,7 +486,7 @@ static void MtmOpenConnections()
479486
}
480487
for (i = 0; i < nNodes; i++) {
481488
if (i+1 != MtmNodeId && i < Mtm->nAllNodes) {
482-
sockets[i] = MtmConnectSocket(Mtm->nodes[i].con.hostName, MtmArbiterPort + i + 1, MtmConnectAttempts);
489+
sockets[i] = MtmConnectSocket(Mtm->nodes[i].con.hostName, MtmArbiterPort + i + 1, MtmConnectTimeout);
483490
if (sockets[i] < 0) {
484491
MtmOnNodeDisconnect(i+1);
485492
}
@@ -511,7 +518,7 @@ static bool MtmSendToNode(int node, void const* buf, int size)
511518
close(sockets[node]);
512519
sockets[node] = -1;
513520
}
514-
sockets[node] = MtmConnectSocket(Mtm->nodes[node].con.hostName, MtmArbiterPort + node + 1, MtmReconnectAttempts);
521+
sockets[node] = MtmConnectSocket(Mtm->nodes[node].con.hostName, MtmArbiterPort + node + 1, MtmReconnectTimeout);
515522
if (sockets[node] < 0) {
516523
MtmOnNodeDisconnect(node+1);
517524
return false;

multimaster.c

Lines changed: 13 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -193,10 +193,9 @@ int MtmNodes;
193193
int MtmNodeId;
194194
int MtmReplicationNodeId;
195195
int MtmArbiterPort;
196-
int MtmConnectAttempts;
197196
int MtmConnectTimeout;
197+
int MtmReconnectTimeout;
198198
int MtmRaftPollDelay;
199-
int MtmReconnectAttempts;
200199
int MtmNodeDisableDelay;
201200
int MtmTransSpillThreshold;
202201
int MtmMaxNodes;
@@ -2030,9 +2029,9 @@ _PG_init(void)
20302029
DefineCustomIntVariable(
20312030
"multimaster.connect_timeout",
20322031
"Multimaster nodes connect timeout",
2033-
"Interval in milliseconds between connection attempts",
2032+
"Interval in milliseconds for establishing connection with cluster node",
20342033
&MtmConnectTimeout,
2035-
1000,
2034+
10000, /* 10 seconds */
20362035
1,
20372036
INT_MAX,
20382037
PGC_BACKEND,
@@ -2043,11 +2042,11 @@ _PG_init(void)
20432042
);
20442043

20452044
DefineCustomIntVariable(
2046-
"multimaster.raft_poll_delay",
2047-
"Multimaster delay of polling cluster state from Raftable after updating local node status",
2048-
"Timeout in milliseconds before polling state of nodes",
2049-
&MtmRaftPollDelay,
2050-
1000,
2045+
"multimaster.reconnect_timeout",
2046+
"Multimaster nodes reconnect timeout",
2047+
"Interval in milliseconds for establishing connection with cluster node",
2048+
&MtmReconnectTimeout,
2049+
5000, /* 5 seconds */
20512050
1,
20522051
INT_MAX,
20532052
PGC_BACKEND,
@@ -2058,11 +2057,11 @@ _PG_init(void)
20582057
);
20592058

20602059
DefineCustomIntVariable(
2061-
"multimaster.connect_attempts",
2062-
"Multimaster number of connect attemts",
2063-
"Maximal number of attempt to establish connection with other node after which multimaster is give up",
2064-
&MtmConnectAttempts,
2065-
10,
2060+
"multimaster.raft_poll_delay",
2061+
"Multimaster delay of polling cluster state from Raftable after updating local node status",
2062+
"Timeout in milliseconds before polling state of nodes",
2063+
&MtmRaftPollDelay,
2064+
1000,
20662065
1,
20672066
INT_MAX,
20682067
PGC_BACKEND,
@@ -2072,20 +2071,6 @@ _PG_init(void)
20722071
NULL
20732072
);
20742073

2075-
DefineCustomIntVariable(
2076-
"multimaster.reconnect_attempts",
2077-
"Multimaster number of reconnect attemts",
2078-
"Maximal number of attempt to reestablish connection with other node after which node is considered to be offline",
2079-
&MtmReconnectAttempts,
2080-
10,
2081-
1,
2082-
INT_MAX,
2083-
PGC_BACKEND,
2084-
0,
2085-
NULL,
2086-
NULL,
2087-
NULL
2088-
);
20892074

20902075
MtmSplitConnStrs();
20912076
MtmStartReceivers();

multimaster.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -215,9 +215,8 @@ extern int MtmReplicationNodeId;
215215
extern int MtmNodes;
216216
extern int MtmArbiterPort;
217217
extern char* MtmDatabaseName;
218-
extern int MtmConnectAttempts;
219218
extern int MtmConnectTimeout;
220-
extern int MtmReconnectAttempts;
219+
extern int MtmReconnectTimeout;
221220
extern int MtmRaftPollDelay;
222221
extern int MtmNodeDisableDelay;
223222
extern int MtmTransSpillThreshold;

0 commit comments

Comments
 (0)