18
18
#include <time.h>
19
19
#include <fcntl.h>
20
20
21
+ #ifdef WITH_RSOCKET
22
+ #include <rdma/rsocket.h>
23
+ #endif
24
+
21
25
#include "postgres.h"
22
26
#include "fmgr.h"
23
27
#include "miscadmin.h"
28
+ #include "pg_socket.h"
24
29
#include "postmaster/postmaster.h"
25
30
#include "postmaster/bgworker.h"
26
31
#include "storage/s_lock.h"
58
63
#include "tcop/utility.h"
59
64
#include "libpq/ip.h"
60
65
66
+
61
67
#ifndef USE_EPOLL
62
68
#ifdef __linux__
63
69
#define USE_EPOLL 0
@@ -185,7 +191,7 @@ static void MtmUnregisterSocket(int fd)
185
191
static void MtmDisconnect (int node )
186
192
{
187
193
MtmUnregisterSocket (sockets [node ]);
188
- close (sockets [node ]);
194
+ pg_closesocket (sockets [node ], MtmUseRDMA );
189
195
sockets [node ] = -1 ;
190
196
MtmOnNodeDisconnect (node + 1 );
191
197
}
@@ -208,7 +214,7 @@ static int MtmWaitSocket(int sd, bool forWrite, timestamp_t timeoutMsec)
208
214
FD_SET (sd , & set );
209
215
tv .tv_sec = (deadline - now )/USECS_PER_SEC ;
210
216
tv .tv_usec = (deadline - now )%USECS_PER_SEC ;
211
- } while ((rc = select ( sd + 1 , forWrite ? NULL : & set , forWrite ? & set : NULL , NULL , & tv )) < 0 && errno == EINTR );
217
+ } while ((rc = pg_select ([ sd + 1 , forWrite ? NULL : & set , forWrite ? & set : NULL , NULL , & tv , MtmUseRDMA )) < 0 && errno == EINTR );
212
218
213
219
return rc ;
214
220
}
@@ -219,7 +225,7 @@ static bool MtmWriteSocket(int sd, void const* buf, int size)
219
225
while (size != 0 ) {
220
226
int rc = MtmWaitSocket (sd , true, MtmHeartbeatSendTimeout );
221
227
if (rc == 1 ) {
222
- while ((rc = send (sd , src , size , 0 )) < 0 && errno == EINTR );
228
+ while ((rc = pg_send (sd , src , size , 0 , MtmUseRDMA )) < 0 && errno == EINTR );
223
229
if (rc < 0 ) {
224
230
if (errno == EINPROGRESS ) {
225
231
continue ;
@@ -238,11 +244,11 @@ static bool MtmWriteSocket(int sd, void const* buf, int size)
238
244
static int MtmReadSocket (int sd , void * buf , int buf_size )
239
245
{
240
246
int rc ;
241
- while ((rc = recv (sd , buf , buf_size , 0 )) < 0 && errno == EINTR );
247
+ while ((rc = pg_recv (sd , buf , buf_size , 0 , MtmUseRDMA )) < 0 && errno == EINTR );
242
248
if (rc <= 0 && (errno == EAGAIN || errno == EINPROGRESS )) {
243
249
rc = MtmWaitSocket (sd , false, MtmHeartbeatSendTimeout );
244
250
if (rc == 1 ) {
245
- while ((rc = recv (sd , buf , buf_size , 0 )) < 0 && errno == EINTR );
251
+ while ((rc = pg_recv (sd , buf , buf_size , 0 , MtmUseRDMA )) < 0 && errno == EINTR );
246
252
}
247
253
}
248
254
return rc ;
@@ -254,25 +260,25 @@ static void MtmSetSocketOptions(int sd)
254
260
{
255
261
#ifdef TCP_NODELAY
256
262
int on = 1 ;
257
- if (setsockopt (sd , IPPROTO_TCP , TCP_NODELAY , (char const * )& on , sizeof (on )) < 0 ) {
263
+ if (pg_setsockopt (sd , IPPROTO_TCP , TCP_NODELAY , (char const * )& on , sizeof (on ), MtmUseRDMA ) < 0 ) {
258
264
MTM_ELOG (WARNING , "Failed to set TCP_NODELAY: %m" );
259
265
}
260
266
#endif
261
- if (setsockopt (sd , SOL_SOCKET , SO_KEEPALIVE , (char const * )& on , sizeof (on )) < 0 ) {
267
+ if (pg_setsockopt (sd , SOL_SOCKET , SO_KEEPALIVE , (char const * )& on , sizeof (on ), MtmUseRDMA ) < 0 ) {
262
268
MTM_ELOG (WARNING , "Failed to set SO_KEEPALIVE: %m" );
263
269
}
264
270
265
271
if (tcp_keepalives_idle ) {
266
272
#ifdef TCP_KEEPIDLE
267
- if (setsockopt (sd , IPPROTO_TCP , TCP_KEEPIDLE ,
268
- (char * ) & tcp_keepalives_idle , sizeof (tcp_keepalives_idle )) < 0 )
273
+ if (pg_setsockopt (sd , IPPROTO_TCP , TCP_KEEPIDLE ,
274
+ (char * ) & tcp_keepalives_idle , sizeof (tcp_keepalives_idle ), MtmUseRDMA ) < 0 )
269
275
{
270
276
MTM_ELOG (WARNING , "Failed to set TCP_KEEPIDLE: %m" );
271
277
}
272
278
#else
273
279
#ifdef TCP_KEEPALIVE
274
- if (setsockopt (sd , IPPROTO_TCP , TCP_KEEPALIVE ,
275
- (char * ) & tcp_keepalives_idle , sizeof (tcp_keepalives_idle )) < 0 )
280
+ if (pg_setsockopt (sd , IPPROTO_TCP , TCP_KEEPALIVE ,
281
+ (char * ) & tcp_keepalives_idle , sizeof (tcp_keepalives_idle ), MtmUseRDMA ) < 0 )
276
282
{
277
283
MTM_ELOG (WARNING , "Failed to set TCP_KEEPALIVE: %m" );
278
284
}
@@ -281,17 +287,17 @@ static void MtmSetSocketOptions(int sd)
281
287
}
282
288
#ifdef TCP_KEEPINTVL
283
289
if (tcp_keepalives_interval ) {
284
- if (setsockopt (sd , IPPROTO_TCP , TCP_KEEPINTVL ,
285
- (char * ) & tcp_keepalives_interval , sizeof (tcp_keepalives_interval )) < 0 )
290
+ if (pg_setsockopt (sd , IPPROTO_TCP , TCP_KEEPINTVL ,
291
+ (char * ) & tcp_keepalives_interval , sizeof (tcp_keepalives_interval ), MtmUseRDMA ) < 0 )
286
292
{
287
293
MTM_ELOG (WARNING , "Failed to set TCP_KEEPINTVL: %m" );
288
294
}
289
295
}
290
296
#endif
291
297
#ifdef TCP_KEEPCNT
292
298
if (tcp_keepalives_count ) {
293
- if (setsockopt (sd , IPPROTO_TCP , TCP_KEEPCNT ,
294
- (char * ) & tcp_keepalives_count , sizeof (tcp_keepalives_count )) < 0 )
299
+ if (pg_setsockopt (sd , IPPROTO_TCP , TCP_KEEPCNT ,
300
+ (char * ) & tcp_keepalives_count , sizeof (tcp_keepalives_count ), MtmUseRDMA ) < 0 )
295
301
{
296
302
MTM_ELOG (WARNING , "Failed to set TCP_KEEPCNT: %m" );
297
303
}
@@ -375,7 +381,7 @@ static void MtmSendHeartbeat()
375
381
/* Connectivity mask can be cleared by MtmWatchdog: in this case sockets[i] >= 0 */
376
382
if (BIT_CHECK (SELF_CONNECTIVITY_MASK , i )) {
377
383
MTM_LOG1 ("Force reconnect to node %d" , i + 1 );
378
- close (sockets [i ]);
384
+ pg_closesocket (sockets [i ], MtmUseRDMA );
379
385
sockets [i ] = -1 ;
380
386
MtmReconnectNode (i + 1 ); /* set reconnect mask to force node reconnent */
381
387
}
@@ -436,20 +442,20 @@ static int MtmConnectSocket(int node, int port, time_t timeout)
436
442
Retry :
437
443
while (1 ) {
438
444
int rc = -1 ;
439
- sd = socket (AF_INET , SOCK_STREAM , 0 );
445
+ sd = pg_socket (AF_INET , SOCK_STREAM , 0 , MtmUseRDMA );
440
446
if (sd < 0 ) {
441
447
MTM_ELOG (LOG , "Arbiter failed to create socket: %d" , errno );
442
448
goto Error ;
443
449
}
444
- rc = fcntl (sd , F_SETFL , O_NONBLOCK );
450
+ rc = pg_fcntl (sd , F_SETFL , O_NONBLOCK , MtmUseRDMA );
445
451
if (rc < 0 ) {
446
452
MTM_ELOG (LOG , "Arbiter failed to switch socket to non-blocking mode: %d" , errno );
447
453
goto Error ;
448
454
}
449
455
for (addr = addrs ; addr != NULL ; addr = addr -> ai_next )
450
456
{
451
457
do {
452
- rc = connect (sd , addr -> ai_addr , addr -> ai_addrlen );
458
+ rc = pg_connect (sd , addr -> ai_addr , addr -> ai_addrlen , MtmUseRDMA );
453
459
} while (rc < 0 && errno == EINTR );
454
460
455
461
if (rc >= 0 || errno == EINPROGRESS ) {
@@ -479,7 +485,7 @@ static int MtmConnectSocket(int node, int port, time_t timeout)
479
485
} else {
480
486
MTM_ELOG (WARNING , "Arbiter waiting socket to %s:%d: rc=%d, error=%d" , host , port , rc , errno );
481
487
}
482
- close (sd );
488
+ pg_closesocket (sd , MtmUseRDMA );
483
489
afterWait = MtmGetSystemTime ();
484
490
if (afterWait < beforeWait + MSEC_TO_USEC (MtmHeartbeatSendTimeout )) {
485
491
MtmSleep (beforeWait + MSEC_TO_USEC (MtmHeartbeatSendTimeout ) - afterWait );
@@ -495,17 +501,17 @@ static int MtmConnectSocket(int node, int port, time_t timeout)
495
501
strcpy (req .connStr , Mtm -> nodes [MtmNodeId - 1 ].con .connStr );
496
502
if (!MtmWriteSocket (sd , & req , sizeof req )) {
497
503
MTM_ELOG (WARNING , "Arbiter failed to send handshake message to %s:%d: %d" , host , port , errno );
498
- close (sd );
504
+ pg_closesocket (sd , MtmUseRDMA );
499
505
goto Retry ;
500
506
}
501
507
if (MtmReadSocket (sd , & resp , sizeof resp ) != sizeof (resp )) {
502
508
MTM_ELOG (WARNING , "Arbiter failed to receive response for handshake message from %s:%d: errno=%d" , host , port , errno );
503
- close (sd );
509
+ pg_closesocket (sd , MtmUseRDMA );
504
510
goto Retry ;
505
511
}
506
512
if (resp .code != MSG_STATUS || resp .dxid != HANDSHAKE_MAGIC ) {
507
513
MTM_ELOG (WARNING , "Arbiter get unexpected response %d for handshake message from %s:%d" , resp .code , host , port );
508
- close (sd );
514
+ pg_closesocket (sd , MtmUseRDMA );
509
515
goto Retry ;
510
516
}
511
517
if (addrs )
@@ -524,7 +530,7 @@ static int MtmConnectSocket(int node, int port, time_t timeout)
524
530
Error :
525
531
busy_mask = save_mask ;
526
532
if (sd >= 0 ) {
527
- close (sd );
533
+ pg_closesocket (sd , MtmUseRDMA );
528
534
}
529
535
if (addrs ) {
530
536
pg_freeaddrinfo_all (hint .ai_family , addrs );
@@ -572,7 +578,7 @@ static bool MtmSendToNode(int node, void const* buf, int size, time_t reconnectT
572
578
*/
573
579
if (sockets [node ] >= 0 && BIT_CHECK (Mtm -> reconnectMask , node )) {
574
580
MTM_ELOG (WARNING , "Arbiter is forced to reconnect to node %d" , node + 1 );
575
- close (sockets [node ]);
581
+ pg_closesocket (sockets [node ], MtmUseRDMA );
576
582
sockets [node ] = -1 ;
577
583
}
578
584
#endif
@@ -584,7 +590,7 @@ static bool MtmSendToNode(int node, void const* buf, int size, time_t reconnectT
584
590
if (sockets [node ] < 0 || !MtmWriteSocket (sockets [node ], buf , size )) {
585
591
if (sockets [node ] >= 0 ) {
586
592
MTM_ELOG (WARNING , "Arbiter fail to write to node %d: %d" , node + 1 , errno );
587
- close (sockets [node ]);
593
+ pg_closesocket (sockets [node ], MtmUseRDMA );
588
594
sockets [node ] = -1 ;
589
595
}
590
596
sockets [node ] = MtmConnectSocket (node , Mtm -> nodes [node ].con .arbiterPort , reconnectTimeout );
@@ -615,23 +621,23 @@ static int MtmReadFromNode(int node, void* buf, int buf_size)
615
621
616
622
static void MtmAcceptOneConnection ()
617
623
{
618
- int fd = accept (gateway , NULL , NULL );
624
+ int fd = pg_accept (gateway , NULL , NULL , MtmUseRDMA );
619
625
if (fd < 0 ) {
620
626
MTM_ELOG (WARNING , "Arbiter failed to accept socket: %d" , errno );
621
627
} else {
622
628
MtmHandshakeMessage req ;
623
629
MtmArbiterMessage resp ;
624
- int rc = fcntl (fd , F_SETFL , O_NONBLOCK );
630
+ int rc = pg_fcntl (fd , F_SETFL , O_NONBLOCK , MtmUseRDMA );
625
631
if (rc < 0 ) {
626
632
MTM_ELOG (ERROR , "Arbiter failed to switch socket to non-blocking mode: %d" , errno );
627
633
}
628
634
rc = MtmReadSocket (fd , & req , sizeof req );
629
635
if (rc < sizeof (req )) {
630
636
MTM_ELOG (WARNING , "Arbiter failed to handshake socket: %d, errno=%d" , rc , errno );
631
- close (fd );
637
+ pg_closesocket (fd , MtmUseRDMA );
632
638
} else if (req .hdr .code != MSG_HANDSHAKE && req .hdr .dxid != HANDSHAKE_MAGIC ) {
633
639
MTM_ELOG (WARNING , "Arbiter get unexpected handshake message %d" , req .hdr .code );
634
- close (fd );
640
+ pg_closesocket (fd , MtmUseRDMA );
635
641
} else {
636
642
int node = req .hdr .node - 1 ;
637
643
Assert (node >= 0 && node < Mtm -> nAllNodes && node + 1 != MtmNodeId );
@@ -648,7 +654,7 @@ static void MtmAcceptOneConnection()
648
654
MtmUpdateNodeConnectionInfo (& Mtm -> nodes [node ].con , req .connStr );
649
655
if (!MtmWriteSocket (fd , & resp , sizeof resp )) {
650
656
MTM_ELOG (WARNING , "Arbiter failed to write response for handshake message to node %d" , node + 1 );
651
- close (fd );
657
+ pg_closesocket (fd , MtmUseRDMA );
652
658
} else {
653
659
MTM_LOG1 ("Arbiter established connection with node %d" , node + 1 );
654
660
if (sockets [node ] >= 0 ) {
@@ -678,18 +684,18 @@ static void MtmAcceptIncomingConnections()
678
684
sock_inet .sin_addr .s_addr = htonl (INADDR_ANY );
679
685
sock_inet .sin_port = htons (MtmArbiterPort );
680
686
681
- gateway = socket (sock_inet .sin_family , SOCK_STREAM , 0 );
687
+ gateway = pg_socket (sock_inet .sin_family , SOCK_STREAM , 0 , MtmUseRDMA );
682
688
if (gateway < 0 ) {
683
689
MTM_ELOG (ERROR , "Arbiter failed to create socket: %s" , strerror (errno ));
684
690
}
685
- if (setsockopt (gateway , SOL_SOCKET , SO_REUSEADDR , (char * )& on , sizeof on ) < 0 ) {
691
+ if (pg_setsockopt (gateway , SOL_SOCKET , SO_REUSEADDR , (char * )& on , sizeof on ) < 0 ) {
686
692
MTM_ELOG (ERROR , "Arbiter failed to set options for socket: %s" , strerror (errno ));
687
693
}
688
694
689
- if (bind (gateway , (struct sockaddr * )& sock_inet , sizeof (sock_inet )) < 0 ) {
695
+ if (pg_bind (gateway , (struct sockaddr * )& sock_inet , sizeof (sock_inet ), MtmUseRDMA ) < 0 ) {
690
696
MTM_ELOG (ERROR , "Arbiter failed to bind socket: %s" , strerror (errno ));
691
697
}
692
- if (listen (gateway , nNodes ) < 0 ) {
698
+ if (pg_listen (gateway , nNodes , MtmUseRDMA ) < 0 ) {
693
699
MTM_ELOG (ERROR , "Arbiter failed to listen socket: %s" , strerror (errno ));
694
700
}
695
701
@@ -790,7 +796,7 @@ static bool MtmRecovery()
790
796
fd_set tryset ;
791
797
FD_ZERO (& tryset );
792
798
FD_SET (sd , & tryset );
793
- if (select (sd + 1 , & tryset , NULL , NULL , & tm ) < 0 ) {
799
+ if (pg_select (sd + 1 , & tryset , NULL , NULL , & tm , MtmUseRDMA ) < 0 ) {
794
800
MTM_ELOG (WARNING , "Arbiter lost connection with node %d" , i + 1 );
795
801
MtmDisconnect (i );
796
802
recovered = true;
@@ -883,7 +889,7 @@ static void MtmReceiver(Datum arg)
883
889
tv .tv_sec = selectTimeout /1000 ;
884
890
tv .tv_usec = selectTimeout %1000 * 1000 ;
885
891
do {
886
- n = select (max_fd + 1 , & events , NULL , NULL , & tv );
892
+ n = pg_select (max_fd + 1 , & events , NULL , NULL , & tv , MtmUseRDMA );
887
893
} while (n < 0 && errno == EINTR );
888
894
} while (n < 0 && MtmRecovery ());
889
895
0 commit comments