Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit d73d14c

Browse files
committed
Fix incorrect order of lock file removal and failure to close() sockets.
Commit c9b0cbe accidentally broke the order of operations during postmaster shutdown: it resulted in removing the per-socket lockfiles after, not before, postmaster.pid. This creates a race-condition hazard for a new postmaster that's started immediately after observing that postmaster.pid has disappeared; if it sees the socket lockfile still present, it will quite properly refuse to start. This error appears to be the explanation for at least some of the intermittent buildfarm failures we've seen in the pg_upgrade test. Another problem, which has been there all along, is that the postmaster has never bothered to close() its listen sockets, but has just allowed them to close at process death. This creates a different race condition for an incoming postmaster: it might be unable to bind to the desired listen address because the old postmaster is still incumbent. This might explain some odd failures we've seen in the past, too. (Note: this is not related to the fact that individual backends don't close their client communication sockets. That behavior is intentional and is not changed by this patch.) Fix by adding an on_proc_exit function that closes the postmaster's ports explicitly, and (in 9.3 and up) reshuffling the responsibility for where to unlink the Unix socket files. Lock file unlinking can stay where it is, but teach it to unlink the lock files in reverse order of creation.
1 parent 358cde3 commit d73d14c

File tree

4 files changed

+78
-30
lines changed

4 files changed

+78
-30
lines changed

src/backend/libpq/pqcomm.c

+25-29
Original file line numberDiff line numberDiff line change
@@ -174,12 +174,15 @@ PQcommMethods *PqCommMethods = &PqCommSocketMethods;
174174
void
175175
pq_init(void)
176176
{
177+
/* initialize state variables */
177178
PqSendBufferSize = PQ_SEND_BUFFER_SIZE;
178179
PqSendBuffer = MemoryContextAlloc(TopMemoryContext, PqSendBufferSize);
179180
PqSendPointer = PqSendStart = PqRecvPointer = PqRecvLength = 0;
180181
PqCommBusy = false;
181182
PqCommReadingMsg = false;
182183
DoingCopyOut = false;
184+
185+
/* set up process-exit hook to close the socket */
183186
on_proc_exit(socket_close, 0);
184187

185188
/*
@@ -285,28 +288,6 @@ socket_close(int code, Datum arg)
285288
*/
286289

287290

288-
/* StreamDoUnlink()
289-
* Shutdown routine for backend connection
290-
* If any Unix sockets are used for communication, explicitly close them.
291-
*/
292-
#ifdef HAVE_UNIX_SOCKETS
293-
static void
294-
StreamDoUnlink(int code, Datum arg)
295-
{
296-
ListCell *l;
297-
298-
/* Loop through all created sockets... */
299-
foreach(l, sock_paths)
300-
{
301-
char *sock_path = (char *) lfirst(l);
302-
303-
unlink(sock_path);
304-
}
305-
/* Since we're about to exit, no need to reclaim storage */
306-
sock_paths = NIL;
307-
}
308-
#endif /* HAVE_UNIX_SOCKETS */
309-
310291
/*
311292
* StreamServerPort -- open a "listening" port to accept connections.
312293
*
@@ -588,16 +569,11 @@ Lock_AF_UNIX(char *unixSocketDir, char *unixSocketPath)
588569
* Once we have the interlock, we can safely delete any pre-existing
589570
* socket file to avoid failure at bind() time.
590571
*/
591-
unlink(unixSocketPath);
572+
(void) unlink(unixSocketPath);
592573

593574
/*
594-
* Arrange to unlink the socket file(s) at proc_exit. If this is the
595-
* first one, set up the on_proc_exit function to do it; then add this
596-
* socket file to the list of files to unlink.
575+
* Remember socket file pathnames for later maintenance.
597576
*/
598-
if (sock_paths == NIL)
599-
on_proc_exit(StreamDoUnlink, 0);
600-
601577
sock_paths = lappend(sock_paths, pstrdup(unixSocketPath));
602578

603579
return STATUS_OK;
@@ -858,6 +834,26 @@ TouchSocketFiles(void)
858834
}
859835
}
860836

837+
/*
838+
* RemoveSocketFiles -- unlink socket files at postmaster shutdown
839+
*/
840+
void
841+
RemoveSocketFiles(void)
842+
{
843+
ListCell *l;
844+
845+
/* Loop through all created sockets... */
846+
foreach(l, sock_paths)
847+
{
848+
char *sock_path = (char *) lfirst(l);
849+
850+
/* Ignore any error. */
851+
(void) unlink(sock_path);
852+
}
853+
/* Since we're about to exit, no need to reclaim storage */
854+
sock_paths = NIL;
855+
}
856+
861857

862858
/* --------------------------------
863859
* Low-level I/O routines begin here.

src/backend/postmaster/postmaster.c

+47
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,7 @@ static DNSServiceRef bonjour_sdref = NULL;
370370
/*
371371
* postmaster.c - function prototypes
372372
*/
373+
static void CloseServerPorts(int status, Datum arg);
373374
static void unlink_external_pid_file(int status, Datum arg);
374375
static void getInstallationPaths(const char *argv0);
375376
static void checkDataDir(void);
@@ -900,6 +901,11 @@ PostmasterMain(int argc, char *argv[])
900901
* interlock (thanks to whoever decided to put socket files in /tmp :-().
901902
* For the same reason, it's best to grab the TCP socket(s) before the
902903
* Unix socket(s).
904+
*
905+
* Also note that this internally sets up the on_proc_exit function that
906+
* is responsible for removing both data directory and socket lockfiles;
907+
* so it must happen before opening sockets so that at exit, the socket
908+
* lockfiles go away after CloseServerPorts runs.
903909
*/
904910
CreateDataDirLockFile(true);
905911

@@ -924,10 +930,15 @@ PostmasterMain(int argc, char *argv[])
924930

925931
/*
926932
* Establish input sockets.
933+
*
934+
* First, mark them all closed, and set up an on_proc_exit function that's
935+
* charged with closing the sockets again at postmaster shutdown.
927936
*/
928937
for (i = 0; i < MAXLISTEN; i++)
929938
ListenSocket[i] = PGINVALID_SOCKET;
930939

940+
on_proc_exit(CloseServerPorts, 0);
941+
931942
if (ListenAddresses)
932943
{
933944
char *rawstring;
@@ -1271,6 +1282,42 @@ PostmasterMain(int argc, char *argv[])
12711282
}
12721283

12731284

1285+
/*
1286+
* on_proc_exit callback to close server's listen sockets
1287+
*/
1288+
static void
1289+
CloseServerPorts(int status, Datum arg)
1290+
{
1291+
int i;
1292+
1293+
/*
1294+
* First, explicitly close all the socket FDs. We used to just let this
1295+
* happen implicitly at postmaster exit, but it's better to close them
1296+
* before we remove the postmaster.pid lockfile; otherwise there's a race
1297+
* condition if a new postmaster wants to re-use the TCP port number.
1298+
*/
1299+
for (i = 0; i < MAXLISTEN; i++)
1300+
{
1301+
if (ListenSocket[i] != PGINVALID_SOCKET)
1302+
{
1303+
StreamClose(ListenSocket[i]);
1304+
ListenSocket[i] = PGINVALID_SOCKET;
1305+
}
1306+
}
1307+
1308+
/*
1309+
* Next, remove any filesystem entries for Unix sockets. To avoid race
1310+
* conditions against incoming postmasters, this must happen after closing
1311+
* the sockets and before removing lock files.
1312+
*/
1313+
RemoveSocketFiles();
1314+
1315+
/*
1316+
* We don't do anything about socket lock files here; those will be
1317+
* removed in a later on_proc_exit callback.
1318+
*/
1319+
}
1320+
12741321
/*
12751322
* on_proc_exit callback to delete external_pid_file
12761323
*/

src/backend/utils/init/miscinit.c

+5-1
Original file line numberDiff line numberDiff line change
@@ -1018,7 +1018,11 @@ CreateLockFile(const char *filename, bool amPostmaster,
10181018
if (lock_files == NIL)
10191019
on_proc_exit(UnlinkLockFiles, 0);
10201020

1021-
lock_files = lappend(lock_files, pstrdup(filename));
1021+
/*
1022+
* Use lcons so that the lock files are unlinked in reverse order of
1023+
* creation; this is critical!
1024+
*/
1025+
lock_files = lcons(pstrdup(filename), lock_files);
10221026
}
10231027

10241028
/*

src/include/libpq/libpq.h

+1
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ extern int StreamServerPort(int family, char *hostName,
5959
extern int StreamConnection(pgsocket server_fd, Port *port);
6060
extern void StreamClose(pgsocket sock);
6161
extern void TouchSocketFiles(void);
62+
extern void RemoveSocketFiles(void);
6263
extern void pq_init(void);
6364
extern int pq_getbytes(char *s, size_t len);
6465
extern int pq_getstring(StringInfo s);

0 commit comments

Comments
 (0)