Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit d41a178

Browse files
committed
Fix waitpid() emulation on Windows.
Our waitpid() emulation didn't prevent a PID from being recycled by the OS before the call to waitpid(). The postmaster could finish up tracking more than one child process with the same PID, and confuse them. Fix, by moving the guts of pgwin32_deadchild_callback() into waitpid(), so that resources are released synchronously. The process and PID continue to exist until we close the process handle, which only happens once we're ready to adjust our book-keeping of running children. This seems to explain a couple of failures on CI. It had never been reported before, despite the code being as old as the Windows port. Perhaps Windows started recycling PIDs more rapidly, or perhaps timing changes due to commit 7389aad made it more likely to break. Thanks to Alexander Lakhin for analysis and Andres Freund for tracking down the root cause. Back-patch to all supported branches. Reported-by: Andres Freund <andres@anarazel.de> Discussion: https://postgr.es/m/20230208012852.bvkn2am4h4iqjogq%40awork3.anarazel.de
1 parent b081fe4 commit d41a178

File tree

1 file changed

+40
-30
lines changed

1 file changed

+40
-30
lines changed

src/backend/postmaster/postmaster.c

+40-30
Original file line numberDiff line numberDiff line change
@@ -4811,7 +4811,7 @@ internal_forkexec(int argc, char *argv[], Port *port)
48114811
(errmsg_internal("could not register process for wait: error code %lu",
48124812
GetLastError())));
48134813

4814-
/* Don't close pi.hProcess here - the wait thread needs access to it */
4814+
/* Don't close pi.hProcess here - waitpid() needs access to it */
48154815

48164816
CloseHandle(pi.hThread);
48174817

@@ -6421,36 +6421,21 @@ ShmemBackendArrayRemove(Backend *bn)
64216421
static pid_t
64226422
waitpid(pid_t pid, int *exitstatus, int options)
64236423
{
6424+
win32_deadchild_waitinfo *childinfo;
6425+
DWORD exitcode;
64246426
DWORD dwd;
64256427
ULONG_PTR key;
64266428
OVERLAPPED *ovl;
64276429

6428-
/*
6429-
* Check if there are any dead children. If there are, return the pid of
6430-
* the first one that died.
6431-
*/
6432-
if (GetQueuedCompletionStatus(win32ChildQueue, &dwd, &key, &ovl, 0))
6430+
/* Try to consume one win32_deadchild_waitinfo from the queue. */
6431+
if (!GetQueuedCompletionStatus(win32ChildQueue, &dwd, &key, &ovl, 0))
64336432
{
6434-
*exitstatus = (int) key;
6435-
return dwd;
6433+
errno = EAGAIN;
6434+
return -1;
64366435
}
64376436

6438-
return -1;
6439-
}
6440-
6441-
/*
6442-
* Note! Code below executes on a thread pool! All operations must
6443-
* be thread safe! Note that elog() and friends must *not* be used.
6444-
*/
6445-
static void WINAPI
6446-
pgwin32_deadchild_callback(PVOID lpParameter, BOOLEAN TimerOrWaitFired)
6447-
{
6448-
win32_deadchild_waitinfo *childinfo = (win32_deadchild_waitinfo *) lpParameter;
6449-
DWORD exitcode;
6450-
6451-
if (TimerOrWaitFired)
6452-
return; /* timeout. Should never happen, since we use
6453-
* INFINITE as timeout value. */
6437+
childinfo = (win32_deadchild_waitinfo *) key;
6438+
pid = childinfo->procId;
64546439

64556440
/*
64566441
* Remove handle from wait - required even though it's set to wait only
@@ -6466,13 +6451,11 @@ pgwin32_deadchild_callback(PVOID lpParameter, BOOLEAN TimerOrWaitFired)
64666451
write_stderr("could not read exit code for process\n");
64676452
exitcode = 255;
64686453
}
6469-
6470-
if (!PostQueuedCompletionStatus(win32ChildQueue, childinfo->procId, (ULONG_PTR) exitcode, NULL))
6471-
write_stderr("could not post child completion status\n");
6454+
*exitstatus = exitcode;
64726455

64736456
/*
6474-
* Handle is per-process, so we close it here instead of in the
6475-
* originating thread
6457+
* Close the process handle. Only after this point can the PID can be
6458+
* recycled by the kernel.
64766459
*/
64776460
CloseHandle(childinfo->procHandle);
64786461

@@ -6482,7 +6465,34 @@ pgwin32_deadchild_callback(PVOID lpParameter, BOOLEAN TimerOrWaitFired)
64826465
*/
64836466
free(childinfo);
64846467

6485-
/* Queue SIGCHLD signal */
6468+
return pid;
6469+
}
6470+
6471+
/*
6472+
* Note! Code below executes on a thread pool! All operations must
6473+
* be thread safe! Note that elog() and friends must *not* be used.
6474+
*/
6475+
static void WINAPI
6476+
pgwin32_deadchild_callback(PVOID lpParameter, BOOLEAN TimerOrWaitFired)
6477+
{
6478+
/* Should never happen, since we use INFINITE as timeout value. */
6479+
if (TimerOrWaitFired)
6480+
return;
6481+
6482+
/*
6483+
* Post the win32_deadchild_waitinfo object for waitpid() to deal with. If
6484+
* that fails, we leak the object, but we also leak a whole process and
6485+
* get into an unrecoverable state, so there's not much point in worrying
6486+
* about that. We'd like to panic, but we can't use that infrastructure
6487+
* from this thread.
6488+
*/
6489+
if (!PostQueuedCompletionStatus(win32ChildQueue,
6490+
0,
6491+
(ULONG_PTR) lpParameter,
6492+
NULL))
6493+
write_stderr("could not post child completion status\n");
6494+
6495+
/* Queue SIGCHLD signal. */
64866496
pg_queue_signal(SIGCHLD);
64876497
}
64886498
#endif /* WIN32 */

0 commit comments

Comments
 (0)