Fix race condition between shutdown and unstarted background workers.

author Tom Lane <tgl@sss.pgh.pa.us>

Thu, 24 Dec 2020 22:00:43 +0000 (17:00 -0500)

committer Tom Lane <tgl@sss.pgh.pa.us>

Thu, 24 Dec 2020 22:00:43 +0000 (17:00 -0500)
author Tom Lane <tgl@sss.pgh.pa.us>
Thu, 24 Dec 2020 22:00:43 +0000 (17:00 -0500)
committer Tom Lane <tgl@sss.pgh.pa.us>
Thu, 24 Dec 2020 22:00:43 +0000 (17:00 -0500)
diff --git a/contrib/pg_prewarm/autoprewarm.c b/contrib/pg_prewarm/autoprewarm.c

index 255e8998d567df5a21069a040b68132e76636323..13d4e3b4a36e1e2df219153b905b0358f5756251 100644 (file)
--- a/contrib/pg_prewarm/autoprewarm.c
+++ b/contrib/pg_prewarm/autoprewarm.c
@@ -400,12 +400,7 @@ apw_load_buffers(void)
  
         /*
          * Likewise, don't launch if we've already been told to shut down.
-        *
-        * There is a race condition here: if the postmaster has received a
-        * fast-shutdown signal, but we've not heard about it yet, then the
-        * postmaster will ignore our worker start request and we'll wait
-        * forever.  However, that's a bug in the general background-worker
-        * logic, not the fault of this module.
+        * (The launch would fail anyway, but we might as well skip it.)
          */
         if (got_sigterm)
             break;
diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c

index 2db83d666f2cee7c14fc987cd2a0189483024846..850b652f9f9d44c860757b651a9e3233d514c13c 100644 (file)
--- a/src/backend/postmaster/bgworker.c
+++ b/src/backend/postmaster/bgworker.c
@@ -232,13 +232,15 @@ FindRegisteredWorkerBySlotNumber(int slotno)
  }
  
  /*
- * Notice changes to shared memory made by other backends.  This code
- * runs in the postmaster, so we must be very careful not to assume that
- * shared memory contents are sane.  Otherwise, a rogue backend could take
- * out the postmaster.
+ * Notice changes to shared memory made by other backends.
+ * Accept new worker requests only if allow_new_workers is true.
+ *
+ * This code runs in the postmaster, so we must be very careful not to assume
+ * that shared memory contents are sane.  Otherwise, a rogue backend could
+ * take out the postmaster.
   */
  void
-BackgroundWorkerStateChange(void)
+BackgroundWorkerStateChange(bool allow_new_workers)
  {
     int         slotno;
  
@@ -298,6 +300,15 @@ BackgroundWorkerStateChange(void)
             continue;
         }
  
+       /*
+        * If we aren't allowing new workers, then immediately mark it for
+        * termination; the next stanza will take care of cleaning it up.
+        * Doing this ensures that any process waiting for the worker will get
+        * awoken, even though the worker will never be allowed to run.
+        */
+       if (!allow_new_workers)
+           slot->terminate = true;
+
         /*
          * If the worker is marked for termination, we don't need to add it to
          * the registered workers list; we can just free the slot. However, if
@@ -504,12 +515,55 @@ BackgroundWorkerStopNotifications(pid_t pid)
     }
  }
  
+/*
+ * Cancel any not-yet-started worker requests that have waiting processes.
+ *
+ * This is called during a normal ("smart" or "fast") database shutdown.
+ * After this point, no new background workers will be started, so anything
+ * that might be waiting for them needs to be kicked off its wait.  We do
+ * that by cancelling the bgworker registration entirely, which is perhaps
+ * overkill, but since we're shutting down it does not matter whether the
+ * registration record sticks around.
+ *
+ * This function should only be called from the postmaster.
+ */
+void
+ForgetUnstartedBackgroundWorkers(void)
+{
+   slist_mutable_iter iter;
+
+   slist_foreach_modify(iter, &BackgroundWorkerList)
+   {
+       RegisteredBgWorker *rw;
+       BackgroundWorkerSlot *slot;
+
+       rw = slist_container(RegisteredBgWorker, rw_lnode, iter.cur);
+       Assert(rw->rw_shmem_slot < max_worker_processes);
+       slot = &BackgroundWorkerData->slot[rw->rw_shmem_slot];
+
+       /* If it's not yet started, and there's someone waiting ... */
+       if (slot->pid == InvalidPid &&
+           rw->rw_worker.bgw_notify_pid != 0)
+       {
+           /* ... then zap it, and notify the waiter */
+           int         notify_pid = rw->rw_worker.bgw_notify_pid;
+
+           ForgetBackgroundWorker(&iter);
+           if (notify_pid != 0)
+               kill(notify_pid, SIGUSR1);
+       }
+   }
+}
+
  /*
   * Reset background worker crash state.
   *
   * We assume that, after a crash-and-restart cycle, background workers without
   * the never-restart flag should be restarted immediately, instead of waiting
- * for bgw_restart_time to elapse.
+ * for bgw_restart_time to elapse.  On the other hand, workers with that flag
+ * should be forgotten immediately, since we won't ever restart them.
+ *
+ * This function should only be called from the postmaster.
   */
  void
  ResetBackgroundWorkerCrashTimes(void)
@@ -549,6 +603,11 @@ ResetBackgroundWorkerCrashTimes(void)
              * resetting.
              */
             rw->rw_crashed_at = 0;
+
+           /*
+            * If there was anyone waiting for it, they're history.
+            */
+           rw->rw_worker.bgw_notify_pid = 0;
         }
     }
  }
@@ -1098,6 +1157,9 @@ GetBackgroundWorkerPid(BackgroundWorkerHandle *handle, pid_t *pidp)
   * returned.  However, if the postmaster has died, we give up and return
   * BGWH_POSTMASTER_DIED, since it that case we know that startup will not
   * take place.
+ *
+ * The caller *must* have set our PID as the worker's bgw_notify_pid,
+ * else we will not be awoken promptly when the worker's state changes.
   */
  BgwHandleStatus
  WaitForBackgroundWorkerStartup(BackgroundWorkerHandle *handle, pid_t *pidp)
@@ -1140,6 +1202,9 @@ WaitForBackgroundWorkerStartup(BackgroundWorkerHandle *handle, pid_t *pidp)
   * and then return BGWH_STOPPED.  However, if the postmaster has died, we give
   * up and return BGWH_POSTMASTER_DIED, because it's the postmaster that
   * notifies us when a worker's state changes.
+ *
+ * The caller *must* have set our PID as the worker's bgw_notify_pid,
+ * else we will not be awoken promptly when the worker's state changes.
   */
  BgwHandleStatus
  WaitForBackgroundWorkerShutdown(BackgroundWorkerHandle *handle)
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c

index e1822c792302f2e2167ca5c0e0f4adb665a7548a..257301ed8af76df96640bfef613e69a9dbd086a2 100644 (file)
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -3761,6 +3761,13 @@ PostmasterStateMachine(void)
      */
     if (pmState == PM_STOP_BACKENDS)
     {
+       /*
+        * Forget any pending requests for background workers, since we're no
+        * longer willing to launch any new workers.  (If additional requests
+        * arrive, BackgroundWorkerStateChange will reject them.)
+        */
+       ForgetUnstartedBackgroundWorkers();
+
         /* Signal all backend children except walsenders */
         SignalSomeChildren(SIGTERM,
                            BACKEND_TYPE_ALL - BACKEND_TYPE_WALSND);
@@ -5151,13 +5158,6 @@ sigusr1_handler(SIGNAL_ARGS)
     PG_SETMASK(&BlockSig);
  #endif
  
-   /* Process background worker state change. */
-   if (CheckPostmasterSignal(PMSIGNAL_BACKGROUND_WORKER_CHANGE))
-   {
-       BackgroundWorkerStateChange();
-       StartWorkerNeeded = true;
-   }
-
     /*
      * RECOVERY_STARTED and BEGIN_HOT_STANDBY signals are ignored in
      * unexpected states. If the startup process quickly starts up, completes
@@ -5203,6 +5203,7 @@ sigusr1_handler(SIGNAL_ARGS)
  
         pmState = PM_RECOVERY;
     }
+
     if (CheckPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY) &&
         pmState == PM_RECOVERY && Shutdown == NoShutdown)
     {
@@ -5228,6 +5229,14 @@ sigusr1_handler(SIGNAL_ARGS)
         StartWorkerNeeded = true;
     }
  
+   /* Process background worker state changes. */
+   if (CheckPostmasterSignal(PMSIGNAL_BACKGROUND_WORKER_CHANGE))
+   {
+       /* Accept new worker requests only if not stopping. */
+       BackgroundWorkerStateChange(pmState < PM_STOP_BACKENDS);
+       StartWorkerNeeded = true;
+   }
+
     if (StartWorkerNeeded || HaveCrashedWorker)
         maybe_start_bgworkers();
  
diff --git a/src/include/postmaster/bgworker_internals.h b/src/include/postmaster/bgworker_internals.h

index 0809e0af538f42cc3aab5d5b20f9915b2d0b1106..e864df978a6ed28cb240fab15c706638196fa472 100644 (file)
--- a/src/include/postmaster/bgworker_internals.h
+++ b/src/include/postmaster/bgworker_internals.h
@@ -46,11 +46,12 @@ extern slist_head BackgroundWorkerList;
  
  extern Size BackgroundWorkerShmemSize(void);
  extern void BackgroundWorkerShmemInit(void);
-extern void BackgroundWorkerStateChange(void);
+extern void BackgroundWorkerStateChange(bool allow_new_workers);
  extern void ForgetBackgroundWorker(slist_mutable_iter *cur);
  extern void ReportBackgroundWorkerPID(RegisteredBgWorker *);
  extern void ReportBackgroundWorkerExit(slist_mutable_iter *cur);
  extern void BackgroundWorkerStopNotifications(pid_t pid);
+extern void ForgetUnstartedBackgroundWorkers(void);
  extern void ResetBackgroundWorkerCrashTimes(void);
  
  /* Function to start a background worker, called from postmaster.c */
author	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 24 Dec 2020 22:00:43 +0000 (17:00 -0500)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Thu, 24 Dec 2020 22:00:43 +0000 (17:00 -0500)
contrib/pg_prewarm/autoprewarm.c		patch \| blob \| blame \| history
src/backend/postmaster/bgworker.c		patch \| blob \| blame \| history
src/backend/postmaster/postmaster.c		patch \| blob \| blame \| history
src/include/postmaster/bgworker_internals.h		patch \| blob \| blame \| history