@@ -412,6 +412,7 @@ static void TerminateChildren(int signal);
412
412
#define SignalChildren (sig ) SignalSomeChildren(sig, BACKEND_TYPE_ALL)
413
413
414
414
static int CountChildren (int target );
415
+ static bool assign_backendlist_entry (RegisteredBgWorker * rw );
415
416
static void maybe_start_bgworker (void );
416
417
static bool CreateOptsFile (int argc , char * argv [], char * fullprogname );
417
418
static pid_t StartChildProcess (AuxProcType type );
@@ -5491,13 +5492,33 @@ bgworker_forkexec(int shmem_slot)
5491
5492
* Start a new bgworker.
5492
5493
* Starting time conditions must have been checked already.
5493
5494
*
5495
+ * Returns true on success, false on failure.
5496
+ * In either case, update the RegisteredBgWorker's state appropriately.
5497
+ *
5494
5498
* This code is heavily based on autovacuum.c, q.v.
5495
5499
*/
5496
- static void
5500
+ static bool
5497
5501
do_start_bgworker (RegisteredBgWorker * rw )
5498
5502
{
5499
5503
pid_t worker_pid ;
5500
5504
5505
+ Assert (rw -> rw_pid == 0 );
5506
+
5507
+ /*
5508
+ * Allocate and assign the Backend element. Note we must do this before
5509
+ * forking, so that we can handle out of memory properly.
5510
+ *
5511
+ * Treat failure as though the worker had crashed. That way, the
5512
+ * postmaster will wait a bit before attempting to start it again; if it
5513
+ * tried again right away, most likely it'd find itself repeating the
5514
+ * out-of-memory or fork failure condition.
5515
+ */
5516
+ if (!assign_backendlist_entry (rw ))
5517
+ {
5518
+ rw -> rw_crashed_at = GetCurrentTimestamp ();
5519
+ return false;
5520
+ }
5521
+
5501
5522
ereport (DEBUG1 ,
5502
5523
(errmsg ("starting background worker process \"%s\"" ,
5503
5524
rw -> rw_worker .bgw_name )));
@@ -5509,9 +5530,17 @@ do_start_bgworker(RegisteredBgWorker *rw)
5509
5530
#endif
5510
5531
{
5511
5532
case -1 :
5533
+ /* in postmaster, fork failed ... */
5512
5534
ereport (LOG ,
5513
5535
(errmsg ("could not fork worker process: %m" )));
5514
- return ;
5536
+ /* undo what assign_backendlist_entry did */
5537
+ ReleasePostmasterChildSlot (rw -> rw_child_slot );
5538
+ rw -> rw_child_slot = 0 ;
5539
+ free (rw -> rw_backend );
5540
+ rw -> rw_backend = NULL ;
5541
+ /* mark entry as crashed, so we'll try again later */
5542
+ rw -> rw_crashed_at = GetCurrentTimestamp ();
5543
+ break ;
5515
5544
5516
5545
#ifndef EXEC_BACKEND
5517
5546
case 0 :
@@ -5535,14 +5564,24 @@ do_start_bgworker(RegisteredBgWorker *rw)
5535
5564
PostmasterContext = NULL ;
5536
5565
5537
5566
StartBackgroundWorker ();
5567
+
5568
+ exit (1 ); /* should not get here */
5538
5569
break ;
5539
5570
#endif
5540
5571
default :
5572
+ /* in postmaster, fork successful ... */
5541
5573
rw -> rw_pid = worker_pid ;
5542
5574
rw -> rw_backend -> pid = rw -> rw_pid ;
5543
5575
ReportBackgroundWorkerPID (rw );
5544
- break ;
5576
+ /* add new worker to lists of backends */
5577
+ dlist_push_head (& BackendList , & rw -> rw_backend -> elem );
5578
+ #ifdef EXEC_BACKEND
5579
+ ShmemBackendArrayAdd (rw -> rw_backend );
5580
+ #endif
5581
+ return true;
5545
5582
}
5583
+
5584
+ return false;
5546
5585
}
5547
5586
5548
5587
/*
@@ -5589,6 +5628,8 @@ bgworker_should_start_now(BgWorkerStartTime start_time)
5589
5628
* Allocate the Backend struct for a connected background worker, but don't
5590
5629
* add it to the list of backends just yet.
5591
5630
*
5631
+ * On failure, return false without changing any worker state.
5632
+ *
5592
5633
* Some info from the Backend is copied into the passed rw.
5593
5634
*/
5594
5635
static bool
@@ -5601,14 +5642,6 @@ assign_backendlist_entry(RegisteredBgWorker *rw)
5601
5642
ereport (LOG ,
5602
5643
(errcode (ERRCODE_OUT_OF_MEMORY ),
5603
5644
errmsg ("out of memory" )));
5604
-
5605
- /*
5606
- * The worker didn't really crash, but setting this nonzero makes
5607
- * postmaster wait a bit before attempting to start it again; if it
5608
- * tried again right away, most likely it'd find itself under the same
5609
- * memory pressure.
5610
- */
5611
- rw -> rw_crashed_at = GetCurrentTimestamp ();
5612
5645
return false;
5613
5646
}
5614
5647
@@ -5638,20 +5671,31 @@ assign_backendlist_entry(RegisteredBgWorker *rw)
5638
5671
* As a side effect, the bgworker control variables are set or reset whenever
5639
5672
* there are more workers to start after this one, and whenever the overall
5640
5673
* system state requires it.
5674
+ *
5675
+ * The reason we start at most one worker per call is to avoid consuming the
5676
+ * postmaster's attention for too long when many such requests are pending.
5677
+ * As long as StartWorkerNeeded is true, ServerLoop will not block and will
5678
+ * call this function again after dealing with any other issues.
5641
5679
*/
5642
5680
static void
5643
5681
maybe_start_bgworker (void )
5644
5682
{
5645
5683
slist_mutable_iter iter ;
5646
5684
TimestampTz now = 0 ;
5647
5685
5686
+ /*
5687
+ * During crash recovery, we have no need to be called until the state
5688
+ * transition out of recovery.
5689
+ */
5648
5690
if (FatalError )
5649
5691
{
5650
5692
StartWorkerNeeded = false;
5651
5693
HaveCrashedWorker = false;
5652
- return ; /* not yet */
5694
+ return ;
5653
5695
}
5654
5696
5697
+ /* Don't need to be called again unless we find a reason for it below */
5698
+ StartWorkerNeeded = false;
5655
5699
HaveCrashedWorker = false;
5656
5700
5657
5701
slist_foreach_modify (iter , & BackgroundWorkerList )
@@ -5660,11 +5704,11 @@ maybe_start_bgworker(void)
5660
5704
5661
5705
rw = slist_container (RegisteredBgWorker , rw_lnode , iter .cur );
5662
5706
5663
- /* already running? */
5707
+ /* ignore if already running */
5664
5708
if (rw -> rw_pid != 0 )
5665
5709
continue ;
5666
5710
5667
- /* marked for death? */
5711
+ /* if marked for death, clean up and remove from list */
5668
5712
if (rw -> rw_terminate )
5669
5713
{
5670
5714
ForgetBackgroundWorker (& iter );
@@ -5686,48 +5730,50 @@ maybe_start_bgworker(void)
5686
5730
continue ;
5687
5731
}
5688
5732
5733
+ /* read system time only when needed */
5689
5734
if (now == 0 )
5690
5735
now = GetCurrentTimestamp ();
5691
5736
5692
5737
if (!TimestampDifferenceExceeds (rw -> rw_crashed_at , now ,
5693
5738
rw -> rw_worker .bgw_restart_time * 1000 ))
5694
5739
{
5740
+ /* Set flag to remember that we have workers to start later */
5695
5741
HaveCrashedWorker = true;
5696
5742
continue ;
5697
5743
}
5698
5744
}
5699
5745
5700
5746
if (bgworker_should_start_now (rw -> rw_worker .bgw_start_time ))
5701
5747
{
5702
- /* reset crash time before calling assign_backendlist_entry */
5748
+ /* reset crash time before trying to start worker */
5703
5749
rw -> rw_crashed_at = 0 ;
5704
5750
5705
5751
/*
5706
- * Allocate and assign the Backend element. Note we must do this
5707
- * before forking, so that we can handle out of memory properly.
5752
+ * Try to start the worker.
5753
+ *
5754
+ * On failure, give up processing workers for now, but set
5755
+ * StartWorkerNeeded so we'll come back here on the next iteration
5756
+ * of ServerLoop to try again. (We don't want to wait, because
5757
+ * there might be additional ready-to-run workers.) We could set
5758
+ * HaveCrashedWorker as well, since this worker is now marked
5759
+ * crashed, but there's no need because the next run of this
5760
+ * function will do that.
5708
5761
*/
5709
- if (!assign_backendlist_entry (rw ))
5762
+ if (!do_start_bgworker (rw ))
5763
+ {
5764
+ StartWorkerNeeded = true;
5710
5765
return ;
5711
-
5712
- do_start_bgworker (rw ); /* sets rw->rw_pid */
5713
-
5714
- dlist_push_head (& BackendList , & rw -> rw_backend -> elem );
5715
- #ifdef EXEC_BACKEND
5716
- ShmemBackendArrayAdd (rw -> rw_backend );
5717
- #endif
5766
+ }
5718
5767
5719
5768
/*
5720
- * Have ServerLoop call us again. Note that there might not
5721
- * actually *be* another runnable worker , but we don't care all
5722
- * that much; we will find out the next time we run.
5769
+ * Quit, but have ServerLoop call us again to look for additional
5770
+ * ready-to-run workers. There might not be any , but we'll find
5771
+ * out the next time we run.
5723
5772
*/
5724
5773
StartWorkerNeeded = true;
5725
5774
return ;
5726
5775
}
5727
5776
}
5728
-
5729
- /* no runnable worker found */
5730
- StartWorkerNeeded = false;
5731
5777
}
5732
5778
5733
5779
/*
0 commit comments