@@ -366,6 +366,10 @@ start_postmaster(void)
366
366
/*
367
367
* Since there might be quotes to handle here, it is easier simply to pass
368
368
* everything to a shell to process them.
369
+ *
370
+ * XXX it would be better to fork and exec so that we would know the
371
+ * child postmaster's PID directly; then test_postmaster_connection could
372
+ * use the PID without having to rely on reading it back from the pidfile.
369
373
*/
370
374
if (log_file != NULL )
371
375
snprintf (cmd , MAXPGPATH , SYSTEMQUOTE "\"%s\" %s%s < \"%s\" >> \"%s\" 2>&1 &" SYSTEMQUOTE ,
@@ -412,6 +416,8 @@ static PGPing
412
416
test_postmaster_connection (bool do_checkpoint )
413
417
{
414
418
PGPing ret = PQPING_NO_RESPONSE ;
419
+ bool found_stale_pidfile = false;
420
+ pgpid_t pm_pid = 0 ;
415
421
char connstr [MAXPGPATH * 2 + 256 ];
416
422
int i ;
417
423
@@ -458,78 +464,94 @@ test_postmaster_connection(bool do_checkpoint)
458
464
if (optlines [3 ] == NULL )
459
465
{
460
466
/* File is exactly three lines, must be pre-9.1 */
461
- write_stderr (_ ("%s: -w option is not supported when starting a pre-9.1 server\n" ),
467
+ write_stderr (_ ("\n %s: -w option is not supported when starting a pre-9.1 server\n" ),
462
468
progname );
463
469
return PQPING_NO_ATTEMPT ;
464
470
}
465
471
else if (optlines [4 ] != NULL &&
466
472
optlines [5 ] != NULL )
467
473
{
468
474
/* File is complete enough for us, parse it */
475
+ long pmpid ;
469
476
time_t pmstart ;
470
- int portnum ;
471
- char * sockdir ;
472
- char * hostaddr ;
473
- char host_str [MAXPGPATH ];
474
477
475
478
/*
476
- * Easy cross-check that we are looking at the right data
477
- * directory: is the postmaster older than this execution
478
- * of pg_ctl? Subtract 2 seconds to allow for possible
479
- * clock skew between pg_ctl and the postmaster.
479
+ * Make sanity checks. If it's for a standalone backend
480
+ * (negative PID), or the recorded start time is before
481
+ * pg_ctl started, then either we are looking at the wrong
482
+ * data directory, or this is a pre-existing pidfile that
483
+ * hasn't (yet?) been overwritten by our child postmaster.
484
+ * Allow 2 seconds slop for possible cross-process clock
485
+ * skew.
480
486
*/
487
+ pmpid = atol (optlines [LOCK_FILE_LINE_PID - 1 ]);
481
488
pmstart = atol (optlines [LOCK_FILE_LINE_START_TIME - 1 ]);
482
- if (pmstart < start_time - 2 )
489
+ if (pmpid <= 0 || pmstart < start_time - 2 )
483
490
{
484
- write_stderr (_ ("%s: this data directory is running a pre-existing postmaster\n" ),
485
- progname );
486
- return PQPING_NO_ATTEMPT ;
491
+ /*
492
+ * Set flag to report stale pidfile if it doesn't
493
+ * get overwritten before we give up waiting.
494
+ */
495
+ found_stale_pidfile = true;
487
496
}
497
+ else
498
+ {
499
+ /*
500
+ * OK, seems to be a valid pidfile from our child.
501
+ */
502
+ int portnum ;
503
+ char * sockdir ;
504
+ char * hostaddr ;
505
+ char host_str [MAXPGPATH ];
488
506
489
- /*
490
- * OK, extract port number and host string to use. Prefer
491
- * using Unix socket if available.
492
- */
493
- portnum = atoi (optlines [LOCK_FILE_LINE_PORT - 1 ]);
507
+ found_stale_pidfile = false;
508
+ pm_pid = (pgpid_t ) pmpid ;
494
509
495
- sockdir = optlines [LOCK_FILE_LINE_SOCKET_DIR - 1 ];
496
- hostaddr = optlines [LOCK_FILE_LINE_LISTEN_ADDR - 1 ];
510
+ /*
511
+ * Extract port number and host string to use. Prefer
512
+ * using Unix socket if available.
513
+ */
514
+ portnum = atoi (optlines [LOCK_FILE_LINE_PORT - 1 ]);
515
+ sockdir = optlines [LOCK_FILE_LINE_SOCKET_DIR - 1 ];
516
+ hostaddr = optlines [LOCK_FILE_LINE_LISTEN_ADDR - 1 ];
497
517
498
- /*
499
- * While unix_socket_directory can accept relative
500
- * directories, libpq's host parameter must have a leading
501
- * slash to indicate a socket directory. So, ignore
502
- * sockdir if it's relative, and try to use TCP instead.
503
- */
504
- if (sockdir [0 ] == '/' )
505
- strlcpy (host_str , sockdir , sizeof (host_str ));
506
- else
507
- strlcpy (host_str , hostaddr , sizeof (host_str ));
518
+ /*
519
+ * While unix_socket_directory can accept relative
520
+ * directories, libpq's host parameter must have a
521
+ * leading slash to indicate a socket directory. So,
522
+ * ignore sockdir if it's relative, and try to use TCP
523
+ * instead.
524
+ */
525
+ if (sockdir [0 ] == '/' )
526
+ strlcpy (host_str , sockdir , sizeof (host_str ));
527
+ else
528
+ strlcpy (host_str , hostaddr , sizeof (host_str ));
508
529
509
- /* remove trailing newline */
510
- if (strchr (host_str , '\n' ) != NULL )
511
- * strchr (host_str , '\n' ) = '\0' ;
530
+ /* remove trailing newline */
531
+ if (strchr (host_str , '\n' ) != NULL )
532
+ * strchr (host_str , '\n' ) = '\0' ;
512
533
513
- /* Fail if we couldn't get either sockdir or host addr */
514
- if (host_str [0 ] == '\0' )
515
- {
516
- write_stderr (_ ("%s: -w option cannot use a relative socket directory specification\n" ),
517
- progname );
518
- return PQPING_NO_ATTEMPT ;
519
- }
534
+ /* Fail if couldn't get either sockdir or host addr */
535
+ if (host_str [0 ] == '\0' )
536
+ {
537
+ write_stderr (_ ("\n %s: -w option cannot use a relative socket directory specification\n" ),
538
+ progname );
539
+ return PQPING_NO_ATTEMPT ;
540
+ }
520
541
521
- /* If postmaster is listening on "*", use " localhost" */
522
- if (strcmp (host_str , "*" ) == 0 )
523
- strcpy (host_str , "localhost" );
542
+ /* If postmaster is listening on "*", use localhost */
543
+ if (strcmp (host_str , "*" ) == 0 )
544
+ strcpy (host_str , "localhost" );
524
545
525
- /*
526
- * We need to set connect_timeout otherwise on Windows the
527
- * Service Control Manager (SCM) will probably timeout
528
- * first.
529
- */
530
- snprintf (connstr , sizeof (connstr ),
531
- "dbname=postgres port=%d host='%s' connect_timeout=5" ,
532
- portnum , host_str );
546
+ /*
547
+ * We need to set connect_timeout otherwise on Windows
548
+ * the Service Control Manager (SCM) will probably
549
+ * timeout first.
550
+ */
551
+ snprintf (connstr , sizeof (connstr ),
552
+ "dbname=postgres port=%d host='%s' connect_timeout=5" ,
553
+ portnum , host_str );
554
+ }
533
555
}
534
556
}
535
557
}
@@ -545,16 +567,36 @@ test_postmaster_connection(bool do_checkpoint)
545
567
/*
546
568
* The postmaster should create postmaster.pid very soon after being
547
569
* started. If it's not there after we've waited 5 or more seconds,
548
- * assume startup failed and give up waiting.
570
+ * assume startup failed and give up waiting. (Note this covers
571
+ * both cases where the pidfile was never created, and where it was
572
+ * created and then removed during postmaster exit.) Also, if there
573
+ * *is* a file there but it appears stale, issue a suitable warning
574
+ * and give up waiting.
549
575
*/
550
576
if (i >= 5 )
551
577
{
552
578
struct stat statbuf ;
553
579
554
580
if (stat (pid_file , & statbuf ) != 0 )
555
581
return PQPING_NO_RESPONSE ;
582
+
583
+ if (found_stale_pidfile )
584
+ {
585
+ write_stderr (_ ("\n%s: this data directory appears to be running a pre-existing postmaster\n" ),
586
+ progname );
587
+ return PQPING_NO_RESPONSE ;
588
+ }
556
589
}
557
590
591
+ /*
592
+ * If we've been able to identify the child postmaster's PID, check
593
+ * the process is still alive. This covers cases where the postmaster
594
+ * successfully created the pidfile but then crashed without removing
595
+ * it.
596
+ */
597
+ if (pm_pid > 0 && !postmaster_is_alive ((pid_t ) pm_pid ))
598
+ return PQPING_NO_RESPONSE ;
599
+
558
600
/* No response, or startup still in process; wait */
559
601
#if defined(WIN32 )
560
602
if (do_checkpoint )
@@ -715,7 +757,6 @@ do_init(void)
715
757
static void
716
758
do_start (void )
717
759
{
718
- pgpid_t pid ;
719
760
pgpid_t old_pid = 0 ;
720
761
int exitcode ;
721
762
@@ -765,19 +806,6 @@ do_start(void)
765
806
exit (1 );
766
807
}
767
808
768
- if (old_pid != 0 )
769
- {
770
- pg_usleep (1000000 );
771
- pid = get_pgpid ();
772
- if (pid == old_pid )
773
- {
774
- write_stderr (_ ("%s: could not start server\n"
775
- "Examine the log output.\n" ),
776
- progname );
777
- exit (1 );
778
- }
779
- }
780
-
781
809
if (do_wait )
782
810
{
783
811
print_msg (_ ("waiting for server to start..." ));
0 commit comments