Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 9ce346e

Browse files
committed
Report progress of startup operations that take a long time.
Users sometimes get concerned whe they start the server and it emits a few messages and then doesn't emit any more messages for a long time. Generally, what's happening is either that the system is taking a long time to apply WAL, or it's taking a long time to reset unlogged relations, or it's taking a long time to fsync the data directory, but it's not easy to tell which is the case. To fix that, add a new 'log_startup_progress_interval' setting, by default 10s. When an operation that is known to be potentially long-running takes more than this amount of time, we'll log a status update each time this interval elapses. To avoid undesirable log chatter, don't log anything about WAL replay when in standby mode. Nitin Jadhav and Robert Haas, reviewed by Amul Sul, Bharath Rupireddy, Justin Pryzby, Michael Paquier, and Álvaro Herrera. Discussion: https://postgr.es/m/CA+TgmoaHQrgDFOBwgY16XCoMtXxsrVGFB2jNCvb7-ubuEe1MGg@mail.gmail.com Discussion: https://postgr.es/m/CAMm1aWaHF7VE69572_OLQ+MgpT5RUiUDgF1x5RrtkJBLdpRj3Q@mail.gmail.com
1 parent 732e667 commit 9ce346e

File tree

10 files changed

+181
-3
lines changed

10 files changed

+181
-3
lines changed

doc/src/sgml/config.sgml

+24
Original file line numberDiff line numberDiff line change
@@ -6549,6 +6549,30 @@ local0.* /var/log/postgresql
65496549
</listitem>
65506550
</varlistentry>
65516551

6552+
<varlistentry id="guc-log-startup-progress-interval" xreflabel="log_startup_progress_interval">
6553+
<term><varname>log_startup_progress_interval</varname> (<type>integer</type>)
6554+
<indexterm>
6555+
<primary><varname>log_startup_progress_interval</varname> configuration parameter</primary>
6556+
</indexterm>
6557+
</term>
6558+
<listitem>
6559+
<para>
6560+
Sets the amount of time after which the startup process will log
6561+
a message about a long-running operation that is still in progress,
6562+
as well as the interval between further progress messages for that
6563+
operation. This setting is applied separately to each operation.
6564+
For example, if syncing the data directory takes 25 seconds and
6565+
thereafter resetting unlogged relations takes 8 seconds, and if this
6566+
setting has the default value of 10 seconds, then a messages will be
6567+
logged for syncing the data directory after it has been in progress
6568+
for 10 seconds and again after it has been in progress for 20 seconds,
6569+
but nothing will be logged for resetting unlogged operations.
6570+
A setting of <literal>0</literal> disables the feature. If this value
6571+
is specified without units, it is taken as milliseconds.
6572+
</para>
6573+
</listitem>
6574+
</varlistentry>
6575+
65526576
</variablelist>
65536577

65546578
<para>

src/backend/access/transam/xlog.c

+14
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@
7979
#include "utils/relmapper.h"
8080
#include "utils/pg_rusage.h"
8181
#include "utils/snapmgr.h"
82+
#include "utils/timeout.h"
8283
#include "utils/timestamp.h"
8384

8485
extern uint32 bootstrap_data_checksum_version;
@@ -6718,6 +6719,11 @@ StartupXLOG(void)
67186719
*/
67196720
ValidateXLOGDirectoryStructure();
67206721

6722+
/* Set up timeout handler needed to report startup progress. */
6723+
if (!IsBootstrapProcessingMode())
6724+
RegisterTimeout(STARTUP_PROGRESS_TIMEOUT,
6725+
startup_progress_timeout_handler);
6726+
67216727
/*----------
67226728
* If we previously crashed, perform a couple of actions:
67236729
*
@@ -7491,13 +7497,21 @@ StartupXLOG(void)
74917497
(errmsg("redo starts at %X/%X",
74927498
LSN_FORMAT_ARGS(ReadRecPtr))));
74937499

7500+
/* Prepare to report progress of the redo phase. */
7501+
if (!StandbyMode)
7502+
begin_startup_progress_phase();
7503+
74947504
/*
74957505
* main redo apply loop
74967506
*/
74977507
do
74987508
{
74997509
bool switchedTLI = false;
75007510

7511+
if (!StandbyMode)
7512+
ereport_startup_progress("redo in progress, elapsed time: %ld.%02d s, current LSN: %X/%X",
7513+
LSN_FORMAT_ARGS(ReadRecPtr));
7514+
75017515
#ifdef WAL_DEBUG
75027516
if (XLOG_DEBUG ||
75037517
(rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||

src/backend/nls.mk

+4-2
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,15 @@ GETTEXT_TRIGGERS = $(BACKEND_COMMON_GETTEXT_TRIGGERS) \
1010
yyerror \
1111
parser_yyerror \
1212
scanner_yyerror \
13-
report_invalid_record:2
13+
report_invalid_record:2 \
14+
ereport_startup_progress
1415
GETTEXT_FLAGS = $(BACKEND_COMMON_GETTEXT_FLAGS) \
1516
GUC_check_errmsg:1:c-format \
1617
GUC_check_errdetail:1:c-format \
1718
GUC_check_errhint:1:c-format \
1819
write_stderr:1:c-format \
19-
report_invalid_record:2:c-format
20+
report_invalid_record:2:c-format \
21+
ereport_startup_progress:1:c-format
2022

2123
gettext-files: distprep
2224
find $(srcdir) $(srcdir)/../common $(srcdir)/../port -name '*.c' -print | LC_ALL=C sort >$@

src/backend/postmaster/startup.c

+71
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,22 @@ static volatile sig_atomic_t promote_signaled = false;
5959
*/
6060
static volatile sig_atomic_t in_restore_command = false;
6161

62+
/*
63+
* Time at which the most recent startup operation started.
64+
*/
65+
static TimestampTz startup_progress_phase_start_time;
66+
67+
/*
68+
* Indicates whether the startup progress interval mentioned by the user is
69+
* elapsed or not. TRUE if timeout occurred, FALSE otherwise.
70+
*/
71+
static volatile sig_atomic_t startup_progress_timer_expired = false;
72+
73+
/*
74+
* Time between progress updates for long-running startup operations.
75+
*/
76+
int log_startup_progress_interval = 10000; /* 10 sec */
77+
6278
/* Signal handlers */
6379
static void StartupProcTriggerHandler(SIGNAL_ARGS);
6480
static void StartupProcSigHupHandler(SIGNAL_ARGS);
@@ -282,3 +298,58 @@ ResetPromoteSignaled(void)
282298
{
283299
promote_signaled = false;
284300
}
301+
302+
/*
303+
* Set a flag indicating that it's time to log a progress report.
304+
*/
305+
void
306+
startup_progress_timeout_handler(void)
307+
{
308+
startup_progress_timer_expired = true;
309+
}
310+
311+
/*
312+
* Set the start timestamp of the current operation and enable the timeout.
313+
*/
314+
void
315+
begin_startup_progress_phase(void)
316+
{
317+
TimestampTz fin_time;
318+
319+
/* Feature is disabled. */
320+
if (log_startup_progress_interval == 0)
321+
return;
322+
323+
startup_progress_phase_start_time = GetCurrentTimestamp();
324+
fin_time = TimestampTzPlusMilliseconds(startup_progress_phase_start_time,
325+
log_startup_progress_interval);
326+
enable_timeout_every(STARTUP_PROGRESS_TIMEOUT, fin_time,
327+
log_startup_progress_interval);
328+
}
329+
330+
/*
331+
* Report whether startup progress timeout has occurred. Reset the timer flag
332+
* if it did, set the elapsed time to the out parameters and return true,
333+
* otherwise return false.
334+
*/
335+
bool
336+
has_startup_progress_timeout_expired(long *secs, int *usecs)
337+
{
338+
long seconds;
339+
int useconds;
340+
TimestampTz now;
341+
342+
/* No timeout has occurred. */
343+
if (!startup_progress_timer_expired)
344+
return false;
345+
346+
/* Calculate the elapsed time. */
347+
now = GetCurrentTimestamp();
348+
TimestampDifference(startup_progress_phase_start_time, now, &seconds, &useconds);
349+
350+
*secs = seconds;
351+
*usecs = useconds;
352+
startup_progress_timer_expired = false;
353+
354+
return true;
355+
}

src/backend/storage/file/fd.c

+20-1
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@
9696
#include "pgstat.h"
9797
#include "port/pg_iovec.h"
9898
#include "portability/mem.h"
99+
#include "postmaster/startup.h"
99100
#include "storage/fd.h"
100101
#include "storage/ipc.h"
101102
#include "utils/guc.h"
@@ -3381,6 +3382,9 @@ do_syncfs(const char *path)
33813382
{
33823383
int fd;
33833384

3385+
ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
3386+
path);
3387+
33843388
fd = OpenTransientFile(path, O_RDONLY);
33853389
if (fd < 0)
33863390
{
@@ -3465,6 +3469,9 @@ SyncDataDirectory(void)
34653469
* directories.
34663470
*/
34673471

3472+
/* Prepare to report progress syncing the data directory via syncfs. */
3473+
begin_startup_progress_phase();
3474+
34683475
/* Sync the top level pgdata directory. */
34693476
do_syncfs(".");
34703477
/* If any tablespaces are configured, sync each of those. */
@@ -3487,18 +3494,24 @@ SyncDataDirectory(void)
34873494
}
34883495
#endif /* !HAVE_SYNCFS */
34893496

3497+
#ifdef PG_FLUSH_DATA_WORKS
3498+
/* Prepare to report progress of the pre-fsync phase. */
3499+
begin_startup_progress_phase();
3500+
34903501
/*
34913502
* If possible, hint to the kernel that we're soon going to fsync the data
34923503
* directory and its contents. Errors in this step are even less
34933504
* interesting than normal, so log them only at DEBUG1.
34943505
*/
3495-
#ifdef PG_FLUSH_DATA_WORKS
34963506
walkdir(".", pre_sync_fname, false, DEBUG1);
34973507
if (xlog_is_symlink)
34983508
walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
34993509
walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
35003510
#endif
35013511

3512+
/* Prepare to report progress syncing the data directory via fsync. */
3513+
begin_startup_progress_phase();
3514+
35023515
/*
35033516
* Now we do the fsync()s in the same order.
35043517
*
@@ -3601,6 +3614,9 @@ pre_sync_fname(const char *fname, bool isdir, int elevel)
36013614
if (isdir)
36023615
return;
36033616

3617+
ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
3618+
fname);
3619+
36043620
fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
36053621

36063622
if (fd < 0)
@@ -3630,6 +3646,9 @@ pre_sync_fname(const char *fname, bool isdir, int elevel)
36303646
static void
36313647
datadir_fsync_fname(const char *fname, bool isdir, int elevel)
36323648
{
3649+
ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
3650+
fname);
3651+
36333652
/*
36343653
* We want to silently ignoring errors about unreadable files. Pass that
36353654
* desire on to fsync_fname_ext().

src/backend/storage/file/reinit.c

+12
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <unistd.h>
1818

1919
#include "common/relpath.h"
20+
#include "postmaster/startup.h"
2021
#include "storage/copydir.h"
2122
#include "storage/fd.h"
2223
#include "storage/reinit.h"
@@ -65,6 +66,9 @@ ResetUnloggedRelations(int op)
6566
ALLOCSET_DEFAULT_SIZES);
6667
oldctx = MemoryContextSwitchTo(tmpctx);
6768

69+
/* Prepare to report progress resetting unlogged relations. */
70+
begin_startup_progress_phase();
71+
6872
/*
6973
* First process unlogged files in pg_default ($PGDATA/base)
7074
*/
@@ -136,6 +140,14 @@ ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op)
136140

137141
snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
138142
tsdirname, de->d_name);
143+
144+
if (op & UNLOGGED_RELATION_INIT)
145+
ereport_startup_progress("resetting unlogged relations (init), elapsed time: %ld.%02d s, current path: %s",
146+
dbspace_path);
147+
else if (op & UNLOGGED_RELATION_CLEANUP)
148+
ereport_startup_progress("resetting unlogged relations (cleanup), elapsed time: %ld.%02d s, current path: %s",
149+
dbspace_path);
150+
139151
ResetUnloggedRelationsInDbspaceDir(dbspace_path, op);
140152
}
141153

src/backend/utils/misc/guc.c

+13
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@
7171
#include "postmaster/bgworker_internals.h"
7272
#include "postmaster/bgwriter.h"
7373
#include "postmaster/postmaster.h"
74+
#include "postmaster/startup.h"
7475
#include "postmaster/syslogger.h"
7576
#include "postmaster/walwriter.h"
7677
#include "replication/logicallauncher.h"
@@ -3570,6 +3571,18 @@ static struct config_int ConfigureNamesInt[] =
35703571
check_client_connection_check_interval, NULL, NULL
35713572
},
35723573

3574+
{
3575+
{"log_startup_progress_interval", PGC_SIGHUP, LOGGING_WHEN,
3576+
gettext_noop("Time between progress updates for "
3577+
"long-running startup operations."),
3578+
gettext_noop("0 turns this feature off."),
3579+
GUC_UNIT_MS,
3580+
},
3581+
&log_startup_progress_interval,
3582+
10000, 0, INT_MAX,
3583+
NULL, NULL, NULL
3584+
},
3585+
35733586
/* End-of-list marker */
35743587
{
35753588
{NULL, 0, 0, NULL, NULL}, NULL, 0, 0, 0, NULL, NULL, NULL

src/backend/utils/misc/postgresql.conf.sample

+5
Original file line numberDiff line numberDiff line change
@@ -523,6 +523,11 @@
523523
# are logged regardless of their duration; 1.0 logs all
524524
# statements from all transactions, 0.0 never logs
525525

526+
#log_startup_progress_interval = 10s # Time between progress updates for
527+
# long-running startup operations.
528+
# 0 disables the feature, > 0 indicates
529+
# the interval in milliseconds.
530+
526531
# - What to Log -
527532

528533
#debug_print_parse = off

src/include/postmaster/startup.h

+17
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,28 @@
1212
#ifndef _STARTUP_H
1313
#define _STARTUP_H
1414

15+
/*
16+
* Log the startup progress message if a timer has expired.
17+
*/
18+
#define ereport_startup_progress(msg, ...) \
19+
do { \
20+
long secs; \
21+
int usecs; \
22+
if (has_startup_progress_timeout_expired(&secs, &usecs)) \
23+
ereport(LOG, errmsg(msg, secs, (usecs / 10000), __VA_ARGS__ )); \
24+
} while(0)
25+
26+
extern int log_startup_progress_interval;
27+
1528
extern void HandleStartupProcInterrupts(void);
1629
extern void StartupProcessMain(void) pg_attribute_noreturn();
1730
extern void PreRestoreCommand(void);
1831
extern void PostRestoreCommand(void);
1932
extern bool IsPromoteSignaled(void);
2033
extern void ResetPromoteSignaled(void);
2134

35+
extern void begin_startup_progress_phase(void);
36+
extern void startup_progress_timeout_handler(void);
37+
extern bool has_startup_progress_timeout_expired(long *secs, int *usecs);
38+
2239
#endif /* _STARTUP_H */

src/include/utils/timeout.h

+1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ typedef enum TimeoutId
3333
IDLE_IN_TRANSACTION_SESSION_TIMEOUT,
3434
IDLE_SESSION_TIMEOUT,
3535
CLIENT_CONNECTION_CHECK_TIMEOUT,
36+
STARTUP_PROGRESS_TIMEOUT,
3637
/* First user-definable timeout reason */
3738
USER_TIMEOUT,
3839
/* Maximum number of timeout reasons */

0 commit comments

Comments
 (0)