Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit d4e71df

Browse files
committed
Add io_direct setting (developer-only).
Provide a way to ask the kernel to use O_DIRECT (or local equivalent) where available for data and WAL files, to avoid or minimize kernel caching. This hurts performance currently and is not intended for end users yet. Later proposed work would introduce our own I/O clustering, read-ahead, etc to replace the facilities the kernel disables with this option. The only user-visible change, if the developer-only GUC is not used, is that this commit also removes the obscure logic that would activate O_DIRECT for the WAL when wal_sync_method=open_[data]sync and wal_level=minimal (which also requires max_wal_senders=0). Those are non-default and unlikely settings, and this behavior wasn't (correctly) documented. The same effect can be achieved with io_direct=wal. Author: Thomas Munro <thomas.munro@gmail.com> Author: Andres Freund <andres@anarazel.de> Author: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com> Reviewed-by: Justin Pryzby <pryzby@telsasoft.com> Reviewed-by: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com> Discussion: https://postgr.es/m/CA%2BhUKGK1X532hYqJ_MzFWt0n1zt8trz980D79WbjwnT-yYLZpg%40mail.gmail.com
1 parent faeedbc commit d4e71df

File tree

14 files changed

+263
-35
lines changed

14 files changed

+263
-35
lines changed

doc/src/sgml/config.sgml

+32-1
Original file line numberDiff line numberDiff line change
@@ -3172,7 +3172,6 @@ include_dir 'conf.d'
31723172
</listitem>
31733173
</itemizedlist>
31743174
<para>
3175-
The <literal>open_</literal>* options also use <literal>O_DIRECT</literal> if available.
31763175
Not all of these choices are available on all platforms.
31773176
The default is the first method in the above list that is supported
31783177
by the platform, except that <literal>fdatasync</literal> is the default on
@@ -11256,6 +11255,38 @@ dynamic_library_path = 'C:\tools\postgresql;H:\my_project\lib;$libdir'
1125611255
</listitem>
1125711256
</varlistentry>
1125811257

11258+
<varlistentry id="guc-io-direct" xreflabel="io_direct">
11259+
<term><varname>io_direct</varname> (<type>string</type>)
11260+
<indexterm>
11261+
<primary><varname>io_direct</varname> configuration parameter</primary>
11262+
</indexterm>
11263+
</term>
11264+
<listitem>
11265+
<para>
11266+
Ask the kernel to minimize caching effects for relation data and WAL
11267+
files using <literal>O_DIRECT</literal> (most Unix-like systems),
11268+
<literal>F_NOCACHE</literal> (macOS) or
11269+
<literal>FILE_FLAG_NO_BUFFERING</literal> (Windows).
11270+
</para>
11271+
<para>
11272+
May be set to an empty string (the default) to disable use of direct
11273+
I/O, or a comma-separated list of operations that should use direct I/O.
11274+
The valid options are <literal>data</literal> for
11275+
main data files, <literal>wal</literal> for WAL files, and
11276+
<literal>wal_init</literal> for WAL files when being initially
11277+
allocated.
11278+
</para>
11279+
<para>
11280+
Some operating systems and file systems do not support direct I/O, so
11281+
non-default settings may be rejected at startup or cause errors.
11282+
</para>
11283+
<para>
11284+
Currently this feature reduces performance, and is intended for
11285+
developer testing only.
11286+
</para>
11287+
</listitem>
11288+
</varlistentry>
11289+
1125911290
<varlistentry id="guc-post-auth-delay" xreflabel="post_auth_delay">
1126011291
<term><varname>post_auth_delay</varname> (<type>integer</type>)
1126111292
<indexterm>

src/backend/access/transam/xlog.c

+16-21
Original file line numberDiff line numberDiff line change
@@ -2926,6 +2926,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
29262926
XLogSegNo max_segno;
29272927
int fd;
29282928
int save_errno;
2929+
int open_flags = O_RDWR | O_CREAT | O_EXCL | PG_BINARY;
29292930

29302931
Assert(logtli != 0);
29312932

@@ -2959,8 +2960,11 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
29592960

29602961
unlink(tmppath);
29612962

2963+
if (io_direct_flags & IO_DIRECT_WAL_INIT)
2964+
open_flags |= PG_O_DIRECT;
2965+
29622966
/* do not use get_sync_bit() here --- want to fsync only at end of fill */
2963-
fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
2967+
fd = BasicOpenFile(tmppath, open_flags);
29642968
if (fd < 0)
29652969
ereport(ERROR,
29662970
(errcode_for_file_access(),
@@ -3354,7 +3358,7 @@ XLogFileClose(void)
33543358
* use the cache to read the WAL segment.
33553359
*/
33563360
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3357-
if (!XLogIsNeeded())
3361+
if (!XLogIsNeeded() && (io_direct_flags & IO_DIRECT_WAL) == 0)
33583362
(void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
33593363
#endif
33603364

@@ -4445,7 +4449,6 @@ show_in_hot_standby(void)
44454449
return RecoveryInProgress() ? "on" : "off";
44464450
}
44474451

4448-
44494452
/*
44504453
* Read the control file, set respective GUCs.
44514454
*
@@ -8029,35 +8032,27 @@ xlog_redo(XLogReaderState *record)
80298032
}
80308033

80318034
/*
8032-
* Return the (possible) sync flag used for opening a file, depending on the
8033-
* value of the GUC wal_sync_method.
8035+
* Return the extra open flags used for opening a file, depending on the
8036+
* value of the GUCs wal_sync_method, fsync and io_direct.
80348037
*/
80358038
static int
80368039
get_sync_bit(int method)
80378040
{
80388041
int o_direct_flag = 0;
80398042

8040-
/* If fsync is disabled, never open in sync mode */
8041-
if (!enableFsync)
8042-
return 0;
8043-
80448043
/*
8045-
* Optimize writes by bypassing kernel cache with O_DIRECT when using
8046-
* O_SYNC and O_DSYNC. But only if archiving and streaming are disabled,
8047-
* otherwise the archive command or walsender process will read the WAL
8048-
* soon after writing it, which is guaranteed to cause a physical read if
8049-
* we bypassed the kernel cache. We also skip the
8050-
* posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
8051-
* reason.
8052-
*
8053-
* Never use O_DIRECT in walreceiver process for similar reasons; the WAL
8044+
* Use O_DIRECT if requested, except in walreceiver process. The WAL
80548045
* written by walreceiver is normally read by the startup process soon
8055-
* after it's written. Also, walreceiver performs unaligned writes, which
8046+
* after it's written. Also, walreceiver performs unaligned writes, which
80568047
* don't work with O_DIRECT, so it is required for correctness too.
80578048
*/
8058-
if (!XLogIsNeeded() && !AmWalReceiverProcess())
8049+
if ((io_direct_flags & IO_DIRECT_WAL) && !AmWalReceiverProcess())
80598050
o_direct_flag = PG_O_DIRECT;
80608051

8052+
/* If fsync is disabled, never open in sync mode */
8053+
if (!enableFsync)
8054+
return o_direct_flag;
8055+
80618056
switch (method)
80628057
{
80638058
/*
@@ -8069,7 +8064,7 @@ get_sync_bit(int method)
80698064
case SYNC_METHOD_FSYNC:
80708065
case SYNC_METHOD_FSYNC_WRITETHROUGH:
80718066
case SYNC_METHOD_FDATASYNC:
8072-
return 0;
8067+
return o_direct_flag;
80738068
#ifdef O_SYNC
80748069
case SYNC_METHOD_OPEN:
80758070
return O_SYNC | o_direct_flag;

src/backend/access/transam/xlogprefetcher.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -785,7 +785,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
785785
block->prefetch_buffer = InvalidBuffer;
786786
return LRQ_NEXT_IO;
787787
}
788-
else
788+
else if ((io_direct_flags & IO_DIRECT_DATA) == 0)
789789
{
790790
/*
791791
* This shouldn't be possible, because we already determined

src/backend/storage/buffer/bufmgr.c

+11-5
Original file line numberDiff line numberDiff line change
@@ -541,8 +541,11 @@ PrefetchSharedBuffer(SMgrRelation smgr_reln,
541541
* Try to initiate an asynchronous read. This returns false in
542542
* recovery if the relation file doesn't exist.
543543
*/
544-
if (smgrprefetch(smgr_reln, forkNum, blockNum))
544+
if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
545+
smgrprefetch(smgr_reln, forkNum, blockNum))
546+
{
545547
result.initiated_io = true;
548+
}
546549
#endif /* USE_PREFETCH */
547550
}
548551
else
@@ -588,11 +591,11 @@ PrefetchSharedBuffer(SMgrRelation smgr_reln,
588591
* the kernel and therefore didn't really initiate I/O, and no way to know when
589592
* the I/O completes other than using synchronous ReadBuffer().
590593
*
591-
* 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and either
594+
* 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
592595
* USE_PREFETCH is not defined (this build doesn't support prefetching due to
593-
* lack of a kernel facility), or the underlying relation file wasn't found and
594-
* we are in recovery. (If the relation file wasn't found and we are not in
595-
* recovery, an error is raised).
596+
* lack of a kernel facility), direct I/O is enabled, or the underlying
597+
* relation file wasn't found and we are in recovery. (If the relation file
598+
* wasn't found and we are not in recovery, an error is raised).
596599
*/
597600
PrefetchBufferResult
598601
PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
@@ -5440,6 +5443,9 @@ ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
54405443
{
54415444
PendingWriteback *pending;
54425445

5446+
if (io_direct_flags & IO_DIRECT_DATA)
5447+
return;
5448+
54435449
/*
54445450
* Add buffer to the pending writeback array, unless writeback control is
54455451
* disabled.

src/backend/storage/buffer/localbuf.c

+5-2
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,11 @@ PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum,
9292
{
9393
#ifdef USE_PREFETCH
9494
/* Not in buffers, so initiate prefetch */
95-
smgrprefetch(smgr, forkNum, blockNum);
96-
result.initiated_io = true;
95+
if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
96+
smgrprefetch(smgr, forkNum, blockNum))
97+
{
98+
result.initiated_io = true;
99+
}
97100
#endif /* USE_PREFETCH */
98101
}
99102

src/backend/storage/file/fd.c

+98
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,9 @@
9898
#include "storage/fd.h"
9999
#include "storage/ipc.h"
100100
#include "utils/guc.h"
101+
#include "utils/guc_hooks.h"
101102
#include "utils/resowner_private.h"
103+
#include "utils/varlena.h"
102104

103105
/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
104106
#if defined(HAVE_SYNC_FILE_RANGE)
@@ -162,6 +164,9 @@ bool data_sync_retry = false;
162164
/* How SyncDataDirectory() should do its job. */
163165
int recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC;
164166

167+
/* Which kinds of files should be opened with PG_O_DIRECT. */
168+
int io_direct_flags;
169+
165170
/* Debugging.... */
166171

167172
#ifdef FDDEBUG
@@ -2022,6 +2027,9 @@ FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
20222027
if (nbytes <= 0)
20232028
return;
20242029

2030+
if (VfdCache[file].fileFlags & PG_O_DIRECT)
2031+
return;
2032+
20252033
returnCode = FileAccess(file);
20262034
if (returnCode < 0)
20272035
return;
@@ -3826,3 +3834,93 @@ data_sync_elevel(int elevel)
38263834
{
38273835
return data_sync_retry ? elevel : PANIC;
38283836
}
3837+
3838+
bool
3839+
check_io_direct(char **newval, void **extra, GucSource source)
3840+
{
3841+
bool result = true;
3842+
int flags;
3843+
3844+
#if PG_O_DIRECT == 0
3845+
if (strcmp(*newval, "") != 0)
3846+
{
3847+
GUC_check_errdetail("io_direct is not supported on this platform.");
3848+
result = false;
3849+
}
3850+
flags = 0;
3851+
#else
3852+
List *elemlist;
3853+
ListCell *l;
3854+
char *rawstring;
3855+
3856+
/* Need a modifiable copy of string */
3857+
rawstring = pstrdup(*newval);
3858+
3859+
if (!SplitGUCList(rawstring, ',', &elemlist))
3860+
{
3861+
GUC_check_errdetail("invalid list syntax in parameter \"%s\"",
3862+
"io_direct");
3863+
pfree(rawstring);
3864+
list_free(elemlist);
3865+
return false;
3866+
}
3867+
3868+
flags = 0;
3869+
foreach(l, elemlist)
3870+
{
3871+
char *item = (char *) lfirst(l);
3872+
3873+
if (pg_strcasecmp(item, "data") == 0)
3874+
flags |= IO_DIRECT_DATA;
3875+
else if (pg_strcasecmp(item, "wal") == 0)
3876+
flags |= IO_DIRECT_WAL;
3877+
else if (pg_strcasecmp(item, "wal_init") == 0)
3878+
flags |= IO_DIRECT_WAL_INIT;
3879+
else
3880+
{
3881+
GUC_check_errdetail("invalid option \"%s\"", item);
3882+
result = false;
3883+
break;
3884+
}
3885+
}
3886+
3887+
/*
3888+
* It's possible to configure block sizes smaller than our assumed I/O
3889+
* alignment size, which could result in invalid I/O requests.
3890+
*/
3891+
#if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
3892+
if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
3893+
{
3894+
GUC_check_errdetail("io_direct is not supported for WAL because XLOG_BLCKSZ is too small");
3895+
result = false;
3896+
}
3897+
#endif
3898+
#if BLCKSZ < PG_IO_ALIGN_SIZE
3899+
if (result && (flags & IO_DIRECT_DATA))
3900+
{
3901+
GUC_check_errdetail("io_direct is not supported for data because BLCKSZ is too small");
3902+
result = false;
3903+
}
3904+
#endif
3905+
3906+
pfree(rawstring);
3907+
list_free(elemlist);
3908+
#endif
3909+
3910+
if (!result)
3911+
return result;
3912+
3913+
/* Save the flags in *extra, for use by assign_io_direct */
3914+
*extra = guc_malloc(ERROR, sizeof(int));
3915+
*((int *) *extra) = flags;
3916+
3917+
return result;
3918+
}
3919+
3920+
extern void
3921+
assign_io_direct(const char *newval, void *extra)
3922+
{
3923+
int *flags = (int *) extra;
3924+
3925+
io_direct_flags = *flags;
3926+
}

0 commit comments

Comments
 (0)