Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit ad458cf

Browse files
committed
Don't use O_DIRECT when writing WAL files if archiving or streaming is
enabled. Bypassing the kernel cache is counter-productive in that case, because the archiver/walsender process will read from the WAL file soon after it's written, and if it's not cached the read will cause a physical read, eating I/O bandwidth available on the WAL drive. Also, walreceiver process does unaligned writes, so disable O_DIRECT in walreceiver process for that reason too.
1 parent 94f610b commit ad458cf

File tree

4 files changed

+47
-26
lines changed

4 files changed

+47
-26
lines changed

src/backend/access/transam/xlog.c

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.376 2010/02/19 01:04:03 itagaki Exp $
10+
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.377 2010/02/19 10:51:03 heikki Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -2686,13 +2686,10 @@ XLogFileClose(void)
26862686
* WAL segment files will not be re-read in normal operation, so we advise
26872687
* the OS to release any cached pages. But do not do so if WAL archiving
26882688
* or streaming is active, because archiver and walsender process could use
2689-
* the cache to read the WAL segment. Also, don't bother with it if we
2690-
* are using O_DIRECT, since the kernel is presumably not caching in that
2691-
* case.
2689+
* the cache to read the WAL segment.
26922690
*/
26932691
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
2694-
if (!XLogIsNeeded() &&
2695-
(get_sync_bit(sync_method) & PG_O_DIRECT) == 0)
2692+
if (!XLogIsNeeded())
26962693
(void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
26972694
#endif
26982695

@@ -7652,10 +7649,29 @@ xlog_outrec(StringInfo buf, XLogRecord *record)
76527649
static int
76537650
get_sync_bit(int method)
76547651
{
7652+
int o_direct_flag = 0;
7653+
76557654
/* If fsync is disabled, never open in sync mode */
76567655
if (!enableFsync)
76577656
return 0;
76587657

7658+
/*
7659+
* Optimize writes by bypassing kernel cache with O_DIRECT when using
7660+
* O_SYNC, O_DSYNC or O_FSYNC. But only if archiving and streaming are
7661+
* disabled, otherwise the archive command or walsender process will
7662+
* read the WAL soon after writing it, which is guaranteed to cause a
7663+
* physical read if we bypassed the kernel cache. We also skip the
7664+
* posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the
7665+
* same reason.
7666+
*
7667+
* Never use O_DIRECT in walreceiver process for similar reasons; the WAL
7668+
* written by walreceiver is normally read by the startup process soon
7669+
* after its written. Also, walreceiver performs unaligned writes, which
7670+
* don't work with O_DIRECT, so it is required for correctness too.
7671+
*/
7672+
if (!XLogIsNeeded() && !am_walreceiver)
7673+
o_direct_flag = PG_O_DIRECT;
7674+
76597675
switch (method)
76607676
{
76617677
/*
@@ -7670,11 +7686,11 @@ get_sync_bit(int method)
76707686
return 0;
76717687
#ifdef OPEN_SYNC_FLAG
76727688
case SYNC_METHOD_OPEN:
7673-
return OPEN_SYNC_FLAG;
7689+
return OPEN_SYNC_FLAG | o_direct_flag;
76747690
#endif
76757691
#ifdef OPEN_DATASYNC_FLAG
76767692
case SYNC_METHOD_OPEN_DSYNC:
7677-
return OPEN_DATASYNC_FLAG;
7693+
return OPEN_DATASYNC_FLAG | o_direct_flag;
76787694
#endif
76797695
default:
76807696
/* can't happen (unless we are out of sync with option array) */

src/backend/replication/walreceiver.c

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
*
3030
*
3131
* IDENTIFICATION
32-
* $PostgreSQL: pgsql/src/backend/replication/walreceiver.c,v 1.4 2010/02/17 04:19:39 tgl Exp $
32+
* $PostgreSQL: pgsql/src/backend/replication/walreceiver.c,v 1.5 2010/02/19 10:51:04 heikki Exp $
3333
*
3434
*-------------------------------------------------------------------------
3535
*/
@@ -50,6 +50,9 @@
5050
#include "utils/ps_status.h"
5151
#include "utils/resowner.h"
5252

53+
/* Global variable to indicate if this process is a walreceiver process */
54+
bool am_walreceiver;
55+
5356
/* libpqreceiver hooks to these when loaded */
5457
walrcv_connect_type walrcv_connect = NULL;
5558
walrcv_receive_type walrcv_receive = NULL;
@@ -158,6 +161,8 @@ WalReceiverMain(void)
158161
/* use volatile pointer to prevent code rearrangement */
159162
volatile WalRcvData *walrcv = WalRcv;
160163

164+
am_walreceiver = true;
165+
161166
/*
162167
* WalRcv should be set up already (if we are a backend, we inherit
163168
* this by fork() or EXEC_BACKEND mechanism from the postmaster).
@@ -424,16 +429,18 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr)
424429
bool use_existent;
425430

426431
/*
427-
* XLOG segment files will be re-read in recovery operation soon,
428-
* so we don't need to advise the OS to release any cache page.
432+
* fsync() and close current file before we switch to next one.
433+
* We would otherwise have to reopen this file to fsync it later
429434
*/
430435
if (recvFile >= 0)
431436
{
437+
XLogWalRcvFlush();
438+
432439
/*
433-
* fsync() before we switch to next file. We would otherwise
434-
* have to reopen this file to fsync it later
440+
* XLOG segment files will be re-read by recovery in startup
441+
* process soon, so we don't advise the OS to release cache
442+
* pages associated with the file like XLogFileClose() does.
435443
*/
436-
XLogWalRcvFlush();
437444
if (close(recvFile) != 0)
438445
ereport(PANIC,
439446
(errcode_for_file_access(),
@@ -445,8 +452,7 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr)
445452
/* Create/use new log file */
446453
XLByteToSeg(recptr, recvId, recvSeg);
447454
use_existent = true;
448-
recvFile = XLogFileInit(recvId, recvSeg,
449-
&use_existent, true);
455+
recvFile = XLogFileInit(recvId, recvSeg, &use_existent, true);
450456
recvOff = 0;
451457
}
452458

src/include/access/xlogdefs.h

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/include/access/xlogdefs.h,v 1.25 2010/01/15 09:19:06 heikki Exp $
10+
* $PostgreSQL: pgsql/src/include/access/xlogdefs.h,v 1.26 2010/02/19 10:51:04 heikki Exp $
1111
*/
1212
#ifndef XLOG_DEFS_H
1313
#define XLOG_DEFS_H
@@ -106,23 +106,20 @@ typedef uint32 TimeLineID;
106106
* configure determined whether fdatasync() is.
107107
*/
108108
#if defined(O_SYNC)
109-
#define BARE_OPEN_SYNC_FLAG O_SYNC
109+
#define OPEN_SYNC_FLAG O_SYNC
110110
#elif defined(O_FSYNC)
111-
#define BARE_OPEN_SYNC_FLAG O_FSYNC
112-
#endif
113-
#ifdef BARE_OPEN_SYNC_FLAG
114-
#define OPEN_SYNC_FLAG (BARE_OPEN_SYNC_FLAG | PG_O_DIRECT)
111+
#define OPEN_SYNC_FLAG O_FSYNC
115112
#endif
116113

117114
#if defined(O_DSYNC)
118115
#if defined(OPEN_SYNC_FLAG)
119116
/* O_DSYNC is distinct? */
120-
#if O_DSYNC != BARE_OPEN_SYNC_FLAG
121-
#define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT)
117+
#if O_DSYNC != OPEN_SYNC_FLAG
118+
#define OPEN_DATASYNC_FLAG O_DSYNC
122119
#endif
123120
#else /* !defined(OPEN_SYNC_FLAG) */
124121
/* Win32 only has O_DSYNC */
125-
#define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT)
122+
#define OPEN_DATASYNC_FLAG O_DSYNC
126123
#endif
127124
#endif
128125

src/include/replication/walreceiver.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
*
66
* Portions Copyright (c) 2010-2010, PostgreSQL Global Development Group
77
*
8-
* $PostgreSQL: pgsql/src/include/replication/walreceiver.h,v 1.6 2010/02/03 09:47:19 heikki Exp $
8+
* $PostgreSQL: pgsql/src/include/replication/walreceiver.h,v 1.7 2010/02/19 10:51:04 heikki Exp $
99
*
1010
*-------------------------------------------------------------------------
1111
*/
@@ -15,6 +15,8 @@
1515
#include "access/xlogdefs.h"
1616
#include "storage/spin.h"
1717

18+
extern bool am_walreceiver;
19+
1820
/*
1921
* MAXCONNINFO: maximum size of a connection string.
2022
*

0 commit comments

Comments
 (0)