Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit c34bb00

Browse files
committed
Use O_DIRECT if available when using O_SYNC for wal_sync_method. Also, write multiple WAL buffers out in one write() operation. ITAGAKI Takahiro --------------------------------------------------------------------------- > If we disable writeback-cache and use open_sync, the per-page writing > behavior in WAL module will show up as bad result. O_DIRECT is similar > to O_DSYNC (at least on linux), so that the benefit of it will disappear > behind the slow disk revolution. > > In the current source, WAL is written as: > for (i = 0; i < N; i++) { write(&buffers[i], BLCKSZ); } > Is this intentional? Can we rewrite it as follows? > write(&buffers[0], N * BLCKSZ); > > In order to achieve it, I wrote a 'gather-write' patch (xlog.gw.diff). > Aside from this, I'll also send the fixed direct io patch (xlog.dio.diff). > These two patches are independent, so they can be applied either or both. > > > I tested them on my machine and the results as follows. It shows that > direct-io and gather-write is the best choice when writeback-cache is off. > Are these two patches worth trying if they are used together? > > > | writeback | fsync= | fdata | open_ | fsync_ | open_ > patch | cache | false | sync | sync | direct | direct > ------------+-----------+--------+-------+-------+--------+--------- > direct io | off | 124.2 | 105.7 | 48.3 | 48.3 | 48.2 > direct io | on | 129.1 | 112.3 | 114.1 | 142.9 | 144.5 > gather-write| off | 124.3 | 108.7 | 105.4 | (N/A) | (N/A) > both | off | 131.5 | 115.5 | 114.4 | 145.4 | 145.2 > > - 20runs * pgbench -s 100 -c 50 -t 200 > - with tuning (wal_buffers=64, commit_delay=500, checkpoint_segments=8) > - using 2 ATA disks: > - hda(reiserfs) includes system and wal. > - hdc(jfs) includes database files. writeback-cache is always on. > > --- > ITAGAKI Takahiro
1 parent 722f31f commit c34bb00

File tree

1 file changed

+149
-48
lines changed
  • src/backend/access/transam

1 file changed

+149
-48
lines changed

src/backend/access/transam/xlog.c

+149-48
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.210 2005/07/23 15:31:16 momjian Exp $
10+
* $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.211 2005/07/29 03:22:33 momjian Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -47,31 +47,71 @@
4747
#include "utils/relcache.h"
4848

4949

50+
/*
51+
* Becauase O_DIRECT bypasses the kernel buffers, and because we never
52+
* read those buffers except during crash recovery, it is a win to use
53+
* it in all cases where we sync on each write(). We could allow O_DIRECT
54+
* with fsync(), but because skipping the kernel buffer forces writes out
55+
* quickly, it seems best just to use it for O_SYNC. It is hard to imagine
56+
* how fsync() could be a win for O_DIRECT compared to O_SYNC and O_DIRECT.
57+
*/
58+
#ifdef O_DIRECT
59+
#define PG_O_DIRECT O_DIRECT
60+
#else
61+
#define PG_O_DIRECT 0
62+
#endif
63+
5064
/*
5165
* This chunk of hackery attempts to determine which file sync methods
5266
* are available on the current platform, and to choose an appropriate
5367
* default method. We assume that fsync() is always available, and that
5468
* configure determined whether fdatasync() is.
5569
*/
5670
#if defined(O_SYNC)
57-
#define OPEN_SYNC_FLAG O_SYNC
71+
#define CMP_OPEN_SYNC_FLAG O_SYNC
5872
#else
5973
#if defined(O_FSYNC)
60-
#define OPEN_SYNC_FLAG O_FSYNC
74+
#define CMP_OPEN_SYNC_FLAG O_FSYNC
6175
#endif
6276
#endif
77+
#define OPEN_SYNC_FLAG (CMP_OPEN_SYNC_FLAG | PG_O_DIRECT)
6378

6479
#if defined(O_DSYNC)
6580
#if defined(OPEN_SYNC_FLAG)
66-
#if O_DSYNC != OPEN_SYNC_FLAG
67-
#define OPEN_DATASYNC_FLAG O_DSYNC
81+
#if O_DSYNC != CMP_OPEN_SYNC_FLAG
82+
#define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT)
6883
#endif
6984
#else /* !defined(OPEN_SYNC_FLAG) */
7085
/* Win32 only has O_DSYNC */
71-
#define OPEN_DATASYNC_FLAG O_DSYNC
86+
#define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT)
7287
#endif
7388
#endif
7489

90+
/*
91+
* Limitation of buffer-alignment for direct io depend on OS and filesystem,
92+
* but BLCKSZ is assumed to be enough for it.
93+
*/
94+
#ifdef O_DIRECT
95+
#define ALIGNOF_XLOG_BUFFER BLCKSZ
96+
#else
97+
#define ALIGNOF_XLOG_BUFFER MAXIMUM_ALIGNOF
98+
#endif
99+
100+
/*
101+
* Switch the alignment routine because ShmemAlloc() returns a max-aligned
102+
* buffer and ALIGNOF_XLOG_BUFFER may be greater than MAXIMUM_ALIGNOF.
103+
*/
104+
#if ALIGNOF_XLOG_BUFFER <= MAXIMUM_ALIGNOF
105+
#define XLOG_BUFFER_ALIGN(LEN) MAXALIGN((LEN))
106+
#else
107+
#define XLOG_BUFFER_ALIGN(LEN) ((LEN) + (ALIGNOF_XLOG_BUFFER))
108+
#endif
109+
/* assume sizeof(ptrdiff_t) == sizeof(void*) */
110+
#define POINTERALIGN(ALIGNVAL,PTR) \
111+
((char *)(((ptrdiff_t) (PTR) + (ALIGNVAL-1)) & ~((ptrdiff_t) (ALIGNVAL-1))))
112+
#define XLOG_BUFFER_POINTERALIGN(PTR) \
113+
POINTERALIGN((ALIGNOF_XLOG_BUFFER), (PTR))
114+
75115
#if defined(OPEN_DATASYNC_FLAG)
76116
#define DEFAULT_SYNC_METHOD_STR "open_datasync"
77117
#define DEFAULT_SYNC_METHOD SYNC_METHOD_OPEN
@@ -469,6 +509,17 @@ static void ReadControlFile(void);
469509
static char *str_time(time_t tnow);
470510
static void issue_xlog_fsync(void);
471511

512+
/* XLog gather-write staffs */
513+
typedef struct XLogPages
514+
{
515+
char *head; /* Head of first page */
516+
int size; /* Total bytes of pages == count(pages) * BLCKSZ */
517+
int offset; /* Offset in xlog segment file */
518+
} XLogPages;
519+
static void XLogPageReset(XLogPages *pages);
520+
static void XLogPageWrite(XLogPages *pages, int index);
521+
static void XLogPageFlush(XLogPages *pages, int index);
522+
472523
#ifdef WAL_DEBUG
473524
static void xlog_outrec(char *buf, XLogRecord *record);
474525
#endif
@@ -1245,9 +1296,10 @@ static void
12451296
XLogWrite(XLogwrtRqst WriteRqst)
12461297
{
12471298
XLogCtlWrite *Write = &XLogCtl->Write;
1248-
char *from;
12491299
bool ispartialpage;
12501300
bool use_existent;
1301+
int currentIndex = Write->curridx;
1302+
XLogPages pages;
12511303

12521304
/* We should always be inside a critical section here */
12531305
Assert(CritSectionCount > 0);
@@ -1258,6 +1310,8 @@ XLogWrite(XLogwrtRqst WriteRqst)
12581310
*/
12591311
LogwrtResult = Write->LogwrtResult;
12601312

1313+
XLogPageReset(&pages);
1314+
12611315
while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
12621316
{
12631317
/*
@@ -1266,21 +1320,22 @@ XLogWrite(XLogwrtRqst WriteRqst)
12661320
* end of the last page that's been initialized by
12671321
* AdvanceXLInsertBuffer.
12681322
*/
1269-
if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[Write->curridx]))
1323+
if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[currentIndex]))
12701324
elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
12711325
LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
1272-
XLogCtl->xlblocks[Write->curridx].xlogid,
1273-
XLogCtl->xlblocks[Write->curridx].xrecoff);
1326+
XLogCtl->xlblocks[currentIndex].xlogid,
1327+
XLogCtl->xlblocks[currentIndex].xrecoff);
12741328

12751329
/* Advance LogwrtResult.Write to end of current buffer page */
1276-
LogwrtResult.Write = XLogCtl->xlblocks[Write->curridx];
1330+
LogwrtResult.Write = XLogCtl->xlblocks[currentIndex];
12771331
ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);
12781332

12791333
if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
12801334
{
12811335
/*
12821336
* Switch to new logfile segment.
12831337
*/
1338+
XLogPageFlush(&pages, currentIndex);
12841339
if (openLogFile >= 0)
12851340
{
12861341
if (close(openLogFile))
@@ -1354,31 +1409,8 @@ XLogWrite(XLogwrtRqst WriteRqst)
13541409
openLogOff = 0;
13551410
}
13561411

1357-
/* Need to seek in the file? */
1358-
if (openLogOff != (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize)
1359-
{
1360-
openLogOff = (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize;
1361-
if (lseek(openLogFile, (off_t) openLogOff, SEEK_SET) < 0)
1362-
ereport(PANIC,
1363-
(errcode_for_file_access(),
1364-
errmsg("could not seek in log file %u, segment %u to offset %u: %m",
1365-
openLogId, openLogSeg, openLogOff)));
1366-
}
1367-
1368-
/* OK to write the page */
1369-
from = XLogCtl->pages + Write->curridx * BLCKSZ;
1370-
errno = 0;
1371-
if (write(openLogFile, from, BLCKSZ) != BLCKSZ)
1372-
{
1373-
/* if write didn't set errno, assume problem is no disk space */
1374-
if (errno == 0)
1375-
errno = ENOSPC;
1376-
ereport(PANIC,
1377-
(errcode_for_file_access(),
1378-
errmsg("could not write to log file %u, segment %u at offset %u: %m",
1379-
openLogId, openLogSeg, openLogOff)));
1380-
}
1381-
openLogOff += BLCKSZ;
1412+
/* Add a page to buffer */
1413+
XLogPageWrite(&pages, currentIndex);
13821414

13831415
/*
13841416
* If we just wrote the whole last page of a logfile segment,
@@ -1390,8 +1422,9 @@ XLogWrite(XLogwrtRqst WriteRqst)
13901422
* This is also the right place to notify the Archiver that the
13911423
* segment is ready to copy to archival storage.
13921424
*/
1393-
if (openLogOff >= XLogSegSize && !ispartialpage)
1425+
if (openLogOff + pages.size >= XLogSegSize && !ispartialpage)
13941426
{
1427+
XLogPageFlush(&pages, currentIndex);
13951428
issue_xlog_fsync();
13961429
LogwrtResult.Flush = LogwrtResult.Write; /* end of current page */
13971430

@@ -1405,8 +1438,9 @@ XLogWrite(XLogwrtRqst WriteRqst)
14051438
LogwrtResult.Write = WriteRqst.Write;
14061439
break;
14071440
}
1408-
Write->curridx = NextBufIdx(Write->curridx);
1441+
currentIndex = NextBufIdx(currentIndex);
14091442
}
1443+
XLogPageFlush(&pages, currentIndex);
14101444

14111445
/*
14121446
* If asked to flush, do so
@@ -3584,7 +3618,7 @@ XLOGShmemSize(void)
35843618
if (XLOGbuffers < MinXLOGbuffers)
35853619
XLOGbuffers = MinXLOGbuffers;
35863620

3587-
return MAXALIGN(sizeof(XLogCtlData) + sizeof(XLogRecPtr) * XLOGbuffers)
3621+
return XLOG_BUFFER_ALIGN(sizeof(XLogCtlData) + sizeof(XLogRecPtr) * XLOGbuffers)
35883622
+ BLCKSZ * XLOGbuffers +
35893623
MAXALIGN(sizeof(ControlFileData));
35903624
}
@@ -3601,7 +3635,7 @@ XLOGShmemInit(void)
36013635

36023636
XLogCtl = (XLogCtlData *)
36033637
ShmemInitStruct("XLOG Ctl",
3604-
MAXALIGN(sizeof(XLogCtlData) +
3638+
XLOG_BUFFER_ALIGN(sizeof(XLogCtlData) +
36053639
sizeof(XLogRecPtr) * XLOGbuffers)
36063640
+ BLCKSZ * XLOGbuffers,
36073641
&foundXLog);
@@ -3630,9 +3664,9 @@ XLOGShmemInit(void)
36303664
* Here, on the other hand, we must MAXALIGN to ensure the page
36313665
* buffers have worst-case alignment.
36323666
*/
3633-
XLogCtl->pages =
3634-
((char *) XLogCtl) + MAXALIGN(sizeof(XLogCtlData) +
3635-
sizeof(XLogRecPtr) * XLOGbuffers);
3667+
XLogCtl->pages = XLOG_BUFFER_POINTERALIGN(
3668+
((char *) XLogCtl)
3669+
+ sizeof(XLogCtlData) + sizeof(XLogRecPtr) * XLOGbuffers);
36363670
memset(XLogCtl->pages, 0, BLCKSZ * XLOGbuffers);
36373671

36383672
/*
@@ -3690,10 +3724,9 @@ BootStrapXLOG(void)
36903724
/* First timeline ID is always 1 */
36913725
ThisTimeLineID = 1;
36923726

3693-
/* Use malloc() to ensure buffer is MAXALIGNED */
3694-
buffer = (char *) malloc(BLCKSZ);
3695-
page = (XLogPageHeader) buffer;
3696-
memset(buffer, 0, BLCKSZ);
3727+
buffer = (char *) malloc(BLCKSZ + ALIGNOF_XLOG_BUFFER);
3728+
page = (XLogPageHeader) XLOG_BUFFER_POINTERALIGN(buffer);
3729+
memset(page, 0, BLCKSZ);
36973730

36983731
/* Set up information for the initial checkpoint record */
36993732
checkPoint.redo.xlogid = 0;
@@ -3745,7 +3778,7 @@ BootStrapXLOG(void)
37453778

37463779
/* Write the first page with the initial record */
37473780
errno = 0;
3748-
if (write(openLogFile, buffer, BLCKSZ) != BLCKSZ)
3781+
if (write(openLogFile, page, BLCKSZ) != BLCKSZ)
37493782
{
37503783
/* if write didn't set errno, assume problem is no disk space */
37513784
if (errno == 0)
@@ -5837,3 +5870,71 @@ remove_backup_label(void)
58375870
errmsg("could not remove file \"%s\": %m",
58385871
BACKUP_LABEL_FILE)));
58395872
}
5873+
5874+
5875+
/* XLog gather-write staffs */
5876+
5877+
static void
5878+
XLogPageReset(XLogPages *pages)
5879+
{
5880+
memset(pages, 0, sizeof(*pages));
5881+
}
5882+
5883+
static void
5884+
XLogPageWrite(XLogPages *pages, int index)
5885+
{
5886+
char *page = XLogCtl->pages + index * BLCKSZ;
5887+
int size = BLCKSZ;
5888+
int offset = (LogwrtResult.Write.xrecoff - BLCKSZ) % XLogSegSize;
5889+
5890+
if (pages->head + pages->size == page
5891+
&& pages->offset + pages->size == offset)
5892+
{ /* Pages are continuous. Append new page. */
5893+
pages->size += size;
5894+
}
5895+
else
5896+
{ /* Pages are not continuous. Flush and clear. */
5897+
XLogPageFlush(pages, PrevBufIdx(index));
5898+
pages->head = page;
5899+
pages->size = size;
5900+
pages->offset = offset;
5901+
}
5902+
}
5903+
5904+
static void
5905+
XLogPageFlush(XLogPages *pages, int index)
5906+
{
5907+
if (!pages->head)
5908+
{ /* No needs to write pages. */
5909+
XLogCtl->Write.curridx = index;
5910+
return;
5911+
}
5912+
5913+
/* Need to seek in the file? */
5914+
if (openLogOff != pages->offset)
5915+
{
5916+
openLogOff = pages->offset;
5917+
if (lseek(openLogFile, (off_t) openLogOff, SEEK_SET) < 0)
5918+
ereport(PANIC,
5919+
(errcode_for_file_access(),
5920+
errmsg("could not seek in log file %u, segment %u to offset %u: %m",
5921+
openLogId, openLogSeg, openLogOff)));
5922+
}
5923+
5924+
/* OK to write the page */
5925+
errno = 0;
5926+
if (write(openLogFile, pages->head, pages->size) != pages->size)
5927+
{
5928+
/* if write didn't set errno, assume problem is no disk space */
5929+
if (errno == 0)
5930+
errno = ENOSPC;
5931+
ereport(PANIC,
5932+
(errcode_for_file_access(),
5933+
errmsg("could not write to log file %u, segment %u at offset %u: %m",
5934+
openLogId, openLogSeg, openLogOff)));
5935+
}
5936+
5937+
openLogOff += pages->size;
5938+
XLogCtl->Write.curridx = index;
5939+
XLogPageReset(pages);
5940+
}

0 commit comments

Comments
 (0)