7
7
* Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
8
8
* Portions Copyright (c) 1994, Regents of the University of California
9
9
*
10
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.210 2005/07/23 15:31:16 momjian Exp $
10
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.211 2005/07/29 03:22:33 momjian Exp $
11
11
*
12
12
*-------------------------------------------------------------------------
13
13
*/
47
47
#include "utils/relcache.h"
48
48
49
49
50
+ /*
51
+ * Becauase O_DIRECT bypasses the kernel buffers, and because we never
52
+ * read those buffers except during crash recovery, it is a win to use
53
+ * it in all cases where we sync on each write(). We could allow O_DIRECT
54
+ * with fsync(), but because skipping the kernel buffer forces writes out
55
+ * quickly, it seems best just to use it for O_SYNC. It is hard to imagine
56
+ * how fsync() could be a win for O_DIRECT compared to O_SYNC and O_DIRECT.
57
+ */
58
+ #ifdef O_DIRECT
59
+ #define PG_O_DIRECT O_DIRECT
60
+ #else
61
+ #define PG_O_DIRECT 0
62
+ #endif
63
+
50
64
/*
51
65
* This chunk of hackery attempts to determine which file sync methods
52
66
* are available on the current platform, and to choose an appropriate
53
67
* default method. We assume that fsync() is always available, and that
54
68
* configure determined whether fdatasync() is.
55
69
*/
56
70
#if defined(O_SYNC )
57
- #define OPEN_SYNC_FLAG O_SYNC
71
+ #define CMP_OPEN_SYNC_FLAG O_SYNC
58
72
#else
59
73
#if defined(O_FSYNC )
60
- #define OPEN_SYNC_FLAG O_FSYNC
74
+ #define CMP_OPEN_SYNC_FLAG O_FSYNC
61
75
#endif
62
76
#endif
77
+ #define OPEN_SYNC_FLAG (CMP_OPEN_SYNC_FLAG | PG_O_DIRECT)
63
78
64
79
#if defined(O_DSYNC )
65
80
#if defined(OPEN_SYNC_FLAG )
66
- #if O_DSYNC != OPEN_SYNC_FLAG
67
- #define OPEN_DATASYNC_FLAG O_DSYNC
81
+ #if O_DSYNC != CMP_OPEN_SYNC_FLAG
82
+ #define OPEN_DATASYNC_FLAG ( O_DSYNC | PG_O_DIRECT)
68
83
#endif
69
84
#else /* !defined(OPEN_SYNC_FLAG) */
70
85
/* Win32 only has O_DSYNC */
71
- #define OPEN_DATASYNC_FLAG O_DSYNC
86
+ #define OPEN_DATASYNC_FLAG ( O_DSYNC | PG_O_DIRECT)
72
87
#endif
73
88
#endif
74
89
90
+ /*
91
+ * Limitation of buffer-alignment for direct io depend on OS and filesystem,
92
+ * but BLCKSZ is assumed to be enough for it.
93
+ */
94
+ #ifdef O_DIRECT
95
+ #define ALIGNOF_XLOG_BUFFER BLCKSZ
96
+ #else
97
+ #define ALIGNOF_XLOG_BUFFER MAXIMUM_ALIGNOF
98
+ #endif
99
+
100
+ /*
101
+ * Switch the alignment routine because ShmemAlloc() returns a max-aligned
102
+ * buffer and ALIGNOF_XLOG_BUFFER may be greater than MAXIMUM_ALIGNOF.
103
+ */
104
+ #if ALIGNOF_XLOG_BUFFER <= MAXIMUM_ALIGNOF
105
+ #define XLOG_BUFFER_ALIGN (LEN ) MAXALIGN((LEN))
106
+ #else
107
+ #define XLOG_BUFFER_ALIGN (LEN ) ((LEN) + (ALIGNOF_XLOG_BUFFER))
108
+ #endif
109
+ /* assume sizeof(ptrdiff_t) == sizeof(void*) */
110
+ #define POINTERALIGN (ALIGNVAL ,PTR ) \
111
+ ((char *)(((ptrdiff_t) (PTR) + (ALIGNVAL-1)) & ~((ptrdiff_t) (ALIGNVAL-1))))
112
+ #define XLOG_BUFFER_POINTERALIGN (PTR ) \
113
+ POINTERALIGN((ALIGNOF_XLOG_BUFFER), (PTR))
114
+
75
115
#if defined(OPEN_DATASYNC_FLAG )
76
116
#define DEFAULT_SYNC_METHOD_STR "open_datasync"
77
117
#define DEFAULT_SYNC_METHOD SYNC_METHOD_OPEN
@@ -469,6 +509,17 @@ static void ReadControlFile(void);
469
509
static char * str_time (time_t tnow );
470
510
static void issue_xlog_fsync (void );
471
511
512
+ /* XLog gather-write staffs */
513
+ typedef struct XLogPages
514
+ {
515
+ char * head ; /* Head of first page */
516
+ int size ; /* Total bytes of pages == count(pages) * BLCKSZ */
517
+ int offset ; /* Offset in xlog segment file */
518
+ } XLogPages ;
519
+ static void XLogPageReset (XLogPages * pages );
520
+ static void XLogPageWrite (XLogPages * pages , int index );
521
+ static void XLogPageFlush (XLogPages * pages , int index );
522
+
472
523
#ifdef WAL_DEBUG
473
524
static void xlog_outrec (char * buf , XLogRecord * record );
474
525
#endif
@@ -1245,9 +1296,10 @@ static void
1245
1296
XLogWrite (XLogwrtRqst WriteRqst )
1246
1297
{
1247
1298
XLogCtlWrite * Write = & XLogCtl -> Write ;
1248
- char * from ;
1249
1299
bool ispartialpage ;
1250
1300
bool use_existent ;
1301
+ int currentIndex = Write -> curridx ;
1302
+ XLogPages pages ;
1251
1303
1252
1304
/* We should always be inside a critical section here */
1253
1305
Assert (CritSectionCount > 0 );
@@ -1258,6 +1310,8 @@ XLogWrite(XLogwrtRqst WriteRqst)
1258
1310
*/
1259
1311
LogwrtResult = Write -> LogwrtResult ;
1260
1312
1313
+ XLogPageReset (& pages );
1314
+
1261
1315
while (XLByteLT (LogwrtResult .Write , WriteRqst .Write ))
1262
1316
{
1263
1317
/*
@@ -1266,21 +1320,22 @@ XLogWrite(XLogwrtRqst WriteRqst)
1266
1320
* end of the last page that's been initialized by
1267
1321
* AdvanceXLInsertBuffer.
1268
1322
*/
1269
- if (!XLByteLT (LogwrtResult .Write , XLogCtl -> xlblocks [Write -> curridx ]))
1323
+ if (!XLByteLT (LogwrtResult .Write , XLogCtl -> xlblocks [currentIndex ]))
1270
1324
elog (PANIC , "xlog write request %X/%X is past end of log %X/%X" ,
1271
1325
LogwrtResult .Write .xlogid , LogwrtResult .Write .xrecoff ,
1272
- XLogCtl -> xlblocks [Write -> curridx ].xlogid ,
1273
- XLogCtl -> xlblocks [Write -> curridx ].xrecoff );
1326
+ XLogCtl -> xlblocks [currentIndex ].xlogid ,
1327
+ XLogCtl -> xlblocks [currentIndex ].xrecoff );
1274
1328
1275
1329
/* Advance LogwrtResult.Write to end of current buffer page */
1276
- LogwrtResult .Write = XLogCtl -> xlblocks [Write -> curridx ];
1330
+ LogwrtResult .Write = XLogCtl -> xlblocks [currentIndex ];
1277
1331
ispartialpage = XLByteLT (WriteRqst .Write , LogwrtResult .Write );
1278
1332
1279
1333
if (!XLByteInPrevSeg (LogwrtResult .Write , openLogId , openLogSeg ))
1280
1334
{
1281
1335
/*
1282
1336
* Switch to new logfile segment.
1283
1337
*/
1338
+ XLogPageFlush (& pages , currentIndex );
1284
1339
if (openLogFile >= 0 )
1285
1340
{
1286
1341
if (close (openLogFile ))
@@ -1354,31 +1409,8 @@ XLogWrite(XLogwrtRqst WriteRqst)
1354
1409
openLogOff = 0 ;
1355
1410
}
1356
1411
1357
- /* Need to seek in the file? */
1358
- if (openLogOff != (LogwrtResult .Write .xrecoff - BLCKSZ ) % XLogSegSize )
1359
- {
1360
- openLogOff = (LogwrtResult .Write .xrecoff - BLCKSZ ) % XLogSegSize ;
1361
- if (lseek (openLogFile , (off_t ) openLogOff , SEEK_SET ) < 0 )
1362
- ereport (PANIC ,
1363
- (errcode_for_file_access (),
1364
- errmsg ("could not seek in log file %u, segment %u to offset %u: %m" ,
1365
- openLogId , openLogSeg , openLogOff )));
1366
- }
1367
-
1368
- /* OK to write the page */
1369
- from = XLogCtl -> pages + Write -> curridx * BLCKSZ ;
1370
- errno = 0 ;
1371
- if (write (openLogFile , from , BLCKSZ ) != BLCKSZ )
1372
- {
1373
- /* if write didn't set errno, assume problem is no disk space */
1374
- if (errno == 0 )
1375
- errno = ENOSPC ;
1376
- ereport (PANIC ,
1377
- (errcode_for_file_access (),
1378
- errmsg ("could not write to log file %u, segment %u at offset %u: %m" ,
1379
- openLogId , openLogSeg , openLogOff )));
1380
- }
1381
- openLogOff += BLCKSZ ;
1412
+ /* Add a page to buffer */
1413
+ XLogPageWrite (& pages , currentIndex );
1382
1414
1383
1415
/*
1384
1416
* If we just wrote the whole last page of a logfile segment,
@@ -1390,8 +1422,9 @@ XLogWrite(XLogwrtRqst WriteRqst)
1390
1422
* This is also the right place to notify the Archiver that the
1391
1423
* segment is ready to copy to archival storage.
1392
1424
*/
1393
- if (openLogOff >= XLogSegSize && !ispartialpage )
1425
+ if (openLogOff + pages . size >= XLogSegSize && !ispartialpage )
1394
1426
{
1427
+ XLogPageFlush (& pages , currentIndex );
1395
1428
issue_xlog_fsync ();
1396
1429
LogwrtResult .Flush = LogwrtResult .Write ; /* end of current page */
1397
1430
@@ -1405,8 +1438,9 @@ XLogWrite(XLogwrtRqst WriteRqst)
1405
1438
LogwrtResult .Write = WriteRqst .Write ;
1406
1439
break ;
1407
1440
}
1408
- Write -> curridx = NextBufIdx (Write -> curridx );
1441
+ currentIndex = NextBufIdx (currentIndex );
1409
1442
}
1443
+ XLogPageFlush (& pages , currentIndex );
1410
1444
1411
1445
/*
1412
1446
* If asked to flush, do so
@@ -3584,7 +3618,7 @@ XLOGShmemSize(void)
3584
3618
if (XLOGbuffers < MinXLOGbuffers )
3585
3619
XLOGbuffers = MinXLOGbuffers ;
3586
3620
3587
- return MAXALIGN (sizeof (XLogCtlData ) + sizeof (XLogRecPtr ) * XLOGbuffers )
3621
+ return XLOG_BUFFER_ALIGN (sizeof (XLogCtlData ) + sizeof (XLogRecPtr ) * XLOGbuffers )
3588
3622
+ BLCKSZ * XLOGbuffers +
3589
3623
MAXALIGN (sizeof (ControlFileData ));
3590
3624
}
@@ -3601,7 +3635,7 @@ XLOGShmemInit(void)
3601
3635
3602
3636
XLogCtl = (XLogCtlData * )
3603
3637
ShmemInitStruct ("XLOG Ctl" ,
3604
- MAXALIGN (sizeof (XLogCtlData ) +
3638
+ XLOG_BUFFER_ALIGN (sizeof (XLogCtlData ) +
3605
3639
sizeof (XLogRecPtr ) * XLOGbuffers )
3606
3640
+ BLCKSZ * XLOGbuffers ,
3607
3641
& foundXLog );
@@ -3630,9 +3664,9 @@ XLOGShmemInit(void)
3630
3664
* Here, on the other hand, we must MAXALIGN to ensure the page
3631
3665
* buffers have worst-case alignment.
3632
3666
*/
3633
- XLogCtl -> pages =
3634
- ((char * ) XLogCtl ) + MAXALIGN ( sizeof ( XLogCtlData ) +
3635
- sizeof (XLogRecPtr ) * XLOGbuffers );
3667
+ XLogCtl -> pages = XLOG_BUFFER_POINTERALIGN (
3668
+ ((char * ) XLogCtl )
3669
+ + sizeof ( XLogCtlData ) + sizeof (XLogRecPtr ) * XLOGbuffers );
3636
3670
memset (XLogCtl -> pages , 0 , BLCKSZ * XLOGbuffers );
3637
3671
3638
3672
/*
@@ -3690,10 +3724,9 @@ BootStrapXLOG(void)
3690
3724
/* First timeline ID is always 1 */
3691
3725
ThisTimeLineID = 1 ;
3692
3726
3693
- /* Use malloc() to ensure buffer is MAXALIGNED */
3694
- buffer = (char * ) malloc (BLCKSZ );
3695
- page = (XLogPageHeader ) buffer ;
3696
- memset (buffer , 0 , BLCKSZ );
3727
+ buffer = (char * ) malloc (BLCKSZ + ALIGNOF_XLOG_BUFFER );
3728
+ page = (XLogPageHeader ) XLOG_BUFFER_POINTERALIGN (buffer );
3729
+ memset (page , 0 , BLCKSZ );
3697
3730
3698
3731
/* Set up information for the initial checkpoint record */
3699
3732
checkPoint .redo .xlogid = 0 ;
@@ -3745,7 +3778,7 @@ BootStrapXLOG(void)
3745
3778
3746
3779
/* Write the first page with the initial record */
3747
3780
errno = 0 ;
3748
- if (write (openLogFile , buffer , BLCKSZ ) != BLCKSZ )
3781
+ if (write (openLogFile , page , BLCKSZ ) != BLCKSZ )
3749
3782
{
3750
3783
/* if write didn't set errno, assume problem is no disk space */
3751
3784
if (errno == 0 )
@@ -5837,3 +5870,71 @@ remove_backup_label(void)
5837
5870
errmsg ("could not remove file \"%s\": %m" ,
5838
5871
BACKUP_LABEL_FILE )));
5839
5872
}
5873
+
5874
+
5875
+ /* XLog gather-write staffs */
5876
+
5877
+ static void
5878
+ XLogPageReset (XLogPages * pages )
5879
+ {
5880
+ memset (pages , 0 , sizeof (* pages ));
5881
+ }
5882
+
5883
+ static void
5884
+ XLogPageWrite (XLogPages * pages , int index )
5885
+ {
5886
+ char * page = XLogCtl -> pages + index * BLCKSZ ;
5887
+ int size = BLCKSZ ;
5888
+ int offset = (LogwrtResult .Write .xrecoff - BLCKSZ ) % XLogSegSize ;
5889
+
5890
+ if (pages -> head + pages -> size == page
5891
+ && pages -> offset + pages -> size == offset )
5892
+ { /* Pages are continuous. Append new page. */
5893
+ pages -> size += size ;
5894
+ }
5895
+ else
5896
+ { /* Pages are not continuous. Flush and clear. */
5897
+ XLogPageFlush (pages , PrevBufIdx (index ));
5898
+ pages -> head = page ;
5899
+ pages -> size = size ;
5900
+ pages -> offset = offset ;
5901
+ }
5902
+ }
5903
+
5904
+ static void
5905
+ XLogPageFlush (XLogPages * pages , int index )
5906
+ {
5907
+ if (!pages -> head )
5908
+ { /* No needs to write pages. */
5909
+ XLogCtl -> Write .curridx = index ;
5910
+ return ;
5911
+ }
5912
+
5913
+ /* Need to seek in the file? */
5914
+ if (openLogOff != pages -> offset )
5915
+ {
5916
+ openLogOff = pages -> offset ;
5917
+ if (lseek (openLogFile , (off_t ) openLogOff , SEEK_SET ) < 0 )
5918
+ ereport (PANIC ,
5919
+ (errcode_for_file_access (),
5920
+ errmsg ("could not seek in log file %u, segment %u to offset %u: %m" ,
5921
+ openLogId , openLogSeg , openLogOff )));
5922
+ }
5923
+
5924
+ /* OK to write the page */
5925
+ errno = 0 ;
5926
+ if (write (openLogFile , pages -> head , pages -> size ) != pages -> size )
5927
+ {
5928
+ /* if write didn't set errno, assume problem is no disk space */
5929
+ if (errno == 0 )
5930
+ errno = ENOSPC ;
5931
+ ereport (PANIC ,
5932
+ (errcode_for_file_access (),
5933
+ errmsg ("could not write to log file %u, segment %u at offset %u: %m" ,
5934
+ openLogId , openLogSeg , openLogOff )));
5935
+ }
5936
+
5937
+ openLogOff += pages -> size ;
5938
+ XLogCtl -> Write .curridx = index ;
5939
+ XLogPageReset (pages );
5940
+ }
0 commit comments