Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 33cc5d8

Browse files
committed
Change s_lock to not use any zero-delay select() calls; these are just a
waste of cycles on single-CPU machines, and of dubious utility on multi-CPU machines too. Tweak s_lock_stuck so that caller can specify timeout interval, and increase interval before declaring stuck spinlock for buffer locks and XLOG locks. On systems that have fdatasync(), use that rather than fsync() to sync WAL log writes. Ensure that WAL file is entirely allocated during XLogFileInit.
1 parent 58c4ab9 commit 33cc5d8

File tree

9 files changed

+284
-188
lines changed

9 files changed

+284
-188
lines changed

configure

Lines changed: 140 additions & 124 deletions
Large diffs are not rendered by default.

configure.in

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -772,7 +772,10 @@ PGAC_VAR_INT_TIMEZONE
772772
AC_FUNC_ACCEPT_ARGTYPES
773773
PGAC_FUNC_GETTIMEOFDAY_1ARG
774774

775-
AC_CHECK_FUNCS([fcvt getopt_long memmove pstat setproctitle setsid sigprocmask sysconf waitpid dlopen])
775+
AC_CHECK_FUNCS([fcvt getopt_long memmove pstat setproctitle setsid sigprocmask sysconf waitpid dlopen fdatasync])
776+
777+
dnl Check whether <unistd.h> declares fdatasync().
778+
AC_EGREP_HEADER(fdatasync, unistd.h, AC_DEFINE(HAVE_FDATASYNC_DECL))
776779

777780
AC_CACHE_CHECK([for PS_STRINGS], [pgac_cv_var_PS_STRINGS],
778781
[AC_TRY_LINK(

src/backend/access/transam/xlog.c

Lines changed: 47 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group
77
* Portions Copyright (c) 1994, Regents of the University of California
88
*
9-
* $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.53 2001/02/13 20:40:25 vadim Exp $
9+
* $Header: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v 1.54 2001/02/18 04:39:42 tgl Exp $
1010
*
1111
*-------------------------------------------------------------------------
1212
*/
@@ -39,6 +39,13 @@
3939

4040
#include "miscadmin.h"
4141

42+
43+
/* Max time to wait to acquire XLog activity locks */
44+
#define XLOG_LOCK_TIMEOUT (5*60*1000000) /* 5 minutes */
45+
/* Max time to wait to acquire checkpoint lock */
46+
#define CHECKPOINT_LOCK_TIMEOUT (10*60*1000000) /* 10 minutes */
47+
48+
4249
int XLOGbuffers = 8;
4350
int XLOGfiles = 0; /* how many files to pre-allocate */
4451
XLogRecPtr MyLastRecPtr = {0, 0};
@@ -178,8 +185,8 @@ typedef struct BkpBlock
178185
/*
179186
* We break each log file in 16Mb segments
180187
*/
181-
#define XLogSegSize (16*1024*1024)
182-
#define XLogLastSeg (0xffffffff / XLogSegSize)
188+
#define XLogSegSize ((uint32) (16*1024*1024))
189+
#define XLogLastSeg (((uint32) 0xffffffff) / XLogSegSize)
183190
#define XLogFileSize (XLogLastSeg * XLogSegSize)
184191

185192
#define NextLogSeg(_logId, _logSeg) \
@@ -423,7 +430,7 @@ begin:;
423430
}
424431
}
425432
}
426-
S_LOCK_SLEEP(&(XLogCtl->insert_lck), i++);
433+
S_LOCK_SLEEP(&(XLogCtl->insert_lck), i++, XLOG_LOCK_TIMEOUT);
427434
if (!TAS(&(XLogCtl->insert_lck)))
428435
break;
429436
}
@@ -721,7 +728,7 @@ XLogFlush(XLogRecPtr record)
721728
break;
722729
}
723730
}
724-
S_LOCK_SLEEP(&(XLogCtl->lgwr_lck), spins++);
731+
S_LOCK_SLEEP(&(XLogCtl->lgwr_lck), spins++, XLOG_LOCK_TIMEOUT);
725732
}
726733

727734
if (logFile >= 0 && (LgwrResult.Write.xlogid != logId ||
@@ -741,7 +748,7 @@ XLogFlush(XLogRecPtr record)
741748
logFile = XLogFileOpen(logId, logSeg, false);
742749
}
743750

744-
if (pg_fsync(logFile) != 0)
751+
if (pg_fdatasync(logFile) != 0)
745752
elog(STOP, "fsync(logfile %u seg %u) failed: %m",
746753
logId, logSeg);
747754
LgwrResult.Flush = LgwrResult.Write;
@@ -826,7 +833,7 @@ GetFreeXLBuffer()
826833
InitXLBuffer(curridx);
827834
return;
828835
}
829-
S_LOCK_SLEEP(&(XLogCtl->lgwr_lck), spins++);
836+
S_LOCK_SLEEP(&(XLogCtl->lgwr_lck), spins++, XLOG_LOCK_TIMEOUT);
830837
}
831838
}
832839

@@ -846,7 +853,7 @@ XLogWrite(char *buffer)
846853
{
847854
if (wcnt > 0)
848855
{
849-
if (pg_fsync(logFile) != 0)
856+
if (pg_fdatasync(logFile) != 0)
850857
elog(STOP, "fsync(logfile %u seg %u) failed: %m",
851858
logId, logSeg);
852859
if (LgwrResult.Write.xlogid != logId)
@@ -928,7 +935,7 @@ XLogWrite(char *buffer)
928935
if (XLByteLT(LgwrResult.Flush, LgwrRqst.Flush) &&
929936
XLByteLE(LgwrRqst.Flush, LgwrResult.Write))
930937
{
931-
if (pg_fsync(logFile) != 0)
938+
if (pg_fdatasync(logFile) != 0)
932939
elog(STOP, "fsync(logfile %u seg %u) failed: %m",
933940
logId, logSeg);
934941
LgwrResult.Flush = LgwrResult.Write;
@@ -948,13 +955,14 @@ XLogFileInit(uint32 log, uint32 seg, bool *usexistent)
948955
{
949956
char path[MAXPGPATH];
950957
char tpath[MAXPGPATH];
958+
char zbuffer[BLCKSZ];
951959
int fd;
960+
int nbytes;
952961

953962
XLogFileName(path, log, seg);
954963

955964
/*
956-
* Try to use existent file (checkpoint maker
957-
* creates it sometime).
965+
* Try to use existent file (checkpoint maker creates it sometimes).
958966
*/
959967
if (*usexistent)
960968
{
@@ -963,7 +971,7 @@ XLogFileInit(uint32 log, uint32 seg, bool *usexistent)
963971
{
964972
if (errno != ENOENT)
965973
elog(STOP, "InitOpen(logfile %u seg %u) failed: %m",
966-
logId, logSeg);
974+
logId, logSeg);
967975
}
968976
else
969977
return(fd);
@@ -979,33 +987,44 @@ XLogFileInit(uint32 log, uint32 seg, bool *usexistent)
979987
elog(STOP, "InitCreate(logfile %u seg %u) failed: %m",
980988
logId, logSeg);
981989

982-
if (lseek(fd, XLogSegSize - 1, SEEK_SET) != (off_t) (XLogSegSize - 1))
983-
elog(STOP, "lseek(logfile %u seg %u) failed: %m",
984-
logId, logSeg);
985-
986-
if (write(fd, "", 1) != 1)
987-
elog(STOP, "write(logfile %u seg %u) failed: %m",
988-
logId, logSeg);
990+
/*
991+
* Zero-fill the file. We have to do this the hard way to ensure that
992+
* all the file space has really been allocated --- on platforms that
993+
* allow "holes" in files, just seeking to the end doesn't allocate
994+
* intermediate space. This way, we know that we have all the space
995+
* and (after the fsync below) that all the indirect blocks are down
996+
* on disk. Therefore, fdatasync(2) will be sufficient to sync future
997+
* writes to the log file.
998+
*/
999+
MemSet(zbuffer, 0, sizeof(zbuffer));
1000+
for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(zbuffer))
1001+
{
1002+
if ((int) write(fd, zbuffer, sizeof(zbuffer)) != (int) sizeof(zbuffer))
1003+
elog(STOP, "ZeroFill(logfile %u seg %u) failed: %m",
1004+
logId, logSeg);
1005+
}
9891006

9901007
if (pg_fsync(fd) != 0)
9911008
elog(STOP, "fsync(logfile %u seg %u) failed: %m",
9921009
logId, logSeg);
9931010

994-
if (lseek(fd, 0, SEEK_SET) < 0)
995-
elog(STOP, "lseek(logfile %u seg %u off %u) failed: %m",
996-
log, seg, 0);
997-
9981011
close(fd);
9991012

1013+
/*
1014+
* Prefer link() to rename() here just to be sure that we don't overwrite
1015+
* an existing logfile. However, there shouldn't be one, so rename()
1016+
* is an acceptable substitute except for the truly paranoid.
1017+
*/
10001018
#ifndef __BEOS__
10011019
if (link(tpath, path) < 0)
1020+
elog(STOP, "InitRelink(logfile %u seg %u) failed: %m",
1021+
logId, logSeg);
1022+
unlink(tpath);
10021023
#else
10031024
if (rename(tpath, path) < 0)
1004-
#endif
10051025
elog(STOP, "InitRelink(logfile %u seg %u) failed: %m",
10061026
logId, logSeg);
1007-
1008-
unlink(tpath);
1027+
#endif
10091028

10101029
fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
10111030
if (fd < 0)
@@ -2101,7 +2120,8 @@ CreateCheckPoint(bool shutdown)
21012120
/* Grab lock, using larger than normal sleep between tries (1 sec) */
21022121
while (TAS(&(XLogCtl->chkp_lck)))
21032122
{
2104-
S_LOCK_SLEEP_INTERVAL(&(XLogCtl->chkp_lck), spins++, 1000000);
2123+
S_LOCK_SLEEP_INTERVAL(&(XLogCtl->chkp_lck), spins++,
2124+
CHECKPOINT_LOCK_TIMEOUT, 1000000);
21052125
}
21062126

21072127
memset(&checkPoint, 0, sizeof(checkPoint));

src/backend/storage/buffer/bufmgr.c

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.106 2001/01/24 19:43:05 momjian Exp $
11+
* $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.107 2001/02/18 04:39:42 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -1990,6 +1990,9 @@ UnlockBuffers(void)
19901990
}
19911991
}
19921992

1993+
/* Max time to wait to acquire a buffer read or write lock */
1994+
#define BUFFER_LOCK_TIMEOUT (10*60*1000000) /* 10 minutes */
1995+
19931996
void
19941997
LockBuffer(Buffer buffer, int mode)
19951998
{
@@ -2041,7 +2044,7 @@ LockBuffer(Buffer buffer, int mode)
20412044
{
20422045
S_UNLOCK(&(buf->cntx_lock));
20432046
RESUME_INTERRUPTS();
2044-
S_LOCK_SLEEP(&(buf->cntx_lock), i++);
2047+
S_LOCK_SLEEP(&(buf->cntx_lock), i++, BUFFER_LOCK_TIMEOUT);
20452048
HOLD_INTERRUPTS();
20462049
S_LOCK(&(buf->cntx_lock));
20472050
}
@@ -2069,7 +2072,7 @@ LockBuffer(Buffer buffer, int mode)
20692072
}
20702073
S_UNLOCK(&(buf->cntx_lock));
20712074
RESUME_INTERRUPTS();
2072-
S_LOCK_SLEEP(&(buf->cntx_lock), i++);
2075+
S_LOCK_SLEEP(&(buf->cntx_lock), i++, BUFFER_LOCK_TIMEOUT);
20732076
HOLD_INTERRUPTS();
20742077
S_LOCK(&(buf->cntx_lock));
20752078
}

src/backend/storage/buffer/s_lock.c

Lines changed: 40 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $Header: /cvsroot/pgsql/src/backend/storage/buffer/Attic/s_lock.c,v 1.32 2001/01/24 19:43:06 momjian Exp $
11+
* $Header: /cvsroot/pgsql/src/backend/storage/buffer/Attic/s_lock.c,v 1.33 2001/02/18 04:39:42 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -21,23 +21,39 @@
2121
#include "storage/s_lock.h"
2222

2323

24-
/*
24+
/*----------
2525
* Each time we busy spin we select the next element of this array as the
2626
* number of microseconds to wait. This accomplishes pseudo random back-off.
27-
* Values are not critical but 10 milliseconds is a common platform
28-
* granularity.
2927
*
30-
* Total time to cycle through all 20 entries might be about .07 sec,
31-
* so the given value of S_MAX_BUSY results in timeout after ~70 sec.
28+
* Note that on most platforms, specified values will be rounded up to the
29+
* next multiple of a clock tick, which is often ten milliseconds (10000).
30+
* So, we are being way overoptimistic to assume that these different values
31+
* are really different, other than the last. But there are a few platforms
32+
* with better-than-usual timekeeping, and on these we will get pretty good
33+
* pseudo-random behavior.
34+
*
35+
* Total time to cycle through all 20 entries will be at least 100 msec,
36+
* more commonly (10 msec resolution) 220 msec, and on some platforms
37+
* as much as 420 msec (when the remainder of the current tick cycle is
38+
* ignored in deciding when to time out, as on FreeBSD and older Linuxen).
39+
* We use the 100msec figure to figure max_spins, so actual timeouts may
40+
* be as much as four times the nominal value, but will never be less.
41+
*----------
3242
*/
3343
#define S_NSPINCYCLE 20
34-
#define S_MAX_BUSY 1000 * S_NSPINCYCLE
3544

3645
int s_spincycle[S_NSPINCYCLE] =
37-
{ 0, 0, 0, 0, 10000, 0, 0, 0, 10000, 0,
38-
0, 10000, 0, 0, 10000, 0, 10000, 0, 10000, 10000
46+
{ 1, 10, 100, 1000,
47+
10000, 1000, 1000, 1000,
48+
10000, 1000, 1000, 10000,
49+
1000, 1000, 10000, 1000,
50+
10000, 1000, 10000, 30000
3951
};
4052

53+
#define AVG_SPINCYCLE 5000 /* average entry in microsec: 100ms / 20 */
54+
55+
#define DEFAULT_TIMEOUT (100*1000000) /* default timeout: 100 sec */
56+
4157

4258
/*
4359
* s_lock_stuck() - complain about a stuck spinlock
@@ -58,34 +74,40 @@ s_lock_stuck(volatile slock_t *lock, const char *file, const int line)
5874
/*
5975
* s_lock_sleep() - sleep a pseudo-random amount of time, check for timeout
6076
*
61-
* Normally 'microsec' is 0, specifying to use the next s_spincycle[] value.
77+
* The 'timeout' is given in microsec, or may be 0 for "infinity". Note that
78+
* this will be a lower bound (a fairly loose lower bound, on most platforms).
79+
*
80+
* 'microsec' is the number of microsec to delay per loop. Normally
81+
* 'microsec' is 0, specifying to use the next s_spincycle[] value.
6282
* Some callers may pass a nonzero interval, specifying to use exactly that
6383
* delay value rather than a pseudo-random delay.
6484
*/
6585
void
66-
s_lock_sleep(unsigned spins, int microsec,
86+
s_lock_sleep(unsigned spins, int timeout, int microsec,
6787
volatile slock_t *lock,
6888
const char *file, const int line)
6989
{
7090
struct timeval delay;
71-
unsigned max_spins;
7291

7392
if (microsec > 0)
7493
{
7594
delay.tv_sec = 0;
7695
delay.tv_usec = microsec;
77-
/* two-minute timeout in this case */
78-
max_spins = 120000000 / microsec;
7996
}
8097
else
8198
{
8299
delay.tv_sec = 0;
83100
delay.tv_usec = s_spincycle[spins % S_NSPINCYCLE];
84-
max_spins = S_MAX_BUSY;
101+
microsec = AVG_SPINCYCLE; /* use average to figure timeout */
85102
}
86103

87-
if (spins > max_spins)
88-
s_lock_stuck(lock, file, line);
104+
if (timeout > 0)
105+
{
106+
unsigned max_spins = timeout / microsec;
107+
108+
if (spins > max_spins)
109+
s_lock_stuck(lock, file, line);
110+
}
89111

90112
(void) select(0, NULL, NULL, NULL, &delay);
91113
}
@@ -110,7 +132,7 @@ s_lock(volatile slock_t *lock, const char *file, const int line)
110132
*/
111133
while (TAS(lock))
112134
{
113-
s_lock_sleep(spins++, 0, lock, file, line);
135+
s_lock_sleep(spins++, DEFAULT_TIMEOUT, 0, lock, file, line);
114136
CHECK_FOR_INTERRUPTS();
115137
}
116138
}

src/backend/storage/file/fd.c

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1994, Regents of the University of California
88
*
99
* IDENTIFICATION
10-
* $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.72 2001/02/17 01:00:04 tgl Exp $
10+
* $Header: /cvsroot/pgsql/src/backend/storage/file/fd.c,v 1.73 2001/02/18 04:39:42 tgl Exp $
1111
*
1212
* NOTES:
1313
*
@@ -193,7 +193,7 @@ static char *filepath(char *filename);
193193
static long pg_nofile(void);
194194

195195
/*
196-
* pg_fsync --- same as fsync except does nothing if -F switch was given
196+
* pg_fsync --- same as fsync except does nothing if enableFsync is off
197197
*/
198198
int
199199
pg_fsync(int fd)
@@ -204,6 +204,26 @@ pg_fsync(int fd)
204204
return 0;
205205
}
206206

207+
/*
208+
* pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
209+
*
210+
* Not all platforms have fdatasync; treat as fsync if not available.
211+
*/
212+
int
213+
pg_fdatasync(int fd)
214+
{
215+
if (enableFsync)
216+
{
217+
#ifdef HAVE_FDATASYNC
218+
return fdatasync(fd);
219+
#else
220+
return fsync(fd);
221+
#endif
222+
}
223+
else
224+
return 0;
225+
}
226+
207227
/*
208228
* BasicOpenFile --- same as open(2) except can free other FDs if needed
209229
*

0 commit comments

Comments
 (0)