Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit f0828b2

Browse files
committed
Provide a build-time option to store large relations as single files, rather
than dividing them into 1GB segments as has been our longtime practice. This requires working support for large files in the operating system; at least for the time being, it won't be the default. Zdenek Kotala
1 parent b6912af commit f0828b2

File tree

12 files changed

+595
-100
lines changed

12 files changed

+595
-100
lines changed

configure

+446
Large diffs are not rendered by default.

configure.in

+14-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
dnl Process this file with autoconf to produce a configure script.
2-
dnl $PostgreSQL: pgsql/configure.in,v 1.552 2008/02/24 05:21:54 tgl Exp $
2+
dnl $PostgreSQL: pgsql/configure.in,v 1.553 2008/03/10 20:06:27 tgl Exp $
33
dnl
44
dnl Developers, please strive to achieve this order:
55
dnl
@@ -217,6 +217,12 @@ fi
217217
AC_SUBST(DTRACEFLAGS)])
218218
AC_SUBST(enable_dtrace)
219219

220+
#
221+
# Data file segmentation
222+
#
223+
PGAC_ARG_BOOL(enable, segmented-files, yes,
224+
[ --disable-segmented-files disable data file segmentation (requires largefile support)])
225+
220226
#
221227
# C compiler
222228
#
@@ -1411,6 +1417,13 @@ if test $ac_cv_func_fseeko = yes; then
14111417
AC_SYS_LARGEFILE
14121418
fi
14131419

1420+
# Check for largefile support (must be after AC_SYS_LARGEFILE)
1421+
AC_CHECK_SIZEOF([off_t])
1422+
1423+
if test "$ac_cv_sizeof_off_t" -lt 8 -o "$enable_segmented_files" = "yes"; then
1424+
AC_DEFINE([USE_SEGMENTED_FILES], 1, [Define to split data files into 1GB segments.])
1425+
fi
1426+
14141427
# SunOS doesn't handle negative byte comparisons properly with +/- return
14151428
AC_FUNC_MEMCMP
14161429

doc/src/sgml/installation.sgml

+15-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<!-- $PostgreSQL: pgsql/doc/src/sgml/installation.sgml,v 1.303 2008/03/06 21:37:33 momjian Exp $ -->
1+
<!-- $PostgreSQL: pgsql/doc/src/sgml/installation.sgml,v 1.304 2008/03/10 20:06:27 tgl Exp $ -->
22

33
<chapter id="installation">
44
<title><![%standalone-include[<productname>PostgreSQL</>]]>
@@ -1025,6 +1025,20 @@ su - postgres
10251025
</listitem>
10261026
</varlistentry>
10271027

1028+
<varlistentry>
1029+
<term><option>--disable-segmented-files</option></term>
1030+
<listitem>
1031+
<para>
1032+
Store large tables as single operating-system files, rather than
1033+
dividing them into 1GB segments as is the default. This option
1034+
is ignored unless the operating system has <quote>largefile</>
1035+
support (which most do, nowadays). It can be helpful to reduce
1036+
the number of file descriptors consumed when working with very
1037+
large tables.
1038+
</para>
1039+
</listitem>
1040+
</varlistentry>
1041+
10281042
<varlistentry>
10291043
<term><option>--disable-spinlocks</option></term>
10301044
<listitem>

doc/src/sgml/storage.sgml

+6-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<!-- $PostgreSQL: pgsql/doc/src/sgml/storage.sgml,v 1.21 2007/11/23 00:24:12 ishii Exp $ -->
1+
<!-- $PostgreSQL: pgsql/doc/src/sgml/storage.sgml,v 1.22 2008/03/10 20:06:27 tgl Exp $ -->
22

33
<chapter id="storage">
44

@@ -138,10 +138,14 @@ Avoid assuming that filenode and table OID are the same.
138138
</caution>
139139

140140
<para>
141-
When a table or index exceeds 1 GB, it is divided into gigabyte-sized
141+
When a table or index exceeds 1 GB, it is normally divided into gigabyte-sized
142142
<firstterm>segments</>. The first segment's file name is the same as the
143143
filenode; subsequent segments are named filenode.1, filenode.2, etc.
144144
This arrangement avoids problems on platforms that have file size limitations.
145+
(But if the platform does not have such a limitation, and
146+
<option>--disable-segmented-files</option> was specified when
147+
<productname>PostgreSQL</> was built, then each table or index is stored
148+
as a single file, without segmentation.)
145149
The contents of tables and indexes are discussed further in
146150
<xref linkend="storage-page-layout">.
147151
</para>

src/backend/storage/file/buffile.c

+20-21
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1994, Regents of the University of California
88
*
99
* IDENTIFICATION
10-
* $PostgreSQL: pgsql/src/backend/storage/file/buffile.c,v 1.29 2008/01/01 19:45:51 momjian Exp $
10+
* $PostgreSQL: pgsql/src/backend/storage/file/buffile.c,v 1.30 2008/03/10 20:06:27 tgl Exp $
1111
*
1212
* NOTES:
1313
*
@@ -38,13 +38,12 @@
3838
#include "storage/buffile.h"
3939

4040
/*
41-
* The maximum safe file size is presumed to be RELSEG_SIZE * BLCKSZ.
42-
* Note we adhere to this limit whether or not LET_OS_MANAGE_FILESIZE
43-
* is defined, although md.c ignores it when that symbol is defined.
44-
* The reason for doing this is that we'd like large temporary BufFiles
45-
* to be spread across multiple tablespaces when available.
41+
* We break BufFiles into gigabyte-sized segments, whether or not
42+
* USE_SEGMENTED_FILES is defined. The reason is that we'd like large
43+
* temporary BufFiles to be spread across multiple tablespaces when available.
4644
*/
47-
#define MAX_PHYSICAL_FILESIZE (RELSEG_SIZE * BLCKSZ)
45+
#define MAX_PHYSICAL_FILESIZE 0x40000000
46+
#define BUFFILE_SEG_SIZE (MAX_PHYSICAL_FILESIZE / BLCKSZ)
4847

4948
/*
5049
* This data structure represents a buffered file that consists of one or
@@ -56,7 +55,7 @@ struct BufFile
5655
int numFiles; /* number of physical files in set */
5756
/* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */
5857
File *files; /* palloc'd array with numFiles entries */
59-
long *offsets; /* palloc'd array with numFiles entries */
58+
off_t *offsets; /* palloc'd array with numFiles entries */
6059

6160
/*
6261
* offsets[i] is the current seek position of files[i]. We use this to
@@ -72,7 +71,7 @@ struct BufFile
7271
* Position as seen by user of BufFile is (curFile, curOffset + pos).
7372
*/
7473
int curFile; /* file index (0..n) part of current pos */
75-
int curOffset; /* offset part of current pos */
74+
off_t curOffset; /* offset part of current pos */
7675
int pos; /* next read/write position in buffer */
7776
int nbytes; /* total # of valid bytes in buffer */
7877
char buffer[BLCKSZ];
@@ -97,7 +96,7 @@ makeBufFile(File firstfile)
9796
file->numFiles = 1;
9897
file->files = (File *) palloc(sizeof(File));
9998
file->files[0] = firstfile;
100-
file->offsets = (long *) palloc(sizeof(long));
99+
file->offsets = (off_t *) palloc(sizeof(off_t));
101100
file->offsets[0] = 0L;
102101
file->isTemp = false;
103102
file->isInterXact = false;
@@ -124,8 +123,8 @@ extendBufFile(BufFile *file)
124123

125124
file->files = (File *) repalloc(file->files,
126125
(file->numFiles + 1) * sizeof(File));
127-
file->offsets = (long *) repalloc(file->offsets,
128-
(file->numFiles + 1) * sizeof(long));
126+
file->offsets = (off_t *) repalloc(file->offsets,
127+
(file->numFiles + 1) * sizeof(off_t));
129128
file->files[file->numFiles] = pfile;
130129
file->offsets[file->numFiles] = 0L;
131130
file->numFiles++;
@@ -279,9 +278,9 @@ BufFileDumpBuffer(BufFile *file)
279278
bytestowrite = file->nbytes - wpos;
280279
if (file->isTemp)
281280
{
282-
long availbytes = MAX_PHYSICAL_FILESIZE - file->curOffset;
281+
off_t availbytes = MAX_PHYSICAL_FILESIZE - file->curOffset;
283282

284-
if ((long) bytestowrite > availbytes)
283+
if ((off_t) bytestowrite > availbytes)
285284
bytestowrite = (int) availbytes;
286285
}
287286

@@ -451,10 +450,10 @@ BufFileFlush(BufFile *file)
451450
* impossible seek is attempted.
452451
*/
453452
int
454-
BufFileSeek(BufFile *file, int fileno, long offset, int whence)
453+
BufFileSeek(BufFile *file, int fileno, off_t offset, int whence)
455454
{
456455
int newFile;
457-
long newOffset;
456+
off_t newOffset;
458457

459458
switch (whence)
460459
{
@@ -469,7 +468,7 @@ BufFileSeek(BufFile *file, int fileno, long offset, int whence)
469468
/*
470469
* Relative seek considers only the signed offset, ignoring
471470
* fileno. Note that large offsets (> 1 gig) risk overflow in this
472-
* add...
471+
* add, unless we have 64-bit off_t.
473472
*/
474473
newFile = file->curFile;
475474
newOffset = (file->curOffset + file->pos) + offset;
@@ -537,7 +536,7 @@ BufFileSeek(BufFile *file, int fileno, long offset, int whence)
537536
}
538537

539538
void
540-
BufFileTell(BufFile *file, int *fileno, long *offset)
539+
BufFileTell(BufFile *file, int *fileno, off_t *offset)
541540
{
542541
*fileno = file->curFile;
543542
*offset = file->curOffset + file->pos;
@@ -558,8 +557,8 @@ int
558557
BufFileSeekBlock(BufFile *file, long blknum)
559558
{
560559
return BufFileSeek(file,
561-
(int) (blknum / RELSEG_SIZE),
562-
(blknum % RELSEG_SIZE) * BLCKSZ,
560+
(int) (blknum / BUFFILE_SEG_SIZE),
561+
(off_t) (blknum % BUFFILE_SEG_SIZE) * BLCKSZ,
563562
SEEK_SET);
564563
}
565564

@@ -575,7 +574,7 @@ BufFileTellBlock(BufFile *file)
575574
long blknum;
576575

577576
blknum = (file->curOffset + file->pos) / BLCKSZ;
578-
blknum += file->curFile * RELSEG_SIZE;
577+
blknum += file->curFile * BUFFILE_SEG_SIZE;
579578
return blknum;
580579
}
581580

src/backend/storage/file/fd.c

+29-24
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1994, Regents of the University of California
88
*
99
* IDENTIFICATION
10-
* $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.143 2008/01/01 19:45:51 momjian Exp $
10+
* $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.144 2008/03/10 20:06:27 tgl Exp $
1111
*
1212
* NOTES:
1313
*
@@ -115,21 +115,21 @@ static int max_safe_fds = 32; /* default if not changed */
115115

116116
#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
117117

118-
#define FileUnknownPos (-1L)
118+
#define FileUnknownPos ((off_t) -1)
119119

120120
/* these are the assigned bits in fdstate below: */
121121
#define FD_TEMPORARY (1 << 0) /* T = delete when closed */
122122
#define FD_XACT_TEMPORARY (1 << 1) /* T = delete at eoXact */
123123

124124
typedef struct vfd
125125
{
126-
signed short fd; /* current FD, or VFD_CLOSED if none */
126+
int fd; /* current FD, or VFD_CLOSED if none */
127127
unsigned short fdstate; /* bitflags for VFD's state */
128-
SubTransactionId create_subid; /* for TEMPORARY fds, creating subxact */
128+
SubTransactionId create_subid; /* for TEMPORARY fds, creating subxact */
129129
File nextFree; /* link to next free VFD, if in freelist */
130130
File lruMoreRecently; /* doubly linked recency-of-use list */
131131
File lruLessRecently;
132-
long seekPos; /* current logical file position */
132+
off_t seekPos; /* current logical file position */
133133
char *fileName; /* name of file, or NULL for unused VFD */
134134
/* NB: fileName is malloc'd, and must be free'd when closing the VFD */
135135
int fileFlags; /* open(2) flags for (re)opening the file */
@@ -544,8 +544,8 @@ LruDelete(File file)
544544
Delete(file);
545545

546546
/* save the seek position */
547-
vfdP->seekPos = (long) lseek(vfdP->fd, 0L, SEEK_CUR);
548-
Assert(vfdP->seekPos != -1L);
547+
vfdP->seekPos = lseek(vfdP->fd, (off_t) 0, SEEK_CUR);
548+
Assert(vfdP->seekPos != (off_t) -1);
549549

550550
/* close the file */
551551
if (close(vfdP->fd))
@@ -616,12 +616,12 @@ LruInsert(File file)
616616
}
617617

618618
/* seek to the right position */
619-
if (vfdP->seekPos != 0L)
619+
if (vfdP->seekPos != (off_t) 0)
620620
{
621-
long returnValue;
621+
off_t returnValue;
622622

623-
returnValue = (long) lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
624-
Assert(returnValue != -1L);
623+
returnValue = lseek(vfdP->fd, vfdP->seekPos, SEEK_SET);
624+
Assert(returnValue != (off_t) -1);
625625
}
626626
}
627627

@@ -1027,9 +1027,10 @@ FileRead(File file, char *buffer, int amount)
10271027

10281028
Assert(FileIsValid(file));
10291029

1030-
DO_DB(elog(LOG, "FileRead: %d (%s) %ld %d %p",
1030+
DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %d %p",
10311031
file, VfdCache[file].fileName,
1032-
VfdCache[file].seekPos, amount, buffer));
1032+
(int64) VfdCache[file].seekPos,
1033+
amount, buffer));
10331034

10341035
returnCode = FileAccess(file);
10351036
if (returnCode < 0)
@@ -1081,9 +1082,10 @@ FileWrite(File file, char *buffer, int amount)
10811082

10821083
Assert(FileIsValid(file));
10831084

1084-
DO_DB(elog(LOG, "FileWrite: %d (%s) %ld %d %p",
1085+
DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %d %p",
10851086
file, VfdCache[file].fileName,
1086-
VfdCache[file].seekPos, amount, buffer));
1087+
(int64) VfdCache[file].seekPos,
1088+
amount, buffer));
10871089

10881090
returnCode = FileAccess(file);
10891091
if (returnCode < 0)
@@ -1146,24 +1148,26 @@ FileSync(File file)
11461148
return pg_fsync(VfdCache[file].fd);
11471149
}
11481150

1149-
long
1150-
FileSeek(File file, long offset, int whence)
1151+
off_t
1152+
FileSeek(File file, off_t offset, int whence)
11511153
{
11521154
int returnCode;
11531155

11541156
Assert(FileIsValid(file));
11551157

1156-
DO_DB(elog(LOG, "FileSeek: %d (%s) %ld %ld %d",
1158+
DO_DB(elog(LOG, "FileSeek: %d (%s) " INT64_FORMAT " " INT64_FORMAT " %d",
11571159
file, VfdCache[file].fileName,
1158-
VfdCache[file].seekPos, offset, whence));
1160+
(int64) VfdCache[file].seekPos,
1161+
(int64) offset, whence));
11591162

11601163
if (FileIsNotOpen(file))
11611164
{
11621165
switch (whence)
11631166
{
11641167
case SEEK_SET:
11651168
if (offset < 0)
1166-
elog(ERROR, "invalid seek offset: %ld", offset);
1169+
elog(ERROR, "invalid seek offset: " INT64_FORMAT,
1170+
(int64) offset);
11671171
VfdCache[file].seekPos = offset;
11681172
break;
11691173
case SEEK_CUR:
@@ -1187,7 +1191,8 @@ FileSeek(File file, long offset, int whence)
11871191
{
11881192
case SEEK_SET:
11891193
if (offset < 0)
1190-
elog(ERROR, "invalid seek offset: %ld", offset);
1194+
elog(ERROR, "invalid seek offset: " INT64_FORMAT,
1195+
(int64) offset);
11911196
if (VfdCache[file].seekPos != offset)
11921197
VfdCache[file].seekPos = lseek(VfdCache[file].fd,
11931198
offset, whence);
@@ -1213,7 +1218,7 @@ FileSeek(File file, long offset, int whence)
12131218
* XXX not actually used but here for completeness
12141219
*/
12151220
#ifdef NOT_USED
1216-
long
1221+
off_t
12171222
FileTell(File file)
12181223
{
12191224
Assert(FileIsValid(file));
@@ -1224,7 +1229,7 @@ FileTell(File file)
12241229
#endif
12251230

12261231
int
1227-
FileTruncate(File file, long offset)
1232+
FileTruncate(File file, off_t offset)
12281233
{
12291234
int returnCode;
12301235

@@ -1237,7 +1242,7 @@ FileTruncate(File file, long offset)
12371242
if (returnCode < 0)
12381243
return returnCode;
12391244

1240-
returnCode = ftruncate(VfdCache[file].fd, (size_t) offset);
1245+
returnCode = ftruncate(VfdCache[file].fd, offset);
12411246
return returnCode;
12421247
}
12431248

0 commit comments

Comments
 (0)