Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 1aba62e

Browse files
committed
Allow per-tablespace effective_io_concurrency
Per discussion, nowadays it is possible to have tablespaces that have wildly different I/O characteristics from others. Setting different effective_io_concurrency parameters for those has been measured to improve performance. Author: Julien Rouhaud Reviewed by: Andres Freund
1 parent 665a00c commit 1aba62e

File tree

12 files changed

+145
-63
lines changed

12 files changed

+145
-63
lines changed

doc/src/sgml/config.sgml

+4-1
Original file line numberDiff line numberDiff line change
@@ -1901,7 +1901,10 @@ include_dir 'conf.d'
19011901
</para>
19021902

19031903
<para>
1904-
The default is 1 on supported systems, otherwise 0.
1904+
The default is 1 on supported systems, otherwise 0. This value can
1905+
be overriden for tables in a particular tablespace by setting the
1906+
tablespace parameter of the same name (see
1907+
<xref linkend="sql-altertablespace">).
19051908
</para>
19061909
</listitem>
19071910
</varlistentry>

doc/src/sgml/ref/create_tablespace.sgml

+9-8
Original file line numberDiff line numberDiff line change
@@ -104,14 +104,15 @@ CREATE TABLESPACE <replaceable class="parameter">tablespace_name</replaceable>
104104
<listitem>
105105
<para>
106106
A tablespace parameter to be set or reset. Currently, the only
107-
available parameters are <varname>seq_page_cost</> and
108-
<varname>random_page_cost</>. Setting either value for a particular
109-
tablespace will override the planner's usual estimate of the cost of
110-
reading pages from tables in that tablespace, as established by
111-
the configuration parameters of the same name (see
112-
<xref linkend="guc-seq-page-cost">,
113-
<xref linkend="guc-random-page-cost">). This may be useful if one
114-
tablespace is located on a disk which is faster or slower than the
107+
available parameters are <varname>seq_page_cost</>,
108+
<varname>random_page_cost</> and <varname>effective_io_concurrency</>.
109+
Setting either value for a particular tablespace will override the
110+
planner's usual estimate of the cost of reading pages from tables in
111+
that tablespace, as established by the configuration parameters of the
112+
same name (see <xref linkend="guc-seq-page-cost">,
113+
<xref linkend="guc-random-page-cost">,
114+
<xref linkend="guc-effective-io-concurrency">). This may be useful if
115+
one tablespace is located on a disk which is faster or slower than the
115116
remainder of the I/O subsystem.
116117
</para>
117118
</listitem>

src/backend/access/common/reloptions.c

+15-1
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,19 @@ static relopt_int intRelOpts[] =
254254
},
255255
-1, 64, MAX_KILOBYTES
256256
},
257+
{
258+
{
259+
"effective_io_concurrency",
260+
"Number of simultaneous requests that can be handled efficiently by the disk subsystem.",
261+
RELOPT_KIND_TABLESPACE,
262+
AccessExclusiveLock
263+
},
264+
#ifdef USE_PREFETCH
265+
-1, 0, MAX_IO_CONCURRENCY
266+
#else
267+
0, 0, 0
268+
#endif
269+
},
257270

258271
/* list terminator */
259272
{{NULL}}
@@ -1438,7 +1451,8 @@ tablespace_reloptions(Datum reloptions, bool validate)
14381451
int numoptions;
14391452
static const relopt_parse_elt tab[] = {
14401453
{"random_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, random_page_cost)},
1441-
{"seq_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, seq_page_cost)}
1454+
{"seq_page_cost", RELOPT_TYPE_REAL, offsetof(TableSpaceOpts, seq_page_cost)},
1455+
{"effective_io_concurrency", RELOPT_TYPE_INT, offsetof(TableSpaceOpts, effective_io_concurrency)}
14421456
};
14431457

14441458
options = parseRelOptions(reloptions, validate, RELOPT_KIND_TABLESPACE,

src/backend/executor/nodeBitmapHeapscan.c

+27-8
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
#include "storage/predicate.h"
4545
#include "utils/memutils.h"
4646
#include "utils/rel.h"
47+
#include "utils/spccache.h"
4748
#include "utils/snapmgr.h"
4849
#include "utils/tqual.h"
4950

@@ -95,9 +96,8 @@ BitmapHeapNext(BitmapHeapScanState *node)
9596
* prefetching. node->prefetch_pages tracks exactly how many pages ahead
9697
* the prefetch iterator is. Also, node->prefetch_target tracks the
9798
* desired prefetch distance, which starts small and increases up to the
98-
* GUC-controlled maximum, target_prefetch_pages. This is to avoid doing
99-
* a lot of prefetching in a scan that stops after a few tuples because of
100-
* a LIMIT.
99+
* node->prefetch_maximum. This is to avoid doing a lot of prefetching in
100+
* a scan that stops after a few tuples because of a LIMIT.
101101
*/
102102
if (tbm == NULL)
103103
{
@@ -111,7 +111,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
111111
node->tbmres = tbmres = NULL;
112112

113113
#ifdef USE_PREFETCH
114-
if (target_prefetch_pages > 0)
114+
if (node->prefetch_maximum > 0)
115115
{
116116
node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm);
117117
node->prefetch_pages = 0;
@@ -188,10 +188,10 @@ BitmapHeapNext(BitmapHeapScanState *node)
188188
* page/tuple, then to one after the second tuple is fetched, then
189189
* it doubles as later pages are fetched.
190190
*/
191-
if (node->prefetch_target >= target_prefetch_pages)
191+
if (node->prefetch_target >= node->prefetch_maximum)
192192
/* don't increase any further */ ;
193-
else if (node->prefetch_target >= target_prefetch_pages / 2)
194-
node->prefetch_target = target_prefetch_pages;
193+
else if (node->prefetch_target >= node->prefetch_maximum / 2)
194+
node->prefetch_target = node->prefetch_maximum;
195195
else if (node->prefetch_target > 0)
196196
node->prefetch_target *= 2;
197197
else
@@ -211,7 +211,7 @@ BitmapHeapNext(BitmapHeapScanState *node)
211211
* Try to prefetch at least a few pages even before we get to the
212212
* second page if we don't stop reading after the first tuple.
213213
*/
214-
if (node->prefetch_target < target_prefetch_pages)
214+
if (node->prefetch_target < node->prefetch_maximum)
215215
node->prefetch_target++;
216216
#endif /* USE_PREFETCH */
217217
}
@@ -539,6 +539,7 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
539539
{
540540
BitmapHeapScanState *scanstate;
541541
Relation currentRelation;
542+
int io_concurrency;
542543

543544
/* check for unsupported flags */
544545
Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)));
@@ -564,6 +565,8 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
564565
scanstate->prefetch_iterator = NULL;
565566
scanstate->prefetch_pages = 0;
566567
scanstate->prefetch_target = 0;
568+
/* may be updated below */
569+
scanstate->prefetch_maximum = target_prefetch_pages;
567570

568571
/*
569572
* Miscellaneous initialization
@@ -598,6 +601,22 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
598601
*/
599602
currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags);
600603

604+
/*
605+
* Determine the maximum for prefetch_target. If the tablespace has a
606+
* specific IO concurrency set, use that to compute the corresponding
607+
* maximum value; otherwise, we already initialized to the value computed
608+
* by the GUC machinery.
609+
*/
610+
io_concurrency =
611+
get_tablespace_io_concurrency(currentRelation->rd_rel->reltablespace);
612+
if (io_concurrency != effective_io_concurrency)
613+
{
614+
double maximum;
615+
616+
if (ComputeIoConcurrency(io_concurrency, &maximum))
617+
scanstate->prefetch_maximum = rint(maximum);
618+
}
619+
601620
scanstate->ss.ss_currentRelation = currentRelation;
602621

603622
/*

src/backend/storage/buffer/bufmgr.c

+62-1
Original file line numberDiff line numberDiff line change
@@ -80,11 +80,14 @@ bool zero_damaged_pages = false;
8080
int bgwriter_lru_maxpages = 100;
8181
double bgwriter_lru_multiplier = 2.0;
8282
bool track_io_timing = false;
83+
int effective_io_concurrency = 0;
8384

8485
/*
8586
* How many buffers PrefetchBuffer callers should try to stay ahead of their
8687
* ReadBuffer calls by. This is maintained by the assign hook for
87-
* effective_io_concurrency. Zero means "never prefetch".
88+
* effective_io_concurrency. Zero means "never prefetch". This value is
89+
* only used for buffers not belonging to tablespaces that have their
90+
* effective_io_concurrency parameter set.
8891
*/
8992
int target_prefetch_pages = 0;
9093

@@ -415,6 +418,64 @@ static void CheckForBufferLeaks(void);
415418
static int rnode_comparator(const void *p1, const void *p2);
416419

417420

421+
/*
422+
* ComputeIoConcurrency -- get the number of pages to prefetch for a given
423+
* number of spindles.
424+
*/
425+
bool
426+
ComputeIoConcurrency(int io_concurrency, double *target)
427+
{
428+
double new_prefetch_pages = 0.0;
429+
int i;
430+
431+
/*
432+
* Make sure the io_concurrency value is within valid range; it may have
433+
* been forced with a manual pg_tablespace update.
434+
*/
435+
io_concurrency = Min(Max(io_concurrency, 0), MAX_IO_CONCURRENCY);
436+
437+
/*----------
438+
* The user-visible GUC parameter is the number of drives (spindles),
439+
* which we need to translate to a number-of-pages-to-prefetch target.
440+
* The target value is stashed in *extra and then assigned to the actual
441+
* variable by assign_effective_io_concurrency.
442+
*
443+
* The expected number of prefetch pages needed to keep N drives busy is:
444+
*
445+
* drives | I/O requests
446+
* -------+----------------
447+
* 1 | 1
448+
* 2 | 2/1 + 2/2 = 3
449+
* 3 | 3/1 + 3/2 + 3/3 = 5 1/2
450+
* 4 | 4/1 + 4/2 + 4/3 + 4/4 = 8 1/3
451+
* n | n * H(n)
452+
*
453+
* This is called the "coupon collector problem" and H(n) is called the
454+
* harmonic series. This could be approximated by n * ln(n), but for
455+
* reasonable numbers of drives we might as well just compute the series.
456+
*
457+
* Alternatively we could set the target to the number of pages necessary
458+
* so that the expected number of active spindles is some arbitrary
459+
* percentage of the total. This sounds the same but is actually slightly
460+
* different. The result ends up being ln(1-P)/ln((n-1)/n) where P is
461+
* that desired fraction.
462+
*
463+
* Experimental results show that both of these formulas aren't aggressive
464+
* enough, but we don't really have any better proposals.
465+
*
466+
* Note that if io_concurrency = 0 (disabled), we must set target = 0.
467+
*----------
468+
*/
469+
470+
for (i = 1; i <= io_concurrency; i++)
471+
new_prefetch_pages += (double) io_concurrency / (double) i;
472+
473+
*target = new_prefetch_pages;
474+
475+
/* This range check shouldn't fail, but let's be paranoid */
476+
return (new_prefetch_pages > 0.0 && new_prefetch_pages < (double) INT_MAX);
477+
}
478+
418479
/*
419480
* PrefetchBuffer -- initiate asynchronous read of a block of a relation
420481
*

src/backend/utils/cache/spccache.c

+12
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "commands/tablespace.h"
2424
#include "miscadmin.h"
2525
#include "optimizer/cost.h"
26+
#include "storage/bufmgr.h"
2627
#include "utils/catcache.h"
2728
#include "utils/hsearch.h"
2829
#include "utils/inval.h"
@@ -198,3 +199,14 @@ get_tablespace_page_costs(Oid spcid,
198199
*spc_seq_page_cost = spc->opts->seq_page_cost;
199200
}
200201
}
202+
203+
int
204+
get_tablespace_io_concurrency(Oid spcid)
205+
{
206+
TableSpaceCacheEntry *spc = get_tablespace(spcid);
207+
208+
if (!spc->opts || spc->opts->effective_io_concurrency < 0)
209+
return effective_io_concurrency;
210+
else
211+
return spc->opts->effective_io_concurrency;
212+
}

src/backend/utils/misc/guc.c

+3-42
Original file line numberDiff line numberDiff line change
@@ -490,7 +490,6 @@ static int wal_block_size;
490490
static bool data_checksums;
491491
static int wal_segment_size;
492492
static bool integer_datetimes;
493-
static int effective_io_concurrency;
494493
static bool assert_enabled;
495494

496495
/* should be static, but commands/variable.c needs to get at this */
@@ -2352,7 +2351,7 @@ static struct config_int ConfigureNamesInt[] =
23522351
},
23532352
&effective_io_concurrency,
23542353
#ifdef USE_PREFETCH
2355-
1, 0, 1000,
2354+
1, 0, MAX_IO_CONCURRENCY,
23562355
#else
23572356
0, 0, 0,
23582357
#endif
@@ -9986,47 +9985,9 @@ static bool
99869985
check_effective_io_concurrency(int *newval, void **extra, GucSource source)
99879986
{
99889987
#ifdef USE_PREFETCH
9989-
double new_prefetch_pages = 0.0;
9990-
int i;
9991-
9992-
/*----------
9993-
* The user-visible GUC parameter is the number of drives (spindles),
9994-
* which we need to translate to a number-of-pages-to-prefetch target.
9995-
* The target value is stashed in *extra and then assigned to the actual
9996-
* variable by assign_effective_io_concurrency.
9997-
*
9998-
* The expected number of prefetch pages needed to keep N drives busy is:
9999-
*
10000-
* drives | I/O requests
10001-
* -------+----------------
10002-
* 1 | 1
10003-
* 2 | 2/1 + 2/2 = 3
10004-
* 3 | 3/1 + 3/2 + 3/3 = 5 1/2
10005-
* 4 | 4/1 + 4/2 + 4/3 + 4/4 = 8 1/3
10006-
* n | n * H(n)
10007-
*
10008-
* This is called the "coupon collector problem" and H(n) is called the
10009-
* harmonic series. This could be approximated by n * ln(n), but for
10010-
* reasonable numbers of drives we might as well just compute the series.
10011-
*
10012-
* Alternatively we could set the target to the number of pages necessary
10013-
* so that the expected number of active spindles is some arbitrary
10014-
* percentage of the total. This sounds the same but is actually slightly
10015-
* different. The result ends up being ln(1-P)/ln((n-1)/n) where P is
10016-
* that desired fraction.
10017-
*
10018-
* Experimental results show that both of these formulas aren't aggressive
10019-
* enough, but we don't really have any better proposals.
10020-
*
10021-
* Note that if *newval = 0 (disabled), we must set target = 0.
10022-
*----------
10023-
*/
10024-
10025-
for (i = 1; i <= *newval; i++)
10026-
new_prefetch_pages += (double) *newval / (double) i;
9988+
double new_prefetch_pages;
100279989

10028-
/* This range check shouldn't fail, but let's be paranoid */
10029-
if (new_prefetch_pages >= 0.0 && new_prefetch_pages < (double) INT_MAX)
9990+
if (ComputeIoConcurrency(*newval, &new_prefetch_pages))
100309991
{
100319992
int *myextra = (int *) guc_malloc(ERROR, sizeof(int));
100329993

src/bin/psql/tab-complete.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -1885,7 +1885,7 @@ psql_completion(const char *text, int start, int end)
18851885
pg_strcasecmp(prev_wd, "(") == 0)
18861886
{
18871887
static const char *const list_TABLESPACEOPTIONS[] =
1888-
{"seq_page_cost", "random_page_cost", NULL};
1888+
{"seq_page_cost", "random_page_cost", "effective_io_concurrency", NULL};
18891889

18901890
COMPLETE_WITH_LIST(list_TABLESPACEOPTIONS);
18911891
}

src/include/commands/tablespace.h

+1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ typedef struct TableSpaceOpts
3939
int32 vl_len_; /* varlena header (do not touch directly!) */
4040
float8 random_page_cost;
4141
float8 seq_page_cost;
42+
int effective_io_concurrency;
4243
} TableSpaceOpts;
4344

4445
extern Oid CreateTableSpace(CreateTableSpaceStmt *stmt);

src/include/nodes/execnodes.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -1424,7 +1424,8 @@ typedef struct BitmapIndexScanState
14241424
* lossy_pages total number of lossy pages retrieved
14251425
* prefetch_iterator iterator for prefetching ahead of current page
14261426
* prefetch_pages # pages prefetch iterator is ahead of current
1427-
* prefetch_target target prefetch distance
1427+
* prefetch_target current target prefetch distance
1428+
* prefetch_maximum maximum value for prefetch_target
14281429
* ----------------
14291430
*/
14301431
typedef struct BitmapHeapScanState
@@ -1439,6 +1440,7 @@ typedef struct BitmapHeapScanState
14391440
TBMIterator *prefetch_iterator;
14401441
int prefetch_pages;
14411442
int prefetch_target;
1443+
int prefetch_maximum;
14421444
} BitmapHeapScanState;
14431445

14441446
/* ----------------

src/include/storage/bufmgr.h

+7
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,17 @@ extern int target_prefetch_pages;
5858
/* in buf_init.c */
5959
extern PGDLLIMPORT char *BufferBlocks;
6060

61+
/* in guc.c */
62+
extern int effective_io_concurrency;
63+
6164
/* in localbuf.c */
6265
extern PGDLLIMPORT int NLocBuffer;
6366
extern PGDLLIMPORT Block *LocalBufferBlockPointers;
6467
extern PGDLLIMPORT int32 *LocalRefCount;
6568

69+
/* upper limit for effective_io_concurrency */
70+
#define MAX_IO_CONCURRENCY 1000
71+
6672
/* special block number for ReadBuffer() */
6773
#define P_NEW InvalidBlockNumber /* grow the file to get a new page */
6874

@@ -144,6 +150,7 @@ extern PGDLLIMPORT int32 *LocalRefCount;
144150
/*
145151
* prototypes for functions in bufmgr.c
146152
*/
153+
extern bool ComputeIoConcurrency(int io_concurrency, double *target);
147154
extern void PrefetchBuffer(Relation reln, ForkNumber forkNum,
148155
BlockNumber blockNum);
149156
extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum);

src/include/utils/spccache.h

+1
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,6 @@
1515

1616
void get_tablespace_page_costs(Oid spcid, float8 *spc_random_page_cost,
1717
float8 *spc_seq_page_cost);
18+
int get_tablespace_io_concurrency(Oid spcid);
1819

1920
#endif /* SPCCACHE_H */

0 commit comments

Comments
 (0)