Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 7e534ad

Browse files
committed
Fix BRIN cost estimation
The original code was overly optimistic about the cost of scanning a BRIN index, leading to BRIN indexes being selected when they'd be a worse choice than some other index. This complete rewrite should be more accurate. Author: David Rowley, based on an earlier patch by Emre Hasegeli Reviewed-by: Emre Hasegeli Discussion: https://postgr.es/m/CAKJS1f9n-Wapop5Xz1dtGdpdqmzeGqQK4sV2MK-zZugfC14Xtw@mail.gmail.com
1 parent b2ff37d commit 7e534ad

File tree

5 files changed

+248
-26
lines changed

5 files changed

+248
-26
lines changed

src/backend/access/brin/brin.c

+21
Original file line numberDiff line numberDiff line change
@@ -1049,6 +1049,27 @@ brin_free_desc(BrinDesc *bdesc)
10491049
MemoryContextDelete(bdesc->bd_context);
10501050
}
10511051

1052+
/*
1053+
* Fetch index's statistical data into *stats
1054+
*/
1055+
void
1056+
brinGetStats(Relation index, BrinStatsData *stats)
1057+
{
1058+
Buffer metabuffer;
1059+
Page metapage;
1060+
BrinMetaPageData *metadata;
1061+
1062+
metabuffer = ReadBuffer(index, BRIN_METAPAGE_BLKNO);
1063+
LockBuffer(metabuffer, BUFFER_LOCK_SHARE);
1064+
metapage = BufferGetPage(metabuffer);
1065+
metadata = (BrinMetaPageData *) PageGetContents(metapage);
1066+
1067+
stats->pagesPerRange = metadata->pagesPerRange;
1068+
stats->revmapNumPages = metadata->lastRevmapPage - 1;
1069+
1070+
UnlockReleaseBuffer(metabuffer);
1071+
}
1072+
10521073
/*
10531074
* Initialize a BrinBuildState appropriate to create tuples on the given index.
10541075
*/

src/backend/utils/adt/selfuncs.c

+171-26
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@
101101
#include <float.h>
102102
#include <math.h>
103103

104+
#include "access/brin.h"
104105
#include "access/gin.h"
105106
#include "access/htup_details.h"
106107
#include "access/sysattr.h"
@@ -7721,56 +7722,200 @@ brincostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
77217722
{
77227723
IndexOptInfo *index = path->indexinfo;
77237724
List *indexQuals = path->indexquals;
7724-
List *indexOrderBys = path->indexorderbys;
77257725
double numPages = index->pages;
7726-
double numTuples = index->tuples;
7726+
RelOptInfo *baserel = index->rel;
7727+
RangeTblEntry *rte = planner_rt_fetch(baserel->relid, root);
77277728
List *qinfos;
77287729
Cost spc_seq_page_cost;
77297730
Cost spc_random_page_cost;
7730-
double qual_op_cost;
77317731
double qual_arg_cost;
7732+
double qualSelectivity;
7733+
BrinStatsData statsData;
7734+
double indexRanges;
7735+
double minimalRanges;
7736+
double estimatedRanges;
7737+
double selec;
7738+
Relation indexRel;
7739+
ListCell *l;
7740+
VariableStatData vardata;
77327741

7733-
/* Do preliminary analysis of indexquals */
7734-
qinfos = deconstruct_indexquals(path);
7742+
Assert(rte->rtekind == RTE_RELATION);
77357743

7736-
/* fetch estimated page cost for tablespace containing index */
7744+
/* fetch estimated page cost for the tablespace containing the index */
77377745
get_tablespace_page_costs(index->reltablespace,
77387746
&spc_random_page_cost,
77397747
&spc_seq_page_cost);
77407748

77417749
/*
7742-
* BRIN indexes are always read in full; use that as startup cost.
7750+
* Obtain some data from the index itself.
7751+
*/
7752+
indexRel = index_open(index->indexoid, AccessShareLock);
7753+
brinGetStats(indexRel, &statsData);
7754+
index_close(indexRel, AccessShareLock);
7755+
7756+
/*
7757+
* Compute index correlation
77437758
*
7744-
* XXX maybe only include revmap pages here?
7759+
* Because we can use all index quals equally when scanning, we can use
7760+
* the largest correlation (in absolute value) among columns used by the
7761+
* query. Start at zero, the worst possible case. If we cannot find
7762+
* any correlation statistics, we will keep it as 0.
7763+
*/
7764+
*indexCorrelation = 0;
7765+
7766+
qinfos = deconstruct_indexquals(path);
7767+
foreach(l, qinfos)
7768+
{
7769+
IndexQualInfo *qinfo = (IndexQualInfo *) lfirst(l);
7770+
AttrNumber attnum = index->indexkeys[qinfo->indexcol];
7771+
7772+
/* attempt to lookup stats in relation for this index column */
7773+
if (attnum != 0)
7774+
{
7775+
/* Simple variable -- look to stats for the underlying table */
7776+
if (get_relation_stats_hook &&
7777+
(*get_relation_stats_hook) (root, rte, attnum, &vardata))
7778+
{
7779+
/*
7780+
* The hook took control of acquiring a stats tuple. If it
7781+
* did supply a tuple, it'd better have supplied a freefunc.
7782+
*/
7783+
if (HeapTupleIsValid(vardata.statsTuple) && !vardata.freefunc)
7784+
elog(ERROR,
7785+
"no function provided to release variable stats with");
7786+
}
7787+
else
7788+
{
7789+
vardata.statsTuple =
7790+
SearchSysCache3(STATRELATTINH,
7791+
ObjectIdGetDatum(rte->relid),
7792+
Int16GetDatum(attnum),
7793+
BoolGetDatum(false));
7794+
vardata.freefunc = ReleaseSysCache;
7795+
}
7796+
}
7797+
else
7798+
{
7799+
/*
7800+
* Looks like we've found an expression column in the index. Let's
7801+
* see if there's any stats for it.
7802+
*/
7803+
7804+
/* get the attnum from the 0-based index. */
7805+
attnum = qinfo->indexcol + 1;
7806+
7807+
if (get_index_stats_hook &&
7808+
(*get_index_stats_hook) (root, index->indexoid, attnum, &vardata))
7809+
{
7810+
/*
7811+
* The hook took control of acquiring a stats tuple. If it did
7812+
* supply a tuple, it'd better have supplied a freefunc.
7813+
*/
7814+
if (HeapTupleIsValid(vardata.statsTuple) &&
7815+
!vardata.freefunc)
7816+
elog(ERROR, "no function provided to release variable stats with");
7817+
}
7818+
else
7819+
{
7820+
vardata.statsTuple = SearchSysCache3(STATRELATTINH,
7821+
ObjectIdGetDatum(index->indexoid),
7822+
Int16GetDatum(attnum),
7823+
BoolGetDatum(false));
7824+
vardata.freefunc = ReleaseSysCache;
7825+
}
7826+
}
7827+
7828+
if (HeapTupleIsValid(vardata.statsTuple))
7829+
{
7830+
float4 *numbers;
7831+
int nnumbers;
7832+
7833+
if (get_attstatsslot(vardata.statsTuple, InvalidOid, 0,
7834+
STATISTIC_KIND_CORRELATION,
7835+
InvalidOid,
7836+
NULL,
7837+
NULL, NULL,
7838+
&numbers, &nnumbers))
7839+
{
7840+
double varCorrelation = 0.0;
7841+
7842+
if (nnumbers > 0)
7843+
varCorrelation = Abs(numbers[0]);
7844+
7845+
if (varCorrelation > *indexCorrelation)
7846+
*indexCorrelation = varCorrelation;
7847+
7848+
free_attstatsslot(InvalidOid, NULL, 0, numbers, nnumbers);
7849+
}
7850+
}
7851+
7852+
ReleaseVariableStats(vardata);
7853+
}
7854+
7855+
qualSelectivity = clauselist_selectivity(root, indexQuals,
7856+
baserel->relid,
7857+
JOIN_INNER, NULL,
7858+
baserel);
7859+
7860+
/* work out the actual number of ranges in the index */
7861+
indexRanges = Max(ceil((double) baserel->pages / statsData.pagesPerRange),
7862+
1.0);
7863+
7864+
/*
7865+
* Now calculate the minimum possible ranges we could match with if all of
7866+
* the rows were in the perfect order in the table's heap.
77457867
*/
7746-
*indexStartupCost = spc_seq_page_cost * numPages * loop_count;
7868+
minimalRanges = ceil(indexRanges * qualSelectivity);
77477869

77487870
/*
7749-
* To read a BRIN index there might be a bit of back and forth over
7750-
* regular pages, as revmap might point to them out of sequential order;
7751-
* calculate this as reading the whole index in random order.
7871+
* Now estimate the number of ranges that we'll touch by using the
7872+
* indexCorrelation from the stats. Careful not to divide by zero
7873+
* (note we're using the absolute value of the correlation).
77527874
*/
7753-
*indexTotalCost = spc_random_page_cost * numPages * loop_count;
7875+
if (*indexCorrelation < 1.0e-10)
7876+
estimatedRanges = indexRanges;
7877+
else
7878+
estimatedRanges = Min(minimalRanges / *indexCorrelation, indexRanges);
77547879

7755-
*indexSelectivity =
7756-
clauselist_selectivity(root, indexQuals,
7757-
path->indexinfo->rel->relid,
7758-
JOIN_INNER, NULL,
7759-
path->indexinfo->rel);
7760-
*indexCorrelation = 1;
7880+
/* we expect to visit this portion of the table */
7881+
selec = estimatedRanges / indexRanges;
7882+
7883+
CLAMP_PROBABILITY(selec);
7884+
7885+
*indexSelectivity = selec;
77617886

77627887
/*
7763-
* Add on index qual eval costs, much as in genericcostestimate.
7888+
* Compute the index qual costs, much as in genericcostestimate, to add
7889+
* to the index costs.
77647890
*/
77657891
qual_arg_cost = other_operands_eval_cost(root, qinfos) +
77667892
orderby_operands_eval_cost(root, path);
7767-
qual_op_cost = cpu_operator_cost *
7768-
(list_length(indexQuals) + list_length(indexOrderBys));
77697893

7894+
/*
7895+
* Compute the startup cost as the cost to read the whole revmap
7896+
* sequentially, including the cost to execute the index quals.
7897+
*/
7898+
*indexStartupCost =
7899+
spc_seq_page_cost * statsData.revmapNumPages * loop_count;
77707900
*indexStartupCost += qual_arg_cost;
7771-
*indexTotalCost += qual_arg_cost;
7772-
*indexTotalCost += (numTuples * *indexSelectivity) * (cpu_index_tuple_cost + qual_op_cost);
7773-
*indexPages = index->pages;
77747901

7775-
/* XXX what about pages_per_range? */
7902+
/*
7903+
* To read a BRIN index there might be a bit of back and forth over
7904+
* regular pages, as revmap might point to them out of sequential order;
7905+
* calculate the total cost as reading the whole index in random order.
7906+
*/
7907+
*indexTotalCost = *indexStartupCost +
7908+
spc_random_page_cost * (numPages - statsData.revmapNumPages) * loop_count;
7909+
7910+
/*
7911+
* Charge a small amount per range tuple which we expect to match to. This
7912+
* is meant to reflect the costs of manipulating the bitmap. The BRIN scan
7913+
* will set a bit for each page in the range when we find a matching
7914+
* range, so we must multiply the charge by the number of pages in the
7915+
* range.
7916+
*/
7917+
*indexTotalCost += 0.1 * cpu_operator_cost * estimatedRanges *
7918+
statsData.pagesPerRange;
7919+
7920+
*indexPages = index->pages;
77767921
}

src/include/access/brin.h

+14
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,17 @@ typedef struct BrinOptions
2525
bool autosummarize;
2626
} BrinOptions;
2727

28+
29+
/*
30+
* BrinStatsData represents stats data for planner use
31+
*/
32+
typedef struct BrinStatsData
33+
{
34+
BlockNumber pagesPerRange;
35+
BlockNumber revmapNumPages;
36+
} BrinStatsData;
37+
38+
2839
#define BRIN_DEFAULT_PAGES_PER_RANGE 128
2940
#define BrinGetPagesPerRange(relation) \
3041
((relation)->rd_options ? \
@@ -35,4 +46,7 @@ typedef struct BrinOptions
3546
((BrinOptions *) (relation)->rd_options)->autosummarize : \
3647
false)
3748

49+
50+
extern void brinGetStats(Relation index, BrinStatsData *stats);
51+
3852
#endif /* BRIN_H */

src/test/regress/expected/brin.out

+26
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,8 @@ BEGIN
363363
END LOOP;
364364
END;
365365
$x$;
366+
RESET enable_seqscan;
367+
RESET enable_bitmapscan;
366368
INSERT INTO brintest SELECT
367369
repeat(stringu1, 42)::bytea,
368370
substr(stringu1, 1, 1)::"char",
@@ -481,3 +483,27 @@ SELECT brin_summarize_range('brin_summarize_idx', -1);
481483
ERROR: block number out of range: -1
482484
SELECT brin_summarize_range('brin_summarize_idx', 4294967296);
483485
ERROR: block number out of range: 4294967296
486+
-- test brin cost estimates behave sanely based on correlation of values
487+
CREATE TABLE brin_test (a INT, b INT);
488+
INSERT INTO brin_test SELECT x/100,x%100 FROM generate_series(1,10000) x(x);
489+
CREATE INDEX brin_test_a_idx ON brin_test USING brin (a) WITH (pages_per_range = 2);
490+
CREATE INDEX brin_test_b_idx ON brin_test USING brin (b) WITH (pages_per_range = 2);
491+
VACUUM ANALYZE brin_test;
492+
-- Ensure brin index is used when columns are perfectly correlated
493+
EXPLAIN (COSTS OFF) SELECT * FROM brin_test WHERE a = 1;
494+
QUERY PLAN
495+
--------------------------------------------
496+
Bitmap Heap Scan on brin_test
497+
Recheck Cond: (a = 1)
498+
-> Bitmap Index Scan on brin_test_a_idx
499+
Index Cond: (a = 1)
500+
(4 rows)
501+
502+
-- Ensure brin index is not used when values are not correlated
503+
EXPLAIN (COSTS OFF) SELECT * FROM brin_test WHERE b = 1;
504+
QUERY PLAN
505+
-----------------------
506+
Seq Scan on brin_test
507+
Filter: (b = 1)
508+
(2 rows)
509+

src/test/regress/sql/brin.sql

+16
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,9 @@ BEGIN
370370
END;
371371
$x$;
372372

373+
RESET enable_seqscan;
374+
RESET enable_bitmapscan;
375+
373376
INSERT INTO brintest SELECT
374377
repeat(stringu1, 42)::bytea,
375378
substr(stringu1, 1, 1)::"char",
@@ -444,3 +447,16 @@ SELECT brin_summarize_range('brin_summarize_idx', 4294967295);
444447
-- invalid block number values
445448
SELECT brin_summarize_range('brin_summarize_idx', -1);
446449
SELECT brin_summarize_range('brin_summarize_idx', 4294967296);
450+
451+
452+
-- test brin cost estimates behave sanely based on correlation of values
453+
CREATE TABLE brin_test (a INT, b INT);
454+
INSERT INTO brin_test SELECT x/100,x%100 FROM generate_series(1,10000) x(x);
455+
CREATE INDEX brin_test_a_idx ON brin_test USING brin (a) WITH (pages_per_range = 2);
456+
CREATE INDEX brin_test_b_idx ON brin_test USING brin (b) WITH (pages_per_range = 2);
457+
VACUUM ANALYZE brin_test;
458+
459+
-- Ensure brin index is used when columns are perfectly correlated
460+
EXPLAIN (COSTS OFF) SELECT * FROM brin_test WHERE a = 1;
461+
-- Ensure brin index is not used when values are not correlated
462+
EXPLAIN (COSTS OFF) SELECT * FROM brin_test WHERE b = 1;

0 commit comments

Comments
 (0)