Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit f4230d2

Browse files
committed
Change patternsel() so that instead of switching from a pure
pattern-examination heuristic method to purely histogram-driven selectivity at histogram size 100, we compute both estimates and use a weighted average. The weight put on the heuristic estimate decreases linearly with histogram size, dropping to zero for 100 or more histogram entries. Likewise in ltreeparentsel(). After a patch by Greg Stark, though I reorganized the logic a bit to give the caller of histogram_selectivity() more control.
1 parent 422495d commit f4230d2

File tree

3 files changed

+75
-35
lines changed

3 files changed

+75
-35
lines changed

contrib/ltree/ltree_op.c

+19-8
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/*
22
* op function for ltree
33
* Teodor Sigaev <teodor@stack.net>
4-
* $PostgreSQL: pgsql/contrib/ltree/ltree_op.c,v 1.16 2007/02/28 22:44:38 tgl Exp $
4+
* $PostgreSQL: pgsql/contrib/ltree/ltree_op.c,v 1.17 2008/03/09 00:32:09 tgl Exp $
55
*/
66

77
#include "ltree.h"
@@ -609,6 +609,7 @@ ltreeparentsel(PG_FUNCTION_ARGS)
609609
double mcvsum;
610610
double mcvsel;
611611
double nullfrac;
612+
int hist_size;
612613

613614
fmgr_info(get_opcode(operator), &contproc);
614615

@@ -626,21 +627,31 @@ ltreeparentsel(PG_FUNCTION_ARGS)
626627
*/
627628
selec = histogram_selectivity(&vardata, &contproc,
628629
constval, varonleft,
629-
100, 1);
630+
10, 1, &hist_size);
630631
if (selec < 0)
631632
{
632633
/* Nope, fall back on default */
633634
selec = DEFAULT_PARENT_SEL;
634635
}
635-
else
636+
else if (hist_size < 100)
636637
{
637-
/* Yes, but don't believe extremely small or large estimates. */
638-
if (selec < 0.0001)
639-
selec = 0.0001;
640-
else if (selec > 0.9999)
641-
selec = 0.9999;
638+
/*
639+
* For histogram sizes from 10 to 100, we combine the
640+
* histogram and default selectivities, putting increasingly
641+
* more trust in the histogram for larger sizes.
642+
*/
643+
double hist_weight = hist_size / 100.0;
644+
645+
selec = selec * hist_weight +
646+
DEFAULT_PARENT_SEL * (1.0 - hist_weight);
642647
}
643648

649+
/* In any case, don't believe extremely small or large estimates. */
650+
if (selec < 0.0001)
651+
selec = 0.0001;
652+
else if (selec > 0.9999)
653+
selec = 0.9999;
654+
644655
if (HeapTupleIsValid(vardata.statsTuple))
645656
nullfrac = ((Form_pg_statistic) GETSTRUCT(vardata.statsTuple))->stanullfrac;
646657
else

src/backend/utils/adt/selfuncs.c

+53-25
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
*
1616
*
1717
* IDENTIFICATION
18-
* $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.244 2008/03/08 22:41:38 tgl Exp $
18+
* $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.245 2008/03/09 00:32:09 tgl Exp $
1919
*
2020
*-------------------------------------------------------------------------
2121
*/
@@ -567,17 +567,23 @@ mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
567567
* or not it has anything to do with the histogram sort operator. We are
568568
* essentially using the histogram just as a representative sample. However,
569569
* small histograms are unlikely to be all that representative, so the caller
570-
* should specify a minimum histogram size to use, and fall back on some
571-
* other approach if this routine fails.
570+
* should be prepared to fall back on some other estimation approach when the
571+
* histogram is missing or very small. It may also be prudent to combine this
572+
* approach with another one when the histogram is small.
572573
*
573-
* The caller also specifies n_skip, which causes us to ignore the first and
574-
* last n_skip histogram elements, on the grounds that they are outliers and
575-
* hence not very representative. If in doubt, min_hist_size = 100 and
576-
* n_skip = 1 are reasonable values.
574+
* If the actual histogram size is not at least min_hist_size, we won't bother
575+
* to do the calculation at all. Also, if the n_skip parameter is > 0, we
576+
* ignore the first and last n_skip histogram elements, on the grounds that
577+
* they are outliers and hence not very representative. Typical values for
578+
* these parameters are 10 and 1.
577579
*
578580
* The function result is the selectivity, or -1 if there is no histogram
579581
* or it's smaller than min_hist_size.
580582
*
583+
* The output parameter *hist_size receives the actual histogram size,
584+
* or zero if no histogram. Callers may use this number to decide how
585+
* much faith to put in the function result.
586+
*
581587
* Note that the result disregards both the most-common-values (if any) and
582588
* null entries. The caller is expected to combine this result with
583589
* statistics for those portions of the column population. It may also be
@@ -586,7 +592,8 @@ mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
586592
double
587593
histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
588594
Datum constval, bool varonleft,
589-
int min_hist_size, int n_skip)
595+
int min_hist_size, int n_skip,
596+
int *hist_size)
590597
{
591598
double result;
592599
Datum *values;
@@ -603,6 +610,7 @@ histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
603610
&values, &nvalues,
604611
NULL, NULL))
605612
{
613+
*hist_size = nvalues;
606614
if (nvalues >= min_hist_size)
607615
{
608616
int nmatch = 0;
@@ -626,7 +634,10 @@ histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
626634
free_attstatsslot(vardata->atttype, values, nvalues, NULL, 0);
627635
}
628636
else
637+
{
638+
*hist_size = 0;
629639
result = -1;
640+
}
630641

631642
return result;
632643
}
@@ -1117,13 +1128,16 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
11171128
* selectivity of the fixed prefix and remainder of pattern
11181129
* separately, then combine the two to get an estimate of the
11191130
* selectivity for the part of the column population represented by
1120-
* the histogram. We then add up data for any most-common-values
1121-
* values; these are not in the histogram population, and we can get
1122-
* exact answers for them by applying the pattern operator, so there's
1123-
* no reason to approximate. (If the MCVs cover a significant part of
1124-
* the total population, this gives us a big leg up in accuracy.)
1131+
* the histogram. (For small histograms, we combine these approaches.)
1132+
*
1133+
* We then add up data for any most-common-values values; these are
1134+
* not in the histogram population, and we can get exact answers for
1135+
* them by applying the pattern operator, so there's no reason to
1136+
* approximate. (If the MCVs cover a significant part of the total
1137+
* population, this gives us a big leg up in accuracy.)
11251138
*/
11261139
Selectivity selec;
1140+
int hist_size;
11271141
FmgrInfo opproc;
11281142
double nullfrac,
11291143
mcv_selec,
@@ -1133,10 +1147,12 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
11331147
fmgr_info(get_opcode(operator), &opproc);
11341148

11351149
selec = histogram_selectivity(&vardata, &opproc, constval, true,
1136-
100, 1);
1137-
if (selec < 0)
1150+
10, 1, &hist_size);
1151+
1152+
/* If not at least 100 entries, use the heuristic method */
1153+
if (hist_size < 100)
11381154
{
1139-
/* Nope, so fake it with the heuristic method */
1155+
Selectivity heursel;
11401156
Selectivity prefixsel;
11411157
Selectivity restsel;
11421158

@@ -1146,17 +1162,29 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
11461162
else
11471163
prefixsel = 1.0;
11481164
restsel = pattern_selectivity(rest, ptype);
1149-
selec = prefixsel * restsel;
1150-
}
1151-
else
1152-
{
1153-
/* Yes, but don't believe extremely small or large estimates. */
1154-
if (selec < 0.0001)
1155-
selec = 0.0001;
1156-
else if (selec > 0.9999)
1157-
selec = 0.9999;
1165+
heursel = prefixsel * restsel;
1166+
1167+
if (selec < 0) /* fewer than 10 histogram entries? */
1168+
selec = heursel;
1169+
else
1170+
{
1171+
/*
1172+
* For histogram sizes from 10 to 100, we combine the
1173+
* histogram and heuristic selectivities, putting increasingly
1174+
* more trust in the histogram for larger sizes.
1175+
*/
1176+
double hist_weight = hist_size / 100.0;
1177+
1178+
selec = selec * hist_weight + heursel * (1.0 - hist_weight);
1179+
}
11581180
}
11591181

1182+
/* In any case, don't believe extremely small or large estimates. */
1183+
if (selec < 0.0001)
1184+
selec = 0.0001;
1185+
else if (selec > 0.9999)
1186+
selec = 0.9999;
1187+
11601188
/*
11611189
* If we have most-common-values info, add up the fractions of the MCV
11621190
* entries that satisfy MCV OP PATTERN. These fractions contribute

src/include/utils/selfuncs.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
99
* Portions Copyright (c) 1994, Regents of the University of California
1010
*
11-
* $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.43 2008/01/01 19:45:59 momjian Exp $
11+
* $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.44 2008/03/09 00:32:09 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -112,7 +112,8 @@ extern double mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
112112
double *sumcommonp);
113113
extern double histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
114114
Datum constval, bool varonleft,
115-
int min_hist_size, int n_skip);
115+
int min_hist_size, int n_skip,
116+
int *hist_size);
116117

117118
extern Pattern_Prefix_Status pattern_fixed_prefix(Const *patt,
118119
Pattern_Type ptype,

0 commit comments

Comments
 (0)