15
15
*
16
16
*
17
17
* IDENTIFICATION
18
- * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.244 2008/03/08 22:41:38 tgl Exp $
18
+ * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.245 2008/03/09 00:32:09 tgl Exp $
19
19
*
20
20
*-------------------------------------------------------------------------
21
21
*/
@@ -567,17 +567,23 @@ mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
567
567
* or not it has anything to do with the histogram sort operator. We are
568
568
* essentially using the histogram just as a representative sample. However,
569
569
* small histograms are unlikely to be all that representative, so the caller
570
- * should specify a minimum histogram size to use, and fall back on some
571
- * other approach if this routine fails.
570
+ * should be prepared to fall back on some other estimation approach when the
571
+ * histogram is missing or very small. It may also be prudent to combine this
572
+ * approach with another one when the histogram is small.
572
573
*
573
- * The caller also specifies n_skip, which causes us to ignore the first and
574
- * last n_skip histogram elements, on the grounds that they are outliers and
575
- * hence not very representative. If in doubt, min_hist_size = 100 and
576
- * n_skip = 1 are reasonable values.
574
+ * If the actual histogram size is not at least min_hist_size, we won't bother
575
+ * to do the calculation at all. Also, if the n_skip parameter is > 0, we
576
+ * ignore the first and last n_skip histogram elements, on the grounds that
577
+ * they are outliers and hence not very representative. Typical values for
578
+ * these parameters are 10 and 1.
577
579
*
578
580
* The function result is the selectivity, or -1 if there is no histogram
579
581
* or it's smaller than min_hist_size.
580
582
*
583
+ * The output parameter *hist_size receives the actual histogram size,
584
+ * or zero if no histogram. Callers may use this number to decide how
585
+ * much faith to put in the function result.
586
+ *
581
587
* Note that the result disregards both the most-common-values (if any) and
582
588
* null entries. The caller is expected to combine this result with
583
589
* statistics for those portions of the column population. It may also be
@@ -586,7 +592,8 @@ mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
586
592
double
587
593
histogram_selectivity (VariableStatData * vardata , FmgrInfo * opproc ,
588
594
Datum constval , bool varonleft ,
589
- int min_hist_size , int n_skip )
595
+ int min_hist_size , int n_skip ,
596
+ int * hist_size )
590
597
{
591
598
double result ;
592
599
Datum * values ;
@@ -603,6 +610,7 @@ histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
603
610
& values , & nvalues ,
604
611
NULL , NULL ))
605
612
{
613
+ * hist_size = nvalues ;
606
614
if (nvalues >= min_hist_size )
607
615
{
608
616
int nmatch = 0 ;
@@ -626,7 +634,10 @@ histogram_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
626
634
free_attstatsslot (vardata -> atttype , values , nvalues , NULL , 0 );
627
635
}
628
636
else
637
+ {
638
+ * hist_size = 0 ;
629
639
result = -1 ;
640
+ }
630
641
631
642
return result ;
632
643
}
@@ -1117,13 +1128,16 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
1117
1128
* selectivity of the fixed prefix and remainder of pattern
1118
1129
* separately, then combine the two to get an estimate of the
1119
1130
* selectivity for the part of the column population represented by
1120
- * the histogram. We then add up data for any most-common-values
1121
- * values; these are not in the histogram population, and we can get
1122
- * exact answers for them by applying the pattern operator, so there's
1123
- * no reason to approximate. (If the MCVs cover a significant part of
1124
- * the total population, this gives us a big leg up in accuracy.)
1131
+ * the histogram. (For small histograms, we combine these approaches.)
1132
+ *
1133
+ * We then add up data for any most-common-values values; these are
1134
+ * not in the histogram population, and we can get exact answers for
1135
+ * them by applying the pattern operator, so there's no reason to
1136
+ * approximate. (If the MCVs cover a significant part of the total
1137
+ * population, this gives us a big leg up in accuracy.)
1125
1138
*/
1126
1139
Selectivity selec ;
1140
+ int hist_size ;
1127
1141
FmgrInfo opproc ;
1128
1142
double nullfrac ,
1129
1143
mcv_selec ,
@@ -1133,10 +1147,12 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
1133
1147
fmgr_info (get_opcode (operator ), & opproc );
1134
1148
1135
1149
selec = histogram_selectivity (& vardata , & opproc , constval , true,
1136
- 100 , 1 );
1137
- if (selec < 0 )
1150
+ 10 , 1 , & hist_size );
1151
+
1152
+ /* If not at least 100 entries, use the heuristic method */
1153
+ if (hist_size < 100 )
1138
1154
{
1139
- /* Nope, so fake it with the heuristic method */
1155
+ Selectivity heursel ;
1140
1156
Selectivity prefixsel ;
1141
1157
Selectivity restsel ;
1142
1158
@@ -1146,17 +1162,29 @@ patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
1146
1162
else
1147
1163
prefixsel = 1.0 ;
1148
1164
restsel = pattern_selectivity (rest , ptype );
1149
- selec = prefixsel * restsel ;
1150
- }
1151
- else
1152
- {
1153
- /* Yes, but don't believe extremely small or large estimates. */
1154
- if (selec < 0.0001 )
1155
- selec = 0.0001 ;
1156
- else if (selec > 0.9999 )
1157
- selec = 0.9999 ;
1165
+ heursel = prefixsel * restsel ;
1166
+
1167
+ if (selec < 0 ) /* fewer than 10 histogram entries? */
1168
+ selec = heursel ;
1169
+ else
1170
+ {
1171
+ /*
1172
+ * For histogram sizes from 10 to 100, we combine the
1173
+ * histogram and heuristic selectivities, putting increasingly
1174
+ * more trust in the histogram for larger sizes.
1175
+ */
1176
+ double hist_weight = hist_size / 100.0 ;
1177
+
1178
+ selec = selec * hist_weight + heursel * (1.0 - hist_weight );
1179
+ }
1158
1180
}
1159
1181
1182
+ /* In any case, don't believe extremely small or large estimates. */
1183
+ if (selec < 0.0001 )
1184
+ selec = 0.0001 ;
1185
+ else if (selec > 0.9999 )
1186
+ selec = 0.9999 ;
1187
+
1160
1188
/*
1161
1189
* If we have most-common-values info, add up the fractions of the MCV
1162
1190
* entries that satisfy MCV OP PATTERN. These fractions contribute
0 commit comments