@@ -155,15 +155,17 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
155
155
numattrs ,
156
156
ngroups ,
157
157
nitems ;
158
-
159
- AttrNumber * attnums = build_attnums_array (attrs , & numattrs );
160
-
158
+ AttrNumber * attnums ;
159
+ double mincount ;
161
160
SortItem * items ;
162
161
SortItem * groups ;
163
162
MCVList * mcvlist = NULL ;
163
+ MultiSortSupport mss ;
164
+
165
+ attnums = build_attnums_array (attrs , & numattrs );
164
166
165
167
/* comparator for all the columns */
166
- MultiSortSupport mss = build_mss (stats , numattrs );
168
+ mss = build_mss (stats , numattrs );
167
169
168
170
/* sort the rows */
169
171
items = build_sorted_items (numrows , & nitems , rows , stats [0 ]-> tupDesc ,
@@ -196,33 +198,28 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
196
198
* per-column frequencies, as if the columns were independent).
197
199
*
198
200
* Using the same algorithm might exclude items that are close to the
199
- * "average" frequency. But it does not say whether the frequency is
200
- * close to base frequency or not. We also need to consider unexpectedly
201
- * uncommon items (compared to base frequency), and the single-column
202
- * algorithm ignores that entirely .
201
+ * "average" frequency of the sample . But that does not say whether the
202
+ * observed frequency is close to the base frequency or not. We also
203
+ * need to consider unexpectedly uncommon items (again, compared to the
204
+ * base frequency), and the single-column algorithm does not have to .
203
205
*
204
- * If we can fit all the items onto the MCV list, do that. Otherwise
205
- * use get_mincount_for_mcv_list to decide which items to keep in the
206
- * MCV list, based on the number of occurrences in the sample .
206
+ * We simply decide how many items to keep by computing minimum count
207
+ * using get_mincount_for_mcv_list() and then keep all items that seem
208
+ * to be more common than that .
207
209
*/
208
- if (ngroups > nitems )
209
- {
210
- double mincount ;
210
+ mincount = get_mincount_for_mcv_list (numrows , totalrows );
211
211
212
- mincount = get_mincount_for_mcv_list ( numrows , totalrows );
213
-
214
- /*
215
- * Walk the groups until we find the first group with a count below
216
- * the mincount threshold (the index of that group is the number of
217
- * groups we want to keep).
218
- */
219
- for ( i = 0 ; i < nitems ; i ++ )
212
+ /*
213
+ * Walk the groups until we find the first group with a count below
214
+ * the mincount threshold (the index of that group is the number of
215
+ * groups we want to keep).
216
+ */
217
+ for ( i = 0 ; i < nitems ; i ++ )
218
+ {
219
+ if ( groups [ i ]. count < mincount )
220
220
{
221
- if (groups [i ].count < mincount )
222
- {
223
- nitems = i ;
224
- break ;
225
- }
221
+ nitems = i ;
222
+ break ;
226
223
}
227
224
}
228
225
@@ -469,11 +466,12 @@ statext_mcv_load(Oid mvoid)
469
466
* Each attribute has to be processed separately, as we may be mixing different
470
467
* datatypes, with different sort operators, etc.
471
468
*
472
- * We use uint16 values for the indexes in step (3), as we currently don't allow
473
- * more than 8k MCV items anyway, although that's mostly arbitrary limit. We might
474
- * increase this to 65k and still fit into uint16. Furthermore, this limit is on
475
- * the number of distinct values per column, and we usually have few of those
476
- * (and various combinations of them for the those MCV list). So uint16 seems fine.
469
+ * We use uint16 values for the indexes in step (3), as the number of MCV items
470
+ * is limited by the statistics target (which is capped to 10k at the moment).
471
+ * We might increase this to 65k and still fit into uint16, so there's a bit of
472
+ * slack. Furthermore, this limit is on the number of distinct values per column,
473
+ * and we usually have few of those (and various combinations of them for the
474
+ * those MCV list). So uint16 seems fine for now.
477
475
*
478
476
* We don't really expect the serialization to save as much space as for
479
477
* histograms, as we are not doing any bucket splits (which is the source
@@ -1322,7 +1320,7 @@ pg_mcv_list_send(PG_FUNCTION_ARGS)
1322
1320
* somewhat wasteful as we could do with just a single bit, thus reducing
1323
1321
* the size to ~1/8. It would also allow us to combine bitmaps simply using
1324
1322
* & and |, which should be faster than min/max. The bitmaps are fairly
1325
- * small, though (as we cap the MCV list size to 8k items ).
1323
+ * small, though (thanks to the cap on the MCV list size).
1326
1324
*/
1327
1325
static bool *
1328
1326
mcv_get_match_bitmap (PlannerInfo * root , List * clauses ,
0 commit comments