Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit e365a58

Browse files
committed
Speed-up build of MCV lists with many distinct values
When building multi-column MCV lists, we compute base frequency for each item, i.e. a product of per-column frequencies for values from the item. As a value may be in multiple groups, the code was scanning the whole array of groups while adding items to the MCV list. This works fine as long as the number of distinct groups is small, but it's easy to trigger trigger O(N^2) behavior, especially after increasing statistics target. This commit precomputes frequencies for values in all columns, so that when computing the base frequency it's enough to make a simple bsearch lookup in the array. Backpatch to 12, where multi-column MCV lists were introduced. Discussion: https://postgr.es/m/20190618205920.qtlzcu73whfpfqne@development
1 parent d5ab9df commit e365a58

File tree

1 file changed

+122
-8
lines changed
  • src/backend/statistics

1 file changed

+122
-8
lines changed

src/backend/statistics/mcv.c

+122-8
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ static MultiSortSupport build_mss(VacAttrStats **stats, int numattrs);
7878
static SortItem *build_distinct_groups(int numrows, SortItem *items,
7979
MultiSortSupport mss, int *ndistinct);
8080

81+
static SortItem **build_column_frequencies(SortItem *groups, int ngroups,
82+
MultiSortSupport mss, int *ncounts);
83+
8184
static int count_distinct_groups(int numrows, SortItem *items,
8285
MultiSortSupport mss);
8386

@@ -242,6 +245,20 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
242245
if (nitems > 0)
243246
{
244247
int j;
248+
SortItem key;
249+
MultiSortSupport tmp;
250+
251+
/* frequencies for values in each attribute */
252+
SortItem **freqs;
253+
int *nfreqs;
254+
255+
/* used to search values */
256+
tmp = (MultiSortSupport) palloc(offsetof(MultiSortSupportData, ssup)
257+
+ sizeof(SortSupportData));
258+
259+
/* compute frequencies for values in each column */
260+
nfreqs = (int *) palloc0(sizeof(int) * numattrs);
261+
freqs = build_column_frequencies(groups, ngroups, mss, nfreqs);
245262

246263
/*
247264
* Allocate the MCV list structure, set the global parameters.
@@ -281,18 +298,26 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs,
281298
item->base_frequency = 1.0;
282299
for (j = 0; j < numattrs; j++)
283300
{
284-
int count = 0;
285-
int k;
301+
SortItem *freq;
286302

287-
for (k = 0; k < ngroups; k++)
288-
{
289-
if (multi_sort_compare_dim(j, &groups[i], &groups[k], mss) == 0)
290-
count += groups[k].count;
291-
}
303+
/* single dimension */
304+
tmp->ndims = 1;
305+
tmp->ssup[0] = mss->ssup[j];
292306

293-
item->base_frequency *= (double) count / numrows;
307+
/* fill search key */
308+
key.values = &groups[i].values[j];
309+
key.isnull = &groups[i].isnull[j];
310+
311+
freq = (SortItem *) bsearch_arg(&key, freqs[j], nfreqs[j],
312+
sizeof(SortItem),
313+
multi_sort_compare, tmp);
314+
315+
item->base_frequency *= ((double) freq->count) / numrows;
294316
}
295317
}
318+
319+
pfree(nfreqs);
320+
pfree(freqs);
296321
}
297322

298323
pfree(items);
@@ -419,6 +444,95 @@ build_distinct_groups(int numrows, SortItem *items, MultiSortSupport mss,
419444
return groups;
420445
}
421446

447+
/* compare sort items (single dimension) */
448+
static int
449+
sort_item_compare(const void *a, const void *b, void *arg)
450+
{
451+
SortSupport ssup = (SortSupport) arg;
452+
SortItem *ia = (SortItem *) a;
453+
SortItem *ib = (SortItem *) b;
454+
455+
return ApplySortComparator(ia->values[0], ia->isnull[0],
456+
ib->values[0], ib->isnull[0],
457+
ssup);
458+
}
459+
460+
/*
461+
* build_column_frequencies
462+
* compute frequencies of values in each column
463+
*
464+
* This returns an array of SortItems for each attibute the MCV is built
465+
* on, with a frequency (number of occurrences) for each value. This is
466+
* then used to compute "base" frequency of MCV items.
467+
*
468+
* All the memory is allocated in a single chunk, so that a single pfree
469+
* is enough to release it. We do not allocate space for values/isnull
470+
* arrays in the SortItems, because we can simply point into the input
471+
* groups directly.
472+
*/
473+
static SortItem **
474+
build_column_frequencies(SortItem *groups, int ngroups,
475+
MultiSortSupport mss, int *ncounts)
476+
{
477+
int i,
478+
dim;
479+
SortItem **result;
480+
char *ptr;
481+
482+
Assert(groups);
483+
Assert(ncounts);
484+
485+
/* allocate arrays for all columns as a single chunk */
486+
ptr = palloc(MAXALIGN(sizeof(SortItem *) * mss->ndims) +
487+
mss->ndims * MAXALIGN(sizeof(SortItem) * ngroups));
488+
489+
/* initial array of pointers */
490+
result = (SortItem **) ptr;
491+
ptr += MAXALIGN(sizeof(SortItem *) * mss->ndims);
492+
493+
for (dim = 0; dim < mss->ndims; dim++)
494+
{
495+
SortSupport ssup = &mss->ssup[dim];
496+
497+
/* array of values for a single column */
498+
result[dim] = (SortItem *) ptr;
499+
ptr += MAXALIGN(sizeof(SortItem) * ngroups);
500+
501+
/* extract data for the dimension */
502+
for (i = 0; i < ngroups; i++)
503+
{
504+
/* point into the input groups */
505+
result[dim][i].values = &groups[i].values[dim];
506+
result[dim][i].isnull = &groups[i].isnull[dim];
507+
result[dim][i].count = groups[i].count;
508+
}
509+
510+
/* sort the values, deduplicate */
511+
qsort_arg((void *) result[dim], ngroups, sizeof(SortItem),
512+
sort_item_compare, ssup);
513+
514+
/*
515+
* Identify distinct values, compute frequency (there might be
516+
* multiple MCV items containing this value, so we need to sum
517+
* counts from all of them.
518+
*/
519+
ncounts[dim] = 1;
520+
for (i = 1; i < ngroups; i++)
521+
{
522+
if (sort_item_compare(&result[dim][i-1], &result[dim][i], ssup) == 0)
523+
{
524+
result[dim][ncounts[dim]-1].count += result[dim][i].count;
525+
continue;
526+
}
527+
528+
result[dim][ncounts[dim]] = result[dim][i];
529+
530+
ncounts[dim]++;
531+
}
532+
}
533+
534+
return result;
535+
}
422536

423537
/*
424538
* statext_mcv_load

0 commit comments

Comments
 (0)