Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 4fb694a

Browse files
committed
Improve histogram-filling loop in new compute_array_stats() code.
Do "frac" arithmetic in int64 to prevent overflow with large statistics targets, and improve the comments so people have some chance of understanding how it works. Alexander Korotkov and Tom Lane
1 parent 141b898 commit 4fb694a

File tree

1 file changed

+44
-19
lines changed

1 file changed

+44
-19
lines changed

src/backend/utils/adt/array_typanalyze.c

Lines changed: 44 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -579,9 +579,9 @@ compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
579579
{
580580
int num_hist = stats->attr->attstattarget;
581581
DECountItem **sorted_count_items;
582-
int count_item_index;
582+
int j;
583583
int delta;
584-
int frac;
584+
int64 frac;
585585
float4 *hist;
586586

587587
/* num_hist must be at least 2 for the loop below to work */
@@ -594,45 +594,70 @@ compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
594594
sorted_count_items = (DECountItem **)
595595
palloc(sizeof(DECountItem *) * count_items_count);
596596
hash_seq_init(&scan_status, count_tab);
597-
count_item_index = 0;
597+
j = 0;
598598
while ((count_item = (DECountItem *) hash_seq_search(&scan_status)) != NULL)
599599
{
600-
sorted_count_items[count_item_index++] = count_item;
600+
sorted_count_items[j++] = count_item;
601601
}
602602
qsort(sorted_count_items, count_items_count,
603603
sizeof(DECountItem *), countitem_compare_count);
604604

605605
/*
606-
* Fill stanumbers with the histogram, followed by the average
607-
* count. This array must be stored in anl_context.
606+
* Prepare to fill stanumbers with the histogram, followed by the
607+
* average count. This array must be stored in anl_context.
608608
*/
609609
hist = (float4 *)
610610
MemoryContextAlloc(stats->anl_context,
611611
sizeof(float4) * (num_hist + 1));
612612
hist[num_hist] = (double) element_no / (double) nonnull_cnt;
613613

614-
/*
615-
* Construct the histogram.
614+
/*----------
615+
* Construct the histogram of distinct-element counts (DECs).
616+
*
617+
* The object of this loop is to copy the min and max DECs to
618+
* hist[0] and hist[num_hist - 1], along with evenly-spaced DECs
619+
* in between (where "evenly-spaced" is with reference to the
620+
* whole input population of arrays). If we had a complete sorted
621+
* array of DECs, one per analyzed row, the i'th hist value would
622+
* come from DECs[i * (analyzed_rows - 1) / (num_hist - 1)]
623+
* (compare the histogram-making loop in compute_scalar_stats()).
624+
* But instead of that we have the sorted_count_items[] array,
625+
* which holds unique DEC values with their frequencies (that is,
626+
* a run-length-compressed version of the full array). So we
627+
* control advancing through sorted_count_items[] with the
628+
* variable "frac", which is defined as (x - y) * (num_hist - 1),
629+
* where x is the index in the notional DECs array corresponding
630+
* to the start of the next sorted_count_items[] element's run,
631+
* and y is the index in DECs from which we should take the next
632+
* histogram value. We have to advance whenever x <= y, that is
633+
* frac <= 0. The x component is the sum of the frequencies seen
634+
* so far (up through the current sorted_count_items[] element),
635+
* and of course y * (num_hist - 1) = i * (analyzed_rows - 1),
636+
* per the subscript calculation above. (The subscript calculation
637+
* implies dropping any fractional part of y; in this formulation
638+
* that's handled by not advancing until frac reaches 1.)
616639
*
617-
* XXX this needs work: frac could overflow, and it's not clear
618-
* how or why the code works. Even if it does work, it needs
619-
* documented.
640+
* Even though frac has a bounded range, it could overflow int32
641+
* when working with very large statistics targets, so we do that
642+
* math in int64.
643+
*----------
620644
*/
621645
delta = analyzed_rows - 1;
622-
count_item_index = 0;
623-
frac = sorted_count_items[0]->frequency * (num_hist - 1);
646+
j = 0; /* current index in sorted_count_items */
647+
/* Initialize frac for sorted_count_items[0]; y is initially 0 */
648+
frac = (int64) sorted_count_items[0]->frequency * (num_hist - 1);
624649
for (i = 0; i < num_hist; i++)
625650
{
626651
while (frac <= 0)
627652
{
628-
count_item_index++;
629-
Assert(count_item_index < count_items_count);
630-
frac += sorted_count_items[count_item_index]->frequency * (num_hist - 1);
653+
/* Advance, and update x component of frac */
654+
j++;
655+
frac += (int64) sorted_count_items[j]->frequency * (num_hist - 1);
631656
}
632-
hist[i] = sorted_count_items[count_item_index]->count;
633-
frac -= delta;
657+
hist[i] = sorted_count_items[j]->count;
658+
frac -= delta; /* update y for upcoming i increment */
634659
}
635-
Assert(count_item_index == count_items_count - 1);
660+
Assert(j == count_items_count - 1);
636661

637662
stats->stakind[slot_idx] = STATISTIC_KIND_DECHIST;
638663
stats->staop[slot_idx] = extra_data->eq_opr;

0 commit comments

Comments
 (0)