@@ -579,9 +579,9 @@ compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
579
579
{
580
580
int num_hist = stats -> attr -> attstattarget ;
581
581
DECountItem * * sorted_count_items ;
582
- int count_item_index ;
582
+ int j ;
583
583
int delta ;
584
- int frac ;
584
+ int64 frac ;
585
585
float4 * hist ;
586
586
587
587
/* num_hist must be at least 2 for the loop below to work */
@@ -594,45 +594,70 @@ compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
594
594
sorted_count_items = (DECountItem * * )
595
595
palloc (sizeof (DECountItem * ) * count_items_count );
596
596
hash_seq_init (& scan_status , count_tab );
597
- count_item_index = 0 ;
597
+ j = 0 ;
598
598
while ((count_item = (DECountItem * ) hash_seq_search (& scan_status )) != NULL )
599
599
{
600
- sorted_count_items [count_item_index ++ ] = count_item ;
600
+ sorted_count_items [j ++ ] = count_item ;
601
601
}
602
602
qsort (sorted_count_items , count_items_count ,
603
603
sizeof (DECountItem * ), countitem_compare_count );
604
604
605
605
/*
606
- * Fill stanumbers with the histogram, followed by the average
607
- * count. This array must be stored in anl_context.
606
+ * Prepare to fill stanumbers with the histogram, followed by the
607
+ * average count. This array must be stored in anl_context.
608
608
*/
609
609
hist = (float4 * )
610
610
MemoryContextAlloc (stats -> anl_context ,
611
611
sizeof (float4 ) * (num_hist + 1 ));
612
612
hist [num_hist ] = (double ) element_no / (double ) nonnull_cnt ;
613
613
614
- /*
615
- * Construct the histogram.
614
+ /*----------
615
+ * Construct the histogram of distinct-element counts (DECs).
616
+ *
617
+ * The object of this loop is to copy the min and max DECs to
618
+ * hist[0] and hist[num_hist - 1], along with evenly-spaced DECs
619
+ * in between (where "evenly-spaced" is with reference to the
620
+ * whole input population of arrays). If we had a complete sorted
621
+ * array of DECs, one per analyzed row, the i'th hist value would
622
+ * come from DECs[i * (analyzed_rows - 1) / (num_hist - 1)]
623
+ * (compare the histogram-making loop in compute_scalar_stats()).
624
+ * But instead of that we have the sorted_count_items[] array,
625
+ * which holds unique DEC values with their frequencies (that is,
626
+ * a run-length-compressed version of the full array). So we
627
+ * control advancing through sorted_count_items[] with the
628
+ * variable "frac", which is defined as (x - y) * (num_hist - 1),
629
+ * where x is the index in the notional DECs array corresponding
630
+ * to the start of the next sorted_count_items[] element's run,
631
+ * and y is the index in DECs from which we should take the next
632
+ * histogram value. We have to advance whenever x <= y, that is
633
+ * frac <= 0. The x component is the sum of the frequencies seen
634
+ * so far (up through the current sorted_count_items[] element),
635
+ * and of course y * (num_hist - 1) = i * (analyzed_rows - 1),
636
+ * per the subscript calculation above. (The subscript calculation
637
+ * implies dropping any fractional part of y; in this formulation
638
+ * that's handled by not advancing until frac reaches 1.)
616
639
*
617
- * XXX this needs work: frac could overflow, and it's not clear
618
- * how or why the code works. Even if it does work, it needs
619
- * documented.
640
+ * Even though frac has a bounded range, it could overflow int32
641
+ * when working with very large statistics targets, so we do that
642
+ * math in int64.
643
+ *----------
620
644
*/
621
645
delta = analyzed_rows - 1 ;
622
- count_item_index = 0 ;
623
- frac = sorted_count_items [0 ]-> frequency * (num_hist - 1 );
646
+ j = 0 ; /* current index in sorted_count_items */
647
+ /* Initialize frac for sorted_count_items[0]; y is initially 0 */
648
+ frac = (int64 ) sorted_count_items [0 ]-> frequency * (num_hist - 1 );
624
649
for (i = 0 ; i < num_hist ; i ++ )
625
650
{
626
651
while (frac <= 0 )
627
652
{
628
- count_item_index ++ ;
629
- Assert ( count_item_index < count_items_count ) ;
630
- frac += sorted_count_items [count_item_index ]-> frequency * (num_hist - 1 );
653
+ /* Advance, and update x component of frac */
654
+ j ++ ;
655
+ frac += ( int64 ) sorted_count_items [j ]-> frequency * (num_hist - 1 );
631
656
}
632
- hist [i ] = sorted_count_items [count_item_index ]-> count ;
633
- frac -= delta ;
657
+ hist [i ] = sorted_count_items [j ]-> count ;
658
+ frac -= delta ; /* update y for upcoming i increment */
634
659
}
635
- Assert (count_item_index == count_items_count - 1 );
660
+ Assert (j == count_items_count - 1 );
636
661
637
662
stats -> stakind [slot_idx ] = STATISTIC_KIND_DECHIST ;
638
663
stats -> staop [slot_idx ] = extra_data -> eq_opr ;
0 commit comments