Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 18f5233

Browse files
danolivoCommitfest Bot
authored and
Commitfest Bot
committed
Employ EquivalenceClass to adjust ndistinct estimation.
Operations like grouping, incremental sort, hash join, memoize, etc., estimate the number of groups in a source using the statistics of this column or expression. Equivalence clauses like 'x=y' obviously reduce the maximum number of distinct values above the clause evaluation node to the fewest distincts in the 'x' or 'y' source. Therefore, in estimating the groups number, the planner may involve this data from the EC. Identification of proper EC for arbitrary expression seems too expensive operation. However, having a pathkey gives the planner immediate access to the EC. Here, a routine to identify proper expression is provided. ndistinct estimation is cached inside the EM.
1 parent 232d8ca commit 18f5233

File tree

16 files changed

+249
-75
lines changed

16 files changed

+249
-75
lines changed

contrib/postgres_fdw/postgres_fdw.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3388,7 +3388,7 @@ estimate_path_cost_size(PlannerInfo *root,
33883388
numGroups = estimate_num_groups(root,
33893389
get_sortgrouplist_exprs(root->processed_groupClause,
33903390
fpinfo->grouped_tlist),
3391-
input_rows, NULL, NULL);
3391+
input_rows, NULL, NULL, NULL);
33923392

33933393
/*
33943394
* Get the retrieved_rows and rows estimates. If there are HAVING

src/backend/optimizer/path/costsize.c

Lines changed: 72 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1999,7 +1999,7 @@ cost_tuplesort(Cost *startup_cost, Cost *run_cost,
19991999
void
20002000
cost_incremental_sort(Path *path,
20012001
PlannerInfo *root, List *pathkeys, int presorted_keys,
2002-
int input_disabled_nodes,
2002+
int input_disabled_nodes, Relids relids,
20032003
Cost input_startup_cost, Cost input_total_cost,
20042004
double input_tuples, int width, Cost comparison_cost, int sort_mem,
20052005
double limit_tuples)
@@ -2012,9 +2012,6 @@ cost_incremental_sort(Path *path,
20122012
Cost group_startup_cost,
20132013
group_run_cost,
20142014
group_input_run_cost;
2015-
List *presortedExprs = NIL;
2016-
ListCell *l;
2017-
bool unknown_varno = false;
20182015

20192016
Assert(presorted_keys > 0 && presorted_keys < list_length(pathkeys));
20202017

@@ -2025,58 +2022,10 @@ cost_incremental_sort(Path *path,
20252022
if (input_tuples < 2.0)
20262023
input_tuples = 2.0;
20272024

2028-
/* Default estimate of number of groups, capped to one group per row. */
2029-
input_groups = Min(input_tuples, DEFAULT_NUM_DISTINCT);
2030-
2031-
/*
2032-
* Extract presorted keys as list of expressions.
2033-
*
2034-
* We need to be careful about Vars containing "varno 0" which might have
2035-
* been introduced by generate_append_tlist, which would confuse
2036-
* estimate_num_groups (in fact it'd fail for such expressions). See
2037-
* recurse_set_operations which has to deal with the same issue.
2038-
*
2039-
* Unlike recurse_set_operations we can't access the original target list
2040-
* here, and even if we could it's not very clear how useful would that be
2041-
* for a set operation combining multiple tables. So we simply detect if
2042-
* there are any expressions with "varno 0" and use the default
2043-
* DEFAULT_NUM_DISTINCT in that case.
2044-
*
2045-
* We might also use either 1.0 (a single group) or input_tuples (each row
2046-
* being a separate group), pretty much the worst and best case for
2047-
* incremental sort. But those are extreme cases and using something in
2048-
* between seems reasonable. Furthermore, generate_append_tlist is used
2049-
* for set operations, which are likely to produce mostly unique output
2050-
* anyway - from that standpoint the DEFAULT_NUM_DISTINCT is defensive
2051-
* while maintaining lower startup cost.
2052-
*/
2053-
foreach(l, pathkeys)
2054-
{
2055-
PathKey *key = (PathKey *) lfirst(l);
2056-
EquivalenceMember *member = (EquivalenceMember *)
2057-
linitial(key->pk_eclass->ec_members);
2058-
2059-
/*
2060-
* Check if the expression contains Var with "varno 0" so that we
2061-
* don't call estimate_num_groups in that case.
2062-
*/
2063-
if (bms_is_member(0, pull_varnos(root, (Node *) member->em_expr)))
2064-
{
2065-
unknown_varno = true;
2066-
break;
2067-
}
2068-
2069-
/* expression not containing any Vars with "varno 0" */
2070-
presortedExprs = lappend(presortedExprs, member->em_expr);
2071-
2072-
if (foreach_current_index(l) + 1 >= presorted_keys)
2073-
break;
2074-
}
2075-
20762025
/* Estimate the number of groups with equal presorted keys. */
2077-
if (!unknown_varno)
2078-
input_groups = estimate_num_groups(root, presortedExprs, input_tuples,
2079-
NULL, NULL);
2026+
input_groups = estimate_num_groups(root,
2027+
list_copy_head(pathkeys, presorted_keys),
2028+
input_tuples, NULL, NULL, relids);
20802029

20812030
group_tuples = input_tuples / input_groups;
20822031
group_input_run_cost = input_run_cost / input_groups;
@@ -2579,7 +2528,7 @@ cost_memoize_rescan(PlannerInfo *root, MemoizePath *mpath,
25792528

25802529
/* estimate on the distinct number of parameter values */
25812530
ndistinct = estimate_num_groups(root, mpath->param_exprs, calls, NULL,
2582-
&estinfo);
2531+
&estinfo, NULL);
25832532

25842533
/*
25852534
* When the estimation fell back on using a default value, it's a bit too
@@ -2900,7 +2849,7 @@ get_windowclause_startup_tuples(PlannerInfo *root, WindowClause *wc,
29002849
root->parse->targetList);
29012850

29022851
num_partitions = estimate_num_groups(root, partexprs, input_tuples,
2903-
NULL, NULL);
2852+
NULL, NULL, NULL);
29042853
list_free(partexprs);
29052854

29062855
partition_tuples = input_tuples / num_partitions;
@@ -2923,7 +2872,7 @@ get_windowclause_startup_tuples(PlannerInfo *root, WindowClause *wc,
29232872
/* estimate out how many peer groups there are in the partition */
29242873
num_groups = estimate_num_groups(root, orderexprs,
29252874
partition_tuples, NULL,
2926-
NULL);
2875+
NULL, NULL);
29272876
list_free(orderexprs);
29282877
peer_tuples = partition_tuples / num_groups;
29292878
}
@@ -3703,6 +3652,7 @@ initial_cost_mergejoin(PlannerInfo *root, JoinCostWorkspace *workspace,
37033652
outersortkeys,
37043653
outer_presorted_keys,
37053654
outer_path->disabled_nodes,
3655+
outer_path->parent->relids,
37063656
outer_path->startup_cost,
37073657
outer_path->total_cost,
37083658
outer_path_rows,
@@ -6614,3 +6564,67 @@ compute_gather_rows(Path *path)
66146564

66156565
return clamp_row_est(path->rows * get_parallel_divisor(path));
66166566
}
6567+
6568+
/*
6569+
* Find suitable member of the equivalence class.
6570+
* Passing through the list of EC members find the member with minimum of
6571+
* distinct values. Cache estimated number of distincts in the em_ndistinct
6572+
* field of each member.
6573+
*
6574+
* Return NULL if no one proper member found (each member contains 0 relid).
6575+
*/
6576+
EquivalenceMember *
6577+
identify_proper_ecmember(PlannerInfo *root, EquivalenceClass *ec, Relids relids)
6578+
{
6579+
EquivalenceMember *candidate = NULL;
6580+
EquivalenceMember *em;
6581+
EquivalenceMemberIterator it;
6582+
6583+
setup_eclass_member_iterator(&it, ec, relids);
6584+
while ((em = eclass_member_iterator_next(&it)) != NULL)
6585+
{
6586+
VariableStatData vardata;
6587+
6588+
if (bms_is_member(0, em->em_relids))
6589+
continue;
6590+
6591+
if (em->em_is_const || bms_is_empty(em->em_relids))
6592+
{
6593+
/* Trivial case. Set up cache values and go further */
6594+
em->em_default_nd = false;
6595+
em->em_ndistinct = 1.0;
6596+
}
6597+
else if (relids && !bms_is_subset(em->em_relids, relids))
6598+
continue;
6599+
6600+
if (em->em_ndistinct < 0.)
6601+
{
6602+
/* Let's check candidate's ndistinct value */
6603+
examine_variable(root, (Node *) em->em_expr, 0, &vardata);
6604+
if (HeapTupleIsValid(vardata.statsTuple))
6605+
em->em_ndistinct =
6606+
get_variable_numdistinct(&vardata, &em->em_default_nd);
6607+
else
6608+
{
6609+
em->em_ndistinct = 0.0;
6610+
em->em_default_nd = true;
6611+
}
6612+
ReleaseVariableStats(vardata);
6613+
}
6614+
6615+
if (candidate == NULL)
6616+
candidate = em;
6617+
6618+
if (em->em_default_nd)
6619+
/* Nothing helpful */
6620+
continue;
6621+
6622+
Assert(em->em_ndistinct > 0.);
6623+
6624+
if (candidate->em_ndistinct == 0. ||
6625+
em->em_ndistinct < candidate->em_ndistinct)
6626+
candidate = em;
6627+
}
6628+
6629+
return candidate;
6630+
}

src/backend/optimizer/path/equivclass.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -601,6 +601,7 @@ make_eq_member(EquivalenceClass *ec, Expr *expr, Relids relids,
601601
em->em_datatype = datatype;
602602
em->em_jdomain = jdomain;
603603
em->em_parent = parent;
604+
em->em_ndistinct = -1.0;
604605

605606
if (bms_is_empty(relids))
606607
{

src/backend/optimizer/path/indxpath.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2400,6 +2400,7 @@ adjust_rowcount_for_semijoins(PlannerInfo *root,
24002400
sjinfo->semi_rhs_exprs,
24012401
nraw,
24022402
NULL,
2403+
NULL,
24032404
NULL);
24042405
if (rowcount > nunique)
24052406
rowcount = nunique;

src/backend/optimizer/plan/createplan.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5523,6 +5523,7 @@ label_incrementalsort_with_costsize(PlannerInfo *root, IncrementalSort *plan,
55235523
cost_incremental_sort(&sort_path, root, pathkeys,
55245524
plan->nPresortedCols,
55255525
plan->sort.plan.disabled_nodes,
5526+
NULL,
55265527
lefttree->startup_cost,
55275528
lefttree->total_cost,
55285529
lefttree->plan_rows,

src/backend/optimizer/plan/planner.c

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ static List *extract_rollup_sets(List *groupingSets);
144144
static List *reorder_grouping_sets(List *groupingSets, List *sortclause);
145145
static void standard_qp_callback(PlannerInfo *root, void *extra);
146146
static double get_number_of_groups(PlannerInfo *root,
147+
Relids relids,
147148
double path_rows,
148149
grouping_sets_data *gd,
149150
List *target_list);
@@ -3602,7 +3603,7 @@ standard_qp_callback(PlannerInfo *root, void *extra)
36023603
* determining whether some combination of them could be hashed instead.
36033604
*/
36043605
static double
3605-
get_number_of_groups(PlannerInfo *root,
3606+
get_number_of_groups(PlannerInfo *root, Relids relids,
36063607
double path_rows,
36073608
grouping_sets_data *gd,
36083609
List *target_list)
@@ -3642,7 +3643,8 @@ get_number_of_groups(PlannerInfo *root,
36423643
groupExprs,
36433644
path_rows,
36443645
&gset,
3645-
NULL);
3646+
NULL,
3647+
relids);
36463648

36473649
gs->numGroups = numGroups;
36483650
rollup->numGroups += numGroups;
@@ -3668,7 +3670,8 @@ get_number_of_groups(PlannerInfo *root,
36683670
groupExprs,
36693671
path_rows,
36703672
&gset,
3671-
NULL);
3673+
NULL,
3674+
relids);
36723675

36733676
gs->numGroups = numGroups;
36743677
gd->dNumHashGroups += numGroups;
@@ -3679,12 +3682,12 @@ get_number_of_groups(PlannerInfo *root,
36793682
}
36803683
else
36813684
{
3682-
/* Plain GROUP BY -- estimate based on optimized groupClause */
3683-
groupExprs = get_sortgrouplist_exprs(root->processed_groupClause,
3684-
target_list);
3685+
List *pathkeys = list_copy_head(root->group_pathkeys,
3686+
root->num_groupby_pathkeys);
36853687

3686-
dNumGroups = estimate_num_groups(root, groupExprs, path_rows,
3687-
NULL, NULL);
3688+
/* Plain GROUP BY -- estimate based on grouping pathkeys */
3689+
dNumGroups = estimate_num_groups(root, pathkeys, path_rows,
3690+
NULL, NULL, relids);
36883691
}
36893692
}
36903693
else if (parse->groupingSets)
@@ -4074,7 +4077,7 @@ create_ordinary_grouping_paths(PlannerInfo *root, RelOptInfo *input_rel,
40744077
/*
40754078
* Estimate number of groups.
40764079
*/
4077-
dNumGroups = get_number_of_groups(root,
4080+
dNumGroups = get_number_of_groups(root, cheapest_path->parent->relids,
40784081
cheapest_path->rows,
40794082
gd,
40804083
extra->targetList);
@@ -4846,7 +4849,7 @@ create_partial_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
48464849
/* estimate how many distinct rows we'll get from each worker */
48474850
numDistinctRows = estimate_num_groups(root, distinctExprs,
48484851
cheapest_partial_path->rows,
4849-
NULL, NULL);
4852+
NULL, NULL, NULL);
48504853

48514854
/*
48524855
* Try sorting the cheapest path and incrementally sorting any paths with
@@ -5017,7 +5020,7 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
50175020
parse->targetList);
50185021
numDistinctRows = estimate_num_groups(root, distinctExprs,
50195022
cheapest_input_path->rows,
5020-
NULL, NULL);
5023+
NULL, NULL, NULL);
50215024
}
50225025

50235026
/*
@@ -7358,13 +7361,13 @@ create_partial_grouping_paths(PlannerInfo *root,
73587361
/* Estimate number of partial groups. */
73597362
if (cheapest_total_path != NULL)
73607363
dNumPartialGroups =
7361-
get_number_of_groups(root,
7364+
get_number_of_groups(root, input_rel->relids,
73627365
cheapest_total_path->rows,
73637366
gd,
73647367
extra->targetList);
73657368
if (cheapest_partial_path != NULL)
73667369
dNumPartialPartialGroups =
7367-
get_number_of_groups(root,
7370+
get_number_of_groups(root, input_rel->relids,
73687371
cheapest_partial_path->rows,
73697372
gd,
73707373
extra->targetList);

src/backend/optimizer/prep/prepunion.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -665,6 +665,7 @@ build_setop_child_paths(PlannerInfo *root, RelOptInfo *rel,
665665
get_tlist_exprs(subroot->parse->targetList, false),
666666
rel->cheapest_total_path->rows,
667667
NULL,
668+
NULL,
668669
NULL);
669670
}
670671
}

src/backend/optimizer/util/pathnode.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1858,6 +1858,7 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
18581858
sjinfo->semi_rhs_exprs,
18591859
rel->rows,
18601860
NULL,
1861+
NULL,
18611862
NULL);
18621863
numCols = list_length(sjinfo->semi_rhs_exprs);
18631864

@@ -3059,6 +3060,7 @@ create_incremental_sort_path(PlannerInfo *root,
30593060
cost_incremental_sort(&pathnode->path,
30603061
root, pathkeys, presorted_keys,
30613062
subpath->disabled_nodes,
3063+
subpath->parent->relids,
30623064
subpath->startup_cost,
30633065
subpath->total_cost,
30643066
subpath->rows,

0 commit comments

Comments
 (0)