Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit f759ef1

Browse files
committed
Further repair of eqjoinsel ndistinct-clamping logic.
Examination of examples provided by Mark Kirkwood and others has convinced me that actually commit 7f3eba3 was quite a few bricks shy of a load. The useful part of that patch was clamping ndistinct for the inner side of a semi or anti join, and the reason why that's needed is that it's the only way that restriction clauses eliminating rows from the inner relation can affect the estimated size of the join result. I had not clearly understood why the clamping was appropriate, and so mis-extrapolated to conclude that we should clamp ndistinct for the outer side too, as well as for both sides of regular joins. These latter actions were all wrong, and are reverted with this patch. In addition, the clamping logic is now made to affect the behavior of both paths in eqjoinsel_semi, with or without MCV lists to compare. When we have MCVs, we suppose that the most common values are the ones that are most likely to survive the decimation resulting from a lower restriction clause, so we think of the clamping as eliminating non-MCV values, or potentially even the least-common MCVs for the inner relation. Back-patch to 8.4, same as previous fixes in this area.
1 parent dbb8d47 commit f759ef1

File tree

1 file changed

+50
-58
lines changed

1 file changed

+50
-58
lines changed

src/backend/utils/adt/selfuncs.c

Lines changed: 50 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -141,11 +141,10 @@ static double ineq_histogram_selectivity(PlannerInfo *root,
141141
FmgrInfo *opproc, bool isgt,
142142
Datum constval, Oid consttype);
143143
static double eqjoinsel_inner(Oid operator,
144-
VariableStatData *vardata1, VariableStatData *vardata2,
145-
RelOptInfo *rel1, RelOptInfo *rel2);
144+
VariableStatData *vardata1, VariableStatData *vardata2);
146145
static double eqjoinsel_semi(Oid operator,
147146
VariableStatData *vardata1, VariableStatData *vardata2,
148-
RelOptInfo *rel1, RelOptInfo *rel2);
147+
RelOptInfo *inner_rel);
149148
static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
150149
Datum lobound, Datum hibound, Oid boundstypid,
151150
double *scaledlobound, double *scaledhibound);
@@ -2010,47 +2009,35 @@ eqjoinsel(PG_FUNCTION_ARGS)
20102009
VariableStatData vardata1;
20112010
VariableStatData vardata2;
20122011
bool join_is_reversed;
2013-
RelOptInfo *rel1;
2014-
RelOptInfo *rel2;
2012+
RelOptInfo *inner_rel;
20152013

20162014
get_join_variables(root, args, sjinfo,
20172015
&vardata1, &vardata2, &join_is_reversed);
20182016

2019-
/*
2020-
* Identify the join's direct input relations. We use the min lefthand
2021-
* and min righthand as the inputs, even though the join might actually
2022-
* get done with larger input relations. The min inputs are guaranteed to
2023-
* have been formed by now, though, and always using them ensures
2024-
* consistency of estimates.
2025-
*/
2026-
if (!join_is_reversed)
2027-
{
2028-
rel1 = find_join_input_rel(root, sjinfo->min_lefthand);
2029-
rel2 = find_join_input_rel(root, sjinfo->min_righthand);
2030-
}
2031-
else
2032-
{
2033-
rel1 = find_join_input_rel(root, sjinfo->min_righthand);
2034-
rel2 = find_join_input_rel(root, sjinfo->min_lefthand);
2035-
}
2036-
20372017
switch (sjinfo->jointype)
20382018
{
20392019
case JOIN_INNER:
20402020
case JOIN_LEFT:
20412021
case JOIN_FULL:
2042-
selec = eqjoinsel_inner(operator, &vardata1, &vardata2,
2043-
rel1, rel2);
2022+
selec = eqjoinsel_inner(operator, &vardata1, &vardata2);
20442023
break;
20452024
case JOIN_SEMI:
20462025
case JOIN_ANTI:
2026+
/*
2027+
* Look up the join's inner relation. min_righthand is sufficient
2028+
* information because neither SEMI nor ANTI joins permit any
2029+
* reassociation into or out of their RHS, so the righthand will
2030+
* always be exactly that set of rels.
2031+
*/
2032+
inner_rel = find_join_input_rel(root, sjinfo->min_righthand);
2033+
20472034
if (!join_is_reversed)
20482035
selec = eqjoinsel_semi(operator, &vardata1, &vardata2,
2049-
rel1, rel2);
2036+
inner_rel);
20502037
else
20512038
selec = eqjoinsel_semi(get_commutator(operator),
20522039
&vardata2, &vardata1,
2053-
rel2, rel1);
2040+
inner_rel);
20542041
break;
20552042
default:
20562043
/* other values not expected here */
@@ -2076,8 +2063,7 @@ eqjoinsel(PG_FUNCTION_ARGS)
20762063
*/
20772064
static double
20782065
eqjoinsel_inner(Oid operator,
2079-
VariableStatData *vardata1, VariableStatData *vardata2,
2080-
RelOptInfo *rel1, RelOptInfo *rel2)
2066+
VariableStatData *vardata1, VariableStatData *vardata2)
20812067
{
20822068
double selec;
20832069
double nd1;
@@ -2272,26 +2258,10 @@ eqjoinsel_inner(Oid operator,
22722258
* XXX Can we be smarter if we have an MCV list for just one side? It
22732259
* seems that if we assume equal distribution for the other side, we
22742260
* end up with the same answer anyway.
2275-
*
2276-
* An additional hack we use here is to clamp the nd1 and nd2 values
2277-
* to not more than what we are estimating the input relation sizes to
2278-
* be, providing a crude correction for the selectivity of restriction
2279-
* clauses on those relations. (We don't do that in the other path
2280-
* since there we are comparing the nd values to stats for the whole
2281-
* relations.) We can apply this clamp both with respect to the base
2282-
* relations from which the join variables come, and to the immediate
2283-
* input relations of the current join.
22842261
*/
22852262
double nullfrac1 = stats1 ? stats1->stanullfrac : 0.0;
22862263
double nullfrac2 = stats2 ? stats2->stanullfrac : 0.0;
22872264

2288-
if (vardata1->rel)
2289-
nd1 = Min(nd1, vardata1->rel->rows);
2290-
nd1 = Min(nd1, rel1->rows);
2291-
if (vardata2->rel)
2292-
nd2 = Min(nd2, vardata2->rel->rows);
2293-
nd2 = Min(nd2, rel2->rows);
2294-
22952265
selec = (1.0 - nullfrac1) * (1.0 - nullfrac2);
22962266
if (nd1 > nd2)
22972267
selec /= nd1;
@@ -2318,7 +2288,7 @@ eqjoinsel_inner(Oid operator,
23182288
static double
23192289
eqjoinsel_semi(Oid operator,
23202290
VariableStatData *vardata1, VariableStatData *vardata2,
2321-
RelOptInfo *rel1, RelOptInfo *rel2)
2291+
RelOptInfo *inner_rel)
23222292
{
23232293
double selec;
23242294
double nd1;
@@ -2338,6 +2308,25 @@ eqjoinsel_semi(Oid operator,
23382308
nd1 = get_variable_numdistinct(vardata1);
23392309
nd2 = get_variable_numdistinct(vardata2);
23402310

2311+
/*
2312+
* We clamp nd2 to be not more than what we estimate the inner relation's
2313+
* size to be. This is intuitively somewhat reasonable since obviously
2314+
* there can't be more than that many distinct values coming from the
2315+
* inner rel. The reason for the asymmetry (ie, that we don't clamp nd1
2316+
* likewise) is that this is the only pathway by which restriction clauses
2317+
* applied to the inner rel will affect the join result size estimate,
2318+
* since set_joinrel_size_estimates will multiply SEMI/ANTI selectivity by
2319+
* only the outer rel's size. If we clamped nd1 we'd be double-counting
2320+
* the selectivity of outer-rel restrictions.
2321+
*
2322+
* We can apply this clamping both with respect to the base relation from
2323+
* which the join variable comes (if there is just one), and to the
2324+
* immediate inner input relation of the current join.
2325+
*/
2326+
if (vardata2->rel)
2327+
nd2 = Min(nd2, vardata2->rel->rows);
2328+
nd2 = Min(nd2, inner_rel->rows);
2329+
23412330
if (HeapTupleIsValid(vardata1->statsTuple))
23422331
{
23432332
stats1 = (Form_pg_statistic) GETSTRUCT(vardata1->statsTuple);
@@ -2381,11 +2370,21 @@ eqjoinsel_semi(Oid operator,
23812370
uncertainfrac,
23822371
uncertain;
23832372
int i,
2384-
nmatches;
2373+
nmatches,
2374+
clamped_nvalues2;
2375+
2376+
/*
2377+
* The clamping above could have resulted in nd2 being less than
2378+
* nvalues2; in which case, we assume that precisely the nd2 most
2379+
* common values in the relation will appear in the join input, and so
2380+
* compare to only the first nd2 members of the MCV list. Of course
2381+
* this is frequently wrong, but it's the best bet we can make.
2382+
*/
2383+
clamped_nvalues2 = Min(nvalues2, nd2);
23852384

23862385
fmgr_info(get_opcode(operator), &eqproc);
23872386
hasmatch1 = (bool *) palloc0(nvalues1 * sizeof(bool));
2388-
hasmatch2 = (bool *) palloc0(nvalues2 * sizeof(bool));
2387+
hasmatch2 = (bool *) palloc0(clamped_nvalues2 * sizeof(bool));
23892388

23902389
/*
23912390
* Note we assume that each MCV will match at most one member of the
@@ -2398,7 +2397,7 @@ eqjoinsel_semi(Oid operator,
23982397
{
23992398
int j;
24002399

2401-
for (j = 0; j < nvalues2; j++)
2400+
for (j = 0; j < clamped_nvalues2; j++)
24022401
{
24032402
if (hasmatch2[j])
24042403
continue;
@@ -2443,7 +2442,7 @@ eqjoinsel_semi(Oid operator,
24432442
{
24442443
nd1 -= nmatches;
24452444
nd2 -= nmatches;
2446-
if (nd1 <= nd2 || nd2 <= 0)
2445+
if (nd1 <= nd2 || nd2 < 0)
24472446
uncertainfrac = 1.0;
24482447
else
24492448
uncertainfrac = nd2 / nd1;
@@ -2464,14 +2463,7 @@ eqjoinsel_semi(Oid operator,
24642463

24652464
if (nd1 != DEFAULT_NUM_DISTINCT && nd2 != DEFAULT_NUM_DISTINCT)
24662465
{
2467-
if (vardata1->rel)
2468-
nd1 = Min(nd1, vardata1->rel->rows);
2469-
nd1 = Min(nd1, rel1->rows);
2470-
if (vardata2->rel)
2471-
nd2 = Min(nd2, vardata2->rel->rows);
2472-
nd2 = Min(nd2, rel2->rows);
2473-
2474-
if (nd1 <= nd2 || nd2 <= 0)
2466+
if (nd1 <= nd2 || nd2 < 0)
24752467
selec = 1.0 - nullfrac1;
24762468
else
24772469
selec = (nd2 / nd1) * (1.0 - nullfrac1);

0 commit comments

Comments
 (0)