@@ -141,11 +141,10 @@ static double ineq_histogram_selectivity(PlannerInfo *root,
141
141
FmgrInfo * opproc , bool isgt ,
142
142
Datum constval , Oid consttype );
143
143
static double eqjoinsel_inner (Oid operator ,
144
- VariableStatData * vardata1 , VariableStatData * vardata2 ,
145
- RelOptInfo * rel1 , RelOptInfo * rel2 );
144
+ VariableStatData * vardata1 , VariableStatData * vardata2 );
146
145
static double eqjoinsel_semi (Oid operator ,
147
146
VariableStatData * vardata1 , VariableStatData * vardata2 ,
148
- RelOptInfo * rel1 , RelOptInfo * rel2 );
147
+ RelOptInfo * inner_rel );
149
148
static bool convert_to_scalar (Datum value , Oid valuetypid , double * scaledvalue ,
150
149
Datum lobound , Datum hibound , Oid boundstypid ,
151
150
double * scaledlobound , double * scaledhibound );
@@ -2010,47 +2009,35 @@ eqjoinsel(PG_FUNCTION_ARGS)
2010
2009
VariableStatData vardata1 ;
2011
2010
VariableStatData vardata2 ;
2012
2011
bool join_is_reversed ;
2013
- RelOptInfo * rel1 ;
2014
- RelOptInfo * rel2 ;
2012
+ RelOptInfo * inner_rel ;
2015
2013
2016
2014
get_join_variables (root , args , sjinfo ,
2017
2015
& vardata1 , & vardata2 , & join_is_reversed );
2018
2016
2019
- /*
2020
- * Identify the join's direct input relations. We use the min lefthand
2021
- * and min righthand as the inputs, even though the join might actually
2022
- * get done with larger input relations. The min inputs are guaranteed to
2023
- * have been formed by now, though, and always using them ensures
2024
- * consistency of estimates.
2025
- */
2026
- if (!join_is_reversed )
2027
- {
2028
- rel1 = find_join_input_rel (root , sjinfo -> min_lefthand );
2029
- rel2 = find_join_input_rel (root , sjinfo -> min_righthand );
2030
- }
2031
- else
2032
- {
2033
- rel1 = find_join_input_rel (root , sjinfo -> min_righthand );
2034
- rel2 = find_join_input_rel (root , sjinfo -> min_lefthand );
2035
- }
2036
-
2037
2017
switch (sjinfo -> jointype )
2038
2018
{
2039
2019
case JOIN_INNER :
2040
2020
case JOIN_LEFT :
2041
2021
case JOIN_FULL :
2042
- selec = eqjoinsel_inner (operator , & vardata1 , & vardata2 ,
2043
- rel1 , rel2 );
2022
+ selec = eqjoinsel_inner (operator , & vardata1 , & vardata2 );
2044
2023
break ;
2045
2024
case JOIN_SEMI :
2046
2025
case JOIN_ANTI :
2026
+ /*
2027
+ * Look up the join's inner relation. min_righthand is sufficient
2028
+ * information because neither SEMI nor ANTI joins permit any
2029
+ * reassociation into or out of their RHS, so the righthand will
2030
+ * always be exactly that set of rels.
2031
+ */
2032
+ inner_rel = find_join_input_rel (root , sjinfo -> min_righthand );
2033
+
2047
2034
if (!join_is_reversed )
2048
2035
selec = eqjoinsel_semi (operator , & vardata1 , & vardata2 ,
2049
- rel1 , rel2 );
2036
+ inner_rel );
2050
2037
else
2051
2038
selec = eqjoinsel_semi (get_commutator (operator ),
2052
2039
& vardata2 , & vardata1 ,
2053
- rel2 , rel1 );
2040
+ inner_rel );
2054
2041
break ;
2055
2042
default :
2056
2043
/* other values not expected here */
@@ -2076,8 +2063,7 @@ eqjoinsel(PG_FUNCTION_ARGS)
2076
2063
*/
2077
2064
static double
2078
2065
eqjoinsel_inner (Oid operator ,
2079
- VariableStatData * vardata1 , VariableStatData * vardata2 ,
2080
- RelOptInfo * rel1 , RelOptInfo * rel2 )
2066
+ VariableStatData * vardata1 , VariableStatData * vardata2 )
2081
2067
{
2082
2068
double selec ;
2083
2069
double nd1 ;
@@ -2272,26 +2258,10 @@ eqjoinsel_inner(Oid operator,
2272
2258
* XXX Can we be smarter if we have an MCV list for just one side? It
2273
2259
* seems that if we assume equal distribution for the other side, we
2274
2260
* end up with the same answer anyway.
2275
- *
2276
- * An additional hack we use here is to clamp the nd1 and nd2 values
2277
- * to not more than what we are estimating the input relation sizes to
2278
- * be, providing a crude correction for the selectivity of restriction
2279
- * clauses on those relations. (We don't do that in the other path
2280
- * since there we are comparing the nd values to stats for the whole
2281
- * relations.) We can apply this clamp both with respect to the base
2282
- * relations from which the join variables come, and to the immediate
2283
- * input relations of the current join.
2284
2261
*/
2285
2262
double nullfrac1 = stats1 ? stats1 -> stanullfrac : 0.0 ;
2286
2263
double nullfrac2 = stats2 ? stats2 -> stanullfrac : 0.0 ;
2287
2264
2288
- if (vardata1 -> rel )
2289
- nd1 = Min (nd1 , vardata1 -> rel -> rows );
2290
- nd1 = Min (nd1 , rel1 -> rows );
2291
- if (vardata2 -> rel )
2292
- nd2 = Min (nd2 , vardata2 -> rel -> rows );
2293
- nd2 = Min (nd2 , rel2 -> rows );
2294
-
2295
2265
selec = (1.0 - nullfrac1 ) * (1.0 - nullfrac2 );
2296
2266
if (nd1 > nd2 )
2297
2267
selec /= nd1 ;
@@ -2318,7 +2288,7 @@ eqjoinsel_inner(Oid operator,
2318
2288
static double
2319
2289
eqjoinsel_semi (Oid operator ,
2320
2290
VariableStatData * vardata1 , VariableStatData * vardata2 ,
2321
- RelOptInfo * rel1 , RelOptInfo * rel2 )
2291
+ RelOptInfo * inner_rel )
2322
2292
{
2323
2293
double selec ;
2324
2294
double nd1 ;
@@ -2338,6 +2308,25 @@ eqjoinsel_semi(Oid operator,
2338
2308
nd1 = get_variable_numdistinct (vardata1 );
2339
2309
nd2 = get_variable_numdistinct (vardata2 );
2340
2310
2311
+ /*
2312
+ * We clamp nd2 to be not more than what we estimate the inner relation's
2313
+ * size to be. This is intuitively somewhat reasonable since obviously
2314
+ * there can't be more than that many distinct values coming from the
2315
+ * inner rel. The reason for the asymmetry (ie, that we don't clamp nd1
2316
+ * likewise) is that this is the only pathway by which restriction clauses
2317
+ * applied to the inner rel will affect the join result size estimate,
2318
+ * since set_joinrel_size_estimates will multiply SEMI/ANTI selectivity by
2319
+ * only the outer rel's size. If we clamped nd1 we'd be double-counting
2320
+ * the selectivity of outer-rel restrictions.
2321
+ *
2322
+ * We can apply this clamping both with respect to the base relation from
2323
+ * which the join variable comes (if there is just one), and to the
2324
+ * immediate inner input relation of the current join.
2325
+ */
2326
+ if (vardata2 -> rel )
2327
+ nd2 = Min (nd2 , vardata2 -> rel -> rows );
2328
+ nd2 = Min (nd2 , inner_rel -> rows );
2329
+
2341
2330
if (HeapTupleIsValid (vardata1 -> statsTuple ))
2342
2331
{
2343
2332
stats1 = (Form_pg_statistic ) GETSTRUCT (vardata1 -> statsTuple );
@@ -2381,11 +2370,21 @@ eqjoinsel_semi(Oid operator,
2381
2370
uncertainfrac ,
2382
2371
uncertain ;
2383
2372
int i ,
2384
- nmatches ;
2373
+ nmatches ,
2374
+ clamped_nvalues2 ;
2375
+
2376
+ /*
2377
+ * The clamping above could have resulted in nd2 being less than
2378
+ * nvalues2; in which case, we assume that precisely the nd2 most
2379
+ * common values in the relation will appear in the join input, and so
2380
+ * compare to only the first nd2 members of the MCV list. Of course
2381
+ * this is frequently wrong, but it's the best bet we can make.
2382
+ */
2383
+ clamped_nvalues2 = Min (nvalues2 , nd2 );
2385
2384
2386
2385
fmgr_info (get_opcode (operator ), & eqproc );
2387
2386
hasmatch1 = (bool * ) palloc0 (nvalues1 * sizeof (bool ));
2388
- hasmatch2 = (bool * ) palloc0 (nvalues2 * sizeof (bool ));
2387
+ hasmatch2 = (bool * ) palloc0 (clamped_nvalues2 * sizeof (bool ));
2389
2388
2390
2389
/*
2391
2390
* Note we assume that each MCV will match at most one member of the
@@ -2398,7 +2397,7 @@ eqjoinsel_semi(Oid operator,
2398
2397
{
2399
2398
int j ;
2400
2399
2401
- for (j = 0 ; j < nvalues2 ; j ++ )
2400
+ for (j = 0 ; j < clamped_nvalues2 ; j ++ )
2402
2401
{
2403
2402
if (hasmatch2 [j ])
2404
2403
continue ;
@@ -2443,7 +2442,7 @@ eqjoinsel_semi(Oid operator,
2443
2442
{
2444
2443
nd1 -= nmatches ;
2445
2444
nd2 -= nmatches ;
2446
- if (nd1 <= nd2 || nd2 <= 0 )
2445
+ if (nd1 <= nd2 || nd2 < 0 )
2447
2446
uncertainfrac = 1.0 ;
2448
2447
else
2449
2448
uncertainfrac = nd2 / nd1 ;
@@ -2464,14 +2463,7 @@ eqjoinsel_semi(Oid operator,
2464
2463
2465
2464
if (nd1 != DEFAULT_NUM_DISTINCT && nd2 != DEFAULT_NUM_DISTINCT )
2466
2465
{
2467
- if (vardata1 -> rel )
2468
- nd1 = Min (nd1 , vardata1 -> rel -> rows );
2469
- nd1 = Min (nd1 , rel1 -> rows );
2470
- if (vardata2 -> rel )
2471
- nd2 = Min (nd2 , vardata2 -> rel -> rows );
2472
- nd2 = Min (nd2 , rel2 -> rows );
2473
-
2474
- if (nd1 <= nd2 || nd2 <= 0 )
2466
+ if (nd1 <= nd2 || nd2 < 0 )
2475
2467
selec = 1.0 - nullfrac1 ;
2476
2468
else
2477
2469
selec = (nd2 / nd1 ) * (1.0 - nullfrac1 );
0 commit comments