Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 73d1040

Browse files
committed
Fix eqjoinsel() to make use of new statistics.
1 parent a001f13 commit 73d1040

File tree

1 file changed

+201
-41
lines changed

1 file changed

+201
-41
lines changed

src/backend/utils/adt/selfuncs.c

Lines changed: 201 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
*
1616
*
1717
* IDENTIFICATION
18-
* $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.90 2001/05/20 20:28:19 tgl Exp $
18+
* $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.91 2001/05/27 17:37:48 tgl Exp $
1919
*
2020
*-------------------------------------------------------------------------
2121
*/
@@ -940,9 +940,7 @@ Datum
940940
eqjoinsel(PG_FUNCTION_ARGS)
941941
{
942942
Query *root = (Query *) PG_GETARG_POINTER(0);
943-
#ifdef NOT_USED /* see neqjoinsel() before removing me! */
944943
Oid operator = PG_GETARG_OID(1);
945-
#endif
946944
List *args = (List *) PG_GETARG_POINTER(2);
947945
Var *var1;
948946
Var *var2;
@@ -958,73 +956,219 @@ eqjoinsel(PG_FUNCTION_ARGS)
958956
HeapTuple statsTuple2 = NULL;
959957
Form_pg_statistic stats1 = NULL;
960958
Form_pg_statistic stats2 = NULL;
961-
double nd1,
962-
nd2;
963-
964-
if (var1 == NULL)
965-
{
966-
nd1 = DEFAULT_NUM_DISTINCT;
967-
}
968-
else
959+
double nd1 = DEFAULT_NUM_DISTINCT;
960+
double nd2 = DEFAULT_NUM_DISTINCT;
961+
bool have_mcvs1 = false;
962+
Datum *values1 = NULL;
963+
int nvalues1 = 0;
964+
float4 *numbers1 = NULL;
965+
int nnumbers1 = 0;
966+
bool have_mcvs2 = false;
967+
Datum *values2 = NULL;
968+
int nvalues2 = 0;
969+
float4 *numbers2 = NULL;
970+
int nnumbers2 = 0;
971+
972+
if (var1 != NULL)
969973
{
970974
/* get stats for the attribute, if available */
971975
Oid relid1 = getrelid(var1->varno, root->rtable);
972976

973-
if (relid1 == InvalidOid)
974-
nd1 = DEFAULT_NUM_DISTINCT;
975-
else
977+
if (relid1 != InvalidOid)
976978
{
977979
statsTuple1 = SearchSysCache(STATRELATT,
978980
ObjectIdGetDatum(relid1),
979981
Int16GetDatum(var1->varattno),
980982
0, 0);
981983
if (HeapTupleIsValid(statsTuple1))
984+
{
982985
stats1 = (Form_pg_statistic) GETSTRUCT(statsTuple1);
986+
have_mcvs1 = get_attstatsslot(statsTuple1,
987+
var1->vartype,
988+
var1->vartypmod,
989+
STATISTIC_KIND_MCV,
990+
InvalidOid,
991+
&values1, &nvalues1,
992+
&numbers1, &nnumbers1);
993+
}
983994

984995
nd1 = get_att_numdistinct(root, var1, stats1);
985996
}
986997
}
987998

988-
if (var2 == NULL)
989-
{
990-
nd2 = DEFAULT_NUM_DISTINCT;
991-
}
992-
else
999+
if (var2 != NULL)
9931000
{
9941001
/* get stats for the attribute, if available */
9951002
Oid relid2 = getrelid(var2->varno, root->rtable);
9961003

997-
if (relid2 == InvalidOid)
998-
nd2 = DEFAULT_NUM_DISTINCT;
999-
else
1004+
if (relid2 != InvalidOid)
10001005
{
10011006
statsTuple2 = SearchSysCache(STATRELATT,
10021007
ObjectIdGetDatum(relid2),
10031008
Int16GetDatum(var2->varattno),
10041009
0, 0);
10051010
if (HeapTupleIsValid(statsTuple2))
1011+
{
10061012
stats2 = (Form_pg_statistic) GETSTRUCT(statsTuple2);
1013+
have_mcvs2 = get_attstatsslot(statsTuple2,
1014+
var2->vartype,
1015+
var2->vartypmod,
1016+
STATISTIC_KIND_MCV,
1017+
InvalidOid,
1018+
&values2, &nvalues2,
1019+
&numbers2, &nnumbers2);
1020+
}
10071021

10081022
nd2 = get_att_numdistinct(root, var2, stats2);
10091023
}
10101024
}
10111025

1012-
/*
1013-
* Estimate the join selectivity as 1 / sqrt(nd1*nd2)
1014-
* (can we produce any theory for this)?
1015-
*
1016-
* XXX possibility to do better: if both attributes have histograms
1017-
* then we could determine the exact join selectivity between the
1018-
* MCV sets, and only have to assume the join behavior of the non-MCV
1019-
* values. This could be a big win when the MCVs cover a large part
1020-
* of the population.
1021-
*
1022-
* XXX what about nulls?
1023-
*/
1024-
selec = 1.0 / sqrt(nd1 * nd2);
1025-
if (selec > 1.0)
1026-
selec = 1.0;
1026+
if (have_mcvs1 && have_mcvs2)
1027+
{
1028+
/*
1029+
* We have most-common-value lists for both relations. Run
1030+
* through the lists to see which MCVs actually join to each
1031+
* other with the given operator. This allows us to determine
1032+
* the exact join selectivity for the portion of the relations
1033+
* represented by the MCV lists. We still have to estimate for
1034+
* the remaining population, but in a skewed distribution this
1035+
* gives us a big leg up in accuracy. For motivation see the
1036+
* analysis in Y. Ioannidis and S. Christodoulakis, "On the
1037+
* propagation of errors in the size of join results", Technical
1038+
* Report 1018, Computer Science Dept., University of Wisconsin,
1039+
* Madison, March 1991 (available from ftp.cs.wisc.edu).
1040+
*/
1041+
FmgrInfo eqproc;
1042+
bool *hasmatch1;
1043+
bool *hasmatch2;
1044+
double matchprodfreq,
1045+
matchfreq1,
1046+
matchfreq2,
1047+
unmatchfreq1,
1048+
unmatchfreq2,
1049+
otherfreq1,
1050+
otherfreq2,
1051+
totalsel1,
1052+
totalsel2;
1053+
int i,
1054+
nmatches;
1055+
1056+
fmgr_info(get_opcode(operator), &eqproc);
1057+
hasmatch1 = (bool *) palloc(nvalues1 * sizeof(bool));
1058+
memset(hasmatch1, 0, nvalues1 * sizeof(bool));
1059+
hasmatch2 = (bool *) palloc(nvalues2 * sizeof(bool));
1060+
memset(hasmatch2, 0, nvalues2 * sizeof(bool));
1061+
/*
1062+
* Note we assume that each MCV will match at most one member of
1063+
* the other MCV list. If the operator isn't really equality,
1064+
* there could be multiple matches --- but we don't look for them,
1065+
* both for speed and because the math wouldn't add up...
1066+
*/
1067+
matchprodfreq = 0.0;
1068+
nmatches = 0;
1069+
for (i = 0; i < nvalues1; i++)
1070+
{
1071+
int j;
10271072

1073+
for (j = 0; j < nvalues2; j++)
1074+
{
1075+
if (hasmatch2[j])
1076+
continue;
1077+
if (DatumGetBool(FunctionCall2(&eqproc,
1078+
values1[i],
1079+
values2[j])))
1080+
{
1081+
hasmatch1[i] = hasmatch2[j] = true;
1082+
matchprodfreq += numbers1[i] * numbers2[j];
1083+
nmatches++;
1084+
break;
1085+
}
1086+
}
1087+
}
1088+
/* Sum up frequencies of matched and unmatched MCVs */
1089+
matchfreq1 = unmatchfreq1 = 0.0;
1090+
for (i = 0; i < nvalues1; i++)
1091+
{
1092+
if (hasmatch1[i])
1093+
matchfreq1 += numbers1[i];
1094+
else
1095+
unmatchfreq1 += numbers1[i];
1096+
}
1097+
matchfreq2 = unmatchfreq2 = 0.0;
1098+
for (i = 0; i < nvalues2; i++)
1099+
{
1100+
if (hasmatch2[i])
1101+
matchfreq2 += numbers2[i];
1102+
else
1103+
unmatchfreq2 += numbers2[i];
1104+
}
1105+
pfree(hasmatch1);
1106+
pfree(hasmatch2);
1107+
/*
1108+
* Compute total frequency of non-null values that are not in
1109+
* the MCV lists.
1110+
*/
1111+
otherfreq1 = 1.0 - stats1->stanullfrac - matchfreq1 - unmatchfreq1;
1112+
otherfreq2 = 1.0 - stats2->stanullfrac - matchfreq2 - unmatchfreq2;
1113+
/*
1114+
* We can estimate the total selectivity from the point of view
1115+
* of relation 1 as: the known selectivity for matched MCVs, plus
1116+
* unmatched MCVs that are assumed to match against random members
1117+
* of relation 2's non-MCV population, plus non-MCV values that
1118+
* are assumed to match against random members of relation 2's
1119+
* unmatched MCVs plus non-MCV values.
1120+
*/
1121+
totalsel1 = matchprodfreq;
1122+
if (nd2 > nvalues2)
1123+
totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2);
1124+
if (nd2 > nmatches)
1125+
totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) /
1126+
(nd2 - nmatches);
1127+
/* Same estimate from the point of view of relation 2. */
1128+
totalsel2 = matchprodfreq;
1129+
if (nd1 > nvalues1)
1130+
totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1);
1131+
if (nd1 > nmatches)
1132+
totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) /
1133+
(nd1 - nmatches);
1134+
/*
1135+
* For robustness, we average the two estimates. (Can a case
1136+
* be made for taking the min or max instead?)
1137+
*/
1138+
selec = (totalsel1 + totalsel2) * 0.5;
1139+
}
1140+
else
1141+
{
1142+
/*
1143+
* We do not have MCV lists for both sides. Estimate the
1144+
* join selectivity as MIN(1/nd1, 1/nd2). This is plausible
1145+
* if we assume that the values are about equally distributed:
1146+
* a given tuple of rel1 will join to either 0 or N2/nd2 rows
1147+
* of rel2, so total join rows are at most N1*N2/nd2 giving
1148+
* a join selectivity of not more than 1/nd2. By the same logic
1149+
* it is not more than 1/nd1, so MIN(1/nd1, 1/nd2) is an upper
1150+
* bound. Using the MIN() means we estimate from the point of
1151+
* view of the relation with smaller nd (since the larger nd is
1152+
* determining the MIN). It is reasonable to assume that most
1153+
* tuples in this rel will have join partners, so the bound is
1154+
* probably reasonably tight and should be taken as-is.
1155+
*
1156+
* XXX Can we be smarter if we have an MCV list for just one side?
1157+
* It seems that if we assume equal distribution for the other
1158+
* side, we end up with the same answer anyway.
1159+
*/
1160+
if (nd1 > nd2)
1161+
selec = 1.0 / nd1;
1162+
else
1163+
selec = 1.0 / nd2;
1164+
}
1165+
1166+
if (have_mcvs1)
1167+
free_attstatsslot(var1->vartype, values1, nvalues1,
1168+
numbers1, nnumbers1);
1169+
if (have_mcvs2)
1170+
free_attstatsslot(var2->vartype, values2, nvalues2,
1171+
numbers2, nnumbers2);
10281172
if (HeapTupleIsValid(statsTuple1))
10291173
ReleaseSysCache(statsTuple1);
10301174
if (HeapTupleIsValid(statsTuple2))
@@ -1039,14 +1183,30 @@ eqjoinsel(PG_FUNCTION_ARGS)
10391183
Datum
10401184
neqjoinsel(PG_FUNCTION_ARGS)
10411185
{
1186+
Query *root = (Query *) PG_GETARG_POINTER(0);
1187+
Oid operator = PG_GETARG_OID(1);
1188+
List *args = (List *) PG_GETARG_POINTER(2);
1189+
Oid eqop;
10421190
float8 result;
10431191

10441192
/*
1045-
* XXX we skip looking up the negator operator here because we know
1046-
* eqjoinsel() won't look at it anyway. If eqjoinsel() ever does
1047-
* look, this routine will need to look more like neqsel() does.
1193+
* We want 1 - eqjoinsel() where the equality operator is the one
1194+
* associated with this != operator, that is, its negator.
10481195
*/
1049-
result = DatumGetFloat8(eqjoinsel(fcinfo));
1196+
eqop = get_negator(operator);
1197+
if (eqop)
1198+
{
1199+
result = DatumGetFloat8(DirectFunctionCall3(eqjoinsel,
1200+
PointerGetDatum(root),
1201+
ObjectIdGetDatum(eqop),
1202+
PointerGetDatum(args)));
1203+
1204+
}
1205+
else
1206+
{
1207+
/* Use default selectivity (should we raise an error instead?) */
1208+
result = DEFAULT_EQ_SEL;
1209+
}
10501210
result = 1.0 - result;
10511211
PG_RETURN_FLOAT8(result);
10521212
}

0 commit comments

Comments
 (0)