15
15
*
16
16
*
17
17
* IDENTIFICATION
18
- * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.90 2001/05/20 20:28:19 tgl Exp $
18
+ * $Header: /cvsroot/pgsql/src/backend/utils/adt/selfuncs.c,v 1.91 2001/05/27 17:37:48 tgl Exp $
19
19
*
20
20
*-------------------------------------------------------------------------
21
21
*/
@@ -940,9 +940,7 @@ Datum
940
940
eqjoinsel (PG_FUNCTION_ARGS )
941
941
{
942
942
Query * root = (Query * ) PG_GETARG_POINTER (0 );
943
- #ifdef NOT_USED /* see neqjoinsel() before removing me! */
944
943
Oid operator = PG_GETARG_OID (1 );
945
- #endif
946
944
List * args = (List * ) PG_GETARG_POINTER (2 );
947
945
Var * var1 ;
948
946
Var * var2 ;
@@ -958,73 +956,219 @@ eqjoinsel(PG_FUNCTION_ARGS)
958
956
HeapTuple statsTuple2 = NULL ;
959
957
Form_pg_statistic stats1 = NULL ;
960
958
Form_pg_statistic stats2 = NULL ;
961
- double nd1 ,
962
- nd2 ;
963
-
964
- if (var1 == NULL )
965
- {
966
- nd1 = DEFAULT_NUM_DISTINCT ;
967
- }
968
- else
959
+ double nd1 = DEFAULT_NUM_DISTINCT ;
960
+ double nd2 = DEFAULT_NUM_DISTINCT ;
961
+ bool have_mcvs1 = false;
962
+ Datum * values1 = NULL ;
963
+ int nvalues1 = 0 ;
964
+ float4 * numbers1 = NULL ;
965
+ int nnumbers1 = 0 ;
966
+ bool have_mcvs2 = false;
967
+ Datum * values2 = NULL ;
968
+ int nvalues2 = 0 ;
969
+ float4 * numbers2 = NULL ;
970
+ int nnumbers2 = 0 ;
971
+
972
+ if (var1 != NULL )
969
973
{
970
974
/* get stats for the attribute, if available */
971
975
Oid relid1 = getrelid (var1 -> varno , root -> rtable );
972
976
973
- if (relid1 == InvalidOid )
974
- nd1 = DEFAULT_NUM_DISTINCT ;
975
- else
977
+ if (relid1 != InvalidOid )
976
978
{
977
979
statsTuple1 = SearchSysCache (STATRELATT ,
978
980
ObjectIdGetDatum (relid1 ),
979
981
Int16GetDatum (var1 -> varattno ),
980
982
0 , 0 );
981
983
if (HeapTupleIsValid (statsTuple1 ))
984
+ {
982
985
stats1 = (Form_pg_statistic ) GETSTRUCT (statsTuple1 );
986
+ have_mcvs1 = get_attstatsslot (statsTuple1 ,
987
+ var1 -> vartype ,
988
+ var1 -> vartypmod ,
989
+ STATISTIC_KIND_MCV ,
990
+ InvalidOid ,
991
+ & values1 , & nvalues1 ,
992
+ & numbers1 , & nnumbers1 );
993
+ }
983
994
984
995
nd1 = get_att_numdistinct (root , var1 , stats1 );
985
996
}
986
997
}
987
998
988
- if (var2 == NULL )
989
- {
990
- nd2 = DEFAULT_NUM_DISTINCT ;
991
- }
992
- else
999
+ if (var2 != NULL )
993
1000
{
994
1001
/* get stats for the attribute, if available */
995
1002
Oid relid2 = getrelid (var2 -> varno , root -> rtable );
996
1003
997
- if (relid2 == InvalidOid )
998
- nd2 = DEFAULT_NUM_DISTINCT ;
999
- else
1004
+ if (relid2 != InvalidOid )
1000
1005
{
1001
1006
statsTuple2 = SearchSysCache (STATRELATT ,
1002
1007
ObjectIdGetDatum (relid2 ),
1003
1008
Int16GetDatum (var2 -> varattno ),
1004
1009
0 , 0 );
1005
1010
if (HeapTupleIsValid (statsTuple2 ))
1011
+ {
1006
1012
stats2 = (Form_pg_statistic ) GETSTRUCT (statsTuple2 );
1013
+ have_mcvs2 = get_attstatsslot (statsTuple2 ,
1014
+ var2 -> vartype ,
1015
+ var2 -> vartypmod ,
1016
+ STATISTIC_KIND_MCV ,
1017
+ InvalidOid ,
1018
+ & values2 , & nvalues2 ,
1019
+ & numbers2 , & nnumbers2 );
1020
+ }
1007
1021
1008
1022
nd2 = get_att_numdistinct (root , var2 , stats2 );
1009
1023
}
1010
1024
}
1011
1025
1012
- /*
1013
- * Estimate the join selectivity as 1 / sqrt(nd1*nd2)
1014
- * (can we produce any theory for this)?
1015
- *
1016
- * XXX possibility to do better: if both attributes have histograms
1017
- * then we could determine the exact join selectivity between the
1018
- * MCV sets, and only have to assume the join behavior of the non-MCV
1019
- * values. This could be a big win when the MCVs cover a large part
1020
- * of the population.
1021
- *
1022
- * XXX what about nulls?
1023
- */
1024
- selec = 1.0 / sqrt (nd1 * nd2 );
1025
- if (selec > 1.0 )
1026
- selec = 1.0 ;
1026
+ if (have_mcvs1 && have_mcvs2 )
1027
+ {
1028
+ /*
1029
+ * We have most-common-value lists for both relations. Run
1030
+ * through the lists to see which MCVs actually join to each
1031
+ * other with the given operator. This allows us to determine
1032
+ * the exact join selectivity for the portion of the relations
1033
+ * represented by the MCV lists. We still have to estimate for
1034
+ * the remaining population, but in a skewed distribution this
1035
+ * gives us a big leg up in accuracy. For motivation see the
1036
+ * analysis in Y. Ioannidis and S. Christodoulakis, "On the
1037
+ * propagation of errors in the size of join results", Technical
1038
+ * Report 1018, Computer Science Dept., University of Wisconsin,
1039
+ * Madison, March 1991 (available from ftp.cs.wisc.edu).
1040
+ */
1041
+ FmgrInfo eqproc ;
1042
+ bool * hasmatch1 ;
1043
+ bool * hasmatch2 ;
1044
+ double matchprodfreq ,
1045
+ matchfreq1 ,
1046
+ matchfreq2 ,
1047
+ unmatchfreq1 ,
1048
+ unmatchfreq2 ,
1049
+ otherfreq1 ,
1050
+ otherfreq2 ,
1051
+ totalsel1 ,
1052
+ totalsel2 ;
1053
+ int i ,
1054
+ nmatches ;
1055
+
1056
+ fmgr_info (get_opcode (operator ), & eqproc );
1057
+ hasmatch1 = (bool * ) palloc (nvalues1 * sizeof (bool ));
1058
+ memset (hasmatch1 , 0 , nvalues1 * sizeof (bool ));
1059
+ hasmatch2 = (bool * ) palloc (nvalues2 * sizeof (bool ));
1060
+ memset (hasmatch2 , 0 , nvalues2 * sizeof (bool ));
1061
+ /*
1062
+ * Note we assume that each MCV will match at most one member of
1063
+ * the other MCV list. If the operator isn't really equality,
1064
+ * there could be multiple matches --- but we don't look for them,
1065
+ * both for speed and because the math wouldn't add up...
1066
+ */
1067
+ matchprodfreq = 0.0 ;
1068
+ nmatches = 0 ;
1069
+ for (i = 0 ; i < nvalues1 ; i ++ )
1070
+ {
1071
+ int j ;
1027
1072
1073
+ for (j = 0 ; j < nvalues2 ; j ++ )
1074
+ {
1075
+ if (hasmatch2 [j ])
1076
+ continue ;
1077
+ if (DatumGetBool (FunctionCall2 (& eqproc ,
1078
+ values1 [i ],
1079
+ values2 [j ])))
1080
+ {
1081
+ hasmatch1 [i ] = hasmatch2 [j ] = true;
1082
+ matchprodfreq += numbers1 [i ] * numbers2 [j ];
1083
+ nmatches ++ ;
1084
+ break ;
1085
+ }
1086
+ }
1087
+ }
1088
+ /* Sum up frequencies of matched and unmatched MCVs */
1089
+ matchfreq1 = unmatchfreq1 = 0.0 ;
1090
+ for (i = 0 ; i < nvalues1 ; i ++ )
1091
+ {
1092
+ if (hasmatch1 [i ])
1093
+ matchfreq1 += numbers1 [i ];
1094
+ else
1095
+ unmatchfreq1 += numbers1 [i ];
1096
+ }
1097
+ matchfreq2 = unmatchfreq2 = 0.0 ;
1098
+ for (i = 0 ; i < nvalues2 ; i ++ )
1099
+ {
1100
+ if (hasmatch2 [i ])
1101
+ matchfreq2 += numbers2 [i ];
1102
+ else
1103
+ unmatchfreq2 += numbers2 [i ];
1104
+ }
1105
+ pfree (hasmatch1 );
1106
+ pfree (hasmatch2 );
1107
+ /*
1108
+ * Compute total frequency of non-null values that are not in
1109
+ * the MCV lists.
1110
+ */
1111
+ otherfreq1 = 1.0 - stats1 -> stanullfrac - matchfreq1 - unmatchfreq1 ;
1112
+ otherfreq2 = 1.0 - stats2 -> stanullfrac - matchfreq2 - unmatchfreq2 ;
1113
+ /*
1114
+ * We can estimate the total selectivity from the point of view
1115
+ * of relation 1 as: the known selectivity for matched MCVs, plus
1116
+ * unmatched MCVs that are assumed to match against random members
1117
+ * of relation 2's non-MCV population, plus non-MCV values that
1118
+ * are assumed to match against random members of relation 2's
1119
+ * unmatched MCVs plus non-MCV values.
1120
+ */
1121
+ totalsel1 = matchprodfreq ;
1122
+ if (nd2 > nvalues2 )
1123
+ totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - nvalues2 );
1124
+ if (nd2 > nmatches )
1125
+ totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2 ) /
1126
+ (nd2 - nmatches );
1127
+ /* Same estimate from the point of view of relation 2. */
1128
+ totalsel2 = matchprodfreq ;
1129
+ if (nd1 > nvalues1 )
1130
+ totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - nvalues1 );
1131
+ if (nd1 > nmatches )
1132
+ totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1 ) /
1133
+ (nd1 - nmatches );
1134
+ /*
1135
+ * For robustness, we average the two estimates. (Can a case
1136
+ * be made for taking the min or max instead?)
1137
+ */
1138
+ selec = (totalsel1 + totalsel2 ) * 0.5 ;
1139
+ }
1140
+ else
1141
+ {
1142
+ /*
1143
+ * We do not have MCV lists for both sides. Estimate the
1144
+ * join selectivity as MIN(1/nd1, 1/nd2). This is plausible
1145
+ * if we assume that the values are about equally distributed:
1146
+ * a given tuple of rel1 will join to either 0 or N2/nd2 rows
1147
+ * of rel2, so total join rows are at most N1*N2/nd2 giving
1148
+ * a join selectivity of not more than 1/nd2. By the same logic
1149
+ * it is not more than 1/nd1, so MIN(1/nd1, 1/nd2) is an upper
1150
+ * bound. Using the MIN() means we estimate from the point of
1151
+ * view of the relation with smaller nd (since the larger nd is
1152
+ * determining the MIN). It is reasonable to assume that most
1153
+ * tuples in this rel will have join partners, so the bound is
1154
+ * probably reasonably tight and should be taken as-is.
1155
+ *
1156
+ * XXX Can we be smarter if we have an MCV list for just one side?
1157
+ * It seems that if we assume equal distribution for the other
1158
+ * side, we end up with the same answer anyway.
1159
+ */
1160
+ if (nd1 > nd2 )
1161
+ selec = 1.0 / nd1 ;
1162
+ else
1163
+ selec = 1.0 / nd2 ;
1164
+ }
1165
+
1166
+ if (have_mcvs1 )
1167
+ free_attstatsslot (var1 -> vartype , values1 , nvalues1 ,
1168
+ numbers1 , nnumbers1 );
1169
+ if (have_mcvs2 )
1170
+ free_attstatsslot (var2 -> vartype , values2 , nvalues2 ,
1171
+ numbers2 , nnumbers2 );
1028
1172
if (HeapTupleIsValid (statsTuple1 ))
1029
1173
ReleaseSysCache (statsTuple1 );
1030
1174
if (HeapTupleIsValid (statsTuple2 ))
@@ -1039,14 +1183,30 @@ eqjoinsel(PG_FUNCTION_ARGS)
1039
1183
Datum
1040
1184
neqjoinsel (PG_FUNCTION_ARGS )
1041
1185
{
1186
+ Query * root = (Query * ) PG_GETARG_POINTER (0 );
1187
+ Oid operator = PG_GETARG_OID (1 );
1188
+ List * args = (List * ) PG_GETARG_POINTER (2 );
1189
+ Oid eqop ;
1042
1190
float8 result ;
1043
1191
1044
1192
/*
1045
- * XXX we skip looking up the negator operator here because we know
1046
- * eqjoinsel() won't look at it anyway. If eqjoinsel() ever does
1047
- * look, this routine will need to look more like neqsel() does.
1193
+ * We want 1 - eqjoinsel() where the equality operator is the one
1194
+ * associated with this != operator, that is, its negator.
1048
1195
*/
1049
- result = DatumGetFloat8 (eqjoinsel (fcinfo ));
1196
+ eqop = get_negator (operator );
1197
+ if (eqop )
1198
+ {
1199
+ result = DatumGetFloat8 (DirectFunctionCall3 (eqjoinsel ,
1200
+ PointerGetDatum (root ),
1201
+ ObjectIdGetDatum (eqop ),
1202
+ PointerGetDatum (args )));
1203
+
1204
+ }
1205
+ else
1206
+ {
1207
+ /* Use default selectivity (should we raise an error instead?) */
1208
+ result = DEFAULT_EQ_SEL ;
1209
+ }
1050
1210
result = 1.0 - result ;
1051
1211
PG_RETURN_FLOAT8 (result );
1052
1212
}
0 commit comments