Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 0182951

Browse files
committed
Fix overenthusiastic optimization of 'x IN (SELECT DISTINCT ...)' and related
cases: we can't just consider whether the subquery's output is unique on its own terms, we have to check whether the set of output columns we are going to use will be unique. Per complaint from Luca Pireddu and test case from Michael Fuhr.
1 parent aa11106 commit 0182951

File tree

3 files changed

+210
-45
lines changed

3 files changed

+210
-45
lines changed

src/backend/optimizer/util/pathnode.c

Lines changed: 120 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.122 2005/06/05 22:32:56 tgl Exp $
11+
* $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.123 2005/07/15 17:09:25 tgl Exp $
1212
*
1313
*-------------------------------------------------------------------------
1414
*/
@@ -34,7 +34,8 @@
3434
#include "utils/syscache.h"
3535

3636

37-
static bool is_distinct_query(Query *query);
37+
static List *translate_sub_tlist(List *tlist, int relid);
38+
static bool query_is_distinct_for(Query *query, List *colnos);
3839
static bool hash_safe_tlist(List *tlist);
3940

4041

@@ -800,14 +801,41 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath)
800801
pathnode->subpath = subpath;
801802

802803
/*
803-
* If the input is a subquery whose output must be unique already, we
804-
* don't need to do anything.
804+
* Try to identify the targetlist that will actually be unique-ified.
805+
* In current usage, this routine is only used for sub-selects of IN
806+
* clauses, so we should be able to find the tlist in in_info_list.
807+
*/
808+
sub_targetlist = NIL;
809+
foreach(l, root->in_info_list)
810+
{
811+
InClauseInfo *ininfo = (InClauseInfo *) lfirst(l);
812+
813+
if (bms_equal(ininfo->righthand, rel->relids))
814+
{
815+
sub_targetlist = ininfo->sub_targetlist;
816+
break;
817+
}
818+
}
819+
820+
/*
821+
* If the input is a subquery whose output must be unique already,
822+
* then we don't need to do anything. The test for uniqueness has
823+
* to consider exactly which columns we are extracting; for example
824+
* "SELECT DISTINCT x,y" doesn't guarantee that x alone is distinct.
825+
* So we cannot check for this optimization unless we found our own
826+
* targetlist above, and it consists only of simple Vars referencing
827+
* subquery outputs. (Possibly we could do something with expressions
828+
* in the subquery outputs, too, but for now keep it simple.)
805829
*/
806-
if (rel->rtekind == RTE_SUBQUERY)
830+
if (sub_targetlist && rel->rtekind == RTE_SUBQUERY)
807831
{
808832
RangeTblEntry *rte = rt_fetch(rel->relid, root->parse->rtable);
833+
List *sub_tlist_colnos;
834+
835+
sub_tlist_colnos = translate_sub_tlist(sub_targetlist, rel->relid);
809836

810-
if (is_distinct_query(rte->subquery))
837+
if (sub_tlist_colnos &&
838+
query_is_distinct_for(rte->subquery, sub_tlist_colnos))
811839
{
812840
pathnode->umethod = UNIQUE_PATH_NOOP;
813841
pathnode->rows = rel->rows;
@@ -821,23 +849,6 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath)
821849
}
822850
}
823851

824-
/*
825-
* Try to identify the targetlist that will actually be unique-ified.
826-
* In current usage, this routine is only used for sub-selects of IN
827-
* clauses, so we should be able to find the tlist in in_info_list.
828-
*/
829-
sub_targetlist = NIL;
830-
foreach(l, root->in_info_list)
831-
{
832-
InClauseInfo *ininfo = (InClauseInfo *) lfirst(l);
833-
834-
if (bms_equal(ininfo->righthand, rel->relids))
835-
{
836-
sub_targetlist = ininfo->sub_targetlist;
837-
break;
838-
}
839-
}
840-
841852
/*
842853
* If we know the targetlist, try to estimate number of result rows;
843854
* otherwise punt.
@@ -913,46 +924,83 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath)
913924
}
914925

915926
/*
916-
* is_distinct_query - does query never return duplicate rows?
927+
* translate_sub_tlist - get subquery column numbers represented by tlist
928+
*
929+
* The given targetlist should contain only Vars referencing the given relid.
930+
* Extract their varattnos (ie, the column numbers of the subquery) and return
931+
* as an integer List.
932+
*
933+
* If any of the tlist items is not a simple Var, we cannot determine whether
934+
* the subquery's uniqueness condition (if any) matches ours, so punt and
935+
* return NIL.
917936
*/
918-
static bool
919-
is_distinct_query(Query *query)
937+
static List *
938+
translate_sub_tlist(List *tlist, int relid)
920939
{
921-
/* DISTINCT (but not DISTINCT ON) guarantees uniqueness */
922-
if (has_distinct_clause(query))
923-
return true;
940+
List *result = NIL;
941+
ListCell *l;
924942

925-
/* UNION, INTERSECT, EXCEPT guarantee uniqueness, except with ALL */
926-
if (query->setOperations)
943+
foreach(l, tlist)
927944
{
928-
SetOperationStmt *topop = (SetOperationStmt *) query->setOperations;
945+
Var *var = (Var *) lfirst(l);
929946

930-
Assert(IsA(topop, SetOperationStmt));
931-
Assert(topop->op != SETOP_NONE);
947+
if (!var || !IsA(var, Var) ||
948+
var->varno != relid)
949+
return NIL; /* punt */
932950

933-
if (!topop->all)
951+
result = lappend_int(result, var->varattno);
952+
}
953+
return result;
954+
}
955+
956+
/*
957+
* query_is_distinct_for - does query never return duplicates of the
958+
* specified columns?
959+
*
960+
* colnos is an integer list of output column numbers (resno's). We are
961+
* interested in whether rows consisting of just these columns are certain
962+
* to be distinct.
963+
*/
964+
static bool
965+
query_is_distinct_for(Query *query, List *colnos)
966+
{
967+
ListCell *l;
968+
969+
/*
970+
* DISTINCT (including DISTINCT ON) guarantees uniqueness if all the
971+
* columns in the DISTINCT clause appear in colnos.
972+
*/
973+
if (query->distinctClause)
974+
{
975+
foreach(l, query->distinctClause)
976+
{
977+
SortClause *scl = (SortClause *) lfirst(l);
978+
TargetEntry *tle = get_sortgroupclause_tle(scl,
979+
query->targetList);
980+
981+
if (!list_member_int(colnos, tle->resno))
982+
break; /* exit early if no match */
983+
}
984+
if (l == NULL) /* had matches for all? */
934985
return true;
935986
}
936987

937988
/*
938-
* GROUP BY guarantees uniqueness if all the grouped columns appear in
939-
* the output. In our implementation this means checking they are non
940-
* resjunk columns.
989+
* Similarly, GROUP BY guarantees uniqueness if all the grouped columns
990+
* appear in colnos.
941991
*/
942992
if (query->groupClause)
943993
{
944-
ListCell *gl;
945-
946-
foreach(gl, query->groupClause)
994+
foreach(l, query->groupClause)
947995
{
948-
GroupClause *grpcl = (GroupClause *) lfirst(gl);
996+
GroupClause *grpcl = (GroupClause *) lfirst(l);
949997
TargetEntry *tle = get_sortgroupclause_tle(grpcl,
950998
query->targetList);
951999

952-
if (tle->resjunk)
953-
break;
1000+
if (!list_member_int(colnos, tle->resno))
1001+
break; /* exit early if no match */
9541002
}
955-
if (!gl) /* got to the end? */
1003+
if (l == NULL) /* had matches for all? */
9561004
return true;
9571005
}
9581006
else
@@ -965,6 +1013,33 @@ is_distinct_query(Query *query)
9651013
return true;
9661014
}
9671015

1016+
/*
1017+
* UNION, INTERSECT, EXCEPT guarantee uniqueness of the whole output row,
1018+
* except with ALL
1019+
*/
1020+
if (query->setOperations)
1021+
{
1022+
SetOperationStmt *topop = (SetOperationStmt *) query->setOperations;
1023+
1024+
Assert(IsA(topop, SetOperationStmt));
1025+
Assert(topop->op != SETOP_NONE);
1026+
1027+
if (!topop->all)
1028+
{
1029+
/* We're good if all the nonjunk output columns are in colnos */
1030+
foreach(l, query->targetList)
1031+
{
1032+
TargetEntry *tle = (TargetEntry *) lfirst(l);
1033+
1034+
if (!tle->resjunk &&
1035+
!list_member_int(colnos, tle->resno))
1036+
break; /* exit early if no match */
1037+
}
1038+
if (l == NULL) /* had matches for all? */
1039+
return true;
1040+
}
1041+
}
1042+
9681043
/*
9691044
* XXX Are there any other cases in which we can easily see the result
9701045
* must be distinct?

src/test/regress/expected/subselect.out

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,63 @@ select count(distinct ss.ten) from
202202
10
203203
(1 row)
204204

205+
--
206+
-- Test cases to check for overenthusiastic optimization of
207+
-- "IN (SELECT DISTINCT ...)" and related cases. Per example from
208+
-- Luca Pireddu and Michael Fuhr.
209+
--
210+
CREATE TEMP TABLE foo (id integer);
211+
CREATE TEMP TABLE bar (id1 integer, id2 integer);
212+
INSERT INTO foo VALUES (1);
213+
INSERT INTO bar VALUES (1, 1);
214+
INSERT INTO bar VALUES (2, 2);
215+
INSERT INTO bar VALUES (3, 1);
216+
-- These cases require an extra level of distinct-ing above subquery s
217+
SELECT * FROM foo WHERE id IN
218+
(SELECT id2 FROM (SELECT DISTINCT id1, id2 FROM bar) AS s);
219+
id
220+
----
221+
1
222+
(1 row)
223+
224+
SELECT * FROM foo WHERE id IN
225+
(SELECT id2 FROM (SELECT id1,id2 FROM bar GROUP BY id1,id2) AS s);
226+
id
227+
----
228+
1
229+
(1 row)
230+
231+
SELECT * FROM foo WHERE id IN
232+
(SELECT id2 FROM (SELECT id1, id2 FROM bar UNION
233+
SELECT id1, id2 FROM bar) AS s);
234+
id
235+
----
236+
1
237+
(1 row)
238+
239+
-- These cases do not
240+
SELECT * FROM foo WHERE id IN
241+
(SELECT id2 FROM (SELECT DISTINCT ON (id2) id1, id2 FROM bar) AS s);
242+
id
243+
----
244+
1
245+
(1 row)
246+
247+
SELECT * FROM foo WHERE id IN
248+
(SELECT id2 FROM (SELECT id2 FROM bar GROUP BY id2) AS s);
249+
id
250+
----
251+
1
252+
(1 row)
253+
254+
SELECT * FROM foo WHERE id IN
255+
(SELECT id2 FROM (SELECT id2 FROM bar UNION
256+
SELECT id2 FROM bar) AS s);
257+
id
258+
----
259+
1
260+
(1 row)
261+
205262
--
206263
-- Test case to catch problems with multiply nested sub-SELECTs not getting
207264
-- recalculated properly. Per bug report from Didier Moens.

src/test/regress/sql/subselect.sql

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,39 @@ select count(distinct ss.ten) from
9595
(select ten from tenk1 a
9696
where unique1 IN (select distinct hundred from tenk1 b)) ss;
9797

98+
--
99+
-- Test cases to check for overenthusiastic optimization of
100+
-- "IN (SELECT DISTINCT ...)" and related cases. Per example from
101+
-- Luca Pireddu and Michael Fuhr.
102+
--
103+
104+
CREATE TEMP TABLE foo (id integer);
105+
CREATE TEMP TABLE bar (id1 integer, id2 integer);
106+
107+
INSERT INTO foo VALUES (1);
108+
109+
INSERT INTO bar VALUES (1, 1);
110+
INSERT INTO bar VALUES (2, 2);
111+
INSERT INTO bar VALUES (3, 1);
112+
113+
-- These cases require an extra level of distinct-ing above subquery s
114+
SELECT * FROM foo WHERE id IN
115+
(SELECT id2 FROM (SELECT DISTINCT id1, id2 FROM bar) AS s);
116+
SELECT * FROM foo WHERE id IN
117+
(SELECT id2 FROM (SELECT id1,id2 FROM bar GROUP BY id1,id2) AS s);
118+
SELECT * FROM foo WHERE id IN
119+
(SELECT id2 FROM (SELECT id1, id2 FROM bar UNION
120+
SELECT id1, id2 FROM bar) AS s);
121+
122+
-- These cases do not
123+
SELECT * FROM foo WHERE id IN
124+
(SELECT id2 FROM (SELECT DISTINCT ON (id2) id1, id2 FROM bar) AS s);
125+
SELECT * FROM foo WHERE id IN
126+
(SELECT id2 FROM (SELECT id2 FROM bar GROUP BY id2) AS s);
127+
SELECT * FROM foo WHERE id IN
128+
(SELECT id2 FROM (SELECT id2 FROM bar UNION
129+
SELECT id2 FROM bar) AS s);
130+
98131
--
99132
-- Test case to catch problems with multiply nested sub-SELECTs not getting
100133
-- recalculated properly. Per bug report from Didier Moens.

0 commit comments

Comments
 (0)