Fix overenthusiastic optimization of 'x IN (SELECT DISTINCT ...)' and related

tglsfdc · tglsfdc · commit 0182951bc86e · 2005-07-15T17:09:26.000Z
cases: we can't just consider whether the subquery's output is unique on its
own terms, we have to check whether the set of output columns we are going to
use will be unique.  Per complaint from Luca Pireddu and test case from
Michael Fuhr.
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.122 2005/06/05 22:32:56 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.123 2005/07/15 17:09:25 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -34,7 +34,8 @@
 #include "utils/syscache.h"
 
 
-static bool is_distinct_query(Query *query);
+static List *translate_sub_tlist(List *tlist, int relid);
+static bool query_is_distinct_for(Query *query, List *colnos);
 static bool hash_safe_tlist(List *tlist);
 
 
@@ -800,14 +801,41 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath)
 	pathnode->subpath = subpath;
 
 	/*
-	 * If the input is a subquery whose output must be unique already, we
-	 * don't need to do anything.
+	 * Try to identify the targetlist that will actually be unique-ified.
+	 * In current usage, this routine is only used for sub-selects of IN
+	 * clauses, so we should be able to find the tlist in in_info_list.
+	 */
+	sub_targetlist = NIL;
+	foreach(l, root->in_info_list)
+	{
+		InClauseInfo *ininfo = (InClauseInfo *) lfirst(l);
+
+		if (bms_equal(ininfo->righthand, rel->relids))
+		{
+			sub_targetlist = ininfo->sub_targetlist;
+			break;
+		}
+	}
+
+	/*
+	 * If the input is a subquery whose output must be unique already,
+	 * then we don't need to do anything.  The test for uniqueness has
+	 * to consider exactly which columns we are extracting; for example
+	 * "SELECT DISTINCT x,y" doesn't guarantee that x alone is distinct.
+	 * So we cannot check for this optimization unless we found our own
+	 * targetlist above, and it consists only of simple Vars referencing
+	 * subquery outputs.  (Possibly we could do something with expressions
+	 * in the subquery outputs, too, but for now keep it simple.)
 	 */
-	if (rel->rtekind == RTE_SUBQUERY)
+	if (sub_targetlist && rel->rtekind == RTE_SUBQUERY)
 	{
 		RangeTblEntry *rte = rt_fetch(rel->relid, root->parse->rtable);
+		List   *sub_tlist_colnos;
+
+		sub_tlist_colnos = translate_sub_tlist(sub_targetlist, rel->relid);
 
-		if (is_distinct_query(rte->subquery))
+		if (sub_tlist_colnos &&
+			query_is_distinct_for(rte->subquery, sub_tlist_colnos))
 		{
 			pathnode->umethod = UNIQUE_PATH_NOOP;
 			pathnode->rows = rel->rows;
@@ -821,23 +849,6 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath)
 		}
 	}
 
-	/*
-	 * Try to identify the targetlist that will actually be unique-ified.
-	 * In current usage, this routine is only used for sub-selects of IN
-	 * clauses, so we should be able to find the tlist in in_info_list.
-	 */
-	sub_targetlist = NIL;
-	foreach(l, root->in_info_list)
-	{
-		InClauseInfo *ininfo = (InClauseInfo *) lfirst(l);
-
-		if (bms_equal(ininfo->righthand, rel->relids))
-		{
-			sub_targetlist = ininfo->sub_targetlist;
-			break;
-		}
-	}
-
 	/*
 	 * If we know the targetlist, try to estimate number of result rows;
 	 * otherwise punt.
@@ -913,46 +924,83 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath)
 }
 
 /*
- * is_distinct_query - does query never return duplicate rows?
+ * translate_sub_tlist - get subquery column numbers represented by tlist
+ *
+ * The given targetlist should contain only Vars referencing the given relid.
+ * Extract their varattnos (ie, the column numbers of the subquery) and return
+ * as an integer List.
+ *
+ * If any of the tlist items is not a simple Var, we cannot determine whether
+ * the subquery's uniqueness condition (if any) matches ours, so punt and
+ * return NIL.
  */
-static bool
-is_distinct_query(Query *query)
+static List *
+translate_sub_tlist(List *tlist, int relid)
 {
-	/* DISTINCT (but not DISTINCT ON) guarantees uniqueness */
-	if (has_distinct_clause(query))
-		return true;
+	List	   *result = NIL;
+	ListCell   *l;
 
-	/* UNION, INTERSECT, EXCEPT guarantee uniqueness, except with ALL */
-	if (query->setOperations)
+	foreach(l, tlist)
 	{
-		SetOperationStmt *topop = (SetOperationStmt *) query->setOperations;
+		Var	   *var = (Var *) lfirst(l);
 
-		Assert(IsA(topop, SetOperationStmt));
-		Assert(topop->op != SETOP_NONE);
+		if (!var || !IsA(var, Var) ||
+			var->varno != relid)
+			return NIL;			/* punt */
 
-		if (!topop->all)
+		result = lappend_int(result, var->varattno);
+	}
+	return result;
+}
+
+/*
+ * query_is_distinct_for - does query never return duplicates of the
+ *		specified columns?
+ *
+ * colnos is an integer list of output column numbers (resno's).  We are
+ * interested in whether rows consisting of just these columns are certain
+ * to be distinct.
+ */
+static bool
+query_is_distinct_for(Query *query, List *colnos)
+{
+	ListCell   *l;
+
+	/*
+	 * DISTINCT (including DISTINCT ON) guarantees uniqueness if all the
+	 * columns in the DISTINCT clause appear in colnos.
+	 */
+	if (query->distinctClause)
+	{
+		foreach(l, query->distinctClause)
+		{
+			SortClause *scl = (SortClause *) lfirst(l);
+			TargetEntry *tle = get_sortgroupclause_tle(scl,
+													   query->targetList);
+
+			if (!list_member_int(colnos, tle->resno))
+				break;			/* exit early if no match */
+		}
+		if (l == NULL)			/* had matches for all? */
 			return true;
 	}
 
 	/*
-	 * GROUP BY guarantees uniqueness if all the grouped columns appear in
-	 * the output.	In our implementation this means checking they are non
-	 * resjunk columns.
+	 * Similarly, GROUP BY guarantees uniqueness if all the grouped columns
+	 * appear in colnos.
 	 */
 	if (query->groupClause)
 	{
-		ListCell   *gl;
-
-		foreach(gl, query->groupClause)
+		foreach(l, query->groupClause)
 		{
-			GroupClause *grpcl = (GroupClause *) lfirst(gl);
+			GroupClause *grpcl = (GroupClause *) lfirst(l);
 			TargetEntry *tle = get_sortgroupclause_tle(grpcl,
 													   query->targetList);
 
-			if (tle->resjunk)
-				break;
+			if (!list_member_int(colnos, tle->resno))
+				break;			/* exit early if no match */
 		}
-		if (!gl)				/* got to the end? */
+		if (l == NULL)			/* had matches for all? */
 			return true;
 	}
 	else
@@ -965,6 +1013,33 @@ is_distinct_query(Query *query)
 			return true;
 	}
 
+	/*
+	 * UNION, INTERSECT, EXCEPT guarantee uniqueness of the whole output row,
+	 * except with ALL
+	 */
+	if (query->setOperations)
+	{
+		SetOperationStmt *topop = (SetOperationStmt *) query->setOperations;
+
+		Assert(IsA(topop, SetOperationStmt));
+		Assert(topop->op != SETOP_NONE);
+
+		if (!topop->all)
+		{
+			/* We're good if all the nonjunk output columns are in colnos */
+			foreach(l, query->targetList)
+			{
+				TargetEntry *tle = (TargetEntry *) lfirst(l);
+
+				if (!tle->resjunk &&
+					!list_member_int(colnos, tle->resno))
+					break;		/* exit early if no match */
+			}
+			if (l == NULL)		/* had matches for all? */
+				return true;
+		}
+	}
+
 	/*
 	 * XXX Are there any other cases in which we can easily see the result
 	 * must be distinct?
diff --git a/src/test/regress/expected/subselect.out b/src/test/regress/expected/subselect.out
@@ -202,6 +202,63 @@ select count(distinct ss.ten) from
     10
 (1 row)
 
+--
+-- Test cases to check for overenthusiastic optimization of
+-- "IN (SELECT DISTINCT ...)" and related cases.  Per example from
+-- Luca Pireddu and Michael Fuhr.
+--
+CREATE TEMP TABLE foo (id integer);
+CREATE TEMP TABLE bar (id1 integer, id2 integer);
+INSERT INTO foo VALUES (1);
+INSERT INTO bar VALUES (1, 1);
+INSERT INTO bar VALUES (2, 2);
+INSERT INTO bar VALUES (3, 1);
+-- These cases require an extra level of distinct-ing above subquery s
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT DISTINCT id1, id2 FROM bar) AS s);
+ id 
+----
+  1
+(1 row)
+
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id1,id2 FROM bar GROUP BY id1,id2) AS s);
+ id 
+----
+  1
+(1 row)
+
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id1, id2 FROM bar UNION
+                      SELECT id1, id2 FROM bar) AS s);
+ id 
+----
+  1
+(1 row)
+
+-- These cases do not
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT DISTINCT ON (id2) id1, id2 FROM bar) AS s);
+ id 
+----
+  1
+(1 row)
+
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id2 FROM bar GROUP BY id2) AS s);
+ id 
+----
+  1
+(1 row)
+
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id2 FROM bar UNION
+                      SELECT id2 FROM bar) AS s);
+ id 
+----
+  1
+(1 row)
+
 --
 -- Test case to catch problems with multiply nested sub-SELECTs not getting
 -- recalculated properly.  Per bug report from Didier Moens.
diff --git a/src/test/regress/sql/subselect.sql b/src/test/regress/sql/subselect.sql
@@ -95,6 +95,39 @@ select count(distinct ss.ten) from
   (select ten from tenk1 a
    where unique1 IN (select distinct hundred from tenk1 b)) ss;
 
+--
+-- Test cases to check for overenthusiastic optimization of
+-- "IN (SELECT DISTINCT ...)" and related cases.  Per example from
+-- Luca Pireddu and Michael Fuhr.
+--
+
+CREATE TEMP TABLE foo (id integer);
+CREATE TEMP TABLE bar (id1 integer, id2 integer);
+
+INSERT INTO foo VALUES (1);
+
+INSERT INTO bar VALUES (1, 1);
+INSERT INTO bar VALUES (2, 2);
+INSERT INTO bar VALUES (3, 1);
+
+-- These cases require an extra level of distinct-ing above subquery s
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT DISTINCT id1, id2 FROM bar) AS s);
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id1,id2 FROM bar GROUP BY id1,id2) AS s);
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id1, id2 FROM bar UNION
+                      SELECT id1, id2 FROM bar) AS s);
+
+-- These cases do not
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT DISTINCT ON (id2) id1, id2 FROM bar) AS s);
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id2 FROM bar GROUP BY id2) AS s);
+SELECT * FROM foo WHERE id IN
+    (SELECT id2 FROM (SELECT id2 FROM bar UNION
+                      SELECT id2 FROM bar) AS s);
+
 --
 -- Test case to catch problems with multiply nested sub-SELECTs not getting
 -- recalculated properly.  Per bug report from Didier Moens.