Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 5543677

Browse files
committed
Use Limit instead of Unique to implement DISTINCT, when possible
When all of the query's DISTINCT pathkeys have been marked as redundant due to EquivalenceClasses existing which contain constants, we can just implement the DISTINCT operation on a query by just limiting the number of returned rows to 1 instead of performing a Unique on all of the matching (duplicate) rows. This applies in cases such as: SELECT DISTINCT col,col2 FROM tab WHERE col = 1 AND col2 = 10; If there are any matching rows, then they must all be {1,10}. There's no point in fetching all of those and running a Unique operator on them to leave only a single row. Here we effectively just find the first row and then stop. We are obviously unable to apply this optimization if either the col = 1 or col2 = 10 were missing from the WHERE clause or if there were any additional columns in the SELECT clause. Such queries are probably not all that common, but detecting when we can apply this optimization amounts to checking if the distinct_pathkeys are NULL, which is very cheap indeed. Nothing is done here to check if the query already has a LIMIT clause. If it does then the plan may end up with 2 Limits nodes. There's no harm in that and it's probably not worth the complexity to unify them into a single Limit node. Author: David Rowley Reviewed-by: Richard Guo Discussion: https://postgr.es/m/CAApHDvqS0j8RUWRUSgCAXxOqnYjHUXmKwspRj4GzVfOO25ByHA@mail.gmail.com Discussion: https://postgr.es/m/MEYPR01MB7101CD5DA0A07C9DE2B74850A4239@MEYPR01MB7101.ausprd01.prod.outlook.com
1 parent b1099ec commit 5543677

File tree

5 files changed

+221
-11
lines changed

5 files changed

+221
-11
lines changed

src/backend/optimizer/plan/planner.c

+66-11
Original file line numberDiff line numberDiff line change
@@ -4780,11 +4780,46 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
47804780

47814781
if (pathkeys_contained_in(needed_pathkeys, path->pathkeys))
47824782
{
4783-
add_path(distinct_rel, (Path *)
4784-
create_upper_unique_path(root, distinct_rel,
4785-
path,
4786-
list_length(root->distinct_pathkeys),
4787-
numDistinctRows));
4783+
/*
4784+
* distinct_pathkeys may have become empty if all of the
4785+
* pathkeys were determined to be redundant. If all of the
4786+
* pathkeys are redundant then each DISTINCT target must only
4787+
* allow a single value, therefore all resulting tuples must
4788+
* be identical (or at least indistinguishable by an equality
4789+
* check). We can uniquify these tuples simply by just taking
4790+
* the first tuple. All we do here is add a path to do "LIMIT
4791+
* 1" atop of 'path'. When doing a DISTINCT ON we may still
4792+
* have a non-NIL sort_pathkeys list, so we must still only do
4793+
* this with paths which are correctly sorted by
4794+
* sort_pathkeys.
4795+
*/
4796+
if (root->distinct_pathkeys == NIL)
4797+
{
4798+
Node *limitCount;
4799+
4800+
limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid,
4801+
sizeof(int64),
4802+
Int64GetDatum(1), false,
4803+
FLOAT8PASSBYVAL);
4804+
4805+
/*
4806+
* If the query already has a LIMIT clause, then we could
4807+
* end up with a duplicate LimitPath in the final plan.
4808+
* That does not seem worth troubling over too much.
4809+
*/
4810+
add_path(distinct_rel, (Path *)
4811+
create_limit_path(root, distinct_rel, path, NULL,
4812+
limitCount, LIMIT_OPTION_COUNT,
4813+
0, 1));
4814+
}
4815+
else
4816+
{
4817+
add_path(distinct_rel, (Path *)
4818+
create_upper_unique_path(root, distinct_rel,
4819+
path,
4820+
list_length(root->distinct_pathkeys),
4821+
numDistinctRows));
4822+
}
47884823
}
47894824
}
47904825

@@ -4805,13 +4840,33 @@ create_final_distinct_paths(PlannerInfo *root, RelOptInfo *input_rel,
48054840
path = (Path *) create_sort_path(root, distinct_rel,
48064841
path,
48074842
needed_pathkeys,
4808-
-1.0);
4843+
root->distinct_pathkeys == NIL ?
4844+
1.0 : -1.0);
48094845

4810-
add_path(distinct_rel, (Path *)
4811-
create_upper_unique_path(root, distinct_rel,
4812-
path,
4813-
list_length(root->distinct_pathkeys),
4814-
numDistinctRows));
4846+
/*
4847+
* As above, use a LimitPath instead of a UniquePath when all of the
4848+
* distinct_pathkeys are redundant and we're only going to get a
4849+
* series of tuples all with the same values anyway.
4850+
*/
4851+
if (root->distinct_pathkeys == NIL)
4852+
{
4853+
Node *limitCount = (Node *) makeConst(INT8OID, -1, InvalidOid,
4854+
sizeof(int64),
4855+
Int64GetDatum(1), false,
4856+
FLOAT8PASSBYVAL);
4857+
4858+
add_path(distinct_rel, (Path *)
4859+
create_limit_path(root, distinct_rel, path, NULL,
4860+
limitCount, LIMIT_OPTION_COUNT, 0, 1));
4861+
}
4862+
else
4863+
{
4864+
add_path(distinct_rel, (Path *)
4865+
create_upper_unique_path(root, distinct_rel,
4866+
path,
4867+
list_length(root->distinct_pathkeys),
4868+
numDistinctRows));
4869+
}
48154870
}
48164871

48174872
/*

src/test/regress/expected/select_distinct.out

+54
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,60 @@ RESET max_parallel_workers_per_gather;
279279
RESET min_parallel_table_scan_size;
280280
RESET parallel_setup_cost;
281281
RESET parallel_tuple_cost;
282+
--
283+
-- Test the planner's ability to use a LIMIT 1 instead of a Unique node when
284+
-- all of the distinct_pathkeys have been marked as redundant
285+
--
286+
-- Ensure we get a plan with a Limit 1
287+
EXPLAIN (COSTS OFF)
288+
SELECT DISTINCT four FROM tenk1 WHERE four = 0;
289+
QUERY PLAN
290+
----------------------------
291+
Limit
292+
-> Seq Scan on tenk1
293+
Filter: (four = 0)
294+
(3 rows)
295+
296+
-- Ensure the above gives us the correct result
297+
SELECT DISTINCT four FROM tenk1 WHERE four = 0;
298+
four
299+
------
300+
0
301+
(1 row)
302+
303+
-- Ensure we get a plan with a Limit 1
304+
EXPLAIN (COSTS OFF)
305+
SELECT DISTINCT four FROM tenk1 WHERE four = 0 AND two <> 0;
306+
QUERY PLAN
307+
---------------------------------------------
308+
Limit
309+
-> Seq Scan on tenk1
310+
Filter: ((two <> 0) AND (four = 0))
311+
(3 rows)
312+
313+
-- Ensure no rows are returned
314+
SELECT DISTINCT four FROM tenk1 WHERE four = 0 AND two <> 0;
315+
four
316+
------
317+
(0 rows)
318+
319+
-- Ensure we get a plan with a Limit 1 when the SELECT list contains constants
320+
EXPLAIN (COSTS OFF)
321+
SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
322+
QUERY PLAN
323+
----------------------------
324+
Limit
325+
-> Seq Scan on tenk1
326+
Filter: (four = 0)
327+
(3 rows)
328+
329+
-- Ensure we only get 1 row
330+
SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
331+
four | ?column? | ?column? | ?column?
332+
------+----------+----------+----------
333+
0 | 1 | 2 | 3
334+
(1 row)
335+
282336
--
283337
-- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its
284338
-- very own regression file.

src/test/regress/expected/select_distinct_on.out

+50
Original file line numberDiff line numberDiff line change
@@ -73,3 +73,53 @@ select distinct on (1) floor(random()) as r, f1 from int4_tbl order by 1,2;
7373
0 | -2147483647
7474
(1 row)
7575

76+
--
77+
-- Test the planner's ability to use a LIMIT 1 instead of a Unique node when
78+
-- all of the distinct_pathkeys have been marked as redundant
79+
--
80+
-- Ensure we also get a LIMIT plan with DISTINCT ON
81+
EXPLAIN (COSTS OFF)
82+
SELECT DISTINCT ON (four) four,two
83+
FROM tenk1 WHERE four = 0 ORDER BY 1;
84+
QUERY PLAN
85+
----------------------------------
86+
Result
87+
-> Limit
88+
-> Seq Scan on tenk1
89+
Filter: (four = 0)
90+
(4 rows)
91+
92+
-- and check the result of the above query is correct
93+
SELECT DISTINCT ON (four) four,two
94+
FROM tenk1 WHERE four = 0 ORDER BY 1;
95+
four | two
96+
------+-----
97+
0 | 0
98+
(1 row)
99+
100+
-- Ensure a Sort -> Limit is used when the ORDER BY contains additional cols
101+
EXPLAIN (COSTS OFF)
102+
SELECT DISTINCT ON (four) four,two
103+
FROM tenk1 WHERE four = 0 ORDER BY 1,2;
104+
QUERY PLAN
105+
----------------------------------
106+
Limit
107+
-> Sort
108+
Sort Key: two
109+
-> Seq Scan on tenk1
110+
Filter: (four = 0)
111+
(5 rows)
112+
113+
-- Same again but use a column that is indexed so that we get an index scan
114+
-- then a limit
115+
EXPLAIN (COSTS OFF)
116+
SELECT DISTINCT ON (four) four,hundred
117+
FROM tenk1 WHERE four = 0 ORDER BY 1,2;
118+
QUERY PLAN
119+
-----------------------------------------------------
120+
Result
121+
-> Limit
122+
-> Index Scan using tenk1_hundred on tenk1
123+
Filter: (four = 0)
124+
(4 rows)
125+

src/test/regress/sql/select_distinct.sql

+26
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,32 @@ RESET min_parallel_table_scan_size;
146146
RESET parallel_setup_cost;
147147
RESET parallel_tuple_cost;
148148

149+
--
150+
-- Test the planner's ability to use a LIMIT 1 instead of a Unique node when
151+
-- all of the distinct_pathkeys have been marked as redundant
152+
--
153+
154+
-- Ensure we get a plan with a Limit 1
155+
EXPLAIN (COSTS OFF)
156+
SELECT DISTINCT four FROM tenk1 WHERE four = 0;
157+
158+
-- Ensure the above gives us the correct result
159+
SELECT DISTINCT four FROM tenk1 WHERE four = 0;
160+
161+
-- Ensure we get a plan with a Limit 1
162+
EXPLAIN (COSTS OFF)
163+
SELECT DISTINCT four FROM tenk1 WHERE four = 0 AND two <> 0;
164+
165+
-- Ensure no rows are returned
166+
SELECT DISTINCT four FROM tenk1 WHERE four = 0 AND two <> 0;
167+
168+
-- Ensure we get a plan with a Limit 1 when the SELECT list contains constants
169+
EXPLAIN (COSTS OFF)
170+
SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
171+
172+
-- Ensure we only get 1 row
173+
SELECT DISTINCT four,1,2,3 FROM tenk1 WHERE four = 0;
174+
149175
--
150176
-- Also, some tests of IS DISTINCT FROM, which doesn't quite deserve its
151177
-- very own regression file.

src/test/regress/sql/select_distinct_on.sql

+25
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,28 @@ SELECT DISTINCT ON (string4, ten) string4, ten, two
1717

1818
-- bug #5049: early 8.4.x chokes on volatile DISTINCT ON clauses
1919
select distinct on (1) floor(random()) as r, f1 from int4_tbl order by 1,2;
20+
21+
--
22+
-- Test the planner's ability to use a LIMIT 1 instead of a Unique node when
23+
-- all of the distinct_pathkeys have been marked as redundant
24+
--
25+
26+
-- Ensure we also get a LIMIT plan with DISTINCT ON
27+
EXPLAIN (COSTS OFF)
28+
SELECT DISTINCT ON (four) four,two
29+
FROM tenk1 WHERE four = 0 ORDER BY 1;
30+
31+
-- and check the result of the above query is correct
32+
SELECT DISTINCT ON (four) four,two
33+
FROM tenk1 WHERE four = 0 ORDER BY 1;
34+
35+
-- Ensure a Sort -> Limit is used when the ORDER BY contains additional cols
36+
EXPLAIN (COSTS OFF)
37+
SELECT DISTINCT ON (four) four,two
38+
FROM tenk1 WHERE four = 0 ORDER BY 1,2;
39+
40+
-- Same again but use a column that is indexed so that we get an index scan
41+
-- then a limit
42+
EXPLAIN (COSTS OFF)
43+
SELECT DISTINCT ON (four) four,hundred
44+
FROM tenk1 WHERE four = 0 ORDER BY 1,2;

0 commit comments

Comments
 (0)