Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 036bdce

Browse files
committed
Teach planner how to estimate rows for timestamp generate_series
This provides the planner with row estimates for generate_series(TIMESTAMP, TIMESTAMP, INTERVAL), generate_series(TIMESTAMPTZ, TIMESTAMPTZ, INTERVAL) and generate_series(TIMESTAMPTZ, TIMESTAMPTZ, INTERVAL, TEXT) when the input parameter values can be estimated during planning. Author: David Rowley Reviewed-by: jian he <jian.universality@gmail.com> Discussion: https://postgr.es/m/CAApHDvrBE%3D%2BASo_sGYmQJ3GvO8GPvX5yxXhRS%3Dt_ybd4odFkhQ%40mail.gmail.com
1 parent 5193ca8 commit 036bdce

File tree

4 files changed

+291
-3
lines changed

4 files changed

+291
-3
lines changed

src/backend/utils/adt/timestamp.c

+88
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include "funcapi.h"
2828
#include "libpq/pqformat.h"
2929
#include "miscadmin.h"
30+
#include "optimizer/optimizer.h"
3031
#include "nodes/nodeFuncs.h"
3132
#include "nodes/supportnodes.h"
3233
#include "parser/scansup.h"
@@ -6680,6 +6681,93 @@ generate_series_timestamptz_at_zone(PG_FUNCTION_ARGS)
66806681
return generate_series_timestamptz_internal(fcinfo);
66816682
}
66826683

6684+
/*
6685+
* Planner support function for generate_series(timestamp, timestamp, interval)
6686+
*/
6687+
Datum
6688+
generate_series_timestamp_support(PG_FUNCTION_ARGS)
6689+
{
6690+
Node *rawreq = (Node *) PG_GETARG_POINTER(0);
6691+
Node *ret = NULL;
6692+
6693+
if (IsA(rawreq, SupportRequestRows))
6694+
{
6695+
/* Try to estimate the number of rows returned */
6696+
SupportRequestRows *req = (SupportRequestRows *) rawreq;
6697+
6698+
if (is_funcclause(req->node)) /* be paranoid */
6699+
{
6700+
List *args = ((FuncExpr *) req->node)->args;
6701+
Node *arg1,
6702+
*arg2,
6703+
*arg3;
6704+
6705+
/* We can use estimated argument values here */
6706+
arg1 = estimate_expression_value(req->root, linitial(args));
6707+
arg2 = estimate_expression_value(req->root, lsecond(args));
6708+
arg3 = estimate_expression_value(req->root, lthird(args));
6709+
6710+
/*
6711+
* If any argument is constant NULL, we can safely assume that
6712+
* zero rows are returned. Otherwise, if they're all non-NULL
6713+
* constants, we can calculate the number of rows that will be
6714+
* returned.
6715+
*/
6716+
if ((IsA(arg1, Const) && ((Const *) arg1)->constisnull) ||
6717+
(IsA(arg2, Const) && ((Const *) arg2)->constisnull) ||
6718+
(IsA(arg3, Const) && ((Const *) arg3)->constisnull))
6719+
{
6720+
req->rows = 0;
6721+
ret = (Node *) req;
6722+
}
6723+
else if (IsA(arg1, Const) && IsA(arg2, Const) && IsA(arg3, Const))
6724+
{
6725+
Timestamp start,
6726+
finish;
6727+
Interval *step;
6728+
Datum diff;
6729+
double dstep;
6730+
int64 dummy;
6731+
6732+
start = DatumGetTimestamp(((Const *) arg1)->constvalue);
6733+
finish = DatumGetTimestamp(((Const *) arg2)->constvalue);
6734+
step = DatumGetIntervalP(((Const *) arg3)->constvalue);
6735+
6736+
/*
6737+
* Perform some prechecks which could cause timestamp_mi to
6738+
* raise an ERROR. It's much better to just return some
6739+
* default estimate than error out in a support function.
6740+
*/
6741+
if (!TIMESTAMP_NOT_FINITE(start) && !TIMESTAMP_NOT_FINITE(finish) &&
6742+
!pg_sub_s64_overflow(finish, start, &dummy))
6743+
{
6744+
diff = DirectFunctionCall2(timestamp_mi,
6745+
TimestampGetDatum(finish),
6746+
TimestampGetDatum(start));
6747+
6748+
#define INTERVAL_TO_MICROSECONDS(i) ((((double) (i)->month * DAYS_PER_MONTH + (i)->day)) * USECS_PER_DAY + (i)->time)
6749+
6750+
dstep = INTERVAL_TO_MICROSECONDS(step);
6751+
6752+
/* This equation works for either sign of step */
6753+
if (dstep != 0.0)
6754+
{
6755+
Interval *idiff = DatumGetIntervalP(diff);
6756+
double ddiff = INTERVAL_TO_MICROSECONDS(idiff);
6757+
6758+
req->rows = floor(ddiff / dstep + 1.0);
6759+
ret = (Node *) req;
6760+
}
6761+
#undef INTERVAL_TO_MICROSECONDS
6762+
}
6763+
}
6764+
}
6765+
}
6766+
6767+
PG_RETURN_POINTER(ret);
6768+
}
6769+
6770+
66836771
/* timestamp_at_local()
66846772
* timestamptz_at_local()
66856773
*

src/include/catalog/pg_proc.dat

+9-3
Original file line numberDiff line numberDiff line change
@@ -8331,19 +8331,25 @@
83318331
prorettype => 'numeric', proargtypes => 'numeric numeric',
83328332
prosrc => 'generate_series_numeric' },
83338333
{ oid => '938', descr => 'non-persistent series generator',
8334-
proname => 'generate_series', prorows => '1000', proretset => 't',
8334+
proname => 'generate_series', prorows => '1000',
8335+
prosupport => 'generate_series_timestamp_support', proretset => 't',
83358336
prorettype => 'timestamp', proargtypes => 'timestamp timestamp interval',
83368337
prosrc => 'generate_series_timestamp' },
83378338
{ oid => '939', descr => 'non-persistent series generator',
8338-
proname => 'generate_series', prorows => '1000', proretset => 't',
8339+
proname => 'generate_series', prorows => '1000',
8340+
prosupport => 'generate_series_timestamp_support', proretset => 't',
83398341
provolatile => 's', prorettype => 'timestamptz',
83408342
proargtypes => 'timestamptz timestamptz interval',
83418343
prosrc => 'generate_series_timestamptz' },
83428344
{ oid => '6274', descr => 'non-persistent series generator',
8343-
proname => 'generate_series', prorows => '1000', proretset => 't',
8345+
proname => 'generate_series', prorows => '1000',
8346+
prosupport => 'generate_series_timestamp_support', proretset => 't',
83448347
prorettype => 'timestamptz',
83458348
proargtypes => 'timestamptz timestamptz interval text',
83468349
prosrc => 'generate_series_timestamptz_at_zone' },
8350+
{ oid => '8402', descr => 'planner support for generate_series',
8351+
proname => 'generate_series_timestamp_support', prorettype => 'internal',
8352+
proargtypes => 'internal', prosrc => 'generate_series_timestamp_support' },
83478353

83488354
# boolean aggregates
83498355
{ oid => '2515', descr => 'aggregate transition function',

src/test/regress/expected/misc_functions.out

+110
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,43 @@
22
\getenv libdir PG_LIBDIR
33
\getenv dlsuffix PG_DLSUFFIX
44
\set regresslib :libdir '/regress' :dlsuffix
5+
-- Function to assist with verifying EXPLAIN which includes costs. A series
6+
-- of bool flags allows control over which portions are masked out
7+
CREATE FUNCTION explain_mask_costs(query text, do_analyze bool,
8+
hide_costs bool, hide_row_est bool, hide_width bool) RETURNS setof text
9+
LANGUAGE plpgsql AS
10+
$$
11+
DECLARE
12+
ln text;
13+
analyze_str text;
14+
BEGIN
15+
IF do_analyze = true THEN
16+
analyze_str := 'on';
17+
ELSE
18+
analyze_str := 'off';
19+
END IF;
20+
21+
FOR ln IN
22+
EXECUTE format('explain (analyze %s, costs on, summary off, timing off) %s',
23+
analyze_str, query)
24+
LOOP
25+
IF hide_costs = true THEN
26+
ln := regexp_replace(ln, 'cost=\d+\.\d\d\.\.\d+\.\d\d', 'cost=N..N');
27+
END IF;
28+
29+
IF hide_row_est = true THEN
30+
-- don't use 'g' so that we leave the actual rows intact
31+
ln := regexp_replace(ln, 'rows=\d+', 'rows=N');
32+
END IF;
33+
34+
IF hide_width = true THEN
35+
ln := regexp_replace(ln, 'width=\d+', 'width=N');
36+
END IF;
37+
38+
RETURN NEXT ln;
39+
END LOOP;
40+
END;
41+
$$;
542
--
643
-- num_nulls()
744
--
@@ -594,6 +631,78 @@ SELECT * FROM tenk1 a JOIN my_gen_series(1,10) g ON a.unique1 = g;
594631
Index Cond: (unique1 = g.g)
595632
(4 rows)
596633

634+
--
635+
-- Test the SupportRequestRows support function for generate_series_timestamp()
636+
--
637+
-- Ensure the row estimate matches the actual rows
638+
SELECT explain_mask_costs($$
639+
SELECT * FROM generate_series(TIMESTAMPTZ '2024-02-01', TIMESTAMPTZ '2024-03-01', INTERVAL '1 day') g(s);$$,
640+
true, true, false, true);
641+
explain_mask_costs
642+
------------------------------------------------------------------------------------------
643+
Function Scan on generate_series g (cost=N..N rows=30 width=N) (actual rows=30 loops=1)
644+
(1 row)
645+
646+
-- As above but with generate_series_timestamp
647+
SELECT explain_mask_costs($$
648+
SELECT * FROM generate_series(TIMESTAMP '2024-02-01', TIMESTAMP '2024-03-01', INTERVAL '1 day') g(s);$$,
649+
true, true, false, true);
650+
explain_mask_costs
651+
------------------------------------------------------------------------------------------
652+
Function Scan on generate_series g (cost=N..N rows=30 width=N) (actual rows=30 loops=1)
653+
(1 row)
654+
655+
-- As above but with generate_series_timestamptz_at_zone()
656+
SELECT explain_mask_costs($$
657+
SELECT * FROM generate_series(TIMESTAMPTZ '2024-02-01', TIMESTAMPTZ '2024-03-01', INTERVAL '1 day', 'UTC') g(s);$$,
658+
true, true, false, true);
659+
explain_mask_costs
660+
------------------------------------------------------------------------------------------
661+
Function Scan on generate_series g (cost=N..N rows=30 width=N) (actual rows=30 loops=1)
662+
(1 row)
663+
664+
-- Ensure the estimated and actual row counts match when the range isn't
665+
-- evenly divisible by the step
666+
SELECT explain_mask_costs($$
667+
SELECT * FROM generate_series(TIMESTAMPTZ '2024-02-01', TIMESTAMPTZ '2024-03-01', INTERVAL '7 day') g(s);$$,
668+
true, true, false, true);
669+
explain_mask_costs
670+
----------------------------------------------------------------------------------------
671+
Function Scan on generate_series g (cost=N..N rows=5 width=N) (actual rows=5 loops=1)
672+
(1 row)
673+
674+
-- Ensure the estimates match when step is decreasing
675+
SELECT explain_mask_costs($$
676+
SELECT * FROM generate_series(TIMESTAMPTZ '2024-03-01', TIMESTAMPTZ '2024-02-01', INTERVAL '-1 day') g(s);$$,
677+
true, true, false, true);
678+
explain_mask_costs
679+
------------------------------------------------------------------------------------------
680+
Function Scan on generate_series g (cost=N..N rows=30 width=N) (actual rows=30 loops=1)
681+
(1 row)
682+
683+
-- Ensure an empty range estimates 1 row
684+
SELECT explain_mask_costs($$
685+
SELECT * FROM generate_series(TIMESTAMPTZ '2024-03-01', TIMESTAMPTZ '2024-02-01', INTERVAL '1 day') g(s);$$,
686+
true, true, false, true);
687+
explain_mask_costs
688+
----------------------------------------------------------------------------------------
689+
Function Scan on generate_series g (cost=N..N rows=1 width=N) (actual rows=0 loops=1)
690+
(1 row)
691+
692+
-- Ensure we get the default row estimate for infinity values
693+
SELECT explain_mask_costs($$
694+
SELECT * FROM generate_series(TIMESTAMPTZ '-infinity', TIMESTAMPTZ 'infinity', INTERVAL '1 day') g(s);$$,
695+
false, true, false, true);
696+
explain_mask_costs
697+
-------------------------------------------------------------------
698+
Function Scan on generate_series g (cost=N..N rows=1000 width=N)
699+
(1 row)
700+
701+
-- Ensure the row estimate behaves correctly when step size is zero.
702+
-- We expect generate_series_timestamp() to throw the error rather than in
703+
-- the support function.
704+
SELECT * FROM generate_series(TIMESTAMPTZ '2024-02-01', TIMESTAMPTZ '2024-03-01', INTERVAL '0 day') g(s);
705+
ERROR: step size cannot equal zero
597706
-- Test functions for control data
598707
SELECT count(*) > 0 AS ok FROM pg_control_checkpoint();
599708
ok
@@ -706,3 +815,4 @@ SELECT pg_column_toast_chunk_id(a) IS NULL,
706815
(1 row)
707816

708817
DROP TABLE test_chunk_id;
818+
DROP FUNCTION explain_mask_costs(text, bool, bool, bool, bool);

src/test/regress/sql/misc_functions.sql

+84
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,44 @@
44

55
\set regresslib :libdir '/regress' :dlsuffix
66

7+
-- Function to assist with verifying EXPLAIN which includes costs. A series
8+
-- of bool flags allows control over which portions are masked out
9+
CREATE FUNCTION explain_mask_costs(query text, do_analyze bool,
10+
hide_costs bool, hide_row_est bool, hide_width bool) RETURNS setof text
11+
LANGUAGE plpgsql AS
12+
$$
13+
DECLARE
14+
ln text;
15+
analyze_str text;
16+
BEGIN
17+
IF do_analyze = true THEN
18+
analyze_str := 'on';
19+
ELSE
20+
analyze_str := 'off';
21+
END IF;
22+
23+
FOR ln IN
24+
EXECUTE format('explain (analyze %s, costs on, summary off, timing off) %s',
25+
analyze_str, query)
26+
LOOP
27+
IF hide_costs = true THEN
28+
ln := regexp_replace(ln, 'cost=\d+\.\d\d\.\.\d+\.\d\d', 'cost=N..N');
29+
END IF;
30+
31+
IF hide_row_est = true THEN
32+
-- don't use 'g' so that we leave the actual rows intact
33+
ln := regexp_replace(ln, 'rows=\d+', 'rows=N');
34+
END IF;
35+
36+
IF hide_width = true THEN
37+
ln := regexp_replace(ln, 'width=\d+', 'width=N');
38+
END IF;
39+
40+
RETURN NEXT ln;
41+
END LOOP;
42+
END;
43+
$$;
44+
745
--
846
-- num_nulls()
947
--
@@ -224,6 +262,51 @@ SELECT * FROM tenk1 a JOIN my_gen_series(1,1000) g ON a.unique1 = g;
224262
EXPLAIN (COSTS OFF)
225263
SELECT * FROM tenk1 a JOIN my_gen_series(1,10) g ON a.unique1 = g;
226264

265+
--
266+
-- Test the SupportRequestRows support function for generate_series_timestamp()
267+
--
268+
269+
-- Ensure the row estimate matches the actual rows
270+
SELECT explain_mask_costs($$
271+
SELECT * FROM generate_series(TIMESTAMPTZ '2024-02-01', TIMESTAMPTZ '2024-03-01', INTERVAL '1 day') g(s);$$,
272+
true, true, false, true);
273+
274+
-- As above but with generate_series_timestamp
275+
SELECT explain_mask_costs($$
276+
SELECT * FROM generate_series(TIMESTAMP '2024-02-01', TIMESTAMP '2024-03-01', INTERVAL '1 day') g(s);$$,
277+
true, true, false, true);
278+
279+
-- As above but with generate_series_timestamptz_at_zone()
280+
SELECT explain_mask_costs($$
281+
SELECT * FROM generate_series(TIMESTAMPTZ '2024-02-01', TIMESTAMPTZ '2024-03-01', INTERVAL '1 day', 'UTC') g(s);$$,
282+
true, true, false, true);
283+
284+
-- Ensure the estimated and actual row counts match when the range isn't
285+
-- evenly divisible by the step
286+
SELECT explain_mask_costs($$
287+
SELECT * FROM generate_series(TIMESTAMPTZ '2024-02-01', TIMESTAMPTZ '2024-03-01', INTERVAL '7 day') g(s);$$,
288+
true, true, false, true);
289+
290+
-- Ensure the estimates match when step is decreasing
291+
SELECT explain_mask_costs($$
292+
SELECT * FROM generate_series(TIMESTAMPTZ '2024-03-01', TIMESTAMPTZ '2024-02-01', INTERVAL '-1 day') g(s);$$,
293+
true, true, false, true);
294+
295+
-- Ensure an empty range estimates 1 row
296+
SELECT explain_mask_costs($$
297+
SELECT * FROM generate_series(TIMESTAMPTZ '2024-03-01', TIMESTAMPTZ '2024-02-01', INTERVAL '1 day') g(s);$$,
298+
true, true, false, true);
299+
300+
-- Ensure we get the default row estimate for infinity values
301+
SELECT explain_mask_costs($$
302+
SELECT * FROM generate_series(TIMESTAMPTZ '-infinity', TIMESTAMPTZ 'infinity', INTERVAL '1 day') g(s);$$,
303+
false, true, false, true);
304+
305+
-- Ensure the row estimate behaves correctly when step size is zero.
306+
-- We expect generate_series_timestamp() to throw the error rather than in
307+
-- the support function.
308+
SELECT * FROM generate_series(TIMESTAMPTZ '2024-02-01', TIMESTAMPTZ '2024-03-01', INTERVAL '0 day') g(s);
309+
227310
-- Test functions for control data
228311
SELECT count(*) > 0 AS ok FROM pg_control_checkpoint();
229312
SELECT count(*) > 0 AS ok FROM pg_control_init();
@@ -273,3 +356,4 @@ SELECT pg_column_toast_chunk_id(a) IS NULL,
273356
pg_column_toast_chunk_id(b) IN (SELECT chunk_id FROM pg_toast.:toastrel)
274357
FROM test_chunk_id;
275358
DROP TABLE test_chunk_id;
359+
DROP FUNCTION explain_mask_costs(text, bool, bool, bool, bool);

0 commit comments

Comments
 (0)