Consider the number of columns when estimating the Sort's cost.

danolivo · danolivo · commit 4058c72a6e34 · 2024-07-25T09:28:01.000+07:00
During the multicolumn sort, we apply a comparison operator to each column one
by one. So, even keeping aside the cost of a specific operator, we should take
into account the number of its calls.

This code impacts the cost model of GatherMerge, IncrementalSort, Sort, and
MergeAppend. It also alters the balance between ordered and hashed operations,
which should be benchmarked and tuned carefully.
diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -9984,13 +9984,16 @@ SELECT t1.a,t2.b,t3.c FROM fprt1 t1 INNER JOIN fprt2 t2 ON (t1.a = t2.b) INNER J
 -- left outer join + nullable clause
 EXPLAIN (VERBOSE, COSTS OFF)
 SELECT t1.a,t2.b,t2.c FROM fprt1 t1 LEFT JOIN (SELECT * FROM fprt2 WHERE a < 10) t2 ON (t1.a = t2.b and t1.b = t2.a) WHERE t1.a < 10 ORDER BY 1,2,3;
-                                                                                                                     QUERY PLAN                                                                                                                     
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
- Foreign Scan
+                                                                                    QUERY PLAN                                                                                     
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ Sort
    Output: t1.a, fprt2.b, fprt2.c
-   Relations: (public.ftprt1_p1 t1) LEFT JOIN (public.ftprt2_p1 fprt2)
-   Remote SQL: SELECT r5.a, r6.b, r6.c FROM (public.fprt1_p1 r5 LEFT JOIN public.fprt2_p1 r6 ON (((r5.a = r6.b)) AND ((r5.b = r6.a)) AND ((r6.a < 10)))) WHERE ((r5.a < 10)) ORDER BY r5.a ASC NULLS LAST, r6.b ASC NULLS LAST, r6.c ASC NULLS LAST
-(4 rows)
+   Sort Key: t1.a, fprt2.b, fprt2.c
+   ->  Foreign Scan
+         Output: t1.a, fprt2.b, fprt2.c
+         Relations: (public.ftprt1_p1 t1) LEFT JOIN (public.ftprt2_p1 fprt2)
+         Remote SQL: SELECT r5.a, r6.b, r6.c FROM (public.fprt1_p1 r5 LEFT JOIN public.fprt2_p1 r6 ON (((r5.a = r6.b)) AND ((r5.b = r6.a)) AND ((r6.a < 10)))) WHERE ((r5.a < 10))
+(7 rows)
 
 SELECT t1.a,t2.b,t2.c FROM fprt1 t1 LEFT JOIN (SELECT * FROM fprt2 WHERE a < 10) t2 ON (t1.a = t2.b and t1.b = t2.a) WHERE t1.a < 10 ORDER BY 1,2,3;
  a | b |  c   
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
@@ -481,6 +481,8 @@ cost_gather_merge(GatherMergePath *path, PlannerInfo *root,
 	Cost		comparison_cost;
 	double		N;
 	double		logN;
+	int			npathkeys = list_length(((Path *) path)->pathkeys);
+	int			cmpMultiplier = (npathkeys == 0) ? 2.0 : npathkeys + 1.0;
 
 	/* Mark the path with the correct row estimate */
 	if (rows)
@@ -503,7 +505,7 @@ cost_gather_merge(GatherMergePath *path, PlannerInfo *root,
 	logN = LOG2(N);
 
 	/* Assumed cost per tuple comparison */
-	comparison_cost = 2.0 * cpu_operator_cost;
+	comparison_cost = cmpMultiplier * cpu_operator_cost;
 
 	/* Heap creation cost */
 	startup_cost += comparison_cost * N * logN;
@@ -1861,7 +1863,7 @@ cost_recursive_union(Path *runion, Path *nrterm, Path *rterm)
  */
 static void
 cost_tuplesort(Cost *startup_cost, Cost *run_cost,
-			   double tuples, int width,
+			   double tuples, int width, int cmpMultiplier,
 			   Cost comparison_cost, int sort_mem,
 			   double limit_tuples)
 {
@@ -1878,7 +1880,7 @@ cost_tuplesort(Cost *startup_cost, Cost *run_cost,
 		tuples = 2.0;
 
 	/* Include the default cost-per-comparison */
-	comparison_cost += 2.0 * cpu_operator_cost;
+	comparison_cost += cmpMultiplier * cpu_operator_cost;
 
 	/* Do we have a useful LIMIT? */
 	if (limit_tuples > 0 && limit_tuples < tuples)
@@ -2050,7 +2052,9 @@ cost_incremental_sort(Path *path,
 	 * are equal.
 	 */
 	cost_tuplesort(&group_startup_cost, &group_run_cost,
-				   group_tuples, width, comparison_cost, sort_mem,
+				   group_tuples, width,
+				   list_length(pathkeys) + 1,
+				   comparison_cost, sort_mem,
 				   limit_tuples);
 
 	/*
@@ -2074,7 +2078,7 @@ cost_incremental_sort(Path *path,
 	 * detect the sort groups. This is roughly equal to one extra copy and
 	 * comparison per tuple.
 	 */
-	run_cost += (cpu_tuple_cost + comparison_cost) * input_tuples;
+	run_cost += (cpu_tuple_cost + (presorted_keys + 1) * comparison_cost) * input_tuples;
 
 	/*
 	 * Additionally, we charge double cpu_tuple_cost for each input group to
@@ -2108,9 +2112,11 @@ cost_sort(Path *path, PlannerInfo *root,
 {
 	Cost		startup_cost;
 	Cost		run_cost;
+	int			cmpMultiplier =
+						(pathkeys == NIL) ? 2.0 : list_length(pathkeys) + 1.0;
 
 	cost_tuplesort(&startup_cost, &run_cost,
-				   tuples, width,
+				   tuples, width, cmpMultiplier,
 				   comparison_cost, sort_mem,
 				   limit_tuples);
 
@@ -2390,6 +2396,8 @@ cost_merge_append(Path *path, PlannerInfo *root,
 	Cost		comparison_cost;
 	double		N;
 	double		logN;
+	int			cmpMultiplier =
+						(pathkeys == NIL) ? 2.0 : list_length(pathkeys) + 1.0;
 
 	/*
 	 * Avoid log(0)...
@@ -2398,7 +2406,7 @@ cost_merge_append(Path *path, PlannerInfo *root,
 	logN = LOG2(N);
 
 	/* Assumed cost per tuple comparison */
-	comparison_cost = 2.0 * cpu_operator_cost;
+	comparison_cost = cmpMultiplier * cpu_operator_cost;
 
 	/* Heap creation cost */
 	startup_cost += comparison_cost * N * logN;
diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out
@@ -3001,17 +3001,18 @@ ANALYZE agg_sort_order;
 EXPLAIN (COSTS OFF)
 SELECT array_agg(c1 ORDER BY c2),c2
 FROM agg_sort_order WHERE c2 < 100 GROUP BY c1 ORDER BY 2;
-                                 QUERY PLAN                                 
-----------------------------------------------------------------------------
+                                QUERY PLAN                                
+--------------------------------------------------------------------------
  Sort
    Sort Key: c2
    ->  GroupAggregate
          Group Key: c1
-         ->  Sort
+         ->  Incremental Sort
                Sort Key: c1, c2
-               ->  Index Scan using agg_sort_order_c2_idx on agg_sort_order
-                     Index Cond: (c2 < 100)
-(8 rows)
+               Presorted Key: c1
+               ->  Index Scan using agg_sort_order_pkey on agg_sort_order
+                     Filter: (c2 < 100)
+(9 rows)
 
 DROP TABLE agg_sort_order CASCADE;
 DROP TABLE btg;
diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out
@@ -5726,18 +5726,20 @@ select d.* from d left join (select * from b group by b.id, b.c_id) s
 explain (costs off)
 select d.* from d left join (select distinct * from b) s
   on d.a = s.id;
-              QUERY PLAN              
---------------------------------------
- Merge Right Join
-   Merge Cond: (b.id = d.a)
-   ->  Unique
-         ->  Sort
-               Sort Key: b.id, b.c_id
-               ->  Seq Scan on b
+                 QUERY PLAN                  
+---------------------------------------------
+ Merge Left Join
+   Merge Cond: (d.a = s.id)
    ->  Sort
          Sort Key: d.a
          ->  Seq Scan on d
-(9 rows)
+   ->  Sort
+         Sort Key: s.id
+         ->  Subquery Scan on s
+               ->  HashAggregate
+                     Group Key: b.id, b.c_id
+                     ->  Seq Scan on b
+(11 rows)
 
 -- join removal is not possible here
 explain (costs off)
diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out
@@ -1235,9 +1235,11 @@ EXPLAIN (COSTS OFF)
 SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
                                  QUERY PLAN                                 
 ----------------------------------------------------------------------------
- Sort
+ Incremental Sort
    Sort Key: t1.a, t2.b, ((t3.a + t3.b))
-   ->  Append
+   Presorted Key: t1.a
+   ->  Merge Append
+         Sort Key: t1.a
          ->  Merge Left Join
                Merge Cond: (t1_1.a = t2_1.b)
                ->  Sort
@@ -1286,7 +1288,7 @@ SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2
                ->  Sort
                      Sort Key: t2_3.b
                      ->  Seq Scan on prt2_p3 t2_3
-(51 rows)
+(53 rows)
 
 SELECT t1.a, t1.c, t2.b, t2.c, t3.a + t3.b, t3.c FROM (prt1 t1 LEFT JOIN prt2 t2 ON t1.a = t2.b) RIGHT JOIN prt1_e t3 ON (t1.a = (t3.a + t3.b)/2) WHERE t3.c = 0 ORDER BY t1.a, t2.b, t3.a + t3.b;
   a  |  c   |  b  |  c   | ?column? | c 
diff --git a/src/test/regress/expected/union.out b/src/test/regress/expected/union.out
@@ -1224,18 +1224,17 @@ SELECT * FROM
    SELECT 2 AS t, 4 AS x) ss
 WHERE x < 4
 ORDER BY x;
-                    QUERY PLAN                    
---------------------------------------------------
+                 QUERY PLAN                 
+--------------------------------------------
  Sort
    Sort Key: (2)
-   ->  Unique
-         ->  Sort
-               Sort Key: (1), (2)
-               ->  Append
-                     ->  Result
-                     ->  Result
-                           One-Time Filter: false
-(9 rows)
+   ->  HashAggregate
+         Group Key: (1), (2)
+         ->  Append
+               ->  Result
+               ->  Result
+                     One-Time Filter: false
+(8 rows)
 
 SELECT * FROM
   (SELECT 1 AS t, 2 AS x
@@ -1289,19 +1288,18 @@ SELECT * FROM
    SELECT 2 AS t, 4 AS x) ss
 WHERE x > 3
 ORDER BY x;
-                                     QUERY PLAN                                     
-------------------------------------------------------------------------------------
+                                  QUERY PLAN                                   
+-------------------------------------------------------------------------------
  Sort
    Sort Key: ss.x
    ->  Subquery Scan on ss
          Filter: (ss.x > 3)
-         ->  Unique
-               ->  Sort
-                     Sort Key: (1), (((random() * '3'::double precision))::integer)
-                     ->  Append
-                           ->  Result
-                           ->  Result
-(10 rows)
+         ->  HashAggregate
+               Group Key: (1), (((random() * '3'::double precision))::integer)
+               ->  Append
+                     ->  Result
+                     ->  Result
+(9 rows)
 
 SELECT * FROM
   (SELECT 1 AS t, (random()*3)::int AS x
@@ -1322,24 +1320,22 @@ select distinct q1 from
    union all
    select distinct * from int8_tbl i82) ss
 where q2 = q2;
-                        QUERY PLAN                        
-----------------------------------------------------------
- Unique
-   ->  Merge Append
-         Sort Key: "*SELECT* 1".q1
+                     QUERY PLAN                     
+----------------------------------------------------
+ HashAggregate
+   Group Key: "*SELECT* 1".q1
+   ->  Append
          ->  Subquery Scan on "*SELECT* 1"
-               ->  Unique
-                     ->  Sort
-                           Sort Key: i81.q1, i81.q2
-                           ->  Seq Scan on int8_tbl i81
-                                 Filter: (q2 IS NOT NULL)
+               ->  HashAggregate
+                     Group Key: i81.q1, i81.q2
+                     ->  Seq Scan on int8_tbl i81
+                           Filter: (q2 IS NOT NULL)
          ->  Subquery Scan on "*SELECT* 2"
-               ->  Unique
-                     ->  Sort
-                           Sort Key: i82.q1, i82.q2
-                           ->  Seq Scan on int8_tbl i82
-                                 Filter: (q2 IS NOT NULL)
-(15 rows)
+               ->  HashAggregate
+                     Group Key: i82.q1, i82.q2
+                     ->  Seq Scan on int8_tbl i82
+                           Filter: (q2 IS NOT NULL)
+(13 rows)
 
 select distinct q1 from
   (select distinct * from int8_tbl i81