Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 9bb3428

Browse files
committed
Rewrite the planner's handling of materialized plan types so that there is
an explicit model of rescan costs being different from first-time costs. The costing of Material nodes in particular now has some visible relationship to the actual runtime behavior, where before it was essentially fantasy. This also fixes up a couple of places where different materialized plan types were treated differently for no very good reason (probably just oversights). A couple of the regression tests are affected, because the planner now chooses to put the other relation on the inside of a nestloop-with-materialize. So far as I can see both changes are sane, and the planner is now more consistently following the expectation that it should prefer to materialize the smaller of two relations. Per a recent discussion with Robert Haas.
1 parent 5f1b32d commit 9bb3428

File tree

12 files changed

+1034
-921
lines changed

12 files changed

+1034
-921
lines changed

src/backend/executor/execAmi.c

+28-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
77
* Portions Copyright (c) 1994, Regents of the University of California
88
*
9-
* $PostgreSQL: pgsql/src/backend/executor/execAmi.c,v 1.103 2009/01/01 17:23:41 momjian Exp $
9+
* $PostgreSQL: pgsql/src/backend/executor/execAmi.c,v 1.104 2009/09/12 22:12:03 tgl Exp $
1010
*
1111
*-------------------------------------------------------------------------
1212
*/
@@ -496,3 +496,30 @@ IndexSupportsBackwardScan(Oid indexid)
496496

497497
return result;
498498
}
499+
500+
/*
501+
* ExecMaterializesOutput - does a plan type materialize its output?
502+
*
503+
* Returns true if the plan node type is one that automatically materializes
504+
* its output (typically by keeping it in a tuplestore). For such plans,
505+
* a rescan without any parameter change will have zero startup cost and
506+
* very low per-tuple cost.
507+
*/
508+
bool
509+
ExecMaterializesOutput(NodeTag plantype)
510+
{
511+
switch (plantype)
512+
{
513+
case T_Material:
514+
case T_FunctionScan:
515+
case T_CteScan:
516+
case T_WorkTableScan:
517+
case T_Sort:
518+
return true;
519+
520+
default:
521+
break;
522+
}
523+
524+
return false;
525+
}

src/backend/optimizer/path/costsize.c

+158-54
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454
* Portions Copyright (c) 1994, Regents of the University of California
5555
*
5656
* IDENTIFICATION
57-
* $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.210 2009/07/11 04:09:33 tgl Exp $
57+
* $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.211 2009/09/12 22:12:03 tgl Exp $
5858
*
5959
*-------------------------------------------------------------------------
6060
*/
@@ -63,6 +63,7 @@
6363

6464
#include <math.h>
6565

66+
#include "executor/executor.h"
6667
#include "executor/nodeHash.h"
6768
#include "miscadmin.h"
6869
#include "nodes/nodeFuncs.h"
@@ -119,6 +120,8 @@ typedef struct
119120
static MergeScanSelCache *cached_scansel(PlannerInfo *root,
120121
RestrictInfo *rinfo,
121122
PathKey *pathkey);
123+
static void cost_rescan(PlannerInfo *root, Path *path,
124+
Cost *rescan_startup_cost, Cost *rescan_total_cost);
122125
static bool cost_qual_eval_walker(Node *node, cost_qual_eval_context *context);
123126
static bool adjust_semi_join(PlannerInfo *root, JoinPath *path,
124127
SpecialJoinInfo *sjinfo,
@@ -895,15 +898,26 @@ cost_functionscan(Path *path, PlannerInfo *root, RelOptInfo *baserel)
895898
rte = planner_rt_fetch(baserel->relid, root);
896899
Assert(rte->rtekind == RTE_FUNCTION);
897900

898-
/* Estimate costs of executing the function expression */
901+
/*
902+
* Estimate costs of executing the function expression.
903+
*
904+
* Currently, nodeFunctionscan.c always executes the function to
905+
* completion before returning any rows, and caches the results in a
906+
* tuplestore. So the function eval cost is all startup cost, and
907+
* per-row costs are minimal.
908+
*
909+
* XXX in principle we ought to charge tuplestore spill costs if the
910+
* number of rows is large. However, given how phony our rowcount
911+
* estimates for functions tend to be, there's not a lot of point
912+
* in that refinement right now.
913+
*/
899914
cost_qual_eval_node(&exprcost, rte->funcexpr, root);
900915

901-
startup_cost += exprcost.startup;
902-
cpu_per_tuple = exprcost.per_tuple;
916+
startup_cost += exprcost.startup + exprcost.per_tuple;
903917

904918
/* Add scanning CPU costs */
905919
startup_cost += baserel->baserestrictcost.startup;
906-
cpu_per_tuple += cpu_tuple_cost + baserel->baserestrictcost.per_tuple;
920+
cpu_per_tuple = cpu_tuple_cost + baserel->baserestrictcost.per_tuple;
907921
run_cost += cpu_per_tuple * baserel->tuples;
908922

909923
path->startup_cost = startup_cost;
@@ -1176,41 +1190,44 @@ sort_exceeds_work_mem(Sort *sort)
11761190
*
11771191
* If the total volume of data to materialize exceeds work_mem, we will need
11781192
* to write it to disk, so the cost is much higher in that case.
1193+
*
1194+
* Note that here we are estimating the costs for the first scan of the
1195+
* relation, so the materialization is all overhead --- any savings will
1196+
* occur only on rescan, which is estimated in cost_rescan.
11791197
*/
11801198
void
11811199
cost_material(Path *path,
1182-
Cost input_cost, double tuples, int width)
1200+
Cost input_startup_cost, Cost input_total_cost,
1201+
double tuples, int width)
11831202
{
1184-
Cost startup_cost = input_cost;
1185-
Cost run_cost = 0;
1203+
Cost startup_cost = input_startup_cost;
1204+
Cost run_cost = input_total_cost - input_startup_cost;
11861205
double nbytes = relation_byte_size(tuples, width);
11871206
long work_mem_bytes = work_mem * 1024L;
11881207

1189-
/* disk costs */
1208+
/*
1209+
* Whether spilling or not, charge 2x cpu_tuple_cost per tuple to reflect
1210+
* bookkeeping overhead. (This rate must be more than cpu_tuple_cost;
1211+
* if it is exactly the same then there will be a cost tie between
1212+
* nestloop with A outer, materialized B inner and nestloop with B outer,
1213+
* materialized A inner. The extra cost ensures we'll prefer
1214+
* materializing the smaller rel.)
1215+
*/
1216+
run_cost += 2 * cpu_tuple_cost * tuples;
1217+
1218+
/*
1219+
* If we will spill to disk, charge at the rate of seq_page_cost per page.
1220+
* This cost is assumed to be evenly spread through the plan run phase,
1221+
* which isn't exactly accurate but our cost model doesn't allow for
1222+
* nonuniform costs within the run phase.
1223+
*/
11901224
if (nbytes > work_mem_bytes)
11911225
{
11921226
double npages = ceil(nbytes / BLCKSZ);
11931227

1194-
/* We'll write during startup and read during retrieval */
1195-
startup_cost += seq_page_cost * npages;
11961228
run_cost += seq_page_cost * npages;
11971229
}
11981230

1199-
/*
1200-
* Charge a very small amount per inserted tuple, to reflect bookkeeping
1201-
* costs. We use cpu_tuple_cost/10 for this. This is needed to break the
1202-
* tie that would otherwise exist between nestloop with A outer,
1203-
* materialized B inner and nestloop with B outer, materialized A inner.
1204-
* The extra cost ensures we'll prefer materializing the smaller rel.
1205-
*/
1206-
startup_cost += cpu_tuple_cost * 0.1 * tuples;
1207-
1208-
/*
1209-
* Also charge a small amount per extracted tuple. We use cpu_tuple_cost
1210-
* so that it doesn't appear worthwhile to materialize a bare seqscan.
1211-
*/
1212-
run_cost += cpu_tuple_cost * tuples;
1213-
12141231
path->startup_cost = startup_cost;
12151232
path->total_cost = startup_cost + run_cost;
12161233
}
@@ -1400,7 +1417,10 @@ cost_nestloop(NestPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
14001417
Path *inner_path = path->innerjoinpath;
14011418
Cost startup_cost = 0;
14021419
Cost run_cost = 0;
1420+
Cost inner_rescan_start_cost;
1421+
Cost inner_rescan_total_cost;
14031422
Cost inner_run_cost;
1423+
Cost inner_rescan_run_cost;
14041424
Cost cpu_per_tuple;
14051425
QualCost restrict_qual_cost;
14061426
double outer_path_rows = PATH_ROWS(outer_path);
@@ -1413,32 +1433,26 @@ cost_nestloop(NestPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
14131433
if (!enable_nestloop)
14141434
startup_cost += disable_cost;
14151435

1436+
/* estimate costs to rescan the inner relation */
1437+
cost_rescan(root, inner_path,
1438+
&inner_rescan_start_cost,
1439+
&inner_rescan_total_cost);
1440+
14161441
/* cost of source data */
14171442

14181443
/*
14191444
* NOTE: clearly, we must pay both outer and inner paths' startup_cost
14201445
* before we can start returning tuples, so the join's startup cost is
1421-
* their sum. What's not so clear is whether the inner path's
1422-
* startup_cost must be paid again on each rescan of the inner path. This
1423-
* is not true if the inner path is materialized or is a hashjoin, but
1424-
* probably is true otherwise.
1446+
* their sum. We'll also pay the inner path's rescan startup cost
1447+
* multiple times.
14251448
*/
14261449
startup_cost += outer_path->startup_cost + inner_path->startup_cost;
14271450
run_cost += outer_path->total_cost - outer_path->startup_cost;
1428-
if (IsA(inner_path, MaterialPath) ||
1429-
IsA(inner_path, HashPath))
1430-
{
1431-
/* charge only run cost for each iteration of inner path */
1432-
}
1433-
else
1434-
{
1435-
/*
1436-
* charge startup cost for each iteration of inner path, except we
1437-
* already charged the first startup_cost in our own startup
1438-
*/
1439-
run_cost += (outer_path_rows - 1) * inner_path->startup_cost;
1440-
}
1451+
if (outer_path_rows > 1)
1452+
run_cost += (outer_path_rows - 1) * inner_rescan_start_cost;
1453+
14411454
inner_run_cost = inner_path->total_cost - inner_path->startup_cost;
1455+
inner_rescan_run_cost = inner_rescan_total_cost - inner_rescan_start_cost;
14421456

14431457
if (adjust_semi_join(root, path, sjinfo,
14441458
&outer_match_frac,
@@ -1458,12 +1472,22 @@ cost_nestloop(NestPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
14581472
* that fraction. (If we used a larger fuzz factor, we'd have to
14591473
* clamp inner_scan_frac to at most 1.0; but since match_count is at
14601474
* least 1, no such clamp is needed now.)
1475+
*
1476+
* A complicating factor is that rescans may be cheaper than first
1477+
* scans. If we never scan all the way to the end of the inner rel,
1478+
* it might be (depending on the plan type) that we'd never pay the
1479+
* whole inner first-scan run cost. However it is difficult to
1480+
* estimate whether that will happen, so be conservative and always
1481+
* charge the whole first-scan cost once.
14611482
*/
1483+
run_cost += inner_run_cost;
1484+
14621485
outer_matched_rows = rint(outer_path_rows * outer_match_frac);
14631486
inner_scan_frac = 2.0 / (match_count + 1.0);
14641487

1465-
/* Add inner run cost for outer tuples having matches */
1466-
run_cost += outer_matched_rows * inner_run_cost * inner_scan_frac;
1488+
/* Add inner run cost for additional outer tuples having matches */
1489+
if (outer_matched_rows > 1)
1490+
run_cost += (outer_matched_rows - 1) * inner_rescan_run_cost * inner_scan_frac;
14671491

14681492
/* Compute number of tuples processed (not number emitted!) */
14691493
ntuples = outer_matched_rows * inner_path_rows * inner_scan_frac;
@@ -1479,21 +1503,26 @@ cost_nestloop(NestPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
14791503
if (indexed_join_quals)
14801504
{
14811505
run_cost += (outer_path_rows - outer_matched_rows) *
1482-
inner_run_cost / inner_path_rows;
1483-
/* We won't be evaluating any quals at all for these rows */
1506+
inner_rescan_run_cost / inner_path_rows;
1507+
/*
1508+
* We won't be evaluating any quals at all for these rows,
1509+
* so don't add them to ntuples.
1510+
*/
14841511
}
14851512
else
14861513
{
14871514
run_cost += (outer_path_rows - outer_matched_rows) *
1488-
inner_run_cost;
1515+
inner_rescan_run_cost;
14891516
ntuples += (outer_path_rows - outer_matched_rows) *
14901517
inner_path_rows;
14911518
}
14921519
}
14931520
else
14941521
{
14951522
/* Normal case; we'll scan whole input rel for each outer row */
1496-
run_cost += outer_path_rows * inner_run_cost;
1523+
run_cost += inner_run_cost;
1524+
if (outer_path_rows > 1)
1525+
run_cost += (outer_path_rows - 1) * inner_rescan_run_cost;
14971526

14981527
/* Compute number of tuples processed (not number emitted!) */
14991528
ntuples = outer_path_rows * inner_path_rows;
@@ -2190,13 +2219,13 @@ cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan)
21902219

21912220
/*
21922221
* Also account for subplan's startup cost. If the subplan is
2193-
* uncorrelated or undirect correlated, AND its topmost node is a Sort
2194-
* or Material node, assume that we'll only need to pay its startup
2195-
* cost once; otherwise assume we pay the startup cost every time.
2222+
* uncorrelated or undirect correlated, AND its topmost node is one
2223+
* that materializes its output, assume that we'll only need to pay
2224+
* its startup cost once; otherwise assume we pay the startup cost
2225+
* every time.
21962226
*/
21972227
if (subplan->parParam == NIL &&
2198-
(IsA(plan, Sort) ||
2199-
IsA(plan, Material)))
2228+
ExecMaterializesOutput(nodeTag(plan)))
22002229
sp_cost.startup += plan->startup_cost;
22012230
else
22022231
sp_cost.per_tuple += plan->startup_cost;
@@ -2207,6 +2236,81 @@ cost_subplan(PlannerInfo *root, SubPlan *subplan, Plan *plan)
22072236
}
22082237

22092238

2239+
/*
2240+
* cost_rescan
2241+
* Given a finished Path, estimate the costs of rescanning it after
2242+
* having done so the first time. For some Path types a rescan is
2243+
* cheaper than an original scan (if no parameters change), and this
2244+
* function embodies knowledge about that. The default is to return
2245+
* the same costs stored in the Path. (Note that the cost estimates
2246+
* actually stored in Paths are always for first scans.)
2247+
*
2248+
* This function is not currently intended to model effects such as rescans
2249+
* being cheaper due to disk block caching; what we are concerned with is
2250+
* plan types wherein the executor caches results explicitly, or doesn't
2251+
* redo startup calculations, etc.
2252+
*/
2253+
static void
2254+
cost_rescan(PlannerInfo *root, Path *path,
2255+
Cost *rescan_startup_cost, /* output parameters */
2256+
Cost *rescan_total_cost)
2257+
{
2258+
switch (path->pathtype)
2259+
{
2260+
case T_FunctionScan:
2261+
/*
2262+
* Currently, nodeFunctionscan.c always executes the function
2263+
* to completion before returning any rows, and caches the
2264+
* results in a tuplestore. So the function eval cost is
2265+
* all startup cost and isn't paid over again on rescans.
2266+
* However, all run costs will be paid over again.
2267+
*/
2268+
*rescan_startup_cost = 0;
2269+
*rescan_total_cost = path->total_cost - path->startup_cost;
2270+
break;
2271+
case T_HashJoin:
2272+
/*
2273+
* Assume that all of the startup cost represents hash table
2274+
* building, which we won't have to do over.
2275+
*/
2276+
*rescan_startup_cost = 0;
2277+
*rescan_total_cost = path->total_cost - path->startup_cost;
2278+
break;
2279+
case T_Material:
2280+
case T_CteScan:
2281+
case T_WorkTableScan:
2282+
case T_Sort:
2283+
{
2284+
/*
2285+
* These plan types materialize their final result in a
2286+
* tuplestore or tuplesort object. So the rescan cost is only
2287+
* cpu_tuple_cost per tuple, unless the result is large enough
2288+
* to spill to disk.
2289+
*/
2290+
Cost run_cost = cpu_tuple_cost * path->parent->rows;
2291+
double nbytes = relation_byte_size(path->parent->rows,
2292+
path->parent->width);
2293+
long work_mem_bytes = work_mem * 1024L;
2294+
2295+
if (nbytes > work_mem_bytes)
2296+
{
2297+
/* It will spill, so account for re-read cost */
2298+
double npages = ceil(nbytes / BLCKSZ);
2299+
2300+
run_cost += seq_page_cost * npages;
2301+
}
2302+
*rescan_startup_cost = 0;
2303+
*rescan_total_cost = run_cost;
2304+
}
2305+
break;
2306+
default:
2307+
*rescan_startup_cost = path->startup_cost;
2308+
*rescan_total_cost = path->total_cost;
2309+
break;
2310+
}
2311+
}
2312+
2313+
22102314
/*
22112315
* cost_qual_eval
22122316
* Estimate the CPU costs of evaluating a WHERE clause.

0 commit comments

Comments
 (0)