Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 0c2070c

Browse files
committed
Fix cardinality estimates for parallel joins.
For a partial path, the cardinality estimate needs to reflect the number of rows we think each worker will see, rather than the total number of rows; otherwise, costing will go wrong. The previous coding got this completely wrong for parallel joins. Unfortunately, this change may destabilize plans for users of 9.6 who have enabled parallel query, but since 9.6 is still fairly new I'm hoping expectations won't be too settled yet. Also, this is really a brown-paper-bag bug, so leaving it unfixed for the entire lifetime of 9.6 seems unwise. Related reports (whose import I initially failed to recognize) by Tomas Vondra and Tom Lane. Discussion: http://postgr.es/m/CA+TgmoaDxZ5z5Kw_oCQoymNxNoVaTCXzPaODcOuao=CzK8dMZw@mail.gmail.com
1 parent e2117e4 commit 0c2070c

File tree

1 file changed

+48
-26
lines changed

1 file changed

+48
-26
lines changed

src/backend/optimizer/path/costsize.c

+48-26
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ static Selectivity get_foreign_key_join_selectivity(PlannerInfo *root,
161161
static void set_rel_width(PlannerInfo *root, RelOptInfo *rel);
162162
static double relation_byte_size(double tuples, int width);
163163
static double page_size(double tuples, int width);
164+
static double get_parallel_divisor(Path *path);
164165

165166

166167
/*
@@ -238,32 +239,7 @@ cost_seqscan(Path *path, PlannerInfo *root,
238239
/* Adjust costing for parallelism, if used. */
239240
if (path->parallel_workers > 0)
240241
{
241-
double parallel_divisor = path->parallel_workers;
242-
double leader_contribution;
243-
244-
/*
245-
* Early experience with parallel query suggests that when there is
246-
* only one worker, the leader often makes a very substantial
247-
* contribution to executing the parallel portion of the plan, but as
248-
* more workers are added, it does less and less, because it's busy
249-
* reading tuples from the workers and doing whatever non-parallel
250-
* post-processing is needed. By the time we reach 4 workers, the
251-
* leader no longer makes a meaningful contribution. Thus, for now,
252-
* estimate that the leader spends 30% of its time servicing each
253-
* worker, and the remainder executing the parallel plan.
254-
*/
255-
leader_contribution = 1.0 - (0.3 * path->parallel_workers);
256-
if (leader_contribution > 0)
257-
parallel_divisor += leader_contribution;
258-
259-
/*
260-
* In the case of a parallel plan, the row count needs to represent
261-
* the number of tuples processed per worker. Otherwise, higher-level
262-
* plan nodes that appear below the gather will be costed incorrectly,
263-
* because they'll anticipate receiving more rows than any given copy
264-
* will actually get.
265-
*/
266-
path->rows = clamp_row_est(path->rows / parallel_divisor);
242+
double parallel_divisor = get_parallel_divisor(path);
267243

268244
/* The CPU cost is divided among all the workers. */
269245
cpu_run_cost /= parallel_divisor;
@@ -274,6 +250,12 @@ cost_seqscan(Path *path, PlannerInfo *root,
274250
* prefetching. For now, we assume that the disk run cost can't be
275251
* amortized at all.
276252
*/
253+
254+
/*
255+
* In the case of a parallel plan, the row count needs to represent
256+
* the number of tuples processed per worker.
257+
*/
258+
path->rows = clamp_row_est(path->rows / parallel_divisor);
277259
}
278260

279261
path->startup_cost = startup_cost;
@@ -2013,6 +1995,10 @@ final_cost_nestloop(PlannerInfo *root, NestPath *path,
20131995
else
20141996
path->path.rows = path->path.parent->rows;
20151997

1998+
/* For partial paths, scale row estimate. */
1999+
if (path->path.parallel_workers > 0)
2000+
path->path.rows /= get_parallel_divisor(&path->path);
2001+
20162002
/*
20172003
* We could include disable_cost in the preliminary estimate, but that
20182004
* would amount to optimizing for the case where the join method is
@@ -2431,6 +2417,10 @@ final_cost_mergejoin(PlannerInfo *root, MergePath *path,
24312417
else
24322418
path->jpath.path.rows = path->jpath.path.parent->rows;
24332419

2420+
/* For partial paths, scale row estimate. */
2421+
if (path->jpath.path.parallel_workers > 0)
2422+
path->jpath.path.rows /= get_parallel_divisor(&path->jpath.path);
2423+
24342424
/*
24352425
* We could include disable_cost in the preliminary estimate, but that
24362426
* would amount to optimizing for the case where the join method is
@@ -2810,6 +2800,10 @@ final_cost_hashjoin(PlannerInfo *root, HashPath *path,
28102800
else
28112801
path->jpath.path.rows = path->jpath.path.parent->rows;
28122802

2803+
/* For partial paths, scale row estimate. */
2804+
if (path->jpath.path.parallel_workers > 0)
2805+
path->jpath.path.rows /= get_parallel_divisor(&path->jpath.path);
2806+
28132807
/*
28142808
* We could include disable_cost in the preliminary estimate, but that
28152809
* would amount to optimizing for the case where the join method is
@@ -4798,3 +4792,31 @@ page_size(double tuples, int width)
47984792
{
47994793
return ceil(relation_byte_size(tuples, width) / BLCKSZ);
48004794
}
4795+
4796+
/*
4797+
* Estimate the fraction of the work that each worker will do given the
4798+
* number of workers budgeted for the path.
4799+
*/
4800+
static double
4801+
get_parallel_divisor(Path *path)
4802+
{
4803+
double parallel_divisor = path->parallel_workers;
4804+
double leader_contribution;
4805+
4806+
/*
4807+
* Early experience with parallel query suggests that when there is only
4808+
* one worker, the leader often makes a very substantial contribution to
4809+
* executing the parallel portion of the plan, but as more workers are
4810+
* added, it does less and less, because it's busy reading tuples from the
4811+
* workers and doing whatever non-parallel post-processing is needed. By
4812+
* the time we reach 4 workers, the leader no longer makes a meaningful
4813+
* contribution. Thus, for now, estimate that the leader spends 30% of
4814+
* its time servicing each worker, and the remainder executing the
4815+
* parallel plan.
4816+
*/
4817+
leader_contribution = 1.0 - (0.3 * path->parallel_workers);
4818+
if (leader_contribution > 0)
4819+
parallel_divisor += leader_contribution;
4820+
4821+
return parallel_divisor;
4822+
}

0 commit comments

Comments
 (0)