If we expect a hash join to be performed in multiple batches, suppress

tglsfdc · tglsfdc · commit f38fbf31f571 · 2009-03-26T17:15:35.000Z
"physical tlist" optimization on the outer relation (ie, force a projection
step to occur in its scan).  This avoids storing useless column values when
the outer relation's tuples are written to temporary batch files.

Modified version of a patch by Michael Henderson and Ramon Lawrence.
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.355 2009/03/21 00:04:39 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.356 2009/03/26 17:15:34 tgl Exp $
  *
  * NOTES
  *	  Every node type that can appear in stored rules' parsetrees *must*
@@ -1448,6 +1448,7 @@ _outHashPath(StringInfo str, HashPath *node)
 	_outJoinPathInfo(str, (JoinPath *) node);
 
 	WRITE_NODE_FIELD(path_hashclauses);
+	WRITE_INT_FIELD(num_batches);
 }
 
 static void
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
@@ -54,7 +54,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.205 2009/03/21 00:04:39 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.206 2009/03/26 17:15:35 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1880,6 +1880,8 @@ cost_hashjoin(HashPath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo)
 							&numbatches,
 							&num_skew_mcvs);
 	virtualbuckets = (double) numbuckets *(double) numbatches;
+	/* mark the path with estimated # of batches */
+	path->num_batches = numbatches;
 
 	/*
 	 * Determine bucketsize fraction for inner relation.  We use the smallest
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
@@ -10,7 +10,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.256 2009/03/21 00:04:39 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.257 2009/03/26 17:15:35 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1910,6 +1910,10 @@ create_hashjoin_plan(PlannerInfo *root,
 	/* We don't want any excess columns in the hashed tuples */
 	disuse_physical_tlist(inner_plan, best_path->jpath.innerjoinpath);
 
+	/* If we expect batching, suppress excess columns in outer tuples too */
+	if (best_path->num_batches > 1)
+		disuse_physical_tlist(outer_plan, best_path->jpath.outerjoinpath);
+
 	/*
 	 * If there is a single join clause and we can identify the outer
 	 * variable as a simple column reference, supply its identity for
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.150 2009/02/27 00:06:27 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.151 2009/03/26 17:15:35 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1480,9 +1480,20 @@ create_hashjoin_path(PlannerInfo *root,
 	pathnode->jpath.outerjoinpath = outer_path;
 	pathnode->jpath.innerjoinpath = inner_path;
 	pathnode->jpath.joinrestrictinfo = restrict_clauses;
-	/* A hashjoin never has pathkeys, since its ordering is unpredictable */
+	/*
+	 * A hashjoin never has pathkeys, since its output ordering is
+	 * unpredictable due to possible batching.  XXX If the inner relation is
+	 * small enough, we could instruct the executor that it must not batch,
+	 * and then we could assume that the output inherits the outer relation's
+	 * ordering, which might save a sort step.  However there is considerable
+	 * downside if our estimate of the inner relation size is badly off.
+	 * For the moment we don't risk it.  (Note also that if we wanted to take
+	 * this seriously, joinpath.c would have to consider many more paths for
+	 * the outer rel than it does now.)
+	 */
 	pathnode->jpath.path.pathkeys = NIL;
 	pathnode->path_hashclauses = hashclauses;
+	/* cost_hashjoin will fill in pathnode->num_batches */
 
 	cost_hashjoin(pathnode, root, sjinfo);
 
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.170 2009/03/05 23:06:45 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.171 2009/03/26 17:15:35 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -845,6 +845,7 @@ typedef struct HashPath
 {
 	JoinPath	jpath;
 	List	   *path_hashclauses;		/* join clauses used for hashing */
+	int			num_batches;			/* number of batches expected */
 } HashPath;
 
 /*

Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@`
`8`	`8`	`*`
`9`	`9`	`*`
`10`	`10`	`* IDENTIFICATION`
`11`		`- * $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.355 2009/03/21 00:04:39 tgl Exp $`
	`11`	`+ * $PostgreSQL: pgsql/src/backend/nodes/outfuncs.c,v 1.356 2009/03/26 17:15:34 tgl Exp $`
`12`	`12`	`*`
`13`	`13`	`* NOTES`
`14`	`14`	`* Every node type that can appear in stored rules' parsetrees must`
`@@ -1448,6 +1448,7 @@ _outHashPath(StringInfo str, HashPath *node)`
`1448`	`1448`	`_outJoinPathInfo(str, (JoinPath *) node);`
`1449`	`1449`
`1450`	`1450`	`WRITE_NODE_FIELD(path_hashclauses);`
	`1451`	`+ WRITE_INT_FIELD(num_batches);`
`1451`	`1452`	`}`
`1452`	`1453`
`1453`	`1454`	`static void`
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@`
`10`	`10`	`*`
`11`	`11`	`*`
`12`	`12`	`* IDENTIFICATION`
`13`		`- * $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.256 2009/03/21 00:04:39 tgl Exp $`
	`13`	`+ * $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.257 2009/03/26 17:15:35 tgl Exp $`
`14`	`14`	`*`
`15`	`15`	`*-------------------------------------------------------------------------`
`16`	`16`	`*/`
`@@ -1910,6 +1910,10 @@ create_hashjoin_plan(PlannerInfo *root,`
`1910`	`1910`	`/* We don't want any excess columns in the hashed tuples */`
`1911`	`1911`	`disuse_physical_tlist(inner_plan, best_path->jpath.innerjoinpath);`
`1912`	`1912`
	`1913`	`+ /* If we expect batching, suppress excess columns in outer tuples too */`
	`1914`	`+ if (best_path->num_batches > 1)`
	`1915`	`+ disuse_physical_tlist(outer_plan, best_path->jpath.outerjoinpath);`
	`1916`	`+`
`1913`	`1917`	`/*`
`1914`	`1918`	`* If there is a single join clause and we can identify the outer`
`1915`	`1919`	`* variable as a simple column reference, supply its identity for`
Original file line number	Diff line number	Diff line change
`@@ -7,7 +7,7 @@`
`7`	`7`	`* Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group`
`8`	`8`	`* Portions Copyright (c) 1994, Regents of the University of California`
`9`	`9`	`*`
`10`		`- * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.170 2009/03/05 23:06:45 tgl Exp $`
	`10`	`+ * $PostgreSQL: pgsql/src/include/nodes/relation.h,v 1.171 2009/03/26 17:15:35 tgl Exp $`
`11`	`11`	`*`
`12`	`12`	`*-------------------------------------------------------------------------`
`13`	`13`	`*/`
`@@ -845,6 +845,7 @@ typedef struct HashPath`
`845`	`845`	`{`
`846`	`846`	`JoinPath jpath;`
`847`	`847`	`List path_hashclauses; / join clauses used for hashing */`
	`848`	`+ int num_batches; /* number of batches expected */`
`848`	`849`	`} HashPath;`
`849`	`850`
`850`	`851`	`/*`