postgrespro
diff --git a/‎doc/src/sgml/config.sgml
Lines changed: 51 additions & 9 deletions b/‎doc/src/sgml/config.sgml
Lines changed: 51 additions & 9 deletions
diff --git a/‎doc/src/sgml/ref/postgres-ref.sgml
Lines changed: 4 additions & 4 deletions b/‎doc/src/sgml/ref/postgres-ref.sgml
Lines changed: 4 additions & 4 deletions
diff --git a/‎doc/src/sgml/runtime.sgml
Lines changed: 6 additions & 4 deletions b/‎doc/src/sgml/runtime.sgml
Lines changed: 6 additions & 4 deletions
diff --git a/‎src/backend/executor/execGrouping.c
Lines changed: 3 additions & 2 deletions b/‎src/backend/executor/execGrouping.c
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/backend/executor/nodeAgg.c
Lines changed: 16 additions & 14 deletions b/‎src/backend/executor/nodeAgg.c
Lines changed: 16 additions & 14 deletions
diff --git a/‎src/backend/executor/nodeHash.c
Lines changed: 61 additions & 19 deletions b/‎src/backend/executor/nodeHash.c
Lines changed: 61 additions & 19 deletions
@@ -1663,22 +1663,64 @@ include_dir 'conf.d'
       </term>
       <listitem>
        <para>
-        Sets the maximum amount of memory to be used by a query operation
+        Sets the base maximum amount of memory to be used by a query operation
         (such as a sort or hash table) before writing to temporary disk files.
         If this value is specified without units, it is taken as kilobytes.
         The default value is four megabytes (<literal>4MB</literal>).
         Note that for a complex query, several sort or hash operations might be
-        running in parallel; each operation will be allowed to use as much memory
-        as this value specifies before it starts to write data into temporary
-        files. Also, several running sessions could be doing such operations
-        concurrently.  Therefore, the total memory used could be many
-        times the value of <varname>work_mem</varname>; it is necessary to
-        keep this fact in mind when choosing the value. Sort operations are
-        used for <literal>ORDER BY</literal>, <literal>DISTINCT</literal>, and
-        merge joins.
+        running in parallel; each operation will generally be allowed
+        to use as much memory as this value specifies before it starts
+        to write data into temporary files.  Also, several running
+        sessions could be doing such operations concurrently.
+        Therefore, the total memory used could be many times the value
+        of <varname>work_mem</varname>; it is necessary to keep this
+        fact in mind when choosing the value.  Sort operations are used
+        for <literal>ORDER BY</literal>, <literal>DISTINCT</literal>,
+        and merge joins.
         Hash tables are used in hash joins, hash-based aggregation, and
         hash-based processing of <literal>IN</literal> subqueries.
        </para>
+       <para>
+        Hash-based operations are generally more sensitive to memory
+        availability than equivalent sort-based operations.  The
+        memory available for hash tables is computed by multiplying
+        <varname>work_mem</varname> by
+        <varname>hash_mem_multiplier</varname>.  This makes it
+        possible for hash-based operations to use an amount of memory
+        that exceeds the usual <varname>work_mem</varname> base
+        amount.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry id="guc-hash-mem-multiplier" xreflabel="hash_mem_multiplier">
+      <term><varname>hash_mem_multiplier</varname> (<type>floating point</type>)
+      <indexterm>
+       <primary><varname>hash_mem_multiplier</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Used to compute the maximum amount of memory that hash-based
+        operations can use.  The final limit is determined by
+        multiplying <varname>work_mem</varname> by
+        <varname>hash_mem_multiplier</varname>.  The default value is
+        1.0, which makes hash-based operations subject to the same
+        simple <varname>work_mem</varname> maximum as sort-based
+        operations.
+       </para>
+       <para>
+        Consider increasing <varname>hash_mem_multiplier</varname> in
+        environments where spilling by query operations is a regular
+        occurrence, especially when simply increasing
+        <varname>work_mem</varname> results in memory pressure (memory
+        pressure typically takes the form of intermittent out of
+        memory errors).  A setting of 1.5 or 2.0 may be effective with
+        mixed workloads.  Higher settings in the range of 2.0 - 8.0 or
+        more may be effective in environments where
+        <varname>work_mem</varname> has already been increased to 40MB
+        or more.
+       </para>
       </listitem>
      </varlistentry>
 
 
@@ -338,10 +338,10 @@ PostgreSQL documentation
       <term><option>-S</option> <replaceable class="parameter">work-mem</replaceable></term>
       <listitem>
        <para>
-        Specifies the amount of memory to be used by internal sorts and hashes
-        before resorting to temporary disk files.  See the description of the
-        <varname>work_mem</varname> configuration parameter in <xref
-        linkend="runtime-config-resource-memory"/>.
+        Specifies the base amount of memory to be used by sorts and
+        hash tables before resorting to temporary disk files.  See the
+        description of the <varname>work_mem</varname> configuration
+        parameter in <xref linkend="runtime-config-resource-memory"/>.
        </para>
       </listitem>
      </varlistentry>
 
@@ -1326,10 +1326,12 @@ Out of Memory: Killed process 12345 (postgres).
     system running out of memory, you can avoid the problem by changing
     your configuration.  In some cases, it may help to lower memory-related
     configuration parameters, particularly
-    <link linkend="guc-shared-buffers"><varname>shared_buffers</varname></link>
-    and <link linkend="guc-work-mem"><varname>work_mem</varname></link>.  In
-    other cases, the problem may be caused by allowing too many connections
-    to the database server itself.  In many cases, it may be better to reduce
+    <link linkend="guc-shared-buffers"><varname>shared_buffers</varname></link>,
+    <link linkend="guc-work-mem"><varname>work_mem</varname></link>, and
+    <link linkend="guc-hash-mem-multiplier"><varname>hash_mem_multiplier</varname></link>.
+    In other cases, the problem may be caused by allowing too many
+    connections to the database server itself.  In many cases, it may
+    be better to reduce
     <link linkend="guc-max-connections"><varname>max_connections</varname></link>
     and instead make use of external connection-pooling software.
    </para>
 
@@ -165,13 +165,14 @@ BuildTupleHashTableExt(PlanState *parent,
 {
 	TupleHashTable hashtable;
 	Size		entrysize = sizeof(TupleHashEntryData) + additionalsize;
+	int			hash_mem = get_hash_mem();
 	MemoryContext oldcontext;
 	bool		allow_jit;
 
 	Assert(nbuckets > 0);
 
-	/* Limit initial table size request to not more than work_mem */
-	nbuckets = Min(nbuckets, (long) ((work_mem * 1024L) / entrysize));
+	/* Limit initial table size request to not more than hash_mem */
+	nbuckets = Min(nbuckets, (long) ((hash_mem * 1024L) / entrysize));
 
 	oldcontext = MemoryContextSwitchTo(metacxt);
 
 
@@ -203,7 +203,7 @@
  *	  entries (and initialize new transition states), we instead spill them to
  *	  disk to be processed later. The tuples are spilled in a partitioned
  *	  manner, so that subsequent batches are smaller and less likely to exceed
- *	  work_mem (if a batch does exceed work_mem, it must be spilled
+ *	  hash_mem (if a batch does exceed hash_mem, it must be spilled
  *	  recursively).
  *
  *	  Spilled data is written to logical tapes. These provide better control
@@ -212,7 +212,7 @@
  *
  *	  Note that it's possible for transition states to start small but then
  *	  grow very large; for instance in the case of ARRAY_AGG. In such cases,
- *	  it's still possible to significantly exceed work_mem. We try to avoid
+ *	  it's still possible to significantly exceed hash_mem. We try to avoid
  *	  this situation by estimating what will fit in the available memory, and
  *	  imposing a limit on the number of groups separately from the amount of
  *	  memory consumed.
@@ -1516,7 +1516,7 @@ build_hash_table(AggState *aggstate, int setno, long nbuckets)
 
 	/*
 	 * Used to make sure initial hash table allocation does not exceed
-	 * work_mem. Note that the estimate does not include space for
+	 * hash_mem. Note that the estimate does not include space for
 	 * pass-by-reference transition data values, nor for the representative
 	 * tuple of each group.
 	 */
@@ -1782,7 +1782,7 @@ hashagg_recompile_expressions(AggState *aggstate, bool minslot, bool nullcheck)
 }
 
 /*
- * Set limits that trigger spilling to avoid exceeding work_mem. Consider the
+ * Set limits that trigger spilling to avoid exceeding hash_mem. Consider the
  * number of partitions we expect to create (if we do spill).
  *
  * There are two limits: a memory limit, and also an ngroups limit. The
@@ -1796,13 +1796,14 @@ hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
 {
 	int			npartitions;
 	Size		partition_mem;
+	int			hash_mem = get_hash_mem();
 
-	/* if not expected to spill, use all of work_mem */
-	if (input_groups * hashentrysize < work_mem * 1024L)
+	/* if not expected to spill, use all of hash_mem */
+	if (input_groups * hashentrysize < hash_mem * 1024L)
 	{
 		if (num_partitions != NULL)
 			*num_partitions = 0;
-		*mem_limit = work_mem * 1024L;
+		*mem_limit = hash_mem * 1024L;
 		*ngroups_limit = *mem_limit / hashentrysize;
 		return;
 	}
@@ -1824,14 +1825,14 @@ hash_agg_set_limits(double hashentrysize, double input_groups, int used_bits,
 		HASHAGG_WRITE_BUFFER_SIZE * npartitions;
 
 	/*
-	 * Don't set the limit below 3/4 of work_mem. In that case, we are at the
+	 * Don't set the limit below 3/4 of hash_mem. In that case, we are at the
 	 * minimum number of partitions, so we aren't going to dramatically exceed
 	 * work mem anyway.
 	 */
-	if (work_mem * 1024L > 4 * partition_mem)
-		*mem_limit = work_mem * 1024L - partition_mem;
+	if (hash_mem * 1024L > 4 * partition_mem)
+		*mem_limit = hash_mem * 1024L - partition_mem;
 	else
-		*mem_limit = work_mem * 1024L * 0.75;
+		*mem_limit = hash_mem * 1024L * 0.75;
 
 	if (*mem_limit > hashentrysize)
 		*ngroups_limit = *mem_limit / hashentrysize;
@@ -1989,19 +1990,20 @@ hash_choose_num_partitions(double input_groups, double hashentrysize,
 	int			partition_limit;
 	int			npartitions;
 	int			partition_bits;
+	int			hash_mem = get_hash_mem();
 
 	/*
 	 * Avoid creating so many partitions that the memory requirements of the
-	 * open partition files are greater than 1/4 of work_mem.
+	 * open partition files are greater than 1/4 of hash_mem.
 	 */
 	partition_limit =
-		(work_mem * 1024L * 0.25 - HASHAGG_READ_BUFFER_SIZE) /
+		(hash_mem * 1024L * 0.25 - HASHAGG_READ_BUFFER_SIZE) /
 		HASHAGG_WRITE_BUFFER_SIZE;
 
 	mem_wanted = HASHAGG_PARTITION_FACTOR * input_groups * hashentrysize;
 
 	/* make enough partitions so that each one is likely to fit in memory */
-	npartitions = 1 + (mem_wanted / (work_mem * 1024L));
+	npartitions = 1 + (mem_wanted / (hash_mem * 1024L));
 
 	if (npartitions > partition_limit)
 		npartitions = partition_limit;
 
@@ -39,6 +39,7 @@
 #include "port/atomics.h"
 #include "port/pg_bitutils.h"
 #include "utils/dynahash.h"
+#include "utils/guc.h"
 #include "utils/lsyscache.h"
 #include "utils/memutils.h"
 #include "utils/syscache.h"
@@ -506,7 +507,7 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations,
 	hashtable->spaceAllowed = space_allowed;
 	hashtable->spaceUsedSkew = 0;
 	hashtable->spaceAllowedSkew =
-		hashtable->spaceAllowed * SKEW_WORK_MEM_PERCENT / 100;
+		hashtable->spaceAllowed * SKEW_HASH_MEM_PERCENT / 100;
 	hashtable->chunks = NULL;
 	hashtable->current_chunk = NULL;
 	hashtable->parallel_state = state->parallel_state;
@@ -665,7 +666,7 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations,
 
 void
 ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
-						bool try_combined_work_mem,
+						bool try_combined_hash_mem,
 						int parallel_workers,
 						size_t *space_allowed,
 						int *numbuckets,
@@ -682,6 +683,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 	int			nbatch = 1;
 	int			nbuckets;
 	double		dbuckets;
+	int			hash_mem = get_hash_mem();
 
 	/* Force a plausible relation size if no info */
 	if (ntuples <= 0.0)
@@ -698,16 +700,16 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 	inner_rel_bytes = ntuples * tupsize;
 
 	/*
-	 * Target in-memory hashtable size is work_mem kilobytes.
+	 * Target in-memory hashtable size is hash_mem kilobytes.
 	 */
-	hash_table_bytes = work_mem * 1024L;
+	hash_table_bytes = hash_mem * 1024L;
 
 	/*
-	 * Parallel Hash tries to use the combined work_mem of all workers to
-	 * avoid the need to batch.  If that won't work, it falls back to work_mem
+	 * Parallel Hash tries to use the combined hash_mem of all workers to
+	 * avoid the need to batch.  If that won't work, it falls back to hash_mem
 	 * per worker and tries to process batches in parallel.
 	 */
-	if (try_combined_work_mem)
+	if (try_combined_hash_mem)
 		hash_table_bytes += hash_table_bytes * parallel_workers;
 
 	*space_allowed = hash_table_bytes;
@@ -728,7 +730,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 	 */
 	if (useskew)
 	{
-		skew_table_bytes = hash_table_bytes * SKEW_WORK_MEM_PERCENT / 100;
+		skew_table_bytes = hash_table_bytes * SKEW_HASH_MEM_PERCENT / 100;
 
 		/*----------
 		 * Divisor is:
@@ -751,7 +753,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 	/*
 	 * Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when
 	 * memory is filled, assuming a single batch; but limit the value so that
-	 * the pointer arrays we'll try to allocate do not exceed work_mem nor
+	 * the pointer arrays we'll try to allocate do not exceed hash_mem nor
 	 * MaxAllocSize.
 	 *
 	 * Note that both nbuckets and nbatch must be powers of 2 to make
@@ -790,10 +792,10 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 		long		bucket_size;
 
 		/*
-		 * If Parallel Hash with combined work_mem would still need multiple
-		 * batches, we'll have to fall back to regular work_mem budget.
+		 * If Parallel Hash with combined hash_mem would still need multiple
+		 * batches, we'll have to fall back to regular hash_mem budget.
 		 */
-		if (try_combined_work_mem)
+		if (try_combined_hash_mem)
 		{
 			ExecChooseHashTableSize(ntuples, tupwidth, useskew,
 									false, parallel_workers,
@@ -805,7 +807,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 		}
 
 		/*
-		 * Estimate the number of buckets we'll want to have when work_mem is
+		 * Estimate the number of buckets we'll want to have when hash_mem is
 		 * entirely full.  Each bucket will contain a bucket pointer plus
 		 * NTUP_PER_BUCKET tuples, whose projected size already includes
 		 * overhead for the hash code, pointer to the next tuple, etc.
@@ -820,8 +822,8 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
 		/*
 		 * Buckets are simple pointers to hashjoin tuples, while tupsize
 		 * includes the pointer, hash code, and MinimalTupleData.  So buckets
-		 * should never really exceed 25% of work_mem (even for
-		 * NTUP_PER_BUCKET=1); except maybe for work_mem values that are not
+		 * should never really exceed 25% of hash_mem (even for
+		 * NTUP_PER_BUCKET=1); except maybe for hash_mem values that are not
 		 * 2^N bytes, where we might get more because of doubling. So let's
 		 * look for 50% here.
 		 */
@@ -1095,15 +1097,17 @@ ExecParallelHashIncreaseNumBatches(HashJoinTable hashtable)
 				/* Figure out how many batches to use. */
 				if (hashtable->nbatch == 1)
 				{
+					int			hash_mem = get_hash_mem();
+
 					/*
 					 * We are going from single-batch to multi-batch.  We need
 					 * to switch from one large combined memory budget to the
-					 * regular work_mem budget.
+					 * regular hash_mem budget.
 					 */
-					pstate->space_allowed = work_mem * 1024L;
+					pstate->space_allowed = hash_mem * 1024L;
 
 					/*
-					 * The combined work_mem of all participants wasn't
+					 * The combined hash_mem of all participants wasn't
 					 * enough. Therefore one batch per participant would be
 					 * approximately equivalent and would probably also be
 					 * insufficient.  So try two batches per participant,
@@ -2855,7 +2859,7 @@ ExecParallelHashTupleAlloc(HashJoinTable hashtable, size_t size,
 
 		/*
 		 * Check if our space limit would be exceeded.  To avoid choking on
-		 * very large tuples or very low work_mem setting, we'll always allow
+		 * very large tuples or very low hash_mem setting, we'll always allow
 		 * each backend to allocate at least one chunk.
 		 */
 		if (hashtable->batches[0].at_least_one_chunk &&
@@ -3366,3 +3370,41 @@ ExecParallelHashTuplePrealloc(HashJoinTable hashtable, int batchno, size_t size)
 
 	return true;
 }
+
+/*
+ * Get a hash_mem value by multiplying the work_mem GUC's value by the
+ * hash_mem_multiplier GUC's value.
+ *
+ * Returns a work_mem style KB value that hash-based nodes (including but not
+ * limited to hash join) use in place of work_mem.  This is subject to the
+ * same restrictions as work_mem itself.  (There is no such thing as the
+ * hash_mem GUC, but it's convenient for our callers to pretend that there
+ * is.)
+ *
+ * Exported for use by the planner, as well as other hash-based executor
+ * nodes.  This is a rather random place for this, but there is no better
+ * place.
+ */
+int
+get_hash_mem(void)
+{
+	double		hash_mem;
+
+	Assert(hash_mem_multiplier >= 1.0);
+
+	hash_mem = (double) work_mem * hash_mem_multiplier;
+
+	/*
+	 * guc.c enforces a MAX_KILOBYTES limitation on work_mem in order to
+	 * support the assumption that raw derived byte values can be stored in
+	 * 'long' variables.  The returned hash_mem value must also meet this
+	 * assumption.
+	 *
+	 * We clamp the final value rather than throw an error because it should
+	 * be possible to set work_mem and hash_mem_multiplier independently.
+	 */
+	if (hash_mem < MAX_KILOBYTES)
+		return (int) hash_mem;
+
+	return MAX_KILOBYTES;
+}