Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 30d7ae3

Browse files
committed
Increase number of hash join buckets for underestimate.
If we expect batching at the very beginning, we size nbuckets for "full work_mem" (see how many tuples we can get into work_mem, while not breaking NTUP_PER_BUCKET threshold). If we expect to be fine without batching, we start with the 'right' nbuckets and track the optimal nbuckets as we go (without actually resizing the hash table). Once we hit work_mem (considering the optimal nbuckets value), we keep the value. At the end of the first batch, we check whether (nbuckets != nbuckets_optimal) and resize the hash table if needed. Also, we keep this value for all batches (it's OK because it assumes full work_mem, and it makes the batchno evaluation trivial). So the resize happens only once. There could be cases where it would improve performance to allow the NTUP_PER_BUCKET threshold to be exceeded to keep everything in one batch rather than spilling to a second batch, but attempts to generate such a case have so far been unsuccessful; that issue may be addressed with a follow-on patch after further investigation. Tomas Vondra with minor format and comment cleanup by me Reviewed by Robert Haas, Heikki Linnakangas, and Kevin Grittner
1 parent 494affb commit 30d7ae3

File tree

3 files changed

+141
-6
lines changed

3 files changed

+141
-6
lines changed

src/backend/commands/explain.c

+7-4
Original file line numberDiff line numberDiff line change
@@ -1901,18 +1901,21 @@ show_hash_info(HashState *hashstate, ExplainState *es)
19011901
if (es->format != EXPLAIN_FORMAT_TEXT)
19021902
{
19031903
ExplainPropertyLong("Hash Buckets", hashtable->nbuckets, es);
1904+
ExplainPropertyLong("Original Hash Buckets",
1905+
hashtable->nbuckets_original, es);
19041906
ExplainPropertyLong("Hash Batches", hashtable->nbatch, es);
19051907
ExplainPropertyLong("Original Hash Batches",
19061908
hashtable->nbatch_original, es);
19071909
ExplainPropertyLong("Peak Memory Usage", spacePeakKb, es);
19081910
}
1909-
else if (hashtable->nbatch_original != hashtable->nbatch)
1911+
else if ((hashtable->nbatch_original != hashtable->nbatch) ||
1912+
(hashtable->nbuckets_original != hashtable->nbuckets))
19101913
{
19111914
appendStringInfoSpaces(es->str, es->indent * 2);
19121915
appendStringInfo(es->str,
1913-
"Buckets: %d Batches: %d (originally %d) Memory Usage: %ldkB\n",
1914-
hashtable->nbuckets, hashtable->nbatch,
1915-
hashtable->nbatch_original, spacePeakKb);
1916+
"Buckets: %d (originally %d) Batches: %d (originally %d) Memory Usage: %ldkB\n",
1917+
hashtable->nbuckets, hashtable->nbuckets_original,
1918+
hashtable->nbatch, hashtable->nbatch_original, spacePeakKb);
19161919
}
19171920
else
19181921
{

src/backend/executor/nodeHash.c

+129-2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939

4040

4141
static void ExecHashIncreaseNumBatches(HashJoinTable hashtable);
42+
static void ExecHashIncreaseNumBuckets(HashJoinTable hashtable);
4243
static void ExecHashBuildSkewHash(HashJoinTable hashtable, Hash *node,
4344
int mcvsToUse);
4445
static void ExecHashSkewTableInsert(HashJoinTable hashtable,
@@ -117,6 +118,7 @@ MultiExecHash(HashState *node)
117118
/* It's a skew tuple, so put it into that hash table */
118119
ExecHashSkewTableInsert(hashtable, slot, hashvalue,
119120
bucketNumber);
121+
hashtable->skewTuples += 1;
120122
}
121123
else
122124
{
@@ -127,6 +129,25 @@ MultiExecHash(HashState *node)
127129
}
128130
}
129131

132+
/* resize the hash table if needed (NTUP_PER_BUCKET exceeded) */
133+
if (hashtable->nbuckets != hashtable->nbuckets_optimal)
134+
{
135+
/* We never decrease the number of buckets. */
136+
Assert(hashtable->nbuckets_optimal > hashtable->nbuckets);
137+
138+
#ifdef HJDEBUG
139+
printf("Increasing nbuckets %d => %d\n",
140+
hashtable->nbuckets, hashtable->nbuckets_optimal);
141+
#endif
142+
143+
ExecHashIncreaseNumBuckets(hashtable);
144+
}
145+
146+
/* Account for the buckets in spaceUsed (reported in EXPLAIN ANALYZE) */
147+
hashtable->spaceUsed += hashtable->nbuckets * sizeof(HashJoinTuple);
148+
if (hashtable->spaceUsed > hashtable->spacePeak)
149+
hashtable->spacePeak = hashtable->spaceUsed;
150+
130151
/* must provide our own instrumentation support */
131152
if (node->ps.instrument)
132153
InstrStopNode(node->ps.instrument, hashtable->totalTuples);
@@ -272,7 +293,10 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
272293
*/
273294
hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData));
274295
hashtable->nbuckets = nbuckets;
296+
hashtable->nbuckets_original = nbuckets;
297+
hashtable->nbuckets_optimal = nbuckets;
275298
hashtable->log2_nbuckets = log2_nbuckets;
299+
hashtable->log2_nbuckets_optimal = log2_nbuckets;
276300
hashtable->buckets = NULL;
277301
hashtable->keepNulls = keepNulls;
278302
hashtable->skewEnabled = false;
@@ -286,6 +310,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
286310
hashtable->nbatch_outstart = nbatch;
287311
hashtable->growEnabled = true;
288312
hashtable->totalTuples = 0;
313+
hashtable->skewTuples = 0;
289314
hashtable->innerBatchFile = NULL;
290315
hashtable->outerBatchFile = NULL;
291316
hashtable->spaceUsed = 0;
@@ -620,6 +645,19 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
620645
*/
621646
ninmemory = nfreed = 0;
622647

648+
/* If know we need to resize nbuckets, we can do it while rebatching. */
649+
if (hashtable->nbuckets_optimal != hashtable->nbuckets)
650+
{
651+
/* we never decrease the number of buckets */
652+
Assert(hashtable->nbuckets_optimal > hashtable->nbuckets);
653+
654+
hashtable->nbuckets = hashtable->nbuckets_optimal;
655+
hashtable->log2_nbuckets = hashtable->log2_nbuckets_optimal;
656+
657+
hashtable->buckets = repalloc(hashtable->buckets,
658+
sizeof(HashJoinTuple) * hashtable->nbuckets);
659+
}
660+
623661
/*
624662
* We will scan through the chunks directly, so that we can reset the
625663
* buckets now and not have to keep track which tuples in the buckets have
@@ -703,6 +741,78 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
703741
}
704742
}
705743

744+
/*
745+
* ExecHashIncreaseNumBuckets
746+
* increase the original number of buckets in order to reduce
747+
* number of tuples per bucket
748+
*/
749+
static void
750+
ExecHashIncreaseNumBuckets(HashJoinTable hashtable)
751+
{
752+
HashMemoryChunk chunk;
753+
754+
/* do nothing if not an increase (it's called increase for a reason) */
755+
if (hashtable->nbuckets >= hashtable->nbuckets_optimal)
756+
return;
757+
758+
/*
759+
* We already know the optimal number of buckets, so let's just
760+
* compute the log2_nbuckets for it.
761+
*/
762+
hashtable->nbuckets = hashtable->nbuckets_optimal;
763+
hashtable->log2_nbuckets = my_log2(hashtable->nbuckets_optimal);
764+
765+
Assert(hashtable->nbuckets > 1);
766+
Assert(hashtable->nbuckets <= (INT_MAX / 2));
767+
Assert(hashtable->nbuckets == (1 << hashtable->log2_nbuckets));
768+
769+
#ifdef HJDEBUG
770+
printf("Increasing nbuckets to %d\n", hashtable->nbuckets);
771+
#endif
772+
773+
/*
774+
* Just reallocate the proper number of buckets - we don't need to
775+
* walk through them - we can walk the dense-allocated chunks
776+
* (just like in ExecHashIncreaseNumBatches, but without all the
777+
* copying into new chunks)
778+
*/
779+
hashtable->buckets =
780+
(HashJoinTuple *) repalloc(hashtable->buckets,
781+
hashtable->nbuckets * sizeof(HashJoinTuple));
782+
783+
memset(hashtable->buckets, 0, sizeof(void *) * hashtable->nbuckets);
784+
785+
/* scan through all tuples in all chunks to rebuild the hash table */
786+
for (chunk = hashtable->chunks; chunk != NULL; chunk = chunk->next)
787+
{
788+
/* process all tuples stored in this chunk */
789+
size_t idx = 0;
790+
while (idx < chunk->used)
791+
{
792+
HashJoinTuple hashTuple = (HashJoinTuple) (chunk->data + idx);
793+
int bucketno;
794+
int batchno;
795+
796+
ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue,
797+
&bucketno, &batchno);
798+
799+
/* add the tuple to the proper bucket */
800+
hashTuple->next = hashtable->buckets[bucketno];
801+
hashtable->buckets[bucketno] = hashTuple;
802+
803+
/* advance index past the tuple */
804+
idx += MAXALIGN(HJTUPLE_OVERHEAD +
805+
HJTUPLE_MINTUPLE(hashTuple)->t_len);
806+
}
807+
}
808+
809+
#ifdef HJDEBUG
810+
printf("Nbuckets increased to %d, average items per bucket %.1f\n",
811+
hashtable->nbuckets, batchTuples / hashtable->nbuckets);
812+
#endif
813+
}
814+
815+
706816
/*
707817
* ExecHashTableInsert
708818
* insert a tuple into the hash table depending on the hash value
@@ -736,6 +846,7 @@ ExecHashTableInsert(HashJoinTable hashtable,
736846
*/
737847
HashJoinTuple hashTuple;
738848
int hashTupleSize;
849+
double ntuples = (hashtable->totalTuples - hashtable->skewTuples);
739850

740851
/* Create the HashJoinTuple */
741852
hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
@@ -756,11 +867,24 @@ ExecHashTableInsert(HashJoinTable hashtable,
756867
hashTuple->next = hashtable->buckets[bucketno];
757868
hashtable->buckets[bucketno] = hashTuple;
758869

870+
/*
871+
* Increase the (optimal) number of buckets if we just exceeded the
872+
* NTUP_PER_BUCKET threshold, but only when there's still a single batch.
873+
*/
874+
if ((hashtable->nbatch == 1) &&
875+
(hashtable->nbuckets_optimal <= INT_MAX/2) && /* overflow protection */
876+
(ntuples >= (hashtable->nbuckets_optimal * NTUP_PER_BUCKET)))
877+
{
878+
hashtable->nbuckets_optimal *= 2;
879+
hashtable->log2_nbuckets_optimal += 1;
880+
}
881+
759882
/* Account for space used, and back off if we've used too much */
760883
hashtable->spaceUsed += hashTupleSize;
761884
if (hashtable->spaceUsed > hashtable->spacePeak)
762885
hashtable->spacePeak = hashtable->spaceUsed;
763-
if (hashtable->spaceUsed + hashtable->nbuckets * sizeof(HashJoinTuple)
886+
if (hashtable->spaceUsed +
887+
hashtable->nbuckets_optimal * sizeof(HashJoinTuple)
764888
> hashtable->spaceAllowed)
765889
ExecHashIncreaseNumBatches(hashtable);
766890
}
@@ -885,7 +1009,10 @@ ExecHashGetHashValue(HashJoinTable hashtable,
8851009
* functions are good about randomizing all their output bits, else we are
8861010
* likely to have very skewed bucket or batch occupancy.)
8871011
*
888-
* nbuckets doesn't change over the course of the join.
1012+
* nbuckets and log2_nbuckets may change while nbatch == 1 because of dynamic
1013+
* bucket count growth. Once we start batching, the value is fixed and does
1014+
* not change over the course of the join (making it possible to compute batch
1015+
* number the way we do here).
8891016
*
8901017
* nbatch is always a power of 2; we increase it only by doubling it. This
8911018
* effectively adds one more bit to the top of the batchno.

src/include/executor/hashjoin.h

+5
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,10 @@ typedef struct HashJoinTableData
127127
int nbuckets; /* # buckets in the in-memory hash table */
128128
int log2_nbuckets; /* its log2 (nbuckets must be a power of 2) */
129129

130+
int nbuckets_original; /* # buckets when starting the first hash */
131+
int nbuckets_optimal; /* optimal # buckets (per batch) */
132+
int log2_nbuckets_optimal; /* same as log2_nbuckets optimal */
133+
130134
/* buckets[i] is head of list of tuples in i'th in-memory bucket */
131135
struct HashJoinTupleData **buckets;
132136
/* buckets array is per-batch storage, as are all the tuples */
@@ -148,6 +152,7 @@ typedef struct HashJoinTableData
148152
bool growEnabled; /* flag to shut off nbatch increases */
149153

150154
double totalTuples; /* # tuples obtained from inner plan */
155+
double skewTuples; /* # tuples inserted into skew tuples */
151156

152157
/*
153158
* These arrays are allocated for the life of the hash join, but only if

0 commit comments

Comments
 (0)