Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 8cce08f

Browse files
committed
Change NTUP_PER_BUCKET to 1 to improve hash join lookup speed.
Since this makes the bucket headers use ~10x as much memory, properly account for that memory when we figure out whether everything fits in work_mem. This might result in some cases that previously used only a single batch getting split into multiple batches, but it's unclear as yet whether we need defenses against that case, and if so, what the shape of those defenses should be. It's worth noting that even in these edge cases, users should still be no worse off than they would have been last week, because commit 45f6240 saved a big pile of memory on exactly the same workloads. Tomas Vondra, reviewed and somewhat revised by me.
1 parent 4ad2a54 commit 8cce08f

File tree

1 file changed

+41
-34
lines changed

1 file changed

+41
-34
lines changed

src/backend/executor/nodeHash.c

+41-34
Original file line numberDiff line numberDiff line change
@@ -386,7 +386,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls)
386386
*/
387387

388388
/* Target bucket loading (tuples per bucket) */
389-
#define NTUP_PER_BUCKET 10
389+
#define NTUP_PER_BUCKET 1
390390

391391
void
392392
ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
@@ -396,12 +396,13 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
396396
{
397397
int tupsize;
398398
double inner_rel_bytes;
399+
long bucket_bytes;
399400
long hash_table_bytes;
400401
long skew_table_bytes;
401402
long max_pointers;
402-
int nbatch;
403+
int nbatch = 1;
403404
int nbuckets;
404-
int i;
405+
double dbuckets;
405406

406407
/* Force a plausible relation size if no info */
407408
if (ntuples <= 0.0)
@@ -460,56 +461,61 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
460461

461462
/*
462463
* Set nbuckets to achieve an average bucket load of NTUP_PER_BUCKET when
463-
* memory is filled. Set nbatch to the smallest power of 2 that appears
464-
* sufficient. The Min() steps limit the results so that the pointer
465-
* arrays we'll try to allocate do not exceed work_mem.
464+
* memory is filled, assuming a single batch. The Min() step limits the
465+
* results so that the pointer arrays we'll try to allocate do not exceed
466+
* work_mem.
466467
*/
467468
max_pointers = (work_mem * 1024L) / sizeof(void *);
468469
/* also ensure we avoid integer overflow in nbatch and nbuckets */
469470
max_pointers = Min(max_pointers, INT_MAX / 2);
471+
dbuckets = ceil(ntuples / NTUP_PER_BUCKET);
472+
dbuckets = Min(dbuckets, max_pointers);
473+
nbuckets = Max((int) dbuckets, 1024);
474+
nbuckets = 1 << my_log2(nbuckets);
475+
bucket_bytes = sizeof(HashJoinTuple) * nbuckets;
470476

471-
if (inner_rel_bytes > hash_table_bytes)
477+
/*
478+
* If there's not enough space to store the projected number of tuples
479+
* and the required bucket headers, we will need multiple batches.
480+
*/
481+
if (inner_rel_bytes + bucket_bytes > hash_table_bytes)
472482
{
473483
/* We'll need multiple batches */
474484
long lbuckets;
475485
double dbatch;
476486
int minbatch;
487+
long bucket_size;
477488

478-
lbuckets = (hash_table_bytes / tupsize) / NTUP_PER_BUCKET;
489+
/*
490+
* Estimate the number of buckets we'll want to have when work_mem
491+
* is entirely full. Each bucket will contain a bucket pointer plus
492+
* NTUP_PER_BUCKET tuples, whose projected size already includes
493+
* overhead for the hash code, pointer to the next tuple, etc.
494+
*/
495+
bucket_size = (tupsize * NTUP_PER_BUCKET + sizeof(HashJoinTuple));
496+
lbuckets = 1 << my_log2(hash_table_bytes / bucket_size);
479497
lbuckets = Min(lbuckets, max_pointers);
480498
nbuckets = (int) lbuckets;
499+
bucket_bytes = nbuckets * sizeof(HashJoinTuple);
500+
501+
/*
502+
* Buckets are simple pointers to hashjoin tuples, while tupsize
503+
* includes the pointer, hash code, and MinimalTupleData. So buckets
504+
* should never really exceed 25% of work_mem (even for
505+
* NTUP_PER_BUCKET=1); except maybe * for work_mem values that are
506+
* not 2^N bytes, where we might get more * because of doubling.
507+
* So let's look for 50% here.
508+
*/
509+
Assert(bucket_bytes <= hash_table_bytes / 2);
481510

482-
dbatch = ceil(inner_rel_bytes / hash_table_bytes);
511+
/* Calculate required number of batches. */
512+
dbatch = ceil(inner_rel_bytes / (hash_table_bytes - bucket_bytes));
483513
dbatch = Min(dbatch, max_pointers);
484514
minbatch = (int) dbatch;
485515
nbatch = 2;
486516
while (nbatch < minbatch)
487517
nbatch <<= 1;
488518
}
489-
else
490-
{
491-
/* We expect the hashtable to fit in memory */
492-
double dbuckets;
493-
494-
dbuckets = ceil(ntuples / NTUP_PER_BUCKET);
495-
dbuckets = Min(dbuckets, max_pointers);
496-
nbuckets = (int) dbuckets;
497-
498-
nbatch = 1;
499-
}
500-
501-
/*
502-
* Both nbuckets and nbatch must be powers of 2 to make
503-
* ExecHashGetBucketAndBatch fast. We already fixed nbatch; now inflate
504-
* nbuckets to the next larger power of 2. We also force nbuckets to not
505-
* be real small, by starting the search at 2^10. (Note: above we made
506-
* sure that nbuckets is not more than INT_MAX / 2, so this loop cannot
507-
* overflow, nor can the final shift to recalculate nbuckets.)
508-
*/
509-
i = 10;
510-
while ((1 << i) < nbuckets)
511-
i++;
512-
nbuckets = (1 << i);
513519

514520
*numbuckets = nbuckets;
515521
*numbatches = nbatch;
@@ -754,7 +760,8 @@ ExecHashTableInsert(HashJoinTable hashtable,
754760
hashtable->spaceUsed += hashTupleSize;
755761
if (hashtable->spaceUsed > hashtable->spacePeak)
756762
hashtable->spacePeak = hashtable->spaceUsed;
757-
if (hashtable->spaceUsed > hashtable->spaceAllowed)
763+
if (hashtable->spaceUsed + hashtable->nbuckets * sizeof(HashJoinTuple)
764+
> hashtable->spaceAllowed)
758765
ExecHashIncreaseNumBatches(hashtable);
759766
}
760767
else

0 commit comments

Comments
 (0)