Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 969f9d0

Browse files
committed
Make EXPLAIN report maximum hashtable usage across multiple rescans.
Before discarding the old hash table in ExecReScanHashJoin, capture its statistics, ensuring that we report the maximum hashtable size across repeated rescans of the hash input relation. We can repurpose the existing code for reporting hashtable size in parallel workers to help with this, making the patch pretty small. This also ensures that if rescans happen within parallel workers, we get the correct maximums across all instances. Konstantin Knizhnik and Tom Lane, per diagnosis by Thomas Munro of a trouble report from Alvaro Herrera. Discussion: https://postgr.es/m/20200323165059.GA24950@alvherre.pgsql
1 parent 5c27bce commit 969f9d0

File tree

5 files changed

+87
-49
lines changed

5 files changed

+87
-49
lines changed

src/backend/commands/explain.c

+17-29
Original file line numberDiff line numberDiff line change
@@ -2964,22 +2964,25 @@ show_hash_info(HashState *hashstate, ExplainState *es)
29642964
HashInstrumentation hinstrument = {0};
29652965

29662966
/*
2967+
* Collect stats from the local process, even when it's a parallel query.
29672968
* In a parallel query, the leader process may or may not have run the
29682969
* hash join, and even if it did it may not have built a hash table due to
29692970
* timing (if it started late it might have seen no tuples in the outer
29702971
* relation and skipped building the hash table). Therefore we have to be
29712972
* prepared to get instrumentation data from all participants.
29722973
*/
2973-
if (hashstate->hashtable)
2974-
ExecHashGetInstrumentation(&hinstrument, hashstate->hashtable);
2974+
if (hashstate->hinstrument)
2975+
memcpy(&hinstrument, hashstate->hinstrument,
2976+
sizeof(HashInstrumentation));
29752977

29762978
/*
29772979
* Merge results from workers. In the parallel-oblivious case, the
29782980
* results from all participants should be identical, except where
29792981
* participants didn't run the join at all so have no data. In the
29802982
* parallel-aware case, we need to consider all the results. Each worker
2981-
* may have seen a different subset of batches and we want to find the
2982-
* highest memory usage for any one batch across all batches.
2983+
* may have seen a different subset of batches and we want to report the
2984+
* highest memory usage across all batches. We take the maxima of other
2985+
* values too, for the same reasons as in ExecHashAccumInstrumentation.
29832986
*/
29842987
if (hashstate->shared_info)
29852988
{
@@ -2990,31 +2993,16 @@ show_hash_info(HashState *hashstate, ExplainState *es)
29902993
{
29912994
HashInstrumentation *worker_hi = &shared_info->hinstrument[i];
29922995

2993-
if (worker_hi->nbatch > 0)
2994-
{
2995-
/*
2996-
* Every participant should agree on the buckets, so to be
2997-
* sure we have a value we'll just overwrite each time.
2998-
*/
2999-
hinstrument.nbuckets = worker_hi->nbuckets;
3000-
hinstrument.nbuckets_original = worker_hi->nbuckets_original;
3001-
3002-
/*
3003-
* Normally every participant should agree on the number of
3004-
* batches too, but it's possible for a backend that started
3005-
* late and missed the whole join not to have the final nbatch
3006-
* number. So we'll take the largest number.
3007-
*/
3008-
hinstrument.nbatch = Max(hinstrument.nbatch, worker_hi->nbatch);
3009-
hinstrument.nbatch_original = worker_hi->nbatch_original;
3010-
3011-
/*
3012-
* In a parallel-aware hash join, for now we report the
3013-
* maximum peak memory reported by any worker.
3014-
*/
3015-
hinstrument.space_peak =
3016-
Max(hinstrument.space_peak, worker_hi->space_peak);
3017-
}
2996+
hinstrument.nbuckets = Max(hinstrument.nbuckets,
2997+
worker_hi->nbuckets);
2998+
hinstrument.nbuckets_original = Max(hinstrument.nbuckets_original,
2999+
worker_hi->nbuckets_original);
3000+
hinstrument.nbatch = Max(hinstrument.nbatch,
3001+
worker_hi->nbatch);
3002+
hinstrument.nbatch_original = Max(hinstrument.nbatch_original,
3003+
worker_hi->nbatch_original);
3004+
hinstrument.space_peak = Max(hinstrument.space_peak,
3005+
worker_hi->space_peak);
30183006
}
30193007
}
30203008

src/backend/executor/nodeHash.c

+44-14
Original file line numberDiff line numberDiff line change
@@ -2597,7 +2597,10 @@ ExecHashInitializeDSM(HashState *node, ParallelContext *pcxt)
25972597
size = offsetof(SharedHashInfo, hinstrument) +
25982598
pcxt->nworkers * sizeof(HashInstrumentation);
25992599
node->shared_info = (SharedHashInfo *) shm_toc_allocate(pcxt->toc, size);
2600+
2601+
/* Each per-worker area must start out as zeroes. */
26002602
memset(node->shared_info, 0, size);
2603+
26012604
node->shared_info->num_workers = pcxt->nworkers;
26022605
shm_toc_insert(pcxt->toc, node->ps.plan->plan_node_id,
26032606
node->shared_info);
@@ -2616,22 +2619,33 @@ ExecHashInitializeWorker(HashState *node, ParallelWorkerContext *pwcxt)
26162619
if (!node->ps.instrument)
26172620
return;
26182621

2622+
/*
2623+
* Find our entry in the shared area, and set up a pointer to it so that
2624+
* we'll accumulate stats there when shutting down or rebuilding the hash
2625+
* table.
2626+
*/
26192627
shared_info = (SharedHashInfo *)
26202628
shm_toc_lookup(pwcxt->toc, node->ps.plan->plan_node_id, false);
26212629
node->hinstrument = &shared_info->hinstrument[ParallelWorkerNumber];
26222630
}
26232631

26242632
/*
2625-
* Copy instrumentation data from this worker's hash table (if it built one)
2626-
* to DSM memory so the leader can retrieve it. This must be done in an
2627-
* ExecShutdownHash() rather than ExecEndHash() because the latter runs after
2628-
* we've detached from the DSM segment.
2633+
* Collect EXPLAIN stats if needed, saving them into DSM memory if
2634+
* ExecHashInitializeWorker was called, or local storage if not. In the
2635+
* parallel case, this must be done in ExecShutdownHash() rather than
2636+
* ExecEndHash() because the latter runs after we've detached from the DSM
2637+
* segment.
26292638
*/
26302639
void
26312640
ExecShutdownHash(HashState *node)
26322641
{
2642+
/* Allocate save space if EXPLAIN'ing and we didn't do so already */
2643+
if (node->ps.instrument && !node->hinstrument)
2644+
node->hinstrument = (HashInstrumentation *)
2645+
palloc0(sizeof(HashInstrumentation));
2646+
/* Now accumulate data for the current (final) hash table */
26332647
if (node->hinstrument && node->hashtable)
2634-
ExecHashGetInstrumentation(node->hinstrument, node->hashtable);
2648+
ExecHashAccumInstrumentation(node->hinstrument, node->hashtable);
26352649
}
26362650

26372651
/*
@@ -2655,18 +2669,34 @@ ExecHashRetrieveInstrumentation(HashState *node)
26552669
}
26562670

26572671
/*
2658-
* Copy the instrumentation data from 'hashtable' into a HashInstrumentation
2659-
* struct.
2672+
* Accumulate instrumentation data from 'hashtable' into an
2673+
* initially-zeroed HashInstrumentation struct.
2674+
*
2675+
* This is used to merge information across successive hash table instances
2676+
* within a single plan node. We take the maximum values of each interesting
2677+
* number. The largest nbuckets and largest nbatch values might have occurred
2678+
* in different instances, so there's some risk of confusion from reporting
2679+
* unrelated numbers; but there's a bigger risk of misdiagnosing a performance
2680+
* issue if we don't report the largest values. Similarly, we want to report
2681+
* the largest spacePeak regardless of whether it happened in the same
2682+
* instance as the largest nbuckets or nbatch. All the instances should have
2683+
* the same nbuckets_original and nbatch_original; but there's little value
2684+
* in depending on that here, so handle them the same way.
26602685
*/
26612686
void
2662-
ExecHashGetInstrumentation(HashInstrumentation *instrument,
2663-
HashJoinTable hashtable)
2687+
ExecHashAccumInstrumentation(HashInstrumentation *instrument,
2688+
HashJoinTable hashtable)
26642689
{
2665-
instrument->nbuckets = hashtable->nbuckets;
2666-
instrument->nbuckets_original = hashtable->nbuckets_original;
2667-
instrument->nbatch = hashtable->nbatch;
2668-
instrument->nbatch_original = hashtable->nbatch_original;
2669-
instrument->space_peak = hashtable->spacePeak;
2690+
instrument->nbuckets = Max(instrument->nbuckets,
2691+
hashtable->nbuckets);
2692+
instrument->nbuckets_original = Max(instrument->nbuckets_original,
2693+
hashtable->nbuckets_original);
2694+
instrument->nbatch = Max(instrument->nbatch,
2695+
hashtable->nbatch);
2696+
instrument->nbatch_original = Max(instrument->nbatch_original,
2697+
hashtable->nbatch_original);
2698+
instrument->space_peak = Max(instrument->space_peak,
2699+
hashtable->spacePeak);
26702700
}
26712701

26722702
/*

src/backend/executor/nodeHashjoin.c

+9-1
Original file line numberDiff line numberDiff line change
@@ -1338,8 +1338,16 @@ ExecReScanHashJoin(HashJoinState *node)
13381338
/* must destroy and rebuild hash table */
13391339
HashState *hashNode = castNode(HashState, innerPlanState(node));
13401340

1341-
/* for safety, be sure to clear child plan node's pointer too */
13421341
Assert(hashNode->hashtable == node->hj_HashTable);
1342+
/* accumulate stats from old hash table, if wanted */
1343+
/* (this should match ExecShutdownHash) */
1344+
if (hashNode->ps.instrument && !hashNode->hinstrument)
1345+
hashNode->hinstrument = (HashInstrumentation *)
1346+
palloc0(sizeof(HashInstrumentation));
1347+
if (hashNode->hinstrument)
1348+
ExecHashAccumInstrumentation(hashNode->hinstrument,
1349+
hashNode->hashtable);
1350+
/* for safety, be sure to clear child plan node's pointer too */
13431351
hashNode->hashtable = NULL;
13441352

13451353
ExecHashTableDestroy(node->hj_HashTable);

src/include/executor/nodeHash.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ extern void ExecHashInitializeDSM(HashState *node, ParallelContext *pcxt);
7373
extern void ExecHashInitializeWorker(HashState *node, ParallelWorkerContext *pwcxt);
7474
extern void ExecHashRetrieveInstrumentation(HashState *node);
7575
extern void ExecShutdownHash(HashState *node);
76-
extern void ExecHashGetInstrumentation(HashInstrumentation *instrument,
77-
HashJoinTable hashtable);
76+
extern void ExecHashAccumInstrumentation(HashInstrumentation *instrument,
77+
HashJoinTable hashtable);
7878

7979
#endif /* NODEHASH_H */

src/include/nodes/execnodes.h

+15-3
Original file line numberDiff line numberDiff line change
@@ -2358,7 +2358,7 @@ typedef struct HashInstrumentation
23582358
int nbuckets_original; /* planned number of buckets */
23592359
int nbatch; /* number of batches at end of execution */
23602360
int nbatch_original; /* planned number of batches */
2361-
size_t space_peak; /* peak memory usage in bytes */
2361+
Size space_peak; /* peak memory usage in bytes */
23622362
} HashInstrumentation;
23632363

23642364
/* ----------------
@@ -2381,8 +2381,20 @@ typedef struct HashState
23812381
HashJoinTable hashtable; /* hash table for the hashjoin */
23822382
List *hashkeys; /* list of ExprState nodes */
23832383

2384-
SharedHashInfo *shared_info; /* one entry per worker */
2385-
HashInstrumentation *hinstrument; /* this worker's entry */
2384+
/*
2385+
* In a parallelized hash join, the leader retains a pointer to the
2386+
* shared-memory stats area in its shared_info field, and then copies the
2387+
* shared-memory info back to local storage before DSM shutdown. The
2388+
* shared_info field remains NULL in workers, or in non-parallel joins.
2389+
*/
2390+
SharedHashInfo *shared_info;
2391+
2392+
/*
2393+
* If we are collecting hash stats, this points to an initially-zeroed
2394+
* collection area, which could be either local storage or in shared
2395+
* memory; either way it's for just one process.
2396+
*/
2397+
HashInstrumentation *hinstrument;
23862398

23872399
/* Parallel hash state. */
23882400
struct ParallelHashJoinState *parallel_state;

0 commit comments

Comments
 (0)