Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit b9ed856

Browse files
committed
Revert "Fix race in Parallel Hash Join batch cleanup."
This reverts commit 8fa2478. Discussion: https://postgr.es/m/CA%2BhUKGJmcqAE3MZeDCLLXa62cWM0AJbKmp2JrJYaJ86bz36LFA%40mail.gmail.com
1 parent 8fa2478 commit b9ed856

File tree

3 files changed

+32
-58
lines changed

3 files changed

+32
-58
lines changed

src/backend/executor/nodeHash.c

Lines changed: 15 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -333,21 +333,14 @@ MultiExecParallelHash(HashState *node)
333333
hashtable->nbuckets = pstate->nbuckets;
334334
hashtable->log2_nbuckets = my_log2(hashtable->nbuckets);
335335
hashtable->totalTuples = pstate->total_tuples;
336-
337-
/*
338-
* Unless we're completely done and the batch state has been freed, make
339-
* sure we have accessors.
340-
*/
341-
if (BarrierPhase(build_barrier) < PHJ_BUILD_DONE)
342-
ExecParallelHashEnsureBatchAccessors(hashtable);
336+
ExecParallelHashEnsureBatchAccessors(hashtable);
343337

344338
/*
345339
* The next synchronization point is in ExecHashJoin's HJ_BUILD_HASHTABLE
346-
* case, which will bring the build phase to PHJ_BUILD_RUNNING (if it isn't
340+
* case, which will bring the build phase to PHJ_BUILD_DONE (if it isn't
347341
* there already).
348342
*/
349343
Assert(BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER ||
350-
BarrierPhase(build_barrier) == PHJ_BUILD_RUNNING ||
351344
BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
352345
}
353346

@@ -631,7 +624,7 @@ ExecHashTableCreate(HashState *state, List *hashOperators, List *hashCollations,
631624
/*
632625
* The next Parallel Hash synchronization point is in
633626
* MultiExecParallelHash(), which will progress it all the way to
634-
* PHJ_BUILD_RUNNING. The caller must not return control from this
627+
* PHJ_BUILD_DONE. The caller must not return control from this
635628
* executor node between now and then.
636629
*/
637630
}
@@ -3026,11 +3019,14 @@ ExecParallelHashEnsureBatchAccessors(HashJoinTable hashtable)
30263019
}
30273020

30283021
/*
3029-
* We should never see a state where the batch-tracking array is freed,
3030-
* because we should have given up sooner if we join when the build barrier
3031-
* has reached the PHJ_BUILD_DONE phase.
3022+
* It's possible for a backend to start up very late so that the whole
3023+
* join is finished and the shm state for tracking batches has already
3024+
* been freed by ExecHashTableDetach(). In that case we'll just leave
3025+
* hashtable->batches as NULL so that ExecParallelHashJoinNewBatch() gives
3026+
* up early.
30323027
*/
3033-
Assert(DsaPointerIsValid(pstate->batches));
3028+
if (!DsaPointerIsValid(pstate->batches))
3029+
return;
30343030

30353031
/* Use hash join memory context. */
30363032
oldcxt = MemoryContextSwitchTo(hashtable->hashCxt);
@@ -3150,17 +3146,9 @@ ExecHashTableDetachBatch(HashJoinTable hashtable)
31503146
void
31513147
ExecHashTableDetach(HashJoinTable hashtable)
31523148
{
3153-
ParallelHashJoinState *pstate = hashtable->parallel_state;
3154-
3155-
/*
3156-
* If we're involved in a parallel query, we must either have got all the
3157-
* way to PHJ_BUILD_RUNNING, or joined too late and be in PHJ_BUILD_DONE.
3158-
*/
3159-
Assert(!pstate ||
3160-
BarrierPhase(&pstate->build_barrier) >= PHJ_BUILD_RUNNING);
3161-
3162-
if (pstate && BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_RUNNING)
3149+
if (hashtable->parallel_state)
31633150
{
3151+
ParallelHashJoinState *pstate = hashtable->parallel_state;
31643152
int i;
31653153

31663154
/* Make sure any temporary files are closed. */
@@ -3176,22 +3164,17 @@ ExecHashTableDetach(HashJoinTable hashtable)
31763164
}
31773165

31783166
/* If we're last to detach, clean up shared memory. */
3179-
if (BarrierArriveAndDetach(&pstate->build_barrier))
3167+
if (BarrierDetach(&pstate->build_barrier))
31803168
{
3181-
/*
3182-
* Late joining processes will see this state and give up
3183-
* immediately.
3184-
*/
3185-
Assert(BarrierPhase(&pstate->build_barrier) == PHJ_BUILD_DONE);
3186-
31873169
if (DsaPointerIsValid(pstate->batches))
31883170
{
31893171
dsa_free(hashtable->area, pstate->batches);
31903172
pstate->batches = InvalidDsaPointer;
31913173
}
31923174
}
3175+
3176+
hashtable->parallel_state = NULL;
31933177
}
3194-
hashtable->parallel_state = NULL;
31953178
}
31963179

31973180
/*

src/backend/executor/nodeHashjoin.c

Lines changed: 16 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,7 @@
4545
* PHJ_BUILD_ALLOCATING -- one sets up the batches and table 0
4646
* PHJ_BUILD_HASHING_INNER -- all hash the inner rel
4747
* PHJ_BUILD_HASHING_OUTER -- (multi-batch only) all hash the outer
48-
* PHJ_BUILD_RUNNING -- building done, probing can begin
49-
* PHJ_BUILD_DONE -- all work complete, one frees batches
48+
* PHJ_BUILD_DONE -- building done, probing can begin
5049
*
5150
* While in the phase PHJ_BUILD_HASHING_INNER a separate pair of barriers may
5251
* be used repeatedly as required to coordinate expansions in the number of
@@ -74,7 +73,7 @@
7473
* batches whenever it encounters them while scanning and probing, which it
7574
* can do because it processes batches in serial order.
7675
*
77-
* Once PHJ_BUILD_RUNNING is reached, backends then split up and process
76+
* Once PHJ_BUILD_DONE is reached, backends then split up and process
7877
* different batches, or gang up and work together on probing batches if there
7978
* aren't enough to go around. For each batch there is a separate barrier
8079
* with the following phases:
@@ -96,16 +95,11 @@
9695
*
9796
* To avoid deadlocks, we never wait for any barrier unless it is known that
9897
* all other backends attached to it are actively executing the node or have
99-
* finished. Practically, that means that we never emit a tuple while attached
100-
* to a barrier, unless the barrier has reached a phase that means that no
101-
* process will wait on it again. We emit tuples while attached to the build
102-
* barrier in phase PHJ_BUILD_RUNNING, and to a per-batch barrier in phase
103-
* PHJ_BATCH_PROBING. These are advanced to PHJ_BUILD_DONE and PHJ_BATCH_DONE
104-
* respectively without waiting, using BarrierArriveAndDetach(). The last to
105-
* detach receives a different return value so that it knows that it's safe to
106-
* clean up. Any straggler process that attaches after that phase is reached
107-
* will see that it's too late to participate or access the relevant shared
108-
* memory objects.
98+
* already arrived. Practically, that means that we never return a tuple
99+
* while attached to a barrier, unless the barrier has reached its final
100+
* state. In the slightly special case of the per-batch barrier, we return
101+
* tuples while in PHJ_BATCH_PROBING phase, but that's OK because we use
102+
* BarrierArriveAndDetach() to advance it to PHJ_BATCH_DONE without waiting.
109103
*
110104
*-------------------------------------------------------------------------
111105
*/
@@ -323,7 +317,6 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
323317

324318
build_barrier = &parallel_state->build_barrier;
325319
Assert(BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER ||
326-
BarrierPhase(build_barrier) == PHJ_BUILD_RUNNING ||
327320
BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
328321
if (BarrierPhase(build_barrier) == PHJ_BUILD_HASHING_OUTER)
329322
{
@@ -336,18 +329,9 @@ ExecHashJoinImpl(PlanState *pstate, bool parallel)
336329
BarrierArriveAndWait(build_barrier,
337330
WAIT_EVENT_HASH_BUILD_HASHING_OUTER);
338331
}
339-
else if (BarrierPhase(build_barrier) == PHJ_BUILD_DONE)
340-
{
341-
/*
342-
* If we attached so late that the job is finished and
343-
* the batch state has been freed, we can return
344-
* immediately.
345-
*/
346-
return NULL;
347-
}
332+
Assert(BarrierPhase(build_barrier) == PHJ_BUILD_DONE);
348333

349334
/* Each backend should now select a batch to work on. */
350-
Assert(BarrierPhase(build_barrier) == PHJ_BUILD_RUNNING);
351335
hashtable->curbatch = -1;
352336
node->hj_JoinState = HJ_NEED_NEW_BATCH;
353337

@@ -1106,6 +1090,14 @@ ExecParallelHashJoinNewBatch(HashJoinState *hjstate)
11061090
int start_batchno;
11071091
int batchno;
11081092

1093+
/*
1094+
* If we started up so late that the batch tracking array has been freed
1095+
* already by ExecHashTableDetach(), then we are finished. See also
1096+
* ExecParallelHashEnsureBatchAccessors().
1097+
*/
1098+
if (hashtable->batches == NULL)
1099+
return false;
1100+
11091101
/*
11101102
* If we were already attached to a batch, remember not to bother checking
11111103
* it again, and detach from it (possibly freeing the hash table if we are

src/include/executor/hashjoin.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -258,8 +258,7 @@ typedef struct ParallelHashJoinState
258258
#define PHJ_BUILD_ALLOCATING 1
259259
#define PHJ_BUILD_HASHING_INNER 2
260260
#define PHJ_BUILD_HASHING_OUTER 3
261-
#define PHJ_BUILD_RUNNING 4
262-
#define PHJ_BUILD_DONE 5
261+
#define PHJ_BUILD_DONE 4
263262

264263
/* The phases for probing each batch, used by for batch_barrier. */
265264
#define PHJ_BATCH_ELECTING 0

0 commit comments

Comments
 (0)