Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 36764b2

Browse files
michail-nikolaevCommitfest Bot
authored and
Commitfest Bot
committed
Support snapshot resets in parallel concurrent index builds
Extend periodic snapshot reset support to parallel builds, previously limited to non-parallel operations. This allows the xmin horizon to advance during parallel concurrent index builds as well. The main limitation of applying that technic to parallel builds was a requirement to wait until workers processes restore their initial snapshot from leader. To address this, following changes applied: - add infrastructure to track snapshot restoration in parallel workers - extend parallel scan initialization to support periodic snapshot resets - wait for parallel workers to restore their initial snapshots before proceeding with scan - relax limitation for parallel worker to call GetLatestSnapshot
1 parent c3fbf50 commit 36764b2

File tree

14 files changed

+225
-89
lines changed

14 files changed

+225
-89
lines changed

src/backend/access/brin/brin.c

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,6 @@ typedef struct BrinLeader
143143
*/
144144
BrinShared *brinshared;
145145
Sharedsort *sharedsort;
146-
Snapshot snapshot;
147146
WalUsage *walusage;
148147
BufferUsage *bufferusage;
149148
} BrinLeader;
@@ -231,7 +230,7 @@ static void brin_fill_empty_ranges(BrinBuildState *state,
231230
static void _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
232231
bool isconcurrent, int request);
233232
static void _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state);
234-
static Size _brin_parallel_estimate_shared(Relation heap, Snapshot snapshot);
233+
static Size _brin_parallel_estimate_shared(Relation heap);
235234
static double _brin_parallel_heapscan(BrinBuildState *state);
236235
static double _brin_parallel_merge(BrinBuildState *state);
237236
static void _brin_leader_participate_as_worker(BrinBuildState *buildstate,
@@ -1221,7 +1220,6 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)
12211220
reltuples = _brin_parallel_merge(state);
12221221

12231222
_brin_end_parallel(state->bs_leader, state);
1224-
Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin));
12251223
}
12261224
else /* no parallel index build */
12271225
{
@@ -1254,7 +1252,6 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)
12541252
brin_fill_empty_ranges(state,
12551253
state->bs_currRangeStart,
12561254
state->bs_maxRangeStart);
1257-
Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin));
12581255
}
12591256

12601257
/* release resources */
@@ -1269,6 +1266,7 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)
12691266

12701267
result->heap_tuples = reltuples;
12711268
result->index_tuples = idxtuples;
1269+
Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xid));
12721270

12731271
return result;
12741272
}
@@ -2368,7 +2366,6 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
23682366
{
23692367
ParallelContext *pcxt;
23702368
int scantuplesortstates;
2371-
Snapshot snapshot;
23722369
Size estbrinshared;
23732370
Size estsort;
23742371
BrinShared *brinshared;
@@ -2399,25 +2396,25 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
23992396
* Prepare for scan of the base relation. In a normal index build, we use
24002397
* SnapshotAny because we must retrieve all tuples and do our own time
24012398
* qual checks (because we have to index RECENTLY_DEAD tuples). In a
2402-
* concurrent build, we take a regular MVCC snapshot and index whatever's
2403-
* live according to that.
2399+
* concurrent build, we take a regular MVCC snapshot and push it as active.
2400+
* Later we index whatever's live according to that snapshot while that
2401+
* snapshot is reset periodically.
24042402
*/
24052403
if (!isconcurrent)
24062404
{
24072405
Assert(ActiveSnapshotSet());
2408-
snapshot = SnapshotAny;
24092406
need_pop_active_snapshot = false;
24102407
}
24112408
else
24122409
{
2413-
snapshot = RegisterSnapshot(GetTransactionSnapshot());
2410+
Assert(!ActiveSnapshotSet());
24142411
PushActiveSnapshot(GetTransactionSnapshot());
24152412
}
24162413

24172414
/*
24182415
* Estimate size for our own PARALLEL_KEY_BRIN_SHARED workspace.
24192416
*/
2420-
estbrinshared = _brin_parallel_estimate_shared(heap, snapshot);
2417+
estbrinshared = _brin_parallel_estimate_shared(heap);
24212418
shm_toc_estimate_chunk(&pcxt->estimator, estbrinshared);
24222419
estsort = tuplesort_estimate_shared(scantuplesortstates);
24232420
shm_toc_estimate_chunk(&pcxt->estimator, estsort);
@@ -2457,8 +2454,6 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
24572454
{
24582455
if (need_pop_active_snapshot)
24592456
PopActiveSnapshot();
2460-
if (IsMVCCSnapshot(snapshot))
2461-
UnregisterSnapshot(snapshot);
24622457
DestroyParallelContext(pcxt);
24632458
ExitParallelMode();
24642459
return;
@@ -2483,7 +2478,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
24832478

24842479
table_parallelscan_initialize(heap,
24852480
ParallelTableScanFromBrinShared(brinshared),
2486-
snapshot);
2481+
isconcurrent ? InvalidSnapshot : SnapshotAny,
2482+
isconcurrent);
24872483

24882484
/*
24892485
* Store shared tuplesort-private state, for which we reserved space.
@@ -2529,7 +2525,6 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
25292525
brinleader->nparticipanttuplesorts++;
25302526
brinleader->brinshared = brinshared;
25312527
brinleader->sharedsort = sharedsort;
2532-
brinleader->snapshot = snapshot;
25332528
brinleader->walusage = walusage;
25342529
brinleader->bufferusage = bufferusage;
25352530

@@ -2545,6 +2540,13 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
25452540
/* Save leader state now that it's clear build will be parallel */
25462541
buildstate->bs_leader = brinleader;
25472542

2543+
/*
2544+
* In case of concurrent build snapshots are going to be reset periodically.
2545+
* We need to wait until all workers imported initial snapshot.
2546+
*/
2547+
if (isconcurrent)
2548+
WaitForParallelWorkersToAttach(pcxt, true);
2549+
25482550
/* Join heap scan ourselves */
25492551
if (leaderparticipates)
25502552
_brin_leader_participate_as_worker(buildstate, heap, index);
@@ -2553,7 +2555,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index,
25532555
* Caller needs to wait for all launched workers when we return. Make
25542556
* sure that the failure-to-start case will not hang forever.
25552557
*/
2556-
WaitForParallelWorkersToAttach(pcxt);
2558+
if (!isconcurrent)
2559+
WaitForParallelWorkersToAttach(pcxt, false);
25572560
if (need_pop_active_snapshot)
25582561
PopActiveSnapshot();
25592562
}
@@ -2576,9 +2579,6 @@ _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state)
25762579
for (i = 0; i < brinleader->pcxt->nworkers_launched; i++)
25772580
InstrAccumParallelQuery(&brinleader->bufferusage[i], &brinleader->walusage[i]);
25782581

2579-
/* Free last reference to MVCC snapshot, if one was used */
2580-
if (IsMVCCSnapshot(brinleader->snapshot))
2581-
UnregisterSnapshot(brinleader->snapshot);
25822582
DestroyParallelContext(brinleader->pcxt);
25832583
ExitParallelMode();
25842584
}
@@ -2778,14 +2778,14 @@ _brin_parallel_merge(BrinBuildState *state)
27782778

27792779
/*
27802780
* Returns size of shared memory required to store state for a parallel
2781-
* brin index build based on the snapshot its parallel scan will use.
2781+
* brin index build.
27822782
*/
27832783
static Size
2784-
_brin_parallel_estimate_shared(Relation heap, Snapshot snapshot)
2784+
_brin_parallel_estimate_shared(Relation heap)
27852785
{
27862786
/* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
27872787
return add_size(BUFFERALIGN(sizeof(BrinShared)),
2788-
table_parallelscan_estimate(heap, snapshot));
2788+
table_parallelscan_estimate(heap, InvalidSnapshot));
27892789
}
27902790

27912791
/*
@@ -2807,6 +2807,7 @@ _brin_leader_participate_as_worker(BrinBuildState *buildstate, Relation heap, Re
28072807
/* Perform work common to all participants */
28082808
_brin_parallel_scan_and_build(buildstate, brinleader->brinshared,
28092809
brinleader->sharedsort, heap, index, sortmem, true);
2810+
Assert(!brinleader->brinshared->isconcurrent || !TransactionIdIsValid(MyProc->xid));
28102811
}
28112812

28122813
/*
@@ -2947,6 +2948,13 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
29472948

29482949
_brin_parallel_scan_and_build(buildstate, brinshared, sharedsort,
29492950
heapRel, indexRel, sortmem, false);
2951+
if (brinshared->isconcurrent)
2952+
{
2953+
PopActiveSnapshot();
2954+
InvalidateCatalogSnapshot();
2955+
Assert(!TransactionIdIsValid(MyProc->xid));
2956+
PushActiveSnapshot(GetTransactionSnapshot());
2957+
}
29502958

29512959
/* Report WAL/buffer usage during parallel execution */
29522960
bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);

src/backend/access/gin/gininsert.c

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,6 @@ typedef struct GinLeader
132132
*/
133133
GinBuildShared *ginshared;
134134
Sharedsort *sharedsort;
135-
Snapshot snapshot;
136135
WalUsage *walusage;
137136
BufferUsage *bufferusage;
138137
} GinLeader;
@@ -180,7 +179,7 @@ typedef struct
180179
static void _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
181180
bool isconcurrent, int request);
182181
static void _gin_end_parallel(GinLeader *ginleader, GinBuildState *state);
183-
static Size _gin_parallel_estimate_shared(Relation heap, Snapshot snapshot);
182+
static Size _gin_parallel_estimate_shared(Relation heap);
184183
static double _gin_parallel_heapscan(GinBuildState *state);
185184
static double _gin_parallel_merge(GinBuildState *state);
186185
static void _gin_leader_participate_as_worker(GinBuildState *buildstate,
@@ -717,7 +716,6 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
717716
reltuples = _gin_parallel_merge(state);
718717

719718
_gin_end_parallel(state->bs_leader, state);
720-
Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin));
721719
}
722720
else /* no parallel index build */
723721
{
@@ -741,7 +739,6 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
741739
list, nlist, &buildstate.buildStats);
742740
}
743741
MemoryContextSwitchTo(oldCtx);
744-
Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin));
745742
}
746743

747744
MemoryContextDelete(buildstate.funcCtx);
@@ -771,6 +768,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo)
771768

772769
result->heap_tuples = reltuples;
773770
result->index_tuples = buildstate.indtuples;
771+
Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin));
774772

775773
return result;
776774
}
@@ -905,7 +903,6 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
905903
{
906904
ParallelContext *pcxt;
907905
int scantuplesortstates;
908-
Snapshot snapshot;
909906
Size estginshared;
910907
Size estsort;
911908
GinBuildShared *ginshared;
@@ -935,25 +932,25 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
935932
* Prepare for scan of the base relation. In a normal index build, we use
936933
* SnapshotAny because we must retrieve all tuples and do our own time
937934
* qual checks (because we have to index RECENTLY_DEAD tuples). In a
938-
* concurrent build, we take a regular MVCC snapshot and index whatever's
939-
* live according to that.
935+
* concurrent build, we take a regular MVCC snapshot and push it as active.
936+
* Later we index whatever's live according to that snapshot while that
937+
* snapshot is reset periodically.
940938
*/
941939
if (!isconcurrent)
942940
{
943941
Assert(ActiveSnapshotSet());
944-
snapshot = SnapshotAny;
945942
need_pop_active_snapshot = false;
946943
}
947944
else
948945
{
949-
snapshot = RegisterSnapshot(GetTransactionSnapshot());
946+
Assert(!ActiveSnapshotSet());
950947
PushActiveSnapshot(GetTransactionSnapshot());
951948
}
952949

953950
/*
954951
* Estimate size for our own PARALLEL_KEY_GIN_SHARED workspace.
955952
*/
956-
estginshared = _gin_parallel_estimate_shared(heap, snapshot);
953+
estginshared = _gin_parallel_estimate_shared(heap);
957954
shm_toc_estimate_chunk(&pcxt->estimator, estginshared);
958955
estsort = tuplesort_estimate_shared(scantuplesortstates);
959956
shm_toc_estimate_chunk(&pcxt->estimator, estsort);
@@ -993,8 +990,6 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
993990
{
994991
if (need_pop_active_snapshot)
995992
PopActiveSnapshot();
996-
if (IsMVCCSnapshot(snapshot))
997-
UnregisterSnapshot(snapshot);
998993
DestroyParallelContext(pcxt);
999994
ExitParallelMode();
1000995
return;
@@ -1018,7 +1013,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
10181013

10191014
table_parallelscan_initialize(heap,
10201015
ParallelTableScanFromGinBuildShared(ginshared),
1021-
snapshot);
1016+
isconcurrent ? InvalidSnapshot : SnapshotAny,
1017+
isconcurrent);
10221018

10231019
/*
10241020
* Store shared tuplesort-private state, for which we reserved space.
@@ -1060,7 +1056,6 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
10601056
ginleader->nparticipanttuplesorts++;
10611057
ginleader->ginshared = ginshared;
10621058
ginleader->sharedsort = sharedsort;
1063-
ginleader->snapshot = snapshot;
10641059
ginleader->walusage = walusage;
10651060
ginleader->bufferusage = bufferusage;
10661061

@@ -1076,6 +1071,13 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
10761071
/* Save leader state now that it's clear build will be parallel */
10771072
buildstate->bs_leader = ginleader;
10781073

1074+
/*
1075+
* In case of concurrent build snapshots are going to be reset periodically.
1076+
* We need to wait until all workers imported initial snapshot.
1077+
*/
1078+
if (isconcurrent)
1079+
WaitForParallelWorkersToAttach(pcxt, true);
1080+
10791081
/* Join heap scan ourselves */
10801082
if (leaderparticipates)
10811083
_gin_leader_participate_as_worker(buildstate, heap, index);
@@ -1084,7 +1086,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index,
10841086
* Caller needs to wait for all launched workers when we return. Make
10851087
* sure that the failure-to-start case will not hang forever.
10861088
*/
1087-
WaitForParallelWorkersToAttach(pcxt);
1089+
if (!isconcurrent)
1090+
WaitForParallelWorkersToAttach(pcxt, false);
10881091
if (need_pop_active_snapshot)
10891092
PopActiveSnapshot();
10901093
}
@@ -1107,9 +1110,6 @@ _gin_end_parallel(GinLeader *ginleader, GinBuildState *state)
11071110
for (i = 0; i < ginleader->pcxt->nworkers_launched; i++)
11081111
InstrAccumParallelQuery(&ginleader->bufferusage[i], &ginleader->walusage[i]);
11091112

1110-
/* Free last reference to MVCC snapshot, if one was used */
1111-
if (IsMVCCSnapshot(ginleader->snapshot))
1112-
UnregisterSnapshot(ginleader->snapshot);
11131113
DestroyParallelContext(ginleader->pcxt);
11141114
ExitParallelMode();
11151115
}
@@ -1790,14 +1790,14 @@ _gin_parallel_merge(GinBuildState *state)
17901790

17911791
/*
17921792
* Returns size of shared memory required to store state for a parallel
1793-
* gin index build based on the snapshot its parallel scan will use.
1793+
* gin index build.
17941794
*/
17951795
static Size
1796-
_gin_parallel_estimate_shared(Relation heap, Snapshot snapshot)
1796+
_gin_parallel_estimate_shared(Relation heap)
17971797
{
17981798
/* c.f. shm_toc_allocate as to why BUFFERALIGN is used */
17991799
return add_size(BUFFERALIGN(sizeof(GinBuildShared)),
1800-
table_parallelscan_estimate(heap, snapshot));
1800+
table_parallelscan_estimate(heap, InvalidSnapshot));
18011801
}
18021802

18031803
/*
@@ -1820,6 +1820,7 @@ _gin_leader_participate_as_worker(GinBuildState *buildstate, Relation heap, Rela
18201820
_gin_parallel_scan_and_build(buildstate, ginleader->ginshared,
18211821
ginleader->sharedsort, heap, index,
18221822
sortmem, true);
1823+
Assert(!ginleader->ginshared->isconcurrent || !TransactionIdIsValid(MyProc->xid));
18231824
}
18241825

18251826
/*
@@ -2179,6 +2180,13 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc)
21792180

21802181
_gin_parallel_scan_and_build(&buildstate, ginshared, sharedsort,
21812182
heapRel, indexRel, sortmem, false);
2183+
if (ginshared->isconcurrent)
2184+
{
2185+
PopActiveSnapshot();
2186+
InvalidateCatalogSnapshot();
2187+
Assert(!TransactionIdIsValid(MyProc->xid));
2188+
PushActiveSnapshot(GetTransactionSnapshot());
2189+
}
21822190

21832191
/* Report WAL/buffer usage during parallel execution */
21842192
bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false);

src/backend/access/heap/heapam_handler.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1235,14 +1235,13 @@ heapam_index_build_range_scan(Relation heapRelation,
12351235
* SnapshotAny because we must retrieve all tuples and do our own time
12361236
* qual checks (because we have to index RECENTLY_DEAD tuples). In a
12371237
* concurrent build, or during bootstrap, we take a regular MVCC snapshot
1238-
* and index whatever's live according to that.
1238+
* and index whatever's live according to that while that snapshot is reset
1239+
* every so often (in case of non-unique index).
12391240
*/
12401241
OldestXmin = InvalidTransactionId;
12411242

12421243
/*
12431244
* For unique index we need consistent snapshot for the whole scan.
1244-
* In case of parallel scan some additional infrastructure required
1245-
* to perform scan with SO_RESET_SNAPSHOT which is not yet ready.
12461245
*/
12471246
reset_snapshots = indexInfo->ii_Concurrent &&
12481247
!indexInfo->ii_Unique &&
@@ -1304,8 +1303,11 @@ heapam_index_build_range_scan(Relation heapRelation,
13041303
Assert(!IsBootstrapProcessingMode());
13051304
Assert(allow_sync);
13061305
snapshot = scan->rs_snapshot;
1307-
PushActiveSnapshot(snapshot);
1308-
need_pop_active_snapshot = true;
1306+
if (!reset_snapshots)
1307+
{
1308+
PushActiveSnapshot(snapshot);
1309+
need_pop_active_snapshot = true;
1310+
}
13091311
}
13101312

13111313
hscan = (HeapScanDesc) scan;

0 commit comments

Comments
 (0)