Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 848ef42

Browse files
committed
Add the "snapshot too old" feature
This feature is controlled by a new old_snapshot_threshold GUC. A value of -1 disables the feature, and that is the default. The value of 0 is just intended for testing. Above that it is the number of minutes a snapshot can reach before pruning and vacuum are allowed to remove dead tuples which the snapshot would otherwise protect. The xmin associated with a transaction ID does still protect dead tuples. A connection which is using an "old" snapshot does not get an error unless it accesses a page modified recently enough that it might not be able to produce accurate results. This is similar to the Oracle feature, and we use the same SQLSTATE and error message for compatibility.
1 parent 8b65cf4 commit 848ef42

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

41 files changed

+942
-85
lines changed

contrib/bloom/blscan.c

+2-1
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,8 @@ blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
138138
blkno, RBM_NORMAL, bas);
139139

140140
LockBuffer(buffer, BUFFER_LOCK_SHARE);
141-
page = BufferGetPage(buffer, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
141+
page = BufferGetPage(buffer, scan->xs_snapshot, scan->indexRelation,
142+
BGP_TEST_FOR_OLD_SNAPSHOT);
142143

143144
if (!BloomPageIsDeleted(page))
144145
{

doc/src/sgml/config.sgml

+50
Original file line numberDiff line numberDiff line change
@@ -2041,6 +2041,42 @@ include_dir 'conf.d'
20412041
</para>
20422042
</listitem>
20432043
</varlistentry>
2044+
2045+
<varlistentry id="guc-old-snapshot-threshold" xreflabel="old_snapshot_threshold">
2046+
<term><varname>old_snapshot_threshold</varname> (<type>integer</type>)
2047+
<indexterm>
2048+
<primary><varname>old_snapshot_threshold</> configuration parameter</primary>
2049+
</indexterm>
2050+
</term>
2051+
<listitem>
2052+
<para>
2053+
Sets the minimum time that a snapshot can be used without risk of a
2054+
<literal>snapshot too old</> error occurring when using the snapshot.
2055+
This parameter can only be set at server start.
2056+
</para>
2057+
2058+
<para>
2059+
Beyond the threshold, old data may be vacuumed away. This can help
2060+
prevent bloat in the face of snapshots which remain in use for a
2061+
long time. To prevent incorrect results due to cleanup of data which
2062+
would otherwise be visible to the snapshot, an error is generated
2063+
when the snapshot is older than this threshold and the snapshot is
2064+
used to read a page which has been modified since the snapshot was
2065+
built.
2066+
</para>
2067+
2068+
<para>
2069+
A value of <literal>-1</> disables this feature, and is the default.
2070+
Useful values for production work probably range from a small number
2071+
of hours to a few days. The setting will be coerced to a granularity
2072+
of minutes, and small numbers (such as <literal>0</> or
2073+
<literal>1min</>) are only allowed because they may sometimes be
2074+
useful for testing. While a setting as high as <literal>60d</> is
2075+
allowed, please note that in many workloads extreme bloat or
2076+
transaction ID wraparound may occur in much shorter time frames.
2077+
</para>
2078+
</listitem>
2079+
</varlistentry>
20442080
</variablelist>
20452081
</sect2>
20462082
</sect1>
@@ -3051,6 +3087,10 @@ include_dir 'conf.d'
30513087
You should also consider setting <varname>hot_standby_feedback</>
30523088
on standby server(s) as an alternative to using this parameter.
30533089
</para>
3090+
<para>
3091+
This does not prevent cleanup of dead rows which have reached the age
3092+
specified by <varname>old_snapshot_threshold</>.
3093+
</para>
30543094
</listitem>
30553095
</varlistentry>
30563096

@@ -3198,6 +3238,16 @@ include_dir 'conf.d'
31983238
until it eventually reaches the primary. Standbys make no other use
31993239
of feedback they receive other than to pass upstream.
32003240
</para>
3241+
<para>
3242+
This setting does not override the behavior of
3243+
<varname>old_snapshot_threshold</> on the primary; a snapshot on the
3244+
standby which exceeds the primary's age threshold can become invalid,
3245+
resulting in cancellation of transactions on the standby. This is
3246+
because <varname>old_snapshot_threshold</> is intended to provide an
3247+
absolute limit on the time which dead rows can contribute to bloat,
3248+
which would otherwise be violated because of the configuration of a
3249+
standby.
3250+
</para>
32013251
</listitem>
32023252
</varlistentry>
32033253

src/backend/access/brin/brin.c

+11-8
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
135135
MemoryContext tupcxt = NULL;
136136
MemoryContext oldcxt = NULL;
137137

138-
revmap = brinRevmapInitialize(idxRel, &pagesPerRange);
138+
revmap = brinRevmapInitialize(idxRel, &pagesPerRange, NULL);
139139

140140
for (;;)
141141
{
@@ -152,7 +152,7 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
152152
/* normalize the block number to be the first block in the range */
153153
heapBlk = (heapBlk / pagesPerRange) * pagesPerRange;
154154
brtup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, NULL,
155-
BUFFER_LOCK_SHARE);
155+
BUFFER_LOCK_SHARE, NULL);
156156

157157
/* if range is unsummarized, there's nothing to do */
158158
if (!brtup)
@@ -285,7 +285,8 @@ brinbeginscan(Relation r, int nkeys, int norderbys)
285285
scan = RelationGetIndexScan(r, nkeys, norderbys);
286286

287287
opaque = (BrinOpaque *) palloc(sizeof(BrinOpaque));
288-
opaque->bo_rmAccess = brinRevmapInitialize(r, &opaque->bo_pagesPerRange);
288+
opaque->bo_rmAccess = brinRevmapInitialize(r, &opaque->bo_pagesPerRange,
289+
scan->xs_snapshot);
289290
opaque->bo_bdesc = brin_build_desc(r);
290291
scan->opaque = opaque;
291292

@@ -368,7 +369,8 @@ bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
368369
MemoryContextResetAndDeleteChildren(perRangeCxt);
369370

370371
tup = brinGetTupleForHeapBlock(opaque->bo_rmAccess, heapBlk, &buf,
371-
&off, &size, BUFFER_LOCK_SHARE);
372+
&off, &size, BUFFER_LOCK_SHARE,
373+
scan->xs_snapshot);
372374
if (tup)
373375
{
374376
tup = brin_copy_tuple(tup, size);
@@ -647,7 +649,7 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo)
647649
/*
648650
* Initialize our state, including the deformed tuple state.
649651
*/
650-
revmap = brinRevmapInitialize(index, &pagesPerRange);
652+
revmap = brinRevmapInitialize(index, &pagesPerRange, NULL);
651653
state = initialize_brin_buildstate(index, revmap, pagesPerRange);
652654

653655
/*
@@ -1045,7 +1047,8 @@ summarize_range(IndexInfo *indexInfo, BrinBuildState *state, Relation heapRel,
10451047
* the same.)
10461048
*/
10471049
phtup = brinGetTupleForHeapBlock(state->bs_rmAccess, heapBlk, &phbuf,
1048-
&offset, &phsz, BUFFER_LOCK_SHARE);
1050+
&offset, &phsz, BUFFER_LOCK_SHARE,
1051+
NULL);
10491052
/* the placeholder tuple must exist */
10501053
if (phtup == NULL)
10511054
elog(ERROR, "missing placeholder tuple");
@@ -1080,7 +1083,7 @@ brinsummarize(Relation index, Relation heapRel, double *numSummarized,
10801083
BlockNumber pagesPerRange;
10811084
Buffer buf;
10821085

1083-
revmap = brinRevmapInitialize(index, &pagesPerRange);
1086+
revmap = brinRevmapInitialize(index, &pagesPerRange, NULL);
10841087

10851088
/*
10861089
* Scan the revmap to find unsummarized items.
@@ -1095,7 +1098,7 @@ brinsummarize(Relation index, Relation heapRel, double *numSummarized,
10951098
CHECK_FOR_INTERRUPTS();
10961099

10971100
tup = brinGetTupleForHeapBlock(revmap, heapBlk, &buf, &off, NULL,
1098-
BUFFER_LOCK_SHARE);
1101+
BUFFER_LOCK_SHARE, NULL);
10991102
if (tup == NULL)
11001103
{
11011104
/* no revmap entry for this heap range. Summarize it. */

src/backend/access/brin/brin_revmap.c

+7-4
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,8 @@ static void revmap_physical_extend(BrinRevmap *revmap);
6868
* brinRevmapTerminate when caller is done with it.
6969
*/
7070
BrinRevmap *
71-
brinRevmapInitialize(Relation idxrel, BlockNumber *pagesPerRange)
71+
brinRevmapInitialize(Relation idxrel, BlockNumber *pagesPerRange,
72+
Snapshot snapshot)
7273
{
7374
BrinRevmap *revmap;
7475
Buffer meta;
@@ -77,7 +78,7 @@ brinRevmapInitialize(Relation idxrel, BlockNumber *pagesPerRange)
7778

7879
meta = ReadBuffer(idxrel, BRIN_METAPAGE_BLKNO);
7980
LockBuffer(meta, BUFFER_LOCK_SHARE);
80-
page = BufferGetPage(meta, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
81+
page = BufferGetPage(meta, snapshot, idxrel, BGP_TEST_FOR_OLD_SNAPSHOT);
8182
metadata = (BrinMetaPageData *) PageGetContents(page);
8283

8384
revmap = palloc(sizeof(BrinRevmap));
@@ -187,7 +188,8 @@ brinSetHeapBlockItemptr(Buffer buf, BlockNumber pagesPerRange,
187188
*/
188189
BrinTuple *
189190
brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk,
190-
Buffer *buf, OffsetNumber *off, Size *size, int mode)
191+
Buffer *buf, OffsetNumber *off, Size *size, int mode,
192+
Snapshot snapshot)
191193
{
192194
Relation idxRel = revmap->rm_irel;
193195
BlockNumber mapBlk;
@@ -264,7 +266,8 @@ brinGetTupleForHeapBlock(BrinRevmap *revmap, BlockNumber heapBlk,
264266
*buf = ReadBuffer(idxRel, blk);
265267
}
266268
LockBuffer(*buf, mode);
267-
page = BufferGetPage(*buf, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
269+
page = BufferGetPage(*buf, snapshot, idxRel,
270+
BGP_TEST_FOR_OLD_SNAPSHOT);
268271

269272
/* If we land on a revmap page, start over */
270273
if (BRIN_IS_REGULAR_PAGE(page))

src/backend/access/gin/ginbtree.c

+5-4
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ ginTraverseLock(Buffer buffer, bool searchMode)
7171
* is share-locked, and stack->parent is NULL.
7272
*/
7373
GinBtreeStack *
74-
ginFindLeafPage(GinBtree btree, bool searchMode)
74+
ginFindLeafPage(GinBtree btree, bool searchMode, Snapshot snapshot)
7575
{
7676
GinBtreeStack *stack;
7777

@@ -89,7 +89,8 @@ ginFindLeafPage(GinBtree btree, bool searchMode)
8989

9090
stack->off = InvalidOffsetNumber;
9191

92-
page = BufferGetPage(stack->buffer, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
92+
page = BufferGetPage(stack->buffer, snapshot, btree->index,
93+
BGP_TEST_FOR_OLD_SNAPSHOT);
9394

9495
access = ginTraverseLock(stack->buffer, searchMode);
9596

@@ -115,8 +116,8 @@ ginFindLeafPage(GinBtree btree, bool searchMode)
115116

116117
stack->buffer = ginStepRight(stack->buffer, btree->index, access);
117118
stack->blkno = rightlink;
118-
page = BufferGetPage(stack->buffer, NULL, NULL,
119-
BGP_NO_SNAPSHOT_TEST);
119+
page = BufferGetPage(stack->buffer, snapshot, btree->index,
120+
BGP_TEST_FOR_OLD_SNAPSHOT);
120121

121122
if (!searchMode && GinPageIsIncompleteSplit(page))
122123
ginFinishSplit(btree, stack, false, NULL);

src/backend/access/gin/gindatapage.c

+4-3
Original file line numberDiff line numberDiff line change
@@ -1820,7 +1820,7 @@ ginInsertItemPointers(Relation index, BlockNumber rootBlkno,
18201820
{
18211821
/* search for the leaf page where the first item should go to */
18221822
btree.itemptr = insertdata.items[insertdata.curitem];
1823-
stack = ginFindLeafPage(&btree, false);
1823+
stack = ginFindLeafPage(&btree, false, NULL);
18241824

18251825
ginInsertValue(&btree, stack, &insertdata, buildStats);
18261826
}
@@ -1830,15 +1830,16 @@ ginInsertItemPointers(Relation index, BlockNumber rootBlkno,
18301830
* Starts a new scan on a posting tree.
18311831
*/
18321832
GinBtreeStack *
1833-
ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno)
1833+
ginScanBeginPostingTree(GinBtree btree, Relation index, BlockNumber rootBlkno,
1834+
Snapshot snapshot)
18341835
{
18351836
GinBtreeStack *stack;
18361837

18371838
ginPrepareDataScan(btree, index, rootBlkno);
18381839

18391840
btree->fullScan = TRUE;
18401841

1841-
stack = ginFindLeafPage(btree, TRUE);
1842+
stack = ginFindLeafPage(btree, TRUE, snapshot);
18421843

18431844
return stack;
18441845
}

src/backend/access/gin/ginget.c

+12-10
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ scanPostingTree(Relation index, GinScanEntry scanEntry,
7373
Page page;
7474

7575
/* Descend to the leftmost leaf page */
76-
stack = ginScanBeginPostingTree(&btree, index, rootPostingTree);
76+
stack = ginScanBeginPostingTree(&btree, index, rootPostingTree, snapshot);
7777
buffer = stack->buffer;
7878
IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */
7979

@@ -146,7 +146,8 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
146146
if (moveRightIfItNeeded(btree, stack) == false)
147147
return true;
148148

149-
page = BufferGetPage(stack->buffer, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
149+
page = BufferGetPage(stack->buffer, snapshot, btree->index,
150+
BGP_TEST_FOR_OLD_SNAPSHOT);
150151
itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
151152

152153
/*
@@ -320,7 +321,7 @@ startScanEntry(GinState *ginstate, GinScanEntry entry, Snapshot snapshot)
320321
ginPrepareEntryScan(&btreeEntry, entry->attnum,
321322
entry->queryKey, entry->queryCategory,
322323
ginstate);
323-
stackEntry = ginFindLeafPage(&btreeEntry, true);
324+
stackEntry = ginFindLeafPage(&btreeEntry, true, snapshot);
324325
page = BufferGetPage(stackEntry->buffer, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
325326
needUnlock = TRUE;
326327

@@ -385,7 +386,7 @@ startScanEntry(GinState *ginstate, GinScanEntry entry, Snapshot snapshot)
385386
needUnlock = FALSE;
386387

387388
stack = ginScanBeginPostingTree(&entry->btree, ginstate->index,
388-
rootPostingTree);
389+
rootPostingTree, snapshot);
389390
entry->buffer = stack->buffer;
390391

391392
/*
@@ -627,7 +628,7 @@ entryLoadMoreItems(GinState *ginstate, GinScanEntry entry,
627628
entry->btree.itemptr.ip_posid++;
628629
}
629630
entry->btree.fullScan = false;
630-
stack = ginFindLeafPage(&entry->btree, true);
631+
stack = ginFindLeafPage(&entry->btree, true, snapshot);
631632

632633
/* we don't need the stack, just the buffer. */
633634
entry->buffer = stack->buffer;
@@ -1335,8 +1336,8 @@ scanGetCandidate(IndexScanDesc scan, pendingPosition *pos)
13351336
ItemPointerSetInvalid(&pos->item);
13361337
for (;;)
13371338
{
1338-
page = BufferGetPage(pos->pendingBuffer, NULL,
1339-
NULL, BGP_NO_SNAPSHOT_TEST);
1339+
page = BufferGetPage(pos->pendingBuffer, scan->xs_snapshot,
1340+
scan->indexRelation, BGP_TEST_FOR_OLD_SNAPSHOT);
13401341

13411342
maxoff = PageGetMaxOffsetNumber(page);
13421343
if (pos->firstOffset > maxoff)
@@ -1516,8 +1517,8 @@ collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos)
15161517
memset(datumExtracted + pos->firstOffset - 1, 0,
15171518
sizeof(bool) * (pos->lastOffset - pos->firstOffset));
15181519

1519-
page = BufferGetPage(pos->pendingBuffer, NULL,
1520-
NULL, BGP_NO_SNAPSHOT_TEST);
1520+
page = BufferGetPage(pos->pendingBuffer, scan->xs_snapshot,
1521+
scan->indexRelation, BGP_TEST_FOR_OLD_SNAPSHOT);
15211522

15221523
for (i = 0; i < so->nkeys; i++)
15231524
{
@@ -1710,7 +1711,8 @@ scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
17101711
*ntids = 0;
17111712

17121713
LockBuffer(metabuffer, GIN_SHARE);
1713-
page = BufferGetPage(metabuffer, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
1714+
page = BufferGetPage(metabuffer, scan->xs_snapshot, scan->indexRelation,
1715+
BGP_TEST_FOR_OLD_SNAPSHOT);
17141716
blkno = GinPageGetMeta(page)->head;
17151717

17161718
/*

src/backend/access/gin/gininsert.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ ginEntryInsert(GinState *ginstate,
192192

193193
ginPrepareEntryScan(&btree, attnum, key, category, ginstate);
194194

195-
stack = ginFindLeafPage(&btree, false);
195+
stack = ginFindLeafPage(&btree, false, NULL);
196196
page = BufferGetPage(stack->buffer, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
197197

198198
if (btree.findItem(&btree, stack))

src/backend/access/gist/gistget.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances,
336336
buffer = ReadBuffer(scan->indexRelation, pageItem->blkno);
337337
LockBuffer(buffer, GIST_SHARE);
338338
gistcheckpage(scan->indexRelation, buffer);
339-
page = BufferGetPage(buffer, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
339+
page = BufferGetPage(buffer, scan->xs_snapshot, r, BGP_TEST_FOR_OLD_SNAPSHOT);
340340
opaque = GistPageGetOpaque(page);
341341

342342
/*

src/backend/access/hash/hash.c

+2-1
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,8 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir)
278278

279279
buf = so->hashso_curbuf;
280280
Assert(BufferIsValid(buf));
281-
page = BufferGetPage(buf, NULL, NULL, BGP_NO_SNAPSHOT_TEST);
281+
page = BufferGetPage(buf, scan->xs_snapshot, rel,
282+
BGP_TEST_FOR_OLD_SNAPSHOT);
282283
maxoffnum = PageGetMaxOffsetNumber(page);
283284
for (offnum = ItemPointerGetOffsetNumber(current);
284285
offnum <= maxoffnum;

src/backend/access/hash/hashsearch.c

+6-4
Original file line numberDiff line numberDiff line change
@@ -188,8 +188,8 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
188188

189189
/* Read the metapage */
190190
metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
191-
page = BufferGetPage(metabuf, NULL, NULL,
192-
BGP_NO_SNAPSHOT_TEST);
191+
page = BufferGetPage(metabuf, scan->xs_snapshot, rel,
192+
BGP_TEST_FOR_OLD_SNAPSHOT);
193193
metap = HashPageGetMeta(page);
194194

195195
/*
@@ -242,8 +242,8 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
242242

243243
/* Fetch the primary bucket page for the bucket */
244244
buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE);
245-
page = BufferGetPage(buf, NULL, NULL,
246-
BGP_NO_SNAPSHOT_TEST);
245+
page = BufferGetPage(buf, scan->xs_snapshot, rel,
246+
BGP_TEST_FOR_OLD_SNAPSHOT);
247247
opaque = (HashPageOpaque) PageGetSpecialPointer(page);
248248
Assert(opaque->hasho_bucket == bucket);
249249

@@ -350,6 +350,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
350350
_hash_readnext(rel, &buf, &page, &opaque);
351351
if (BufferIsValid(buf))
352352
{
353+
TestForOldSnapshot(scan->xs_snapshot, rel, page);
353354
maxoff = PageGetMaxOffsetNumber(page);
354355
offnum = _hash_binsearch(page, so->hashso_sk_hash);
355356
}
@@ -391,6 +392,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir)
391392
_hash_readprev(rel, &buf, &page, &opaque);
392393
if (BufferIsValid(buf))
393394
{
395+
TestForOldSnapshot(scan->xs_snapshot, rel, page);
394396
maxoff = PageGetMaxOffsetNumber(page);
395397
offnum = _hash_binsearch_last(page, so->hashso_sk_hash);
396398
}

0 commit comments

Comments
 (0)