Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 7e0272c

Browse files
MMeentCommitfest Bot
authored and
Commitfest Bot
committed
NBTree: Reduce Index-Only Scan pin duration
Previously, we would keep a pin on every leaf page while we were returning tuples to the scan. With this patch, we utilize the newly introduced table_index_vischeck_tuples API to pre-check visibility of all TIDs, and thus unpin the page well ahead of when we'd usually be ready with returning and processing all index tuple results. This reduces the time VACUUM may have to wait for a pin, and can increase performance with reduced redundant VM checks.
1 parent cf116aa commit 7e0272c

File tree

3 files changed

+147
-5
lines changed

3 files changed

+147
-5
lines changed

src/backend/access/nbtree/nbtree.c

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,10 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
360360
so->killedItems = NULL; /* until needed */
361361
so->numKilled = 0;
362362

363+
so->vmbuf = InvalidBuffer;
364+
so->vischeckcap = 0;
365+
so->vischecksbuf = NULL;
366+
363367
/*
364368
* We don't know yet whether the scan will be index-only, so we do not
365369
* allocate the tuple workspace arrays until btrescan. However, we set up
@@ -400,6 +404,12 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
400404
BTScanPosUnpinIfPinned(so->markPos);
401405
BTScanPosInvalidate(so->markPos);
402406

407+
if (BufferIsValid(so->vmbuf))
408+
{
409+
ReleaseBuffer(so->vmbuf);
410+
so->vmbuf = InvalidBuffer;
411+
}
412+
403413
/*
404414
* Allocate tuple workspace arrays, if needed for an index-only scan and
405415
* not already done in a previous rescan call. To save on palloc
@@ -451,6 +461,17 @@ btendscan(IndexScanDesc scan)
451461
so->markItemIndex = -1;
452462
BTScanPosUnpinIfPinned(so->markPos);
453463

464+
if (so->vischecksbuf)
465+
pfree(so->vischecksbuf);
466+
so->vischecksbuf = NULL;
467+
so->vischeckcap = 0;
468+
469+
if (BufferIsValid(so->vmbuf))
470+
{
471+
ReleaseBuffer(so->vmbuf);
472+
so->vmbuf = InvalidBuffer;
473+
}
474+
454475
/* No need to invalidate positions, the RAM is about to be freed. */
455476

456477
/* Release storage */

src/backend/access/nbtree/nbtsearch.c

Lines changed: 120 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
#include "utils/rel.h"
2626

2727

28-
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp);
28+
static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp, BTScanOpaque so);
2929
static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
3030
Buffer buf, bool forupdate, BTStack stack,
3131
int access);
@@ -54,6 +54,12 @@ static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno,
5454
static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
5555

5656

57+
/*
58+
* Execute vischecks at the index level?
59+
* Enabled by default.
60+
*/
61+
#define DEBUG_IOS_VISCHECKS_ENABLED true
62+
5763
/*
5864
* _bt_drop_lock_and_maybe_pin()
5965
*
@@ -64,13 +70,109 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
6470
* See nbtree/README section on making concurrent TID recycling safe.
6571
*/
6672
static void
67-
_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
73+
_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp, BTScanOpaque so)
6874
{
6975
_bt_unlockbuf(scan->indexRelation, sp->buf);
7076

77+
/*
78+
* Do some visibility checks if this is an index-only scan; allowing us to
79+
* drop the pin on this page before we have returned all tuples from this
80+
* IOS to the executor.
81+
*/
82+
if (scan->xs_want_itup && DEBUG_IOS_VISCHECKS_ENABLED)
83+
{
84+
int initOffset = sp->firstItem;
85+
int ntids = 1 + sp->lastItem - initOffset;
86+
87+
if (ntids > 0)
88+
{
89+
TM_IndexVisibilityCheckOp visCheck;
90+
Relation heaprel = scan->heapRelation;
91+
TM_VisCheck *check;
92+
BTScanPosItem *item;
93+
94+
visCheck.checkntids = ntids;
95+
96+
if (so->vischeckcap == 0)
97+
{
98+
so->vischecksbuf = palloc_array(TM_VisCheck, ntids);
99+
so->vischeckcap = ntids;
100+
}
101+
else if (so->vischeckcap < visCheck.checkntids)
102+
{
103+
so->vischecksbuf = repalloc_array(so->vischecksbuf,
104+
TM_VisCheck, ntids);
105+
so->vischeckcap = ntids;
106+
}
107+
108+
visCheck.checktids = so->vischecksbuf;
109+
visCheck.vmbuf = &so->vmbuf;
110+
111+
check = so->vischecksbuf;
112+
item = &so->currPos.items[initOffset];
113+
114+
for (int i = 0; i < visCheck.checkntids; i++)
115+
{
116+
Assert(item->visrecheck == TMVC_Unchecked);
117+
Assert(ItemPointerIsValid(&item->heapTid));
118+
119+
PopulateTMVischeck(check, &item->heapTid, initOffset + i);
120+
121+
item++;
122+
check++;
123+
}
124+
125+
table_index_vischeck_tuples(heaprel, &visCheck);
126+
check = so->vischecksbuf;
127+
128+
for (int i = 0; i < visCheck.checkntids; i++)
129+
{
130+
item = &so->currPos.items[check->idxoffnum];
131+
/* We must have a valid visibility check result */
132+
Assert(check->vischeckresult != TMVC_Unchecked);
133+
/* The offset number should still indicate the right item */
134+
Assert(check->tidblkno == ItemPointerGetBlockNumberNoCheck(&item->heapTid));
135+
Assert(check->tidoffset == ItemPointerGetOffsetNumberNoCheck(&item->heapTid));
136+
137+
/* Store the visibility check result */
138+
item->visrecheck = check->vischeckresult;
139+
check++;
140+
}
141+
}
142+
}
143+
144+
/*
145+
* We may need to hold a pin on the page for one of several reasons:
146+
*
147+
* 1.) To safely apply kill_prior_tuple, we need to know that the tuples
148+
* were not removed from the page (and subsequently re-inserted).
149+
* A page's LSN can also allow us to detect modifications on the page,
150+
* which then allows us to bail out of setting the hint bits, but that
151+
* requires the index to be WAL-logged; so unless the index is WAL-logged
152+
* we must hold a pin on the page to apply the kill_prior_tuple
153+
* optimization.
154+
*
155+
* 2.) Non-MVCC scans need pin coupling to make sure the scan covers
156+
* exactly the whole index keyspace.
157+
*
158+
* 3.) For Index-Only Scans, the scan needs to check the visibility of the
159+
* table tuple while the relevant index tuple is guaranteed to still be
160+
* contained in the index (so that vacuum hasn't yet marked any pages that
161+
* could contain the value as ALL_VISIBLE after reclaiming a dead tuple
162+
* that might be buffered in the scan). A pin must therefore be held
163+
* at least while the basic visibility of the page's tuples is being
164+
* checked.
165+
*
166+
* For cases 1 and 2, we must hold the pin after we've finished processing
167+
* the index page.
168+
*
169+
* For case 3, we can release the pin if we first do the visibility checks
170+
* of to-be-returned tuples using table_index_vischeck_tuples, which we've
171+
* done just above.
172+
*/
71173
if (IsMVCCSnapshot(scan->xs_snapshot) &&
72174
RelationNeedsWAL(scan->indexRelation) &&
73-
!scan->xs_want_itup)
175+
(!scan->xs_want_itup || DEBUG_IOS_VISCHECKS_ENABLED))
74176
{
75177
ReleaseBuffer(sp->buf);
76178
sp->buf = InvalidBuffer;
@@ -2007,6 +2109,8 @@ _bt_saveitem(BTScanOpaque so, int itemIndex,
20072109

20082110
currItem->heapTid = itup->t_tid;
20092111
currItem->indexOffset = offnum;
2112+
currItem->visrecheck = TMVC_Unchecked;
2113+
20102114
if (so->currTuples)
20112115
{
20122116
Size itupsz = IndexTupleSize(itup);
@@ -2037,6 +2141,8 @@ _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
20372141

20382142
currItem->heapTid = *heapTid;
20392143
currItem->indexOffset = offnum;
2144+
currItem->visrecheck = TMVC_Unchecked;
2145+
20402146
if (so->currTuples)
20412147
{
20422148
/* Save base IndexTuple (truncate posting list) */
@@ -2073,6 +2179,7 @@ _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
20732179

20742180
currItem->heapTid = *heapTid;
20752181
currItem->indexOffset = offnum;
2182+
currItem->visrecheck = TMVC_Unchecked;
20762183

20772184
/*
20782185
* Have index-only scans return the same base IndexTuple for every TID
@@ -2098,6 +2205,14 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
20982205

20992206
/* Return next item, per amgettuple contract */
21002207
scan->xs_heaptid = currItem->heapTid;
2208+
2209+
if (scan->xs_want_itup)
2210+
{
2211+
scan->xs_visrecheck = currItem->visrecheck;
2212+
Assert(currItem->visrecheck != TMVC_Unchecked ||
2213+
BufferIsValid(so->currPos.buf));
2214+
}
2215+
21012216
if (so->currTuples)
21022217
scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
21032218
}
@@ -2256,7 +2371,7 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
22562371
* so->currPos.buf in preparation for btgettuple returning tuples.
22572372
*/
22582373
Assert(BTScanPosIsPinned(so->currPos));
2259-
_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2374+
_bt_drop_lock_and_maybe_pin(scan, &so->currPos, so);
22602375
return true;
22612376
}
22622377

@@ -2413,7 +2528,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
24132528
*/
24142529
Assert(so->currPos.currPage == blkno);
24152530
Assert(BTScanPosIsPinned(so->currPos));
2416-
_bt_drop_lock_and_maybe_pin(scan, &so->currPos);
2531+
_bt_drop_lock_and_maybe_pin(scan, &so->currPos, so);
24172532

24182533
return true;
24192534
}

src/include/access/nbtree.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -957,6 +957,7 @@ typedef struct BTScanPosItem /* what we remember about each match */
957957
ItemPointerData heapTid; /* TID of referenced heap item */
958958
OffsetNumber indexOffset; /* index item's location within page */
959959
LocationIndex tupleOffset; /* IndexTuple's offset in workspace, if any */
960+
uint8 visrecheck; /* visibility recheck status, if any */
960961
} BTScanPosItem;
961962

962963
typedef struct BTScanPosData
@@ -1071,6 +1072,11 @@ typedef struct BTScanOpaqueData
10711072
int *killedItems; /* currPos.items indexes of killed items */
10721073
int numKilled; /* number of currently stored items */
10731074

1075+
/* used for index-only scan visibility prechecks */
1076+
Buffer vmbuf; /* vm buffer */
1077+
int vischeckcap; /* capacity of vischeckbuf */
1078+
TM_VisCheck *vischecksbuf; /* single allocation to save on alloc overhead */
1079+
10741080
/*
10751081
* If we are doing an index-only scan, these are the tuple storage
10761082
* workspaces for the currPos and markPos respectively. Each is of size

0 commit comments

Comments
 (0)