Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 95fd732

Browse files
nkeyCommitfest Bot
nkey
authored and
Commitfest Bot
committed
Fix btree index scan concurrency issues with dirty snapshots
This patch addresses an issue where non-MVCC index scans using SnapshotDirty or SnapshotSelf could miss tuples due to concurrent modifications. The fix retains read locks on pages for these special snapshot types until the scan is done with the page's tuples, preventing concurrent modifications from causing inconsistent results. Updated README to document this special case in the btree locking mechanism.
1 parent 0131291 commit 95fd732

File tree

5 files changed

+69
-7
lines changed

5 files changed

+69
-7
lines changed

src/backend/access/nbtree/README

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,8 @@ move right until we find a page whose right-link matches the page we
8585
came from. (Actually, it's even harder than that; see page deletion
8686
discussion below.)
8787

88-
Page read locks are held only for as long as a scan is examining a page.
88+
Page read locks are held only for as long as a scan is examining a page
89+
(with exception for SnapshotDirty and SnapshotSelf scans - see below).
8990
To minimize lock/unlock traffic, an index scan always searches a leaf page
9091
to identify all the matching items at once, copying their heap tuple IDs
9192
into backend-local storage. The heap tuple IDs are then processed while
@@ -103,6 +104,16 @@ We also remember the left-link, and follow it when the scan moves backwards
103104
(though this requires extra handling to account for concurrent splits of
104105
the left sibling; see detailed move-left algorithm below).
105106

107+
Despite the described mechanics in place, inconsistent results may still occur
108+
during non-MVCC scans (SnapshotDirty and SnapshotSelf). This issue can occur if a
109+
concurrent transaction deletes a tuple and inserts a new tuple with a new TID in the
110+
same page. If the scan has already visited the page and cached its content in the
111+
backend-local storage, it might skip the old tuple due to deletion and miss the new
112+
tuple because the scan does not re-read the page. To address this issue, for
113+
SnapshotDirty and SnapshotSelf scans, we retain the read lock on the page until
114+
we're completely done processing all the tuples from that page, preventing
115+
concurrent modifications that could lead to inconsistent results.
116+
106117
In most cases we release our lock and pin on a page before attempting
107118
to acquire pin and lock on the page we are moving to. In a few places
108119
it is necessary to lock the next page before releasing the current one.

src/backend/access/nbtree/nbtree.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,12 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
389389
/* Before leaving current page, deal with any killed items */
390390
if (so->numKilled > 0)
391391
_bt_killitems(scan);
392+
/* Release any extended lock held for SnapshotDirty/Self scans */
393+
if (so->currPos.extra_unlock)
394+
{
395+
_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
396+
so->currPos.extra_unlock = false;
397+
}
392398
BTScanPosUnpinIfPinned(so->currPos);
393399
BTScanPosInvalidate(so->currPos);
394400
}
@@ -445,6 +451,12 @@ btendscan(IndexScanDesc scan)
445451
/* Before leaving current page, deal with any killed items */
446452
if (so->numKilled > 0)
447453
_bt_killitems(scan);
454+
/* Release any extended lock held for SnapshotDirty/Self scans */
455+
if (so->currPos.extra_unlock)
456+
{
457+
_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
458+
so->currPos.extra_unlock = false;
459+
}
448460
BTScanPosUnpinIfPinned(so->currPos);
449461
}
450462

@@ -525,6 +537,11 @@ btrestrpos(IndexScanDesc scan)
525537
/* Before leaving current page, deal with any killed items */
526538
if (so->numKilled > 0)
527539
_bt_killitems(scan);
540+
if (so->currPos.extra_unlock)
541+
{
542+
_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
543+
so->currPos.extra_unlock = false;
544+
}
528545
BTScanPosUnpinIfPinned(so->currPos);
529546
}
530547

src/backend/access/nbtree/nbtsearch.c

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,22 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
6161
* This will prevent vacuum from stalling in a blocked state trying to read a
6262
* page when a cursor is sitting on it.
6363
*
64+
* For SnapshotDirty and SnapshotSelf scans, we don't actually unlock the buffer
65+
* here, but instead set extra_unlock to indicate that the lock should be held
66+
* until we're completely done with this page. This prevents concurrent
67+
* modifications from causing inconsistent results during non-MVCC scans.
68+
*
6469
* See nbtree/README section on making concurrent TID recycling safe.
6570
*/
6671
static void
6772
_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
6873
{
74+
if (scan->xs_snapshot->snapshot_type == SNAPSHOT_DIRTY ||
75+
scan->xs_snapshot->snapshot_type == SNAPSHOT_SELF)
76+
{
77+
sp->extra_unlock = true;
78+
return;
79+
}
6980
_bt_unlockbuf(scan->indexRelation, sp->buf);
7081

7182
if (IsMVCCSnapshot(scan->xs_snapshot) &&
@@ -1527,7 +1538,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
15271538
* _bt_next() -- Get the next item in a scan.
15281539
*
15291540
* On entry, so->currPos describes the current page, which may be pinned
1530-
* but is not locked, and so->currPos.itemIndex identifies which item was
1541+
* but is not locked (except for SnapshotDirty and SnapshotSelf scans, where
1542+
* the page remains locked), and so->currPos.itemIndex identifies which item was
15311543
* previously returned.
15321544
*
15331545
* On success exit, so->currPos is updated as needed, and _bt_returnitem
@@ -2107,10 +2119,11 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
21072119
*
21082120
* Wrapper on _bt_readnextpage that performs final steps for the current page.
21092121
*
2110-
* On entry, if so->currPos.buf is valid the buffer is pinned but not locked.
2111-
* If there's no pin held, it's because _bt_drop_lock_and_maybe_pin dropped
2112-
* the pin eagerly earlier on. The scan must have so->currPos.currPage set to
2113-
* a valid block, in any case.
2122+
* On entry, if so->currPos.buf is valid the buffer is pinned but not locked,
2123+
* except for SnapshotDirty and SnapshotSelf scans where the buffer remains locked
2124+
* until we're done with all tuples from the page. If there's no pin held, it's
2125+
* because _bt_drop_lock_and_maybe_pin dropped the pin eagerly earlier on.
2126+
* The scan must have so->currPos.currPage set to a valid block, in any case.
21142127
*/
21152128
static bool
21162129
_bt_steppage(IndexScanDesc scan, ScanDirection dir)
@@ -2169,8 +2182,20 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
21692182

21702183
/* mark/restore not supported by parallel scans */
21712184
Assert(!scan->parallel_scan);
2185+
Assert(scan->xs_snapshot->snapshot_type != SNAPSHOT_DIRTY);
2186+
Assert(scan->xs_snapshot->snapshot_type != SNAPSHOT_SELF);
21722187
}
21732188

2189+
/*
2190+
* For SnapshotDirty/Self scans, we kept the read lock after processing
2191+
* the page's tuples (see _bt_drop_lock_and_maybe_pin). Now that we're
2192+
* moving to another page, we need to explicitly release that lock.
2193+
*/
2194+
if (so->currPos.extra_unlock)
2195+
{
2196+
_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
2197+
so->currPos.extra_unlock = false;
2198+
}
21742199
BTScanPosUnpinIfPinned(so->currPos);
21752200

21762201
/* Walk to the next page with data */

src/backend/access/nbtree/nbtutils.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3383,13 +3383,17 @@ _bt_killitems(IndexScanDesc scan)
33833383
* LSN.
33843384
*/
33853385
droppedpin = false;
3386-
_bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
3386+
/* For SnapshotDirty/Self scans, the buffer is already locked */
3387+
if (!so->currPos.extra_unlock)
3388+
_bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
33873389

33883390
page = BufferGetPage(so->currPos.buf);
33893391
}
33903392
else
33913393
{
33923394
Buffer buf;
3395+
/* extra_unlock should never be set without a valid buffer pin */
3396+
Assert(!so->currPos.extra_unlock);
33933397

33943398
droppedpin = true;
33953399
/* Attempt to re-read the buffer, getting pin and lock. */
@@ -3526,6 +3530,8 @@ _bt_killitems(IndexScanDesc scan)
35263530
}
35273531

35283532
_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
3533+
/* Reset the extra_unlock flag since we've now released the lock */
3534+
so->currPos.extra_unlock = false;
35293535
}
35303536

35313537

src/include/access/nbtree.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -962,6 +962,7 @@ typedef struct BTScanPosItem /* what we remember about each match */
962962
typedef struct BTScanPosData
963963
{
964964
Buffer buf; /* currPage buf (invalid means unpinned) */
965+
bool extra_unlock; /* for SnapshotDirty/Self, read lock is held even after _bt_drop_lock_and_maybe_pin */
965966

966967
/* page details as of the saved position's call to _bt_readpage */
967968
BlockNumber currPage; /* page referenced by items array */
@@ -1009,6 +1010,7 @@ typedef BTScanPosData *BTScanPos;
10091010
)
10101011
#define BTScanPosUnpin(scanpos) \
10111012
do { \
1013+
Assert(!(scanpos).extra_unlock); \
10121014
ReleaseBuffer((scanpos).buf); \
10131015
(scanpos).buf = InvalidBuffer; \
10141016
} while (0)
@@ -1028,6 +1030,7 @@ typedef BTScanPosData *BTScanPos;
10281030
do { \
10291031
(scanpos).buf = InvalidBuffer; \
10301032
(scanpos).currPage = InvalidBlockNumber; \
1033+
(scanpos).extra_unlock = false; \
10311034
} while (0)
10321035

10331036
/* We need one of these for each equality-type SK_SEARCHARRAY scan key */

0 commit comments

Comments
 (0)