Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 9df4805

Browse files
author
Commitfest Bot
committed
[CF 5151] v6 - DirtyScanshot index scan skips concurrently updated tuples
This branch was automatically generated by a robot using patches from an email thread registered at: https://commitfest.postgresql.org/patch/5151 The branch will be overwritten each time a new patch version is posted to the thread, and also periodically to check for bitrot caused by changes on the master branch. Patch(es): https://www.postgresql.org/message-id/CADzfLwW61K09Q8HnQ6zO+QHD2oHBmm5VmZbHa19=ZkMMX_Vcow@mail.gmail.com Author(s): Michail Nikolaev, Mihail Nikalayeu
2 parents 3c4d755 + 95fd732 commit 9df4805

File tree

11 files changed

+146
-8
lines changed

11 files changed

+146
-8
lines changed

src/backend/access/index/indexam.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
#include "utils/ruleutils.h"
5858
#include "utils/snapmgr.h"
5959
#include "utils/syscache.h"
60+
#include "utils/injection_point.h"
6061

6162

6263
/* ----------------------------------------------------------------
@@ -741,6 +742,13 @@ index_getnext_slot(IndexScanDesc scan, ScanDirection direction, TupleTableSlot *
741742
* the index.
742743
*/
743744
Assert(ItemPointerIsValid(&scan->xs_heaptid));
745+
#ifdef USE_INJECTION_POINTS
746+
if (!IsCatalogRelationOid(scan->indexRelation->rd_id))
747+
{
748+
INJECTION_POINT("index_getnext_slot_before_fetch", NULL);
749+
}
750+
#endif
751+
744752
if (index_fetch_heap(scan, slot))
745753
return true;
746754
}

src/backend/access/nbtree/README

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,8 @@ move right until we find a page whose right-link matches the page we
8585
came from. (Actually, it's even harder than that; see page deletion
8686
discussion below.)
8787

88-
Page read locks are held only for as long as a scan is examining a page.
88+
Page read locks are held only for as long as a scan is examining a page
89+
(with exception for SnapshotDirty and SnapshotSelf scans - see below).
8990
To minimize lock/unlock traffic, an index scan always searches a leaf page
9091
to identify all the matching items at once, copying their heap tuple IDs
9192
into backend-local storage. The heap tuple IDs are then processed while
@@ -103,6 +104,16 @@ We also remember the left-link, and follow it when the scan moves backwards
103104
(though this requires extra handling to account for concurrent splits of
104105
the left sibling; see detailed move-left algorithm below).
105106

107+
Despite the described mechanics in place, inconsistent results may still occur
108+
during non-MVCC scans (SnapshotDirty and SnapshotSelf). This issue can occur if a
109+
concurrent transaction deletes a tuple and inserts a new tuple with a new TID in the
110+
same page. If the scan has already visited the page and cached its content in the
111+
backend-local storage, it might skip the old tuple due to deletion and miss the new
112+
tuple because the scan does not re-read the page. To address this issue, for
113+
SnapshotDirty and SnapshotSelf scans, we retain the read lock on the page until
114+
we're completely done processing all the tuples from that page, preventing
115+
concurrent modifications that could lead to inconsistent results.
116+
106117
In most cases we release our lock and pin on a page before attempting
107118
to acquire pin and lock on the page we are moving to. In a few places
108119
it is necessary to lock the next page before releasing the current one.

src/backend/access/nbtree/nbtree.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,12 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
389389
/* Before leaving current page, deal with any killed items */
390390
if (so->numKilled > 0)
391391
_bt_killitems(scan);
392+
/* Release any extended lock held for SnapshotDirty/Self scans */
393+
if (so->currPos.extra_unlock)
394+
{
395+
_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
396+
so->currPos.extra_unlock = false;
397+
}
392398
BTScanPosUnpinIfPinned(so->currPos);
393399
BTScanPosInvalidate(so->currPos);
394400
}
@@ -445,6 +451,12 @@ btendscan(IndexScanDesc scan)
445451
/* Before leaving current page, deal with any killed items */
446452
if (so->numKilled > 0)
447453
_bt_killitems(scan);
454+
/* Release any extended lock held for SnapshotDirty/Self scans */
455+
if (so->currPos.extra_unlock)
456+
{
457+
_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
458+
so->currPos.extra_unlock = false;
459+
}
448460
BTScanPosUnpinIfPinned(so->currPos);
449461
}
450462

@@ -525,6 +537,11 @@ btrestrpos(IndexScanDesc scan)
525537
/* Before leaving current page, deal with any killed items */
526538
if (so->numKilled > 0)
527539
_bt_killitems(scan);
540+
if (so->currPos.extra_unlock)
541+
{
542+
_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
543+
so->currPos.extra_unlock = false;
544+
}
528545
BTScanPosUnpinIfPinned(so->currPos);
529546
}
530547

src/backend/access/nbtree/nbtsearch.c

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,22 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
6161
* This will prevent vacuum from stalling in a blocked state trying to read a
6262
* page when a cursor is sitting on it.
6363
*
64+
* For SnapshotDirty and SnapshotSelf scans, we don't actually unlock the buffer
65+
* here, but instead set extra_unlock to indicate that the lock should be held
66+
* until we're completely done with this page. This prevents concurrent
67+
* modifications from causing inconsistent results during non-MVCC scans.
68+
*
6469
* See nbtree/README section on making concurrent TID recycling safe.
6570
*/
6671
static void
6772
_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
6873
{
74+
if (scan->xs_snapshot->snapshot_type == SNAPSHOT_DIRTY ||
75+
scan->xs_snapshot->snapshot_type == SNAPSHOT_SELF)
76+
{
77+
sp->extra_unlock = true;
78+
return;
79+
}
6980
_bt_unlockbuf(scan->indexRelation, sp->buf);
7081

7182
if (IsMVCCSnapshot(scan->xs_snapshot) &&
@@ -1527,7 +1538,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
15271538
* _bt_next() -- Get the next item in a scan.
15281539
*
15291540
* On entry, so->currPos describes the current page, which may be pinned
1530-
* but is not locked, and so->currPos.itemIndex identifies which item was
1541+
* but is not locked (except for SnapshotDirty and SnapshotSelf scans, where
1542+
* the page remains locked), and so->currPos.itemIndex identifies which item was
15311543
* previously returned.
15321544
*
15331545
* On success exit, so->currPos is updated as needed, and _bt_returnitem
@@ -2107,10 +2119,11 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
21072119
*
21082120
* Wrapper on _bt_readnextpage that performs final steps for the current page.
21092121
*
2110-
* On entry, if so->currPos.buf is valid the buffer is pinned but not locked.
2111-
* If there's no pin held, it's because _bt_drop_lock_and_maybe_pin dropped
2112-
* the pin eagerly earlier on. The scan must have so->currPos.currPage set to
2113-
* a valid block, in any case.
2122+
* On entry, if so->currPos.buf is valid the buffer is pinned but not locked,
2123+
* except for SnapshotDirty and SnapshotSelf scans where the buffer remains locked
2124+
* until we're done with all tuples from the page. If there's no pin held, it's
2125+
* because _bt_drop_lock_and_maybe_pin dropped the pin eagerly earlier on.
2126+
* The scan must have so->currPos.currPage set to a valid block, in any case.
21142127
*/
21152128
static bool
21162129
_bt_steppage(IndexScanDesc scan, ScanDirection dir)
@@ -2169,8 +2182,20 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir)
21692182

21702183
/* mark/restore not supported by parallel scans */
21712184
Assert(!scan->parallel_scan);
2185+
Assert(scan->xs_snapshot->snapshot_type != SNAPSHOT_DIRTY);
2186+
Assert(scan->xs_snapshot->snapshot_type != SNAPSHOT_SELF);
21722187
}
21732188

2189+
/*
2190+
* For SnapshotDirty/Self scans, we kept the read lock after processing
2191+
* the page's tuples (see _bt_drop_lock_and_maybe_pin). Now that we're
2192+
* moving to another page, we need to explicitly release that lock.
2193+
*/
2194+
if (so->currPos.extra_unlock)
2195+
{
2196+
_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
2197+
so->currPos.extra_unlock = false;
2198+
}
21742199
BTScanPosUnpinIfPinned(so->currPos);
21752200

21762201
/* Walk to the next page with data */

src/backend/access/nbtree/nbtutils.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3383,13 +3383,17 @@ _bt_killitems(IndexScanDesc scan)
33833383
* LSN.
33843384
*/
33853385
droppedpin = false;
3386-
_bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
3386+
/* For SnapshotDirty/Self scans, the buffer is already locked */
3387+
if (!so->currPos.extra_unlock)
3388+
_bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
33873389

33883390
page = BufferGetPage(so->currPos.buf);
33893391
}
33903392
else
33913393
{
33923394
Buffer buf;
3395+
/* extra_unlock should never be set without a valid buffer pin */
3396+
Assert(!so->currPos.extra_unlock);
33933397

33943398
droppedpin = true;
33953399
/* Attempt to re-read the buffer, getting pin and lock. */
@@ -3526,6 +3530,8 @@ _bt_killitems(IndexScanDesc scan)
35263530
}
35273531

35283532
_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
3533+
/* Reset the extra_unlock flag since we've now released the lock */
3534+
so->currPos.extra_unlock = false;
35293535
}
35303536

35313537

src/backend/executor/execIndexing.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@
117117
#include "utils/multirangetypes.h"
118118
#include "utils/rangetypes.h"
119119
#include "utils/snapmgr.h"
120+
#include "utils/injection_point.h"
120121

121122
/* waitMode argument to check_exclusion_or_unique_constraint() */
122123
typedef enum
@@ -943,6 +944,8 @@ check_exclusion_or_unique_constraint(Relation heap, Relation index,
943944

944945
ExecDropSingleTupleTableSlot(existing_slot);
945946

947+
if (!conflict)
948+
INJECTION_POINT("check_exclusion_or_unique_constraint_no_conflict", NULL);
946949
return !conflict;
947950
}
948951

src/include/access/nbtree.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -962,6 +962,7 @@ typedef struct BTScanPosItem /* what we remember about each match */
962962
typedef struct BTScanPosData
963963
{
964964
Buffer buf; /* currPage buf (invalid means unpinned) */
965+
bool extra_unlock; /* for SnapshotDirty/Self, read lock is held even after _bt_drop_lock_and_maybe_pin */
965966

966967
/* page details as of the saved position's call to _bt_readpage */
967968
BlockNumber currPage; /* page referenced by items array */
@@ -1009,6 +1010,7 @@ typedef BTScanPosData *BTScanPos;
10091010
)
10101011
#define BTScanPosUnpin(scanpos) \
10111012
do { \
1013+
Assert(!(scanpos).extra_unlock); \
10121014
ReleaseBuffer((scanpos).buf); \
10131015
(scanpos).buf = InvalidBuffer; \
10141016
} while (0)
@@ -1028,6 +1030,7 @@ typedef BTScanPosData *BTScanPos;
10281030
do { \
10291031
(scanpos).buf = InvalidBuffer; \
10301032
(scanpos).currPage = InvalidBlockNumber; \
1033+
(scanpos).extra_unlock = false; \
10311034
} while (0)
10321035

10331036
/* We need one of these for each equality-type SK_SEARCHARRAY scan key */

src/test/modules/injection_points/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ PGFILEDESC = "injection_points - facility for injection points"
1414
REGRESS = injection_points hashagg reindex_conc
1515
REGRESS_OPTS = --dlpath=$(top_builddir)/src/test/regress
1616

17-
ISOLATION = basic inplace syscache-update-pruned
17+
ISOLATION = basic inplace syscache-update-pruned dirty_index_scan
1818

1919
TAP_TESTS = 1
2020

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
Parsed test spec with 3 sessions
2+
3+
starting permutation: s1_s1 s2_s1 s3_s1
4+
injection_points_attach
5+
-----------------------
6+
7+
(1 row)
8+
9+
step s1_s1: INSERT INTO test.tbl VALUES(42, 1) on conflict(i) do update set n = EXCLUDED.n + 1; <waiting ...>
10+
step s2_s1: UPDATE test.tbl SET n = n + 1 WHERE i = 42; <waiting ...>
11+
step s3_s1:
12+
SELECT injection_points_detach('index_getnext_slot_before_fetch');
13+
SELECT injection_points_wakeup('index_getnext_slot_before_fetch');
14+
<waiting ...>
15+
step s1_s1: <... completed>
16+
step s2_s1: <... completed>
17+
step s3_s1: <... completed>
18+
injection_points_detach
19+
-----------------------
20+
21+
(1 row)
22+
23+
injection_points_wakeup
24+
-----------------------
25+
26+
(1 row)
27+

src/test/modules/injection_points/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ tests += {
4747
'basic',
4848
'inplace',
4949
'syscache-update-pruned',
50+
'dirty_index_scan',
5051
],
5152
'runningcheck': false, # see syscache-update-pruned
5253
},
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
setup
2+
{
3+
CREATE EXTENSION injection_points;
4+
CREATE SCHEMA test;
5+
CREATE UNLOGGED TABLE test.tbl(i int primary key, n int);
6+
CREATE INDEX tbl_n_idx ON test.tbl(n);
7+
INSERT INTO test.tbl VALUES(42,1);
8+
}
9+
10+
teardown
11+
{
12+
DROP SCHEMA test CASCADE;
13+
DROP EXTENSION injection_points;
14+
}
15+
16+
session s1
17+
setup {
18+
SELECT injection_points_set_local();
19+
SELECT injection_points_attach('check_exclusion_or_unique_constraint_no_conflict', 'error');
20+
SELECT injection_points_attach('index_getnext_slot_before_fetch', 'wait');
21+
}
22+
23+
step s1_s1 { INSERT INTO test.tbl VALUES(42, 1) on conflict(i) do update set n = EXCLUDED.n + 1; }
24+
25+
session s2
26+
step s2_s1 { UPDATE test.tbl SET n = n + 1 WHERE i = 42; }
27+
28+
session s3
29+
step s3_s1 {
30+
SELECT injection_points_detach('index_getnext_slot_before_fetch');
31+
SELECT injection_points_wakeup('index_getnext_slot_before_fetch');
32+
}
33+
34+
permutation
35+
s1_s1
36+
s2_s1(*)
37+
s3_s1(s1_s1)

0 commit comments

Comments
 (0)