Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit 4a70f82

Browse files
Add nbtree Valgrind buffer lock checks.
Holding just a buffer pin (with no buffer lock) on an nbtree buffer/page provides very weak guarantees, especially compared to heapam, where it's often safe to read a page while only holding a buffer pin. This commit has Valgrind enforce the following rule: it is never okay to access an nbtree buffer without holding both a pin and a lock on the buffer. A draft version of this patch detected questionable code that was cleaned up by commits fa7ff64 and 7154aa1. The code in question used to access an nbtree buffer page's special/opaque area with no buffer lock (only a buffer pin). This practice (which isn't obviously unsafe) is hereby formally disallowed in nbtree. There doesn't seem to be any reason to allow it, and banning it keeps things simple for Valgrind. The new checks are implemented by adding custom nbtree client requests (located in LockBuffer() wrapper functions); these requests are "superimposed" on top of the generic bufmgr.c Valgrind client requests added by commit 1e0dfd1. No custom resource management cleanup code is needed to undo the effects of marking buffers as non-accessible under this scheme. Author: Peter Geoghegan Reviewed-By: Anastasia Lubennikova, Georgios Kokolatos Discussion: https://postgr.es/m/CAH2-WzkLgyN3zBvRZ1pkNJThC=xi_0gpWRUb_45eexLH1+k2_Q@mail.gmail.com
1 parent 670c0a1 commit 4a70f82

File tree

8 files changed

+170
-44
lines changed

8 files changed

+170
-44
lines changed

src/backend/access/nbtree/nbtinsert.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,7 @@ _bt_search_insert(Relation rel, BTInsertState insertstate)
303303
{
304304
/* Simulate a _bt_getbuf() call with conditional locking */
305305
insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel));
306-
if (ConditionalLockBuffer(insertstate->buf))
306+
if (_bt_conditionallockbuf(rel, insertstate->buf))
307307
{
308308
Page page;
309309
BTPageOpaque lpageop;

src/backend/access/nbtree/nbtpage.c

Lines changed: 126 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "storage/indexfsm.h"
3333
#include "storage/lmgr.h"
3434
#include "storage/predicate.h"
35+
#include "utils/memdebug.h"
3536
#include "utils/snapmgr.h"
3637

3738
static BTMetaPageData *_bt_getmeta(Relation rel, Buffer metabuf);
@@ -198,8 +199,8 @@ _bt_update_meta_cleanup_info(Relation rel, TransactionId oldestBtpoXact,
198199
}
199200

200201
/* trade in our read lock for a write lock */
201-
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
202-
LockBuffer(metabuf, BT_WRITE);
202+
_bt_unlockbuf(rel, metabuf);
203+
_bt_lockbuf(rel, metabuf, BT_WRITE);
203204

204205
START_CRIT_SECTION();
205206

@@ -340,8 +341,8 @@ _bt_getroot(Relation rel, int access)
340341
}
341342

342343
/* trade in our read lock for a write lock */
343-
LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
344-
LockBuffer(metabuf, BT_WRITE);
344+
_bt_unlockbuf(rel, metabuf);
345+
_bt_lockbuf(rel, metabuf, BT_WRITE);
345346

346347
/*
347348
* Race condition: if someone else initialized the metadata between
@@ -434,8 +435,8 @@ _bt_getroot(Relation rel, int access)
434435
* else accessing the new root page while it's unlocked, since no one
435436
* else knows where it is yet.
436437
*/
437-
LockBuffer(rootbuf, BUFFER_LOCK_UNLOCK);
438-
LockBuffer(rootbuf, BT_READ);
438+
_bt_unlockbuf(rel, rootbuf);
439+
_bt_lockbuf(rel, rootbuf, BT_READ);
439440

440441
/* okay, metadata is correct, release lock on it without caching */
441442
_bt_relbuf(rel, metabuf);
@@ -783,10 +784,20 @@ _bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedX
783784
* blkno == P_NEW means to get an unallocated index page. The page
784785
* will be initialized before returning it.
785786
*
787+
* The general rule in nbtree is that it's never okay to access a
788+
* page without holding both a buffer pin and a buffer lock on
789+
* the page's buffer.
790+
*
786791
* When this routine returns, the appropriate lock is set on the
787792
* requested buffer and its reference count has been incremented
788793
* (ie, the buffer is "locked and pinned"). Also, we apply
789-
* _bt_checkpage to sanity-check the page (except in P_NEW case).
794+
* _bt_checkpage to sanity-check the page (except in P_NEW case),
795+
* and perform Valgrind client requests that help Valgrind detect
796+
* unsafe page accesses.
797+
*
798+
* Note: raw LockBuffer() calls are disallowed in nbtree; all
799+
* buffer lock requests need to go through wrapper functions such
800+
* as _bt_lockbuf().
790801
*/
791802
Buffer
792803
_bt_getbuf(Relation rel, BlockNumber blkno, int access)
@@ -797,7 +808,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
797808
{
798809
/* Read an existing block of the relation */
799810
buf = ReadBuffer(rel, blkno);
800-
LockBuffer(buf, access);
811+
_bt_lockbuf(rel, buf, access);
801812
_bt_checkpage(rel, buf);
802813
}
803814
else
@@ -837,7 +848,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
837848
if (blkno == InvalidBlockNumber)
838849
break;
839850
buf = ReadBuffer(rel, blkno);
840-
if (ConditionalLockBuffer(buf))
851+
if (_bt_conditionallockbuf(rel, buf))
841852
{
842853
page = BufferGetPage(buf);
843854
if (_bt_page_recyclable(page))
@@ -890,7 +901,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
890901
buf = ReadBuffer(rel, P_NEW);
891902

892903
/* Acquire buffer lock on new page */
893-
LockBuffer(buf, BT_WRITE);
904+
_bt_lockbuf(rel, buf, BT_WRITE);
894905

895906
/*
896907
* Release the file-extension lock; it's now OK for someone else to
@@ -931,9 +942,10 @@ _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
931942

932943
Assert(blkno != P_NEW);
933944
if (BufferIsValid(obuf))
934-
LockBuffer(obuf, BUFFER_LOCK_UNLOCK);
945+
_bt_unlockbuf(rel, obuf);
935946
buf = ReleaseAndReadBuffer(obuf, rel, blkno);
936-
LockBuffer(buf, access);
947+
_bt_lockbuf(rel, buf, access);
948+
937949
_bt_checkpage(rel, buf);
938950
return buf;
939951
}
@@ -946,7 +958,102 @@ _bt_relandgetbuf(Relation rel, Buffer obuf, BlockNumber blkno, int access)
946958
void
947959
_bt_relbuf(Relation rel, Buffer buf)
948960
{
949-
UnlockReleaseBuffer(buf);
961+
_bt_unlockbuf(rel, buf);
962+
ReleaseBuffer(buf);
963+
}
964+
965+
/*
966+
* _bt_lockbuf() -- lock a pinned buffer.
967+
*
968+
* Lock is acquired without acquiring another pin. This is like a raw
969+
* LockBuffer() call, but performs extra steps needed by Valgrind.
970+
*
971+
* Note: Caller may need to call _bt_checkpage() with buf when pin on buf
972+
* wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
973+
*/
974+
void
975+
_bt_lockbuf(Relation rel, Buffer buf, int access)
976+
{
977+
/* LockBuffer() asserts that pin is held by this backend */
978+
LockBuffer(buf, access);
979+
980+
/*
981+
* It doesn't matter that _bt_unlockbuf() won't get called in the
982+
* event of an nbtree error (e.g. a unique violation error). That
983+
* won't cause Valgrind false positives.
984+
*
985+
* The nbtree client requests are superimposed on top of the
986+
* bufmgr.c buffer pin client requests. In the event of an nbtree
987+
* error the buffer will certainly get marked as defined when the
988+
* backend once again acquires its first pin on the buffer. (Of
989+
* course, if the backend never touches the buffer again then it
990+
* doesn't matter that it remains non-accessible to Valgrind.)
991+
*
992+
* Note: When an IndexTuple C pointer gets computed using an
993+
* ItemId read from a page while a lock was held, the C pointer
994+
* becomes unsafe to dereference forever as soon as the lock is
995+
* released. Valgrind can only detect cases where the pointer
996+
* gets dereferenced with no _current_ lock/pin held, though.
997+
*/
998+
if (!RelationUsesLocalBuffers(rel))
999+
VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
1000+
}
1001+
1002+
/*
1003+
* _bt_unlockbuf() -- unlock a pinned buffer.
1004+
*/
1005+
void
1006+
_bt_unlockbuf(Relation rel, Buffer buf)
1007+
{
1008+
/*
1009+
* Buffer is pinned and locked, which means that it is expected to be
1010+
* defined and addressable. Check that proactively.
1011+
*/
1012+
VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
1013+
1014+
/* LockBuffer() asserts that pin is held by this backend */
1015+
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1016+
1017+
if (!RelationUsesLocalBuffers(rel))
1018+
VALGRIND_MAKE_MEM_NOACCESS(BufferGetPage(buf), BLCKSZ);
1019+
}
1020+
1021+
/*
1022+
* _bt_conditionallockbuf() -- conditionally BT_WRITE lock pinned
1023+
* buffer.
1024+
*
1025+
* Note: Caller may need to call _bt_checkpage() with buf when pin on buf
1026+
* wasn't originally acquired in _bt_getbuf() or _bt_relandgetbuf().
1027+
*/
1028+
bool
1029+
_bt_conditionallockbuf(Relation rel, Buffer buf)
1030+
{
1031+
/* ConditionalLockBuffer() asserts that pin is held by this backend */
1032+
if (!ConditionalLockBuffer(buf))
1033+
return false;
1034+
1035+
if (!RelationUsesLocalBuffers(rel))
1036+
VALGRIND_MAKE_MEM_DEFINED(BufferGetPage(buf), BLCKSZ);
1037+
1038+
return true;
1039+
}
1040+
1041+
/*
1042+
* _bt_upgradelockbufcleanup() -- upgrade lock to super-exclusive/cleanup
1043+
* lock.
1044+
*/
1045+
void
1046+
_bt_upgradelockbufcleanup(Relation rel, Buffer buf)
1047+
{
1048+
/*
1049+
* Buffer is pinned and locked, which means that it is expected to be
1050+
* defined and addressable. Check that proactively.
1051+
*/
1052+
VALGRIND_CHECK_MEM_IS_DEFINED(BufferGetPage(buf), BLCKSZ);
1053+
1054+
/* LockBuffer() asserts that pin is held by this backend */
1055+
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1056+
LockBufferForCleanup(buf);
9501057
}
9511058

9521059
/*
@@ -1580,7 +1687,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
15801687
* To avoid deadlocks, we'd better drop the leaf page lock
15811688
* before going further.
15821689
*/
1583-
LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
1690+
_bt_unlockbuf(rel, leafbuf);
15841691

15851692
/*
15861693
* Check that the left sibling of leafbuf (if any) is not
@@ -1616,7 +1723,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
16161723
* (Page deletion can cope with the stack being to the left of
16171724
* leafbuf, but not to the right of leafbuf.)
16181725
*/
1619-
LockBuffer(leafbuf, BT_WRITE);
1726+
_bt_lockbuf(rel, leafbuf, BT_WRITE);
16201727
continue;
16211728
}
16221729

@@ -1970,7 +2077,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
19702077
leafleftsib = opaque->btpo_prev;
19712078
leafrightsib = opaque->btpo_next;
19722079

1973-
LockBuffer(leafbuf, BUFFER_LOCK_UNLOCK);
2080+
_bt_unlockbuf(rel, leafbuf);
19742081

19752082
/*
19762083
* Check here, as calling loops will have locks held, preventing
@@ -2005,7 +2112,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
20052112
* To avoid deadlocks, we'd better drop the target page lock before
20062113
* going further.
20072114
*/
2008-
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
2115+
_bt_unlockbuf(rel, buf);
20092116
}
20102117

20112118
/*
@@ -2022,7 +2129,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
20222129
* table.)
20232130
*/
20242131
if (target != leafblkno)
2025-
LockBuffer(leafbuf, BT_WRITE);
2132+
_bt_lockbuf(rel, leafbuf, BT_WRITE);
20262133
if (leftsib != P_NONE)
20272134
{
20282135
lbuf = _bt_getbuf(rel, leftsib, BT_WRITE);
@@ -2072,7 +2179,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
20722179
* rather than a superexclusive lock, since no scan will stop on an empty
20732180
* page.
20742181
*/
2075-
LockBuffer(buf, BT_WRITE);
2182+
_bt_lockbuf(rel, buf, BT_WRITE);
20762183
page = BufferGetPage(buf);
20772184
opaque = (BTPageOpaque) PageGetSpecialPointer(page);
20782185

src/backend/access/nbtree/nbtree.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1115,7 +1115,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
11151115
*/
11161116
buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
11171117
info->strategy);
1118-
LockBuffer(buf, BT_READ);
1118+
_bt_lockbuf(rel, buf, BT_READ);
11191119
page = BufferGetPage(buf);
11201120
opaque = NULL;
11211121
if (!PageIsNew(page))
@@ -1222,8 +1222,7 @@ btvacuumpage(BTVacState *vstate, BlockNumber scanblkno)
12221222
* course of the vacuum scan, whether or not it actually contains any
12231223
* deletable tuples --- see nbtree/README.
12241224
*/
1225-
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
1226-
LockBufferForCleanup(buf);
1225+
_bt_upgradelockbufcleanup(rel, buf);
12271226

12281227
/*
12291228
* Check whether we need to backtrack to earlier pages. What we are

src/backend/access/nbtree/nbtsearch.c

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir);
6464
static void
6565
_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp)
6666
{
67-
LockBuffer(sp->buf, BUFFER_LOCK_UNLOCK);
67+
_bt_unlockbuf(scan->indexRelation, sp->buf);
6868

6969
if (IsMVCCSnapshot(scan->xs_snapshot) &&
7070
RelationNeedsWAL(scan->indexRelation) &&
@@ -187,14 +187,13 @@ _bt_search(Relation rel, BTScanInsert key, Buffer *bufP, int access,
187187
if (access == BT_WRITE && page_access == BT_READ)
188188
{
189189
/* trade in our read lock for a write lock */
190-
LockBuffer(*bufP, BUFFER_LOCK_UNLOCK);
191-
LockBuffer(*bufP, BT_WRITE);
190+
_bt_unlockbuf(rel, *bufP);
191+
_bt_lockbuf(rel, *bufP, BT_WRITE);
192192

193193
/*
194-
* If the page was split between the time that we surrendered our read
195-
* lock and acquired our write lock, then this page may no longer be
196-
* the right place for the key we want to insert. In this case, we
197-
* need to move right in the tree.
194+
* Race -- the leaf page may have split after we dropped the read lock
195+
* but before we acquired a write lock. If it has, we may need to
196+
* move right to its new sibling. Do that.
198197
*/
199198
*bufP = _bt_moveright(rel, key, *bufP, true, stack_in, BT_WRITE,
200199
snapshot);
@@ -289,8 +288,8 @@ _bt_moveright(Relation rel,
289288
/* upgrade our lock if necessary */
290289
if (access == BT_READ)
291290
{
292-
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
293-
LockBuffer(buf, BT_WRITE);
291+
_bt_unlockbuf(rel, buf);
292+
_bt_lockbuf(rel, buf, BT_WRITE);
294293
}
295294

296295
if (P_INCOMPLETE_SPLIT(opaque))
@@ -1413,7 +1412,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir)
14131412
* There's no actually-matching data on this page. Try to advance to
14141413
* the next page. Return false if there's no matching data at all.
14151414
*/
1416-
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
1415+
_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
14171416
if (!_bt_steppage(scan, dir))
14181417
return false;
14191418
}
@@ -2061,7 +2060,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir)
20612060
* deleted.
20622061
*/
20632062
if (BTScanPosIsPinned(so->currPos))
2064-
LockBuffer(so->currPos.buf, BT_READ);
2063+
_bt_lockbuf(rel, so->currPos.buf, BT_READ);
20652064
else
20662065
so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ);
20672066

@@ -2439,7 +2438,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir)
24392438
* There's no actually-matching data on this page. Try to advance to
24402439
* the next page. Return false if there's no matching data at all.
24412440
*/
2442-
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
2441+
_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
24432442
if (!_bt_steppage(scan, dir))
24442443
return false;
24452444
}

src/backend/access/nbtree/nbtutils.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1744,7 +1744,7 @@ _bt_killitems(IndexScanDesc scan)
17441744
* LSN.
17451745
*/
17461746
droppedpin = false;
1747-
LockBuffer(so->currPos.buf, BT_READ);
1747+
_bt_lockbuf(scan->indexRelation, so->currPos.buf, BT_READ);
17481748

17491749
page = BufferGetPage(so->currPos.buf);
17501750
}
@@ -1885,7 +1885,7 @@ _bt_killitems(IndexScanDesc scan)
18851885
MarkBufferDirtyHint(so->currPos.buf, true);
18861886
}
18871887

1888-
LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
1888+
_bt_unlockbuf(scan->indexRelation, so->currPos.buf);
18891889
}
18901890

18911891

0 commit comments

Comments
 (0)