From da189bc515cf8a7d6384cc50cb9484d06c678a2d Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Fri, 7 Mar 2025 17:39:23 +0100 Subject: [PATCH 1/5] IOS/TableAM: Support AM-specific fast visibility tests Previously, we assumed VM_ALL_VISIBLE is universal across all AMs. This is probably not the case, so we introduce a new table method called "table_index_vischeck_tuples" which allows anyone to ask the AM whether a tuple is definitely visible to everyone or might be invisible to someone. The API is intended to replace direct calls to VM_ALL_VISIBLE and as such doesn't include "definitely dead to everyone", as the Heap AM's VM doesn't support *definitely dead* as output for its lookups; and thus it would be too expensive for the Heap AM to produce such results. A future commit will use this inside GIST and SP-GIST to fix a race condition between IOS and VACUUM, which causes a bug with tuple visibility, and a further patch will add support for this to nbtree. --- src/backend/access/heap/heapam.c | 177 +++++++++++++++++++++++ src/backend/access/heap/heapam_handler.c | 1 + src/backend/access/heap/visibilitymap.c | 39 ++--- src/backend/access/index/indexam.c | 6 + src/backend/access/table/tableamapi.c | 1 + src/backend/executor/nodeIndexonlyscan.c | 83 +++++++---- src/backend/utils/adt/selfuncs.c | 76 ++++++---- src/include/access/heapam.h | 2 + src/include/access/relscan.h | 5 + src/include/access/tableam.h | 103 +++++++++++++ src/include/access/visibilitymapdefs.h | 19 +++ 11 files changed, 430 insertions(+), 82 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 2be7f817c78a..3d0277624cc0 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -101,11 +101,37 @@ static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status uint16 infomask, Relation rel, int *remaining, bool logLockFailure); static void index_delete_sort(TM_IndexDeleteOp *delstate); +static inline int heap_ivc_process_block(Relation rel, Buffer *vmbuf, + TM_VisCheck *checks, int nchecks); +static void heap_ivc_process_all(Relation rel, Buffer *vmbuf, + TM_VisCheck *checks, int nchecks); static int bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate); static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup); static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required, bool *copy); +/* sort template definitions for index */ +#define ST_SORT heap_ivc_sortby_tidheapblk +#define ST_ELEMENT_TYPE TM_VisCheck +#define ST_DECLARE +#define ST_DEFINE +#define ST_SCOPE static inline +#define ST_COMPARE(a, b) ( \ + a->tidblkno < b->tidblkno ? -1 : ( \ + a->tidblkno > b->tidblkno ? 1 : 0 \ + ) \ +) + +#include "lib/sort_template.h" + +#define ST_SORT heap_ivc_sortby_idx +#define ST_ELEMENT_TYPE TM_VisCheck +#define ST_DECLARE +#define ST_DEFINE +#define ST_SCOPE static inline +#define ST_COMPARE(a, b) (((int) a->idxoffnum) - ((int) b->idxoffnum)) +#include "lib/sort_template.h" + /* * Each tuple lock mode has a corresponding heavyweight lock, and one or two @@ -8779,6 +8805,157 @@ bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate) return nblocksfavorable; } +/* + * heapam implementation of tableam's index_vischeck_tuples interface. + * + * This helper function is called by index AMs during index-only scans, + * to do VM-based visibility checks on individual tuples, so that the AM + * can hold the tuple in memory for e.g. reordering for extended periods of + * time while without holding thousands of pins to conflict with VACUUM. + * + * It's possible for this to generate a fair amount of I/O, since we may be + * checking hundreds of tuples from a single index block, but that is + * preferred over holding thousands of pins. + * + * We use heuristics to balance the costs of sorting TIDs with VM page + * lookups. + */ +void +heap_index_vischeck_tuples(Relation rel, TM_IndexVisibilityCheckOp *checkop) +{ + Buffer vmbuf = *checkop->vmbuf; + Buffer storvmbuf = vmbuf; + TM_VisCheck *checks = checkop->checktids; + int checkntids = checkop->checkntids; + int upcomingvmbufchanges = 0; + + /* + * The first index scan will have to pin the VM buffer, and that first + * change in the vm buffer shouldn't put us into the expensive VM page & + * sort path; so we special-case this operation. + */ + if (!BufferIsValid(vmbuf)) + { + int processed; + processed = heap_ivc_process_block(rel, &vmbuf, checks,checkntids); + checkntids -= processed; + checks += processed; + storvmbuf = vmbuf; + Assert(processed > 0); + } + + while (vmbuf == storvmbuf && checkntids > 0) + { + int processed; + + processed = heap_ivc_process_block(rel, &vmbuf, checks,checkntids); + + Assert(processed <= checkntids); + + checkntids -= processed; + checks += processed; + } + + *checkop->vmbuf = vmbuf; + + if (checkntids == 0) + { + return; + } + + upcomingvmbufchanges = 0; + + for (int i = 1; i < checkntids; i++) + { + /* + * Instead of storing the previous iteration's result, we only match + * the block numbers + */ + BlockNumber lastblkno = checks[i - 1].tidblkno; + BlockNumber newblkno = checks[i].tidblkno; + /* + * divide-by-constant can be faster than BufferGetBlockNumber() + */ + BlockNumber lastvmblkno = HEAPBLK_TO_VMBLOCK(lastblkno); + BlockNumber newvmblkno = HEAPBLK_TO_VMBLOCK(newblkno); + + if (lastvmblkno != newvmblkno) + upcomingvmbufchanges++; + } + + if (upcomingvmbufchanges <= pg_ceil_log2_32(checkntids)) + { + /* + * No big amount of VM buf changes, so do all visibility checks + * without sorting. + */ + heap_ivc_process_all(rel, checkop->vmbuf, checks, checkntids); + + return; + } + + /* + * Order the TIDs to heap order, so that we will only need to visit every + * VM page at most once. + */ + heap_ivc_sortby_tidheapblk(checks, checkntids); + + /* do all visibility checks */ + heap_ivc_process_all(rel, checkop->vmbuf, checks, checkntids); + + /* put the checks back in index order */ + heap_ivc_sortby_idx(checks, checkntids); +} + + +static inline int +heap_ivc_process_block(Relation rel, Buffer *vmbuf, TM_VisCheck *checks, + int nchecks) +{ + BlockNumber blkno; + BlockNumber prevblkno = blkno = checks->tidblkno; + TMVC_Result result; + int processed = 0; + + if (VM_ALL_VISIBLE(rel, blkno, vmbuf)) + result = TMVC_Visible; + else + result = TMVC_MaybeVisible; + + do + { + checks->vischeckresult = result; + + nchecks--; + processed++; + checks++; + + if (nchecks <= 0) + return processed; + + blkno = checks->tidblkno; + } while (blkno == prevblkno); + + return processed; +} + +static void +heap_ivc_process_all(Relation rel, Buffer *vmbuf, + TM_VisCheck *checks, int nchecks) +{ + while (nchecks > 0) + { + int processed; + + processed = heap_ivc_process_block(rel, vmbuf, checks, nchecks); + + Assert(processed <= nchecks); + + nchecks -= processed; + checks += processed; + } +} + /* * Perform XLogInsert for a heap-visible operation. 'block' is the block * being marked all-visible, and vm_buffer is the buffer containing the diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index ac082fefa77a..fe4b0b39da7f 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -2648,6 +2648,7 @@ static const TableAmRoutine heapam_methods = { .tuple_tid_valid = heapam_tuple_tid_valid, .tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot, .index_delete_tuples = heap_index_delete_tuples, + .index_vischeck_tuples = heap_index_vischeck_tuples, .relation_set_new_filelocator = heapam_relation_set_new_filelocator, .relation_nontransactional_truncate = heapam_relation_nontransactional_truncate, diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c index 745a04ef26e2..ae71c0a6d6eb 100644 --- a/src/backend/access/heap/visibilitymap.c +++ b/src/backend/access/heap/visibilitymap.c @@ -107,17 +107,6 @@ */ #define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData)) -/* Number of heap blocks we can represent in one byte */ -#define HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK) - -/* Number of heap blocks we can represent in one visibility map page. */ -#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE) - -/* Mapping from heap block number to the right bit in the visibility map */ -#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE) -#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE) -#define HEAPBLK_TO_OFFSET(x) (((x) % HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK) - /* Masks for counting subsets of bits in the visibility map. */ #define VISIBLE_MASK8 (0x55) /* The lower bit of each bit pair */ #define FROZEN_MASK8 (0xaa) /* The upper bit of each bit pair */ @@ -137,9 +126,9 @@ static Buffer vm_extend(Relation rel, BlockNumber vm_nblocks); bool visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf, uint8 flags) { - BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); - int mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); - int mapOffset = HEAPBLK_TO_OFFSET(heapBlk); + BlockNumber mapBlock = HEAPBLK_TO_VMBLOCK(heapBlk); + int mapByte = HEAPBLK_TO_VMBYTE(heapBlk); + int mapOffset = HEAPBLK_TO_VMOFFSET(heapBlk); uint8 mask = flags << mapOffset; char *map; bool cleared = false; @@ -190,7 +179,7 @@ visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf, uint8 flags void visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf) { - BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); + BlockNumber mapBlock = HEAPBLK_TO_VMBLOCK(heapBlk); /* Reuse the old pinned buffer if possible */ if (BufferIsValid(*vmbuf)) @@ -214,7 +203,7 @@ visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf) bool visibilitymap_pin_ok(BlockNumber heapBlk, Buffer vmbuf) { - BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); + BlockNumber mapBlock = HEAPBLK_TO_VMBLOCK(heapBlk); return BufferIsValid(vmbuf) && BufferGetBlockNumber(vmbuf) == mapBlock; } @@ -247,9 +236,9 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, XLogRecPtr recptr, Buffer vmBuf, TransactionId cutoff_xid, uint8 flags) { - BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); - uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); - uint8 mapOffset = HEAPBLK_TO_OFFSET(heapBlk); + BlockNumber mapBlock = HEAPBLK_TO_VMBLOCK(heapBlk); + uint32 mapByte = HEAPBLK_TO_VMBYTE(heapBlk); + uint8 mapOffset = HEAPBLK_TO_VMOFFSET(heapBlk); Page page; uint8 *map; uint8 status; @@ -340,9 +329,9 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, Buffer heapBuf, uint8 visibilitymap_get_status(Relation rel, BlockNumber heapBlk, Buffer *vmbuf) { - BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk); - uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk); - uint8 mapOffset = HEAPBLK_TO_OFFSET(heapBlk); + BlockNumber mapBlock = HEAPBLK_TO_VMBLOCK(heapBlk); + uint32 mapByte = HEAPBLK_TO_VMBYTE(heapBlk); + uint8 mapOffset = HEAPBLK_TO_VMOFFSET(heapBlk); char *map; uint8 result; @@ -445,9 +434,9 @@ visibilitymap_prepare_truncate(Relation rel, BlockNumber nheapblocks) BlockNumber newnblocks; /* last remaining block, byte, and bit */ - BlockNumber truncBlock = HEAPBLK_TO_MAPBLOCK(nheapblocks); - uint32 truncByte = HEAPBLK_TO_MAPBYTE(nheapblocks); - uint8 truncOffset = HEAPBLK_TO_OFFSET(nheapblocks); + BlockNumber truncBlock = HEAPBLK_TO_VMBLOCK(nheapblocks); + uint32 truncByte = HEAPBLK_TO_VMBYTE(nheapblocks); + uint8 truncOffset = HEAPBLK_TO_VMOFFSET(nheapblocks); #ifdef TRACE_VISIBILITYMAP elog(DEBUG1, "vm_truncate %s %d", RelationGetRelationName(rel), nheapblocks); diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 219df1971da6..61d1f08220de 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -628,6 +628,12 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) /* XXX: we should assert that a snapshot is pushed or registered */ Assert(TransactionIdIsValid(RecentXmin)); + /* + * Reset xs_visrecheck, so we don't confuse the next tuple's visibility + * state with that of the previous. + */ + scan->xs_visrecheck = TMVC_Unchecked; + /* * The AM's amgettuple proc finds the next index entry matching the scan * keys, and puts the TID into scan->xs_heaptid. It should also set diff --git a/src/backend/access/table/tableamapi.c b/src/backend/access/table/tableamapi.c index 476663b66aad..b3ce90ceaeaa 100644 --- a/src/backend/access/table/tableamapi.c +++ b/src/backend/access/table/tableamapi.c @@ -61,6 +61,7 @@ GetTableAmRoutine(Oid amhandler) Assert(routine->tuple_get_latest_tid != NULL); Assert(routine->tuple_satisfies_snapshot != NULL); Assert(routine->index_delete_tuples != NULL); + Assert(routine->index_vischeck_tuples != NULL); Assert(routine->tuple_insert != NULL); diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index f464cca9507a..e02fc1652ffd 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -121,6 +121,7 @@ IndexOnlyNext(IndexOnlyScanState *node) while ((tid = index_getnext_tid(scandesc, direction)) != NULL) { bool tuple_from_heap = false; + TMVC_Result vischeck = scandesc->xs_visrecheck; CHECK_FOR_INTERRUPTS(); @@ -128,6 +129,9 @@ IndexOnlyNext(IndexOnlyScanState *node) * We can skip the heap fetch if the TID references a heap page on * which all tuples are known visible to everybody. In any case, * we'll use the index tuple not the heap tuple as the data source. + * The index may have already pre-checked the visibility of the tuple + * for us, and stored the result in xs_visrecheck, in which case we + * can skip the call. * * Note on Memory Ordering Effects: visibilitymap_get_status does not * lock the visibility map buffer, and therefore the result we read @@ -157,37 +161,60 @@ IndexOnlyNext(IndexOnlyScanState *node) * * It's worth going through this complexity to avoid needing to lock * the VM buffer, which could cause significant contention. + * + * The index doing these checks for us doesn't materially change these + * considerations. */ - if (!VM_ALL_VISIBLE(scandesc->heapRelation, - ItemPointerGetBlockNumber(tid), - &node->ioss_VMBuffer)) - { - /* - * Rats, we have to visit the heap to check visibility. - */ - InstrCountTuples2(node, 1); - if (!index_fetch_heap(scandesc, node->ioss_TableSlot)) - continue; /* no visible tuple, try next index entry */ + if (vischeck == TMVC_Unchecked) + vischeck = table_index_vischeck_tuple(scandesc->heapRelation, + &node->ioss_VMBuffer, + tid); - ExecClearTuple(node->ioss_TableSlot); - - /* - * Only MVCC snapshots are supported here, so there should be no - * need to keep following the HOT chain once a visible entry has - * been found. If we did want to allow that, we'd need to keep - * more state to remember not to call index_getnext_tid next time. - */ - if (scandesc->xs_heap_continue) - elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); + Assert(vischeck != TMVC_Unchecked); - /* - * Note: at this point we are holding a pin on the heap page, as - * recorded in scandesc->xs_cbuf. We could release that pin now, - * but it's not clear whether it's a win to do so. The next index - * entry might require a visit to the same heap page. - */ - - tuple_from_heap = true; + switch (vischeck) + { + case TMVC_Unchecked: + elog(ERROR, "Failed to check visibility for tuple"); + /* + * In case of compilers that don't undertand that elog(ERROR) + * doens't exit, and which have -Wimplicit-fallthrough: + */ + /* fallthrough */ + case TMVC_MaybeVisible: + { + /* + * Rats, we have to visit the heap to check visibility. + */ + InstrCountTuples2(node, 1); + if (!index_fetch_heap(scandesc, node->ioss_TableSlot)) + continue; /* no visible tuple, try next index entry */ + + ExecClearTuple(node->ioss_TableSlot); + + /* + * Only MVCC snapshots are supported here, so there should be + * no need to keep following the HOT chain once a visible + * entry has been found. If we did want to allow that, we'd + * need to keep more state to remember not to call + * index_getnext_tid next time. + */ + if (scandesc->xs_heap_continue) + elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); + + /* + * Note: at this point we are holding a pin on the heap page, + * as recorded in scandesc->xs_cbuf. We could release that + * pin now, but it's not clear whether it's a win to do so. + * The next index entry might require a visit to the same heap + * page. + */ + + tuple_from_heap = true; + break; + } + case TMVC_Visible: + break; } /* diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index a96b1b9c0bc6..035bd7a82be0 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -6730,44 +6730,62 @@ get_actual_variable_endpoint(Relation heapRel, while ((tid = index_getnext_tid(index_scan, indexscandir)) != NULL) { BlockNumber block = ItemPointerGetBlockNumber(tid); + TMVC_Result visres = index_scan->xs_visrecheck; - if (!VM_ALL_VISIBLE(heapRel, - block, - &vmbuffer)) + if (visres == TMVC_Unchecked) + visres = table_index_vischeck_tuple(heapRel, &vmbuffer, tid); + + Assert(visres != TMVC_Unchecked); + + switch (visres) { - /* Rats, we have to visit the heap to check visibility */ - if (!index_fetch_heap(index_scan, tableslot)) - { + case TMVC_Unchecked: + elog(ERROR, "Failed to check visibility for tuple"); /* - * No visible tuple for this index entry, so we need to - * advance to the next entry. Before doing so, count heap - * page fetches and give up if we've done too many. - * - * We don't charge a page fetch if this is the same heap page - * as the previous tuple. This is on the conservative side, - * since other recently-accessed pages are probably still in - * buffers too; but it's good enough for this heuristic. + * In case of compilers that don't undertand that elog(ERROR) + * doens't exit, and which have -Wimplicit-fallthrough: */ + /* fallthrough */ + case TMVC_MaybeVisible: + { + /* Rats, we have to visit the heap to check visibility */ + if (!index_fetch_heap(index_scan, tableslot)) + { + /* + * No visible tuple for this index entry, so we need to + * advance to the next entry. Before doing so, count heap + * page fetches and give up if we've done too many. + * + * We don't charge a page fetch if this is the same heap + * page as the previous tuple. This is on the + * conservative side, since other recently-accessed pages + * are probably still in buffers too; but it's good enough + * for this heuristic. + */ #define VISITED_PAGES_LIMIT 100 - if (block != last_heap_block) - { - last_heap_block = block; - n_visited_heap_pages++; - if (n_visited_heap_pages > VISITED_PAGES_LIMIT) - break; - } + if (block != last_heap_block) + { + last_heap_block = block; + n_visited_heap_pages++; + if (n_visited_heap_pages > VISITED_PAGES_LIMIT) + break; + } - continue; /* no visible tuple, try next index entry */ - } + continue; /* no visible tuple, try next index entry */ + } - /* We don't actually need the heap tuple for anything */ - ExecClearTuple(tableslot); + /* We don't actually need the heap tuple for anything */ + ExecClearTuple(tableslot); - /* - * We don't care whether there's more than one visible tuple in - * the HOT chain; if any are visible, that's good enough. - */ + /* + * We don't care whether there's more than one visible tuple in + * the HOT chain; if any are visible, that's good enough. + */ + break; + } + case TMVC_Visible: + break; } /* diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index e48fe434cd39..1b66aa0bacc6 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -368,6 +368,8 @@ extern void simple_heap_update(Relation relation, ItemPointer otid, extern TransactionId heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate); +extern void heap_index_vischeck_tuples(Relation rel, + TM_IndexVisibilityCheckOp *checkop); /* in heap/pruneheap.c */ struct GlobalVisState; diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index b5e0fb386c0a..93a6f65ab0e3 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -26,6 +26,9 @@ struct ParallelTableScanDescData; +enum TMVC_Result; + + /* * Generic descriptor for table scans. This is the base-class for table scans, * which needs to be embedded in the scans of individual AMs. @@ -176,6 +179,8 @@ typedef struct IndexScanDescData bool xs_recheck; /* T means scan keys must be rechecked */ + int xs_visrecheck; /* TM_VisCheckResult from tableam.h */ + /* * When fetching with an ordering operator, the values of the ORDER BY * expressions of the last returned tuple, according to the index. If diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 8713e12cbfb9..47666cf96eaa 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -248,6 +248,63 @@ typedef struct TM_IndexDeleteOp TM_IndexStatus *status; } TM_IndexDeleteOp; +/* + * State used when calling table_index_delete_tuples() + * + * Index-only scans need to know the visibility of the associated table tuples + * before they can return the index tuple. If the index tuple is known to be + * visible with a cheap check, we can return it directly without requesting + * the visibility info from the table AM directly. + * + * This AM API exposes a cheap visibility checking API to indexes, allowing + * these indexes to check multiple tuples worth of visibility info at once, + * and allowing the AM to store these checks, improving the pinning ergonomics + * of index AMs by allowing a scan to cache index tuples in memory without + * holding pins on index tuples' pages until the index tuples were returned. + * + * The AM is called with a list of TIDs, and its output will indicate the + * visibility state of each tuple: Unchecked, Dead, MaybeVisible, or Visible. + * + * HeapAM's implementation of visibility maps only allows for cheap checks of + * *definitely visible*; all other results are *maybe visible*. A result for + * *definitely not visible* aka dead is currently not accounted for by lack of + * Table AMs which support such visibility lookups cheaply. + */ +typedef enum TMVC_Result +{ + TMVC_Unchecked, + TMVC_Visible, + TMVC_MaybeVisible, +} TMVC_Result; + +typedef struct TM_VisCheck +{ + /* table TID from index tuple */ + BlockNumber tidblkno; + uint16 tidoffset; + /* identifier for the TID in this visibility check operation context */ + OffsetNumber idxoffnum; + /* the result of the visibility check operation */ + TMVC_Result vischeckresult; +} TM_VisCheck; + +static inline void +PopulateTMVischeck(TM_VisCheck *check, ItemPointer tid, OffsetNumber idxoff) +{ + Assert(ItemPointerIsValid(tid)); + check->tidblkno = ItemPointerGetBlockNumberNoCheck(tid); + check->tidoffset = ItemPointerGetOffsetNumberNoCheck(tid); + check->idxoffnum = idxoff; + check->vischeckresult = TMVC_Unchecked; +} + +typedef struct TM_IndexVisibilityCheckOp +{ + int checkntids; /* number of TIDs to check */ + Buffer *vmbuf; /* pointer to VM buffer to reuse across calls */ + TM_VisCheck *checktids; /* the checks to execute */ +} TM_IndexVisibilityCheckOp; + /* "options" flag bits for table_tuple_insert */ /* TABLE_INSERT_SKIP_WAL was 0x0001; RelationNeedsWAL() now governs */ #define TABLE_INSERT_SKIP_FSM 0x0002 @@ -494,6 +551,10 @@ typedef struct TableAmRoutine TransactionId (*index_delete_tuples) (Relation rel, TM_IndexDeleteOp *delstate); + /* see table_index_vischeck_tuples() */ + void (*index_vischeck_tuples) (Relation rel, + TM_IndexVisibilityCheckOp *checkop); + /* ------------------------------------------------------------------------ * Manipulations of physical tuples. @@ -1318,6 +1379,48 @@ table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate) return rel->rd_tableam->index_delete_tuples(rel, delstate); } +/* + * Determine rough visibility information of index tuples based on each TID. + * + * Determines which entries from index AM caller's TM_IndexVisibilityCheckOp + * state point to TMVC_VISIBLE or TMVC_MAYBE_VISIBLE table tuples, at low IO + * overhead. For the heap AM, the implementation is effectively a wrapper + * around VM_ALL_FROZEN. + * + * On return, all TM_VisChecks indicated by checkop->checktids will have been + * updated with the correct visibility status. + * + * Note that there is no value for "definitely dead" tuples, as the Heap AM + * doesn't have an efficient method to determine that a tuple is dead to all + * users, as it would have to go into the heap. If and when AMs are built + * that would support VM checks with an equivalent to VM_ALL_DEAD this + * decision can be reconsidered. + */ +static inline void +table_index_vischeck_tuples(Relation rel, TM_IndexVisibilityCheckOp *checkop) +{ + return rel->rd_tableam->index_vischeck_tuples(rel, checkop); +} + +static inline TMVC_Result +table_index_vischeck_tuple(Relation rel, Buffer *vmbuffer, ItemPointer tid) +{ + TM_IndexVisibilityCheckOp checkOp; + TM_VisCheck op; + + PopulateTMVischeck(&op, tid, 0); + + checkOp.checktids = &op; + checkOp.checkntids = 1; + checkOp.vmbuf = vmbuffer; + + rel->rd_tableam->index_vischeck_tuples(rel, &checkOp); + + Assert(op.vischeckresult != TMVC_Unchecked); + + return op.vischeckresult; +} + /* ---------------------------------------------------------------------------- * Functions for manipulations of physical tuples. diff --git a/src/include/access/visibilitymapdefs.h b/src/include/access/visibilitymapdefs.h index 5ad5c0208779..c75303f63fd4 100644 --- a/src/include/access/visibilitymapdefs.h +++ b/src/include/access/visibilitymapdefs.h @@ -12,6 +12,7 @@ */ #ifndef VISIBILITYMAPDEFS_H #define VISIBILITYMAPDEFS_H +#include "storage/bufpage.h" /* Number of bits for one heap page */ #define BITS_PER_HEAPBLOCK 2 @@ -31,4 +32,22 @@ #define VISIBILITYMAP_XLOG_CATALOG_REL 0x04 #define VISIBILITYMAP_XLOG_VALID_BITS (VISIBILITYMAP_VALID_BITS | VISIBILITYMAP_XLOG_CATALOG_REL) +/* + * Size of the bitmap on each visibility map page, in bytes. There's no + * extra headers, so the whole page minus the standard page header is + * used for the bitmap. + */ +#define VM_MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData)) + +/* Number of heap blocks we can represent in one byte */ +#define VM_HEAPBLOCKS_PER_BYTE (BITS_PER_BYTE / BITS_PER_HEAPBLOCK) + +/* Number of heap blocks we can represent in one visibility map page. */ +#define VM_HEAPBLOCKS_PER_PAGE (VM_MAPSIZE * VM_HEAPBLOCKS_PER_BYTE) + +/* Mapping from heap block number to the right bit in the visibility map */ +#define HEAPBLK_TO_VMBLOCK(x) ((x) / VM_HEAPBLOCKS_PER_PAGE) +#define HEAPBLK_TO_VMBYTE(x) (((x) % VM_HEAPBLOCKS_PER_PAGE) / VM_HEAPBLOCKS_PER_BYTE) +#define HEAPBLK_TO_VMOFFSET(x) (((x) % VM_HEAPBLOCKS_PER_BYTE) * BITS_PER_HEAPBLOCK) + #endif /* VISIBILITYMAPDEFS_H */ From 760da724a430efe019e861234e9e61b0172ebf69 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Fri, 7 Mar 2025 22:55:24 +0100 Subject: [PATCH 2/5] GIST: Fix visibility issues in IOS Previously, GIST IOS could buffer tuples from pages while VACUUM came along and cleaned up an ALL_DEAD tuple, marking the tuple's page ALL_VISIBLE again and making IOS mistakenly believe the tuple is indeed visible. With this patch, pins now conflict with GIST vacuum, and we now do preliminary visibility checks to be used by IOS so that the IOS infrastructure knows to recheck the heap page even if that page is now ALL_VISIBLE. Idea from Heikki Linnakangas --- src/backend/access/gist/gistget.c | 163 ++++++++++++++++++++++----- src/backend/access/gist/gistscan.c | 11 +- src/backend/access/gist/gistvacuum.c | 6 +- src/include/access/gist_private.h | 27 ++++- 4 files changed, 168 insertions(+), 39 deletions(-) diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c index 387d99723453..16fda28d4ad0 100644 --- a/src/backend/access/gist/gistget.c +++ b/src/backend/access/gist/gistget.c @@ -17,6 +17,7 @@ #include "access/genam.h" #include "access/gist_private.h" #include "access/relscan.h" +#include "access/tableam.h" #include "lib/pairingheap.h" #include "miscadmin.h" #include "pgstat.h" @@ -394,10 +395,14 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, return; } - so->nPageData = so->curPageData = 0; + if (scan->numberOfOrderBys) + so->os.nsortData = 0; + else + so->nos.nPageData = so->nos.curPageData = 0; + scan->xs_hitup = NULL; /* might point into pageDataCxt */ - if (so->pageDataCxt) - MemoryContextReset(so->pageDataCxt); + if (so->nos.pageDataCxt) + MemoryContextReset(so->nos.pageDataCxt); /* * We save the LSN of the page as we read it, so that we know whether it @@ -457,9 +462,9 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, /* * Non-ordered scan, so report tuples in so->pageData[] */ - so->pageData[so->nPageData].heapPtr = it->t_tid; - so->pageData[so->nPageData].recheck = recheck; - so->pageData[so->nPageData].offnum = i; + so->nos.pageData[so->nos.nPageData].heapPtr = it->t_tid; + so->nos.pageData[so->nos.nPageData].recheck = recheck; + so->nos.pageData[so->nos.nPageData].offnum = i; /* * In an index-only scan, also fetch the data from the tuple. The @@ -467,12 +472,12 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, */ if (scan->xs_want_itup) { - oldcxt = MemoryContextSwitchTo(so->pageDataCxt); - so->pageData[so->nPageData].recontup = + oldcxt = MemoryContextSwitchTo(so->nos.pageDataCxt); + so->nos.pageData[so->nos.nPageData].recontup = gistFetchTuple(giststate, r, it); MemoryContextSwitchTo(oldcxt); } - so->nPageData++; + so->nos.nPageData++; } else { @@ -501,7 +506,11 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, * In an index-only scan, also fetch the data from the tuple. */ if (scan->xs_want_itup) + { item->data.heap.recontup = gistFetchTuple(giststate, r, it); + so->os.sortData[so->os.nsortData] = &item->data.heap; + so->os.nsortData += 1; + } } else { @@ -526,7 +535,101 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, } } - UnlockReleaseBuffer(buffer); + /* Allow writes to the buffer, but don't yet allow VACUUM */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + /* + * If we're in an index-only scan, we need to do visibility checks before + * we release the pin, so that VACUUM can't clean up dead tuples from this + * index page and mark the page ALL_VISIBLE before the tuple was returned. + * + * See also docs section "Index Locking Considerations". + */ + if (scan->xs_want_itup) + { + TM_IndexVisibilityCheckOp op; + op.vmbuf = &so->vmbuf; + + if (scan->numberOfOrderBys > 0) + { + op.checkntids = so->os.nsortData; + + if (op.checkntids > 0) + { + op.checktids = palloc(op.checkntids * sizeof(TM_VisCheck)); + + for (int off = 0; off < op.checkntids; off++) + { + Assert(ItemPointerIsValid(&so->os.sortData[off]->heapPtr)); + + PopulateTMVischeck(&op.checktids[off], + &so->os.sortData[off]->heapPtr, + off); + } + } + } + else + { + op.checkntids = so->nos.nPageData; + + if (op.checkntids > 0) + { + op.checktids = palloc_array(TM_VisCheck, op.checkntids); + + for (int off = 0; off < op.checkntids; off++) + { + Assert(ItemPointerIsValid(&so->nos.pageData[off].heapPtr)); + + PopulateTMVischeck(&op.checktids[off], + &so->nos.pageData[off].heapPtr, + off); + } + } + } + + if (op.checkntids > 0) + { + table_index_vischeck_tuples(scan->heapRelation, &op); + + if (scan->numberOfOrderBys > 0) + { + for (int off = 0; off < op.checkntids; off++) + { + TM_VisCheck *check = &op.checktids[off]; + GISTSearchHeapItem *item = so->os.sortData[check->idxoffnum]; + + /* sanity checks */ + Assert(check->idxoffnum < op.checkntids); + Assert(check->tidblkno == ItemPointerGetBlockNumberNoCheck(&item->heapPtr)); + Assert(check->tidoffset == ItemPointerGetOffsetNumberNoCheck(&item->heapPtr)); + + item->visrecheck = check->vischeckresult; + } + /* reset state */ + so->os.nsortData = 0; + } + else + { + for (int off = 0; off < op.checkntids; off++) + { + TM_VisCheck *check = &op.checktids[off]; + GISTSearchHeapItem *item = &so->nos.pageData[check->idxoffnum]; + + Assert(check->idxoffnum < op.checkntids); + Assert(check->tidblkno == ItemPointerGetBlockNumberNoCheck(&item->heapPtr)); + Assert(check->tidoffset == ItemPointerGetOffsetNumberNoCheck(&item->heapPtr)); + + item->visrecheck = check->vischeckresult; + } + } + + /* clean up the used resources */ + pfree(op.checktids); + } + } + + /* Allow VACUUM to process the buffer again */ + ReleaseBuffer(buffer); } /* @@ -588,7 +691,10 @@ getNextNearest(IndexScanDesc scan) /* in an index-only scan, also return the reconstructed tuple. */ if (scan->xs_want_itup) + { scan->xs_hitup = item->data.heap.recontup; + scan->xs_visrecheck = item->data.heap.visrecheck; + } res = true; } else @@ -629,10 +735,10 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir) scan->instrument->nsearches++; so->firstCall = false; - so->curPageData = so->nPageData = 0; + so->nos.curPageData = so->nos.nPageData = 0; scan->xs_hitup = NULL; - if (so->pageDataCxt) - MemoryContextReset(so->pageDataCxt); + if (so->nos.pageDataCxt) + MemoryContextReset(so->nos.pageDataCxt); fakeItem.blkno = GIST_ROOT_BLKNO; memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN)); @@ -649,9 +755,9 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir) /* Fetch tuples index-page-at-a-time */ for (;;) { - if (so->curPageData < so->nPageData) + if (so->nos.curPageData < so->nos.nPageData) { - if (scan->kill_prior_tuple && so->curPageData > 0) + if (scan->kill_prior_tuple && so->nos.curPageData > 0) { if (so->killedItems == NULL) @@ -667,17 +773,20 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir) } if (so->numKilled < MaxIndexTuplesPerPage) so->killedItems[so->numKilled++] = - so->pageData[so->curPageData - 1].offnum; + so->nos.pageData[so->nos.curPageData - 1].offnum; } /* continuing to return tuples from a leaf page */ - scan->xs_heaptid = so->pageData[so->curPageData].heapPtr; - scan->xs_recheck = so->pageData[so->curPageData].recheck; + scan->xs_heaptid = so->nos.pageData[so->nos.curPageData].heapPtr; + scan->xs_recheck = so->nos.pageData[so->nos.curPageData].recheck; /* in an index-only scan, also return the reconstructed tuple */ if (scan->xs_want_itup) - scan->xs_hitup = so->pageData[so->curPageData].recontup; + { + scan->xs_hitup = so->nos.pageData[so->nos.curPageData].recontup; + scan->xs_visrecheck = so->nos.pageData[so->nos.curPageData].visrecheck; + } - so->curPageData++; + so->nos.curPageData++; return true; } @@ -687,8 +796,8 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir) * necessary */ if (scan->kill_prior_tuple - && so->curPageData > 0 - && so->curPageData == so->nPageData) + && so->nos.curPageData > 0 + && so->nos.curPageData == so->nos.nPageData) { if (so->killedItems == NULL) @@ -704,7 +813,7 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir) } if (so->numKilled < MaxIndexTuplesPerPage) so->killedItems[so->numKilled++] = - so->pageData[so->curPageData - 1].offnum; + so->nos.pageData[so->nos.curPageData - 1].offnum; } /* find and process the next index page */ do @@ -733,7 +842,7 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir) gistScanPage(scan, item, item->distances, NULL, NULL); pfree(item); - } while (so->nPageData == 0); + } while (so->nos.nPageData == 0); } } } @@ -756,10 +865,10 @@ gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) scan->instrument->nsearches++; /* Begin the scan by processing the root page */ - so->curPageData = so->nPageData = 0; + so->nos.curPageData = so->nos.nPageData = 0; scan->xs_hitup = NULL; - if (so->pageDataCxt) - MemoryContextReset(so->pageDataCxt); + if (so->nos.pageDataCxt) + MemoryContextReset(so->nos.pageDataCxt); fakeItem.blkno = GIST_ROOT_BLKNO; memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN)); diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c index 700fa959d03d..a59672ce9791 100644 --- a/src/backend/access/gist/gistscan.c +++ b/src/backend/access/gist/gistscan.c @@ -204,9 +204,9 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys, scan->xs_hitupdesc = so->giststate->fetchTupdesc; /* Also create a memory context that will hold the returned tuples */ - so->pageDataCxt = AllocSetContextCreate(so->giststate->scanCxt, - "GiST page data context", - ALLOCSET_DEFAULT_SIZES); + so->nos.pageDataCxt = AllocSetContextCreate(so->giststate->scanCxt, + "GiST page data context", + ALLOCSET_DEFAULT_SIZES); } /* create new, empty pairing heap for search queue */ @@ -347,6 +347,11 @@ void gistendscan(IndexScanDesc scan) { GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + if (BufferIsValid(so->vmbuf)) + { + ReleaseBuffer(so->vmbuf); + so->vmbuf = InvalidBuffer; + } /* * freeGISTstate is enough to clean up everything made by gistbeginscan, diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index dca236b6e573..dfe4acb64187 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -326,10 +326,10 @@ gistvacuumpage(GistVacState *vstate, Buffer buffer) recurse_to = InvalidBlockNumber; /* - * We are not going to stay here for a long time, aggressively grab an - * exclusive lock. + * We are not going to stay here for a long time, aggressively grab a + * cleanup lock. */ - LockBuffer(buffer, GIST_EXCLUSIVE); + LockBufferForCleanup(buffer); page = (Page) BufferGetPage(buffer); if (gistPageRecyclable(page)) diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 39404ec7cdb6..a4bc344381c7 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -22,6 +22,7 @@ #include "storage/buffile.h" #include "utils/hsearch.h" #include "access/genam.h" +#include "tableam.h" /* * Maximum number of "halves" a page can be split into in one operation. @@ -124,6 +125,8 @@ typedef struct GISTSearchHeapItem * index-only scans */ OffsetNumber offnum; /* track offset in page to mark tuple as * LP_DEAD */ + uint8 visrecheck; /* Cached visibility check result for this + * heap pointer. */ } GISTSearchHeapItem; /* Unvisited item, either index page or heap tuple */ @@ -170,12 +173,24 @@ typedef struct GISTScanOpaqueData BlockNumber curBlkno; /* current number of block */ GistNSN curPageLSN; /* pos in the WAL stream when page was read */ - /* In a non-ordered search, returnable heap items are stored here: */ - GISTSearchHeapItem pageData[BLCKSZ / sizeof(IndexTupleData)]; - OffsetNumber nPageData; /* number of valid items in array */ - OffsetNumber curPageData; /* next item to return */ - MemoryContext pageDataCxt; /* context holding the fetched tuples, for - * index-only scans */ + /* info used by Index-Only Scans */ + Buffer vmbuf; /* reusable buffer for IOS' vm lookups */ + + union { + struct { + /* In a non-ordered search, returnable heap items are stored here: */ + GISTSearchHeapItem pageData[BLCKSZ / sizeof(IndexTupleData)]; + OffsetNumber nPageData; /* number of valid items in array */ + OffsetNumber curPageData; /* next item to return */ + MemoryContext pageDataCxt; /* context holding the fetched tuples, + * for index-only scans */ + } nos; + struct { + /* In an ordered search, we use this as scratch space */ + GISTSearchHeapItem *sortData[BLCKSZ / sizeof(IndexTupleData)]; + OffsetNumber nsortData; /* number of items in sortData */ + } os; + }; } GISTScanOpaqueData; typedef GISTScanOpaqueData *GISTScanOpaque; From 927703d2399d817ec5189587f4ec554293068a25 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Sat, 8 Mar 2025 01:15:08 +0100 Subject: [PATCH 3/5] SP-GIST: Fix visibility issues in IOS Previously, SP-GIST IOS could buffer tuples from pages while VACUUM came along and cleaned up an ALL_DEAD tuple, marking the tuple's page ALL_VISIBLE again and making IOS mistakenly believe the tuple is indeed visible. With this patch, pins now conflict with SP-GIST vacuum, and we now do preliminary visibility checks to be used by IOS so that the IOS infrastructure knows to recheck the heap page even if that page is now ALL_VISIBLE. Idea from Heikki Linnakangas --- src/backend/access/spgist/spgscan.c | 182 ++++++++++++++++++++++++-- src/backend/access/spgist/spgvacuum.c | 2 +- src/include/access/spgist_private.h | 9 +- 3 files changed, 179 insertions(+), 14 deletions(-) diff --git a/src/backend/access/spgist/spgscan.c b/src/backend/access/spgist/spgscan.c index 25893050c587..42784fdc4cac 100644 --- a/src/backend/access/spgist/spgscan.c +++ b/src/backend/access/spgist/spgscan.c @@ -30,7 +30,8 @@ typedef void (*storeRes_func) (SpGistScanOpaque so, ItemPointer heapPtr, Datum leafValue, bool isNull, SpGistLeafTuple leafTuple, bool recheck, - bool recheckDistances, double *distances); + bool recheckDistances, double *distances, + TMVC_Result visrecheck); /* * Pairing heap comparison function for the SpGistSearchItem queue. @@ -142,6 +143,7 @@ spgAddStartItem(SpGistScanOpaque so, bool isnull) startEntry->traversalValue = NULL; startEntry->recheck = false; startEntry->recheckDistances = false; + startEntry->visrecheck = TMVC_Unchecked; spgAddSearchItemToQueue(so, startEntry); } @@ -386,6 +388,19 @@ spgrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, if (scankey && scan->numberOfKeys > 0) memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData)); + /* prepare index-only scan requirements */ + so->nReorderThisPage = 0; + if (scan->xs_want_itup) + { + if (so->visrecheck == NULL) + so->visrecheck = palloc(MaxIndexTuplesPerPage); + + if (scan->numberOfOrderBys > 0 && so->items == NULL) + { + so->items = palloc_array(SpGistSearchItem *, MaxIndexTuplesPerPage); + } + } + /* initialize order-by data if needed */ if (orderbys && scan->numberOfOrderBys > 0) { @@ -453,6 +468,9 @@ spgendscan(IndexScanDesc scan) pfree(scan->xs_orderbynulls); } + if (BufferIsValid(so->vmbuf)) + ReleaseBuffer(so->vmbuf); + pfree(so); } @@ -502,6 +520,7 @@ spgNewHeapItem(SpGistScanOpaque so, int level, SpGistLeafTuple leafTuple, item->isLeaf = true; item->recheck = recheck; item->recheckDistances = recheckDistances; + item->visrecheck = TMVC_Unchecked; return item; } @@ -584,6 +603,14 @@ spgLeafTest(SpGistScanOpaque so, SpGistSearchItem *item, isnull, distances); + if (so->want_itup) + { + Assert(PointerIsValid(so->items)); + + so->items[so->nReorderThisPage] = heapItem; + so->nReorderThisPage++; + } + spgAddSearchItemToQueue(so, heapItem); MemoryContextSwitchTo(oldCxt); @@ -593,7 +620,7 @@ spgLeafTest(SpGistScanOpaque so, SpGistSearchItem *item, /* non-ordered scan, so report the item right away */ Assert(!recheckDistances); storeRes(so, &leafTuple->heapPtr, leafValue, isnull, - leafTuple, recheck, false, NULL); + leafTuple, recheck, false, NULL, TMVC_Unchecked); *reportedSome = true; } } @@ -806,6 +833,88 @@ spgTestLeafTuple(SpGistScanOpaque so, return SGLT_GET_NEXTOFFSET(leafTuple); } +/* + * Pupulate so->visrecheck based on tuples which are cached for a currently + * pinned page. + */ +static void +spgPopulateUnorderedVischecks(IndexScanDesc scan, SpGistScanOpaqueData *so) +{ + TM_IndexVisibilityCheckOp op; + + Assert(scan->numberOfOrderBys == 0); + + if (so->nPtrs == 0) + return; + + op.checkntids = so->nPtrs; + op.checktids = palloc_array(TM_VisCheck, so->nPtrs); + op.vmbuf = &so->vmbuf; + + for (int i = 0; i < op.checkntids; i++) + { + Assert(ItemPointerIsValid(&so->heapPtrs[i])); + + PopulateTMVischeck(&op.checktids[i], &so->heapPtrs[i], i); + } + + table_index_vischeck_tuples(scan->heapRelation, &op); + + for (int i = 0; i < op.checkntids; i++) + { + TM_VisCheck *check = &op.checktids[i]; + + Assert(check->tidblkno == ItemPointerGetBlockNumberNoCheck(&so->heapPtrs[check->idxoffnum])); + Assert(check->tidoffset == ItemPointerGetOffsetNumberNoCheck(&so->heapPtrs[check->idxoffnum])); + Assert(check->idxoffnum < op.checkntids); + + so->visrecheck[check->idxoffnum] = check->vischeckresult; + } + + pfree(op.checktids); +} + +/* pupulate so->visrecheck based on current cached tuples */ +static void +spgPopulateOrderedVisChecks(IndexScanDesc scan, SpGistScanOpaqueData *so) +{ + TM_IndexVisibilityCheckOp op; + + if (so->nReorderThisPage == 0) + return; + + Assert(so->nReorderThisPage > 0); + Assert(scan->numberOfOrderBys > 0); + Assert(PointerIsValid(so->items)); + + op.checkntids = so->nReorderThisPage; + op.checktids = palloc_array(TM_VisCheck, so->nReorderThisPage); + op.vmbuf = &so->vmbuf; + + for (int i = 0; i < op.checkntids; i++) + { + PopulateTMVischeck(&op.checktids[i], &so->items[i]->heapPtr, i); + Assert(ItemPointerIsValid(&so->items[i]->heapPtr)); + Assert(so->items[i]->isLeaf); + } + + table_index_vischeck_tuples(scan->heapRelation, &op); + + for (int i = 0; i < op.checkntids; i++) + { + TM_VisCheck *check = &op.checktids[i]; + + Assert(check->idxoffnum < op.checkntids); + Assert(check->tidblkno == ItemPointerGetBlockNumberNoCheck(&so->items[check->idxoffnum]->heapPtr)); + Assert(check->tidoffset == ItemPointerGetOffsetNumberNoCheck(&so->items[check->idxoffnum]->heapPtr)); + + so->items[check->idxoffnum]->visrecheck = check->vischeckresult; + } + + pfree(op.checktids); + so->nReorderThisPage = 0; +} + /* * Walk the tree and report all tuples passing the scan quals to the storeRes * subroutine. @@ -814,8 +923,8 @@ spgTestLeafTuple(SpGistScanOpaque so, * next page boundary once we have reported at least one tuple. */ static void -spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex, - storeRes_func storeRes) +spgWalk(IndexScanDesc scan, Relation index, SpGistScanOpaque so, + bool scanWholeIndex, storeRes_func storeRes) { Buffer buffer = InvalidBuffer; bool reportedSome = false; @@ -835,9 +944,23 @@ spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex, { /* We store heap items in the queue only in case of ordered search */ Assert(so->numberOfNonNullOrderBys > 0); + + /* + * If an item we found on a page is retrieved immediately after + * processing that page, we won't yet have released the page pin, + * and thus won't yet have processed the visibility data of the + * page's (now) ordered tuples. + * Do that now, so that all tuples on the page we're about to + * unpin were checked for visibility before we returned any. + */ + if (so->want_itup && so->nReorderThisPage) + spgPopulateOrderedVisChecks(scan, so); + + Assert(!so->want_itup || item->visrecheck != TMVC_Unchecked); storeRes(so, &item->heapPtr, item->value, item->isNull, item->leafTuple, item->recheck, - item->recheckDistances, item->distances); + item->recheckDistances, item->distances, + item->visrecheck); reportedSome = true; } else @@ -854,7 +977,15 @@ spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex, } else if (blkno != BufferGetBlockNumber(buffer)) { - UnlockReleaseBuffer(buffer); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + Assert(so->numberOfOrderBys >= 0); + if (so->numberOfOrderBys == 0) + spgPopulateUnorderedVischecks(scan, so); + else + spgPopulateOrderedVisChecks(scan, so); + + ReleaseBuffer(buffer); buffer = ReadBuffer(index, blkno); LockBuffer(buffer, BUFFER_LOCK_SHARE); } @@ -922,16 +1053,36 @@ spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex, } if (buffer != InvalidBuffer) - UnlockReleaseBuffer(buffer); -} + { + /* Unlock the buffer for concurrent accesses except VACUUM */ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + /* + * If we're in an index-only scan, pre-check visibility of the tuples, + * so we can drop the pin without causing visibility bugs. + */ + if (so->want_itup) + { + Assert(scan->numberOfOrderBys >= 0); + + if (scan->numberOfOrderBys == 0) + spgPopulateUnorderedVischecks(scan, so); + else + spgPopulateOrderedVisChecks(scan, so); + } + + /* Release the page */ + ReleaseBuffer(buffer); + } +} /* storeRes subroutine for getbitmap case */ static void storeBitmap(SpGistScanOpaque so, ItemPointer heapPtr, Datum leafValue, bool isnull, SpGistLeafTuple leafTuple, bool recheck, - bool recheckDistances, double *distances) + bool recheckDistances, double *distances, + TMVC_Result visres) { Assert(!recheckDistances && !distances); tbm_add_tuples(so->tbm, heapPtr, 1, recheck); @@ -949,7 +1100,7 @@ spggetbitmap(IndexScanDesc scan, TIDBitmap *tbm) so->tbm = tbm; so->ntids = 0; - spgWalk(scan->indexRelation, so, true, storeBitmap); + spgWalk(scan, scan->indexRelation, so, true, storeBitmap); return so->ntids; } @@ -959,12 +1110,15 @@ static void storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr, Datum leafValue, bool isnull, SpGistLeafTuple leafTuple, bool recheck, - bool recheckDistances, double *nonNullDistances) + bool recheckDistances, double *nonNullDistances, + TMVC_Result visres) { Assert(so->nPtrs < MaxIndexTuplesPerPage); so->heapPtrs[so->nPtrs] = *heapPtr; so->recheck[so->nPtrs] = recheck; so->recheckDistances[so->nPtrs] = recheckDistances; + if (so->want_itup) + so->visrecheck[so->nPtrs] = visres; if (so->numberOfOrderBys > 0) { @@ -1041,6 +1195,10 @@ spggettuple(IndexScanDesc scan, ScanDirection dir) scan->xs_heaptid = so->heapPtrs[so->iPtr]; scan->xs_recheck = so->recheck[so->iPtr]; scan->xs_hitup = so->reconTups[so->iPtr]; + if (so->want_itup) + scan->xs_visrecheck = so->visrecheck[so->iPtr]; + + Assert(!scan->xs_want_itup || scan->xs_visrecheck != TMVC_Unchecked); if (so->numberOfOrderBys > 0) index_store_float8_orderby_distances(scan, so->orderByTypes, @@ -1070,7 +1228,7 @@ spggettuple(IndexScanDesc scan, ScanDirection dir) } so->iPtr = so->nPtrs = 0; - spgWalk(scan->indexRelation, so, false, storeGettuple); + spgWalk(scan, scan->indexRelation, so, false, storeGettuple); if (so->nPtrs == 0) break; /* must have completed scan */ diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index 2678f7ab7829..a29c3d798699 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -625,7 +625,7 @@ spgvacuumpage(spgBulkDeleteState *bds, Buffer buffer) BlockNumber blkno = BufferGetBlockNumber(buffer); Page page; - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + LockBufferForCleanup(buffer); page = (Page) BufferGetPage(buffer); if (PageIsNew(page)) diff --git a/src/include/access/spgist_private.h b/src/include/access/spgist_private.h index cb43a278f466..63e970468c7d 100644 --- a/src/include/access/spgist_private.h +++ b/src/include/access/spgist_private.h @@ -21,6 +21,7 @@ #include "storage/buf.h" #include "utils/geo_decls.h" #include "utils/relcache.h" +#include "tableam.h" typedef struct SpGistOptions @@ -175,7 +176,7 @@ typedef struct SpGistSearchItem bool isLeaf; /* SearchItem is heap item */ bool recheck; /* qual recheck is needed */ bool recheckDistances; /* distance recheck is needed */ - + uint8 visrecheck; /* IOS: TMVC_Result of contained heap tuple */ /* array with numberOfOrderBys entries */ double distances[FLEXIBLE_ARRAY_MEMBER]; } SpGistSearchItem; @@ -223,6 +224,7 @@ typedef struct SpGistScanOpaqueData /* These fields are only used in amgettuple scans: */ bool want_itup; /* are we reconstructing tuples? */ + Buffer vmbuf; /* IOS: used for table_index_vischeck_tuples */ TupleDesc reconTupDesc; /* if so, descriptor for reconstructed tuples */ int nPtrs; /* number of TIDs found on current page */ int iPtr; /* index for scanning through same */ @@ -235,6 +237,11 @@ typedef struct SpGistScanOpaqueData /* distances (for recheck) */ IndexOrderByDistance *distances[MaxIndexTuplesPerPage]; + /* support for IOS */ + int nReorderThisPage; + uint8 *visrecheck; /* IOS vis check results, counted by nPtrs */ + SpGistSearchItem **items; /* counted by nReorderThisPage */ + /* * Note: using MaxIndexTuplesPerPage above is a bit hokey since * SpGistLeafTuples aren't exactly IndexTuples; however, they are larger, From 2975e878f255d773ecab15db9e1b5bf48af70fed Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Thu, 20 Mar 2025 23:12:25 +0100 Subject: [PATCH 4/5] NBTree: Reduce Index-Only Scan pin duration Previously, we would keep a pin on every leaf page while we were returning tuples to the scan. With this patch, we utilize the newly introduced table_index_vischeck_tuples API to pre-check visibility of all TIDs, and thus unpin the page well ahead of when we'd usually be ready with returning and processing all index tuple results. This reduces the time VACUUM may have to wait for a pin, and can increase performance with reduced redundant VM checks. --- src/backend/access/nbtree/nbtree.c | 21 +++++ src/backend/access/nbtree/nbtsearch.c | 125 ++++++++++++++++++++++++-- src/include/access/nbtree.h | 6 ++ 3 files changed, 147 insertions(+), 5 deletions(-) diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 765659887af7..96217d7baf04 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -360,6 +360,10 @@ btbeginscan(Relation rel, int nkeys, int norderbys) so->killedItems = NULL; /* until needed */ so->numKilled = 0; + so->vmbuf = InvalidBuffer; + so->vischeckcap = 0; + so->vischecksbuf = NULL; + /* * We don't know yet whether the scan will be index-only, so we do not * allocate the tuple workspace arrays until btrescan. However, we set up @@ -400,6 +404,12 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, BTScanPosUnpinIfPinned(so->markPos); BTScanPosInvalidate(so->markPos); + if (BufferIsValid(so->vmbuf)) + { + ReleaseBuffer(so->vmbuf); + so->vmbuf = InvalidBuffer; + } + /* * Allocate tuple workspace arrays, if needed for an index-only scan and * not already done in a previous rescan call. To save on palloc @@ -451,6 +461,17 @@ btendscan(IndexScanDesc scan) so->markItemIndex = -1; BTScanPosUnpinIfPinned(so->markPos); + if (so->vischecksbuf) + pfree(so->vischecksbuf); + so->vischecksbuf = NULL; + so->vischeckcap = 0; + + if (BufferIsValid(so->vmbuf)) + { + ReleaseBuffer(so->vmbuf); + so->vmbuf = InvalidBuffer; + } + /* No need to invalidate positions, the RAM is about to be freed. */ /* Release storage */ diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index fe9a3886913d..2f89cf1ea728 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -25,7 +25,7 @@ #include "utils/rel.h" -static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); +static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp, BTScanOpaque so); static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, Buffer buf, bool forupdate, BTStack stack, int access); @@ -54,6 +54,12 @@ static Buffer _bt_lock_and_validate_left(Relation rel, BlockNumber *blkno, static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); +/* + * Execute vischecks at the index level? + * Enabled by default. + */ +#define DEBUG_IOS_VISCHECKS_ENABLED true + /* * _bt_drop_lock_and_maybe_pin() * @@ -64,13 +70,109 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); * See nbtree/README section on making concurrent TID recycling safe. */ static void -_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) +_bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp, BTScanOpaque so) { _bt_unlockbuf(scan->indexRelation, sp->buf); + /* + * Do some visibility checks if this is an index-only scan; allowing us to + * drop the pin on this page before we have returned all tuples from this + * IOS to the executor. + */ + if (scan->xs_want_itup && DEBUG_IOS_VISCHECKS_ENABLED) + { + int initOffset = sp->firstItem; + int ntids = 1 + sp->lastItem - initOffset; + + if (ntids > 0) + { + TM_IndexVisibilityCheckOp visCheck; + Relation heaprel = scan->heapRelation; + TM_VisCheck *check; + BTScanPosItem *item; + + visCheck.checkntids = ntids; + + if (so->vischeckcap == 0) + { + so->vischecksbuf = palloc_array(TM_VisCheck, ntids); + so->vischeckcap = ntids; + } + else if (so->vischeckcap < visCheck.checkntids) + { + so->vischecksbuf = repalloc_array(so->vischecksbuf, + TM_VisCheck, ntids); + so->vischeckcap = ntids; + } + + visCheck.checktids = so->vischecksbuf; + visCheck.vmbuf = &so->vmbuf; + + check = so->vischecksbuf; + item = &so->currPos.items[initOffset]; + + for (int i = 0; i < visCheck.checkntids; i++) + { + Assert(item->visrecheck == TMVC_Unchecked); + Assert(ItemPointerIsValid(&item->heapTid)); + + PopulateTMVischeck(check, &item->heapTid, initOffset + i); + + item++; + check++; + } + + table_index_vischeck_tuples(heaprel, &visCheck); + check = so->vischecksbuf; + + for (int i = 0; i < visCheck.checkntids; i++) + { + item = &so->currPos.items[check->idxoffnum]; + /* We must have a valid visibility check result */ + Assert(check->vischeckresult != TMVC_Unchecked); + /* The offset number should still indicate the right item */ + Assert(check->tidblkno == ItemPointerGetBlockNumberNoCheck(&item->heapTid)); + Assert(check->tidoffset == ItemPointerGetOffsetNumberNoCheck(&item->heapTid)); + + /* Store the visibility check result */ + item->visrecheck = check->vischeckresult; + check++; + } + } + } + + /* + * We may need to hold a pin on the page for one of several reasons: + * + * 1.) To safely apply kill_prior_tuple, we need to know that the tuples + * were not removed from the page (and subsequently re-inserted). + * A page's LSN can also allow us to detect modifications on the page, + * which then allows us to bail out of setting the hint bits, but that + * requires the index to be WAL-logged; so unless the index is WAL-logged + * we must hold a pin on the page to apply the kill_prior_tuple + * optimization. + * + * 2.) Non-MVCC scans need pin coupling to make sure the scan covers + * exactly the whole index keyspace. + * + * 3.) For Index-Only Scans, the scan needs to check the visibility of the + * table tuple while the relevant index tuple is guaranteed to still be + * contained in the index (so that vacuum hasn't yet marked any pages that + * could contain the value as ALL_VISIBLE after reclaiming a dead tuple + * that might be buffered in the scan). A pin must therefore be held + * at least while the basic visibility of the page's tuples is being + * checked. + * + * For cases 1 and 2, we must hold the pin after we've finished processing + * the index page. + * + * For case 3, we can release the pin if we first do the visibility checks + * of to-be-returned tuples using table_index_vischeck_tuples, which we've + * done just above. + */ if (IsMVCCSnapshot(scan->xs_snapshot) && RelationNeedsWAL(scan->indexRelation) && - !scan->xs_want_itup) + (!scan->xs_want_itup || DEBUG_IOS_VISCHECKS_ENABLED)) { ReleaseBuffer(sp->buf); sp->buf = InvalidBuffer; @@ -2007,6 +2109,8 @@ _bt_saveitem(BTScanOpaque so, int itemIndex, currItem->heapTid = itup->t_tid; currItem->indexOffset = offnum; + currItem->visrecheck = TMVC_Unchecked; + if (so->currTuples) { Size itupsz = IndexTupleSize(itup); @@ -2037,6 +2141,8 @@ _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum, currItem->heapTid = *heapTid; currItem->indexOffset = offnum; + currItem->visrecheck = TMVC_Unchecked; + if (so->currTuples) { /* Save base IndexTuple (truncate posting list) */ @@ -2073,6 +2179,7 @@ _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, currItem->heapTid = *heapTid; currItem->indexOffset = offnum; + currItem->visrecheck = TMVC_Unchecked; /* * Have index-only scans return the same base IndexTuple for every TID @@ -2098,6 +2205,14 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so) /* Return next item, per amgettuple contract */ scan->xs_heaptid = currItem->heapTid; + + if (scan->xs_want_itup) + { + scan->xs_visrecheck = currItem->visrecheck; + Assert(currItem->visrecheck != TMVC_Unchecked || + BufferIsValid(so->currPos.buf)); + } + if (so->currTuples) scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); } @@ -2256,7 +2371,7 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir) * so->currPos.buf in preparation for btgettuple returning tuples. */ Assert(BTScanPosIsPinned(so->currPos)); - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + _bt_drop_lock_and_maybe_pin(scan, &so->currPos, so); return true; } @@ -2413,7 +2528,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, */ Assert(so->currPos.currPage == blkno); Assert(BTScanPosIsPinned(so->currPos)); - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + _bt_drop_lock_and_maybe_pin(scan, &so->currPos, so); return true; } diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index ebca02588d3e..f9ced4a1f0bc 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -957,6 +957,7 @@ typedef struct BTScanPosItem /* what we remember about each match */ ItemPointerData heapTid; /* TID of referenced heap item */ OffsetNumber indexOffset; /* index item's location within page */ LocationIndex tupleOffset; /* IndexTuple's offset in workspace, if any */ + uint8 visrecheck; /* visibility recheck status, if any */ } BTScanPosItem; typedef struct BTScanPosData @@ -1071,6 +1072,11 @@ typedef struct BTScanOpaqueData int *killedItems; /* currPos.items indexes of killed items */ int numKilled; /* number of currently stored items */ + /* used for index-only scan visibility prechecks */ + Buffer vmbuf; /* vm buffer */ + int vischeckcap; /* capacity of vischeckbuf */ + TM_VisCheck *vischecksbuf; /* single allocation to save on alloc overhead */ + /* * If we are doing an index-only scan, these are the tuple storage * workspaces for the currPos and markPos respectively. Each is of size From e71b5d47ef301e658d9b8ba23d9e21de8f3e79d1 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Fri, 21 Mar 2025 16:41:31 +0100 Subject: [PATCH 5/5] Test for IOS/Vacuum race conditions in index AMs Add regression tests that demonstrate wrong results can occur with index-only scans in GiST and SP-GiST indexes when encountering tuples being removed by a concurrent VACUUM operation. With these tests the index AMs are also expected to not block VACUUM even when they're used inside a cursor. Co-authored-by: Matthias van de Meent Co-authored-by: Peter Geoghegan Co-authored-by: Michail Nikolaev Discussion: https://www.postgresql.org/message-id/flat/CANtu0oi0rkR%2BFsgyLXnGZ-uW2950-urApAWLhy-%2BV1WJD%3D_ZXA%40mail.gmail.com --- .../expected/index-only-scan-btree-vacuum.out | 59 +++++++++ .../expected/index-only-scan-gist-vacuum.out | 53 ++++++++ .../index-only-scan-spgist-vacuum.out | 53 ++++++++ src/test/isolation/isolation_schedule | 3 + .../specs/index-only-scan-btree-vacuum.spec | 113 ++++++++++++++++++ .../specs/index-only-scan-gist-vacuum.spec | 112 +++++++++++++++++ .../specs/index-only-scan-spgist-vacuum.spec | 112 +++++++++++++++++ 7 files changed, 505 insertions(+) create mode 100644 src/test/isolation/expected/index-only-scan-btree-vacuum.out create mode 100644 src/test/isolation/expected/index-only-scan-gist-vacuum.out create mode 100644 src/test/isolation/expected/index-only-scan-spgist-vacuum.out create mode 100644 src/test/isolation/specs/index-only-scan-btree-vacuum.spec create mode 100644 src/test/isolation/specs/index-only-scan-gist-vacuum.spec create mode 100644 src/test/isolation/specs/index-only-scan-spgist-vacuum.spec diff --git a/src/test/isolation/expected/index-only-scan-btree-vacuum.out b/src/test/isolation/expected/index-only-scan-btree-vacuum.out new file mode 100644 index 000000000000..9a9d94c86f6f --- /dev/null +++ b/src/test/isolation/expected/index-only-scan-btree-vacuum.out @@ -0,0 +1,59 @@ +Parsed test spec with 2 sessions + +starting permutation: s2_mod s1_begin s1_prepare_sorted_asc s1_fetch_1 s2_vacuum s1_fetch_all s1_commit +step s2_mod: + DELETE FROM ios_needs_cleanup_lock WHERE a BETWEEN 2 AND 9; + +step s1_begin: BEGIN; +step s1_prepare_sorted_asc: + DECLARE foo NO SCROLL CURSOR FOR SELECT a as x FROM ios_needs_cleanup_lock ORDER BY a ASC; + +step s1_fetch_1: + FETCH FROM foo; + +x +- +1 +(1 row) + +step s2_vacuum: + VACUUM (TRUNCATE false) ios_needs_cleanup_lock; + +step s1_fetch_all: + FETCH ALL FROM foo; + + x +-- +10 +(1 row) + +step s1_commit: COMMIT; + +starting permutation: s2_mod s1_begin s1_prepare_sorted_desc s1_fetch_1 s2_vacuum s1_fetch_all s1_commit +step s2_mod: + DELETE FROM ios_needs_cleanup_lock WHERE a BETWEEN 2 AND 9; + +step s1_begin: BEGIN; +step s1_prepare_sorted_desc: + DECLARE foo NO SCROLL CURSOR FOR SELECT a as x FROM ios_needs_cleanup_lock ORDER BY a DESC; + +step s1_fetch_1: + FETCH FROM foo; + + x +-- +10 +(1 row) + +step s2_vacuum: + VACUUM (TRUNCATE false) ios_needs_cleanup_lock; + +step s1_fetch_all: + FETCH ALL FROM foo; + +x +- +1 +(1 row) + +step s1_commit: COMMIT; diff --git a/src/test/isolation/expected/index-only-scan-gist-vacuum.out b/src/test/isolation/expected/index-only-scan-gist-vacuum.out new file mode 100644 index 000000000000..b7c02ee95292 --- /dev/null +++ b/src/test/isolation/expected/index-only-scan-gist-vacuum.out @@ -0,0 +1,53 @@ +Parsed test spec with 2 sessions + +starting permutation: s2_mod s1_begin s1_prepare_sorted s1_fetch_1 s2_vacuum s1_fetch_all s1_commit +step s2_mod: + DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)'; + +step s1_begin: BEGIN; +step s1_prepare_sorted: + DECLARE foo NO SCROLL CURSOR FOR SELECT a <-> point '(0,0)' as x FROM ios_needs_cleanup_lock ORDER BY a <-> point '(0,0)'; + +step s1_fetch_1: + FETCH FROM foo; + + x +------------------ +1.4142135623730951 +(1 row) + +step s2_vacuum: VACUUM (TRUNCATE false) ios_needs_cleanup_lock; +step s1_fetch_all: + FETCH ALL FROM foo; + +x +- +(0 rows) + +step s1_commit: COMMIT; + +starting permutation: s2_mod s1_begin s1_prepare_unsorted s1_fetch_1 s2_vacuum s1_fetch_all s1_commit +step s2_mod: + DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)'; + +step s1_begin: BEGIN; +step s1_prepare_unsorted: + DECLARE foo NO SCROLL CURSOR FOR SELECT a FROM ios_needs_cleanup_lock WHERE box '((-100,-100),(100,100))' @> a; + +step s1_fetch_1: + FETCH FROM foo; + +a +----- +(1,1) +(1 row) + +step s2_vacuum: VACUUM (TRUNCATE false) ios_needs_cleanup_lock; +step s1_fetch_all: + FETCH ALL FROM foo; + +a +- +(0 rows) + +step s1_commit: COMMIT; diff --git a/src/test/isolation/expected/index-only-scan-spgist-vacuum.out b/src/test/isolation/expected/index-only-scan-spgist-vacuum.out new file mode 100644 index 000000000000..b7c02ee95292 --- /dev/null +++ b/src/test/isolation/expected/index-only-scan-spgist-vacuum.out @@ -0,0 +1,53 @@ +Parsed test spec with 2 sessions + +starting permutation: s2_mod s1_begin s1_prepare_sorted s1_fetch_1 s2_vacuum s1_fetch_all s1_commit +step s2_mod: + DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)'; + +step s1_begin: BEGIN; +step s1_prepare_sorted: + DECLARE foo NO SCROLL CURSOR FOR SELECT a <-> point '(0,0)' as x FROM ios_needs_cleanup_lock ORDER BY a <-> point '(0,0)'; + +step s1_fetch_1: + FETCH FROM foo; + + x +------------------ +1.4142135623730951 +(1 row) + +step s2_vacuum: VACUUM (TRUNCATE false) ios_needs_cleanup_lock; +step s1_fetch_all: + FETCH ALL FROM foo; + +x +- +(0 rows) + +step s1_commit: COMMIT; + +starting permutation: s2_mod s1_begin s1_prepare_unsorted s1_fetch_1 s2_vacuum s1_fetch_all s1_commit +step s2_mod: + DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)'; + +step s1_begin: BEGIN; +step s1_prepare_unsorted: + DECLARE foo NO SCROLL CURSOR FOR SELECT a FROM ios_needs_cleanup_lock WHERE box '((-100,-100),(100,100))' @> a; + +step s1_fetch_1: + FETCH FROM foo; + +a +----- +(1,1) +(1 row) + +step s2_vacuum: VACUUM (TRUNCATE false) ios_needs_cleanup_lock; +step s1_fetch_all: + FETCH ALL FROM foo; + +a +- +(0 rows) + +step s1_commit: COMMIT; diff --git a/src/test/isolation/isolation_schedule b/src/test/isolation/isolation_schedule index e3c669a29c7a..69909d4d9118 100644 --- a/src/test/isolation/isolation_schedule +++ b/src/test/isolation/isolation_schedule @@ -18,6 +18,9 @@ test: two-ids test: multiple-row-versions test: index-only-scan test: index-only-bitmapscan +test: index-only-scan-btree-vacuum +test: index-only-scan-gist-vacuum +test: index-only-scan-spgist-vacuum test: predicate-lock-hot-tuple test: update-conflict-out test: deadlock-simple diff --git a/src/test/isolation/specs/index-only-scan-btree-vacuum.spec b/src/test/isolation/specs/index-only-scan-btree-vacuum.spec new file mode 100644 index 000000000000..9a00804c2c57 --- /dev/null +++ b/src/test/isolation/specs/index-only-scan-btree-vacuum.spec @@ -0,0 +1,113 @@ +# index-only-scan test showing correct results with btree even with concurrent +# vacuum + +setup +{ + -- by using a low fillfactor and a wide tuple we can get multiple blocks + -- with just few rows + CREATE TABLE ios_needs_cleanup_lock (a int NOT NULL, pad char(1024) default '') + WITH (AUTOVACUUM_ENABLED = false, FILLFACTOR = 10); + + INSERT INTO ios_needs_cleanup_lock SELECT g.i FROM generate_series(1, 10) g(i); + + CREATE INDEX ios_btree_a ON ios_needs_cleanup_lock USING btree (a); +} +setup +{ + VACUUM (ANALYZE) ios_needs_cleanup_lock; +} + +teardown +{ + DROP TABLE ios_needs_cleanup_lock; +} + + +session s1 + +# Force an index-only scan, where possible: +setup { + SET enable_bitmapscan = false; + SET enable_indexonlyscan = true; + SET enable_indexscan = true; +} + +step s1_begin { BEGIN; } +step s1_commit { COMMIT; } + +step s1_prepare_sorted_asc { + DECLARE foo NO SCROLL CURSOR FOR SELECT a as x FROM ios_needs_cleanup_lock ORDER BY a ASC; +} +step s1_prepare_sorted_desc { + DECLARE foo NO SCROLL CURSOR FOR SELECT a as x FROM ios_needs_cleanup_lock ORDER BY a DESC; +} + +step s1_fetch_1 { + FETCH FROM foo; +} + +step s1_fetch_all { + FETCH ALL FROM foo; +} + + +session s2 + +# Don't delete row 1, nor 10, so we have a row for the cursor to "rest" on. +step s2_mod +{ + DELETE FROM ios_needs_cleanup_lock WHERE a BETWEEN 2 AND 9; +} + +# Disable truncation, as otherwise we'll just wait for a timeout while trying +# to acquire the lock +step s2_vacuum +{ + VACUUM (TRUNCATE false) ios_needs_cleanup_lock; +} + +permutation + # delete nearly all rows, to make issue visible + s2_mod + # create a cursor + s1_begin + s1_prepare_sorted_asc + + # fetch one row from the cursor, that ensures the index scan portion is done + # before the vacuum in the next step + s1_fetch_1 + + # with the bug this vacuum will mark pages as all-visible that the scan in + # the next step then considers all-visible, despite all rows from those + # pages having been removed. + # Because this should block on buffer-level locks, this won't ever be + # considered "blocked" by isolation tester, and so we only have a single + # step we can work with concurrently. + s2_vacuum + + # if this returns any rows, we're busted + s1_fetch_all + + s1_commit + +permutation + # delete nearly all rows, to make issue visible + s2_mod + # create a cursor + s1_begin + s1_prepare_sorted_desc + + # fetch one row from the cursor, that ensures the index scan portion is done + # before the vacuum in the next step + s1_fetch_1 + + # If the index scan doesn't correctly interlock its visibility tests with + # concurrent VACUUM cleanup then VACUUM will mark pages as all-visible that + # the scan in the next steps may then consider all-visible, despite some of + # those rows having been removed. + s2_vacuum + + # if this returns any rows, we're busted + s1_fetch_all + + s1_commit diff --git a/src/test/isolation/specs/index-only-scan-gist-vacuum.spec b/src/test/isolation/specs/index-only-scan-gist-vacuum.spec new file mode 100644 index 000000000000..9d241b259200 --- /dev/null +++ b/src/test/isolation/specs/index-only-scan-gist-vacuum.spec @@ -0,0 +1,112 @@ +# index-only-scan test showing wrong results with GiST +# +setup +{ + -- by using a low fillfactor and a wide tuple we can get multiple blocks + -- with just few rows + CREATE TABLE ios_needs_cleanup_lock (a point NOT NULL, b int not null, pad char(1024) default '') + WITH (AUTOVACUUM_ENABLED = false, FILLFACTOR = 10); + + INSERT INTO ios_needs_cleanup_lock SELECT point(g.i, g.i), g.i FROM generate_series(1, 10) g(i); + + CREATE INDEX ios_spgist_a ON ios_needs_cleanup_lock USING gist(a); +} +setup +{ + VACUUM (ANALYZE) ios_needs_cleanup_lock; +} + +teardown +{ + DROP TABLE ios_needs_cleanup_lock; +} + + +session s1 + +# Force an index-only scan, where possible: +setup { + SET enable_bitmapscan = false; + SET enable_indexonlyscan = true; + SET enable_indexscan = true; +} + +step s1_begin { BEGIN; } +step s1_commit { COMMIT; } + +step s1_prepare_sorted { + DECLARE foo NO SCROLL CURSOR FOR SELECT a <-> point '(0,0)' as x FROM ios_needs_cleanup_lock ORDER BY a <-> point '(0,0)'; +} + +step s1_prepare_unsorted { + DECLARE foo NO SCROLL CURSOR FOR SELECT a FROM ios_needs_cleanup_lock WHERE box '((-100,-100),(100,100))' @> a; +} + +step s1_fetch_1 { + FETCH FROM foo; +} + +step s1_fetch_all { + FETCH ALL FROM foo; +} + + +session s2 + +# Don't delete row 1 so we have a row for the cursor to "rest" on. +step s2_mod +{ + DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)'; +} + +# Disable truncation, as otherwise we'll just wait for a timeout while trying +# to acquire the lock +step s2_vacuum { VACUUM (TRUNCATE false) ios_needs_cleanup_lock; } + +permutation + # delete nearly all rows, to make issue visible + s2_mod + # create a cursor + s1_begin + s1_prepare_sorted + + # fetch one row from the cursor, that ensures the index scan portion is done + # before the vacuum in the next step + s1_fetch_1 + + # with the bug this vacuum will mark pages as all-visible that the scan in + # the next step then considers all-visible, despite all rows from those + # pages having been removed. + # Because this should block on buffer-level locks, this won't ever be + # considered "blocked" by isolation tester, and so we only have a single + # step we can work with concurrently. + s2_vacuum + + # if this returns any rows, we're busted + s1_fetch_all + + s1_commit + +permutation + # delete nearly all rows, to make issue visible + s2_mod + # create a cursor + s1_begin + s1_prepare_unsorted + + # fetch one row from the cursor, that ensures the index scan portion is done + # before the vacuum in the next step + s1_fetch_1 + + # with the bug this vacuum will mark pages as all-visible that the scan in + # the next step then considers all-visible, despite all rows from those + # pages having been removed. + # Because this should block on buffer-level locks, this won't ever be + # considered "blocked" by isolation tester, and so we only have a single + # step we can work with concurrently. + s2_vacuum + + # if this returns any rows, we're busted + s1_fetch_all + + s1_commit diff --git a/src/test/isolation/specs/index-only-scan-spgist-vacuum.spec b/src/test/isolation/specs/index-only-scan-spgist-vacuum.spec new file mode 100644 index 000000000000..cd621d4f7f20 --- /dev/null +++ b/src/test/isolation/specs/index-only-scan-spgist-vacuum.spec @@ -0,0 +1,112 @@ +# index-only-scan test showing wrong results with SPGiST +# +setup +{ + -- by using a low fillfactor and a wide tuple we can get multiple blocks + -- with just few rows + CREATE TABLE ios_needs_cleanup_lock (a point NOT NULL, b int not null, pad char(1024) default '') + WITH (AUTOVACUUM_ENABLED = false, FILLFACTOR = 10); + + INSERT INTO ios_needs_cleanup_lock SELECT point(g.i, g.i), g.i FROM generate_series(1, 10) g(i); + + CREATE INDEX ios_spgist_a ON ios_needs_cleanup_lock USING spgist(a); +} +setup +{ + VACUUM (ANALYZE) ios_needs_cleanup_lock; +} + +teardown +{ + DROP TABLE ios_needs_cleanup_lock; +} + + +session s1 + +# Force an index-only scan, where possible: +setup { + SET enable_bitmapscan = false; + SET enable_indexonlyscan = true; + SET enable_indexscan = true; +} + +step s1_begin { BEGIN; } +step s1_commit { COMMIT; } + +step s1_prepare_sorted { + DECLARE foo NO SCROLL CURSOR FOR SELECT a <-> point '(0,0)' as x FROM ios_needs_cleanup_lock ORDER BY a <-> point '(0,0)'; +} + +step s1_prepare_unsorted { + DECLARE foo NO SCROLL CURSOR FOR SELECT a FROM ios_needs_cleanup_lock WHERE box '((-100,-100),(100,100))' @> a; +} + +step s1_fetch_1 { + FETCH FROM foo; +} + +step s1_fetch_all { + FETCH ALL FROM foo; +} + + +session s2 + +# Don't delete row 1 so we have a row for the cursor to "rest" on. +step s2_mod +{ + DELETE FROM ios_needs_cleanup_lock WHERE a != point '(1,1)'; +} + +# Disable truncation, as otherwise we'll just wait for a timeout while trying +# to acquire the lock +step s2_vacuum { VACUUM (TRUNCATE false) ios_needs_cleanup_lock; } + +permutation + # delete nearly all rows, to make issue visible + s2_mod + # create a cursor + s1_begin + s1_prepare_sorted + + # fetch one row from the cursor, that ensures the index scan portion is done + # before the vacuum in the next step + s1_fetch_1 + + # with the bug this vacuum will mark pages as all-visible that the scan in + # the next step then considers all-visible, despite all rows from those + # pages having been removed. + # Because this should block on buffer-level locks, this won't ever be + # considered "blocked" by isolation tester, and so we only have a single + # step we can work with concurrently. + s2_vacuum + + # if this returns any rows, we're busted + s1_fetch_all + + s1_commit + +permutation + # delete nearly all rows, to make issue visible + s2_mod + # create a cursor + s1_begin + s1_prepare_unsorted + + # fetch one row from the cursor, that ensures the index scan portion is done + # before the vacuum in the next step + s1_fetch_1 + + # with the bug this vacuum will mark pages as all-visible that the scan in + # the next step then considers all-visible, despite all rows from those + # pages having been removed. + # Because this should block on buffer-level locks, this won't ever be + # considered "blocked" by isolation tester, and so we only have a single + # step we can work with concurrently. + s2_vacuum + + # if this returns any rows, we're busted + s1_fetch_all + + s1_commit