From d695cc496d44e6b57e25f8bbd278331fd7d28444 Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu Date: Tue, 31 Dec 2024 14:09:52 +0100 Subject: [PATCH 01/12] This is https://commitfest.postgresql.org/50/5160/ and https://commitfest.postgresql.org/patch/5438/ merged in single commit. it is required for stability of stress tests. --- contrib/amcheck/meson.build | 1 + .../t/006_cic_bt_index_parent_check.pl | 39 +++++ contrib/amcheck/verify_nbtree.c | 68 ++++----- src/backend/commands/indexcmds.c | 4 +- src/backend/executor/execIndexing.c | 3 + src/backend/executor/execPartition.c | 119 +++++++++++++-- src/backend/executor/nodeModifyTable.c | 2 + src/backend/optimizer/util/plancat.c | 135 +++++++++++++----- src/backend/utils/time/snapmgr.c | 2 + 9 files changed, 285 insertions(+), 88 deletions(-) create mode 100644 contrib/amcheck/t/006_cic_bt_index_parent_check.pl diff --git a/contrib/amcheck/meson.build b/contrib/amcheck/meson.build index b33e8c9b062f..b040000dd55f 100644 --- a/contrib/amcheck/meson.build +++ b/contrib/amcheck/meson.build @@ -49,6 +49,7 @@ tests += { 't/003_cic_2pc.pl', 't/004_verify_nbtree_unique.pl', 't/005_pitr.pl', + 't/006_cic_bt_index_parent_check.pl', ], }, } diff --git a/contrib/amcheck/t/006_cic_bt_index_parent_check.pl b/contrib/amcheck/t/006_cic_bt_index_parent_check.pl new file mode 100644 index 000000000000..6e52c5e39ec9 --- /dev/null +++ b/contrib/amcheck/t/006_cic_bt_index_parent_check.pl @@ -0,0 +1,39 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group + +# Test bt_index_parent_check with index created with CREATE INDEX CONCURRENTLY +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; + +use Test::More; + +my ($node, $result); + +# +# Test set-up +# +$node = PostgreSQL::Test::Cluster->new('CIC_bt_index_parent_check_test'); +$node->init; +$node->start; +$node->safe_psql('postgres', q(CREATE EXTENSION amcheck)); +$node->safe_psql('postgres', q(CREATE TABLE tbl(i int primary key))); +# Insert two rows into index +$node->safe_psql('postgres', q(INSERT INTO tbl SELECT i FROM generate_series(1, 2) s(i);)); + +# start background transaction +my $in_progress_h = $node->background_psql('postgres'); +$in_progress_h->query_safe(q(BEGIN; SELECT pg_current_xact_id();)); + +# delete one row from table, while background transaction is in progress +$node->safe_psql('postgres', q(DELETE FROM tbl WHERE i = 1;)); +# create index concurrently, which will skip the deleted row +$node->safe_psql('postgres', q(CREATE INDEX CONCURRENTLY idx ON tbl(i);)); + +# check index using bt_index_parent_check +$result = $node->psql('postgres', q(SELECT bt_index_parent_check('idx', heapallindexed => true))); +is($result, '0', 'bt_index_parent_check for CIC after removed row'); + +$in_progress_h->quit; +done_testing(); diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index f11c43a0ed79..3048e044aecb 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -382,7 +382,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, BTMetaPageData *metad; uint32 previouslevel; BtreeLevel current; - Snapshot snapshot = SnapshotAny; if (!readonly) elog(DEBUG1, "verifying consistency of tree structure for index \"%s\"", @@ -433,38 +432,35 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, state->heaptuplespresent = 0; /* - * Register our own snapshot in !readonly case, rather than asking + * Register our own snapshot for heapallindexed, rather than asking * table_index_build_scan() to do this for us later. This needs to * happen before index fingerprinting begins, so we can later be * certain that index fingerprinting should have reached all tuples * returned by table_index_build_scan(). */ - if (!state->readonly) - { - snapshot = RegisterSnapshot(GetTransactionSnapshot()); + state->snapshot = RegisterSnapshot(GetTransactionSnapshot()); - /* - * GetTransactionSnapshot() always acquires a new MVCC snapshot in - * READ COMMITTED mode. A new snapshot is guaranteed to have all - * the entries it requires in the index. - * - * We must defend against the possibility that an old xact - * snapshot was returned at higher isolation levels when that - * snapshot is not safe for index scans of the target index. This - * is possible when the snapshot sees tuples that are before the - * index's indcheckxmin horizon. Throwing an error here should be - * very rare. It doesn't seem worth using a secondary snapshot to - * avoid this. - */ - if (IsolationUsesXactSnapshot() && rel->rd_index->indcheckxmin && - !TransactionIdPrecedes(HeapTupleHeaderGetXmin(rel->rd_indextuple->t_data), - snapshot->xmin)) - ereport(ERROR, - (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), - errmsg("index \"%s\" cannot be verified using transaction snapshot", - RelationGetRelationName(rel)))); - } - } + /* + * GetTransactionSnapshot() always acquires a new MVCC snapshot in + * READ COMMITTED mode. A new snapshot is guaranteed to have all + * the entries it requires in the index. + * + * We must defend against the possibility that an old xact + * snapshot was returned at higher isolation levels when that + * snapshot is not safe for index scans of the target index. This + * is possible when the snapshot sees tuples that are before the + * index's indcheckxmin horizon. Throwing an error here should be + * very rare. It doesn't seem worth using a secondary snapshot to + * avoid this. + */ + if (IsolationUsesXactSnapshot() && rel->rd_index->indcheckxmin && + !TransactionIdPrecedes(HeapTupleHeaderGetXmin(rel->rd_indextuple->t_data), + state->snapshot->xmin)) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("index \"%s\" cannot be verified using transaction snapshot", + RelationGetRelationName(rel)))); +} /* * We need a snapshot to check the uniqueness of the index. For better @@ -476,9 +472,7 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, state->indexinfo = BuildIndexInfo(state->rel); if (state->indexinfo->ii_Unique) { - if (snapshot != SnapshotAny) - state->snapshot = snapshot; - else + if (state->snapshot == InvalidSnapshot) state->snapshot = RegisterSnapshot(GetTransactionSnapshot()); } } @@ -555,13 +549,12 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, /* * Create our own scan for table_index_build_scan(), rather than * getting it to do so for us. This is required so that we can - * actually use the MVCC snapshot registered earlier in !readonly - * case. + * actually use the MVCC snapshot registered earlier. * * Note that table_index_build_scan() calls heap_endscan() for us. */ scan = table_beginscan_strat(state->heaprel, /* relation */ - snapshot, /* snapshot */ + state->snapshot, /* snapshot */ 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ @@ -569,7 +562,7 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, /* * Scan will behave as the first scan of a CREATE INDEX CONCURRENTLY - * behaves in !readonly case. + * behaves. * * It's okay that we don't actually use the same lock strength for the * heap relation as any other ii_Concurrent caller would in !readonly @@ -578,7 +571,7 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, * that needs to be sure that there was no concurrent recycling of * TIDs. */ - indexinfo->ii_Concurrent = !state->readonly; + indexinfo->ii_Concurrent = true; /* * Don't wait for uncommitted tuple xact commit/abort when index is a @@ -602,14 +595,11 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, state->heaptuplespresent, RelationGetRelationName(heaprel), 100.0 * bloom_prop_bits_set(state->filter)))); - if (snapshot != SnapshotAny) - UnregisterSnapshot(snapshot); - bloom_free(state->filter); } /* Be tidy: */ - if (snapshot == SnapshotAny && state->snapshot != InvalidSnapshot) + if (state->snapshot != InvalidSnapshot) UnregisterSnapshot(state->snapshot); MemoryContextDelete(state->targetcontext); } diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index c3ec2076a52e..96fa31353a0b 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1790,6 +1790,7 @@ DefineIndex(Oid tableId, * before the reference snap was taken, we have to wait out any * transactions that might have older snapshots. */ + INJECTION_POINT("define_index_before_set_valid", NULL); pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, PROGRESS_CREATEIDX_PHASE_WAIT_3); WaitForOlderSnapshots(limitXmin, true); @@ -4195,7 +4196,7 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein * the same time to make sure we only get constraint violations from the * indexes with the correct names. */ - + INJECTION_POINT("reindex_relation_concurrently_before_swap", NULL); StartTransactionCommand(); /* @@ -4274,6 +4275,7 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein * index_drop() for more details. */ + INJECTION_POINT("reindex_relation_concurrently_before_set_dead", NULL); pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, PROGRESS_CREATEIDX_PHASE_WAIT_4); WaitForLockersMultiple(lockTags, AccessExclusiveLock, true); diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index bdf862b24062..499cba145dd4 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -117,6 +117,7 @@ #include "utils/multirangetypes.h" #include "utils/rangetypes.h" #include "utils/snapmgr.h" +#include "utils/injection_point.h" /* waitMode argument to check_exclusion_or_unique_constraint() */ typedef enum @@ -942,6 +943,8 @@ check_exclusion_or_unique_constraint(Relation heap, Relation index, econtext->ecxt_scantuple = save_scantuple; ExecDropSingleTupleTableSlot(existing_slot); + if (!conflict) + INJECTION_POINT("check_exclusion_or_unique_constraint_no_conflict", NULL); return !conflict; } diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index 514eae1037dc..8851f0fda065 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -486,6 +486,48 @@ ExecFindPartition(ModifyTableState *mtstate, return rri; } +/* + * IsIndexCompatibleAsArbiter + * Checks if the indexes are identical in terms of being used + * as arbiters for the INSERT ON CONFLICT operation by comparing + * them to the provided arbiter index. + * + * Returns the true if indexes are compatible. + */ +static bool +IsIndexCompatibleAsArbiter(Relation arbiterIndexRelation, + IndexInfo *arbiterIndexInfo, + Relation indexRelation, + IndexInfo *indexInfo) +{ + int i; + + if (arbiterIndexInfo->ii_Unique != indexInfo->ii_Unique) + return false; + /* it is not supported for cases of exclusion constraints. */ + if (arbiterIndexInfo->ii_ExclusionOps != NULL || indexInfo->ii_ExclusionOps != NULL) + return false; + if (arbiterIndexRelation->rd_index->indnkeyatts != indexRelation->rd_index->indnkeyatts) + return false; + + for (i = 0; i < indexRelation->rd_index->indnkeyatts; i++) + { + int arbiterAttoNo = arbiterIndexRelation->rd_index->indkey.values[i]; + int attoNo = indexRelation->rd_index->indkey.values[i]; + if (arbiterAttoNo != attoNo) + return false; + } + + if (list_difference(RelationGetIndexExpressions(arbiterIndexRelation), + RelationGetIndexExpressions(indexRelation)) != NIL) + return false; + + if (list_difference(RelationGetIndexPredicate(arbiterIndexRelation), + RelationGetIndexPredicate(indexRelation)) != NIL) + return false; + return true; +} + /* * ExecInitPartitionInfo * Lock the partition and initialize ResultRelInfo. Also setup other @@ -696,6 +738,8 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate, if (rootResultRelInfo->ri_onConflictArbiterIndexes != NIL) { List *childIdxs; + List *nonAncestorIdxs = NIL; + int i, j, additional_arbiters = 0; childIdxs = RelationGetIndexList(leaf_part_rri->ri_RelationDesc); @@ -706,23 +750,74 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, EState *estate, ListCell *lc2; ancestors = get_partition_ancestors(childIdx); - foreach(lc2, rootResultRelInfo->ri_onConflictArbiterIndexes) + if (ancestors) { - if (list_member_oid(ancestors, lfirst_oid(lc2))) - arbiterIndexes = lappend_oid(arbiterIndexes, childIdx); + foreach(lc2, rootResultRelInfo->ri_onConflictArbiterIndexes) + { + if (list_member_oid(ancestors, lfirst_oid(lc2))) + arbiterIndexes = lappend_oid(arbiterIndexes, childIdx); + } } + else /* No ancestor was found for that index. Save it for rechecking later. */ + nonAncestorIdxs = lappend_oid(nonAncestorIdxs, childIdx); list_free(ancestors); } - } - /* - * If the resulting lists are of inequal length, something is wrong. - * (This shouldn't happen, since arbiter index selection should not - * pick up an invalid index.) - */ - if (list_length(rootResultRelInfo->ri_onConflictArbiterIndexes) != - list_length(arbiterIndexes)) - elog(ERROR, "invalid arbiter index list"); + /* + * If any non-ancestor indexes are found, we need to compare them with other + * indexes of the relation that will be used as arbiters. This is necessary + * when a partitioned index is processed by REINDEX CONCURRENTLY. Both indexes + * must be considered as arbiters to ensure that all concurrent transactions + * use the same set of arbiters. + */ + if (nonAncestorIdxs) + { + for (i = 0; i < leaf_part_rri->ri_NumIndices; i++) + { + if (list_member_oid(nonAncestorIdxs, leaf_part_rri->ri_IndexRelationDescs[i]->rd_index->indexrelid)) + { + Relation nonAncestorIndexRelation = leaf_part_rri->ri_IndexRelationDescs[i]; + IndexInfo *nonAncestorIndexInfo = leaf_part_rri->ri_IndexRelationInfo[i]; + Assert(!list_member_oid(arbiterIndexes, nonAncestorIndexRelation->rd_index->indexrelid)); + + /* It is too early to us non-ready indexes as arbiters */ + if (!nonAncestorIndexInfo->ii_ReadyForInserts) + continue; + + for (j = 0; j < leaf_part_rri->ri_NumIndices; j++) + { + if (list_member_oid(arbiterIndexes, + leaf_part_rri->ri_IndexRelationDescs[j]->rd_index->indexrelid)) + { + Relation arbiterIndexRelation = leaf_part_rri->ri_IndexRelationDescs[j]; + IndexInfo *arbiterIndexInfo = leaf_part_rri->ri_IndexRelationInfo[j]; + + /* If non-ancestor index are compatible to arbiter - use it as arbiter too. */ + if (IsIndexCompatibleAsArbiter(arbiterIndexRelation, arbiterIndexInfo, + nonAncestorIndexRelation, nonAncestorIndexInfo)) + { + arbiterIndexes = lappend_oid(arbiterIndexes, + nonAncestorIndexRelation->rd_index->indexrelid); + additional_arbiters++; + } + } + } + } + } + } + list_free(nonAncestorIdxs); + + /* + * If the resulting lists are of inequal length, something is wrong. + * (This shouldn't happen, since arbiter index selection should not + * pick up a non-ready index.) + * + * But we need to consider an additional arbiter indexes also. + */ + if (list_length(rootResultRelInfo->ri_onConflictArbiterIndexes) != + list_length(arbiterIndexes) - additional_arbiters) + elog(ERROR, "invalid arbiter index list"); + } leaf_part_rri->ri_onConflictArbiterIndexes = arbiterIndexes; /* diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 54da8e7995bd..86c64477eaef 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -70,6 +70,7 @@ #include "utils/datum.h" #include "utils/rel.h" #include "utils/snapmgr.h" +#include "utils/injection_point.h" typedef struct MTTargetRelLookup @@ -1179,6 +1180,7 @@ ExecInsert(ModifyTableContext *context, return NULL; } } + INJECTION_POINT("exec_insert_before_insert_speculative", NULL); /* * Before we start insertion proper, acquire our "speculative diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 59233b647302..0c720e450e99 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -716,12 +716,14 @@ infer_arbiter_indexes(PlannerInfo *root) List *indexList; ListCell *l; - /* Normalized inference attributes and inference expressions: */ - Bitmapset *inferAttrs = NULL; - List *inferElems = NIL; + /* Normalized required attributes and expressions: */ + Bitmapset *requiredArbiterAttrs = NULL; + List *requiredArbiterElems = NIL; + List *requiredIndexPredExprs = (List *) onconflict->arbiterWhere; /* Results */ List *results = NIL; + bool foundValid = false; /* * Quickly return NIL for ON CONFLICT DO NOTHING without an inference @@ -756,8 +758,8 @@ infer_arbiter_indexes(PlannerInfo *root) if (!IsA(elem->expr, Var)) { - /* If not a plain Var, just shove it in inferElems for now */ - inferElems = lappend(inferElems, elem->expr); + /* If not a plain Var, just shove it in requiredArbiterElems for now */ + requiredArbiterElems = lappend(requiredArbiterElems, elem->expr); continue; } @@ -769,30 +771,76 @@ infer_arbiter_indexes(PlannerInfo *root) (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("whole row unique index inference specifications are not supported"))); - inferAttrs = bms_add_member(inferAttrs, + requiredArbiterAttrs = bms_add_member(requiredArbiterAttrs, attno - FirstLowInvalidHeapAttributeNumber); } + indexList = RelationGetIndexList(relation); + /* * Lookup named constraint's index. This is not immediately returned - * because some additional sanity checks are required. + * because some additional sanity checks are required. Additionally, we + * need to process other indexes as potential arbiters to account for + * cases where REINDEX CONCURRENTLY is processing an index used as a + * named constraint. */ if (onconflict->constraint != InvalidOid) { indexOidFromConstraint = get_constraint_index(onconflict->constraint); if (indexOidFromConstraint == InvalidOid) + { ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), - errmsg("constraint in ON CONFLICT clause has no associated index"))); + errmsg("constraint in ON CONFLICT clause has no associated index"))); + } + + /* + * Find the named constraint index to extract its attributes and predicates. + * We open all indexes in the loop to avoid deadlock of changed order of locks. + * */ + foreach(l, indexList) + { + Oid indexoid = lfirst_oid(l); + Relation idxRel; + Form_pg_index idxForm; + AttrNumber natt; + + idxRel = index_open(indexoid, rte->rellockmode); + idxForm = idxRel->rd_index; + + if (idxForm->indisready) + { + if (indexOidFromConstraint == idxForm->indexrelid) + { + /* + * Prepare requirements for other indexes to be used as arbiter together + * with indexOidFromConstraint. It is required to involve both equals indexes + * in case of REINDEX CONCURRENTLY. + */ + for (natt = 0; natt < idxForm->indnkeyatts; natt++) + { + int attno = idxRel->rd_index->indkey.values[natt]; + + if (attno != 0) + requiredArbiterAttrs = bms_add_member(requiredArbiterAttrs, + attno - FirstLowInvalidHeapAttributeNumber); + } + requiredArbiterElems = RelationGetIndexExpressions(idxRel); + requiredIndexPredExprs = RelationGetIndexPredicate(idxRel); + /* We are done, so, quite the loop. */ + index_close(idxRel, NoLock); + break; + } + } + index_close(idxRel, NoLock); + } } /* * Using that representation, iterate through the list of indexes on the * target relation to try and find a match */ - indexList = RelationGetIndexList(relation); - foreach(l, indexList) { Oid indexoid = lfirst_oid(l); @@ -815,7 +863,13 @@ infer_arbiter_indexes(PlannerInfo *root) idxRel = index_open(indexoid, rte->rellockmode); idxForm = idxRel->rd_index; - if (!idxForm->indisvalid) + /* + * We need to consider both indisvalid and indisready indexes because + * them may become indisvalid before execution phase. It is required + * to keep set of indexes used as arbiter to be the same for all + * concurrent transactions. + */ + if (!idxForm->indisready) goto next; /* @@ -835,27 +889,23 @@ infer_arbiter_indexes(PlannerInfo *root) ereport(ERROR, (errcode(ERRCODE_WRONG_OBJECT_TYPE), errmsg("ON CONFLICT DO UPDATE not supported with exclusion constraints"))); - - results = lappend_oid(results, idxForm->indexrelid); - list_free(indexList); - index_close(idxRel, NoLock); - table_close(relation, NoLock); - return results; + goto found; } else if (indexOidFromConstraint != InvalidOid) { - /* No point in further work for index in named constraint case */ - goto next; + /* In the case of "ON constraint_name DO UPDATE" we need to skip non-unique candidates. */ + if (!idxForm->indisunique && onconflict->action == ONCONFLICT_UPDATE) + goto next; + } else { + /* + * Only considering conventional inference at this point (not named + * constraints), so index under consideration can be immediately + * skipped if it's not unique + */ + if (!idxForm->indisunique) + goto next; } - /* - * Only considering conventional inference at this point (not named - * constraints), so index under consideration can be immediately - * skipped if it's not unique - */ - if (!idxForm->indisunique) - goto next; - /* * So-called unique constraints with WITHOUT OVERLAPS are really * exclusion constraints, so skip those too. @@ -875,7 +925,7 @@ infer_arbiter_indexes(PlannerInfo *root) } /* Non-expression attributes (if any) must match */ - if (!bms_equal(indexedAttrs, inferAttrs)) + if (!bms_equal(indexedAttrs, requiredArbiterAttrs)) goto next; /* Expression attributes (if any) must match */ @@ -883,6 +933,10 @@ infer_arbiter_indexes(PlannerInfo *root) if (idxExprs && varno != 1) ChangeVarNodes((Node *) idxExprs, 1, varno, 0); + /* + * If arbiterElems are present, check them. If name >constraint is + * present arbiterElems == NIL. + */ foreach(el, onconflict->arbiterElems) { InferenceElem *elem = (InferenceElem *) lfirst(el); @@ -920,27 +974,35 @@ infer_arbiter_indexes(PlannerInfo *root) } /* - * Now that all inference elements were matched, ensure that the + * In case of the conventional inference involved ensure that the * expression elements from inference clause are not missing any * cataloged expressions. This does the right thing when unique * indexes redundantly repeat the same attribute, or if attributes * redundantly appear multiple times within an inference clause. + * + * In the case of named constraint ensure candidate has equal set + * of expressions as the named constraint index. */ - if (list_difference(idxExprs, inferElems) != NIL) + if (list_difference(idxExprs, requiredArbiterElems) != NIL) goto next; - /* - * If it's a partial index, its predicate must be implied by the ON - * CONFLICT's WHERE clause. - */ predExprs = RelationGetIndexPredicate(idxRel); if (predExprs && varno != 1) ChangeVarNodes((Node *) predExprs, 1, varno, 0); - if (!predicate_implied_by(predExprs, (List *) onconflict->arbiterWhere, false)) + /* + * If it's a partial index and conventional inference, its predicate must be implied + * by the ON CONFLICT's WHERE clause. + */ + if (indexOidFromConstraint == InvalidOid && !predicate_implied_by(predExprs, requiredIndexPredExprs, false)) + goto next; + /* If it's a partial index and named constraint predicates must be equal. */ + if (indexOidFromConstraint != InvalidOid && list_difference(predExprs, requiredIndexPredExprs) != NIL) goto next; +found: results = lappend_oid(results, idxForm->indexrelid); + foundValid |= idxForm->indisvalid; next: index_close(idxRel, NoLock); } @@ -948,7 +1010,8 @@ infer_arbiter_indexes(PlannerInfo *root) list_free(indexList); table_close(relation, NoLock); - if (results == NIL) + /* It is required to have at least one indisvalid index during the planning. */ + if (results == NIL || !foundValid) ereport(ERROR, (errcode(ERRCODE_INVALID_COLUMN_REFERENCE), errmsg("there is no unique or exclusion constraint matching the ON CONFLICT specification"))); diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index ea35f30f4945..ad440ff024c6 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -123,6 +123,7 @@ #include "utils/resowner.h" #include "utils/snapmgr.h" #include "utils/syscache.h" +#include "utils/injection_point.h" /* @@ -447,6 +448,7 @@ InvalidateCatalogSnapshot(void) pairingheap_remove(&RegisteredSnapshots, &CatalogSnapshot->ph_node); CatalogSnapshot = NULL; SnapshotResetXmin(); + INJECTION_POINT("invalidate_catalog_snapshot_end", NULL); } } From 4e79c573e118ea4ce57373efae3e3d64ec045d45 Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu Date: Sat, 30 Nov 2024 16:24:20 +0100 Subject: [PATCH 02/12] Add stress tests for concurrent index builds Introduce stress tests for concurrent index operations: - test concurrent inserts/updates during CREATE/REINDEX INDEX CONCURRENTLY - cover various index types (btree, gin, gist, brin, hash, spgist) - test unique and non-unique indexes - test with expressions and predicates - test both parallel and non-parallel operations These tests verify the behavior of the following commits. --- src/bin/pg_amcheck/meson.build | 1 + src/bin/pg_amcheck/t/006_cic.pl | 223 ++++++++++++++++++++++++++++++++ 2 files changed, 224 insertions(+) create mode 100644 src/bin/pg_amcheck/t/006_cic.pl diff --git a/src/bin/pg_amcheck/meson.build b/src/bin/pg_amcheck/meson.build index 316ea0d40b8c..7df15435fbb7 100644 --- a/src/bin/pg_amcheck/meson.build +++ b/src/bin/pg_amcheck/meson.build @@ -28,6 +28,7 @@ tests += { 't/003_check.pl', 't/004_verify_heapam.pl', 't/005_opclass_damage.pl', + 't/006_cic.pl', ], }, } diff --git a/src/bin/pg_amcheck/t/006_cic.pl b/src/bin/pg_amcheck/t/006_cic.pl new file mode 100644 index 000000000000..2aad0e8daa87 --- /dev/null +++ b/src/bin/pg_amcheck/t/006_cic.pl @@ -0,0 +1,223 @@ +# Copyright (c) 2024, PostgreSQL Global Development Group + +# Test REINDEX CONCURRENTLY with concurrent modifications and HOT updates +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +Test::More->builder->todo_start('filesystem bug') + if PostgreSQL::Test::Utils::has_wal_read_bug; + +my ($node, $result); + +# +# Test set-up +# +$node = PostgreSQL::Test::Cluster->new('RC_test'); +$node->init; +$node->append_conf('postgresql.conf', + 'lock_timeout = ' . (1000 * $PostgreSQL::Test::Utils::timeout_default)); +$node->append_conf('postgresql.conf', 'fsync = off'); +$node->start; +$node->safe_psql('postgres', q(CREATE EXTENSION amcheck)); +$node->safe_psql('postgres', q(CREATE TABLE tbl(i int primary key, + c1 money default 0, c2 money default 0, + c3 money default 0, updated_at timestamp, + ia int4[], p point))); +$node->safe_psql('postgres', q(CREATE INDEX CONCURRENTLY idx ON tbl(i, updated_at);)); +# create sequence +$node->safe_psql('postgres', q(CREATE UNLOGGED SEQUENCE in_row_rebuild START 1 INCREMENT 1;)); +$node->safe_psql('postgres', q(SELECT nextval('in_row_rebuild');)); + +# Create helper functions for predicate tests +$node->safe_psql('postgres', q( + CREATE FUNCTION predicate_stable() RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ + BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN true; + END; $$; +)); + +$node->safe_psql('postgres', q( + CREATE FUNCTION predicate_const(integer) RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ + BEGIN + RETURN MOD($1, 2) = 0; + END; $$; +)); + +# Run CIC/RIC in different options concurrently with upserts +$node->pgbench( + '--no-vacuum --client=30 --jobs=4 --exit-on-abort --transactions=2500', + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY', + { + 'concurrent_ops' => q( + SET debug_parallel_query = off; -- this is because predicate_stable implementation + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set variant random(0, 5) + \set parallels random(0, 4) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + \if :variant = 0 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at); + \elif :variant = 1 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at) WHERE predicate_stable(); + \elif :variant = 2 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at) WHERE MOD(i, 2) = 0; + \elif :variant = 3 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, updated_at) WHERE predicate_const(i); + \elif :variant = 4 + CREATE INDEX CONCURRENTLY new_idx ON tbl(predicate_const(i)); + \elif :variant = 5 + CREATE INDEX CONCURRENTLY new_idx ON tbl(i, predicate_const(i), updated_at) WHERE predicate_const(i); + \endif + \sleep 10 ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + REINDEX INDEX CONCURRENTLY new_idx; + \sleep 10 ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + DROP INDEX CONCURRENTLY new_idx; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1000, 100000) + BEGIN; + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now()) + ON CONFLICT(i) DO UPDATE SET updated_at = now(); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now()) + ON CONFLICT(i) DO UPDATE SET updated_at = now(); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now()) + ON CONFLICT(i) DO UPDATE SET updated_at = now(); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now()) + ON CONFLICT(i) DO UPDATE SET updated_at = now(); + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now()) + ON CONFLICT(i) DO UPDATE SET updated_at = now(); + SELECT setval('in_row_rebuild', 1); + COMMIT; + \endif + ) + }); + +$node->safe_psql('postgres', q(TRUNCATE TABLE tbl;)); + +# Run CIC/RIC for unique index concurrently with upserts +$node->pgbench( + '--no-vacuum --client=30 --jobs=4 --exit-on-abort --transactions=2500', + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for unique BTREE', + { + 'concurrent_ops_unique_idx' => q( + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set parallels random(0, 4) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + CREATE UNIQUE INDEX CONCURRENTLY new_idx ON tbl(i); + \sleep 10 ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + REINDEX INDEX CONCURRENTLY new_idx; + \sleep 10 ms + SELECT bt_index_check('new_idx', heapallindexed => true, checkunique => true); + DROP INDEX CONCURRENTLY new_idx; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1, power(10, random(1, 5))) + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now()) + ON CONFLICT(i) DO UPDATE SET updated_at = now(); + SELECT setval('in_row_rebuild', 1); + \endif + ) + }); + +$node->safe_psql('postgres', q(TRUNCATE TABLE tbl;)); + +# Run CIC/RIC for GIN with upserts +$node->pgbench( + '--no-vacuum --client=30 --jobs=4 --exit-on-abort --transactions=2500', + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for GIN/GIST/BRIN/HASH/SPGIST', + { + 'concurrent_ops_gin_idx' => q( + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set parallels random(0, 4) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + CREATE INDEX CONCURRENTLY new_idx ON tbl USING GIN (ia); + \sleep 10 ms + SELECT gin_index_check('new_idx'); + REINDEX INDEX CONCURRENTLY new_idx; + \sleep 10 ms + SELECT gin_index_check('new_idx'); + DROP INDEX CONCURRENTLY new_idx; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1, power(10, random(1, 5))) + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now()) + ON CONFLICT(i) DO UPDATE SET updated_at = now(); + SELECT setval('in_row_rebuild', 1); + \endif + ) + }); + +$node->safe_psql('postgres', q(TRUNCATE TABLE tbl;)); + +# Run CIC/RIC for GIST/BRIN/HASH/SPGIST index concurrently with upserts +$node->pgbench( + '--no-vacuum --client=30 --jobs=4 --exit-on-abort --transactions=2500', + 0, + [qr{actually processed}], + [qr{^$}], + 'concurrent operations with REINDEX/CREATE INDEX CONCURRENTLY for GIN/GIST/BRIN/HASH/SPGIST', + { + 'concurrent_ops_other_idx' => q( + SELECT pg_try_advisory_lock(42)::integer AS gotlock \gset + \if :gotlock + SELECT nextval('in_row_rebuild') AS last_value \gset + \set parallels random(0, 4) + \if :last_value < 3 + ALTER TABLE tbl SET (parallel_workers=:parallels); + \set variant random(0, 3) + \if :variant = 0 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING GIST (p); + \elif :variant = 1 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING BRIN (updated_at); + \elif :variant = 2 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING HASH (updated_at); + \elif :variant = 3 + CREATE INDEX CONCURRENTLY new_idx ON tbl USING SPGIST (p); + \endif + \sleep 10 ms + REINDEX INDEX CONCURRENTLY new_idx; + \sleep 10 ms + DROP INDEX CONCURRENTLY new_idx; + \endif + SELECT pg_advisory_unlock(42); + \else + \set num random(1, power(10, random(1, 5))) + INSERT INTO tbl VALUES(floor(random()*:num),0,0,0,now()) + ON CONFLICT(i) DO UPDATE SET updated_at = now(); + SELECT setval('in_row_rebuild', 1); + \endif + ) + }); + +$node->stop; +done_testing(); \ No newline at end of file From bf1cd14dbb0531ba44311bd45f93491a5f0aa242 Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu Date: Tue, 31 Dec 2024 21:10:23 +0100 Subject: [PATCH 03/12] Reset snapshots periodically in non-unique non-parallel concurrent index builds Long-living snapshots used by CREATE INDEX CONCURRENTLY and REINDEX CONCURRENTLY can hold back the global xmin horizon. Commit d9d076222f5b attempted to allow VACUUM to ignore such snapshots to mitigate this problem. However, this was reverted in commit e28bb8851969 because it could cause indexes to miss heap tuples that were HOT-updated and HOT-pruned during the index creation, leading to index corruption. This patch introduces an alternative by periodically resetting the snapshot used during the first phase. By resetting the snapshot every N pages during the heap scan, it allows the xmin horizon to advance. Currently, this technique is applied to: - only during the first scan of the heap: The second scan during index validation still uses a single snapshot to ensure index correctness - non-parallel index builds: Parallel index builds are not yet supported and will be addressed in a following commits - non-unique indexes: Unique index builds still require a consistent snapshot to enforce uniqueness constraints, will be addressed in a following commits A new scan option SO_RESET_SNAPSHOT is introduced. When set, it causes the snapshot to be reset "between" every SO_RESET_SNAPSHOT_EACH_N_PAGE pages during the scan. The heap scan code is adjusted to support this option, and the index build code is modified to use it for applicable concurrent index builds that are not on system catalogs and not using parallel workers. --- contrib/amcheck/verify_nbtree.c | 3 +- contrib/pgstattuple/pgstattuple.c | 2 +- src/backend/access/brin/brin.c | 19 +++- src/backend/access/gin/gininsert.c | 21 ++++ src/backend/access/gist/gistbuild.c | 3 + src/backend/access/hash/hash.c | 1 + src/backend/access/heap/heapam.c | 45 ++++++++ src/backend/access/heap/heapam_handler.c | 57 ++++++++-- src/backend/access/index/genam.c | 2 +- src/backend/access/nbtree/nbtsort.c | 30 ++++- src/backend/access/spgist/spginsert.c | 2 + src/backend/catalog/index.c | 30 ++++- src/backend/commands/indexcmds.c | 14 +-- src/backend/optimizer/plan/planner.c | 9 ++ src/include/access/heapam.h | 2 + src/include/access/tableam.h | 28 ++++- src/test/modules/injection_points/Makefile | 2 +- .../expected/cic_reset_snapshots.out | 105 ++++++++++++++++++ src/test/modules/injection_points/meson.build | 1 + .../sql/cic_reset_snapshots.sql | 86 ++++++++++++++ 20 files changed, 427 insertions(+), 35 deletions(-) create mode 100644 src/test/modules/injection_points/expected/cic_reset_snapshots.out create mode 100644 src/test/modules/injection_points/sql/cic_reset_snapshots.sql diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 3048e044aecb..e59197bb35e9 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -558,7 +558,8 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - true); /* syncscan OK? */ + true, /* syncscan OK? */ + false); /* * Scan will behave as the first scan of a CREATE INDEX CONCURRENTLY diff --git a/contrib/pgstattuple/pgstattuple.c b/contrib/pgstattuple/pgstattuple.c index 0d9c2b0b6536..a6dad54ff584 100644 --- a/contrib/pgstattuple/pgstattuple.c +++ b/contrib/pgstattuple/pgstattuple.c @@ -335,7 +335,7 @@ pgstat_heap(Relation rel, FunctionCallInfo fcinfo) errmsg("only heap AM is supported"))); /* Disable syncscan because we assume we scan from block zero upwards */ - scan = table_beginscan_strat(rel, SnapshotAny, 0, NULL, true, false); + scan = table_beginscan_strat(rel, SnapshotAny, 0, NULL, true, false, false); hscan = (HeapScanDesc) scan; InitDirtySnapshot(SnapshotDirty); diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 4204088fa0d7..a48682b8dbf2 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -1216,11 +1216,12 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) state->bs_sortstate = tuplesort_begin_index_brin(maintenance_work_mem, coordinate, TUPLESORT_NONE); - + InvalidateCatalogSnapshot(); /* scan the relation and merge per-worker results */ reltuples = _brin_parallel_merge(state); _brin_end_parallel(state->bs_leader, state); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } else /* no parallel index build */ { @@ -1233,6 +1234,7 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) reltuples = table_index_build_scan(heap, index, indexInfo, false, true, brinbuildCallback, state, NULL); + InvalidateCatalogSnapshot(); /* * process the final batch * @@ -1252,6 +1254,7 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) brin_fill_empty_ranges(state, state->bs_currRangeStart, state->bs_maxRangeStart); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } /* release resources */ @@ -2374,6 +2377,7 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; + bool need_pop_active_snapshot = true; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -2399,9 +2403,16 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * live according to that. */ if (!isconcurrent) + { + Assert(ActiveSnapshotSet()); snapshot = SnapshotAny; + need_pop_active_snapshot = false; + } else + { snapshot = RegisterSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(GetTransactionSnapshot()); + } /* * Estimate size for our own PARALLEL_KEY_BRIN_SHARED workspace. @@ -2444,6 +2455,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, /* If no DSM segment was available, back out (do serial build) */ if (pcxt->seg == NULL) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); if (IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); @@ -2523,6 +2536,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); _brin_end_parallel(brinleader, NULL); return; } @@ -2539,6 +2554,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * sure that the failure-to-start case will not hang forever. */ WaitForParallelWorkersToAttach(pcxt); + if (need_pop_active_snapshot) + PopActiveSnapshot(); } /* diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index a65acd891049..4cea1612ce6e 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -28,6 +28,7 @@ #include "pgstat.h" #include "storage/bufmgr.h" #include "storage/predicate.h" +#include "storage/proc.h" #include "tcop/tcopprot.h" #include "utils/datum.h" #include "utils/memutils.h" @@ -646,6 +647,8 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) buildstate.accum.ginstate = &buildstate.ginstate; ginInitBA(&buildstate.accum); + Assert(!indexInfo->ii_Concurrent || indexInfo->ii_ParallelWorkers || !TransactionIdIsValid(MyProc->xid)); + /* Report table scan phase started */ pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_GIN_PHASE_INDEXBUILD_TABLESCAN); @@ -708,11 +711,13 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) tuplesort_begin_index_gin(heap, index, maintenance_work_mem, coordinate, TUPLESORT_NONE); + InvalidateCatalogSnapshot(); /* scan the relation in parallel and merge per-worker results */ reltuples = _gin_parallel_merge(state); _gin_end_parallel(state->bs_leader, state); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } else /* no parallel index build */ { @@ -722,6 +727,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) */ reltuples = table_index_build_scan(heap, index, indexInfo, false, true, ginBuildCallback, &buildstate, NULL); + InvalidateCatalogSnapshot(); /* dump remaining entries to the index */ oldCtx = MemoryContextSwitchTo(buildstate.tmpCtx); @@ -735,6 +741,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) list, nlist, &buildstate.buildStats); } MemoryContextSwitchTo(oldCtx); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } MemoryContextDelete(buildstate.funcCtx); @@ -907,6 +914,7 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; + bool need_pop_active_snapshot = true; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -931,9 +939,16 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * live according to that. */ if (!isconcurrent) + { + Assert(ActiveSnapshotSet()); snapshot = SnapshotAny; + need_pop_active_snapshot = false; + } else + { snapshot = RegisterSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(GetTransactionSnapshot()); + } /* * Estimate size for our own PARALLEL_KEY_GIN_SHARED workspace. @@ -976,6 +991,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, /* If no DSM segment was available, back out (do serial build) */ if (pcxt->seg == NULL) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); if (IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); @@ -1050,6 +1067,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); _gin_end_parallel(ginleader, NULL); return; } @@ -1066,6 +1085,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * sure that the failure-to-start case will not hang forever. */ WaitForParallelWorkersToAttach(pcxt); + if (need_pop_active_snapshot) + PopActiveSnapshot(); } /* diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c index 9e707167d984..56981147ae15 100644 --- a/src/backend/access/gist/gistbuild.c +++ b/src/backend/access/gist/gistbuild.c @@ -43,6 +43,7 @@ #include "optimizer/optimizer.h" #include "storage/bufmgr.h" #include "storage/bulk_write.h" +#include "storage/proc.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -259,6 +260,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) buildstate.indtuples = 0; buildstate.indtuplesSize = 0; + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xid)); if (buildstate.buildMode == GIST_SORTED_BUILD) { /* @@ -350,6 +352,7 @@ gistbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = (double) buildstate.indtuples; + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xid)); return result; } diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 53061c819fbf..3711baea0520 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -197,6 +197,7 @@ hashbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xid)); return result; } diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 0dcd6ee817e0..6d485b84d9fd 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -53,6 +53,7 @@ #include "utils/inval.h" #include "utils/spccache.h" #include "utils/syscache.h" +#include "utils/injection_point.h" static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, @@ -633,6 +634,36 @@ heap_prepare_pagescan(TableScanDesc sscan) LockBuffer(buffer, BUFFER_LOCK_UNLOCK); } +/* + * Reset the active snapshot during a scan. + * This ensures the xmin horizon can advance while maintaining safe tuple visibility. + * Note: No other snapshot should be active during this operation. + */ +static inline void +heap_reset_scan_snapshot(TableScanDesc sscan) +{ + /* Make sure no other snapshot was set as active. */ + Assert(GetActiveSnapshot() == sscan->rs_snapshot); + /* And make sure active snapshot is not registered. */ + Assert(GetActiveSnapshot()->regd_count == 0); + PopActiveSnapshot(); + + sscan->rs_snapshot = InvalidSnapshot; /* just ot be tidy */ + Assert(!HaveRegisteredOrActiveSnapshot()); + InvalidateCatalogSnapshot(); + + /* Goal of snapshot reset is to allow horizon to advance. */ + Assert(!TransactionIdIsValid(MyProc->xmin)); +#if USE_INJECTION_POINTS + /* In some cases it is still not possible due xid assign. */ + if (!TransactionIdIsValid(MyProc->xid)) + INJECTION_POINT("heap_reset_scan_snapshot_effective", NULL); +#endif + + PushActiveSnapshot(GetLatestSnapshot()); + sscan->rs_snapshot = GetActiveSnapshot(); +} + /* * heap_fetch_next_buffer - read and pin the next block from MAIN_FORKNUM. * @@ -674,7 +705,12 @@ heap_fetch_next_buffer(HeapScanDesc scan, ScanDirection dir) scan->rs_cbuf = read_stream_next_buffer(scan->rs_read_stream, NULL); if (BufferIsValid(scan->rs_cbuf)) + { scan->rs_cblock = BufferGetBlockNumber(scan->rs_cbuf); + if ((scan->rs_base.rs_flags & SO_RESET_SNAPSHOT) && + (scan->rs_cblock % SO_RESET_SNAPSHOT_EACH_N_PAGE == 0)) + heap_reset_scan_snapshot((TableScanDesc) scan); + } } /* @@ -1325,6 +1361,15 @@ heap_endscan(TableScanDesc sscan) if (scan->rs_parallelworkerdata != NULL) pfree(scan->rs_parallelworkerdata); + if (scan->rs_base.rs_flags & SO_RESET_SNAPSHOT) + { + Assert(!(scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)); + /* Make sure no other snapshot was set as active. */ + Assert(GetActiveSnapshot() == sscan->rs_snapshot); + /* And make sure snapshot is not registered. */ + Assert(GetActiveSnapshot()->regd_count == 0); + } + if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT) UnregisterSnapshot(scan->rs_base.rs_snapshot); diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index cb4bc35c93ed..3b4d3c4d5819 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1194,6 +1194,8 @@ heapam_index_build_range_scan(Relation heapRelation, ExprContext *econtext; Snapshot snapshot; bool need_unregister_snapshot = false; + bool need_pop_active_snapshot = false; + bool reset_snapshots = false; TransactionId OldestXmin; BlockNumber previous_blkno = InvalidBlockNumber; BlockNumber root_blkno = InvalidBlockNumber; @@ -1228,9 +1230,6 @@ heapam_index_build_range_scan(Relation heapRelation, /* Arrange for econtext's scan tuple to be the tuple under test */ econtext->ecxt_scantuple = slot; - /* Set up execution state for predicate, if any. */ - predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); - /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time @@ -1240,6 +1239,15 @@ heapam_index_build_range_scan(Relation heapRelation, */ OldestXmin = InvalidTransactionId; + /* + * For unique index we need consistent snapshot for the whole scan. + * In case of parallel scan some additional infrastructure required + * to perform scan with SO_RESET_SNAPSHOT which is not yet ready. + */ + reset_snapshots = indexInfo->ii_Concurrent && + !indexInfo->ii_Unique && + !is_system_catalog; /* just for the case */ + /* okay to ignore lazy VACUUMs here */ if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent) OldestXmin = GetOldestNonRemovableTransactionId(heapRelation); @@ -1248,24 +1256,41 @@ heapam_index_build_range_scan(Relation heapRelation, { /* * Serial index build. - * - * Must begin our own heap scan in this case. We may also need to - * register a snapshot whose lifetime is under our direct control. */ if (!TransactionIdIsValid(OldestXmin)) { - snapshot = RegisterSnapshot(GetTransactionSnapshot()); - need_unregister_snapshot = true; + snapshot = GetTransactionSnapshot(); + /* + * Must begin our own heap scan in this case. We may also need to + * register a snapshot whose lifetime is under our direct control. + * In case of resetting of snapshot during the scan registration is + * not allowed because snapshot is going to be changed every so + * often. + */ + if (!reset_snapshots) + { + snapshot = RegisterSnapshot(snapshot); + need_unregister_snapshot = true; + } + Assert(!ActiveSnapshotSet()); + PushActiveSnapshot(snapshot); + /* store link to snapshot because it may be copied */ + snapshot = GetActiveSnapshot(); + need_pop_active_snapshot = true; } else + { + Assert(!indexInfo->ii_Concurrent); snapshot = SnapshotAny; + } scan = table_beginscan_strat(heapRelation, /* relation */ snapshot, /* snapshot */ 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - allow_sync); /* syncscan OK? */ + allow_sync, /* syncscan OK? */ + reset_snapshots /* reset snapshots? */); } else { @@ -1279,6 +1304,8 @@ heapam_index_build_range_scan(Relation heapRelation, Assert(!IsBootstrapProcessingMode()); Assert(allow_sync); snapshot = scan->rs_snapshot; + PushActiveSnapshot(snapshot); + need_pop_active_snapshot = true; } hscan = (HeapScanDesc) scan; @@ -1293,6 +1320,13 @@ heapam_index_build_range_scan(Relation heapRelation, Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) : !TransactionIdIsValid(OldestXmin)); Assert(snapshot == SnapshotAny || !anyvisible); + Assert(snapshot == SnapshotAny || ActiveSnapshotSet()); + + /* Set up execution state for predicate, if any. */ + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + /* Clear reference to snapshot since it may be changed by the scan itself. */ + if (reset_snapshots) + snapshot = InvalidSnapshot; /* Publish number of blocks to scan */ if (progress) @@ -1728,6 +1762,8 @@ heapam_index_build_range_scan(Relation heapRelation, table_endscan(scan); + if (need_pop_active_snapshot) + PopActiveSnapshot(); /* we can now forget our snapshot, if set and registered by us */ if (need_unregister_snapshot) UnregisterSnapshot(snapshot); @@ -1800,7 +1836,8 @@ heapam_index_validate_scan(Relation heapRelation, 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - false); /* syncscan not OK */ + false, /* syncscan not OK */ + false); hscan = (HeapScanDesc) scan; pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL, diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 0cb27af13109..c9c530447480 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -464,7 +464,7 @@ systable_beginscan(Relation heapRelation, */ sysscan->scan = table_beginscan_strat(heapRelation, snapshot, nkeys, key, - true, false); + true, false, false); sysscan->iscan = NULL; } diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 9d70e89c1f3c..47340de1d328 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -258,7 +258,7 @@ static double _bt_spools_heapscan(Relation heap, Relation index, static void _bt_spooldestroy(BTSpool *btspool); static void _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull); -static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2); +static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots); static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state); static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level); @@ -321,18 +321,22 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) RelationGetRelationName(index)); reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo); + Assert(indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique || + !indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); /* * Finish the build by (1) completing the sort of the spool file, (2) * inserting the sorted tuples into btree pages and (3) building the upper * levels. Finally, it may also be necessary to end use of parallelism. */ - _bt_leafbuild(buildstate.spool, buildstate.spool2); + _bt_leafbuild(buildstate.spool, buildstate.spool2, !indexInfo->ii_ParallelWorkers && indexInfo->ii_Concurrent); _bt_spooldestroy(buildstate.spool); if (buildstate.spool2) _bt_spooldestroy(buildstate.spool2); if (buildstate.btleader) _bt_end_parallel(buildstate.btleader); + Assert(indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique || + !indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); @@ -480,6 +484,9 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, else reltuples = _bt_parallel_heapscan(buildstate, &indexInfo->ii_BrokenHotChain); + InvalidateCatalogSnapshot(); + Assert(indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique || + !indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); /* * Set the progress target for the next phase. Reset the block number @@ -535,7 +542,7 @@ _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull) * create an entire btree. */ static void -_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) +_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots) { BTWriteState wstate; @@ -557,18 +564,21 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) PROGRESS_BTREE_PHASE_PERFORMSORT_2); tuplesort_performsort(btspool2->sortstate); } + Assert(!reset_snapshots || !TransactionIdIsValid(MyProc->xmin)); wstate.heap = btspool->heap; wstate.index = btspool->index; wstate.inskey = _bt_mkscankey(wstate.index, NULL); /* _bt_mkscankey() won't set allequalimage without metapage */ wstate.inskey->allequalimage = _bt_allequalimage(wstate.index, true); + InvalidateCatalogSnapshot(); /* reserve the metapage */ wstate.btws_pages_alloced = BTREE_METAPAGE + 1; pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_BTREE_PHASE_LEAF_LOAD); + Assert(!reset_snapshots || !TransactionIdIsValid(MyProc->xmin)); _bt_load(&wstate, btspool, btspool2); } @@ -1409,6 +1419,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; + bool need_pop_active_snapshot = true; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -1434,9 +1445,16 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * live according to that. */ if (!isconcurrent) + { + Assert(ActiveSnapshotSet()); snapshot = SnapshotAny; + need_pop_active_snapshot = false; + } else + { snapshot = RegisterSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(snapshot); + } /* * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and @@ -1490,6 +1508,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) /* If no DSM segment was available, back out (do serial build) */ if (pcxt->seg == NULL) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); if (IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); @@ -1584,6 +1604,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); _bt_end_parallel(btleader); return; } @@ -1600,6 +1622,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * sure that the failure-to-start case will not hang forever. */ WaitForParallelWorkersToAttach(pcxt); + if (need_pop_active_snapshot) + PopActiveSnapshot(); } /* diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index 6a61e093fa05..06c01cf3360d 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -24,6 +24,7 @@ #include "nodes/execnodes.h" #include "storage/bufmgr.h" #include "storage/bulk_write.h" +#include "storage/proc.h" #include "utils/memutils.h" #include "utils/rel.h" @@ -143,6 +144,7 @@ spgbuild(Relation heap, Relation index, IndexInfo *indexInfo) result = (IndexBuildResult *) palloc0(sizeof(IndexBuildResult)); result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); return result; } diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 739a92bdcc1c..cbd0ba9aa01d 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -80,6 +80,7 @@ #include "utils/snapmgr.h" #include "utils/syscache.h" #include "utils/tuplesort.h" +#include "storage/proc.h" /* Potentially set by pg_upgrade_support functions */ Oid binary_upgrade_next_index_pg_class_oid = InvalidOid; @@ -1492,8 +1493,8 @@ index_concurrently_build(Oid heapRelationId, Relation indexRelation; IndexInfo *indexInfo; - /* This had better make sure that a snapshot is active */ - Assert(ActiveSnapshotSet()); + Assert(!TransactionIdIsValid(MyProc->xmin)); + Assert(!TransactionIdIsValid(MyProc->xid)); /* Open and lock the parent heap relation */ heapRel = table_open(heapRelationId, ShareUpdateExclusiveLock); @@ -1511,19 +1512,28 @@ index_concurrently_build(Oid heapRelationId, indexRelation = index_open(indexRelationId, RowExclusiveLock); + /* BuildIndexInfo may require as snapshot for expressions and predicates */ + PushActiveSnapshot(GetTransactionSnapshot()); /* * We have to re-build the IndexInfo struct, since it was lost in the * commit of the transaction where this concurrent index was created at * the catalog level. */ indexInfo = BuildIndexInfo(indexRelation); + /* Done with snapshot */ + PopActiveSnapshot(); Assert(!indexInfo->ii_ReadyForInserts); indexInfo->ii_Concurrent = true; indexInfo->ii_BrokenHotChain = false; + Assert(!TransactionIdIsValid(MyProc->xmin)); /* Now build the index */ index_build(heapRel, indexRelation, indexInfo, false, true); + /* Invalidate catalog snapshot just for assert */ + InvalidateCatalogSnapshot(); + Assert((indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique) || !TransactionIdIsValid(MyProc->xmin)); + /* Roll back any GUC changes executed by index functions */ AtEOXact_GUC(false, save_nestlevel); @@ -1534,12 +1544,19 @@ index_concurrently_build(Oid heapRelationId, table_close(heapRel, NoLock); index_close(indexRelation, NoLock); + /* + * Updating pg_index might involve TOAST table access, so ensure we + * have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); /* * Update the pg_index row to mark the index as ready for inserts. Once we * commit this transaction, any new transactions that open the table must * insert new entries into the index for insertions and non-HOT updates. */ index_set_state_flags(indexRelationId, INDEX_CREATE_SET_READY); + /* we can do away with our snapshot */ + PopActiveSnapshot(); } /* @@ -3236,7 +3253,8 @@ IndexCheckExclusion(Relation heapRelation, 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - true); /* syncscan OK */ + true, /* syncscan OK */ + false); while (table_scan_getnextslot(scan, ForwardScanDirection, slot)) { @@ -3299,12 +3317,16 @@ IndexCheckExclusion(Relation heapRelation, * as of the start of the scan (see table_index_build_scan), whereas a normal * build takes care to include recently-dead tuples. This is OK because * we won't mark the index valid until all transactions that might be able - * to see those tuples are gone. The reason for doing that is to avoid + * to see those tuples are gone. One of reasons for doing that is to avoid * bogus unique-index failures due to concurrent UPDATEs (we might see * different versions of the same row as being valid when we pass over them, * if we used HeapTupleSatisfiesVacuum). This leaves us with an index that * does not contain any tuples added to the table while we built the index. * + * Furthermore, in case of non-unique index we set SO_RESET_SNAPSHOT for the + * scan, which causes new snapshot to be set as active every so often. The reason + * for that is to propagate the xmin horizon forward. + * * Next, we mark the index "indisready" (but still not "indisvalid") and * commit the second transaction and start a third. Again we wait for all * transactions that could have been modifying the table to terminate. Now diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 96fa31353a0b..62bdcb6e551d 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1694,23 +1694,17 @@ DefineIndex(Oid tableId, * chains can be created where the new tuple and the old tuple in the * chain have different index keys. * - * We now take a new snapshot, and build the index using all tuples that - * are visible in this snapshot. We can be sure that any HOT updates to + * We build the index using all tuples that are visible using single or + * multiple refreshing snapshots. We can be sure that any HOT updates to * these tuples will be compatible with the index, since any updates made * by transactions that didn't know about the index are now committed or * rolled back. Thus, each visible tuple is either the end of its * HOT-chain or the extension of the chain is HOT-safe for this index. */ - /* Set ActiveSnapshot since functions in the indexes may need it */ - PushActiveSnapshot(GetTransactionSnapshot()); - /* Perform concurrent build of index */ index_concurrently_build(tableId, indexRelationId); - /* we can do away with our snapshot */ - PopActiveSnapshot(); - /* * Commit this transaction to make the indisready update visible. */ @@ -4073,9 +4067,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein if (newidx->safe) set_indexsafe_procflags(); - /* Set ActiveSnapshot since functions in the indexes may need it */ - PushActiveSnapshot(GetTransactionSnapshot()); - /* * Update progress for the index to build, with the correct parent * table involved. @@ -4090,7 +4081,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein /* Perform concurrent build of new index */ index_concurrently_build(newidx->tableId, newidx->indexId); - PopActiveSnapshot(); CommitTransactionCommand(); } diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index ff65867eebee..0d5e54e0cc23 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -62,6 +62,7 @@ #include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/selfuncs.h" +#include "utils/snapmgr.h" /* GUC parameters */ double cursor_tuple_fraction = DEFAULT_CURSOR_TUPLE_FRACTION; @@ -6899,6 +6900,7 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) Relation heap; Relation index; RelOptInfo *rel; + bool need_pop_active_snapshot = false; int parallel_workers; BlockNumber heap_blocks; double reltuples; @@ -6954,6 +6956,11 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) heap = table_open(tableOid, NoLock); index = index_open(indexOid, NoLock); + /* Set ActiveSnapshot since functions in the indexes may need it */ + if (!ActiveSnapshotSet()) { + PushActiveSnapshot(GetTransactionSnapshot()); + need_pop_active_snapshot = true; + } /* * Determine if it's safe to proceed. * @@ -7011,6 +7018,8 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) parallel_workers--; done: + if (need_pop_active_snapshot) + PopActiveSnapshot(); index_close(index, NoLock); table_close(heap, NoLock); diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index e48fe434cd39..6caad42ea4c0 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -42,6 +42,8 @@ #define HEAP_PAGE_PRUNE_MARK_UNUSED_NOW (1 << 0) #define HEAP_PAGE_PRUNE_FREEZE (1 << 1) +#define SO_RESET_SNAPSHOT_EACH_N_PAGE 4096 + typedef struct BulkInsertStateData *BulkInsertState; struct TupleTableSlot; struct VacuumCutoffs; diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 8713e12cbfb9..8df6ba9b89e6 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -24,6 +24,7 @@ #include "storage/read_stream.h" #include "utils/rel.h" #include "utils/snapshot.h" +#include "utils/injection_point.h" #define DEFAULT_TABLE_ACCESS_METHOD "heap" @@ -62,6 +63,17 @@ typedef enum ScanOptions /* unregister snapshot at scan end? */ SO_TEMP_SNAPSHOT = 1 << 9, + /* + * Reset scan and catalog snapshot every so often? If so, each + * SO_RESET_SNAPSHOT_EACH_N_PAGE pages active snapshot is popped, + * catalog snapshot invalidated, latest snapshot pushed as active. + * + * At the end of the scan snapshot is not popped. + * Goal of such mode is keep xmin propagating horizon forward. + * + * see heap_reset_scan_snapshot for details. + */ + SO_RESET_SNAPSHOT = 1 << 10, } ScanOptions; /* @@ -893,7 +905,8 @@ extern TableScanDesc table_beginscan_catalog(Relation relation, int nkeys, static inline TableScanDesc table_beginscan_strat(Relation rel, Snapshot snapshot, int nkeys, struct ScanKeyData *key, - bool allow_strat, bool allow_sync) + bool allow_strat, bool allow_sync, + bool reset_snapshot) { uint32 flags = SO_TYPE_SEQSCAN | SO_ALLOW_PAGEMODE; @@ -901,6 +914,15 @@ table_beginscan_strat(Relation rel, Snapshot snapshot, flags |= SO_ALLOW_STRAT; if (allow_sync) flags |= SO_ALLOW_SYNC; + if (reset_snapshot) + { + INJECTION_POINT("table_beginscan_strat_reset_snapshots", NULL); + /* Active snapshot is required on start. */ + Assert(GetActiveSnapshot() == snapshot); + /* Active snapshot should not be registered to keep xmin propagating. */ + Assert(GetActiveSnapshot()->regd_count == 0); + flags |= (SO_RESET_SNAPSHOT); + } return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); } @@ -1730,6 +1752,10 @@ table_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, * very hard to detect whether they're really incompatible with the chain tip. * This only really makes sense for heap AM, it might need to be generalized * for other AMs later. + * + * In case of non-unique index and non-parallel concurrent build + * SO_RESET_SNAPSHOT is applied for the scan. That leads for changing snapshots + * on the fly to allow xmin horizon propagate. */ static inline double table_index_build_scan(Relation table_rel, diff --git a/src/test/modules/injection_points/Makefile b/src/test/modules/injection_points/Makefile index e680991f8d4f..19d26408c2a4 100644 --- a/src/test/modules/injection_points/Makefile +++ b/src/test/modules/injection_points/Makefile @@ -11,7 +11,7 @@ EXTENSION = injection_points DATA = injection_points--1.0.sql PGFILEDESC = "injection_points - facility for injection points" -REGRESS = injection_points hashagg reindex_conc +REGRESS = injection_points hashagg reindex_conc cic_reset_snapshots REGRESS_OPTS = --dlpath=$(top_builddir)/src/test/regress ISOLATION = basic inplace syscache-update-pruned diff --git a/src/test/modules/injection_points/expected/cic_reset_snapshots.out b/src/test/modules/injection_points/expected/cic_reset_snapshots.out new file mode 100644 index 000000000000..948d1232aa0c --- /dev/null +++ b/src/test/modules/injection_points/expected/cic_reset_snapshots.out @@ -0,0 +1,105 @@ +CREATE EXTENSION injection_points; +SELECT injection_points_set_local(); + injection_points_set_local +---------------------------- + +(1 row) + +SELECT injection_points_attach('heap_reset_scan_snapshot_effective', 'notice'); + injection_points_attach +------------------------- + +(1 row) + +SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice'); + injection_points_attach +------------------------- + +(1 row) + +CREATE SCHEMA cic_reset_snap; +CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); +INSERT INTO cic_reset_snap.tbl SELECT i, i * I FROM generate_series(1, 200) s(i); +CREATE FUNCTION cic_reset_snap.predicate_stable(integer) RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN MOD($1, 2) = 0; +END; $$; +CREATE FUNCTION cic_reset_snap.predicate_stable_no_param() RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN false; +END; $$; +---------------- +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +-- The same in parallel mode +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i DESC NULLS LAST); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +DROP SCHEMA cic_reset_snap CASCADE; +NOTICE: drop cascades to 3 other objects +DETAIL: drop cascades to table cic_reset_snap.tbl +drop cascades to function cic_reset_snap.predicate_stable(integer) +drop cascades to function cic_reset_snap.predicate_stable_no_param() +DROP EXTENSION injection_points; diff --git a/src/test/modules/injection_points/meson.build b/src/test/modules/injection_points/meson.build index d61149712fd7..8476bfe72a7f 100644 --- a/src/test/modules/injection_points/meson.build +++ b/src/test/modules/injection_points/meson.build @@ -37,6 +37,7 @@ tests += { 'injection_points', 'hashagg', 'reindex_conc', + 'cic_reset_snapshots', ], 'regress_args': ['--dlpath', meson.build_root() / 'src/test/regress'], # The injection points are cluster-wide, so disable installcheck diff --git a/src/test/modules/injection_points/sql/cic_reset_snapshots.sql b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql new file mode 100644 index 000000000000..5072535b3555 --- /dev/null +++ b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql @@ -0,0 +1,86 @@ +CREATE EXTENSION injection_points; + +SELECT injection_points_set_local(); +SELECT injection_points_attach('heap_reset_scan_snapshot_effective', 'notice'); +SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice'); + + +CREATE SCHEMA cic_reset_snap; +CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); +INSERT INTO cic_reset_snap.tbl SELECT i, i * I FROM generate_series(1, 200) s(i); + +CREATE FUNCTION cic_reset_snap.predicate_stable(integer) RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN MOD($1, 2) = 0; +END; $$; + +CREATE FUNCTION cic_reset_snap.predicate_stable_no_param() RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN false; +END; $$; + +---------------- +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); + +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +-- The same in parallel mode +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); + +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i DESC NULLS LAST); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +DROP SCHEMA cic_reset_snap CASCADE; + +DROP EXTENSION injection_points; From cdca32ed680dcf6838a31e0eedf7cb707c6921da Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu Date: Wed, 1 Jan 2025 15:25:20 +0100 Subject: [PATCH 04/12] Support snapshot resets in parallel concurrent index builds Extend periodic snapshot reset support to parallel builds, previously limited to non-parallel operations. This allows the xmin horizon to advance during parallel concurrent index builds as well. The main limitation of applying that technic to parallel builds was a requirement to wait until workers processes restore their initial snapshot from leader. To address this, following changes applied: - add infrastructure to track snapshot restoration in parallel workers - extend parallel scan initialization to support periodic snapshot resets - wait for parallel workers to restore their initial snapshots before proceeding with scan - relax limitation for parallel worker to call GetLatestSnapshot --- src/backend/access/brin/brin.c | 50 +++++++++------- src/backend/access/gin/gininsert.c | 50 +++++++++------- src/backend/access/heap/heapam_handler.c | 12 ++-- src/backend/access/nbtree/nbtsort.c | 57 ++++++++++++++----- src/backend/access/table/tableam.c | 37 ++++++++++-- src/backend/access/transam/parallel.c | 50 ++++++++++++++-- src/backend/catalog/index.c | 2 +- src/backend/executor/nodeSeqscan.c | 3 +- src/backend/utils/time/snapmgr.c | 8 --- src/include/access/parallel.h | 3 +- src/include/access/relscan.h | 1 + src/include/access/tableam.h | 9 +-- .../expected/cic_reset_snapshots.out | 25 +++++++- .../sql/cic_reset_snapshots.sql | 7 ++- 14 files changed, 225 insertions(+), 89 deletions(-) diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index a48682b8dbf2..947dc79b138d 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -143,7 +143,6 @@ typedef struct BrinLeader */ BrinShared *brinshared; Sharedsort *sharedsort; - Snapshot snapshot; WalUsage *walusage; BufferUsage *bufferusage; } BrinLeader; @@ -231,7 +230,7 @@ static void brin_fill_empty_ranges(BrinBuildState *state, static void _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, bool isconcurrent, int request); static void _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state); -static Size _brin_parallel_estimate_shared(Relation heap, Snapshot snapshot); +static Size _brin_parallel_estimate_shared(Relation heap); static double _brin_parallel_heapscan(BrinBuildState *state); static double _brin_parallel_merge(BrinBuildState *state); static void _brin_leader_participate_as_worker(BrinBuildState *buildstate, @@ -1221,7 +1220,6 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) reltuples = _brin_parallel_merge(state); _brin_end_parallel(state->bs_leader, state); - Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } else /* no parallel index build */ { @@ -1254,7 +1252,6 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) brin_fill_empty_ranges(state, state->bs_currRangeStart, state->bs_maxRangeStart); - Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } /* release resources */ @@ -1269,6 +1266,7 @@ brinbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = idxtuples; + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xid)); return result; } @@ -2368,7 +2366,6 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, { ParallelContext *pcxt; int scantuplesortstates; - Snapshot snapshot; Size estbrinshared; Size estsort; BrinShared *brinshared; @@ -2399,25 +2396,25 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a - * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that. + * concurrent build, we take a regular MVCC snapshot and push it as active. + * Later we index whatever's live according to that snapshot while that + * snapshot is reset periodically. */ if (!isconcurrent) { Assert(ActiveSnapshotSet()); - snapshot = SnapshotAny; need_pop_active_snapshot = false; } else { - snapshot = RegisterSnapshot(GetTransactionSnapshot()); + Assert(!ActiveSnapshotSet()); PushActiveSnapshot(GetTransactionSnapshot()); } /* * Estimate size for our own PARALLEL_KEY_BRIN_SHARED workspace. */ - estbrinshared = _brin_parallel_estimate_shared(heap, snapshot); + estbrinshared = _brin_parallel_estimate_shared(heap); shm_toc_estimate_chunk(&pcxt->estimator, estbrinshared); estsort = tuplesort_estimate_shared(scantuplesortstates); shm_toc_estimate_chunk(&pcxt->estimator, estsort); @@ -2457,8 +2454,6 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, { if (need_pop_active_snapshot) PopActiveSnapshot(); - if (IsMVCCSnapshot(snapshot)) - UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); ExitParallelMode(); return; @@ -2483,7 +2478,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, table_parallelscan_initialize(heap, ParallelTableScanFromBrinShared(brinshared), - snapshot); + isconcurrent ? InvalidSnapshot : SnapshotAny, + isconcurrent); /* * Store shared tuplesort-private state, for which we reserved space. @@ -2529,7 +2525,6 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, brinleader->nparticipanttuplesorts++; brinleader->brinshared = brinshared; brinleader->sharedsort = sharedsort; - brinleader->snapshot = snapshot; brinleader->walusage = walusage; brinleader->bufferusage = bufferusage; @@ -2545,6 +2540,13 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, /* Save leader state now that it's clear build will be parallel */ buildstate->bs_leader = brinleader; + /* + * In case of concurrent build snapshots are going to be reset periodically. + * We need to wait until all workers imported initial snapshot. + */ + if (isconcurrent) + WaitForParallelWorkersToAttach(pcxt, true); + /* Join heap scan ourselves */ if (leaderparticipates) _brin_leader_participate_as_worker(buildstate, heap, index); @@ -2553,7 +2555,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - WaitForParallelWorkersToAttach(pcxt); + if (!isconcurrent) + WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); } @@ -2576,9 +2579,6 @@ _brin_end_parallel(BrinLeader *brinleader, BrinBuildState *state) for (i = 0; i < brinleader->pcxt->nworkers_launched; i++) InstrAccumParallelQuery(&brinleader->bufferusage[i], &brinleader->walusage[i]); - /* Free last reference to MVCC snapshot, if one was used */ - if (IsMVCCSnapshot(brinleader->snapshot)) - UnregisterSnapshot(brinleader->snapshot); DestroyParallelContext(brinleader->pcxt); ExitParallelMode(); } @@ -2778,14 +2778,14 @@ _brin_parallel_merge(BrinBuildState *state) /* * Returns size of shared memory required to store state for a parallel - * brin index build based on the snapshot its parallel scan will use. + * brin index build. */ static Size -_brin_parallel_estimate_shared(Relation heap, Snapshot snapshot) +_brin_parallel_estimate_shared(Relation heap) { /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */ return add_size(BUFFERALIGN(sizeof(BrinShared)), - table_parallelscan_estimate(heap, snapshot)); + table_parallelscan_estimate(heap, InvalidSnapshot)); } /* @@ -2807,6 +2807,7 @@ _brin_leader_participate_as_worker(BrinBuildState *buildstate, Relation heap, Re /* Perform work common to all participants */ _brin_parallel_scan_and_build(buildstate, brinleader->brinshared, brinleader->sharedsort, heap, index, sortmem, true); + Assert(!brinleader->brinshared->isconcurrent || !TransactionIdIsValid(MyProc->xid)); } /* @@ -2947,6 +2948,13 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) _brin_parallel_scan_and_build(buildstate, brinshared, sharedsort, heapRel, indexRel, sortmem, false); + if (brinshared->isconcurrent) + { + PopActiveSnapshot(); + InvalidateCatalogSnapshot(); + Assert(!TransactionIdIsValid(MyProc->xid)); + PushActiveSnapshot(GetTransactionSnapshot()); + } /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 4cea1612ce6e..629f6d5f2c0a 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -132,7 +132,6 @@ typedef struct GinLeader */ GinBuildShared *ginshared; Sharedsort *sharedsort; - Snapshot snapshot; WalUsage *walusage; BufferUsage *bufferusage; } GinLeader; @@ -180,7 +179,7 @@ typedef struct static void _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, bool isconcurrent, int request); static void _gin_end_parallel(GinLeader *ginleader, GinBuildState *state); -static Size _gin_parallel_estimate_shared(Relation heap, Snapshot snapshot); +static Size _gin_parallel_estimate_shared(Relation heap); static double _gin_parallel_heapscan(GinBuildState *state); static double _gin_parallel_merge(GinBuildState *state); static void _gin_leader_participate_as_worker(GinBuildState *buildstate, @@ -717,7 +716,6 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) reltuples = _gin_parallel_merge(state); _gin_end_parallel(state->bs_leader, state); - Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } else /* no parallel index build */ { @@ -741,7 +739,6 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) list, nlist, &buildstate.buildStats); } MemoryContextSwitchTo(oldCtx); - Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); } MemoryContextDelete(buildstate.funcCtx); @@ -771,6 +768,7 @@ ginbuild(Relation heap, Relation index, IndexInfo *indexInfo) result->heap_tuples = reltuples; result->index_tuples = buildstate.indtuples; + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); return result; } @@ -905,7 +903,6 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, { ParallelContext *pcxt; int scantuplesortstates; - Snapshot snapshot; Size estginshared; Size estsort; GinBuildShared *ginshared; @@ -935,25 +932,25 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a - * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that. + * concurrent build, we take a regular MVCC snapshot and push it as active. + * Later we index whatever's live according to that snapshot while that + * snapshot is reset periodically. */ if (!isconcurrent) { Assert(ActiveSnapshotSet()); - snapshot = SnapshotAny; need_pop_active_snapshot = false; } else { - snapshot = RegisterSnapshot(GetTransactionSnapshot()); + Assert(!ActiveSnapshotSet()); PushActiveSnapshot(GetTransactionSnapshot()); } /* * Estimate size for our own PARALLEL_KEY_GIN_SHARED workspace. */ - estginshared = _gin_parallel_estimate_shared(heap, snapshot); + estginshared = _gin_parallel_estimate_shared(heap); shm_toc_estimate_chunk(&pcxt->estimator, estginshared); estsort = tuplesort_estimate_shared(scantuplesortstates); shm_toc_estimate_chunk(&pcxt->estimator, estsort); @@ -993,8 +990,6 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, { if (need_pop_active_snapshot) PopActiveSnapshot(); - if (IsMVCCSnapshot(snapshot)) - UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); ExitParallelMode(); return; @@ -1018,7 +1013,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, table_parallelscan_initialize(heap, ParallelTableScanFromGinBuildShared(ginshared), - snapshot); + isconcurrent ? InvalidSnapshot : SnapshotAny, + isconcurrent); /* * Store shared tuplesort-private state, for which we reserved space. @@ -1060,7 +1056,6 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, ginleader->nparticipanttuplesorts++; ginleader->ginshared = ginshared; ginleader->sharedsort = sharedsort; - ginleader->snapshot = snapshot; ginleader->walusage = walusage; ginleader->bufferusage = bufferusage; @@ -1076,6 +1071,13 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, /* Save leader state now that it's clear build will be parallel */ buildstate->bs_leader = ginleader; + /* + * In case of concurrent build snapshots are going to be reset periodically. + * We need to wait until all workers imported initial snapshot. + */ + if (isconcurrent) + WaitForParallelWorkersToAttach(pcxt, true); + /* Join heap scan ourselves */ if (leaderparticipates) _gin_leader_participate_as_worker(buildstate, heap, index); @@ -1084,7 +1086,8 @@ _gin_begin_parallel(GinBuildState *buildstate, Relation heap, Relation index, * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - WaitForParallelWorkersToAttach(pcxt); + if (!isconcurrent) + WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); } @@ -1107,9 +1110,6 @@ _gin_end_parallel(GinLeader *ginleader, GinBuildState *state) for (i = 0; i < ginleader->pcxt->nworkers_launched; i++) InstrAccumParallelQuery(&ginleader->bufferusage[i], &ginleader->walusage[i]); - /* Free last reference to MVCC snapshot, if one was used */ - if (IsMVCCSnapshot(ginleader->snapshot)) - UnregisterSnapshot(ginleader->snapshot); DestroyParallelContext(ginleader->pcxt); ExitParallelMode(); } @@ -1790,14 +1790,14 @@ _gin_parallel_merge(GinBuildState *state) /* * Returns size of shared memory required to store state for a parallel - * gin index build based on the snapshot its parallel scan will use. + * gin index build. */ static Size -_gin_parallel_estimate_shared(Relation heap, Snapshot snapshot) +_gin_parallel_estimate_shared(Relation heap) { /* c.f. shm_toc_allocate as to why BUFFERALIGN is used */ return add_size(BUFFERALIGN(sizeof(GinBuildShared)), - table_parallelscan_estimate(heap, snapshot)); + table_parallelscan_estimate(heap, InvalidSnapshot)); } /* @@ -1820,6 +1820,7 @@ _gin_leader_participate_as_worker(GinBuildState *buildstate, Relation heap, Rela _gin_parallel_scan_and_build(buildstate, ginleader->ginshared, ginleader->sharedsort, heap, index, sortmem, true); + Assert(!ginleader->ginshared->isconcurrent || !TransactionIdIsValid(MyProc->xid)); } /* @@ -2179,6 +2180,13 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) _gin_parallel_scan_and_build(&buildstate, ginshared, sharedsort, heapRel, indexRel, sortmem, false); + if (ginshared->isconcurrent) + { + PopActiveSnapshot(); + InvalidateCatalogSnapshot(); + Assert(!TransactionIdIsValid(MyProc->xid)); + PushActiveSnapshot(GetTransactionSnapshot()); + } /* Report WAL/buffer usage during parallel execution */ bufferusage = shm_toc_lookup(toc, PARALLEL_KEY_BUFFER_USAGE, false); diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 3b4d3c4d5819..4cbbf7f2d707 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1235,14 +1235,13 @@ heapam_index_build_range_scan(Relation heapRelation, * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, or during bootstrap, we take a regular MVCC snapshot - * and index whatever's live according to that. + * and index whatever's live according to that while that snapshot is reset + * every so often (in case of non-unique index). */ OldestXmin = InvalidTransactionId; /* * For unique index we need consistent snapshot for the whole scan. - * In case of parallel scan some additional infrastructure required - * to perform scan with SO_RESET_SNAPSHOT which is not yet ready. */ reset_snapshots = indexInfo->ii_Concurrent && !indexInfo->ii_Unique && @@ -1304,8 +1303,11 @@ heapam_index_build_range_scan(Relation heapRelation, Assert(!IsBootstrapProcessingMode()); Assert(allow_sync); snapshot = scan->rs_snapshot; - PushActiveSnapshot(snapshot); - need_pop_active_snapshot = true; + if (!reset_snapshots) + { + PushActiveSnapshot(snapshot); + need_pop_active_snapshot = true; + } } hscan = (HeapScanDesc) scan; diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 47340de1d328..052ebfe6a211 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -321,22 +321,20 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) RelationGetRelationName(index)); reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo); - Assert(indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique || - !indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); /* * Finish the build by (1) completing the sort of the spool file, (2) * inserting the sorted tuples into btree pages and (3) building the upper * levels. Finally, it may also be necessary to end use of parallelism. */ - _bt_leafbuild(buildstate.spool, buildstate.spool2, !indexInfo->ii_ParallelWorkers && indexInfo->ii_Concurrent); + _bt_leafbuild(buildstate.spool, buildstate.spool2, !indexInfo->ii_Unique && indexInfo->ii_Concurrent); _bt_spooldestroy(buildstate.spool); if (buildstate.spool2) _bt_spooldestroy(buildstate.spool2); if (buildstate.btleader) _bt_end_parallel(buildstate.btleader); - Assert(indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique || - !indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); @@ -485,8 +483,7 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, reltuples = _bt_parallel_heapscan(buildstate, &indexInfo->ii_BrokenHotChain); InvalidateCatalogSnapshot(); - Assert(indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique || - !indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); /* * Set the progress target for the next phase. Reset the block number @@ -1420,6 +1417,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) BufferUsage *bufferusage; bool leaderparticipates = true; bool need_pop_active_snapshot = true; + bool reset_snapshot; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -1437,12 +1435,21 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) scantuplesortstates = leaderparticipates ? request + 1 : request; + /* + * For concurrent non-unique index builds, we can periodically reset snapshots + * to allow the xmin horizon to advance. This is safe since these builds don't + * require a consistent view across the entire scan. Unique indexes still need + * a stable snapshot to properly enforce uniqueness constraints. + */ + reset_snapshot = isconcurrent && !btspool->isunique; + /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that. + * live according to that, while that snapshot may be reset periodically in + * case of non-unique index. */ if (!isconcurrent) { @@ -1450,6 +1457,11 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) snapshot = SnapshotAny; need_pop_active_snapshot = false; } + else if (reset_snapshot) + { + snapshot = InvalidSnapshot; + PushActiveSnapshot(GetTransactionSnapshot()); + } else { snapshot = RegisterSnapshot(GetTransactionSnapshot()); @@ -1510,7 +1522,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) { if (need_pop_active_snapshot) PopActiveSnapshot(); - if (IsMVCCSnapshot(snapshot)) + if (snapshot != InvalidSnapshot && IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); ExitParallelMode(); @@ -1537,7 +1549,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) btshared->brokenhotchain = false; table_parallelscan_initialize(btspool->heap, ParallelTableScanFromBTShared(btshared), - snapshot); + snapshot, + reset_snapshot); /* * Store shared tuplesort-private state, for which we reserved space. @@ -1613,6 +1626,13 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) /* Save leader state now that it's clear build will be parallel */ buildstate->btleader = btleader; + /* + * In case of concurrent build snapshots are going to be reset periodically. + * Wait until all workers imported initial snapshot. + */ + if (reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, true); + /* Join heap scan ourselves */ if (leaderparticipates) _bt_leader_participate_as_worker(buildstate); @@ -1621,7 +1641,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - WaitForParallelWorkersToAttach(pcxt); + if (!reset_snapshot) + WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); } @@ -1645,7 +1666,7 @@ _bt_end_parallel(BTLeader *btleader) InstrAccumParallelQuery(&btleader->bufferusage[i], &btleader->walusage[i]); /* Free last reference to MVCC snapshot, if one was used */ - if (IsMVCCSnapshot(btleader->snapshot)) + if (btleader->snapshot != InvalidSnapshot && IsMVCCSnapshot(btleader->snapshot)) UnregisterSnapshot(btleader->snapshot); DestroyParallelContext(btleader->pcxt); ExitParallelMode(); @@ -1895,6 +1916,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, SortCoordinate coordinate; BTBuildState buildstate; TableScanDesc scan; + ParallelTableScanDesc pscan; double reltuples; IndexInfo *indexInfo; @@ -1949,11 +1971,15 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, /* Join parallel scan */ indexInfo = BuildIndexInfo(btspool->index); indexInfo->ii_Concurrent = btshared->isconcurrent; - scan = table_beginscan_parallel(btspool->heap, - ParallelTableScanFromBTShared(btshared)); + pscan = ParallelTableScanFromBTShared(btshared); + scan = table_beginscan_parallel(btspool->heap, pscan); reltuples = table_index_build_scan(btspool->heap, btspool->index, indexInfo, true, progress, _bt_build_callback, &buildstate, scan); + InvalidateCatalogSnapshot(); + if (pscan->phs_reset_snapshot) + PopActiveSnapshot(); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); /* Execute this worker's part of the sort */ if (progress) @@ -1989,4 +2015,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, tuplesort_end(btspool->sortstate); if (btspool2) tuplesort_end(btspool2->sortstate); + Assert(!pscan->phs_reset_snapshot || !TransactionIdIsValid(MyProc->xmin)); + if (pscan->phs_reset_snapshot) + PushActiveSnapshot(GetTransactionSnapshot()); } diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index a56c5eceb14a..6f04c365994e 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -132,10 +132,10 @@ table_parallelscan_estimate(Relation rel, Snapshot snapshot) { Size sz = 0; - if (IsMVCCSnapshot(snapshot)) + if (snapshot != InvalidSnapshot && IsMVCCSnapshot(snapshot)) sz = add_size(sz, EstimateSnapshotSpace(snapshot)); else - Assert(snapshot == SnapshotAny); + Assert(snapshot == SnapshotAny || snapshot == InvalidSnapshot); sz = add_size(sz, rel->rd_tableam->parallelscan_estimate(rel)); @@ -144,21 +144,36 @@ table_parallelscan_estimate(Relation rel, Snapshot snapshot) void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, - Snapshot snapshot) + Snapshot snapshot, bool reset_snapshot) { Size snapshot_off = rel->rd_tableam->parallelscan_initialize(rel, pscan); pscan->phs_snapshot_off = snapshot_off; - if (IsMVCCSnapshot(snapshot)) + /* + * Initialize parallel scan description. For normal scans with a regular + * MVCC snapshot, serialize the snapshot info. For scans that use periodic + * snapshot resets, mark the scan accordingly. + */ + if (reset_snapshot) + { + Assert(snapshot == InvalidSnapshot); + pscan->phs_snapshot_any = false; + pscan->phs_reset_snapshot = true; + INJECTION_POINT("table_parallelscan_initialize", NULL); + } + else if (IsMVCCSnapshot(snapshot)) { SerializeSnapshot(snapshot, (char *) pscan + pscan->phs_snapshot_off); pscan->phs_snapshot_any = false; + pscan->phs_reset_snapshot = false; } else { Assert(snapshot == SnapshotAny); + Assert(!reset_snapshot); pscan->phs_snapshot_any = true; + pscan->phs_reset_snapshot = false; } } @@ -171,7 +186,19 @@ table_beginscan_parallel(Relation relation, ParallelTableScanDesc pscan) Assert(RelFileLocatorEquals(relation->rd_locator, pscan->phs_locator)); - if (!pscan->phs_snapshot_any) + /* + * For scans that + * use periodic snapshot resets, mark the scan accordingly and use the active + * snapshot as the initial state. + */ + if (pscan->phs_reset_snapshot) + { + Assert(ActiveSnapshotSet()); + flags |= SO_RESET_SNAPSHOT; + /* Start with current active snapshot. */ + snapshot = GetActiveSnapshot(); + } + else if (!pscan->phs_snapshot_any) { /* Snapshot was serialized -- restore it */ snapshot = RestoreSnapshot((char *) pscan + pscan->phs_snapshot_off); diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c index 94db1ec30126..065ea9d26f68 100644 --- a/src/backend/access/transam/parallel.c +++ b/src/backend/access/transam/parallel.c @@ -77,6 +77,7 @@ #define PARALLEL_KEY_RELMAPPER_STATE UINT64CONST(0xFFFFFFFFFFFF000D) #define PARALLEL_KEY_UNCOMMITTEDENUMS UINT64CONST(0xFFFFFFFFFFFF000E) #define PARALLEL_KEY_CLIENTCONNINFO UINT64CONST(0xFFFFFFFFFFFF000F) +#define PARALLEL_KEY_SNAPSHOT_RESTORED UINT64CONST(0xFFFFFFFFFFFF0010) /* Fixed-size parallel state. */ typedef struct FixedParallelState @@ -305,6 +306,10 @@ InitializeParallelDSM(ParallelContext *pcxt) pcxt->nworkers)); shm_toc_estimate_keys(&pcxt->estimator, 1); + shm_toc_estimate_chunk(&pcxt->estimator, mul_size(sizeof(bool), + pcxt->nworkers)); + shm_toc_estimate_keys(&pcxt->estimator, 1); + /* Estimate how much we'll need for the entrypoint info. */ shm_toc_estimate_chunk(&pcxt->estimator, strlen(pcxt->library_name) + strlen(pcxt->function_name) + 2); @@ -376,6 +381,7 @@ InitializeParallelDSM(ParallelContext *pcxt) char *entrypointstate; char *uncommittedenumsspace; char *clientconninfospace; + bool *snapshot_set_flag_space; Size lnamelen; /* Serialize shared libraries we have loaded. */ @@ -491,6 +497,19 @@ InitializeParallelDSM(ParallelContext *pcxt) strcpy(entrypointstate, pcxt->library_name); strcpy(entrypointstate + lnamelen + 1, pcxt->function_name); shm_toc_insert(pcxt->toc, PARALLEL_KEY_ENTRYPOINT, entrypointstate); + + /* + * Establish dynamic shared memory to pass information about importing + * of snapshot. + */ + snapshot_set_flag_space = + shm_toc_allocate(pcxt->toc, mul_size(sizeof(bool), pcxt->nworkers)); + for (i = 0; i < pcxt->nworkers; ++i) + { + pcxt->worker[i].snapshot_restored = snapshot_set_flag_space + i * sizeof(bool); + *pcxt->worker[i].snapshot_restored = false; + } + shm_toc_insert(pcxt->toc, PARALLEL_KEY_SNAPSHOT_RESTORED, snapshot_set_flag_space); } /* Update nworkers_to_launch, in case we changed nworkers above. */ @@ -546,6 +565,17 @@ ReinitializeParallelDSM(ParallelContext *pcxt) pcxt->worker[i].error_mqh = shm_mq_attach(mq, pcxt->seg, NULL); } } + + /* Set snapshot restored flag to false. */ + if (pcxt->nworkers > 0) + { + bool *snapshot_restored_space; + int i; + snapshot_restored_space = + shm_toc_lookup(pcxt->toc, PARALLEL_KEY_SNAPSHOT_RESTORED, false); + for (i = 0; i < pcxt->nworkers; ++i) + snapshot_restored_space[i] = false; + } } /* @@ -661,6 +691,10 @@ LaunchParallelWorkers(ParallelContext *pcxt) * Wait for all workers to attach to their error queues, and throw an error if * any worker fails to do this. * + * wait_for_snapshot: track whether each parallel worker has successfully restored + * its snapshot. This is needed when using periodic snapshot resets to ensure all + * workers have a valid initial snapshot before proceeding with the scan. + * * Callers can assume that if this function returns successfully, then the * number of workers given by pcxt->nworkers_launched have initialized and * attached to their error queues. Whether or not these workers are guaranteed @@ -690,7 +724,7 @@ LaunchParallelWorkers(ParallelContext *pcxt) * call this function at all. */ void -WaitForParallelWorkersToAttach(ParallelContext *pcxt) +WaitForParallelWorkersToAttach(ParallelContext *pcxt, bool wait_for_snapshot) { int i; @@ -734,9 +768,12 @@ WaitForParallelWorkersToAttach(ParallelContext *pcxt) mq = shm_mq_get_queue(pcxt->worker[i].error_mqh); if (shm_mq_get_sender(mq) != NULL) { - /* Yes, so it is known to be attached. */ - pcxt->known_attached_workers[i] = true; - ++pcxt->nknown_attached_workers; + if (!wait_for_snapshot || *(pcxt->worker[i].snapshot_restored)) + { + /* Yes, so it is known to be attached. */ + pcxt->known_attached_workers[i] = true; + ++pcxt->nknown_attached_workers; + } } } else if (status == BGWH_STOPPED) @@ -1295,6 +1332,7 @@ ParallelWorkerMain(Datum main_arg) shm_toc *toc; FixedParallelState *fps; char *error_queue_space; + bool *snapshot_restored_space; shm_mq *mq; shm_mq_handle *mqh; char *libraryspace; @@ -1499,6 +1537,10 @@ ParallelWorkerMain(Datum main_arg) fps->parallel_leader_pgproc); PushActiveSnapshot(asnapshot); + /* Snapshot is restored, set flag to make leader know about it. */ + snapshot_restored_space = shm_toc_lookup(toc, PARALLEL_KEY_SNAPSHOT_RESTORED, false); + snapshot_restored_space[ParallelWorkerNumber] = true; + /* * We've changed which tuples we can see, and must therefore invalidate * system caches. diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index cbd0ba9aa01d..6432ef55cdc2 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -1532,7 +1532,7 @@ index_concurrently_build(Oid heapRelationId, /* Invalidate catalog snapshot just for assert */ InvalidateCatalogSnapshot(); - Assert((indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique) || !TransactionIdIsValid(MyProc->xmin)); + Assert(indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); /* Roll back any GUC changes executed by index functions */ AtEOXact_GUC(false, save_nestlevel); diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c index ed35c58c2c34..8a15dd72b918 100644 --- a/src/backend/executor/nodeSeqscan.c +++ b/src/backend/executor/nodeSeqscan.c @@ -367,7 +367,8 @@ ExecSeqScanInitializeDSM(SeqScanState *node, pscan = shm_toc_allocate(pcxt->toc, node->pscan_len); table_parallelscan_initialize(node->ss.ss_currentRelation, pscan, - estate->es_snapshot); + estate->es_snapshot, + false); shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan); node->ss.ss_currentScanDesc = table_beginscan_parallel(node->ss.ss_currentRelation, pscan); diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index ad440ff024c6..f251bc528951 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -342,14 +342,6 @@ GetTransactionSnapshot(void) Snapshot GetLatestSnapshot(void) { - /* - * We might be able to relax this, but nothing that could otherwise work - * needs it. - */ - if (IsInParallelMode()) - elog(ERROR, - "cannot update SecondarySnapshot during a parallel operation"); - /* * So far there are no cases requiring support for GetLatestSnapshot() * during logical decoding, but it wouldn't be hard to add if required. diff --git a/src/include/access/parallel.h b/src/include/access/parallel.h index f37be6d56909..a7362f7b43b4 100644 --- a/src/include/access/parallel.h +++ b/src/include/access/parallel.h @@ -26,6 +26,7 @@ typedef struct ParallelWorkerInfo { BackgroundWorkerHandle *bgwhandle; shm_mq_handle *error_mqh; + bool *snapshot_restored; } ParallelWorkerInfo; typedef struct ParallelContext @@ -65,7 +66,7 @@ extern void InitializeParallelDSM(ParallelContext *pcxt); extern void ReinitializeParallelDSM(ParallelContext *pcxt); extern void ReinitializeParallelWorkers(ParallelContext *pcxt, int nworkers_to_launch); extern void LaunchParallelWorkers(ParallelContext *pcxt); -extern void WaitForParallelWorkersToAttach(ParallelContext *pcxt); +extern void WaitForParallelWorkersToAttach(ParallelContext *pcxt, bool wait_for_snapshot); extern void WaitForParallelWorkersToFinish(ParallelContext *pcxt); extern void DestroyParallelContext(ParallelContext *pcxt); extern bool ParallelContextActive(void); diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index b5e0fb386c0a..50441c58cea3 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -82,6 +82,7 @@ typedef struct ParallelTableScanDescData RelFileLocator phs_locator; /* physical relation to scan */ bool phs_syncscan; /* report location to syncscan logic? */ bool phs_snapshot_any; /* SnapshotAny, not phs_snapshot_data? */ + bool phs_reset_snapshot; /* use SO_RESET_SNAPSHOT? */ Size phs_snapshot_off; /* data for snapshot */ } ParallelTableScanDescData; typedef struct ParallelTableScanDescData *ParallelTableScanDesc; diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 8df6ba9b89e6..a69f71a3ace0 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -1135,7 +1135,8 @@ extern Size table_parallelscan_estimate(Relation rel, Snapshot snapshot); */ extern void table_parallelscan_initialize(Relation rel, ParallelTableScanDesc pscan, - Snapshot snapshot); + Snapshot snapshot, + bool reset_snapshot); /* * Begin a parallel scan. `pscan` needs to have been initialized with @@ -1753,9 +1754,9 @@ table_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, * This only really makes sense for heap AM, it might need to be generalized * for other AMs later. * - * In case of non-unique index and non-parallel concurrent build - * SO_RESET_SNAPSHOT is applied for the scan. That leads for changing snapshots - * on the fly to allow xmin horizon propagate. + * In case of non-unique concurrent index build SO_RESET_SNAPSHOT is applied + * for the scan. That leads for changing snapshots on the fly to allow xmin + * horizon propagate. */ static inline double table_index_build_scan(Relation table_rel, diff --git a/src/test/modules/injection_points/expected/cic_reset_snapshots.out b/src/test/modules/injection_points/expected/cic_reset_snapshots.out index 948d1232aa0c..595a4000ce0e 100644 --- a/src/test/modules/injection_points/expected/cic_reset_snapshots.out +++ b/src/test/modules/injection_points/expected/cic_reset_snapshots.out @@ -17,6 +17,12 @@ SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice' (1 row) +SELECT injection_points_attach('table_parallelscan_initialize', 'notice'); + injection_points_attach +------------------------- + +(1 row) + CREATE SCHEMA cic_reset_snap; CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); INSERT INTO cic_reset_snap.tbl SELECT i, i * I FROM generate_series(1, 200) s(i); @@ -72,30 +78,45 @@ NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective DROP INDEX CONCURRENTLY cic_reset_snap.idx; -- The same in parallel mode ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); +-- Detach to keep test stable, since parallel worker may complete scan before leader +SELECT injection_points_detach('heap_reset_scan_snapshot_effective'); + injection_points_detach +------------------------- + +(1 row) + CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots -NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots -NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i DESC NULLS LAST); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; DROP SCHEMA cic_reset_snap CASCADE; NOTICE: drop cascades to 3 other objects diff --git a/src/test/modules/injection_points/sql/cic_reset_snapshots.sql b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql index 5072535b3555..2941aa7ae389 100644 --- a/src/test/modules/injection_points/sql/cic_reset_snapshots.sql +++ b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql @@ -3,7 +3,7 @@ CREATE EXTENSION injection_points; SELECT injection_points_set_local(); SELECT injection_points_attach('heap_reset_scan_snapshot_effective', 'notice'); SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice'); - +SELECT injection_points_attach('table_parallelscan_initialize', 'notice'); CREATE SCHEMA cic_reset_snap; CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); @@ -53,6 +53,9 @@ DROP INDEX CONCURRENTLY cic_reset_snap.idx; -- The same in parallel mode ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); +-- Detach to keep test stable, since parallel worker may complete scan before leader +SELECT injection_points_detach('heap_reset_scan_snapshot_effective'); + CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; DROP INDEX CONCURRENTLY cic_reset_snap.idx; @@ -83,4 +86,4 @@ DROP INDEX CONCURRENTLY cic_reset_snap.idx; DROP SCHEMA cic_reset_snap CASCADE; -DROP EXTENSION injection_points; +DROP EXTENSION injection_points; \ No newline at end of file From 17fe9cc91b3600dc96e61ed96c95b5579ee8557a Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu Date: Thu, 6 Mar 2025 14:54:44 +0100 Subject: [PATCH 05/12] Support snapshot resets in concurrent builds of unique indexes Previously, concurrent builds if unique index used a fixed snapshot for the entire scan to ensure proper uniqueness checks. Now reset snapshots periodically during concurrent unique index builds, while still maintaining uniqueness by: - ignoring SnapshotSelf dead tuples during uniqueness checks in tuplesort as not a guarantee, but a fail-fast mechanics - adding a uniqueness check in _bt_load that detects multiple alive tuples with the same key values as a guarantee of correctness Tuples are SnapshotSelf tested only in the case of equal index key values, overwise _bt_load works like before. --- src/backend/access/heap/README.HOT | 12 +- src/backend/access/heap/heapam_handler.c | 6 +- src/backend/access/nbtree/nbtdedup.c | 8 +- src/backend/access/nbtree/nbtsort.c | 192 ++++++++++++++---- src/backend/access/nbtree/nbtsplitloc.c | 12 +- src/backend/access/nbtree/nbtutils.c | 31 ++- src/backend/catalog/index.c | 8 +- src/backend/commands/indexcmds.c | 4 +- src/backend/utils/sort/tuplesortvariants.c | 69 +++++-- src/include/access/nbtree.h | 4 +- src/include/access/tableam.h | 5 +- src/include/utils/tuplesort.h | 1 + .../expected/cic_reset_snapshots.out | 6 + 13 files changed, 264 insertions(+), 94 deletions(-) diff --git a/src/backend/access/heap/README.HOT b/src/backend/access/heap/README.HOT index 74e407f375aa..829dad1194ef 100644 --- a/src/backend/access/heap/README.HOT +++ b/src/backend/access/heap/README.HOT @@ -386,12 +386,12 @@ have the HOT-safety property enforced before we start to build the new index. After waiting for transactions which had the table open, we build the index -for all rows that are valid in a fresh snapshot. Any tuples visible in the -snapshot will have only valid forward-growing HOT chains. (They might have -older HOT updates behind them which are broken, but this is OK for the same -reason it's OK in a regular index build.) As above, we point the index -entry at the root of the HOT-update chain but we use the key value from the -live tuple. +for all rows that are valid in a fresh snapshot, which is updated every so +often. Any tuples visible in the snapshot will have only valid forward-growing +HOT chains. (They might have older HOT updates behind them which are broken, +but this is OK for the same reason it's OK in a regular index build.) +As above, we point the index entry at the root of the HOT-update chain but we +use the key value from the live tuple. We mark the index open for inserts (but still not ready for reads) then we again wait for transactions which have the table open. Then we take diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 4cbbf7f2d707..58ffa4306e27 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1236,15 +1236,15 @@ heapam_index_build_range_scan(Relation heapRelation, * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, or during bootstrap, we take a regular MVCC snapshot * and index whatever's live according to that while that snapshot is reset - * every so often (in case of non-unique index). + * every so often. */ OldestXmin = InvalidTransactionId; /* - * For unique index we need consistent snapshot for the whole scan. + * For concurrent builds of non-system indexes, we may want to periodically + * reset snapshots to allow vacuum to clean up tuples. */ reset_snapshots = indexInfo->ii_Concurrent && - !indexInfo->ii_Unique && !is_system_catalog; /* just for the case */ /* okay to ignore lazy VACUUMs here */ diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c index 08884116aecb..347b50d6e516 100644 --- a/src/backend/access/nbtree/nbtdedup.c +++ b/src/backend/access/nbtree/nbtdedup.c @@ -148,7 +148,7 @@ _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, _bt_dedup_start_pending(state, itup, offnum); } else if (state->deduplicate && - _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + _bt_keep_natts_fast(rel, state->base, itup, NULL) > nkeyatts && _bt_dedup_save_htid(state, itup)) { /* @@ -374,7 +374,7 @@ _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, /* itup starts first pending interval */ _bt_dedup_start_pending(state, itup, offnum); } - else if (_bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + else if (_bt_keep_natts_fast(rel, state->base, itup, NULL) > nkeyatts && _bt_dedup_save_htid(state, itup)) { /* Tuple is equal; just added its TIDs to pending interval */ @@ -789,12 +789,12 @@ _bt_do_singleval(Relation rel, Page page, BTDedupState state, itemid = PageGetItemId(page, minoff); itup = (IndexTuple) PageGetItem(page, itemid); - if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + if (_bt_keep_natts_fast(rel, newitem, itup, NULL) > nkeyatts) { itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page)); itup = (IndexTuple) PageGetItem(page, itemid); - if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + if (_bt_keep_natts_fast(rel, newitem, itup, NULL) > nkeyatts) return true; } diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 052ebfe6a211..08a3cb283482 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -83,6 +83,7 @@ typedef struct BTSpool Relation index; bool isunique; bool nulls_not_distinct; + bool unique_dead_ignored; } BTSpool; /* @@ -101,6 +102,7 @@ typedef struct BTShared Oid indexrelid; bool isunique; bool nulls_not_distinct; + bool unique_dead_ignored; bool isconcurrent; int scantuplesortstates; @@ -203,15 +205,13 @@ typedef struct BTLeader */ typedef struct BTBuildState { - bool isunique; - bool nulls_not_distinct; bool havedead; Relation heap; BTSpool *spool; /* - * spool2 is needed only when the index is a unique index. Dead tuples are - * put into spool2 instead of spool in order to avoid uniqueness check. + * spool2 is needed only when the index is a unique index and build non-concurrently. + * Dead tuples are put into spool2 instead of spool in order to avoid uniqueness check. */ BTSpool *spool2; double indtuples; @@ -258,7 +258,7 @@ static double _bt_spools_heapscan(Relation heap, Relation index, static void _bt_spooldestroy(BTSpool *btspool); static void _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull); -static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots); +static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool isconcurrent); static void _bt_build_callback(Relation index, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state); static BulkWriteBuffer _bt_blnewpage(BTWriteState *wstate, uint32 level); @@ -303,8 +303,6 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) ResetUsage(); #endif /* BTREE_BUILD_STATS */ - buildstate.isunique = indexInfo->ii_Unique; - buildstate.nulls_not_distinct = indexInfo->ii_NullsNotDistinct; buildstate.havedead = false; buildstate.heap = heap; buildstate.spool = NULL; @@ -321,20 +319,20 @@ btbuild(Relation heap, Relation index, IndexInfo *indexInfo) RelationGetRelationName(index)); reltuples = _bt_spools_heapscan(heap, index, &buildstate, indexInfo); - Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); /* * Finish the build by (1) completing the sort of the spool file, (2) * inserting the sorted tuples into btree pages and (3) building the upper * levels. Finally, it may also be necessary to end use of parallelism. */ - _bt_leafbuild(buildstate.spool, buildstate.spool2, !indexInfo->ii_Unique && indexInfo->ii_Concurrent); + _bt_leafbuild(buildstate.spool, buildstate.spool2, indexInfo->ii_Concurrent); _bt_spooldestroy(buildstate.spool); if (buildstate.spool2) _bt_spooldestroy(buildstate.spool2); if (buildstate.btleader) _bt_end_parallel(buildstate.btleader); - Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); @@ -381,6 +379,11 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, btspool->index = index; btspool->isunique = indexInfo->ii_Unique; btspool->nulls_not_distinct = indexInfo->ii_NullsNotDistinct; + /* + * We need to ignore dead tuples for unique checks in case of concurrent build. + * It is required because or periodic reset of snapshot. + */ + btspool->unique_dead_ignored = indexInfo->ii_Concurrent && indexInfo->ii_Unique; /* Save as primary spool */ buildstate->spool = btspool; @@ -429,8 +432,9 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * the use of parallelism or any other factor. */ buildstate->spool->sortstate = - tuplesort_begin_index_btree(heap, index, buildstate->isunique, - buildstate->nulls_not_distinct, + tuplesort_begin_index_btree(heap, index, btspool->isunique, + btspool->nulls_not_distinct, + btspool->unique_dead_ignored, maintenance_work_mem, coordinate, TUPLESORT_NONE); @@ -438,8 +442,12 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * If building a unique index, put dead tuples in a second spool to keep * them out of the uniqueness check. We expect that the second spool (for * dead tuples) won't get very full, so we give it only work_mem. + * + * In case of concurrent build dead tuples are not need to be put into index + * since we wait for all snapshots older than reference snapshot during the + * validation phase. */ - if (indexInfo->ii_Unique) + if (indexInfo->ii_Unique && !indexInfo->ii_Concurrent) { BTSpool *btspool2 = (BTSpool *) palloc0(sizeof(BTSpool)); SortCoordinate coordinate2 = NULL; @@ -470,7 +478,7 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * full, so we give it only work_mem */ buildstate->spool2->sortstate = - tuplesort_begin_index_btree(heap, index, false, false, work_mem, + tuplesort_begin_index_btree(heap, index, false, false, false, work_mem, coordinate2, TUPLESORT_NONE); } @@ -483,7 +491,7 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, reltuples = _bt_parallel_heapscan(buildstate, &indexInfo->ii_BrokenHotChain); InvalidateCatalogSnapshot(); - Assert(!indexInfo->ii_Concurrent || indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); + Assert(!indexInfo->ii_Concurrent || !TransactionIdIsValid(MyProc->xmin)); /* * Set the progress target for the next phase. Reset the block number @@ -539,7 +547,7 @@ _bt_spool(BTSpool *btspool, ItemPointer self, Datum *values, bool *isnull) * create an entire btree. */ static void -_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots) +_bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool isconcurrent) { BTWriteState wstate; @@ -561,7 +569,7 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots) PROGRESS_BTREE_PHASE_PERFORMSORT_2); tuplesort_performsort(btspool2->sortstate); } - Assert(!reset_snapshots || !TransactionIdIsValid(MyProc->xmin)); + Assert(!isconcurrent || !TransactionIdIsValid(MyProc->xmin)); wstate.heap = btspool->heap; wstate.index = btspool->index; @@ -575,7 +583,7 @@ _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2, bool reset_snapshots) pgstat_progress_update_param(PROGRESS_CREATEIDX_SUBPHASE, PROGRESS_BTREE_PHASE_LEAF_LOAD); - Assert(!reset_snapshots || !TransactionIdIsValid(MyProc->xmin)); + Assert(!isconcurrent || !TransactionIdIsValid(MyProc->xmin)); _bt_load(&wstate, btspool, btspool2); } @@ -1154,13 +1162,117 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) SortSupport sortKeys; int64 tuples_done = 0; bool deduplicate; + bool fail_on_alive_duplicate; wstate->bulkstate = smgr_bulk_start_rel(wstate->index, MAIN_FORKNUM); deduplicate = wstate->inskey->allequalimage && !btspool->isunique && BTGetDeduplicateItems(wstate->index); + /* + * The unique_dead_ignored does not guarantee absence of multiple alive + * tuples with same values exists in the spool. Such thing may happen if + * alive tuples are located between a few dead tuples, like this: addda. + */ + fail_on_alive_duplicate = btspool->unique_dead_ignored; - if (merge) + if (fail_on_alive_duplicate) + { + bool seen_alive = false, + prev_tested = false; + IndexTuple prev = NULL; + TupleTableSlot *slot = MakeSingleTupleTableSlot(RelationGetDescr(wstate->heap), + &TTSOpsBufferHeapTuple); + IndexFetchTableData *fetch = table_index_fetch_begin(wstate->heap); + + Assert(btspool->isunique); + Assert(!btspool2); + + while ((itup = tuplesort_getindextuple(btspool->sortstate, true)) != NULL) + { + bool tuples_equal = false; + + /* When we see first tuple, create first index page */ + if (state == NULL) + state = _bt_pagestate(wstate, 0); + + if (prev != NULL) /* if is not the first tuple */ + { + bool has_nulls = false, + call_again, /* just to pass something */ + ignored, /* just to pass something */ + now_alive; + ItemPointerData tid; + + /* if this tuples equal to previouse one? */ + if (wstate->inskey->allequalimage) + tuples_equal = _bt_keep_natts_fast(wstate->index, prev, itup, &has_nulls) > keysz; + else + tuples_equal = _bt_keep_natts(wstate->index, prev, itup,wstate->inskey, &has_nulls) > keysz; + + /* handle null values correctly */ + if (has_nulls && !btspool->nulls_not_distinct) + tuples_equal = false; + + if (tuples_equal) + { + /* check previous tuple if not yet */ + if (!prev_tested) + { + call_again = false; + tid = prev->t_tid; + seen_alive = table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + prev_tested = true; + } + + call_again = false; + tid = itup->t_tid; + now_alive = table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + + /* are multiple alive tuples detected in equal group? */ + if (seen_alive && now_alive) + { + char *key_desc; + TupleDesc tupDes = RelationGetDescr(wstate->index); + bool isnull[INDEX_MAX_KEYS]; + Datum values[INDEX_MAX_KEYS]; + + index_deform_tuple(itup, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(wstate->index, values, isnull); + + /* keep this message in sync with the same in comparetup_index_btree_tiebreak */ + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(wstate->index)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(wstate->heap, + RelationGetRelationName(wstate->index)))); + } + seen_alive |= now_alive; + } + } + + if (!tuples_equal) + { + seen_alive = false; + prev_tested = false; + } + + _bt_buildadd(wstate, state, itup, 0); + if (prev) pfree(prev); + prev = CopyIndexTuple(itup); + + /* Report progress */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tuples_done); + } + ExecDropSingleTupleTableSlot(slot); + table_index_fetch_end(fetch); + Assert(!TransactionIdIsValid(MyProc->xmin)); + } + else if (merge) { /* * Another BTSpool for dead tuples exists. Now we have to merge @@ -1320,7 +1432,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) InvalidOffsetNumber); } else if (_bt_keep_natts_fast(wstate->index, dstate->base, - itup) > keysz && + itup, NULL) > keysz && _bt_dedup_save_htid(dstate, itup)) { /* @@ -1417,7 +1529,6 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) BufferUsage *bufferusage; bool leaderparticipates = true; bool need_pop_active_snapshot = true; - bool reset_snapshot; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -1435,21 +1546,12 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) scantuplesortstates = leaderparticipates ? request + 1 : request; - /* - * For concurrent non-unique index builds, we can periodically reset snapshots - * to allow the xmin horizon to advance. This is safe since these builds don't - * require a consistent view across the entire scan. Unique indexes still need - * a stable snapshot to properly enforce uniqueness constraints. - */ - reset_snapshot = isconcurrent && !btspool->isunique; - /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time * qual checks (because we have to index RECENTLY_DEAD tuples). In a * concurrent build, we take a regular MVCC snapshot and index whatever's - * live according to that, while that snapshot may be reset periodically in - * case of non-unique index. + * live according to that, while that snapshot may be reset periodically. */ if (!isconcurrent) { @@ -1457,16 +1559,16 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) snapshot = SnapshotAny; need_pop_active_snapshot = false; } - else if (reset_snapshot) + else { + /* + * For concurrent index builds, we can periodically reset snapshots to allow + * the xmin horizon to advance. This is safe since these builds don't + * require a consistent view across the entire scan. + */ snapshot = InvalidSnapshot; PushActiveSnapshot(GetTransactionSnapshot()); } - else - { - snapshot = RegisterSnapshot(GetTransactionSnapshot()); - PushActiveSnapshot(snapshot); - } /* * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and @@ -1536,6 +1638,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) btshared->indexrelid = RelationGetRelid(btspool->index); btshared->isunique = btspool->isunique; btshared->nulls_not_distinct = btspool->nulls_not_distinct; + btshared->unique_dead_ignored = btspool->unique_dead_ignored; btshared->isconcurrent = isconcurrent; btshared->scantuplesortstates = scantuplesortstates; btshared->queryid = pgstat_get_my_query_id(); @@ -1550,7 +1653,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) table_parallelscan_initialize(btspool->heap, ParallelTableScanFromBTShared(btshared), snapshot, - reset_snapshot); + isconcurrent); /* * Store shared tuplesort-private state, for which we reserved space. @@ -1630,7 +1733,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * In case of concurrent build snapshots are going to be reset periodically. * Wait until all workers imported initial snapshot. */ - if (reset_snapshot) + if (isconcurrent) WaitForParallelWorkersToAttach(pcxt, true); /* Join heap scan ourselves */ @@ -1641,7 +1744,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * Caller needs to wait for all launched workers when we return. Make * sure that the failure-to-start case will not hang forever. */ - if (!reset_snapshot) + if (!isconcurrent) WaitForParallelWorkersToAttach(pcxt, false); if (need_pop_active_snapshot) PopActiveSnapshot(); @@ -1744,6 +1847,7 @@ _bt_leader_participate_as_worker(BTBuildState *buildstate) leaderworker->index = buildstate->spool->index; leaderworker->isunique = buildstate->spool->isunique; leaderworker->nulls_not_distinct = buildstate->spool->nulls_not_distinct; + leaderworker->unique_dead_ignored = buildstate->spool->unique_dead_ignored; /* Initialize second spool, if required */ if (!btleader->btshared->isunique) @@ -1847,11 +1951,12 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) btspool->index = indexRel; btspool->isunique = btshared->isunique; btspool->nulls_not_distinct = btshared->nulls_not_distinct; + btspool->unique_dead_ignored = btshared->unique_dead_ignored; /* Look up shared state private to tuplesort.c */ sharedsort = shm_toc_lookup(toc, PARALLEL_KEY_TUPLESORT, false); tuplesort_attach_shared(sharedsort, seg); - if (!btshared->isunique) + if (!btshared->isunique || btshared->isconcurrent) { btspool2 = NULL; sharedsort2 = NULL; @@ -1931,6 +2036,7 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, btspool->index, btspool->isunique, btspool->nulls_not_distinct, + btspool->unique_dead_ignored, sortmem, coordinate, TUPLESORT_NONE); @@ -1953,14 +2059,12 @@ _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, coordinate2->nParticipants = -1; coordinate2->sharedsort = sharedsort2; btspool2->sortstate = - tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false, + tuplesort_begin_index_btree(btspool->heap, btspool->index, false, false, false, Min(sortmem, work_mem), coordinate2, false); } /* Fill in buildstate for _bt_build_callback() */ - buildstate.isunique = btshared->isunique; - buildstate.nulls_not_distinct = btshared->nulls_not_distinct; buildstate.havedead = false; buildstate.heap = btspool->heap; buildstate.spool = btspool; diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c index e6c9aaa0454d..7cb1f3e1bc60 100644 --- a/src/backend/access/nbtree/nbtsplitloc.c +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -687,7 +687,7 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, { itemid = PageGetItemId(state->origpage, maxoff); tup = (IndexTuple) PageGetItem(state->origpage, itemid); - keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); + keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem, NULL); if (keepnatts > 1 && keepnatts <= nkeyatts) { @@ -718,7 +718,7 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, !_bt_adjacenthtid(&tup->t_tid, &state->newitem->t_tid)) return false; /* Check same conditions as rightmost item case, too */ - keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem); + keepnatts = _bt_keep_natts_fast(state->rel, tup, state->newitem, NULL); if (keepnatts > 1 && keepnatts <= nkeyatts) { @@ -967,7 +967,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, * avoid appending a heap TID in new high key, we're done. Finish split * with default strategy and initial split interval. */ - perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost, NULL); if (perfectpenalty <= indnkeyatts) return perfectpenalty; @@ -988,7 +988,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, * If page is entirely full of duplicates, a single value strategy split * will be performed. */ - perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost); + perfectpenalty = _bt_keep_natts_fast(state->rel, leftmost, rightmost, NULL); if (perfectpenalty <= indnkeyatts) { *strategy = SPLIT_MANY_DUPLICATES; @@ -1027,7 +1027,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, itemid = PageGetItemId(state->origpage, P_HIKEY); hikey = (IndexTuple) PageGetItem(state->origpage, itemid); perfectpenalty = _bt_keep_natts_fast(state->rel, hikey, - state->newitem); + state->newitem, NULL); if (perfectpenalty <= indnkeyatts) *strategy = SPLIT_SINGLE_VALUE; else @@ -1149,7 +1149,7 @@ _bt_split_penalty(FindSplitData *state, SplitPoint *split) lastleft = _bt_split_lastleft(state, split); firstright = _bt_split_firstright(state, split); - return _bt_keep_natts_fast(state->rel, lastleft, firstright); + return _bt_keep_natts_fast(state->rel, lastleft, firstright, NULL); } /* diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 1a15dfcb7d35..d07fe72713d8 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -66,8 +66,6 @@ static bool _bt_check_rowcompare(ScanKey skey, ScanDirection dir, bool forcenonrequired, bool *continuescan); static void _bt_checkkeys_look_ahead(IndexScanDesc scan, BTReadPageState *pstate, int tupnatts, TupleDesc tupdesc); -static int _bt_keep_natts(Relation rel, IndexTuple lastleft, - IndexTuple firstright, BTScanInsert itup_key); /* @@ -2532,7 +2530,7 @@ _bt_set_startikey(IndexScanDesc scan, BTReadPageState *pstate) lasttup = (IndexTuple) PageGetItem(pstate->page, iid); /* Determine the first attribute whose values change on caller's page */ - firstchangingattnum = _bt_keep_natts_fast(rel, firsttup, lasttup); + firstchangingattnum = _bt_keep_natts_fast(rel, firsttup, lasttup, NULL); for (; startikey < so->numberOfKeys; startikey++) { @@ -3852,7 +3850,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright)); /* Determine how many attributes must be kept in truncated tuple */ - keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key); + keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key, NULL); #ifdef DEBUG_NO_TRUNCATE /* Force truncation to be ineffective for testing purposes */ @@ -3970,17 +3968,24 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, /* * _bt_keep_natts - how many key attributes to keep when truncating. * + * This is exported to be used as comparison function during concurrent + * unique index build in case _bt_keep_natts_fast is not suitable because + * collation is not "allequalimage"/deduplication-safe. + * * Caller provides two tuples that enclose a split point. Caller's insertion * scankey is used to compare the tuples; the scankey's argument values are * not considered here. * + * hasnulls value set to true in case of any null column in any tuple. + * * This can return a number of attributes that is one greater than the * number of key attributes for the index relation. This indicates that the * caller must use a heap TID as a unique-ifier in new pivot tuple. */ -static int +int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, - BTScanInsert itup_key) + BTScanInsert itup_key, + bool *hasnulls) { int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); TupleDesc itupdesc = RelationGetDescr(rel); @@ -4006,6 +4011,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + if (hasnulls) + (*hasnulls) |= (isNull1 || isNull2); if (isNull1 != isNull2) break; @@ -4025,7 +4032,7 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * expected in an allequalimage index. */ Assert(!itup_key->allequalimage || - keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright)); + keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright, NULL)); return keepnatts; } @@ -4036,7 +4043,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * This is exported so that a candidate split point can have its effect on * suffix truncation inexpensively evaluated ahead of time when finding a * split location. A naive bitwise approach to datum comparisons is used to - * save cycles. + * save cycles. Also, it may be used as comparison function during concurrent + * build of unique index. * * The approach taken here usually provides the same answer as _bt_keep_natts * will (for the same pair of tuples from a heapkeyspace index), since the @@ -4045,6 +4053,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * "equal image" columns, routine is guaranteed to give the same result as * _bt_keep_natts would. * + * hasnulls value set to true in case of any null column in any tuple. + * * Callers can rely on the fact that attributes considered equal here are * definitely also equal according to _bt_keep_natts, even when the index uses * an opclass or collation that is not "allequalimage"/deduplication-safe. @@ -4053,7 +4063,8 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, * more balanced split point. */ int -_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) +_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright, + bool *hasnulls) { TupleDesc itupdesc = RelationGetDescr(rel); int keysz = IndexRelationGetNumberOfKeyAttributes(rel); @@ -4070,6 +4081,8 @@ _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + if (hasnulls) + *hasnulls |= (isNull1 | isNull2); att = TupleDescCompactAttr(itupdesc, attnum - 1); if (isNull1 != isNull2) diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 6432ef55cdc2..cca1dbb8e378 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -1532,7 +1532,7 @@ index_concurrently_build(Oid heapRelationId, /* Invalidate catalog snapshot just for assert */ InvalidateCatalogSnapshot(); - Assert(indexInfo->ii_Unique || !TransactionIdIsValid(MyProc->xmin)); + Assert(!TransactionIdIsValid(MyProc->xmin)); /* Roll back any GUC changes executed by index functions */ AtEOXact_GUC(false, save_nestlevel); @@ -3323,9 +3323,9 @@ IndexCheckExclusion(Relation heapRelation, * if we used HeapTupleSatisfiesVacuum). This leaves us with an index that * does not contain any tuples added to the table while we built the index. * - * Furthermore, in case of non-unique index we set SO_RESET_SNAPSHOT for the - * scan, which causes new snapshot to be set as active every so often. The reason - * for that is to propagate the xmin horizon forward. + * Furthermore, we set SO_RESET_SNAPSHOT for the scan, which causes new + * snapshot to be set as active every so often. The reason for that is to + * propagate the xmin horizon forward. * * Next, we mark the index "indisready" (but still not "indisvalid") and * commit the second transaction and start a third. Again we wait for all diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 62bdcb6e551d..991fa6ae6372 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1694,8 +1694,8 @@ DefineIndex(Oid tableId, * chains can be created where the new tuple and the old tuple in the * chain have different index keys. * - * We build the index using all tuples that are visible using single or - * multiple refreshing snapshots. We can be sure that any HOT updates to + * We build the index using all tuples that are visible using multiple + * refreshing snapshots. We can be sure that any HOT updates to * these tuples will be compatible with the index, since any updates made * by transactions that didn't know about the index are now committed or * rolled back. Thus, each visible tuple is either the end of its diff --git a/src/backend/utils/sort/tuplesortvariants.c b/src/backend/utils/sort/tuplesortvariants.c index 5f70e8dddac5..71a5c21e0df6 100644 --- a/src/backend/utils/sort/tuplesortvariants.c +++ b/src/backend/utils/sort/tuplesortvariants.c @@ -32,6 +32,7 @@ #include "utils/guc.h" #include "utils/lsyscache.h" #include "utils/tuplesort.h" +#include "storage/proc.h" /* sort-type codes for sort__start probes */ @@ -133,6 +134,7 @@ typedef struct bool enforceUnique; /* complain if we find duplicate tuples */ bool uniqueNullsNotDistinct; /* unique constraint null treatment */ + bool uniqueDeadIgnored; /* ignore dead tuples in unique check */ } TuplesortIndexBTreeArg; /* @@ -358,6 +360,7 @@ tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, + bool uniqueDeadIgnored, int workMem, SortCoordinate coordinate, int sortopt) @@ -400,6 +403,7 @@ tuplesort_begin_index_btree(Relation heapRel, arg->index.indexRel = indexRel; arg->enforceUnique = enforceUnique; arg->uniqueNullsNotDistinct = uniqueNullsNotDistinct; + arg->uniqueDeadIgnored = uniqueDeadIgnored; indexScanKey = _bt_mkscankey(indexRel, NULL); @@ -1653,6 +1657,7 @@ comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; char *key_desc; + bool uniqueCheckFail = true; /* * Some rather brain-dead implementations of qsort (such as the one in @@ -1662,18 +1667,58 @@ comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, */ Assert(tuple1 != tuple2); - index_deform_tuple(tuple1, tupDes, values, isnull); - - key_desc = BuildIndexValueDescription(arg->index.indexRel, values, isnull); - - ereport(ERROR, - (errcode(ERRCODE_UNIQUE_VIOLATION), - errmsg("could not create unique index \"%s\"", - RelationGetRelationName(arg->index.indexRel)), - key_desc ? errdetail("Key %s is duplicated.", key_desc) : - errdetail("Duplicate keys exist."), - errtableconstraint(arg->index.heapRel, - RelationGetRelationName(arg->index.indexRel)))); + /* This is fail-fast check, see _bt_load for details. */ + if (arg->uniqueDeadIgnored) + { + bool any_tuple_dead, + call_again = false, + ignored; + + TupleTableSlot *slot = MakeSingleTupleTableSlot(RelationGetDescr(arg->index.heapRel), + &TTSOpsBufferHeapTuple); + ItemPointerData tid = tuple1->t_tid; + + IndexFetchTableData *fetch = table_index_fetch_begin(arg->index.heapRel); + any_tuple_dead = !table_index_fetch_tuple(fetch, &tid, SnapshotSelf, slot, &call_again, &ignored); + + if (!any_tuple_dead) + { + call_again = false; + tid = tuple2->t_tid; + any_tuple_dead = !table_index_fetch_tuple(fetch, &tuple2->t_tid, SnapshotSelf, slot, &call_again, + &ignored); + } + + if (any_tuple_dead) + { + elog(DEBUG5, "skipping duplicate values because some of them are dead: (%u,%u) vs (%u,%u)", + ItemPointerGetBlockNumber(&tuple1->t_tid), + ItemPointerGetOffsetNumber(&tuple1->t_tid), + ItemPointerGetBlockNumber(&tuple2->t_tid), + ItemPointerGetOffsetNumber(&tuple2->t_tid)); + + uniqueCheckFail = false; + } + ExecDropSingleTupleTableSlot(slot); + table_index_fetch_end(fetch); + Assert(!TransactionIdIsValid(MyProc->xmin)); + } + if (uniqueCheckFail) + { + index_deform_tuple(tuple1, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(arg->index.indexRel, values, isnull); + + /* keep this error message in sync with the same in _bt_load */ + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(arg->index.indexRel)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(arg->index.heapRel, + RelationGetRelationName(arg->index.indexRel)))); + } } /* diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index ebca02588d3e..38471e90a0cf 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1339,8 +1339,10 @@ extern bool btproperty(Oid index_oid, int attno, extern char *btbuildphasename(int64 phasenum); extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, BTScanInsert itup_key); +extern int _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, + BTScanInsert itup_key, bool *hasnulls); extern int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, - IndexTuple firstright); + IndexTuple firstright, bool *hasnulls); extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum); extern void _bt_check_third_page(Relation rel, Relation heap, diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index a69f71a3ace0..acd20dbfab86 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -1754,9 +1754,8 @@ table_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, * This only really makes sense for heap AM, it might need to be generalized * for other AMs later. * - * In case of non-unique concurrent index build SO_RESET_SNAPSHOT is applied - * for the scan. That leads for changing snapshots on the fly to allow xmin - * horizon propagate. + * In case of concurrent index build SO_RESET_SNAPSHOT is applied for the scan. + * That leads for changing snapshots on the fly to allow xmin horizon propagate. */ static inline double table_index_build_scan(Relation table_rel, diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index ef79f259f935..eb9bc30e5daf 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -429,6 +429,7 @@ extern Tuplesortstate *tuplesort_begin_index_btree(Relation heapRel, Relation indexRel, bool enforceUnique, bool uniqueNullsNotDistinct, + bool uniqueDeadIgnored, int workMem, SortCoordinate coordinate, int sortopt); extern Tuplesortstate *tuplesort_begin_index_hash(Relation heapRel, diff --git a/src/test/modules/injection_points/expected/cic_reset_snapshots.out b/src/test/modules/injection_points/expected/cic_reset_snapshots.out index 595a4000ce0e..9f03fa3033ce 100644 --- a/src/test/modules/injection_points/expected/cic_reset_snapshots.out +++ b/src/test/modules/injection_points/expected/cic_reset_snapshots.out @@ -41,7 +41,11 @@ END; $$; ---------------- ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots @@ -86,7 +90,9 @@ SELECT injection_points_detach('heap_reset_scan_snapshot_effective'); (1 row) CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_parallelscan_initialize REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); NOTICE: notice triggered for injection point table_parallelscan_initialize From 1b36f6fd4ce265499fe33fe8eda2d01fb6f37914 Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu Date: Sat, 21 Dec 2024 18:36:10 +0100 Subject: [PATCH 06/12] Add STIR access method and flags related to auxiliary indexes This patch provides infrastructure for following enhancements to concurrent index builds by: - ii_Auxiliary in IndexInfo: indicates that an index is an auxiliary index used during concurrent index build - validate_index in IndexVacuumInfo: set if index_bulk_delete called during the validation phase of concurrent index build - STIR(Short-Term Index Replacement) access method is introduced, intended solely for short-lived, auxiliary usage STIR functions designed as an ephemeral helper during concurrent index builds, temporarily storing TIDs without providing the full features of a typical access method. As such, it raises warnings or errors when accessed outside its specialized usage path. Planned to be used in following commits. --- contrib/pgstattuple/pgstattuple.c | 3 + src/backend/access/Makefile | 2 +- src/backend/access/heap/vacuumlazy.c | 2 + src/backend/access/meson.build | 1 + src/backend/access/stir/Makefile | 18 + src/backend/access/stir/meson.build | 5 + src/backend/access/stir/stir.c | 573 +++++++++++++++++++++++ src/backend/catalog/index.c | 1 + src/backend/commands/analyze.c | 1 + src/backend/commands/vacuumparallel.c | 1 + src/backend/nodes/makefuncs.c | 1 + src/include/access/genam.h | 1 + src/include/access/reloptions.h | 3 +- src/include/access/stir.h | 117 +++++ src/include/catalog/pg_am.dat | 3 + src/include/catalog/pg_opclass.dat | 4 + src/include/catalog/pg_opfamily.dat | 2 + src/include/catalog/pg_proc.dat | 4 + src/include/nodes/execnodes.h | 6 +- src/include/utils/index_selfuncs.h | 8 + src/test/regress/expected/amutils.out | 8 +- src/test/regress/expected/opr_sanity.out | 7 +- src/test/regress/expected/psql.out | 24 +- 23 files changed, 777 insertions(+), 18 deletions(-) create mode 100644 src/backend/access/stir/Makefile create mode 100644 src/backend/access/stir/meson.build create mode 100644 src/backend/access/stir/stir.c create mode 100644 src/include/access/stir.h diff --git a/contrib/pgstattuple/pgstattuple.c b/contrib/pgstattuple/pgstattuple.c index a6dad54ff584..ca5214461e6b 100644 --- a/contrib/pgstattuple/pgstattuple.c +++ b/contrib/pgstattuple/pgstattuple.c @@ -285,6 +285,9 @@ pgstat_relation(Relation rel, FunctionCallInfo fcinfo) case SPGIST_AM_OID: err = "spgist index"; break; + case STIR_AM_OID: + err = "stir index"; + break; case BRIN_AM_OID: err = "brin index"; break; diff --git a/src/backend/access/Makefile b/src/backend/access/Makefile index 1932d11d154e..cd6524a54abd 100644 --- a/src/backend/access/Makefile +++ b/src/backend/access/Makefile @@ -9,6 +9,6 @@ top_builddir = ../../.. include $(top_builddir)/src/Makefile.global SUBDIRS = brin common gin gist hash heap index nbtree rmgrdesc spgist \ - sequence table tablesample transam + stir sequence table tablesample transam include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index 09416450af96..893aed0b0d96 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -3098,6 +3098,7 @@ lazy_vacuum_one_index(Relation indrel, IndexBulkDeleteResult *istat, ivinfo.message_level = DEBUG2; ivinfo.num_heap_tuples = reltuples; ivinfo.strategy = vacrel->bstrategy; + ivinfo.validate_index = false; /* * Update error traceback information. @@ -3149,6 +3150,7 @@ lazy_cleanup_one_index(Relation indrel, IndexBulkDeleteResult *istat, ivinfo.num_heap_tuples = reltuples; ivinfo.strategy = vacrel->bstrategy; + ivinfo.validate_index = false; /* * Update error traceback information. diff --git a/src/backend/access/meson.build b/src/backend/access/meson.build index 7a2d0ddb6894..a156cddff359 100644 --- a/src/backend/access/meson.build +++ b/src/backend/access/meson.build @@ -11,6 +11,7 @@ subdir('nbtree') subdir('rmgrdesc') subdir('sequence') subdir('spgist') +subdir('stir') subdir('table') subdir('tablesample') subdir('transam') diff --git a/src/backend/access/stir/Makefile b/src/backend/access/stir/Makefile new file mode 100644 index 000000000000..fae5898b8d78 --- /dev/null +++ b/src/backend/access/stir/Makefile @@ -0,0 +1,18 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for access/stir +# +# IDENTIFICATION +# src/backend/access/stir/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/access/stir +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + stir.o + +include $(top_srcdir)/src/backend/common.mk \ No newline at end of file diff --git a/src/backend/access/stir/meson.build b/src/backend/access/stir/meson.build new file mode 100644 index 000000000000..39c6eca848dc --- /dev/null +++ b/src/backend/access/stir/meson.build @@ -0,0 +1,5 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group + +backend_sources += files( + 'stir.c', +) \ No newline at end of file diff --git a/src/backend/access/stir/stir.c b/src/backend/access/stir/stir.c new file mode 100644 index 000000000000..01f3b660f4b9 --- /dev/null +++ b/src/backend/access/stir/stir.c @@ -0,0 +1,573 @@ +/*------------------------------------------------------------------------- + * + * stir.c + * Implementation of Short-Term Index Replacement. + * + * STIR is a specialized access method type designed for temporary storage + * of TID values during concurernt index build operations. + * + * The typical lifecycle of a STIR index is: + * 1. created as an auxiliary index for CIC/RIC + * 2. accepts inserts for a period + * 3. stirbulkdelete called during index validation phase + * 5. gets dropped + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/access/stir/stir.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/stir.h" +#include "miscadmin.h" +#include "access/amvalidate.h" +#include "access/htup_details.h" +#include "access/tableam.h" +#include "catalog/index.h" +#include "catalog/pg_amop.h" +#include "catalog/pg_opclass.h" +#include "catalog/pg_opfamily.h" +#include "commands/vacuum.h" +#include "storage/bufmgr.h" +#include "utils/catcache.h" +#include "utils/fmgrprotos.h" +#include "utils/index_selfuncs.h" +#include "utils/memutils.h" +#include "utils/regproc.h" +#include "utils/syscache.h" + +/* + * Stir handler function: return IndexAmRoutine with access method parameters + * and callbacks. + */ +Datum +stirhandler(PG_FUNCTION_ARGS) +{ + IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); + + /* Set STIR-specific strategy and procedure numbers */ + amroutine->amstrategies = STIR_NSTRATEGIES; + amroutine->amsupport = STIR_NPROC; + amroutine->amoptsprocnum = STIR_OPTIONS_PROC; + + /* STIR doesn't support most index operations */ + amroutine->amcanorder = false; + amroutine->amcanorderbyop = false; + amroutine->amcanbackward = false; + amroutine->amcanunique = false; + amroutine->amcanmulticol = true; + amroutine->amoptionalkey = true; + amroutine->amsearcharray = false; + amroutine->amsearchnulls = false; + amroutine->amstorage = false; + amroutine->amclusterable = false; + amroutine->ampredlocks = false; + amroutine->amcanparallel = false; + amroutine->amcanbuildparallel = false; + amroutine->amcaninclude = true; + amroutine->amusemaintenanceworkmem = false; + amroutine->amparallelvacuumoptions = + VACUUM_OPTION_PARALLEL_BULKDEL | VACUUM_OPTION_PARALLEL_CLEANUP; + amroutine->amkeytype = InvalidOid; + + /* Set up function callbacks */ + amroutine->ambuild = stirbuild; + amroutine->ambuildempty = stirbuildempty; + amroutine->aminsert = stirinsert; + amroutine->aminsertcleanup = NULL; + amroutine->ambulkdelete = stirbulkdelete; + amroutine->amvacuumcleanup = stirvacuumcleanup; + amroutine->amcanreturn = NULL; + amroutine->amcostestimate = stircostestimate; + amroutine->amoptions = stiroptions; + amroutine->amproperty = NULL; + amroutine->ambuildphasename = NULL; + amroutine->amvalidate = stirvalidate; + amroutine->amadjustmembers = NULL; + amroutine->ambeginscan = stirbeginscan; + amroutine->amrescan = stirrescan; + amroutine->amgettuple = NULL; + amroutine->amgetbitmap = NULL; + amroutine->amendscan = stirendscan; + amroutine->ammarkpos = NULL; + amroutine->amrestrpos = NULL; + amroutine->amestimateparallelscan = NULL; + amroutine->aminitparallelscan = NULL; + amroutine->amparallelrescan = NULL; + + PG_RETURN_POINTER(amroutine); +} + +/* + * Validates operator class for STIR index. + * + * STIR is not an real index, so validatio may be skipped. + * But we do it just for consistency. + */ +bool +stirvalidate(Oid opclassoid) +{ + bool result = true; + HeapTuple classtup; + Form_pg_opclass classform; + Oid opfamilyoid; + HeapTuple familytup; + Form_pg_opfamily familyform; + char *opfamilyname; + CatCList *proclist, + *oprlist; + int i; + + /* Fetch opclass information */ + classtup = SearchSysCache1(CLAOID, ObjectIdGetDatum(opclassoid)); + if (!HeapTupleIsValid(classtup)) + elog(ERROR, "cache lookup failed for operator class %u", opclassoid); + classform = (Form_pg_opclass) GETSTRUCT(classtup); + + opfamilyoid = classform->opcfamily; + + + /* Fetch opfamily information */ + familytup = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(opfamilyoid)); + if (!HeapTupleIsValid(familytup)) + elog(ERROR, "cache lookup failed for operator family %u", opfamilyoid); + familyform = (Form_pg_opfamily) GETSTRUCT(familytup); + + opfamilyname = NameStr(familyform->opfname); + + /* Fetch all operators and support functions of the opfamily */ + oprlist = SearchSysCacheList1(AMOPSTRATEGY, ObjectIdGetDatum(opfamilyoid)); + proclist = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(opfamilyoid)); + + /* Check individual operators */ + for (i = 0; i < oprlist->n_members; i++) + { + HeapTuple oprtup = &oprlist->members[i]->tuple; + Form_pg_amop oprform = (Form_pg_amop) GETSTRUCT(oprtup); + + /* Check it's allowed strategy for stir */ + if (oprform->amopstrategy < 1 || + oprform->amopstrategy > STIR_NSTRATEGIES) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("stir opfamily %s contains operator %s with invalid strategy number %d", + opfamilyname, + format_operator(oprform->amopopr), + oprform->amopstrategy))); + result = false; + } + + /* stir doesn't support ORDER BY operators */ + if (oprform->amoppurpose != AMOP_SEARCH || + OidIsValid(oprform->amopsortfamily)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("stir opfamily %s contains invalid ORDER BY specification for operator %s", + opfamilyname, + format_operator(oprform->amopopr)))); + result = false; + } + + /* Check operator signature --- same for all stir strategies */ + if (!check_amop_signature(oprform->amopopr, BOOLOID, + oprform->amoplefttype, + oprform->amoprighttype)) + { + ereport(INFO, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("stir opfamily %s contains operator %s with wrong signature", + opfamilyname, + format_operator(oprform->amopopr)))); + result = false; + } + } + + + ReleaseCatCacheList(proclist); + ReleaseCatCacheList(oprlist); + ReleaseSysCache(familytup); + ReleaseSysCache(classtup); + + return result; +} + + +/* + * Initialize metapage of a STIR index. + * The skipInserts flag determines if new inserts will be accepted or skipped. + */ +void +StirFillMetapage(Relation index, Page metaPage, bool skipInserts) +{ + StirMetaPageData *metadata; + + StirInitPage(metaPage, STIR_META); + metadata = StirPageGetMeta(metaPage); + memset(metadata, 0, sizeof(StirMetaPageData)); + metadata->magickNumber = STIR_MAGICK_NUMBER; + metadata->skipInserts = skipInserts; + ((PageHeader) metaPage)->pd_lower += sizeof(StirMetaPageData); +} + +/* + * Create and initialize the metapage for a STIR index. + * This is called during index creation. + */ +void +StirInitMetapage(Relation index, ForkNumber forknum) +{ + Buffer metaBuffer; + Page metaPage; + + Assert(!RelationNeedsWAL(index)); + /* + * Make a new page; since it is first page it should be associated with + * block number 0 (STIR_METAPAGE_BLKNO). No need to hold the extension + * lock because there cannot be concurrent inserters yet. + */ + metaBuffer = ReadBufferExtended(index, forknum, P_NEW, RBM_NORMAL, NULL); + START_CRIT_SECTION(); + LockBuffer(metaBuffer, BUFFER_LOCK_EXCLUSIVE); + Assert(BufferGetBlockNumber(metaBuffer) == STIR_METAPAGE_BLKNO); + + metaPage = BufferGetPage(metaBuffer); + StirFillMetapage(index, metaPage, forknum == INIT_FORKNUM); + + MarkBufferDirty(metaBuffer); + END_CRIT_SECTION(); + UnlockReleaseBuffer(metaBuffer); +} + +/* + * Initialize any page of a stir index. + */ +void +StirInitPage(Page page, uint16 flags) +{ + StirPageOpaque opaque; + + PageInit(page, BLCKSZ, sizeof(StirPageOpaqueData)); + + opaque = StirPageGetOpaque(page); + opaque->flags = flags; + opaque->stir_page_id = STIR_PAGE_ID; +} + +/* + * Add a tuple to a STIR page. Returns false if tuple doesn't fit. + * The tuple is added to the end of the page. + */ +static bool +StirPageAddItem(Page page, StirTuple *tuple) +{ + StirTuple *itup; + StirPageOpaque opaque; + Pointer ptr; + + /* We shouldn't be pointed to an invalid page */ + Assert(!PageIsNew(page)); + + /* Does new tuple fit on the page? */ + if (StirPageGetFreeSpace(state, page) < sizeof(StirTuple)) + return false; + + /* Copy new tuple to the end of page */ + opaque = StirPageGetOpaque(page); + itup = StirPageGetTuple(page, opaque->maxoff + 1); + memcpy((Pointer) itup, (Pointer) tuple, sizeof(StirTuple)); + + /* Adjust maxoff and pd_lower */ + opaque->maxoff++; + ptr = (Pointer) StirPageGetTuple(page, opaque->maxoff + 1); + ((PageHeader) page)->pd_lower = ptr - page; + + /* Assert we didn't overrun available space */ + Assert(((PageHeader) page)->pd_lower <= ((PageHeader) page)->pd_upper); + return true; +} + +/* + * Insert a new tuple into a STIR index. + */ +bool +stirinsert(Relation index, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + struct IndexInfo *indexInfo) +{ + StirTuple *itup; + MemoryContext oldCtx; + MemoryContext insertCtx; + StirMetaPageData *metaData; + Buffer buffer, + metaBuffer; + Page page; + uint16 blkNo; + + /* Create temporary context for insert operation */ + insertCtx = AllocSetContextCreate(CurrentMemoryContext, + "Stir insert temporary context", + ALLOCSET_DEFAULT_SIZES); + + oldCtx = MemoryContextSwitchTo(insertCtx); + + /* Create new tuple with heap pointer */ + itup = (StirTuple *) palloc0(sizeof(StirTuple)); + itup->heapPtr = *ht_ctid; + + Assert(!RelationNeedsWAL(index)); + metaBuffer = ReadBuffer(index, STIR_METAPAGE_BLKNO); + + for (;;) + { + LockBuffer(metaBuffer, BUFFER_LOCK_SHARE); + metaData = StirPageGetMeta(BufferGetPage(metaBuffer)); + /* Check if inserts are allowed */ + if (metaData->skipInserts) + { + UnlockReleaseBuffer(metaBuffer); + return false; + } + blkNo = metaData->lastBlkNo; + /* Don't hold metabuffer lock while doing insert */ + LockBuffer(metaBuffer, BUFFER_LOCK_UNLOCK); + + if (blkNo > 0) + { + buffer = ReadBuffer(index, blkNo); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + START_CRIT_SECTION(); + + page = BufferGetPage(buffer); + + Assert(!PageIsNew(page)); + + /* Try to add tuple to existing page */ + if (StirPageAddItem(page, itup)) + { + /* Success! Apply the change, clean up, and exit */ + MarkBufferDirty(buffer); + END_CRIT_SECTION(); + + UnlockReleaseBuffer(buffer); + ReleaseBuffer(metaBuffer); + MemoryContextSwitchTo(oldCtx); + MemoryContextDelete(insertCtx); + return false; + } + + END_CRIT_SECTION(); + UnlockReleaseBuffer(buffer); + } + + /* Need to add new page - get exclusive lock on meta page */ + LockBuffer(metaBuffer, BUFFER_LOCK_EXCLUSIVE); + + metaData = StirPageGetMeta(BufferGetPage(metaBuffer)); + /* Check if another backend already extended the index */ + + if (blkNo != metaData->lastBlkNo) + { + Assert(blkNo < metaData->lastBlkNo); + /* Someone else inserted the new page into the index, lets try again */ + LockBuffer(metaBuffer, BUFFER_LOCK_UNLOCK); + continue; + } + else + { + /* Must extend the file */ + buffer = ExtendBufferedRel(BMR_REL(index), MAIN_FORKNUM, NULL, + EB_LOCK_FIRST); + page = BufferGetPage(buffer); + START_CRIT_SECTION(); + + StirInitPage(page, 0); + + if (!StirPageAddItem(page, itup)) + { + /* We shouldn't be here since we're inserting to an empty page */ + elog(ERROR, "could not add new stir tuple to empty page"); + } + + /* Update meta page with new last block number */ + metaData->lastBlkNo = BufferGetBlockNumber(buffer); + + MarkBufferDirty(metaBuffer); + MarkBufferDirty(buffer); + + END_CRIT_SECTION(); + + UnlockReleaseBuffer(buffer); + UnlockReleaseBuffer(metaBuffer); + + MemoryContextSwitchTo(oldCtx); + MemoryContextDelete(insertCtx); + + return false; + } + } +} + +/* + * STIR doesn't support scans - these functions all error out + */ +IndexScanDesc stirbeginscan(Relation r, int nkeys, int norderbys) +{ + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("\"%s\" is not a not implemented", __func__))); +} + +void +stirrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, + ScanKey orderbys, int norderbys) +{ + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("\"%s\" is not a not implemented", __func__))); +} + +void stirendscan(IndexScanDesc scan) +{ + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("\"%s\" is not a not implemented", __func__))); +} + +/* + * Build a STIR index - only allowed for auxiliary indexes. + * Just initializes the meta page without any heap scans. + */ +IndexBuildResult *stirbuild(Relation heap, Relation index, + struct IndexInfo *indexInfo) +{ + IndexBuildResult *result; + + if (!indexInfo->ii_Auxiliary) + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("STIR indexes are not supported to be built"))); + + StirInitMetapage(index, MAIN_FORKNUM); + + result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult)); + result->heap_tuples = 0; + result->index_tuples = 0; + return result; +} + +void stirbuildempty(Relation index) +{ + StirInitMetapage(index, INIT_FORKNUM); +} + +IndexBulkDeleteResult *stirbulkdelete(IndexVacuumInfo *info, + IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, + void *callback_state) +{ + Relation index = info->index; + BlockNumber blkno, npages; + Buffer buffer; + Page page; + + /* For normal VACUUM, mark to skip inserts and warn about index drop needed */ + if (!info->validate_index) + { + StirMarkAsSkipInserts(index); + + ereport(WARNING, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("\"%s\" is not a not implemented, seems like this index need to be dropped", __func__))); + return NULL; + } + + if (stats == NULL) + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + + /* + * Iterate over the pages. We don't care about concurrently added pages, + * because index is marked as not-ready for that momment and index not + * used for insert. + */ + npages = RelationGetNumberOfBlocks(index); + for (blkno = STIR_HEAD_BLKNO; blkno < npages; blkno++) + { + StirTuple *itup, *itupEnd; + + vacuum_delay_point(false); + + buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, + RBM_NORMAL, info->strategy); + + LockBuffer(buffer, BUFFER_LOCK_SHARE); + page = BufferGetPage(buffer); + + if (PageIsNew(page)) + { + UnlockReleaseBuffer(buffer); + continue; + } + + itup = StirPageGetTuple(page, FirstOffsetNumber); + itupEnd = StirPageGetTuple(page, OffsetNumberNext(StirPageGetMaxOffset(page))); + while (itup < itupEnd) + { + /* Do we have to delete this tuple? */ + if (callback(&itup->heapPtr, callback_state)) + { + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("we never delete in stir"))); + } + + itup = StirPageGetNextTuple(itup); + } + + UnlockReleaseBuffer(buffer); + } + + return stats; +} + +/* + * Mark a STIR index to skip future inserts + */ +void StirMarkAsSkipInserts(Relation index) +{ + StirMetaPageData *metaData; + Buffer metaBuffer; + Page metaPage; + + Assert(!RelationNeedsWAL(index)); + metaBuffer = ReadBuffer(index, STIR_METAPAGE_BLKNO); + LockBuffer(metaBuffer, BUFFER_LOCK_EXCLUSIVE); + START_CRIT_SECTION(); + + metaPage = BufferGetPage(metaBuffer); + metaData = StirPageGetMeta(metaPage); + + if (!metaData->skipInserts) + { + metaData->skipInserts = true; + MarkBufferDirty(metaBuffer); + } + UnlockReleaseBuffer(metaBuffer); +} + +IndexBulkDeleteResult *stirvacuumcleanup(IndexVacuumInfo *info, + IndexBulkDeleteResult *stats) +{ + StirMarkAsSkipInserts(info->index); + ereport(WARNING, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("\"%s\" is not a not implemented, seems like this index need to be dropped", __func__))); + return NULL; +} + +bytea *stiroptions(Datum reloptions, bool validate) +{ + return NULL; +} + +void stircostestimate(PlannerInfo *root, IndexPath *path, + double loop_count, Cost *indexStartupCost, + Cost *indexTotalCost, Selectivity *indexSelectivity, + double *indexCorrelation, double *indexPages) +{ + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("\"%s\" is not a not implemented", __func__))); +} \ No newline at end of file diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index cca1dbb8e378..e9e22ec0e84b 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -3433,6 +3433,7 @@ validate_index(Oid heapId, Oid indexId, Snapshot snapshot) ivinfo.message_level = DEBUG2; ivinfo.num_heap_tuples = heapRelation->rd_rel->reltuples; ivinfo.strategy = NULL; + ivinfo.validate_index = true; /* * Encode TIDs as int8 values for the sort, rather than directly sorting diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 4fffb76e5573..38602e6a72dc 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -720,6 +720,7 @@ do_analyze_rel(Relation onerel, VacuumParams *params, ivinfo.message_level = elevel; ivinfo.num_heap_tuples = onerel->rd_rel->reltuples; ivinfo.strategy = vac_strategy; + ivinfo.validate_index = false; stats = index_vacuum_cleanup(&ivinfo, NULL); diff --git a/src/backend/commands/vacuumparallel.c b/src/backend/commands/vacuumparallel.c index 0feea1d30ec3..582db77ddc08 100644 --- a/src/backend/commands/vacuumparallel.c +++ b/src/backend/commands/vacuumparallel.c @@ -884,6 +884,7 @@ parallel_vacuum_process_one_index(ParallelVacuumState *pvs, Relation indrel, ivinfo.estimated_count = pvs->shared->estimated_count; ivinfo.num_heap_tuples = pvs->shared->reltuples; ivinfo.strategy = pvs->bstrategy; + ivinfo.validate_index = false; /* Update error traceback information */ pvs->indname = pstrdup(RelationGetRelationName(indrel)); diff --git a/src/backend/nodes/makefuncs.c b/src/backend/nodes/makefuncs.c index e2d9e9be41a6..e97e0943f5b7 100644 --- a/src/backend/nodes/makefuncs.c +++ b/src/backend/nodes/makefuncs.c @@ -875,6 +875,7 @@ makeIndexInfo(int numattrs, int numkeyattrs, Oid amoid, List *expressions, /* initialize index-build state to default */ n->ii_BrokenHotChain = false; n->ii_ParallelWorkers = 0; + n->ii_Auxiliary = false; /* set up for possible use by index AM */ n->ii_Am = amoid; diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 5b2ab181b5f8..b99916edb4a9 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -73,6 +73,7 @@ typedef struct IndexVacuumInfo bool estimated_count; /* num_heap_tuples is an estimate */ int message_level; /* ereport level for progress messages */ double num_heap_tuples; /* tuples remaining in heap */ + bool validate_index; /* validating concurrently built index? */ BufferAccessStrategy strategy; /* access strategy for reads */ } IndexVacuumInfo; diff --git a/src/include/access/reloptions.h b/src/include/access/reloptions.h index dfbb4c854606..a121b4d31c9d 100644 --- a/src/include/access/reloptions.h +++ b/src/include/access/reloptions.h @@ -51,8 +51,9 @@ typedef enum relopt_kind RELOPT_KIND_VIEW = (1 << 9), RELOPT_KIND_BRIN = (1 << 10), RELOPT_KIND_PARTITIONED = (1 << 11), + RELOPT_KIND_STIR = (1 << 12), /* if you add a new kind, make sure you update "last_default" too */ - RELOPT_KIND_LAST_DEFAULT = RELOPT_KIND_PARTITIONED, + RELOPT_KIND_LAST_DEFAULT = RELOPT_KIND_STIR, /* some compilers treat enums as signed ints, so we can't use 1 << 31 */ RELOPT_KIND_MAX = (1 << 30) } relopt_kind; diff --git a/src/include/access/stir.h b/src/include/access/stir.h new file mode 100644 index 000000000000..9943c42a97e7 --- /dev/null +++ b/src/include/access/stir.h @@ -0,0 +1,117 @@ +/*------------------------------------------------------------------------- + * + * stir.h + * header file for postgres stir access method implementation. + * + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * src/include/access/stir.h + * + *------------------------------------------------------------------------- + */ +#ifndef _STIR_H_ +#define _STIR_H_ + +#include "amapi.h" +#include "xlog.h" +#include "generic_xlog.h" +#include "itup.h" +#include "fmgr.h" +#include "nodes/pathnodes.h" + +/* Support procedures numbers */ +#define STIR_NPROC 0 + +/* Scan strategies */ +#define STIR_NSTRATEGIES 1 + +#define STIR_OPTIONS_PROC 0 + +/* Macros for accessing stir page structures */ +#define StirPageGetOpaque(page) ((StirPageOpaque) PageGetSpecialPointer(page)) +#define StirPageGetMaxOffset(page) (StirPageGetOpaque(page)->maxoff) +#define StirPageIsMeta(page) \ + ((StirPageGetOpaque(page)->flags & STIR_META) != 0) +#define StirPageGetData(page) ((StirTuple *)PageGetContents(page)) +#define StirPageGetTuple(page, offset) \ + ((StirTuple *)(PageGetContents(page) \ + + sizeof(StirTuple) * ((offset) - 1))) +#define StirPageGetNextTuple(tuple) \ + ((StirTuple *)((Pointer)(tuple) + sizeof(StirTuple))) + + + +/* Preserved page numbers */ +#define STIR_METAPAGE_BLKNO (0) +#define STIR_HEAD_BLKNO (1) /* first data page */ + + +/* Opaque for stir pages */ +typedef struct StirPageOpaqueData +{ + OffsetNumber maxoff; /* number of index tuples on page */ + uint16 flags; /* see bit definitions below */ + uint16 unused; /* placeholder to force maxaligning of size of + * StirPageOpaqueData and to place + * stir_page_id exactly at the end of page */ + uint16 stir_page_id; /* for identification of STIR indexes */ +} StirPageOpaqueData; + +/* Stir page flags */ +#define STIR_META (1<<0) + +typedef StirPageOpaqueData *StirPageOpaque; + +#define STIR_PAGE_ID 0xFF84 + +/* Metadata of stir index */ +typedef struct StirMetaPageData +{ + uint32 magickNumber; + uint16 lastBlkNo; + bool skipInserts; /* should we just exit without any inserts */ +} StirMetaPageData; + +/* Magic number to distinguish stir pages from others */ +#define STIR_MAGICK_NUMBER (0xDBAC0DEF) + +#define StirPageGetMeta(page) ((StirMetaPageData *) PageGetContents(page)) + +typedef struct StirTuple +{ + ItemPointerData heapPtr; +} StirTuple; + +#define StirPageGetFreeSpace(state, page) \ + (BLCKSZ - MAXALIGN(SizeOfPageHeaderData) \ + - StirPageGetMaxOffset(page) * (sizeof(StirTuple)) \ + - MAXALIGN(sizeof(StirPageOpaqueData))) + +extern void StirFillMetapage(Relation index, Page metaPage, bool skipInserts); +extern void StirInitMetapage(Relation index, ForkNumber forknum); +extern void StirInitPage(Page page, uint16 flags); +extern void StirMarkAsSkipInserts(Relation index); + +/* index access method interface functions */ +extern bool stirvalidate(Oid opclassoid); +extern bool stirinsert(Relation index, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + struct IndexInfo *indexInfo); +extern IndexScanDesc stirbeginscan(Relation r, int nkeys, int norderbys); +extern void stirrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, + ScanKey orderbys, int norderbys); +extern void stirendscan(IndexScanDesc scan); +extern IndexBuildResult *stirbuild(Relation heap, Relation index, + struct IndexInfo *indexInfo); +extern void stirbuildempty(Relation index); +extern IndexBulkDeleteResult *stirbulkdelete(IndexVacuumInfo *info, + IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, + void *callback_state); +extern IndexBulkDeleteResult *stirvacuumcleanup(IndexVacuumInfo *info, + IndexBulkDeleteResult *stats); +extern bytea *stiroptions(Datum reloptions, bool validate); + +#endif \ No newline at end of file diff --git a/src/include/catalog/pg_am.dat b/src/include/catalog/pg_am.dat index 26d15928a155..a5ecf9208ad5 100644 --- a/src/include/catalog/pg_am.dat +++ b/src/include/catalog/pg_am.dat @@ -33,5 +33,8 @@ { oid => '3580', oid_symbol => 'BRIN_AM_OID', descr => 'block range index (BRIN) access method', amname => 'brin', amhandler => 'brinhandler', amtype => 'i' }, +{ oid => '5555', oid_symbol => 'STIR_AM_OID', + descr => 'short term index replacement access method', + amname => 'stir', amhandler => 'stirhandler', amtype => 'i' }, ] diff --git a/src/include/catalog/pg_opclass.dat b/src/include/catalog/pg_opclass.dat index 4a9624802aa5..6227c5658fc4 100644 --- a/src/include/catalog/pg_opclass.dat +++ b/src/include/catalog/pg_opclass.dat @@ -488,4 +488,8 @@ # no brin opclass for the geometric types except box +# allow any types for STIR +{ opcmethod => 'stir', oid_symbol => 'ANY_STIR_OPS_OID', opcname => 'stir_ops', + opcfamily => 'stir/any_ops', opcintype => 'any'}, + ] diff --git a/src/include/catalog/pg_opfamily.dat b/src/include/catalog/pg_opfamily.dat index f7dcb96b43ce..838ad32c9328 100644 --- a/src/include/catalog/pg_opfamily.dat +++ b/src/include/catalog/pg_opfamily.dat @@ -304,5 +304,7 @@ opfmethod => 'hash', opfname => 'multirange_ops' }, { oid => '6158', opfmethod => 'gist', opfname => 'multirange_ops' }, +{ oid => '5558', + opfmethod => 'stir', opfname => 'any_ops' }, ] diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index d3d28a263fa9..198795f010fb 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -935,6 +935,10 @@ proname => 'brinhandler', provolatile => 'v', prorettype => 'index_am_handler', proargtypes => 'internal', prosrc => 'brinhandler' }, +{ oid => '5556', descr => 'short term index replacement access method handler', + proname => 'stirhandler', provolatile => 'v', + prorettype => 'index_am_handler', proargtypes => 'internal', + prosrc => 'stirhandler' }, { oid => '3952', descr => 'brin: standalone scan new table pages', proname => 'brin_summarize_new_values', provolatile => 'v', proparallel => 'u', prorettype => 'int4', proargtypes => 'regclass', diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 2492282213ff..0341bb743250 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -181,12 +181,13 @@ typedef struct ExprState * BrokenHotChain did we detect any broken HOT chains? * Summarizing is it a summarizing index? * ParallelWorkers # of workers requested (excludes leader) + * Auxiliary # index-helper for concurrent build? * Am Oid of index AM * AmCache private cache area for index AM * Context memory context holding this IndexInfo * - * ii_Concurrent, ii_BrokenHotChain, and ii_ParallelWorkers are used only - * during index build; they're conventionally zeroed otherwise. + * ii_Concurrent, ii_BrokenHotChain, ii_Auxiliary and ii_ParallelWorkers + * are used only during index build; they're conventionally zeroed otherwise. * ---------------- */ typedef struct IndexInfo @@ -215,6 +216,7 @@ typedef struct IndexInfo bool ii_Summarizing; bool ii_WithoutOverlaps; int ii_ParallelWorkers; + bool ii_Auxiliary; Oid ii_Am; void *ii_AmCache; MemoryContext ii_Context; diff --git a/src/include/utils/index_selfuncs.h b/src/include/utils/index_selfuncs.h index 6c64db6d456c..e0d939d68573 100644 --- a/src/include/utils/index_selfuncs.h +++ b/src/include/utils/index_selfuncs.h @@ -62,6 +62,14 @@ extern void spgcostestimate(struct PlannerInfo *root, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages); +extern void stircostestimate(struct PlannerInfo *root, + struct IndexPath *path, + double loop_count, + Cost *indexStartupCost, + Cost *indexTotalCost, + Selectivity *indexSelectivity, + double *indexCorrelation, + double *indexPages); extern void gincostestimate(struct PlannerInfo *root, struct IndexPath *path, double loop_count, diff --git a/src/test/regress/expected/amutils.out b/src/test/regress/expected/amutils.out index 7ab6113c6191..92c033a20100 100644 --- a/src/test/regress/expected/amutils.out +++ b/src/test/regress/expected/amutils.out @@ -173,7 +173,13 @@ select amname, prop, pg_indexam_has_property(a.oid, prop) as p spgist | can_exclude | t spgist | can_include | t spgist | bogus | -(36 rows) + stir | can_order | f + stir | can_unique | f + stir | can_multi_col | t + stir | can_exclude | f + stir | can_include | t + stir | bogus | +(42 rows) -- -- additional checks for pg_index_column_has_property diff --git a/src/test/regress/expected/opr_sanity.out b/src/test/regress/expected/opr_sanity.out index 20bf9ea9cdf7..fc116b84a28c 100644 --- a/src/test/regress/expected/opr_sanity.out +++ b/src/test/regress/expected/opr_sanity.out @@ -2122,9 +2122,10 @@ FROM pg_opclass AS c1 WHERE NOT EXISTS(SELECT 1 FROM pg_amop AS a1 WHERE a1.amopfamily = c1.opcfamily AND binary_coercible(c1.opcintype, a1.amoplefttype)); - opcname | opcfamily ----------+----------- -(0 rows) + opcname | opcfamily +----------+----------- + stir_ops | 5558 +(1 row) -- Check that each operator listed in pg_amop has an associated opclass, -- that is one whose opcintype matches oprleft (possibly by coercion). diff --git a/src/test/regress/expected/psql.out b/src/test/regress/expected/psql.out index cf48ae6d0c2e..52dde57680da 100644 --- a/src/test/regress/expected/psql.out +++ b/src/test/regress/expected/psql.out @@ -5137,7 +5137,8 @@ List of access methods heap | Table heap2 | Table spgist | Index -(8 rows) + stir | Index +(9 rows) \dA * List of access methods @@ -5151,7 +5152,8 @@ List of access methods heap | Table heap2 | Table spgist | Index -(8 rows) + stir | Index +(9 rows) \dA h* List of access methods @@ -5176,9 +5178,9 @@ List of access methods \dA: extra argument "bar" ignored \dA+ - List of access methods - Name | Type | Handler | Description ---------+-------+----------------------+---------------------------------------- + List of access methods + Name | Type | Handler | Description +--------+-------+----------------------+-------------------------------------------- brin | Index | brinhandler | block range index (BRIN) access method btree | Index | bthandler | b-tree index access method gin | Index | ginhandler | GIN index access method @@ -5187,12 +5189,13 @@ List of access methods heap | Table | heap_tableam_handler | heap table access method heap2 | Table | heap_tableam_handler | spgist | Index | spghandler | SP-GiST index access method -(8 rows) + stir | Index | stirhandler | short term index replacement access method +(9 rows) \dA+ * - List of access methods - Name | Type | Handler | Description ---------+-------+----------------------+---------------------------------------- + List of access methods + Name | Type | Handler | Description +--------+-------+----------------------+-------------------------------------------- brin | Index | brinhandler | block range index (BRIN) access method btree | Index | bthandler | b-tree index access method gin | Index | ginhandler | GIN index access method @@ -5201,7 +5204,8 @@ List of access methods heap | Table | heap_tableam_handler | heap table access method heap2 | Table | heap_tableam_handler | spgist | Index | spghandler | SP-GiST index access method -(8 rows) + stir | Index | stirhandler | short term index replacement access method +(9 rows) \dA+ h* List of access methods From 34936673771535c98c0850db72acb412b1fab9d0 Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu Date: Sat, 25 Jan 2025 13:33:21 +0100 Subject: [PATCH 07/12] Add Datum storage support to tuplestore Extend tuplestore to store individual Datum values: - fixed-length datatypes: store raw bytes without a length header - variable-length datatypes: include a length header and padding - by-value types: store inline This support enables usages tuplestore for non-tuple data (TIDs) in the next commit. --- src/backend/utils/sort/tuplestore.c | 260 +++++++++++++++++++++++----- src/include/utils/tuplestore.h | 33 ++-- 2 files changed, 239 insertions(+), 54 deletions(-) diff --git a/src/backend/utils/sort/tuplestore.c b/src/backend/utils/sort/tuplestore.c index c9aecab8d66c..12ae705c091a 100644 --- a/src/backend/utils/sort/tuplestore.c +++ b/src/backend/utils/sort/tuplestore.c @@ -1,16 +1,19 @@ /*------------------------------------------------------------------------- * * tuplestore.c - * Generalized routines for temporary tuple storage. + * Generalized routines for temporary storage of tuples and Datums. + * + * This module handles temporary storage of either tuples or single + * Datum values for purposes such as Materialize nodes, hashjoin batch + * files, etc. It is essentially a dumbed-down version of tuplesort.c; + * it does no sorting of tuples but can only store and regurgitate a sequence + * of tuples. However, because no sort is required, it is allowed to start + * reading the sequence before it has all been written. + * + * This is particularly useful for cursors, because it allows random access + * within the already-scanned portion of a query without having to process + * the underlying scan to completion. * - * This module handles temporary storage of tuples for purposes such - * as Materialize nodes, hashjoin batch files, etc. It is essentially - * a dumbed-down version of tuplesort.c; it does no sorting of tuples - * but can only store and regurgitate a sequence of tuples. However, - * because no sort is required, it is allowed to start reading the sequence - * before it has all been written. This is particularly useful for cursors, - * because it allows random access within the already-scanned portion of - * a query without having to process the underlying scan to completion. * Also, it is possible to support multiple independent read pointers. * * A temporary file is used to handle the data if it exceeds the @@ -61,6 +64,8 @@ #include "executor/executor.h" #include "miscadmin.h" #include "storage/buffile.h" +#include "utils/datum.h" +#include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/resowner.h" @@ -115,16 +120,15 @@ struct Tuplestorestate BufFile *myfile; /* underlying file, or NULL if none */ MemoryContext context; /* memory context for holding tuples */ ResourceOwner resowner; /* resowner for holding temp files */ + Oid datumType; /* InvalidOid or oid of Datum's to be stored */ + int16 datumTypeLen; /* typelen of that Datum */ + bool datumTypeByVal; /* by-value of that atum */ /* * These function pointers decouple the routines that must know what kind * of tuple we are handling from the routines that don't need to know it. * They are set up by the tuplestore_begin_xxx routines. * - * (Although tuplestore.c currently only supports heap tuples, I've copied - * this part of tuplesort.c so that extension to other kinds of objects - * will be easy if it's ever needed.) - * * Function to copy a supplied input tuple into palloc'd space. (NB: we * assume that a single pfree() is enough to release the tuple later, so * the representation must be "flat" in one palloc chunk.) state->availMem @@ -143,12 +147,18 @@ struct Tuplestorestate /* * Function to read a stored tuple from tape back into memory. 'len' is - * the already-read length of the stored tuple. Create and return a - * palloc'd copy, and decrease state->availMem by the amount of memory - * space consumed. + * the already-known (read of constant) length of the stored tuple. + * Create and return a palloc'd copy, and decrease state->availMem by the + * amount of memory space consumed. */ void *(*readtup) (Tuplestorestate *state, unsigned int len); + /* + * Function to get lengh of tuple from tape. Used to provide 'len' argument + * for readtup (see above). + */ + unsigned int(*lentup)(Tuplestorestate *state, bool eofOK); + /* * This array holds pointers to tuples in memory if we are in state INMEM. * In states WRITEFILE and READFILE it's not used. @@ -185,6 +195,7 @@ struct Tuplestorestate #define COPYTUP(state,tup) ((*(state)->copytup) (state, tup)) #define WRITETUP(state,tup) ((*(state)->writetup) (state, tup)) #define READTUP(state,len) ((*(state)->readtup) (state, len)) +#define LENTUP(state,eofOK) ((*(state)->lentup) (state, eofOK)) #define LACKMEM(state) ((state)->availMem < 0) #define USEMEM(state,amt) ((state)->availMem -= (amt)) #define FREEMEM(state,amt) ((state)->availMem += (amt)) @@ -193,9 +204,9 @@ struct Tuplestorestate * * NOTES about on-tape representation of tuples: * - * We require the first "unsigned int" of a stored tuple to be the total size - * on-tape of the tuple, including itself (so it is never zero). - * The remainder of the stored tuple + * In case of tuples we use first "unsigned int" of a stored tuple + * to be the total size on-tape of the tuple, including itself + * (so it is never zero). The remainder of the stored tuple * may or may not match the in-memory representation of the tuple --- * any conversion needed is the job of the writetup and readtup routines. * @@ -206,10 +217,13 @@ struct Tuplestorestate * state->backward is not set, the write/read routines may omit the extra * length word. * + * In case of Datum with constant lenght both "unsigned int" are ommitted. + * * writetup is expected to write both length words as well as the tuple * data. When readtup is called, the tape is positioned just after the - * front length word; readtup must read the tuple data and advance past - * the back length word (if present). + * front length word (if it not ommitted like in case of contant-size Datum); + * readtup must read the tuple data and advance past the back length word + * (if present). * * The write/read routines can make use of the tuple description data * stored in the Tuplestorestate record, if needed. They are also expected @@ -241,11 +255,16 @@ static Tuplestorestate *tuplestore_begin_common(int eflags, static void tuplestore_puttuple_common(Tuplestorestate *state, void *tuple); static void dumptuples(Tuplestorestate *state); static void tuplestore_updatemax(Tuplestorestate *state); -static unsigned int getlen(Tuplestorestate *state, bool eofOK); + +static unsigned int lentup_heap(Tuplestorestate *state, bool eofOK); static void *copytup_heap(Tuplestorestate *state, void *tup); static void writetup_heap(Tuplestorestate *state, void *tup); static void *readtup_heap(Tuplestorestate *state, unsigned int len); +static unsigned int lentup_datum(Tuplestorestate *state, bool eofOK); +static void *copytup_datum(Tuplestorestate *state, void *datum); +static void writetup_datum(Tuplestorestate *state, void *datum); +static void *readtup_datum(Tuplestorestate *state, unsigned int len); /* * tuplestore_begin_xxx @@ -268,6 +287,12 @@ tuplestore_begin_common(int eflags, bool interXact, int maxKBytes) state->allowedMem = maxKBytes * (int64) 1024; state->availMem = state->allowedMem; state->myfile = NULL; + /* + * Set Datum related data to invalid by default. + */ + state->datumType = InvalidOid; + state->datumTypeLen = 0; + state->datumTypeByVal = false; /* * The palloc/pfree pattern for tuple memory is in a FIFO pattern. A @@ -345,6 +370,36 @@ tuplestore_begin_heap(bool randomAccess, bool interXact, int maxKBytes) state->copytup = copytup_heap; state->writetup = writetup_heap; state->readtup = readtup_heap; + state->lentup = lentup_heap; + + return state; +} + +/* + * The same as tuplestore_begin_heap but create store for Datum values. + */ +Tuplestorestate * +tuplestore_begin_datum(Oid datumType, bool randomAccess, bool interXact, int maxKBytes) +{ + Tuplestorestate *state; + int eflags; + + /* + * This interpretation of the meaning of randomAccess is compatible with + * the pre-8.3 behavior of tuplestores. + */ + eflags = randomAccess ? + (EXEC_FLAG_BACKWARD | EXEC_FLAG_REWIND) : + (EXEC_FLAG_REWIND); + + state = tuplestore_begin_common(eflags, interXact, maxKBytes); + state->datumType = datumType; + get_typlenbyval(state->datumType, &state->datumTypeLen, &state->datumTypeByVal); + + state->copytup = copytup_datum; + state->writetup = writetup_datum; + state->readtup = readtup_datum; + state->lentup = lentup_datum; return state; } @@ -776,6 +831,25 @@ tuplestore_puttuple(Tuplestorestate *state, HeapTuple tuple) MemoryContextSwitchTo(oldcxt); } +/* + * Like tuplestore_puttupleslot but for single Datum. + */ +void +tuplestore_putdatum(Tuplestorestate *state, Datum datum) +{ + MemoryContext oldcxt = MemoryContextSwitchTo(state->context); + + /* + * Copy the Datum. (Must do this even in WRITEFILE case. Note that + * COPYTUP includes USEMEM, so we needn't do that here.) + */ + datum = PointerGetDatum(COPYTUP(state, DatumGetPointer(datum))); + + tuplestore_puttuple_common(state, DatumGetPointer(datum)); + + MemoryContextSwitchTo(oldcxt); +} + /* * Similar to tuplestore_puttuple(), but work from values + nulls arrays. * This avoids an extra tuple-construction operation. @@ -1030,7 +1104,7 @@ tuplestore_gettuple(Tuplestorestate *state, bool forward, *should_free = true; if (forward) { - if ((tuplen = getlen(state, true)) != 0) + if ((tuplen = LENTUP(state, true)) != 0) { tup = READTUP(state, tuplen); return tup; @@ -1059,7 +1133,7 @@ tuplestore_gettuple(Tuplestorestate *state, bool forward, Assert(!state->truncated); return NULL; } - tuplen = getlen(state, false); + tuplen = LENTUP(state, false); if (readptr->eof_reached) { @@ -1090,7 +1164,7 @@ tuplestore_gettuple(Tuplestorestate *state, bool forward, Assert(!state->truncated); return NULL; } - tuplen = getlen(state, false); + tuplen = LENTUP(state, false); } /* @@ -1152,6 +1226,25 @@ tuplestore_gettupleslot(Tuplestorestate *state, bool forward, } } +extern bool tuplestore_getdatum(Tuplestorestate *state, bool forward, + bool *should_free, Datum *result) +{ + Datum datum; + *should_free = false; + + datum = (Datum) tuplestore_gettuple(state, forward, should_free); + if (datum) + { + *result =datum; + return true; + } + else + { + *result = PointerGetDatum(NULL); + return false; + } +} + /* * tuplestore_advance - exported function to adjust position without fetching * @@ -1556,13 +1649,18 @@ tuplestore_in_memory(Tuplestorestate *state) return (state->status == TSS_INMEM); } - /* - * Tape interface routines + * Routines specialized for HeapTuple case + * + * The stored form is actually a MinimalTuple, but for largely historical + * reasons we allow COPYTUP to work from a HeapTuple. + * + * Since MinimalTuple already has length in its first word, we don't need + * to write that separately. */ static unsigned int -getlen(Tuplestorestate *state, bool eofOK) +lentup_heap(Tuplestorestate *state, bool eofOK) { unsigned int len; size_t nbytes; @@ -1574,17 +1672,6 @@ getlen(Tuplestorestate *state, bool eofOK) return len; } - -/* - * Routines specialized for HeapTuple case - * - * The stored form is actually a MinimalTuple, but for largely historical - * reasons we allow COPYTUP to work from a HeapTuple. - * - * Since MinimalTuple already has length in its first word, we don't need - * to write that separately. - */ - static void * copytup_heap(Tuplestorestate *state, void *tup) { @@ -1631,3 +1718,98 @@ readtup_heap(Tuplestorestate *state, unsigned int len) BufFileReadExact(state->myfile, &tuplen, sizeof(tuplen)); return tuple; } + +/* + * Routines specialized for Datum case. + * + * Handles both fixed and variable-length Datums efficiently: + * - Fixed-length: stores raw bytes without length prefix + * - Variable-length: includes length prefix (and suffix if backward scan) + * - By-value types handled inline without extra copying + */ + +static unsigned int +lentup_datum(Tuplestorestate *state, bool eofOK) +{ + unsigned int len; + size_t nbytes; + + Assert(state->datumType != InvalidOid); + + if (state->datumTypeLen > 0) + return state->datumTypeLen; + + nbytes = BufFileReadMaybeEOF(state->myfile, &len, sizeof(len), eofOK); + if (nbytes == 0) + return 0; + else + return len; +} + +static void * +copytup_datum(Tuplestorestate *state, void* datum) +{ + Assert(state->datumType != InvalidOid); + if (state->datumTypeByVal) + return DatumGetPointer(PointerGetDatum(datum)); + else + { + Datum d = datumCopy(PointerGetDatum(datum), state->datumTypeByVal, state->datumTypeLen); + USEMEM(state, GetMemoryChunkSpace(DatumGetPointer(d))); + return DatumGetPointer(d); + } +} + +static void +writetup_datum(Tuplestorestate *state, void* datum) +{ + Assert(state->datumType != InvalidOid); + if (state->datumTypeByVal) + { + Assert(state->datumTypeLen > 0); + BufFileWrite(state->myfile, datum, state->datumTypeLen); + } + else + { + Size size = state->datumTypeLen; + if (state->datumTypeLen < 0) + { + BufFileWrite(state->myfile, &size, sizeof(size)); + size = datumGetSize(PointerGetDatum(datum), state->datumTypeByVal, state->datumTypeLen); + } + + BufFileWrite(state->myfile, datum, size); + + /* need trailing length word? */ + if (state->backward && state->datumTypeLen < 0) + BufFileWrite(state->myfile, &size, sizeof(size)); + + FREEMEM(state, GetMemoryChunkSpace(datum)); + pfree(datum); + } +} + +static void* +readtup_datum(Tuplestorestate *state, unsigned int len) +{ + Assert(state->datumType != InvalidOid); + if (state->datumTypeByVal) + { + Datum datum = PointerGetDatum(NULL); + Assert(state->datumTypeLen > 0); + Assert(len == state->datumTypeLen); + BufFileReadExact(state->myfile, &datum, state->datumTypeLen); + return DatumGetPointer(datum); + } + else + { + Datum *datums = palloc(len); + BufFileReadExact(state->myfile, &datums, len); + + /* need trailing length word? */ + if (state->backward && state->datumTypeLen < 0) + BufFileReadExact(state->myfile, &len, sizeof(len)); + + return DatumGetPointer(*datums); + } +} diff --git a/src/include/utils/tuplestore.h b/src/include/utils/tuplestore.h index 865ba7b82655..0341c47b8513 100644 --- a/src/include/utils/tuplestore.h +++ b/src/include/utils/tuplestore.h @@ -1,17 +1,18 @@ /*------------------------------------------------------------------------- * * tuplestore.h - * Generalized routines for temporary tuple storage. + * Generalized routines for temporary storage of tuples and Datums. * - * This module handles temporary storage of tuples for purposes such - * as Materialize nodes, hashjoin batch files, etc. It is essentially - * a dumbed-down version of tuplesort.c; it does no sorting of tuples - * but can only store and regurgitate a sequence of tuples. However, - * because no sort is required, it is allowed to start reading the sequence - * before it has all been written. This is particularly useful for cursors, - * because it allows random access within the already-scanned portion of - * a query without having to process the underlying scan to completion. - * Also, it is possible to support multiple independent read pointers. + * This module handles temporary storage of either tuples or single + * Datum values for purposes such as Materialize nodes, hashjoin batch + * files, etc. It is essentially a dumbed-down version of tuplesort.c; + * it does no sorting of tuples but can only store and regurgitate a sequence + * of tuples. However, because no sort is required, it is allowed to start + * reading the sequence before it has all been written. + * + * This is particularly useful for cursors, because it allows random access + * within the already-scanned portion of a query without having to process + * the underlying scan to completion. * * A temporary file is used to handle the data if it exceeds the * space limit specified by the caller. @@ -39,14 +40,13 @@ */ typedef struct Tuplestorestate Tuplestorestate; -/* - * Currently we only need to store MinimalTuples, but it would be easy - * to support the same behavior for IndexTuples and/or bare Datums. - */ - extern Tuplestorestate *tuplestore_begin_heap(bool randomAccess, bool interXact, int maxKBytes); +extern Tuplestorestate *tuplestore_begin_datum(Oid datumType, + bool randomAccess, + bool interXact, + int maxKBytes); extern void tuplestore_set_eflags(Tuplestorestate *state, int eflags); @@ -55,6 +55,7 @@ extern void tuplestore_puttupleslot(Tuplestorestate *state, extern void tuplestore_puttuple(Tuplestorestate *state, HeapTuple tuple); extern void tuplestore_putvalues(Tuplestorestate *state, TupleDesc tdesc, const Datum *values, const bool *isnull); +extern void tuplestore_putdatum(Tuplestorestate *state, Datum datum); extern int tuplestore_alloc_read_pointer(Tuplestorestate *state, int eflags); @@ -72,6 +73,8 @@ extern bool tuplestore_in_memory(Tuplestorestate *state); extern bool tuplestore_gettupleslot(Tuplestorestate *state, bool forward, bool copy, TupleTableSlot *slot); +extern bool tuplestore_getdatum(Tuplestorestate *state, bool forward, + bool *should_free, Datum *result); extern bool tuplestore_advance(Tuplestorestate *state, bool forward); From 09bc7791ad6d7fee11bc4e3bf4539f2d033ceae3 Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu Date: Tue, 31 Dec 2024 15:03:10 +0100 Subject: [PATCH 08/12] Use auxiliary indexes for concurrent index operations Replace the second table full scan in concurrent index builds with an auxiliary index approach: - create a STIR auxiliary index with the same predicate (if exists) as in main index - use it to track tuples inserted during the first phase - merge auxiliary index with main index during validation to catch up new index with any tuples missed during the first phase - automatically drop auxiliary when main index is ready To merge main and auxiliary indexes: - index_bulk_delete called for both, TIDs put into tuplesort - both tuplesort are being sorted - both tuplesort scanned with two pointers looking for the TIDs present in auxiliary index, but absent in main one - all such TIDs are put into tuplestore - all TIDs in tuplestore are fetched using the stream, tuplestore used in heapam_index_validate_scan_read_stream_next to provide the next page to prefetch - if fetched tuple is alive - it is inserted into the main index This eliminates the need for a second full table scan during validation, improving performance especially for large tables. Affects both CREATE INDEX CONCURRENTLY and REINDEX INDEX CONCURRENTLY operations. --- doc/src/sgml/monitoring.sgml | 26 +- doc/src/sgml/ref/create_index.sgml | 34 +- doc/src/sgml/ref/reindex.sgml | 41 +- src/backend/access/heap/README.HOT | 13 +- src/backend/access/heap/heapam_handler.c | 539 ++++++++++++++------- src/backend/catalog/index.c | 292 +++++++++-- src/backend/catalog/system_views.sql | 17 +- src/backend/catalog/toasting.c | 3 +- src/backend/commands/indexcmds.c | 337 +++++++++++-- src/backend/nodes/makefuncs.c | 4 +- src/include/access/tableam.h | 28 +- src/include/catalog/index.h | 12 +- src/include/commands/progress.h | 13 +- src/include/nodes/execnodes.h | 4 +- src/include/nodes/makefuncs.h | 3 +- src/test/regress/expected/create_index.out | 42 ++ src/test/regress/expected/indexing.out | 3 +- src/test/regress/expected/rules.out | 17 +- src/test/regress/sql/create_index.sql | 21 + 19 files changed, 1101 insertions(+), 348 deletions(-) diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 4265a22d4de3..8ccd69b14c25 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -6314,6 +6314,18 @@ FROM pg_stat_get_backend_idset() AS backendid; information for this phase. + + waiting for writers to use auxiliary index + + CREATE INDEX CONCURRENTLY or REINDEX CONCURRENTLY is waiting for transactions + with write locks that can potentially see the table to finish, to ensure use of auxiliary index for new tuples in + future transactions. + This phase is skipped when not in concurrent mode. + Columns lockers_total, lockers_done + and current_locker_pid contain the progress + information for this phase. + + building index @@ -6354,13 +6366,12 @@ FROM pg_stat_get_backend_idset() AS backendid; - index validation: scanning table + index validation: merging indexes - CREATE INDEX CONCURRENTLY is scanning the table - to validate the index tuples collected in the previous two phases. + CREATE INDEX CONCURRENTLY merging content of auxiliary index with the target index. This phase is skipped when not in concurrent mode. - Columns blocks_total (set to the total size of the table) - and blocks_done contain the progress information for this phase. + Columns tuples_total (set to the number of tuples to be merged) + and tuples_done contain the progress information for this phase. @@ -6377,8 +6388,9 @@ FROM pg_stat_get_backend_idset() AS backendid; waiting for readers before marking dead - REINDEX CONCURRENTLY is waiting for transactions - with read locks on the table to finish, before marking the old index dead. + CREATE INDEX CONCURRENTLY is waiting for transactions + with read locks on the table to finish, before marking the auxiliary index as dead. + REINDEX CONCURRENTLY is also waiting before marking the old index as dead. This phase is skipped when not in concurrent mode. Columns lockers_total, lockers_done and current_locker_pid contain the progress diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml index 147a8f7587c7..e7a7a1607424 100644 --- a/doc/src/sgml/ref/create_index.sgml +++ b/doc/src/sgml/ref/create_index.sgml @@ -620,10 +620,10 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] CONCURRENTLY option of CREATE INDEX. When this option is used, - PostgreSQL must perform two scans of the table, and in - addition it must wait for all existing transactions that could potentially - modify or use the index to terminate. Thus - this method requires more total work than a standard index build and takes + PostgreSQL must perform table scan followed by + validation phase, and in addition it must wait for all existing transactions + that could potentially modify or use the index to terminate. Thus + this method requires more total work than a standard index build and may take significantly longer to complete. However, since it allows normal operations to continue while the index is built, this method is useful for adding new indexes in a production environment. Of course, the extra CPU @@ -631,14 +631,14 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] - In a concurrent index build, the index is actually entered as an - invalid index into - the system catalogs in one transaction, then two table scans occur in - two more transactions. Before each table scan, the index build must + In a concurrent index build, the main and auxiliary indexes is actually + entered as an invalid index into + the system catalogs in one transaction, then two phases occur in + multiple transactions. Before each phase, the index build must wait for existing transactions that have modified the table to terminate. - After the second scan, the index build must wait for any transactions + After the second phase, the index build must wait for any transactions that have a snapshot (see ) predating the second - scan to terminate, including transactions used by any phase of concurrent + phase to terminate, including transactions used by any phase of concurrent index builds on other tables, if the indexes involved are partial or have columns that are not simple column references. Then finally the index can be marked valid and ready for use, @@ -651,10 +651,11 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] If a problem arises while scanning the table, such as a deadlock or a uniqueness violation in a unique index, the CREATE INDEX - command will fail but leave behind an invalid index. This index - will be ignored for querying purposes because it might be incomplete; - however it will still consume update overhead. The psql - \d command will report such an index as INVALID: + command will fail but leave behind an invalid index and its + associated auxiliary index. These indexes + will be ignored for querying purposes because they might be incomplete; + however they will still consume update overhead. The psql + \d command will report such indexes as INVALID: postgres=# \d tab @@ -664,11 +665,12 @@ postgres=# \d tab col | integer | | | Indexes: "idx" btree (col) INVALID + "idx_ccaux" stir (col) INVALID The recommended recovery - method in such cases is to drop the index and try again to perform - CREATE INDEX CONCURRENTLY. (Another possibility is + method in such cases is to drop these indexes and try again to perform + CREATE INDEX CONCURRENTLY. (Another possibility is to rebuild the index with REINDEX INDEX CONCURRENTLY). diff --git a/doc/src/sgml/ref/reindex.sgml b/doc/src/sgml/ref/reindex.sgml index c40553971469..4ed3c969012f 100644 --- a/doc/src/sgml/ref/reindex.sgml +++ b/doc/src/sgml/ref/reindex.sgml @@ -368,9 +368,8 @@ REINDEX [ ( option [, ...] ) ] { DA PostgreSQL supports rebuilding indexes with minimum locking of writes. This method is invoked by specifying the CONCURRENTLY option of REINDEX. When this option - is used, PostgreSQL must perform two scans of the table - for each index that needs to be rebuilt and wait for termination of - all existing transactions that could potentially use the index. + is used, PostgreSQL must perform several steps to ensure data + consistency while allowing normal operations to continue. This method requires more total work than a standard index rebuild and takes significantly longer to complete as it needs to wait for unfinished transactions that might modify the index. However, since @@ -388,7 +387,7 @@ REINDEX [ ( option [, ...] ) ] { DA - A new transient index definition is added to the catalog + A new transient index definition and an auxiliary index are added to the catalog pg_index. This definition will be used to replace the old index. A SHARE UPDATE EXCLUSIVE lock at session level is taken on the indexes being reindexed as well as their @@ -398,7 +397,15 @@ REINDEX [ ( option [, ...] ) ] { DA - A first pass to build the index is done for each new index. Once the + The auxiliary index is marked as "ready for inserts", making + it visible to other sessions. This index efficiently tracks all new + tuples during the reindex process. + + + + + + The new main index is built by scanning the table. Once the index is built, its flag pg_index.indisready is switched to true to make it ready for inserts, making it visible to other sessions once the transaction that performed the build @@ -409,9 +416,9 @@ REINDEX [ ( option [, ...] ) ] { DA - Then a second pass is performed to add tuples that were added while the - first pass was running. This step is also done in a separate - transaction for each index. + A validation phase merges any missing entries from the auxiliary index + into the main index, ensuring all concurrent changes are captured. + This step is also done in a separate transaction for each index. @@ -428,7 +435,7 @@ REINDEX [ ( option [, ...] ) ] { DA - The old indexes have pg_index.indisready switched to + The old and auxiliary indexes have pg_index.indisready switched to false to prevent any new tuple insertions, after waiting for running queries that might reference the old index to complete. @@ -436,7 +443,7 @@ REINDEX [ ( option [, ...] ) ] { DA - The old indexes are dropped. The SHARE UPDATE + The old and auxiliary indexes are dropped. The SHARE UPDATE EXCLUSIVE session locks for the indexes and the table are released. @@ -447,11 +454,11 @@ REINDEX [ ( option [, ...] ) ] { DA If a problem arises while rebuilding the indexes, such as a uniqueness violation in a unique index, the REINDEX - command will fail but leave behind an invalid new index in addition to - the pre-existing one. This index will be ignored for querying purposes - because it might be incomplete; however it will still consume update + command will fail but leave behind an invalid new index and its auxiliary index in addition to + the pre-existing one. These indexes will be ignored for querying purposes + because they might be incomplete; however they will still consume update overhead. The psql \d command will report - such an index as INVALID: + such indexes as INVALID: postgres=# \d tab @@ -462,12 +469,14 @@ postgres=# \d tab Indexes: "idx" btree (col) "idx_ccnew" btree (col) INVALID + "idx_ccaux" stir (col) INVALID + If the index marked INVALID is suffixed - _ccnew, then it corresponds to the transient + _ccnew or _ccaux, then it corresponds to the transient or auxiliary index created during the concurrent operation, and the recommended - recovery method is to drop it using DROP INDEX, + recovery method is to drop these indexes using DROP INDEX, then attempt REINDEX CONCURRENTLY again. If the invalid index is instead suffixed _ccold, it corresponds to the original index which could not be dropped; diff --git a/src/backend/access/heap/README.HOT b/src/backend/access/heap/README.HOT index 829dad1194ef..6f718feb6d52 100644 --- a/src/backend/access/heap/README.HOT +++ b/src/backend/access/heap/README.HOT @@ -375,6 +375,11 @@ constraint on which updates can be HOT. Other transactions must include such an index when determining HOT-safety of updates, even though they must ignore it for both insertion and searching purposes. +Also, special auxiliary index is created the same way. It marked as +"ready for inserts" without any actual table scan. Its purpose is collect +new tuples inserted into table while our target index is still "not ready +for inserts" + We must do this to avoid making incorrect index entries. For example, suppose we are building an index on column X and we make an index entry for a non-HOT tuple with X=1. Then some other backend, unaware that X is an @@ -394,10 +399,10 @@ As above, we point the index entry at the root of the HOT-update chain but we use the key value from the live tuple. We mark the index open for inserts (but still not ready for reads) then -we again wait for transactions which have the table open. Then we take -a second reference snapshot and validate the index. This searches for -tuples missing from the index, and inserts any missing ones. Again, -the index entries have to have TIDs equal to HOT-chain root TIDs, but +we again wait for transactions which have the table open. Then validate +the index. This searches for tuples missing from the index in auxiliary +index, and inserts any missing ones if them visible to reference snapshot. +Again, the index entries have to have TIDs equal to HOT-chain root TIDs, but the value to be inserted is the one from the live tuple. Then we wait until every transaction that could have a snapshot older than diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 58ffa4306e27..f592b09ec689 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -41,6 +41,7 @@ #include "storage/bufpage.h" #include "storage/lmgr.h" #include "storage/predicate.h" +#include "storage/proc.h" #include "storage/procarray.h" #include "storage/smgr.h" #include "utils/builtins.h" @@ -1781,243 +1782,405 @@ heapam_index_build_range_scan(Relation heapRelation, return reltuples; } +/* + * Calculate set difference (relative complement) of main and aux + * sets. + * + * All records which are present in auxliary tuplesort but not is + * main are added to the store. + * + * In set theory notation store = aux - main or store = aux / main. + * + * returns number of items added to store + */ +static int +heapam_index_validate_tuplesort_difference(Tuplesortstate *main, + Tuplesortstate *aux, + Tuplestorestate *store) +{ + int num = 0; + /* state variables for the merge */ + ItemPointer indexcursor = NULL, + auxindexcursor = NULL; + ItemPointerData decoded, + auxdecoded; + bool tuplesort_empty = false, + auxtuplesort_empty = false; + + /* Initialize pointers. */ + ItemPointerSetInvalid(&decoded); + ItemPointerSetInvalid(&auxdecoded); + + /* + * Main loop: we step through the auxiliary sort (auxState->tuplesort), + * which holds TIDs that must compared to those from the "main" sort + * (state->tuplesort). + */ + while (!auxtuplesort_empty) + { + Datum ts_val; + bool ts_isnull; + CHECK_FOR_INTERRUPTS(); + + /* + * Attempt to fetch the next TID from the auxiliary sort. If it's + * empty, we set auxindexcursor to NULL. + */ + auxtuplesort_empty = !tuplesort_getdatum(aux, true, + false, &ts_val, &ts_isnull, + NULL); + Assert(auxtuplesort_empty || !ts_isnull); + if (!auxtuplesort_empty) + { + itemptr_decode(&auxdecoded, DatumGetInt64(ts_val)); + auxindexcursor = &auxdecoded; + } + else + { + auxindexcursor = NULL; + } + + /* + * If the auxiliary sort is not yet empty, we now try to synchronize + * the "main" sort cursor (indexcursor) with auxindexcursor. We advance + * the main sort cursor until we've reached or passed the auxiliary TID. + */ + if (!auxtuplesort_empty) + { + /* + * Move the main sort forward while: + * (1) It's not exhausted (tuplesort_empty == false), and + * (2) Either indexcursor is NULL (first iteration) or + * indexcursor < auxindexcursor in TID order. + */ + while (!tuplesort_empty && (indexcursor == NULL || /* null on first time here */ + ItemPointerCompare(indexcursor, auxindexcursor) < 0)) + { + /* + * Get the next TID from the main sort. If it's empty, + * we set indexcursor to NULL. + */ + tuplesort_empty = !tuplesort_getdatum(main, true, + false, &ts_val, &ts_isnull, + NULL); + Assert(tuplesort_empty || !ts_isnull); + + if (!tuplesort_empty) + { + itemptr_decode(&decoded, DatumGetInt64(ts_val)); + indexcursor = &decoded; + } + else + { + indexcursor = NULL; + } + + CHECK_FOR_INTERRUPTS(); + } + + /* + * Now, if either: + * - the main sort is empty, or + * - indexcursor > auxindexcursor, + * + * then auxindexcursor identifies a TID that doesn't appear in + * the main sort. We likely need to insert it + * into the target index if it’s visible in the heap. + */ + if (tuplesort_empty || ItemPointerCompare(indexcursor, auxindexcursor) > 0) + { + tuplestore_putdatum(store, Int64GetDatum(itemptr_encode(auxindexcursor))); + num++; + } + } + } + + return num; +} + +typedef struct ValidateIndexScanState +{ + Tuplestorestate *store; + BlockNumber prev_block_number; + OffsetNumber prev_off_offset_number; +} ValidateIndexScanState; + +/* + * This is ReadStreamBlockNumberCB implementation which works as follows: + * + * 1) It iterates over a sorted tuplestore, where each element is an encoded + * ItemPointer + * + * 2) It returns the current BlockNumber and collects all OffsetNumbers + * for that block in per_buffer_data. + * + * 3) Once the code encounters a new BlockNumber, it stops reading more + * offsets and saves the OffsetNumber of the new block for the next call. + * + * 4) The list of offsets for a block is always terminated with InvalidOffsetNumber. + * + * This function is intended to be repeatedly called, each time returning + * the next block and its corresponding set of offsets. + */ +static BlockNumber +heapam_index_validate_scan_read_stream_next( + ReadStream *stream, + void *void_callback_private_data, + void *void_per_buffer_data + ) +{ + bool shoud_free; + Datum datum; + BlockNumber result = InvalidBlockNumber; + int i = 0; + + /* + * Retrieve the specialized callback state and the output buffer. + * callback_private_data keeps track of the previous block and offset + * from a prior invocation, if any. + */ + ValidateIndexScanState *callback_private_data = void_callback_private_data; + OffsetNumber *per_buffer_data = void_per_buffer_data; + + /* + * If there is a "leftover" offset number from the previous invocation, + * it means we had switched to a new block in the middle of the last call. + * We place that leftover offset number into the buffer first. + */ + if (callback_private_data->prev_off_offset_number != InvalidOffsetNumber) + { + Assert(callback_private_data->prev_block_number != InvalidBlockNumber); + /* + * 'result' is the block number to return. We set it to the block + * from the previous leftover offset. + */ + result = callback_private_data->prev_block_number; + /* Place leftover offset number in the output buffer. */ + per_buffer_data[i++] = callback_private_data->prev_off_offset_number; + /* + * Clear the leftover offset number so it won't be reused unless + * we encounter another block change. + */ + callback_private_data->prev_off_offset_number = InvalidOffsetNumber; + } + + /* + * Read from the tuplestore until we either run out of tuples or we + * encounter a block change. For each tuple: + * + * 1) Decode its block/offset from the Datum. + * 2) If it's the first time in this call (prev_block_number == InvalidBlockNumber), + * initialize prev_block_number. + * 3) If the block number matches the current block, collect the offset. + * 4) If the block number differs, save that offset as leftover and break + * so that the next call can handle the new block. + */ + while (tuplestore_getdatum(callback_private_data->store, true, &shoud_free, &datum)) + { + BlockNumber next_block_number; + ItemPointerData next_data; + + /* Decode the datum into an ItemPointer (block + offset). */ + itemptr_decode(&next_data, DatumGetInt64(datum)); + next_block_number = ItemPointerGetBlockNumber(&next_data); + + /* + * If we haven't set a block number yet this round, initialize it + * using the first tuple we read. + */ + if (callback_private_data->prev_block_number == InvalidBlockNumber) + callback_private_data->prev_block_number = next_block_number; + + /* + * Always set the result to be the "current" block number + * we are filling offsets for. + */ + result = callback_private_data->prev_block_number; + + /* + * If this tuple is from the same block, just store its offset + * in our per_buffer_data array. + */ + if (next_block_number == callback_private_data->prev_block_number) + { + per_buffer_data[i++] = ItemPointerGetOffsetNumber(&next_data); + + /* Free the datum if needed. */ + if (shoud_free) + pfree(DatumGetPointer(datum)); + } + else + { + /* + * If the block just changed, store the offset of the new block + * as leftover for the next invocation and break out. + */ + callback_private_data->prev_block_number = next_block_number; + callback_private_data->prev_off_offset_number = ItemPointerGetOffsetNumber(&next_data); + + /* Free the datum if needed. */ + if (shoud_free) + pfree(DatumGetPointer(datum)); + + /* Break to let the next call handle the new block. */ + break; + } + } + + /* + * Terminate the list of offsets for this block with an InvalidOffsetNumber. + */ + per_buffer_data[i] = InvalidOffsetNumber; + return result; +} + static void heapam_index_validate_scan(Relation heapRelation, Relation indexRelation, IndexInfo *indexInfo, Snapshot snapshot, - ValidateIndexState *state) + ValidateIndexState *state, + ValidateIndexState *auxState) { - TableScanDesc scan; - HeapScanDesc hscan; - HeapTuple heapTuple; Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; - ExprState *predicate; - TupleTableSlot *slot; - EState *estate; - ExprContext *econtext; - BlockNumber root_blkno = InvalidBlockNumber; - OffsetNumber root_offsets[MaxHeapTuplesPerPage]; - bool in_index[MaxHeapTuplesPerPage]; - BlockNumber previous_blkno = InvalidBlockNumber; - /* state variables for the merge */ - ItemPointer indexcursor = NULL; - ItemPointerData decoded; - bool tuplesort_empty = false; + TupleTableSlot *slot; + EState *estate; + ExprContext *econtext; + BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD); + + int num_to_check; + Tuplestorestate *tuples_for_check; + ValidateIndexScanState callback_private_data; + + Buffer buf; + OffsetNumber* tuples; + ReadStream *read_stream; + + /* Use 10% of memory for tuple store. */ + int store_work_mem_part = maintenance_work_mem / 10; /* - * sanity checks + * Encode TIDs as int8 values for the sort, rather than directly sorting + * item pointers. This can be significantly faster, primarily because TID + * is a pass-by-reference type on all platforms, whereas int8 is + * pass-by-value on most platforms. */ - Assert(OidIsValid(indexRelation->rd_rel->relam)); + tuples_for_check = tuplestore_begin_datum(INT8OID, false, false, store_work_mem_part); /* - * Need an EState for evaluation of index expressions and partial-index - * predicates. Also a slot to hold the current tuple. + * sanity checks */ + Assert(OidIsValid(indexRelation->rd_rel->relam)); + + num_to_check = heapam_index_validate_tuplesort_difference(state->tuplesort, + auxState->tuplesort, + tuples_for_check); + + /* It is our responsibility to close tuple sort as fast as we can */ + tuplesort_end(state->tuplesort); + tuplesort_end(auxState->tuplesort); + + state->tuplesort = auxState->tuplesort = NULL; + estate = CreateExecutorState(); econtext = GetPerTupleExprContext(estate); slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRelation), - &TTSOpsHeapTuple); + &TTSOpsBufferHeapTuple); /* Arrange for econtext's scan tuple to be the tuple under test */ econtext->ecxt_scantuple = slot; - /* Set up execution state for predicate, if any. */ - predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + callback_private_data.prev_block_number = InvalidBlockNumber; + callback_private_data.store = tuples_for_check; + callback_private_data.prev_off_offset_number = InvalidOffsetNumber; - /* - * Prepare for scan of the base relation. We need just those tuples - * satisfying the passed-in reference snapshot. We must disable syncscan - * here, because it's critical that we read from block zero forward to - * match the sorted TIDs. - */ - scan = table_beginscan_strat(heapRelation, /* relation */ - snapshot, /* snapshot */ - 0, /* number of keys */ - NULL, /* scan key */ - true, /* buffer access strategy OK */ - false, /* syncscan not OK */ - false); - hscan = (HeapScanDesc) scan; + read_stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE | READ_STREAM_USE_BATCHING, + bstrategy, + heapRelation, MAIN_FORKNUM, + heapam_index_validate_scan_read_stream_next, + &callback_private_data, + (MaxHeapTuplesPerPage + 1) * sizeof(OffsetNumber)); - pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL, - hscan->rs_nblocks); + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_TOTAL, num_to_check); + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, 0); - /* - * Scan all tuples matching the snapshot. - */ - while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + while ((buf = read_stream_next_buffer(read_stream, (void*) &tuples)) != InvalidBuffer) { - ItemPointer heapcursor = &heapTuple->t_self; - ItemPointerData rootTuple; - OffsetNumber root_offnum; + HeapTupleData heap_tuple_data[MaxHeapTuplesPerPage]; + int i; + OffsetNumber off; + BlockNumber block_number; CHECK_FOR_INTERRUPTS(); - state->htups += 1; + LockBuffer(buf, BUFFER_LOCK_SHARE); + block_number = BufferGetBlockNumber(buf); - if ((previous_blkno == InvalidBlockNumber) || - (hscan->rs_cblock != previous_blkno)) + i = 0; + while ((off = tuples[i]) != InvalidOffsetNumber) { - pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE, - hscan->rs_cblock); - previous_blkno = hscan->rs_cblock; + ItemPointerData tid; + bool all_dead, found; + ItemPointerSet(&tid, block_number, off); + + found = heap_hot_search_buffer(&tid, heapRelation, buf, snapshot, + &heap_tuple_data[i], &all_dead, true); + if (!found) + ItemPointerSetInvalid(&heap_tuple_data[i].t_self); + i++; } + LockBuffer(buf, BUFFER_LOCK_UNLOCK); - /* - * As commented in table_index_build_scan, we should index heap-only - * tuples under the TIDs of their root tuples; so when we advance onto - * a new heap page, build a map of root item offsets on the page. - * - * This complicates merging against the tuplesort output: we will - * visit the live tuples in order by their offsets, but the root - * offsets that we need to compare against the index contents might be - * ordered differently. So we might have to "look back" within the - * tuplesort output, but only within the current page. We handle that - * by keeping a bool array in_index[] showing all the - * already-passed-over tuplesort output TIDs of the current page. We - * clear that array here, when advancing onto a new heap page. - */ - if (hscan->rs_cblock != root_blkno) + i = 0; + while ((off = tuples[i]) != InvalidOffsetNumber) { - Page page = BufferGetPage(hscan->rs_cbuf); - - LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE); - heap_get_root_tuples(page, root_offsets); - LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK); - - memset(in_index, 0, sizeof(in_index)); - - root_blkno = hscan->rs_cblock; - } - - /* Convert actual tuple TID to root TID */ - rootTuple = *heapcursor; - root_offnum = ItemPointerGetOffsetNumber(heapcursor); + if (ItemPointerIsValid(&heap_tuple_data[i].t_self)) + { + ItemPointerData root_tid; + ItemPointerSet(&root_tid, block_number, off); - if (HeapTupleIsHeapOnly(heapTuple)) - { - root_offnum = root_offsets[root_offnum - 1]; - if (!OffsetNumberIsValid(root_offnum)) - ereport(ERROR, - (errcode(ERRCODE_DATA_CORRUPTED), - errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"", - ItemPointerGetBlockNumber(heapcursor), - ItemPointerGetOffsetNumber(heapcursor), - RelationGetRelationName(heapRelation)))); - ItemPointerSetOffsetNumber(&rootTuple, root_offnum); - } + /* Reset the per-tuple memory context for the next fetch. */ + MemoryContextReset(econtext->ecxt_per_tuple_memory); + ExecStoreBufferHeapTuple(&heap_tuple_data[i], slot, buf); - /* - * "merge" by skipping through the index tuples until we find or pass - * the current root tuple. - */ - while (!tuplesort_empty && - (!indexcursor || - ItemPointerCompare(indexcursor, &rootTuple) < 0)) - { - Datum ts_val; - bool ts_isnull; + /* Compute the key values and null flags for this tuple. */ + FormIndexDatum(indexInfo, + slot, + estate, + values, + isnull); - if (indexcursor) - { /* - * Remember index items seen earlier on the current heap page + * Insert the tuple into the target index. */ - if (ItemPointerGetBlockNumber(indexcursor) == root_blkno) - in_index[ItemPointerGetOffsetNumber(indexcursor) - 1] = true; + index_insert(indexRelation, + values, + isnull, + &root_tid, /* insert root tuple */ + heapRelation, + indexInfo->ii_Unique ? + UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, + false, + indexInfo); } - tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true, - false, &ts_val, &ts_isnull, - NULL); - Assert(tuplesort_empty || !ts_isnull); - if (!tuplesort_empty) - { - itemptr_decode(&decoded, DatumGetInt64(ts_val)); - indexcursor = &decoded; - } - else - { - /* Be tidy */ - indexcursor = NULL; - } + state->htups += 1; + pgstat_progress_incr_param(PROGRESS_CREATEIDX_TUPLES_DONE, 1); + i++; } - /* - * If the tuplesort has overshot *and* we didn't see a match earlier, - * then this tuple is missing from the index, so insert it. - */ - if ((tuplesort_empty || - ItemPointerCompare(indexcursor, &rootTuple) > 0) && - !in_index[root_offnum - 1]) - { - MemoryContextReset(econtext->ecxt_per_tuple_memory); - - /* Set up for predicate or expression evaluation */ - ExecStoreHeapTuple(heapTuple, slot, false); - - /* - * In a partial index, discard tuples that don't satisfy the - * predicate. - */ - if (predicate != NULL) - { - if (!ExecQual(predicate, econtext)) - continue; - } - - /* - * For the current heap tuple, extract all the attributes we use - * in this index, and note which are null. This also performs - * evaluation of any expressions needed. - */ - FormIndexDatum(indexInfo, - slot, - estate, - values, - isnull); - - /* - * You'd think we should go ahead and build the index tuple here, - * but some index AMs want to do further processing on the data - * first. So pass the values[] and isnull[] arrays, instead. - */ - - /* - * If the tuple is already committed dead, you might think we - * could suppress uniqueness checking, but this is no longer true - * in the presence of HOT, because the insert is actually a proxy - * for a uniqueness check on the whole HOT-chain. That is, the - * tuple we have here could be dead because it was already - * HOT-updated, and if so the updating transaction will not have - * thought it should insert index entries. The index AM will - * check the whole HOT-chain and correctly detect a conflict if - * there is one. - */ - - index_insert(indexRelation, - values, - isnull, - &rootTuple, - heapRelation, - indexInfo->ii_Unique ? - UNIQUE_CHECK_YES : UNIQUE_CHECK_NO, - false, - indexInfo); - - state->tups_inserted += 1; - } + ReleaseBuffer(buf); } - table_endscan(scan); - ExecDropSingleTupleTableSlot(slot); FreeExecutorState(estate); + read_stream_end(read_stream); + tuplestore_end(tuples_for_check); + /* These may have been pointing to the now-gone estate */ indexInfo->ii_ExpressionsState = NIL; indexInfo->ii_PredicateState = NULL; diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index e9e22ec0e84b..6c09c6a2b676 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -715,11 +715,16 @@ UpdateIndexRelation(Oid indexoid, * already exists. * INDEX_CREATE_PARTITIONED: * create a partitioned index (table must be partitioned) + * INDEX_CREATE_AUXILIARY: + * mark index as auxiliary index * constr_flags: flags passed to index_constraint_create * (only if INDEX_CREATE_ADD_CONSTRAINT is set) * allow_system_table_mods: allow table to be a system catalog * is_internal: if true, post creation hook for new index * constraintId: if not NULL, receives OID of created constraint + * relpersistence: persistence level to use for index. In most of the + * cases it is should be equal to persistence level of table, + * auxiliary indexes are only exception here. * * Returns the OID of the created index. */ @@ -744,7 +749,8 @@ index_create(Relation heapRelation, bits16 constr_flags, bool allow_system_table_mods, bool is_internal, - Oid *constraintId) + Oid *constraintId, + char relpersistence) { Oid heapRelationId = RelationGetRelid(heapRelation); Relation pg_class; @@ -755,11 +761,11 @@ index_create(Relation heapRelation, bool is_exclusion; Oid namespaceId; int i; - char relpersistence; bool isprimary = (flags & INDEX_CREATE_IS_PRIMARY) != 0; bool invalid = (flags & INDEX_CREATE_INVALID) != 0; bool concurrent = (flags & INDEX_CREATE_CONCURRENT) != 0; bool partitioned = (flags & INDEX_CREATE_PARTITIONED) != 0; + bool auxiliary = (flags & INDEX_CREATE_AUXILIARY) != 0; char relkind; TransactionId relfrozenxid; MultiXactId relminmxid; @@ -785,7 +791,6 @@ index_create(Relation heapRelation, namespaceId = RelationGetNamespace(heapRelation); shared_relation = heapRelation->rd_rel->relisshared; mapped_relation = RelationIsMapped(heapRelation); - relpersistence = heapRelation->rd_rel->relpersistence; /* * check parameters @@ -793,6 +798,11 @@ index_create(Relation heapRelation, if (indexInfo->ii_NumIndexAttrs < 1) elog(ERROR, "must index at least one column"); + if (indexInfo->ii_Am == STIR_AM_OID && !auxiliary) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("user-defined indexes with STIR access method are not supported"))); + if (!allow_system_table_mods && IsSystemRelation(heapRelation) && IsNormalProcessingMode()) @@ -1398,7 +1408,8 @@ index_concurrently_create_copy(Relation heapRelation, Oid oldIndexId, false, /* not ready for inserts */ true, indexRelation->rd_indam->amsummarizing, - oldInfo->ii_WithoutOverlaps); + oldInfo->ii_WithoutOverlaps, + false); /* * Extract the list of column names and the column numbers for the new @@ -1463,7 +1474,8 @@ index_concurrently_create_copy(Relation heapRelation, Oid oldIndexId, 0, true, /* allow table to be a system catalog? */ false, /* is_internal? */ - NULL); + NULL, + heapRelation->rd_rel->relpersistence); /* Close the relations used and clean up */ index_close(indexRelation, NoLock); @@ -1473,6 +1485,155 @@ index_concurrently_create_copy(Relation heapRelation, Oid oldIndexId, return newIndexId; } +/* + * index_concurrently_create_aux + * + * Create concurrently an auxiliary index based on the definition of the one + * provided by caller. The index is inserted into catalogs and needs to be + * built later on. This is called during concurrent reindex processing. + * + * "tablespaceOid" is the tablespace to use for this index. + */ +Oid +index_concurrently_create_aux(Relation heapRelation, Oid mainIndexId, + Oid tablespaceOid, const char *newName) +{ + Relation indexRelation; + IndexInfo *oldInfo, + *newInfo; + Oid newIndexId = InvalidOid; + HeapTuple indexTuple; + + List *indexColNames = NIL; + List *indexExprs = NIL; + List *indexPreds = NIL; + + Oid *auxOpclassIds; + int16 *auxColoptions; + + indexRelation = index_open(mainIndexId, RowExclusiveLock); + + /* The new index needs some information from the old index */ + oldInfo = BuildIndexInfo(indexRelation); + + /* + * Build of an auxiliary index with exclusion constraints is not + * supported. + */ + if (oldInfo->ii_ExclusionOps != NULL) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("auxiliary index creation for exclusion constraints is not supported"))); + + /* Get the array of class and column options IDs from index info */ + indexTuple = SearchSysCache1(INDEXRELID, ObjectIdGetDatum(mainIndexId)); + if (!HeapTupleIsValid(indexTuple)) + elog(ERROR, "cache lookup failed for index %u", mainIndexId); + + + /* + * Fetch the list of expressions and predicates directly from the + * catalogs. This cannot rely on the information from IndexInfo of the + * old index as these have been flattened for the planner. + */ + if (oldInfo->ii_Expressions != NIL) + { + Datum exprDatum; + char *exprString; + + exprDatum = SysCacheGetAttrNotNull(INDEXRELID, indexTuple, + Anum_pg_index_indexprs); + exprString = TextDatumGetCString(exprDatum); + indexExprs = (List *) stringToNode(exprString); + pfree(exprString); + } + if (oldInfo->ii_Predicate != NIL) + { + Datum predDatum; + char *predString; + + predDatum = SysCacheGetAttrNotNull(INDEXRELID, indexTuple, + Anum_pg_index_indpred); + predString = TextDatumGetCString(predDatum); + indexPreds = (List *) stringToNode(predString); + + /* Also convert to implicit-AND format */ + indexPreds = make_ands_implicit((Expr *) indexPreds); + pfree(predString); + } + + /* + * Build the index information for the new index. Note that rebuild of + * indexes with exclusion constraints is not supported, hence there is no + * need to fill all the ii_Exclusion* fields. + */ + newInfo = makeIndexInfo(oldInfo->ii_NumIndexAttrs, + oldInfo->ii_NumIndexKeyAttrs, + STIR_AM_OID, /* special AM for aux indexes */ + indexExprs, + indexPreds, + false, /* aux index are not unique */ + oldInfo->ii_NullsNotDistinct, + false, /* not ready for inserts */ + true, + false, /* aux are not summarizing */ + false, /* aux are not without overlaps */ + true /* auxiliary */); + + /* + * Extract the list of column names and the column numbers for the new + * index information. All this information will be used for the index + * creation. + */ + for (int i = 0; i < oldInfo->ii_NumIndexAttrs; i++) + { + TupleDesc indexTupDesc = RelationGetDescr(indexRelation); + Form_pg_attribute att = TupleDescAttr(indexTupDesc, i); + + indexColNames = lappend(indexColNames, NameStr(att->attname)); + newInfo->ii_IndexAttrNumbers[i] = oldInfo->ii_IndexAttrNumbers[i]; + } + + auxOpclassIds = palloc0(sizeof(Oid) * newInfo->ii_NumIndexAttrs); + auxColoptions = palloc0(sizeof(int16) * newInfo->ii_NumIndexAttrs); + + /* Fill with "any ops" */ + for (int i = 0; i < newInfo->ii_NumIndexAttrs; i++) + { + auxOpclassIds[i] = ANY_STIR_OPS_OID; + auxColoptions[i] = 0; + } + + newIndexId = index_create(heapRelation, + newName, + InvalidOid, /* indexRelationId */ + InvalidOid, /* parentIndexRelid */ + InvalidOid, /* parentConstraintId */ + InvalidRelFileNumber, /* relFileNumber */ + newInfo, + indexColNames, + STIR_AM_OID, + tablespaceOid, + indexRelation->rd_indcollation, + auxOpclassIds, + NULL, + auxColoptions, + NULL, + (Datum) 0, + INDEX_CREATE_SKIP_BUILD | INDEX_CREATE_CONCURRENT | INDEX_CREATE_AUXILIARY, + 0, + true, /* allow table to be a system catalog? */ + false, /* is_internal? */ + NULL, + RELPERSISTENCE_UNLOGGED); /* aux indexes unlogged */ + + /* Close the relations used and clean up */ + index_close(indexRelation, NoLock); + ReleaseSysCache(indexTuple); + + return newIndexId; +} + /* * index_concurrently_build * @@ -2469,7 +2630,8 @@ BuildIndexInfo(Relation index) indexStruct->indisready, false, index->rd_indam->amsummarizing, - indexStruct->indisexclusion && indexStruct->indisunique); + indexStruct->indisexclusion && indexStruct->indisunique, + index->rd_rel->relam == STIR_AM_OID /* auxiliary iff STIR */); /* fill in attribute numbers */ for (i = 0; i < numAtts; i++) @@ -2529,7 +2691,8 @@ BuildDummyIndexInfo(Relation index) indexStruct->indisready, false, index->rd_indam->amsummarizing, - indexStruct->indisexclusion && indexStruct->indisunique); + indexStruct->indisexclusion && indexStruct->indisunique, + index->rd_rel->relam == STIR_AM_OID /* auxiliary iff STIR */); /* fill in attribute numbers */ for (i = 0; i < numAtts; i++) @@ -3306,12 +3469,21 @@ IndexCheckExclusion(Relation heapRelation, * * We do a concurrent index build by first inserting the catalog entry for the * index via index_create(), marking it not indisready and not indisvalid. + * Then we create special auxiliary index the same way. It based on STIR AM. * Then we commit our transaction and start a new one, then we wait for all * transactions that could have been modifying the table to terminate. Now - * we know that any subsequently-started transactions will see the index and + * we know that any subsequently-started transactions will see indexes and * honor its constraints on HOT updates; so while existing HOT-chains might * be broken with respect to the index, no currently live tuple will have an - * incompatible HOT update done to it. We now build the index normally via + * incompatible HOT update done to it. + * + * After we build auxiliary index. It is fast operation without any actual + * table scan. As result, we have empty STIR index. We commit transaction and + * again wait for all transactions that could have been modifying the table + * to terminate. At that moment all new tuples are going to be inserted into + * auxiliary index. + * + * We now build the index normally via * index_build(), while holding a weak lock that allows concurrent * insert/update/delete. Also, we index only tuples that are valid * as of the start of the scan (see table_index_build_scan), whereas a normal @@ -3321,18 +3493,21 @@ IndexCheckExclusion(Relation heapRelation, * bogus unique-index failures due to concurrent UPDATEs (we might see * different versions of the same row as being valid when we pass over them, * if we used HeapTupleSatisfiesVacuum). This leaves us with an index that - * does not contain any tuples added to the table while we built the index. + * does not contain any tuples added to the table while we built the index + * (but these tuples contained in auxiliary index). * * Furthermore, we set SO_RESET_SNAPSHOT for the scan, which causes new * snapshot to be set as active every so often. The reason for that is to * propagate the xmin horizon forward. * * Next, we mark the index "indisready" (but still not "indisvalid") and - * commit the second transaction and start a third. Again we wait for all + * commit the third transaction and start a fourth. Again we wait for all * transactions that could have been modifying the table to terminate. Now * we know that any subsequently-started transactions will see the index and - * insert their new tuples into it. We then take a new reference snapshot - * which is passed to validate_index(). Any tuples that are valid according + * insert their new tuples into it. At the same moment we clear "indisready" for + * auxiliary index, since it is no more required to be updated. + * + * We then take a new reference snapshot, any tuples that are valid according * to this snap, but are not in the index, must be added to the index. * (Any tuples committed live after the snap will be inserted into the * index by their originating transaction. Any tuples committed dead before @@ -3340,12 +3515,14 @@ IndexCheckExclusion(Relation heapRelation, * that might care about them before we mark the index valid.) * * validate_index() works by first gathering all the TIDs currently in the - * index, using a bulkdelete callback that just stores the TIDs and doesn't + * indexes, using a bulkdelete callback that just stores the TIDs and doesn't * ever say "delete it". (This should be faster than a plain indexscan; * also, not all index AMs support full-index indexscan.) Then we sort the - * TIDs, and finally scan the table doing a "merge join" against the TID list - * to see which tuples are missing from the index. Thus we will ensure that - * all tuples valid according to the reference snapshot are in the index. + * TIDs of both auxiliary and target indexes, and doing a "merge join" against + * the TID lists to see which tuples from auxiliary index are missing from the + * target index. Thus we will ensure that all tuples valid according to the + * reference snapshot are in the index. Notice we need to do bulkdelete in the + * particular order: auxiliary first, target last. * * Building a unique index this way is tricky: we might try to insert a * tuple that is already dead or is in process of being deleted, and we @@ -3363,22 +3540,26 @@ IndexCheckExclusion(Relation heapRelation, * not index). Then we mark the index "indisvalid" and commit. Subsequent * transactions will be able to use it for queries. * - * Doing two full table scans is a brute-force strategy. We could try to be - * cleverer, eg storing new tuples in a special area of the table (perhaps - * making the table append-only by setting use_fsm). However that would - * add yet more locking issues. + * Also, some actions to concurrent drop the auxiliary index are performed. */ void -validate_index(Oid heapId, Oid indexId, Snapshot snapshot) +validate_index(Oid heapId, Oid indexId, Oid auxIndexId, Snapshot snapshot) { Relation heapRelation, - indexRelation; + indexRelation, + auxIndexRelation; IndexInfo *indexInfo; - IndexVacuumInfo ivinfo; - ValidateIndexState state; + IndexVacuumInfo ivinfo, auxivinfo; + ValidateIndexState state, auxState; Oid save_userid; int save_sec_context; int save_nestlevel; + /* Use 80% of maintenance_work_mem to target index sorting and + * 10% rest for auxiliary. + * + * Rest 10% will be used for tuplestore later. */ + int64 main_work_mem_part = (int64) maintenance_work_mem * 8 / 10; + int aux_work_mem_part = maintenance_work_mem / 10; { const int progress_index[] = { @@ -3411,6 +3592,7 @@ validate_index(Oid heapId, Oid indexId, Snapshot snapshot) RestrictSearchPath(); indexRelation = index_open(indexId, RowExclusiveLock); + auxIndexRelation = index_open(auxIndexId, RowExclusiveLock); /* * Fetch info needed for index_insert. (You might think this should be @@ -3435,15 +3617,30 @@ validate_index(Oid heapId, Oid indexId, Snapshot snapshot) ivinfo.strategy = NULL; ivinfo.validate_index = true; + /* + * Copy all info to auxiliary info, changing only relation. + */ + auxivinfo = ivinfo; + auxivinfo.index = auxIndexRelation; + /* * Encode TIDs as int8 values for the sort, rather than directly sorting * item pointers. This can be significantly faster, primarily because TID * is a pass-by-reference type on all platforms, whereas int8 is * pass-by-value on most platforms. */ + auxState.tuplesort = tuplesort_begin_datum(INT8OID, Int8LessOperator, + InvalidOid, false, + aux_work_mem_part, + NULL, TUPLESORT_NONE); + auxState.htups = auxState.itups = auxState.tups_inserted = 0; + + (void) index_bulk_delete(&auxivinfo, NULL, + validate_index_callback, &auxState); + state.tuplesort = tuplesort_begin_datum(INT8OID, Int8LessOperator, InvalidOid, false, - maintenance_work_mem, + (int) main_work_mem_part, NULL, TUPLESORT_NONE); state.htups = state.itups = state.tups_inserted = 0; @@ -3466,27 +3663,30 @@ validate_index(Oid heapId, Oid indexId, Snapshot snapshot) pgstat_progress_update_multi_param(3, progress_index, progress_vals); } tuplesort_performsort(state.tuplesort); + tuplesort_performsort(auxState.tuplesort); /* - * Now scan the heap and "merge" it with the index + * Now merge both indexes */ pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, - PROGRESS_CREATEIDX_PHASE_VALIDATE_TABLESCAN); + PROGRESS_CREATEIDX_PHASE_VALIDATE_IDXMERGE); table_index_validate_scan(heapRelation, indexRelation, indexInfo, snapshot, - &state); + &state, + &auxState); - /* Done with tuplesort object */ - tuplesort_end(state.tuplesort); + /* Tuple sort closed by table_index_validate_scan */ + Assert(state.tuplesort == NULL && auxState.tuplesort == NULL); /* Make sure to release resources cached in indexInfo (if needed). */ index_insert_cleanup(indexRelation, indexInfo); elog(DEBUG2, - "validate_index found %.0f heap tuples, %.0f index tuples; inserted %.0f missing tuples", - state.htups, state.itups, state.tups_inserted); + "validate_index fetched %.0f heap tuples, %.0f index tuples;" + " %.0f aux index tuples; inserted %.0f missing tuples", + state.htups, state.itups, auxState.itups, state.tups_inserted); /* Roll back any GUC changes executed by index functions */ AtEOXact_GUC(false, save_nestlevel); @@ -3495,6 +3695,7 @@ validate_index(Oid heapId, Oid indexId, Snapshot snapshot) SetUserIdAndSecContext(save_userid, save_sec_context); /* Close rels, but keep locks */ + index_close(auxIndexRelation, NoLock); index_close(indexRelation, NoLock); table_close(heapRelation, NoLock); } @@ -3555,6 +3756,11 @@ index_set_state_flags(Oid indexId, IndexStateFlagsAction action) Assert(!indexForm->indisvalid); indexForm->indisvalid = true; break; + case INDEX_DROP_CLEAR_READY: + /* Clear indisready during a CREATE INDEX CONCURRENTLY sequence */ + Assert(!indexForm->indisvalid); + indexForm->indisready = false; + break; case INDEX_DROP_CLEAR_VALID: /* @@ -3826,6 +4032,13 @@ reindex_index(const ReindexStmt *stmt, Oid indexId, indexInfo->ii_ExclusionStrats = NULL; } + /* Auxiliary indexes are not allowed to be rebuilt */ + if (indexInfo->ii_Auxiliary) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("reindex of auxiliary index \"%s\" not supported", + RelationGetRelationName(iRel)))); + /* Suppress use of the target index while rebuilding it */ SetReindexProcessing(heapId, indexId); @@ -4068,6 +4281,7 @@ reindex_relation(const ReindexStmt *stmt, Oid relid, int flags, { Oid indexOid = lfirst_oid(indexId); Oid indexNamespaceId = get_rel_namespace(indexOid); + Oid indexAm = get_rel_relam(indexOid); /* * Skip any invalid indexes on a TOAST table. These can only be @@ -4093,6 +4307,18 @@ reindex_relation(const ReindexStmt *stmt, Oid relid, int flags, continue; } + if (indexAm == STIR_AM_OID) + { + ereport(WARNING, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("skipping reindex of auxiliary index \"%s.%s\"", + get_namespace_name(indexNamespaceId), + get_rel_name(indexOid)))); + if (flags & REINDEX_REL_SUPPRESS_INDEX_USE) + RemoveReindexPending(indexOid); + continue; + } + reindex_index(stmt, indexOid, !(flags & REINDEX_REL_CHECK_CONSTRAINTS), persistence, params); diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 08f780a2e638..b20decd1204d 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1283,16 +1283,17 @@ CREATE VIEW pg_stat_progress_create_index AS END AS command, CASE S.param10 WHEN 0 THEN 'initializing' WHEN 1 THEN 'waiting for writers before build' - WHEN 2 THEN 'building index' || + WHEN 2 THEN 'waiting for writers to use auxiliary index' + WHEN 3 THEN 'building index' || COALESCE((': ' || pg_indexam_progress_phasename(S.param9::oid, S.param11)), '') - WHEN 3 THEN 'waiting for writers before validation' - WHEN 4 THEN 'index validation: scanning index' - WHEN 5 THEN 'index validation: sorting tuples' - WHEN 6 THEN 'index validation: scanning table' - WHEN 7 THEN 'waiting for old snapshots' - WHEN 8 THEN 'waiting for readers before marking dead' - WHEN 9 THEN 'waiting for readers before dropping' + WHEN 4 THEN 'waiting for writers before validation' + WHEN 5 THEN 'index validation: scanning index' + WHEN 6 THEN 'index validation: sorting tuples' + WHEN 7 THEN 'index validation: merging indexes' + WHEN 8 THEN 'waiting for old snapshots' + WHEN 9 THEN 'waiting for readers before marking dead' + WHEN 10 THEN 'waiting for readers before dropping' END as phase, S.param4 AS lockers_total, S.param5 AS lockers_done, diff --git a/src/backend/catalog/toasting.c b/src/backend/catalog/toasting.c index 874a8fc89adb..0ee2fd5e7dea 100644 --- a/src/backend/catalog/toasting.c +++ b/src/backend/catalog/toasting.c @@ -325,7 +325,8 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid, BTREE_AM_OID, rel->rd_rel->reltablespace, collationIds, opclassIds, NULL, coloptions, NULL, (Datum) 0, - INDEX_CREATE_IS_PRIMARY, 0, true, true, NULL); + INDEX_CREATE_IS_PRIMARY, 0, true, true, NULL, + toast_rel->rd_rel->relpersistence); table_close(toast_rel, NoLock); diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 991fa6ae6372..9ca11b21023a 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -182,6 +182,7 @@ CheckIndexCompatible(Oid oldId, bool isWithoutOverlaps) { bool isconstraint; + bool isauxiliary; Oid *typeIds; Oid *collationIds; Oid *opclassIds; @@ -232,6 +233,7 @@ CheckIndexCompatible(Oid oldId, amcanorder = amRoutine->amcanorder; amsummarizing = amRoutine->amsummarizing; + isauxiliary = accessMethodId == STIR_AM_OID; /* * Compute the operator classes, collations, and exclusion operators for @@ -243,7 +245,8 @@ CheckIndexCompatible(Oid oldId, */ indexInfo = makeIndexInfo(numberOfAttributes, numberOfAttributes, accessMethodId, NIL, NIL, false, false, - false, false, amsummarizing, isWithoutOverlaps); + false, false, amsummarizing, + isWithoutOverlaps, isauxiliary); typeIds = palloc_array(Oid, numberOfAttributes); collationIds = palloc_array(Oid, numberOfAttributes); opclassIds = palloc_array(Oid, numberOfAttributes); @@ -553,6 +556,7 @@ DefineIndex(Oid tableId, { bool concurrent; char *indexRelationName; + char *auxIndexRelationName = NULL; char *accessMethodName; Oid *typeIds; Oid *collationIds; @@ -562,6 +566,7 @@ DefineIndex(Oid tableId, Oid namespaceId; Oid tablespaceId; Oid createdConstraintId = InvalidOid; + Oid auxIndexRelationId = InvalidOid; List *indexColNames; List *allIndexParams; Relation rel; @@ -583,6 +588,7 @@ DefineIndex(Oid tableId, int numberOfKeyAttributes; TransactionId limitXmin; ObjectAddress address; + ObjectAddress auxAddress; LockRelId heaprelid; LOCKTAG heaplocktag; LOCKMODE lockmode; @@ -833,6 +839,15 @@ DefineIndex(Oid tableId, stmt->excludeOpNames, stmt->primary, stmt->isconstraint); + /* + * Select name for auxiliary index + */ + if (concurrent) + auxIndexRelationName = ChooseRelationName(indexRelationName, + NULL, + "ccaux", + namespaceId, + false); /* * look up the access method, verify it can handle the requested features @@ -928,7 +943,8 @@ DefineIndex(Oid tableId, !concurrent, concurrent, amissummarizing, - stmt->iswithoutoverlaps); + stmt->iswithoutoverlaps, + false); typeIds = palloc_array(Oid, numberOfAttributes); collationIds = palloc_array(Oid, numberOfAttributes); @@ -1251,7 +1267,8 @@ DefineIndex(Oid tableId, coloptions, NULL, reloptions, flags, constr_flags, allowSystemTableMods, !check_rights, - &createdConstraintId); + &createdConstraintId, + rel->rd_rel->relpersistence); ObjectAddressSet(address, RelationRelationId, indexRelationId); @@ -1593,6 +1610,16 @@ DefineIndex(Oid tableId, return address; } + /* + * In case of concurrent build - create auxiliary index record. + */ + if (concurrent) + { + auxIndexRelationId = index_concurrently_create_aux(rel, indexRelationId, + tablespaceId, auxIndexRelationName); + ObjectAddressSet(auxAddress, RelationRelationId, auxIndexRelationId); + } + AtEOXact_GUC(false, root_save_nestlevel); SetUserIdAndSecContext(root_save_userid, root_save_sec_context); @@ -1621,11 +1648,11 @@ DefineIndex(Oid tableId, /* * For a concurrent build, it's important to make the catalog entries * visible to other transactions before we start to build the index. That - * will prevent them from making incompatible HOT updates. The new index - * will be marked not indisready and not indisvalid, so that no one else - * tries to either insert into it or use it for queries. + * will prevent them from making incompatible HOT updates. New indexes + * (main and auxiliary) will be marked not indisready and not indisvalid, + * so that no one else tries to either insert into it or use it for queries. * - * We must commit our current transaction so that the index becomes + * We must commit our current transaction so that the indexes becomes * visible; then start another. Note that all the data structures we just * built are lost in the commit. The only data we keep past here are the * relation IDs. @@ -1635,7 +1662,7 @@ DefineIndex(Oid tableId, * cannot block, even if someone else is waiting for access, because we * already have the same lock within our transaction. * - * Note: we don't currently bother with a session lock on the index, + * Note: we don't currently bother with a session lock on the indexes, * because there are no operations that could change its state while we * hold lock on the parent table. This might need to change later. */ @@ -1674,7 +1701,7 @@ DefineIndex(Oid tableId, * with the old list of indexes. Use ShareLock to consider running * transactions that hold locks that permit writing to the table. Note we * do not need to worry about xacts that open the table for writing after - * this point; they will see the new index when they open it. + * this point; they will see the new indexes when they open it. * * Note: the reason we use actual lock acquisition here, rather than just * checking the ProcArray and sleeping, is that deadlock is possible if @@ -1686,14 +1713,38 @@ DefineIndex(Oid tableId, /* * At this moment we are sure that there are no transactions with the - * table open for write that don't have this new index in their list of + * table open for write that don't have this new indexes in their list of * indexes. We have waited out all the existing transactions and any new - * transaction will have the new index in its list, but the index is still - * marked as "not-ready-for-inserts". The index is consulted while + * transaction will have both new indexes in its list, but indexes are still + * marked as "not-ready-for-inserts". The indexes are consulted while * deciding HOT-safety though. This arrangement ensures that no new HOT * chains can be created where the new tuple and the old tuple in the * chain have different index keys. * + * Now call build on auxiliary index. Index will be created empty without + * any actual heap scan, but marked as "ready-for-inserts". The goal of + * that index is accumulate new tuples while main index is actually built. + */ + index_concurrently_build(tableId, auxIndexRelationId); + + CommitTransactionCommand(); + StartTransactionCommand(); + + /* Tell concurrent index builds to ignore us, if index qualifies */ + if (safe_index) + set_indexsafe_procflags(); + pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, + PROGRESS_CREATEIDX_PHASE_WAIT_2); + /* + * Now we need to ensure are no transactions with the with auxiliary index + * marked as "not-ready-for-inserts". + */ + WaitForLockers(heaplocktag, ShareLock, true); + + /* + * At this moment we are sure what all new tuples in table are inserted into + * auxiliary index. Now it is time to build the target index itself. + * * We build the index using all tuples that are visible using multiple * refreshing snapshots. We can be sure that any HOT updates to * these tuples will be compatible with the index, since any updates made @@ -1722,9 +1773,28 @@ DefineIndex(Oid tableId, * the index marked as read-only for updates. */ pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, - PROGRESS_CREATEIDX_PHASE_WAIT_2); + PROGRESS_CREATEIDX_PHASE_WAIT_3); WaitForLockers(heaplocktag, ShareLock, true); + /* + * Updating pg_index might involve TOAST table access, so ensure we + * have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); + /* + * Now target index is marked as "ready" for all transactions. So, auxiliary + * index is not more needed. So, start removing process by reverting "ready" + * flag. + */ + index_set_state_flags(auxIndexRelationId, INDEX_DROP_CLEAR_READY); + PopActiveSnapshot(); + + CommitTransactionCommand(); + StartTransactionCommand(); + /* Tell concurrent index builds to ignore us, if index qualifies */ + if (safe_index) + set_indexsafe_procflags(); + /* * Now take the "reference snapshot" that will be used by validate_index() * to filter candidate tuples. Beware! There might still be snapshots in @@ -1742,24 +1812,14 @@ DefineIndex(Oid tableId, */ snapshot = RegisterSnapshot(GetTransactionSnapshot()); PushActiveSnapshot(snapshot); - - /* - * Scan the index and the heap, insert any missing index entries. - */ - validate_index(tableId, indexRelationId, snapshot); - /* - * Drop the reference snapshot. We must do this before waiting out other - * snapshot holders, else we will deadlock against other processes also - * doing CREATE INDEX CONCURRENTLY, which would see our snapshot as one - * they must wait for. But first, save the snapshot's xmin to use as - * limitXmin for GetCurrentVirtualXIDs(). + * Merge content of auxiliary and target indexes - insert any missing index entries. */ + validate_index(tableId, indexRelationId, auxIndexRelationId, snapshot); limitXmin = snapshot->xmin; PopActiveSnapshot(); UnregisterSnapshot(snapshot); - /* * The snapshot subsystem could still contain registered snapshots that * are holding back our process's advertised xmin; in particular, if @@ -1786,7 +1846,7 @@ DefineIndex(Oid tableId, */ INJECTION_POINT("define_index_before_set_valid", NULL); pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, - PROGRESS_CREATEIDX_PHASE_WAIT_3); + PROGRESS_CREATEIDX_PHASE_WAIT_4); WaitForOlderSnapshots(limitXmin, true); /* @@ -1811,6 +1871,53 @@ DefineIndex(Oid tableId, * to replan; so relcache flush on the index itself was sufficient.) */ CacheInvalidateRelcacheByRelid(heaprelid.relId); + CommitTransactionCommand(); + StartTransactionCommand(); + + /* Tell concurrent index builds to ignore us, if index qualifies */ + if (safe_index) + set_indexsafe_procflags(); + + pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, + PROGRESS_CREATEIDX_PHASE_WAIT_5); + /* Now wait for all transaction to see auxiliary as "non-ready for inserts" */ + WaitForLockers(heaplocktag, AccessExclusiveLock, true); + + CommitTransactionCommand(); + StartTransactionCommand(); + + /* + * Updating pg_index might involve TOAST table access, so ensure we + * have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); + /* Now it is time to mark auxiliary index as dead */ + index_concurrently_set_dead(tableId, auxIndexRelationId); + PopActiveSnapshot(); + + CommitTransactionCommand(); + StartTransactionCommand(); + + /* Tell concurrent index builds to ignore us, if index qualifies */ + if (safe_index) + set_indexsafe_procflags(); + + pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, + PROGRESS_CREATEIDX_PHASE_WAIT_6); + /* Now wait for all transaction to ignore auxiliary because it is dead */ + WaitForLockers(heaplocktag, AccessExclusiveLock, true); + + CommitTransactionCommand(); + StartTransactionCommand(); + + /* + * Drop auxiliary index. + * + * Use PERFORM_DELETION_CONCURRENT_LOCK so that index_drop() uses the + * right lock level. + */ + performDeletion(&auxAddress, DROP_RESTRICT, + PERFORM_DELETION_CONCURRENT_LOCK | PERFORM_DELETION_INTERNAL); /* * Last thing to do is release the session-level lock on the parent table. @@ -3531,6 +3638,7 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein typedef struct ReindexIndexInfo { Oid indexId; + Oid auxIndexId; Oid tableId; Oid amId; bool safe; /* for set_indexsafe_procflags */ @@ -3636,8 +3744,15 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein Oid cellOid = lfirst_oid(lc); Relation indexRelation = index_open(cellOid, ShareUpdateExclusiveLock); + IndexInfo* indexInfo = BuildDummyIndexInfo(indexRelation); - if (!indexRelation->rd_index->indisvalid) + + if (indexInfo->ii_Auxiliary) + ereport(WARNING,(errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("skipping reindex of auxiliary index \"%s.%s\"", + get_namespace_name(get_rel_namespace(cellOid)), + get_rel_name(cellOid)))); + else if (!indexRelation->rd_index->indisvalid) ereport(WARNING, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("skipping reindex of invalid index \"%s.%s\"", @@ -3689,8 +3804,15 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein Oid cellOid = lfirst_oid(lc2); Relation indexRelation = index_open(cellOid, ShareUpdateExclusiveLock); + IndexInfo* indexInfo = BuildDummyIndexInfo(indexRelation); - if (!indexRelation->rd_index->indisvalid) + if (indexInfo->ii_Auxiliary) + ereport(WARNING, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("skipping reindex of auxiliary index \"%s.%s\"", + get_namespace_name(get_rel_namespace(cellOid)), + get_rel_name(cellOid)))); + else if (!indexRelation->rd_index->indisvalid) ereport(WARNING, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("skipping reindex of invalid index \"%s.%s\"", @@ -3751,6 +3873,13 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot reindex invalid index on TOAST table"))); + /* Auxiliary indexes are not allowed to be rebuilt */ + if (get_rel_relam(relationOid) == STIR_AM_OID) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("reindex of auxiliary index \"%s\" not supported", + get_rel_name(relationOid)))); + /* * Check if parent relation can be locked and if it exists, * this needs to be done at this stage as the list of indexes @@ -3854,15 +3983,18 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein foreach(lc, indexIds) { char *concurrentName; + char *auxConcurrentName; ReindexIndexInfo *idx = lfirst(lc); ReindexIndexInfo *newidx; Oid newIndexId; + Oid auxIndexId; Relation indexRel; Relation heapRel; Oid save_userid; int save_sec_context; int save_nestlevel; Relation newIndexRel; + Relation auxIndexRel; LockRelId *lockrelid; Oid tablespaceid; @@ -3913,6 +4045,11 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein "ccnew", get_rel_namespace(indexRel->rd_index->indrelid), false); + auxConcurrentName = ChooseRelationName(get_rel_name(idx->indexId), + NULL, + "ccaux", + get_rel_namespace(indexRel->rd_index->indrelid), + false); /* Choose the new tablespace, indexes of toast tables are not moved */ if (OidIsValid(params->tablespaceOid) && @@ -3926,12 +4063,17 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein idx->indexId, tablespaceid, concurrentName); + auxIndexId = index_concurrently_create_aux(heapRel, + newIndexId, + tablespaceid, + auxConcurrentName); /* * Now open the relation of the new index, a session-level lock is * also needed on it. */ newIndexRel = index_open(newIndexId, ShareUpdateExclusiveLock); + auxIndexRel = index_open(auxIndexId, ShareUpdateExclusiveLock); /* * Save the list of OIDs and locks in private context @@ -3940,6 +4082,7 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein newidx = palloc_object(ReindexIndexInfo); newidx->indexId = newIndexId; + newidx->auxIndexId = auxIndexId; newidx->safe = idx->safe; newidx->tableId = idx->tableId; newidx->amId = idx->amId; @@ -3958,10 +4101,14 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein lockrelid = palloc_object(LockRelId); *lockrelid = newIndexRel->rd_lockInfo.lockRelId; relationLocks = lappend(relationLocks, lockrelid); + lockrelid = palloc_object(LockRelId); + *lockrelid = auxIndexRel->rd_lockInfo.lockRelId; + relationLocks = lappend(relationLocks, lockrelid); MemoryContextSwitchTo(oldcontext); index_close(indexRel, NoLock); + index_close(auxIndexRel, NoLock); index_close(newIndexRel, NoLock); /* Roll back any GUC changes executed by index functions */ @@ -4042,13 +4189,56 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein * doing that, wait until no running transactions could have the table of * the index open with the old list of indexes. See "phase 2" in * DefineIndex() for more details. + */ + + pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, + PROGRESS_CREATEIDX_PHASE_WAIT_1); + WaitForLockersMultiple(lockTags, ShareLock, true); + CommitTransactionCommand(); + + /* + * Now build all auxiliary indexes and mark them as "ready-for-inserts". + */ + foreach(lc, newIndexIds) + { + ReindexIndexInfo *newidx = lfirst(lc); + + StartTransactionCommand(); + + /* + * Check for user-requested abort. This is inside a transaction so as + * xact.c does not issue a useless WARNING, and ensures that + * session-level locks are cleaned up on abort. + */ + CHECK_FOR_INTERRUPTS(); + + /* Tell concurrent indexing to ignore us, if index qualifies */ + if (newidx->safe) + set_indexsafe_procflags(); + + /* Build auxiliary index, it is fast - without any actual heap scan, just an empty index. */ + index_concurrently_build(newidx->tableId, newidx->auxIndexId); + + CommitTransactionCommand(); + } + + StartTransactionCommand(); + + /* + * Because we don't take a snapshot in this transaction, there's no need + * to set the PROC_IN_SAFE_IC flag here. */ pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, - PROGRESS_CREATEIDX_PHASE_WAIT_1); + PROGRESS_CREATEIDX_PHASE_WAIT_2); + /* + * Wait until all auxiliary indexes are taken into account by all + * transactions. + */ WaitForLockersMultiple(lockTags, ShareLock, true); CommitTransactionCommand(); + /* Now it is time to perform target index build. */ foreach(lc, newIndexIds) { ReindexIndexInfo *newidx = lfirst(lc); @@ -4091,6 +4281,41 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein * need to set the PROC_IN_SAFE_IC flag here. */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, + PROGRESS_CREATEIDX_PHASE_WAIT_3); + WaitForLockersMultiple(lockTags, ShareLock, true); + CommitTransactionCommand(); + + /* + * At this moment all target indexes are marked as "ready-to-insert". So, + * we are free to start process of dropping auxiliary indexes. + */ + foreach(lc, newIndexIds) + { + ReindexIndexInfo *newidx = lfirst(lc); + StartTransactionCommand(); + /* + * Check for user-requested abort. This is inside a transaction so as + * xact.c does not issue a useless WARNING, and ensures that + * session-level locks are cleaned up on abort. + */ + CHECK_FOR_INTERRUPTS(); + + /* Tell concurrent indexing to ignore us, if index qualifies */ + if (newidx->safe) + set_indexsafe_procflags(); + + /* + * Updating pg_index might involve TOAST table access, so ensure we + * have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); + index_set_state_flags(newidx->auxIndexId, INDEX_DROP_CLEAR_READY); + PopActiveSnapshot(); + + CommitTransactionCommand(); + } + /* * Phase 3 of REINDEX CONCURRENTLY * @@ -4098,12 +4323,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein * were created during the previous phase. See "phase 3" in DefineIndex() * for more details. */ - - pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, - PROGRESS_CREATEIDX_PHASE_WAIT_2); - WaitForLockersMultiple(lockTags, ShareLock, true); - CommitTransactionCommand(); - foreach(lc, newIndexIds) { ReindexIndexInfo *newidx = lfirst(lc); @@ -4141,7 +4360,7 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein progress_vals[3] = newidx->amId; pgstat_progress_update_multi_param(4, progress_index, progress_vals); - validate_index(newidx->tableId, newidx->indexId, snapshot); + validate_index(newidx->tableId, newidx->indexId, newidx->auxIndexId, snapshot); /* * We can now do away with our active snapshot, we still need to save @@ -4170,7 +4389,7 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein * there's no need to set the PROC_IN_SAFE_IC flag here. */ pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, - PROGRESS_CREATEIDX_PHASE_WAIT_3); + PROGRESS_CREATEIDX_PHASE_WAIT_4); WaitForOlderSnapshots(limitXmin, true); CommitTransactionCommand(); @@ -4260,14 +4479,14 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein /* * Phase 5 of REINDEX CONCURRENTLY * - * Mark the old indexes as dead. First we must wait until no running - * transaction could be using the index for a query. See also + * Mark the old and auxiliary indexes as dead. First we must wait until no + * running transaction could be using the index for a query. See also * index_drop() for more details. */ INJECTION_POINT("reindex_relation_concurrently_before_set_dead", NULL); pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, - PROGRESS_CREATEIDX_PHASE_WAIT_4); + PROGRESS_CREATEIDX_PHASE_WAIT_5); WaitForLockersMultiple(lockTags, AccessExclusiveLock, true); foreach(lc, indexIds) @@ -4292,6 +4511,28 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein PopActiveSnapshot(); } + foreach(lc, newIndexIds) + { + ReindexIndexInfo *newidx = lfirst(lc); + + /* + * Check for user-requested abort. This is inside a transaction so as + * xact.c does not issue a useless WARNING, and ensures that + * session-level locks are cleaned up on abort. + */ + CHECK_FOR_INTERRUPTS(); + + /* + * Updating pg_index might involve TOAST table access, so ensure we + * have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); + + index_concurrently_set_dead(newidx->tableId, newidx->auxIndexId); + + PopActiveSnapshot(); + } + /* Commit this transaction to make the updates visible. */ CommitTransactionCommand(); StartTransactionCommand(); @@ -4305,11 +4546,11 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein /* * Phase 6 of REINDEX CONCURRENTLY * - * Drop the old indexes. + * Drop the old and auxiliary indexes. */ pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, - PROGRESS_CREATEIDX_PHASE_WAIT_5); + PROGRESS_CREATEIDX_PHASE_WAIT_6); WaitForLockersMultiple(lockTags, AccessExclusiveLock, true); PushActiveSnapshot(GetTransactionSnapshot()); @@ -4329,6 +4570,18 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein add_exact_object_address(&object, objects); } + foreach(lc, newIndexIds) + { + ReindexIndexInfo *idx = lfirst(lc); + ObjectAddress object; + + object.classId = RelationRelationId; + object.objectId = idx->auxIndexId; + object.objectSubId = 0; + + add_exact_object_address(&object, objects); + } + /* * Use PERFORM_DELETION_CONCURRENT_LOCK so that index_drop() uses the * right lock level. diff --git a/src/backend/nodes/makefuncs.c b/src/backend/nodes/makefuncs.c index e97e0943f5b7..b556ba4817b8 100644 --- a/src/backend/nodes/makefuncs.c +++ b/src/backend/nodes/makefuncs.c @@ -834,7 +834,7 @@ IndexInfo * makeIndexInfo(int numattrs, int numkeyattrs, Oid amoid, List *expressions, List *predicates, bool unique, bool nulls_not_distinct, bool isready, bool concurrent, bool summarizing, - bool withoutoverlaps) + bool withoutoverlaps, bool auxiliary) { IndexInfo *n = makeNode(IndexInfo); @@ -850,6 +850,7 @@ makeIndexInfo(int numattrs, int numkeyattrs, Oid amoid, List *expressions, n->ii_Concurrent = concurrent; n->ii_Summarizing = summarizing; n->ii_WithoutOverlaps = withoutoverlaps; + n->ii_Auxiliary = auxiliary; /* summarizing indexes cannot contain non-key attributes */ Assert(!summarizing || (numkeyattrs == numattrs)); @@ -875,7 +876,6 @@ makeIndexInfo(int numattrs, int numkeyattrs, Oid amoid, List *expressions, /* initialize index-build state to default */ n->ii_BrokenHotChain = false; n->ii_ParallelWorkers = 0; - n->ii_Auxiliary = false; /* set up for possible use by index AM */ n->ii_Am = amoid; diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index acd20dbfab86..6c43f47814d1 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -708,11 +708,12 @@ typedef struct TableAmRoutine TableScanDesc scan); /* see table_index_validate_scan for reference about parameters */ - void (*index_validate_scan) (Relation table_rel, - Relation index_rel, - struct IndexInfo *index_info, - Snapshot snapshot, - struct ValidateIndexState *state); + void (*index_validate_scan) (Relation table_rel, + Relation index_rel, + struct IndexInfo *index_info, + Snapshot snapshot, + struct ValidateIndexState *state, + struct ValidateIndexState *aux_state); /* ------------------------------------------------------------------------ @@ -1820,19 +1821,24 @@ table_index_build_range_scan(Relation table_rel, * table_index_validate_scan - second table scan for concurrent index build * * See validate_index() for an explanation. + * + * Note: it is responsibility of that function to close sortstates in + * both `state` and `auxstate`. */ static inline void table_index_validate_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, Snapshot snapshot, - struct ValidateIndexState *state) + struct ValidateIndexState *state, + struct ValidateIndexState *auxstate) { - table_rel->rd_tableam->index_validate_scan(table_rel, - index_rel, - index_info, - snapshot, - state); + return table_rel->rd_tableam->index_validate_scan(table_rel, + index_rel, + index_info, + snapshot, + state, + auxstate); } diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 4daa8bef5eea..4713f18e68d8 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -25,6 +25,7 @@ typedef enum { INDEX_CREATE_SET_READY, INDEX_CREATE_SET_VALID, + INDEX_DROP_CLEAR_READY, INDEX_DROP_CLEAR_VALID, INDEX_DROP_SET_DEAD, } IndexStateFlagsAction; @@ -65,6 +66,7 @@ extern void index_check_primary_key(Relation heapRel, #define INDEX_CREATE_IF_NOT_EXISTS (1 << 4) #define INDEX_CREATE_PARTITIONED (1 << 5) #define INDEX_CREATE_INVALID (1 << 6) +#define INDEX_CREATE_AUXILIARY (1 << 7) extern Oid index_create(Relation heapRelation, const char *indexRelationName, @@ -86,7 +88,8 @@ extern Oid index_create(Relation heapRelation, bits16 constr_flags, bool allow_system_table_mods, bool is_internal, - Oid *constraintId); + Oid *constraintId, + char relpersistence); #define INDEX_CONSTR_CREATE_MARK_AS_PRIMARY (1 << 0) #define INDEX_CONSTR_CREATE_DEFERRABLE (1 << 1) @@ -100,6 +103,11 @@ extern Oid index_concurrently_create_copy(Relation heapRelation, Oid tablespaceOid, const char *newName); +extern Oid index_concurrently_create_aux(Relation heapRelation, + Oid mainIndexId, + Oid tablespaceOid, + const char *newName); + extern void index_concurrently_build(Oid heapRelationId, Oid indexRelationId); @@ -145,7 +153,7 @@ extern void index_build(Relation heapRelation, bool isreindex, bool parallel); -extern void validate_index(Oid heapId, Oid indexId, Snapshot snapshot); +extern void validate_index(Oid heapId, Oid indexId, Oid auxIndexId, Snapshot snapshot); extern void index_set_state_flags(Oid indexId, IndexStateFlagsAction action); diff --git a/src/include/commands/progress.h b/src/include/commands/progress.h index 7c736e7b03bc..6e14577ef9bc 100644 --- a/src/include/commands/progress.h +++ b/src/include/commands/progress.h @@ -94,14 +94,15 @@ /* Phases of CREATE INDEX (as advertised via PROGRESS_CREATEIDX_PHASE) */ #define PROGRESS_CREATEIDX_PHASE_WAIT_1 1 -#define PROGRESS_CREATEIDX_PHASE_BUILD 2 -#define PROGRESS_CREATEIDX_PHASE_WAIT_2 3 -#define PROGRESS_CREATEIDX_PHASE_VALIDATE_IDXSCAN 4 -#define PROGRESS_CREATEIDX_PHASE_VALIDATE_SORT 5 -#define PROGRESS_CREATEIDX_PHASE_VALIDATE_TABLESCAN 6 -#define PROGRESS_CREATEIDX_PHASE_WAIT_3 7 +#define PROGRESS_CREATEIDX_PHASE_WAIT_2 2 +#define PROGRESS_CREATEIDX_PHASE_BUILD 3 +#define PROGRESS_CREATEIDX_PHASE_WAIT_3 4 +#define PROGRESS_CREATEIDX_PHASE_VALIDATE_IDXSCAN 5 +#define PROGRESS_CREATEIDX_PHASE_VALIDATE_SORT 6 +#define PROGRESS_CREATEIDX_PHASE_VALIDATE_IDXMERGE 7 #define PROGRESS_CREATEIDX_PHASE_WAIT_4 8 #define PROGRESS_CREATEIDX_PHASE_WAIT_5 9 +#define PROGRESS_CREATEIDX_PHASE_WAIT_6 10 /* * Subphases of CREATE INDEX, for index_build. diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 0341bb743250..e02fc6aa3e6f 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -186,8 +186,8 @@ typedef struct ExprState * AmCache private cache area for index AM * Context memory context holding this IndexInfo * - * ii_Concurrent, ii_BrokenHotChain, ii_Auxiliary and ii_ParallelWorkers - * are used only during index build; they're conventionally zeroed otherwise. + * ii_Concurrent, ii_BrokenHotChain, and ii_ParallelWorkers are used only + * during index build; they're conventionally zeroed otherwise. * ---------------- */ typedef struct IndexInfo diff --git a/src/include/nodes/makefuncs.h b/src/include/nodes/makefuncs.h index 5473ce9a288a..4904748f5fc2 100644 --- a/src/include/nodes/makefuncs.h +++ b/src/include/nodes/makefuncs.h @@ -99,7 +99,8 @@ extern IndexInfo *makeIndexInfo(int numattrs, int numkeyattrs, Oid amoid, List *expressions, List *predicates, bool unique, bool nulls_not_distinct, bool isready, bool concurrent, - bool summarizing, bool withoutoverlaps); + bool summarizing, bool withoutoverlaps, + bool auxiliary); extern Node *makeStringConst(char *str, int location); extern DefElem *makeDefElem(char *name, Node *arg, int location); diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index 9ade7b835e69..ca74844b5c63 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -1423,6 +1423,7 @@ DETAIL: Key (f1)=(b) already exists. CREATE UNIQUE INDEX CONCURRENTLY concur_index3 ON concur_heap(f2); ERROR: could not create unique index "concur_index3" DETAIL: Key (f2)=(b) is duplicated. +DROP INDEX concur_index3_ccaux; -- test that expression indexes and partial indexes work concurrently CREATE INDEX CONCURRENTLY concur_index4 on concur_heap(f2) WHERE f1='a'; CREATE INDEX CONCURRENTLY concur_index5 on concur_heap(f2) WHERE f1='x'; @@ -3197,6 +3198,7 @@ INSERT INTO concur_reindex_tab4 VALUES (1), (1), (2); CREATE UNIQUE INDEX CONCURRENTLY concur_reindex_ind5 ON concur_reindex_tab4 (c1); ERROR: could not create unique index "concur_reindex_ind5" DETAIL: Key (c1)=(1) is duplicated. +DROP INDEX concur_reindex_ind5_ccaux; -- Reindexing concurrently this index fails with the same failure. -- The extra index created is itself invalid, and can be dropped. REINDEX INDEX CONCURRENTLY concur_reindex_ind5; @@ -3209,8 +3211,10 @@ DETAIL: Key (c1)=(1) is duplicated. c1 | integer | | | Indexes: "concur_reindex_ind5" UNIQUE, btree (c1) INVALID + "concur_reindex_ind5_ccaux" stir (c1) INVALID "concur_reindex_ind5_ccnew" UNIQUE, btree (c1) INVALID +DROP INDEX concur_reindex_ind5_ccaux; DROP INDEX concur_reindex_ind5_ccnew; -- This makes the previous failure go away, so the index can become valid. DELETE FROM concur_reindex_tab4 WHERE c1 = 1; @@ -3238,6 +3242,44 @@ Indexes: "concur_reindex_ind5" UNIQUE, btree (c1) DROP TABLE concur_reindex_tab4; +-- Check handling of auxiliary indexes +CREATE TABLE aux_index_tab5 (c1 int); +INSERT INTO aux_index_tab5 VALUES (1), (1), (2); +-- This trick creates an invalid index and auxiliary index for it +CREATE UNIQUE INDEX CONCURRENTLY aux_index_ind6 ON aux_index_tab5 (c1); +ERROR: could not create unique index "aux_index_ind6" +DETAIL: Key (c1)=(1) is duplicated. +\d aux_index_tab5 + Table "public.aux_index_tab5" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | +Indexes: + "aux_index_ind6" UNIQUE, btree (c1) INVALID + "aux_index_ind6_ccaux" stir (c1) INVALID + +-- Not allowed to reindex auxiliary index +REINDEX INDEX aux_index_ind6_ccaux; +ERROR: reindex of auxiliary index "aux_index_ind6_ccaux" not supported +-- Concurrently also +REINDEX INDEX CONCURRENTLY aux_index_ind6_ccaux; +ERROR: reindex of auxiliary index "aux_index_ind6_ccaux" not supported +-- This makes the previous failure go away, so the index can become valid. +DELETE FROM concur_reindex_tab4 WHERE c1 = 1; +ERROR: relation "concur_reindex_tab4" does not exist +LINE 1: DELETE FROM concur_reindex_tab4 WHERE c1 = 1; + ^ +-- Should be skipped during reindex +REINDEX TABLE aux_index_tab5; +ERROR: could not create unique index "aux_index_ind6" +DETAIL: Key (c1)=(1) is duplicated. +-- Should be skipped during concurrent reindex +REINDEX TABLE CONCURRENTLY aux_index_tab5; +WARNING: skipping reindex of invalid index "public.aux_index_ind6" +HINT: Use DROP INDEX or REINDEX INDEX. +WARNING: skipping reindex of auxiliary index "public.aux_index_ind6_ccaux" +NOTICE: table "aux_index_tab5" has no indexes that can be reindexed concurrently +DROP TABLE aux_index_tab5; -- Check handling of indexes with expressions and predicates. The -- definitions of the rebuilt indexes should match the original -- definitions. diff --git a/src/test/regress/expected/indexing.out b/src/test/regress/expected/indexing.out index bcf1db11d731..3fecaa38850f 100644 --- a/src/test/regress/expected/indexing.out +++ b/src/test/regress/expected/indexing.out @@ -1585,10 +1585,11 @@ select indexrelid::regclass, indisvalid, --------------------------------+------------+-----------------------+------------------------------- parted_isvalid_idx | f | parted_isvalid_tab | parted_isvalid_idx_11 | f | parted_isvalid_tab_11 | parted_isvalid_tab_1_expr_idx + parted_isvalid_idx_11_ccaux | f | parted_isvalid_tab_11 | parted_isvalid_tab_12_expr_idx | t | parted_isvalid_tab_12 | parted_isvalid_tab_1_expr_idx parted_isvalid_tab_1_expr_idx | f | parted_isvalid_tab_1 | parted_isvalid_idx parted_isvalid_tab_2_expr_idx | t | parted_isvalid_tab_2 | parted_isvalid_idx -(5 rows) +(6 rows) drop table parted_isvalid_tab; -- Check state of replica indexes when attaching a partition. diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 6cf828ca8d0d..ed6c20a495c3 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2041,14 +2041,15 @@ pg_stat_progress_create_index| SELECT s.pid, CASE s.param10 WHEN 0 THEN 'initializing'::text WHEN 1 THEN 'waiting for writers before build'::text - WHEN 2 THEN ('building index'::text || COALESCE((': '::text || pg_indexam_progress_phasename((s.param9)::oid, s.param11)), ''::text)) - WHEN 3 THEN 'waiting for writers before validation'::text - WHEN 4 THEN 'index validation: scanning index'::text - WHEN 5 THEN 'index validation: sorting tuples'::text - WHEN 6 THEN 'index validation: scanning table'::text - WHEN 7 THEN 'waiting for old snapshots'::text - WHEN 8 THEN 'waiting for readers before marking dead'::text - WHEN 9 THEN 'waiting for readers before dropping'::text + WHEN 2 THEN 'waiting for writers to use auxiliary index'::text + WHEN 3 THEN ('building index'::text || COALESCE((': '::text || pg_indexam_progress_phasename((s.param9)::oid, s.param11)), ''::text)) + WHEN 4 THEN 'waiting for writers before validation'::text + WHEN 5 THEN 'index validation: scanning index'::text + WHEN 6 THEN 'index validation: sorting tuples'::text + WHEN 7 THEN 'index validation: merging indexes'::text + WHEN 8 THEN 'waiting for old snapshots'::text + WHEN 9 THEN 'waiting for readers before marking dead'::text + WHEN 10 THEN 'waiting for readers before dropping'::text ELSE NULL::text END AS phase, s.param4 AS lockers_total, diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql index e21ff426519b..2cff1ac29be8 100644 --- a/src/test/regress/sql/create_index.sql +++ b/src/test/regress/sql/create_index.sql @@ -499,6 +499,7 @@ CREATE UNIQUE INDEX CONCURRENTLY IF NOT EXISTS concur_index2 ON concur_heap(f1); INSERT INTO concur_heap VALUES ('b','x'); -- check if constraint is enforced properly at build time CREATE UNIQUE INDEX CONCURRENTLY concur_index3 ON concur_heap(f2); +DROP INDEX concur_index3_ccaux; -- test that expression indexes and partial indexes work concurrently CREATE INDEX CONCURRENTLY concur_index4 on concur_heap(f2) WHERE f1='a'; CREATE INDEX CONCURRENTLY concur_index5 on concur_heap(f2) WHERE f1='x'; @@ -1311,10 +1312,12 @@ CREATE TABLE concur_reindex_tab4 (c1 int); INSERT INTO concur_reindex_tab4 VALUES (1), (1), (2); -- This trick creates an invalid index. CREATE UNIQUE INDEX CONCURRENTLY concur_reindex_ind5 ON concur_reindex_tab4 (c1); +DROP INDEX concur_reindex_ind5_ccaux; -- Reindexing concurrently this index fails with the same failure. -- The extra index created is itself invalid, and can be dropped. REINDEX INDEX CONCURRENTLY concur_reindex_ind5; \d concur_reindex_tab4 +DROP INDEX concur_reindex_ind5_ccaux; DROP INDEX concur_reindex_ind5_ccnew; -- This makes the previous failure go away, so the index can become valid. DELETE FROM concur_reindex_tab4 WHERE c1 = 1; @@ -1326,6 +1329,24 @@ REINDEX INDEX CONCURRENTLY concur_reindex_ind5; \d concur_reindex_tab4 DROP TABLE concur_reindex_tab4; +-- Check handling of auxiliary indexes +CREATE TABLE aux_index_tab5 (c1 int); +INSERT INTO aux_index_tab5 VALUES (1), (1), (2); +-- This trick creates an invalid index and auxiliary index for it +CREATE UNIQUE INDEX CONCURRENTLY aux_index_ind6 ON aux_index_tab5 (c1); +\d aux_index_tab5 +-- Not allowed to reindex auxiliary index +REINDEX INDEX aux_index_ind6_ccaux; +-- Concurrently also +REINDEX INDEX CONCURRENTLY aux_index_ind6_ccaux; +-- This makes the previous failure go away, so the index can become valid. +DELETE FROM concur_reindex_tab4 WHERE c1 = 1; +-- Should be skipped during reindex +REINDEX TABLE aux_index_tab5; +-- Should be skipped during concurrent reindex +REINDEX TABLE CONCURRENTLY aux_index_tab5; +DROP TABLE aux_index_tab5; + -- Check handling of indexes with expressions and predicates. The -- definitions of the rebuilt indexes should match the original -- definitions. From 0fe2418cd3bb08082fec2447ad5a22814158b38e Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu Date: Tue, 31 Dec 2024 14:36:31 +0100 Subject: [PATCH 09/12] Track and drop auxiliary indexes in DROP/REINDEX During concurrent index operations, auxiliary indexes may be left as orphaned objects when errors occur (junk auxiliary indexes). This patch improves the handling of such auxiliary indexes: - add auxiliaryForIndexId parameter to index_create() to track dependencies between main and auxiliary indexes - automatically drop auxiliary indexes when the main index is dropped - delete junk auxiliary indexes properly during REINDEX operations --- doc/src/sgml/ref/create_index.sgml | 14 ++- doc/src/sgml/ref/reindex.sgml | 8 +- src/backend/catalog/dependency.c | 2 +- src/backend/catalog/index.c | 64 ++++++++++--- src/backend/catalog/pg_depend.c | 57 ++++++++++++ src/backend/catalog/toasting.c | 2 +- src/backend/commands/indexcmds.c | 35 ++++++- src/backend/commands/tablecmds.c | 48 +++++++++- src/include/catalog/dependency.h | 1 + src/include/catalog/index.h | 1 + src/test/regress/expected/create_index.out | 103 +++++++++++++++++++-- src/test/regress/sql/create_index.sql | 57 +++++++++++- 12 files changed, 357 insertions(+), 35 deletions(-) diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml index e7a7a1607424..298a093f5545 100644 --- a/doc/src/sgml/ref/create_index.sgml +++ b/doc/src/sgml/ref/create_index.sgml @@ -668,10 +668,16 @@ Indexes: "idx_ccaux" stir (col) INVALID - The recommended recovery - method in such cases is to drop these indexes and try again to perform - CREATE INDEX CONCURRENTLY. (Another possibility is - to rebuild the index with REINDEX INDEX CONCURRENTLY). + The recommended recovery method in such cases is to drop the index with + DROP INDEX. The auxiliary index (suffixed with + ccaux) will be automatically dropped when the main + index is dropped. After dropping the indexes, you can try again to perform + CREATE INDEX CONCURRENTLY. (Another possibility is to + rebuild the index with REINDEX INDEX CONCURRENTLY, + which will also handle cleanup of any invalid auxiliary indexes.) + If the only invalid index is one suffixed ccaux + recommended recovery method is just DROP INDEX + for that index. diff --git a/doc/src/sgml/ref/reindex.sgml b/doc/src/sgml/ref/reindex.sgml index 4ed3c969012f..d62791ff9c38 100644 --- a/doc/src/sgml/ref/reindex.sgml +++ b/doc/src/sgml/ref/reindex.sgml @@ -477,11 +477,15 @@ Indexes: _ccnew or _ccaux, then it corresponds to the transient or auxiliary index created during the concurrent operation, and the recommended recovery method is to drop these indexes using DROP INDEX, - then attempt REINDEX CONCURRENTLY again. + then attempt REINDEX CONCURRENTLY again. The auxiliary index + (suffixed with _ccaux) will be automatically dropped + along with its main index. If the invalid index is instead suffixed _ccold, it corresponds to the original index which could not be dropped; the recommended recovery method is to just drop said index, since the - rebuild proper has been successful. + rebuild proper has been successful. If the only + invalid index is one suffixed ccaux recommended + recovery method is just DROP INDEX for that index. A nonzero number may be appended to the suffix of the invalid index names to keep them unique, like _ccnew1, _ccold2, etc. diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c index 18316a3968bc..ab4c3e2fb4a5 100644 --- a/src/backend/catalog/dependency.c +++ b/src/backend/catalog/dependency.c @@ -286,7 +286,7 @@ performDeletion(const ObjectAddress *object, * Acquire deletion lock on the target object. (Ideally the caller has * done this already, but many places are sloppy about it.) */ - AcquireDeletionLock(object, 0); + AcquireDeletionLock(object, flags); /* * Construct a list of objects to delete (ie, the given object plus diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 6c09c6a2b676..bf0bb79474b6 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -688,6 +688,8 @@ UpdateIndexRelation(Oid indexoid, * parent index; otherwise InvalidOid. * parentConstraintId: if creating a constraint on a partition, the OID * of the constraint in the parent; otherwise InvalidOid. + * auxiliaryForIndexId: if creating auxiliary index, the OID of the main + * index; otherwise InvalidOid. * relFileNumber: normally, pass InvalidRelFileNumber to get new storage. * May be nonzero to attach an existing valid build. * indexInfo: same info executor uses to insert into the index @@ -734,6 +736,7 @@ index_create(Relation heapRelation, Oid indexRelationId, Oid parentIndexRelid, Oid parentConstraintId, + Oid auxiliaryForIndexId, RelFileNumber relFileNumber, IndexInfo *indexInfo, const List *indexColNames, @@ -776,6 +779,8 @@ index_create(Relation heapRelation, ((flags & INDEX_CREATE_ADD_CONSTRAINT) != 0)); /* partitioned indexes must never be "built" by themselves */ Assert(!partitioned || (flags & INDEX_CREATE_SKIP_BUILD)); + /* auxiliaryForIndexId and INDEX_CREATE_AUXILIARY are required both or neither */ + Assert(OidIsValid(auxiliaryForIndexId) == auxiliary); relkind = partitioned ? RELKIND_PARTITIONED_INDEX : RELKIND_INDEX; is_exclusion = (indexInfo->ii_ExclusionOps != NULL); @@ -1177,6 +1182,15 @@ index_create(Relation heapRelation, recordDependencyOn(&myself, &referenced, DEPENDENCY_PARTITION_SEC); } + /* + * Record dependency on the main index in case of auxiliary index. + */ + if (OidIsValid(auxiliaryForIndexId)) + { + ObjectAddressSet(referenced, RelationRelationId, auxiliaryForIndexId); + recordDependencyOn(&myself, &referenced, DEPENDENCY_AUTO); + } + /* placeholder for normal dependencies */ addrs = new_object_addresses(); @@ -1459,6 +1473,7 @@ index_concurrently_create_copy(Relation heapRelation, Oid oldIndexId, InvalidOid, /* indexRelationId */ InvalidOid, /* parentIndexRelid */ InvalidOid, /* parentConstraintId */ + InvalidOid, /* auxiliaryForIndexId */ InvalidRelFileNumber, /* relFileNumber */ newInfo, indexColNames, @@ -1609,6 +1624,7 @@ index_concurrently_create_aux(Relation heapRelation, Oid mainIndexId, InvalidOid, /* indexRelationId */ InvalidOid, /* parentIndexRelid */ InvalidOid, /* parentConstraintId */ + mainIndexId, /* auxiliaryForIndexId */ InvalidRelFileNumber, /* relFileNumber */ newInfo, indexColNames, @@ -3842,6 +3858,7 @@ reindex_index(const ReindexStmt *stmt, Oid indexId, heapRelation; Oid heapId; Oid save_userid; + Oid junkAuxIndexId; int save_sec_context; int save_nestlevel; IndexInfo *indexInfo; @@ -3898,6 +3915,19 @@ reindex_index(const ReindexStmt *stmt, Oid indexId, pgstat_progress_update_multi_param(2, progress_cols, progress_vals); } + /* Check for the auxiliary index for that index, it needs to dropped */ + junkAuxIndexId = get_auxiliary_index(indexId); + if (OidIsValid(junkAuxIndexId)) + { + ObjectAddress object; + object.classId = RelationRelationId; + object.objectId = junkAuxIndexId; + object.objectSubId = 0; + performDeletion(&object, DROP_RESTRICT, + PERFORM_DELETION_INTERNAL | + PERFORM_DELETION_QUIETLY); + } + /* * Open the target index relation and get an exclusive lock on it, to * ensure that no one else is touching this particular index. @@ -4186,7 +4216,8 @@ reindex_relation(const ReindexStmt *stmt, Oid relid, int flags, { Relation rel; Oid toast_relid; - List *indexIds; + List *indexIds, + *auxIndexIds = NIL; char persistence; bool result = false; ListCell *indexId; @@ -4275,13 +4306,30 @@ reindex_relation(const ReindexStmt *stmt, Oid relid, int flags, else persistence = rel->rd_rel->relpersistence; + foreach(indexId, indexIds) + { + Oid indexOid = lfirst_oid(indexId); + Oid indexAm = get_rel_relam(indexOid); + + /* All STIR indexes are auxiliary indexes */ + if (indexAm == STIR_AM_OID) + { + if (flags & REINDEX_REL_SUPPRESS_INDEX_USE) + RemoveReindexPending(indexOid); + auxIndexIds = lappend_oid(auxIndexIds, indexOid); + } + } + /* Reindex all the indexes. */ i = 1; foreach(indexId, indexIds) { Oid indexOid = lfirst_oid(indexId); Oid indexNamespaceId = get_rel_namespace(indexOid); - Oid indexAm = get_rel_relam(indexOid); + + /* Auxiliary indexes are going to be dropped during main index rebuild */ + if (list_member_oid(auxIndexIds, indexOid)) + continue; /* * Skip any invalid indexes on a TOAST table. These can only be @@ -4307,18 +4355,6 @@ reindex_relation(const ReindexStmt *stmt, Oid relid, int flags, continue; } - if (indexAm == STIR_AM_OID) - { - ereport(WARNING, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("skipping reindex of auxiliary index \"%s.%s\"", - get_namespace_name(indexNamespaceId), - get_rel_name(indexOid)))); - if (flags & REINDEX_REL_SUPPRESS_INDEX_USE) - RemoveReindexPending(indexOid); - continue; - } - reindex_index(stmt, indexOid, !(flags & REINDEX_REL_CHECK_CONSTRAINTS), persistence, params); diff --git a/src/backend/catalog/pg_depend.c b/src/backend/catalog/pg_depend.c index c8b11f887e27..1c275ef373f2 100644 --- a/src/backend/catalog/pg_depend.c +++ b/src/backend/catalog/pg_depend.c @@ -1035,6 +1035,63 @@ get_index_constraint(Oid indexId) return constraintId; } +/* + * get_auxiliary_index + * Given the OID of an index, return the OID of its auxiliary + * index, or InvalidOid if there is no auxiliary index. + */ +Oid +get_auxiliary_index(Oid indexId) +{ + Oid auxiliaryIndexOid = InvalidOid; + Relation depRel; + ScanKeyData key[3]; + SysScanDesc scan; + HeapTuple tup; + + /* Search the dependency table for the index */ + depRel = table_open(DependRelationId, AccessShareLock); + + ScanKeyInit(&key[0], + Anum_pg_depend_refclassid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(RelationRelationId)); + ScanKeyInit(&key[1], + Anum_pg_depend_refobjid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(indexId)); + ScanKeyInit(&key[2], + Anum_pg_depend_refobjsubid, + BTEqualStrategyNumber, F_INT4EQ, + Int32GetDatum(0)); + + scan = systable_beginscan(depRel, DependReferenceIndexId, true, + NULL, 3, key); + + + while (HeapTupleIsValid(tup = systable_getnext(scan))) + { + Form_pg_depend deprec = (Form_pg_depend) GETSTRUCT(tup); + + /* + * We assume any AUTO dependency on index with rel_kind + * of RELKIND_INDEX is that we are looking for. + */ + if (deprec->classid == RelationRelationId && + (deprec->deptype == DEPENDENCY_AUTO) && + get_rel_relkind(deprec->objid) == RELKIND_INDEX) + { + auxiliaryIndexOid = deprec->objid; + break; + } + } + + systable_endscan(scan); + table_close(depRel, AccessShareLock); + + return auxiliaryIndexOid; +} + /* * get_index_ref_constraints * Given the OID of an index, return the OID of all foreign key diff --git a/src/backend/catalog/toasting.c b/src/backend/catalog/toasting.c index 0ee2fd5e7dea..0ee8cbf4ca64 100644 --- a/src/backend/catalog/toasting.c +++ b/src/backend/catalog/toasting.c @@ -319,7 +319,7 @@ create_toast_table(Relation rel, Oid toastOid, Oid toastIndexOid, coloptions[1] = 0; index_create(toast_rel, toast_idxname, toastIndexOid, InvalidOid, - InvalidOid, InvalidOid, + InvalidOid, InvalidOid, InvalidOid, indexInfo, list_make2("chunk_id", "chunk_seq"), BTREE_AM_OID, diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 9ca11b21023a..3b25068dad38 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1260,7 +1260,7 @@ DefineIndex(Oid tableId, indexRelationId = index_create(rel, indexRelationName, indexRelationId, parentIndexId, - parentConstraintId, + parentConstraintId, InvalidOid, stmt->oldNumber, indexInfo, indexColNames, accessMethodId, tablespaceId, collationIds, opclassIds, opclassOptions, @@ -3639,6 +3639,7 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein { Oid indexId; Oid auxIndexId; + Oid junkAuxIndexId; Oid tableId; Oid amId; bool safe; /* for set_indexsafe_procflags */ @@ -3988,6 +3989,7 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein ReindexIndexInfo *newidx; Oid newIndexId; Oid auxIndexId; + Oid junkAuxIndexId; Relation indexRel; Relation heapRel; Oid save_userid; @@ -3995,6 +3997,7 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein int save_nestlevel; Relation newIndexRel; Relation auxIndexRel; + Relation junkAuxIndexRel; LockRelId *lockrelid; Oid tablespaceid; @@ -4068,12 +4071,17 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein tablespaceid, auxConcurrentName); + /* Search for auxiliary index for reindexed index, to drop it */ + junkAuxIndexId = get_auxiliary_index(idx->indexId); + /* * Now open the relation of the new index, a session-level lock is * also needed on it. */ newIndexRel = index_open(newIndexId, ShareUpdateExclusiveLock); auxIndexRel = index_open(auxIndexId, ShareUpdateExclusiveLock); + if (OidIsValid(junkAuxIndexId)) + junkAuxIndexRel = index_open(junkAuxIndexId, ShareUpdateExclusiveLock); /* * Save the list of OIDs and locks in private context @@ -4083,6 +4091,7 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein newidx = palloc_object(ReindexIndexInfo); newidx->indexId = newIndexId; newidx->auxIndexId = auxIndexId; + newidx->junkAuxIndexId = junkAuxIndexId; newidx->safe = idx->safe; newidx->tableId = idx->tableId; newidx->amId = idx->amId; @@ -4104,10 +4113,18 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein lockrelid = palloc_object(LockRelId); *lockrelid = auxIndexRel->rd_lockInfo.lockRelId; relationLocks = lappend(relationLocks, lockrelid); + if (OidIsValid(junkAuxIndexId)) + { + lockrelid = palloc_object(LockRelId); + *lockrelid = junkAuxIndexRel->rd_lockInfo.lockRelId; + relationLocks = lappend(relationLocks, lockrelid); + } MemoryContextSwitchTo(oldcontext); index_close(indexRel, NoLock); + if (OidIsValid(junkAuxIndexId)) + index_close(junkAuxIndexRel, NoLock); index_close(auxIndexRel, NoLock); index_close(newIndexRel, NoLock); @@ -4288,7 +4305,8 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein /* * At this moment all target indexes are marked as "ready-to-insert". So, - * we are free to start process of dropping auxiliary indexes. + * we are free to start process of dropping auxiliary indexes - including + * just indexes detected earlier. */ foreach(lc, newIndexIds) { @@ -4311,6 +4329,9 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein */ PushActiveSnapshot(GetTransactionSnapshot()); index_set_state_flags(newidx->auxIndexId, INDEX_DROP_CLEAR_READY); + /* Ensure just index is marked as non-ready */ + if (OidIsValid(newidx->junkAuxIndexId)) + index_set_state_flags(newidx->junkAuxIndexId, INDEX_DROP_CLEAR_READY); PopActiveSnapshot(); CommitTransactionCommand(); @@ -4529,6 +4550,8 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein PushActiveSnapshot(GetTransactionSnapshot()); index_concurrently_set_dead(newidx->tableId, newidx->auxIndexId); + if (OidIsValid(newidx->junkAuxIndexId)) + index_concurrently_set_dead(newidx->tableId, newidx->junkAuxIndexId); PopActiveSnapshot(); } @@ -4580,6 +4603,14 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein object.objectSubId = 0; add_exact_object_address(&object, objects); + + if (OidIsValid(idx->junkAuxIndexId)) + { + + object.objectId = idx->junkAuxIndexId; + object.objectSubId = 0; + add_exact_object_address(&object, objects); + } } /* diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index acf11e83c04e..f11461b4941a 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -1532,6 +1532,8 @@ RemoveRelations(DropStmt *drop) ListCell *cell; int flags = 0; LOCKMODE lockmode = AccessExclusiveLock; + MemoryContext private_context, + oldcontext; /* DROP CONCURRENTLY uses a weaker lock, and has some restrictions */ if (drop->concurrent) @@ -1592,9 +1594,20 @@ RemoveRelations(DropStmt *drop) relkind = 0; /* keep compiler quiet */ break; } + /* + * Create a memory context that will survive forced transaction commits we + * may need to do below (in case of concurrent index drop). + * Since it is a child of PortalContext, it will go away eventually even if + * we suffer an error; there's no need for special abort cleanup logic. + */ + private_context = AllocSetContextCreate(PortalContext, + "RemoveRelations", + ALLOCSET_SMALL_SIZES); + oldcontext = MemoryContextSwitchTo(private_context); /* Lock and validate each relation; build a list of object addresses */ objects = new_object_addresses(); + MemoryContextSwitchTo(oldcontext); foreach(cell, drop->objects) { @@ -1646,6 +1659,34 @@ RemoveRelations(DropStmt *drop) flags |= PERFORM_DELETION_CONCURRENTLY; } + /* + * Concurrent index drop requires to be the first transaction. But in + * case we have junk auxiliary index - we want to drop it too (and also + * in a concurrent way). In this case perform silent internal deletion + * of auxiliary index, and restore transaction state. It is fine to do it + * in the loop because there is only single element in drop->objects. + */ + if ((flags & PERFORM_DELETION_CONCURRENTLY) != 0 && + state.actual_relkind == RELKIND_INDEX) + { + Oid junkAuxIndexOid = get_auxiliary_index(relOid); + if (OidIsValid(junkAuxIndexOid)) + { + ObjectAddress object; + object.classId = RelationRelationId; + object.objectId = junkAuxIndexOid; + object.objectSubId = 0; + performDeletion(&object, DROP_RESTRICT, + PERFORM_DELETION_CONCURRENTLY | + PERFORM_DELETION_INTERNAL | + PERFORM_DELETION_QUIETLY); + CommitTransactionCommand(); + StartTransactionCommand(); + PushActiveSnapshot(GetTransactionSnapshot()); + PreventInTransactionBlock(true, "DROP INDEX CONCURRENTLY"); + } + } + /* * Concurrent index drop cannot be used with partitioned indexes, * either. @@ -1674,12 +1715,17 @@ RemoveRelations(DropStmt *drop) obj.objectId = relOid; obj.objectSubId = 0; + oldcontext = MemoryContextSwitchTo(private_context); add_exact_object_address(&obj, objects); + MemoryContextSwitchTo(oldcontext); } + /* Deletion may involve multiple commits, so, switch to memory context */ + oldcontext = MemoryContextSwitchTo(private_context); performMultipleDeletions(objects, drop->behavior, flags); + MemoryContextSwitchTo(oldcontext); - free_object_addresses(objects); + MemoryContextDelete(private_context); } /* diff --git a/src/include/catalog/dependency.h b/src/include/catalog/dependency.h index 0ea7ccf52430..02bcf5e93150 100644 --- a/src/include/catalog/dependency.h +++ b/src/include/catalog/dependency.h @@ -180,6 +180,7 @@ extern List *getOwnedSequences(Oid relid); extern Oid getIdentitySequence(Relation rel, AttrNumber attnum, bool missing_ok); extern Oid get_index_constraint(Oid indexId); +extern Oid get_auxiliary_index(Oid indexId); extern List *get_index_ref_constraints(Oid indexId); diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 4713f18e68d8..53b2b13efc39 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -73,6 +73,7 @@ extern Oid index_create(Relation heapRelation, Oid indexRelationId, Oid parentIndexRelid, Oid parentConstraintId, + Oid auxiliaryForIndexId, RelFileNumber relFileNumber, IndexInfo *indexInfo, const List *indexColNames, diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index ca74844b5c63..aca6ec57ad76 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -3265,20 +3265,109 @@ ERROR: reindex of auxiliary index "aux_index_ind6_ccaux" not supported REINDEX INDEX CONCURRENTLY aux_index_ind6_ccaux; ERROR: reindex of auxiliary index "aux_index_ind6_ccaux" not supported -- This makes the previous failure go away, so the index can become valid. -DELETE FROM concur_reindex_tab4 WHERE c1 = 1; -ERROR: relation "concur_reindex_tab4" does not exist -LINE 1: DELETE FROM concur_reindex_tab4 WHERE c1 = 1; - ^ --- Should be skipped during reindex -REINDEX TABLE aux_index_tab5; +DELETE FROM aux_index_tab5 WHERE c1 = 1; +-- Should be skipped during reindex and dropped +REINDEX INDEX aux_index_ind6; +-- Make sure aux index is dropped +\d aux_index_tab5 + Table "public.aux_index_tab5" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | +Indexes: + "aux_index_ind6" UNIQUE, btree (c1) + +DROP INDEX aux_index_ind6; +-- Insert duplicates again +INSERT INTO aux_index_tab5 VALUES (1), (1); +-- Create invalid index again +CREATE UNIQUE INDEX CONCURRENTLY aux_index_ind6 ON aux_index_tab5 (c1); ERROR: could not create unique index "aux_index_ind6" DETAIL: Key (c1)=(1) is duplicated. --- Should be skipped during concurrent reindex +-- This makes the previous failure go away, so the index can become valid. +DELETE FROM aux_index_tab5 WHERE c1 = 1; +-- Should be skipped during reindex REINDEX TABLE CONCURRENTLY aux_index_tab5; WARNING: skipping reindex of invalid index "public.aux_index_ind6" HINT: Use DROP INDEX or REINDEX INDEX. WARNING: skipping reindex of auxiliary index "public.aux_index_ind6_ccaux" NOTICE: table "aux_index_tab5" has no indexes that can be reindexed concurrently +-- Make sure it is still exists +\d aux_index_tab5 + Table "public.aux_index_tab5" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | +Indexes: + "aux_index_ind6" UNIQUE, btree (c1) INVALID + "aux_index_ind6_ccaux" stir (c1) INVALID + +-- Should be skipped during reindex and dropped +REINDEX TABLE aux_index_tab5; +-- Make sure aux index is dropped +\d aux_index_tab5 + Table "public.aux_index_tab5" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | +Indexes: + "aux_index_ind6" UNIQUE, btree (c1) + +DROP INDEX aux_index_ind6; +-- Insert duplicates again +INSERT INTO aux_index_tab5 VALUES (1), (1); +-- Create invalid index again +CREATE UNIQUE INDEX CONCURRENTLY aux_index_ind6 ON aux_index_tab5 (c1); +ERROR: could not create unique index "aux_index_ind6" +DETAIL: Key (c1)=(1) is duplicated. +-- This makes the previous failure go away, so the index can become valid. +DELETE FROM aux_index_tab5 WHERE c1 = 1; +-- Should be skipped during reindex and dropped +REINDEX INDEX CONCURRENTLY aux_index_ind6; +-- Make sure aux index is dropped +\d aux_index_tab5 + Table "public.aux_index_tab5" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | +Indexes: + "aux_index_ind6" UNIQUE, btree (c1) + +DROP INDEX aux_index_ind6; +-- Insert duplicates again +INSERT INTO aux_index_tab5 VALUES (1), (1); +-- Create invalid index again +CREATE UNIQUE INDEX CONCURRENTLY aux_index_ind6 ON aux_index_tab5 (c1); +ERROR: could not create unique index "aux_index_ind6" +DETAIL: Key (c1)=(1) is duplicated. +-- This makes the previous failure go away, so the index can become valid. +DELETE FROM aux_index_tab5 WHERE c1 = 1; +-- Drop main index CONCURRENTLY +DROP INDEX CONCURRENTLY aux_index_ind6; +-- Make sure auxiliary index dropped too +\d aux_index_tab5 + Table "public.aux_index_tab5" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + +DROP INDEX aux_index_ind6; +ERROR: index "aux_index_ind6" does not exist +-- Insert duplicates again +INSERT INTO aux_index_tab5 VALUES (1), (1); +-- Create invalid index again +CREATE UNIQUE INDEX CONCURRENTLY aux_index_ind6 ON aux_index_tab5 (c1); +ERROR: could not create unique index "aux_index_ind6" +DETAIL: Key (c1)=(1) is duplicated. +-- Drop main index +DROP INDEX aux_index_ind6; +-- Make sure auxiliary index dropped too +\d aux_index_tab5 + Table "public.aux_index_tab5" + Column | Type | Collation | Nullable | Default +--------+---------+-----------+----------+--------- + c1 | integer | | | + DROP TABLE aux_index_tab5; -- Check handling of indexes with expressions and predicates. The -- definitions of the rebuilt indexes should match the original diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql index 2cff1ac29be8..e1464eaa67cf 100644 --- a/src/test/regress/sql/create_index.sql +++ b/src/test/regress/sql/create_index.sql @@ -1340,11 +1340,62 @@ REINDEX INDEX aux_index_ind6_ccaux; -- Concurrently also REINDEX INDEX CONCURRENTLY aux_index_ind6_ccaux; -- This makes the previous failure go away, so the index can become valid. -DELETE FROM concur_reindex_tab4 WHERE c1 = 1; +DELETE FROM aux_index_tab5 WHERE c1 = 1; +-- Should be skipped during reindex and dropped +REINDEX INDEX aux_index_ind6; +-- Make sure aux index is dropped +\d aux_index_tab5 +DROP INDEX aux_index_ind6; + +-- Insert duplicates again +INSERT INTO aux_index_tab5 VALUES (1), (1); +-- Create invalid index again +CREATE UNIQUE INDEX CONCURRENTLY aux_index_ind6 ON aux_index_tab5 (c1); +-- This makes the previous failure go away, so the index can become valid. +DELETE FROM aux_index_tab5 WHERE c1 = 1; -- Should be skipped during reindex -REINDEX TABLE aux_index_tab5; --- Should be skipped during concurrent reindex REINDEX TABLE CONCURRENTLY aux_index_tab5; +-- Make sure it is still exists +\d aux_index_tab5 +-- Should be skipped during reindex and dropped +REINDEX TABLE aux_index_tab5; +-- Make sure aux index is dropped +\d aux_index_tab5 +DROP INDEX aux_index_ind6; + +-- Insert duplicates again +INSERT INTO aux_index_tab5 VALUES (1), (1); +-- Create invalid index again +CREATE UNIQUE INDEX CONCURRENTLY aux_index_ind6 ON aux_index_tab5 (c1); +-- This makes the previous failure go away, so the index can become valid. +DELETE FROM aux_index_tab5 WHERE c1 = 1; +-- Should be skipped during reindex and dropped +REINDEX INDEX CONCURRENTLY aux_index_ind6; +-- Make sure aux index is dropped +\d aux_index_tab5 +DROP INDEX aux_index_ind6; + +-- Insert duplicates again +INSERT INTO aux_index_tab5 VALUES (1), (1); +-- Create invalid index again +CREATE UNIQUE INDEX CONCURRENTLY aux_index_ind6 ON aux_index_tab5 (c1); +-- This makes the previous failure go away, so the index can become valid. +DELETE FROM aux_index_tab5 WHERE c1 = 1; +-- Drop main index CONCURRENTLY +DROP INDEX CONCURRENTLY aux_index_ind6; +-- Make sure auxiliary index dropped too +\d aux_index_tab5 +DROP INDEX aux_index_ind6; + +-- Insert duplicates again +INSERT INTO aux_index_tab5 VALUES (1), (1); +-- Create invalid index again +CREATE UNIQUE INDEX CONCURRENTLY aux_index_ind6 ON aux_index_tab5 (c1); +-- Drop main index +DROP INDEX aux_index_ind6; +-- Make sure auxiliary index dropped too +\d aux_index_tab5 + DROP TABLE aux_index_tab5; -- Check handling of indexes with expressions and predicates. The From 2b08aff27d2d97b97eb32231e91e62bb169ef5ae Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu Date: Mon, 30 Dec 2024 16:37:12 +0100 Subject: [PATCH 10/12] Optimize auxiliary index handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Skip unnecessary computations for auxiliary indices by: - in the index‐insert path, detect auxiliary indexes and bypass Datum value computation - set indexUnchanged=false for auxiliary indices to avoid redundant checks These optimizations reduce overhead during concurrent index operations. --- src/backend/catalog/index.c | 11 +++++++++++ src/backend/executor/execIndexing.c | 11 +++++++---- 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index bf0bb79474b6..d1b96703bbc9 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -2932,6 +2932,17 @@ FormIndexDatum(IndexInfo *indexInfo, ListCell *indexpr_item; int i; + /* Auxiliary index does not need any values to be computed */ + if (unlikely(indexInfo->ii_Auxiliary)) + { + for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++) + { + values[i] = PointerGetDatum(NULL); + isnull[i] = true; + } + return; + } + if (indexInfo->ii_Expressions != NIL && indexInfo->ii_ExpressionsState == NIL) { diff --git a/src/backend/executor/execIndexing.c b/src/backend/executor/execIndexing.c index 499cba145dd4..c8b51e2725cf 100644 --- a/src/backend/executor/execIndexing.c +++ b/src/backend/executor/execIndexing.c @@ -440,11 +440,14 @@ ExecInsertIndexTuples(ResultRelInfo *resultRelInfo, * There's definitely going to be an index_insert() call for this * index. If we're being called as part of an UPDATE statement, * consider if the 'indexUnchanged' = true hint should be passed. + * + * In case of auxiliary index always pass false as optimisation. */ - indexUnchanged = update && index_unchanged_by_update(resultRelInfo, - estate, - indexInfo, - indexRelation); + indexUnchanged = update && likely(!indexInfo->ii_Auxiliary) && + index_unchanged_by_update(resultRelInfo, + estate, + indexInfo, + indexRelation); satisfiesConstraint = index_insert(indexRelation, /* index relation */ From fd4d927858aa162e98b5e59ccde509704f7f890f Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu Date: Mon, 21 Apr 2025 14:18:32 +0200 Subject: [PATCH 11/12] Refresh snapshot periodically during index validation Enhances validation phase of concurrently built indexes by periodically refreshing snapshots rather than using a single reference snapshot. This addresses issues with xmin propagation during long-running validations. The validation now takes a fresh snapshot every few pages, allowing the xmin horizon to advance. This restores feature of commit d9d076222f5b, which was reverted in commit e28bb8851969. New STIR-based approach is not depends on single reference snapshot anymore. --- doc/src/sgml/ref/create_index.sgml | 11 ++- doc/src/sgml/ref/reindex.sgml | 11 ++- src/backend/access/heap/README.HOT | 4 +- src/backend/access/heap/heapam_handler.c | 77 ++++++++++++++++--- src/backend/access/nbtree/nbtsort.c | 2 +- src/backend/access/spgist/spgvacuum.c | 12 ++- src/backend/catalog/index.c | 42 +++++++--- src/backend/commands/indexcmds.c | 50 ++---------- src/include/access/tableam.h | 7 +- src/include/access/transam.h | 15 ++++ src/include/catalog/index.h | 2 +- .../expected/cic_reset_snapshots.out | 28 +++++++ .../sql/cic_reset_snapshots.sql | 1 + 13 files changed, 179 insertions(+), 83 deletions(-) diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml index 298a093f5545..6220a80474f1 100644 --- a/doc/src/sgml/ref/create_index.sgml +++ b/doc/src/sgml/ref/create_index.sgml @@ -881,9 +881,14 @@ Indexes: - Like any long-running transaction, CREATE INDEX on a - table can affect which tuples can be removed by concurrent - VACUUM on any other table. + Due to the improved implementation using periodically refreshed snapshots and + auxiliary indexes, concurrent index builds have minimal impact on concurrent + VACUUM operations. The system automatically advances its + internal transaction horizon during the build process, allowing + VACUUM to remove dead tuples on other tables without + having to wait for the entire index build to complete. Only during very brief + periods when snapshots are being refreshed might there be any temporary effect + on concurrent VACUUM operations. diff --git a/doc/src/sgml/ref/reindex.sgml b/doc/src/sgml/ref/reindex.sgml index d62791ff9c38..60f4d0d680f0 100644 --- a/doc/src/sgml/ref/reindex.sgml +++ b/doc/src/sgml/ref/reindex.sgml @@ -502,10 +502,13 @@ Indexes: - Like any long-running transaction, REINDEX on a table - can affect which tuples can be removed by concurrent - VACUUM on any other table. - + REINDEX CONCURRENTLY has minimal + impact on which tuples can be removed by concurrent VACUUM + operations on other tables. This is achieved through periodic snapshot + refreshes and the use of auxiliary indexes during the rebuild process, + allowing the system to advance its transaction horizon regularly rather than + maintaining a single long-running snapshot. + REINDEX SYSTEM does not support diff --git a/src/backend/access/heap/README.HOT b/src/backend/access/heap/README.HOT index 6f718feb6d52..d41609c97cd6 100644 --- a/src/backend/access/heap/README.HOT +++ b/src/backend/access/heap/README.HOT @@ -401,12 +401,12 @@ use the key value from the live tuple. We mark the index open for inserts (but still not ready for reads) then we again wait for transactions which have the table open. Then validate the index. This searches for tuples missing from the index in auxiliary -index, and inserts any missing ones if them visible to reference snapshot. +index, and inserts any missing ones if them visible to fresh snapshot. Again, the index entries have to have TIDs equal to HOT-chain root TIDs, but the value to be inserted is the one from the live tuple. Then we wait until every transaction that could have a snapshot older than -the second reference snapshot is finished. This ensures that nobody is +the latest used snapshot is finished. This ensures that nobody is alive any longer who could need to see any tuples that might be missing from the index, as well as ensuring that no one can see any inconsistent rows in a broken HOT chain (the first condition is stronger than the diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index f592b09ec689..7c90650f58fc 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -2034,23 +2034,26 @@ heapam_index_validate_scan_read_stream_next( return result; } -static void +static TransactionId heapam_index_validate_scan(Relation heapRelation, Relation indexRelation, IndexInfo *indexInfo, - Snapshot snapshot, ValidateIndexState *state, ValidateIndexState *auxState) { + TransactionId limitXmin; + Datum values[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; + Snapshot snapshot; TupleTableSlot *slot; EState *estate; ExprContext *econtext; BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD); - int num_to_check; + int num_to_check, + page_read_counter = 1; /* set to 1 to skip snapshot reset at start */ Tuplestorestate *tuples_for_check; ValidateIndexScanState callback_private_data; @@ -2061,14 +2064,16 @@ heapam_index_validate_scan(Relation heapRelation, /* Use 10% of memory for tuple store. */ int store_work_mem_part = maintenance_work_mem / 10; - /* - * Encode TIDs as int8 values for the sort, rather than directly sorting - * item pointers. This can be significantly faster, primarily because TID - * is a pass-by-reference type on all platforms, whereas int8 is - * pass-by-value on most platforms. - */ + PushActiveSnapshot(GetTransactionSnapshot()); + tuples_for_check = tuplestore_begin_datum(INT8OID, false, false, store_work_mem_part); + PopActiveSnapshot(); + InvalidateCatalogSnapshot(); + + Assert(!HaveRegisteredOrActiveSnapshot()); + Assert(!TransactionIdIsValid(MyProc->xmin)); + /* * sanity checks */ @@ -2084,6 +2089,29 @@ heapam_index_validate_scan(Relation heapRelation, state->tuplesort = auxState->tuplesort = NULL; + /* + * Now take the first snapshot that will be used by to filter candidate + * tuples. We are going to replace it by newer snapshot every so often + * to propagate horizon. + * + * Beware! There might still be snapshots in use that treat some transaction + * as in-progress that our temporary snapshot treats as committed. + * + * If such a recently-committed transaction deleted tuples in the table, + * we will not include them in the index; yet those transactions which + * see the deleting one as still-in-progress will expect such tuples to + * be there once we mark the index as valid. + * + * We solve this by waiting for all endangered transactions to exit before + * we mark the index as valid, for that reason limitXmin is supported. + * + * We also set ActiveSnapshot to this snap, since functions in indexes may + * need a snapshot. + */ + snapshot = RegisterSnapshot(GetLatestSnapshot()); + PushActiveSnapshot(snapshot); + limitXmin = snapshot->xmin; + estate = CreateExecutorState(); econtext = GetPerTupleExprContext(estate); slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRelation), @@ -2117,6 +2145,7 @@ heapam_index_validate_scan(Relation heapRelation, LockBuffer(buf, BUFFER_LOCK_SHARE); block_number = BufferGetBlockNumber(buf); + page_read_counter++; i = 0; while ((off = tuples[i]) != InvalidOffsetNumber) @@ -2172,6 +2201,20 @@ heapam_index_validate_scan(Relation heapRelation, } ReleaseBuffer(buf); +#define VALIDATE_INDEX_RESET_SNAPSHOT_EACH_N_PAGE 4096 + if (page_read_counter % VALIDATE_INDEX_RESET_SNAPSHOT_EACH_N_PAGE == 0) + { + PopActiveSnapshot(); + UnregisterSnapshot(snapshot); + /* to make sure we propagate xmin */ + InvalidateCatalogSnapshot(); + Assert(!TransactionIdIsValid(MyProc->xmin)); + + snapshot = RegisterSnapshot(GetLatestSnapshot()); + PushActiveSnapshot(snapshot); + /* xmin should not go backwards, but just for the case*/ + limitXmin = TransactionIdNewer(limitXmin, snapshot->xmin); + } } ExecDropSingleTupleTableSlot(slot); @@ -2181,9 +2224,25 @@ heapam_index_validate_scan(Relation heapRelation, read_stream_end(read_stream); tuplestore_end(tuples_for_check); + /* + * Drop the latest snapshot. We must do this before waiting out other + * snapshot holders, else we will deadlock against other processes also + * doing CREATE INDEX CONCURRENTLY, which would see our snapshot as one + * they must wait for. + */ + PopActiveSnapshot(); + UnregisterSnapshot(snapshot); + InvalidateCatalogSnapshot(); + Assert(MyProc->xmin == InvalidTransactionId); +#if USE_INJECTION_POINTS + if (MyProc->xid == InvalidTransactionId) + INJECTION_POINT("heapam_index_validate_scan_no_xid", NULL); +#endif /* These may have been pointing to the now-gone estate */ indexInfo->ii_ExpressionsState = NIL; indexInfo->ii_PredicateState = NULL; + + return limitXmin; } /* diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 08a3cb283482..250d9d59b9ab 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -444,7 +444,7 @@ _bt_spools_heapscan(Relation heap, Relation index, BTBuildState *buildstate, * dead tuples) won't get very full, so we give it only work_mem. * * In case of concurrent build dead tuples are not need to be put into index - * since we wait for all snapshots older than reference snapshot during the + * since we wait for all snapshots older than latest snapshot during the * validation phase. */ if (indexInfo->ii_Unique && !indexInfo->ii_Concurrent) diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index 2678f7ab7829..968a8f7725c4 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -191,14 +191,16 @@ vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer, * Add target TID to pending list if the redirection could have * happened since VACUUM started. (If xid is invalid, assume it * must have happened before VACUUM started, since REINDEX - * CONCURRENTLY locks out VACUUM.) + * CONCURRENTLY locks out VACUUM, if myXmin is invalid it is + * validation scan.) * * Note: we could make a tighter test by seeing if the xid is * "running" according to the active snapshot; but snapmgr.c * doesn't currently export a suitable API, and it's not entirely * clear that a tighter test is worth the cycles anyway. */ - if (TransactionIdFollowsOrEquals(dt->xid, bds->myXmin)) + if (!TransactionIdIsValid(bds->myXmin) || + TransactionIdFollowsOrEquals(dt->xid, bds->myXmin)) spgAddPendingTID(bds, &dt->pointer); } else @@ -808,7 +810,6 @@ spgvacuumscan(spgBulkDeleteState *bds) /* Finish setting up spgBulkDeleteState */ initSpGistState(&bds->spgstate, index); bds->pendingList = NULL; - bds->myXmin = GetActiveSnapshot()->xmin; bds->lastFilledBlock = SPGIST_LAST_FIXED_BLKNO; /* @@ -959,6 +960,10 @@ spgbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, bds.stats = stats; bds.callback = callback; bds.callback_state = callback_state; + if (info->validate_index) + bds.myXmin = InvalidTransactionId; + else + bds.myXmin = GetActiveSnapshot()->xmin; spgvacuumscan(&bds); @@ -999,6 +1004,7 @@ spgvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) bds.stats = stats; bds.callback = dummy_callback; bds.callback_state = NULL; + bds.myXmin = GetActiveSnapshot()->xmin; spgvacuumscan(&bds); } diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index d1b96703bbc9..e707b012f415 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -3534,8 +3534,9 @@ IndexCheckExclusion(Relation heapRelation, * insert their new tuples into it. At the same moment we clear "indisready" for * auxiliary index, since it is no more required to be updated. * - * We then take a new reference snapshot, any tuples that are valid according - * to this snap, but are not in the index, must be added to the index. + * We then take a new snapshot, any tuples that are valid according + * to this snap, but are not in the index, must be added to the index. In + * order to propagate xmin we reset that snapshot every few so often. * (Any tuples committed live after the snap will be inserted into the * index by their originating transaction. Any tuples committed dead before * the snap need not be indexed, because we will wait out all transactions @@ -3548,7 +3549,7 @@ IndexCheckExclusion(Relation heapRelation, * TIDs of both auxiliary and target indexes, and doing a "merge join" against * the TID lists to see which tuples from auxiliary index are missing from the * target index. Thus we will ensure that all tuples valid according to the - * reference snapshot are in the index. Notice we need to do bulkdelete in the + * latest snapshot are in the index. Notice we need to do bulkdelete in the * particular order: auxiliary first, target last. * * Building a unique index this way is tricky: we might try to insert a @@ -3569,13 +3570,14 @@ IndexCheckExclusion(Relation heapRelation, * * Also, some actions to concurrent drop the auxiliary index are performed. */ -void -validate_index(Oid heapId, Oid indexId, Oid auxIndexId, Snapshot snapshot) +TransactionId +validate_index(Oid heapId, Oid indexId, Oid auxIndexId) { Relation heapRelation, indexRelation, auxIndexRelation; IndexInfo *indexInfo; + TransactionId limitXmin; IndexVacuumInfo ivinfo, auxivinfo; ValidateIndexState state, auxState; Oid save_userid; @@ -3625,8 +3627,12 @@ validate_index(Oid heapId, Oid indexId, Oid auxIndexId, Snapshot snapshot) * Fetch info needed for index_insert. (You might think this should be * passed in from DefineIndex, but its copy is long gone due to having * been built in a previous transaction.) + * + * We might need snapshot for index expressions or predicates. */ + PushActiveSnapshot(GetTransactionSnapshot()); indexInfo = BuildIndexInfo(indexRelation); + PopActiveSnapshot(); /* mark build is concurrent just for consistency */ indexInfo->ii_Concurrent = true; @@ -3662,6 +3668,9 @@ validate_index(Oid heapId, Oid indexId, Oid auxIndexId, Snapshot snapshot) NULL, TUPLESORT_NONE); auxState.htups = auxState.itups = auxState.tups_inserted = 0; + /* tuplesort_begin_datum may require catalog snapshot */ + InvalidateCatalogSnapshot(); + (void) index_bulk_delete(&auxivinfo, NULL, validate_index_callback, &auxState); @@ -3671,6 +3680,9 @@ validate_index(Oid heapId, Oid indexId, Oid auxIndexId, Snapshot snapshot) NULL, TUPLESORT_NONE); state.htups = state.itups = state.tups_inserted = 0; + /* tuplesort_begin_datum may require catalog snapshot */ + InvalidateCatalogSnapshot(); + /* ambulkdelete updates progress metrics */ (void) index_bulk_delete(&ivinfo, NULL, validate_index_callback, &state); @@ -3690,19 +3702,24 @@ validate_index(Oid heapId, Oid indexId, Oid auxIndexId, Snapshot snapshot) pgstat_progress_update_multi_param(3, progress_index, progress_vals); } tuplesort_performsort(state.tuplesort); + /* tuplesort_performsort may require catalog snapshot */ + InvalidateCatalogSnapshot(); + tuplesort_performsort(auxState.tuplesort); + /* tuplesort_performsort may require catalog snapshot */ + InvalidateCatalogSnapshot(); + Assert(!TransactionIdIsValid(MyProc->xmin)); /* * Now merge both indexes */ pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, PROGRESS_CREATEIDX_PHASE_VALIDATE_IDXMERGE); - table_index_validate_scan(heapRelation, - indexRelation, - indexInfo, - snapshot, - &state, - &auxState); + limitXmin = table_index_validate_scan(heapRelation, + indexRelation, + indexInfo, + &state, + &auxState); /* Tuple sort closed by table_index_validate_scan */ Assert(state.tuplesort == NULL && auxState.tuplesort == NULL); @@ -3725,6 +3742,9 @@ validate_index(Oid heapId, Oid indexId, Oid auxIndexId, Snapshot snapshot) index_close(auxIndexRelation, NoLock); index_close(indexRelation, NoLock); table_close(heapRelation, NoLock); + + Assert(!TransactionIdIsValid(MyProc->xmin)); + return limitXmin; } /* diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 3b25068dad38..3290f4ed120e 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -592,7 +592,6 @@ DefineIndex(Oid tableId, LockRelId heaprelid; LOCKTAG heaplocktag; LOCKMODE lockmode; - Snapshot snapshot; Oid root_save_userid; int root_save_sec_context; int root_save_nestlevel; @@ -1794,32 +1793,11 @@ DefineIndex(Oid tableId, /* Tell concurrent index builds to ignore us, if index qualifies */ if (safe_index) set_indexsafe_procflags(); - - /* - * Now take the "reference snapshot" that will be used by validate_index() - * to filter candidate tuples. Beware! There might still be snapshots in - * use that treat some transaction as in-progress that our reference - * snapshot treats as committed. If such a recently-committed transaction - * deleted tuples in the table, we will not include them in the index; yet - * those transactions which see the deleting one as still-in-progress will - * expect such tuples to be there once we mark the index as valid. - * - * We solve this by waiting for all endangered transactions to exit before - * we mark the index as valid. - * - * We also set ActiveSnapshot to this snap, since functions in indexes may - * need a snapshot. - */ - snapshot = RegisterSnapshot(GetTransactionSnapshot()); - PushActiveSnapshot(snapshot); /* * Merge content of auxiliary and target indexes - insert any missing index entries. */ - validate_index(tableId, indexRelationId, auxIndexRelationId, snapshot); - limitXmin = snapshot->xmin; + limitXmin = validate_index(tableId, indexRelationId, auxIndexRelationId); - PopActiveSnapshot(); - UnregisterSnapshot(snapshot); /* * The snapshot subsystem could still contain registered snapshots that * are holding back our process's advertised xmin; in particular, if @@ -1841,8 +1819,8 @@ DefineIndex(Oid tableId, /* * The index is now valid in the sense that it contains all currently * interesting tuples. But since it might not contain tuples deleted just - * before the reference snap was taken, we have to wait out any - * transactions that might have older snapshots. + * before the last snapshot during validating was taken, we have to wait + * out any transactions that might have older snapshots. */ INJECTION_POINT("define_index_before_set_valid", NULL); pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, @@ -4348,7 +4326,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein { ReindexIndexInfo *newidx = lfirst(lc); TransactionId limitXmin; - Snapshot snapshot; StartTransactionCommand(); @@ -4363,13 +4340,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein if (newidx->safe) set_indexsafe_procflags(); - /* - * Take the "reference snapshot" that will be used by validate_index() - * to filter candidate tuples. - */ - snapshot = RegisterSnapshot(GetTransactionSnapshot()); - PushActiveSnapshot(snapshot); - /* * Update progress for the index to build, with the correct parent * table involved. @@ -4381,16 +4351,8 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein progress_vals[3] = newidx->amId; pgstat_progress_update_multi_param(4, progress_index, progress_vals); - validate_index(newidx->tableId, newidx->indexId, newidx->auxIndexId, snapshot); - - /* - * We can now do away with our active snapshot, we still need to save - * the xmin limit to wait for older snapshots. - */ - limitXmin = snapshot->xmin; - - PopActiveSnapshot(); - UnregisterSnapshot(snapshot); + limitXmin = validate_index(newidx->tableId, newidx->indexId, newidx->auxIndexId); + Assert(!TransactionIdIsValid(MyProc->xmin)); /* * To ensure no deadlocks, we must commit and start yet another @@ -4403,7 +4365,7 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein /* * The index is now valid in the sense that it contains all currently * interesting tuples. But since it might not contain tuples deleted - * just before the reference snap was taken, we have to wait out any + * just before the latest snap was taken, we have to wait out any * transactions that might have older snapshots. * * Because we don't take a snapshot or Xid in this transaction, diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index 6c43f47814d1..d38a69610356 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -708,10 +708,9 @@ typedef struct TableAmRoutine TableScanDesc scan); /* see table_index_validate_scan for reference about parameters */ - void (*index_validate_scan) (Relation table_rel, + TransactionId (*index_validate_scan) (Relation table_rel, Relation index_rel, struct IndexInfo *index_info, - Snapshot snapshot, struct ValidateIndexState *state, struct ValidateIndexState *aux_state); @@ -1825,18 +1824,16 @@ table_index_build_range_scan(Relation table_rel, * Note: it is responsibility of that function to close sortstates in * both `state` and `auxstate`. */ -static inline void +static inline TransactionId table_index_validate_scan(Relation table_rel, Relation index_rel, struct IndexInfo *index_info, - Snapshot snapshot, struct ValidateIndexState *state, struct ValidateIndexState *auxstate) { return table_rel->rd_tableam->index_validate_scan(table_rel, index_rel, index_info, - snapshot, state, auxstate); } diff --git a/src/include/access/transam.h b/src/include/access/transam.h index 7d82cd2eb562..15e345c7a193 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -355,6 +355,21 @@ NormalTransactionIdOlder(TransactionId a, TransactionId b) return b; } +/* return the newer of the two IDs */ +static inline TransactionId +TransactionIdNewer(TransactionId a, TransactionId b) +{ + if (!TransactionIdIsValid(a)) + return b; + + if (!TransactionIdIsValid(b)) + return a; + + if (TransactionIdFollows(a, b)) + return a; + return b; +} + /* return the newer of the two IDs */ static inline FullTransactionId FullTransactionIdNewer(FullTransactionId a, FullTransactionId b) diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 53b2b13efc39..8fe0acc1e6b7 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -154,7 +154,7 @@ extern void index_build(Relation heapRelation, bool isreindex, bool parallel); -extern void validate_index(Oid heapId, Oid indexId, Oid auxIndexId, Snapshot snapshot); +extern TransactionId validate_index(Oid heapId, Oid indexId, Oid auxIndexId); extern void index_set_state_flags(Oid indexId, IndexStateFlagsAction action); diff --git a/src/test/modules/injection_points/expected/cic_reset_snapshots.out b/src/test/modules/injection_points/expected/cic_reset_snapshots.out index 9f03fa3033ce..780313f477b7 100644 --- a/src/test/modules/injection_points/expected/cic_reset_snapshots.out +++ b/src/test/modules/injection_points/expected/cic_reset_snapshots.out @@ -23,6 +23,12 @@ SELECT injection_points_attach('table_parallelscan_initialize', 'notice'); (1 row) +SELECT injection_points_attach('heapam_index_validate_scan_no_xid', 'notice'); + injection_points_attach +------------------------- + +(1 row) + CREATE SCHEMA cic_reset_snap; CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); INSERT INTO cic_reset_snap.tbl SELECT i, i * I FROM generate_series(1, 200) s(i); @@ -43,30 +49,38 @@ ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +NOTICE: notice triggered for injection point heapam_index_validate_scan_no_xid REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +NOTICE: notice triggered for injection point heapam_index_validate_scan_no_xid DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +NOTICE: notice triggered for injection point heapam_index_validate_scan_no_xid REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +NOTICE: notice triggered for injection point heapam_index_validate_scan_no_xid DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +NOTICE: notice triggered for injection point heapam_index_validate_scan_no_xid REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +NOTICE: notice triggered for injection point heapam_index_validate_scan_no_xid DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +NOTICE: notice triggered for injection point heapam_index_validate_scan_no_xid REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +NOTICE: notice triggered for injection point heapam_index_validate_scan_no_xid DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots @@ -76,9 +90,11 @@ DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +NOTICE: notice triggered for injection point heapam_index_validate_scan_no_xid REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +NOTICE: notice triggered for injection point heapam_index_validate_scan_no_xid DROP INDEX CONCURRENTLY cic_reset_snap.idx; -- The same in parallel mode ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); @@ -91,23 +107,31 @@ SELECT injection_points_detach('heap_reset_scan_snapshot_effective'); CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); NOTICE: notice triggered for injection point table_parallelscan_initialize +NOTICE: notice triggered for injection point heapam_index_validate_scan_no_xid REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; NOTICE: notice triggered for injection point table_parallelscan_initialize +NOTICE: notice triggered for injection point heapam_index_validate_scan_no_xid DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); NOTICE: notice triggered for injection point table_parallelscan_initialize +NOTICE: notice triggered for injection point heapam_index_validate_scan_no_xid REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; NOTICE: notice triggered for injection point table_parallelscan_initialize +NOTICE: notice triggered for injection point heapam_index_validate_scan_no_xid DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; NOTICE: notice triggered for injection point table_parallelscan_initialize +NOTICE: notice triggered for injection point heapam_index_validate_scan_no_xid REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; NOTICE: notice triggered for injection point table_parallelscan_initialize +NOTICE: notice triggered for injection point heapam_index_validate_scan_no_xid DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heapam_index_validate_scan_no_xid REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heapam_index_validate_scan_no_xid DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); NOTICE: notice triggered for injection point table_parallelscan_initialize @@ -116,13 +140,17 @@ NOTICE: notice triggered for injection point table_parallelscan_initialize DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i DESC NULLS LAST); NOTICE: notice triggered for injection point table_parallelscan_initialize +NOTICE: notice triggered for injection point heapam_index_validate_scan_no_xid REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; NOTICE: notice triggered for injection point table_parallelscan_initialize +NOTICE: notice triggered for injection point heapam_index_validate_scan_no_xid DROP INDEX CONCURRENTLY cic_reset_snap.idx; CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); NOTICE: notice triggered for injection point table_parallelscan_initialize +NOTICE: notice triggered for injection point heapam_index_validate_scan_no_xid REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; NOTICE: notice triggered for injection point table_parallelscan_initialize +NOTICE: notice triggered for injection point heapam_index_validate_scan_no_xid DROP INDEX CONCURRENTLY cic_reset_snap.idx; DROP SCHEMA cic_reset_snap CASCADE; NOTICE: drop cascades to 3 other objects diff --git a/src/test/modules/injection_points/sql/cic_reset_snapshots.sql b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql index 2941aa7ae389..249d1061adae 100644 --- a/src/test/modules/injection_points/sql/cic_reset_snapshots.sql +++ b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql @@ -4,6 +4,7 @@ SELECT injection_points_set_local(); SELECT injection_points_attach('heap_reset_scan_snapshot_effective', 'notice'); SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice'); SELECT injection_points_attach('table_parallelscan_initialize', 'notice'); +SELECT injection_points_attach('heapam_index_validate_scan_no_xid', 'notice'); CREATE SCHEMA cic_reset_snap; CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); From 52f91f2b43291c5911e2e8bb290b57d35fbed456 Mon Sep 17 00:00:00 2001 From: Mikhail Nikalayeu Date: Tue, 31 Dec 2024 14:24:48 +0100 Subject: [PATCH 12/12] Remove PROC_IN_SAFE_IC optimization This optimization allowed concurrent index builds to ignore other indexes without expressions or predicates. With the new snapshot handling approach that periodically refreshes snapshots, this optimization is no longer necessary. The change simplifies concurrent index build code by: - removing the PROC_IN_SAFE_IC process status flag - eliminating set_indexsafe_procflags() calls and related logic - removing special case handling in GetCurrentVirtualXIDs() - removing related test cases and injection points --- src/backend/access/brin/brin.c | 6 +- src/backend/access/gin/gininsert.c | 6 +- src/backend/access/nbtree/nbtsort.c | 6 +- src/backend/commands/indexcmds.c | 142 +----------------- src/include/storage/proc.h | 8 +- src/test/modules/injection_points/Makefile | 2 +- .../expected/reindex_conc.out | 51 ------- src/test/modules/injection_points/meson.build | 1 - .../injection_points/sql/reindex_conc.sql | 28 ---- 9 files changed, 13 insertions(+), 237 deletions(-) delete mode 100644 src/test/modules/injection_points/expected/reindex_conc.out delete mode 100644 src/test/modules/injection_points/sql/reindex_conc.sql diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 947dc79b138d..a59f84a42514 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -2893,11 +2893,9 @@ _brin_parallel_build_main(dsm_segment *seg, shm_toc *toc) int sortmem; /* - * The only possible status flag that can be set to the parallel worker is - * PROC_IN_SAFE_IC. + * There are no possible status flag that can be set to the parallel worker. */ - Assert((MyProc->statusFlags == 0) || - (MyProc->statusFlags == PROC_IN_SAFE_IC)); + Assert(MyProc->statusFlags == 0); /* Set debug_query_string for individual workers first */ sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true); diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 629f6d5f2c0a..df79b5850f90 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -2106,11 +2106,9 @@ _gin_parallel_build_main(dsm_segment *seg, shm_toc *toc) int sortmem; /* - * The only possible status flag that can be set to the parallel worker is - * PROC_IN_SAFE_IC. + * There are no possible status flag that can be set to the parallel worker. */ - Assert((MyProc->statusFlags == 0) || - (MyProc->statusFlags == PROC_IN_SAFE_IC)); + Assert(MyProc->statusFlags == 0); /* Set debug_query_string for individual workers first */ sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true); diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 250d9d59b9ab..f80379618b2a 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -1910,11 +1910,9 @@ _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc) #endif /* BTREE_BUILD_STATS */ /* - * The only possible status flag that can be set to the parallel worker is - * PROC_IN_SAFE_IC. + * There are no possible status flag that can be set to the parallel worker. */ - Assert((MyProc->statusFlags == 0) || - (MyProc->statusFlags == PROC_IN_SAFE_IC)); + Assert(MyProc->statusFlags == 0); /* Set debug_query_string for individual workers first */ sharedquery = shm_toc_lookup(toc, PARALLEL_KEY_QUERY_TEXT, true); diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 3290f4ed120e..d36a71fd03d3 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -115,7 +115,6 @@ static bool ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const ReindexParams *params); static void update_relispartition(Oid relationId, bool newval); -static inline void set_indexsafe_procflags(void); /* * callback argument type for RangeVarCallbackForReindexIndex() @@ -418,10 +417,7 @@ CompareOpclassOptions(const Datum *opts1, const Datum *opts2, int natts) * lazy VACUUMs, because they won't be fazed by missing index entries * either. (Manual ANALYZEs, however, can't be excluded because they * might be within transactions that are going to do arbitrary operations - * later.) Processes running CREATE INDEX CONCURRENTLY or REINDEX CONCURRENTLY - * on indexes that are neither expressional nor partial are also safe to - * ignore, since we know that those processes won't examine any data - * outside the table they're indexing. + * later.) * * Also, GetCurrentVirtualXIDs never reports our own vxid, so we need not * check for that. @@ -442,8 +438,7 @@ WaitForOlderSnapshots(TransactionId limitXmin, bool progress) VirtualTransactionId *old_snapshots; old_snapshots = GetCurrentVirtualXIDs(limitXmin, true, false, - PROC_IS_AUTOVACUUM | PROC_IN_VACUUM - | PROC_IN_SAFE_IC, + PROC_IS_AUTOVACUUM | PROC_IN_VACUUM, &n_old_snapshots); if (progress) pgstat_progress_update_param(PROGRESS_WAITFOR_TOTAL, n_old_snapshots); @@ -463,8 +458,7 @@ WaitForOlderSnapshots(TransactionId limitXmin, bool progress) newer_snapshots = GetCurrentVirtualXIDs(limitXmin, true, false, - PROC_IS_AUTOVACUUM | PROC_IN_VACUUM - | PROC_IN_SAFE_IC, + PROC_IS_AUTOVACUUM | PROC_IN_VACUUM, &n_newer_snapshots); for (j = i; j < n_old_snapshots; j++) { @@ -578,7 +572,6 @@ DefineIndex(Oid tableId, amoptions_function amoptions; bool exclusion; bool partitioned; - bool safe_index; Datum reloptions; int16 *coloptions; IndexInfo *indexInfo; @@ -1181,10 +1174,6 @@ DefineIndex(Oid tableId, } } - /* Is index safe for others to ignore? See set_indexsafe_procflags() */ - safe_index = indexInfo->ii_Expressions == NIL && - indexInfo->ii_Predicate == NIL; - /* * Report index creation if appropriate (delay this till after most of the * error checks) @@ -1671,10 +1660,6 @@ DefineIndex(Oid tableId, CommitTransactionCommand(); StartTransactionCommand(); - /* Tell concurrent index builds to ignore us, if index qualifies */ - if (safe_index) - set_indexsafe_procflags(); - /* * The index is now visible, so we can report the OID. While on it, * include the report for the beginning of phase 2. @@ -1729,9 +1714,6 @@ DefineIndex(Oid tableId, CommitTransactionCommand(); StartTransactionCommand(); - /* Tell concurrent index builds to ignore us, if index qualifies */ - if (safe_index) - set_indexsafe_procflags(); pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, PROGRESS_CREATEIDX_PHASE_WAIT_2); /* @@ -1761,10 +1743,6 @@ DefineIndex(Oid tableId, CommitTransactionCommand(); StartTransactionCommand(); - /* Tell concurrent index builds to ignore us, if index qualifies */ - if (safe_index) - set_indexsafe_procflags(); - /* * Phase 3 of concurrent index build * @@ -1790,9 +1768,7 @@ DefineIndex(Oid tableId, CommitTransactionCommand(); StartTransactionCommand(); - /* Tell concurrent index builds to ignore us, if index qualifies */ - if (safe_index) - set_indexsafe_procflags(); + /* * Merge content of auxiliary and target indexes - insert any missing index entries. */ @@ -1809,9 +1785,6 @@ DefineIndex(Oid tableId, CommitTransactionCommand(); StartTransactionCommand(); - /* Tell concurrent index builds to ignore us, if index qualifies */ - if (safe_index) - set_indexsafe_procflags(); /* We should now definitely not be advertising any xmin. */ Assert(MyProc->xmin == InvalidTransactionId); @@ -1852,10 +1825,6 @@ DefineIndex(Oid tableId, CommitTransactionCommand(); StartTransactionCommand(); - /* Tell concurrent index builds to ignore us, if index qualifies */ - if (safe_index) - set_indexsafe_procflags(); - pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, PROGRESS_CREATEIDX_PHASE_WAIT_5); /* Now wait for all transaction to see auxiliary as "non-ready for inserts" */ @@ -1876,10 +1845,6 @@ DefineIndex(Oid tableId, CommitTransactionCommand(); StartTransactionCommand(); - /* Tell concurrent index builds to ignore us, if index qualifies */ - if (safe_index) - set_indexsafe_procflags(); - pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, PROGRESS_CREATEIDX_PHASE_WAIT_6); /* Now wait for all transaction to ignore auxiliary because it is dead */ @@ -3620,7 +3585,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein Oid junkAuxIndexId; Oid tableId; Oid amId; - bool safe; /* for set_indexsafe_procflags */ } ReindexIndexInfo; List *heapRelationIds = NIL; List *indexIds = NIL; @@ -3994,17 +3958,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein save_nestlevel = NewGUCNestLevel(); RestrictSearchPath(); - /* determine safety of this index for set_indexsafe_procflags */ - idx->safe = (RelationGetIndexExpressions(indexRel) == NIL && - RelationGetIndexPredicate(indexRel) == NIL); - -#ifdef USE_INJECTION_POINTS - if (idx->safe) - INJECTION_POINT("reindex-conc-index-safe", NULL); - else - INJECTION_POINT("reindex-conc-index-not-safe", NULL); -#endif - idx->tableId = RelationGetRelid(heapRel); idx->amId = indexRel->rd_rel->relam; @@ -4070,7 +4023,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein newidx->indexId = newIndexId; newidx->auxIndexId = auxIndexId; newidx->junkAuxIndexId = junkAuxIndexId; - newidx->safe = idx->safe; newidx->tableId = idx->tableId; newidx->amId = idx->amId; @@ -4171,11 +4123,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein CommitTransactionCommand(); StartTransactionCommand(); - /* - * Because we don't take a snapshot in this transaction, there's no need - * to set the PROC_IN_SAFE_IC flag here. - */ - /* * Phase 2 of REINDEX CONCURRENTLY * @@ -4207,10 +4154,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein */ CHECK_FOR_INTERRUPTS(); - /* Tell concurrent indexing to ignore us, if index qualifies */ - if (newidx->safe) - set_indexsafe_procflags(); - /* Build auxiliary index, it is fast - without any actual heap scan, just an empty index. */ index_concurrently_build(newidx->tableId, newidx->auxIndexId); @@ -4219,11 +4162,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein StartTransactionCommand(); - /* - * Because we don't take a snapshot in this transaction, there's no need - * to set the PROC_IN_SAFE_IC flag here. - */ - pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, PROGRESS_CREATEIDX_PHASE_WAIT_2); /* @@ -4248,10 +4186,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein */ CHECK_FOR_INTERRUPTS(); - /* Tell concurrent indexing to ignore us, if index qualifies */ - if (newidx->safe) - set_indexsafe_procflags(); - /* * Update progress for the index to build, with the correct parent * table involved. @@ -4271,11 +4205,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein StartTransactionCommand(); - /* - * Because we don't take a snapshot or Xid in this transaction, there's no - * need to set the PROC_IN_SAFE_IC flag here. - */ - pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, PROGRESS_CREATEIDX_PHASE_WAIT_3); WaitForLockersMultiple(lockTags, ShareLock, true); @@ -4297,10 +4226,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein */ CHECK_FOR_INTERRUPTS(); - /* Tell concurrent indexing to ignore us, if index qualifies */ - if (newidx->safe) - set_indexsafe_procflags(); - /* * Updating pg_index might involve TOAST table access, so ensure we * have a valid snapshot. @@ -4336,10 +4261,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein */ CHECK_FOR_INTERRUPTS(); - /* Tell concurrent indexing to ignore us, if index qualifies */ - if (newidx->safe) - set_indexsafe_procflags(); - /* * Update progress for the index to build, with the correct parent * table involved. @@ -4367,9 +4288,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein * interesting tuples. But since it might not contain tuples deleted * just before the latest snap was taken, we have to wait out any * transactions that might have older snapshots. - * - * Because we don't take a snapshot or Xid in this transaction, - * there's no need to set the PROC_IN_SAFE_IC flag here. */ pgstat_progress_update_param(PROGRESS_CREATEIDX_PHASE, PROGRESS_CREATEIDX_PHASE_WAIT_4); @@ -4391,13 +4309,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein INJECTION_POINT("reindex_relation_concurrently_before_swap", NULL); StartTransactionCommand(); - /* - * Because this transaction only does catalog manipulations and doesn't do - * any index operations, we can set the PROC_IN_SAFE_IC flag here - * unconditionally. - */ - set_indexsafe_procflags(); - forboth(lc, indexIds, lc2, newIndexIds) { ReindexIndexInfo *oldidx = lfirst(lc); @@ -4453,12 +4364,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein CommitTransactionCommand(); StartTransactionCommand(); - /* - * While we could set PROC_IN_SAFE_IC if all indexes qualified, there's no - * real need for that, because we only acquire an Xid after the wait is - * done, and that lasts for a very short period. - */ - /* * Phase 5 of REINDEX CONCURRENTLY * @@ -4522,12 +4427,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein CommitTransactionCommand(); StartTransactionCommand(); - /* - * While we could set PROC_IN_SAFE_IC if all indexes qualified, there's no - * real need for that, because we only acquire an Xid after the wait is - * done, and that lasts for a very short period. - */ - /* * Phase 6 of REINDEX CONCURRENTLY * @@ -4795,36 +4694,3 @@ update_relispartition(Oid relationId, bool newval) table_close(classRel, RowExclusiveLock); } -/* - * Set the PROC_IN_SAFE_IC flag in MyProc->statusFlags. - * - * When doing concurrent index builds, we can set this flag - * to tell other processes concurrently running CREATE - * INDEX CONCURRENTLY or REINDEX CONCURRENTLY to ignore us when - * doing their waits for concurrent snapshots. On one hand it - * avoids pointlessly waiting for a process that's not interesting - * anyway; but more importantly it avoids deadlocks in some cases. - * - * This can be done safely only for indexes that don't execute any - * expressions that could access other tables, so index must not be - * expressional nor partial. Caller is responsible for only calling - * this routine when that assumption holds true. - * - * (The flag is reset automatically at transaction end, so it must be - * set for each transaction.) - */ -static inline void -set_indexsafe_procflags(void) -{ - /* - * This should only be called before installing xid or xmin in MyProc; - * otherwise, concurrent processes could see an Xmin that moves backwards. - */ - Assert(MyProc->xid == InvalidTransactionId && - MyProc->xmin == InvalidTransactionId); - - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - MyProc->statusFlags |= PROC_IN_SAFE_IC; - ProcGlobal->statusFlags[MyProc->pgxactoff] = MyProc->statusFlags; - LWLockRelease(ProcArrayLock); -} diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index 9f9b3fcfbf1d..5e07466c7371 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -56,10 +56,6 @@ struct XidCache */ #define PROC_IS_AUTOVACUUM 0x01 /* is it an autovac worker? */ #define PROC_IN_VACUUM 0x02 /* currently running lazy vacuum */ -#define PROC_IN_SAFE_IC 0x04 /* currently running CREATE INDEX - * CONCURRENTLY or REINDEX - * CONCURRENTLY on non-expressional, - * non-partial index */ #define PROC_VACUUM_FOR_WRAPAROUND 0x08 /* set by autovac only */ #define PROC_IN_LOGICAL_DECODING 0x10 /* currently doing logical * decoding outside xact */ @@ -69,13 +65,13 @@ struct XidCache /* flags reset at EOXact */ #define PROC_VACUUM_STATE_MASK \ - (PROC_IN_VACUUM | PROC_IN_SAFE_IC | PROC_VACUUM_FOR_WRAPAROUND) + (PROC_IN_VACUUM | PROC_VACUUM_FOR_WRAPAROUND) /* * Xmin-related flags. Make sure any flags that affect how the process' Xmin * value is interpreted by VACUUM are included here. */ -#define PROC_XMIN_FLAGS (PROC_IN_VACUUM | PROC_IN_SAFE_IC) +#define PROC_XMIN_FLAGS (PROC_IN_VACUUM) /* * We allow a limited number of "weak" relation locks (AccessShareLock, diff --git a/src/test/modules/injection_points/Makefile b/src/test/modules/injection_points/Makefile index 19d26408c2a4..82acf3006bdf 100644 --- a/src/test/modules/injection_points/Makefile +++ b/src/test/modules/injection_points/Makefile @@ -11,7 +11,7 @@ EXTENSION = injection_points DATA = injection_points--1.0.sql PGFILEDESC = "injection_points - facility for injection points" -REGRESS = injection_points hashagg reindex_conc cic_reset_snapshots +REGRESS = injection_points hashagg cic_reset_snapshots REGRESS_OPTS = --dlpath=$(top_builddir)/src/test/regress ISOLATION = basic inplace syscache-update-pruned diff --git a/src/test/modules/injection_points/expected/reindex_conc.out b/src/test/modules/injection_points/expected/reindex_conc.out deleted file mode 100644 index db8de4bbe85c..000000000000 --- a/src/test/modules/injection_points/expected/reindex_conc.out +++ /dev/null @@ -1,51 +0,0 @@ --- Tests for REINDEX CONCURRENTLY -CREATE EXTENSION injection_points; --- Check safety of indexes with predicates and expressions. -SELECT injection_points_set_local(); - injection_points_set_local ----------------------------- - -(1 row) - -SELECT injection_points_attach('reindex-conc-index-safe', 'notice'); - injection_points_attach -------------------------- - -(1 row) - -SELECT injection_points_attach('reindex-conc-index-not-safe', 'notice'); - injection_points_attach -------------------------- - -(1 row) - -CREATE SCHEMA reindex_inj; -CREATE TABLE reindex_inj.tbl(i int primary key, updated_at timestamp); -CREATE UNIQUE INDEX ind_simple ON reindex_inj.tbl(i); -CREATE UNIQUE INDEX ind_expr ON reindex_inj.tbl(ABS(i)); -CREATE UNIQUE INDEX ind_pred ON reindex_inj.tbl(i) WHERE mod(i, 2) = 0; -CREATE UNIQUE INDEX ind_expr_pred ON reindex_inj.tbl(abs(i)) WHERE mod(i, 2) = 0; -REINDEX INDEX CONCURRENTLY reindex_inj.ind_simple; -NOTICE: notice triggered for injection point reindex-conc-index-safe -REINDEX INDEX CONCURRENTLY reindex_inj.ind_expr; -NOTICE: notice triggered for injection point reindex-conc-index-not-safe -REINDEX INDEX CONCURRENTLY reindex_inj.ind_pred; -NOTICE: notice triggered for injection point reindex-conc-index-not-safe -REINDEX INDEX CONCURRENTLY reindex_inj.ind_expr_pred; -NOTICE: notice triggered for injection point reindex-conc-index-not-safe --- Cleanup -SELECT injection_points_detach('reindex-conc-index-safe'); - injection_points_detach -------------------------- - -(1 row) - -SELECT injection_points_detach('reindex-conc-index-not-safe'); - injection_points_detach -------------------------- - -(1 row) - -DROP TABLE reindex_inj.tbl; -DROP SCHEMA reindex_inj; -DROP EXTENSION injection_points; diff --git a/src/test/modules/injection_points/meson.build b/src/test/modules/injection_points/meson.build index 8476bfe72a7f..bddf22df3ac0 100644 --- a/src/test/modules/injection_points/meson.build +++ b/src/test/modules/injection_points/meson.build @@ -36,7 +36,6 @@ tests += { 'sql': [ 'injection_points', 'hashagg', - 'reindex_conc', 'cic_reset_snapshots', ], 'regress_args': ['--dlpath', meson.build_root() / 'src/test/regress'], diff --git a/src/test/modules/injection_points/sql/reindex_conc.sql b/src/test/modules/injection_points/sql/reindex_conc.sql deleted file mode 100644 index 6cf211e6d5dd..000000000000 --- a/src/test/modules/injection_points/sql/reindex_conc.sql +++ /dev/null @@ -1,28 +0,0 @@ --- Tests for REINDEX CONCURRENTLY -CREATE EXTENSION injection_points; - --- Check safety of indexes with predicates and expressions. -SELECT injection_points_set_local(); -SELECT injection_points_attach('reindex-conc-index-safe', 'notice'); -SELECT injection_points_attach('reindex-conc-index-not-safe', 'notice'); - -CREATE SCHEMA reindex_inj; -CREATE TABLE reindex_inj.tbl(i int primary key, updated_at timestamp); - -CREATE UNIQUE INDEX ind_simple ON reindex_inj.tbl(i); -CREATE UNIQUE INDEX ind_expr ON reindex_inj.tbl(ABS(i)); -CREATE UNIQUE INDEX ind_pred ON reindex_inj.tbl(i) WHERE mod(i, 2) = 0; -CREATE UNIQUE INDEX ind_expr_pred ON reindex_inj.tbl(abs(i)) WHERE mod(i, 2) = 0; - -REINDEX INDEX CONCURRENTLY reindex_inj.ind_simple; -REINDEX INDEX CONCURRENTLY reindex_inj.ind_expr; -REINDEX INDEX CONCURRENTLY reindex_inj.ind_pred; -REINDEX INDEX CONCURRENTLY reindex_inj.ind_expr_pred; - --- Cleanup -SELECT injection_points_detach('reindex-conc-index-safe'); -SELECT injection_points_detach('reindex-conc-index-not-safe'); -DROP TABLE reindex_inj.tbl; -DROP SCHEMA reindex_inj; - -DROP EXTENSION injection_points;