Location via proxy:   [ UP ]  
[Report a bug]   [Manage cookies]                
Skip to content

Commit d5cf2f4

Browse files
author
Commitfest Bot
committed
[CF 4971] v20 - [CREATE|RE] INDEX CONCURRENTLY with single heap scan and short-term resetting shapshots
This branch was automatically generated by a robot using patches from an email thread registered at: https://commitfest.postgresql.org/patch/4971 The branch will be overwritten each time a new patch version is posted to the thread, and also periodically to check for bitrot caused by changes on the master branch. Patch(es): https://www.postgresql.org/message-id/CADzfLwW5bDWSxjHK7mqX8Lewki3+5FBydBC+nVcxg4xMGKscyw@mail.gmail.com Author(s): Michail Nikolaev, Mihail Nikalayeu
2 parents 3c4d755 + 16c4fa4 commit d5cf2f4

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

85 files changed

+3953
-929
lines changed

contrib/amcheck/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ tests += {
4949
't/003_cic_2pc.pl',
5050
't/004_verify_nbtree_unique.pl',
5151
't/005_pitr.pl',
52+
't/006_cic_bt_index_parent_check.pl',
5253
],
5354
},
5455
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Copyright (c) 2025, PostgreSQL Global Development Group
2+
3+
# Test bt_index_parent_check with index created with CREATE INDEX CONCURRENTLY
4+
use strict;
5+
use warnings FATAL => 'all';
6+
7+
use PostgreSQL::Test::Cluster;
8+
use PostgreSQL::Test::Utils;
9+
10+
use Test::More;
11+
12+
my ($node, $result);
13+
14+
#
15+
# Test set-up
16+
#
17+
$node = PostgreSQL::Test::Cluster->new('CIC_bt_index_parent_check_test');
18+
$node->init;
19+
$node->start;
20+
$node->safe_psql('postgres', q(CREATE EXTENSION amcheck));
21+
$node->safe_psql('postgres', q(CREATE TABLE tbl(i int primary key)));
22+
# Insert two rows into index
23+
$node->safe_psql('postgres', q(INSERT INTO tbl SELECT i FROM generate_series(1, 2) s(i);));
24+
25+
# start background transaction
26+
my $in_progress_h = $node->background_psql('postgres');
27+
$in_progress_h->query_safe(q(BEGIN; SELECT pg_current_xact_id();));
28+
29+
# delete one row from table, while background transaction is in progress
30+
$node->safe_psql('postgres', q(DELETE FROM tbl WHERE i = 1;));
31+
# create index concurrently, which will skip the deleted row
32+
$node->safe_psql('postgres', q(CREATE INDEX CONCURRENTLY idx ON tbl(i);));
33+
34+
# check index using bt_index_parent_check
35+
$result = $node->psql('postgres', q(SELECT bt_index_parent_check('idx', heapallindexed => true)));
36+
is($result, '0', 'bt_index_parent_check for CIC after removed row');
37+
38+
$in_progress_h->quit;
39+
done_testing();

contrib/amcheck/verify_nbtree.c

Lines changed: 31 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,6 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
382382
BTMetaPageData *metad;
383383
uint32 previouslevel;
384384
BtreeLevel current;
385-
Snapshot snapshot = SnapshotAny;
386385

387386
if (!readonly)
388387
elog(DEBUG1, "verifying consistency of tree structure for index \"%s\"",
@@ -433,38 +432,35 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
433432
state->heaptuplespresent = 0;
434433

435434
/*
436-
* Register our own snapshot in !readonly case, rather than asking
435+
* Register our own snapshot for heapallindexed, rather than asking
437436
* table_index_build_scan() to do this for us later. This needs to
438437
* happen before index fingerprinting begins, so we can later be
439438
* certain that index fingerprinting should have reached all tuples
440439
* returned by table_index_build_scan().
441440
*/
442-
if (!state->readonly)
443-
{
444-
snapshot = RegisterSnapshot(GetTransactionSnapshot());
441+
state->snapshot = RegisterSnapshot(GetTransactionSnapshot());
445442

446-
/*
447-
* GetTransactionSnapshot() always acquires a new MVCC snapshot in
448-
* READ COMMITTED mode. A new snapshot is guaranteed to have all
449-
* the entries it requires in the index.
450-
*
451-
* We must defend against the possibility that an old xact
452-
* snapshot was returned at higher isolation levels when that
453-
* snapshot is not safe for index scans of the target index. This
454-
* is possible when the snapshot sees tuples that are before the
455-
* index's indcheckxmin horizon. Throwing an error here should be
456-
* very rare. It doesn't seem worth using a secondary snapshot to
457-
* avoid this.
458-
*/
459-
if (IsolationUsesXactSnapshot() && rel->rd_index->indcheckxmin &&
460-
!TransactionIdPrecedes(HeapTupleHeaderGetXmin(rel->rd_indextuple->t_data),
461-
snapshot->xmin))
462-
ereport(ERROR,
463-
(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
464-
errmsg("index \"%s\" cannot be verified using transaction snapshot",
465-
RelationGetRelationName(rel))));
466-
}
467-
}
443+
/*
444+
* GetTransactionSnapshot() always acquires a new MVCC snapshot in
445+
* READ COMMITTED mode. A new snapshot is guaranteed to have all
446+
* the entries it requires in the index.
447+
*
448+
* We must defend against the possibility that an old xact
449+
* snapshot was returned at higher isolation levels when that
450+
* snapshot is not safe for index scans of the target index. This
451+
* is possible when the snapshot sees tuples that are before the
452+
* index's indcheckxmin horizon. Throwing an error here should be
453+
* very rare. It doesn't seem worth using a secondary snapshot to
454+
* avoid this.
455+
*/
456+
if (IsolationUsesXactSnapshot() && rel->rd_index->indcheckxmin &&
457+
!TransactionIdPrecedes(HeapTupleHeaderGetXmin(rel->rd_indextuple->t_data),
458+
state->snapshot->xmin))
459+
ereport(ERROR,
460+
(errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
461+
errmsg("index \"%s\" cannot be verified using transaction snapshot",
462+
RelationGetRelationName(rel))));
463+
}
468464

469465
/*
470466
* We need a snapshot to check the uniqueness of the index. For better
@@ -476,9 +472,7 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
476472
state->indexinfo = BuildIndexInfo(state->rel);
477473
if (state->indexinfo->ii_Unique)
478474
{
479-
if (snapshot != SnapshotAny)
480-
state->snapshot = snapshot;
481-
else
475+
if (state->snapshot == InvalidSnapshot)
482476
state->snapshot = RegisterSnapshot(GetTransactionSnapshot());
483477
}
484478
}
@@ -555,21 +549,21 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
555549
/*
556550
* Create our own scan for table_index_build_scan(), rather than
557551
* getting it to do so for us. This is required so that we can
558-
* actually use the MVCC snapshot registered earlier in !readonly
559-
* case.
552+
* actually use the MVCC snapshot registered earlier.
560553
*
561554
* Note that table_index_build_scan() calls heap_endscan() for us.
562555
*/
563556
scan = table_beginscan_strat(state->heaprel, /* relation */
564-
snapshot, /* snapshot */
557+
state->snapshot, /* snapshot */
565558
0, /* number of keys */
566559
NULL, /* scan key */
567560
true, /* buffer access strategy OK */
568-
true); /* syncscan OK? */
561+
true, /* syncscan OK? */
562+
false);
569563

570564
/*
571565
* Scan will behave as the first scan of a CREATE INDEX CONCURRENTLY
572-
* behaves in !readonly case.
566+
* behaves.
573567
*
574568
* It's okay that we don't actually use the same lock strength for the
575569
* heap relation as any other ii_Concurrent caller would in !readonly
@@ -578,7 +572,7 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
578572
* that needs to be sure that there was no concurrent recycling of
579573
* TIDs.
580574
*/
581-
indexinfo->ii_Concurrent = !state->readonly;
575+
indexinfo->ii_Concurrent = true;
582576

583577
/*
584578
* Don't wait for uncommitted tuple xact commit/abort when index is a
@@ -602,14 +596,11 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace,
602596
state->heaptuplespresent, RelationGetRelationName(heaprel),
603597
100.0 * bloom_prop_bits_set(state->filter))));
604598

605-
if (snapshot != SnapshotAny)
606-
UnregisterSnapshot(snapshot);
607-
608599
bloom_free(state->filter);
609600
}
610601

611602
/* Be tidy: */
612-
if (snapshot == SnapshotAny && state->snapshot != InvalidSnapshot)
603+
if (state->snapshot != InvalidSnapshot)
613604
UnregisterSnapshot(state->snapshot);
614605
MemoryContextDelete(state->targetcontext);
615606
}

contrib/pgstattuple/pgstattuple.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,9 @@ pgstat_relation(Relation rel, FunctionCallInfo fcinfo)
285285
case SPGIST_AM_OID:
286286
err = "spgist index";
287287
break;
288+
case STIR_AM_OID:
289+
err = "stir index";
290+
break;
288291
case BRIN_AM_OID:
289292
err = "brin index";
290293
break;
@@ -335,7 +338,7 @@ pgstat_heap(Relation rel, FunctionCallInfo fcinfo)
335338
errmsg("only heap AM is supported")));
336339

337340
/* Disable syncscan because we assume we scan from block zero upwards */
338-
scan = table_beginscan_strat(rel, SnapshotAny, 0, NULL, true, false);
341+
scan = table_beginscan_strat(rel, SnapshotAny, 0, NULL, true, false, false);
339342
hscan = (HeapScanDesc) scan;
340343

341344
InitDirtySnapshot(SnapshotDirty);

doc/src/sgml/monitoring.sgml

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6314,6 +6314,18 @@ FROM pg_stat_get_backend_idset() AS backendid;
63146314
information for this phase.
63156315
</entry>
63166316
</row>
6317+
<row>
6318+
<entry><literal>waiting for writers to use auxiliary index</literal></entry>
6319+
<entry>
6320+
<command>CREATE INDEX CONCURRENTLY</command> or <command>REINDEX CONCURRENTLY</command> is waiting for transactions
6321+
with write locks that can potentially see the table to finish, to ensure use of auxiliary index for new tuples in
6322+
future transactions.
6323+
This phase is skipped when not in concurrent mode.
6324+
Columns <structname>lockers_total</structname>, <structname>lockers_done</structname>
6325+
and <structname>current_locker_pid</structname> contain the progress
6326+
information for this phase.
6327+
</entry>
6328+
</row>
63176329
<row>
63186330
<entry><literal>building index</literal></entry>
63196331
<entry>
@@ -6354,13 +6366,12 @@ FROM pg_stat_get_backend_idset() AS backendid;
63546366
</entry>
63556367
</row>
63566368
<row>
6357-
<entry><literal>index validation: scanning table</literal></entry>
6369+
<entry><literal>index validation: merging indexes</literal></entry>
63586370
<entry>
6359-
<command>CREATE INDEX CONCURRENTLY</command> is scanning the table
6360-
to validate the index tuples collected in the previous two phases.
6371+
<command>CREATE INDEX CONCURRENTLY</command> merging content of auxiliary index with the target index.
63616372
This phase is skipped when not in concurrent mode.
6362-
Columns <structname>blocks_total</structname> (set to the total size of the table)
6363-
and <structname>blocks_done</structname> contain the progress information for this phase.
6373+
Columns <structname>tuples_total</structname> (set to the number of tuples to be merged)
6374+
and <structname>tuples_done</structname> contain the progress information for this phase.
63646375
</entry>
63656376
</row>
63666377
<row>
@@ -6377,8 +6388,9 @@ FROM pg_stat_get_backend_idset() AS backendid;
63776388
<row>
63786389
<entry><literal>waiting for readers before marking dead</literal></entry>
63796390
<entry>
6380-
<command>REINDEX CONCURRENTLY</command> is waiting for transactions
6381-
with read locks on the table to finish, before marking the old index dead.
6391+
<command>CREATE INDEX CONCURRENTLY</command> is waiting for transactions
6392+
with read locks on the table to finish, before marking the auxiliary index as dead.
6393+
<command>REINDEX CONCURRENTLY</command> is also waiting before marking the old index as dead.
63826394
This phase is skipped when not in concurrent mode.
63836395
Columns <structname>lockers_total</structname>, <structname>lockers_done</structname>
63846396
and <structname>current_locker_pid</structname> contain the progress

doc/src/sgml/ref/create_index.sgml

Lines changed: 34 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -620,25 +620,25 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] <replaceable class=
620620
out writes. This method is invoked by specifying the
621621
<literal>CONCURRENTLY</literal> option of <command>CREATE INDEX</command>.
622622
When this option is used,
623-
<productname>PostgreSQL</productname> must perform two scans of the table, and in
624-
addition it must wait for all existing transactions that could potentially
625-
modify or use the index to terminate. Thus
626-
this method requires more total work than a standard index build and takes
623+
<productname>PostgreSQL</productname> must perform table scan followed by
624+
validation phase, and in addition it must wait for all existing transactions
625+
that could potentially modify or use the index to terminate. Thus
626+
this method requires more total work than a standard index build and may take
627627
significantly longer to complete. However, since it allows normal
628628
operations to continue while the index is built, this method is useful for
629629
adding new indexes in a production environment. Of course, the extra CPU
630630
and I/O load imposed by the index creation might slow other operations.
631631
</para>
632632

633633
<para>
634-
In a concurrent index build, the index is actually entered as an
635-
<quote>invalid</quote> index into
636-
the system catalogs in one transaction, then two table scans occur in
637-
two more transactions. Before each table scan, the index build must
634+
In a concurrent index build, the main and auxiliary indexes is actually
635+
entered as an <quote>invalid</quote> index into
636+
the system catalogs in one transaction, then two phases occur in
637+
multiple transactions. Before each phase, the index build must
638638
wait for existing transactions that have modified the table to terminate.
639-
After the second scan, the index build must wait for any transactions
639+
After the second phase, the index build must wait for any transactions
640640
that have a snapshot (see <xref linkend="mvcc"/>) predating the second
641-
scan to terminate, including transactions used by any phase of concurrent
641+
phase to terminate, including transactions used by any phase of concurrent
642642
index builds on other tables, if the indexes involved are partial or have
643643
columns that are not simple column references.
644644
Then finally the index can be marked <quote>valid</quote> and ready for use,
@@ -651,10 +651,11 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] [ [ IF NOT EXISTS ] <replaceable class=
651651
<para>
652652
If a problem arises while scanning the table, such as a deadlock or a
653653
uniqueness violation in a unique index, the <command>CREATE INDEX</command>
654-
command will fail but leave behind an <quote>invalid</quote> index. This index
655-
will be ignored for querying purposes because it might be incomplete;
656-
however it will still consume update overhead. The <application>psql</application>
657-
<command>\d</command> command will report such an index as <literal>INVALID</literal>:
654+
command will fail but leave behind an <quote>invalid</quote> index and its
655+
associated auxiliary index. These indexes
656+
will be ignored for querying purposes because they might be incomplete;
657+
however they will still consume update overhead. The <application>psql</application>
658+
<command>\d</command> command will report such indexes as <literal>INVALID</literal>:
658659

659660
<programlisting>
660661
postgres=# \d tab
@@ -664,12 +665,19 @@ postgres=# \d tab
664665
col | integer | | |
665666
Indexes:
666667
"idx" btree (col) INVALID
668+
"idx_ccaux" stir (col) INVALID
667669
</programlisting>
668670

669-
The recommended recovery
670-
method in such cases is to drop the index and try again to perform
671-
<command>CREATE INDEX CONCURRENTLY</command>. (Another possibility is
672-
to rebuild the index with <command>REINDEX INDEX CONCURRENTLY</command>).
671+
The recommended recovery method in such cases is to drop the index with
672+
<command>DROP INDEX</command>. The auxiliary index (suffixed with
673+
<literal>ccaux</literal>) will be automatically dropped when the main
674+
index is dropped. After dropping the indexes, you can try again to perform
675+
<command>CREATE INDEX CONCURRENTLY</command>. (Another possibility is to
676+
rebuild the index with <command>REINDEX INDEX CONCURRENTLY</command>,
677+
which will also handle cleanup of any invalid auxiliary indexes.)
678+
If the only invalid index is one suffixed <literal>ccaux</literal>
679+
recommended recovery method is just <literal>DROP INDEX</literal>
680+
for that index.
673681
</para>
674682

675683
<para>
@@ -873,9 +881,14 @@ Indexes:
873881
</para>
874882

875883
<para>
876-
Like any long-running transaction, <command>CREATE INDEX</command> on a
877-
table can affect which tuples can be removed by concurrent
878-
<command>VACUUM</command> on any other table.
884+
Due to the improved implementation using periodically refreshed snapshots and
885+
auxiliary indexes, concurrent index builds have minimal impact on concurrent
886+
<command>VACUUM</command> operations. The system automatically advances its
887+
internal transaction horizon during the build process, allowing
888+
<command>VACUUM</command> to remove dead tuples on other tables without
889+
having to wait for the entire index build to complete. Only during very brief
890+
periods when snapshots are being refreshed might there be any temporary effect
891+
on concurrent <command>VACUUM</command> operations.
879892
</para>
880893

881894
<para>

0 commit comments

Comments
 (0)