@@ -145,6 +145,9 @@ static void bt_check_every_level(Relation rel, Relation heaprel,
145
145
bool rootdescend );
146
146
static BtreeLevel bt_check_level_from_leftmost (BtreeCheckState * state ,
147
147
BtreeLevel level );
148
+ static void bt_recheck_sibling_links (BtreeCheckState * state ,
149
+ BlockNumber btpo_prev_from_target ,
150
+ BlockNumber leftcurrent );
148
151
static void bt_target_page_check (BtreeCheckState * state );
149
152
static BTScanInsert bt_right_page_check_scankey (BtreeCheckState * state );
150
153
static void bt_child_check (BtreeCheckState * state , BTScanInsert targetkey ,
@@ -787,17 +790,9 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
787
790
*/
788
791
}
789
792
790
- /*
791
- * readonly mode can only ever land on live pages and half-dead pages,
792
- * so sibling pointers should always be in mutual agreement
793
- */
794
- if (state -> readonly && opaque -> btpo_prev != leftcurrent )
795
- ereport (ERROR ,
796
- (errcode (ERRCODE_INDEX_CORRUPTED ),
797
- errmsg ("left link/right link pair in index \"%s\" not in agreement" ,
798
- RelationGetRelationName (state -> rel )),
799
- errdetail_internal ("Block=%u left block=%u left link from block=%u." ,
800
- current , leftcurrent , opaque -> btpo_prev )));
793
+ /* Sibling links should be in mutual agreement */
794
+ if (opaque -> btpo_prev != leftcurrent )
795
+ bt_recheck_sibling_links (state , opaque -> btpo_prev , leftcurrent );
801
796
802
797
/* Check level, which must be valid for non-ignorable page */
803
798
if (level .level != opaque -> btpo .level )
@@ -877,6 +872,140 @@ bt_check_level_from_leftmost(BtreeCheckState *state, BtreeLevel level)
877
872
return nextleveldown ;
878
873
}
879
874
875
+ /*
876
+ * Raise an error when target page's left link does not point back to the
877
+ * previous target page, called leftcurrent here. The leftcurrent page's
878
+ * right link was followed to get to the current target page, and we expect
879
+ * mutual agreement among leftcurrent and the current target page. Make sure
880
+ * that this condition has definitely been violated in the !readonly case,
881
+ * where concurrent page splits are something that we need to deal with.
882
+ *
883
+ * Cross-page inconsistencies involving pages that don't agree about being
884
+ * siblings are known to be a particularly good indicator of corruption
885
+ * involving partial writes/lost updates. The bt_right_page_check_scankey
886
+ * check also provides a way of detecting cross-page inconsistencies for
887
+ * !readonly callers, but it can only detect sibling pages that have an
888
+ * out-of-order keyspace, which can't catch many of the problems that we
889
+ * expect to catch here.
890
+ *
891
+ * The classic example of the kind of inconsistency that we can only catch
892
+ * with this check (when in !readonly mode) involves three sibling pages that
893
+ * were affected by a faulty page split at some point in the past. The
894
+ * effects of the split are reflected in the original page and its new right
895
+ * sibling page, with a lack of any accompanying changes for the _original_
896
+ * right sibling page. The original right sibling page's left link fails to
897
+ * point to the new right sibling page (its left link still points to the
898
+ * original page), even though the first phase of a page split is supposed to
899
+ * work as a single atomic action. This subtle inconsistency will probably
900
+ * only break backwards scans in practice.
901
+ *
902
+ * Note that this is the only place where amcheck will "couple" buffer locks
903
+ * (and only for !readonly callers). In general we prefer to avoid more
904
+ * thorough cross-page checks in !readonly mode, but it seems worth the
905
+ * complexity here. Also, the performance overhead of performing lock
906
+ * coupling here is negligible in practice. Control only reaches here with a
907
+ * non-corrupt index when there is a concurrent page split at the instant
908
+ * caller crossed over to target page from leftcurrent page.
909
+ */
910
+ static void
911
+ bt_recheck_sibling_links (BtreeCheckState * state ,
912
+ BlockNumber btpo_prev_from_target ,
913
+ BlockNumber leftcurrent )
914
+ {
915
+ if (!state -> readonly )
916
+ {
917
+ Buffer lbuf ;
918
+ Buffer newtargetbuf ;
919
+ Page page ;
920
+ BTPageOpaque opaque ;
921
+ BlockNumber newtargetblock ;
922
+
923
+ /* Couple locks in the usual order for nbtree: Left to right */
924
+ lbuf = ReadBufferExtended (state -> rel , MAIN_FORKNUM , leftcurrent ,
925
+ RBM_NORMAL , state -> checkstrategy );
926
+ LockBuffer (lbuf , BT_READ );
927
+ _bt_checkpage (state -> rel , lbuf );
928
+ page = BufferGetPage (lbuf );
929
+ opaque = (BTPageOpaque ) PageGetSpecialPointer (page );
930
+ if (P_ISDELETED (opaque ))
931
+ {
932
+ /*
933
+ * Cannot reason about concurrently deleted page -- the left link
934
+ * in the page to the right is expected to point to some other
935
+ * page to the left (not leftcurrent page).
936
+ *
937
+ * Note that we deliberately don't give up with a half-dead page.
938
+ */
939
+ UnlockReleaseBuffer (lbuf );
940
+ return ;
941
+ }
942
+
943
+ newtargetblock = opaque -> btpo_next ;
944
+ /* Avoid self-deadlock when newtargetblock == leftcurrent */
945
+ if (newtargetblock != leftcurrent )
946
+ {
947
+ newtargetbuf = ReadBufferExtended (state -> rel , MAIN_FORKNUM ,
948
+ newtargetblock , RBM_NORMAL ,
949
+ state -> checkstrategy );
950
+ LockBuffer (newtargetbuf , BT_READ );
951
+ _bt_checkpage (state -> rel , newtargetbuf );
952
+ page = BufferGetPage (newtargetbuf );
953
+ opaque = (BTPageOpaque ) PageGetSpecialPointer (page );
954
+ /* btpo_prev_from_target may have changed; update it */
955
+ btpo_prev_from_target = opaque -> btpo_prev ;
956
+ }
957
+ else
958
+ {
959
+ /*
960
+ * leftcurrent right sibling points back to leftcurrent block.
961
+ * Index is corrupt. Easiest way to handle this is to pretend
962
+ * that we actually read from a distinct page that has an invalid
963
+ * block number in its btpo_prev.
964
+ */
965
+ newtargetbuf = InvalidBuffer ;
966
+ btpo_prev_from_target = InvalidBlockNumber ;
967
+ }
968
+
969
+ /*
970
+ * No need to check P_ISDELETED here, since new target block cannot be
971
+ * marked deleted as long as we hold a lock on lbuf
972
+ */
973
+ if (BufferIsValid (newtargetbuf ))
974
+ UnlockReleaseBuffer (newtargetbuf );
975
+ UnlockReleaseBuffer (lbuf );
976
+
977
+ if (btpo_prev_from_target == leftcurrent )
978
+ {
979
+ /* Report split in left sibling, not target (or new target) */
980
+ ereport (DEBUG1 ,
981
+ (errcode (ERRCODE_INTERNAL_ERROR ),
982
+ errmsg ("harmless concurrent page split detected in index \"%s\"" ,
983
+ RelationGetRelationName (state -> rel )),
984
+ errdetail_internal ("Block=%u new right sibling=%u original right sibling=%u." ,
985
+ leftcurrent , newtargetblock ,
986
+ state -> targetblock )));
987
+ return ;
988
+ }
989
+
990
+ /*
991
+ * Index is corrupt. Make sure that we report correct target page.
992
+ *
993
+ * This could have changed in cases where there was a concurrent page
994
+ * split, as well as index corruption (at least in theory). Note that
995
+ * btpo_prev_from_target was already updated above.
996
+ */
997
+ state -> targetblock = newtargetblock ;
998
+ }
999
+
1000
+ ereport (ERROR ,
1001
+ (errcode (ERRCODE_INDEX_CORRUPTED ),
1002
+ errmsg ("left link/right link pair in index \"%s\" not in agreement" ,
1003
+ RelationGetRelationName (state -> rel )),
1004
+ errdetail_internal ("Block=%u left block=%u left link from block=%u." ,
1005
+ state -> targetblock , leftcurrent ,
1006
+ btpo_prev_from_target )));
1007
+ }
1008
+
880
1009
/*
881
1010
* Function performs the following checks on target page, or pages ancillary to
882
1011
* target page:
@@ -1965,18 +2094,14 @@ bt_child_check(BtreeCheckState *state, BTScanInsert targetkey,
1965
2094
* downlink, which was concurrently physically removed in target/parent as
1966
2095
* part of deletion's first phase.)
1967
2096
*
1968
- * Note that while the cross-page-same-level last item check uses a trick
1969
- * that allows it to perform verification for !readonly callers, a similar
1970
- * trick seems difficult here. The trick that that other check uses is,
1971
- * in essence, to lock down race conditions to those that occur due to
1972
- * concurrent page deletion of the target; that's a race that can be
1973
- * reliably detected before actually reporting corruption.
1974
- *
1975
- * On the other hand, we'd need to lock down race conditions involving
1976
- * deletion of child's left page, for long enough to read the child page
1977
- * into memory (in other words, a scheme with concurrently held buffer
1978
- * locks on both child and left-of-child pages). That's unacceptable for
1979
- * amcheck functions on general principle, though.
2097
+ * While we use various techniques elsewhere to perform cross-page
2098
+ * verification for !readonly callers, a similar trick seems difficult
2099
+ * here. The tricks used by bt_recheck_sibling_links and by
2100
+ * bt_right_page_check_scankey both involve verification of a same-level,
2101
+ * cross-sibling invariant. Cross-level invariants are far more squishy,
2102
+ * though. The nbtree REDO routines do not actually couple buffer locks
2103
+ * across levels during page splits, so making any cross-level check work
2104
+ * reliably in !readonly mode may be impossible.
1980
2105
*/
1981
2106
Assert (state -> readonly );
1982
2107
@@ -2785,6 +2910,8 @@ invariant_l_nontarget_offset(BtreeCheckState *state, BTScanInsert key,
2785
2910
* There is never an attempt to get a consistent view of multiple pages using
2786
2911
* multiple concurrent buffer locks; in general, we only acquire a single pin
2787
2912
* and buffer lock at a time, which is often all that the nbtree code requires.
2913
+ * (Actually, bt_recheck_sibling_links couples buffer locks, which is the only
2914
+ * exception to this general rule.)
2788
2915
*
2789
2916
* Operating on a copy of the page is useful because it prevents control
2790
2917
* getting stuck in an uninterruptible state when an underlying operator class
0 commit comments