@@ -1315,20 +1315,87 @@ _bt_xid_horizon(Relation rel, Relation heapRel, Page page,
1315
1315
}
1316
1316
1317
1317
/*
1318
- * Returns true, if the given block has the half-dead flag set.
1318
+ * Check that leftsib page (the btpo_prev of target page) is not marked with
1319
+ * INCOMPLETE_SPLIT flag.
1320
+ *
1321
+ * Returning true indicates that page flag is set in leftsib (which is
1322
+ * definitely still the left sibling of target). When that happens, the
1323
+ * target doesn't have a downlink in parent, and the page deletion algorithm
1324
+ * isn't prepared to handle that. Deletion of the target page (or the whole
1325
+ * subtree that contains the target page) cannot take place.
1326
+ */
1327
+ static bool
1328
+ _bt_leftsib_splitflag (Relation rel , BlockNumber leftsib , BlockNumber target )
1329
+ {
1330
+ Buffer buf ;
1331
+ Page page ;
1332
+ BTPageOpaque opaque ;
1333
+ bool result ;
1334
+
1335
+ /* Easy case: No left sibling */
1336
+ if (leftsib == P_NONE )
1337
+ return false;
1338
+
1339
+ buf = _bt_getbuf (rel , leftsib , BT_READ );
1340
+ page = BufferGetPage (buf );
1341
+ opaque = (BTPageOpaque ) PageGetSpecialPointer (page );
1342
+
1343
+ /*
1344
+ * If the left sibling was concurrently split, so that its next-pointer
1345
+ * doesn't point to the current page anymore, the split that created
1346
+ * target must be completed. Caller can reasonably expect that there will
1347
+ * be a downlink to the target page that it can relocate using its stack.
1348
+ * (We don't allow splitting an incompletely split page again until the
1349
+ * previous split has been completed.)
1350
+ */
1351
+ result = (opaque -> btpo_next == target && P_INCOMPLETE_SPLIT (opaque ));
1352
+ _bt_relbuf (rel , buf );
1353
+
1354
+ return result ;
1355
+ }
1356
+
1357
+ /*
1358
+ * Check that leafrightsib page (the btpo_next of target leaf page) is not
1359
+ * marked with ISHALFDEAD flag.
1360
+ *
1361
+ * Returning true indicates that page flag is set in leafrightsib, so page
1362
+ * deletion cannot go ahead. Our caller is not prepared to deal with the case
1363
+ * where the parent page does not have a pivot tuples whose downlink points to
1364
+ * leafrightsib (due to an earlier interrupted VACUUM operation). It doesn't
1365
+ * seem worth going to the trouble of teaching our caller to deal with it.
1366
+ * The situation will be resolved after VACUUM finishes the deletion of the
1367
+ * half-dead page (when a future VACUUM operation reaches the target page
1368
+ * again).
1369
+ *
1370
+ * _bt_leftsib_splitflag() is called for both leaf pages and internal pages.
1371
+ * _bt_rightsib_halfdeadflag() is only called for leaf pages, though. This is
1372
+ * okay because of the restriction on deleting pages that are the rightmost
1373
+ * page of their parent (i.e. that such deletions can only take place when the
1374
+ * entire subtree must be deleted). The leaf level check made here will apply
1375
+ * to a right "cousin" leaf page rather than a simple right sibling leaf page
1376
+ * in cases where caller actually goes on to attempt deleting pages that are
1377
+ * above the leaf page. The right cousin leaf page is representative of the
1378
+ * left edge of the subtree to the right of the to-be-deleted subtree as a
1379
+ * whole, which is exactly the condition that our caller cares about.
1380
+ * (Besides, internal pages are never marked half-dead, so it isn't even
1381
+ * possible to _directly_ assess if an internal page is part of some other
1382
+ * to-be-deleted subtree.)
1319
1383
*/
1320
1384
static bool
1321
- _bt_is_page_halfdead (Relation rel , BlockNumber blk )
1385
+ _bt_rightsib_halfdeadflag (Relation rel , BlockNumber leafrightsib )
1322
1386
{
1323
1387
Buffer buf ;
1324
1388
Page page ;
1325
1389
BTPageOpaque opaque ;
1326
1390
bool result ;
1327
1391
1328
- buf = _bt_getbuf (rel , blk , BT_READ );
1392
+ Assert (leafrightsib != P_NONE );
1393
+
1394
+ buf = _bt_getbuf (rel , leafrightsib , BT_READ );
1329
1395
page = BufferGetPage (buf );
1330
1396
opaque = (BTPageOpaque ) PageGetSpecialPointer (page );
1331
1397
1398
+ Assert (P_ISLEAF (opaque ) && !P_ISDELETED (opaque ));
1332
1399
result = P_ISHALFDEAD (opaque );
1333
1400
_bt_relbuf (rel , buf );
1334
1401
@@ -1374,7 +1441,6 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
1374
1441
Buffer pbuf ;
1375
1442
Page page ;
1376
1443
BTPageOpaque opaque ;
1377
- BlockNumber leftsib ;
1378
1444
1379
1445
/*
1380
1446
* Locate the downlink of "child" in the parent, updating the stack entry
@@ -1399,11 +1465,14 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
1399
1465
* If the target is the rightmost child of its parent, then we can't
1400
1466
* delete, unless it's also the only child.
1401
1467
*/
1468
+ Assert (poffset <= maxoff );
1402
1469
if (poffset >= maxoff )
1403
1470
{
1404
1471
/* It's rightmost child... */
1405
1472
if (poffset == P_FIRSTDATAKEY (opaque ))
1406
1473
{
1474
+ BlockNumber leftsibparent ;
1475
+
1407
1476
/*
1408
1477
* It's only child, so safe if parent would itself be removable.
1409
1478
* We have to check the parent itself, and then recurse to test
@@ -1418,41 +1487,16 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
1418
1487
1419
1488
* target = parent ;
1420
1489
* rightsib = opaque -> btpo_next ;
1421
- leftsib = opaque -> btpo_prev ;
1490
+ leftsibparent = opaque -> btpo_prev ;
1422
1491
1423
1492
_bt_relbuf (rel , pbuf );
1424
1493
1425
1494
/*
1426
- * Like in _bt_pagedel, check that the left sibling is not marked
1427
- * with INCOMPLETE_SPLIT flag. That would mean that there is no
1428
- * downlink to the page to be deleted, and the page deletion
1429
- * algorithm isn't prepared to handle that.
1495
+ * Check that the left sibling of parent (if any) is not marked
1496
+ * with INCOMPLETE_SPLIT flag before proceeding
1430
1497
*/
1431
- if (leftsib != P_NONE )
1432
- {
1433
- Buffer lbuf ;
1434
- Page lpage ;
1435
- BTPageOpaque lopaque ;
1436
-
1437
- lbuf = _bt_getbuf (rel , leftsib , BT_READ );
1438
- lpage = BufferGetPage (lbuf );
1439
- lopaque = (BTPageOpaque ) PageGetSpecialPointer (lpage );
1440
-
1441
- /*
1442
- * If the left sibling was concurrently split, so that its
1443
- * next-pointer doesn't point to the current page anymore, the
1444
- * split that created the current page must be completed. (We
1445
- * don't allow splitting an incompletely split page again
1446
- * until the previous split has been completed)
1447
- */
1448
- if (lopaque -> btpo_next == parent &&
1449
- P_INCOMPLETE_SPLIT (lopaque ))
1450
- {
1451
- _bt_relbuf (rel , lbuf );
1452
- return false;
1453
- }
1454
- _bt_relbuf (rel , lbuf );
1455
- }
1498
+ if (_bt_leftsib_splitflag (rel , leftsibparent , parent ))
1499
+ return false;
1456
1500
1457
1501
return _bt_lock_branch_parent (rel , parent , stack -> bts_parent ,
1458
1502
topparent , topoff , target , rightsib );
@@ -1525,7 +1569,9 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
1525
1569
*
1526
1570
* Also, when "stack" is not NULL, we have already checked that the
1527
1571
* current page is not the right half of an incomplete split, i.e. the
1528
- * left sibling does not have its INCOMPLETE_SPLIT flag set.
1572
+ * left sibling does not have its INCOMPLETE_SPLIT flag set, including
1573
+ * when the current target page is to the right of caller's initial page
1574
+ * (the scanblkno page).
1529
1575
*/
1530
1576
BTStack stack = NULL ;
1531
1577
@@ -1589,11 +1635,12 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
1589
1635
* The INCOMPLETE_SPLIT flag on the page tells us if the page is the
1590
1636
* left half of an incomplete split, but ensuring that it's not the
1591
1637
* right half is more complicated. For that, we have to check that
1592
- * the left sibling doesn't have its INCOMPLETE_SPLIT flag set. On
1593
- * the first iteration, we temporarily release the lock on the current
1594
- * page, and check the left sibling and also construct a search stack
1595
- * to. On subsequent iterations, we know we stepped right from a page
1596
- * that passed these tests, so it's OK.
1638
+ * the left sibling doesn't have its INCOMPLETE_SPLIT flag set using
1639
+ * _bt_leftsib_splitflag(). On the first iteration, we temporarily
1640
+ * release the lock on scanblkno/leafbuf, check the left sibling, and
1641
+ * construct a search stack to scanblkno. On subsequent iterations,
1642
+ * we know we stepped right from a page that passed these tests, so
1643
+ * it's OK.
1597
1644
*/
1598
1645
if (P_RIGHTMOST (opaque ) || P_ISROOT (opaque ) ||
1599
1646
P_FIRSTDATAKEY (opaque ) <= PageGetMaxOffsetNumber (page ) ||
@@ -1628,13 +1675,14 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
1628
1675
BTScanInsert itup_key ;
1629
1676
ItemId itemid ;
1630
1677
IndexTuple targetkey ;
1678
+ BlockNumber leftsib , target ;
1631
1679
Buffer lbuf ;
1632
- BlockNumber leftsib ;
1633
1680
1634
1681
itemid = PageGetItemId (page , P_HIKEY );
1635
1682
targetkey = CopyIndexTuple ((IndexTuple ) PageGetItem (page , itemid ));
1636
1683
1637
1684
leftsib = opaque -> btpo_prev ;
1685
+ target = BufferGetBlockNumber (leafbuf );
1638
1686
1639
1687
/*
1640
1688
* To avoid deadlocks, we'd better drop the leaf page lock
@@ -1643,43 +1691,22 @@ _bt_pagedel(Relation rel, Buffer leafbuf, TransactionId *oldestBtpoXact)
1643
1691
LockBuffer (leafbuf , BUFFER_LOCK_UNLOCK );
1644
1692
1645
1693
/*
1646
- * Fetch the left sibling, to check that it's not marked with
1647
- * INCOMPLETE_SPLIT flag. That would mean that the page
1648
- * to-be-deleted doesn't have a downlink, and the page
1649
- * deletion algorithm isn't prepared to handle that.
1694
+ * Check that the left sibling of leafbuf (if any) is not
1695
+ * marked with INCOMPLETE_SPLIT flag before proceeding
1650
1696
*/
1651
- if (leftsib != P_NONE )
1697
+ Assert (target == scanblkno );
1698
+ if (_bt_leftsib_splitflag (rel , leftsib , target ))
1652
1699
{
1653
- BTPageOpaque lopaque ;
1654
- Page lpage ;
1655
-
1656
- lbuf = _bt_getbuf (rel , leftsib , BT_READ );
1657
- lpage = BufferGetPage (lbuf );
1658
- lopaque = (BTPageOpaque ) PageGetSpecialPointer (lpage );
1659
-
1660
- /*
1661
- * If the left sibling is split again by another backend,
1662
- * after we released the lock, we know that the first
1663
- * split must have finished, because we don't allow an
1664
- * incompletely-split page to be split again. So we don't
1665
- * need to walk right here.
1666
- */
1667
- if (lopaque -> btpo_next == BufferGetBlockNumber (leafbuf ) &&
1668
- P_INCOMPLETE_SPLIT (lopaque ))
1669
- {
1670
- ReleaseBuffer (leafbuf );
1671
- _bt_relbuf (rel , lbuf );
1672
- return ndeleted ;
1673
- }
1674
- _bt_relbuf (rel , lbuf );
1700
+ ReleaseBuffer (leafbuf );
1701
+ return ndeleted ;
1675
1702
}
1676
1703
1677
1704
/* we need an insertion scan key for the search, so build one */
1678
1705
itup_key = _bt_mkscankey (rel , targetkey );
1679
1706
/* find the leftmost leaf page with matching pivot/high key */
1680
1707
itup_key -> pivotsearch = true;
1681
1708
stack = _bt_search (rel , itup_key , & lbuf , BT_READ , NULL );
1682
- /* don 't need a lock or second pin on the page */
1709
+ /* won 't need a second lock or pin on leafbuf */
1683
1710
_bt_relbuf (rel , lbuf );
1684
1711
1685
1712
/*
@@ -1804,12 +1831,11 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
1804
1831
* Before attempting to lock the parent page, check that the right sibling
1805
1832
* is not in half-dead state. A half-dead right sibling would have no
1806
1833
* downlink in the parent, which would be highly confusing later when we
1807
- * delete the downlink that follows the current page's downlink. (I
1808
- * believe the deletion would work correctly, but it would fail the
1809
- * cross-check we make that the following downlink points to the right
1810
- * sibling of the delete page.)
1834
+ * delete the downlink that follows the leafbuf page's downlink. It would
1835
+ * fail the "right sibling of target page is also the next child in parent
1836
+ * page" cross-check below.
1811
1837
*/
1812
- if (_bt_is_page_halfdead (rel , leafrightsib ))
1838
+ if (_bt_rightsib_halfdeadflag (rel , leafrightsib ))
1813
1839
{
1814
1840
elog (DEBUG1 , "could not delete page %u because its right sibling %u is half-dead" ,
1815
1841
leafblkno , leafrightsib );
@@ -1822,16 +1848,6 @@ _bt_mark_page_halfdead(Relation rel, Buffer leafbuf, BTStack stack)
1822
1848
* be deleted too, and the same condition applies recursively to it. We
1823
1849
* have to check this condition all the way up before trying to delete,
1824
1850
* and lock the final parent of the to-be-deleted subtree.
1825
- *
1826
- * However, we won't need to repeat the above _bt_is_page_halfdead() check
1827
- * for parent/ancestor pages because of the rightmost restriction. The
1828
- * leaf check will apply to a right "cousin" leaf page rather than a
1829
- * simple right sibling leaf page in cases where we actually go on to
1830
- * perform internal page deletion. The right cousin leaf page is
1831
- * representative of the left edge of the subtree to the right of the
1832
- * to-be-deleted subtree as a whole. (Besides, internal pages are never
1833
- * marked half-dead, so it isn't even possible to directly assess if an
1834
- * internal page is part of some other to-be-deleted subtree.)
1835
1851
*/
1836
1852
rightsib = leafrightsib ;
1837
1853
target = leafblkno ;
0 commit comments